polars-df 0.20.0-x86_64-darwin → 0.21.1-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +192 -186
  4. data/LICENSE-THIRD-PARTY.txt +1431 -1810
  5. data/LICENSE.txt +1 -1
  6. data/lib/polars/3.2/polars.bundle +0 -0
  7. data/lib/polars/3.3/polars.bundle +0 -0
  8. data/lib/polars/3.4/polars.bundle +0 -0
  9. data/lib/polars/array_expr.rb +382 -3
  10. data/lib/polars/array_name_space.rb +281 -0
  11. data/lib/polars/binary_expr.rb +67 -0
  12. data/lib/polars/binary_name_space.rb +43 -0
  13. data/lib/polars/cat_expr.rb +224 -0
  14. data/lib/polars/cat_name_space.rb +130 -32
  15. data/lib/polars/catalog/unity/catalog_info.rb +20 -0
  16. data/lib/polars/catalog/unity/column_info.rb +31 -0
  17. data/lib/polars/catalog/unity/namespace_info.rb +21 -0
  18. data/lib/polars/catalog/unity/table_info.rb +50 -0
  19. data/lib/polars/catalog.rb +448 -0
  20. data/lib/polars/config.rb +2 -2
  21. data/lib/polars/convert.rb +12 -2
  22. data/lib/polars/data_frame.rb +834 -48
  23. data/lib/polars/data_type_expr.rb +52 -0
  24. data/lib/polars/data_types.rb +61 -5
  25. data/lib/polars/date_time_expr.rb +251 -0
  26. data/lib/polars/date_time_name_space.rb +299 -0
  27. data/lib/polars/exceptions.rb +7 -2
  28. data/lib/polars/expr.rb +1247 -211
  29. data/lib/polars/functions/col.rb +6 -5
  30. data/lib/polars/functions/datatype.rb +21 -0
  31. data/lib/polars/functions/lazy.rb +127 -15
  32. data/lib/polars/functions/repeat.rb +4 -0
  33. data/lib/polars/io/csv.rb +19 -1
  34. data/lib/polars/io/json.rb +16 -0
  35. data/lib/polars/io/ndjson.rb +13 -0
  36. data/lib/polars/io/parquet.rb +70 -66
  37. data/lib/polars/io/scan_options.rb +47 -0
  38. data/lib/polars/lazy_frame.rb +1099 -95
  39. data/lib/polars/list_expr.rb +400 -11
  40. data/lib/polars/list_name_space.rb +321 -5
  41. data/lib/polars/meta_expr.rb +71 -22
  42. data/lib/polars/name_expr.rb +36 -0
  43. data/lib/polars/scan_cast_options.rb +64 -0
  44. data/lib/polars/schema.rb +84 -3
  45. data/lib/polars/selector.rb +210 -0
  46. data/lib/polars/selectors.rb +932 -203
  47. data/lib/polars/series.rb +1083 -63
  48. data/lib/polars/string_expr.rb +435 -9
  49. data/lib/polars/string_name_space.rb +729 -45
  50. data/lib/polars/struct_expr.rb +103 -0
  51. data/lib/polars/struct_name_space.rb +19 -1
  52. data/lib/polars/utils/parse.rb +40 -0
  53. data/lib/polars/utils/various.rb +18 -1
  54. data/lib/polars/utils.rb +9 -1
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars.rb +10 -0
  57. metadata +12 -2
data/lib/polars/schema.rb CHANGED
@@ -1,29 +1,110 @@
1
1
  module Polars
2
2
  class Schema
3
- def initialize(schema, check_dtypes: true)
4
- raise Todo if check_dtypes
5
- @schema = schema.to_h
3
+ # Ordered mapping of column names to their data type.
4
+ #
5
+ # @param schema [Object]
6
+ # The schema definition given by column names and their associated
7
+ # Polars data type. Accepts a mapping or an enumerable of arrays.
8
+ def initialize(schema = nil, check_dtypes: true)
9
+ input = schema || {}
10
+ @schema = {}
11
+ input.each do |name, tp|
12
+ if !check_dtypes
13
+ @schema[name] = tp
14
+ elsif Utils.is_polars_dtype(tp)
15
+ @schema[name] = _check_dtype(tp)
16
+ else
17
+ self[name] = tp
18
+ end
19
+ end
6
20
  end
7
21
 
22
+ # Returns the data type of the column.
23
+ #
24
+ # @return [Object]
8
25
  def [](key)
9
26
  @schema[key]
10
27
  end
11
28
 
29
+ # Sets the data type of the column.
30
+ #
31
+ # @return [Object]
32
+ def []=(name, dtype)
33
+ # TODO check dtype
34
+ @schema[name] = dtype
35
+ end
36
+
37
+ # Get the column names of the schema.
38
+ #
39
+ # @return [Array]
40
+ #
41
+ # @example
42
+ # s = Polars::Schema.new({"x" => Polars::Float64.new, "y" => Polars::Datetime.new(time_zone: "UTC")})
43
+ # s.names
44
+ # # => ["x", "y"]
12
45
  def names
13
46
  @schema.keys
14
47
  end
15
48
 
49
+ # Get the data types of the schema.
50
+ #
51
+ # @return [Array]
52
+ #
53
+ # @example
54
+ # s = Polars::Schema.new({"x" => Polars::UInt8.new, "y" => Polars::List.new(Polars::UInt8)})
55
+ # s.dtypes
56
+ # # => [Polars::UInt8, Polars::List(Polars::UInt8)]
16
57
  def dtypes
17
58
  @schema.values
18
59
  end
19
60
 
61
+ # Get the number of schema entries.
62
+ #
63
+ # @return [Integer]
64
+ #
65
+ # @example
66
+ # s = Polars::Schema.new({"x" => Polars::Int32.new, "y" => Polars::List.new(Polars::String)})
67
+ # s.length
68
+ # # => 2
20
69
  def length
21
70
  @schema.length
22
71
  end
23
72
 
73
+ # Returns a string representing the Schema.
74
+ #
75
+ # @return [String]
24
76
  def to_s
25
77
  "#{self.class.name}(#{@schema})"
26
78
  end
27
79
  alias_method :inspect, :to_s
80
+
81
+ # @private
82
+ def include?(name)
83
+ @schema.include?(name)
84
+ end
85
+
86
+ # @private
87
+ def to_h
88
+ @schema.to_h
89
+ end
90
+
91
+ private
92
+
93
+ def _check_dtype(tp)
94
+ if !tp.is_a?(DataType)
95
+ # note: if nested/decimal, or has signature params, this implies required args
96
+ if tp.nested? || tp.decimal? || _required_init_args(tp)
97
+ msg = "dtypes must be fully-specified, got: #{tp.inspect}"
98
+ raise TypeError, msg
99
+ end
100
+ tp = tp.new
101
+ end
102
+ tp
103
+ end
104
+
105
+ def _required_init_args(tp)
106
+ arity = tp.method(:new).arity
107
+ arity > 0 || arity < -1
108
+ end
28
109
  end
29
110
  end
@@ -0,0 +1,210 @@
1
+ module Polars
2
+ # Base column selector expression/proxy.
3
+ class Selector < Expr
4
+ # @private
5
+ attr_accessor :_rbselector
6
+
7
+ # @private
8
+ def self._from_rbselector(rbselector)
9
+ slf = new
10
+ slf._rbselector = rbselector
11
+ slf._rbexpr = RbExpr.new_selector(rbselector)
12
+ slf
13
+ end
14
+
15
+ # Returns a string representing the Selector.
16
+ #
17
+ # @return [String]
18
+ def inspect
19
+ Expr._from_rbexpr(_rbexpr).to_s
20
+ end
21
+
22
+ # @private
23
+ def self._by_dtype(dtypes)
24
+ selectors = []
25
+ concrete_dtypes = []
26
+ dtypes.each do |dt|
27
+ if Utils.is_polars_dtype(dt)
28
+ concrete_dtypes += [dt]
29
+ else
30
+ raise Todo
31
+ end
32
+ end
33
+
34
+ dtype_selector = _from_rbselector(RbSelector.by_dtype(concrete_dtypes))
35
+
36
+ if selectors.length == 0
37
+ return dtype_selector
38
+ end
39
+
40
+ selector = selectors[0]
41
+ selectors[1..].each do |s|
42
+ selector = selector | s
43
+ end
44
+ if concrete_dtypes.length == 0
45
+ selector
46
+ else
47
+ dtype_selector | selector
48
+ end
49
+ end
50
+
51
+ # @private
52
+ def self._by_name(names, strict:)
53
+ _from_rbselector(RbSelector.by_name(names, strict))
54
+ end
55
+
56
+ # Invert the selector.
57
+ #
58
+ # @return [Selector]
59
+ def ~
60
+ Selectors.all - self
61
+ end
62
+
63
+ # AND.
64
+ #
65
+ # @return [Selector]
66
+ def &(other)
67
+ if Utils.is_column(other)
68
+ colname = other.meta.output_name
69
+ other = by_name(colname)
70
+ end
71
+ if Utils.is_selector(other)
72
+ Selector._from_rbselector(
73
+ _rbselector.intersect(other._rbselector)
74
+ )
75
+ else
76
+ as_expr & other
77
+ end
78
+ end
79
+
80
+ # OR.
81
+ #
82
+ # @return [Selector]
83
+ def |(other)
84
+ if Utils.is_column(other)
85
+ other = by_name(other.meta.output_name)
86
+ end
87
+ if Utils.is_selector(other)
88
+ Selector._from_rbselector(
89
+ _rbselector.union(other._rbselector)
90
+ )
91
+ else
92
+ as_expr | other
93
+ end
94
+ end
95
+
96
+ # Difference.
97
+ #
98
+ # @return [Selector]
99
+ def -(other)
100
+ if Utils.is_selector(other)
101
+ Selector._from_rbselector(
102
+ _rbselector.difference(other._rbselector)
103
+ )
104
+ else
105
+ as_expr - other
106
+ end
107
+ end
108
+
109
+ # XOR.
110
+ #
111
+ # @return [Selector]
112
+ def ^(other)
113
+ if Utils.is_column(other)
114
+ other = by_name(other.meta.output_name)
115
+ end
116
+ if Utils.is_selector(other)
117
+ Selector._from_rbselector(
118
+ _rbselector.exclusive_or(other._rbselector)
119
+ )
120
+ else
121
+ as_expr ^ other
122
+ end
123
+ end
124
+
125
+ # Exclude columns from a multi-column expression.
126
+ #
127
+ # Only works after a wildcard or regex column selection, and you cannot provide
128
+ # both string column names *and* dtypes (you may prefer to use selectors instead).
129
+ #
130
+ # @return [Selector]
131
+ #
132
+ # @param columns [Object]
133
+ # The name or datatype of the column(s) to exclude. Accepts regular expression
134
+ # input. Regular expressions should start with `^` and end with `$`.
135
+ # @param more_columns [Array]
136
+ # Additional names or datatypes of columns to exclude, specified as positional
137
+ # arguments.
138
+ def exclude(columns, *more_columns)
139
+ exclude_cols = []
140
+ exclude_dtypes = []
141
+ ((columns.is_a?(::Array) ? columns : [columns]) + more_columns).each do |item|
142
+ if item.is_a?(::String)
143
+ exclude_cols << item
144
+ elsif Utils.is_polars_dtype(item)
145
+ exclude_dtypes << item
146
+ else
147
+ msg = (
148
+ "invalid input for `exclude`" +
149
+ "\n\nExpected one or more `str` or `DataType`; found #{item.inspect} instead."
150
+ )
151
+ raise TypeError, msg
152
+ end
153
+ end
154
+
155
+ if exclude_cols.any? && exclude_dtypes.any?
156
+ msg = "cannot exclude by both column name and dtype; use a selector instead"
157
+ raise TypeError, msg
158
+ elsif exclude_dtypes.any?
159
+ self - Selectors.by_dtype(exclude_dtypes)
160
+ else
161
+ self - Selectors.by_name(exclude_cols, require_all: false)
162
+ end
163
+ end
164
+
165
+ # Materialize the `selector` as a normal expression.
166
+ #
167
+ # This ensures that the operators `|`, `&`, `~` and `-`
168
+ # are applied on the data and not on the selector sets.
169
+ #
170
+ # @return [Expr]
171
+ #
172
+ # @example Inverting the boolean selector will choose the non-boolean columns:
173
+ # df = Polars::DataFrame.new(
174
+ # {
175
+ # "colx" => ["aa", "bb", "cc"],
176
+ # "coly" => [true, false, true],
177
+ # "colz" => [1, 2, 3]
178
+ # }
179
+ # )
180
+ # df.select(~Polars.cs.boolean)
181
+ # # =>
182
+ # # shape: (3, 2)
183
+ # # ┌──────┬──────┐
184
+ # # │ colx ┆ colz │
185
+ # # │ --- ┆ --- │
186
+ # # │ str ┆ i64 │
187
+ # # ╞══════╪══════╡
188
+ # # │ aa ┆ 1 │
189
+ # # │ bb ┆ 2 │
190
+ # # │ cc ┆ 3 │
191
+ # # └──────┴──────┘
192
+ #
193
+ # @example To invert the *values* in the selected boolean columns, we need to materialize the selector as a standard expression instead:
194
+ # df.select(~Polars.cs.boolean.as_expr)
195
+ # # =>
196
+ # # shape: (3, 1)
197
+ # # ┌───────┐
198
+ # # │ coly │
199
+ # # │ --- │
200
+ # # │ bool │
201
+ # # ╞═══════╡
202
+ # # │ false │
203
+ # # │ true │
204
+ # # │ false │
205
+ # # └───────┘
206
+ def as_expr
207
+ Expr._from_rbexpr(_rbexpr)
208
+ end
209
+ end
210
+ end