polars-df 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +55 -48
  4. data/Cargo.toml +3 -0
  5. data/README.md +12 -0
  6. data/ext/polars/Cargo.toml +22 -11
  7. data/ext/polars/src/batched_csv.rs +4 -4
  8. data/ext/polars/src/catalog/unity.rs +96 -94
  9. data/ext/polars/src/conversion/any_value.rs +26 -30
  10. data/ext/polars/src/conversion/chunked_array.rs +32 -28
  11. data/ext/polars/src/conversion/datetime.rs +11 -0
  12. data/ext/polars/src/conversion/mod.rs +307 -34
  13. data/ext/polars/src/dataframe/construction.rs +4 -3
  14. data/ext/polars/src/dataframe/export.rs +17 -15
  15. data/ext/polars/src/dataframe/general.rs +15 -12
  16. data/ext/polars/src/dataframe/io.rs +1 -2
  17. data/ext/polars/src/dataframe/mod.rs +25 -1
  18. data/ext/polars/src/dataframe/serde.rs +23 -8
  19. data/ext/polars/src/exceptions.rs +8 -4
  20. data/ext/polars/src/expr/array.rs +73 -4
  21. data/ext/polars/src/expr/binary.rs +26 -1
  22. data/ext/polars/src/expr/bitwise.rs +39 -0
  23. data/ext/polars/src/expr/categorical.rs +20 -0
  24. data/ext/polars/src/expr/datatype.rs +24 -1
  25. data/ext/polars/src/expr/datetime.rs +58 -14
  26. data/ext/polars/src/expr/general.rs +87 -15
  27. data/ext/polars/src/expr/list.rs +32 -24
  28. data/ext/polars/src/expr/meta.rs +15 -6
  29. data/ext/polars/src/expr/mod.rs +3 -0
  30. data/ext/polars/src/expr/name.rs +19 -14
  31. data/ext/polars/src/expr/rolling.rs +20 -0
  32. data/ext/polars/src/expr/serde.rs +28 -0
  33. data/ext/polars/src/expr/string.rs +64 -10
  34. data/ext/polars/src/expr/struct.rs +9 -1
  35. data/ext/polars/src/file.rs +15 -9
  36. data/ext/polars/src/functions/business.rs +0 -1
  37. data/ext/polars/src/functions/io.rs +25 -3
  38. data/ext/polars/src/functions/lazy.rs +11 -6
  39. data/ext/polars/src/functions/meta.rs +3 -3
  40. data/ext/polars/src/functions/string_cache.rs +3 -3
  41. data/ext/polars/src/interop/arrow/to_ruby.rs +3 -3
  42. data/ext/polars/src/interop/numo/numo_rs.rs +4 -3
  43. data/ext/polars/src/io/mod.rs +6 -0
  44. data/ext/polars/src/lazyframe/general.rs +59 -9
  45. data/ext/polars/src/lazyframe/mod.rs +16 -1
  46. data/ext/polars/src/lazyframe/optflags.rs +58 -0
  47. data/ext/polars/src/lazyframe/serde.rs +27 -3
  48. data/ext/polars/src/lib.rs +261 -19
  49. data/ext/polars/src/map/dataframe.rs +20 -17
  50. data/ext/polars/src/map/lazy.rs +6 -5
  51. data/ext/polars/src/map/series.rs +8 -7
  52. data/ext/polars/src/on_startup.rs +12 -5
  53. data/ext/polars/src/rb_modules.rs +2 -2
  54. data/ext/polars/src/series/aggregation.rs +85 -28
  55. data/ext/polars/src/series/construction.rs +1 -0
  56. data/ext/polars/src/series/export.rs +37 -33
  57. data/ext/polars/src/series/general.rs +120 -21
  58. data/ext/polars/src/series/mod.rs +29 -4
  59. data/lib/polars/array_expr.rb +382 -3
  60. data/lib/polars/array_name_space.rb +281 -0
  61. data/lib/polars/binary_expr.rb +67 -0
  62. data/lib/polars/binary_name_space.rb +43 -0
  63. data/lib/polars/cat_expr.rb +224 -0
  64. data/lib/polars/cat_name_space.rb +138 -0
  65. data/lib/polars/config.rb +2 -2
  66. data/lib/polars/convert.rb +6 -6
  67. data/lib/polars/data_frame.rb +794 -27
  68. data/lib/polars/data_type_expr.rb +52 -0
  69. data/lib/polars/data_types.rb +26 -5
  70. data/lib/polars/date_time_expr.rb +252 -1
  71. data/lib/polars/date_time_name_space.rb +299 -0
  72. data/lib/polars/expr.rb +1248 -206
  73. data/lib/polars/functions/business.rb +95 -0
  74. data/lib/polars/functions/datatype.rb +21 -0
  75. data/lib/polars/functions/lazy.rb +14 -1
  76. data/lib/polars/io/csv.rb +1 -1
  77. data/lib/polars/io/iceberg.rb +27 -0
  78. data/lib/polars/io/json.rb +4 -4
  79. data/lib/polars/io/ndjson.rb +4 -4
  80. data/lib/polars/io/parquet.rb +32 -7
  81. data/lib/polars/io/scan_options.rb +4 -1
  82. data/lib/polars/lazy_frame.rb +1028 -28
  83. data/lib/polars/list_expr.rb +217 -17
  84. data/lib/polars/list_name_space.rb +231 -22
  85. data/lib/polars/meta_expr.rb +89 -0
  86. data/lib/polars/name_expr.rb +36 -0
  87. data/lib/polars/query_opt_flags.rb +50 -0
  88. data/lib/polars/scan_cast_options.rb +20 -1
  89. data/lib/polars/schema.rb +79 -3
  90. data/lib/polars/selector.rb +72 -0
  91. data/lib/polars/selectors.rb +3 -3
  92. data/lib/polars/series.rb +1053 -54
  93. data/lib/polars/string_expr.rb +436 -32
  94. data/lib/polars/string_name_space.rb +736 -50
  95. data/lib/polars/struct_expr.rb +103 -0
  96. data/lib/polars/struct_name_space.rb +19 -1
  97. data/lib/polars/utils/serde.rb +17 -0
  98. data/lib/polars/utils/various.rb +22 -1
  99. data/lib/polars/utils.rb +5 -1
  100. data/lib/polars/version.rb +1 -1
  101. data/lib/polars.rb +6 -0
  102. metadata +11 -1
@@ -97,6 +97,70 @@ module Polars
97
97
  _rbexpr.meta_is_regex_projection
98
98
  end
99
99
 
100
+ # Indicate if this expression only selects columns (optionally with aliasing).
101
+ #
102
+ # This can include bare columns, columns matched by regex or dtype, selectors
103
+ # and exclude ops, and (optionally) column/expression aliasing.
104
+ #
105
+ # @param allow_aliasing [Boolean]
106
+ # If false (default), any aliasing is not considered to be column selection.
107
+ # Set true to allow for column selection that also includes aliasing.
108
+ #
109
+ # @return [Boolean]
110
+ #
111
+ # @example
112
+ # e = Polars.col("foo")
113
+ # e.meta.is_column_selection
114
+ # # => true
115
+ #
116
+ # @example
117
+ # e = Polars.col("foo").alias("bar")
118
+ # e.meta.is_column_selection
119
+ # # => false
120
+ #
121
+ # @example
122
+ # e.meta.is_column_selection(allow_aliasing: true)
123
+ # # => true
124
+ #
125
+ # @example
126
+ # e = Polars.col("foo") * Polars.col("bar")
127
+ # e.meta.is_column_selection
128
+ # # => false
129
+ #
130
+ # @example
131
+ # e = Polars.cs.starts_with("foo")
132
+ # e.meta.is_column_selection
133
+ # # => true
134
+ #
135
+ # @example
136
+ # e = Polars.cs.starts_with("foo").exclude("foo!")
137
+ # e.meta.is_column_selection
138
+ # # => true
139
+ def is_column_selection(allow_aliasing: false)
140
+ _rbexpr.meta_is_column_selection(allow_aliasing)
141
+ end
142
+
143
+ # Indicate if this expression is a literal value (optionally aliased).
144
+ #
145
+ # @param allow_aliasing [Boolean]
146
+ # If false (default), only a bare literal will match.
147
+ # Set true to also allow for aliased literals.
148
+ #
149
+ # @return [Boolean]
150
+ #
151
+ # @example
152
+ # e = Polars.lit(123)
153
+ # e.meta.is_literal
154
+ # # => true
155
+ #
156
+ # @example
157
+ # e = Polars.lit(987.654321).alias("foo")
158
+ # e.meta.is_literal
159
+ # # => false
160
+ def is_literal(allow_aliasing: false)
161
+ _rbexpr.meta_is_literal(allow_aliasing)
162
+ end
163
+
100
164
  # Get the column name that this expression would produce.
101
165
  #
102
166
  # @return [String]
@@ -184,6 +248,31 @@ module Polars
184
248
  Selector._from_rbselector(_rbexpr.into_selector)
185
249
  end
186
250
 
251
+ # Serialize this expression to a file or string.
252
+ #
253
+ # @param file [Object]
254
+ # File path to which the result should be written. If set to `nil`
255
+ # (default), the output is returned as a string instead.
256
+ #
257
+ # @return [Object]
258
+ #
259
+ # @note
260
+ # Serialization is not stable across Polars versions: a LazyFrame serialized
261
+ # in one Polars version may not be deserializable in another Polars version.
262
+ #
263
+ # @example Serialize the expression into a binary representation.
264
+ # expr = Polars.col("foo").sum.over("bar")
265
+ # bytes = expr.meta.serialize
266
+ # Polars::Expr.deserialize(StringIO.new(bytes))
267
+ # # => col("foo").sum().over([col("bar")])
268
+ def serialize(file = nil)
269
+ raise Todo unless _rbexpr.respond_to?(:serialize_binary)
270
+
271
+ serializer = _rbexpr.method(:serialize_binary)
272
+
273
+ Utils.serialize_polars_object(serializer, file)
274
+ end
275
+
187
276
  # Format the expression as a tree.
188
277
  #
189
278
  # @param return_as_string [Boolean]
@@ -194,5 +194,41 @@ module Polars
194
194
  def to_uppercase
195
195
  Utils.wrap_expr(_rbexpr.name_to_uppercase)
196
196
  end
197
+
198
+ # Add a prefix to all field names of a struct.
199
+ #
200
+ # @note
201
+ # This only takes effect for struct columns.
202
+ #
203
+ # @param prefix [String]
204
+ # Prefix to add to the field name.
205
+ #
206
+ # @return [Expr]
207
+ #
208
+ # @example
209
+ # df = Polars::DataFrame.new({"x" => {"a" => 1, "b" => 2}})
210
+ # df.select(Polars.col("x").name.prefix_fields("prefix_")).schema
211
+ # # => {"x"=>Polars::Struct({"prefix_a"=>Polars::Int64, "prefix_b"=>Polars::Int64})}
212
+ def prefix_fields(prefix)
213
+ Utils.wrap_expr(_rbexpr.name_prefix_fields(prefix))
214
+ end
215
+
216
+ # Add a suffix to all field names of a struct.
217
+ #
218
+ # @note
219
+ # This only takes effect for struct columns.
220
+ #
221
+ # @param suffix [String]
222
+ # Suffix to add to the field name.
223
+ #
224
+ # @return [Expr]
225
+ #
226
+ # @example
227
+ # df = Polars::DataFrame.new({"x" => {"a" => 1, "b" => 2}})
228
+ # df.select(Polars.col("x").name.suffix_fields("_suffix")).schema
229
+ # # => {"x"=>Polars::Struct({"a_suffix"=>Polars::Int64, "b_suffix"=>Polars::Int64})}
230
+ def suffix_fields(suffix)
231
+ Utils.wrap_expr(_rbexpr.name_suffix_fields(suffix))
232
+ end
197
233
  end
198
234
  end
@@ -0,0 +1,50 @@
1
+ module Polars
2
+ # The set of the optimizations considered during query optimization.
3
+ #
4
+ # @note
5
+ # This functionality is considered **unstable**. It may be changed
6
+ # at any point without it being considered a breaking change.
7
+ class QueryOptFlags
8
+ def initialize(
9
+ predicate_pushdown: nil,
10
+ projection_pushdown: nil,
11
+ simplify_expression: nil,
12
+ slice_pushdown: nil,
13
+ comm_subplan_elim: nil,
14
+ comm_subexpr_elim: nil,
15
+ cluster_with_columns: nil,
16
+ collapse_joins: nil,
17
+ check_order_observe: nil,
18
+ fast_projection: nil
19
+ )
20
+ @_rboptflags = RbOptFlags.default
21
+ update(
22
+ predicate_pushdown: predicate_pushdown,
23
+ projection_pushdown: projection_pushdown,
24
+ simplify_expression: simplify_expression,
25
+ slice_pushdown: slice_pushdown,
26
+ comm_subplan_elim: comm_subplan_elim,
27
+ comm_subexpr_elim: comm_subexpr_elim,
28
+ cluster_with_columns: cluster_with_columns,
29
+ collapse_joins: collapse_joins,
30
+ check_order_observe: check_order_observe,
31
+ fast_projection: fast_projection
32
+ )
33
+ end
34
+
35
+ def update(
36
+ predicate_pushdown: nil,
37
+ projection_pushdown: nil,
38
+ simplify_expression: nil,
39
+ slice_pushdown: nil,
40
+ comm_subplan_elim: nil,
41
+ comm_subexpr_elim: nil,
42
+ cluster_with_columns: nil,
43
+ collapse_joins: nil,
44
+ check_order_observe: nil,
45
+ fast_projection: nil
46
+ )
47
+ raise Todo
48
+ end
49
+ end
50
+ end
@@ -1,6 +1,8 @@
1
1
  module Polars
2
2
  # Options for scanning files.
3
3
  class ScanCastOptions
4
+ attr_reader :integer_cast, :float_cast, :datetime_cast, :missing_struct_fields, :extra_struct_fields
5
+
4
6
  # Common configuration for scanning files.
5
7
  #
6
8
  # @note
@@ -50,6 +52,10 @@ module Polars
50
52
  extra_struct_fields: "raise",
51
53
  _internal_call: false
52
54
  )
55
+ if !_internal_call
56
+ warn "ScanCastOptions is considered unstable."
57
+ end
58
+
53
59
  @integer_cast = integer_cast
54
60
  @float_cast = float_cast
55
61
  @datetime_cast = datetime_cast
@@ -57,8 +63,21 @@ module Polars
57
63
  @extra_struct_fields = extra_struct_fields
58
64
  end
59
65
 
60
- def self.default
66
+ def self._default
61
67
  new(_internal_call: true)
62
68
  end
69
+
70
+ def self._default_iceberg
71
+ @_default_cast_options_iceberg ||= begin
72
+ ScanCastOptions.new(
73
+ integer_cast: "upcast",
74
+ float_cast: ["upcast", "downcast"],
75
+ datetime_cast: ["nanosecond-downcast", "convert-timezone"],
76
+ missing_struct_fields: "insert",
77
+ extra_struct_fields: "ignore",
78
+ _internal_call: true
79
+ )
80
+ end
81
+ end
63
82
  end
64
83
  end
data/lib/polars/schema.rb CHANGED
@@ -1,34 +1,110 @@
1
1
  module Polars
2
2
  class Schema
3
+ # Ordered mapping of column names to their data type.
4
+ #
5
+ # @param schema [Object]
6
+ # The schema definition given by column names and their associated
7
+ # Polars data type. Accepts a mapping or an enumerable of arrays.
3
8
  def initialize(schema = nil, check_dtypes: true)
4
- raise Todo if check_dtypes
5
- @schema = schema.to_h
9
+ input = schema || {}
10
+ @schema = {}
11
+ input.each do |name, tp|
12
+ if !check_dtypes
13
+ @schema[name] = tp
14
+ elsif Utils.is_polars_dtype(tp)
15
+ @schema[name] = _check_dtype(tp)
16
+ else
17
+ self[name] = tp
18
+ end
19
+ end
6
20
  end
7
21
 
22
+ # Returns the data type of the column.
23
+ #
24
+ # @return [Object]
8
25
  def [](key)
9
26
  @schema[key]
10
27
  end
11
28
 
29
+ # Sets the data type of the column.
30
+ #
31
+ # @return [Object]
12
32
  def []=(name, dtype)
13
- # TODO check dtype if needed
33
+ _check_dtype(dtype)
14
34
  @schema[name] = dtype
15
35
  end
16
36
 
37
+ # Get the column names of the schema.
38
+ #
39
+ # @return [Array]
40
+ #
41
+ # @example
42
+ # s = Polars::Schema.new({"x" => Polars::Float64.new, "y" => Polars::Datetime.new(time_zone: "UTC")})
43
+ # s.names
44
+ # # => ["x", "y"]
17
45
  def names
18
46
  @schema.keys
19
47
  end
20
48
 
49
+ # Get the data types of the schema.
50
+ #
51
+ # @return [Array]
52
+ #
53
+ # @example
54
+ # s = Polars::Schema.new({"x" => Polars::UInt8.new, "y" => Polars::List.new(Polars::UInt8)})
55
+ # s.dtypes
56
+ # # => [Polars::UInt8, Polars::List(Polars::UInt8)]
21
57
  def dtypes
22
58
  @schema.values
23
59
  end
24
60
 
61
+ # Get the number of schema entries.
62
+ #
63
+ # @return [Integer]
64
+ #
65
+ # @example
66
+ # s = Polars::Schema.new({"x" => Polars::Int32.new, "y" => Polars::List.new(Polars::String)})
67
+ # s.length
68
+ # # => 2
25
69
  def length
26
70
  @schema.length
27
71
  end
28
72
 
73
+ # Returns a string representing the Schema.
74
+ #
75
+ # @return [String]
29
76
  def to_s
30
77
  "#{self.class.name}(#{@schema})"
31
78
  end
32
79
  alias_method :inspect, :to_s
80
+
81
+ # @private
82
+ def include?(name)
83
+ @schema.include?(name)
84
+ end
85
+
86
+ # @private
87
+ def to_h
88
+ @schema.to_h
89
+ end
90
+
91
+ private
92
+
93
+ def _check_dtype(tp)
94
+ if !tp.is_a?(DataType)
95
+ # note: if nested/decimal, or has signature params, this implies required args
96
+ if tp.nested? || tp.decimal? || _required_init_args(tp)
97
+ msg = "dtypes must be fully-specified, got: #{tp.inspect}"
98
+ raise TypeError, msg
99
+ end
100
+ tp = tp.new
101
+ end
102
+ tp
103
+ end
104
+
105
+ def _required_init_args(tp)
106
+ arity = tp.method(:new).arity
107
+ arity > 0 || arity < -1
108
+ end
33
109
  end
34
110
  end
@@ -12,6 +12,9 @@ module Polars
12
12
  slf
13
13
  end
14
14
 
15
+ # Returns a string representing the Selector.
16
+ #
17
+ # @return [String]
15
18
  def inspect
16
19
  Expr._from_rbexpr(_rbexpr).to_s
17
20
  end
@@ -50,10 +53,16 @@ module Polars
50
53
  _from_rbselector(RbSelector.by_name(names, strict))
51
54
  end
52
55
 
56
+ # Invert the selector.
57
+ #
58
+ # @return [Selector]
53
59
  def ~
54
60
  Selectors.all - self
55
61
  end
56
62
 
63
+ # AND.
64
+ #
65
+ # @return [Selector]
57
66
  def &(other)
58
67
  if Utils.is_column(other)
59
68
  colname = other.meta.output_name
@@ -68,6 +77,9 @@ module Polars
68
77
  end
69
78
  end
70
79
 
80
+ # OR.
81
+ #
82
+ # @return [Selector]
71
83
  def |(other)
72
84
  if Utils.is_column(other)
73
85
  other = by_name(other.meta.output_name)
@@ -81,6 +93,9 @@ module Polars
81
93
  end
82
94
  end
83
95
 
96
+ # Difference.
97
+ #
98
+ # @return [Selector]
84
99
  def -(other)
85
100
  if Utils.is_selector(other)
86
101
  Selector._from_rbselector(
@@ -91,6 +106,9 @@ module Polars
91
106
  end
92
107
  end
93
108
 
109
+ # XOR.
110
+ #
111
+ # @return [Selector]
94
112
  def ^(other)
95
113
  if Utils.is_column(other)
96
114
  other = by_name(other.meta.output_name)
@@ -104,6 +122,19 @@ module Polars
104
122
  end
105
123
  end
106
124
 
125
+ # Exclude columns from a multi-column expression.
126
+ #
127
+ # Only works after a wildcard or regex column selection, and you cannot provide
128
+ # both string column names *and* dtypes (you may prefer to use selectors instead).
129
+ #
130
+ # @return [Selector]
131
+ #
132
+ # @param columns [Object]
133
+ # The name or datatype of the column(s) to exclude. Accepts regular expression
134
+ # input. Regular expressions should start with `^` and end with `$`.
135
+ # @param more_columns [Array]
136
+ # Additional names or datatypes of columns to exclude, specified as positional
137
+ # arguments.
107
138
  def exclude(columns, *more_columns)
108
139
  exclude_cols = []
109
140
  exclude_dtypes = []
@@ -131,6 +162,47 @@ module Polars
131
162
  end
132
163
  end
133
164
 
165
+ # Materialize the `selector` as a normal expression.
166
+ #
167
+ # This ensures that the operators `|`, `&`, `~` and `-`
168
+ # are applied on the data and not on the selector sets.
169
+ #
170
+ # @return [Expr]
171
+ #
172
+ # @example Inverting the boolean selector will choose the non-boolean columns:
173
+ # df = Polars::DataFrame.new(
174
+ # {
175
+ # "colx" => ["aa", "bb", "cc"],
176
+ # "coly" => [true, false, true],
177
+ # "colz" => [1, 2, 3]
178
+ # }
179
+ # )
180
+ # df.select(~Polars.cs.boolean)
181
+ # # =>
182
+ # # shape: (3, 2)
183
+ # # ┌──────┬──────┐
184
+ # # │ colx ┆ colz │
185
+ # # │ --- ┆ --- │
186
+ # # │ str ┆ i64 │
187
+ # # ╞══════╪══════╡
188
+ # # │ aa ┆ 1 │
189
+ # # │ bb ┆ 2 │
190
+ # # │ cc ┆ 3 │
191
+ # # └──────┴──────┘
192
+ #
193
+ # @example To invert the *values* in the selected boolean columns, we need to materialize the selector as a standard expression instead:
194
+ # df.select(~Polars.cs.boolean.as_expr)
195
+ # # =>
196
+ # # shape: (3, 1)
197
+ # # ┌───────┐
198
+ # # │ coly │
199
+ # # │ --- │
200
+ # # │ bool │
201
+ # # ╞═══════╡
202
+ # # │ false │
203
+ # # │ true │
204
+ # # │ false │
205
+ # # └───────┘
134
206
  def as_expr
135
207
  Expr._from_rbexpr(_rbexpr)
136
208
  end
@@ -287,7 +287,7 @@ module Polars
287
287
  # # │ b"hello" ┆ world ┆ b"!" ┆ :) │
288
288
  # # └──────────┴───────┴────────┴─────┘
289
289
  #
290
- # @example Select binary columns and export as a dict:
290
+ # @example Select binary columns and export as a hash:
291
291
  # df.select(Polars.cs.binary).to_h(as_series: false)
292
292
  # # => {"a"=>["hello"], "c"=>["!"]}
293
293
  #
@@ -628,7 +628,7 @@ module Polars
628
628
  # # └──────┘
629
629
  #
630
630
  # @example Select all columns *except* for those that are enum:
631
- # df.select(~Polars.cs.enum())
631
+ # df.select(~Polars.cs.enum)
632
632
  # # =>
633
633
  # # shape: (2, 2)
634
634
  # # ┌─────┬─────┐
@@ -928,7 +928,7 @@ module Polars
928
928
  # # │ 456 ┆ 5.5 │
929
929
  # # └─────┴─────┘
930
930
  def self.categorical
931
- Selector._from_rbselector(RbSelector.categorical())
931
+ Selector._from_rbselector(RbSelector.categorical)
932
932
  end
933
933
 
934
934
  # Select columns whose names contain the given literal substring(s).