polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +127 -1
  3. data/Cargo.lock +72 -58
  4. data/README.md +31 -27
  5. data/ext/polars/Cargo.toml +15 -6
  6. data/ext/polars/src/batched_csv.rs +35 -39
  7. data/ext/polars/src/c_api/allocator.rs +7 -0
  8. data/ext/polars/src/c_api/mod.rs +1 -0
  9. data/ext/polars/src/catalog/unity.rs +123 -101
  10. data/ext/polars/src/conversion/any_value.rs +13 -17
  11. data/ext/polars/src/conversion/chunked_array.rs +5 -5
  12. data/ext/polars/src/conversion/datetime.rs +3 -2
  13. data/ext/polars/src/conversion/mod.rs +50 -45
  14. data/ext/polars/src/dataframe/export.rs +13 -13
  15. data/ext/polars/src/dataframe/general.rs +223 -223
  16. data/ext/polars/src/dataframe/io.rs +27 -141
  17. data/ext/polars/src/dataframe/mod.rs +13 -5
  18. data/ext/polars/src/dataframe/serde.rs +1 -1
  19. data/ext/polars/src/error.rs +44 -7
  20. data/ext/polars/src/exceptions.rs +45 -12
  21. data/ext/polars/src/expr/array.rs +12 -0
  22. data/ext/polars/src/expr/datatype.rs +2 -2
  23. data/ext/polars/src/expr/datetime.rs +4 -5
  24. data/ext/polars/src/expr/general.rs +49 -13
  25. data/ext/polars/src/expr/list.rs +4 -0
  26. data/ext/polars/src/expr/meta.rs +8 -3
  27. data/ext/polars/src/expr/mod.rs +22 -6
  28. data/ext/polars/src/expr/name.rs +19 -8
  29. data/ext/polars/src/expr/rolling.rs +50 -1
  30. data/ext/polars/src/expr/string.rs +0 -1
  31. data/ext/polars/src/expr/struct.rs +7 -2
  32. data/ext/polars/src/file.rs +136 -103
  33. data/ext/polars/src/functions/aggregation.rs +9 -8
  34. data/ext/polars/src/functions/io.rs +81 -10
  35. data/ext/polars/src/functions/lazy.rs +95 -21
  36. data/ext/polars/src/functions/mod.rs +2 -0
  37. data/ext/polars/src/functions/range.rs +19 -3
  38. data/ext/polars/src/functions/strings.rs +6 -0
  39. data/ext/polars/src/functions/utils.rs +6 -0
  40. data/ext/polars/src/interop/arrow/mod.rs +50 -1
  41. data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
  42. data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
  43. data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
  44. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  45. data/ext/polars/src/lazyframe/exitable.rs +39 -0
  46. data/ext/polars/src/lazyframe/general.rs +340 -236
  47. data/ext/polars/src/lazyframe/mod.rs +46 -10
  48. data/ext/polars/src/lazyframe/optflags.rs +5 -4
  49. data/ext/polars/src/lazyframe/serde.rs +11 -3
  50. data/ext/polars/src/lazyframe/sink.rs +10 -5
  51. data/ext/polars/src/lazygroupby.rs +6 -7
  52. data/ext/polars/src/lib.rs +141 -76
  53. data/ext/polars/src/map/dataframe.rs +12 -12
  54. data/ext/polars/src/map/lazy.rs +7 -5
  55. data/ext/polars/src/map/mod.rs +15 -8
  56. data/ext/polars/src/map/series.rs +3 -3
  57. data/ext/polars/src/on_startup.rs +16 -8
  58. data/ext/polars/src/prelude.rs +1 -0
  59. data/ext/polars/src/rb_modules.rs +19 -49
  60. data/ext/polars/src/series/aggregation.rs +79 -140
  61. data/ext/polars/src/series/arithmetic.rs +16 -22
  62. data/ext/polars/src/series/comparison.rs +101 -222
  63. data/ext/polars/src/series/construction.rs +17 -18
  64. data/ext/polars/src/series/export.rs +1 -1
  65. data/ext/polars/src/series/general.rs +254 -289
  66. data/ext/polars/src/series/import.rs +17 -0
  67. data/ext/polars/src/series/map.rs +178 -160
  68. data/ext/polars/src/series/mod.rs +28 -12
  69. data/ext/polars/src/series/scatter.rs +12 -9
  70. data/ext/polars/src/sql.rs +16 -9
  71. data/ext/polars/src/testing/frame.rs +31 -0
  72. data/ext/polars/src/testing/mod.rs +5 -0
  73. data/ext/polars/src/testing/series.rs +31 -0
  74. data/ext/polars/src/timeout.rs +105 -0
  75. data/ext/polars/src/utils.rs +159 -1
  76. data/lib/polars/array_expr.rb +81 -12
  77. data/lib/polars/array_name_space.rb +74 -7
  78. data/lib/polars/batched_csv_reader.rb +21 -21
  79. data/lib/polars/binary_name_space.rb +1 -1
  80. data/lib/polars/cat_expr.rb +7 -7
  81. data/lib/polars/config.rb +1 -1
  82. data/lib/polars/convert.rb +189 -34
  83. data/lib/polars/data_frame.rb +1066 -831
  84. data/lib/polars/data_frame_plot.rb +173 -0
  85. data/lib/polars/data_type_group.rb +1 -0
  86. data/lib/polars/data_types.rb +31 -12
  87. data/lib/polars/date_time_expr.rb +51 -69
  88. data/lib/polars/date_time_name_space.rb +80 -112
  89. data/lib/polars/dynamic_group_by.rb +7 -7
  90. data/lib/polars/exceptions.rb +50 -10
  91. data/lib/polars/expr.rb +470 -517
  92. data/lib/polars/functions/aggregation/horizontal.rb +0 -1
  93. data/lib/polars/functions/aggregation/vertical.rb +2 -3
  94. data/lib/polars/functions/as_datatype.rb +290 -8
  95. data/lib/polars/functions/eager.rb +204 -10
  96. data/lib/polars/functions/escape_regex.rb +21 -0
  97. data/lib/polars/functions/lazy.rb +409 -169
  98. data/lib/polars/functions/lit.rb +17 -1
  99. data/lib/polars/functions/range/int_range.rb +74 -2
  100. data/lib/polars/functions/range/linear_space.rb +77 -0
  101. data/lib/polars/functions/range/time_range.rb +1 -1
  102. data/lib/polars/functions/repeat.rb +3 -12
  103. data/lib/polars/functions/whenthen.rb +2 -2
  104. data/lib/polars/group_by.rb +72 -20
  105. data/lib/polars/iceberg_dataset.rb +1 -6
  106. data/lib/polars/in_process_query.rb +37 -0
  107. data/lib/polars/io/cloud.rb +18 -0
  108. data/lib/polars/io/csv.rb +265 -126
  109. data/lib/polars/io/database.rb +0 -1
  110. data/lib/polars/io/delta.rb +15 -7
  111. data/lib/polars/io/ipc.rb +24 -17
  112. data/lib/polars/io/ndjson.rb +161 -24
  113. data/lib/polars/io/parquet.rb +101 -38
  114. data/lib/polars/lazy_frame.rb +849 -558
  115. data/lib/polars/lazy_group_by.rb +327 -2
  116. data/lib/polars/list_expr.rb +94 -16
  117. data/lib/polars/list_name_space.rb +88 -24
  118. data/lib/polars/meta_expr.rb +42 -1
  119. data/lib/polars/name_expr.rb +41 -4
  120. data/lib/polars/query_opt_flags.rb +198 -2
  121. data/lib/polars/rolling_group_by.rb +3 -3
  122. data/lib/polars/schema.rb +21 -3
  123. data/lib/polars/selector.rb +37 -2
  124. data/lib/polars/selectors.rb +45 -9
  125. data/lib/polars/series.rb +1156 -728
  126. data/lib/polars/series_plot.rb +72 -0
  127. data/lib/polars/slice.rb +1 -1
  128. data/lib/polars/sql_context.rb +11 -4
  129. data/lib/polars/string_expr.rb +59 -68
  130. data/lib/polars/string_name_space.rb +51 -87
  131. data/lib/polars/struct_expr.rb +36 -18
  132. data/lib/polars/testing.rb +24 -273
  133. data/lib/polars/utils/constants.rb +2 -0
  134. data/lib/polars/utils/construction/data_frame.rb +410 -0
  135. data/lib/polars/utils/construction/series.rb +364 -0
  136. data/lib/polars/utils/construction/utils.rb +9 -0
  137. data/lib/polars/utils/deprecation.rb +11 -0
  138. data/lib/polars/utils/serde.rb +8 -3
  139. data/lib/polars/utils/unstable.rb +19 -0
  140. data/lib/polars/utils/various.rb +59 -0
  141. data/lib/polars/utils.rb +46 -47
  142. data/lib/polars/version.rb +1 -1
  143. data/lib/polars.rb +47 -1
  144. metadata +25 -6
  145. data/ext/polars/src/allocator.rs +0 -13
  146. data/lib/polars/plot.rb +0 -109
@@ -0,0 +1,410 @@
1
+ module Polars
2
+ module Utils
3
+ def self.hash_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true, nan_to_null: nil)
4
+ if schema.is_a?(Hash) && !data.empty?
5
+ if !data.all? { |col, _| schema[col] }
6
+ raise ArgumentError, "The given column-schema names do not match the data hash"
7
+ end
8
+
9
+ data = schema.to_h { |col| [col, data[col]] }
10
+ end
11
+
12
+ column_names, schema_overrides = _unpack_schema(
13
+ schema, lookup_names: data.keys, schema_overrides: schema_overrides
14
+ )
15
+ if column_names.empty?
16
+ column_names = data.keys
17
+ end
18
+
19
+ if data.empty? && !schema_overrides.empty?
20
+ data_series = column_names.map { |name| Series.new(name, [], dtype: schema_overrides[name], strict: strict, nan_to_null: nan_to_null)._s }
21
+ else
22
+ data_series = _expand_hash_values(data, schema_overrides: schema_overrides, strict: strict, nan_to_null: nan_to_null).values.map(&:_s)
23
+ end
24
+
25
+ data_series = _handle_columns_arg(data_series, columns: column_names, from_hash: true)
26
+ RbDataFrame.new(data_series)
27
+ end
28
+
29
+ def self._unpack_schema(schema, schema_overrides: nil, n_expected: nil, lookup_names: nil, include_overrides_in_columns: false)
30
+ if schema.is_a?(Hash)
31
+ schema = schema.to_a
32
+ end
33
+ column_names =
34
+ (schema || []).map.with_index do |col, i|
35
+ if col.is_a?(::String)
36
+ col || "column_#{i}"
37
+ else
38
+ col[0]
39
+ end
40
+ end
41
+ if column_names.empty? && n_expected
42
+ column_names = n_expected.times.map { |i| "column_#{i}" }
43
+ end
44
+ # TODO zip_longest
45
+ lookup = column_names.zip(lookup_names || []).to_h
46
+
47
+ column_dtypes =
48
+ (schema || []).select { |col| !col.is_a?(::String) && col[1] }.to_h do |col|
49
+ [lookup[col[0]] || col[0], col[1]]
50
+ end
51
+
52
+ if schema_overrides && schema_overrides.any?
53
+ column_dtypes.merge!(schema_overrides)
54
+ end
55
+
56
+ column_dtypes.each do |col, dtype|
57
+ if !Utils.is_polars_dtype(dtype, include_unknown: true) && !dtype.nil?
58
+ column_dtypes[col] = Utils.parse_into_dtype(dtype)
59
+ end
60
+ end
61
+
62
+ [column_names, column_dtypes]
63
+ end
64
+
65
+ def self._handle_columns_arg(data, columns: nil, from_hash: false)
66
+ if columns.nil? || columns.empty?
67
+ data
68
+ else
69
+ if data.empty?
70
+ columns.map { |c| Series.new(c, nil)._s }
71
+ elsif data.length == columns.length
72
+ if from_hash
73
+ series_map = data.to_h { |s| [s.name, s] }
74
+ if columns.all? { |col| series_map.key?(col) }
75
+ return columns.map { |col| series_map[col] }
76
+ end
77
+ end
78
+
79
+ columns.each_with_index do |c, i|
80
+ # not in-place?
81
+ data[i].rename(c)
82
+ end
83
+ data
84
+ else
85
+ raise ArgumentError, "Dimensions of columns arg must match data dimensions."
86
+ end
87
+ end
88
+ end
89
+
90
+ def self._post_apply_columns(rbdf, columns, structs: nil, schema_overrides: nil, strict: true)
91
+ rbdf_columns = rbdf.columns
92
+ rbdf_dtypes = rbdf.dtypes
93
+ columns, dtypes = _unpack_schema(
94
+ (columns || rbdf_columns), schema_overrides: schema_overrides
95
+ )
96
+ column_subset = []
97
+ if columns != rbdf_columns
98
+ if columns.length < rbdf_columns.length && columns == rbdf_columns.first(columns.length)
99
+ column_subset = columns
100
+ else
101
+ rbdf.set_column_names(columns)
102
+ end
103
+ end
104
+
105
+ column_casts = []
106
+ columns.each_with_index do |col, i|
107
+ if dtypes[col] == Categorical # != rbdf_dtypes[i]
108
+ column_casts << Polars.col(col).cast(Categorical, strict: strict)._rbexpr
109
+ elsif structs&.any? && structs.include?(col) && structs[col] != rbdf_dtypes[i]
110
+ column_casts << Polars.col(col).cast(structs[col], strict: strict)._rbexpr
111
+ elsif dtypes.include?(col) && dtypes[col] != rbdf_dtypes[i]
112
+ column_casts << Polars.col(col).cast(dtypes[col], strict: strict)._rbexpr
113
+ end
114
+ end
115
+
116
+ if column_casts.any? || column_subset.any?
117
+ rbdf = rbdf.lazy
118
+ if column_casts.any?
119
+ rbdf = rbdf.with_columns(column_casts)
120
+ end
121
+ if column_subset.any?
122
+ rbdf = rbdf.select(column_subset.map { |col| Polars.col(col)._rbexpr })
123
+ end
124
+ rbdf = rbdf.collect
125
+ end
126
+
127
+ rbdf
128
+ end
129
+
130
+ def self._expand_hash_values(data, schema_overrides: nil, strict: true, order: nil, nan_to_null: false)
131
+ updated_data = {}
132
+ unless data.empty?
133
+ dtypes = schema_overrides || {}
134
+ array_len = data.values.map { |val| Utils.arrlen(val) || 0 }.max
135
+ if array_len > 0
136
+ data.each do |name, val|
137
+ dtype = dtypes[name]
138
+ if val.is_a?(Hash) && dtype != Struct
139
+ updated_data[name] = DataFrame.new(val, strict: strict).to_struct(name)
140
+ elsif !Utils.arrlen(val).nil?
141
+ updated_data[name] = Series.new(::String.new(name), val, dtype: dtype, strict: strict)
142
+ elsif val.nil? || [Integer, Float, TrueClass, FalseClass, ::String, ::Date, ::DateTime, ::Time].any? { |cls| val.is_a?(cls) }
143
+ dtype = Polars::Float64 if val.nil? && dtype.nil?
144
+ updated_data[name] = Series.new(::String.new(name), [val], dtype: dtype, strict: strict).extend_constant(val, array_len - 1)
145
+ else
146
+ raise Todo
147
+ end
148
+ end
149
+ elsif data.values.all? { |val| Utils.arrlen(val) == 0 }
150
+ data.each do |name, val|
151
+ updated_data[name] = Series.new(name, val, dtype: dtypes[name], strict: strict)
152
+ end
153
+ elsif data.values.all? { |val| Utils.arrlen(val).nil? }
154
+ data.each do |name, val|
155
+ updated_data[name] = Series.new(name, [val], dtype: dtypes[name], strict: strict)
156
+ end
157
+ end
158
+ end
159
+ updated_data
160
+ end
161
+
162
+ def self.sequence_to_rbdf(
163
+ data,
164
+ schema: nil,
165
+ schema_overrides: nil,
166
+ strict: true,
167
+ orient: nil,
168
+ infer_schema_length: N_INFER_DEFAULT,
169
+ nan_to_null: false
170
+ )
171
+ if data.empty?
172
+ return hash_to_rbdf({}, schema: schema, schema_overrides: schema_overrides)
173
+ end
174
+
175
+ _sequence_to_rbdf_dispatcher(
176
+ get_first_non_none(data),
177
+ data,
178
+ schema,
179
+ schema_overrides: schema_overrides,
180
+ strict: strict,
181
+ orient: orient,
182
+ infer_schema_length: infer_schema_length,
183
+ nan_to_null: nan_to_null
184
+ )
185
+ end
186
+
187
+ def self._sequence_to_rbdf_dispatcher(
188
+ first_element,
189
+ data,
190
+ schema,
191
+ schema_overrides: nil,
192
+ strict: true,
193
+ orient: nil,
194
+ infer_schema_length: nil,
195
+ nan_to_null: false
196
+ )
197
+ common_params = {
198
+ data: data,
199
+ schema: schema,
200
+ schema_overrides: schema_overrides,
201
+ strict: strict,
202
+ orient: orient,
203
+ infer_schema_length: infer_schema_length,
204
+ nan_to_null: nan_to_null
205
+ }
206
+
207
+ if first_element.is_a?(Series)
208
+ to_rbdf = method(:_sequence_of_series_to_rbdf)
209
+ elsif first_element.is_a?(::Array)
210
+ to_rbdf = method(:_sequence_of_sequence_to_rbdf)
211
+ elsif first_element.is_a?(Hash)
212
+ to_rbdf = method(:_sequence_of_dict_to_rbdf)
213
+ else
214
+ to_rbdf = method(:_sequence_of_elements_to_rbdf)
215
+ end
216
+
217
+ common_params[:first_element] = first_element
218
+ to_rbdf.(**common_params)
219
+ end
220
+
221
+ def self._sequence_of_sequence_to_rbdf(
222
+ first_element:,
223
+ data:,
224
+ schema:,
225
+ schema_overrides:,
226
+ strict:,
227
+ orient:,
228
+ infer_schema_length:,
229
+ nan_to_null: false
230
+ )
231
+ if orient.nil?
232
+ if schema.nil?
233
+ orient = "col"
234
+ else
235
+ # Try to infer orientation from schema length and data dimensions
236
+ is_row_oriented = schema.length == first_element.length && schema.length != data.length
237
+ orient = is_row_oriented ? "row" : "col"
238
+
239
+ if is_row_oriented
240
+ Utils.issue_warning(
241
+ "Row orientation inferred during DataFrame construction." +
242
+ ' Explicitly specify the orientation by passing `orient: "row"` to silence this warning.'
243
+ )
244
+ end
245
+ end
246
+ end
247
+
248
+ if orient == "row"
249
+ column_names, schema_overrides = _unpack_schema(
250
+ schema, schema_overrides: schema_overrides, n_expected: first_element.length
251
+ )
252
+ local_schema_override =
253
+ if schema_overrides
254
+ _include_unknowns(schema_overrides, column_names)
255
+ else
256
+ {}
257
+ end
258
+
259
+ unpack_nested = false
260
+ local_schema_override.each do |col, tp|
261
+ if [Categorical, Enum].include?(tp)
262
+ local_schema_override[col] = String
263
+ elsif !unpack_nested && [Unknown, Struct].include?(tp.base_type)
264
+ # TODO fix
265
+ unpack_nested = false
266
+ end
267
+ end
268
+
269
+ if unpack_nested
270
+ raise Todo
271
+ else
272
+ rbdf = RbDataFrame.from_rows(
273
+ data,
274
+ infer_schema_length,
275
+ local_schema_override
276
+ )
277
+ end
278
+ if column_names.any? || schema_overrides.any?
279
+ rbdf = _post_apply_columns(
280
+ rbdf, column_names, schema_overrides: schema_overrides, strict: strict
281
+ )
282
+ end
283
+ rbdf
284
+
285
+ elsif orient == "col"
286
+ column_names, schema_overrides = _unpack_schema(
287
+ schema, schema_overrides: schema_overrides, n_expected: data.length
288
+ )
289
+ data_series =
290
+ data.map.with_index do |element, i|
291
+ Series.new(
292
+ column_names[i],
293
+ element,
294
+ dtype: schema_overrides[column_names[i]],
295
+ strict: strict,
296
+ nan_to_null: nan_to_null
297
+ )._s
298
+ end
299
+ RbDataFrame.new(data_series)
300
+
301
+ else
302
+ msg = "`orient` must be one of {{'col', 'row', None}}, got #{orient.inspect}"
303
+ raise ArgumentError, msg
304
+ end
305
+ end
306
+
307
+ def self._sequence_of_series_to_rbdf(
308
+ first_element:,
309
+ data:,
310
+ schema:,
311
+ schema_overrides:,
312
+ strict:,
313
+ **kwargs
314
+ )
315
+ series_names = data.map { |s| s.name }
316
+ column_names, schema_overrides = _unpack_schema(
317
+ schema || series_names,
318
+ schema_overrides: schema_overrides,
319
+ n_expected: data.length
320
+ )
321
+ data_series = []
322
+ data.each_with_index do |s, i|
323
+ if !s.name
324
+ s = s.alias(column_names[i])
325
+ end
326
+ new_dtype = schema_overrides[column_names[i]]
327
+ if new_dtype && new_dtype != s.dtype
328
+ s = s.cast(new_dtype, strict: strict, wrap_numerical: false)
329
+ end
330
+ data_series << s._s
331
+ end
332
+
333
+ data_series = _handle_columns_arg(data_series, columns: column_names)
334
+ RbDataFrame.new(data_series)
335
+ end
336
+
337
+ def self._sequence_of_dict_to_rbdf(
338
+ first_element:,
339
+ data:,
340
+ schema:,
341
+ schema_overrides:,
342
+ strict:,
343
+ infer_schema_length:,
344
+ **kwargs
345
+ )
346
+ column_names, schema_overrides = _unpack_schema(
347
+ schema, schema_overrides: schema_overrides
348
+ )
349
+ dicts_schema =
350
+ if column_names.any?
351
+ _include_unknowns(schema_overrides, column_names || schema_overrides.to_a)
352
+ else
353
+ nil
354
+ end
355
+
356
+ rbdf = RbDataFrame.from_hashes(
357
+ data,
358
+ dicts_schema,
359
+ schema_overrides,
360
+ strict,
361
+ infer_schema_length
362
+ )
363
+ rbdf
364
+ end
365
+
366
+ def self._sequence_of_elements_to_rbdf(
367
+ first_element:,
368
+ data:,
369
+ schema:,
370
+ schema_overrides:,
371
+ strict:,
372
+ **kwargs
373
+ )
374
+ column_names, schema_overrides = _unpack_schema(
375
+ schema, schema_overrides: schema_overrides, n_expected: 1
376
+ )
377
+ data_series = [
378
+ Series.new(
379
+ column_names[0],
380
+ data,
381
+ dtype: schema_overrides[column_names[0]],
382
+ strict: strict
383
+ )._s
384
+ ]
385
+ data_series = _handle_columns_arg(data_series, columns: column_names)
386
+ RbDataFrame.new(data_series)
387
+ end
388
+
389
+ def self._include_unknowns(schema, cols)
390
+ cols.to_h { |col| [col, schema[col] || Unknown] }
391
+ end
392
+
393
+ def self.series_to_rbdf(data, schema: nil, schema_overrides: nil, strict: true)
394
+ data_series = [data._s]
395
+ series_name = data_series.map(&:name)
396
+ column_names, schema_overrides = _unpack_schema(
397
+ schema || series_name, schema_overrides: schema_overrides, n_expected: 1
398
+ )
399
+ if schema_overrides.any?
400
+ new_dtype = schema_overrides.values[0]
401
+ if new_dtype != data.dtype
402
+ data_series[0] = data_series[0].cast(new_dtype, strict)
403
+ end
404
+ end
405
+
406
+ data_series = _handle_columns_arg(data_series, columns: column_names)
407
+ RbDataFrame.new(data_series)
408
+ end
409
+ end
410
+ end