polars-df 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +127 -1
  3. data/Cargo.lock +72 -58
  4. data/README.md +31 -27
  5. data/ext/polars/Cargo.toml +15 -6
  6. data/ext/polars/src/batched_csv.rs +35 -39
  7. data/ext/polars/src/c_api/allocator.rs +7 -0
  8. data/ext/polars/src/c_api/mod.rs +1 -0
  9. data/ext/polars/src/catalog/unity.rs +123 -101
  10. data/ext/polars/src/conversion/any_value.rs +13 -17
  11. data/ext/polars/src/conversion/chunked_array.rs +5 -5
  12. data/ext/polars/src/conversion/datetime.rs +3 -2
  13. data/ext/polars/src/conversion/mod.rs +50 -45
  14. data/ext/polars/src/dataframe/export.rs +13 -13
  15. data/ext/polars/src/dataframe/general.rs +223 -223
  16. data/ext/polars/src/dataframe/io.rs +27 -141
  17. data/ext/polars/src/dataframe/mod.rs +13 -5
  18. data/ext/polars/src/dataframe/serde.rs +1 -1
  19. data/ext/polars/src/error.rs +44 -7
  20. data/ext/polars/src/exceptions.rs +45 -12
  21. data/ext/polars/src/expr/array.rs +12 -0
  22. data/ext/polars/src/expr/datatype.rs +2 -2
  23. data/ext/polars/src/expr/datetime.rs +4 -5
  24. data/ext/polars/src/expr/general.rs +49 -13
  25. data/ext/polars/src/expr/list.rs +4 -0
  26. data/ext/polars/src/expr/meta.rs +8 -3
  27. data/ext/polars/src/expr/mod.rs +22 -6
  28. data/ext/polars/src/expr/name.rs +19 -8
  29. data/ext/polars/src/expr/rolling.rs +50 -1
  30. data/ext/polars/src/expr/string.rs +0 -1
  31. data/ext/polars/src/expr/struct.rs +7 -2
  32. data/ext/polars/src/file.rs +136 -103
  33. data/ext/polars/src/functions/aggregation.rs +9 -8
  34. data/ext/polars/src/functions/io.rs +81 -10
  35. data/ext/polars/src/functions/lazy.rs +95 -21
  36. data/ext/polars/src/functions/mod.rs +2 -0
  37. data/ext/polars/src/functions/range.rs +19 -3
  38. data/ext/polars/src/functions/strings.rs +6 -0
  39. data/ext/polars/src/functions/utils.rs +6 -0
  40. data/ext/polars/src/interop/arrow/mod.rs +50 -1
  41. data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
  42. data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
  43. data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
  44. data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
  45. data/ext/polars/src/lazyframe/exitable.rs +39 -0
  46. data/ext/polars/src/lazyframe/general.rs +340 -236
  47. data/ext/polars/src/lazyframe/mod.rs +46 -10
  48. data/ext/polars/src/lazyframe/optflags.rs +5 -4
  49. data/ext/polars/src/lazyframe/serde.rs +11 -3
  50. data/ext/polars/src/lazyframe/sink.rs +10 -5
  51. data/ext/polars/src/lazygroupby.rs +6 -7
  52. data/ext/polars/src/lib.rs +141 -76
  53. data/ext/polars/src/map/dataframe.rs +12 -12
  54. data/ext/polars/src/map/lazy.rs +7 -5
  55. data/ext/polars/src/map/mod.rs +15 -8
  56. data/ext/polars/src/map/series.rs +3 -3
  57. data/ext/polars/src/on_startup.rs +16 -8
  58. data/ext/polars/src/prelude.rs +1 -0
  59. data/ext/polars/src/rb_modules.rs +19 -49
  60. data/ext/polars/src/series/aggregation.rs +79 -140
  61. data/ext/polars/src/series/arithmetic.rs +16 -22
  62. data/ext/polars/src/series/comparison.rs +101 -222
  63. data/ext/polars/src/series/construction.rs +17 -18
  64. data/ext/polars/src/series/export.rs +1 -1
  65. data/ext/polars/src/series/general.rs +254 -289
  66. data/ext/polars/src/series/import.rs +17 -0
  67. data/ext/polars/src/series/map.rs +178 -160
  68. data/ext/polars/src/series/mod.rs +28 -12
  69. data/ext/polars/src/series/scatter.rs +12 -9
  70. data/ext/polars/src/sql.rs +16 -9
  71. data/ext/polars/src/testing/frame.rs +31 -0
  72. data/ext/polars/src/testing/mod.rs +5 -0
  73. data/ext/polars/src/testing/series.rs +31 -0
  74. data/ext/polars/src/timeout.rs +105 -0
  75. data/ext/polars/src/utils.rs +159 -1
  76. data/lib/polars/array_expr.rb +81 -12
  77. data/lib/polars/array_name_space.rb +74 -7
  78. data/lib/polars/batched_csv_reader.rb +21 -21
  79. data/lib/polars/binary_name_space.rb +1 -1
  80. data/lib/polars/cat_expr.rb +7 -7
  81. data/lib/polars/config.rb +1 -1
  82. data/lib/polars/convert.rb +189 -34
  83. data/lib/polars/data_frame.rb +1066 -831
  84. data/lib/polars/data_frame_plot.rb +173 -0
  85. data/lib/polars/data_type_group.rb +1 -0
  86. data/lib/polars/data_types.rb +31 -12
  87. data/lib/polars/date_time_expr.rb +51 -69
  88. data/lib/polars/date_time_name_space.rb +80 -112
  89. data/lib/polars/dynamic_group_by.rb +7 -7
  90. data/lib/polars/exceptions.rb +50 -10
  91. data/lib/polars/expr.rb +470 -517
  92. data/lib/polars/functions/aggregation/horizontal.rb +0 -1
  93. data/lib/polars/functions/aggregation/vertical.rb +2 -3
  94. data/lib/polars/functions/as_datatype.rb +290 -8
  95. data/lib/polars/functions/eager.rb +204 -10
  96. data/lib/polars/functions/escape_regex.rb +21 -0
  97. data/lib/polars/functions/lazy.rb +409 -169
  98. data/lib/polars/functions/lit.rb +17 -1
  99. data/lib/polars/functions/range/int_range.rb +74 -2
  100. data/lib/polars/functions/range/linear_space.rb +77 -0
  101. data/lib/polars/functions/range/time_range.rb +1 -1
  102. data/lib/polars/functions/repeat.rb +3 -12
  103. data/lib/polars/functions/whenthen.rb +2 -2
  104. data/lib/polars/group_by.rb +72 -20
  105. data/lib/polars/iceberg_dataset.rb +1 -6
  106. data/lib/polars/in_process_query.rb +37 -0
  107. data/lib/polars/io/cloud.rb +18 -0
  108. data/lib/polars/io/csv.rb +265 -126
  109. data/lib/polars/io/database.rb +0 -1
  110. data/lib/polars/io/delta.rb +15 -7
  111. data/lib/polars/io/ipc.rb +24 -17
  112. data/lib/polars/io/ndjson.rb +161 -24
  113. data/lib/polars/io/parquet.rb +101 -38
  114. data/lib/polars/lazy_frame.rb +849 -558
  115. data/lib/polars/lazy_group_by.rb +327 -2
  116. data/lib/polars/list_expr.rb +94 -16
  117. data/lib/polars/list_name_space.rb +88 -24
  118. data/lib/polars/meta_expr.rb +42 -1
  119. data/lib/polars/name_expr.rb +41 -4
  120. data/lib/polars/query_opt_flags.rb +198 -2
  121. data/lib/polars/rolling_group_by.rb +3 -3
  122. data/lib/polars/schema.rb +21 -3
  123. data/lib/polars/selector.rb +37 -2
  124. data/lib/polars/selectors.rb +45 -9
  125. data/lib/polars/series.rb +1156 -728
  126. data/lib/polars/series_plot.rb +72 -0
  127. data/lib/polars/slice.rb +1 -1
  128. data/lib/polars/sql_context.rb +11 -4
  129. data/lib/polars/string_expr.rb +59 -68
  130. data/lib/polars/string_name_space.rb +51 -87
  131. data/lib/polars/struct_expr.rb +36 -18
  132. data/lib/polars/testing.rb +24 -273
  133. data/lib/polars/utils/constants.rb +2 -0
  134. data/lib/polars/utils/construction/data_frame.rb +410 -0
  135. data/lib/polars/utils/construction/series.rb +364 -0
  136. data/lib/polars/utils/construction/utils.rb +9 -0
  137. data/lib/polars/utils/deprecation.rb +11 -0
  138. data/lib/polars/utils/serde.rb +8 -3
  139. data/lib/polars/utils/unstable.rb +19 -0
  140. data/lib/polars/utils/various.rb +59 -0
  141. data/lib/polars/utils.rb +46 -47
  142. data/lib/polars/version.rb +1 -1
  143. data/lib/polars.rb +47 -1
  144. metadata +25 -6
  145. data/ext/polars/src/allocator.rs +0 -13
  146. data/lib/polars/plot.rb +0 -109
@@ -40,50 +40,22 @@ module Polars
40
40
  categorical_as_str: false
41
41
  )
42
42
  lazy = _assert_correct_input_type(left, right)
43
- objects = lazy ? "LazyFrames" : "DataFrames"
44
-
45
- _assert_frame_schema_equal(
46
- left,
47
- right,
48
- check_column_order: check_column_order,
49
- check_dtype: check_dtype,
50
- objects: objects,
51
- )
52
43
 
53
44
  if lazy
54
45
  left, right = left.collect, right.collect
55
46
  end
56
47
 
57
- if left.height != right.height
58
- raise_assertion_error(
59
- objects, "number of rows does not match", left.height, right.height
60
- )
61
- end
62
-
63
- if !check_row_order
64
- left, right = _sort_dataframes(left, right)
65
- end
66
-
67
- left.columns.each do |c|
68
- s_left, s_right = left.get_column(c), right.get_column(c)
69
- begin
70
- _assert_series_values_equal(
71
- s_left,
72
- s_right,
73
- check_exact: check_exact,
74
- rtol: rtol,
75
- atol: atol,
76
- categorical_as_str: categorical_as_str
77
- )
78
- rescue AssertionError
79
- raise_assertion_error(
80
- objects,
81
- "value mismatch for column #{c.inspect}",
82
- s_left.to_a,
83
- s_right.to_a
84
- )
85
- end
86
- end
48
+ Plr.assert_dataframe_equal_rb(
49
+ left._df,
50
+ right._df,
51
+ check_row_order,
52
+ check_column_order,
53
+ check_dtype,
54
+ check_exact,
55
+ rtol,
56
+ atol,
57
+ categorical_as_str,
58
+ )
87
59
  end
88
60
 
89
61
  # Assert that the left and right frame are **not** equal.
@@ -157,6 +129,8 @@ module Polars
157
129
  # Require data types to match.
158
130
  # @param check_names [Boolean]
159
131
  # Require names to match.
132
+ # @param check_order [Boolean]
133
+ # Requires elements to appear in the same order.
160
134
  # @param check_exact [Boolean]
161
135
  # Require float values to match exactly. If set to `false`, values are considered
162
136
  # equal when within tolerance of each other (see `rtol` and `atol`).
@@ -176,6 +150,7 @@ module Polars
176
150
  right,
177
151
  check_dtype: true,
178
152
  check_names: true,
153
+ check_order: true,
179
154
  check_exact: false,
180
155
  rtol: 1e-5,
181
156
  atol: 1e-8,
@@ -190,25 +165,16 @@ module Polars
190
165
  )
191
166
  end
192
167
 
193
- if left.len != right.len
194
- raise_assertion_error("Series", "length mismatch", left.len, right.len)
195
- end
196
-
197
- if check_names && left.name != right.name
198
- raise_assertion_error("Series", "name mismatch", left.name, right.name)
199
- end
200
-
201
- if check_dtype && left.dtype != right.dtype
202
- raise_assertion_error("Series", "dtype mismatch", left.dtype, right.dtype)
203
- end
204
-
205
- _assert_series_values_equal(
206
- left,
207
- right,
208
- check_exact: check_exact,
209
- rtol: rtol,
210
- atol: atol,
211
- categorical_as_str: categorical_as_str
168
+ Plr.assert_series_equal_rb(
169
+ left._s,
170
+ right._s,
171
+ check_dtype,
172
+ check_names,
173
+ check_order,
174
+ check_exact,
175
+ rtol,
176
+ atol,
177
+ categorical_as_str
212
178
  )
213
179
  end
214
180
 
@@ -284,221 +250,6 @@ module Polars
284
250
  end
285
251
  end
286
252
 
287
- def _assert_frame_schema_equal(
288
- left,
289
- right,
290
- check_dtype:,
291
- check_column_order:,
292
- objects:
293
- )
294
- left_schema, right_schema = left.schema, right.schema
295
-
296
- # Fast path for equal frames
297
- if left_schema == right_schema
298
- return
299
- end
300
-
301
- # Special error message for when column names do not match
302
- if left_schema.keys != right_schema.keys
303
- if (left_not_right = right_schema.keys - left_schema.keys).any?
304
- msg = "columns #{left_not_right.inspect} in left #{objects[..-1]}, but not in right"
305
- raise AssertionError, msg
306
- else
307
- right_not_left = right_schema.keys - left_schema.keys
308
- msg = "columns #{right_not_left.inspect} in right #{objects[..-1]}, but not in left"
309
- raise AssertionError, msg
310
- end
311
- end
312
-
313
- if check_column_order
314
- left_columns, right_columns = left_schema.keys, right_schema.keys
315
- if left_columns != right_columns
316
- detail = "columns are not in the same order"
317
- raise_assertion_error(objects, detail, left_columns, right_columns)
318
- end
319
- end
320
-
321
- if check_dtype
322
- left_schema_dict, right_schema_dict = left_schema.to_h, right_schema.to_h
323
- if check_column_order || left_schema_dict != right_schema_dict
324
- detail = "dtypes do not match"
325
- raise_assertion_error(objects, detail, left_schema_dict, right_schema_dict)
326
- end
327
- end
328
- end
329
-
330
- def _sort_dataframes(left, right)
331
- by = left.columns
332
- begin
333
- left = left.sort(by)
334
- right = right.sort(by)
335
- rescue
336
- msg = "cannot set `check_row_order: false` on frame with unsortable columns"
337
- raise InvalidAssert, msg
338
- end
339
- [left, right]
340
- end
341
-
342
- def _assert_series_values_equal(
343
- left,
344
- right,
345
- check_exact:,
346
- rtol:,
347
- atol:,
348
- categorical_as_str:
349
- )
350
- if categorical_as_str
351
- if left.dtype == Categorical
352
- left = left.cast(String)
353
- end
354
- if right.dtype == Categorical
355
- right = right.cast(String)
356
- end
357
- end
358
-
359
- # Determine unequal elements
360
- begin
361
- unequal = left.ne_missing(right)
362
- rescue
363
- raise_assertion_error(
364
- "Series",
365
- "incompatible data types",
366
- left.dtype,
367
- right.dtype
368
- )
369
- end
370
-
371
- # Check nested dtypes in separate function
372
- if _comparing_nested_floats(left.dtype, right.dtype)
373
- begin
374
- _assert_series_nested_values_equal(
375
- left: left.filter(unequal),
376
- right: right.filter(unequal),
377
- check_exact: check_exact,
378
- rtol: rtol,
379
- atol: atol,
380
- categorical_as_str: categorical_as_str
381
- )
382
- rescue AssertionError
383
- raise_assertion_error(
384
- "Series",
385
- "nested value mismatch",
386
- left.to_a,
387
- right.to_a
388
- )
389
- else
390
- return
391
- end
392
- end
393
-
394
- # If no differences found during exact checking, we're done
395
- if !unequal.any
396
- return
397
- end
398
-
399
- # Only do inexact checking for float types
400
- if check_exact || !left.dtype.float? || !right.dtype.float?
401
- raise_assertion_error(
402
- "Series", "exact value mismatch", left.to_a, right.to_a
403
- )
404
- end
405
-
406
- _assert_series_null_values_match(left, right)
407
- _assert_series_nan_values_match(left, right)
408
- _assert_series_values_within_tolerance(
409
- left,
410
- right,
411
- unequal,
412
- rtol: rtol,
413
- atol: atol
414
- )
415
- end
416
-
417
- def _assert_series_nested_values_equal(
418
- left,
419
- right,
420
- check_exact:,
421
- rtol:,
422
- atol:,
423
- categorical_as_str:
424
- )
425
- # compare nested lists element-wise
426
- if _comparing_lists(left.dtype, right.dtype)
427
- left.zip(right) do |s1, s2|
428
- if s1.nil? || s2.nil?
429
- raise_assertion_error("Series", "nested value mismatch", s1, s2)
430
- end
431
-
432
- _assert_series_values_equal(
433
- s1,
434
- s2,
435
- check_exact: check_exact,
436
- rtol: rtol,
437
- atol: atol,
438
- categorical_as_str: categorical_as_str
439
- )
440
- end
441
-
442
- # unnest structs as series and compare
443
- else
444
- ls, rs = left.struct.unnest, right.struct.unnest
445
- ls.zip(rs) do |s1, s2|
446
- _assert_series_values_equal(
447
- s1,
448
- s2,
449
- check_exact: check_exact,
450
- rtol: rtol,
451
- atol: atol,
452
- categorical_as_str: categorical_as_str
453
- )
454
- end
455
- end
456
- end
457
-
458
- def _assert_series_null_values_match(left, right)
459
- null_value_mismatch = left.is_null != right.is_null
460
- if null_value_mismatch.any
461
- raise_assertion_error(
462
- "Series", "null value mismatch", left.to_a, right.to_a
463
- )
464
- end
465
- end
466
-
467
- def _assert_series_nan_values_match(left, right)
468
- if !_comparing_floats(left.dtype, right.dtype)
469
- return
470
- end
471
- nan_value_mismatch = left.is_nan != right.is_nan
472
- if nan_value_mismatch.any
473
- raise_assertion_error(
474
- "Series",
475
- "nan value mismatch",
476
- left.to_a,
477
- right.to_a
478
- )
479
- end
480
- end
481
-
482
- def _comparing_floats(left, right)
483
- left.is_float && right.is_float
484
- end
485
-
486
- def _comparing_lists(left, right)
487
- [List, Array].include?(left) && [List, Array].include?(right)
488
- end
489
-
490
- def _comparing_structs(left, right)
491
- left == Struct && right == Struct
492
- end
493
-
494
- def _comparing_nested_floats(left, right)
495
- if !(_comparing_lists(left, right) || _comparing_structs(left, right))
496
- return false
497
- end
498
-
499
- left.float? && right.float?
500
- end
501
-
502
253
  def raise_assertion_error(objects, detail, left, right)
503
254
  msg = "#{objects} are different (#{detail})\n[left]: #{left}\n[right]: #{right}"
504
255
  raise AssertionError, msg
@@ -1,5 +1,7 @@
1
1
  module Polars
2
2
  module Utils
3
+ U32_MAX = 2**32 - 1
4
+
3
5
  SECONDS_PER_DAY = 86_400
4
6
  SECONDS_PER_HOUR = 3_600
5
7
  NS_PER_SECOND = 1_000_000_000