polars-df 0.13.0-arm64-darwin → 0.15.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -0
  3. data/Cargo.lock +1368 -319
  4. data/LICENSE-THIRD-PARTY.txt +24439 -12853
  5. data/LICENSE.txt +1 -0
  6. data/README.md +1 -2
  7. data/lib/polars/3.1/polars.bundle +0 -0
  8. data/lib/polars/3.2/polars.bundle +0 -0
  9. data/lib/polars/3.3/polars.bundle +0 -0
  10. data/lib/polars/batched_csv_reader.rb +0 -2
  11. data/lib/polars/binary_expr.rb +133 -9
  12. data/lib/polars/binary_name_space.rb +101 -6
  13. data/lib/polars/config.rb +4 -0
  14. data/lib/polars/data_frame.rb +285 -62
  15. data/lib/polars/data_type_group.rb +28 -0
  16. data/lib/polars/data_types.rb +2 -0
  17. data/lib/polars/date_time_expr.rb +244 -0
  18. data/lib/polars/date_time_name_space.rb +87 -0
  19. data/lib/polars/expr.rb +109 -8
  20. data/lib/polars/functions/as_datatype.rb +51 -2
  21. data/lib/polars/functions/col.rb +1 -1
  22. data/lib/polars/functions/eager.rb +1 -3
  23. data/lib/polars/functions/lazy.rb +88 -10
  24. data/lib/polars/functions/range/time_range.rb +21 -21
  25. data/lib/polars/io/csv.rb +14 -16
  26. data/lib/polars/io/database.rb +2 -2
  27. data/lib/polars/io/ipc.rb +14 -12
  28. data/lib/polars/io/ndjson.rb +10 -0
  29. data/lib/polars/io/parquet.rb +168 -111
  30. data/lib/polars/lazy_frame.rb +649 -15
  31. data/lib/polars/list_name_space.rb +169 -0
  32. data/lib/polars/selectors.rb +1144 -0
  33. data/lib/polars/series.rb +470 -40
  34. data/lib/polars/string_cache.rb +27 -1
  35. data/lib/polars/string_expr.rb +0 -1
  36. data/lib/polars/string_name_space.rb +73 -3
  37. data/lib/polars/struct_name_space.rb +31 -7
  38. data/lib/polars/utils/various.rb +5 -1
  39. data/lib/polars/utils.rb +45 -10
  40. data/lib/polars/version.rb +1 -1
  41. data/lib/polars.rb +2 -1
  42. metadata +4 -3
  43. data/lib/polars/functions.rb +0 -57
@@ -824,6 +824,29 @@ module Polars
824
824
  # @note
825
825
  # If you simply want the first encountered expression as accumulator,
826
826
  # consider using `cumreduce`.
827
+ #
828
+ # @example
829
+ # df = Polars::DataFrame.new(
830
+ # {
831
+ # "a" => [1, 2, 3],
832
+ # "b" => [3, 4, 5],
833
+ # "c" => [5, 6, 7]
834
+ # }
835
+ # )
836
+ # df.with_columns(
837
+ # Polars.cum_fold(Polars.lit(1), ->(acc, x) { acc + x }, Polars.all)
838
+ # )
839
+ # # =>
840
+ # # shape: (3, 4)
841
+ # # ┌─────┬─────┬─────┬───────────┐
842
+ # # │ a ┆ b ┆ c ┆ cum_fold │
843
+ # # │ --- ┆ --- ┆ --- ┆ --- │
844
+ # # │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
845
+ # # ╞═════╪═════╪═════╪═══════════╡
846
+ # # │ 1 ┆ 3 ┆ 5 ┆ {2,5,10} │
847
+ # # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
848
+ # # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
849
+ # # └─────┴─────┴─────┴───────────┘
827
850
  def cum_fold(acc, f, exprs, include_init: false)
828
851
  acc = Utils.parse_into_expression(acc, str_as_lit: true)
829
852
  if exprs.is_a?(Expr)
@@ -831,7 +854,7 @@ module Polars
831
854
  end
832
855
 
833
856
  exprs = Utils.parse_into_list_of_expressions(exprs)
834
- Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init))
857
+ Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
835
858
  end
836
859
  alias_method :cumfold, :cum_fold
837
860
 
@@ -1024,15 +1047,70 @@ module Polars
1024
1047
  # Default is ascending.
1025
1048
  #
1026
1049
  # @return [Expr]
1027
- def arg_sort_by(exprs, reverse: false)
1028
- if !exprs.is_a?(::Array)
1029
- exprs = [exprs]
1030
- end
1031
- if reverse == true || reverse == false
1032
- reverse = [reverse] * exprs.length
1033
- end
1034
- exprs = Utils.parse_into_list_of_expressions(exprs)
1035
- Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse))
1050
+ #
1051
+ # @example Pass a single column name to compute the arg sort by that column.
1052
+ # df = Polars::DataFrame.new(
1053
+ # {
1054
+ # "a" => [0, 1, 1, 0],
1055
+ # "b" => [3, 2, 3, 2],
1056
+ # "c" => [1, 2, 3, 4]
1057
+ # }
1058
+ # )
1059
+ # df.select(Polars.arg_sort_by("a"))
1060
+ # # =>
1061
+ # # shape: (4, 1)
1062
+ # # ┌─────┐
1063
+ # # │ a │
1064
+ # # │ --- │
1065
+ # # │ u32 │
1066
+ # # ╞═════╡
1067
+ # # │ 0 │
1068
+ # # │ 3 │
1069
+ # # │ 1 │
1070
+ # # │ 2 │
1071
+ # # └─────┘
1072
+ #
1073
+ # @example Compute the arg sort by multiple columns by either passing a list of columns, or by specifying each column as a positional argument.
1074
+ # df.select(Polars.arg_sort_by(["a", "b"], reverse: true))
1075
+ # # =>
1076
+ # # shape: (4, 1)
1077
+ # # ┌─────┐
1078
+ # # │ a │
1079
+ # # │ --- │
1080
+ # # │ u32 │
1081
+ # # ╞═════╡
1082
+ # # │ 2 │
1083
+ # # │ 1 │
1084
+ # # │ 0 │
1085
+ # # │ 3 │
1086
+ # # └─────┘
1087
+ #
1088
+ # @example Use gather to apply the arg sort to other columns.
1089
+ # df.select(Polars.col("c").gather(Polars.arg_sort_by("a")))
1090
+ # # =>
1091
+ # # shape: (4, 1)
1092
+ # # ┌─────┐
1093
+ # # │ c │
1094
+ # # │ --- │
1095
+ # # │ i64 │
1096
+ # # ╞═════╡
1097
+ # # │ 1 │
1098
+ # # │ 4 │
1099
+ # # │ 2 │
1100
+ # # │ 3 │
1101
+ # # └─────┘
1102
+ def arg_sort_by(
1103
+ exprs,
1104
+ *more_exprs,
1105
+ reverse: false,
1106
+ nulls_last: false,
1107
+ multithreaded: true,
1108
+ maintain_order: false
1109
+ )
1110
+ exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
1111
+ reverse = Utils.extend_bool(reverse, exprs.length, "reverse", "exprs")
1112
+ nulls_last = Utils.extend_bool(nulls_last, exprs.length, "nulls_last", "exprs")
1113
+ Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse, nulls_last, multithreaded, maintain_order))
1036
1114
  end
1037
1115
  alias_method :argsort_by, :arg_sort_by
1038
1116
 
@@ -18,7 +18,7 @@ module Polars
18
18
  #
19
19
  # @example
20
20
  # Polars.time_range(
21
- # time(14, 0),
21
+ # Time.utc(2000, 1, 1, 14, 0),
22
22
  # nil,
23
23
  # "3h15m",
24
24
  # eager: true
@@ -48,12 +48,12 @@ module Polars
48
48
  end
49
49
 
50
50
  if start.nil?
51
- # start = time(0, 0, 0)
52
- raise Todo
51
+ # date part is ignored
52
+ start = ::Time.utc(2000, 1, 1, 0, 0, 0)
53
53
  end
54
54
  if stop.nil?
55
- # stop = time(23, 59, 59, 999999)
56
- raise Todo
55
+ # date part is ignored
56
+ stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
57
57
  end
58
58
 
59
59
  start_rbexpr = Utils.parse_into_expression(start)
@@ -87,21 +87,21 @@ module Polars
87
87
  # @example
88
88
  # df = Polars::DataFrame.new(
89
89
  # {
90
- # "start" => [time(9, 0), time(10, 0)],
91
- # "end" => time(11, 0)
90
+ # "start" => [Time.utc(2000, 1, 1, 9, 0), Time.utc(2000, 1, 1, 10, 0)],
91
+ # "end" => Time.utc(2000, 1, 1, 11, 0)
92
92
  # }
93
93
  # )
94
- # df.with_columns(time_range: Polars.time_ranges("start", "end"))
94
+ # df.select(time_range: Polars.time_ranges("start", "end"))
95
95
  # # =>
96
- # # shape: (2, 3)
97
- # # ┌──────────┬──────────┬────────────────────────────────┐
98
- # # │ start ┆ end ┆ time_range │
99
- # # │ --- ┆ --- ┆ ---
100
- # # │ time ┆ time ┆ list[time] │
101
- # # ╞══════════╪══════════╪════════════════════════════════╡
102
- # # │ 09:00:00 ┆ 11:00:00 ┆ [09:00:00, 10:00:00, 11:00:00] │
103
- # # │ 10:00:00 ┆ 11:00:00 ┆ [10:00:00, 11:00:00] │
104
- # # └──────────┴──────────┴────────────────────────────────┘
96
+ # # shape: (2, 1)
97
+ # # ┌────────────────────────────────┐
98
+ # # │ time_range │
99
+ # # │ --- │
100
+ # # │ list[time] │
101
+ # # ╞════════════════════════════════╡
102
+ # # │ [09:00:00, 10:00:00, 11:00:00] │
103
+ # # │ [10:00:00, 11:00:00] │
104
+ # # └────────────────────────────────┘
105
105
  def time_ranges(
106
106
  start = nil,
107
107
  stop = nil,
@@ -118,12 +118,12 @@ module Polars
118
118
  end
119
119
 
120
120
  if start.nil?
121
- # start = time(0, 0, 0)
122
- raise Todo
121
+ # date part is ignored
122
+ start = ::Time.utc(2000, 1, 1, 0, 0, 0)
123
123
  end
124
124
  if stop.nil?
125
- # stop = time(23, 59, 59, 999999)
126
- raise Todo
125
+ # date part is ignored
126
+ stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
127
127
  end
128
128
 
129
129
  start_rbexpr = Utils.parse_into_expression(start)
data/lib/polars/io/csv.rb CHANGED
@@ -75,9 +75,6 @@ module Polars
75
75
  # the DataFrame.
76
76
  # @param row_count_offset [Integer]
77
77
  # Offset to start the row_count column (only used if the name is set).
78
- # @param sample_size [Integer]
79
- # Set the sample size. This is used to sample statistics to estimate the
80
- # allocation needed.
81
78
  # @param eol_char [String]
82
79
  # Single byte end of line character.
83
80
  # @param truncate_ragged_lines [Boolean]
@@ -114,7 +111,6 @@ module Polars
114
111
  skip_rows_after_header: 0,
115
112
  row_count_name: nil,
116
113
  row_count_offset: 0,
117
- sample_size: 1024,
118
114
  eol_char: "\n",
119
115
  truncate_ragged_lines: false
120
116
  )
@@ -163,7 +159,6 @@ module Polars
163
159
  skip_rows_after_header: skip_rows_after_header,
164
160
  row_count_name: row_count_name,
165
161
  row_count_offset: row_count_offset,
166
- sample_size: sample_size,
167
162
  eol_char: eol_char,
168
163
  truncate_ragged_lines: truncate_ragged_lines
169
164
  )
@@ -201,7 +196,6 @@ module Polars
201
196
  skip_rows_after_header: 0,
202
197
  row_count_name: nil,
203
198
  row_count_offset: 0,
204
- sample_size: 1024,
205
199
  eol_char: "\n",
206
200
  raise_if_empty: true,
207
201
  truncate_ragged_lines: false,
@@ -305,7 +299,6 @@ module Polars
305
299
  parse_dates,
306
300
  skip_rows_after_header,
307
301
  Utils.parse_row_index_args(row_count_name, row_count_offset),
308
- sample_size,
309
302
  eol_char,
310
303
  raise_if_empty,
311
304
  truncate_ragged_lines,
@@ -392,9 +385,6 @@ module Polars
392
385
  # the DataFrame.
393
386
  # @param row_count_offset [Integer]
394
387
  # Offset to start the row_count column (only used if the name is set).
395
- # @param sample_size [Integer]
396
- # Set the sample size. This is used to sample statistics to estimate the
397
- # allocation needed.
398
388
  # @param eol_char [String]
399
389
  # Single byte end of line character.
400
390
  # @param truncate_ragged_lines [Boolean]
@@ -431,7 +421,6 @@ module Polars
431
421
  skip_rows_after_header: 0,
432
422
  row_count_name: nil,
433
423
  row_count_offset: 0,
434
- sample_size: 1024,
435
424
  eol_char: "\n",
436
425
  raise_if_empty: true,
437
426
  truncate_ragged_lines: false,
@@ -474,7 +463,6 @@ module Polars
474
463
  skip_rows_after_header: skip_rows_after_header,
475
464
  row_count_name: row_count_name,
476
465
  row_count_offset: row_count_offset,
477
- sample_size: sample_size,
478
466
  eol_char: eol_char,
479
467
  new_columns: new_columns,
480
468
  raise_if_empty: raise_if_empty,
@@ -618,7 +606,7 @@ module Polars
618
606
 
619
607
  # @private
620
608
  def _scan_csv_impl(
621
- file,
609
+ source,
622
610
  has_header: true,
623
611
  sep: ",",
624
612
  comment_char: nil,
@@ -650,9 +638,16 @@ module Polars
650
638
  end
651
639
  processed_null_values = Utils._process_null_values(null_values)
652
640
 
641
+ if source.is_a?(::Array)
642
+ sources = source
643
+ source = nil
644
+ else
645
+ sources = []
646
+ end
647
+
653
648
  rblf =
654
649
  RbLazyFrame.new_from_csv(
655
- file,
650
+ source,
656
651
  sep,
657
652
  has_header,
658
653
  ignore_errors,
@@ -672,7 +667,8 @@ module Polars
672
667
  Utils.parse_row_index_args(row_count_name, row_count_offset),
673
668
  parse_dates,
674
669
  eol_char,
675
- truncate_ragged_lines
670
+ truncate_ragged_lines,
671
+ sources
676
672
  )
677
673
  Utils.wrap_ldf(rblf)
678
674
  end
@@ -681,7 +677,9 @@ module Polars
681
677
 
682
678
  def _prepare_file_arg(file)
683
679
  if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
684
- raise ArgumentError, "use URI(...) for remote files"
680
+ require "uri"
681
+
682
+ file = URI(file)
685
683
  end
686
684
 
687
685
  if defined?(URI) && file.is_a?(URI)
@@ -18,9 +18,9 @@ module Polars
18
18
  if query.is_a?(ActiveRecord::Result)
19
19
  query
20
20
  elsif query.is_a?(ActiveRecord::Relation)
21
- query.connection.select_all(query.to_sql)
21
+ query.connection_pool.with_connection { |c| c.select_all(query.to_sql) }
22
22
  elsif query.is_a?(::String)
23
- ActiveRecord::Base.connection.select_all(query)
23
+ ActiveRecord::Base.connection_pool.with_connection { |c| c.select_all(query) }
24
24
  else
25
25
  raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
26
26
  end
data/lib/polars/io/ipc.rb CHANGED
@@ -189,10 +189,6 @@ module Polars
189
189
  # Offset to start the row_count column (only use if the name is set).
190
190
  # @param storage_options [Hash]
191
191
  # Extra options that make sense for a particular storage connection.
192
- # @param memory_map [Boolean]
193
- # Try to memory map the file. This can greatly improve performance on repeated
194
- # queries as the OS may cache pages.
195
- # Only uncompressed IPC files can be memory mapped.
196
192
  # @param hive_partitioning [Boolean]
197
193
  # Infer statistics and schema from Hive partitioned URL and use them
198
194
  # to prune reads. This is unset by default (i.e. `nil`), meaning it is
@@ -215,7 +211,6 @@ module Polars
215
211
  row_count_name: nil,
216
212
  row_count_offset: 0,
217
213
  storage_options: nil,
218
- memory_map: true,
219
214
  hive_partitioning: nil,
220
215
  hive_schema: nil,
221
216
  try_parse_hive_dates: true,
@@ -229,7 +224,6 @@ module Polars
229
224
  row_count_name: row_count_name,
230
225
  row_count_offset: row_count_offset,
231
226
  storage_options: storage_options,
232
- memory_map: memory_map,
233
227
  hive_partitioning: hive_partitioning,
234
228
  hive_schema: hive_schema,
235
229
  try_parse_hive_dates: try_parse_hive_dates,
@@ -239,31 +233,39 @@ module Polars
239
233
 
240
234
  # @private
241
235
  def _scan_ipc_impl(
242
- file,
236
+ source,
243
237
  n_rows: nil,
244
238
  cache: true,
245
239
  rechunk: true,
246
240
  row_count_name: nil,
247
241
  row_count_offset: 0,
248
242
  storage_options: nil,
249
- memory_map: true,
250
243
  hive_partitioning: nil,
251
244
  hive_schema: nil,
252
245
  try_parse_hive_dates: true,
253
246
  include_file_paths: nil
254
247
  )
255
- if Utils.pathlike?(file)
256
- file = Utils.normalize_filepath(file)
248
+ sources = []
249
+ if Utils.pathlike?(source)
250
+ source = Utils.normalize_filepath(source)
251
+ elsif source.is_a?(::Array)
252
+ if Utils.is_path_or_str_sequence(source)
253
+ sources = source.map { |s| Utils.normalize_filepath(s) }
254
+ else
255
+ sources = source
256
+ end
257
+
258
+ source = nil
257
259
  end
258
260
 
259
261
  rblf =
260
262
  RbLazyFrame.new_from_ipc(
261
- file,
263
+ source,
264
+ sources,
262
265
  n_rows,
263
266
  cache,
264
267
  rechunk,
265
268
  Utils.parse_row_index_args(row_count_name, row_count_offset),
266
- memory_map,
267
269
  hive_partitioning,
268
270
  hive_schema,
269
271
  try_parse_hive_dates,
@@ -60,13 +60,23 @@ module Polars
60
60
  row_count_name: nil,
61
61
  row_count_offset: 0
62
62
  )
63
+ sources = []
63
64
  if Utils.pathlike?(source)
64
65
  source = Utils.normalize_filepath(source)
66
+ elsif source.is_a?(::Array)
67
+ if Utils.is_path_or_str_sequence(source)
68
+ sources = source.map { |s| Utils.normalize_filepath(s) }
69
+ else
70
+ sources = source
71
+ end
72
+
73
+ source = nil
65
74
  end
66
75
 
67
76
  rblf =
68
77
  RbLazyFrame.new_from_ndjson(
69
78
  source,
79
+ sources,
70
80
  infer_schema_length,
71
81
  batch_size,
72
82
  n_rows,