polars-df 0.13.0-x86_64-linux → 0.15.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +30 -0
  3. data/Cargo.lock +1368 -319
  4. data/LICENSE-THIRD-PARTY.txt +24801 -13447
  5. data/LICENSE.txt +1 -0
  6. data/README.md +1 -2
  7. data/lib/polars/3.1/polars.so +0 -0
  8. data/lib/polars/3.2/polars.so +0 -0
  9. data/lib/polars/3.3/polars.so +0 -0
  10. data/lib/polars/batched_csv_reader.rb +0 -2
  11. data/lib/polars/binary_expr.rb +133 -9
  12. data/lib/polars/binary_name_space.rb +101 -6
  13. data/lib/polars/config.rb +4 -0
  14. data/lib/polars/data_frame.rb +285 -62
  15. data/lib/polars/data_type_group.rb +28 -0
  16. data/lib/polars/data_types.rb +2 -0
  17. data/lib/polars/date_time_expr.rb +244 -0
  18. data/lib/polars/date_time_name_space.rb +87 -0
  19. data/lib/polars/expr.rb +109 -8
  20. data/lib/polars/functions/as_datatype.rb +51 -2
  21. data/lib/polars/functions/col.rb +1 -1
  22. data/lib/polars/functions/eager.rb +1 -3
  23. data/lib/polars/functions/lazy.rb +88 -10
  24. data/lib/polars/functions/range/time_range.rb +21 -21
  25. data/lib/polars/io/csv.rb +14 -16
  26. data/lib/polars/io/database.rb +2 -2
  27. data/lib/polars/io/ipc.rb +14 -12
  28. data/lib/polars/io/ndjson.rb +10 -0
  29. data/lib/polars/io/parquet.rb +168 -111
  30. data/lib/polars/lazy_frame.rb +649 -15
  31. data/lib/polars/list_name_space.rb +169 -0
  32. data/lib/polars/selectors.rb +1144 -0
  33. data/lib/polars/series.rb +470 -40
  34. data/lib/polars/string_cache.rb +27 -1
  35. data/lib/polars/string_expr.rb +0 -1
  36. data/lib/polars/string_name_space.rb +73 -3
  37. data/lib/polars/struct_name_space.rb +31 -7
  38. data/lib/polars/utils/various.rb +5 -1
  39. data/lib/polars/utils.rb +45 -10
  40. data/lib/polars/version.rb +1 -1
  41. data/lib/polars.rb +2 -1
  42. metadata +4 -3
  43. data/lib/polars/functions.rb +0 -57
@@ -824,6 +824,29 @@ module Polars
824
824
  # @note
825
825
  # If you simply want the first encountered expression as accumulator,
826
826
  # consider using `cumreduce`.
827
+ #
828
+ # @example
829
+ # df = Polars::DataFrame.new(
830
+ # {
831
+ # "a" => [1, 2, 3],
832
+ # "b" => [3, 4, 5],
833
+ # "c" => [5, 6, 7]
834
+ # }
835
+ # )
836
+ # df.with_columns(
837
+ # Polars.cum_fold(Polars.lit(1), ->(acc, x) { acc + x }, Polars.all)
838
+ # )
839
+ # # =>
840
+ # # shape: (3, 4)
841
+ # # ┌─────┬─────┬─────┬───────────┐
842
+ # # │ a ┆ b ┆ c ┆ cum_fold │
843
+ # # │ --- ┆ --- ┆ --- ┆ --- │
844
+ # # │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
845
+ # # ╞═════╪═════╪═════╪═══════════╡
846
+ # # │ 1 ┆ 3 ┆ 5 ┆ {2,5,10} │
847
+ # # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
848
+ # # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
849
+ # # └─────┴─────┴─────┴───────────┘
827
850
  def cum_fold(acc, f, exprs, include_init: false)
828
851
  acc = Utils.parse_into_expression(acc, str_as_lit: true)
829
852
  if exprs.is_a?(Expr)
@@ -831,7 +854,7 @@ module Polars
831
854
  end
832
855
 
833
856
  exprs = Utils.parse_into_list_of_expressions(exprs)
834
- Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init))
857
+ Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
835
858
  end
836
859
  alias_method :cumfold, :cum_fold
837
860
 
@@ -1024,15 +1047,70 @@ module Polars
1024
1047
  # Default is ascending.
1025
1048
  #
1026
1049
  # @return [Expr]
1027
- def arg_sort_by(exprs, reverse: false)
1028
- if !exprs.is_a?(::Array)
1029
- exprs = [exprs]
1030
- end
1031
- if reverse == true || reverse == false
1032
- reverse = [reverse] * exprs.length
1033
- end
1034
- exprs = Utils.parse_into_list_of_expressions(exprs)
1035
- Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse))
1050
+ #
1051
+ # @example Pass a single column name to compute the arg sort by that column.
1052
+ # df = Polars::DataFrame.new(
1053
+ # {
1054
+ # "a" => [0, 1, 1, 0],
1055
+ # "b" => [3, 2, 3, 2],
1056
+ # "c" => [1, 2, 3, 4]
1057
+ # }
1058
+ # )
1059
+ # df.select(Polars.arg_sort_by("a"))
1060
+ # # =>
1061
+ # # shape: (4, 1)
1062
+ # # ┌─────┐
1063
+ # # │ a │
1064
+ # # │ --- │
1065
+ # # │ u32 │
1066
+ # # ╞═════╡
1067
+ # # │ 0 │
1068
+ # # │ 3 │
1069
+ # # │ 1 │
1070
+ # # │ 2 │
1071
+ # # └─────┘
1072
+ #
1073
+ # @example Compute the arg sort by multiple columns by either passing a list of columns, or by specifying each column as a positional argument.
1074
+ # df.select(Polars.arg_sort_by(["a", "b"], reverse: true))
1075
+ # # =>
1076
+ # # shape: (4, 1)
1077
+ # # ┌─────┐
1078
+ # # │ a │
1079
+ # # │ --- │
1080
+ # # │ u32 │
1081
+ # # ╞═════╡
1082
+ # # │ 2 │
1083
+ # # │ 1 │
1084
+ # # │ 0 │
1085
+ # # │ 3 │
1086
+ # # └─────┘
1087
+ #
1088
+ # @example Use gather to apply the arg sort to other columns.
1089
+ # df.select(Polars.col("c").gather(Polars.arg_sort_by("a")))
1090
+ # # =>
1091
+ # # shape: (4, 1)
1092
+ # # ┌─────┐
1093
+ # # │ c │
1094
+ # # │ --- │
1095
+ # # │ i64 │
1096
+ # # ╞═════╡
1097
+ # # │ 1 │
1098
+ # # │ 4 │
1099
+ # # │ 2 │
1100
+ # # │ 3 │
1101
+ # # └─────┘
1102
+ def arg_sort_by(
1103
+ exprs,
1104
+ *more_exprs,
1105
+ reverse: false,
1106
+ nulls_last: false,
1107
+ multithreaded: true,
1108
+ maintain_order: false
1109
+ )
1110
+ exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
1111
+ reverse = Utils.extend_bool(reverse, exprs.length, "reverse", "exprs")
1112
+ nulls_last = Utils.extend_bool(nulls_last, exprs.length, "nulls_last", "exprs")
1113
+ Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse, nulls_last, multithreaded, maintain_order))
1036
1114
  end
1037
1115
  alias_method :argsort_by, :arg_sort_by
1038
1116
 
@@ -18,7 +18,7 @@ module Polars
18
18
  #
19
19
  # @example
20
20
  # Polars.time_range(
21
- # time(14, 0),
21
+ # Time.utc(2000, 1, 1, 14, 0),
22
22
  # nil,
23
23
  # "3h15m",
24
24
  # eager: true
@@ -48,12 +48,12 @@ module Polars
48
48
  end
49
49
 
50
50
  if start.nil?
51
- # start = time(0, 0, 0)
52
- raise Todo
51
+ # date part is ignored
52
+ start = ::Time.utc(2000, 1, 1, 0, 0, 0)
53
53
  end
54
54
  if stop.nil?
55
- # stop = time(23, 59, 59, 999999)
56
- raise Todo
55
+ # date part is ignored
56
+ stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
57
57
  end
58
58
 
59
59
  start_rbexpr = Utils.parse_into_expression(start)
@@ -87,21 +87,21 @@ module Polars
87
87
  # @example
88
88
  # df = Polars::DataFrame.new(
89
89
  # {
90
- # "start" => [time(9, 0), time(10, 0)],
91
- # "end" => time(11, 0)
90
+ # "start" => [Time.utc(2000, 1, 1, 9, 0), Time.utc(2000, 1, 1, 10, 0)],
91
+ # "end" => Time.utc(2000, 1, 1, 11, 0)
92
92
  # }
93
93
  # )
94
- # df.with_columns(time_range: Polars.time_ranges("start", "end"))
94
+ # df.select(time_range: Polars.time_ranges("start", "end"))
95
95
  # # =>
96
- # # shape: (2, 3)
97
- # # ┌──────────┬──────────┬────────────────────────────────┐
98
- # # │ start ┆ end ┆ time_range │
99
- # # │ --- ┆ --- ┆ ---
100
- # # │ time ┆ time ┆ list[time] │
101
- # # ╞══════════╪══════════╪════════════════════════════════╡
102
- # # │ 09:00:00 ┆ 11:00:00 ┆ [09:00:00, 10:00:00, 11:00:00] │
103
- # # │ 10:00:00 ┆ 11:00:00 ┆ [10:00:00, 11:00:00] │
104
- # # └──────────┴──────────┴────────────────────────────────┘
96
+ # # shape: (2, 1)
97
+ # # ┌────────────────────────────────┐
98
+ # # │ time_range │
99
+ # # │ --- │
100
+ # # │ list[time] │
101
+ # # ╞════════════════════════════════╡
102
+ # # │ [09:00:00, 10:00:00, 11:00:00] │
103
+ # # │ [10:00:00, 11:00:00] │
104
+ # # └────────────────────────────────┘
105
105
  def time_ranges(
106
106
  start = nil,
107
107
  stop = nil,
@@ -118,12 +118,12 @@ module Polars
118
118
  end
119
119
 
120
120
  if start.nil?
121
- # start = time(0, 0, 0)
122
- raise Todo
121
+ # date part is ignored
122
+ start = ::Time.utc(2000, 1, 1, 0, 0, 0)
123
123
  end
124
124
  if stop.nil?
125
- # stop = time(23, 59, 59, 999999)
126
- raise Todo
125
+ # date part is ignored
126
+ stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
127
127
  end
128
128
 
129
129
  start_rbexpr = Utils.parse_into_expression(start)
data/lib/polars/io/csv.rb CHANGED
@@ -75,9 +75,6 @@ module Polars
75
75
  # the DataFrame.
76
76
  # @param row_count_offset [Integer]
77
77
  # Offset to start the row_count column (only used if the name is set).
78
- # @param sample_size [Integer]
79
- # Set the sample size. This is used to sample statistics to estimate the
80
- # allocation needed.
81
78
  # @param eol_char [String]
82
79
  # Single byte end of line character.
83
80
  # @param truncate_ragged_lines [Boolean]
@@ -114,7 +111,6 @@ module Polars
114
111
  skip_rows_after_header: 0,
115
112
  row_count_name: nil,
116
113
  row_count_offset: 0,
117
- sample_size: 1024,
118
114
  eol_char: "\n",
119
115
  truncate_ragged_lines: false
120
116
  )
@@ -163,7 +159,6 @@ module Polars
163
159
  skip_rows_after_header: skip_rows_after_header,
164
160
  row_count_name: row_count_name,
165
161
  row_count_offset: row_count_offset,
166
- sample_size: sample_size,
167
162
  eol_char: eol_char,
168
163
  truncate_ragged_lines: truncate_ragged_lines
169
164
  )
@@ -201,7 +196,6 @@ module Polars
201
196
  skip_rows_after_header: 0,
202
197
  row_count_name: nil,
203
198
  row_count_offset: 0,
204
- sample_size: 1024,
205
199
  eol_char: "\n",
206
200
  raise_if_empty: true,
207
201
  truncate_ragged_lines: false,
@@ -305,7 +299,6 @@ module Polars
305
299
  parse_dates,
306
300
  skip_rows_after_header,
307
301
  Utils.parse_row_index_args(row_count_name, row_count_offset),
308
- sample_size,
309
302
  eol_char,
310
303
  raise_if_empty,
311
304
  truncate_ragged_lines,
@@ -392,9 +385,6 @@ module Polars
392
385
  # the DataFrame.
393
386
  # @param row_count_offset [Integer]
394
387
  # Offset to start the row_count column (only used if the name is set).
395
- # @param sample_size [Integer]
396
- # Set the sample size. This is used to sample statistics to estimate the
397
- # allocation needed.
398
388
  # @param eol_char [String]
399
389
  # Single byte end of line character.
400
390
  # @param truncate_ragged_lines [Boolean]
@@ -431,7 +421,6 @@ module Polars
431
421
  skip_rows_after_header: 0,
432
422
  row_count_name: nil,
433
423
  row_count_offset: 0,
434
- sample_size: 1024,
435
424
  eol_char: "\n",
436
425
  raise_if_empty: true,
437
426
  truncate_ragged_lines: false,
@@ -474,7 +463,6 @@ module Polars
474
463
  skip_rows_after_header: skip_rows_after_header,
475
464
  row_count_name: row_count_name,
476
465
  row_count_offset: row_count_offset,
477
- sample_size: sample_size,
478
466
  eol_char: eol_char,
479
467
  new_columns: new_columns,
480
468
  raise_if_empty: raise_if_empty,
@@ -618,7 +606,7 @@ module Polars
618
606
 
619
607
  # @private
620
608
  def _scan_csv_impl(
621
- file,
609
+ source,
622
610
  has_header: true,
623
611
  sep: ",",
624
612
  comment_char: nil,
@@ -650,9 +638,16 @@ module Polars
650
638
  end
651
639
  processed_null_values = Utils._process_null_values(null_values)
652
640
 
641
+ if source.is_a?(::Array)
642
+ sources = source
643
+ source = nil
644
+ else
645
+ sources = []
646
+ end
647
+
653
648
  rblf =
654
649
  RbLazyFrame.new_from_csv(
655
- file,
650
+ source,
656
651
  sep,
657
652
  has_header,
658
653
  ignore_errors,
@@ -672,7 +667,8 @@ module Polars
672
667
  Utils.parse_row_index_args(row_count_name, row_count_offset),
673
668
  parse_dates,
674
669
  eol_char,
675
- truncate_ragged_lines
670
+ truncate_ragged_lines,
671
+ sources
676
672
  )
677
673
  Utils.wrap_ldf(rblf)
678
674
  end
@@ -681,7 +677,9 @@ module Polars
681
677
 
682
678
  def _prepare_file_arg(file)
683
679
  if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
684
- raise ArgumentError, "use URI(...) for remote files"
680
+ require "uri"
681
+
682
+ file = URI(file)
685
683
  end
686
684
 
687
685
  if defined?(URI) && file.is_a?(URI)
@@ -18,9 +18,9 @@ module Polars
18
18
  if query.is_a?(ActiveRecord::Result)
19
19
  query
20
20
  elsif query.is_a?(ActiveRecord::Relation)
21
- query.connection.select_all(query.to_sql)
21
+ query.connection_pool.with_connection { |c| c.select_all(query.to_sql) }
22
22
  elsif query.is_a?(::String)
23
- ActiveRecord::Base.connection.select_all(query)
23
+ ActiveRecord::Base.connection_pool.with_connection { |c| c.select_all(query) }
24
24
  else
25
25
  raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
26
26
  end
data/lib/polars/io/ipc.rb CHANGED
@@ -189,10 +189,6 @@ module Polars
189
189
  # Offset to start the row_count column (only use if the name is set).
190
190
  # @param storage_options [Hash]
191
191
  # Extra options that make sense for a particular storage connection.
192
- # @param memory_map [Boolean]
193
- # Try to memory map the file. This can greatly improve performance on repeated
194
- # queries as the OS may cache pages.
195
- # Only uncompressed IPC files can be memory mapped.
196
192
  # @param hive_partitioning [Boolean]
197
193
  # Infer statistics and schema from Hive partitioned URL and use them
198
194
  # to prune reads. This is unset by default (i.e. `nil`), meaning it is
@@ -215,7 +211,6 @@ module Polars
215
211
  row_count_name: nil,
216
212
  row_count_offset: 0,
217
213
  storage_options: nil,
218
- memory_map: true,
219
214
  hive_partitioning: nil,
220
215
  hive_schema: nil,
221
216
  try_parse_hive_dates: true,
@@ -229,7 +224,6 @@ module Polars
229
224
  row_count_name: row_count_name,
230
225
  row_count_offset: row_count_offset,
231
226
  storage_options: storage_options,
232
- memory_map: memory_map,
233
227
  hive_partitioning: hive_partitioning,
234
228
  hive_schema: hive_schema,
235
229
  try_parse_hive_dates: try_parse_hive_dates,
@@ -239,31 +233,39 @@ module Polars
239
233
 
240
234
  # @private
241
235
  def _scan_ipc_impl(
242
- file,
236
+ source,
243
237
  n_rows: nil,
244
238
  cache: true,
245
239
  rechunk: true,
246
240
  row_count_name: nil,
247
241
  row_count_offset: 0,
248
242
  storage_options: nil,
249
- memory_map: true,
250
243
  hive_partitioning: nil,
251
244
  hive_schema: nil,
252
245
  try_parse_hive_dates: true,
253
246
  include_file_paths: nil
254
247
  )
255
- if Utils.pathlike?(file)
256
- file = Utils.normalize_filepath(file)
248
+ sources = []
249
+ if Utils.pathlike?(source)
250
+ source = Utils.normalize_filepath(source)
251
+ elsif source.is_a?(::Array)
252
+ if Utils.is_path_or_str_sequence(source)
253
+ sources = source.map { |s| Utils.normalize_filepath(s) }
254
+ else
255
+ sources = source
256
+ end
257
+
258
+ source = nil
257
259
  end
258
260
 
259
261
  rblf =
260
262
  RbLazyFrame.new_from_ipc(
261
- file,
263
+ source,
264
+ sources,
262
265
  n_rows,
263
266
  cache,
264
267
  rechunk,
265
268
  Utils.parse_row_index_args(row_count_name, row_count_offset),
266
- memory_map,
267
269
  hive_partitioning,
268
270
  hive_schema,
269
271
  try_parse_hive_dates,
@@ -60,13 +60,23 @@ module Polars
60
60
  row_count_name: nil,
61
61
  row_count_offset: 0
62
62
  )
63
+ sources = []
63
64
  if Utils.pathlike?(source)
64
65
  source = Utils.normalize_filepath(source)
66
+ elsif source.is_a?(::Array)
67
+ if Utils.is_path_or_str_sequence(source)
68
+ sources = source.map { |s| Utils.normalize_filepath(s) }
69
+ else
70
+ sources = source
71
+ end
72
+
73
+ source = nil
65
74
  end
66
75
 
67
76
  rblf =
68
77
  RbLazyFrame.new_from_ndjson(
69
78
  source,
79
+ sources,
70
80
  infer_schema_length,
71
81
  batch_size,
72
82
  n_rows,