polars-df 0.13.0-x86_64-linux → 0.15.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -0
- data/Cargo.lock +1368 -319
- data/LICENSE-THIRD-PARTY.txt +24801 -13447
- data/LICENSE.txt +1 -0
- data/README.md +1 -2
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +285 -62
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +2 -0
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +109 -8
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +88 -10
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/ipc.rb +14 -12
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +649 -15
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1144 -0
- data/lib/polars/series.rb +470 -40
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +4 -3
- data/lib/polars/functions.rb +0 -57
@@ -824,6 +824,29 @@ module Polars
|
|
824
824
|
# @note
|
825
825
|
# If you simply want the first encountered expression as accumulator,
|
826
826
|
# consider using `cumreduce`.
|
827
|
+
#
|
828
|
+
# @example
|
829
|
+
# df = Polars::DataFrame.new(
|
830
|
+
# {
|
831
|
+
# "a" => [1, 2, 3],
|
832
|
+
# "b" => [3, 4, 5],
|
833
|
+
# "c" => [5, 6, 7]
|
834
|
+
# }
|
835
|
+
# )
|
836
|
+
# df.with_columns(
|
837
|
+
# Polars.cum_fold(Polars.lit(1), ->(acc, x) { acc + x }, Polars.all)
|
838
|
+
# )
|
839
|
+
# # =>
|
840
|
+
# # shape: (3, 4)
|
841
|
+
# # ┌─────┬─────┬─────┬───────────┐
|
842
|
+
# # │ a ┆ b ┆ c ┆ cum_fold │
|
843
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
844
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
|
845
|
+
# # ╞═════╪═════╪═════╪═══════════╡
|
846
|
+
# # │ 1 ┆ 3 ┆ 5 ┆ {2,5,10} │
|
847
|
+
# # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
|
848
|
+
# # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
|
849
|
+
# # └─────┴─────┴─────┴───────────┘
|
827
850
|
def cum_fold(acc, f, exprs, include_init: false)
|
828
851
|
acc = Utils.parse_into_expression(acc, str_as_lit: true)
|
829
852
|
if exprs.is_a?(Expr)
|
@@ -831,7 +854,7 @@ module Polars
|
|
831
854
|
end
|
832
855
|
|
833
856
|
exprs = Utils.parse_into_list_of_expressions(exprs)
|
834
|
-
Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init))
|
857
|
+
Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
|
835
858
|
end
|
836
859
|
alias_method :cumfold, :cum_fold
|
837
860
|
|
@@ -1024,15 +1047,70 @@ module Polars
|
|
1024
1047
|
# Default is ascending.
|
1025
1048
|
#
|
1026
1049
|
# @return [Expr]
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1050
|
+
#
|
1051
|
+
# @example Pass a single column name to compute the arg sort by that column.
|
1052
|
+
# df = Polars::DataFrame.new(
|
1053
|
+
# {
|
1054
|
+
# "a" => [0, 1, 1, 0],
|
1055
|
+
# "b" => [3, 2, 3, 2],
|
1056
|
+
# "c" => [1, 2, 3, 4]
|
1057
|
+
# }
|
1058
|
+
# )
|
1059
|
+
# df.select(Polars.arg_sort_by("a"))
|
1060
|
+
# # =>
|
1061
|
+
# # shape: (4, 1)
|
1062
|
+
# # ┌─────┐
|
1063
|
+
# # │ a │
|
1064
|
+
# # │ --- │
|
1065
|
+
# # │ u32 │
|
1066
|
+
# # ╞═════╡
|
1067
|
+
# # │ 0 │
|
1068
|
+
# # │ 3 │
|
1069
|
+
# # │ 1 │
|
1070
|
+
# # │ 2 │
|
1071
|
+
# # └─────┘
|
1072
|
+
#
|
1073
|
+
# @example Compute the arg sort by multiple columns by either passing a list of columns, or by specifying each column as a positional argument.
|
1074
|
+
# df.select(Polars.arg_sort_by(["a", "b"], reverse: true))
|
1075
|
+
# # =>
|
1076
|
+
# # shape: (4, 1)
|
1077
|
+
# # ┌─────┐
|
1078
|
+
# # │ a │
|
1079
|
+
# # │ --- │
|
1080
|
+
# # │ u32 │
|
1081
|
+
# # ╞═════╡
|
1082
|
+
# # │ 2 │
|
1083
|
+
# # │ 1 │
|
1084
|
+
# # │ 0 │
|
1085
|
+
# # │ 3 │
|
1086
|
+
# # └─────┘
|
1087
|
+
#
|
1088
|
+
# @example Use gather to apply the arg sort to other columns.
|
1089
|
+
# df.select(Polars.col("c").gather(Polars.arg_sort_by("a")))
|
1090
|
+
# # =>
|
1091
|
+
# # shape: (4, 1)
|
1092
|
+
# # ┌─────┐
|
1093
|
+
# # │ c │
|
1094
|
+
# # │ --- │
|
1095
|
+
# # │ i64 │
|
1096
|
+
# # ╞═════╡
|
1097
|
+
# # │ 1 │
|
1098
|
+
# # │ 4 │
|
1099
|
+
# # │ 2 │
|
1100
|
+
# # │ 3 │
|
1101
|
+
# # └─────┘
|
1102
|
+
def arg_sort_by(
|
1103
|
+
exprs,
|
1104
|
+
*more_exprs,
|
1105
|
+
reverse: false,
|
1106
|
+
nulls_last: false,
|
1107
|
+
multithreaded: true,
|
1108
|
+
maintain_order: false
|
1109
|
+
)
|
1110
|
+
exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
|
1111
|
+
reverse = Utils.extend_bool(reverse, exprs.length, "reverse", "exprs")
|
1112
|
+
nulls_last = Utils.extend_bool(nulls_last, exprs.length, "nulls_last", "exprs")
|
1113
|
+
Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse, nulls_last, multithreaded, maintain_order))
|
1036
1114
|
end
|
1037
1115
|
alias_method :argsort_by, :arg_sort_by
|
1038
1116
|
|
@@ -18,7 +18,7 @@ module Polars
|
|
18
18
|
#
|
19
19
|
# @example
|
20
20
|
# Polars.time_range(
|
21
|
-
#
|
21
|
+
# Time.utc(2000, 1, 1, 14, 0),
|
22
22
|
# nil,
|
23
23
|
# "3h15m",
|
24
24
|
# eager: true
|
@@ -48,12 +48,12 @@ module Polars
|
|
48
48
|
end
|
49
49
|
|
50
50
|
if start.nil?
|
51
|
-
#
|
52
|
-
|
51
|
+
# date part is ignored
|
52
|
+
start = ::Time.utc(2000, 1, 1, 0, 0, 0)
|
53
53
|
end
|
54
54
|
if stop.nil?
|
55
|
-
#
|
56
|
-
|
55
|
+
# date part is ignored
|
56
|
+
stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
|
57
57
|
end
|
58
58
|
|
59
59
|
start_rbexpr = Utils.parse_into_expression(start)
|
@@ -87,21 +87,21 @@ module Polars
|
|
87
87
|
# @example
|
88
88
|
# df = Polars::DataFrame.new(
|
89
89
|
# {
|
90
|
-
# "start" => [
|
91
|
-
# "end" =>
|
90
|
+
# "start" => [Time.utc(2000, 1, 1, 9, 0), Time.utc(2000, 1, 1, 10, 0)],
|
91
|
+
# "end" => Time.utc(2000, 1, 1, 11, 0)
|
92
92
|
# }
|
93
93
|
# )
|
94
|
-
# df.
|
94
|
+
# df.select(time_range: Polars.time_ranges("start", "end"))
|
95
95
|
# # =>
|
96
|
-
# # shape: (2,
|
97
|
-
# #
|
98
|
-
# # │
|
99
|
-
# # │ ---
|
100
|
-
# # │
|
101
|
-
# #
|
102
|
-
# # │
|
103
|
-
# # │
|
104
|
-
# #
|
96
|
+
# # shape: (2, 1)
|
97
|
+
# # ┌────────────────────────────────┐
|
98
|
+
# # │ time_range │
|
99
|
+
# # │ --- │
|
100
|
+
# # │ list[time] │
|
101
|
+
# # ╞════════════════════════════════╡
|
102
|
+
# # │ [09:00:00, 10:00:00, 11:00:00] │
|
103
|
+
# # │ [10:00:00, 11:00:00] │
|
104
|
+
# # └────────────────────────────────┘
|
105
105
|
def time_ranges(
|
106
106
|
start = nil,
|
107
107
|
stop = nil,
|
@@ -118,12 +118,12 @@ module Polars
|
|
118
118
|
end
|
119
119
|
|
120
120
|
if start.nil?
|
121
|
-
#
|
122
|
-
|
121
|
+
# date part is ignored
|
122
|
+
start = ::Time.utc(2000, 1, 1, 0, 0, 0)
|
123
123
|
end
|
124
124
|
if stop.nil?
|
125
|
-
#
|
126
|
-
|
125
|
+
# date part is ignored
|
126
|
+
stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
|
127
127
|
end
|
128
128
|
|
129
129
|
start_rbexpr = Utils.parse_into_expression(start)
|
data/lib/polars/io/csv.rb
CHANGED
@@ -75,9 +75,6 @@ module Polars
|
|
75
75
|
# the DataFrame.
|
76
76
|
# @param row_count_offset [Integer]
|
77
77
|
# Offset to start the row_count column (only used if the name is set).
|
78
|
-
# @param sample_size [Integer]
|
79
|
-
# Set the sample size. This is used to sample statistics to estimate the
|
80
|
-
# allocation needed.
|
81
78
|
# @param eol_char [String]
|
82
79
|
# Single byte end of line character.
|
83
80
|
# @param truncate_ragged_lines [Boolean]
|
@@ -114,7 +111,6 @@ module Polars
|
|
114
111
|
skip_rows_after_header: 0,
|
115
112
|
row_count_name: nil,
|
116
113
|
row_count_offset: 0,
|
117
|
-
sample_size: 1024,
|
118
114
|
eol_char: "\n",
|
119
115
|
truncate_ragged_lines: false
|
120
116
|
)
|
@@ -163,7 +159,6 @@ module Polars
|
|
163
159
|
skip_rows_after_header: skip_rows_after_header,
|
164
160
|
row_count_name: row_count_name,
|
165
161
|
row_count_offset: row_count_offset,
|
166
|
-
sample_size: sample_size,
|
167
162
|
eol_char: eol_char,
|
168
163
|
truncate_ragged_lines: truncate_ragged_lines
|
169
164
|
)
|
@@ -201,7 +196,6 @@ module Polars
|
|
201
196
|
skip_rows_after_header: 0,
|
202
197
|
row_count_name: nil,
|
203
198
|
row_count_offset: 0,
|
204
|
-
sample_size: 1024,
|
205
199
|
eol_char: "\n",
|
206
200
|
raise_if_empty: true,
|
207
201
|
truncate_ragged_lines: false,
|
@@ -305,7 +299,6 @@ module Polars
|
|
305
299
|
parse_dates,
|
306
300
|
skip_rows_after_header,
|
307
301
|
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
308
|
-
sample_size,
|
309
302
|
eol_char,
|
310
303
|
raise_if_empty,
|
311
304
|
truncate_ragged_lines,
|
@@ -392,9 +385,6 @@ module Polars
|
|
392
385
|
# the DataFrame.
|
393
386
|
# @param row_count_offset [Integer]
|
394
387
|
# Offset to start the row_count column (only used if the name is set).
|
395
|
-
# @param sample_size [Integer]
|
396
|
-
# Set the sample size. This is used to sample statistics to estimate the
|
397
|
-
# allocation needed.
|
398
388
|
# @param eol_char [String]
|
399
389
|
# Single byte end of line character.
|
400
390
|
# @param truncate_ragged_lines [Boolean]
|
@@ -431,7 +421,6 @@ module Polars
|
|
431
421
|
skip_rows_after_header: 0,
|
432
422
|
row_count_name: nil,
|
433
423
|
row_count_offset: 0,
|
434
|
-
sample_size: 1024,
|
435
424
|
eol_char: "\n",
|
436
425
|
raise_if_empty: true,
|
437
426
|
truncate_ragged_lines: false,
|
@@ -474,7 +463,6 @@ module Polars
|
|
474
463
|
skip_rows_after_header: skip_rows_after_header,
|
475
464
|
row_count_name: row_count_name,
|
476
465
|
row_count_offset: row_count_offset,
|
477
|
-
sample_size: sample_size,
|
478
466
|
eol_char: eol_char,
|
479
467
|
new_columns: new_columns,
|
480
468
|
raise_if_empty: raise_if_empty,
|
@@ -618,7 +606,7 @@ module Polars
|
|
618
606
|
|
619
607
|
# @private
|
620
608
|
def _scan_csv_impl(
|
621
|
-
|
609
|
+
source,
|
622
610
|
has_header: true,
|
623
611
|
sep: ",",
|
624
612
|
comment_char: nil,
|
@@ -650,9 +638,16 @@ module Polars
|
|
650
638
|
end
|
651
639
|
processed_null_values = Utils._process_null_values(null_values)
|
652
640
|
|
641
|
+
if source.is_a?(::Array)
|
642
|
+
sources = source
|
643
|
+
source = nil
|
644
|
+
else
|
645
|
+
sources = []
|
646
|
+
end
|
647
|
+
|
653
648
|
rblf =
|
654
649
|
RbLazyFrame.new_from_csv(
|
655
|
-
|
650
|
+
source,
|
656
651
|
sep,
|
657
652
|
has_header,
|
658
653
|
ignore_errors,
|
@@ -672,7 +667,8 @@ module Polars
|
|
672
667
|
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
673
668
|
parse_dates,
|
674
669
|
eol_char,
|
675
|
-
truncate_ragged_lines
|
670
|
+
truncate_ragged_lines,
|
671
|
+
sources
|
676
672
|
)
|
677
673
|
Utils.wrap_ldf(rblf)
|
678
674
|
end
|
@@ -681,7 +677,9 @@ module Polars
|
|
681
677
|
|
682
678
|
def _prepare_file_arg(file)
|
683
679
|
if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
|
684
|
-
|
680
|
+
require "uri"
|
681
|
+
|
682
|
+
file = URI(file)
|
685
683
|
end
|
686
684
|
|
687
685
|
if defined?(URI) && file.is_a?(URI)
|
data/lib/polars/io/database.rb
CHANGED
@@ -18,9 +18,9 @@ module Polars
|
|
18
18
|
if query.is_a?(ActiveRecord::Result)
|
19
19
|
query
|
20
20
|
elsif query.is_a?(ActiveRecord::Relation)
|
21
|
-
query.
|
21
|
+
query.connection_pool.with_connection { |c| c.select_all(query.to_sql) }
|
22
22
|
elsif query.is_a?(::String)
|
23
|
-
ActiveRecord::Base.
|
23
|
+
ActiveRecord::Base.connection_pool.with_connection { |c| c.select_all(query) }
|
24
24
|
else
|
25
25
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
26
26
|
end
|
data/lib/polars/io/ipc.rb
CHANGED
@@ -189,10 +189,6 @@ module Polars
|
|
189
189
|
# Offset to start the row_count column (only use if the name is set).
|
190
190
|
# @param storage_options [Hash]
|
191
191
|
# Extra options that make sense for a particular storage connection.
|
192
|
-
# @param memory_map [Boolean]
|
193
|
-
# Try to memory map the file. This can greatly improve performance on repeated
|
194
|
-
# queries as the OS may cache pages.
|
195
|
-
# Only uncompressed IPC files can be memory mapped.
|
196
192
|
# @param hive_partitioning [Boolean]
|
197
193
|
# Infer statistics and schema from Hive partitioned URL and use them
|
198
194
|
# to prune reads. This is unset by default (i.e. `nil`), meaning it is
|
@@ -215,7 +211,6 @@ module Polars
|
|
215
211
|
row_count_name: nil,
|
216
212
|
row_count_offset: 0,
|
217
213
|
storage_options: nil,
|
218
|
-
memory_map: true,
|
219
214
|
hive_partitioning: nil,
|
220
215
|
hive_schema: nil,
|
221
216
|
try_parse_hive_dates: true,
|
@@ -229,7 +224,6 @@ module Polars
|
|
229
224
|
row_count_name: row_count_name,
|
230
225
|
row_count_offset: row_count_offset,
|
231
226
|
storage_options: storage_options,
|
232
|
-
memory_map: memory_map,
|
233
227
|
hive_partitioning: hive_partitioning,
|
234
228
|
hive_schema: hive_schema,
|
235
229
|
try_parse_hive_dates: try_parse_hive_dates,
|
@@ -239,31 +233,39 @@ module Polars
|
|
239
233
|
|
240
234
|
# @private
|
241
235
|
def _scan_ipc_impl(
|
242
|
-
|
236
|
+
source,
|
243
237
|
n_rows: nil,
|
244
238
|
cache: true,
|
245
239
|
rechunk: true,
|
246
240
|
row_count_name: nil,
|
247
241
|
row_count_offset: 0,
|
248
242
|
storage_options: nil,
|
249
|
-
memory_map: true,
|
250
243
|
hive_partitioning: nil,
|
251
244
|
hive_schema: nil,
|
252
245
|
try_parse_hive_dates: true,
|
253
246
|
include_file_paths: nil
|
254
247
|
)
|
255
|
-
|
256
|
-
|
248
|
+
sources = []
|
249
|
+
if Utils.pathlike?(source)
|
250
|
+
source = Utils.normalize_filepath(source)
|
251
|
+
elsif source.is_a?(::Array)
|
252
|
+
if Utils.is_path_or_str_sequence(source)
|
253
|
+
sources = source.map { |s| Utils.normalize_filepath(s) }
|
254
|
+
else
|
255
|
+
sources = source
|
256
|
+
end
|
257
|
+
|
258
|
+
source = nil
|
257
259
|
end
|
258
260
|
|
259
261
|
rblf =
|
260
262
|
RbLazyFrame.new_from_ipc(
|
261
|
-
|
263
|
+
source,
|
264
|
+
sources,
|
262
265
|
n_rows,
|
263
266
|
cache,
|
264
267
|
rechunk,
|
265
268
|
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
266
|
-
memory_map,
|
267
269
|
hive_partitioning,
|
268
270
|
hive_schema,
|
269
271
|
try_parse_hive_dates,
|
data/lib/polars/io/ndjson.rb
CHANGED
@@ -60,13 +60,23 @@ module Polars
|
|
60
60
|
row_count_name: nil,
|
61
61
|
row_count_offset: 0
|
62
62
|
)
|
63
|
+
sources = []
|
63
64
|
if Utils.pathlike?(source)
|
64
65
|
source = Utils.normalize_filepath(source)
|
66
|
+
elsif source.is_a?(::Array)
|
67
|
+
if Utils.is_path_or_str_sequence(source)
|
68
|
+
sources = source.map { |s| Utils.normalize_filepath(s) }
|
69
|
+
else
|
70
|
+
sources = source
|
71
|
+
end
|
72
|
+
|
73
|
+
source = nil
|
65
74
|
end
|
66
75
|
|
67
76
|
rblf =
|
68
77
|
RbLazyFrame.new_from_ndjson(
|
69
78
|
source,
|
79
|
+
sources,
|
70
80
|
infer_schema_length,
|
71
81
|
batch_size,
|
72
82
|
n_rows,
|