polars-df 0.14.0-x86_64-linux → 0.15.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/Cargo.lock +1296 -283
- data/LICENSE-THIRD-PARTY.txt +24793 -13160
- data/LICENSE.txt +1 -0
- data/README.md +1 -2
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +275 -52
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +2 -0
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +103 -2
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +88 -10
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/ipc.rb +14 -4
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +649 -15
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1144 -0
- data/lib/polars/series.rb +465 -35
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +4 -3
- data/lib/polars/functions.rb +0 -57
@@ -824,6 +824,29 @@ module Polars
|
|
824
824
|
# @note
|
825
825
|
# If you simply want the first encountered expression as accumulator,
|
826
826
|
# consider using `cumreduce`.
|
827
|
+
#
|
828
|
+
# @example
|
829
|
+
# df = Polars::DataFrame.new(
|
830
|
+
# {
|
831
|
+
# "a" => [1, 2, 3],
|
832
|
+
# "b" => [3, 4, 5],
|
833
|
+
# "c" => [5, 6, 7]
|
834
|
+
# }
|
835
|
+
# )
|
836
|
+
# df.with_columns(
|
837
|
+
# Polars.cum_fold(Polars.lit(1), ->(acc, x) { acc + x }, Polars.all)
|
838
|
+
# )
|
839
|
+
# # =>
|
840
|
+
# # shape: (3, 4)
|
841
|
+
# # ┌─────┬─────┬─────┬───────────┐
|
842
|
+
# # │ a ┆ b ┆ c ┆ cum_fold │
|
843
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
844
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
|
845
|
+
# # ╞═════╪═════╪═════╪═══════════╡
|
846
|
+
# # │ 1 ┆ 3 ┆ 5 ┆ {2,5,10} │
|
847
|
+
# # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
|
848
|
+
# # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
|
849
|
+
# # └─────┴─────┴─────┴───────────┘
|
827
850
|
def cum_fold(acc, f, exprs, include_init: false)
|
828
851
|
acc = Utils.parse_into_expression(acc, str_as_lit: true)
|
829
852
|
if exprs.is_a?(Expr)
|
@@ -831,7 +854,7 @@ module Polars
|
|
831
854
|
end
|
832
855
|
|
833
856
|
exprs = Utils.parse_into_list_of_expressions(exprs)
|
834
|
-
Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init))
|
857
|
+
Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
|
835
858
|
end
|
836
859
|
alias_method :cumfold, :cum_fold
|
837
860
|
|
@@ -1024,15 +1047,70 @@ module Polars
|
|
1024
1047
|
# Default is ascending.
|
1025
1048
|
#
|
1026
1049
|
# @return [Expr]
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1050
|
+
#
|
1051
|
+
# @example Pass a single column name to compute the arg sort by that column.
|
1052
|
+
# df = Polars::DataFrame.new(
|
1053
|
+
# {
|
1054
|
+
# "a" => [0, 1, 1, 0],
|
1055
|
+
# "b" => [3, 2, 3, 2],
|
1056
|
+
# "c" => [1, 2, 3, 4]
|
1057
|
+
# }
|
1058
|
+
# )
|
1059
|
+
# df.select(Polars.arg_sort_by("a"))
|
1060
|
+
# # =>
|
1061
|
+
# # shape: (4, 1)
|
1062
|
+
# # ┌─────┐
|
1063
|
+
# # │ a │
|
1064
|
+
# # │ --- │
|
1065
|
+
# # │ u32 │
|
1066
|
+
# # ╞═════╡
|
1067
|
+
# # │ 0 │
|
1068
|
+
# # │ 3 │
|
1069
|
+
# # │ 1 │
|
1070
|
+
# # │ 2 │
|
1071
|
+
# # └─────┘
|
1072
|
+
#
|
1073
|
+
# @example Compute the arg sort by multiple columns by either passing a list of columns, or by specifying each column as a positional argument.
|
1074
|
+
# df.select(Polars.arg_sort_by(["a", "b"], reverse: true))
|
1075
|
+
# # =>
|
1076
|
+
# # shape: (4, 1)
|
1077
|
+
# # ┌─────┐
|
1078
|
+
# # │ a │
|
1079
|
+
# # │ --- │
|
1080
|
+
# # │ u32 │
|
1081
|
+
# # ╞═════╡
|
1082
|
+
# # │ 2 │
|
1083
|
+
# # │ 1 │
|
1084
|
+
# # │ 0 │
|
1085
|
+
# # │ 3 │
|
1086
|
+
# # └─────┘
|
1087
|
+
#
|
1088
|
+
# @example Use gather to apply the arg sort to other columns.
|
1089
|
+
# df.select(Polars.col("c").gather(Polars.arg_sort_by("a")))
|
1090
|
+
# # =>
|
1091
|
+
# # shape: (4, 1)
|
1092
|
+
# # ┌─────┐
|
1093
|
+
# # │ c │
|
1094
|
+
# # │ --- │
|
1095
|
+
# # │ i64 │
|
1096
|
+
# # ╞═════╡
|
1097
|
+
# # │ 1 │
|
1098
|
+
# # │ 4 │
|
1099
|
+
# # │ 2 │
|
1100
|
+
# # │ 3 │
|
1101
|
+
# # └─────┘
|
1102
|
+
def arg_sort_by(
|
1103
|
+
exprs,
|
1104
|
+
*more_exprs,
|
1105
|
+
reverse: false,
|
1106
|
+
nulls_last: false,
|
1107
|
+
multithreaded: true,
|
1108
|
+
maintain_order: false
|
1109
|
+
)
|
1110
|
+
exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
|
1111
|
+
reverse = Utils.extend_bool(reverse, exprs.length, "reverse", "exprs")
|
1112
|
+
nulls_last = Utils.extend_bool(nulls_last, exprs.length, "nulls_last", "exprs")
|
1113
|
+
Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse, nulls_last, multithreaded, maintain_order))
|
1036
1114
|
end
|
1037
1115
|
alias_method :argsort_by, :arg_sort_by
|
1038
1116
|
|
@@ -18,7 +18,7 @@ module Polars
|
|
18
18
|
#
|
19
19
|
# @example
|
20
20
|
# Polars.time_range(
|
21
|
-
#
|
21
|
+
# Time.utc(2000, 1, 1, 14, 0),
|
22
22
|
# nil,
|
23
23
|
# "3h15m",
|
24
24
|
# eager: true
|
@@ -48,12 +48,12 @@ module Polars
|
|
48
48
|
end
|
49
49
|
|
50
50
|
if start.nil?
|
51
|
-
#
|
52
|
-
|
51
|
+
# date part is ignored
|
52
|
+
start = ::Time.utc(2000, 1, 1, 0, 0, 0)
|
53
53
|
end
|
54
54
|
if stop.nil?
|
55
|
-
#
|
56
|
-
|
55
|
+
# date part is ignored
|
56
|
+
stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
|
57
57
|
end
|
58
58
|
|
59
59
|
start_rbexpr = Utils.parse_into_expression(start)
|
@@ -87,21 +87,21 @@ module Polars
|
|
87
87
|
# @example
|
88
88
|
# df = Polars::DataFrame.new(
|
89
89
|
# {
|
90
|
-
# "start" => [
|
91
|
-
# "end" =>
|
90
|
+
# "start" => [Time.utc(2000, 1, 1, 9, 0), Time.utc(2000, 1, 1, 10, 0)],
|
91
|
+
# "end" => Time.utc(2000, 1, 1, 11, 0)
|
92
92
|
# }
|
93
93
|
# )
|
94
|
-
# df.
|
94
|
+
# df.select(time_range: Polars.time_ranges("start", "end"))
|
95
95
|
# # =>
|
96
|
-
# # shape: (2,
|
97
|
-
# #
|
98
|
-
# # │
|
99
|
-
# # │ ---
|
100
|
-
# # │
|
101
|
-
# #
|
102
|
-
# # │
|
103
|
-
# # │
|
104
|
-
# #
|
96
|
+
# # shape: (2, 1)
|
97
|
+
# # ┌────────────────────────────────┐
|
98
|
+
# # │ time_range │
|
99
|
+
# # │ --- │
|
100
|
+
# # │ list[time] │
|
101
|
+
# # ╞════════════════════════════════╡
|
102
|
+
# # │ [09:00:00, 10:00:00, 11:00:00] │
|
103
|
+
# # │ [10:00:00, 11:00:00] │
|
104
|
+
# # └────────────────────────────────┘
|
105
105
|
def time_ranges(
|
106
106
|
start = nil,
|
107
107
|
stop = nil,
|
@@ -118,12 +118,12 @@ module Polars
|
|
118
118
|
end
|
119
119
|
|
120
120
|
if start.nil?
|
121
|
-
#
|
122
|
-
|
121
|
+
# date part is ignored
|
122
|
+
start = ::Time.utc(2000, 1, 1, 0, 0, 0)
|
123
123
|
end
|
124
124
|
if stop.nil?
|
125
|
-
#
|
126
|
-
|
125
|
+
# date part is ignored
|
126
|
+
stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
|
127
127
|
end
|
128
128
|
|
129
129
|
start_rbexpr = Utils.parse_into_expression(start)
|
data/lib/polars/io/csv.rb
CHANGED
@@ -75,9 +75,6 @@ module Polars
|
|
75
75
|
# the DataFrame.
|
76
76
|
# @param row_count_offset [Integer]
|
77
77
|
# Offset to start the row_count column (only used if the name is set).
|
78
|
-
# @param sample_size [Integer]
|
79
|
-
# Set the sample size. This is used to sample statistics to estimate the
|
80
|
-
# allocation needed.
|
81
78
|
# @param eol_char [String]
|
82
79
|
# Single byte end of line character.
|
83
80
|
# @param truncate_ragged_lines [Boolean]
|
@@ -114,7 +111,6 @@ module Polars
|
|
114
111
|
skip_rows_after_header: 0,
|
115
112
|
row_count_name: nil,
|
116
113
|
row_count_offset: 0,
|
117
|
-
sample_size: 1024,
|
118
114
|
eol_char: "\n",
|
119
115
|
truncate_ragged_lines: false
|
120
116
|
)
|
@@ -163,7 +159,6 @@ module Polars
|
|
163
159
|
skip_rows_after_header: skip_rows_after_header,
|
164
160
|
row_count_name: row_count_name,
|
165
161
|
row_count_offset: row_count_offset,
|
166
|
-
sample_size: sample_size,
|
167
162
|
eol_char: eol_char,
|
168
163
|
truncate_ragged_lines: truncate_ragged_lines
|
169
164
|
)
|
@@ -201,7 +196,6 @@ module Polars
|
|
201
196
|
skip_rows_after_header: 0,
|
202
197
|
row_count_name: nil,
|
203
198
|
row_count_offset: 0,
|
204
|
-
sample_size: 1024,
|
205
199
|
eol_char: "\n",
|
206
200
|
raise_if_empty: true,
|
207
201
|
truncate_ragged_lines: false,
|
@@ -305,7 +299,6 @@ module Polars
|
|
305
299
|
parse_dates,
|
306
300
|
skip_rows_after_header,
|
307
301
|
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
308
|
-
sample_size,
|
309
302
|
eol_char,
|
310
303
|
raise_if_empty,
|
311
304
|
truncate_ragged_lines,
|
@@ -392,9 +385,6 @@ module Polars
|
|
392
385
|
# the DataFrame.
|
393
386
|
# @param row_count_offset [Integer]
|
394
387
|
# Offset to start the row_count column (only used if the name is set).
|
395
|
-
# @param sample_size [Integer]
|
396
|
-
# Set the sample size. This is used to sample statistics to estimate the
|
397
|
-
# allocation needed.
|
398
388
|
# @param eol_char [String]
|
399
389
|
# Single byte end of line character.
|
400
390
|
# @param truncate_ragged_lines [Boolean]
|
@@ -431,7 +421,6 @@ module Polars
|
|
431
421
|
skip_rows_after_header: 0,
|
432
422
|
row_count_name: nil,
|
433
423
|
row_count_offset: 0,
|
434
|
-
sample_size: 1024,
|
435
424
|
eol_char: "\n",
|
436
425
|
raise_if_empty: true,
|
437
426
|
truncate_ragged_lines: false,
|
@@ -474,7 +463,6 @@ module Polars
|
|
474
463
|
skip_rows_after_header: skip_rows_after_header,
|
475
464
|
row_count_name: row_count_name,
|
476
465
|
row_count_offset: row_count_offset,
|
477
|
-
sample_size: sample_size,
|
478
466
|
eol_char: eol_char,
|
479
467
|
new_columns: new_columns,
|
480
468
|
raise_if_empty: raise_if_empty,
|
@@ -618,7 +606,7 @@ module Polars
|
|
618
606
|
|
619
607
|
# @private
|
620
608
|
def _scan_csv_impl(
|
621
|
-
|
609
|
+
source,
|
622
610
|
has_header: true,
|
623
611
|
sep: ",",
|
624
612
|
comment_char: nil,
|
@@ -650,9 +638,16 @@ module Polars
|
|
650
638
|
end
|
651
639
|
processed_null_values = Utils._process_null_values(null_values)
|
652
640
|
|
641
|
+
if source.is_a?(::Array)
|
642
|
+
sources = source
|
643
|
+
source = nil
|
644
|
+
else
|
645
|
+
sources = []
|
646
|
+
end
|
647
|
+
|
653
648
|
rblf =
|
654
649
|
RbLazyFrame.new_from_csv(
|
655
|
-
|
650
|
+
source,
|
656
651
|
sep,
|
657
652
|
has_header,
|
658
653
|
ignore_errors,
|
@@ -672,7 +667,8 @@ module Polars
|
|
672
667
|
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
673
668
|
parse_dates,
|
674
669
|
eol_char,
|
675
|
-
truncate_ragged_lines
|
670
|
+
truncate_ragged_lines,
|
671
|
+
sources
|
676
672
|
)
|
677
673
|
Utils.wrap_ldf(rblf)
|
678
674
|
end
|
@@ -681,7 +677,9 @@ module Polars
|
|
681
677
|
|
682
678
|
def _prepare_file_arg(file)
|
683
679
|
if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
|
684
|
-
|
680
|
+
require "uri"
|
681
|
+
|
682
|
+
file = URI(file)
|
685
683
|
end
|
686
684
|
|
687
685
|
if defined?(URI) && file.is_a?(URI)
|
data/lib/polars/io/database.rb
CHANGED
@@ -18,9 +18,9 @@ module Polars
|
|
18
18
|
if query.is_a?(ActiveRecord::Result)
|
19
19
|
query
|
20
20
|
elsif query.is_a?(ActiveRecord::Relation)
|
21
|
-
query.
|
21
|
+
query.connection_pool.with_connection { |c| c.select_all(query.to_sql) }
|
22
22
|
elsif query.is_a?(::String)
|
23
|
-
ActiveRecord::Base.
|
23
|
+
ActiveRecord::Base.connection_pool.with_connection { |c| c.select_all(query) }
|
24
24
|
else
|
25
25
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
26
26
|
end
|
data/lib/polars/io/ipc.rb
CHANGED
@@ -233,7 +233,7 @@ module Polars
|
|
233
233
|
|
234
234
|
# @private
|
235
235
|
def _scan_ipc_impl(
|
236
|
-
|
236
|
+
source,
|
237
237
|
n_rows: nil,
|
238
238
|
cache: true,
|
239
239
|
rechunk: true,
|
@@ -245,13 +245,23 @@ module Polars
|
|
245
245
|
try_parse_hive_dates: true,
|
246
246
|
include_file_paths: nil
|
247
247
|
)
|
248
|
-
|
249
|
-
|
248
|
+
sources = []
|
249
|
+
if Utils.pathlike?(source)
|
250
|
+
source = Utils.normalize_filepath(source)
|
251
|
+
elsif source.is_a?(::Array)
|
252
|
+
if Utils.is_path_or_str_sequence(source)
|
253
|
+
sources = source.map { |s| Utils.normalize_filepath(s) }
|
254
|
+
else
|
255
|
+
sources = source
|
256
|
+
end
|
257
|
+
|
258
|
+
source = nil
|
250
259
|
end
|
251
260
|
|
252
261
|
rblf =
|
253
262
|
RbLazyFrame.new_from_ipc(
|
254
|
-
|
263
|
+
source,
|
264
|
+
sources,
|
255
265
|
n_rows,
|
256
266
|
cache,
|
257
267
|
rechunk,
|
data/lib/polars/io/ndjson.rb
CHANGED
@@ -60,13 +60,23 @@ module Polars
|
|
60
60
|
row_count_name: nil,
|
61
61
|
row_count_offset: 0
|
62
62
|
)
|
63
|
+
sources = []
|
63
64
|
if Utils.pathlike?(source)
|
64
65
|
source = Utils.normalize_filepath(source)
|
66
|
+
elsif source.is_a?(::Array)
|
67
|
+
if Utils.is_path_or_str_sequence(source)
|
68
|
+
sources = source.map { |s| Utils.normalize_filepath(s) }
|
69
|
+
else
|
70
|
+
sources = source
|
71
|
+
end
|
72
|
+
|
73
|
+
source = nil
|
65
74
|
end
|
66
75
|
|
67
76
|
rblf =
|
68
77
|
RbLazyFrame.new_from_ndjson(
|
69
78
|
source,
|
79
|
+
sources,
|
70
80
|
infer_schema_length,
|
71
81
|
batch_size,
|
72
82
|
n_rows,
|