polars-df 0.10.0-x86_64-linux → 0.12.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +392 -351
- data/LICENSE-THIRD-PARTY.txt +1125 -865
- data/README.md +6 -6
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +4 -4
- data/lib/polars/batched_csv_reader.rb +11 -5
- data/lib/polars/cat_expr.rb +0 -36
- data/lib/polars/cat_name_space.rb +0 -37
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +176 -403
- data/lib/polars/data_types.rb +1 -1
- data/lib/polars/date_time_expr.rb +525 -572
- data/lib/polars/date_time_name_space.rb +263 -460
- data/lib/polars/dynamic_group_by.rb +5 -5
- data/lib/polars/exceptions.rb +7 -0
- data/lib/polars/expr.rb +1394 -243
- data/lib/polars/expr_dispatch.rb +1 -1
- data/lib/polars/functions/aggregation/horizontal.rb +8 -8
- data/lib/polars/functions/as_datatype.rb +63 -40
- data/lib/polars/functions/lazy.rb +63 -14
- data/lib/polars/functions/lit.rb +1 -1
- data/lib/polars/functions/range/date_range.rb +90 -57
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +2 -2
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +1 -1
- data/lib/polars/functions/whenthen.rb +1 -1
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +299 -493
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +227 -0
- data/lib/polars/lazy_frame.rb +143 -272
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +11 -11
- data/lib/polars/list_name_space.rb +5 -1
- data/lib/polars/rolling_group_by.rb +7 -9
- data/lib/polars/series.rb +103 -187
- data/lib/polars/string_expr.rb +78 -102
- data/lib/polars/string_name_space.rb +5 -4
- data/lib/polars/testing.rb +2 -2
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +8 -300
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +6 -6
- data/lib/polars.rb +20 -1
- metadata +17 -4
data/lib/polars/lazy_frame.rb
CHANGED
@@ -27,149 +27,6 @@ module Polars
|
|
27
27
|
ldf
|
28
28
|
end
|
29
29
|
|
30
|
-
# @private
|
31
|
-
def self._scan_csv(
|
32
|
-
file,
|
33
|
-
has_header: true,
|
34
|
-
sep: ",",
|
35
|
-
comment_char: nil,
|
36
|
-
quote_char: '"',
|
37
|
-
skip_rows: 0,
|
38
|
-
dtypes: nil,
|
39
|
-
null_values: nil,
|
40
|
-
ignore_errors: false,
|
41
|
-
cache: true,
|
42
|
-
with_column_names: nil,
|
43
|
-
infer_schema_length: 100,
|
44
|
-
n_rows: nil,
|
45
|
-
encoding: "utf8",
|
46
|
-
low_memory: false,
|
47
|
-
rechunk: true,
|
48
|
-
skip_rows_after_header: 0,
|
49
|
-
row_count_name: nil,
|
50
|
-
row_count_offset: 0,
|
51
|
-
parse_dates: false,
|
52
|
-
eol_char: "\n",
|
53
|
-
truncate_ragged_lines: true
|
54
|
-
)
|
55
|
-
dtype_list = nil
|
56
|
-
if !dtypes.nil?
|
57
|
-
dtype_list = []
|
58
|
-
dtypes.each do |k, v|
|
59
|
-
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
60
|
-
end
|
61
|
-
end
|
62
|
-
processed_null_values = Utils._process_null_values(null_values)
|
63
|
-
|
64
|
-
_from_rbldf(
|
65
|
-
RbLazyFrame.new_from_csv(
|
66
|
-
file,
|
67
|
-
sep,
|
68
|
-
has_header,
|
69
|
-
ignore_errors,
|
70
|
-
skip_rows,
|
71
|
-
n_rows,
|
72
|
-
cache,
|
73
|
-
dtype_list,
|
74
|
-
low_memory,
|
75
|
-
comment_char,
|
76
|
-
quote_char,
|
77
|
-
processed_null_values,
|
78
|
-
infer_schema_length,
|
79
|
-
with_column_names,
|
80
|
-
rechunk,
|
81
|
-
skip_rows_after_header,
|
82
|
-
encoding,
|
83
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
84
|
-
parse_dates,
|
85
|
-
eol_char,
|
86
|
-
truncate_ragged_lines
|
87
|
-
)
|
88
|
-
)
|
89
|
-
end
|
90
|
-
|
91
|
-
# @private
|
92
|
-
def self._scan_parquet(
|
93
|
-
file,
|
94
|
-
n_rows: nil,
|
95
|
-
cache: true,
|
96
|
-
parallel: "auto",
|
97
|
-
rechunk: true,
|
98
|
-
row_count_name: nil,
|
99
|
-
row_count_offset: 0,
|
100
|
-
storage_options: nil,
|
101
|
-
low_memory: false,
|
102
|
-
use_statistics: true,
|
103
|
-
hive_partitioning: true
|
104
|
-
)
|
105
|
-
_from_rbldf(
|
106
|
-
RbLazyFrame.new_from_parquet(
|
107
|
-
file,
|
108
|
-
[],
|
109
|
-
n_rows,
|
110
|
-
cache,
|
111
|
-
parallel,
|
112
|
-
rechunk,
|
113
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
114
|
-
low_memory,
|
115
|
-
use_statistics,
|
116
|
-
hive_partitioning,
|
117
|
-
nil
|
118
|
-
)
|
119
|
-
)
|
120
|
-
end
|
121
|
-
|
122
|
-
# @private
|
123
|
-
def self._scan_ipc(
|
124
|
-
file,
|
125
|
-
n_rows: nil,
|
126
|
-
cache: true,
|
127
|
-
rechunk: true,
|
128
|
-
row_count_name: nil,
|
129
|
-
row_count_offset: 0,
|
130
|
-
storage_options: nil,
|
131
|
-
memory_map: true
|
132
|
-
)
|
133
|
-
if Utils.pathlike?(file)
|
134
|
-
file = Utils.normalise_filepath(file)
|
135
|
-
end
|
136
|
-
|
137
|
-
_from_rbldf(
|
138
|
-
RbLazyFrame.new_from_ipc(
|
139
|
-
file,
|
140
|
-
n_rows,
|
141
|
-
cache,
|
142
|
-
rechunk,
|
143
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
144
|
-
memory_map
|
145
|
-
)
|
146
|
-
)
|
147
|
-
end
|
148
|
-
|
149
|
-
# @private
|
150
|
-
def self._scan_ndjson(
|
151
|
-
file,
|
152
|
-
infer_schema_length: nil,
|
153
|
-
batch_size: nil,
|
154
|
-
n_rows: nil,
|
155
|
-
low_memory: false,
|
156
|
-
rechunk: true,
|
157
|
-
row_count_name: nil,
|
158
|
-
row_count_offset: 0
|
159
|
-
)
|
160
|
-
_from_rbldf(
|
161
|
-
RbLazyFrame.new_from_ndjson(
|
162
|
-
file,
|
163
|
-
infer_schema_length,
|
164
|
-
batch_size,
|
165
|
-
n_rows,
|
166
|
-
low_memory,
|
167
|
-
rechunk,
|
168
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset)
|
169
|
-
)
|
170
|
-
)
|
171
|
-
end
|
172
|
-
|
173
30
|
# def self.from_json
|
174
31
|
# end
|
175
32
|
|
@@ -181,7 +38,7 @@ module Polars
|
|
181
38
|
# @return [LazyFrame]
|
182
39
|
def self.read_json(file)
|
183
40
|
if Utils.pathlike?(file)
|
184
|
-
file = Utils.
|
41
|
+
file = Utils.normalize_filepath(file)
|
185
42
|
end
|
186
43
|
|
187
44
|
Utils.wrap_ldf(RbLazyFrame.read_json(file))
|
@@ -206,7 +63,7 @@ module Polars
|
|
206
63
|
# df.columns
|
207
64
|
# # => ["foo", "bar"]
|
208
65
|
def columns
|
209
|
-
_ldf.
|
66
|
+
_ldf.collect_schema.keys
|
210
67
|
end
|
211
68
|
|
212
69
|
# Get dtypes of columns in LazyFrame.
|
@@ -224,7 +81,7 @@ module Polars
|
|
224
81
|
# lf.dtypes
|
225
82
|
# # => [Polars::Int64, Polars::Float64, Polars::String]
|
226
83
|
def dtypes
|
227
|
-
_ldf.
|
84
|
+
_ldf.collect_schema.values
|
228
85
|
end
|
229
86
|
|
230
87
|
# Get the schema.
|
@@ -242,7 +99,7 @@ module Polars
|
|
242
99
|
# lf.schema
|
243
100
|
# # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
|
244
101
|
def schema
|
245
|
-
_ldf.
|
102
|
+
_ldf.collect_schema
|
246
103
|
end
|
247
104
|
|
248
105
|
# Get the width of the LazyFrame.
|
@@ -254,7 +111,7 @@ module Polars
|
|
254
111
|
# lf.width
|
255
112
|
# # => 2
|
256
113
|
def width
|
257
|
-
_ldf.
|
114
|
+
_ldf.collect_schema.length
|
258
115
|
end
|
259
116
|
|
260
117
|
# Check if LazyFrame includes key.
|
@@ -288,7 +145,7 @@ module Polars
|
|
288
145
|
# @return [nil]
|
289
146
|
def write_json(file)
|
290
147
|
if Utils.pathlike?(file)
|
291
|
-
file = Utils.
|
148
|
+
file = Utils.normalize_filepath(file)
|
292
149
|
end
|
293
150
|
_ldf.write_json(file)
|
294
151
|
nil
|
@@ -404,16 +261,23 @@ module Polars
|
|
404
261
|
# # │ 2 ┆ 7.0 ┆ b │
|
405
262
|
# # │ 1 ┆ 6.0 ┆ a │
|
406
263
|
# # └─────┴─────┴─────┘
|
407
|
-
def sort(by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
|
408
|
-
if by.is_a?(::String)
|
409
|
-
return _from_rbldf(
|
410
|
-
|
411
|
-
|
412
|
-
|
264
|
+
def sort(by, *more_by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
|
265
|
+
if by.is_a?(::String) && more_by.empty?
|
266
|
+
return _from_rbldf(
|
267
|
+
_ldf.sort(
|
268
|
+
by, reverse, nulls_last, maintain_order, multithreaded
|
269
|
+
)
|
270
|
+
)
|
413
271
|
end
|
414
272
|
|
415
|
-
by = Utils.
|
416
|
-
|
273
|
+
by = Utils.parse_into_list_of_expressions(by, *more_by)
|
274
|
+
reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
|
275
|
+
nulls_last = Utils.extend_bool(nulls_last, by.length, "nulls_last", "by")
|
276
|
+
_from_rbldf(
|
277
|
+
_ldf.sort_by_exprs(
|
278
|
+
by, reverse, nulls_last, maintain_order, multithreaded
|
279
|
+
)
|
280
|
+
)
|
417
281
|
end
|
418
282
|
|
419
283
|
# def profile
|
@@ -558,7 +422,7 @@ module Polars
|
|
558
422
|
path,
|
559
423
|
compression: "zstd",
|
560
424
|
compression_level: nil,
|
561
|
-
statistics:
|
425
|
+
statistics: true,
|
562
426
|
row_group_size: nil,
|
563
427
|
data_pagesize_limit: nil,
|
564
428
|
maintain_order: true,
|
@@ -578,6 +442,24 @@ module Polars
|
|
578
442
|
no_optimization: no_optimization
|
579
443
|
)
|
580
444
|
|
445
|
+
if statistics == true
|
446
|
+
statistics = {
|
447
|
+
min: true,
|
448
|
+
max: true,
|
449
|
+
distinct_count: false,
|
450
|
+
null_count: true
|
451
|
+
}
|
452
|
+
elsif statistics == false
|
453
|
+
statistics = {}
|
454
|
+
elsif statistics == "full"
|
455
|
+
statistics = {
|
456
|
+
min: true,
|
457
|
+
max: true,
|
458
|
+
distinct_count: true,
|
459
|
+
null_count: true
|
460
|
+
}
|
461
|
+
end
|
462
|
+
|
581
463
|
lf.sink_parquet(
|
582
464
|
path,
|
583
465
|
compression,
|
@@ -732,6 +614,7 @@ module Polars
|
|
732
614
|
datetime_format: nil,
|
733
615
|
date_format: nil,
|
734
616
|
time_format: nil,
|
617
|
+
float_scientific: nil,
|
735
618
|
float_precision: nil,
|
736
619
|
null_value: nil,
|
737
620
|
quote_style: nil,
|
@@ -766,6 +649,7 @@ module Polars
|
|
766
649
|
datetime_format,
|
767
650
|
date_format,
|
768
651
|
time_format,
|
652
|
+
float_scientific,
|
769
653
|
float_precision,
|
770
654
|
null_value,
|
771
655
|
quote_style,
|
@@ -1050,7 +934,7 @@ module Polars
|
|
1050
934
|
def filter(predicate)
|
1051
935
|
_from_rbldf(
|
1052
936
|
_ldf.filter(
|
1053
|
-
Utils.
|
937
|
+
Utils.parse_into_expression(predicate, str_as_lit: false)
|
1054
938
|
)
|
1055
939
|
)
|
1056
940
|
end
|
@@ -1137,7 +1021,7 @@ module Polars
|
|
1137
1021
|
# # ┌─────────┐
|
1138
1022
|
# # │ literal │
|
1139
1023
|
# # │ --- │
|
1140
|
-
# # │
|
1024
|
+
# # │ i32 │
|
1141
1025
|
# # ╞═════════╡
|
1142
1026
|
# # │ 0 │
|
1143
1027
|
# # │ 0 │
|
@@ -1146,7 +1030,7 @@ module Polars
|
|
1146
1030
|
def select(*exprs, **named_exprs)
|
1147
1031
|
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1148
1032
|
|
1149
|
-
rbexprs = Utils.
|
1033
|
+
rbexprs = Utils.parse_into_list_of_expressions(
|
1150
1034
|
*exprs, **named_exprs, __structify: structify
|
1151
1035
|
)
|
1152
1036
|
_from_rbldf(_ldf.select(rbexprs))
|
@@ -1154,12 +1038,14 @@ module Polars
|
|
1154
1038
|
|
1155
1039
|
# Start a group by operation.
|
1156
1040
|
#
|
1157
|
-
# @param by [
|
1041
|
+
# @param by [Array]
|
1158
1042
|
# Column(s) to group by.
|
1159
1043
|
# @param maintain_order [Boolean]
|
1160
1044
|
# Make sure that the order of the groups remain consistent. This is more
|
1161
1045
|
# expensive than a default group by.
|
1162
|
-
#
|
1046
|
+
# @param named_by [Hash]
|
1047
|
+
# Additional columns to group by, specified as keyword arguments.
|
1048
|
+
# The columns will be renamed to the keyword used.
|
1163
1049
|
# @return [LazyGroupBy]
|
1164
1050
|
#
|
1165
1051
|
# @example
|
@@ -1182,9 +1068,9 @@ module Polars
|
|
1182
1068
|
# # │ b ┆ 11 │
|
1183
1069
|
# # │ c ┆ 6 │
|
1184
1070
|
# # └─────┴─────┘
|
1185
|
-
def group_by(by, maintain_order: false)
|
1186
|
-
|
1187
|
-
lgb = _ldf.group_by(
|
1071
|
+
def group_by(*by, maintain_order: false, **named_by)
|
1072
|
+
exprs = Utils.parse_into_list_of_expressions(*by, **named_by)
|
1073
|
+
lgb = _ldf.group_by(exprs, maintain_order)
|
1188
1074
|
LazyGroupBy.new(lgb)
|
1189
1075
|
end
|
1190
1076
|
alias_method :groupby, :group_by
|
@@ -1238,12 +1124,6 @@ module Polars
|
|
1238
1124
|
# Define whether the temporal window interval is closed or not.
|
1239
1125
|
# @param by [Object]
|
1240
1126
|
# Also group by this column/these columns.
|
1241
|
-
# @param check_sorted [Boolean]
|
1242
|
-
# When the `by` argument is given, polars can not check sortedness
|
1243
|
-
# by the metadata and has to do a full scan on the index column to
|
1244
|
-
# verify data is sorted. This is expensive. If you are sure the
|
1245
|
-
# data within the by groups is sorted, you can set this to `false`.
|
1246
|
-
# Doing so incorrectly will lead to incorrect output
|
1247
1127
|
#
|
1248
1128
|
# @return [LazyFrame]
|
1249
1129
|
#
|
@@ -1285,21 +1165,20 @@ module Polars
|
|
1285
1165
|
period:,
|
1286
1166
|
offset: nil,
|
1287
1167
|
closed: "right",
|
1288
|
-
by: nil
|
1289
|
-
check_sorted: true
|
1168
|
+
by: nil
|
1290
1169
|
)
|
1291
|
-
index_column = Utils.
|
1170
|
+
index_column = Utils.parse_into_expression(index_column)
|
1292
1171
|
if offset.nil?
|
1293
|
-
offset =
|
1172
|
+
offset = Utils.negate_duration_string(Utils.parse_as_duration_string(period))
|
1294
1173
|
end
|
1295
1174
|
|
1296
|
-
rbexprs_by =
|
1297
|
-
|
1298
|
-
offset = Utils._timedelta_to_pl_duration(offset)
|
1299
|
-
|
1300
|
-
lgb = _ldf.rolling(
|
1301
|
-
index_column, period, offset, closed, rbexprs_by, check_sorted
|
1175
|
+
rbexprs_by = (
|
1176
|
+
!by.nil? ? Utils.parse_into_list_of_expressions(by) : []
|
1302
1177
|
)
|
1178
|
+
period = Utils.parse_as_duration_string(period)
|
1179
|
+
offset = Utils.parse_as_duration_string(offset)
|
1180
|
+
|
1181
|
+
lgb = _ldf.rolling(index_column, period, offset, closed, rbexprs_by)
|
1303
1182
|
LazyGroupBy.new(lgb)
|
1304
1183
|
end
|
1305
1184
|
alias_method :group_by_rolling, :rolling
|
@@ -1367,22 +1246,18 @@ module Polars
|
|
1367
1246
|
# Define whether the temporal window interval is closed or not.
|
1368
1247
|
# @param by [Object]
|
1369
1248
|
# Also group by this column/these columns
|
1370
|
-
# @param check_sorted [Boolean]
|
1371
|
-
# When the `by` argument is given, polars can not check sortedness
|
1372
|
-
# by the metadata and has to do a full scan on the index column to
|
1373
|
-
# verify data is sorted. This is expensive. If you are sure the
|
1374
|
-
# data within the by groups is sorted, you can set this to `false`.
|
1375
|
-
# Doing so incorrectly will lead to incorrect output.
|
1376
1249
|
#
|
1377
1250
|
# @return [DataFrame]
|
1378
1251
|
#
|
1379
1252
|
# @example
|
1380
1253
|
# df = Polars::DataFrame.new(
|
1381
1254
|
# {
|
1382
|
-
# "time" => Polars.
|
1255
|
+
# "time" => Polars.datetime_range(
|
1383
1256
|
# DateTime.new(2021, 12, 16),
|
1384
1257
|
# DateTime.new(2021, 12, 16, 3),
|
1385
|
-
# "30m"
|
1258
|
+
# "30m",
|
1259
|
+
# time_unit: "us",
|
1260
|
+
# eager: true
|
1386
1261
|
# ),
|
1387
1262
|
# "n" => 0..6
|
1388
1263
|
# }
|
@@ -1449,16 +1324,16 @@ module Polars
|
|
1449
1324
|
# )
|
1450
1325
|
# # =>
|
1451
1326
|
# # shape: (4, 3)
|
1452
|
-
# #
|
1453
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
1454
|
-
# # │ --- ┆ --- ┆ ---
|
1455
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
1456
|
-
# #
|
1457
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12
|
1458
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12
|
1459
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12
|
1460
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
1461
|
-
# #
|
1327
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────┐
|
1328
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1329
|
+
# # │ --- ┆ --- ┆ --- │
|
1330
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1331
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════╡
|
1332
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-… │
|
1333
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-… │
|
1334
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-… │
|
1335
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1336
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────┘
|
1462
1337
|
#
|
1463
1338
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1464
1339
|
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
@@ -1481,10 +1356,12 @@ module Polars
|
|
1481
1356
|
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
1482
1357
|
# df = Polars::DataFrame.new(
|
1483
1358
|
# {
|
1484
|
-
# "time" => Polars.
|
1359
|
+
# "time" => Polars.datetime_range(
|
1485
1360
|
# DateTime.new(2021, 12, 16),
|
1486
1361
|
# DateTime.new(2021, 12, 16, 3),
|
1487
|
-
# "30m"
|
1362
|
+
# "30m",
|
1363
|
+
# time_unit: "us",
|
1364
|
+
# eager: true
|
1488
1365
|
# ),
|
1489
1366
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
1490
1367
|
# }
|
@@ -1548,14 +1425,13 @@ module Polars
|
|
1548
1425
|
closed: "left",
|
1549
1426
|
label: "left",
|
1550
1427
|
by: nil,
|
1551
|
-
start_by: "window"
|
1552
|
-
check_sorted: true
|
1428
|
+
start_by: "window"
|
1553
1429
|
)
|
1554
1430
|
if !truncate.nil?
|
1555
1431
|
label = truncate ? "left" : "datapoint"
|
1556
1432
|
end
|
1557
1433
|
|
1558
|
-
index_column = Utils.
|
1434
|
+
index_column = Utils.parse_into_expression(index_column, str_as_lit: false)
|
1559
1435
|
if offset.nil?
|
1560
1436
|
offset = period.nil? ? "-#{every}" : "0ns"
|
1561
1437
|
end
|
@@ -1564,13 +1440,13 @@ module Polars
|
|
1564
1440
|
period = every
|
1565
1441
|
end
|
1566
1442
|
|
1567
|
-
period = Utils.
|
1568
|
-
offset = Utils.
|
1569
|
-
every = Utils.
|
1443
|
+
period = Utils.parse_as_duration_string(period)
|
1444
|
+
offset = Utils.parse_as_duration_string(offset)
|
1445
|
+
every = Utils.parse_as_duration_string(every)
|
1570
1446
|
|
1571
|
-
rbexprs_by = by.nil? ? [] : Utils.
|
1447
|
+
rbexprs_by = by.nil? ? [] : Utils.parse_into_list_of_expressions(by)
|
1572
1448
|
lgb = _ldf.group_by_dynamic(
|
1573
|
-
index_column
|
1449
|
+
index_column,
|
1574
1450
|
every,
|
1575
1451
|
period,
|
1576
1452
|
offset,
|
@@ -1578,8 +1454,7 @@ module Polars
|
|
1578
1454
|
include_boundaries,
|
1579
1455
|
closed,
|
1580
1456
|
rbexprs_by,
|
1581
|
-
start_by
|
1582
|
-
check_sorted
|
1457
|
+
start_by
|
1583
1458
|
)
|
1584
1459
|
LazyGroupBy.new(lgb)
|
1585
1460
|
end
|
@@ -1730,7 +1605,7 @@ module Polars
|
|
1730
1605
|
# @param on Object
|
1731
1606
|
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
1732
1607
|
# None.
|
1733
|
-
# @param how ["inner", "left", "
|
1608
|
+
# @param how ["inner", "left", "full", "semi", "anti", "cross"]
|
1734
1609
|
# Join strategy.
|
1735
1610
|
# @param suffix [String]
|
1736
1611
|
# Suffix to append to columns with a duplicate name.
|
@@ -1772,7 +1647,7 @@ module Polars
|
|
1772
1647
|
# # └─────┴─────┴─────┴───────┘
|
1773
1648
|
#
|
1774
1649
|
# @example
|
1775
|
-
# df.join(other_df, on: "ham", how: "
|
1650
|
+
# df.join(other_df, on: "ham", how: "full").collect
|
1776
1651
|
# # =>
|
1777
1652
|
# # shape: (4, 5)
|
1778
1653
|
# # ┌──────┬──────┬──────┬───────┬───────────┐
|
@@ -1839,7 +1714,9 @@ module Polars
|
|
1839
1714
|
raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
|
1840
1715
|
end
|
1841
1716
|
|
1842
|
-
if how == "
|
1717
|
+
if how == "outer"
|
1718
|
+
how = "full"
|
1719
|
+
elsif how == "cross"
|
1843
1720
|
return _from_rbldf(
|
1844
1721
|
_ldf.join(
|
1845
1722
|
other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
|
@@ -1848,12 +1725,12 @@ module Polars
|
|
1848
1725
|
end
|
1849
1726
|
|
1850
1727
|
if !on.nil?
|
1851
|
-
rbexprs = Utils.
|
1728
|
+
rbexprs = Utils.parse_into_list_of_expressions(on)
|
1852
1729
|
rbexprs_left = rbexprs
|
1853
1730
|
rbexprs_right = rbexprs
|
1854
1731
|
elsif !left_on.nil? && !right_on.nil?
|
1855
|
-
rbexprs_left = Utils.
|
1856
|
-
rbexprs_right = Utils.
|
1732
|
+
rbexprs_left = Utils.parse_into_list_of_expressions(left_on)
|
1733
|
+
rbexprs_right = Utils.parse_into_list_of_expressions(right_on)
|
1857
1734
|
else
|
1858
1735
|
raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
|
1859
1736
|
end
|
@@ -1908,7 +1785,8 @@ module Polars
|
|
1908
1785
|
# # └─────┴──────┴───────┴─────┴──────┴───────┘
|
1909
1786
|
def with_columns(*exprs, **named_exprs)
|
1910
1787
|
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1911
|
-
|
1788
|
+
|
1789
|
+
rbexprs = Utils.parse_into_list_of_expressions(*exprs, **named_exprs, __structify: structify)
|
1912
1790
|
|
1913
1791
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
1914
1792
|
end
|
@@ -2069,9 +1947,9 @@ module Polars
|
|
2069
1947
|
# # └──────┴──────┘
|
2070
1948
|
def shift(n, fill_value: nil)
|
2071
1949
|
if !fill_value.nil?
|
2072
|
-
fill_value = Utils.
|
1950
|
+
fill_value = Utils.parse_into_expression(fill_value, str_as_lit: true)
|
2073
1951
|
end
|
2074
|
-
n = Utils.
|
1952
|
+
n = Utils.parse_into_expression(n)
|
2075
1953
|
_from_rbldf(_ldf.shift(n, fill_value))
|
2076
1954
|
end
|
2077
1955
|
|
@@ -2236,16 +2114,16 @@ module Polars
|
|
2236
2114
|
# df.with_row_index.collect
|
2237
2115
|
# # =>
|
2238
2116
|
# # shape: (3, 3)
|
2239
|
-
# #
|
2240
|
-
# # │
|
2241
|
-
# # │ ---
|
2242
|
-
# # │ u32
|
2243
|
-
# #
|
2244
|
-
# # │ 0
|
2245
|
-
# # │ 1
|
2246
|
-
# # │ 2
|
2247
|
-
# #
|
2248
|
-
def with_row_index(name: "
|
2117
|
+
# # ┌───────┬─────┬─────┐
|
2118
|
+
# # │ index ┆ a ┆ b │
|
2119
|
+
# # │ --- ┆ --- ┆ --- │
|
2120
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
2121
|
+
# # ╞═══════╪═════╪═════╡
|
2122
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
2123
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
2124
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
2125
|
+
# # └───────┴─────┴─────┘
|
2126
|
+
def with_row_index(name: "index", offset: 0)
|
2249
2127
|
_from_rbldf(_ldf.with_row_index(name, offset))
|
2250
2128
|
end
|
2251
2129
|
alias_method :with_row_count, :with_row_index
|
@@ -2268,7 +2146,7 @@ module Polars
|
|
2268
2146
|
# # │ 3 ┆ 7 │
|
2269
2147
|
# # └─────┴─────┘
|
2270
2148
|
def take_every(n)
|
2271
|
-
select(
|
2149
|
+
select(F.col("*").take_every(n))
|
2272
2150
|
end
|
2273
2151
|
|
2274
2152
|
# Fill null values using the specified value or strategy.
|
@@ -2311,7 +2189,7 @@ module Polars
|
|
2311
2189
|
# # └──────┴──────┘
|
2312
2190
|
def fill_nan(fill_value)
|
2313
2191
|
if !fill_value.is_a?(Expr)
|
2314
|
-
fill_value =
|
2192
|
+
fill_value = F.lit(fill_value)
|
2315
2193
|
end
|
2316
2194
|
_from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
|
2317
2195
|
end
|
@@ -2502,8 +2380,8 @@ module Polars
|
|
2502
2380
|
# # │ 3.0 ┆ 1.0 │
|
2503
2381
|
# # └─────┴─────┘
|
2504
2382
|
def quantile(quantile, interpolation: "nearest")
|
2505
|
-
quantile = Utils.
|
2506
|
-
_from_rbldf(_ldf.quantile(quantile
|
2383
|
+
quantile = Utils.parse_into_expression(quantile, str_as_lit: false)
|
2384
|
+
_from_rbldf(_ldf.quantile(quantile, interpolation))
|
2507
2385
|
end
|
2508
2386
|
|
2509
2387
|
# Explode lists to long format.
|
@@ -2535,7 +2413,7 @@ module Polars
|
|
2535
2413
|
# # │ c ┆ 8 │
|
2536
2414
|
# # └─────────┴─────────┘
|
2537
2415
|
def explode(columns)
|
2538
|
-
columns = Utils.
|
2416
|
+
columns = Utils.parse_into_list_of_expressions(columns)
|
2539
2417
|
_from_rbldf(_ldf.explode(columns))
|
2540
2418
|
end
|
2541
2419
|
|
@@ -2598,35 +2476,35 @@ module Polars
|
|
2598
2476
|
# Optionally leaves identifiers set.
|
2599
2477
|
#
|
2600
2478
|
# This function is useful to massage a DataFrame into a format where one or more
|
2601
|
-
# columns are identifier variables (
|
2602
|
-
# measured variables (
|
2479
|
+
# columns are identifier variables (index) while all other columns, considered
|
2480
|
+
# measured variables (on), are "unpivoted" to the row axis leaving just
|
2603
2481
|
# two non-identifier columns, 'variable' and 'value'.
|
2604
2482
|
#
|
2605
|
-
# @param
|
2606
|
-
#
|
2607
|
-
#
|
2608
|
-
#
|
2609
|
-
#
|
2483
|
+
# @param on [Object]
|
2484
|
+
# Column(s) or selector(s) to use as values variables; if `on`
|
2485
|
+
# is empty all columns that are not in `index` will be used.
|
2486
|
+
# @param index [Object]
|
2487
|
+
# Column(s) or selector(s) to use as identifier variables.
|
2610
2488
|
# @param variable_name [String]
|
2611
|
-
# Name to give to the `
|
2489
|
+
# Name to give to the `variable` column. Defaults to "variable"
|
2612
2490
|
# @param value_name [String]
|
2613
2491
|
# Name to give to the `value` column. Defaults to "value"
|
2614
2492
|
# @param streamable [Boolean]
|
2615
2493
|
# Allow this node to run in the streaming engine.
|
2616
|
-
# If this runs in streaming, the output of the
|
2494
|
+
# If this runs in streaming, the output of the unpivot operation
|
2617
2495
|
# will not have a stable ordering.
|
2618
2496
|
#
|
2619
2497
|
# @return [LazyFrame]
|
2620
2498
|
#
|
2621
2499
|
# @example
|
2622
|
-
#
|
2500
|
+
# lf = Polars::LazyFrame.new(
|
2623
2501
|
# {
|
2624
2502
|
# "a" => ["x", "y", "z"],
|
2625
2503
|
# "b" => [1, 3, 5],
|
2626
2504
|
# "c" => [2, 4, 6]
|
2627
2505
|
# }
|
2628
|
-
# )
|
2629
|
-
#
|
2506
|
+
# )
|
2507
|
+
# lf.unpivot(Polars::Selectors.numeric, index: "a").collect
|
2630
2508
|
# # =>
|
2631
2509
|
# # shape: (6, 3)
|
2632
2510
|
# # ┌─────┬──────────┬───────┐
|
@@ -2641,23 +2519,21 @@ module Polars
|
|
2641
2519
|
# # │ y ┆ c ┆ 4 │
|
2642
2520
|
# # │ z ┆ c ┆ 6 │
|
2643
2521
|
# # └─────┴──────────┴───────┘
|
2644
|
-
def
|
2645
|
-
|
2646
|
-
|
2647
|
-
|
2648
|
-
|
2649
|
-
|
2650
|
-
|
2651
|
-
|
2652
|
-
|
2653
|
-
|
2654
|
-
if id_vars.nil?
|
2655
|
-
id_vars = []
|
2656
|
-
end
|
2522
|
+
def unpivot(
|
2523
|
+
on,
|
2524
|
+
index: nil,
|
2525
|
+
variable_name: nil,
|
2526
|
+
value_name: nil,
|
2527
|
+
streamable: true
|
2528
|
+
)
|
2529
|
+
on = on.nil? ? [] : Utils._expand_selectors(self, on)
|
2530
|
+
index = index.nil? ? [] : Utils._expand_selectors(self, index)
|
2531
|
+
|
2657
2532
|
_from_rbldf(
|
2658
|
-
_ldf.
|
2533
|
+
_ldf.unpivot(on, index, value_name, variable_name, streamable)
|
2659
2534
|
)
|
2660
2535
|
end
|
2536
|
+
alias_method :melt, :unpivot
|
2661
2537
|
|
2662
2538
|
# def map
|
2663
2539
|
# end
|
@@ -2688,7 +2564,7 @@ module Polars
|
|
2688
2564
|
# # │ 10.0 ┆ null ┆ 9.0 │
|
2689
2565
|
# # └──────┴──────┴──────────┘
|
2690
2566
|
def interpolate
|
2691
|
-
select(
|
2567
|
+
select(F.col("*").interpolate)
|
2692
2568
|
end
|
2693
2569
|
|
2694
2570
|
# Decompose a struct into its fields.
|
@@ -2795,24 +2671,19 @@ module Polars
|
|
2795
2671
|
#
|
2796
2672
|
# @param column [Object]
|
2797
2673
|
# Columns that are sorted
|
2798
|
-
# @param more_columns [Object]
|
2799
|
-
# Additional columns that are sorted, specified as positional arguments.
|
2800
2674
|
# @param descending [Boolean]
|
2801
2675
|
# Whether the columns are sorted in descending order.
|
2802
2676
|
#
|
2803
2677
|
# @return [LazyFrame]
|
2804
2678
|
def set_sorted(
|
2805
2679
|
column,
|
2806
|
-
*more_columns,
|
2807
2680
|
descending: false
|
2808
2681
|
)
|
2809
|
-
|
2810
|
-
|
2811
|
-
|
2682
|
+
if !Utils.strlike?(column)
|
2683
|
+
msg = "expected a 'str' for argument 'column' in 'set_sorted'"
|
2684
|
+
raise TypeError, msg
|
2812
2685
|
end
|
2813
|
-
with_columns(
|
2814
|
-
columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
|
2815
|
-
)
|
2686
|
+
with_columns(F.col(column).set_sorted(descending: descending))
|
2816
2687
|
end
|
2817
2688
|
|
2818
2689
|
# TODO
|