polars-df 0.10.0-x86_64-linux → 0.12.0-x86_64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +392 -351
- data/LICENSE-THIRD-PARTY.txt +1125 -865
- data/README.md +6 -6
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +4 -4
- data/lib/polars/batched_csv_reader.rb +11 -5
- data/lib/polars/cat_expr.rb +0 -36
- data/lib/polars/cat_name_space.rb +0 -37
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +176 -403
- data/lib/polars/data_types.rb +1 -1
- data/lib/polars/date_time_expr.rb +525 -572
- data/lib/polars/date_time_name_space.rb +263 -460
- data/lib/polars/dynamic_group_by.rb +5 -5
- data/lib/polars/exceptions.rb +7 -0
- data/lib/polars/expr.rb +1394 -243
- data/lib/polars/expr_dispatch.rb +1 -1
- data/lib/polars/functions/aggregation/horizontal.rb +8 -8
- data/lib/polars/functions/as_datatype.rb +63 -40
- data/lib/polars/functions/lazy.rb +63 -14
- data/lib/polars/functions/lit.rb +1 -1
- data/lib/polars/functions/range/date_range.rb +90 -57
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +2 -2
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +1 -1
- data/lib/polars/functions/whenthen.rb +1 -1
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +299 -493
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +227 -0
- data/lib/polars/lazy_frame.rb +143 -272
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +11 -11
- data/lib/polars/list_name_space.rb +5 -1
- data/lib/polars/rolling_group_by.rb +7 -9
- data/lib/polars/series.rb +103 -187
- data/lib/polars/string_expr.rb +78 -102
- data/lib/polars/string_name_space.rb +5 -4
- data/lib/polars/testing.rb +2 -2
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +8 -300
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +6 -6
- data/lib/polars.rb +20 -1
- metadata +17 -4
data/lib/polars/lazy_frame.rb
CHANGED
@@ -27,149 +27,6 @@ module Polars
|
|
27
27
|
ldf
|
28
28
|
end
|
29
29
|
|
30
|
-
# @private
|
31
|
-
def self._scan_csv(
|
32
|
-
file,
|
33
|
-
has_header: true,
|
34
|
-
sep: ",",
|
35
|
-
comment_char: nil,
|
36
|
-
quote_char: '"',
|
37
|
-
skip_rows: 0,
|
38
|
-
dtypes: nil,
|
39
|
-
null_values: nil,
|
40
|
-
ignore_errors: false,
|
41
|
-
cache: true,
|
42
|
-
with_column_names: nil,
|
43
|
-
infer_schema_length: 100,
|
44
|
-
n_rows: nil,
|
45
|
-
encoding: "utf8",
|
46
|
-
low_memory: false,
|
47
|
-
rechunk: true,
|
48
|
-
skip_rows_after_header: 0,
|
49
|
-
row_count_name: nil,
|
50
|
-
row_count_offset: 0,
|
51
|
-
parse_dates: false,
|
52
|
-
eol_char: "\n",
|
53
|
-
truncate_ragged_lines: true
|
54
|
-
)
|
55
|
-
dtype_list = nil
|
56
|
-
if !dtypes.nil?
|
57
|
-
dtype_list = []
|
58
|
-
dtypes.each do |k, v|
|
59
|
-
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
60
|
-
end
|
61
|
-
end
|
62
|
-
processed_null_values = Utils._process_null_values(null_values)
|
63
|
-
|
64
|
-
_from_rbldf(
|
65
|
-
RbLazyFrame.new_from_csv(
|
66
|
-
file,
|
67
|
-
sep,
|
68
|
-
has_header,
|
69
|
-
ignore_errors,
|
70
|
-
skip_rows,
|
71
|
-
n_rows,
|
72
|
-
cache,
|
73
|
-
dtype_list,
|
74
|
-
low_memory,
|
75
|
-
comment_char,
|
76
|
-
quote_char,
|
77
|
-
processed_null_values,
|
78
|
-
infer_schema_length,
|
79
|
-
with_column_names,
|
80
|
-
rechunk,
|
81
|
-
skip_rows_after_header,
|
82
|
-
encoding,
|
83
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
84
|
-
parse_dates,
|
85
|
-
eol_char,
|
86
|
-
truncate_ragged_lines
|
87
|
-
)
|
88
|
-
)
|
89
|
-
end
|
90
|
-
|
91
|
-
# @private
|
92
|
-
def self._scan_parquet(
|
93
|
-
file,
|
94
|
-
n_rows: nil,
|
95
|
-
cache: true,
|
96
|
-
parallel: "auto",
|
97
|
-
rechunk: true,
|
98
|
-
row_count_name: nil,
|
99
|
-
row_count_offset: 0,
|
100
|
-
storage_options: nil,
|
101
|
-
low_memory: false,
|
102
|
-
use_statistics: true,
|
103
|
-
hive_partitioning: true
|
104
|
-
)
|
105
|
-
_from_rbldf(
|
106
|
-
RbLazyFrame.new_from_parquet(
|
107
|
-
file,
|
108
|
-
[],
|
109
|
-
n_rows,
|
110
|
-
cache,
|
111
|
-
parallel,
|
112
|
-
rechunk,
|
113
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
114
|
-
low_memory,
|
115
|
-
use_statistics,
|
116
|
-
hive_partitioning,
|
117
|
-
nil
|
118
|
-
)
|
119
|
-
)
|
120
|
-
end
|
121
|
-
|
122
|
-
# @private
|
123
|
-
def self._scan_ipc(
|
124
|
-
file,
|
125
|
-
n_rows: nil,
|
126
|
-
cache: true,
|
127
|
-
rechunk: true,
|
128
|
-
row_count_name: nil,
|
129
|
-
row_count_offset: 0,
|
130
|
-
storage_options: nil,
|
131
|
-
memory_map: true
|
132
|
-
)
|
133
|
-
if Utils.pathlike?(file)
|
134
|
-
file = Utils.normalise_filepath(file)
|
135
|
-
end
|
136
|
-
|
137
|
-
_from_rbldf(
|
138
|
-
RbLazyFrame.new_from_ipc(
|
139
|
-
file,
|
140
|
-
n_rows,
|
141
|
-
cache,
|
142
|
-
rechunk,
|
143
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
144
|
-
memory_map
|
145
|
-
)
|
146
|
-
)
|
147
|
-
end
|
148
|
-
|
149
|
-
# @private
|
150
|
-
def self._scan_ndjson(
|
151
|
-
file,
|
152
|
-
infer_schema_length: nil,
|
153
|
-
batch_size: nil,
|
154
|
-
n_rows: nil,
|
155
|
-
low_memory: false,
|
156
|
-
rechunk: true,
|
157
|
-
row_count_name: nil,
|
158
|
-
row_count_offset: 0
|
159
|
-
)
|
160
|
-
_from_rbldf(
|
161
|
-
RbLazyFrame.new_from_ndjson(
|
162
|
-
file,
|
163
|
-
infer_schema_length,
|
164
|
-
batch_size,
|
165
|
-
n_rows,
|
166
|
-
low_memory,
|
167
|
-
rechunk,
|
168
|
-
Utils._prepare_row_count_args(row_count_name, row_count_offset)
|
169
|
-
)
|
170
|
-
)
|
171
|
-
end
|
172
|
-
|
173
30
|
# def self.from_json
|
174
31
|
# end
|
175
32
|
|
@@ -181,7 +38,7 @@ module Polars
|
|
181
38
|
# @return [LazyFrame]
|
182
39
|
def self.read_json(file)
|
183
40
|
if Utils.pathlike?(file)
|
184
|
-
file = Utils.
|
41
|
+
file = Utils.normalize_filepath(file)
|
185
42
|
end
|
186
43
|
|
187
44
|
Utils.wrap_ldf(RbLazyFrame.read_json(file))
|
@@ -206,7 +63,7 @@ module Polars
|
|
206
63
|
# df.columns
|
207
64
|
# # => ["foo", "bar"]
|
208
65
|
def columns
|
209
|
-
_ldf.
|
66
|
+
_ldf.collect_schema.keys
|
210
67
|
end
|
211
68
|
|
212
69
|
# Get dtypes of columns in LazyFrame.
|
@@ -224,7 +81,7 @@ module Polars
|
|
224
81
|
# lf.dtypes
|
225
82
|
# # => [Polars::Int64, Polars::Float64, Polars::String]
|
226
83
|
def dtypes
|
227
|
-
_ldf.
|
84
|
+
_ldf.collect_schema.values
|
228
85
|
end
|
229
86
|
|
230
87
|
# Get the schema.
|
@@ -242,7 +99,7 @@ module Polars
|
|
242
99
|
# lf.schema
|
243
100
|
# # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
|
244
101
|
def schema
|
245
|
-
_ldf.
|
102
|
+
_ldf.collect_schema
|
246
103
|
end
|
247
104
|
|
248
105
|
# Get the width of the LazyFrame.
|
@@ -254,7 +111,7 @@ module Polars
|
|
254
111
|
# lf.width
|
255
112
|
# # => 2
|
256
113
|
def width
|
257
|
-
_ldf.
|
114
|
+
_ldf.collect_schema.length
|
258
115
|
end
|
259
116
|
|
260
117
|
# Check if LazyFrame includes key.
|
@@ -288,7 +145,7 @@ module Polars
|
|
288
145
|
# @return [nil]
|
289
146
|
def write_json(file)
|
290
147
|
if Utils.pathlike?(file)
|
291
|
-
file = Utils.
|
148
|
+
file = Utils.normalize_filepath(file)
|
292
149
|
end
|
293
150
|
_ldf.write_json(file)
|
294
151
|
nil
|
@@ -404,16 +261,23 @@ module Polars
|
|
404
261
|
# # │ 2 ┆ 7.0 ┆ b │
|
405
262
|
# # │ 1 ┆ 6.0 ┆ a │
|
406
263
|
# # └─────┴─────┴─────┘
|
407
|
-
def sort(by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
|
408
|
-
if by.is_a?(::String)
|
409
|
-
return _from_rbldf(
|
410
|
-
|
411
|
-
|
412
|
-
|
264
|
+
def sort(by, *more_by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
|
265
|
+
if by.is_a?(::String) && more_by.empty?
|
266
|
+
return _from_rbldf(
|
267
|
+
_ldf.sort(
|
268
|
+
by, reverse, nulls_last, maintain_order, multithreaded
|
269
|
+
)
|
270
|
+
)
|
413
271
|
end
|
414
272
|
|
415
|
-
by = Utils.
|
416
|
-
|
273
|
+
by = Utils.parse_into_list_of_expressions(by, *more_by)
|
274
|
+
reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
|
275
|
+
nulls_last = Utils.extend_bool(nulls_last, by.length, "nulls_last", "by")
|
276
|
+
_from_rbldf(
|
277
|
+
_ldf.sort_by_exprs(
|
278
|
+
by, reverse, nulls_last, maintain_order, multithreaded
|
279
|
+
)
|
280
|
+
)
|
417
281
|
end
|
418
282
|
|
419
283
|
# def profile
|
@@ -558,7 +422,7 @@ module Polars
|
|
558
422
|
path,
|
559
423
|
compression: "zstd",
|
560
424
|
compression_level: nil,
|
561
|
-
statistics:
|
425
|
+
statistics: true,
|
562
426
|
row_group_size: nil,
|
563
427
|
data_pagesize_limit: nil,
|
564
428
|
maintain_order: true,
|
@@ -578,6 +442,24 @@ module Polars
|
|
578
442
|
no_optimization: no_optimization
|
579
443
|
)
|
580
444
|
|
445
|
+
if statistics == true
|
446
|
+
statistics = {
|
447
|
+
min: true,
|
448
|
+
max: true,
|
449
|
+
distinct_count: false,
|
450
|
+
null_count: true
|
451
|
+
}
|
452
|
+
elsif statistics == false
|
453
|
+
statistics = {}
|
454
|
+
elsif statistics == "full"
|
455
|
+
statistics = {
|
456
|
+
min: true,
|
457
|
+
max: true,
|
458
|
+
distinct_count: true,
|
459
|
+
null_count: true
|
460
|
+
}
|
461
|
+
end
|
462
|
+
|
581
463
|
lf.sink_parquet(
|
582
464
|
path,
|
583
465
|
compression,
|
@@ -732,6 +614,7 @@ module Polars
|
|
732
614
|
datetime_format: nil,
|
733
615
|
date_format: nil,
|
734
616
|
time_format: nil,
|
617
|
+
float_scientific: nil,
|
735
618
|
float_precision: nil,
|
736
619
|
null_value: nil,
|
737
620
|
quote_style: nil,
|
@@ -766,6 +649,7 @@ module Polars
|
|
766
649
|
datetime_format,
|
767
650
|
date_format,
|
768
651
|
time_format,
|
652
|
+
float_scientific,
|
769
653
|
float_precision,
|
770
654
|
null_value,
|
771
655
|
quote_style,
|
@@ -1050,7 +934,7 @@ module Polars
|
|
1050
934
|
def filter(predicate)
|
1051
935
|
_from_rbldf(
|
1052
936
|
_ldf.filter(
|
1053
|
-
Utils.
|
937
|
+
Utils.parse_into_expression(predicate, str_as_lit: false)
|
1054
938
|
)
|
1055
939
|
)
|
1056
940
|
end
|
@@ -1137,7 +1021,7 @@ module Polars
|
|
1137
1021
|
# # ┌─────────┐
|
1138
1022
|
# # │ literal │
|
1139
1023
|
# # │ --- │
|
1140
|
-
# # │
|
1024
|
+
# # │ i32 │
|
1141
1025
|
# # ╞═════════╡
|
1142
1026
|
# # │ 0 │
|
1143
1027
|
# # │ 0 │
|
@@ -1146,7 +1030,7 @@ module Polars
|
|
1146
1030
|
def select(*exprs, **named_exprs)
|
1147
1031
|
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1148
1032
|
|
1149
|
-
rbexprs = Utils.
|
1033
|
+
rbexprs = Utils.parse_into_list_of_expressions(
|
1150
1034
|
*exprs, **named_exprs, __structify: structify
|
1151
1035
|
)
|
1152
1036
|
_from_rbldf(_ldf.select(rbexprs))
|
@@ -1154,12 +1038,14 @@ module Polars
|
|
1154
1038
|
|
1155
1039
|
# Start a group by operation.
|
1156
1040
|
#
|
1157
|
-
# @param by [
|
1041
|
+
# @param by [Array]
|
1158
1042
|
# Column(s) to group by.
|
1159
1043
|
# @param maintain_order [Boolean]
|
1160
1044
|
# Make sure that the order of the groups remain consistent. This is more
|
1161
1045
|
# expensive than a default group by.
|
1162
|
-
#
|
1046
|
+
# @param named_by [Hash]
|
1047
|
+
# Additional columns to group by, specified as keyword arguments.
|
1048
|
+
# The columns will be renamed to the keyword used.
|
1163
1049
|
# @return [LazyGroupBy]
|
1164
1050
|
#
|
1165
1051
|
# @example
|
@@ -1182,9 +1068,9 @@ module Polars
|
|
1182
1068
|
# # │ b ┆ 11 │
|
1183
1069
|
# # │ c ┆ 6 │
|
1184
1070
|
# # └─────┴─────┘
|
1185
|
-
def group_by(by, maintain_order: false)
|
1186
|
-
|
1187
|
-
lgb = _ldf.group_by(
|
1071
|
+
def group_by(*by, maintain_order: false, **named_by)
|
1072
|
+
exprs = Utils.parse_into_list_of_expressions(*by, **named_by)
|
1073
|
+
lgb = _ldf.group_by(exprs, maintain_order)
|
1188
1074
|
LazyGroupBy.new(lgb)
|
1189
1075
|
end
|
1190
1076
|
alias_method :groupby, :group_by
|
@@ -1238,12 +1124,6 @@ module Polars
|
|
1238
1124
|
# Define whether the temporal window interval is closed or not.
|
1239
1125
|
# @param by [Object]
|
1240
1126
|
# Also group by this column/these columns.
|
1241
|
-
# @param check_sorted [Boolean]
|
1242
|
-
# When the `by` argument is given, polars can not check sortedness
|
1243
|
-
# by the metadata and has to do a full scan on the index column to
|
1244
|
-
# verify data is sorted. This is expensive. If you are sure the
|
1245
|
-
# data within the by groups is sorted, you can set this to `false`.
|
1246
|
-
# Doing so incorrectly will lead to incorrect output
|
1247
1127
|
#
|
1248
1128
|
# @return [LazyFrame]
|
1249
1129
|
#
|
@@ -1285,21 +1165,20 @@ module Polars
|
|
1285
1165
|
period:,
|
1286
1166
|
offset: nil,
|
1287
1167
|
closed: "right",
|
1288
|
-
by: nil
|
1289
|
-
check_sorted: true
|
1168
|
+
by: nil
|
1290
1169
|
)
|
1291
|
-
index_column = Utils.
|
1170
|
+
index_column = Utils.parse_into_expression(index_column)
|
1292
1171
|
if offset.nil?
|
1293
|
-
offset =
|
1172
|
+
offset = Utils.negate_duration_string(Utils.parse_as_duration_string(period))
|
1294
1173
|
end
|
1295
1174
|
|
1296
|
-
rbexprs_by =
|
1297
|
-
|
1298
|
-
offset = Utils._timedelta_to_pl_duration(offset)
|
1299
|
-
|
1300
|
-
lgb = _ldf.rolling(
|
1301
|
-
index_column, period, offset, closed, rbexprs_by, check_sorted
|
1175
|
+
rbexprs_by = (
|
1176
|
+
!by.nil? ? Utils.parse_into_list_of_expressions(by) : []
|
1302
1177
|
)
|
1178
|
+
period = Utils.parse_as_duration_string(period)
|
1179
|
+
offset = Utils.parse_as_duration_string(offset)
|
1180
|
+
|
1181
|
+
lgb = _ldf.rolling(index_column, period, offset, closed, rbexprs_by)
|
1303
1182
|
LazyGroupBy.new(lgb)
|
1304
1183
|
end
|
1305
1184
|
alias_method :group_by_rolling, :rolling
|
@@ -1367,22 +1246,18 @@ module Polars
|
|
1367
1246
|
# Define whether the temporal window interval is closed or not.
|
1368
1247
|
# @param by [Object]
|
1369
1248
|
# Also group by this column/these columns
|
1370
|
-
# @param check_sorted [Boolean]
|
1371
|
-
# When the `by` argument is given, polars can not check sortedness
|
1372
|
-
# by the metadata and has to do a full scan on the index column to
|
1373
|
-
# verify data is sorted. This is expensive. If you are sure the
|
1374
|
-
# data within the by groups is sorted, you can set this to `false`.
|
1375
|
-
# Doing so incorrectly will lead to incorrect output.
|
1376
1249
|
#
|
1377
1250
|
# @return [DataFrame]
|
1378
1251
|
#
|
1379
1252
|
# @example
|
1380
1253
|
# df = Polars::DataFrame.new(
|
1381
1254
|
# {
|
1382
|
-
# "time" => Polars.
|
1255
|
+
# "time" => Polars.datetime_range(
|
1383
1256
|
# DateTime.new(2021, 12, 16),
|
1384
1257
|
# DateTime.new(2021, 12, 16, 3),
|
1385
|
-
# "30m"
|
1258
|
+
# "30m",
|
1259
|
+
# time_unit: "us",
|
1260
|
+
# eager: true
|
1386
1261
|
# ),
|
1387
1262
|
# "n" => 0..6
|
1388
1263
|
# }
|
@@ -1449,16 +1324,16 @@ module Polars
|
|
1449
1324
|
# )
|
1450
1325
|
# # =>
|
1451
1326
|
# # shape: (4, 3)
|
1452
|
-
# #
|
1453
|
-
# # │ time ┆ time_count ┆ time_agg_list
|
1454
|
-
# # │ --- ┆ --- ┆ ---
|
1455
|
-
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]]
|
1456
|
-
# #
|
1457
|
-
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12
|
1458
|
-
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12
|
1459
|
-
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12
|
1460
|
-
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00]
|
1461
|
-
# #
|
1327
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────┐
|
1328
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1329
|
+
# # │ --- ┆ --- ┆ --- │
|
1330
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1331
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════╡
|
1332
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-… │
|
1333
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-… │
|
1334
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-… │
|
1335
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1336
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────┘
|
1462
1337
|
#
|
1463
1338
|
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1464
1339
|
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
@@ -1481,10 +1356,12 @@ module Polars
|
|
1481
1356
|
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
1482
1357
|
# df = Polars::DataFrame.new(
|
1483
1358
|
# {
|
1484
|
-
# "time" => Polars.
|
1359
|
+
# "time" => Polars.datetime_range(
|
1485
1360
|
# DateTime.new(2021, 12, 16),
|
1486
1361
|
# DateTime.new(2021, 12, 16, 3),
|
1487
|
-
# "30m"
|
1362
|
+
# "30m",
|
1363
|
+
# time_unit: "us",
|
1364
|
+
# eager: true
|
1488
1365
|
# ),
|
1489
1366
|
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
1490
1367
|
# }
|
@@ -1548,14 +1425,13 @@ module Polars
|
|
1548
1425
|
closed: "left",
|
1549
1426
|
label: "left",
|
1550
1427
|
by: nil,
|
1551
|
-
start_by: "window"
|
1552
|
-
check_sorted: true
|
1428
|
+
start_by: "window"
|
1553
1429
|
)
|
1554
1430
|
if !truncate.nil?
|
1555
1431
|
label = truncate ? "left" : "datapoint"
|
1556
1432
|
end
|
1557
1433
|
|
1558
|
-
index_column = Utils.
|
1434
|
+
index_column = Utils.parse_into_expression(index_column, str_as_lit: false)
|
1559
1435
|
if offset.nil?
|
1560
1436
|
offset = period.nil? ? "-#{every}" : "0ns"
|
1561
1437
|
end
|
@@ -1564,13 +1440,13 @@ module Polars
|
|
1564
1440
|
period = every
|
1565
1441
|
end
|
1566
1442
|
|
1567
|
-
period = Utils.
|
1568
|
-
offset = Utils.
|
1569
|
-
every = Utils.
|
1443
|
+
period = Utils.parse_as_duration_string(period)
|
1444
|
+
offset = Utils.parse_as_duration_string(offset)
|
1445
|
+
every = Utils.parse_as_duration_string(every)
|
1570
1446
|
|
1571
|
-
rbexprs_by = by.nil? ? [] : Utils.
|
1447
|
+
rbexprs_by = by.nil? ? [] : Utils.parse_into_list_of_expressions(by)
|
1572
1448
|
lgb = _ldf.group_by_dynamic(
|
1573
|
-
index_column
|
1449
|
+
index_column,
|
1574
1450
|
every,
|
1575
1451
|
period,
|
1576
1452
|
offset,
|
@@ -1578,8 +1454,7 @@ module Polars
|
|
1578
1454
|
include_boundaries,
|
1579
1455
|
closed,
|
1580
1456
|
rbexprs_by,
|
1581
|
-
start_by
|
1582
|
-
check_sorted
|
1457
|
+
start_by
|
1583
1458
|
)
|
1584
1459
|
LazyGroupBy.new(lgb)
|
1585
1460
|
end
|
@@ -1730,7 +1605,7 @@ module Polars
|
|
1730
1605
|
# @param on Object
|
1731
1606
|
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
1732
1607
|
# None.
|
1733
|
-
# @param how ["inner", "left", "
|
1608
|
+
# @param how ["inner", "left", "full", "semi", "anti", "cross"]
|
1734
1609
|
# Join strategy.
|
1735
1610
|
# @param suffix [String]
|
1736
1611
|
# Suffix to append to columns with a duplicate name.
|
@@ -1772,7 +1647,7 @@ module Polars
|
|
1772
1647
|
# # └─────┴─────┴─────┴───────┘
|
1773
1648
|
#
|
1774
1649
|
# @example
|
1775
|
-
# df.join(other_df, on: "ham", how: "
|
1650
|
+
# df.join(other_df, on: "ham", how: "full").collect
|
1776
1651
|
# # =>
|
1777
1652
|
# # shape: (4, 5)
|
1778
1653
|
# # ┌──────┬──────┬──────┬───────┬───────────┐
|
@@ -1839,7 +1714,9 @@ module Polars
|
|
1839
1714
|
raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
|
1840
1715
|
end
|
1841
1716
|
|
1842
|
-
if how == "
|
1717
|
+
if how == "outer"
|
1718
|
+
how = "full"
|
1719
|
+
elsif how == "cross"
|
1843
1720
|
return _from_rbldf(
|
1844
1721
|
_ldf.join(
|
1845
1722
|
other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
|
@@ -1848,12 +1725,12 @@ module Polars
|
|
1848
1725
|
end
|
1849
1726
|
|
1850
1727
|
if !on.nil?
|
1851
|
-
rbexprs = Utils.
|
1728
|
+
rbexprs = Utils.parse_into_list_of_expressions(on)
|
1852
1729
|
rbexprs_left = rbexprs
|
1853
1730
|
rbexprs_right = rbexprs
|
1854
1731
|
elsif !left_on.nil? && !right_on.nil?
|
1855
|
-
rbexprs_left = Utils.
|
1856
|
-
rbexprs_right = Utils.
|
1732
|
+
rbexprs_left = Utils.parse_into_list_of_expressions(left_on)
|
1733
|
+
rbexprs_right = Utils.parse_into_list_of_expressions(right_on)
|
1857
1734
|
else
|
1858
1735
|
raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
|
1859
1736
|
end
|
@@ -1908,7 +1785,8 @@ module Polars
|
|
1908
1785
|
# # └─────┴──────┴───────┴─────┴──────┴───────┘
|
1909
1786
|
def with_columns(*exprs, **named_exprs)
|
1910
1787
|
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1911
|
-
|
1788
|
+
|
1789
|
+
rbexprs = Utils.parse_into_list_of_expressions(*exprs, **named_exprs, __structify: structify)
|
1912
1790
|
|
1913
1791
|
_from_rbldf(_ldf.with_columns(rbexprs))
|
1914
1792
|
end
|
@@ -2069,9 +1947,9 @@ module Polars
|
|
2069
1947
|
# # └──────┴──────┘
|
2070
1948
|
def shift(n, fill_value: nil)
|
2071
1949
|
if !fill_value.nil?
|
2072
|
-
fill_value = Utils.
|
1950
|
+
fill_value = Utils.parse_into_expression(fill_value, str_as_lit: true)
|
2073
1951
|
end
|
2074
|
-
n = Utils.
|
1952
|
+
n = Utils.parse_into_expression(n)
|
2075
1953
|
_from_rbldf(_ldf.shift(n, fill_value))
|
2076
1954
|
end
|
2077
1955
|
|
@@ -2236,16 +2114,16 @@ module Polars
|
|
2236
2114
|
# df.with_row_index.collect
|
2237
2115
|
# # =>
|
2238
2116
|
# # shape: (3, 3)
|
2239
|
-
# #
|
2240
|
-
# # │
|
2241
|
-
# # │ ---
|
2242
|
-
# # │ u32
|
2243
|
-
# #
|
2244
|
-
# # │ 0
|
2245
|
-
# # │ 1
|
2246
|
-
# # │ 2
|
2247
|
-
# #
|
2248
|
-
def with_row_index(name: "
|
2117
|
+
# # ┌───────┬─────┬─────┐
|
2118
|
+
# # │ index ┆ a ┆ b │
|
2119
|
+
# # │ --- ┆ --- ┆ --- │
|
2120
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
2121
|
+
# # ╞═══════╪═════╪═════╡
|
2122
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
2123
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
2124
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
2125
|
+
# # └───────┴─────┴─────┘
|
2126
|
+
def with_row_index(name: "index", offset: 0)
|
2249
2127
|
_from_rbldf(_ldf.with_row_index(name, offset))
|
2250
2128
|
end
|
2251
2129
|
alias_method :with_row_count, :with_row_index
|
@@ -2268,7 +2146,7 @@ module Polars
|
|
2268
2146
|
# # │ 3 ┆ 7 │
|
2269
2147
|
# # └─────┴─────┘
|
2270
2148
|
def take_every(n)
|
2271
|
-
select(
|
2149
|
+
select(F.col("*").take_every(n))
|
2272
2150
|
end
|
2273
2151
|
|
2274
2152
|
# Fill null values using the specified value or strategy.
|
@@ -2311,7 +2189,7 @@ module Polars
|
|
2311
2189
|
# # └──────┴──────┘
|
2312
2190
|
def fill_nan(fill_value)
|
2313
2191
|
if !fill_value.is_a?(Expr)
|
2314
|
-
fill_value =
|
2192
|
+
fill_value = F.lit(fill_value)
|
2315
2193
|
end
|
2316
2194
|
_from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
|
2317
2195
|
end
|
@@ -2502,8 +2380,8 @@ module Polars
|
|
2502
2380
|
# # │ 3.0 ┆ 1.0 │
|
2503
2381
|
# # └─────┴─────┘
|
2504
2382
|
def quantile(quantile, interpolation: "nearest")
|
2505
|
-
quantile = Utils.
|
2506
|
-
_from_rbldf(_ldf.quantile(quantile
|
2383
|
+
quantile = Utils.parse_into_expression(quantile, str_as_lit: false)
|
2384
|
+
_from_rbldf(_ldf.quantile(quantile, interpolation))
|
2507
2385
|
end
|
2508
2386
|
|
2509
2387
|
# Explode lists to long format.
|
@@ -2535,7 +2413,7 @@ module Polars
|
|
2535
2413
|
# # │ c ┆ 8 │
|
2536
2414
|
# # └─────────┴─────────┘
|
2537
2415
|
def explode(columns)
|
2538
|
-
columns = Utils.
|
2416
|
+
columns = Utils.parse_into_list_of_expressions(columns)
|
2539
2417
|
_from_rbldf(_ldf.explode(columns))
|
2540
2418
|
end
|
2541
2419
|
|
@@ -2598,35 +2476,35 @@ module Polars
|
|
2598
2476
|
# Optionally leaves identifiers set.
|
2599
2477
|
#
|
2600
2478
|
# This function is useful to massage a DataFrame into a format where one or more
|
2601
|
-
# columns are identifier variables (
|
2602
|
-
# measured variables (
|
2479
|
+
# columns are identifier variables (index) while all other columns, considered
|
2480
|
+
# measured variables (on), are "unpivoted" to the row axis leaving just
|
2603
2481
|
# two non-identifier columns, 'variable' and 'value'.
|
2604
2482
|
#
|
2605
|
-
# @param
|
2606
|
-
#
|
2607
|
-
#
|
2608
|
-
#
|
2609
|
-
#
|
2483
|
+
# @param on [Object]
|
2484
|
+
# Column(s) or selector(s) to use as values variables; if `on`
|
2485
|
+
# is empty all columns that are not in `index` will be used.
|
2486
|
+
# @param index [Object]
|
2487
|
+
# Column(s) or selector(s) to use as identifier variables.
|
2610
2488
|
# @param variable_name [String]
|
2611
|
-
# Name to give to the `
|
2489
|
+
# Name to give to the `variable` column. Defaults to "variable"
|
2612
2490
|
# @param value_name [String]
|
2613
2491
|
# Name to give to the `value` column. Defaults to "value"
|
2614
2492
|
# @param streamable [Boolean]
|
2615
2493
|
# Allow this node to run in the streaming engine.
|
2616
|
-
# If this runs in streaming, the output of the
|
2494
|
+
# If this runs in streaming, the output of the unpivot operation
|
2617
2495
|
# will not have a stable ordering.
|
2618
2496
|
#
|
2619
2497
|
# @return [LazyFrame]
|
2620
2498
|
#
|
2621
2499
|
# @example
|
2622
|
-
#
|
2500
|
+
# lf = Polars::LazyFrame.new(
|
2623
2501
|
# {
|
2624
2502
|
# "a" => ["x", "y", "z"],
|
2625
2503
|
# "b" => [1, 3, 5],
|
2626
2504
|
# "c" => [2, 4, 6]
|
2627
2505
|
# }
|
2628
|
-
# )
|
2629
|
-
#
|
2506
|
+
# )
|
2507
|
+
# lf.unpivot(Polars::Selectors.numeric, index: "a").collect
|
2630
2508
|
# # =>
|
2631
2509
|
# # shape: (6, 3)
|
2632
2510
|
# # ┌─────┬──────────┬───────┐
|
@@ -2641,23 +2519,21 @@ module Polars
|
|
2641
2519
|
# # │ y ┆ c ┆ 4 │
|
2642
2520
|
# # │ z ┆ c ┆ 6 │
|
2643
2521
|
# # └─────┴──────────┴───────┘
|
2644
|
-
def
|
2645
|
-
|
2646
|
-
|
2647
|
-
|
2648
|
-
|
2649
|
-
|
2650
|
-
|
2651
|
-
|
2652
|
-
|
2653
|
-
|
2654
|
-
if id_vars.nil?
|
2655
|
-
id_vars = []
|
2656
|
-
end
|
2522
|
+
def unpivot(
|
2523
|
+
on,
|
2524
|
+
index: nil,
|
2525
|
+
variable_name: nil,
|
2526
|
+
value_name: nil,
|
2527
|
+
streamable: true
|
2528
|
+
)
|
2529
|
+
on = on.nil? ? [] : Utils._expand_selectors(self, on)
|
2530
|
+
index = index.nil? ? [] : Utils._expand_selectors(self, index)
|
2531
|
+
|
2657
2532
|
_from_rbldf(
|
2658
|
-
_ldf.
|
2533
|
+
_ldf.unpivot(on, index, value_name, variable_name, streamable)
|
2659
2534
|
)
|
2660
2535
|
end
|
2536
|
+
alias_method :melt, :unpivot
|
2661
2537
|
|
2662
2538
|
# def map
|
2663
2539
|
# end
|
@@ -2688,7 +2564,7 @@ module Polars
|
|
2688
2564
|
# # │ 10.0 ┆ null ┆ 9.0 │
|
2689
2565
|
# # └──────┴──────┴──────────┘
|
2690
2566
|
def interpolate
|
2691
|
-
select(
|
2567
|
+
select(F.col("*").interpolate)
|
2692
2568
|
end
|
2693
2569
|
|
2694
2570
|
# Decompose a struct into its fields.
|
@@ -2795,24 +2671,19 @@ module Polars
|
|
2795
2671
|
#
|
2796
2672
|
# @param column [Object]
|
2797
2673
|
# Columns that are sorted
|
2798
|
-
# @param more_columns [Object]
|
2799
|
-
# Additional columns that are sorted, specified as positional arguments.
|
2800
2674
|
# @param descending [Boolean]
|
2801
2675
|
# Whether the columns are sorted in descending order.
|
2802
2676
|
#
|
2803
2677
|
# @return [LazyFrame]
|
2804
2678
|
def set_sorted(
|
2805
2679
|
column,
|
2806
|
-
*more_columns,
|
2807
2680
|
descending: false
|
2808
2681
|
)
|
2809
|
-
|
2810
|
-
|
2811
|
-
|
2682
|
+
if !Utils.strlike?(column)
|
2683
|
+
msg = "expected a 'str' for argument 'column' in 'set_sorted'"
|
2684
|
+
raise TypeError, msg
|
2812
2685
|
end
|
2813
|
-
with_columns(
|
2814
|
-
columns.map { |e| Utils.wrap_expr(e).set_sorted(descending: descending) }
|
2815
|
-
)
|
2686
|
+
with_columns(F.col(column).set_sorted(descending: descending))
|
2816
2687
|
end
|
2817
2688
|
|
2818
2689
|
# TODO
|