polars-df 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +2 -1
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +139 -6
- data/ext/polars/src/dataframe.rs +360 -15
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +135 -3
- data/ext/polars/src/lazy/dsl.rs +97 -2
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +227 -12
- data/ext/polars/src/series.rs +190 -38
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/data_frame.rb +2813 -100
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +631 -11
- data/lib/polars/expr_dispatch.rb +14 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +517 -0
- data/lib/polars/io.rb +763 -4
- data/lib/polars/lazy_frame.rb +1415 -67
- data/lib/polars/lazy_functions.rb +430 -9
- data/lib/polars/lazy_group_by.rb +79 -0
- data/lib/polars/list_expr.rb +5 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2244 -192
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +663 -2
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/utils.rb +76 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +8 -2
- metadata +12 -2
data/lib/polars/lazy_frame.rb
CHANGED
@@ -1,36 +1,249 @@
|
|
1
1
|
module Polars
|
2
|
+
# Representation of a Lazy computation graph/query againat a DataFrame.
|
2
3
|
class LazyFrame
|
4
|
+
# @private
|
3
5
|
attr_accessor :_ldf
|
4
6
|
|
7
|
+
# @private
|
5
8
|
def self._from_rbldf(rb_ldf)
|
6
9
|
ldf = LazyFrame.allocate
|
7
10
|
ldf._ldf = rb_ldf
|
8
11
|
ldf
|
9
12
|
end
|
10
13
|
|
11
|
-
#
|
12
|
-
|
14
|
+
# @private
|
15
|
+
def self._scan_csv(
|
16
|
+
file,
|
17
|
+
has_header: true,
|
18
|
+
sep: ",",
|
19
|
+
comment_char: nil,
|
20
|
+
quote_char: '"',
|
21
|
+
skip_rows: 0,
|
22
|
+
dtypes: nil,
|
23
|
+
null_values: nil,
|
24
|
+
ignore_errors: false,
|
25
|
+
cache: true,
|
26
|
+
with_column_names: nil,
|
27
|
+
infer_schema_length: 100,
|
28
|
+
n_rows: nil,
|
29
|
+
encoding: "utf8",
|
30
|
+
low_memory: false,
|
31
|
+
rechunk: true,
|
32
|
+
skip_rows_after_header: 0,
|
33
|
+
row_count_name: nil,
|
34
|
+
row_count_offset: 0,
|
35
|
+
parse_dates: false,
|
36
|
+
eol_char: "\n"
|
37
|
+
)
|
38
|
+
dtype_list = nil
|
39
|
+
if !dtypes.nil?
|
40
|
+
dtype_list = []
|
41
|
+
dtypes.each do |k, v|
|
42
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
processed_null_values = Utils._process_null_values(null_values)
|
13
46
|
|
14
|
-
|
15
|
-
|
47
|
+
_from_rbldf(
|
48
|
+
RbLazyFrame.new_from_csv(
|
49
|
+
file,
|
50
|
+
sep,
|
51
|
+
has_header,
|
52
|
+
ignore_errors,
|
53
|
+
skip_rows,
|
54
|
+
n_rows,
|
55
|
+
cache,
|
56
|
+
dtype_list,
|
57
|
+
low_memory,
|
58
|
+
comment_char,
|
59
|
+
quote_char,
|
60
|
+
processed_null_values,
|
61
|
+
infer_schema_length,
|
62
|
+
with_column_names,
|
63
|
+
rechunk,
|
64
|
+
skip_rows_after_header,
|
65
|
+
encoding,
|
66
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
67
|
+
parse_dates,
|
68
|
+
eol_char
|
69
|
+
)
|
70
|
+
)
|
71
|
+
end
|
16
72
|
|
17
|
-
#
|
18
|
-
|
73
|
+
# @private
|
74
|
+
def self._scan_parquet(
|
75
|
+
file,
|
76
|
+
n_rows: nil,
|
77
|
+
cache: true,
|
78
|
+
parallel: "auto",
|
79
|
+
rechunk: true,
|
80
|
+
row_count_name: nil,
|
81
|
+
row_count_offset: 0,
|
82
|
+
storage_options: nil,
|
83
|
+
low_memory: false
|
84
|
+
)
|
85
|
+
_from_rbldf(
|
86
|
+
RbLazyFrame.new_from_parquet(
|
87
|
+
file,
|
88
|
+
n_rows,
|
89
|
+
cache,
|
90
|
+
parallel,
|
91
|
+
rechunk,
|
92
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
93
|
+
low_memory
|
94
|
+
)
|
95
|
+
)
|
96
|
+
end
|
97
|
+
|
98
|
+
# @private
|
99
|
+
def self._scan_ipc(
|
100
|
+
file,
|
101
|
+
n_rows: nil,
|
102
|
+
cache: true,
|
103
|
+
rechunk: true,
|
104
|
+
row_count_name: nil,
|
105
|
+
row_count_offset: 0,
|
106
|
+
storage_options: nil,
|
107
|
+
memory_map: true
|
108
|
+
)
|
109
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
110
|
+
file = Utils.format_path(file)
|
111
|
+
end
|
19
112
|
|
20
|
-
|
113
|
+
_from_rbldf(
|
114
|
+
RbLazyFrame.new_from_ipc(
|
115
|
+
file,
|
116
|
+
n_rows,
|
117
|
+
cache,
|
118
|
+
rechunk,
|
119
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
120
|
+
memory_map
|
121
|
+
)
|
122
|
+
)
|
123
|
+
end
|
124
|
+
|
125
|
+
# @private
|
126
|
+
def self._scan_ndjson(
|
127
|
+
file,
|
128
|
+
infer_schema_length: nil,
|
129
|
+
batch_size: nil,
|
130
|
+
n_rows: nil,
|
131
|
+
low_memory: false,
|
132
|
+
rechunk: true,
|
133
|
+
row_count_name: nil,
|
134
|
+
row_count_offset: 0
|
135
|
+
)
|
136
|
+
_from_rbldf(
|
137
|
+
RbLazyFrame.new_from_ndjson(
|
138
|
+
file,
|
139
|
+
infer_schema_length,
|
140
|
+
batch_size,
|
141
|
+
n_rows,
|
142
|
+
low_memory,
|
143
|
+
rechunk,
|
144
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset)
|
145
|
+
)
|
146
|
+
)
|
147
|
+
end
|
148
|
+
|
149
|
+
# def self.from_json
|
21
150
|
# end
|
22
151
|
|
23
|
-
# def
|
152
|
+
# def self.read_json
|
24
153
|
# end
|
25
154
|
|
155
|
+
# Get or set column names.
|
156
|
+
#
|
157
|
+
# @return [Array]
|
158
|
+
#
|
159
|
+
# @example
|
160
|
+
# df = (
|
161
|
+
# Polars::DataFrame.new(
|
162
|
+
# {
|
163
|
+
# "foo" => [1, 2, 3],
|
164
|
+
# "bar" => [6, 7, 8],
|
165
|
+
# "ham" => ["a", "b", "c"]
|
166
|
+
# }
|
167
|
+
# )
|
168
|
+
# .lazy
|
169
|
+
# .select(["foo", "bar"])
|
170
|
+
# )
|
171
|
+
# df.columns
|
172
|
+
# # => ["foo", "bar"]
|
173
|
+
def columns
|
174
|
+
_ldf.columns
|
175
|
+
end
|
176
|
+
|
177
|
+
# Get dtypes of columns in LazyFrame.
|
178
|
+
#
|
179
|
+
# @return [Array]
|
180
|
+
#
|
181
|
+
# @example
|
182
|
+
# lf = Polars::DataFrame.new(
|
183
|
+
# {
|
184
|
+
# "foo" => [1, 2, 3],
|
185
|
+
# "bar" => [6.0, 7.0, 8.0],
|
186
|
+
# "ham" => ["a", "b", "c"]
|
187
|
+
# }
|
188
|
+
# ).lazy
|
189
|
+
# lf.dtypes
|
190
|
+
# # => [:i64, :f64, :str]
|
191
|
+
def dtypes
|
192
|
+
_ldf.dtypes
|
193
|
+
end
|
194
|
+
|
195
|
+
# Get the schema.
|
196
|
+
#
|
197
|
+
# @return [Hash]
|
198
|
+
#
|
199
|
+
# @example
|
200
|
+
# lf = Polars::DataFrame.new(
|
201
|
+
# {
|
202
|
+
# "foo" => [1, 2, 3],
|
203
|
+
# "bar" => [6.0, 7.0, 8.0],
|
204
|
+
# "ham" => ["a", "b", "c"]
|
205
|
+
# }
|
206
|
+
# ).lazy
|
207
|
+
# lf.schema
|
208
|
+
# # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
|
209
|
+
def schema
|
210
|
+
_ldf.schema
|
211
|
+
end
|
212
|
+
|
213
|
+
# Get the width of the LazyFrame.
|
214
|
+
#
|
215
|
+
# @return [Integer]
|
216
|
+
#
|
217
|
+
# @example
|
218
|
+
# lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
|
219
|
+
# lf.width
|
220
|
+
# # => 2
|
221
|
+
def width
|
222
|
+
_ldf.width
|
223
|
+
end
|
224
|
+
|
225
|
+
# Check if LazyFrame includes key.
|
226
|
+
#
|
227
|
+
# @return [Boolean]
|
228
|
+
def include?(key)
|
229
|
+
columns.include?(key)
|
230
|
+
end
|
231
|
+
|
26
232
|
# clone handled by initialize_copy
|
27
233
|
|
28
234
|
# def [](item)
|
29
235
|
# end
|
30
236
|
|
31
|
-
#
|
32
|
-
#
|
33
|
-
#
|
237
|
+
# Returns a string representing the LazyFrame.
|
238
|
+
#
|
239
|
+
# @return [String]
|
240
|
+
def to_s
|
241
|
+
<<~EOS
|
242
|
+
naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
|
243
|
+
|
244
|
+
#{describe_plan}
|
245
|
+
EOS
|
246
|
+
end
|
34
247
|
|
35
248
|
# def write_json
|
36
249
|
# end
|
@@ -38,21 +251,125 @@ module Polars
|
|
38
251
|
# def pipe
|
39
252
|
# end
|
40
253
|
|
41
|
-
#
|
42
|
-
#
|
254
|
+
# Create a string representation of the unoptimized query plan.
|
255
|
+
#
|
256
|
+
# @return [String]
|
257
|
+
def describe_plan
|
258
|
+
_ldf.describe_plan
|
259
|
+
end
|
43
260
|
|
261
|
+
# Create a string representation of the optimized query plan.
|
262
|
+
#
|
263
|
+
# @return [String]
|
44
264
|
# def describe_optimized_plan
|
45
265
|
# end
|
46
266
|
|
47
267
|
# def show_graph
|
48
268
|
# end
|
49
269
|
|
50
|
-
#
|
51
|
-
#
|
270
|
+
# Sort the DataFrame.
|
271
|
+
#
|
272
|
+
# Sorting can be done by:
|
273
|
+
#
|
274
|
+
# - A single column name
|
275
|
+
# - An expression
|
276
|
+
# - Multiple expressions
|
277
|
+
#
|
278
|
+
# @param by [Object]
|
279
|
+
# Column (expressions) to sort by.
|
280
|
+
# @param reverse [Boolean]
|
281
|
+
# Sort in descending order.
|
282
|
+
# @param nulls_last [Boolean]
|
283
|
+
# Place null values last. Can only be used if sorted by a single column.
|
284
|
+
#
|
285
|
+
# @return [LazyFrame]
|
286
|
+
#
|
287
|
+
# @example
|
288
|
+
# df = Polars::DataFrame.new(
|
289
|
+
# {
|
290
|
+
# "foo" => [1, 2, 3],
|
291
|
+
# "bar" => [6.0, 7.0, 8.0],
|
292
|
+
# "ham" => ["a", "b", "c"]
|
293
|
+
# }
|
294
|
+
# ).lazy
|
295
|
+
# df.sort("foo", reverse: true).collect
|
296
|
+
# # =>
|
297
|
+
# # shape: (3, 3)
|
298
|
+
# # ┌─────┬─────┬─────┐
|
299
|
+
# # │ foo ┆ bar ┆ ham │
|
300
|
+
# # │ --- ┆ --- ┆ --- │
|
301
|
+
# # │ i64 ┆ f64 ┆ str │
|
302
|
+
# # ╞═════╪═════╪═════╡
|
303
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
304
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
305
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
306
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
307
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
308
|
+
# # └─────┴─────┴─────┘
|
309
|
+
def sort(by, reverse: false, nulls_last: false)
|
310
|
+
if by.is_a?(String)
|
311
|
+
_from_rbldf(_ldf.sort(by, reverse, nulls_last))
|
312
|
+
end
|
313
|
+
if Utils.bool?(reverse)
|
314
|
+
reverse = [reverse]
|
315
|
+
end
|
316
|
+
|
317
|
+
by = Utils.selection_to_rbexpr_list(by)
|
318
|
+
_from_rbldf(_ldf.sort_by_exprs(by, reverse, nulls_last))
|
319
|
+
end
|
52
320
|
|
53
321
|
# def profile
|
54
322
|
# end
|
55
323
|
|
324
|
+
# Collect into a DataFrame.
|
325
|
+
#
|
326
|
+
# Note: use {#fetch} if you want to run your query on the first `n` rows
|
327
|
+
# only. This can be a huge time saver in debugging queries.
|
328
|
+
#
|
329
|
+
# @param type_coercion [Boolean]
|
330
|
+
# Do type coercion optimization.
|
331
|
+
# @param predicate_pushdown [Boolean]
|
332
|
+
# Do predicate pushdown optimization.
|
333
|
+
# @param projection_pushdown [Boolean]
|
334
|
+
# Do projection pushdown optimization.
|
335
|
+
# @param simplify_expression [Boolean]
|
336
|
+
# Run simplify expressions optimization.
|
337
|
+
# @param string_cache [Boolean]
|
338
|
+
# This argument is deprecated. Please set the string cache globally.
|
339
|
+
# The argument will be ignored
|
340
|
+
# @param no_optimization [Boolean]
|
341
|
+
# Turn off (certain) optimizations.
|
342
|
+
# @param slice_pushdown [Boolean]
|
343
|
+
# Slice pushdown optimization.
|
344
|
+
# @param common_subplan_elimination [Boolean]
|
345
|
+
# Will try to cache branching subplans that occur on self-joins or unions.
|
346
|
+
# @param allow_streaming [Boolean]
|
347
|
+
# Run parts of the query in a streaming fashion (this is in an alpha state)
|
348
|
+
#
|
349
|
+
# @return [DataFrame]
|
350
|
+
#
|
351
|
+
# @example
|
352
|
+
# df = Polars::DataFrame.new(
|
353
|
+
# {
|
354
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
355
|
+
# "b" => [1, 2, 3, 4, 5, 6],
|
356
|
+
# "c" => [6, 5, 4, 3, 2, 1]
|
357
|
+
# }
|
358
|
+
# ).lazy
|
359
|
+
# df.groupby("a", maintain_order: true).agg(Polars.all.sum).collect
|
360
|
+
# # =>
|
361
|
+
# # shape: (3, 3)
|
362
|
+
# # ┌─────┬─────┬─────┐
|
363
|
+
# # │ a ┆ b ┆ c │
|
364
|
+
# # │ --- ┆ --- ┆ --- │
|
365
|
+
# # │ str ┆ i64 ┆ i64 │
|
366
|
+
# # ╞═════╪═════╪═════╡
|
367
|
+
# # │ a ┆ 4 ┆ 10 │
|
368
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
369
|
+
# # │ b ┆ 11 ┆ 10 │
|
370
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
371
|
+
# # │ c ┆ 6 ┆ 1 │
|
372
|
+
# # └─────┴─────┴─────┘
|
56
373
|
def collect(
|
57
374
|
type_coercion: true,
|
58
375
|
predicate_pushdown: true,
|
@@ -87,19 +404,184 @@ module Polars
|
|
87
404
|
Utils.wrap_df(ldf.collect)
|
88
405
|
end
|
89
406
|
|
90
|
-
#
|
91
|
-
#
|
407
|
+
# Collect a small number of rows for debugging purposes.
|
408
|
+
#
|
409
|
+
# Fetch is like a {#collect} operation, but it overwrites the number of rows
|
410
|
+
# read by every scan operation. This is a utility that helps debug a query on a
|
411
|
+
# smaller number of rows.
|
412
|
+
#
|
413
|
+
# Note that the fetch does not guarantee the final number of rows in the
|
414
|
+
# DataFrame. Filter, join operations and a lower number of rows available in the
|
415
|
+
# scanned file influence the final number of rows.
|
416
|
+
#
|
417
|
+
# @param n_rows [Integer]
|
418
|
+
# Collect n_rows from the data sources.
|
419
|
+
# @param type_coercion [Boolean]
|
420
|
+
# Run type coercion optimization.
|
421
|
+
# @param predicate_pushdown [Boolean]
|
422
|
+
# Run predicate pushdown optimization.
|
423
|
+
# @param projection_pushdown [Boolean]
|
424
|
+
# Run projection pushdown optimization.
|
425
|
+
# @param simplify_expression [Boolean]
|
426
|
+
# Run simplify expressions optimization.
|
427
|
+
# @param string_cache [Boolean]
|
428
|
+
# This argument is deprecated. Please set the string cache globally.
|
429
|
+
# The argument will be ignored
|
430
|
+
# @param no_optimization [Boolean]
|
431
|
+
# Turn off optimizations.
|
432
|
+
# @param slice_pushdown [Boolean]
|
433
|
+
# Slice pushdown optimization
|
434
|
+
# @param common_subplan_elimination [Boolean]
|
435
|
+
# Will try to cache branching subplans that occur on self-joins or unions.
|
436
|
+
# @param allow_streaming [Boolean]
|
437
|
+
# Run parts of the query in a streaming fashion (this is in an alpha state)
|
438
|
+
#
|
439
|
+
# @return [DataFrame]
|
440
|
+
#
|
441
|
+
# @example
|
442
|
+
# df = Polars::DataFrame.new(
|
443
|
+
# {
|
444
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
445
|
+
# "b" => [1, 2, 3, 4, 5, 6],
|
446
|
+
# "c" => [6, 5, 4, 3, 2, 1]
|
447
|
+
# }
|
448
|
+
# ).lazy
|
449
|
+
# df.groupby("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
|
450
|
+
# # =>
|
451
|
+
# # shape: (2, 3)
|
452
|
+
# # ┌─────┬─────┬─────┐
|
453
|
+
# # │ a ┆ b ┆ c │
|
454
|
+
# # │ --- ┆ --- ┆ --- │
|
455
|
+
# # │ str ┆ i64 ┆ i64 │
|
456
|
+
# # ╞═════╪═════╪═════╡
|
457
|
+
# # │ a ┆ 1 ┆ 6 │
|
458
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
459
|
+
# # │ b ┆ 2 ┆ 5 │
|
460
|
+
# # └─────┴─────┴─────┘
|
461
|
+
def fetch(
|
462
|
+
n_rows = 500,
|
463
|
+
type_coercion: true,
|
464
|
+
predicate_pushdown: true,
|
465
|
+
projection_pushdown: true,
|
466
|
+
simplify_expression: true,
|
467
|
+
string_cache: false,
|
468
|
+
no_optimization: false,
|
469
|
+
slice_pushdown: true,
|
470
|
+
common_subplan_elimination: true,
|
471
|
+
allow_streaming: false
|
472
|
+
)
|
473
|
+
if no_optimization
|
474
|
+
predicate_pushdown = false
|
475
|
+
projection_pushdown = false
|
476
|
+
slice_pushdown = false
|
477
|
+
common_subplan_elimination = false
|
478
|
+
end
|
92
479
|
|
480
|
+
ldf = _ldf.optimization_toggle(
|
481
|
+
type_coercion,
|
482
|
+
predicate_pushdown,
|
483
|
+
projection_pushdown,
|
484
|
+
simplify_expression,
|
485
|
+
slice_pushdown,
|
486
|
+
common_subplan_elimination,
|
487
|
+
allow_streaming
|
488
|
+
)
|
489
|
+
Utils.wrap_df(ldf.fetch(n_rows))
|
490
|
+
end
|
491
|
+
|
492
|
+
# Return lazy representation, i.e. itself.
|
493
|
+
#
|
494
|
+
# Useful for writing code that expects either a `DataFrame` or
|
495
|
+
# `LazyFrame`.
|
496
|
+
#
|
497
|
+
# @return [LazyFrame]
|
498
|
+
#
|
499
|
+
# @example
|
500
|
+
# df = Polars::DataFrame.new(
|
501
|
+
# {
|
502
|
+
# "a" => [nil, 2, 3, 4],
|
503
|
+
# "b" => [0.5, nil, 2.5, 13],
|
504
|
+
# "c" => [true, true, false, nil]
|
505
|
+
# }
|
506
|
+
# )
|
507
|
+
# df.lazy
|
93
508
|
def lazy
|
94
509
|
self
|
95
510
|
end
|
96
511
|
|
97
|
-
#
|
98
|
-
#
|
512
|
+
# Cache the result once the execution of the physical plan hits this node.
|
513
|
+
#
|
514
|
+
# @return [LazyFrame]
|
515
|
+
def cache
|
516
|
+
_from_rbldf(_ldf.cache)
|
517
|
+
end
|
99
518
|
|
100
|
-
#
|
101
|
-
#
|
519
|
+
# Create an empty copy of the current LazyFrame.
|
520
|
+
#
|
521
|
+
# The copy has an identical schema but no data.
|
522
|
+
#
|
523
|
+
# @return [LazyFrame]
|
524
|
+
#
|
525
|
+
# @example
|
526
|
+
# df = Polars::DataFrame.new(
|
527
|
+
# {
|
528
|
+
# "a" => [nil, 2, 3, 4],
|
529
|
+
# "b" => [0.5, nil, 2.5, 13],
|
530
|
+
# "c" => [true, true, false, nil],
|
531
|
+
# }
|
532
|
+
# ).lazy
|
533
|
+
# df.cleared.fetch
|
534
|
+
# # =>
|
535
|
+
# # shape: (0, 3)
|
536
|
+
# # ┌─────┬─────┬──────┐
|
537
|
+
# # │ a ┆ b ┆ c │
|
538
|
+
# # │ --- ┆ --- ┆ --- │
|
539
|
+
# # │ i64 ┆ f64 ┆ bool │
|
540
|
+
# # ╞═════╪═════╪══════╡
|
541
|
+
# # └─────┴─────┴──────┘
|
542
|
+
def cleared
|
543
|
+
DataFrame.new(columns: schema).lazy
|
544
|
+
end
|
102
545
|
|
546
|
+
# Filter the rows in the DataFrame based on a predicate expression.
|
547
|
+
#
|
548
|
+
# @param predicate [Object]
|
549
|
+
# Expression that evaluates to a boolean Series.
|
550
|
+
#
|
551
|
+
# @return [LazyFrame]
|
552
|
+
#
|
553
|
+
# @example Filter on one condition:
|
554
|
+
# lf = Polars::DataFrame.new(
|
555
|
+
# {
|
556
|
+
# "foo" => [1, 2, 3],
|
557
|
+
# "bar" => [6, 7, 8],
|
558
|
+
# "ham" => ["a", "b", "c"]
|
559
|
+
# }
|
560
|
+
# ).lazy
|
561
|
+
# lf.filter(Polars.col("foo") < 3).collect()
|
562
|
+
# # =>
|
563
|
+
# # shape: (2, 3)
|
564
|
+
# # ┌─────┬─────┬─────┐
|
565
|
+
# # │ foo ┆ bar ┆ ham │
|
566
|
+
# # │ --- ┆ --- ┆ --- │
|
567
|
+
# # │ i64 ┆ i64 ┆ str │
|
568
|
+
# # ╞═════╪═════╪═════╡
|
569
|
+
# # │ 1 ┆ 6 ┆ a │
|
570
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
571
|
+
# # │ 2 ┆ 7 ┆ b │
|
572
|
+
# # └─────┴─────┴─────┘
|
573
|
+
#
|
574
|
+
# @example Filter on multiple conditions:
|
575
|
+
# lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
|
576
|
+
# # =>
|
577
|
+
# # shape: (1, 3)
|
578
|
+
# # ┌─────┬─────┬─────┐
|
579
|
+
# # │ foo ┆ bar ┆ ham │
|
580
|
+
# # │ --- ┆ --- ┆ --- │
|
581
|
+
# # │ i64 ┆ i64 ┆ str │
|
582
|
+
# # ╞═════╪═════╪═════╡
|
583
|
+
# # │ 1 ┆ 6 ┆ a │
|
584
|
+
# # └─────┴─────┴─────┘
|
103
585
|
def filter(predicate)
|
104
586
|
_from_rbldf(
|
105
587
|
_ldf.filter(
|
@@ -108,11 +590,136 @@ module Polars
|
|
108
590
|
)
|
109
591
|
end
|
110
592
|
|
593
|
+
# Select columns from this DataFrame.
|
594
|
+
#
|
595
|
+
# @param exprs [Object]
|
596
|
+
# Column or columns to select.
|
597
|
+
#
|
598
|
+
# @return [LazyFrame]
|
599
|
+
#
|
600
|
+
# @example
|
601
|
+
# df = Polars::DataFrame.new(
|
602
|
+
# {
|
603
|
+
# "foo" => [1, 2, 3],
|
604
|
+
# "bar" => [6, 7, 8],
|
605
|
+
# "ham" => ["a", "b", "c"],
|
606
|
+
# }
|
607
|
+
# ).lazy
|
608
|
+
# df.select("foo").collect
|
609
|
+
# # =>
|
610
|
+
# # shape: (3, 1)
|
611
|
+
# # ┌─────┐
|
612
|
+
# # │ foo │
|
613
|
+
# # │ --- │
|
614
|
+
# # │ i64 │
|
615
|
+
# # ╞═════╡
|
616
|
+
# # │ 1 │
|
617
|
+
# # ├╌╌╌╌╌┤
|
618
|
+
# # │ 2 │
|
619
|
+
# # ├╌╌╌╌╌┤
|
620
|
+
# # │ 3 │
|
621
|
+
# # └─────┘
|
622
|
+
#
|
623
|
+
# @example
|
624
|
+
# df.select(["foo", "bar"]).collect
|
625
|
+
# # =>
|
626
|
+
# # shape: (3, 2)
|
627
|
+
# # ┌─────┬─────┐
|
628
|
+
# # │ foo ┆ bar │
|
629
|
+
# # │ --- ┆ --- │
|
630
|
+
# # │ i64 ┆ i64 │
|
631
|
+
# # ╞═════╪═════╡
|
632
|
+
# # │ 1 ┆ 6 │
|
633
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
634
|
+
# # │ 2 ┆ 7 │
|
635
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
636
|
+
# # │ 3 ┆ 8 │
|
637
|
+
# # └─────┴─────┘
|
638
|
+
#
|
639
|
+
# @example
|
640
|
+
# df.select(Polars.col("foo") + 1).collect
|
641
|
+
# # =>
|
642
|
+
# # shape: (3, 1)
|
643
|
+
# # ┌─────┐
|
644
|
+
# # │ foo │
|
645
|
+
# # │ --- │
|
646
|
+
# # │ i64 │
|
647
|
+
# # ╞═════╡
|
648
|
+
# # │ 2 │
|
649
|
+
# # ├╌╌╌╌╌┤
|
650
|
+
# # │ 3 │
|
651
|
+
# # ├╌╌╌╌╌┤
|
652
|
+
# # │ 4 │
|
653
|
+
# # └─────┘
|
654
|
+
#
|
655
|
+
# @example
|
656
|
+
# df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
|
657
|
+
# # =>
|
658
|
+
# # shape: (3, 2)
|
659
|
+
# # ┌─────┬─────┐
|
660
|
+
# # │ foo ┆ bar │
|
661
|
+
# # │ --- ┆ --- │
|
662
|
+
# # │ i64 ┆ i64 │
|
663
|
+
# # ╞═════╪═════╡
|
664
|
+
# # │ 2 ┆ 7 │
|
665
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
666
|
+
# # │ 3 ┆ 8 │
|
667
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
668
|
+
# # │ 4 ┆ 9 │
|
669
|
+
# # └─────┴─────┘
|
670
|
+
#
|
671
|
+
# @example
|
672
|
+
# df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
|
673
|
+
# # =>
|
674
|
+
# # shape: (3, 1)
|
675
|
+
# # ┌─────────┐
|
676
|
+
# # │ literal │
|
677
|
+
# # │ --- │
|
678
|
+
# # │ i64 │
|
679
|
+
# # ╞═════════╡
|
680
|
+
# # │ 0 │
|
681
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
682
|
+
# # │ 0 │
|
683
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
684
|
+
# # │ 10 │
|
685
|
+
# # └─────────┘
|
111
686
|
def select(exprs)
|
112
687
|
exprs = Utils.selection_to_rbexpr_list(exprs)
|
113
688
|
_from_rbldf(_ldf.select(exprs))
|
114
689
|
end
|
115
690
|
|
691
|
+
# Start a groupby operation.
|
692
|
+
#
|
693
|
+
# @param by [Object]
|
694
|
+
# Column(s) to group by.
|
695
|
+
# @param maintain_order [Boolean]
|
696
|
+
# Make sure that the order of the groups remain consistent. This is more
|
697
|
+
# expensive than a default groupby.
|
698
|
+
#
|
699
|
+
# @return [LazyGroupBy]
|
700
|
+
#
|
701
|
+
# @example
|
702
|
+
# df = Polars::DataFrame.new(
|
703
|
+
# {
|
704
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
705
|
+
# "b" => [1, 2, 3, 4, 5, 6],
|
706
|
+
# "c" => [6, 5, 4, 3, 2, 1]
|
707
|
+
# }
|
708
|
+
# ).lazy
|
709
|
+
# df.groupby("a", maintain_order: true).agg(Polars.col("b").sum).collect
|
710
|
+
# # =>
|
711
|
+
# # shape: (3, 2)
|
712
|
+
# # ┌─────┬─────┐
|
713
|
+
# # │ a ┆ b │
|
714
|
+
# # │ --- ┆ --- │
|
715
|
+
# # │ str ┆ i64 │
|
716
|
+
# # ╞═════╪═════╡
|
717
|
+
# # │ a ┆ 4 │
|
718
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
719
|
+
# # │ b ┆ 11 │
|
720
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
721
|
+
# # │ c ┆ 6 │
|
722
|
+
# # └─────┴─────┘
|
116
723
|
def groupby(by, maintain_order: false)
|
117
724
|
rbexprs_by = Utils.selection_to_rbexpr_list(by)
|
118
725
|
lgb = _ldf.groupby(rbexprs_by, maintain_order)
|
@@ -128,6 +735,116 @@ module Polars
|
|
128
735
|
# def join_asof
|
129
736
|
# end
|
130
737
|
|
738
|
+
# Add a join operation to the Logical Plan.
|
739
|
+
#
|
740
|
+
# @param other [LazyFrame]
|
741
|
+
# Lazy DataFrame to join with.
|
742
|
+
# @param left_on [Object]
|
743
|
+
# Join column of the left DataFrame.
|
744
|
+
# @param right_on [Object]
|
745
|
+
# Join column of the right DataFrame.
|
746
|
+
# @param on Object
|
747
|
+
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
748
|
+
# None.
|
749
|
+
# @param how ["inner", "left", "outer", "semi", "anti", "cross"]
|
750
|
+
# Join strategy.
|
751
|
+
# @param suffix [String]
|
752
|
+
# Suffix to append to columns with a duplicate name.
|
753
|
+
# @param allow_parallel [Boolean]
|
754
|
+
# Allow the physical plan to optionally evaluate the computation of both
|
755
|
+
# DataFrames up to the join in parallel.
|
756
|
+
# @param force_parallel [Boolean]
|
757
|
+
# Force the physical plan to evaluate the computation of both DataFrames up to
|
758
|
+
# the join in parallel.
|
759
|
+
#
|
760
|
+
# @return [LazyFrame]
|
761
|
+
#
|
762
|
+
# @example
|
763
|
+
# df = Polars::DataFrame.new(
|
764
|
+
# {
|
765
|
+
# "foo" => [1, 2, 3],
|
766
|
+
# "bar" => [6.0, 7.0, 8.0],
|
767
|
+
# "ham" => ["a", "b", "c"]
|
768
|
+
# }
|
769
|
+
# ).lazy
|
770
|
+
# other_df = Polars::DataFrame.new(
|
771
|
+
# {
|
772
|
+
# "apple" => ["x", "y", "z"],
|
773
|
+
# "ham" => ["a", "b", "d"]
|
774
|
+
# }
|
775
|
+
# ).lazy
|
776
|
+
# df.join(other_df, on: "ham").collect
|
777
|
+
# # =>
|
778
|
+
# # shape: (2, 4)
|
779
|
+
# # ┌─────┬─────┬─────┬───────┐
|
780
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
781
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
782
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
783
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
784
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
785
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
786
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
787
|
+
# # └─────┴─────┴─────┴───────┘
|
788
|
+
#
|
789
|
+
# @example
|
790
|
+
# df.join(other_df, on: "ham", how: "outer").collect
|
791
|
+
# # =>
|
792
|
+
# # shape: (4, 4)
|
793
|
+
# # ┌──────┬──────┬─────┬───────┐
|
794
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
795
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
796
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
797
|
+
# # ╞══════╪══════╪═════╪═══════╡
|
798
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
799
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
800
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
801
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
802
|
+
# # │ null ┆ null ┆ d ┆ z │
|
803
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
804
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null │
|
805
|
+
# # └──────┴──────┴─────┴───────┘
|
806
|
+
#
|
807
|
+
# @example
|
808
|
+
# df.join(other_df, on: "ham", how: "left").collect
|
809
|
+
# # =>
|
810
|
+
# # shape: (3, 4)
|
811
|
+
# # ┌─────┬─────┬─────┬───────┐
|
812
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
813
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
814
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
815
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
816
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
817
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
818
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
819
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
820
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null │
|
821
|
+
# # └─────┴─────┴─────┴───────┘
|
822
|
+
#
|
823
|
+
# @example
|
824
|
+
# df.join(other_df, on: "ham", how: "semi").collect
|
825
|
+
# # =>
|
826
|
+
# # shape: (2, 3)
|
827
|
+
# # ┌─────┬─────┬─────┐
|
828
|
+
# # │ foo ┆ bar ┆ ham │
|
829
|
+
# # │ --- ┆ --- ┆ --- │
|
830
|
+
# # │ i64 ┆ f64 ┆ str │
|
831
|
+
# # ╞═════╪═════╪═════╡
|
832
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
833
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
834
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
835
|
+
# # └─────┴─────┴─────┘
|
836
|
+
#
|
837
|
+
# @example
|
838
|
+
# df.join(other_df, on: "ham", how: "anti").collect
|
839
|
+
# # =>
|
840
|
+
# # shape: (1, 3)
|
841
|
+
# # ┌─────┬─────┬─────┐
|
842
|
+
# # │ foo ┆ bar ┆ ham │
|
843
|
+
# # │ --- ┆ --- ┆ --- │
|
844
|
+
# # │ i64 ┆ f64 ┆ str │
|
845
|
+
# # ╞═════╪═════╪═════╡
|
846
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
847
|
+
# # └─────┴─────┴─────┘
|
131
848
|
def join(
|
132
849
|
other,
|
133
850
|
left_on: nil,
|
@@ -174,6 +891,43 @@ module Polars
|
|
174
891
|
)
|
175
892
|
end
|
176
893
|
|
894
|
+
# Add or overwrite multiple columns in a DataFrame.
|
895
|
+
#
|
896
|
+
# @param exprs [Object]
|
897
|
+
# List of Expressions that evaluate to columns.
|
898
|
+
#
|
899
|
+
# @return [LazyFrame]
|
900
|
+
#
|
901
|
+
# @example
|
902
|
+
# ldf = Polars::DataFrame.new(
|
903
|
+
# {
|
904
|
+
# "a" => [1, 2, 3, 4],
|
905
|
+
# "b" => [0.5, 4, 10, 13],
|
906
|
+
# "c" => [true, true, false, true]
|
907
|
+
# }
|
908
|
+
# ).lazy
|
909
|
+
# ldf.with_columns(
|
910
|
+
# [
|
911
|
+
# (Polars.col("a") ** 2).alias("a^2"),
|
912
|
+
# (Polars.col("b") / 2).alias("b/2"),
|
913
|
+
# (Polars.col("c").is_not()).alias("not c")
|
914
|
+
# ]
|
915
|
+
# ).collect
|
916
|
+
# # =>
|
917
|
+
# # shape: (4, 6)
|
918
|
+
# # ┌─────┬──────┬───────┬──────┬──────┬───────┐
|
919
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
920
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
921
|
+
# # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
|
922
|
+
# # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
|
923
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
|
924
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
925
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
|
926
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
927
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
|
928
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
929
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
930
|
+
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
177
931
|
def with_columns(exprs)
|
178
932
|
exprs =
|
179
933
|
if exprs.nil?
|
@@ -202,55 +956,343 @@ module Polars
|
|
202
956
|
# def with_context
|
203
957
|
# end
|
204
958
|
|
959
|
+
# Add or overwrite column in a DataFrame.
|
960
|
+
#
|
961
|
+
# @param column [Object]
|
962
|
+
# Expression that evaluates to column or a Series to use.
|
963
|
+
#
|
964
|
+
# @return [LazyFrame]
|
965
|
+
#
|
966
|
+
# @example
|
967
|
+
# df = Polars::DataFrame.new(
|
968
|
+
# {
|
969
|
+
# "a" => [1, 3, 5],
|
970
|
+
# "b" => [2, 4, 6]
|
971
|
+
# }
|
972
|
+
# ).lazy
|
973
|
+
# df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
|
974
|
+
# # =>
|
975
|
+
# # shape: (3, 3)
|
976
|
+
# # ┌─────┬─────┬───────────┐
|
977
|
+
# # │ a ┆ b ┆ b_squared │
|
978
|
+
# # │ --- ┆ --- ┆ --- │
|
979
|
+
# # │ i64 ┆ i64 ┆ f64 │
|
980
|
+
# # ╞═════╪═════╪═══════════╡
|
981
|
+
# # │ 1 ┆ 2 ┆ 4.0 │
|
982
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
983
|
+
# # │ 3 ┆ 4 ┆ 16.0 │
|
984
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
985
|
+
# # │ 5 ┆ 6 ┆ 36.0 │
|
986
|
+
# # └─────┴─────┴───────────┘
|
987
|
+
#
|
988
|
+
# @example
|
989
|
+
# df.with_column(Polars.col("a") ** 2).collect
|
990
|
+
# # =>
|
991
|
+
# # shape: (3, 2)
|
992
|
+
# # ┌──────┬─────┐
|
993
|
+
# # │ a ┆ b │
|
994
|
+
# # │ --- ┆ --- │
|
995
|
+
# # │ f64 ┆ i64 │
|
996
|
+
# # ╞══════╪═════╡
|
997
|
+
# # │ 1.0 ┆ 2 │
|
998
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
|
999
|
+
# # │ 9.0 ┆ 4 │
|
1000
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1001
|
+
# # │ 25.0 ┆ 6 │
|
1002
|
+
# # └──────┴─────┘
|
205
1003
|
def with_column(column)
|
206
1004
|
with_columns([column])
|
207
1005
|
end
|
208
1006
|
|
209
|
-
#
|
210
|
-
#
|
1007
|
+
# Remove one or multiple columns from a DataFrame.
|
1008
|
+
#
|
1009
|
+
# @param columns [Object]
|
1010
|
+
# - Name of the column that should be removed.
|
1011
|
+
# - List of column names.
|
1012
|
+
#
|
1013
|
+
# @return [LazyFrame]
|
1014
|
+
def drop(columns)
|
1015
|
+
if columns.is_a?(String)
|
1016
|
+
columns = [columns]
|
1017
|
+
end
|
1018
|
+
_from_rbldf(_ldf.drop_columns(columns))
|
1019
|
+
end
|
211
1020
|
|
1021
|
+
# Rename column names.
|
1022
|
+
#
|
1023
|
+
# @param mapping [Hash]
|
1024
|
+
# Key value pairs that map from old name to new name.
|
1025
|
+
#
|
1026
|
+
# @return [LazyFrame]
|
212
1027
|
def rename(mapping)
|
213
1028
|
existing = mapping.keys
|
214
1029
|
_new = mapping.values
|
215
1030
|
_from_rbldf(_ldf.rename(existing, _new))
|
216
1031
|
end
|
217
1032
|
|
218
|
-
#
|
219
|
-
#
|
1033
|
+
# Reverse the DataFrame.
|
1034
|
+
#
|
1035
|
+
# @return [LazyFrame]
|
1036
|
+
def reverse
|
1037
|
+
_from_rbldf(_ldf.reverse)
|
1038
|
+
end
|
220
1039
|
|
221
|
-
#
|
222
|
-
#
|
1040
|
+
# Shift the values by a given period.
|
1041
|
+
#
|
1042
|
+
# @param periods [Integer]
|
1043
|
+
# Number of places to shift (may be negative).
|
1044
|
+
#
|
1045
|
+
# @return [LazyFrame]
|
1046
|
+
#
|
1047
|
+
# @example
|
1048
|
+
# df = Polars::DataFrame.new(
|
1049
|
+
# {
|
1050
|
+
# "a" => [1, 3, 5],
|
1051
|
+
# "b" => [2, 4, 6]
|
1052
|
+
# }
|
1053
|
+
# ).lazy
|
1054
|
+
# df.shift(1).collect
|
1055
|
+
# # =>
|
1056
|
+
# # shape: (3, 2)
|
1057
|
+
# # ┌──────┬──────┐
|
1058
|
+
# # │ a ┆ b │
|
1059
|
+
# # │ --- ┆ --- │
|
1060
|
+
# # │ i64 ┆ i64 │
|
1061
|
+
# # ╞══════╪══════╡
|
1062
|
+
# # │ null ┆ null │
|
1063
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1064
|
+
# # │ 1 ┆ 2 │
|
1065
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1066
|
+
# # │ 3 ┆ 4 │
|
1067
|
+
# # └──────┴──────┘
|
1068
|
+
#
|
1069
|
+
# @example
|
1070
|
+
# df.shift(-1).collect
|
1071
|
+
# # =>
|
1072
|
+
# # shape: (3, 2)
|
1073
|
+
# # ┌──────┬──────┐
|
1074
|
+
# # │ a ┆ b │
|
1075
|
+
# # │ --- ┆ --- │
|
1076
|
+
# # │ i64 ┆ i64 │
|
1077
|
+
# # ╞══════╪══════╡
|
1078
|
+
# # │ 3 ┆ 4 │
|
1079
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1080
|
+
# # │ 5 ┆ 6 │
|
1081
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1082
|
+
# # │ null ┆ null │
|
1083
|
+
# # └──────┴──────┘
|
1084
|
+
def shift(periods)
|
1085
|
+
_from_rbldf(_ldf.shift(periods))
|
1086
|
+
end
|
223
1087
|
|
224
|
-
#
|
225
|
-
#
|
1088
|
+
# Shift the values by a given period and fill the resulting null values.
|
1089
|
+
#
|
1090
|
+
# @param periods [Integer]
|
1091
|
+
# Number of places to shift (may be negative).
|
1092
|
+
# @param fill_value [Object]
|
1093
|
+
# Fill `nil` values with the result of this expression.
|
1094
|
+
#
|
1095
|
+
# @return [LazyFrame]
|
1096
|
+
#
|
1097
|
+
# @example
|
1098
|
+
# df = Polars::DataFrame.new(
|
1099
|
+
# {
|
1100
|
+
# "a" => [1, 3, 5],
|
1101
|
+
# "b" => [2, 4, 6]
|
1102
|
+
# }
|
1103
|
+
# ).lazy
|
1104
|
+
# df.shift_and_fill(1, 0).collect
|
1105
|
+
# # =>
|
1106
|
+
# # shape: (3, 2)
|
1107
|
+
# # ┌─────┬─────┐
|
1108
|
+
# # │ a ┆ b │
|
1109
|
+
# # │ --- ┆ --- │
|
1110
|
+
# # │ i64 ┆ i64 │
|
1111
|
+
# # ╞═════╪═════╡
|
1112
|
+
# # │ 0 ┆ 0 │
|
1113
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1114
|
+
# # │ 1 ┆ 2 │
|
1115
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1116
|
+
# # │ 3 ┆ 4 │
|
1117
|
+
# # └─────┴─────┘
|
1118
|
+
#
|
1119
|
+
# @example
|
1120
|
+
# df.shift_and_fill(-1, 0).collect
|
1121
|
+
# # =>
|
1122
|
+
# # shape: (3, 2)
|
1123
|
+
# # ┌─────┬─────┐
|
1124
|
+
# # │ a ┆ b │
|
1125
|
+
# # │ --- ┆ --- │
|
1126
|
+
# # │ i64 ┆ i64 │
|
1127
|
+
# # ╞═════╪═════╡
|
1128
|
+
# # │ 3 ┆ 4 │
|
1129
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1130
|
+
# # │ 5 ┆ 6 │
|
1131
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1132
|
+
# # │ 0 ┆ 0 │
|
1133
|
+
# # └─────┴─────┘
|
1134
|
+
def shift_and_fill(periods, fill_value)
|
1135
|
+
if !fill_value.is_a?(Expr)
|
1136
|
+
fill_value = Polars.lit(fill_value)
|
1137
|
+
end
|
1138
|
+
_from_rbldf(_ldf.shift_and_fill(periods, fill_value._rbexpr))
|
1139
|
+
end
|
226
1140
|
|
227
|
-
#
|
228
|
-
#
|
1141
|
+
# Get a slice of this DataFrame.
|
1142
|
+
#
|
1143
|
+
# @param offset [Integer]
|
1144
|
+
# Start index. Negative indexing is supported.
|
1145
|
+
# @param length [Integer]
|
1146
|
+
# Length of the slice. If set to `nil`, all rows starting at the offset
|
1147
|
+
# will be selected.
|
1148
|
+
#
|
1149
|
+
# @return [LazyFrame]
|
1150
|
+
#
|
1151
|
+
# @example
|
1152
|
+
# df = Polars::DataFrame.new(
|
1153
|
+
# {
|
1154
|
+
# "a" => ["x", "y", "z"],
|
1155
|
+
# "b" => [1, 3, 5],
|
1156
|
+
# "c" => [2, 4, 6]
|
1157
|
+
# }
|
1158
|
+
# ).lazy
|
1159
|
+
# df.slice(1, 2).collect
|
1160
|
+
# # =>
|
1161
|
+
# # shape: (2, 3)
|
1162
|
+
# # ┌─────┬─────┬─────┐
|
1163
|
+
# # │ a ┆ b ┆ c │
|
1164
|
+
# # │ --- ┆ --- ┆ --- │
|
1165
|
+
# # │ str ┆ i64 ┆ i64 │
|
1166
|
+
# # ╞═════╪═════╪═════╡
|
1167
|
+
# # │ y ┆ 3 ┆ 4 │
|
1168
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1169
|
+
# # │ z ┆ 5 ┆ 6 │
|
1170
|
+
# # └─────┴─────┴─────┘
|
1171
|
+
def slice(offset, length = nil)
|
1172
|
+
if length && length < 0
|
1173
|
+
raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
|
1174
|
+
end
|
1175
|
+
_from_rbldf(_ldf.slice(offset, length))
|
1176
|
+
end
|
229
1177
|
|
230
|
-
#
|
231
|
-
#
|
1178
|
+
# Get the first `n` rows.
|
1179
|
+
#
|
1180
|
+
# Alias for {#head}.
|
1181
|
+
#
|
1182
|
+
# @param n [Integer]
|
1183
|
+
# Number of rows to return.
|
1184
|
+
#
|
1185
|
+
# @return [LazyFrame]
|
1186
|
+
#
|
1187
|
+
# @note
|
1188
|
+
# Consider using the {#fetch} operation if you only want to test your
|
1189
|
+
# query. The {#fetch} operation will load the first `n` rows at the scan
|
1190
|
+
# level, whereas the {#head}/{#limit} are applied at the end.
|
1191
|
+
def limit(n = 5)
|
1192
|
+
head(5)
|
1193
|
+
end
|
232
1194
|
|
233
|
-
#
|
234
|
-
#
|
1195
|
+
# Get the first `n` rows.
|
1196
|
+
#
|
1197
|
+
# @param n [Integer]
|
1198
|
+
# Number of rows to return.
|
1199
|
+
#
|
1200
|
+
# @return [LazyFrame]
|
1201
|
+
#
|
1202
|
+
# @note
|
1203
|
+
# Consider using the {#fetch} operation if you only want to test your
|
1204
|
+
# query. The {#fetch} operation will load the first `n` rows at the scan
|
1205
|
+
# level, whereas the {#head}/{#limit} are applied at the end.
|
1206
|
+
def head(n = 5)
|
1207
|
+
slice(0, n)
|
1208
|
+
end
|
235
1209
|
|
236
|
-
#
|
237
|
-
#
|
1210
|
+
# Get the last `n` rows.
|
1211
|
+
#
|
1212
|
+
# @param n [Integer]
|
1213
|
+
# Number of rows.
|
1214
|
+
#
|
1215
|
+
# @return [LazyFrame]
|
1216
|
+
def tail(n = 5)
|
1217
|
+
_from_rbldf(_ldf.tail(n))
|
1218
|
+
end
|
238
1219
|
|
239
|
-
#
|
240
|
-
#
|
1220
|
+
# Get the last row of the DataFrame.
|
1221
|
+
#
|
1222
|
+
# @return [LazyFrame]
|
1223
|
+
def last
|
1224
|
+
tail(1)
|
1225
|
+
end
|
241
1226
|
|
242
|
-
#
|
243
|
-
#
|
1227
|
+
# Get the first row of the DataFrame.
|
1228
|
+
#
|
1229
|
+
# @return [LazyFrame]
|
1230
|
+
def first
|
1231
|
+
slice(0, 1)
|
1232
|
+
end
|
244
1233
|
|
245
1234
|
# def with_row_count
|
246
1235
|
# end
|
247
1236
|
|
248
|
-
#
|
249
|
-
#
|
1237
|
+
# Take every nth row in the LazyFrame and return as a new LazyFrame.
|
1238
|
+
#
|
1239
|
+
# @return [LazyFrame]
|
1240
|
+
#
|
1241
|
+
# @example
|
1242
|
+
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
|
1243
|
+
# s.take_every(2).collect
|
1244
|
+
# # =>
|
1245
|
+
# # shape: (2, 2)
|
1246
|
+
# # ┌─────┬─────┐
|
1247
|
+
# # │ a ┆ b │
|
1248
|
+
# # │ --- ┆ --- │
|
1249
|
+
# # │ i64 ┆ i64 │
|
1250
|
+
# # ╞═════╪═════╡
|
1251
|
+
# # │ 1 ┆ 5 │
|
1252
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1253
|
+
# # │ 3 ┆ 7 │
|
1254
|
+
# # └─────┴─────┘
|
1255
|
+
def take_every(n)
|
1256
|
+
select(Utils.col("*").take_every(n))
|
1257
|
+
end
|
250
1258
|
|
251
1259
|
# def fill_null
|
252
1260
|
# end
|
253
1261
|
|
1262
|
+
# Fill floating point NaN values.
|
1263
|
+
#
|
1264
|
+
# @param fill_value [Object]
|
1265
|
+
# Value to fill the NaN values with.
|
1266
|
+
#
|
1267
|
+
# @return [LazyFrame]
|
1268
|
+
#
|
1269
|
+
# @note
|
1270
|
+
# Note that floating point NaN (Not a Number) are not missing values!
|
1271
|
+
# To replace missing values, use `fill_null` instead.
|
1272
|
+
#
|
1273
|
+
# @example
|
1274
|
+
# df = Polars::DataFrame.new(
|
1275
|
+
# {
|
1276
|
+
# "a" => [1.5, 2, Float::NAN, 4],
|
1277
|
+
# "b" => [0.5, 4, Float::NAN, 13],
|
1278
|
+
# }
|
1279
|
+
# ).lazy
|
1280
|
+
# df.fill_nan(99).collect
|
1281
|
+
# # =>
|
1282
|
+
# # shape: (4, 2)
|
1283
|
+
# # ┌──────┬──────┐
|
1284
|
+
# # │ a ┆ b │
|
1285
|
+
# # │ --- ┆ --- │
|
1286
|
+
# # │ f64 ┆ f64 │
|
1287
|
+
# # ╞══════╪══════╡
|
1288
|
+
# # │ 1.5 ┆ 0.5 │
|
1289
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1290
|
+
# # │ 2.0 ┆ 4.0 │
|
1291
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1292
|
+
# # │ 99.0 ┆ 99.0 │
|
1293
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1294
|
+
# # │ 4.0 ┆ 13.0 │
|
1295
|
+
# # └──────┴──────┘
|
254
1296
|
def fill_nan(fill_value)
|
255
1297
|
if !fill_value.is_a?(Expr)
|
256
1298
|
fill_value = Utils.lit(fill_value)
|
@@ -258,35 +1300,255 @@ module Polars
|
|
258
1300
|
_from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
|
259
1301
|
end
|
260
1302
|
|
261
|
-
#
|
262
|
-
#
|
1303
|
+
# Aggregate the columns in the DataFrame to their standard deviation value.
|
1304
|
+
#
|
1305
|
+
# @return [LazyFrame]
|
1306
|
+
#
|
1307
|
+
# @example
|
1308
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
1309
|
+
# df.std.collect
|
1310
|
+
# # =>
|
1311
|
+
# # shape: (1, 2)
|
1312
|
+
# # ┌──────────┬─────┐
|
1313
|
+
# # │ a ┆ b │
|
1314
|
+
# # │ --- ┆ --- │
|
1315
|
+
# # │ f64 ┆ f64 │
|
1316
|
+
# # ╞══════════╪═════╡
|
1317
|
+
# # │ 1.290994 ┆ 0.5 │
|
1318
|
+
# # └──────────┴─────┘
|
1319
|
+
#
|
1320
|
+
# @example
|
1321
|
+
# df.std(ddof: 0).collect
|
1322
|
+
# # =>
|
1323
|
+
# # shape: (1, 2)
|
1324
|
+
# # ┌──────────┬──────────┐
|
1325
|
+
# # │ a ┆ b │
|
1326
|
+
# # │ --- ┆ --- │
|
1327
|
+
# # │ f64 ┆ f64 │
|
1328
|
+
# # ╞══════════╪══════════╡
|
1329
|
+
# # │ 1.118034 ┆ 0.433013 │
|
1330
|
+
# # └──────────┴──────────┘
|
1331
|
+
def std(ddof: 1)
|
1332
|
+
_from_rbldf(_ldf.std(ddof))
|
1333
|
+
end
|
263
1334
|
|
264
|
-
#
|
265
|
-
#
|
1335
|
+
# Aggregate the columns in the DataFrame to their variance value.
|
1336
|
+
#
|
1337
|
+
# @return [LazyFrame]
|
1338
|
+
#
|
1339
|
+
# @example
|
1340
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
1341
|
+
# df.var.collect
|
1342
|
+
# # =>
|
1343
|
+
# # shape: (1, 2)
|
1344
|
+
# # ┌──────────┬──────┐
|
1345
|
+
# # │ a ┆ b │
|
1346
|
+
# # │ --- ┆ --- │
|
1347
|
+
# # │ f64 ┆ f64 │
|
1348
|
+
# # ╞══════════╪══════╡
|
1349
|
+
# # │ 1.666667 ┆ 0.25 │
|
1350
|
+
# # └──────────┴──────┘
|
1351
|
+
#
|
1352
|
+
# @example
|
1353
|
+
# df.var(ddof: 0).collect
|
1354
|
+
# # =>
|
1355
|
+
# # shape: (1, 2)
|
1356
|
+
# # ┌──────┬────────┐
|
1357
|
+
# # │ a ┆ b │
|
1358
|
+
# # │ --- ┆ --- │
|
1359
|
+
# # │ f64 ┆ f64 │
|
1360
|
+
# # ╞══════╪════════╡
|
1361
|
+
# # │ 1.25 ┆ 0.1875 │
|
1362
|
+
# # └──────┴────────┘
|
1363
|
+
def var(ddof: 1)
|
1364
|
+
_from_rbldf(_ldf.var(ddof))
|
1365
|
+
end
|
266
1366
|
|
267
|
-
#
|
268
|
-
#
|
1367
|
+
# Aggregate the columns in the DataFrame to their maximum value.
|
1368
|
+
#
|
1369
|
+
# @return [LazyFrame]
|
1370
|
+
#
|
1371
|
+
# @example
|
1372
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
1373
|
+
# df.max.collect
|
1374
|
+
# # =>
|
1375
|
+
# # shape: (1, 2)
|
1376
|
+
# # ┌─────┬─────┐
|
1377
|
+
# # │ a ┆ b │
|
1378
|
+
# # │ --- ┆ --- │
|
1379
|
+
# # │ i64 ┆ i64 │
|
1380
|
+
# # ╞═════╪═════╡
|
1381
|
+
# # │ 4 ┆ 2 │
|
1382
|
+
# # └─────┴─────┘
|
1383
|
+
def max
|
1384
|
+
_from_rbldf(_ldf.max)
|
1385
|
+
end
|
269
1386
|
|
270
|
-
#
|
271
|
-
#
|
1387
|
+
# Aggregate the columns in the DataFrame to their minimum value.
|
1388
|
+
#
|
1389
|
+
# @return [LazyFrame]
|
1390
|
+
#
|
1391
|
+
# @example
|
1392
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
1393
|
+
# df.min.collect
|
1394
|
+
# # =>
|
1395
|
+
# # shape: (1, 2)
|
1396
|
+
# # ┌─────┬─────┐
|
1397
|
+
# # │ a ┆ b │
|
1398
|
+
# # │ --- ┆ --- │
|
1399
|
+
# # │ i64 ┆ i64 │
|
1400
|
+
# # ╞═════╪═════╡
|
1401
|
+
# # │ 1 ┆ 1 │
|
1402
|
+
# # └─────┴─────┘
|
1403
|
+
def min
|
1404
|
+
_from_rbldf(_ldf.min)
|
1405
|
+
end
|
272
1406
|
|
273
|
-
#
|
274
|
-
#
|
1407
|
+
# Aggregate the columns in the DataFrame to their sum value.
|
1408
|
+
#
|
1409
|
+
# @return [LazyFrame]
|
1410
|
+
#
|
1411
|
+
# @example
|
1412
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
1413
|
+
# df.sum.collect
|
1414
|
+
# # =>
|
1415
|
+
# # shape: (1, 2)
|
1416
|
+
# # ┌─────┬─────┐
|
1417
|
+
# # │ a ┆ b │
|
1418
|
+
# # │ --- ┆ --- │
|
1419
|
+
# # │ i64 ┆ i64 │
|
1420
|
+
# # ╞═════╪═════╡
|
1421
|
+
# # │ 10 ┆ 5 │
|
1422
|
+
# # └─────┴─────┘
|
1423
|
+
def sum
|
1424
|
+
_from_rbldf(_ldf.sum)
|
1425
|
+
end
|
275
1426
|
|
276
|
-
#
|
277
|
-
#
|
1427
|
+
# Aggregate the columns in the DataFrame to their mean value.
|
1428
|
+
#
|
1429
|
+
# @return [LazyFrame]
|
1430
|
+
#
|
1431
|
+
# @example
|
1432
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
1433
|
+
# df.mean.collect
|
1434
|
+
# # =>
|
1435
|
+
# # shape: (1, 2)
|
1436
|
+
# # ┌─────┬──────┐
|
1437
|
+
# # │ a ┆ b │
|
1438
|
+
# # │ --- ┆ --- │
|
1439
|
+
# # │ f64 ┆ f64 │
|
1440
|
+
# # ╞═════╪══════╡
|
1441
|
+
# # │ 2.5 ┆ 1.25 │
|
1442
|
+
# # └─────┴──────┘
|
1443
|
+
def mean
|
1444
|
+
_from_rbldf(_ldf.mean)
|
1445
|
+
end
|
278
1446
|
|
279
|
-
#
|
280
|
-
#
|
1447
|
+
# Aggregate the columns in the DataFrame to their median value.
|
1448
|
+
#
|
1449
|
+
# @return [LazyFrame]
|
1450
|
+
#
|
1451
|
+
# @example
|
1452
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
1453
|
+
# df.median.collect
|
1454
|
+
# # =>
|
1455
|
+
# # shape: (1, 2)
|
1456
|
+
# # ┌─────┬─────┐
|
1457
|
+
# # │ a ┆ b │
|
1458
|
+
# # │ --- ┆ --- │
|
1459
|
+
# # │ f64 ┆ f64 │
|
1460
|
+
# # ╞═════╪═════╡
|
1461
|
+
# # │ 2.5 ┆ 1.0 │
|
1462
|
+
# # └─────┴─────┘
|
1463
|
+
def median
|
1464
|
+
_from_rbldf(_ldf.median)
|
1465
|
+
end
|
281
1466
|
|
282
|
-
#
|
283
|
-
#
|
1467
|
+
# Aggregate the columns in the DataFrame to their quantile value.
|
1468
|
+
#
|
1469
|
+
# @param quantile [Float]
|
1470
|
+
# Quantile between 0.0 and 1.0.
|
1471
|
+
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
|
1472
|
+
# Interpolation method.
|
1473
|
+
#
|
1474
|
+
# @return [LazyFrame]
|
1475
|
+
#
|
1476
|
+
# @example
|
1477
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
1478
|
+
# df.quantile(0.7).collect
|
1479
|
+
# # =>
|
1480
|
+
# # shape: (1, 2)
|
1481
|
+
# # ┌─────┬─────┐
|
1482
|
+
# # │ a ┆ b │
|
1483
|
+
# # │ --- ┆ --- │
|
1484
|
+
# # │ f64 ┆ f64 │
|
1485
|
+
# # ╞═════╪═════╡
|
1486
|
+
# # │ 3.0 ┆ 1.0 │
|
1487
|
+
# # └─────┴─────┘
|
1488
|
+
def quantile(quantile, interpolation: "nearest")
|
1489
|
+
_from_rbldf(_ldf.quantile(quantile, interpolation))
|
1490
|
+
end
|
284
1491
|
|
285
|
-
#
|
286
|
-
#
|
1492
|
+
# Explode lists to long format.
|
1493
|
+
#
|
1494
|
+
# @return [LazyFrame]
|
1495
|
+
#
|
1496
|
+
# @example
|
1497
|
+
# df = Polars::DataFrame.new(
|
1498
|
+
# {
|
1499
|
+
# "letters" => ["a", "a", "b", "c"],
|
1500
|
+
# "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
|
1501
|
+
# }
|
1502
|
+
# ).lazy
|
1503
|
+
# df.explode("numbers").collect
|
1504
|
+
# # =>
|
1505
|
+
# # shape: (8, 2)
|
1506
|
+
# # ┌─────────┬─────────┐
|
1507
|
+
# # │ letters ┆ numbers │
|
1508
|
+
# # │ --- ┆ --- │
|
1509
|
+
# # │ str ┆ i64 │
|
1510
|
+
# # ╞═════════╪═════════╡
|
1511
|
+
# # │ a ┆ 1 │
|
1512
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
1513
|
+
# # │ a ┆ 2 │
|
1514
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
1515
|
+
# # │ a ┆ 3 │
|
1516
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
1517
|
+
# # │ b ┆ 4 │
|
1518
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
1519
|
+
# # │ b ┆ 5 │
|
1520
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
1521
|
+
# # │ c ┆ 6 │
|
1522
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
1523
|
+
# # │ c ┆ 7 │
|
1524
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
1525
|
+
# # │ c ┆ 8 │
|
1526
|
+
# # └─────────┴─────────┘
|
1527
|
+
def explode(columns)
|
1528
|
+
columns = Utils.selection_to_rbexpr_list(columns)
|
1529
|
+
_from_rbldf(_ldf.explode(columns))
|
1530
|
+
end
|
287
1531
|
|
288
|
-
#
|
289
|
-
#
|
1532
|
+
# Drop duplicate rows from this DataFrame.
|
1533
|
+
#
|
1534
|
+
# Note that this fails if there is a column of type `List` in the DataFrame or
|
1535
|
+
# subset.
|
1536
|
+
#
|
1537
|
+
# @param maintain_order [Boolean]
|
1538
|
+
# Keep the same order as the original DataFrame. This requires more work to
|
1539
|
+
# compute.
|
1540
|
+
# @param subset [Object]
|
1541
|
+
# Subset to use to compare rows.
|
1542
|
+
# @param keep ["first", "last"]
|
1543
|
+
# Which of the duplicate rows to keep.
|
1544
|
+
#
|
1545
|
+
# @return [LazyFrame]
|
1546
|
+
def unique(maintain_order: true, subset: nil, keep: "first")
|
1547
|
+
if !subset.nil? && !subset.is_a?(Array)
|
1548
|
+
subset = [subset]
|
1549
|
+
end
|
1550
|
+
_from_rbldf(_ldf.unique(maintain_order, subset, keep))
|
1551
|
+
end
|
290
1552
|
|
291
1553
|
# def drop_nulls
|
292
1554
|
# end
|
@@ -297,11 +1559,97 @@ module Polars
|
|
297
1559
|
# def map
|
298
1560
|
# end
|
299
1561
|
|
300
|
-
#
|
301
|
-
#
|
1562
|
+
# Interpolate intermediate values. The interpolation method is linear.
|
1563
|
+
#
|
1564
|
+
# @return [LazyFrame]
|
1565
|
+
#
|
1566
|
+
# @example
|
1567
|
+
# df = Polars::DataFrame.new(
|
1568
|
+
# {
|
1569
|
+
# "foo" => [1, nil, 9, 10],
|
1570
|
+
# "bar" => [6, 7, 9, nil],
|
1571
|
+
# "baz" => [1, nil, nil, 9]
|
1572
|
+
# }
|
1573
|
+
# ).lazy
|
1574
|
+
# df.interpolate.collect
|
1575
|
+
# # =>
|
1576
|
+
# # shape: (4, 3)
|
1577
|
+
# # ┌─────┬──────┬─────┐
|
1578
|
+
# # │ foo ┆ bar ┆ baz │
|
1579
|
+
# # │ --- ┆ --- ┆ --- │
|
1580
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
1581
|
+
# # ╞═════╪══════╪═════╡
|
1582
|
+
# # │ 1 ┆ 6 ┆ 1 │
|
1583
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1584
|
+
# # │ 5 ┆ 7 ┆ 3 │
|
1585
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1586
|
+
# # │ 9 ┆ 9 ┆ 6 │
|
1587
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1588
|
+
# # │ 10 ┆ null ┆ 9 │
|
1589
|
+
# # └─────┴──────┴─────┘
|
1590
|
+
def interpolate
|
1591
|
+
select(Utils.col("*").interpolate)
|
1592
|
+
end
|
302
1593
|
|
303
|
-
#
|
304
|
-
#
|
1594
|
+
# Decompose a struct into its fields.
|
1595
|
+
#
|
1596
|
+
# The fields will be inserted into the `DataFrame` on the location of the
|
1597
|
+
# `struct` type.
|
1598
|
+
#
|
1599
|
+
# @param names [Object]
|
1600
|
+
# Names of the struct columns that will be decomposed by its fields
|
1601
|
+
#
|
1602
|
+
# @return [LazyFrame]
|
1603
|
+
#
|
1604
|
+
# @example
|
1605
|
+
# df = (
|
1606
|
+
# Polars::DataFrame.new(
|
1607
|
+
# {
|
1608
|
+
# "before" => ["foo", "bar"],
|
1609
|
+
# "t_a" => [1, 2],
|
1610
|
+
# "t_b" => ["a", "b"],
|
1611
|
+
# "t_c" => [true, nil],
|
1612
|
+
# "t_d" => [[1, 2], [3]],
|
1613
|
+
# "after" => ["baz", "womp"]
|
1614
|
+
# }
|
1615
|
+
# )
|
1616
|
+
# .lazy
|
1617
|
+
# .select(
|
1618
|
+
# ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
|
1619
|
+
# )
|
1620
|
+
# )
|
1621
|
+
# df.fetch
|
1622
|
+
# # =>
|
1623
|
+
# # shape: (2, 3)
|
1624
|
+
# # ┌────────┬─────────────────────┬───────┐
|
1625
|
+
# # │ before ┆ t_struct ┆ after │
|
1626
|
+
# # │ --- ┆ --- ┆ --- │
|
1627
|
+
# # │ str ┆ struct[4] ┆ str │
|
1628
|
+
# # ╞════════╪═════════════════════╪═══════╡
|
1629
|
+
# # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
|
1630
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1631
|
+
# # │ bar ┆ {2,"b",null,[3]} ┆ womp │
|
1632
|
+
# # └────────┴─────────────────────┴───────┘
|
1633
|
+
#
|
1634
|
+
# @example
|
1635
|
+
# df.unnest("t_struct").fetch
|
1636
|
+
# # =>
|
1637
|
+
# # shape: (2, 6)
|
1638
|
+
# # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
|
1639
|
+
# # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
|
1640
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1641
|
+
# # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
|
1642
|
+
# # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
|
1643
|
+
# # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
|
1644
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1645
|
+
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
1646
|
+
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
1647
|
+
def unnest(names)
|
1648
|
+
if names.is_a?(String)
|
1649
|
+
names = [names]
|
1650
|
+
end
|
1651
|
+
_from_rbldf(_ldf.unnest(names))
|
1652
|
+
end
|
305
1653
|
|
306
1654
|
private
|
307
1655
|
|