polars-df 0.13.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +208 -0
- data/Cargo.lock +2556 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +39278 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +104 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +36 -0
- data/lib/polars/cat_name_space.rb +88 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +98 -0
- data/lib/polars/data_frame.rb +5191 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1397 -0
- data/lib/polars/date_time_name_space.rb +1287 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +38 -0
- data/lib/polars/expr.rb +7256 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +271 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1329 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +136 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +613 -0
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/io/csv.rb +696 -0
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +275 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +233 -0
- data/lib/polars/lazy_frame.rb +2708 -0
- data/lib/polars/lazy_group_by.rb +181 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +449 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +4444 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1495 -0
- data/lib/polars/string_name_space.rb +811 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +130 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +91 -0
- metadata +138 -0
@@ -0,0 +1,2708 @@
|
|
1
|
+
module Polars
|
2
|
+
# Representation of a Lazy computation graph/query against a DataFrame.
|
3
|
+
class LazyFrame
|
4
|
+
# @private
|
5
|
+
attr_accessor :_ldf
|
6
|
+
|
7
|
+
# Create a new LazyFrame.
|
8
|
+
def initialize(data = nil, schema: nil, schema_overrides: nil, orient: nil, infer_schema_length: 100, nan_to_null: false)
|
9
|
+
self._ldf = (
|
10
|
+
DataFrame.new(
|
11
|
+
data,
|
12
|
+
schema: schema,
|
13
|
+
schema_overrides: schema_overrides,
|
14
|
+
orient: orient,
|
15
|
+
infer_schema_length: infer_schema_length,
|
16
|
+
nan_to_null: nan_to_null
|
17
|
+
)
|
18
|
+
.lazy
|
19
|
+
._ldf
|
20
|
+
)
|
21
|
+
end
|
22
|
+
|
23
|
+
# @private
|
24
|
+
def self._from_rbldf(rb_ldf)
|
25
|
+
ldf = LazyFrame.allocate
|
26
|
+
ldf._ldf = rb_ldf
|
27
|
+
ldf
|
28
|
+
end
|
29
|
+
|
30
|
+
# def self.from_json
|
31
|
+
# end
|
32
|
+
|
33
|
+
# Read a logical plan from a JSON file to construct a LazyFrame.
|
34
|
+
#
|
35
|
+
# @param file [String]
|
36
|
+
# Path to a file or a file-like object.
|
37
|
+
#
|
38
|
+
# @return [LazyFrame]
|
39
|
+
def self.read_json(file)
|
40
|
+
if Utils.pathlike?(file)
|
41
|
+
file = Utils.normalize_filepath(file)
|
42
|
+
end
|
43
|
+
|
44
|
+
Utils.wrap_ldf(RbLazyFrame.read_json(file))
|
45
|
+
end
|
46
|
+
|
47
|
+
# Get or set column names.
|
48
|
+
#
|
49
|
+
# @return [Array]
|
50
|
+
#
|
51
|
+
# @example
|
52
|
+
# df = (
|
53
|
+
# Polars::DataFrame.new(
|
54
|
+
# {
|
55
|
+
# "foo" => [1, 2, 3],
|
56
|
+
# "bar" => [6, 7, 8],
|
57
|
+
# "ham" => ["a", "b", "c"]
|
58
|
+
# }
|
59
|
+
# )
|
60
|
+
# .lazy
|
61
|
+
# .select(["foo", "bar"])
|
62
|
+
# )
|
63
|
+
# df.columns
|
64
|
+
# # => ["foo", "bar"]
|
65
|
+
def columns
|
66
|
+
_ldf.collect_schema.keys
|
67
|
+
end
|
68
|
+
|
69
|
+
# Get dtypes of columns in LazyFrame.
|
70
|
+
#
|
71
|
+
# @return [Array]
|
72
|
+
#
|
73
|
+
# @example
|
74
|
+
# lf = Polars::DataFrame.new(
|
75
|
+
# {
|
76
|
+
# "foo" => [1, 2, 3],
|
77
|
+
# "bar" => [6.0, 7.0, 8.0],
|
78
|
+
# "ham" => ["a", "b", "c"]
|
79
|
+
# }
|
80
|
+
# ).lazy
|
81
|
+
# lf.dtypes
|
82
|
+
# # => [Polars::Int64, Polars::Float64, Polars::String]
|
83
|
+
def dtypes
|
84
|
+
_ldf.collect_schema.values
|
85
|
+
end
|
86
|
+
|
87
|
+
# Get the schema.
|
88
|
+
#
|
89
|
+
# @return [Hash]
|
90
|
+
#
|
91
|
+
# @example
|
92
|
+
# lf = Polars::DataFrame.new(
|
93
|
+
# {
|
94
|
+
# "foo" => [1, 2, 3],
|
95
|
+
# "bar" => [6.0, 7.0, 8.0],
|
96
|
+
# "ham" => ["a", "b", "c"]
|
97
|
+
# }
|
98
|
+
# ).lazy
|
99
|
+
# lf.schema
|
100
|
+
# # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::String}
|
101
|
+
def schema
|
102
|
+
_ldf.collect_schema
|
103
|
+
end
|
104
|
+
|
105
|
+
# Get the width of the LazyFrame.
|
106
|
+
#
|
107
|
+
# @return [Integer]
|
108
|
+
#
|
109
|
+
# @example
|
110
|
+
# lf = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]}).lazy
|
111
|
+
# lf.width
|
112
|
+
# # => 2
|
113
|
+
def width
|
114
|
+
_ldf.collect_schema.length
|
115
|
+
end
|
116
|
+
|
117
|
+
# Check if LazyFrame includes key.
|
118
|
+
#
|
119
|
+
# @return [Boolean]
|
120
|
+
def include?(key)
|
121
|
+
columns.include?(key)
|
122
|
+
end
|
123
|
+
|
124
|
+
# clone handled by initialize_copy
|
125
|
+
|
126
|
+
# def [](item)
|
127
|
+
# end
|
128
|
+
|
129
|
+
# Returns a string representing the LazyFrame.
|
130
|
+
#
|
131
|
+
# @return [String]
|
132
|
+
def to_s
|
133
|
+
<<~EOS
|
134
|
+
naive plan: (run LazyFrame#describe_optimized_plan to see the optimized plan)
|
135
|
+
|
136
|
+
#{describe_plan}
|
137
|
+
EOS
|
138
|
+
end
|
139
|
+
|
140
|
+
# Write the logical plan of this LazyFrame to a file or string in JSON format.
|
141
|
+
#
|
142
|
+
# @param file [String]
|
143
|
+
# File path to which the result should be written.
|
144
|
+
#
|
145
|
+
# @return [nil]
|
146
|
+
def write_json(file)
|
147
|
+
if Utils.pathlike?(file)
|
148
|
+
file = Utils.normalize_filepath(file)
|
149
|
+
end
|
150
|
+
_ldf.write_json(file)
|
151
|
+
nil
|
152
|
+
end
|
153
|
+
|
154
|
+
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
155
|
+
#
|
156
|
+
# @param func [Object]
|
157
|
+
# Callable; will receive the frame as the first parameter,
|
158
|
+
# followed by any given args/kwargs.
|
159
|
+
# @param args [Object]
|
160
|
+
# Arguments to pass to the UDF.
|
161
|
+
# @param kwargs [Object]
|
162
|
+
# Keyword arguments to pass to the UDF.
|
163
|
+
#
|
164
|
+
# @return [LazyFrame]
|
165
|
+
#
|
166
|
+
# @example
|
167
|
+
# cast_str_to_int = lambda do |data, col_name:|
|
168
|
+
# data.with_column(Polars.col(col_name).cast(:i64))
|
169
|
+
# end
|
170
|
+
#
|
171
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]}).lazy
|
172
|
+
# df.pipe(cast_str_to_int, col_name: "b").collect
|
173
|
+
# # =>
|
174
|
+
# # shape: (4, 2)
|
175
|
+
# # ┌─────┬─────┐
|
176
|
+
# # │ a ┆ b │
|
177
|
+
# # │ --- ┆ --- │
|
178
|
+
# # │ i64 ┆ i64 │
|
179
|
+
# # ╞═════╪═════╡
|
180
|
+
# # │ 1 ┆ 10 │
|
181
|
+
# # │ 2 ┆ 20 │
|
182
|
+
# # │ 3 ┆ 30 │
|
183
|
+
# # │ 4 ┆ 40 │
|
184
|
+
# # └─────┴─────┘
|
185
|
+
def pipe(func, *args, **kwargs, &block)
|
186
|
+
func.call(self, *args, **kwargs, &block)
|
187
|
+
end
|
188
|
+
|
189
|
+
# Create a string representation of the unoptimized query plan.
|
190
|
+
#
|
191
|
+
# @return [String]
|
192
|
+
def describe_plan
|
193
|
+
_ldf.describe_plan
|
194
|
+
end
|
195
|
+
|
196
|
+
# Create a string representation of the optimized query plan.
|
197
|
+
#
|
198
|
+
# @return [String]
|
199
|
+
def describe_optimized_plan(
|
200
|
+
type_coercion: true,
|
201
|
+
predicate_pushdown: true,
|
202
|
+
projection_pushdown: true,
|
203
|
+
simplify_expression: true,
|
204
|
+
slice_pushdown: true,
|
205
|
+
common_subplan_elimination: true,
|
206
|
+
comm_subexpr_elim: true,
|
207
|
+
allow_streaming: false
|
208
|
+
)
|
209
|
+
ldf = _ldf.optimization_toggle(
|
210
|
+
type_coercion,
|
211
|
+
predicate_pushdown,
|
212
|
+
projection_pushdown,
|
213
|
+
simplify_expression,
|
214
|
+
slice_pushdown,
|
215
|
+
common_subplan_elimination,
|
216
|
+
comm_subexpr_elim,
|
217
|
+
allow_streaming,
|
218
|
+
false
|
219
|
+
)
|
220
|
+
|
221
|
+
ldf.describe_optimized_plan
|
222
|
+
end
|
223
|
+
|
224
|
+
# def show_graph
|
225
|
+
# end
|
226
|
+
|
227
|
+
# Sort the DataFrame.
|
228
|
+
#
|
229
|
+
# Sorting can be done by:
|
230
|
+
#
|
231
|
+
# - A single column name
|
232
|
+
# - An expression
|
233
|
+
# - Multiple expressions
|
234
|
+
#
|
235
|
+
# @param by [Object]
|
236
|
+
# Column (expressions) to sort by.
|
237
|
+
# @param reverse [Boolean]
|
238
|
+
# Sort in descending order.
|
239
|
+
# @param nulls_last [Boolean]
|
240
|
+
# Place null values last. Can only be used if sorted by a single column.
|
241
|
+
#
|
242
|
+
# @return [LazyFrame]
|
243
|
+
#
|
244
|
+
# @example
|
245
|
+
# df = Polars::DataFrame.new(
|
246
|
+
# {
|
247
|
+
# "foo" => [1, 2, 3],
|
248
|
+
# "bar" => [6.0, 7.0, 8.0],
|
249
|
+
# "ham" => ["a", "b", "c"]
|
250
|
+
# }
|
251
|
+
# ).lazy
|
252
|
+
# df.sort("foo", reverse: true).collect
|
253
|
+
# # =>
|
254
|
+
# # shape: (3, 3)
|
255
|
+
# # ┌─────┬─────┬─────┐
|
256
|
+
# # │ foo ┆ bar ┆ ham │
|
257
|
+
# # │ --- ┆ --- ┆ --- │
|
258
|
+
# # │ i64 ┆ f64 ┆ str │
|
259
|
+
# # ╞═════╪═════╪═════╡
|
260
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
261
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
262
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
263
|
+
# # └─────┴─────┴─────┘
|
264
|
+
def sort(by, *more_by, reverse: false, nulls_last: false, maintain_order: false, multithreaded: true)
|
265
|
+
if by.is_a?(::String) && more_by.empty?
|
266
|
+
return _from_rbldf(
|
267
|
+
_ldf.sort(
|
268
|
+
by, reverse, nulls_last, maintain_order, multithreaded
|
269
|
+
)
|
270
|
+
)
|
271
|
+
end
|
272
|
+
|
273
|
+
by = Utils.parse_into_list_of_expressions(by, *more_by)
|
274
|
+
reverse = Utils.extend_bool(reverse, by.length, "reverse", "by")
|
275
|
+
nulls_last = Utils.extend_bool(nulls_last, by.length, "nulls_last", "by")
|
276
|
+
_from_rbldf(
|
277
|
+
_ldf.sort_by_exprs(
|
278
|
+
by, reverse, nulls_last, maintain_order, multithreaded
|
279
|
+
)
|
280
|
+
)
|
281
|
+
end
|
282
|
+
|
283
|
+
# def profile
|
284
|
+
# end
|
285
|
+
|
286
|
+
# Collect into a DataFrame.
|
287
|
+
#
|
288
|
+
# Note: use {#fetch} if you want to run your query on the first `n` rows
|
289
|
+
# only. This can be a huge time saver in debugging queries.
|
290
|
+
#
|
291
|
+
# @param type_coercion [Boolean]
|
292
|
+
# Do type coercion optimization.
|
293
|
+
# @param predicate_pushdown [Boolean]
|
294
|
+
# Do predicate pushdown optimization.
|
295
|
+
# @param projection_pushdown [Boolean]
|
296
|
+
# Do projection pushdown optimization.
|
297
|
+
# @param simplify_expression [Boolean]
|
298
|
+
# Run simplify expressions optimization.
|
299
|
+
# @param string_cache [Boolean]
|
300
|
+
# This argument is deprecated. Please set the string cache globally.
|
301
|
+
# The argument will be ignored
|
302
|
+
# @param no_optimization [Boolean]
|
303
|
+
# Turn off (certain) optimizations.
|
304
|
+
# @param slice_pushdown [Boolean]
|
305
|
+
# Slice pushdown optimization.
|
306
|
+
# @param common_subplan_elimination [Boolean]
|
307
|
+
# Will try to cache branching subplans that occur on self-joins or unions.
|
308
|
+
# @param allow_streaming [Boolean]
|
309
|
+
# Run parts of the query in a streaming fashion (this is in an alpha state)
|
310
|
+
#
|
311
|
+
# @return [DataFrame]
|
312
|
+
#
|
313
|
+
# @example
|
314
|
+
# df = Polars::DataFrame.new(
|
315
|
+
# {
|
316
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
317
|
+
# "b" => [1, 2, 3, 4, 5, 6],
|
318
|
+
# "c" => [6, 5, 4, 3, 2, 1]
|
319
|
+
# }
|
320
|
+
# ).lazy
|
321
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).collect
|
322
|
+
# # =>
|
323
|
+
# # shape: (3, 3)
|
324
|
+
# # ┌─────┬─────┬─────┐
|
325
|
+
# # │ a ┆ b ┆ c │
|
326
|
+
# # │ --- ┆ --- ┆ --- │
|
327
|
+
# # │ str ┆ i64 ┆ i64 │
|
328
|
+
# # ╞═════╪═════╪═════╡
|
329
|
+
# # │ a ┆ 4 ┆ 10 │
|
330
|
+
# # │ b ┆ 11 ┆ 10 │
|
331
|
+
# # │ c ┆ 6 ┆ 1 │
|
332
|
+
# # └─────┴─────┴─────┘
|
333
|
+
def collect(
|
334
|
+
type_coercion: true,
|
335
|
+
predicate_pushdown: true,
|
336
|
+
projection_pushdown: true,
|
337
|
+
simplify_expression: true,
|
338
|
+
string_cache: false,
|
339
|
+
no_optimization: false,
|
340
|
+
slice_pushdown: true,
|
341
|
+
common_subplan_elimination: true,
|
342
|
+
comm_subexpr_elim: true,
|
343
|
+
allow_streaming: false,
|
344
|
+
_eager: false
|
345
|
+
)
|
346
|
+
if no_optimization
|
347
|
+
predicate_pushdown = false
|
348
|
+
projection_pushdown = false
|
349
|
+
slice_pushdown = false
|
350
|
+
common_subplan_elimination = false
|
351
|
+
comm_subexpr_elim = false
|
352
|
+
end
|
353
|
+
|
354
|
+
if allow_streaming
|
355
|
+
common_subplan_elimination = false
|
356
|
+
end
|
357
|
+
|
358
|
+
ldf = _ldf.optimization_toggle(
|
359
|
+
type_coercion,
|
360
|
+
predicate_pushdown,
|
361
|
+
projection_pushdown,
|
362
|
+
simplify_expression,
|
363
|
+
slice_pushdown,
|
364
|
+
common_subplan_elimination,
|
365
|
+
comm_subexpr_elim,
|
366
|
+
allow_streaming,
|
367
|
+
_eager
|
368
|
+
)
|
369
|
+
Utils.wrap_df(ldf.collect)
|
370
|
+
end
|
371
|
+
|
372
|
+
# Persists a LazyFrame at the provided path.
|
373
|
+
#
|
374
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
375
|
+
#
|
376
|
+
# @param path [String]
|
377
|
+
# File path to which the file should be written.
|
378
|
+
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
379
|
+
# Choose "zstd" for good compression performance.
|
380
|
+
# Choose "lz4" for fast compression/decompression.
|
381
|
+
# Choose "snappy" for more backwards compatibility guarantees
|
382
|
+
# when you deal with older parquet readers.
|
383
|
+
# @param compression_level [Integer]
|
384
|
+
# The level of compression to use. Higher compression means smaller files on
|
385
|
+
# disk.
|
386
|
+
#
|
387
|
+
# - "gzip" : min-level: 0, max-level: 10.
|
388
|
+
# - "brotli" : min-level: 0, max-level: 11.
|
389
|
+
# - "zstd" : min-level: 1, max-level: 22.
|
390
|
+
# @param statistics [Boolean]
|
391
|
+
# Write statistics to the parquet headers. This requires extra compute.
|
392
|
+
# @param row_group_size [Integer]
|
393
|
+
# Size of the row groups in number of rows.
|
394
|
+
# If `nil` (default), the chunks of the `DataFrame` are
|
395
|
+
# used. Writing in smaller chunks may reduce memory pressure and improve
|
396
|
+
# writing speeds.
|
397
|
+
# @param data_pagesize_limit [Integer]
|
398
|
+
# Size limit of individual data pages.
|
399
|
+
# If not set defaults to 1024 * 1024 bytes
|
400
|
+
# @param maintain_order [Boolean]
|
401
|
+
# Maintain the order in which data is processed.
|
402
|
+
# Setting this to `false` will be slightly faster.
|
403
|
+
# @param type_coercion [Boolean]
|
404
|
+
# Do type coercion optimization.
|
405
|
+
# @param predicate_pushdown [Boolean]
|
406
|
+
# Do predicate pushdown optimization.
|
407
|
+
# @param projection_pushdown [Boolean]
|
408
|
+
# Do projection pushdown optimization.
|
409
|
+
# @param simplify_expression [Boolean]
|
410
|
+
# Run simplify expressions optimization.
|
411
|
+
# @param no_optimization [Boolean]
|
412
|
+
# Turn off (certain) optimizations.
|
413
|
+
# @param slice_pushdown [Boolean]
|
414
|
+
# Slice pushdown optimization.
|
415
|
+
#
|
416
|
+
# @return [DataFrame]
|
417
|
+
#
|
418
|
+
# @example
|
419
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
420
|
+
# lf.sink_parquet("out.parquet")
|
421
|
+
def sink_parquet(
|
422
|
+
path,
|
423
|
+
compression: "zstd",
|
424
|
+
compression_level: nil,
|
425
|
+
statistics: true,
|
426
|
+
row_group_size: nil,
|
427
|
+
data_pagesize_limit: nil,
|
428
|
+
maintain_order: true,
|
429
|
+
type_coercion: true,
|
430
|
+
predicate_pushdown: true,
|
431
|
+
projection_pushdown: true,
|
432
|
+
simplify_expression: true,
|
433
|
+
no_optimization: false,
|
434
|
+
slice_pushdown: true
|
435
|
+
)
|
436
|
+
lf = _set_sink_optimizations(
|
437
|
+
type_coercion: type_coercion,
|
438
|
+
predicate_pushdown: predicate_pushdown,
|
439
|
+
projection_pushdown: projection_pushdown,
|
440
|
+
simplify_expression: simplify_expression,
|
441
|
+
slice_pushdown: slice_pushdown,
|
442
|
+
no_optimization: no_optimization
|
443
|
+
)
|
444
|
+
|
445
|
+
if statistics == true
|
446
|
+
statistics = {
|
447
|
+
min: true,
|
448
|
+
max: true,
|
449
|
+
distinct_count: false,
|
450
|
+
null_count: true
|
451
|
+
}
|
452
|
+
elsif statistics == false
|
453
|
+
statistics = {}
|
454
|
+
elsif statistics == "full"
|
455
|
+
statistics = {
|
456
|
+
min: true,
|
457
|
+
max: true,
|
458
|
+
distinct_count: true,
|
459
|
+
null_count: true
|
460
|
+
}
|
461
|
+
end
|
462
|
+
|
463
|
+
lf.sink_parquet(
|
464
|
+
path,
|
465
|
+
compression,
|
466
|
+
compression_level,
|
467
|
+
statistics,
|
468
|
+
row_group_size,
|
469
|
+
data_pagesize_limit,
|
470
|
+
maintain_order
|
471
|
+
)
|
472
|
+
end
|
473
|
+
|
474
|
+
# Evaluate the query in streaming mode and write to an IPC file.
|
475
|
+
#
|
476
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
477
|
+
#
|
478
|
+
# @param path [String]
|
479
|
+
# File path to which the file should be written.
|
480
|
+
# @param compression ["lz4", "zstd"]
|
481
|
+
# Choose "zstd" for good compression performance.
|
482
|
+
# Choose "lz4" for fast compression/decompression.
|
483
|
+
# @param maintain_order [Boolean]
|
484
|
+
# Maintain the order in which data is processed.
|
485
|
+
# Setting this to `false` will be slightly faster.
|
486
|
+
# @param type_coercion [Boolean]
|
487
|
+
# Do type coercion optimization.
|
488
|
+
# @param predicate_pushdown [Boolean]
|
489
|
+
# Do predicate pushdown optimization.
|
490
|
+
# @param projection_pushdown [Boolean]
|
491
|
+
# Do projection pushdown optimization.
|
492
|
+
# @param simplify_expression [Boolean]
|
493
|
+
# Run simplify expressions optimization.
|
494
|
+
# @param slice_pushdown [Boolean]
|
495
|
+
# Slice pushdown optimization.
|
496
|
+
# @param no_optimization [Boolean]
|
497
|
+
# Turn off (certain) optimizations.
|
498
|
+
#
|
499
|
+
# @return [DataFrame]
|
500
|
+
#
|
501
|
+
# @example
|
502
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
503
|
+
# lf.sink_ipc("out.arrow")
|
504
|
+
def sink_ipc(
|
505
|
+
path,
|
506
|
+
compression: "zstd",
|
507
|
+
maintain_order: true,
|
508
|
+
type_coercion: true,
|
509
|
+
predicate_pushdown: true,
|
510
|
+
projection_pushdown: true,
|
511
|
+
simplify_expression: true,
|
512
|
+
slice_pushdown: true,
|
513
|
+
no_optimization: false
|
514
|
+
)
|
515
|
+
lf = _set_sink_optimizations(
|
516
|
+
type_coercion: type_coercion,
|
517
|
+
predicate_pushdown: predicate_pushdown,
|
518
|
+
projection_pushdown: projection_pushdown,
|
519
|
+
simplify_expression: simplify_expression,
|
520
|
+
slice_pushdown: slice_pushdown,
|
521
|
+
no_optimization: no_optimization
|
522
|
+
)
|
523
|
+
|
524
|
+
lf.sink_ipc(
|
525
|
+
path,
|
526
|
+
compression,
|
527
|
+
maintain_order
|
528
|
+
)
|
529
|
+
end
|
530
|
+
|
531
|
+
# Evaluate the query in streaming mode and write to a CSV file.
|
532
|
+
#
|
533
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
534
|
+
#
|
535
|
+
# @param path [String]
|
536
|
+
# File path to which the file should be written.
|
537
|
+
# @param include_bom [Boolean]
|
538
|
+
# Whether to include UTF-8 BOM in the CSV output.
|
539
|
+
# @param include_header [Boolean]
|
540
|
+
# Whether to include header in the CSV output.
|
541
|
+
# @param separator [String]
|
542
|
+
# Separate CSV fields with this symbol.
|
543
|
+
# @param line_terminator [String]
|
544
|
+
# String used to end each row.
|
545
|
+
# @param quote_char [String]
|
546
|
+
# Byte to use as quoting character.
|
547
|
+
# @param batch_size [Integer]
|
548
|
+
# Number of rows that will be processed per thread.
|
549
|
+
# @param datetime_format [String]
|
550
|
+
# A format string, with the specifiers defined by the
|
551
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
552
|
+
# Rust crate. If no format specified, the default fractional-second
|
553
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
554
|
+
# Datetime cols (if any).
|
555
|
+
# @param date_format [String]
|
556
|
+
# A format string, with the specifiers defined by the
|
557
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
558
|
+
# Rust crate.
|
559
|
+
# @param time_format [String]
|
560
|
+
# A format string, with the specifiers defined by the
|
561
|
+
# `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
562
|
+
# Rust crate.
|
563
|
+
# @param float_precision [Integer]
|
564
|
+
# Number of decimal places to write, applied to both `Float32` and
|
565
|
+
# `Float64` datatypes.
|
566
|
+
# @param null_value [String]
|
567
|
+
# A string representing null values (defaulting to the empty string).
|
568
|
+
# @param quote_style ["necessary", "always", "non_numeric", "never"]
|
569
|
+
# Determines the quoting strategy used.
|
570
|
+
#
|
571
|
+
# - necessary (default): This puts quotes around fields only when necessary.
|
572
|
+
# They are necessary when fields contain a quote,
|
573
|
+
# delimiter or record terminator.
|
574
|
+
# Quotes are also necessary when writing an empty record
|
575
|
+
# (which is indistinguishable from a record with one empty field).
|
576
|
+
# This is the default.
|
577
|
+
# - always: This puts quotes around every field. Always.
|
578
|
+
# - never: This never puts quotes around fields, even if that results in
|
579
|
+
# invalid CSV data (e.g.: by not quoting strings containing the
|
580
|
+
# separator).
|
581
|
+
# - non_numeric: This puts quotes around all fields that are non-numeric.
|
582
|
+
# Namely, when writing a field that does not parse as a valid float
|
583
|
+
# or integer, then quotes will be used even if they aren`t strictly
|
584
|
+
# necessary.
|
585
|
+
# @param maintain_order [Boolean]
|
586
|
+
# Maintain the order in which data is processed.
|
587
|
+
# Setting this to `false` will be slightly faster.
|
588
|
+
# @param type_coercion [Boolean]
|
589
|
+
# Do type coercion optimization.
|
590
|
+
# @param predicate_pushdown [Boolean]
|
591
|
+
# Do predicate pushdown optimization.
|
592
|
+
# @param projection_pushdown [Boolean]
|
593
|
+
# Do projection pushdown optimization.
|
594
|
+
# @param simplify_expression [Boolean]
|
595
|
+
# Run simplify expressions optimization.
|
596
|
+
# @param slice_pushdown [Boolean]
|
597
|
+
# Slice pushdown optimization.
|
598
|
+
# @param no_optimization [Boolean]
|
599
|
+
# Turn off (certain) optimizations.
|
600
|
+
#
|
601
|
+
# @return [DataFrame]
|
602
|
+
#
|
603
|
+
# @example
|
604
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
605
|
+
# lf.sink_csv("out.csv")
|
606
|
+
def sink_csv(
|
607
|
+
path,
|
608
|
+
include_bom: false,
|
609
|
+
include_header: true,
|
610
|
+
separator: ",",
|
611
|
+
line_terminator: "\n",
|
612
|
+
quote_char: '"',
|
613
|
+
batch_size: 1024,
|
614
|
+
datetime_format: nil,
|
615
|
+
date_format: nil,
|
616
|
+
time_format: nil,
|
617
|
+
float_scientific: nil,
|
618
|
+
float_precision: nil,
|
619
|
+
null_value: nil,
|
620
|
+
quote_style: nil,
|
621
|
+
maintain_order: true,
|
622
|
+
type_coercion: true,
|
623
|
+
predicate_pushdown: true,
|
624
|
+
projection_pushdown: true,
|
625
|
+
simplify_expression: true,
|
626
|
+
slice_pushdown: true,
|
627
|
+
no_optimization: false
|
628
|
+
)
|
629
|
+
Utils._check_arg_is_1byte("separator", separator, false)
|
630
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, false)
|
631
|
+
|
632
|
+
lf = _set_sink_optimizations(
|
633
|
+
type_coercion: type_coercion,
|
634
|
+
predicate_pushdown: predicate_pushdown,
|
635
|
+
projection_pushdown: projection_pushdown,
|
636
|
+
simplify_expression: simplify_expression,
|
637
|
+
slice_pushdown: slice_pushdown,
|
638
|
+
no_optimization: no_optimization
|
639
|
+
)
|
640
|
+
|
641
|
+
lf.sink_csv(
|
642
|
+
path,
|
643
|
+
include_bom,
|
644
|
+
include_header,
|
645
|
+
separator.ord,
|
646
|
+
line_terminator,
|
647
|
+
quote_char.ord,
|
648
|
+
batch_size,
|
649
|
+
datetime_format,
|
650
|
+
date_format,
|
651
|
+
time_format,
|
652
|
+
float_scientific,
|
653
|
+
float_precision,
|
654
|
+
null_value,
|
655
|
+
quote_style,
|
656
|
+
maintain_order
|
657
|
+
)
|
658
|
+
end
|
659
|
+
|
660
|
+
# Evaluate the query in streaming mode and write to an NDJSON file.
|
661
|
+
#
|
662
|
+
# This allows streaming results that are larger than RAM to be written to disk.
|
663
|
+
#
|
664
|
+
# @param path [String]
|
665
|
+
# File path to which the file should be written.
|
666
|
+
# @param maintain_order [Boolean]
|
667
|
+
# Maintain the order in which data is processed.
|
668
|
+
# Setting this to `false` will be slightly faster.
|
669
|
+
# @param type_coercion [Boolean]
|
670
|
+
# Do type coercion optimization.
|
671
|
+
# @param predicate_pushdown [Boolean]
|
672
|
+
# Do predicate pushdown optimization.
|
673
|
+
# @param projection_pushdown [Boolean]
|
674
|
+
# Do projection pushdown optimization.
|
675
|
+
# @param simplify_expression [Boolean]
|
676
|
+
# Run simplify expressions optimization.
|
677
|
+
# @param slice_pushdown [Boolean]
|
678
|
+
# Slice pushdown optimization.
|
679
|
+
# @param no_optimization [Boolean]
|
680
|
+
# Turn off (certain) optimizations.
|
681
|
+
#
|
682
|
+
# @return [DataFrame]
|
683
|
+
#
|
684
|
+
# @example
|
685
|
+
# lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
|
686
|
+
# lf.sink_ndjson("out.ndjson")
|
687
|
+
def sink_ndjson(
|
688
|
+
path,
|
689
|
+
maintain_order: true,
|
690
|
+
type_coercion: true,
|
691
|
+
predicate_pushdown: true,
|
692
|
+
projection_pushdown: true,
|
693
|
+
simplify_expression: true,
|
694
|
+
slice_pushdown: true,
|
695
|
+
no_optimization: false
|
696
|
+
)
|
697
|
+
lf = _set_sink_optimizations(
|
698
|
+
type_coercion: type_coercion,
|
699
|
+
predicate_pushdown: predicate_pushdown,
|
700
|
+
projection_pushdown: projection_pushdown,
|
701
|
+
simplify_expression: simplify_expression,
|
702
|
+
slice_pushdown: slice_pushdown,
|
703
|
+
no_optimization: no_optimization
|
704
|
+
)
|
705
|
+
|
706
|
+
lf.sink_json(path, maintain_order)
|
707
|
+
end
|
708
|
+
|
709
|
+
# @private
|
710
|
+
def _set_sink_optimizations(
|
711
|
+
type_coercion: true,
|
712
|
+
predicate_pushdown: true,
|
713
|
+
projection_pushdown: true,
|
714
|
+
simplify_expression: true,
|
715
|
+
slice_pushdown: true,
|
716
|
+
no_optimization: false
|
717
|
+
)
|
718
|
+
if no_optimization
|
719
|
+
predicate_pushdown = false
|
720
|
+
projection_pushdown = false
|
721
|
+
slice_pushdown = false
|
722
|
+
end
|
723
|
+
|
724
|
+
_ldf.optimization_toggle(
|
725
|
+
type_coercion,
|
726
|
+
predicate_pushdown,
|
727
|
+
projection_pushdown,
|
728
|
+
simplify_expression,
|
729
|
+
slice_pushdown,
|
730
|
+
false,
|
731
|
+
false,
|
732
|
+
true,
|
733
|
+
false
|
734
|
+
)
|
735
|
+
end
|
736
|
+
|
737
|
+
# Collect a small number of rows for debugging purposes.
|
738
|
+
#
|
739
|
+
# Fetch is like a {#collect} operation, but it overwrites the number of rows
|
740
|
+
# read by every scan operation. This is a utility that helps debug a query on a
|
741
|
+
# smaller number of rows.
|
742
|
+
#
|
743
|
+
# Note that the fetch does not guarantee the final number of rows in the
|
744
|
+
# DataFrame. Filter, join operations and a lower number of rows available in the
|
745
|
+
# scanned file influence the final number of rows.
|
746
|
+
#
|
747
|
+
# @param n_rows [Integer]
|
748
|
+
# Collect n_rows from the data sources.
|
749
|
+
# @param type_coercion [Boolean]
|
750
|
+
# Run type coercion optimization.
|
751
|
+
# @param predicate_pushdown [Boolean]
|
752
|
+
# Run predicate pushdown optimization.
|
753
|
+
# @param projection_pushdown [Boolean]
|
754
|
+
# Run projection pushdown optimization.
|
755
|
+
# @param simplify_expression [Boolean]
|
756
|
+
# Run simplify expressions optimization.
|
757
|
+
# @param string_cache [Boolean]
|
758
|
+
# This argument is deprecated. Please set the string cache globally.
|
759
|
+
# The argument will be ignored
|
760
|
+
# @param no_optimization [Boolean]
|
761
|
+
# Turn off optimizations.
|
762
|
+
# @param slice_pushdown [Boolean]
|
763
|
+
# Slice pushdown optimization
|
764
|
+
# @param common_subplan_elimination [Boolean]
|
765
|
+
# Will try to cache branching subplans that occur on self-joins or unions.
|
766
|
+
# @param allow_streaming [Boolean]
|
767
|
+
# Run parts of the query in a streaming fashion (this is in an alpha state)
|
768
|
+
#
|
769
|
+
# @return [DataFrame]
|
770
|
+
#
|
771
|
+
# @example
|
772
|
+
# df = Polars::DataFrame.new(
|
773
|
+
# {
|
774
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
775
|
+
# "b" => [1, 2, 3, 4, 5, 6],
|
776
|
+
# "c" => [6, 5, 4, 3, 2, 1]
|
777
|
+
# }
|
778
|
+
# ).lazy
|
779
|
+
# df.group_by("a", maintain_order: true).agg(Polars.all.sum).fetch(2)
|
780
|
+
# # =>
|
781
|
+
# # shape: (2, 3)
|
782
|
+
# # ┌─────┬─────┬─────┐
|
783
|
+
# # │ a ┆ b ┆ c │
|
784
|
+
# # │ --- ┆ --- ┆ --- │
|
785
|
+
# # │ str ┆ i64 ┆ i64 │
|
786
|
+
# # ╞═════╪═════╪═════╡
|
787
|
+
# # │ a ┆ 1 ┆ 6 │
|
788
|
+
# # │ b ┆ 2 ┆ 5 │
|
789
|
+
# # └─────┴─────┴─────┘
|
790
|
+
def fetch(
|
791
|
+
n_rows = 500,
|
792
|
+
type_coercion: true,
|
793
|
+
predicate_pushdown: true,
|
794
|
+
projection_pushdown: true,
|
795
|
+
simplify_expression: true,
|
796
|
+
string_cache: false,
|
797
|
+
no_optimization: false,
|
798
|
+
slice_pushdown: true,
|
799
|
+
common_subplan_elimination: true,
|
800
|
+
comm_subexpr_elim: true,
|
801
|
+
allow_streaming: false
|
802
|
+
)
|
803
|
+
if no_optimization
|
804
|
+
predicate_pushdown = false
|
805
|
+
projection_pushdown = false
|
806
|
+
slice_pushdown = false
|
807
|
+
common_subplan_elimination = false
|
808
|
+
end
|
809
|
+
|
810
|
+
ldf = _ldf.optimization_toggle(
|
811
|
+
type_coercion,
|
812
|
+
predicate_pushdown,
|
813
|
+
projection_pushdown,
|
814
|
+
simplify_expression,
|
815
|
+
slice_pushdown,
|
816
|
+
common_subplan_elimination,
|
817
|
+
comm_subexpr_elim,
|
818
|
+
allow_streaming,
|
819
|
+
false
|
820
|
+
)
|
821
|
+
Utils.wrap_df(ldf.fetch(n_rows))
|
822
|
+
end
|
823
|
+
|
824
|
+
# Return lazy representation, i.e. itself.
|
825
|
+
#
|
826
|
+
# Useful for writing code that expects either a `DataFrame` or
|
827
|
+
# `LazyFrame`.
|
828
|
+
#
|
829
|
+
# @return [LazyFrame]
|
830
|
+
#
|
831
|
+
# @example
|
832
|
+
# df = Polars::DataFrame.new(
|
833
|
+
# {
|
834
|
+
# "a" => [nil, 2, 3, 4],
|
835
|
+
# "b" => [0.5, nil, 2.5, 13],
|
836
|
+
# "c" => [true, true, false, nil]
|
837
|
+
# }
|
838
|
+
# )
|
839
|
+
# df.lazy
|
840
|
+
def lazy
|
841
|
+
self
|
842
|
+
end
|
843
|
+
|
844
|
+
# Cache the result once the execution of the physical plan hits this node.
|
845
|
+
#
|
846
|
+
# @return [LazyFrame]
|
847
|
+
def cache
|
848
|
+
_from_rbldf(_ldf.cache)
|
849
|
+
end
|
850
|
+
|
851
|
+
# TODO
|
852
|
+
# def cast
|
853
|
+
# end
|
854
|
+
|
855
|
+
# Create an empty copy of the current LazyFrame.
|
856
|
+
#
|
857
|
+
# The copy has an identical schema but no data.
|
858
|
+
#
|
859
|
+
# @return [LazyFrame]
|
860
|
+
#
|
861
|
+
# @example
|
862
|
+
# lf = Polars::LazyFrame.new(
|
863
|
+
# {
|
864
|
+
# "a" => [nil, 2, 3, 4],
|
865
|
+
# "b" => [0.5, nil, 2.5, 13],
|
866
|
+
# "c" => [true, true, false, nil],
|
867
|
+
# }
|
868
|
+
# ).lazy
|
869
|
+
# lf.clear.fetch
|
870
|
+
# # =>
|
871
|
+
# # shape: (0, 3)
|
872
|
+
# # ┌─────┬─────┬──────┐
|
873
|
+
# # │ a ┆ b ┆ c │
|
874
|
+
# # │ --- ┆ --- ┆ --- │
|
875
|
+
# # │ i64 ┆ f64 ┆ bool │
|
876
|
+
# # ╞═════╪═════╪══════╡
|
877
|
+
# # └─────┴─────┴──────┘
|
878
|
+
#
|
879
|
+
# @example
|
880
|
+
# lf.clear(2).fetch
|
881
|
+
# # =>
|
882
|
+
# # shape: (2, 3)
|
883
|
+
# # ┌──────┬──────┬──────┐
|
884
|
+
# # │ a ┆ b ┆ c │
|
885
|
+
# # │ --- ┆ --- ┆ --- │
|
886
|
+
# # │ i64 ┆ f64 ┆ bool │
|
887
|
+
# # ╞══════╪══════╪══════╡
|
888
|
+
# # │ null ┆ null ┆ null │
|
889
|
+
# # │ null ┆ null ┆ null │
|
890
|
+
# # └──────┴──────┴──────┘
|
891
|
+
def clear(n = 0)
|
892
|
+
DataFrame.new(columns: schema).clear(n).lazy
|
893
|
+
end
|
894
|
+
alias_method :cleared, :clear
|
895
|
+
|
896
|
+
# Filter the rows in the DataFrame based on a predicate expression.
|
897
|
+
#
|
898
|
+
# @param predicate [Object]
|
899
|
+
# Expression that evaluates to a boolean Series.
|
900
|
+
#
|
901
|
+
# @return [LazyFrame]
|
902
|
+
#
|
903
|
+
# @example Filter on one condition:
|
904
|
+
# lf = Polars::DataFrame.new(
|
905
|
+
# {
|
906
|
+
# "foo" => [1, 2, 3],
|
907
|
+
# "bar" => [6, 7, 8],
|
908
|
+
# "ham" => ["a", "b", "c"]
|
909
|
+
# }
|
910
|
+
# ).lazy
|
911
|
+
# lf.filter(Polars.col("foo") < 3).collect
|
912
|
+
# # =>
|
913
|
+
# # shape: (2, 3)
|
914
|
+
# # ┌─────┬─────┬─────┐
|
915
|
+
# # │ foo ┆ bar ┆ ham │
|
916
|
+
# # │ --- ┆ --- ┆ --- │
|
917
|
+
# # │ i64 ┆ i64 ┆ str │
|
918
|
+
# # ╞═════╪═════╪═════╡
|
919
|
+
# # │ 1 ┆ 6 ┆ a │
|
920
|
+
# # │ 2 ┆ 7 ┆ b │
|
921
|
+
# # └─────┴─────┴─────┘
|
922
|
+
#
|
923
|
+
# @example Filter on multiple conditions:
|
924
|
+
# lf.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a")).collect
|
925
|
+
# # =>
|
926
|
+
# # shape: (1, 3)
|
927
|
+
# # ┌─────┬─────┬─────┐
|
928
|
+
# # │ foo ┆ bar ┆ ham │
|
929
|
+
# # │ --- ┆ --- ┆ --- │
|
930
|
+
# # │ i64 ┆ i64 ┆ str │
|
931
|
+
# # ╞═════╪═════╪═════╡
|
932
|
+
# # │ 1 ┆ 6 ┆ a │
|
933
|
+
# # └─────┴─────┴─────┘
|
934
|
+
def filter(predicate)
|
935
|
+
_from_rbldf(
|
936
|
+
_ldf.filter(
|
937
|
+
Utils.parse_into_expression(predicate, str_as_lit: false)
|
938
|
+
)
|
939
|
+
)
|
940
|
+
end
|
941
|
+
|
942
|
+
# Select columns from this DataFrame.
|
943
|
+
#
|
944
|
+
# @param exprs [Array]
|
945
|
+
# Column(s) to select, specified as positional arguments.
|
946
|
+
# Accepts expression input. Strings are parsed as column names,
|
947
|
+
# other non-expression inputs are parsed as literals.
|
948
|
+
# @param named_exprs [Hash]
|
949
|
+
# Additional columns to select, specified as keyword arguments.
|
950
|
+
# The columns will be renamed to the keyword used.
|
951
|
+
#
|
952
|
+
# @return [LazyFrame]
|
953
|
+
#
|
954
|
+
# @example
|
955
|
+
# df = Polars::DataFrame.new(
|
956
|
+
# {
|
957
|
+
# "foo" => [1, 2, 3],
|
958
|
+
# "bar" => [6, 7, 8],
|
959
|
+
# "ham" => ["a", "b", "c"],
|
960
|
+
# }
|
961
|
+
# ).lazy
|
962
|
+
# df.select("foo").collect
|
963
|
+
# # =>
|
964
|
+
# # shape: (3, 1)
|
965
|
+
# # ┌─────┐
|
966
|
+
# # │ foo │
|
967
|
+
# # │ --- │
|
968
|
+
# # │ i64 │
|
969
|
+
# # ╞═════╡
|
970
|
+
# # │ 1 │
|
971
|
+
# # │ 2 │
|
972
|
+
# # │ 3 │
|
973
|
+
# # └─────┘
|
974
|
+
#
|
975
|
+
# @example
|
976
|
+
# df.select(["foo", "bar"]).collect
|
977
|
+
# # =>
|
978
|
+
# # shape: (3, 2)
|
979
|
+
# # ┌─────┬─────┐
|
980
|
+
# # │ foo ┆ bar │
|
981
|
+
# # │ --- ┆ --- │
|
982
|
+
# # │ i64 ┆ i64 │
|
983
|
+
# # ╞═════╪═════╡
|
984
|
+
# # │ 1 ┆ 6 │
|
985
|
+
# # │ 2 ┆ 7 │
|
986
|
+
# # │ 3 ┆ 8 │
|
987
|
+
# # └─────┴─────┘
|
988
|
+
#
|
989
|
+
# @example
|
990
|
+
# df.select(Polars.col("foo") + 1).collect
|
991
|
+
# # =>
|
992
|
+
# # shape: (3, 1)
|
993
|
+
# # ┌─────┐
|
994
|
+
# # │ foo │
|
995
|
+
# # │ --- │
|
996
|
+
# # │ i64 │
|
997
|
+
# # ╞═════╡
|
998
|
+
# # │ 2 │
|
999
|
+
# # │ 3 │
|
1000
|
+
# # │ 4 │
|
1001
|
+
# # └─────┘
|
1002
|
+
#
|
1003
|
+
# @example
|
1004
|
+
# df.select([Polars.col("foo") + 1, Polars.col("bar") + 1]).collect
|
1005
|
+
# # =>
|
1006
|
+
# # shape: (3, 2)
|
1007
|
+
# # ┌─────┬─────┐
|
1008
|
+
# # │ foo ┆ bar │
|
1009
|
+
# # │ --- ┆ --- │
|
1010
|
+
# # │ i64 ┆ i64 │
|
1011
|
+
# # ╞═════╪═════╡
|
1012
|
+
# # │ 2 ┆ 7 │
|
1013
|
+
# # │ 3 ┆ 8 │
|
1014
|
+
# # │ 4 ┆ 9 │
|
1015
|
+
# # └─────┴─────┘
|
1016
|
+
#
|
1017
|
+
# @example
|
1018
|
+
# df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0)).collect
|
1019
|
+
# # =>
|
1020
|
+
# # shape: (3, 1)
|
1021
|
+
# # ┌─────────┐
|
1022
|
+
# # │ literal │
|
1023
|
+
# # │ --- │
|
1024
|
+
# # │ i32 │
|
1025
|
+
# # ╞═════════╡
|
1026
|
+
# # │ 0 │
|
1027
|
+
# # │ 0 │
|
1028
|
+
# # │ 10 │
|
1029
|
+
# # └─────────┘
|
1030
|
+
def select(*exprs, **named_exprs)
|
1031
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1032
|
+
|
1033
|
+
rbexprs = Utils.parse_into_list_of_expressions(
|
1034
|
+
*exprs, **named_exprs, __structify: structify
|
1035
|
+
)
|
1036
|
+
_from_rbldf(_ldf.select(rbexprs))
|
1037
|
+
end
|
1038
|
+
|
1039
|
+
# Start a group by operation.
|
1040
|
+
#
|
1041
|
+
# @param by [Array]
|
1042
|
+
# Column(s) to group by.
|
1043
|
+
# @param maintain_order [Boolean]
|
1044
|
+
# Make sure that the order of the groups remain consistent. This is more
|
1045
|
+
# expensive than a default group by.
|
1046
|
+
# @param named_by [Hash]
|
1047
|
+
# Additional columns to group by, specified as keyword arguments.
|
1048
|
+
# The columns will be renamed to the keyword used.
|
1049
|
+
# @return [LazyGroupBy]
|
1050
|
+
#
|
1051
|
+
# @example
|
1052
|
+
# df = Polars::DataFrame.new(
|
1053
|
+
# {
|
1054
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
1055
|
+
# "b" => [1, 2, 3, 4, 5, 6],
|
1056
|
+
# "c" => [6, 5, 4, 3, 2, 1]
|
1057
|
+
# }
|
1058
|
+
# ).lazy
|
1059
|
+
# df.group_by("a", maintain_order: true).agg(Polars.col("b").sum).collect
|
1060
|
+
# # =>
|
1061
|
+
# # shape: (3, 2)
|
1062
|
+
# # ┌─────┬─────┐
|
1063
|
+
# # │ a ┆ b │
|
1064
|
+
# # │ --- ┆ --- │
|
1065
|
+
# # │ str ┆ i64 │
|
1066
|
+
# # ╞═════╪═════╡
|
1067
|
+
# # │ a ┆ 4 │
|
1068
|
+
# # │ b ┆ 11 │
|
1069
|
+
# # │ c ┆ 6 │
|
1070
|
+
# # └─────┴─────┘
|
1071
|
+
def group_by(*by, maintain_order: false, **named_by)
|
1072
|
+
exprs = Utils.parse_into_list_of_expressions(*by, **named_by)
|
1073
|
+
lgb = _ldf.group_by(exprs, maintain_order)
|
1074
|
+
LazyGroupBy.new(lgb)
|
1075
|
+
end
|
1076
|
+
alias_method :groupby, :group_by
|
1077
|
+
alias_method :group, :group_by
|
1078
|
+
|
1079
|
+
# Create rolling groups based on a time column.
|
1080
|
+
#
|
1081
|
+
# Also works for index values of type `:i32` or `:i64`.
|
1082
|
+
#
|
1083
|
+
# Different from a `dynamic_group_by` the windows are now determined by the
|
1084
|
+
# individual values and are not of constant intervals. For constant intervals
|
1085
|
+
# use *group_by_dynamic*.
|
1086
|
+
#
|
1087
|
+
# The `period` and `offset` arguments are created either from a timedelta, or
|
1088
|
+
# by using the following string language:
|
1089
|
+
#
|
1090
|
+
# - 1ns (1 nanosecond)
|
1091
|
+
# - 1us (1 microsecond)
|
1092
|
+
# - 1ms (1 millisecond)
|
1093
|
+
# - 1s (1 second)
|
1094
|
+
# - 1m (1 minute)
|
1095
|
+
# - 1h (1 hour)
|
1096
|
+
# - 1d (1 day)
|
1097
|
+
# - 1w (1 week)
|
1098
|
+
# - 1mo (1 calendar month)
|
1099
|
+
# - 1y (1 calendar year)
|
1100
|
+
# - 1i (1 index count)
|
1101
|
+
#
|
1102
|
+
# Or combine them:
|
1103
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1104
|
+
#
|
1105
|
+
# In case of a group_by_rolling on an integer column, the windows are defined by:
|
1106
|
+
#
|
1107
|
+
# - "1i" # length 1
|
1108
|
+
# - "10i" # length 10
|
1109
|
+
#
|
1110
|
+
# @param index_column [Object]
|
1111
|
+
# Column used to group based on the time window.
|
1112
|
+
# Often to type Date/Datetime
|
1113
|
+
# This column must be sorted in ascending order. If not the output will not
|
1114
|
+
# make sense.
|
1115
|
+
#
|
1116
|
+
# In case of a rolling group by on indices, dtype needs to be one of
|
1117
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1118
|
+
# performance matters use an `:i64` column.
|
1119
|
+
# @param period [Object]
|
1120
|
+
# Length of the window.
|
1121
|
+
# @param offset [Object]
|
1122
|
+
# Offset of the window. Default is -period.
|
1123
|
+
# @param closed ["right", "left", "both", "none"]
|
1124
|
+
# Define whether the temporal window interval is closed or not.
|
1125
|
+
# @param by [Object]
|
1126
|
+
# Also group by this column/these columns.
|
1127
|
+
#
|
1128
|
+
# @return [LazyFrame]
|
1129
|
+
#
|
1130
|
+
# @example
|
1131
|
+
# dates = [
|
1132
|
+
# "2020-01-01 13:45:48",
|
1133
|
+
# "2020-01-01 16:42:13",
|
1134
|
+
# "2020-01-01 16:45:09",
|
1135
|
+
# "2020-01-02 18:12:48",
|
1136
|
+
# "2020-01-03 19:45:32",
|
1137
|
+
# "2020-01-08 23:16:43"
|
1138
|
+
# ]
|
1139
|
+
# df = Polars::LazyFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1140
|
+
# Polars.col("dt").str.strptime(Polars::Datetime).set_sorted
|
1141
|
+
# )
|
1142
|
+
# df.rolling(index_column: "dt", period: "2d").agg(
|
1143
|
+
# [
|
1144
|
+
# Polars.sum("a").alias("sum_a"),
|
1145
|
+
# Polars.min("a").alias("min_a"),
|
1146
|
+
# Polars.max("a").alias("max_a")
|
1147
|
+
# ]
|
1148
|
+
# ).collect
|
1149
|
+
# # =>
|
1150
|
+
# # shape: (6, 4)
|
1151
|
+
# # ┌─────────────────────┬───────┬───────┬───────┐
|
1152
|
+
# # │ dt ┆ sum_a ┆ min_a ┆ max_a │
|
1153
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1154
|
+
# # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
|
1155
|
+
# # ╞═════════════════════╪═══════╪═══════╪═══════╡
|
1156
|
+
# # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
|
1157
|
+
# # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
|
1158
|
+
# # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
|
1159
|
+
# # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
|
1160
|
+
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
1161
|
+
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
1162
|
+
# # └─────────────────────┴───────┴───────┴───────┘
|
1163
|
+
def rolling(
|
1164
|
+
index_column:,
|
1165
|
+
period:,
|
1166
|
+
offset: nil,
|
1167
|
+
closed: "right",
|
1168
|
+
by: nil
|
1169
|
+
)
|
1170
|
+
index_column = Utils.parse_into_expression(index_column)
|
1171
|
+
if offset.nil?
|
1172
|
+
offset = Utils.negate_duration_string(Utils.parse_as_duration_string(period))
|
1173
|
+
end
|
1174
|
+
|
1175
|
+
rbexprs_by = (
|
1176
|
+
!by.nil? ? Utils.parse_into_list_of_expressions(by) : []
|
1177
|
+
)
|
1178
|
+
period = Utils.parse_as_duration_string(period)
|
1179
|
+
offset = Utils.parse_as_duration_string(offset)
|
1180
|
+
|
1181
|
+
lgb = _ldf.rolling(index_column, period, offset, closed, rbexprs_by)
|
1182
|
+
LazyGroupBy.new(lgb)
|
1183
|
+
end
|
1184
|
+
alias_method :group_by_rolling, :rolling
|
1185
|
+
alias_method :groupby_rolling, :rolling
|
1186
|
+
|
1187
|
+
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1188
|
+
#
|
1189
|
+
# Time windows are calculated and rows are assigned to windows. Different from a
|
1190
|
+
# normal group by is that a row can be member of multiple groups. The time/index
|
1191
|
+
# window could be seen as a rolling window, with a window size determined by
|
1192
|
+
# dates/times/values instead of slots in the DataFrame.
|
1193
|
+
#
|
1194
|
+
# A window is defined by:
|
1195
|
+
#
|
1196
|
+
# - every: interval of the window
|
1197
|
+
# - period: length of the window
|
1198
|
+
# - offset: offset of the window
|
1199
|
+
#
|
1200
|
+
# The `every`, `period` and `offset` arguments are created with
|
1201
|
+
# the following string language:
|
1202
|
+
#
|
1203
|
+
# - 1ns (1 nanosecond)
|
1204
|
+
# - 1us (1 microsecond)
|
1205
|
+
# - 1ms (1 millisecond)
|
1206
|
+
# - 1s (1 second)
|
1207
|
+
# - 1m (1 minute)
|
1208
|
+
# - 1h (1 hour)
|
1209
|
+
# - 1d (1 day)
|
1210
|
+
# - 1w (1 week)
|
1211
|
+
# - 1mo (1 calendar month)
|
1212
|
+
# - 1y (1 calendar year)
|
1213
|
+
# - 1i (1 index count)
|
1214
|
+
#
|
1215
|
+
# Or combine them:
|
1216
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1217
|
+
#
|
1218
|
+
# In case of a group_by_dynamic on an integer column, the windows are defined by:
|
1219
|
+
#
|
1220
|
+
# - "1i" # length 1
|
1221
|
+
# - "10i" # length 10
|
1222
|
+
#
|
1223
|
+
# @param index_column [Object]
|
1224
|
+
# Column used to group based on the time window.
|
1225
|
+
# Often to type Date/Datetime
|
1226
|
+
# This column must be sorted in ascending order. If not the output will not
|
1227
|
+
# make sense.
|
1228
|
+
#
|
1229
|
+
# In case of a dynamic group by on indices, dtype needs to be one of
|
1230
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1231
|
+
# performance matters use an `:i64` column.
|
1232
|
+
# @param every [Object]
|
1233
|
+
# Interval of the window.
|
1234
|
+
# @param period [Object]
|
1235
|
+
# Length of the window, if None it is equal to 'every'.
|
1236
|
+
# @param offset [Object]
|
1237
|
+
# Offset of the window if None and period is None it will be equal to negative
|
1238
|
+
# `every`.
|
1239
|
+
# @param truncate [Boolean]
|
1240
|
+
# Truncate the time value to the window lower bound.
|
1241
|
+
# @param include_boundaries [Boolean]
|
1242
|
+
# Add the lower and upper bound of the window to the "_lower_bound" and
|
1243
|
+
# "_upper_bound" columns. This will impact performance because it's harder to
|
1244
|
+
# parallelize
|
1245
|
+
# @param closed ["right", "left", "both", "none"]
|
1246
|
+
# Define whether the temporal window interval is closed or not.
|
1247
|
+
# @param by [Object]
|
1248
|
+
# Also group by this column/these columns
|
1249
|
+
#
|
1250
|
+
# @return [DataFrame]
|
1251
|
+
#
|
1252
|
+
# @example
|
1253
|
+
# df = Polars::DataFrame.new(
|
1254
|
+
# {
|
1255
|
+
# "time" => Polars.datetime_range(
|
1256
|
+
# DateTime.new(2021, 12, 16),
|
1257
|
+
# DateTime.new(2021, 12, 16, 3),
|
1258
|
+
# "30m",
|
1259
|
+
# time_unit: "us",
|
1260
|
+
# eager: true
|
1261
|
+
# ),
|
1262
|
+
# "n" => 0..6
|
1263
|
+
# }
|
1264
|
+
# )
|
1265
|
+
# # =>
|
1266
|
+
# # shape: (7, 2)
|
1267
|
+
# # ┌─────────────────────┬─────┐
|
1268
|
+
# # │ time ┆ n │
|
1269
|
+
# # │ --- ┆ --- │
|
1270
|
+
# # │ datetime[μs] ┆ i64 │
|
1271
|
+
# # ╞═════════════════════╪═════╡
|
1272
|
+
# # │ 2021-12-16 00:00:00 ┆ 0 │
|
1273
|
+
# # │ 2021-12-16 00:30:00 ┆ 1 │
|
1274
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 │
|
1275
|
+
# # │ 2021-12-16 01:30:00 ┆ 3 │
|
1276
|
+
# # │ 2021-12-16 02:00:00 ┆ 4 │
|
1277
|
+
# # │ 2021-12-16 02:30:00 ┆ 5 │
|
1278
|
+
# # │ 2021-12-16 03:00:00 ┆ 6 │
|
1279
|
+
# # └─────────────────────┴─────┘
|
1280
|
+
#
|
1281
|
+
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
1282
|
+
# df.group_by_dynamic("time", every: "1h", closed: "right").agg(
|
1283
|
+
# [
|
1284
|
+
# Polars.col("time").min.alias("time_min"),
|
1285
|
+
# Polars.col("time").max.alias("time_max")
|
1286
|
+
# ]
|
1287
|
+
# )
|
1288
|
+
# # =>
|
1289
|
+
# # shape: (4, 3)
|
1290
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┐
|
1291
|
+
# # │ time ┆ time_min ┆ time_max │
|
1292
|
+
# # │ --- ┆ --- ┆ --- │
|
1293
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
|
1294
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╡
|
1295
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
|
1296
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
|
1297
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
|
1298
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
|
1299
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
1300
|
+
#
|
1301
|
+
# @example The window boundaries can also be added to the aggregation result.
|
1302
|
+
# df.group_by_dynamic(
|
1303
|
+
# "time", every: "1h", include_boundaries: true, closed: "right"
|
1304
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
1305
|
+
# # =>
|
1306
|
+
# # shape: (4, 4)
|
1307
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
1308
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
1309
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1310
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
1311
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
1312
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
1313
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
|
1314
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
1315
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
1316
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1317
|
+
#
|
1318
|
+
# @example When closed="left", should not include right end of interval.
|
1319
|
+
# df.group_by_dynamic("time", every: "1h", closed: "left").agg(
|
1320
|
+
# [
|
1321
|
+
# Polars.col("time").count.alias("time_count"),
|
1322
|
+
# Polars.col("time").alias("time_agg_list")
|
1323
|
+
# ]
|
1324
|
+
# )
|
1325
|
+
# # =>
|
1326
|
+
# # shape: (4, 3)
|
1327
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────┐
|
1328
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1329
|
+
# # │ --- ┆ --- ┆ --- │
|
1330
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1331
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════╡
|
1332
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-… │
|
1333
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-… │
|
1334
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-… │
|
1335
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1336
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────┘
|
1337
|
+
#
|
1338
|
+
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1339
|
+
# df.group_by_dynamic("time", every: "1h", closed: "both").agg(
|
1340
|
+
# [Polars.col("time").count.alias("time_count")]
|
1341
|
+
# )
|
1342
|
+
# # =>
|
1343
|
+
# # shape: (5, 2)
|
1344
|
+
# # ┌─────────────────────┬────────────┐
|
1345
|
+
# # │ time ┆ time_count │
|
1346
|
+
# # │ --- ┆ --- │
|
1347
|
+
# # │ datetime[μs] ┆ u32 │
|
1348
|
+
# # ╞═════════════════════╪════════════╡
|
1349
|
+
# # │ 2021-12-15 23:00:00 ┆ 1 │
|
1350
|
+
# # │ 2021-12-16 00:00:00 ┆ 3 │
|
1351
|
+
# # │ 2021-12-16 01:00:00 ┆ 3 │
|
1352
|
+
# # │ 2021-12-16 02:00:00 ┆ 3 │
|
1353
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
1354
|
+
# # └─────────────────────┴────────────┘
|
1355
|
+
#
|
1356
|
+
# @example Dynamic group bys can also be combined with grouping on normal keys.
|
1357
|
+
# df = Polars::DataFrame.new(
|
1358
|
+
# {
|
1359
|
+
# "time" => Polars.datetime_range(
|
1360
|
+
# DateTime.new(2021, 12, 16),
|
1361
|
+
# DateTime.new(2021, 12, 16, 3),
|
1362
|
+
# "30m",
|
1363
|
+
# time_unit: "us",
|
1364
|
+
# eager: true
|
1365
|
+
# ),
|
1366
|
+
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
1367
|
+
# }
|
1368
|
+
# )
|
1369
|
+
# df.group_by_dynamic(
|
1370
|
+
# "time",
|
1371
|
+
# every: "1h",
|
1372
|
+
# closed: "both",
|
1373
|
+
# by: "groups",
|
1374
|
+
# include_boundaries: true
|
1375
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
1376
|
+
# # =>
|
1377
|
+
# # shape: (7, 5)
|
1378
|
+
# # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
1379
|
+
# # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
1380
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1381
|
+
# # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
1382
|
+
# # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
1383
|
+
# # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
1384
|
+
# # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
|
1385
|
+
# # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
|
1386
|
+
# # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
1387
|
+
# # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
|
1388
|
+
# # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
1389
|
+
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
1390
|
+
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1391
|
+
#
|
1392
|
+
# @example Dynamic group by on an index column.
|
1393
|
+
# df = Polars::DataFrame.new(
|
1394
|
+
# {
|
1395
|
+
# "idx" => Polars.arange(0, 6, eager: true),
|
1396
|
+
# "A" => ["A", "A", "B", "B", "B", "C"]
|
1397
|
+
# }
|
1398
|
+
# )
|
1399
|
+
# df.group_by_dynamic(
|
1400
|
+
# "idx",
|
1401
|
+
# every: "2i",
|
1402
|
+
# period: "3i",
|
1403
|
+
# include_boundaries: true,
|
1404
|
+
# closed: "right"
|
1405
|
+
# ).agg(Polars.col("A").alias("A_agg_list"))
|
1406
|
+
# # =>
|
1407
|
+
# # shape: (4, 4)
|
1408
|
+
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
1409
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
|
1410
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1411
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
|
1412
|
+
# # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
|
1413
|
+
# # │ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
|
1414
|
+
# # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
|
1415
|
+
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
1416
|
+
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
1417
|
+
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
1418
|
+
def group_by_dynamic(
|
1419
|
+
index_column,
|
1420
|
+
every:,
|
1421
|
+
period: nil,
|
1422
|
+
offset: nil,
|
1423
|
+
truncate: nil,
|
1424
|
+
include_boundaries: false,
|
1425
|
+
closed: "left",
|
1426
|
+
label: "left",
|
1427
|
+
by: nil,
|
1428
|
+
start_by: "window"
|
1429
|
+
)
|
1430
|
+
if !truncate.nil?
|
1431
|
+
label = truncate ? "left" : "datapoint"
|
1432
|
+
end
|
1433
|
+
|
1434
|
+
index_column = Utils.parse_into_expression(index_column, str_as_lit: false)
|
1435
|
+
if offset.nil?
|
1436
|
+
offset = period.nil? ? "-#{every}" : "0ns"
|
1437
|
+
end
|
1438
|
+
|
1439
|
+
if period.nil?
|
1440
|
+
period = every
|
1441
|
+
end
|
1442
|
+
|
1443
|
+
period = Utils.parse_as_duration_string(period)
|
1444
|
+
offset = Utils.parse_as_duration_string(offset)
|
1445
|
+
every = Utils.parse_as_duration_string(every)
|
1446
|
+
|
1447
|
+
rbexprs_by = by.nil? ? [] : Utils.parse_into_list_of_expressions(by)
|
1448
|
+
lgb = _ldf.group_by_dynamic(
|
1449
|
+
index_column,
|
1450
|
+
every,
|
1451
|
+
period,
|
1452
|
+
offset,
|
1453
|
+
label,
|
1454
|
+
include_boundaries,
|
1455
|
+
closed,
|
1456
|
+
rbexprs_by,
|
1457
|
+
start_by
|
1458
|
+
)
|
1459
|
+
LazyGroupBy.new(lgb)
|
1460
|
+
end
|
1461
|
+
alias_method :groupby_dynamic, :group_by_dynamic
|
1462
|
+
|
1463
|
+
# Perform an asof join.
|
1464
|
+
#
|
1465
|
+
# This is similar to a left-join except that we match on nearest key rather than
|
1466
|
+
# equal keys.
|
1467
|
+
#
|
1468
|
+
# Both DataFrames must be sorted by the join_asof key.
|
1469
|
+
#
|
1470
|
+
# For each row in the left DataFrame:
|
1471
|
+
#
|
1472
|
+
# - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
|
1473
|
+
# - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
|
1474
|
+
#
|
1475
|
+
# The default is "backward".
|
1476
|
+
#
|
1477
|
+
# @param other [LazyFrame]
|
1478
|
+
# Lazy DataFrame to join with.
|
1479
|
+
# @param left_on [String]
|
1480
|
+
# Join column of the left DataFrame.
|
1481
|
+
# @param right_on [String]
|
1482
|
+
# Join column of the right DataFrame.
|
1483
|
+
# @param on [String]
|
1484
|
+
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
1485
|
+
# None.
|
1486
|
+
# @param by [Object]
|
1487
|
+
# Join on these columns before doing asof join.
|
1488
|
+
# @param by_left [Object]
|
1489
|
+
# Join on these columns before doing asof join.
|
1490
|
+
# @param by_right [Object]
|
1491
|
+
# Join on these columns before doing asof join.
|
1492
|
+
# @param strategy ["backward", "forward"]
|
1493
|
+
# Join strategy.
|
1494
|
+
# @param suffix [String]
|
1495
|
+
# Suffix to append to columns with a duplicate name.
|
1496
|
+
# @param tolerance [Object]
|
1497
|
+
# Numeric tolerance. By setting this the join will only be done if the near
|
1498
|
+
# keys are within this distance. If an asof join is done on columns of dtype
|
1499
|
+
# "Date", "Datetime", "Duration" or "Time" you use the following string
|
1500
|
+
# language:
|
1501
|
+
#
|
1502
|
+
# - 1ns (1 nanosecond)
|
1503
|
+
# - 1us (1 microsecond)
|
1504
|
+
# - 1ms (1 millisecond)
|
1505
|
+
# - 1s (1 second)
|
1506
|
+
# - 1m (1 minute)
|
1507
|
+
# - 1h (1 hour)
|
1508
|
+
# - 1d (1 day)
|
1509
|
+
# - 1w (1 week)
|
1510
|
+
# - 1mo (1 calendar month)
|
1511
|
+
# - 1y (1 calendar year)
|
1512
|
+
# - 1i (1 index count)
|
1513
|
+
#
|
1514
|
+
# Or combine them:
|
1515
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1516
|
+
#
|
1517
|
+
# @param allow_parallel [Boolean]
|
1518
|
+
# Allow the physical plan to optionally evaluate the computation of both
|
1519
|
+
# DataFrames up to the join in parallel.
|
1520
|
+
# @param force_parallel [Boolean]
|
1521
|
+
# Force the physical plan to evaluate the computation of both DataFrames up to
|
1522
|
+
# the join in parallel.
|
1523
|
+
#
|
1524
|
+
# @return [LazyFrame]
|
1525
|
+
def join_asof(
|
1526
|
+
other,
|
1527
|
+
left_on: nil,
|
1528
|
+
right_on: nil,
|
1529
|
+
on: nil,
|
1530
|
+
by_left: nil,
|
1531
|
+
by_right: nil,
|
1532
|
+
by: nil,
|
1533
|
+
strategy: "backward",
|
1534
|
+
suffix: "_right",
|
1535
|
+
tolerance: nil,
|
1536
|
+
allow_parallel: true,
|
1537
|
+
force_parallel: false
|
1538
|
+
)
|
1539
|
+
if !other.is_a?(LazyFrame)
|
1540
|
+
raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
|
1541
|
+
end
|
1542
|
+
|
1543
|
+
if on.is_a?(::String)
|
1544
|
+
left_on = on
|
1545
|
+
right_on = on
|
1546
|
+
end
|
1547
|
+
|
1548
|
+
if left_on.nil? || right_on.nil?
|
1549
|
+
raise ArgumentError, "You should pass the column to join on as an argument."
|
1550
|
+
end
|
1551
|
+
|
1552
|
+
if by_left.is_a?(::String) || by_left.is_a?(Expr)
|
1553
|
+
by_left_ = [by_left]
|
1554
|
+
else
|
1555
|
+
by_left_ = by_left
|
1556
|
+
end
|
1557
|
+
|
1558
|
+
if by_right.is_a?(::String) || by_right.is_a?(Expr)
|
1559
|
+
by_right_ = [by_right]
|
1560
|
+
else
|
1561
|
+
by_right_ = by_right
|
1562
|
+
end
|
1563
|
+
|
1564
|
+
if by.is_a?(::String)
|
1565
|
+
by_left_ = [by]
|
1566
|
+
by_right_ = [by]
|
1567
|
+
elsif by.is_a?(::Array)
|
1568
|
+
by_left_ = by
|
1569
|
+
by_right_ = by
|
1570
|
+
end
|
1571
|
+
|
1572
|
+
tolerance_str = nil
|
1573
|
+
tolerance_num = nil
|
1574
|
+
if tolerance.is_a?(::String)
|
1575
|
+
tolerance_str = tolerance
|
1576
|
+
else
|
1577
|
+
tolerance_num = tolerance
|
1578
|
+
end
|
1579
|
+
|
1580
|
+
_from_rbldf(
|
1581
|
+
_ldf.join_asof(
|
1582
|
+
other._ldf,
|
1583
|
+
Polars.col(left_on)._rbexpr,
|
1584
|
+
Polars.col(right_on)._rbexpr,
|
1585
|
+
by_left_,
|
1586
|
+
by_right_,
|
1587
|
+
allow_parallel,
|
1588
|
+
force_parallel,
|
1589
|
+
suffix,
|
1590
|
+
strategy,
|
1591
|
+
tolerance_num,
|
1592
|
+
tolerance_str
|
1593
|
+
)
|
1594
|
+
)
|
1595
|
+
end
|
1596
|
+
|
1597
|
+
# Add a join operation to the Logical Plan.
|
1598
|
+
#
|
1599
|
+
# @param other [LazyFrame]
|
1600
|
+
# Lazy DataFrame to join with.
|
1601
|
+
# @param left_on [Object]
|
1602
|
+
# Join column of the left DataFrame.
|
1603
|
+
# @param right_on [Object]
|
1604
|
+
# Join column of the right DataFrame.
|
1605
|
+
# @param on Object
|
1606
|
+
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
1607
|
+
# None.
|
1608
|
+
# @param how ["inner", "left", "full", "semi", "anti", "cross"]
|
1609
|
+
# Join strategy.
|
1610
|
+
# @param suffix [String]
|
1611
|
+
# Suffix to append to columns with a duplicate name.
|
1612
|
+
# @param join_nulls [Boolean]
|
1613
|
+
# Join on null values. By default null values will never produce matches.
|
1614
|
+
# @param allow_parallel [Boolean]
|
1615
|
+
# Allow the physical plan to optionally evaluate the computation of both
|
1616
|
+
# DataFrames up to the join in parallel.
|
1617
|
+
# @param force_parallel [Boolean]
|
1618
|
+
# Force the physical plan to evaluate the computation of both DataFrames up to
|
1619
|
+
# the join in parallel.
|
1620
|
+
#
|
1621
|
+
# @return [LazyFrame]
|
1622
|
+
#
|
1623
|
+
# @example
|
1624
|
+
# df = Polars::DataFrame.new(
|
1625
|
+
# {
|
1626
|
+
# "foo" => [1, 2, 3],
|
1627
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1628
|
+
# "ham" => ["a", "b", "c"]
|
1629
|
+
# }
|
1630
|
+
# ).lazy
|
1631
|
+
# other_df = Polars::DataFrame.new(
|
1632
|
+
# {
|
1633
|
+
# "apple" => ["x", "y", "z"],
|
1634
|
+
# "ham" => ["a", "b", "d"]
|
1635
|
+
# }
|
1636
|
+
# ).lazy
|
1637
|
+
# df.join(other_df, on: "ham").collect
|
1638
|
+
# # =>
|
1639
|
+
# # shape: (2, 4)
|
1640
|
+
# # ┌─────┬─────┬─────┬───────┐
|
1641
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1642
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1643
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
1644
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
1645
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
1646
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
1647
|
+
# # └─────┴─────┴─────┴───────┘
|
1648
|
+
#
|
1649
|
+
# @example
|
1650
|
+
# df.join(other_df, on: "ham", how: "full").collect
|
1651
|
+
# # =>
|
1652
|
+
# # shape: (4, 5)
|
1653
|
+
# # ┌──────┬──────┬──────┬───────┬───────────┐
|
1654
|
+
# # │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
|
1655
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1656
|
+
# # │ i64 ┆ f64 ┆ str ┆ str ┆ str │
|
1657
|
+
# # ╞══════╪══════╪══════╪═══════╪═══════════╡
|
1658
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x ┆ a │
|
1659
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y ┆ b │
|
1660
|
+
# # │ null ┆ null ┆ null ┆ z ┆ d │
|
1661
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null ┆ null │
|
1662
|
+
# # └──────┴──────┴──────┴───────┴───────────┘
|
1663
|
+
#
|
1664
|
+
# @example
|
1665
|
+
# df.join(other_df, on: "ham", how: "left").collect
|
1666
|
+
# # =>
|
1667
|
+
# # shape: (3, 4)
|
1668
|
+
# # ┌─────┬─────┬─────┬───────┐
|
1669
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1670
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1671
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
1672
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
1673
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
1674
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
1675
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null │
|
1676
|
+
# # └─────┴─────┴─────┴───────┘
|
1677
|
+
#
|
1678
|
+
# @example
|
1679
|
+
# df.join(other_df, on: "ham", how: "semi").collect
|
1680
|
+
# # =>
|
1681
|
+
# # shape: (2, 3)
|
1682
|
+
# # ┌─────┬─────┬─────┐
|
1683
|
+
# # │ foo ┆ bar ┆ ham │
|
1684
|
+
# # │ --- ┆ --- ┆ --- │
|
1685
|
+
# # │ i64 ┆ f64 ┆ str │
|
1686
|
+
# # ╞═════╪═════╪═════╡
|
1687
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
1688
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1689
|
+
# # └─────┴─────┴─────┘
|
1690
|
+
#
|
1691
|
+
# @example
|
1692
|
+
# df.join(other_df, on: "ham", how: "anti").collect
|
1693
|
+
# # =>
|
1694
|
+
# # shape: (1, 3)
|
1695
|
+
# # ┌─────┬─────┬─────┐
|
1696
|
+
# # │ foo ┆ bar ┆ ham │
|
1697
|
+
# # │ --- ┆ --- ┆ --- │
|
1698
|
+
# # │ i64 ┆ f64 ┆ str │
|
1699
|
+
# # ╞═════╪═════╪═════╡
|
1700
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1701
|
+
# # └─────┴─────┴─────┘
|
1702
|
+
def join(
|
1703
|
+
other,
|
1704
|
+
left_on: nil,
|
1705
|
+
right_on: nil,
|
1706
|
+
on: nil,
|
1707
|
+
how: "inner",
|
1708
|
+
suffix: "_right",
|
1709
|
+
join_nulls: false,
|
1710
|
+
allow_parallel: true,
|
1711
|
+
force_parallel: false
|
1712
|
+
)
|
1713
|
+
if !other.is_a?(LazyFrame)
|
1714
|
+
raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
|
1715
|
+
end
|
1716
|
+
|
1717
|
+
if how == "outer"
|
1718
|
+
how = "full"
|
1719
|
+
elsif how == "cross"
|
1720
|
+
return _from_rbldf(
|
1721
|
+
_ldf.join(
|
1722
|
+
other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
|
1723
|
+
)
|
1724
|
+
)
|
1725
|
+
end
|
1726
|
+
|
1727
|
+
if !on.nil?
|
1728
|
+
rbexprs = Utils.parse_into_list_of_expressions(on)
|
1729
|
+
rbexprs_left = rbexprs
|
1730
|
+
rbexprs_right = rbexprs
|
1731
|
+
elsif !left_on.nil? && !right_on.nil?
|
1732
|
+
rbexprs_left = Utils.parse_into_list_of_expressions(left_on)
|
1733
|
+
rbexprs_right = Utils.parse_into_list_of_expressions(right_on)
|
1734
|
+
else
|
1735
|
+
raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
|
1736
|
+
end
|
1737
|
+
|
1738
|
+
_from_rbldf(
|
1739
|
+
self._ldf.join(
|
1740
|
+
other._ldf,
|
1741
|
+
rbexprs_left,
|
1742
|
+
rbexprs_right,
|
1743
|
+
allow_parallel,
|
1744
|
+
force_parallel,
|
1745
|
+
join_nulls,
|
1746
|
+
how,
|
1747
|
+
suffix,
|
1748
|
+
)
|
1749
|
+
)
|
1750
|
+
end
|
1751
|
+
|
1752
|
+
# Add or overwrite multiple columns in a DataFrame.
|
1753
|
+
#
|
1754
|
+
# @param exprs [Object]
|
1755
|
+
# List of Expressions that evaluate to columns.
|
1756
|
+
#
|
1757
|
+
# @return [LazyFrame]
|
1758
|
+
#
|
1759
|
+
# @example
|
1760
|
+
# ldf = Polars::DataFrame.new(
|
1761
|
+
# {
|
1762
|
+
# "a" => [1, 2, 3, 4],
|
1763
|
+
# "b" => [0.5, 4, 10, 13],
|
1764
|
+
# "c" => [true, true, false, true]
|
1765
|
+
# }
|
1766
|
+
# ).lazy
|
1767
|
+
# ldf.with_columns(
|
1768
|
+
# [
|
1769
|
+
# (Polars.col("a") ** 2).alias("a^2"),
|
1770
|
+
# (Polars.col("b") / 2).alias("b/2"),
|
1771
|
+
# (Polars.col("c").is_not).alias("not c")
|
1772
|
+
# ]
|
1773
|
+
# ).collect
|
1774
|
+
# # =>
|
1775
|
+
# # shape: (4, 6)
|
1776
|
+
# # ┌─────┬──────┬───────┬─────┬──────┬───────┐
|
1777
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
1778
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1779
|
+
# # │ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
|
1780
|
+
# # ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
|
1781
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
|
1782
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
|
1783
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
|
1784
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
|
1785
|
+
# # └─────┴──────┴───────┴─────┴──────┴───────┘
|
1786
|
+
def with_columns(*exprs, **named_exprs)
|
1787
|
+
structify = ENV.fetch("POLARS_AUTO_STRUCTIFY", "0") != "0"
|
1788
|
+
|
1789
|
+
rbexprs = Utils.parse_into_list_of_expressions(*exprs, **named_exprs, __structify: structify)
|
1790
|
+
|
1791
|
+
_from_rbldf(_ldf.with_columns(rbexprs))
|
1792
|
+
end
|
1793
|
+
|
1794
|
+
# Add an external context to the computation graph.
|
1795
|
+
#
|
1796
|
+
# This allows expressions to also access columns from DataFrames
|
1797
|
+
# that are not part of this one.
|
1798
|
+
#
|
1799
|
+
# @param other [Object]
|
1800
|
+
# Lazy DataFrame to join with.
|
1801
|
+
#
|
1802
|
+
# @return [LazyFrame]
|
1803
|
+
#
|
1804
|
+
# @example
|
1805
|
+
# df_a = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => ["a", "c", nil]}).lazy
|
1806
|
+
# df_other = Polars::DataFrame.new({"c" => ["foo", "ham"]})
|
1807
|
+
# (
|
1808
|
+
# df_a.with_context(df_other.lazy).select(
|
1809
|
+
# [Polars.col("b") + Polars.col("c").first]
|
1810
|
+
# )
|
1811
|
+
# ).collect
|
1812
|
+
# # =>
|
1813
|
+
# # shape: (3, 1)
|
1814
|
+
# # ┌──────┐
|
1815
|
+
# # │ b │
|
1816
|
+
# # │ --- │
|
1817
|
+
# # │ str │
|
1818
|
+
# # ╞══════╡
|
1819
|
+
# # │ afoo │
|
1820
|
+
# # │ cfoo │
|
1821
|
+
# # │ null │
|
1822
|
+
# # └──────┘
|
1823
|
+
def with_context(other)
|
1824
|
+
if !other.is_a?(::Array)
|
1825
|
+
other = [other]
|
1826
|
+
end
|
1827
|
+
|
1828
|
+
_from_rbldf(_ldf.with_context(other.map(&:_ldf)))
|
1829
|
+
end
|
1830
|
+
|
1831
|
+
# Add or overwrite column in a DataFrame.
|
1832
|
+
#
|
1833
|
+
# @param column [Object]
|
1834
|
+
# Expression that evaluates to column or a Series to use.
|
1835
|
+
#
|
1836
|
+
# @return [LazyFrame]
|
1837
|
+
#
|
1838
|
+
# @example
|
1839
|
+
# df = Polars::DataFrame.new(
|
1840
|
+
# {
|
1841
|
+
# "a" => [1, 3, 5],
|
1842
|
+
# "b" => [2, 4, 6]
|
1843
|
+
# }
|
1844
|
+
# ).lazy
|
1845
|
+
# df.with_column((Polars.col("b") ** 2).alias("b_squared")).collect
|
1846
|
+
# # =>
|
1847
|
+
# # shape: (3, 3)
|
1848
|
+
# # ┌─────┬─────┬───────────┐
|
1849
|
+
# # │ a ┆ b ┆ b_squared │
|
1850
|
+
# # │ --- ┆ --- ┆ --- │
|
1851
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
1852
|
+
# # ╞═════╪═════╪═══════════╡
|
1853
|
+
# # │ 1 ┆ 2 ┆ 4 │
|
1854
|
+
# # │ 3 ┆ 4 ┆ 16 │
|
1855
|
+
# # │ 5 ┆ 6 ┆ 36 │
|
1856
|
+
# # └─────┴─────┴───────────┘
|
1857
|
+
#
|
1858
|
+
# @example
|
1859
|
+
# df.with_column(Polars.col("a") ** 2).collect
|
1860
|
+
# # =>
|
1861
|
+
# # shape: (3, 2)
|
1862
|
+
# # ┌─────┬─────┐
|
1863
|
+
# # │ a ┆ b │
|
1864
|
+
# # │ --- ┆ --- │
|
1865
|
+
# # │ i64 ┆ i64 │
|
1866
|
+
# # ╞═════╪═════╡
|
1867
|
+
# # │ 1 ┆ 2 │
|
1868
|
+
# # │ 9 ┆ 4 │
|
1869
|
+
# # │ 25 ┆ 6 │
|
1870
|
+
# # └─────┴─────┘
|
1871
|
+
def with_column(column)
|
1872
|
+
with_columns([column])
|
1873
|
+
end
|
1874
|
+
|
1875
|
+
# Remove one or multiple columns from a DataFrame.
|
1876
|
+
#
|
1877
|
+
# @param columns [Object]
|
1878
|
+
# - Name of the column that should be removed.
|
1879
|
+
# - List of column names.
|
1880
|
+
#
|
1881
|
+
# @return [LazyFrame]
|
1882
|
+
def drop(*columns)
|
1883
|
+
drop_cols = Utils._expand_selectors(self, *columns)
|
1884
|
+
_from_rbldf(_ldf.drop(drop_cols))
|
1885
|
+
end
|
1886
|
+
|
1887
|
+
# Rename column names.
|
1888
|
+
#
|
1889
|
+
# @param mapping [Hash]
|
1890
|
+
# Key value pairs that map from old name to new name.
|
1891
|
+
#
|
1892
|
+
# @return [LazyFrame]
|
1893
|
+
def rename(mapping)
|
1894
|
+
existing = mapping.keys
|
1895
|
+
_new = mapping.values
|
1896
|
+
_from_rbldf(_ldf.rename(existing, _new))
|
1897
|
+
end
|
1898
|
+
|
1899
|
+
# Reverse the DataFrame.
|
1900
|
+
#
|
1901
|
+
# @return [LazyFrame]
|
1902
|
+
def reverse
|
1903
|
+
_from_rbldf(_ldf.reverse)
|
1904
|
+
end
|
1905
|
+
|
1906
|
+
# Shift the values by a given period.
|
1907
|
+
#
|
1908
|
+
# @param n [Integer]
|
1909
|
+
# Number of places to shift (may be negative).
|
1910
|
+
# @param fill_value [Object]
|
1911
|
+
# Fill the resulting null values with this value.
|
1912
|
+
#
|
1913
|
+
# @return [LazyFrame]
|
1914
|
+
#
|
1915
|
+
# @example
|
1916
|
+
# df = Polars::DataFrame.new(
|
1917
|
+
# {
|
1918
|
+
# "a" => [1, 3, 5],
|
1919
|
+
# "b" => [2, 4, 6]
|
1920
|
+
# }
|
1921
|
+
# ).lazy
|
1922
|
+
# df.shift(1).collect
|
1923
|
+
# # =>
|
1924
|
+
# # shape: (3, 2)
|
1925
|
+
# # ┌──────┬──────┐
|
1926
|
+
# # │ a ┆ b │
|
1927
|
+
# # │ --- ┆ --- │
|
1928
|
+
# # │ i64 ┆ i64 │
|
1929
|
+
# # ╞══════╪══════╡
|
1930
|
+
# # │ null ┆ null │
|
1931
|
+
# # │ 1 ┆ 2 │
|
1932
|
+
# # │ 3 ┆ 4 │
|
1933
|
+
# # └──────┴──────┘
|
1934
|
+
#
|
1935
|
+
# @example
|
1936
|
+
# df.shift(-1).collect
|
1937
|
+
# # =>
|
1938
|
+
# # shape: (3, 2)
|
1939
|
+
# # ┌──────┬──────┐
|
1940
|
+
# # │ a ┆ b │
|
1941
|
+
# # │ --- ┆ --- │
|
1942
|
+
# # │ i64 ┆ i64 │
|
1943
|
+
# # ╞══════╪══════╡
|
1944
|
+
# # │ 3 ┆ 4 │
|
1945
|
+
# # │ 5 ┆ 6 │
|
1946
|
+
# # │ null ┆ null │
|
1947
|
+
# # └──────┴──────┘
|
1948
|
+
def shift(n, fill_value: nil)
|
1949
|
+
if !fill_value.nil?
|
1950
|
+
fill_value = Utils.parse_into_expression(fill_value, str_as_lit: true)
|
1951
|
+
end
|
1952
|
+
n = Utils.parse_into_expression(n)
|
1953
|
+
_from_rbldf(_ldf.shift(n, fill_value))
|
1954
|
+
end
|
1955
|
+
|
1956
|
+
# Shift the values by a given period and fill the resulting null values.
|
1957
|
+
#
|
1958
|
+
# @param periods [Integer]
|
1959
|
+
# Number of places to shift (may be negative).
|
1960
|
+
# @param fill_value [Object]
|
1961
|
+
# Fill `nil` values with the result of this expression.
|
1962
|
+
#
|
1963
|
+
# @return [LazyFrame]
|
1964
|
+
#
|
1965
|
+
# @example
|
1966
|
+
# df = Polars::DataFrame.new(
|
1967
|
+
# {
|
1968
|
+
# "a" => [1, 3, 5],
|
1969
|
+
# "b" => [2, 4, 6]
|
1970
|
+
# }
|
1971
|
+
# ).lazy
|
1972
|
+
# df.shift_and_fill(1, 0).collect
|
1973
|
+
# # =>
|
1974
|
+
# # shape: (3, 2)
|
1975
|
+
# # ┌─────┬─────┐
|
1976
|
+
# # │ a ┆ b │
|
1977
|
+
# # │ --- ┆ --- │
|
1978
|
+
# # │ i64 ┆ i64 │
|
1979
|
+
# # ╞═════╪═════╡
|
1980
|
+
# # │ 0 ┆ 0 │
|
1981
|
+
# # │ 1 ┆ 2 │
|
1982
|
+
# # │ 3 ┆ 4 │
|
1983
|
+
# # └─────┴─────┘
|
1984
|
+
#
|
1985
|
+
# @example
|
1986
|
+
# df.shift_and_fill(-1, 0).collect
|
1987
|
+
# # =>
|
1988
|
+
# # shape: (3, 2)
|
1989
|
+
# # ┌─────┬─────┐
|
1990
|
+
# # │ a ┆ b │
|
1991
|
+
# # │ --- ┆ --- │
|
1992
|
+
# # │ i64 ┆ i64 │
|
1993
|
+
# # ╞═════╪═════╡
|
1994
|
+
# # │ 3 ┆ 4 │
|
1995
|
+
# # │ 5 ┆ 6 │
|
1996
|
+
# # │ 0 ┆ 0 │
|
1997
|
+
# # └─────┴─────┘
|
1998
|
+
def shift_and_fill(periods, fill_value)
|
1999
|
+
shift(periods, fill_value: fill_value)
|
2000
|
+
end
|
2001
|
+
|
2002
|
+
# Get a slice of this DataFrame.
|
2003
|
+
#
|
2004
|
+
# @param offset [Integer]
|
2005
|
+
# Start index. Negative indexing is supported.
|
2006
|
+
# @param length [Integer]
|
2007
|
+
# Length of the slice. If set to `nil`, all rows starting at the offset
|
2008
|
+
# will be selected.
|
2009
|
+
#
|
2010
|
+
# @return [LazyFrame]
|
2011
|
+
#
|
2012
|
+
# @example
|
2013
|
+
# df = Polars::DataFrame.new(
|
2014
|
+
# {
|
2015
|
+
# "a" => ["x", "y", "z"],
|
2016
|
+
# "b" => [1, 3, 5],
|
2017
|
+
# "c" => [2, 4, 6]
|
2018
|
+
# }
|
2019
|
+
# ).lazy
|
2020
|
+
# df.slice(1, 2).collect
|
2021
|
+
# # =>
|
2022
|
+
# # shape: (2, 3)
|
2023
|
+
# # ┌─────┬─────┬─────┐
|
2024
|
+
# # │ a ┆ b ┆ c │
|
2025
|
+
# # │ --- ┆ --- ┆ --- │
|
2026
|
+
# # │ str ┆ i64 ┆ i64 │
|
2027
|
+
# # ╞═════╪═════╪═════╡
|
2028
|
+
# # │ y ┆ 3 ┆ 4 │
|
2029
|
+
# # │ z ┆ 5 ┆ 6 │
|
2030
|
+
# # └─────┴─────┴─────┘
|
2031
|
+
def slice(offset, length = nil)
|
2032
|
+
if length && length < 0
|
2033
|
+
raise ArgumentError, "Negative slice lengths (#{length}) are invalid for LazyFrame"
|
2034
|
+
end
|
2035
|
+
_from_rbldf(_ldf.slice(offset, length))
|
2036
|
+
end
|
2037
|
+
|
2038
|
+
# Get the first `n` rows.
|
2039
|
+
#
|
2040
|
+
# Alias for {#head}.
|
2041
|
+
#
|
2042
|
+
# @param n [Integer]
|
2043
|
+
# Number of rows to return.
|
2044
|
+
#
|
2045
|
+
# @return [LazyFrame]
|
2046
|
+
#
|
2047
|
+
# @note
|
2048
|
+
# Consider using the {#fetch} operation if you only want to test your
|
2049
|
+
# query. The {#fetch} operation will load the first `n` rows at the scan
|
2050
|
+
# level, whereas the {#head}/{#limit} are applied at the end.
|
2051
|
+
def limit(n = 5)
|
2052
|
+
head(5)
|
2053
|
+
end
|
2054
|
+
|
2055
|
+
# Get the first `n` rows.
|
2056
|
+
#
|
2057
|
+
# @param n [Integer]
|
2058
|
+
# Number of rows to return.
|
2059
|
+
#
|
2060
|
+
# @return [LazyFrame]
|
2061
|
+
#
|
2062
|
+
# @note
|
2063
|
+
# Consider using the {#fetch} operation if you only want to test your
|
2064
|
+
# query. The {#fetch} operation will load the first `n` rows at the scan
|
2065
|
+
# level, whereas the {#head}/{#limit} are applied at the end.
|
2066
|
+
def head(n = 5)
|
2067
|
+
slice(0, n)
|
2068
|
+
end
|
2069
|
+
|
2070
|
+
# Get the last `n` rows.
|
2071
|
+
#
|
2072
|
+
# @param n [Integer]
|
2073
|
+
# Number of rows.
|
2074
|
+
#
|
2075
|
+
# @return [LazyFrame]
|
2076
|
+
def tail(n = 5)
|
2077
|
+
_from_rbldf(_ldf.tail(n))
|
2078
|
+
end
|
2079
|
+
|
2080
|
+
# Get the last row of the DataFrame.
|
2081
|
+
#
|
2082
|
+
# @return [LazyFrame]
|
2083
|
+
def last
|
2084
|
+
tail(1)
|
2085
|
+
end
|
2086
|
+
|
2087
|
+
# Get the first row of the DataFrame.
|
2088
|
+
#
|
2089
|
+
# @return [LazyFrame]
|
2090
|
+
def first
|
2091
|
+
slice(0, 1)
|
2092
|
+
end
|
2093
|
+
|
2094
|
+
# Add a column at index 0 that counts the rows.
|
2095
|
+
#
|
2096
|
+
# @param name [String]
|
2097
|
+
# Name of the column to add.
|
2098
|
+
# @param offset [Integer]
|
2099
|
+
# Start the row count at this offset.
|
2100
|
+
#
|
2101
|
+
# @return [LazyFrame]
|
2102
|
+
#
|
2103
|
+
# @note
|
2104
|
+
# This can have a negative effect on query performance.
|
2105
|
+
# This may, for instance, block predicate pushdown optimization.
|
2106
|
+
#
|
2107
|
+
# @example
|
2108
|
+
# df = Polars::DataFrame.new(
|
2109
|
+
# {
|
2110
|
+
# "a" => [1, 3, 5],
|
2111
|
+
# "b" => [2, 4, 6]
|
2112
|
+
# }
|
2113
|
+
# ).lazy
|
2114
|
+
# df.with_row_index.collect
|
2115
|
+
# # =>
|
2116
|
+
# # shape: (3, 3)
|
2117
|
+
# # ┌───────┬─────┬─────┐
|
2118
|
+
# # │ index ┆ a ┆ b │
|
2119
|
+
# # │ --- ┆ --- ┆ --- │
|
2120
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
2121
|
+
# # ╞═══════╪═════╪═════╡
|
2122
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
2123
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
2124
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
2125
|
+
# # └───────┴─────┴─────┘
|
2126
|
+
def with_row_index(name: "index", offset: 0)
|
2127
|
+
_from_rbldf(_ldf.with_row_index(name, offset))
|
2128
|
+
end
|
2129
|
+
alias_method :with_row_count, :with_row_index
|
2130
|
+
|
2131
|
+
# Take every nth row in the LazyFrame and return as a new LazyFrame.
|
2132
|
+
#
|
2133
|
+
# @return [LazyFrame]
|
2134
|
+
#
|
2135
|
+
# @example
|
2136
|
+
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]}).lazy
|
2137
|
+
# s.take_every(2).collect
|
2138
|
+
# # =>
|
2139
|
+
# # shape: (2, 2)
|
2140
|
+
# # ┌─────┬─────┐
|
2141
|
+
# # │ a ┆ b │
|
2142
|
+
# # │ --- ┆ --- │
|
2143
|
+
# # │ i64 ┆ i64 │
|
2144
|
+
# # ╞═════╪═════╡
|
2145
|
+
# # │ 1 ┆ 5 │
|
2146
|
+
# # │ 3 ┆ 7 │
|
2147
|
+
# # └─────┴─────┘
|
2148
|
+
def take_every(n)
|
2149
|
+
select(F.col("*").take_every(n))
|
2150
|
+
end
|
2151
|
+
|
2152
|
+
# Fill null values using the specified value or strategy.
|
2153
|
+
#
|
2154
|
+
# @return [LazyFrame]
|
2155
|
+
def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: nil)
|
2156
|
+
select(Polars.all.fill_null(value, strategy: strategy, limit: limit))
|
2157
|
+
end
|
2158
|
+
|
2159
|
+
# Fill floating point NaN values.
|
2160
|
+
#
|
2161
|
+
# @param fill_value [Object]
|
2162
|
+
# Value to fill the NaN values with.
|
2163
|
+
#
|
2164
|
+
# @return [LazyFrame]
|
2165
|
+
#
|
2166
|
+
# @note
|
2167
|
+
# Note that floating point NaN (Not a Number) are not missing values!
|
2168
|
+
# To replace missing values, use `fill_null` instead.
|
2169
|
+
#
|
2170
|
+
# @example
|
2171
|
+
# df = Polars::DataFrame.new(
|
2172
|
+
# {
|
2173
|
+
# "a" => [1.5, 2, Float::NAN, 4],
|
2174
|
+
# "b" => [0.5, 4, Float::NAN, 13],
|
2175
|
+
# }
|
2176
|
+
# ).lazy
|
2177
|
+
# df.fill_nan(99).collect
|
2178
|
+
# # =>
|
2179
|
+
# # shape: (4, 2)
|
2180
|
+
# # ┌──────┬──────┐
|
2181
|
+
# # │ a ┆ b │
|
2182
|
+
# # │ --- ┆ --- │
|
2183
|
+
# # │ f64 ┆ f64 │
|
2184
|
+
# # ╞══════╪══════╡
|
2185
|
+
# # │ 1.5 ┆ 0.5 │
|
2186
|
+
# # │ 2.0 ┆ 4.0 │
|
2187
|
+
# # │ 99.0 ┆ 99.0 │
|
2188
|
+
# # │ 4.0 ┆ 13.0 │
|
2189
|
+
# # └──────┴──────┘
|
2190
|
+
def fill_nan(fill_value)
|
2191
|
+
if !fill_value.is_a?(Expr)
|
2192
|
+
fill_value = F.lit(fill_value)
|
2193
|
+
end
|
2194
|
+
_from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
|
2195
|
+
end
|
2196
|
+
|
2197
|
+
# Aggregate the columns in the DataFrame to their standard deviation value.
|
2198
|
+
#
|
2199
|
+
# @return [LazyFrame]
|
2200
|
+
#
|
2201
|
+
# @example
|
2202
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
2203
|
+
# df.std.collect
|
2204
|
+
# # =>
|
2205
|
+
# # shape: (1, 2)
|
2206
|
+
# # ┌──────────┬─────┐
|
2207
|
+
# # │ a ┆ b │
|
2208
|
+
# # │ --- ┆ --- │
|
2209
|
+
# # │ f64 ┆ f64 │
|
2210
|
+
# # ╞══════════╪═════╡
|
2211
|
+
# # │ 1.290994 ┆ 0.5 │
|
2212
|
+
# # └──────────┴─────┘
|
2213
|
+
#
|
2214
|
+
# @example
|
2215
|
+
# df.std(ddof: 0).collect
|
2216
|
+
# # =>
|
2217
|
+
# # shape: (1, 2)
|
2218
|
+
# # ┌──────────┬──────────┐
|
2219
|
+
# # │ a ┆ b │
|
2220
|
+
# # │ --- ┆ --- │
|
2221
|
+
# # │ f64 ┆ f64 │
|
2222
|
+
# # ╞══════════╪══════════╡
|
2223
|
+
# # │ 1.118034 ┆ 0.433013 │
|
2224
|
+
# # └──────────┴──────────┘
|
2225
|
+
def std(ddof: 1)
|
2226
|
+
_from_rbldf(_ldf.std(ddof))
|
2227
|
+
end
|
2228
|
+
|
2229
|
+
# Aggregate the columns in the DataFrame to their variance value.
|
2230
|
+
#
|
2231
|
+
# @return [LazyFrame]
|
2232
|
+
#
|
2233
|
+
# @example
|
2234
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
2235
|
+
# df.var.collect
|
2236
|
+
# # =>
|
2237
|
+
# # shape: (1, 2)
|
2238
|
+
# # ┌──────────┬──────┐
|
2239
|
+
# # │ a ┆ b │
|
2240
|
+
# # │ --- ┆ --- │
|
2241
|
+
# # │ f64 ┆ f64 │
|
2242
|
+
# # ╞══════════╪══════╡
|
2243
|
+
# # │ 1.666667 ┆ 0.25 │
|
2244
|
+
# # └──────────┴──────┘
|
2245
|
+
#
|
2246
|
+
# @example
|
2247
|
+
# df.var(ddof: 0).collect
|
2248
|
+
# # =>
|
2249
|
+
# # shape: (1, 2)
|
2250
|
+
# # ┌──────┬────────┐
|
2251
|
+
# # │ a ┆ b │
|
2252
|
+
# # │ --- ┆ --- │
|
2253
|
+
# # │ f64 ┆ f64 │
|
2254
|
+
# # ╞══════╪════════╡
|
2255
|
+
# # │ 1.25 ┆ 0.1875 │
|
2256
|
+
# # └──────┴────────┘
|
2257
|
+
def var(ddof: 1)
|
2258
|
+
_from_rbldf(_ldf.var(ddof))
|
2259
|
+
end
|
2260
|
+
|
2261
|
+
# Aggregate the columns in the DataFrame to their maximum value.
|
2262
|
+
#
|
2263
|
+
# @return [LazyFrame]
|
2264
|
+
#
|
2265
|
+
# @example
|
2266
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
2267
|
+
# df.max.collect
|
2268
|
+
# # =>
|
2269
|
+
# # shape: (1, 2)
|
2270
|
+
# # ┌─────┬─────┐
|
2271
|
+
# # │ a ┆ b │
|
2272
|
+
# # │ --- ┆ --- │
|
2273
|
+
# # │ i64 ┆ i64 │
|
2274
|
+
# # ╞═════╪═════╡
|
2275
|
+
# # │ 4 ┆ 2 │
|
2276
|
+
# # └─────┴─────┘
|
2277
|
+
def max
|
2278
|
+
_from_rbldf(_ldf.max)
|
2279
|
+
end
|
2280
|
+
|
2281
|
+
# Aggregate the columns in the DataFrame to their minimum value.
|
2282
|
+
#
|
2283
|
+
# @return [LazyFrame]
|
2284
|
+
#
|
2285
|
+
# @example
|
2286
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
2287
|
+
# df.min.collect
|
2288
|
+
# # =>
|
2289
|
+
# # shape: (1, 2)
|
2290
|
+
# # ┌─────┬─────┐
|
2291
|
+
# # │ a ┆ b │
|
2292
|
+
# # │ --- ┆ --- │
|
2293
|
+
# # │ i64 ┆ i64 │
|
2294
|
+
# # ╞═════╪═════╡
|
2295
|
+
# # │ 1 ┆ 1 │
|
2296
|
+
# # └─────┴─────┘
|
2297
|
+
def min
|
2298
|
+
_from_rbldf(_ldf.min)
|
2299
|
+
end
|
2300
|
+
|
2301
|
+
# Aggregate the columns in the DataFrame to their sum value.
|
2302
|
+
#
|
2303
|
+
# @return [LazyFrame]
|
2304
|
+
#
|
2305
|
+
# @example
|
2306
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
2307
|
+
# df.sum.collect
|
2308
|
+
# # =>
|
2309
|
+
# # shape: (1, 2)
|
2310
|
+
# # ┌─────┬─────┐
|
2311
|
+
# # │ a ┆ b │
|
2312
|
+
# # │ --- ┆ --- │
|
2313
|
+
# # │ i64 ┆ i64 │
|
2314
|
+
# # ╞═════╪═════╡
|
2315
|
+
# # │ 10 ┆ 5 │
|
2316
|
+
# # └─────┴─────┘
|
2317
|
+
def sum
|
2318
|
+
_from_rbldf(_ldf.sum)
|
2319
|
+
end
|
2320
|
+
|
2321
|
+
# Aggregate the columns in the DataFrame to their mean value.
|
2322
|
+
#
|
2323
|
+
# @return [LazyFrame]
|
2324
|
+
#
|
2325
|
+
# @example
|
2326
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
2327
|
+
# df.mean.collect
|
2328
|
+
# # =>
|
2329
|
+
# # shape: (1, 2)
|
2330
|
+
# # ┌─────┬──────┐
|
2331
|
+
# # │ a ┆ b │
|
2332
|
+
# # │ --- ┆ --- │
|
2333
|
+
# # │ f64 ┆ f64 │
|
2334
|
+
# # ╞═════╪══════╡
|
2335
|
+
# # │ 2.5 ┆ 1.25 │
|
2336
|
+
# # └─────┴──────┘
|
2337
|
+
def mean
|
2338
|
+
_from_rbldf(_ldf.mean)
|
2339
|
+
end
|
2340
|
+
|
2341
|
+
# Aggregate the columns in the DataFrame to their median value.
|
2342
|
+
#
|
2343
|
+
# @return [LazyFrame]
|
2344
|
+
#
|
2345
|
+
# @example
|
2346
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
2347
|
+
# df.median.collect
|
2348
|
+
# # =>
|
2349
|
+
# # shape: (1, 2)
|
2350
|
+
# # ┌─────┬─────┐
|
2351
|
+
# # │ a ┆ b │
|
2352
|
+
# # │ --- ┆ --- │
|
2353
|
+
# # │ f64 ┆ f64 │
|
2354
|
+
# # ╞═════╪═════╡
|
2355
|
+
# # │ 2.5 ┆ 1.0 │
|
2356
|
+
# # └─────┴─────┘
|
2357
|
+
def median
|
2358
|
+
_from_rbldf(_ldf.median)
|
2359
|
+
end
|
2360
|
+
|
2361
|
+
# Aggregate the columns in the DataFrame to their quantile value.
|
2362
|
+
#
|
2363
|
+
# @param quantile [Float]
|
2364
|
+
# Quantile between 0.0 and 1.0.
|
2365
|
+
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
|
2366
|
+
# Interpolation method.
|
2367
|
+
#
|
2368
|
+
# @return [LazyFrame]
|
2369
|
+
#
|
2370
|
+
# @example
|
2371
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [1, 2, 1, 1]}).lazy
|
2372
|
+
# df.quantile(0.7).collect
|
2373
|
+
# # =>
|
2374
|
+
# # shape: (1, 2)
|
2375
|
+
# # ┌─────┬─────┐
|
2376
|
+
# # │ a ┆ b │
|
2377
|
+
# # │ --- ┆ --- │
|
2378
|
+
# # │ f64 ┆ f64 │
|
2379
|
+
# # ╞═════╪═════╡
|
2380
|
+
# # │ 3.0 ┆ 1.0 │
|
2381
|
+
# # └─────┴─────┘
|
2382
|
+
def quantile(quantile, interpolation: "nearest")
|
2383
|
+
quantile = Utils.parse_into_expression(quantile, str_as_lit: false)
|
2384
|
+
_from_rbldf(_ldf.quantile(quantile, interpolation))
|
2385
|
+
end
|
2386
|
+
|
2387
|
+
# Explode lists to long format.
|
2388
|
+
#
|
2389
|
+
# @return [LazyFrame]
|
2390
|
+
#
|
2391
|
+
# @example
|
2392
|
+
# df = Polars::DataFrame.new(
|
2393
|
+
# {
|
2394
|
+
# "letters" => ["a", "a", "b", "c"],
|
2395
|
+
# "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]],
|
2396
|
+
# }
|
2397
|
+
# ).lazy
|
2398
|
+
# df.explode("numbers").collect
|
2399
|
+
# # =>
|
2400
|
+
# # shape: (8, 2)
|
2401
|
+
# # ┌─────────┬─────────┐
|
2402
|
+
# # │ letters ┆ numbers │
|
2403
|
+
# # │ --- ┆ --- │
|
2404
|
+
# # │ str ┆ i64 │
|
2405
|
+
# # ╞═════════╪═════════╡
|
2406
|
+
# # │ a ┆ 1 │
|
2407
|
+
# # │ a ┆ 2 │
|
2408
|
+
# # │ a ┆ 3 │
|
2409
|
+
# # │ b ┆ 4 │
|
2410
|
+
# # │ b ┆ 5 │
|
2411
|
+
# # │ c ┆ 6 │
|
2412
|
+
# # │ c ┆ 7 │
|
2413
|
+
# # │ c ┆ 8 │
|
2414
|
+
# # └─────────┴─────────┘
|
2415
|
+
def explode(columns)
|
2416
|
+
columns = Utils.parse_into_list_of_expressions(columns)
|
2417
|
+
_from_rbldf(_ldf.explode(columns))
|
2418
|
+
end
|
2419
|
+
|
2420
|
+
# Drop duplicate rows from this DataFrame.
|
2421
|
+
#
|
2422
|
+
# Note that this fails if there is a column of type `List` in the DataFrame or
|
2423
|
+
# subset.
|
2424
|
+
#
|
2425
|
+
# @param maintain_order [Boolean]
|
2426
|
+
# Keep the same order as the original DataFrame. This requires more work to
|
2427
|
+
# compute.
|
2428
|
+
# @param subset [Object]
|
2429
|
+
# Subset to use to compare rows.
|
2430
|
+
# @param keep ["first", "last"]
|
2431
|
+
# Which of the duplicate rows to keep.
|
2432
|
+
#
|
2433
|
+
# @return [LazyFrame]
|
2434
|
+
def unique(maintain_order: true, subset: nil, keep: "first")
|
2435
|
+
if !subset.nil? && !subset.is_a?(::Array)
|
2436
|
+
subset = [subset]
|
2437
|
+
end
|
2438
|
+
_from_rbldf(_ldf.unique(maintain_order, subset, keep))
|
2439
|
+
end
|
2440
|
+
|
2441
|
+
# Drop rows with null values from this LazyFrame.
|
2442
|
+
#
|
2443
|
+
# @param subset [Object]
|
2444
|
+
# Subset of column(s) on which `drop_nulls` will be applied.
|
2445
|
+
#
|
2446
|
+
# @return [LazyFrame]
|
2447
|
+
#
|
2448
|
+
# @example
|
2449
|
+
# df = Polars::DataFrame.new(
|
2450
|
+
# {
|
2451
|
+
# "foo" => [1, 2, 3],
|
2452
|
+
# "bar" => [6, nil, 8],
|
2453
|
+
# "ham" => ["a", "b", "c"]
|
2454
|
+
# }
|
2455
|
+
# )
|
2456
|
+
# df.lazy.drop_nulls.collect
|
2457
|
+
# # =>
|
2458
|
+
# # shape: (2, 3)
|
2459
|
+
# # ┌─────┬─────┬─────┐
|
2460
|
+
# # │ foo ┆ bar ┆ ham │
|
2461
|
+
# # │ --- ┆ --- ┆ --- │
|
2462
|
+
# # │ i64 ┆ i64 ┆ str │
|
2463
|
+
# # ╞═════╪═════╪═════╡
|
2464
|
+
# # │ 1 ┆ 6 ┆ a │
|
2465
|
+
# # │ 3 ┆ 8 ┆ c │
|
2466
|
+
# # └─────┴─────┴─────┘
|
2467
|
+
def drop_nulls(subset: nil)
|
2468
|
+
if !subset.nil? && !subset.is_a?(::Array)
|
2469
|
+
subset = [subset]
|
2470
|
+
end
|
2471
|
+
_from_rbldf(_ldf.drop_nulls(subset))
|
2472
|
+
end
|
2473
|
+
|
2474
|
+
# Unpivot a DataFrame from wide to long format.
|
2475
|
+
#
|
2476
|
+
# Optionally leaves identifiers set.
|
2477
|
+
#
|
2478
|
+
# This function is useful to massage a DataFrame into a format where one or more
|
2479
|
+
# columns are identifier variables (index) while all other columns, considered
|
2480
|
+
# measured variables (on), are "unpivoted" to the row axis leaving just
|
2481
|
+
# two non-identifier columns, 'variable' and 'value'.
|
2482
|
+
#
|
2483
|
+
# @param on [Object]
|
2484
|
+
# Column(s) or selector(s) to use as values variables; if `on`
|
2485
|
+
# is empty all columns that are not in `index` will be used.
|
2486
|
+
# @param index [Object]
|
2487
|
+
# Column(s) or selector(s) to use as identifier variables.
|
2488
|
+
# @param variable_name [String]
|
2489
|
+
# Name to give to the `variable` column. Defaults to "variable"
|
2490
|
+
# @param value_name [String]
|
2491
|
+
# Name to give to the `value` column. Defaults to "value"
|
2492
|
+
# @param streamable [Boolean]
|
2493
|
+
# Allow this node to run in the streaming engine.
|
2494
|
+
# If this runs in streaming, the output of the unpivot operation
|
2495
|
+
# will not have a stable ordering.
|
2496
|
+
#
|
2497
|
+
# @return [LazyFrame]
|
2498
|
+
#
|
2499
|
+
# @example
|
2500
|
+
# lf = Polars::LazyFrame.new(
|
2501
|
+
# {
|
2502
|
+
# "a" => ["x", "y", "z"],
|
2503
|
+
# "b" => [1, 3, 5],
|
2504
|
+
# "c" => [2, 4, 6]
|
2505
|
+
# }
|
2506
|
+
# )
|
2507
|
+
# lf.unpivot(Polars::Selectors.numeric, index: "a").collect
|
2508
|
+
# # =>
|
2509
|
+
# # shape: (6, 3)
|
2510
|
+
# # ┌─────┬──────────┬───────┐
|
2511
|
+
# # │ a ┆ variable ┆ value │
|
2512
|
+
# # │ --- ┆ --- ┆ --- │
|
2513
|
+
# # │ str ┆ str ┆ i64 │
|
2514
|
+
# # ╞═════╪══════════╪═══════╡
|
2515
|
+
# # │ x ┆ b ┆ 1 │
|
2516
|
+
# # │ y ┆ b ┆ 3 │
|
2517
|
+
# # │ z ┆ b ┆ 5 │
|
2518
|
+
# # │ x ┆ c ┆ 2 │
|
2519
|
+
# # │ y ┆ c ┆ 4 │
|
2520
|
+
# # │ z ┆ c ┆ 6 │
|
2521
|
+
# # └─────┴──────────┴───────┘
|
2522
|
+
def unpivot(
|
2523
|
+
on,
|
2524
|
+
index: nil,
|
2525
|
+
variable_name: nil,
|
2526
|
+
value_name: nil,
|
2527
|
+
streamable: true
|
2528
|
+
)
|
2529
|
+
if !streamable
|
2530
|
+
warn "The `streamable` parameter for `LazyFrame.unpivot` is deprecated"
|
2531
|
+
end
|
2532
|
+
|
2533
|
+
on = on.nil? ? [] : Utils._expand_selectors(self, on)
|
2534
|
+
index = index.nil? ? [] : Utils._expand_selectors(self, index)
|
2535
|
+
|
2536
|
+
_from_rbldf(
|
2537
|
+
_ldf.unpivot(on, index, value_name, variable_name)
|
2538
|
+
)
|
2539
|
+
end
|
2540
|
+
alias_method :melt, :unpivot
|
2541
|
+
|
2542
|
+
# def map
|
2543
|
+
# end
|
2544
|
+
|
2545
|
+
# Interpolate intermediate values. The interpolation method is linear.
|
2546
|
+
#
|
2547
|
+
# @return [LazyFrame]
|
2548
|
+
#
|
2549
|
+
# @example
|
2550
|
+
# df = Polars::DataFrame.new(
|
2551
|
+
# {
|
2552
|
+
# "foo" => [1, nil, 9, 10],
|
2553
|
+
# "bar" => [6, 7, 9, nil],
|
2554
|
+
# "baz" => [1, nil, nil, 9]
|
2555
|
+
# }
|
2556
|
+
# ).lazy
|
2557
|
+
# df.interpolate.collect
|
2558
|
+
# # =>
|
2559
|
+
# # shape: (4, 3)
|
2560
|
+
# # ┌──────┬──────┬──────────┐
|
2561
|
+
# # │ foo ┆ bar ┆ baz │
|
2562
|
+
# # │ --- ┆ --- ┆ --- │
|
2563
|
+
# # │ f64 ┆ f64 ┆ f64 │
|
2564
|
+
# # ╞══════╪══════╪══════════╡
|
2565
|
+
# # │ 1.0 ┆ 6.0 ┆ 1.0 │
|
2566
|
+
# # │ 5.0 ┆ 7.0 ┆ 3.666667 │
|
2567
|
+
# # │ 9.0 ┆ 9.0 ┆ 6.333333 │
|
2568
|
+
# # │ 10.0 ┆ null ┆ 9.0 │
|
2569
|
+
# # └──────┴──────┴──────────┘
|
2570
|
+
def interpolate
|
2571
|
+
select(F.col("*").interpolate)
|
2572
|
+
end
|
2573
|
+
|
2574
|
+
# Decompose a struct into its fields.
|
2575
|
+
#
|
2576
|
+
# The fields will be inserted into the `DataFrame` on the location of the
|
2577
|
+
# `struct` type.
|
2578
|
+
#
|
2579
|
+
# @param names [Object]
|
2580
|
+
# Names of the struct columns that will be decomposed by its fields
|
2581
|
+
#
|
2582
|
+
# @return [LazyFrame]
|
2583
|
+
#
|
2584
|
+
# @example
|
2585
|
+
# df = (
|
2586
|
+
# Polars::DataFrame.new(
|
2587
|
+
# {
|
2588
|
+
# "before" => ["foo", "bar"],
|
2589
|
+
# "t_a" => [1, 2],
|
2590
|
+
# "t_b" => ["a", "b"],
|
2591
|
+
# "t_c" => [true, nil],
|
2592
|
+
# "t_d" => [[1, 2], [3]],
|
2593
|
+
# "after" => ["baz", "womp"]
|
2594
|
+
# }
|
2595
|
+
# )
|
2596
|
+
# .lazy
|
2597
|
+
# .select(
|
2598
|
+
# ["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"]
|
2599
|
+
# )
|
2600
|
+
# )
|
2601
|
+
# df.fetch
|
2602
|
+
# # =>
|
2603
|
+
# # shape: (2, 3)
|
2604
|
+
# # ┌────────┬─────────────────────┬───────┐
|
2605
|
+
# # │ before ┆ t_struct ┆ after │
|
2606
|
+
# # │ --- ┆ --- ┆ --- │
|
2607
|
+
# # │ str ┆ struct[4] ┆ str │
|
2608
|
+
# # ╞════════╪═════════════════════╪═══════╡
|
2609
|
+
# # │ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
|
2610
|
+
# # │ bar ┆ {2,"b",null,[3]} ┆ womp │
|
2611
|
+
# # └────────┴─────────────────────┴───────┘
|
2612
|
+
#
|
2613
|
+
# @example
|
2614
|
+
# df.unnest("t_struct").fetch
|
2615
|
+
# # =>
|
2616
|
+
# # shape: (2, 6)
|
2617
|
+
# # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
|
2618
|
+
# # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
|
2619
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2620
|
+
# # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
|
2621
|
+
# # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
|
2622
|
+
# # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
|
2623
|
+
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
2624
|
+
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
2625
|
+
def unnest(names)
|
2626
|
+
if names.is_a?(::String)
|
2627
|
+
names = [names]
|
2628
|
+
end
|
2629
|
+
_from_rbldf(_ldf.unnest(names))
|
2630
|
+
end
|
2631
|
+
|
2632
|
+
# Take two sorted DataFrames and merge them by the sorted key.
|
2633
|
+
#
|
2634
|
+
# The output of this operation will also be sorted.
|
2635
|
+
# It is the callers responsibility that the frames are sorted
|
2636
|
+
# by that key otherwise the output will not make sense.
|
2637
|
+
#
|
2638
|
+
# The schemas of both LazyFrames must be equal.
|
2639
|
+
#
|
2640
|
+
# @param other [DataFrame]
|
2641
|
+
# Other DataFrame that must be merged
|
2642
|
+
# @param key [String]
|
2643
|
+
# Key that is sorted.
|
2644
|
+
#
|
2645
|
+
# @return [LazyFrame]
|
2646
|
+
#
|
2647
|
+
# @example
|
2648
|
+
# df0 = Polars::LazyFrame.new(
|
2649
|
+
# {"name" => ["steve", "elise", "bob"], "age" => [42, 44, 18]}
|
2650
|
+
# ).sort("age")
|
2651
|
+
# df1 = Polars::LazyFrame.new(
|
2652
|
+
# {"name" => ["anna", "megan", "steve", "thomas"], "age" => [21, 33, 42, 20]}
|
2653
|
+
# ).sort("age")
|
2654
|
+
# df0.merge_sorted(df1, "age").collect
|
2655
|
+
# # =>
|
2656
|
+
# # shape: (7, 2)
|
2657
|
+
# # ┌────────┬─────┐
|
2658
|
+
# # │ name ┆ age │
|
2659
|
+
# # │ --- ┆ --- │
|
2660
|
+
# # │ str ┆ i64 │
|
2661
|
+
# # ╞════════╪═════╡
|
2662
|
+
# # │ bob ┆ 18 │
|
2663
|
+
# # │ thomas ┆ 20 │
|
2664
|
+
# # │ anna ┆ 21 │
|
2665
|
+
# # │ megan ┆ 33 │
|
2666
|
+
# # │ steve ┆ 42 │
|
2667
|
+
# # │ steve ┆ 42 │
|
2668
|
+
# # │ elise ┆ 44 │
|
2669
|
+
# # └────────┴─────┘
|
2670
|
+
def merge_sorted(other, key)
|
2671
|
+
_from_rbldf(_ldf.merge_sorted(other._ldf, key))
|
2672
|
+
end
|
2673
|
+
|
2674
|
+
# Indicate that one or multiple columns are sorted.
|
2675
|
+
#
|
2676
|
+
# @param column [Object]
|
2677
|
+
# Columns that are sorted
|
2678
|
+
# @param descending [Boolean]
|
2679
|
+
# Whether the columns are sorted in descending order.
|
2680
|
+
#
|
2681
|
+
# @return [LazyFrame]
|
2682
|
+
def set_sorted(
|
2683
|
+
column,
|
2684
|
+
descending: false
|
2685
|
+
)
|
2686
|
+
if !Utils.strlike?(column)
|
2687
|
+
msg = "expected a 'str' for argument 'column' in 'set_sorted'"
|
2688
|
+
raise TypeError, msg
|
2689
|
+
end
|
2690
|
+
with_columns(F.col(column).set_sorted(descending: descending))
|
2691
|
+
end
|
2692
|
+
|
2693
|
+
# TODO
|
2694
|
+
# def update
|
2695
|
+
# end
|
2696
|
+
|
2697
|
+
private
|
2698
|
+
|
2699
|
+
def initialize_copy(other)
|
2700
|
+
super
|
2701
|
+
self._ldf = _ldf._clone
|
2702
|
+
end
|
2703
|
+
|
2704
|
+
def _from_rbldf(rb_ldf)
|
2705
|
+
self.class._from_rbldf(rb_ldf)
|
2706
|
+
end
|
2707
|
+
end
|
2708
|
+
end
|