polars-df 0.2.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +33 -0
- data/Cargo.lock +2230 -0
- data/Cargo.toml +10 -0
- data/LICENSE-THIRD-PARTY.txt +38828 -0
- data/LICENSE.txt +20 -0
- data/README.md +91 -0
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +52 -0
- data/lib/polars/cat_name_space.rb +54 -0
- data/lib/polars/convert.rb +100 -0
- data/lib/polars/data_frame.rb +4833 -0
- data/lib/polars/data_types.rb +122 -0
- data/lib/polars/date_time_expr.rb +1418 -0
- data/lib/polars/date_time_name_space.rb +1484 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +5307 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions.rb +453 -0
- data/lib/polars/group_by.rb +558 -0
- data/lib/polars/io.rb +814 -0
- data/lib/polars/lazy_frame.rb +2442 -0
- data/lib/polars/lazy_functions.rb +1195 -0
- data/lib/polars/lazy_group_by.rb +93 -0
- data/lib/polars/list_expr.rb +610 -0
- data/lib/polars/list_name_space.rb +346 -0
- data/lib/polars/meta_expr.rb +54 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +3730 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +972 -0
- data/lib/polars/string_name_space.rb +690 -0
- data/lib/polars/struct_expr.rb +100 -0
- data/lib/polars/struct_name_space.rb +64 -0
- data/lib/polars/utils.rb +192 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/when.rb +16 -0
- data/lib/polars/when_then.rb +19 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +50 -0
- metadata +89 -0
@@ -0,0 +1,4833 @@
|
|
1
|
+
module Polars
|
2
|
+
# Two-dimensional data structure representing data as a table with rows and columns.
|
3
|
+
class DataFrame
|
4
|
+
# @private
|
5
|
+
attr_accessor :_df
|
6
|
+
|
7
|
+
# Create a new DataFrame.
|
8
|
+
#
|
9
|
+
# @param data [Hash, Array, Series, nil]
|
10
|
+
# Two-dimensional data in various forms. Hash must contain Arrays.
|
11
|
+
# Array may contain Series.
|
12
|
+
# @param columns [Array, Hash, nil]
|
13
|
+
# Column labels to use for resulting DataFrame. If specified, overrides any
|
14
|
+
# labels already present in the data. Must match data dimensions.
|
15
|
+
# @param orient ["col", "row", nil]
|
16
|
+
# Whether to interpret two-dimensional data as columns or as rows. If `nil`,
|
17
|
+
# the orientation is inferred by matching the columns and data dimensions. If
|
18
|
+
# this does not yield conclusive results, column orientation is used.
|
19
|
+
def initialize(data = nil, columns: nil, orient: nil)
|
20
|
+
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
21
|
+
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
|
22
|
+
data = {}
|
23
|
+
result.columns.each_with_index do |k, i|
|
24
|
+
data[k] = result.rows.map { |r| r[i] }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
if data.nil?
|
29
|
+
self._df = self.class.hash_to_rbdf({}, columns: columns)
|
30
|
+
elsif data.is_a?(Hash)
|
31
|
+
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
32
|
+
self._df = self.class.hash_to_rbdf(data, columns: columns)
|
33
|
+
elsif data.is_a?(Array)
|
34
|
+
self._df = self.class.sequence_to_rbdf(data, columns: columns, orient: orient)
|
35
|
+
elsif data.is_a?(Series)
|
36
|
+
self._df = self.class.series_to_rbdf(data, columns: columns)
|
37
|
+
else
|
38
|
+
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# @private
|
43
|
+
def self._from_rbdf(rb_df)
|
44
|
+
df = DataFrame.allocate
|
45
|
+
df._df = rb_df
|
46
|
+
df
|
47
|
+
end
|
48
|
+
|
49
|
+
# @private
|
50
|
+
def self._from_hashes(data, infer_schema_length: 100, schema: nil)
|
51
|
+
rbdf = RbDataFrame.read_hashes(data, infer_schema_length, schema)
|
52
|
+
_from_rbdf(rbdf)
|
53
|
+
end
|
54
|
+
|
55
|
+
# @private
|
56
|
+
def self._from_hash(data, columns: nil)
|
57
|
+
_from_rbdf(hash_to_rbdf(data, columns: columns))
|
58
|
+
end
|
59
|
+
|
60
|
+
# def self._from_records
|
61
|
+
# end
|
62
|
+
|
63
|
+
# def self._from_numo
|
64
|
+
# end
|
65
|
+
|
66
|
+
# no self._from_arrow
|
67
|
+
|
68
|
+
# no self._from_pandas
|
69
|
+
|
70
|
+
# @private
|
71
|
+
def self._read_csv(
|
72
|
+
file,
|
73
|
+
has_header: true,
|
74
|
+
columns: nil,
|
75
|
+
sep: str = ",",
|
76
|
+
comment_char: nil,
|
77
|
+
quote_char: '"',
|
78
|
+
skip_rows: 0,
|
79
|
+
dtypes: nil,
|
80
|
+
null_values: nil,
|
81
|
+
ignore_errors: false,
|
82
|
+
parse_dates: false,
|
83
|
+
n_threads: nil,
|
84
|
+
infer_schema_length: 100,
|
85
|
+
batch_size: 8192,
|
86
|
+
n_rows: nil,
|
87
|
+
encoding: "utf8",
|
88
|
+
low_memory: false,
|
89
|
+
rechunk: true,
|
90
|
+
skip_rows_after_header: 0,
|
91
|
+
row_count_name: nil,
|
92
|
+
row_count_offset: 0,
|
93
|
+
sample_size: 1024,
|
94
|
+
eol_char: "\n"
|
95
|
+
)
|
96
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
97
|
+
path = Utils.format_path(file)
|
98
|
+
else
|
99
|
+
path = nil
|
100
|
+
# if defined?(StringIO) && file.is_a?(StringIO)
|
101
|
+
# file = file.string
|
102
|
+
# end
|
103
|
+
end
|
104
|
+
|
105
|
+
dtype_list = nil
|
106
|
+
dtype_slice = nil
|
107
|
+
if !dtypes.nil?
|
108
|
+
if dtypes.is_a?(Hash)
|
109
|
+
dtype_list = []
|
110
|
+
dtypes.each do|k, v|
|
111
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
112
|
+
end
|
113
|
+
elsif dtypes.is_a?(Array)
|
114
|
+
dtype_slice = dtypes
|
115
|
+
else
|
116
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
processed_null_values = Utils._process_null_values(null_values)
|
121
|
+
|
122
|
+
if columns.is_a?(String)
|
123
|
+
columns = [columns]
|
124
|
+
end
|
125
|
+
if file.is_a?(String) && file.include?("*")
|
126
|
+
raise Todo
|
127
|
+
end
|
128
|
+
|
129
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
130
|
+
|
131
|
+
_from_rbdf(
|
132
|
+
RbDataFrame.read_csv(
|
133
|
+
file,
|
134
|
+
infer_schema_length,
|
135
|
+
batch_size,
|
136
|
+
has_header,
|
137
|
+
ignore_errors,
|
138
|
+
n_rows,
|
139
|
+
skip_rows,
|
140
|
+
projection,
|
141
|
+
sep,
|
142
|
+
rechunk,
|
143
|
+
columns,
|
144
|
+
encoding,
|
145
|
+
n_threads,
|
146
|
+
path,
|
147
|
+
dtype_list,
|
148
|
+
dtype_slice,
|
149
|
+
low_memory,
|
150
|
+
comment_char,
|
151
|
+
quote_char,
|
152
|
+
processed_null_values,
|
153
|
+
parse_dates,
|
154
|
+
skip_rows_after_header,
|
155
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
156
|
+
sample_size,
|
157
|
+
eol_char
|
158
|
+
)
|
159
|
+
)
|
160
|
+
end
|
161
|
+
|
162
|
+
# @private
|
163
|
+
def self._read_parquet(
|
164
|
+
file,
|
165
|
+
columns: nil,
|
166
|
+
n_rows: nil,
|
167
|
+
parallel: "auto",
|
168
|
+
row_count_name: nil,
|
169
|
+
row_count_offset: 0,
|
170
|
+
low_memory: false
|
171
|
+
)
|
172
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
173
|
+
file = Utils.format_path(file)
|
174
|
+
end
|
175
|
+
|
176
|
+
if file.is_a?(String) && file.include?("*")
|
177
|
+
raise Todo
|
178
|
+
end
|
179
|
+
|
180
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
181
|
+
_from_rbdf(
|
182
|
+
RbDataFrame.read_parquet(
|
183
|
+
file,
|
184
|
+
columns,
|
185
|
+
projection,
|
186
|
+
n_rows,
|
187
|
+
parallel,
|
188
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
189
|
+
low_memory
|
190
|
+
)
|
191
|
+
)
|
192
|
+
end
|
193
|
+
|
194
|
+
# @private
|
195
|
+
def self._read_avro(file, columns: nil, n_rows: nil)
|
196
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
197
|
+
file = Utils.format_path(file)
|
198
|
+
end
|
199
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
200
|
+
_from_rbdf(RbDataFrame.read_avro(file, columns, projection, n_rows))
|
201
|
+
end
|
202
|
+
|
203
|
+
# @private
|
204
|
+
def self._read_ipc(
|
205
|
+
file,
|
206
|
+
columns: nil,
|
207
|
+
n_rows: nil,
|
208
|
+
row_count_name: nil,
|
209
|
+
row_count_offset: 0,
|
210
|
+
rechunk: true,
|
211
|
+
memory_map: true
|
212
|
+
)
|
213
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
214
|
+
file = Utils.format_path(file)
|
215
|
+
end
|
216
|
+
if columns.is_a?(String)
|
217
|
+
columns = [columns]
|
218
|
+
end
|
219
|
+
|
220
|
+
if file.is_a?(String) && file.include?("*")
|
221
|
+
raise Todo
|
222
|
+
end
|
223
|
+
|
224
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
225
|
+
_from_rbdf(
|
226
|
+
RbDataFrame.read_ipc(
|
227
|
+
file,
|
228
|
+
columns,
|
229
|
+
projection,
|
230
|
+
n_rows,
|
231
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
232
|
+
memory_map
|
233
|
+
)
|
234
|
+
)
|
235
|
+
end
|
236
|
+
|
237
|
+
# @private
|
238
|
+
def self._read_json(file)
|
239
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
240
|
+
file = Utils.format_path(file)
|
241
|
+
end
|
242
|
+
|
243
|
+
_from_rbdf(RbDataFrame.read_json(file))
|
244
|
+
end
|
245
|
+
|
246
|
+
# @private
|
247
|
+
def self._read_ndjson(file)
|
248
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
249
|
+
file = Utils.format_path(file)
|
250
|
+
end
|
251
|
+
|
252
|
+
_from_rbdf(RbDataFrame.read_ndjson(file))
|
253
|
+
end
|
254
|
+
|
255
|
+
# Get the shape of the DataFrame.
|
256
|
+
#
|
257
|
+
# @return [Array]
|
258
|
+
#
|
259
|
+
# @example
|
260
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
261
|
+
# df.shape
|
262
|
+
# # => [5, 1]
|
263
|
+
def shape
|
264
|
+
_df.shape
|
265
|
+
end
|
266
|
+
|
267
|
+
# Get the height of the DataFrame.
|
268
|
+
#
|
269
|
+
# @return [Integer]
|
270
|
+
#
|
271
|
+
# @example
|
272
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
273
|
+
# df.height
|
274
|
+
# # => 5
|
275
|
+
def height
|
276
|
+
_df.height
|
277
|
+
end
|
278
|
+
|
279
|
+
# Get the width of the DataFrame.
|
280
|
+
#
|
281
|
+
# @return [Integer]
|
282
|
+
#
|
283
|
+
# @example
|
284
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
285
|
+
# df.width
|
286
|
+
# # => 1
|
287
|
+
def width
|
288
|
+
_df.width
|
289
|
+
end
|
290
|
+
|
291
|
+
# Get column names.
|
292
|
+
#
|
293
|
+
# @return [Array]
|
294
|
+
#
|
295
|
+
# @example
|
296
|
+
# df = Polars::DataFrame.new(
|
297
|
+
# {
|
298
|
+
# "foo" => [1, 2, 3],
|
299
|
+
# "bar" => [6, 7, 8],
|
300
|
+
# "ham" => ["a", "b", "c"]
|
301
|
+
# }
|
302
|
+
# )
|
303
|
+
# df.columns
|
304
|
+
# # => ["foo", "bar", "ham"]
|
305
|
+
def columns
|
306
|
+
_df.columns
|
307
|
+
end
|
308
|
+
|
309
|
+
# Change the column names of the DataFrame.
|
310
|
+
#
|
311
|
+
# @param columns [Array]
|
312
|
+
# A list with new names for the DataFrame.
|
313
|
+
# The length of the list should be equal to the width of the DataFrame.
|
314
|
+
#
|
315
|
+
# @return [Object]
|
316
|
+
#
|
317
|
+
# @example
|
318
|
+
# df = Polars::DataFrame.new(
|
319
|
+
# {
|
320
|
+
# "foo" => [1, 2, 3],
|
321
|
+
# "bar" => [6, 7, 8],
|
322
|
+
# "ham" => ["a", "b", "c"]
|
323
|
+
# }
|
324
|
+
# )
|
325
|
+
# df.columns = ["apple", "banana", "orange"]
|
326
|
+
# df
|
327
|
+
# # =>
|
328
|
+
# # shape: (3, 3)
|
329
|
+
# # ┌───────┬────────┬────────┐
|
330
|
+
# # │ apple ┆ banana ┆ orange │
|
331
|
+
# # │ --- ┆ --- ┆ --- │
|
332
|
+
# # │ i64 ┆ i64 ┆ str │
|
333
|
+
# # ╞═══════╪════════╪════════╡
|
334
|
+
# # │ 1 ┆ 6 ┆ a │
|
335
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
336
|
+
# # │ 2 ┆ 7 ┆ b │
|
337
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
338
|
+
# # │ 3 ┆ 8 ┆ c │
|
339
|
+
# # └───────┴────────┴────────┘
|
340
|
+
def columns=(columns)
|
341
|
+
_df.set_column_names(columns)
|
342
|
+
end
|
343
|
+
|
344
|
+
# Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
|
345
|
+
#
|
346
|
+
# @return [Array]
|
347
|
+
#
|
348
|
+
# @example
|
349
|
+
# df = Polars::DataFrame.new(
|
350
|
+
# {
|
351
|
+
# "foo" => [1, 2, 3],
|
352
|
+
# "bar" => [6.0, 7.0, 8.0],
|
353
|
+
# "ham" => ["a", "b", "c"]
|
354
|
+
# }
|
355
|
+
# )
|
356
|
+
# df.dtypes
|
357
|
+
# # => [Polars::Int64, Polars::Float64, Polars::Utf8]
|
358
|
+
def dtypes
|
359
|
+
_df.dtypes
|
360
|
+
end
|
361
|
+
|
362
|
+
# Get the schema.
|
363
|
+
#
|
364
|
+
# @return [Hash]
|
365
|
+
#
|
366
|
+
# @example
|
367
|
+
# df = Polars::DataFrame.new(
|
368
|
+
# {
|
369
|
+
# "foo" => [1, 2, 3],
|
370
|
+
# "bar" => [6.0, 7.0, 8.0],
|
371
|
+
# "ham" => ["a", "b", "c"]
|
372
|
+
# }
|
373
|
+
# )
|
374
|
+
# df.schema
|
375
|
+
# # => {"foo"=>Polars::Int64, "bar"=>Polars::Float64, "ham"=>Polars::Utf8}
|
376
|
+
def schema
|
377
|
+
columns.zip(dtypes).to_h
|
378
|
+
end
|
379
|
+
|
380
|
+
# Equal.
|
381
|
+
#
|
382
|
+
# @return [DataFrame]
|
383
|
+
def ==(other)
|
384
|
+
_comp(other, "eq")
|
385
|
+
end
|
386
|
+
|
387
|
+
# Not equal.
|
388
|
+
#
|
389
|
+
# @return [DataFrame]
|
390
|
+
def !=(other)
|
391
|
+
_comp(other, "neq")
|
392
|
+
end
|
393
|
+
|
394
|
+
# Greater than.
|
395
|
+
#
|
396
|
+
# @return [DataFrame]
|
397
|
+
def >(other)
|
398
|
+
_comp(other, "gt")
|
399
|
+
end
|
400
|
+
|
401
|
+
# Less than.
|
402
|
+
#
|
403
|
+
# @return [DataFrame]
|
404
|
+
def <(other)
|
405
|
+
_comp(other, "lt")
|
406
|
+
end
|
407
|
+
|
408
|
+
# Greater than or equal.
|
409
|
+
#
|
410
|
+
# @return [DataFrame]
|
411
|
+
def >=(other)
|
412
|
+
_comp(other, "gt_eq")
|
413
|
+
end
|
414
|
+
|
415
|
+
# Less than or equal.
|
416
|
+
#
|
417
|
+
# @return [DataFrame]
|
418
|
+
def <=(other)
|
419
|
+
_comp(other, "lt_eq")
|
420
|
+
end
|
421
|
+
|
422
|
+
# Performs multiplication.
|
423
|
+
#
|
424
|
+
# @return [DataFrame]
|
425
|
+
def *(other)
|
426
|
+
if other.is_a?(DataFrame)
|
427
|
+
return _from_rbdf(_df.mul_df(other._df))
|
428
|
+
end
|
429
|
+
|
430
|
+
other = _prepare_other_arg(other)
|
431
|
+
_from_rbdf(_df.mul(other._s))
|
432
|
+
end
|
433
|
+
|
434
|
+
# Performs division.
|
435
|
+
#
|
436
|
+
# @return [DataFrame]
|
437
|
+
def /(other)
|
438
|
+
if other.is_a?(DataFrame)
|
439
|
+
return _from_rbdf(_df.div_df(other._df))
|
440
|
+
end
|
441
|
+
|
442
|
+
other = _prepare_other_arg(other)
|
443
|
+
_from_rbdf(_df.div(other._s))
|
444
|
+
end
|
445
|
+
|
446
|
+
# Performs addition.
|
447
|
+
#
|
448
|
+
# @return [DataFrame]
|
449
|
+
def +(other)
|
450
|
+
if other.is_a?(DataFrame)
|
451
|
+
return _from_rbdf(_df.add_df(other._df))
|
452
|
+
end
|
453
|
+
|
454
|
+
other = _prepare_other_arg(other)
|
455
|
+
_from_rbdf(_df.add(other._s))
|
456
|
+
end
|
457
|
+
|
458
|
+
# Performs subtraction.
|
459
|
+
#
|
460
|
+
# @return [DataFrame]
|
461
|
+
def -(other)
|
462
|
+
if other.is_a?(DataFrame)
|
463
|
+
return _from_rbdf(_df.sub_df(other._df))
|
464
|
+
end
|
465
|
+
|
466
|
+
other = _prepare_other_arg(other)
|
467
|
+
_from_rbdf(_df.sub(other._s))
|
468
|
+
end
|
469
|
+
|
470
|
+
# Returns the modulo.
|
471
|
+
#
|
472
|
+
# @return [DataFrame]
|
473
|
+
def %(other)
|
474
|
+
if other.is_a?(DataFrame)
|
475
|
+
return _from_rbdf(_df.rem_df(other._df))
|
476
|
+
end
|
477
|
+
|
478
|
+
other = _prepare_other_arg(other)
|
479
|
+
_from_rbdf(_df.rem(other._s))
|
480
|
+
end
|
481
|
+
|
482
|
+
# Returns a string representing the DataFrame.
|
483
|
+
#
|
484
|
+
# @return [String]
|
485
|
+
def to_s
|
486
|
+
_df.to_s
|
487
|
+
end
|
488
|
+
alias_method :inspect, :to_s
|
489
|
+
|
490
|
+
# Check if DataFrame includes column.
|
491
|
+
#
|
492
|
+
# @return [Boolean]
|
493
|
+
def include?(name)
|
494
|
+
columns.include?(name)
|
495
|
+
end
|
496
|
+
|
497
|
+
# def each
|
498
|
+
# end
|
499
|
+
|
500
|
+
# Returns subset of the DataFrame.
|
501
|
+
#
|
502
|
+
# @return [Object]
|
503
|
+
def [](*args)
|
504
|
+
if args.size == 2
|
505
|
+
row_selection, col_selection = args
|
506
|
+
|
507
|
+
# df[.., unknown]
|
508
|
+
if row_selection.is_a?(Range)
|
509
|
+
|
510
|
+
# multiple slices
|
511
|
+
# df[.., ..]
|
512
|
+
if col_selection.is_a?(Range)
|
513
|
+
raise Todo
|
514
|
+
end
|
515
|
+
end
|
516
|
+
|
517
|
+
# df[2, ..] (select row as df)
|
518
|
+
if row_selection.is_a?(Integer)
|
519
|
+
if col_selection.is_a?(Array)
|
520
|
+
df = self[0.., col_selection]
|
521
|
+
return df.slice(row_selection, 1)
|
522
|
+
end
|
523
|
+
# df[2, "a"]
|
524
|
+
if col_selection.is_a?(String)
|
525
|
+
return self[col_selection][row_selection]
|
526
|
+
end
|
527
|
+
end
|
528
|
+
|
529
|
+
# column selection can be "a" and ["a", "b"]
|
530
|
+
if col_selection.is_a?(String)
|
531
|
+
col_selection = [col_selection]
|
532
|
+
end
|
533
|
+
|
534
|
+
# df[.., 1]
|
535
|
+
if col_selection.is_a?(Integer)
|
536
|
+
series = to_series(col_selection)
|
537
|
+
return series[row_selection]
|
538
|
+
end
|
539
|
+
|
540
|
+
if col_selection.is_a?(Array)
|
541
|
+
# df[.., [1, 2]]
|
542
|
+
if is_int_sequence(col_selection)
|
543
|
+
series_list = col_selection.map { |i| to_series(i) }
|
544
|
+
df = self.class.new(series_list)
|
545
|
+
return df[row_selection]
|
546
|
+
end
|
547
|
+
end
|
548
|
+
|
549
|
+
df = self[col_selection]
|
550
|
+
return df[row_selection]
|
551
|
+
elsif args.size == 1
|
552
|
+
item = args[0]
|
553
|
+
|
554
|
+
# select single column
|
555
|
+
# df["foo"]
|
556
|
+
if item.is_a?(String)
|
557
|
+
return Utils.wrap_s(_df.column(item))
|
558
|
+
end
|
559
|
+
|
560
|
+
# df[idx]
|
561
|
+
if item.is_a?(Integer)
|
562
|
+
return slice(_pos_idx(item, 0), 1)
|
563
|
+
end
|
564
|
+
|
565
|
+
# df[..]
|
566
|
+
if item.is_a?(Range)
|
567
|
+
return Slice.new(self).apply(item)
|
568
|
+
end
|
569
|
+
|
570
|
+
if Utils.is_str_sequence(item, allow_str: false)
|
571
|
+
# select multiple columns
|
572
|
+
# df[["foo", "bar"]]
|
573
|
+
return _from_rbdf(_df.select(item))
|
574
|
+
end
|
575
|
+
end
|
576
|
+
|
577
|
+
raise ArgumentError, "Cannot get item of type: #{item.class.name}"
|
578
|
+
end
|
579
|
+
|
580
|
+
# Set item.
|
581
|
+
#
|
582
|
+
# @return [Object]
|
583
|
+
# def []=(key, value)
|
584
|
+
# if key.is_a?(String)
|
585
|
+
# raise TypeError, "'DataFrame' object does not support 'Series' assignment by index. Use 'DataFrame.with_columns'"
|
586
|
+
# end
|
587
|
+
|
588
|
+
# raise Todo
|
589
|
+
# end
|
590
|
+
|
591
|
+
# no to_arrow
|
592
|
+
|
593
|
+
# Convert DataFrame to a hash mapping column name to values.
|
594
|
+
#
|
595
|
+
# @return [Hash]
|
596
|
+
def to_h(as_series: true)
|
597
|
+
if as_series
|
598
|
+
get_columns.to_h { |s| [s.name, s] }
|
599
|
+
else
|
600
|
+
get_columns.to_h { |s| [s.name, s.to_a] }
|
601
|
+
end
|
602
|
+
end
|
603
|
+
|
604
|
+
# Convert every row to a dictionary.
|
605
|
+
#
|
606
|
+
# Note that this is slow.
|
607
|
+
#
|
608
|
+
# @return [Array]
|
609
|
+
#
|
610
|
+
# @example
|
611
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
612
|
+
# df.to_hashes
|
613
|
+
# [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
|
614
|
+
def to_hashes
|
615
|
+
rbdf = _df
|
616
|
+
names = columns
|
617
|
+
|
618
|
+
height.times.map do |i|
|
619
|
+
names.zip(rbdf.row_tuple(i)).to_h
|
620
|
+
end
|
621
|
+
end
|
622
|
+
|
623
|
+
# def to_numo
|
624
|
+
# end
|
625
|
+
|
626
|
+
# no to_pandas
|
627
|
+
|
628
|
+
# Select column as Series at index location.
|
629
|
+
#
|
630
|
+
# @param index [Integer]
|
631
|
+
# Location of selection.
|
632
|
+
#
|
633
|
+
# @return [Series]
|
634
|
+
#
|
635
|
+
# @example
|
636
|
+
# df = Polars::DataFrame.new(
|
637
|
+
# {
|
638
|
+
# "foo" => [1, 2, 3],
|
639
|
+
# "bar" => [6, 7, 8],
|
640
|
+
# "ham" => ["a", "b", "c"]
|
641
|
+
# }
|
642
|
+
# )
|
643
|
+
# df.to_series(1)
|
644
|
+
# # =>
|
645
|
+
# # shape: (3,)
|
646
|
+
# # Series: 'bar' [i64]
|
647
|
+
# # [
|
648
|
+
# # 6
|
649
|
+
# # 7
|
650
|
+
# # 8
|
651
|
+
# # ]
|
652
|
+
def to_series(index = 0)
|
653
|
+
if index < 0
|
654
|
+
index = columns.length + index
|
655
|
+
end
|
656
|
+
Utils.wrap_s(_df.select_at_idx(index))
|
657
|
+
end
|
658
|
+
|
659
|
+
# Serialize to JSON representation.
|
660
|
+
#
|
661
|
+
# @return [nil]
|
662
|
+
#
|
663
|
+
# @param file [String]
|
664
|
+
# File path to which the result should be written.
|
665
|
+
# @param pretty [Boolean]
|
666
|
+
# Pretty serialize json.
|
667
|
+
# @param row_oriented [Boolean]
|
668
|
+
# Write to row oriented json. This is slower, but more common.
|
669
|
+
#
|
670
|
+
# @see #write_ndjson
|
671
|
+
def write_json(
|
672
|
+
file,
|
673
|
+
pretty: false,
|
674
|
+
row_oriented: false
|
675
|
+
)
|
676
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
677
|
+
file = Utils.format_path(file)
|
678
|
+
end
|
679
|
+
|
680
|
+
_df.write_json(file, pretty, row_oriented)
|
681
|
+
nil
|
682
|
+
end
|
683
|
+
|
684
|
+
# Serialize to newline delimited JSON representation.
|
685
|
+
#
|
686
|
+
# @param file [String]
|
687
|
+
# File path to which the result should be written.
|
688
|
+
#
|
689
|
+
# @return [nil]
|
690
|
+
def write_ndjson(file)
|
691
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
692
|
+
file = Utils.format_path(file)
|
693
|
+
end
|
694
|
+
|
695
|
+
_df.write_ndjson(file)
|
696
|
+
nil
|
697
|
+
end
|
698
|
+
|
699
|
+
# Write to comma-separated values (CSV) file.
|
700
|
+
#
|
701
|
+
# @param file [String, nil]
|
702
|
+
# File path to which the result should be written. If set to `nil`
|
703
|
+
# (default), the output is returned as a string instead.
|
704
|
+
# @param has_header [Boolean]
|
705
|
+
# Whether to include header in the CSV output.
|
706
|
+
# @param sep [String]
|
707
|
+
# Separate CSV fields with this symbol.
|
708
|
+
# @param quote [String]
|
709
|
+
# Byte to use as quoting character.
|
710
|
+
# @param batch_size [Integer]
|
711
|
+
# Number of rows that will be processed per thread.
|
712
|
+
# @param datetime_format [String, nil]
|
713
|
+
# A format string, with the specifiers defined by the
|
714
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
715
|
+
# Rust crate. If no format specified, the default fractional-second
|
716
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
717
|
+
# Datetime cols (if any).
|
718
|
+
# @param date_format [String, nil]
|
719
|
+
# A format string, with the specifiers defined by the
|
720
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
721
|
+
# Rust crate.
|
722
|
+
# @param time_format [String, nil]
|
723
|
+
# A format string, with the specifiers defined by the
|
724
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
725
|
+
# Rust crate.
|
726
|
+
# @param float_precision [Integer, nil]
|
727
|
+
# Number of decimal places to write, applied to both `:f32` and
|
728
|
+
# `:f64` datatypes.
|
729
|
+
# @param null_value [String, nil]
|
730
|
+
# A string representing null values (defaulting to the empty string).
|
731
|
+
#
|
732
|
+
# @return [String, nil]
|
733
|
+
#
|
734
|
+
# @example
|
735
|
+
# df = Polars::DataFrame.new(
|
736
|
+
# {
|
737
|
+
# "foo" => [1, 2, 3, 4, 5],
|
738
|
+
# "bar" => [6, 7, 8, 9, 10],
|
739
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
740
|
+
# }
|
741
|
+
# )
|
742
|
+
# df.write_csv("file.csv")
|
743
|
+
def write_csv(
|
744
|
+
file = nil,
|
745
|
+
has_header: true,
|
746
|
+
sep: ",",
|
747
|
+
quote: '"',
|
748
|
+
batch_size: 1024,
|
749
|
+
datetime_format: nil,
|
750
|
+
date_format: nil,
|
751
|
+
time_format: nil,
|
752
|
+
float_precision: nil,
|
753
|
+
null_value: nil
|
754
|
+
)
|
755
|
+
if sep.length > 1
|
756
|
+
raise ArgumentError, "only single byte separator is allowed"
|
757
|
+
elsif quote.length > 1
|
758
|
+
raise ArgumentError, "only single byte quote char is allowed"
|
759
|
+
elsif null_value == ""
|
760
|
+
null_value = nil
|
761
|
+
end
|
762
|
+
|
763
|
+
if file.nil?
|
764
|
+
buffer = StringIO.new
|
765
|
+
buffer.set_encoding(Encoding::BINARY)
|
766
|
+
_df.write_csv(
|
767
|
+
buffer,
|
768
|
+
has_header,
|
769
|
+
sep.ord,
|
770
|
+
quote.ord,
|
771
|
+
batch_size,
|
772
|
+
datetime_format,
|
773
|
+
date_format,
|
774
|
+
time_format,
|
775
|
+
float_precision,
|
776
|
+
null_value
|
777
|
+
)
|
778
|
+
return buffer.string.force_encoding(Encoding::UTF_8)
|
779
|
+
end
|
780
|
+
|
781
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
782
|
+
file = Utils.format_path(file)
|
783
|
+
end
|
784
|
+
|
785
|
+
_df.write_csv(
|
786
|
+
file,
|
787
|
+
has_header,
|
788
|
+
sep.ord,
|
789
|
+
quote.ord,
|
790
|
+
batch_size,
|
791
|
+
datetime_format,
|
792
|
+
date_format,
|
793
|
+
time_format,
|
794
|
+
float_precision,
|
795
|
+
null_value,
|
796
|
+
)
|
797
|
+
nil
|
798
|
+
end
|
799
|
+
|
800
|
+
# Write to Apache Avro file.
|
801
|
+
#
|
802
|
+
# @param file [String]
|
803
|
+
# File path to which the file should be written.
|
804
|
+
# @param compression ["uncompressed", "snappy", "deflate"]
|
805
|
+
# Compression method. Defaults to "uncompressed".
|
806
|
+
#
|
807
|
+
# @return [nil]
|
808
|
+
def write_avro(file, compression = "uncompressed")
|
809
|
+
if compression.nil?
|
810
|
+
compression = "uncompressed"
|
811
|
+
end
|
812
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
813
|
+
file = Utils.format_path(file)
|
814
|
+
end
|
815
|
+
|
816
|
+
_df.write_avro(file, compression)
|
817
|
+
end
|
818
|
+
|
819
|
+
# Write to Arrow IPC binary stream or Feather file.
|
820
|
+
#
|
821
|
+
# @param file [String]
|
822
|
+
# File path to which the file should be written.
|
823
|
+
# @param compression ["uncompressed", "lz4", "zstd"]
|
824
|
+
# Compression method. Defaults to "uncompressed".
|
825
|
+
#
|
826
|
+
# @return [nil]
|
827
|
+
def write_ipc(file, compression: "uncompressed")
|
828
|
+
if compression.nil?
|
829
|
+
compression = "uncompressed"
|
830
|
+
end
|
831
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
832
|
+
file = Utils.format_path(file)
|
833
|
+
end
|
834
|
+
|
835
|
+
_df.write_ipc(file, compression)
|
836
|
+
end
|
837
|
+
|
838
|
+
# Write to Apache Parquet file.
|
839
|
+
#
|
840
|
+
# @param file [String]
|
841
|
+
# File path to which the file should be written.
|
842
|
+
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
843
|
+
# Choose "zstd" for good compression performance.
|
844
|
+
# Choose "lz4" for fast compression/decompression.
|
845
|
+
# Choose "snappy" for more backwards compatibility guarantees
|
846
|
+
# when you deal with older parquet readers.
|
847
|
+
# @param compression_level [Integer, nil]
|
848
|
+
# The level of compression to use. Higher compression means smaller files on
|
849
|
+
# disk.
|
850
|
+
#
|
851
|
+
# - "gzip" : min-level: 0, max-level: 10.
|
852
|
+
# - "brotli" : min-level: 0, max-level: 11.
|
853
|
+
# - "zstd" : min-level: 1, max-level: 22.
|
854
|
+
# @param statistics [Boolean]
|
855
|
+
# Write statistics to the parquet headers. This requires extra compute.
|
856
|
+
# @param row_group_size [Integer, nil]
|
857
|
+
# Size of the row groups in number of rows.
|
858
|
+
# If `nil` (default), the chunks of the DataFrame are
|
859
|
+
# used. Writing in smaller chunks may reduce memory pressure and improve
|
860
|
+
# writing speeds.
|
861
|
+
#
|
862
|
+
# @return [nil]
|
863
|
+
def write_parquet(
|
864
|
+
file,
|
865
|
+
compression: "zstd",
|
866
|
+
compression_level: nil,
|
867
|
+
statistics: false,
|
868
|
+
row_group_size: nil
|
869
|
+
)
|
870
|
+
if compression.nil?
|
871
|
+
compression = "uncompressed"
|
872
|
+
end
|
873
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
874
|
+
file = Utils.format_path(file)
|
875
|
+
end
|
876
|
+
|
877
|
+
_df.write_parquet(
|
878
|
+
file, compression, compression_level, statistics, row_group_size
|
879
|
+
)
|
880
|
+
end
|
881
|
+
|
882
|
+
# Return an estimation of the total (heap) allocated size of the DataFrame.
|
883
|
+
#
|
884
|
+
# Estimated size is given in the specified unit (bytes by default).
|
885
|
+
#
|
886
|
+
# This estimation is the sum of the size of its buffers, validity, including
|
887
|
+
# nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
|
888
|
+
# size of 2 arrays is not the sum of the sizes computed from this function. In
|
889
|
+
# particular, StructArray's size is an upper bound.
|
890
|
+
#
|
891
|
+
# When an array is sliced, its allocated size remains constant because the buffer
|
892
|
+
# unchanged. However, this function will yield a smaller number. This is because
|
893
|
+
# this function returns the visible size of the buffer, not its total capacity.
|
894
|
+
#
|
895
|
+
# FFI buffers are included in this estimation.
|
896
|
+
#
|
897
|
+
# @param unit ["b", "kb", "mb", "gb", "tb"]
|
898
|
+
# Scale the returned size to the given unit.
|
899
|
+
#
|
900
|
+
# @return [Numeric]
|
901
|
+
#
|
902
|
+
# @example
|
903
|
+
# df = Polars::DataFrame.new(
|
904
|
+
# {
|
905
|
+
# "x" => 1_000_000.times.to_a.reverse,
|
906
|
+
# "y" => 1_000_000.times.map { |v| v / 1000.0 },
|
907
|
+
# "z" => 1_000_000.times.map(&:to_s)
|
908
|
+
# },
|
909
|
+
# columns: {"x" => :u32, "y" => :f64, "z" => :str}
|
910
|
+
# )
|
911
|
+
# df.estimated_size
|
912
|
+
# # => 25888898
|
913
|
+
# df.estimated_size("mb")
|
914
|
+
# # => 24.689577102661133
|
915
|
+
def estimated_size(unit = "b")
|
916
|
+
sz = _df.estimated_size
|
917
|
+
Utils.scale_bytes(sz, to: unit)
|
918
|
+
end
|
919
|
+
|
920
|
+
# Transpose a DataFrame over the diagonal.
|
921
|
+
#
|
922
|
+
# @param include_header [Boolean]
|
923
|
+
# If set, the column names will be added as first column.
|
924
|
+
# @param header_name [String]
|
925
|
+
# If `include_header` is set, this determines the name of the column that will
|
926
|
+
# be inserted.
|
927
|
+
# @param column_names [Array]
|
928
|
+
# Optional generator/iterator that yields column names. Will be used to
|
929
|
+
# replace the columns in the DataFrame.
|
930
|
+
#
|
931
|
+
# @return [DataFrame]
|
932
|
+
#
|
933
|
+
# @note
|
934
|
+
# This is a very expensive operation. Perhaps you can do it differently.
|
935
|
+
#
|
936
|
+
# @example
|
937
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [1, 2, 3]})
|
938
|
+
# df.transpose(include_header: true)
|
939
|
+
# # =>
|
940
|
+
# # shape: (2, 4)
|
941
|
+
# # ┌────────┬──────────┬──────────┬──────────┐
|
942
|
+
# # │ column ┆ column_0 ┆ column_1 ┆ column_2 │
|
943
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
944
|
+
# # │ str ┆ i64 ┆ i64 ┆ i64 │
|
945
|
+
# # ╞════════╪══════════╪══════════╪══════════╡
|
946
|
+
# # │ a ┆ 1 ┆ 2 ┆ 3 │
|
947
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
948
|
+
# # │ b ┆ 1 ┆ 2 ┆ 3 │
|
949
|
+
# # └────────┴──────────┴──────────┴──────────┘
|
950
|
+
#
|
951
|
+
# @example Replace the auto-generated column names with a list
|
952
|
+
# df.transpose(include_header: false, column_names: ["a", "b", "c"])
|
953
|
+
# # =>
|
954
|
+
# # shape: (2, 3)
|
955
|
+
# # ┌─────┬─────┬─────┐
|
956
|
+
# # │ a ┆ b ┆ c │
|
957
|
+
# # │ --- ┆ --- ┆ --- │
|
958
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
959
|
+
# # ╞═════╪═════╪═════╡
|
960
|
+
# # │ 1 ┆ 2 ┆ 3 │
|
961
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
962
|
+
# # │ 1 ┆ 2 ┆ 3 │
|
963
|
+
# # └─────┴─────┴─────┘
|
964
|
+
#
|
965
|
+
# @example Include the header as a separate column
|
966
|
+
# df.transpose(
|
967
|
+
# include_header: true, header_name: "foo", column_names: ["a", "b", "c"]
|
968
|
+
# )
|
969
|
+
# # =>
|
970
|
+
# # shape: (2, 4)
|
971
|
+
# # ┌─────┬─────┬─────┬─────┐
|
972
|
+
# # │ foo ┆ a ┆ b ┆ c │
|
973
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
974
|
+
# # │ str ┆ i64 ┆ i64 ┆ i64 │
|
975
|
+
# # ╞═════╪═════╪═════╪═════╡
|
976
|
+
# # │ a ┆ 1 ┆ 2 ┆ 3 │
|
977
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
978
|
+
# # │ b ┆ 1 ┆ 2 ┆ 3 │
|
979
|
+
# # └─────┴─────┴─────┴─────┘
|
980
|
+
def transpose(include_header: false, header_name: "column", column_names: nil)
|
981
|
+
df = _from_rbdf(_df.transpose(include_header, header_name))
|
982
|
+
if !column_names.nil?
|
983
|
+
names = []
|
984
|
+
n = df.width
|
985
|
+
if include_header
|
986
|
+
names << header_name
|
987
|
+
n -= 1
|
988
|
+
end
|
989
|
+
|
990
|
+
column_names = column_names.each
|
991
|
+
n.times do
|
992
|
+
names << column_names.next
|
993
|
+
end
|
994
|
+
df.columns = names
|
995
|
+
end
|
996
|
+
df
|
997
|
+
end
|
998
|
+
|
999
|
+
# Reverse the DataFrame.
|
1000
|
+
#
|
1001
|
+
# @return [DataFrame]
|
1002
|
+
#
|
1003
|
+
# @example
|
1004
|
+
# df = Polars::DataFrame.new(
|
1005
|
+
# {
|
1006
|
+
# "key" => ["a", "b", "c"],
|
1007
|
+
# "val" => [1, 2, 3]
|
1008
|
+
# }
|
1009
|
+
# )
|
1010
|
+
# df.reverse
|
1011
|
+
# # =>
|
1012
|
+
# # shape: (3, 2)
|
1013
|
+
# # ┌─────┬─────┐
|
1014
|
+
# # │ key ┆ val │
|
1015
|
+
# # │ --- ┆ --- │
|
1016
|
+
# # │ str ┆ i64 │
|
1017
|
+
# # ╞═════╪═════╡
|
1018
|
+
# # │ c ┆ 3 │
|
1019
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1020
|
+
# # │ b ┆ 2 │
|
1021
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1022
|
+
# # │ a ┆ 1 │
|
1023
|
+
# # └─────┴─────┘
|
1024
|
+
def reverse
|
1025
|
+
select(Polars.col("*").reverse)
|
1026
|
+
end
|
1027
|
+
|
1028
|
+
# Rename column names.
|
1029
|
+
#
|
1030
|
+
# @param mapping [Hash]
|
1031
|
+
# Key value pairs that map from old name to new name.
|
1032
|
+
#
|
1033
|
+
# @return [DataFrame]
|
1034
|
+
#
|
1035
|
+
# @example
|
1036
|
+
# df = Polars::DataFrame.new(
|
1037
|
+
# {
|
1038
|
+
# "foo" => [1, 2, 3],
|
1039
|
+
# "bar" => [6, 7, 8],
|
1040
|
+
# "ham" => ["a", "b", "c"]
|
1041
|
+
# }
|
1042
|
+
# )
|
1043
|
+
# df.rename({"foo" => "apple"})
|
1044
|
+
# # =>
|
1045
|
+
# # shape: (3, 3)
|
1046
|
+
# # ┌───────┬─────┬─────┐
|
1047
|
+
# # │ apple ┆ bar ┆ ham │
|
1048
|
+
# # │ --- ┆ --- ┆ --- │
|
1049
|
+
# # │ i64 ┆ i64 ┆ str │
|
1050
|
+
# # ╞═══════╪═════╪═════╡
|
1051
|
+
# # │ 1 ┆ 6 ┆ a │
|
1052
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1053
|
+
# # │ 2 ┆ 7 ┆ b │
|
1054
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1055
|
+
# # │ 3 ┆ 8 ┆ c │
|
1056
|
+
# # └───────┴─────┴─────┘
|
1057
|
+
def rename(mapping)
|
1058
|
+
lazy.rename(mapping).collect(no_optimization: true)
|
1059
|
+
end
|
1060
|
+
|
1061
|
+
# Insert a Series at a certain column index. This operation is in place.
|
1062
|
+
#
|
1063
|
+
# @param index [Integer]
|
1064
|
+
# Column to insert the new `Series` column.
|
1065
|
+
# @param series [Series]
|
1066
|
+
# `Series` to insert.
|
1067
|
+
#
|
1068
|
+
# @return [DataFrame]
|
1069
|
+
#
|
1070
|
+
# @example
|
1071
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
1072
|
+
# s = Polars::Series.new("baz", [97, 98, 99])
|
1073
|
+
# df.insert_at_idx(1, s)
|
1074
|
+
# # =>
|
1075
|
+
# # shape: (3, 3)
|
1076
|
+
# # ┌─────┬─────┬─────┐
|
1077
|
+
# # │ foo ┆ baz ┆ bar │
|
1078
|
+
# # │ --- ┆ --- ┆ --- │
|
1079
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
1080
|
+
# # ╞═════╪═════╪═════╡
|
1081
|
+
# # │ 1 ┆ 97 ┆ 4 │
|
1082
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1083
|
+
# # │ 2 ┆ 98 ┆ 5 │
|
1084
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1085
|
+
# # │ 3 ┆ 99 ┆ 6 │
|
1086
|
+
# # └─────┴─────┴─────┘
|
1087
|
+
#
|
1088
|
+
# @example
|
1089
|
+
# df = Polars::DataFrame.new(
|
1090
|
+
# {
|
1091
|
+
# "a" => [1, 2, 3, 4],
|
1092
|
+
# "b" => [0.5, 4, 10, 13],
|
1093
|
+
# "c" => [true, true, false, true]
|
1094
|
+
# }
|
1095
|
+
# )
|
1096
|
+
# s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
|
1097
|
+
# df.insert_at_idx(3, s)
|
1098
|
+
# # =>
|
1099
|
+
# # shape: (4, 4)
|
1100
|
+
# # ┌─────┬──────┬───────┬──────┐
|
1101
|
+
# # │ a ┆ b ┆ c ┆ d │
|
1102
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1103
|
+
# # │ i64 ┆ f64 ┆ bool ┆ f64 │
|
1104
|
+
# # ╞═════╪══════╪═══════╪══════╡
|
1105
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
|
1106
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1107
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
|
1108
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1109
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
|
1110
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1111
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
|
1112
|
+
# # └─────┴──────┴───────┴──────┘
|
1113
|
+
def insert_at_idx(index, series)
|
1114
|
+
if index < 0
|
1115
|
+
index = columns.length + index
|
1116
|
+
end
|
1117
|
+
_df.insert_at_idx(index, series._s)
|
1118
|
+
self
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
# Filter the rows in the DataFrame based on a predicate expression.
|
1122
|
+
#
|
1123
|
+
# @param predicate [Expr]
|
1124
|
+
# Expression that evaluates to a boolean Series.
|
1125
|
+
#
|
1126
|
+
# @return [DataFrame]
|
1127
|
+
#
|
1128
|
+
# @example Filter on one condition:
|
1129
|
+
# df = Polars::DataFrame.new(
|
1130
|
+
# {
|
1131
|
+
# "foo" => [1, 2, 3],
|
1132
|
+
# "bar" => [6, 7, 8],
|
1133
|
+
# "ham" => ["a", "b", "c"]
|
1134
|
+
# }
|
1135
|
+
# )
|
1136
|
+
# df.filter(Polars.col("foo") < 3)
|
1137
|
+
# # =>
|
1138
|
+
# # shape: (2, 3)
|
1139
|
+
# # ┌─────┬─────┬─────┐
|
1140
|
+
# # │ foo ┆ bar ┆ ham │
|
1141
|
+
# # │ --- ┆ --- ┆ --- │
|
1142
|
+
# # │ i64 ┆ i64 ┆ str │
|
1143
|
+
# # ╞═════╪═════╪═════╡
|
1144
|
+
# # │ 1 ┆ 6 ┆ a │
|
1145
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1146
|
+
# # │ 2 ┆ 7 ┆ b │
|
1147
|
+
# # └─────┴─────┴─────┘
|
1148
|
+
#
|
1149
|
+
# @example Filter on multiple conditions:
|
1150
|
+
# df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a"))
|
1151
|
+
# # =>
|
1152
|
+
# # shape: (1, 3)
|
1153
|
+
# # ┌─────┬─────┬─────┐
|
1154
|
+
# # │ foo ┆ bar ┆ ham │
|
1155
|
+
# # │ --- ┆ --- ┆ --- │
|
1156
|
+
# # │ i64 ┆ i64 ┆ str │
|
1157
|
+
# # ╞═════╪═════╪═════╡
|
1158
|
+
# # │ 1 ┆ 6 ┆ a │
|
1159
|
+
# # └─────┴─────┴─────┘
|
1160
|
+
def filter(predicate)
|
1161
|
+
lazy.filter(predicate).collect
|
1162
|
+
end
|
1163
|
+
|
1164
|
+
# Summary statistics for a DataFrame.
|
1165
|
+
#
|
1166
|
+
# @return [DataFrame]
|
1167
|
+
#
|
1168
|
+
# @example
|
1169
|
+
# df = Polars::DataFrame.new(
|
1170
|
+
# {
|
1171
|
+
# "a" => [1.0, 2.8, 3.0],
|
1172
|
+
# "b" => [4, 5, nil],
|
1173
|
+
# "c" => [true, false, true],
|
1174
|
+
# "d" => [nil, "b", "c"],
|
1175
|
+
# "e" => ["usd", "eur", nil]
|
1176
|
+
# }
|
1177
|
+
# )
|
1178
|
+
# df.describe
|
1179
|
+
# # =>
|
1180
|
+
# # shape: (7, 6)
|
1181
|
+
# # ┌────────────┬──────────┬──────────┬──────────┬──────┬──────┐
|
1182
|
+
# # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
|
1183
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1184
|
+
# # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
|
1185
|
+
# # ╞════════════╪══════════╪══════════╪══════════╪══════╪══════╡
|
1186
|
+
# # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
|
1187
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1188
|
+
# # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
|
1189
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1190
|
+
# # │ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null │
|
1191
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1192
|
+
# # │ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null │
|
1193
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1194
|
+
# # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
|
1195
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1196
|
+
# # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
|
1197
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1198
|
+
# # │ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null │
|
1199
|
+
# # └────────────┴──────────┴──────────┴──────────┴──────┴──────┘
|
1200
|
+
def describe
|
1201
|
+
describe_cast = lambda do |stat|
|
1202
|
+
columns = []
|
1203
|
+
self.columns.each_with_index do |s, i|
|
1204
|
+
if self[s].is_numeric || self[s].is_boolean
|
1205
|
+
columns << stat[0.., i].cast(:f64)
|
1206
|
+
else
|
1207
|
+
# for dates, strings, etc, we cast to string so that all
|
1208
|
+
# statistics can be shown
|
1209
|
+
columns << stat[0.., i].cast(:str)
|
1210
|
+
end
|
1211
|
+
end
|
1212
|
+
self.class.new(columns)
|
1213
|
+
end
|
1214
|
+
|
1215
|
+
summary = _from_rbdf(
|
1216
|
+
Polars.concat(
|
1217
|
+
[
|
1218
|
+
describe_cast.(
|
1219
|
+
self.class.new(columns.to_h { |c| [c, [height]] })
|
1220
|
+
),
|
1221
|
+
describe_cast.(null_count),
|
1222
|
+
describe_cast.(mean),
|
1223
|
+
describe_cast.(std),
|
1224
|
+
describe_cast.(min),
|
1225
|
+
describe_cast.(max),
|
1226
|
+
describe_cast.(median)
|
1227
|
+
]
|
1228
|
+
)._df
|
1229
|
+
)
|
1230
|
+
summary.insert_at_idx(
|
1231
|
+
0,
|
1232
|
+
Polars::Series.new(
|
1233
|
+
"describe",
|
1234
|
+
["count", "null_count", "mean", "std", "min", "max", "median"],
|
1235
|
+
)
|
1236
|
+
)
|
1237
|
+
summary
|
1238
|
+
end
|
1239
|
+
|
1240
|
+
# Find the index of a column by name.
|
1241
|
+
#
|
1242
|
+
# @param name [String]
|
1243
|
+
# Name of the column to find.
|
1244
|
+
#
|
1245
|
+
# @return [Series]
|
1246
|
+
#
|
1247
|
+
# @example
|
1248
|
+
# df = Polars::DataFrame.new(
|
1249
|
+
# {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
|
1250
|
+
# )
|
1251
|
+
# df.find_idx_by_name("ham")
|
1252
|
+
# # => 2
|
1253
|
+
def find_idx_by_name(name)
|
1254
|
+
_df.find_idx_by_name(name)
|
1255
|
+
end
|
1256
|
+
|
1257
|
+
# Replace a column at an index location.
|
1258
|
+
#
|
1259
|
+
# @param index [Integer]
|
1260
|
+
# Column index.
|
1261
|
+
# @param series [Series]
|
1262
|
+
# Series that will replace the column.
|
1263
|
+
#
|
1264
|
+
# @return [DataFrame]
|
1265
|
+
#
|
1266
|
+
# @example
|
1267
|
+
# df = Polars::DataFrame.new(
|
1268
|
+
# {
|
1269
|
+
# "foo" => [1, 2, 3],
|
1270
|
+
# "bar" => [6, 7, 8],
|
1271
|
+
# "ham" => ["a", "b", "c"]
|
1272
|
+
# }
|
1273
|
+
# )
|
1274
|
+
# s = Polars::Series.new("apple", [10, 20, 30])
|
1275
|
+
# df.replace_at_idx(0, s)
|
1276
|
+
# # =>
|
1277
|
+
# # shape: (3, 3)
|
1278
|
+
# # ┌───────┬─────┬─────┐
|
1279
|
+
# # │ apple ┆ bar ┆ ham │
|
1280
|
+
# # │ --- ┆ --- ┆ --- │
|
1281
|
+
# # │ i64 ┆ i64 ┆ str │
|
1282
|
+
# # ╞═══════╪═════╪═════╡
|
1283
|
+
# # │ 10 ┆ 6 ┆ a │
|
1284
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1285
|
+
# # │ 20 ┆ 7 ┆ b │
|
1286
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1287
|
+
# # │ 30 ┆ 8 ┆ c │
|
1288
|
+
# # └───────┴─────┴─────┘
|
1289
|
+
def replace_at_idx(index, series)
|
1290
|
+
if index < 0
|
1291
|
+
index = columns.length + index
|
1292
|
+
end
|
1293
|
+
_df.replace_at_idx(index, series._s)
|
1294
|
+
self
|
1295
|
+
end
|
1296
|
+
|
1297
|
+
# Sort the DataFrame by column.
|
1298
|
+
#
|
1299
|
+
# @param by [String]
|
1300
|
+
# By which column to sort.
|
1301
|
+
# @param reverse [Boolean]
|
1302
|
+
# Reverse/descending sort.
|
1303
|
+
# @param nulls_last [Boolean]
|
1304
|
+
# Place null values last. Can only be used if sorted by a single column.
|
1305
|
+
#
|
1306
|
+
# @return [DataFrame]
|
1307
|
+
#
|
1308
|
+
# @example
|
1309
|
+
# df = Polars::DataFrame.new(
|
1310
|
+
# {
|
1311
|
+
# "foo" => [1, 2, 3],
|
1312
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1313
|
+
# "ham" => ["a", "b", "c"]
|
1314
|
+
# }
|
1315
|
+
# )
|
1316
|
+
# df.sort("foo", reverse: true)
|
1317
|
+
# # =>
|
1318
|
+
# # shape: (3, 3)
|
1319
|
+
# # ┌─────┬─────┬─────┐
|
1320
|
+
# # │ foo ┆ bar ┆ ham │
|
1321
|
+
# # │ --- ┆ --- ┆ --- │
|
1322
|
+
# # │ i64 ┆ f64 ┆ str │
|
1323
|
+
# # ╞═════╪═════╪═════╡
|
1324
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1325
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1326
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1327
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1328
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
1329
|
+
# # └─────┴─────┴─────┘
|
1330
|
+
#
|
1331
|
+
# @example Sort by multiple columns.
|
1332
|
+
# df.sort(
|
1333
|
+
# [Polars.col("foo"), Polars.col("bar")**2],
|
1334
|
+
# reverse: [true, false]
|
1335
|
+
# )
|
1336
|
+
# # =>
|
1337
|
+
# # shape: (3, 3)
|
1338
|
+
# # ┌─────┬─────┬─────┐
|
1339
|
+
# # │ foo ┆ bar ┆ ham │
|
1340
|
+
# # │ --- ┆ --- ┆ --- │
|
1341
|
+
# # │ i64 ┆ f64 ┆ str │
|
1342
|
+
# # ╞═════╪═════╪═════╡
|
1343
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1344
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1345
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1346
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1347
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
1348
|
+
# # └─────┴─────┴─────┘
|
1349
|
+
def sort(by, reverse: false, nulls_last: false)
|
1350
|
+
if by.is_a?(Array) || by.is_a?(Expr)
|
1351
|
+
lazy
|
1352
|
+
.sort(by, reverse: reverse, nulls_last: nulls_last)
|
1353
|
+
.collect(no_optimization: true, string_cache: false)
|
1354
|
+
else
|
1355
|
+
_from_rbdf(_df.sort(by, reverse, nulls_last))
|
1356
|
+
end
|
1357
|
+
end
|
1358
|
+
|
1359
|
+
# Check if DataFrame is equal to other.
|
1360
|
+
#
|
1361
|
+
# @param other [DataFrame]
|
1362
|
+
# DataFrame to compare with.
|
1363
|
+
# @param null_equal [Boolean]
|
1364
|
+
# Consider null values as equal.
|
1365
|
+
#
|
1366
|
+
# @return [Boolean]
|
1367
|
+
#
|
1368
|
+
# @example
|
1369
|
+
# df1 = Polars::DataFrame.new(
|
1370
|
+
# {
|
1371
|
+
# "foo" => [1, 2, 3],
|
1372
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1373
|
+
# "ham" => ["a", "b", "c"]
|
1374
|
+
# }
|
1375
|
+
# )
|
1376
|
+
# df2 = Polars::DataFrame.new(
|
1377
|
+
# {
|
1378
|
+
# "foo" => [3, 2, 1],
|
1379
|
+
# "bar" => [8.0, 7.0, 6.0],
|
1380
|
+
# "ham" => ["c", "b", "a"]
|
1381
|
+
# }
|
1382
|
+
# )
|
1383
|
+
# df1.frame_equal(df1)
|
1384
|
+
# # => true
|
1385
|
+
# df1.frame_equal(df2)
|
1386
|
+
# # => false
|
1387
|
+
def frame_equal(other, null_equal: true)
|
1388
|
+
_df.frame_equal(other._df, null_equal)
|
1389
|
+
end
|
1390
|
+
|
1391
|
+
# Replace a column by a new Series.
|
1392
|
+
#
|
1393
|
+
# @param column [String]
|
1394
|
+
# Column to replace.
|
1395
|
+
# @param new_col [Series]
|
1396
|
+
# New column to insert.
|
1397
|
+
#
|
1398
|
+
# @return [DataFrame]
|
1399
|
+
#
|
1400
|
+
# @example
|
1401
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
1402
|
+
# s = Polars::Series.new([10, 20, 30])
|
1403
|
+
# df.replace("foo", s)
|
1404
|
+
# # =>
|
1405
|
+
# # shape: (3, 2)
|
1406
|
+
# # ┌─────┬─────┐
|
1407
|
+
# # │ foo ┆ bar │
|
1408
|
+
# # │ --- ┆ --- │
|
1409
|
+
# # │ i64 ┆ i64 │
|
1410
|
+
# # ╞═════╪═════╡
|
1411
|
+
# # │ 10 ┆ 4 │
|
1412
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1413
|
+
# # │ 20 ┆ 5 │
|
1414
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1415
|
+
# # │ 30 ┆ 6 │
|
1416
|
+
# # └─────┴─────┘
|
1417
|
+
def replace(column, new_col)
|
1418
|
+
_df.replace(column, new_col._s)
|
1419
|
+
self
|
1420
|
+
end
|
1421
|
+
|
1422
|
+
# Get a slice of this DataFrame.
|
1423
|
+
#
|
1424
|
+
# @param offset [Integer]
|
1425
|
+
# Start index. Negative indexing is supported.
|
1426
|
+
# @param length [Integer, nil]
|
1427
|
+
# Length of the slice. If set to `nil`, all rows starting at the offset
|
1428
|
+
# will be selected.
|
1429
|
+
#
|
1430
|
+
# @return [DataFrame]
|
1431
|
+
#
|
1432
|
+
# @example
|
1433
|
+
# df = Polars::DataFrame.new(
|
1434
|
+
# {
|
1435
|
+
# "foo" => [1, 2, 3],
|
1436
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1437
|
+
# "ham" => ["a", "b", "c"]
|
1438
|
+
# }
|
1439
|
+
# )
|
1440
|
+
# df.slice(1, 2)
|
1441
|
+
# # =>
|
1442
|
+
# # shape: (2, 3)
|
1443
|
+
# # ┌─────┬─────┬─────┐
|
1444
|
+
# # │ foo ┆ bar ┆ ham │
|
1445
|
+
# # │ --- ┆ --- ┆ --- │
|
1446
|
+
# # │ i64 ┆ f64 ┆ str │
|
1447
|
+
# # ╞═════╪═════╪═════╡
|
1448
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1449
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1450
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1451
|
+
# # └─────┴─────┴─────┘
|
1452
|
+
def slice(offset, length = nil)
|
1453
|
+
if !length.nil? && length < 0
|
1454
|
+
length = height - offset + length
|
1455
|
+
end
|
1456
|
+
_from_rbdf(_df.slice(offset, length))
|
1457
|
+
end
|
1458
|
+
|
1459
|
+
# Get the first `n` rows.
|
1460
|
+
#
|
1461
|
+
# Alias for {#head}.
|
1462
|
+
#
|
1463
|
+
# @param n [Integer]
|
1464
|
+
# Number of rows to return.
|
1465
|
+
#
|
1466
|
+
# @return [DataFrame]
|
1467
|
+
#
|
1468
|
+
# @example
|
1469
|
+
# df = Polars::DataFrame.new(
|
1470
|
+
# {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]}
|
1471
|
+
# )
|
1472
|
+
# df.limit(4)
|
1473
|
+
# # =>
|
1474
|
+
# # shape: (4, 2)
|
1475
|
+
# # ┌─────┬─────┐
|
1476
|
+
# # │ foo ┆ bar │
|
1477
|
+
# # │ --- ┆ --- │
|
1478
|
+
# # │ i64 ┆ str │
|
1479
|
+
# # ╞═════╪═════╡
|
1480
|
+
# # │ 1 ┆ a │
|
1481
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1482
|
+
# # │ 2 ┆ b │
|
1483
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1484
|
+
# # │ 3 ┆ c │
|
1485
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1486
|
+
# # │ 4 ┆ d │
|
1487
|
+
# # └─────┴─────┘
|
1488
|
+
def limit(n = 5)
|
1489
|
+
head(n)
|
1490
|
+
end
|
1491
|
+
|
1492
|
+
# Get the first `n` rows.
|
1493
|
+
#
|
1494
|
+
# @param n [Integer]
|
1495
|
+
# Number of rows to return.
|
1496
|
+
#
|
1497
|
+
# @return [DataFrame]
|
1498
|
+
#
|
1499
|
+
# @example
|
1500
|
+
# df = Polars::DataFrame.new(
|
1501
|
+
# {
|
1502
|
+
# "foo" => [1, 2, 3, 4, 5],
|
1503
|
+
# "bar" => [6, 7, 8, 9, 10],
|
1504
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
1505
|
+
# }
|
1506
|
+
# )
|
1507
|
+
# df.head(3)
|
1508
|
+
# # =>
|
1509
|
+
# # shape: (3, 3)
|
1510
|
+
# # ┌─────┬─────┬─────┐
|
1511
|
+
# # │ foo ┆ bar ┆ ham │
|
1512
|
+
# # │ --- ┆ --- ┆ --- │
|
1513
|
+
# # │ i64 ┆ i64 ┆ str │
|
1514
|
+
# # ╞═════╪═════╪═════╡
|
1515
|
+
# # │ 1 ┆ 6 ┆ a │
|
1516
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1517
|
+
# # │ 2 ┆ 7 ┆ b │
|
1518
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1519
|
+
# # │ 3 ┆ 8 ┆ c │
|
1520
|
+
# # └─────┴─────┴─────┘
|
1521
|
+
def head(n = 5)
|
1522
|
+
_from_rbdf(_df.head(n))
|
1523
|
+
end
|
1524
|
+
|
1525
|
+
# Get the last `n` rows.
|
1526
|
+
#
|
1527
|
+
# @param n [Integer]
|
1528
|
+
# Number of rows to return.
|
1529
|
+
#
|
1530
|
+
# @return [DataFrame]
|
1531
|
+
#
|
1532
|
+
# @example
|
1533
|
+
# df = Polars::DataFrame.new(
|
1534
|
+
# {
|
1535
|
+
# "foo" => [1, 2, 3, 4, 5],
|
1536
|
+
# "bar" => [6, 7, 8, 9, 10],
|
1537
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
1538
|
+
# }
|
1539
|
+
# )
|
1540
|
+
# df.tail(3)
|
1541
|
+
# # =>
|
1542
|
+
# # shape: (3, 3)
|
1543
|
+
# # ┌─────┬─────┬─────┐
|
1544
|
+
# # │ foo ┆ bar ┆ ham │
|
1545
|
+
# # │ --- ┆ --- ┆ --- │
|
1546
|
+
# # │ i64 ┆ i64 ┆ str │
|
1547
|
+
# # ╞═════╪═════╪═════╡
|
1548
|
+
# # │ 3 ┆ 8 ┆ c │
|
1549
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1550
|
+
# # │ 4 ┆ 9 ┆ d │
|
1551
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1552
|
+
# # │ 5 ┆ 10 ┆ e │
|
1553
|
+
# # └─────┴─────┴─────┘
|
1554
|
+
def tail(n = 5)
|
1555
|
+
_from_rbdf(_df.tail(n))
|
1556
|
+
end
|
1557
|
+
|
1558
|
+
# Return a new DataFrame where the null values are dropped.
|
1559
|
+
#
|
1560
|
+
# @param subset [Object]
|
1561
|
+
# Subset of column(s) on which `drop_nulls` will be applied.
|
1562
|
+
#
|
1563
|
+
# @return [DataFrame]
|
1564
|
+
#
|
1565
|
+
# @example
|
1566
|
+
# df = Polars::DataFrame.new(
|
1567
|
+
# {
|
1568
|
+
# "foo" => [1, 2, 3],
|
1569
|
+
# "bar" => [6, nil, 8],
|
1570
|
+
# "ham" => ["a", "b", "c"]
|
1571
|
+
# }
|
1572
|
+
# )
|
1573
|
+
# df.drop_nulls
|
1574
|
+
# # =>
|
1575
|
+
# # shape: (2, 3)
|
1576
|
+
# # ┌─────┬─────┬─────┐
|
1577
|
+
# # │ foo ┆ bar ┆ ham │
|
1578
|
+
# # │ --- ┆ --- ┆ --- │
|
1579
|
+
# # │ i64 ┆ i64 ┆ str │
|
1580
|
+
# # ╞═════╪═════╪═════╡
|
1581
|
+
# # │ 1 ┆ 6 ┆ a │
|
1582
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1583
|
+
# # │ 3 ┆ 8 ┆ c │
|
1584
|
+
# # └─────┴─────┴─────┘
|
1585
|
+
def drop_nulls(subset: nil)
|
1586
|
+
if subset.is_a?(String)
|
1587
|
+
subset = [subset]
|
1588
|
+
end
|
1589
|
+
_from_rbdf(_df.drop_nulls(subset))
|
1590
|
+
end
|
1591
|
+
|
1592
|
+
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
1593
|
+
#
|
1594
|
+
# @param func [Object]
|
1595
|
+
# Callable; will receive the frame as the first parameter,
|
1596
|
+
# followed by any given args/kwargs.
|
1597
|
+
# @param args [Object]
|
1598
|
+
# Arguments to pass to the UDF.
|
1599
|
+
# @param kwargs [Object]
|
1600
|
+
# Keyword arguments to pass to the UDF.
|
1601
|
+
#
|
1602
|
+
# @return [Object]
|
1603
|
+
#
|
1604
|
+
# @note
|
1605
|
+
# It is recommended to use LazyFrame when piping operations, in order
|
1606
|
+
# to fully take advantage of query optimization and parallelization.
|
1607
|
+
# See {#lazy}.
|
1608
|
+
#
|
1609
|
+
# @example
|
1610
|
+
# cast_str_to_int = lambda do |data, col_name:|
|
1611
|
+
# data.with_column(Polars.col(col_name).cast(:i64))
|
1612
|
+
# end
|
1613
|
+
#
|
1614
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => ["10", "20", "30", "40"]})
|
1615
|
+
# df.pipe(cast_str_to_int, col_name: "b")
|
1616
|
+
# # =>
|
1617
|
+
# # shape: (4, 2)
|
1618
|
+
# # ┌─────┬─────┐
|
1619
|
+
# # │ a ┆ b │
|
1620
|
+
# # │ --- ┆ --- │
|
1621
|
+
# # │ i64 ┆ i64 │
|
1622
|
+
# # ╞═════╪═════╡
|
1623
|
+
# # │ 1 ┆ 10 │
|
1624
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1625
|
+
# # │ 2 ┆ 20 │
|
1626
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1627
|
+
# # │ 3 ┆ 30 │
|
1628
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1629
|
+
# # │ 4 ┆ 40 │
|
1630
|
+
# # └─────┴─────┘
|
1631
|
+
def pipe(func, *args, **kwargs, &block)
|
1632
|
+
func.call(self, *args, **kwargs, &block)
|
1633
|
+
end
|
1634
|
+
|
1635
|
+
# Add a column at index 0 that counts the rows.
|
1636
|
+
#
|
1637
|
+
# @param name [String]
|
1638
|
+
# Name of the column to add.
|
1639
|
+
# @param offset [Integer]
|
1640
|
+
# Start the row count at this offset.
|
1641
|
+
#
|
1642
|
+
# @return [DataFrame]
|
1643
|
+
#
|
1644
|
+
# @example
|
1645
|
+
# df = Polars::DataFrame.new(
|
1646
|
+
# {
|
1647
|
+
# "a" => [1, 3, 5],
|
1648
|
+
# "b" => [2, 4, 6]
|
1649
|
+
# }
|
1650
|
+
# )
|
1651
|
+
# df.with_row_count
|
1652
|
+
# # =>
|
1653
|
+
# # shape: (3, 3)
|
1654
|
+
# # ┌────────┬─────┬─────┐
|
1655
|
+
# # │ row_nr ┆ a ┆ b │
|
1656
|
+
# # │ --- ┆ --- ┆ --- │
|
1657
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
1658
|
+
# # ╞════════╪═════╪═════╡
|
1659
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
1660
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1661
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
1662
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1663
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
1664
|
+
# # └────────┴─────┴─────┘
|
1665
|
+
def with_row_count(name: "row_nr", offset: 0)
|
1666
|
+
_from_rbdf(_df.with_row_count(name, offset))
|
1667
|
+
end
|
1668
|
+
|
1669
|
+
# Start a groupby operation.
|
1670
|
+
#
|
1671
|
+
# @param by [Object]
|
1672
|
+
# Column(s) to group by.
|
1673
|
+
# @param maintain_order [Boolean]
|
1674
|
+
# Make sure that the order of the groups remain consistent. This is more
|
1675
|
+
# expensive than a default groupby. Note that this only works in expression
|
1676
|
+
# aggregations.
|
1677
|
+
#
|
1678
|
+
# @return [GroupBy]
|
1679
|
+
#
|
1680
|
+
# @example
|
1681
|
+
# df = Polars::DataFrame.new(
|
1682
|
+
# {
|
1683
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
1684
|
+
# "b" => [1, 2, 3, 4, 5, 6],
|
1685
|
+
# "c" => [6, 5, 4, 3, 2, 1]
|
1686
|
+
# }
|
1687
|
+
# )
|
1688
|
+
# df.groupby("a").agg(Polars.col("b").sum).sort("a")
|
1689
|
+
# # =>
|
1690
|
+
# # shape: (3, 2)
|
1691
|
+
# # ┌─────┬─────┐
|
1692
|
+
# # │ a ┆ b │
|
1693
|
+
# # │ --- ┆ --- │
|
1694
|
+
# # │ str ┆ i64 │
|
1695
|
+
# # ╞═════╪═════╡
|
1696
|
+
# # │ a ┆ 4 │
|
1697
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1698
|
+
# # │ b ┆ 11 │
|
1699
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1700
|
+
# # │ c ┆ 6 │
|
1701
|
+
# # └─────┴─────┘
|
1702
|
+
def groupby(by, maintain_order: false)
|
1703
|
+
if !Utils.bool?(maintain_order)
|
1704
|
+
raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
|
1705
|
+
end
|
1706
|
+
if by.is_a?(String)
|
1707
|
+
by = [by]
|
1708
|
+
end
|
1709
|
+
GroupBy.new(
|
1710
|
+
_df,
|
1711
|
+
by,
|
1712
|
+
self.class,
|
1713
|
+
maintain_order: maintain_order
|
1714
|
+
)
|
1715
|
+
end
|
1716
|
+
|
1717
|
+
# Create rolling groups based on a time column.
|
1718
|
+
#
|
1719
|
+
# Also works for index values of type `:i32` or `:i64`.
|
1720
|
+
#
|
1721
|
+
# Different from a `dynamic_groupby` the windows are now determined by the
|
1722
|
+
# individual values and are not of constant intervals. For constant intervals use
|
1723
|
+
# *groupby_dynamic*
|
1724
|
+
#
|
1725
|
+
# The `period` and `offset` arguments are created either from a timedelta, or
|
1726
|
+
# by using the following string language:
|
1727
|
+
#
|
1728
|
+
# - 1ns (1 nanosecond)
|
1729
|
+
# - 1us (1 microsecond)
|
1730
|
+
# - 1ms (1 millisecond)
|
1731
|
+
# - 1s (1 second)
|
1732
|
+
# - 1m (1 minute)
|
1733
|
+
# - 1h (1 hour)
|
1734
|
+
# - 1d (1 day)
|
1735
|
+
# - 1w (1 week)
|
1736
|
+
# - 1mo (1 calendar month)
|
1737
|
+
# - 1y (1 calendar year)
|
1738
|
+
# - 1i (1 index count)
|
1739
|
+
#
|
1740
|
+
# Or combine them:
|
1741
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1742
|
+
#
|
1743
|
+
# In case of a groupby_rolling on an integer column, the windows are defined by:
|
1744
|
+
#
|
1745
|
+
# - **"1i" # length 1**
|
1746
|
+
# - **"10i" # length 10**
|
1747
|
+
#
|
1748
|
+
# @param index_column [Object]
|
1749
|
+
# Column used to group based on the time window.
|
1750
|
+
# Often to type Date/Datetime
|
1751
|
+
# This column must be sorted in ascending order. If not the output will not
|
1752
|
+
# make sense.
|
1753
|
+
#
|
1754
|
+
# In case of a rolling groupby on indices, dtype needs to be one of
|
1755
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1756
|
+
# performance matters use an `:i64` column.
|
1757
|
+
# @param period [Object]
|
1758
|
+
# Length of the window.
|
1759
|
+
# @param offset [Object]
|
1760
|
+
# Offset of the window. Default is -period.
|
1761
|
+
# @param closed ["right", "left", "both", "none"]
|
1762
|
+
# Define whether the temporal window interval is closed or not.
|
1763
|
+
# @param by [Object]
|
1764
|
+
# Also group by this column/these columns.
|
1765
|
+
#
|
1766
|
+
# @return [RollingGroupBy]
|
1767
|
+
#
|
1768
|
+
# @example
|
1769
|
+
# dates = [
|
1770
|
+
# "2020-01-01 13:45:48",
|
1771
|
+
# "2020-01-01 16:42:13",
|
1772
|
+
# "2020-01-01 16:45:09",
|
1773
|
+
# "2020-01-02 18:12:48",
|
1774
|
+
# "2020-01-03 19:45:32",
|
1775
|
+
# "2020-01-08 23:16:43"
|
1776
|
+
# ]
|
1777
|
+
# df = Polars::DataFrame.new({"dt" => dates, "a" => [3, 7, 5, 9, 2, 1]}).with_column(
|
1778
|
+
# Polars.col("dt").str.strptime(:datetime)
|
1779
|
+
# )
|
1780
|
+
# df.groupby_rolling(index_column: "dt", period: "2d").agg(
|
1781
|
+
# [
|
1782
|
+
# Polars.sum("a").alias("sum_a"),
|
1783
|
+
# Polars.min("a").alias("min_a"),
|
1784
|
+
# Polars.max("a").alias("max_a")
|
1785
|
+
# ]
|
1786
|
+
# )
|
1787
|
+
# # =>
|
1788
|
+
# # shape: (6, 4)
|
1789
|
+
# # ┌─────────────────────┬───────┬───────┬───────┐
|
1790
|
+
# # │ dt ┆ sum_a ┆ min_a ┆ max_a │
|
1791
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1792
|
+
# # │ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
|
1793
|
+
# # ╞═════════════════════╪═══════╪═══════╪═══════╡
|
1794
|
+
# # │ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
|
1795
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1796
|
+
# # │ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
|
1797
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1798
|
+
# # │ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
|
1799
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1800
|
+
# # │ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
|
1801
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1802
|
+
# # │ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
|
1803
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1804
|
+
# # │ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
|
1805
|
+
# # └─────────────────────┴───────┴───────┴───────┘
|
1806
|
+
def groupby_rolling(
|
1807
|
+
index_column:,
|
1808
|
+
period:,
|
1809
|
+
offset: nil,
|
1810
|
+
closed: "right",
|
1811
|
+
by: nil
|
1812
|
+
)
|
1813
|
+
RollingGroupBy.new(self, index_column, period, offset, closed, by)
|
1814
|
+
end
|
1815
|
+
|
1816
|
+
# Group based on a time value (or index value of type `:i32`, `:i64`).
|
1817
|
+
#
|
1818
|
+
# Time windows are calculated and rows are assigned to windows. Different from a
|
1819
|
+
# normal groupby is that a row can be member of multiple groups. The time/index
|
1820
|
+
# window could be seen as a rolling window, with a window size determined by
|
1821
|
+
# dates/times/values instead of slots in the DataFrame.
|
1822
|
+
#
|
1823
|
+
# A window is defined by:
|
1824
|
+
#
|
1825
|
+
# - every: interval of the window
|
1826
|
+
# - period: length of the window
|
1827
|
+
# - offset: offset of the window
|
1828
|
+
#
|
1829
|
+
# The `every`, `period` and `offset` arguments are created with
|
1830
|
+
# the following string language:
|
1831
|
+
#
|
1832
|
+
# - 1ns (1 nanosecond)
|
1833
|
+
# - 1us (1 microsecond)
|
1834
|
+
# - 1ms (1 millisecond)
|
1835
|
+
# - 1s (1 second)
|
1836
|
+
# - 1m (1 minute)
|
1837
|
+
# - 1h (1 hour)
|
1838
|
+
# - 1d (1 day)
|
1839
|
+
# - 1w (1 week)
|
1840
|
+
# - 1mo (1 calendar month)
|
1841
|
+
# - 1y (1 calendar year)
|
1842
|
+
# - 1i (1 index count)
|
1843
|
+
#
|
1844
|
+
# Or combine them:
|
1845
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
1846
|
+
#
|
1847
|
+
# In case of a groupby_dynamic on an integer column, the windows are defined by:
|
1848
|
+
#
|
1849
|
+
# - "1i" # length 1
|
1850
|
+
# - "10i" # length 10
|
1851
|
+
#
|
1852
|
+
# @param index_column
|
1853
|
+
# Column used to group based on the time window.
|
1854
|
+
# Often to type Date/Datetime
|
1855
|
+
# This column must be sorted in ascending order. If not the output will not
|
1856
|
+
# make sense.
|
1857
|
+
#
|
1858
|
+
# In case of a dynamic groupby on indices, dtype needs to be one of
|
1859
|
+
# `:i32`, `:i64`. Note that `:i32` gets temporarily cast to `:i64`, so if
|
1860
|
+
# performance matters use an `:i64` column.
|
1861
|
+
# @param every
|
1862
|
+
# Interval of the window.
|
1863
|
+
# @param period
|
1864
|
+
# Length of the window, if None it is equal to 'every'.
|
1865
|
+
# @param offset
|
1866
|
+
# Offset of the window if None and period is None it will be equal to negative
|
1867
|
+
# `every`.
|
1868
|
+
# @param truncate
|
1869
|
+
# Truncate the time value to the window lower bound.
|
1870
|
+
# @param include_boundaries
|
1871
|
+
# Add the lower and upper bound of the window to the "_lower_bound" and
|
1872
|
+
# "_upper_bound" columns. This will impact performance because it's harder to
|
1873
|
+
# parallelize
|
1874
|
+
# @param closed ["right", "left", "both", "none"]
|
1875
|
+
# Define whether the temporal window interval is closed or not.
|
1876
|
+
# @param by
|
1877
|
+
# Also group by this column/these columns
|
1878
|
+
#
|
1879
|
+
# @return [DataFrame]
|
1880
|
+
#
|
1881
|
+
# @example
|
1882
|
+
# df = Polars::DataFrame.new(
|
1883
|
+
# {
|
1884
|
+
# "time" => Polars.date_range(
|
1885
|
+
# DateTime.new(2021, 12, 16),
|
1886
|
+
# DateTime.new(2021, 12, 16, 3),
|
1887
|
+
# "30m"
|
1888
|
+
# ),
|
1889
|
+
# "n" => 0..6
|
1890
|
+
# }
|
1891
|
+
# )
|
1892
|
+
# # =>
|
1893
|
+
# # shape: (7, 2)
|
1894
|
+
# # ┌─────────────────────┬─────┐
|
1895
|
+
# # │ time ┆ n │
|
1896
|
+
# # │ --- ┆ --- │
|
1897
|
+
# # │ datetime[μs] ┆ i64 │
|
1898
|
+
# # ╞═════════════════════╪═════╡
|
1899
|
+
# # │ 2021-12-16 00:00:00 ┆ 0 │
|
1900
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1901
|
+
# # │ 2021-12-16 00:30:00 ┆ 1 │
|
1902
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1903
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 │
|
1904
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1905
|
+
# # │ 2021-12-16 01:30:00 ┆ 3 │
|
1906
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1907
|
+
# # │ 2021-12-16 02:00:00 ┆ 4 │
|
1908
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1909
|
+
# # │ 2021-12-16 02:30:00 ┆ 5 │
|
1910
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1911
|
+
# # │ 2021-12-16 03:00:00 ┆ 6 │
|
1912
|
+
# # └─────────────────────┴─────┘
|
1913
|
+
#
|
1914
|
+
# @example Group by windows of 1 hour starting at 2021-12-16 00:00:00.
|
1915
|
+
# df.groupby_dynamic("time", every: "1h", closed: "right").agg(
|
1916
|
+
# [
|
1917
|
+
# Polars.col("time").min.alias("time_min"),
|
1918
|
+
# Polars.col("time").max.alias("time_max")
|
1919
|
+
# ]
|
1920
|
+
# )
|
1921
|
+
# # =>
|
1922
|
+
# # shape: (4, 3)
|
1923
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┐
|
1924
|
+
# # │ time ┆ time_min ┆ time_max │
|
1925
|
+
# # │ --- ┆ --- ┆ --- │
|
1926
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │
|
1927
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╡
|
1928
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 00:00:00 │
|
1929
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1930
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 00:30:00 ┆ 2021-12-16 01:00:00 │
|
1931
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1932
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 01:30:00 ┆ 2021-12-16 02:00:00 │
|
1933
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1934
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 02:30:00 ┆ 2021-12-16 03:00:00 │
|
1935
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┘
|
1936
|
+
#
|
1937
|
+
# @example The window boundaries can also be added to the aggregation result.
|
1938
|
+
# df.groupby_dynamic(
|
1939
|
+
# "time", every: "1h", include_boundaries: true, closed: "right"
|
1940
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
1941
|
+
# # =>
|
1942
|
+
# # shape: (4, 4)
|
1943
|
+
# # ┌─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
1944
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
1945
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1946
|
+
# # │ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
1947
|
+
# # ╞═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
1948
|
+
# # │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
1949
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1950
|
+
# # │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 2 │
|
1951
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1952
|
+
# # │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
1953
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1954
|
+
# # │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
1955
|
+
# # └─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
1956
|
+
#
|
1957
|
+
# @example When closed="left", should not include right end of interval.
|
1958
|
+
# df.groupby_dynamic("time", every: "1h", closed: "left").agg(
|
1959
|
+
# [
|
1960
|
+
# Polars.col("time").count.alias("time_count"),
|
1961
|
+
# Polars.col("time").list.alias("time_agg_list")
|
1962
|
+
# ]
|
1963
|
+
# )
|
1964
|
+
# # =>
|
1965
|
+
# # shape: (4, 3)
|
1966
|
+
# # ┌─────────────────────┬────────────┬─────────────────────────────────────┐
|
1967
|
+
# # │ time ┆ time_count ┆ time_agg_list │
|
1968
|
+
# # │ --- ┆ --- ┆ --- │
|
1969
|
+
# # │ datetime[μs] ┆ u32 ┆ list[datetime[μs]] │
|
1970
|
+
# # ╞═════════════════════╪════════════╪═════════════════════════════════════╡
|
1971
|
+
# # │ 2021-12-16 00:00:00 ┆ 2 ┆ [2021-12-16 00:00:00, 2021-12-16... │
|
1972
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1973
|
+
# # │ 2021-12-16 01:00:00 ┆ 2 ┆ [2021-12-16 01:00:00, 2021-12-16... │
|
1974
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1975
|
+
# # │ 2021-12-16 02:00:00 ┆ 2 ┆ [2021-12-16 02:00:00, 2021-12-16... │
|
1976
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1977
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 ┆ [2021-12-16 03:00:00] │
|
1978
|
+
# # └─────────────────────┴────────────┴─────────────────────────────────────┘
|
1979
|
+
#
|
1980
|
+
# @example When closed="both" the time values at the window boundaries belong to 2 groups.
|
1981
|
+
# df.groupby_dynamic("time", every: "1h", closed: "both").agg(
|
1982
|
+
# [Polars.col("time").count.alias("time_count")]
|
1983
|
+
# )
|
1984
|
+
# # =>
|
1985
|
+
# # shape: (5, 2)
|
1986
|
+
# # ┌─────────────────────┬────────────┐
|
1987
|
+
# # │ time ┆ time_count │
|
1988
|
+
# # │ --- ┆ --- │
|
1989
|
+
# # │ datetime[μs] ┆ u32 │
|
1990
|
+
# # ╞═════════════════════╪════════════╡
|
1991
|
+
# # │ 2021-12-15 23:00:00 ┆ 1 │
|
1992
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1993
|
+
# # │ 2021-12-16 00:00:00 ┆ 3 │
|
1994
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1995
|
+
# # │ 2021-12-16 01:00:00 ┆ 3 │
|
1996
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1997
|
+
# # │ 2021-12-16 02:00:00 ┆ 3 │
|
1998
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
1999
|
+
# # │ 2021-12-16 03:00:00 ┆ 1 │
|
2000
|
+
# # └─────────────────────┴────────────┘
|
2001
|
+
#
|
2002
|
+
# @example Dynamic groupbys can also be combined with grouping on normal keys.
|
2003
|
+
# df = Polars::DataFrame.new(
|
2004
|
+
# {
|
2005
|
+
# "time" => Polars.date_range(
|
2006
|
+
# DateTime.new(2021, 12, 16),
|
2007
|
+
# DateTime.new(2021, 12, 16, 3),
|
2008
|
+
# "30m"
|
2009
|
+
# ),
|
2010
|
+
# "groups" => ["a", "a", "a", "b", "b", "a", "a"]
|
2011
|
+
# }
|
2012
|
+
# )
|
2013
|
+
# df.groupby_dynamic(
|
2014
|
+
# "time",
|
2015
|
+
# every: "1h",
|
2016
|
+
# closed: "both",
|
2017
|
+
# by: "groups",
|
2018
|
+
# include_boundaries: true
|
2019
|
+
# ).agg([Polars.col("time").count.alias("time_count")])
|
2020
|
+
# # =>
|
2021
|
+
# # shape: (7, 5)
|
2022
|
+
# # ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────┐
|
2023
|
+
# # │ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ time_count │
|
2024
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2025
|
+
# # │ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ u32 │
|
2026
|
+
# # ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪════════════╡
|
2027
|
+
# # │ a ┆ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 1 │
|
2028
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2029
|
+
# # │ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 3 │
|
2030
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2031
|
+
# # │ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 1 │
|
2032
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2033
|
+
# # │ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 2 │
|
2034
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2035
|
+
# # │ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ 1 │
|
2036
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2037
|
+
# # │ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 2 │
|
2038
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2039
|
+
# # │ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 1 │
|
2040
|
+
# # └────────┴─────────────────────┴─────────────────────┴─────────────────────┴────────────┘
|
2041
|
+
#
|
2042
|
+
# @example Dynamic groupby on an index column.
|
2043
|
+
# df = Polars::DataFrame.new(
|
2044
|
+
# {
|
2045
|
+
# "idx" => Polars.arange(0, 6, eager: true),
|
2046
|
+
# "A" => ["A", "A", "B", "B", "B", "C"]
|
2047
|
+
# }
|
2048
|
+
# )
|
2049
|
+
# df.groupby_dynamic(
|
2050
|
+
# "idx",
|
2051
|
+
# every: "2i",
|
2052
|
+
# period: "3i",
|
2053
|
+
# include_boundaries: true,
|
2054
|
+
# closed: "right"
|
2055
|
+
# ).agg(Polars.col("A").list.alias("A_agg_list"))
|
2056
|
+
# # =>
|
2057
|
+
# # shape: (3, 4)
|
2058
|
+
# # ┌─────────────────┬─────────────────┬─────┬─────────────────┐
|
2059
|
+
# # │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
|
2060
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
2061
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ list[str] │
|
2062
|
+
# # ╞═════════════════╪═════════════════╪═════╪═════════════════╡
|
2063
|
+
# # │ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
|
2064
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2065
|
+
# # │ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
|
2066
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
2067
|
+
# # │ 4 ┆ 7 ┆ 4 ┆ ["C"] │
|
2068
|
+
# # └─────────────────┴─────────────────┴─────┴─────────────────┘
|
2069
|
+
def groupby_dynamic(
|
2070
|
+
index_column,
|
2071
|
+
every:,
|
2072
|
+
period: nil,
|
2073
|
+
offset: nil,
|
2074
|
+
truncate: true,
|
2075
|
+
include_boundaries: false,
|
2076
|
+
closed: "left",
|
2077
|
+
by: nil,
|
2078
|
+
start_by: "window"
|
2079
|
+
)
|
2080
|
+
DynamicGroupBy.new(
|
2081
|
+
self,
|
2082
|
+
index_column,
|
2083
|
+
every,
|
2084
|
+
period,
|
2085
|
+
offset,
|
2086
|
+
truncate,
|
2087
|
+
include_boundaries,
|
2088
|
+
closed,
|
2089
|
+
by,
|
2090
|
+
start_by
|
2091
|
+
)
|
2092
|
+
end
|
2093
|
+
|
2094
|
+
# Upsample a DataFrame at a regular frequency.
|
2095
|
+
#
|
2096
|
+
# @param time_column [Object]
|
2097
|
+
# time column will be used to determine a date_range.
|
2098
|
+
# Note that this column has to be sorted for the output to make sense.
|
2099
|
+
# @param every [String]
|
2100
|
+
# interval will start 'every' duration
|
2101
|
+
# @param offset [String]
|
2102
|
+
# change the start of the date_range by this offset.
|
2103
|
+
# @param by [Object]
|
2104
|
+
# First group by these columns and then upsample for every group
|
2105
|
+
# @param maintain_order [Boolean]
|
2106
|
+
# Keep the ordering predictable. This is slower.
|
2107
|
+
#
|
2108
|
+
# The `every` and `offset` arguments are created with
|
2109
|
+
# the following string language:
|
2110
|
+
#
|
2111
|
+
# - 1ns (1 nanosecond)
|
2112
|
+
# - 1us (1 microsecond)
|
2113
|
+
# - 1ms (1 millisecond)
|
2114
|
+
# - 1s (1 second)
|
2115
|
+
# - 1m (1 minute)
|
2116
|
+
# - 1h (1 hour)
|
2117
|
+
# - 1d (1 day)
|
2118
|
+
# - 1w (1 week)
|
2119
|
+
# - 1mo (1 calendar month)
|
2120
|
+
# - 1y (1 calendar year)
|
2121
|
+
# - 1i (1 index count)
|
2122
|
+
#
|
2123
|
+
# Or combine them:
|
2124
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
2125
|
+
#
|
2126
|
+
# @return [DataFrame]
|
2127
|
+
#
|
2128
|
+
# @example Upsample a DataFrame by a certain interval.
|
2129
|
+
# df = Polars::DataFrame.new(
|
2130
|
+
# {
|
2131
|
+
# "time" => [
|
2132
|
+
# DateTime.new(2021, 2, 1),
|
2133
|
+
# DateTime.new(2021, 4, 1),
|
2134
|
+
# DateTime.new(2021, 5, 1),
|
2135
|
+
# DateTime.new(2021, 6, 1)
|
2136
|
+
# ],
|
2137
|
+
# "groups" => ["A", "B", "A", "B"],
|
2138
|
+
# "values" => [0, 1, 2, 3]
|
2139
|
+
# }
|
2140
|
+
# )
|
2141
|
+
# df.upsample(
|
2142
|
+
# time_column: "time", every: "1mo", by: "groups", maintain_order: true
|
2143
|
+
# ).select(Polars.all.forward_fill)
|
2144
|
+
# # =>
|
2145
|
+
# # shape: (7, 3)
|
2146
|
+
# # ┌─────────────────────┬────────┬────────┐
|
2147
|
+
# # │ time ┆ groups ┆ values │
|
2148
|
+
# # │ --- ┆ --- ┆ --- │
|
2149
|
+
# # │ datetime[ns] ┆ str ┆ i64 │
|
2150
|
+
# # ╞═════════════════════╪════════╪════════╡
|
2151
|
+
# # │ 2021-02-01 00:00:00 ┆ A ┆ 0 │
|
2152
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2153
|
+
# # │ 2021-03-01 00:00:00 ┆ A ┆ 0 │
|
2154
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2155
|
+
# # │ 2021-04-01 00:00:00 ┆ A ┆ 0 │
|
2156
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2157
|
+
# # │ 2021-05-01 00:00:00 ┆ A ┆ 2 │
|
2158
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2159
|
+
# # │ 2021-04-01 00:00:00 ┆ B ┆ 1 │
|
2160
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2161
|
+
# # │ 2021-05-01 00:00:00 ┆ B ┆ 1 │
|
2162
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
2163
|
+
# # │ 2021-06-01 00:00:00 ┆ B ┆ 3 │
|
2164
|
+
# # └─────────────────────┴────────┴────────┘
|
2165
|
+
def upsample(
|
2166
|
+
time_column:,
|
2167
|
+
every:,
|
2168
|
+
offset: nil,
|
2169
|
+
by: nil,
|
2170
|
+
maintain_order: false
|
2171
|
+
)
|
2172
|
+
if by.nil?
|
2173
|
+
by = []
|
2174
|
+
end
|
2175
|
+
if by.is_a?(String)
|
2176
|
+
by = [by]
|
2177
|
+
end
|
2178
|
+
if offset.nil?
|
2179
|
+
offset = "0ns"
|
2180
|
+
end
|
2181
|
+
|
2182
|
+
every = Utils._timedelta_to_pl_duration(every)
|
2183
|
+
offset = Utils._timedelta_to_pl_duration(offset)
|
2184
|
+
|
2185
|
+
_from_rbdf(
|
2186
|
+
_df.upsample(by, time_column, every, offset, maintain_order)
|
2187
|
+
)
|
2188
|
+
end
|
2189
|
+
|
2190
|
+
# Perform an asof join.
|
2191
|
+
#
|
2192
|
+
# This is similar to a left-join except that we match on nearest key rather than
|
2193
|
+
# equal keys.
|
2194
|
+
#
|
2195
|
+
# Both DataFrames must be sorted by the asof_join key.
|
2196
|
+
#
|
2197
|
+
# For each row in the left DataFrame:
|
2198
|
+
#
|
2199
|
+
# - A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
|
2200
|
+
# - A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
|
2201
|
+
#
|
2202
|
+
# The default is "backward".
|
2203
|
+
#
|
2204
|
+
# @param other [DataFrame]
|
2205
|
+
# DataFrame to join with.
|
2206
|
+
# @param left_on [String]
|
2207
|
+
# Join column of the left DataFrame.
|
2208
|
+
# @param right_on [String]
|
2209
|
+
# Join column of the right DataFrame.
|
2210
|
+
# @param on [String]
|
2211
|
+
# Join column of both DataFrames. If set, `left_on` and `right_on` should be
|
2212
|
+
# None.
|
2213
|
+
# @param by [Object]
|
2214
|
+
# join on these columns before doing asof join
|
2215
|
+
# @param by_left [Object]
|
2216
|
+
# join on these columns before doing asof join
|
2217
|
+
# @param by_right [Object]
|
2218
|
+
# join on these columns before doing asof join
|
2219
|
+
# @param strategy ["backward", "forward"]
|
2220
|
+
# Join strategy.
|
2221
|
+
# @param suffix [String]
|
2222
|
+
# Suffix to append to columns with a duplicate name.
|
2223
|
+
# @param tolerance [Object]
|
2224
|
+
# Numeric tolerance. By setting this the join will only be done if the near
|
2225
|
+
# keys are within this distance. If an asof join is done on columns of dtype
|
2226
|
+
# "Date", "Datetime", "Duration" or "Time" you use the following string
|
2227
|
+
# language:
|
2228
|
+
#
|
2229
|
+
# - 1ns (1 nanosecond)
|
2230
|
+
# - 1us (1 microsecond)
|
2231
|
+
# - 1ms (1 millisecond)
|
2232
|
+
# - 1s (1 second)
|
2233
|
+
# - 1m (1 minute)
|
2234
|
+
# - 1h (1 hour)
|
2235
|
+
# - 1d (1 day)
|
2236
|
+
# - 1w (1 week)
|
2237
|
+
# - 1mo (1 calendar month)
|
2238
|
+
# - 1y (1 calendar year)
|
2239
|
+
# - 1i (1 index count)
|
2240
|
+
#
|
2241
|
+
# Or combine them:
|
2242
|
+
# "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
|
2243
|
+
#
|
2244
|
+
# @param allow_parallel [Boolean]
|
2245
|
+
# Allow the physical plan to optionally evaluate the computation of both
|
2246
|
+
# DataFrames up to the join in parallel.
|
2247
|
+
# @param force_parallel [Boolean]
|
2248
|
+
# Force the physical plan to evaluate the computation of both DataFrames up to
|
2249
|
+
# the join in parallel.
|
2250
|
+
#
|
2251
|
+
# @return [DataFrame]
|
2252
|
+
#
|
2253
|
+
# @example
|
2254
|
+
# gdp = Polars::DataFrame.new(
|
2255
|
+
# {
|
2256
|
+
# "date" => [
|
2257
|
+
# DateTime.new(2016, 1, 1),
|
2258
|
+
# DateTime.new(2017, 1, 1),
|
2259
|
+
# DateTime.new(2018, 1, 1),
|
2260
|
+
# DateTime.new(2019, 1, 1),
|
2261
|
+
# ], # note record date: Jan 1st (sorted!)
|
2262
|
+
# "gdp" => [4164, 4411, 4566, 4696]
|
2263
|
+
# }
|
2264
|
+
# )
|
2265
|
+
# population = Polars::DataFrame.new(
|
2266
|
+
# {
|
2267
|
+
# "date" => [
|
2268
|
+
# DateTime.new(2016, 5, 12),
|
2269
|
+
# DateTime.new(2017, 5, 12),
|
2270
|
+
# DateTime.new(2018, 5, 12),
|
2271
|
+
# DateTime.new(2019, 5, 12),
|
2272
|
+
# ], # note record date: May 12th (sorted!)
|
2273
|
+
# "population" => [82.19, 82.66, 83.12, 83.52]
|
2274
|
+
# }
|
2275
|
+
# )
|
2276
|
+
# population.join_asof(
|
2277
|
+
# gdp, left_on: "date", right_on: "date", strategy: "backward"
|
2278
|
+
# )
|
2279
|
+
# # =>
|
2280
|
+
# # shape: (4, 3)
|
2281
|
+
# # ┌─────────────────────┬────────────┬──────┐
|
2282
|
+
# # │ date ┆ population ┆ gdp │
|
2283
|
+
# # │ --- ┆ --- ┆ --- │
|
2284
|
+
# # │ datetime[ns] ┆ f64 ┆ i64 │
|
2285
|
+
# # ╞═════════════════════╪════════════╪══════╡
|
2286
|
+
# # │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
|
2287
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2288
|
+
# # │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
|
2289
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2290
|
+
# # │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
|
2291
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2292
|
+
# # │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
|
2293
|
+
# # └─────────────────────┴────────────┴──────┘
|
2294
|
+
def join_asof(
|
2295
|
+
other,
|
2296
|
+
left_on: nil,
|
2297
|
+
right_on: nil,
|
2298
|
+
on: nil,
|
2299
|
+
by_left: nil,
|
2300
|
+
by_right: nil,
|
2301
|
+
by: nil,
|
2302
|
+
strategy: "backward",
|
2303
|
+
suffix: "_right",
|
2304
|
+
tolerance: nil,
|
2305
|
+
allow_parallel: true,
|
2306
|
+
force_parallel: false
|
2307
|
+
)
|
2308
|
+
lazy
|
2309
|
+
.join_asof(
|
2310
|
+
other.lazy,
|
2311
|
+
left_on: left_on,
|
2312
|
+
right_on: right_on,
|
2313
|
+
on: on,
|
2314
|
+
by_left: by_left,
|
2315
|
+
by_right: by_right,
|
2316
|
+
by: by,
|
2317
|
+
strategy: strategy,
|
2318
|
+
suffix: suffix,
|
2319
|
+
tolerance: tolerance,
|
2320
|
+
allow_parallel: allow_parallel,
|
2321
|
+
force_parallel: force_parallel
|
2322
|
+
)
|
2323
|
+
.collect(no_optimization: true)
|
2324
|
+
end
|
2325
|
+
|
2326
|
+
# Join in SQL-like fashion.
|
2327
|
+
#
|
2328
|
+
# @param other [DataFrame]
|
2329
|
+
# DataFrame to join with.
|
2330
|
+
# @param left_on [Object]
|
2331
|
+
# Name(s) of the left join column(s).
|
2332
|
+
# @param right_on [Object]
|
2333
|
+
# Name(s) of the right join column(s).
|
2334
|
+
# @param on [Object]
|
2335
|
+
# Name(s) of the join columns in both DataFrames.
|
2336
|
+
# @param how ["inner", "left", "outer", "semi", "anti", "cross"]
|
2337
|
+
# Join strategy.
|
2338
|
+
# @param suffix [String]
|
2339
|
+
# Suffix to append to columns with a duplicate name.
|
2340
|
+
#
|
2341
|
+
# @return [DataFrame]
|
2342
|
+
#
|
2343
|
+
# @example
|
2344
|
+
# df = Polars::DataFrame.new(
|
2345
|
+
# {
|
2346
|
+
# "foo" => [1, 2, 3],
|
2347
|
+
# "bar" => [6.0, 7.0, 8.0],
|
2348
|
+
# "ham" => ["a", "b", "c"]
|
2349
|
+
# }
|
2350
|
+
# )
|
2351
|
+
# other_df = Polars::DataFrame.new(
|
2352
|
+
# {
|
2353
|
+
# "apple" => ["x", "y", "z"],
|
2354
|
+
# "ham" => ["a", "b", "d"]
|
2355
|
+
# }
|
2356
|
+
# )
|
2357
|
+
# df.join(other_df, on: "ham")
|
2358
|
+
# # =>
|
2359
|
+
# # shape: (2, 4)
|
2360
|
+
# # ┌─────┬─────┬─────┬───────┐
|
2361
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
2362
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
2363
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
2364
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
2365
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
2366
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2367
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
2368
|
+
# # └─────┴─────┴─────┴───────┘
|
2369
|
+
#
|
2370
|
+
# @example
|
2371
|
+
# df.join(other_df, on: "ham", how: "outer")
|
2372
|
+
# # =>
|
2373
|
+
# # shape: (4, 4)
|
2374
|
+
# # ┌──────┬──────┬─────┬───────┐
|
2375
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
2376
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
2377
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
2378
|
+
# # ╞══════╪══════╪═════╪═══════╡
|
2379
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
2380
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2381
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
2382
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2383
|
+
# # │ null ┆ null ┆ d ┆ z │
|
2384
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2385
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null │
|
2386
|
+
# # └──────┴──────┴─────┴───────┘
|
2387
|
+
#
|
2388
|
+
# @example
|
2389
|
+
# df.join(other_df, on: "ham", how: "left")
|
2390
|
+
# # =>
|
2391
|
+
# # shape: (3, 4)
|
2392
|
+
# # ┌─────┬─────┬─────┬───────┐
|
2393
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
2394
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
2395
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
2396
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
2397
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
2398
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2399
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
2400
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2401
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null │
|
2402
|
+
# # └─────┴─────┴─────┴───────┘
|
2403
|
+
#
|
2404
|
+
# @example
|
2405
|
+
# df.join(other_df, on: "ham", how: "semi")
|
2406
|
+
# # =>
|
2407
|
+
# # shape: (2, 3)
|
2408
|
+
# # ┌─────┬─────┬─────┐
|
2409
|
+
# # │ foo ┆ bar ┆ ham │
|
2410
|
+
# # │ --- ┆ --- ┆ --- │
|
2411
|
+
# # │ i64 ┆ f64 ┆ str │
|
2412
|
+
# # ╞═════╪═════╪═════╡
|
2413
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
2414
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2415
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
2416
|
+
# # └─────┴─────┴─────┘
|
2417
|
+
#
|
2418
|
+
# @example
|
2419
|
+
# df.join(other_df, on: "ham", how: "anti")
|
2420
|
+
# # =>
|
2421
|
+
# # shape: (1, 3)
|
2422
|
+
# # ┌─────┬─────┬─────┐
|
2423
|
+
# # │ foo ┆ bar ┆ ham │
|
2424
|
+
# # │ --- ┆ --- ┆ --- │
|
2425
|
+
# # │ i64 ┆ f64 ┆ str │
|
2426
|
+
# # ╞═════╪═════╪═════╡
|
2427
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
2428
|
+
# # └─────┴─────┴─────┘
|
2429
|
+
def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
|
2430
|
+
lazy
|
2431
|
+
.join(
|
2432
|
+
other.lazy,
|
2433
|
+
left_on: left_on,
|
2434
|
+
right_on: right_on,
|
2435
|
+
on: on,
|
2436
|
+
how: how,
|
2437
|
+
suffix: suffix,
|
2438
|
+
)
|
2439
|
+
.collect(no_optimization: true)
|
2440
|
+
end
|
2441
|
+
|
2442
|
+
# Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
|
2443
|
+
#
|
2444
|
+
# The UDF will receive each row as a tuple of values: `udf(row)`.
|
2445
|
+
#
|
2446
|
+
# Implementing logic using a Ruby function is almost always _significantly_
|
2447
|
+
# slower and more memory intensive than implementing the same logic using
|
2448
|
+
# the native expression API because:
|
2449
|
+
#
|
2450
|
+
# - The native expression engine runs in Rust; UDFs run in Ruby.
|
2451
|
+
# - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
|
2452
|
+
# - Polars-native expressions can be parallelised (UDFs cannot).
|
2453
|
+
# - Polars-native expressions can be logically optimised (UDFs cannot).
|
2454
|
+
#
|
2455
|
+
# Wherever possible you should strongly prefer the native expression API
|
2456
|
+
# to achieve the best performance.
|
2457
|
+
#
|
2458
|
+
# @param return_dtype [Symbol]
|
2459
|
+
# Output type of the operation. If none given, Polars tries to infer the type.
|
2460
|
+
# @param inference_size [Integer]
|
2461
|
+
# Only used in the case when the custom function returns rows.
|
2462
|
+
# This uses the first `n` rows to determine the output schema
|
2463
|
+
#
|
2464
|
+
# @return [Object]
|
2465
|
+
#
|
2466
|
+
# @note
|
2467
|
+
# The frame-level `apply` cannot track column names (as the UDF is a black-box
|
2468
|
+
# that may arbitrarily drop, rearrange, transform, or add new columns); if you
|
2469
|
+
# want to apply a UDF such that column names are preserved, you should use the
|
2470
|
+
# expression-level `apply` syntax instead.
|
2471
|
+
#
|
2472
|
+
# @example
|
2473
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [-1, 5, 8]})
|
2474
|
+
#
|
2475
|
+
# @example Return a DataFrame by mapping each row to a tuple:
|
2476
|
+
# df.apply { |t| [t[0] * 2, t[1] * 3] }
|
2477
|
+
# # =>
|
2478
|
+
# # shape: (3, 2)
|
2479
|
+
# # ┌──────────┬──────────┐
|
2480
|
+
# # │ column_0 ┆ column_1 │
|
2481
|
+
# # │ --- ┆ --- │
|
2482
|
+
# # │ i64 ┆ i64 │
|
2483
|
+
# # ╞══════════╪══════════╡
|
2484
|
+
# # │ 2 ┆ -3 │
|
2485
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
2486
|
+
# # │ 4 ┆ 15 │
|
2487
|
+
# # ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
2488
|
+
# # │ 6 ┆ 24 │
|
2489
|
+
# # └──────────┴──────────┘
|
2490
|
+
#
|
2491
|
+
# @example Return a Series by mapping each row to a scalar:
|
2492
|
+
# df.apply { |t| t[0] * 2 + t[1] }
|
2493
|
+
# # =>
|
2494
|
+
# # shape: (3, 1)
|
2495
|
+
# # ┌───────┐
|
2496
|
+
# # │ apply │
|
2497
|
+
# # │ --- │
|
2498
|
+
# # │ i64 │
|
2499
|
+
# # ╞═══════╡
|
2500
|
+
# # │ 1 │
|
2501
|
+
# # ├╌╌╌╌╌╌╌┤
|
2502
|
+
# # │ 9 │
|
2503
|
+
# # ├╌╌╌╌╌╌╌┤
|
2504
|
+
# # │ 14 │
|
2505
|
+
# # └───────┘
|
2506
|
+
def apply(return_dtype: nil, inference_size: 256, &f)
|
2507
|
+
out, is_df = _df.apply(f, return_dtype, inference_size)
|
2508
|
+
if is_df
|
2509
|
+
_from_rbdf(out)
|
2510
|
+
else
|
2511
|
+
_from_rbdf(Utils.wrap_s(out).to_frame._df)
|
2512
|
+
end
|
2513
|
+
end
|
2514
|
+
|
2515
|
+
# Return a new DataFrame with the column added or replaced.
|
2516
|
+
#
|
2517
|
+
# @param column [Object]
|
2518
|
+
# Series, where the name of the Series refers to the column in the DataFrame.
|
2519
|
+
#
|
2520
|
+
# @return [DataFrame]
|
2521
|
+
#
|
2522
|
+
# @example Added
|
2523
|
+
# df = Polars::DataFrame.new(
|
2524
|
+
# {
|
2525
|
+
# "a" => [1, 3, 5],
|
2526
|
+
# "b" => [2, 4, 6]
|
2527
|
+
# }
|
2528
|
+
# )
|
2529
|
+
# df.with_column((Polars.col("b") ** 2).alias("b_squared"))
|
2530
|
+
# # =>
|
2531
|
+
# # shape: (3, 3)
|
2532
|
+
# # ┌─────┬─────┬───────────┐
|
2533
|
+
# # │ a ┆ b ┆ b_squared │
|
2534
|
+
# # │ --- ┆ --- ┆ --- │
|
2535
|
+
# # │ i64 ┆ i64 ┆ f64 │
|
2536
|
+
# # ╞═════╪═════╪═══════════╡
|
2537
|
+
# # │ 1 ┆ 2 ┆ 4.0 │
|
2538
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
2539
|
+
# # │ 3 ┆ 4 ┆ 16.0 │
|
2540
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
2541
|
+
# # │ 5 ┆ 6 ┆ 36.0 │
|
2542
|
+
# # └─────┴─────┴───────────┘
|
2543
|
+
#
|
2544
|
+
# @example Replaced
|
2545
|
+
# df.with_column(Polars.col("a") ** 2)
|
2546
|
+
# # =>
|
2547
|
+
# # shape: (3, 2)
|
2548
|
+
# # ┌──────┬─────┐
|
2549
|
+
# # │ a ┆ b │
|
2550
|
+
# # │ --- ┆ --- │
|
2551
|
+
# # │ f64 ┆ i64 │
|
2552
|
+
# # ╞══════╪═════╡
|
2553
|
+
# # │ 1.0 ┆ 2 │
|
2554
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
|
2555
|
+
# # │ 9.0 ┆ 4 │
|
2556
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
|
2557
|
+
# # │ 25.0 ┆ 6 │
|
2558
|
+
# # └──────┴─────┘
|
2559
|
+
def with_column(column)
|
2560
|
+
lazy
|
2561
|
+
.with_column(column)
|
2562
|
+
.collect(no_optimization: true, string_cache: false)
|
2563
|
+
end
|
2564
|
+
|
2565
|
+
# Return a new DataFrame grown horizontally by stacking multiple Series to it.
|
2566
|
+
#
|
2567
|
+
# @param columns [Object]
|
2568
|
+
# Series to stack.
|
2569
|
+
# @param in_place [Boolean]
|
2570
|
+
# Modify in place.
|
2571
|
+
#
|
2572
|
+
# @return [DataFrame]
|
2573
|
+
#
|
2574
|
+
# @example
|
2575
|
+
# df = Polars::DataFrame.new(
|
2576
|
+
# {
|
2577
|
+
# "foo" => [1, 2, 3],
|
2578
|
+
# "bar" => [6, 7, 8],
|
2579
|
+
# "ham" => ["a", "b", "c"]
|
2580
|
+
# }
|
2581
|
+
# )
|
2582
|
+
# x = Polars::Series.new("apple", [10, 20, 30])
|
2583
|
+
# df.hstack([x])
|
2584
|
+
# # =>
|
2585
|
+
# # shape: (3, 4)
|
2586
|
+
# # ┌─────┬─────┬─────┬───────┐
|
2587
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
2588
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
2589
|
+
# # │ i64 ┆ i64 ┆ str ┆ i64 │
|
2590
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
2591
|
+
# # │ 1 ┆ 6 ┆ a ┆ 10 │
|
2592
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2593
|
+
# # │ 2 ┆ 7 ┆ b ┆ 20 │
|
2594
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2595
|
+
# # │ 3 ┆ 8 ┆ c ┆ 30 │
|
2596
|
+
# # └─────┴─────┴─────┴───────┘
|
2597
|
+
def hstack(columns, in_place: false)
|
2598
|
+
if !columns.is_a?(Array)
|
2599
|
+
columns = columns.get_columns
|
2600
|
+
end
|
2601
|
+
if in_place
|
2602
|
+
_df.hstack_mut(columns.map(&:_s))
|
2603
|
+
self
|
2604
|
+
else
|
2605
|
+
_from_rbdf(_df.hstack(columns.map(&:_s)))
|
2606
|
+
end
|
2607
|
+
end
|
2608
|
+
|
2609
|
+
# Grow this DataFrame vertically by stacking a DataFrame to it.
|
2610
|
+
#
|
2611
|
+
# @param df [DataFrame]
|
2612
|
+
# DataFrame to stack.
|
2613
|
+
# @param in_place [Boolean]
|
2614
|
+
# Modify in place
|
2615
|
+
#
|
2616
|
+
# @return [DataFrame]
|
2617
|
+
#
|
2618
|
+
# @example
|
2619
|
+
# df1 = Polars::DataFrame.new(
|
2620
|
+
# {
|
2621
|
+
# "foo" => [1, 2],
|
2622
|
+
# "bar" => [6, 7],
|
2623
|
+
# "ham" => ["a", "b"]
|
2624
|
+
# }
|
2625
|
+
# )
|
2626
|
+
# df2 = Polars::DataFrame.new(
|
2627
|
+
# {
|
2628
|
+
# "foo" => [3, 4],
|
2629
|
+
# "bar" => [8, 9],
|
2630
|
+
# "ham" => ["c", "d"]
|
2631
|
+
# }
|
2632
|
+
# )
|
2633
|
+
# df1.vstack(df2)
|
2634
|
+
# # =>
|
2635
|
+
# # shape: (4, 3)
|
2636
|
+
# # ┌─────┬─────┬─────┐
|
2637
|
+
# # │ foo ┆ bar ┆ ham │
|
2638
|
+
# # │ --- ┆ --- ┆ --- │
|
2639
|
+
# # │ i64 ┆ i64 ┆ str │
|
2640
|
+
# # ╞═════╪═════╪═════╡
|
2641
|
+
# # │ 1 ┆ 6 ┆ a │
|
2642
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2643
|
+
# # │ 2 ┆ 7 ┆ b │
|
2644
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2645
|
+
# # │ 3 ┆ 8 ┆ c │
|
2646
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2647
|
+
# # │ 4 ┆ 9 ┆ d │
|
2648
|
+
# # └─────┴─────┴─────┘
|
2649
|
+
def vstack(df, in_place: false)
|
2650
|
+
if in_place
|
2651
|
+
_df.vstack_mut(df._df)
|
2652
|
+
self
|
2653
|
+
else
|
2654
|
+
_from_rbdf(_df.vstack(df._df))
|
2655
|
+
end
|
2656
|
+
end
|
2657
|
+
|
2658
|
+
# Extend the memory backed by this `DataFrame` with the values from `other`.
|
2659
|
+
#
|
2660
|
+
# Different from `vstack` which adds the chunks from `other` to the chunks of this
|
2661
|
+
# `DataFrame` `extend` appends the data from `other` to the underlying memory
|
2662
|
+
# locations and thus may cause a reallocation.
|
2663
|
+
#
|
2664
|
+
# If this does not cause a reallocation, the resulting data structure will not
|
2665
|
+
# have any extra chunks and thus will yield faster queries.
|
2666
|
+
#
|
2667
|
+
# Prefer `extend` over `vstack` when you want to do a query after a single append.
|
2668
|
+
# For instance during online operations where you add `n` rows and rerun a query.
|
2669
|
+
#
|
2670
|
+
# Prefer `vstack` over `extend` when you want to append many times before doing a
|
2671
|
+
# query. For instance when you read in multiple files and when to store them in a
|
2672
|
+
# single `DataFrame`. In the latter case, finish the sequence of `vstack`
|
2673
|
+
# operations with a `rechunk`.
|
2674
|
+
#
|
2675
|
+
# @param other [DataFrame]
|
2676
|
+
# DataFrame to vertically add.
|
2677
|
+
#
|
2678
|
+
# @return [DataFrame]
|
2679
|
+
#
|
2680
|
+
# @example
|
2681
|
+
# df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
2682
|
+
# df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]})
|
2683
|
+
# df1.extend(df2)
|
2684
|
+
# # =>
|
2685
|
+
# # shape: (6, 2)
|
2686
|
+
# # ┌─────┬─────┐
|
2687
|
+
# # │ foo ┆ bar │
|
2688
|
+
# # │ --- ┆ --- │
|
2689
|
+
# # │ i64 ┆ i64 │
|
2690
|
+
# # ╞═════╪═════╡
|
2691
|
+
# # │ 1 ┆ 4 │
|
2692
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2693
|
+
# # │ 2 ┆ 5 │
|
2694
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2695
|
+
# # │ 3 ┆ 6 │
|
2696
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2697
|
+
# # │ 10 ┆ 40 │
|
2698
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2699
|
+
# # │ 20 ┆ 50 │
|
2700
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2701
|
+
# # │ 30 ┆ 60 │
|
2702
|
+
# # └─────┴─────┘
|
2703
|
+
def extend(other)
|
2704
|
+
_df.extend(other._df)
|
2705
|
+
self
|
2706
|
+
end
|
2707
|
+
|
2708
|
+
# Remove column from DataFrame and return as new.
|
2709
|
+
#
|
2710
|
+
# @param columns [Object]
|
2711
|
+
# Column(s) to drop.
|
2712
|
+
#
|
2713
|
+
# @return [DataFrame]
|
2714
|
+
#
|
2715
|
+
# @example
|
2716
|
+
# df = Polars::DataFrame.new(
|
2717
|
+
# {
|
2718
|
+
# "foo" => [1, 2, 3],
|
2719
|
+
# "bar" => [6.0, 7.0, 8.0],
|
2720
|
+
# "ham" => ["a", "b", "c"]
|
2721
|
+
# }
|
2722
|
+
# )
|
2723
|
+
# df.drop("ham")
|
2724
|
+
# # =>
|
2725
|
+
# # shape: (3, 2)
|
2726
|
+
# # ┌─────┬─────┐
|
2727
|
+
# # │ foo ┆ bar │
|
2728
|
+
# # │ --- ┆ --- │
|
2729
|
+
# # │ i64 ┆ f64 │
|
2730
|
+
# # ╞═════╪═════╡
|
2731
|
+
# # │ 1 ┆ 6.0 │
|
2732
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2733
|
+
# # │ 2 ┆ 7.0 │
|
2734
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2735
|
+
# # │ 3 ┆ 8.0 │
|
2736
|
+
# # └─────┴─────┘
|
2737
|
+
def drop(columns)
|
2738
|
+
if columns.is_a?(Array)
|
2739
|
+
df = clone
|
2740
|
+
columns.each do |n|
|
2741
|
+
df._df.drop_in_place(n)
|
2742
|
+
end
|
2743
|
+
df
|
2744
|
+
else
|
2745
|
+
_from_rbdf(_df.drop(columns))
|
2746
|
+
end
|
2747
|
+
end
|
2748
|
+
|
2749
|
+
# Drop in place.
|
2750
|
+
#
|
2751
|
+
# @param name [Object]
|
2752
|
+
# Column to drop.
|
2753
|
+
#
|
2754
|
+
# @return [Series]
|
2755
|
+
#
|
2756
|
+
# @example
|
2757
|
+
# df = Polars::DataFrame.new(
|
2758
|
+
# {
|
2759
|
+
# "foo" => [1, 2, 3],
|
2760
|
+
# "bar" => [6, 7, 8],
|
2761
|
+
# "ham" => ["a", "b", "c"]
|
2762
|
+
# }
|
2763
|
+
# )
|
2764
|
+
# df.drop_in_place("ham")
|
2765
|
+
# # =>
|
2766
|
+
# # shape: (3,)
|
2767
|
+
# # Series: 'ham' [str]
|
2768
|
+
# # [
|
2769
|
+
# # "a"
|
2770
|
+
# # "b"
|
2771
|
+
# # "c"
|
2772
|
+
# # ]
|
2773
|
+
def drop_in_place(name)
|
2774
|
+
Utils.wrap_s(_df.drop_in_place(name))
|
2775
|
+
end
|
2776
|
+
|
2777
|
+
# Create an empty copy of the current DataFrame.
|
2778
|
+
#
|
2779
|
+
# Returns a DataFrame with identical schema but no data.
|
2780
|
+
#
|
2781
|
+
# @return [DataFrame]
|
2782
|
+
#
|
2783
|
+
# @example
|
2784
|
+
# df = Polars::DataFrame.new(
|
2785
|
+
# {
|
2786
|
+
# "a" => [nil, 2, 3, 4],
|
2787
|
+
# "b" => [0.5, nil, 2.5, 13],
|
2788
|
+
# "c" => [true, true, false, nil]
|
2789
|
+
# }
|
2790
|
+
# )
|
2791
|
+
# df.cleared
|
2792
|
+
# # =>
|
2793
|
+
# # shape: (0, 3)
|
2794
|
+
# # ┌─────┬─────┬──────┐
|
2795
|
+
# # │ a ┆ b ┆ c │
|
2796
|
+
# # │ --- ┆ --- ┆ --- │
|
2797
|
+
# # │ i64 ┆ f64 ┆ bool │
|
2798
|
+
# # ╞═════╪═════╪══════╡
|
2799
|
+
# # └─────┴─────┴──────┘
|
2800
|
+
def cleared
|
2801
|
+
height > 0 ? head(0) : clone
|
2802
|
+
end
|
2803
|
+
|
2804
|
+
# clone handled by initialize_copy
|
2805
|
+
|
2806
|
+
# Get the DataFrame as a Array of Series.
|
2807
|
+
#
|
2808
|
+
# @return [Array]
|
2809
|
+
def get_columns
|
2810
|
+
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
2811
|
+
end
|
2812
|
+
|
2813
|
+
# Get a single column as Series by name.
|
2814
|
+
#
|
2815
|
+
# @param name [String]
|
2816
|
+
# Name of the column to retrieve.
|
2817
|
+
#
|
2818
|
+
# @return [Series]
|
2819
|
+
#
|
2820
|
+
# @example
|
2821
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
2822
|
+
# df.get_column("foo")
|
2823
|
+
# # =>
|
2824
|
+
# # shape: (3,)
|
2825
|
+
# # Series: 'foo' [i64]
|
2826
|
+
# # [
|
2827
|
+
# # 1
|
2828
|
+
# # 2
|
2829
|
+
# # 3
|
2830
|
+
# # ]
|
2831
|
+
def get_column(name)
|
2832
|
+
self[name]
|
2833
|
+
end
|
2834
|
+
|
2835
|
+
# Fill null values using the specified value or strategy.
|
2836
|
+
#
|
2837
|
+
# @param value [Numeric]
|
2838
|
+
# Value used to fill null values.
|
2839
|
+
# @param strategy [nil, "forward", "backward", "min", "max", "mean", "zero", "one"]
|
2840
|
+
# Strategy used to fill null values.
|
2841
|
+
# @param limit [Integer]
|
2842
|
+
# Number of consecutive null values to fill when using the 'forward' or
|
2843
|
+
# 'backward' strategy.
|
2844
|
+
# @param matches_supertype [Boolean]
|
2845
|
+
# Fill all matching supertype of the fill `value`.
|
2846
|
+
#
|
2847
|
+
# @return [DataFrame]
|
2848
|
+
#
|
2849
|
+
# @example
|
2850
|
+
# df = Polars::DataFrame.new(
|
2851
|
+
# {
|
2852
|
+
# "a" => [1, 2, nil, 4],
|
2853
|
+
# "b" => [0.5, 4, nil, 13]
|
2854
|
+
# }
|
2855
|
+
# )
|
2856
|
+
# df.fill_null(99)
|
2857
|
+
# # =>
|
2858
|
+
# # shape: (4, 2)
|
2859
|
+
# # ┌─────┬──────┐
|
2860
|
+
# # │ a ┆ b │
|
2861
|
+
# # │ --- ┆ --- │
|
2862
|
+
# # │ i64 ┆ f64 │
|
2863
|
+
# # ╞═════╪══════╡
|
2864
|
+
# # │ 1 ┆ 0.5 │
|
2865
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2866
|
+
# # │ 2 ┆ 4.0 │
|
2867
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2868
|
+
# # │ 99 ┆ 99.0 │
|
2869
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2870
|
+
# # │ 4 ┆ 13.0 │
|
2871
|
+
# # └─────┴──────┘
|
2872
|
+
#
|
2873
|
+
# @example
|
2874
|
+
# df.fill_null(strategy: "forward")
|
2875
|
+
# # =>
|
2876
|
+
# # shape: (4, 2)
|
2877
|
+
# # ┌─────┬──────┐
|
2878
|
+
# # │ a ┆ b │
|
2879
|
+
# # │ --- ┆ --- │
|
2880
|
+
# # │ i64 ┆ f64 │
|
2881
|
+
# # ╞═════╪══════╡
|
2882
|
+
# # │ 1 ┆ 0.5 │
|
2883
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2884
|
+
# # │ 2 ┆ 4.0 │
|
2885
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2886
|
+
# # │ 2 ┆ 4.0 │
|
2887
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2888
|
+
# # │ 4 ┆ 13.0 │
|
2889
|
+
# # └─────┴──────┘
|
2890
|
+
#
|
2891
|
+
# @example
|
2892
|
+
# df.fill_null(strategy: "max")
|
2893
|
+
# # =>
|
2894
|
+
# # shape: (4, 2)
|
2895
|
+
# # ┌─────┬──────┐
|
2896
|
+
# # │ a ┆ b │
|
2897
|
+
# # │ --- ┆ --- │
|
2898
|
+
# # │ i64 ┆ f64 │
|
2899
|
+
# # ╞═════╪══════╡
|
2900
|
+
# # │ 1 ┆ 0.5 │
|
2901
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2902
|
+
# # │ 2 ┆ 4.0 │
|
2903
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2904
|
+
# # │ 4 ┆ 13.0 │
|
2905
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2906
|
+
# # │ 4 ┆ 13.0 │
|
2907
|
+
# # └─────┴──────┘
|
2908
|
+
#
|
2909
|
+
# @example
|
2910
|
+
# df.fill_null(strategy: "zero")
|
2911
|
+
# # =>
|
2912
|
+
# # shape: (4, 2)
|
2913
|
+
# # ┌─────┬──────┐
|
2914
|
+
# # │ a ┆ b │
|
2915
|
+
# # │ --- ┆ --- │
|
2916
|
+
# # │ i64 ┆ f64 │
|
2917
|
+
# # ╞═════╪══════╡
|
2918
|
+
# # │ 1 ┆ 0.5 │
|
2919
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2920
|
+
# # │ 2 ┆ 4.0 │
|
2921
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2922
|
+
# # │ 0 ┆ 0.0 │
|
2923
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2924
|
+
# # │ 4 ┆ 13.0 │
|
2925
|
+
# # └─────┴──────┘
|
2926
|
+
def fill_null(value = nil, strategy: nil, limit: nil, matches_supertype: true)
|
2927
|
+
_from_rbdf(
|
2928
|
+
lazy
|
2929
|
+
.fill_null(value, strategy: strategy, limit: limit, matches_supertype: matches_supertype)
|
2930
|
+
.collect(no_optimization: true)
|
2931
|
+
._df
|
2932
|
+
)
|
2933
|
+
end
|
2934
|
+
|
2935
|
+
# Fill floating point NaN values by an Expression evaluation.
|
2936
|
+
#
|
2937
|
+
# @param fill_value [Object]
|
2938
|
+
# Value to fill NaN with.
|
2939
|
+
#
|
2940
|
+
# @return [DataFrame]
|
2941
|
+
#
|
2942
|
+
# @note
|
2943
|
+
# Note that floating point NaNs (Not a Number) are not missing values!
|
2944
|
+
# To replace missing values, use `fill_null`.
|
2945
|
+
#
|
2946
|
+
# @example
|
2947
|
+
# df = Polars::DataFrame.new(
|
2948
|
+
# {
|
2949
|
+
# "a" => [1.5, 2, Float::NAN, 4],
|
2950
|
+
# "b" => [0.5, 4, Float::NAN, 13]
|
2951
|
+
# }
|
2952
|
+
# )
|
2953
|
+
# df.fill_nan(99)
|
2954
|
+
# # =>
|
2955
|
+
# # shape: (4, 2)
|
2956
|
+
# # ┌──────┬──────┐
|
2957
|
+
# # │ a ┆ b │
|
2958
|
+
# # │ --- ┆ --- │
|
2959
|
+
# # │ f64 ┆ f64 │
|
2960
|
+
# # ╞══════╪══════╡
|
2961
|
+
# # │ 1.5 ┆ 0.5 │
|
2962
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2963
|
+
# # │ 2.0 ┆ 4.0 │
|
2964
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2965
|
+
# # │ 99.0 ┆ 99.0 │
|
2966
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2967
|
+
# # │ 4.0 ┆ 13.0 │
|
2968
|
+
# # └──────┴──────┘
|
2969
|
+
def fill_nan(fill_value)
|
2970
|
+
lazy.fill_nan(fill_value).collect(no_optimization: true)
|
2971
|
+
end
|
2972
|
+
|
2973
|
+
# Explode `DataFrame` to long format by exploding a column with Lists.
|
2974
|
+
#
|
2975
|
+
# @param columns [Object]
|
2976
|
+
# Column of LargeList type.
|
2977
|
+
#
|
2978
|
+
# @return [DataFrame]
|
2979
|
+
#
|
2980
|
+
# @example
|
2981
|
+
# df = Polars::DataFrame.new(
|
2982
|
+
# {
|
2983
|
+
# "letters" => ["a", "a", "b", "c"],
|
2984
|
+
# "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]]
|
2985
|
+
# }
|
2986
|
+
# )
|
2987
|
+
# df.explode("numbers")
|
2988
|
+
# # =>
|
2989
|
+
# # shape: (8, 2)
|
2990
|
+
# # ┌─────────┬─────────┐
|
2991
|
+
# # │ letters ┆ numbers │
|
2992
|
+
# # │ --- ┆ --- │
|
2993
|
+
# # │ str ┆ i64 │
|
2994
|
+
# # ╞═════════╪═════════╡
|
2995
|
+
# # │ a ┆ 1 │
|
2996
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2997
|
+
# # │ a ┆ 2 │
|
2998
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2999
|
+
# # │ a ┆ 3 │
|
3000
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
3001
|
+
# # │ b ┆ 4 │
|
3002
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
3003
|
+
# # │ b ┆ 5 │
|
3004
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
3005
|
+
# # │ c ┆ 6 │
|
3006
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
3007
|
+
# # │ c ┆ 7 │
|
3008
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
3009
|
+
# # │ c ┆ 8 │
|
3010
|
+
# # └─────────┴─────────┘
|
3011
|
+
def explode(columns)
|
3012
|
+
lazy.explode(columns).collect(no_optimization: true)
|
3013
|
+
end
|
3014
|
+
|
3015
|
+
# Create a spreadsheet-style pivot table as a DataFrame.
|
3016
|
+
#
|
3017
|
+
# @param values [Object]
|
3018
|
+
# Column values to aggregate. Can be multiple columns if the *columns*
|
3019
|
+
# arguments contains multiple columns as well
|
3020
|
+
# @param index [Object]
|
3021
|
+
# One or multiple keys to group by
|
3022
|
+
# @param columns [Object]
|
3023
|
+
# Columns whose values will be used as the header of the output DataFrame
|
3024
|
+
# @param aggregate_fn ["first", "sum", "max", "min", "mean", "median", "last", "count"]
|
3025
|
+
# A predefined aggregate function str or an expression.
|
3026
|
+
# @param maintain_order [Object]
|
3027
|
+
# Sort the grouped keys so that the output order is predictable.
|
3028
|
+
# @param sort_columns [Object]
|
3029
|
+
# Sort the transposed columns by name. Default is by order of discovery.
|
3030
|
+
#
|
3031
|
+
# @return [DataFrame]
|
3032
|
+
#
|
3033
|
+
# @example
|
3034
|
+
# df = Polars::DataFrame.new(
|
3035
|
+
# {
|
3036
|
+
# "foo" => ["one", "one", "one", "two", "two", "two"],
|
3037
|
+
# "bar" => ["A", "B", "C", "A", "B", "C"],
|
3038
|
+
# "baz" => [1, 2, 3, 4, 5, 6]
|
3039
|
+
# }
|
3040
|
+
# )
|
3041
|
+
# df.pivot(values: "baz", index: "foo", columns: "bar")
|
3042
|
+
# # =>
|
3043
|
+
# # shape: (2, 4)
|
3044
|
+
# # ┌─────┬─────┬─────┬─────┐
|
3045
|
+
# # │ foo ┆ A ┆ B ┆ C │
|
3046
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
3047
|
+
# # │ str ┆ i64 ┆ i64 ┆ i64 │
|
3048
|
+
# # ╞═════╪═════╪═════╪═════╡
|
3049
|
+
# # │ one ┆ 1 ┆ 2 ┆ 3 │
|
3050
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3051
|
+
# # │ two ┆ 4 ┆ 5 ┆ 6 │
|
3052
|
+
# # └─────┴─────┴─────┴─────┘
|
3053
|
+
def pivot(
|
3054
|
+
values:,
|
3055
|
+
index:,
|
3056
|
+
columns:,
|
3057
|
+
aggregate_fn: "first",
|
3058
|
+
maintain_order: true,
|
3059
|
+
sort_columns: false
|
3060
|
+
)
|
3061
|
+
if values.is_a?(String)
|
3062
|
+
values = [values]
|
3063
|
+
end
|
3064
|
+
if index.is_a?(String)
|
3065
|
+
index = [index]
|
3066
|
+
end
|
3067
|
+
if columns.is_a?(String)
|
3068
|
+
columns = [columns]
|
3069
|
+
end
|
3070
|
+
|
3071
|
+
if aggregate_fn.is_a?(String)
|
3072
|
+
case aggregate_fn
|
3073
|
+
when "first"
|
3074
|
+
aggregate_fn = Polars.element.first
|
3075
|
+
when "sum"
|
3076
|
+
aggregate_fn = Polars.element.sum
|
3077
|
+
when "max"
|
3078
|
+
aggregate_fn = Polars.element.max
|
3079
|
+
when "min"
|
3080
|
+
aggregate_fn = Polars.element.min
|
3081
|
+
when "mean"
|
3082
|
+
aggregate_fn = Polars.element.mean
|
3083
|
+
when "median"
|
3084
|
+
aggregate_fn = Polars.element.median
|
3085
|
+
when "last"
|
3086
|
+
aggregate_fn = Polars.element.last
|
3087
|
+
when "count"
|
3088
|
+
aggregate_fn = Polars.count
|
3089
|
+
else
|
3090
|
+
raise ArgumentError, "Argument aggregate fn: '#{aggregate_fn}' was not expected."
|
3091
|
+
end
|
3092
|
+
end
|
3093
|
+
|
3094
|
+
_from_rbdf(
|
3095
|
+
_df.pivot_expr(
|
3096
|
+
values,
|
3097
|
+
index,
|
3098
|
+
columns,
|
3099
|
+
aggregate_fn._rbexpr,
|
3100
|
+
maintain_order,
|
3101
|
+
sort_columns
|
3102
|
+
)
|
3103
|
+
)
|
3104
|
+
end
|
3105
|
+
|
3106
|
+
# Unpivot a DataFrame from wide to long format.
|
3107
|
+
#
|
3108
|
+
# Optionally leaves identifiers set.
|
3109
|
+
#
|
3110
|
+
# This function is useful to massage a DataFrame into a format where one or more
|
3111
|
+
# columns are identifier variables (id_vars), while all other columns, considered
|
3112
|
+
# measured variables (value_vars), are "unpivoted" to the row axis, leaving just
|
3113
|
+
# two non-identifier columns, 'variable' and 'value'.
|
3114
|
+
#
|
3115
|
+
# @param id_vars [Object]
|
3116
|
+
# Columns to use as identifier variables.
|
3117
|
+
# @param value_vars [Object]
|
3118
|
+
# Values to use as identifier variables.
|
3119
|
+
# If `value_vars` is empty all columns that are not in `id_vars` will be used.
|
3120
|
+
# @param variable_name [String]
|
3121
|
+
# Name to give to the `value` column. Defaults to "variable"
|
3122
|
+
# @param value_name [String]
|
3123
|
+
# Name to give to the `value` column. Defaults to "value"
|
3124
|
+
#
|
3125
|
+
# @return [DataFrame]
|
3126
|
+
#
|
3127
|
+
# @example
|
3128
|
+
# df = Polars::DataFrame.new(
|
3129
|
+
# {
|
3130
|
+
# "a" => ["x", "y", "z"],
|
3131
|
+
# "b" => [1, 3, 5],
|
3132
|
+
# "c" => [2, 4, 6]
|
3133
|
+
# }
|
3134
|
+
# )
|
3135
|
+
# df.melt(id_vars: "a", value_vars: ["b", "c"])
|
3136
|
+
# # =>
|
3137
|
+
# # shape: (6, 3)
|
3138
|
+
# # ┌─────┬──────────┬───────┐
|
3139
|
+
# # │ a ┆ variable ┆ value │
|
3140
|
+
# # │ --- ┆ --- ┆ --- │
|
3141
|
+
# # │ str ┆ str ┆ i64 │
|
3142
|
+
# # ╞═════╪══════════╪═══════╡
|
3143
|
+
# # │ x ┆ b ┆ 1 │
|
3144
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3145
|
+
# # │ y ┆ b ┆ 3 │
|
3146
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3147
|
+
# # │ z ┆ b ┆ 5 │
|
3148
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3149
|
+
# # │ x ┆ c ┆ 2 │
|
3150
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3151
|
+
# # │ y ┆ c ┆ 4 │
|
3152
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3153
|
+
# # │ z ┆ c ┆ 6 │
|
3154
|
+
# # └─────┴──────────┴───────┘
|
3155
|
+
def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
|
3156
|
+
if value_vars.is_a?(String)
|
3157
|
+
value_vars = [value_vars]
|
3158
|
+
end
|
3159
|
+
if id_vars.is_a?(String)
|
3160
|
+
id_vars = [id_vars]
|
3161
|
+
end
|
3162
|
+
if value_vars.nil?
|
3163
|
+
value_vars = []
|
3164
|
+
end
|
3165
|
+
if id_vars.nil?
|
3166
|
+
id_vars = []
|
3167
|
+
end
|
3168
|
+
_from_rbdf(
|
3169
|
+
_df.melt(id_vars, value_vars, value_name, variable_name)
|
3170
|
+
)
|
3171
|
+
end
|
3172
|
+
|
3173
|
+
# Unstack a long table to a wide form without doing an aggregation.
|
3174
|
+
#
|
3175
|
+
# This can be much faster than a pivot, because it can skip the grouping phase.
|
3176
|
+
#
|
3177
|
+
# @note
|
3178
|
+
# This functionality is experimental and may be subject to changes
|
3179
|
+
# without it being considered a breaking change.
|
3180
|
+
#
|
3181
|
+
# @param step Integer
|
3182
|
+
# Number of rows in the unstacked frame.
|
3183
|
+
# @param how ["vertical", "horizontal"]
|
3184
|
+
# Direction of the unstack.
|
3185
|
+
# @param columns [Object]
|
3186
|
+
# Column to include in the operation.
|
3187
|
+
# @param fill_values [Object]
|
3188
|
+
# Fill values that don't fit the new size with this value.
|
3189
|
+
#
|
3190
|
+
# @return [DataFrame]
|
3191
|
+
#
|
3192
|
+
# @example
|
3193
|
+
# df = Polars::DataFrame.new(
|
3194
|
+
# {
|
3195
|
+
# "col1" => "A".."I",
|
3196
|
+
# "col2" => Polars.arange(0, 9, eager: true)
|
3197
|
+
# }
|
3198
|
+
# )
|
3199
|
+
# # =>
|
3200
|
+
# # shape: (9, 2)
|
3201
|
+
# # ┌──────┬──────┐
|
3202
|
+
# # │ col1 ┆ col2 │
|
3203
|
+
# # │ --- ┆ --- │
|
3204
|
+
# # │ str ┆ i64 │
|
3205
|
+
# # ╞══════╪══════╡
|
3206
|
+
# # │ A ┆ 0 │
|
3207
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3208
|
+
# # │ B ┆ 1 │
|
3209
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3210
|
+
# # │ C ┆ 2 │
|
3211
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3212
|
+
# # │ D ┆ 3 │
|
3213
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3214
|
+
# # │ ... ┆ ... │
|
3215
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3216
|
+
# # │ F ┆ 5 │
|
3217
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3218
|
+
# # │ G ┆ 6 │
|
3219
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3220
|
+
# # │ H ┆ 7 │
|
3221
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3222
|
+
# # │ I ┆ 8 │
|
3223
|
+
# # └──────┴──────┘
|
3224
|
+
#
|
3225
|
+
# @example
|
3226
|
+
# df.unstack(step: 3, how: "vertical")
|
3227
|
+
# # =>
|
3228
|
+
# # shape: (3, 6)
|
3229
|
+
# # ┌────────┬────────┬────────┬────────┬────────┬────────┐
|
3230
|
+
# # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
|
3231
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3232
|
+
# # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
|
3233
|
+
# # ╞════════╪════════╪════════╪════════╪════════╪════════╡
|
3234
|
+
# # │ A ┆ D ┆ G ┆ 0 ┆ 3 ┆ 6 │
|
3235
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3236
|
+
# # │ B ┆ E ┆ H ┆ 1 ┆ 4 ┆ 7 │
|
3237
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3238
|
+
# # │ C ┆ F ┆ I ┆ 2 ┆ 5 ┆ 8 │
|
3239
|
+
# # └────────┴────────┴────────┴────────┴────────┴────────┘
|
3240
|
+
#
|
3241
|
+
# @example
|
3242
|
+
# df.unstack(step: 3, how: "horizontal")
|
3243
|
+
# # =>
|
3244
|
+
# # shape: (3, 6)
|
3245
|
+
# # ┌────────┬────────┬────────┬────────┬────────┬────────┐
|
3246
|
+
# # │ col1_0 ┆ col1_1 ┆ col1_2 ┆ col2_0 ┆ col2_1 ┆ col2_2 │
|
3247
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3248
|
+
# # │ str ┆ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
|
3249
|
+
# # ╞════════╪════════╪════════╪════════╪════════╪════════╡
|
3250
|
+
# # │ A ┆ B ┆ C ┆ 0 ┆ 1 ┆ 2 │
|
3251
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3252
|
+
# # │ D ┆ E ┆ F ┆ 3 ┆ 4 ┆ 5 │
|
3253
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
3254
|
+
# # │ G ┆ H ┆ I ┆ 6 ┆ 7 ┆ 8 │
|
3255
|
+
# # └────────┴────────┴────────┴────────┴────────┴────────┘
|
3256
|
+
def unstack(step:, how: "vertical", columns: nil, fill_values: nil)
|
3257
|
+
if !columns.nil?
|
3258
|
+
df = select(columns)
|
3259
|
+
else
|
3260
|
+
df = self
|
3261
|
+
end
|
3262
|
+
|
3263
|
+
height = df.height
|
3264
|
+
if how == "vertical"
|
3265
|
+
n_rows = step
|
3266
|
+
n_cols = (height / n_rows.to_f).ceil
|
3267
|
+
else
|
3268
|
+
n_cols = step
|
3269
|
+
n_rows = (height / n_cols.to_f).ceil
|
3270
|
+
end
|
3271
|
+
|
3272
|
+
n_fill = n_cols * n_rows - height
|
3273
|
+
|
3274
|
+
if n_fill > 0
|
3275
|
+
if !fill_values.is_a?(Array)
|
3276
|
+
fill_values = [fill_values] * df.width
|
3277
|
+
end
|
3278
|
+
|
3279
|
+
df = df.select(
|
3280
|
+
df.get_columns.zip(fill_values).map do |s, next_fill|
|
3281
|
+
s.extend_constant(next_fill, n_fill)
|
3282
|
+
end
|
3283
|
+
)
|
3284
|
+
end
|
3285
|
+
|
3286
|
+
if how == "horizontal"
|
3287
|
+
df = (
|
3288
|
+
df.with_column(
|
3289
|
+
(Polars.arange(0, n_cols * n_rows, eager: true) % n_cols).alias(
|
3290
|
+
"__sort_order"
|
3291
|
+
)
|
3292
|
+
)
|
3293
|
+
.sort("__sort_order")
|
3294
|
+
.drop("__sort_order")
|
3295
|
+
)
|
3296
|
+
end
|
3297
|
+
|
3298
|
+
zfill_val = Math.log10(n_cols).floor + 1
|
3299
|
+
slices =
|
3300
|
+
df.get_columns.flat_map do |s|
|
3301
|
+
n_cols.times.map do |slice_nbr|
|
3302
|
+
s.slice(slice_nbr * n_rows, n_rows).alias("%s_%0#{zfill_val}d" % [s.name, slice_nbr])
|
3303
|
+
end
|
3304
|
+
end
|
3305
|
+
|
3306
|
+
_from_rbdf(DataFrame.new(slices)._df)
|
3307
|
+
end
|
3308
|
+
|
3309
|
+
# Split into multiple DataFrames partitioned by groups.
|
3310
|
+
#
|
3311
|
+
# @param groups [Object]
|
3312
|
+
# Groups to partition by.
|
3313
|
+
# @param maintain_order [Boolean]
|
3314
|
+
# Keep predictable output order. This is slower as it requires an extra sort
|
3315
|
+
# operation.
|
3316
|
+
# @param as_dict [Boolean]
|
3317
|
+
# If true, return the partitions in a dictionary keyed by the distinct group
|
3318
|
+
# values instead of a list.
|
3319
|
+
#
|
3320
|
+
# @return [Object]
|
3321
|
+
#
|
3322
|
+
# @example
|
3323
|
+
# df = Polars::DataFrame.new(
|
3324
|
+
# {
|
3325
|
+
# "foo" => ["A", "A", "B", "B", "C"],
|
3326
|
+
# "N" => [1, 2, 2, 4, 2],
|
3327
|
+
# "bar" => ["k", "l", "m", "m", "l"]
|
3328
|
+
# }
|
3329
|
+
# )
|
3330
|
+
# df.partition_by("foo", maintain_order: true)
|
3331
|
+
# # =>
|
3332
|
+
# # [shape: (2, 3)
|
3333
|
+
# # ┌─────┬─────┬─────┐
|
3334
|
+
# # │ foo ┆ N ┆ bar │
|
3335
|
+
# # │ --- ┆ --- ┆ --- │
|
3336
|
+
# # │ str ┆ i64 ┆ str │
|
3337
|
+
# # ╞═════╪═════╪═════╡
|
3338
|
+
# # │ A ┆ 1 ┆ k │
|
3339
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3340
|
+
# # │ A ┆ 2 ┆ l │
|
3341
|
+
# # └─────┴─────┴─────┘, shape: (2, 3)
|
3342
|
+
# # ┌─────┬─────┬─────┐
|
3343
|
+
# # │ foo ┆ N ┆ bar │
|
3344
|
+
# # │ --- ┆ --- ┆ --- │
|
3345
|
+
# # │ str ┆ i64 ┆ str │
|
3346
|
+
# # ╞═════╪═════╪═════╡
|
3347
|
+
# # │ B ┆ 2 ┆ m │
|
3348
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3349
|
+
# # │ B ┆ 4 ┆ m │
|
3350
|
+
# # └─────┴─────┴─────┘, shape: (1, 3)
|
3351
|
+
# # ┌─────┬─────┬─────┐
|
3352
|
+
# # │ foo ┆ N ┆ bar │
|
3353
|
+
# # │ --- ┆ --- ┆ --- │
|
3354
|
+
# # │ str ┆ i64 ┆ str │
|
3355
|
+
# # ╞═════╪═════╪═════╡
|
3356
|
+
# # │ C ┆ 2 ┆ l │
|
3357
|
+
# # └─────┴─────┴─────┘]
|
3358
|
+
#
|
3359
|
+
# @example
|
3360
|
+
# df.partition_by("foo", maintain_order: true, as_dict: true)
|
3361
|
+
# # =>
|
3362
|
+
# # {"A"=>shape: (2, 3)
|
3363
|
+
# # ┌─────┬─────┬─────┐
|
3364
|
+
# # │ foo ┆ N ┆ bar │
|
3365
|
+
# # │ --- ┆ --- ┆ --- │
|
3366
|
+
# # │ str ┆ i64 ┆ str │
|
3367
|
+
# # ╞═════╪═════╪═════╡
|
3368
|
+
# # │ A ┆ 1 ┆ k │
|
3369
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3370
|
+
# # │ A ┆ 2 ┆ l │
|
3371
|
+
# # └─────┴─────┴─────┘, "B"=>shape: (2, 3)
|
3372
|
+
# # ┌─────┬─────┬─────┐
|
3373
|
+
# # │ foo ┆ N ┆ bar │
|
3374
|
+
# # │ --- ┆ --- ┆ --- │
|
3375
|
+
# # │ str ┆ i64 ┆ str │
|
3376
|
+
# # ╞═════╪═════╪═════╡
|
3377
|
+
# # │ B ┆ 2 ┆ m │
|
3378
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3379
|
+
# # │ B ┆ 4 ┆ m │
|
3380
|
+
# # └─────┴─────┴─────┘, "C"=>shape: (1, 3)
|
3381
|
+
# # ┌─────┬─────┬─────┐
|
3382
|
+
# # │ foo ┆ N ┆ bar │
|
3383
|
+
# # │ --- ┆ --- ┆ --- │
|
3384
|
+
# # │ str ┆ i64 ┆ str │
|
3385
|
+
# # ╞═════╪═════╪═════╡
|
3386
|
+
# # │ C ┆ 2 ┆ l │
|
3387
|
+
# # └─────┴─────┴─────┘}
|
3388
|
+
def partition_by(groups, maintain_order: true, as_dict: false)
|
3389
|
+
if groups.is_a?(String)
|
3390
|
+
groups = [groups]
|
3391
|
+
elsif !groups.is_a?(Array)
|
3392
|
+
groups = Array(groups)
|
3393
|
+
end
|
3394
|
+
|
3395
|
+
if as_dict
|
3396
|
+
out = {}
|
3397
|
+
if groups.length == 1
|
3398
|
+
_df.partition_by(groups, maintain_order).each do |df|
|
3399
|
+
df = _from_rbdf(df)
|
3400
|
+
out[df[groups][0, 0]] = df
|
3401
|
+
end
|
3402
|
+
else
|
3403
|
+
_df.partition_by(groups, maintain_order).each do |df|
|
3404
|
+
df = _from_rbdf(df)
|
3405
|
+
out[df[groups].row(0)] = df
|
3406
|
+
end
|
3407
|
+
end
|
3408
|
+
out
|
3409
|
+
else
|
3410
|
+
_df.partition_by(groups, maintain_order).map { |df| _from_rbdf(df) }
|
3411
|
+
end
|
3412
|
+
end
|
3413
|
+
|
3414
|
+
# Shift values by the given period.
|
3415
|
+
#
|
3416
|
+
# @param periods [Integer]
|
3417
|
+
# Number of places to shift (may be negative).
|
3418
|
+
#
|
3419
|
+
# @return [DataFrame]
|
3420
|
+
#
|
3421
|
+
# @example
|
3422
|
+
# df = Polars::DataFrame.new(
|
3423
|
+
# {
|
3424
|
+
# "foo" => [1, 2, 3],
|
3425
|
+
# "bar" => [6, 7, 8],
|
3426
|
+
# "ham" => ["a", "b", "c"]
|
3427
|
+
# }
|
3428
|
+
# )
|
3429
|
+
# df.shift(1)
|
3430
|
+
# # =>
|
3431
|
+
# # shape: (3, 3)
|
3432
|
+
# # ┌──────┬──────┬──────┐
|
3433
|
+
# # │ foo ┆ bar ┆ ham │
|
3434
|
+
# # │ --- ┆ --- ┆ --- │
|
3435
|
+
# # │ i64 ┆ i64 ┆ str │
|
3436
|
+
# # ╞══════╪══════╪══════╡
|
3437
|
+
# # │ null ┆ null ┆ null │
|
3438
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3439
|
+
# # │ 1 ┆ 6 ┆ a │
|
3440
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3441
|
+
# # │ 2 ┆ 7 ┆ b │
|
3442
|
+
# # └──────┴──────┴──────┘
|
3443
|
+
#
|
3444
|
+
# @example
|
3445
|
+
# df.shift(-1)
|
3446
|
+
# # =>
|
3447
|
+
# # shape: (3, 3)
|
3448
|
+
# # ┌──────┬──────┬──────┐
|
3449
|
+
# # │ foo ┆ bar ┆ ham │
|
3450
|
+
# # │ --- ┆ --- ┆ --- │
|
3451
|
+
# # │ i64 ┆ i64 ┆ str │
|
3452
|
+
# # ╞══════╪══════╪══════╡
|
3453
|
+
# # │ 2 ┆ 7 ┆ b │
|
3454
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3455
|
+
# # │ 3 ┆ 8 ┆ c │
|
3456
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
3457
|
+
# # │ null ┆ null ┆ null │
|
3458
|
+
# # └──────┴──────┴──────┘
|
3459
|
+
def shift(periods)
|
3460
|
+
_from_rbdf(_df.shift(periods))
|
3461
|
+
end
|
3462
|
+
|
3463
|
+
# Shift the values by a given period and fill the resulting null values.
|
3464
|
+
#
|
3465
|
+
# @param periods [Integer]
|
3466
|
+
# Number of places to shift (may be negative).
|
3467
|
+
# @param fill_value [Object]
|
3468
|
+
# fill nil values with this value.
|
3469
|
+
#
|
3470
|
+
# @return [DataFrame]
|
3471
|
+
#
|
3472
|
+
# @example
|
3473
|
+
# df = Polars::DataFrame.new(
|
3474
|
+
# {
|
3475
|
+
# "foo" => [1, 2, 3],
|
3476
|
+
# "bar" => [6, 7, 8],
|
3477
|
+
# "ham" => ["a", "b", "c"]
|
3478
|
+
# }
|
3479
|
+
# )
|
3480
|
+
# df.shift_and_fill(1, 0)
|
3481
|
+
# # =>
|
3482
|
+
# # shape: (3, 3)
|
3483
|
+
# # ┌─────┬─────┬─────┐
|
3484
|
+
# # │ foo ┆ bar ┆ ham │
|
3485
|
+
# # │ --- ┆ --- ┆ --- │
|
3486
|
+
# # │ i64 ┆ i64 ┆ str │
|
3487
|
+
# # ╞═════╪═════╪═════╡
|
3488
|
+
# # │ 0 ┆ 0 ┆ 0 │
|
3489
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3490
|
+
# # │ 1 ┆ 6 ┆ a │
|
3491
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
3492
|
+
# # │ 2 ┆ 7 ┆ b │
|
3493
|
+
# # └─────┴─────┴─────┘
|
3494
|
+
def shift_and_fill(periods, fill_value)
|
3495
|
+
lazy
|
3496
|
+
.shift_and_fill(periods, fill_value)
|
3497
|
+
.collect(no_optimization: true, string_cache: false)
|
3498
|
+
end
|
3499
|
+
|
3500
|
+
# Get a mask of all duplicated rows in this DataFrame.
|
3501
|
+
#
|
3502
|
+
# @return [Series]
|
3503
|
+
#
|
3504
|
+
# @example
|
3505
|
+
# df = Polars::DataFrame.new(
|
3506
|
+
# {
|
3507
|
+
# "a" => [1, 2, 3, 1],
|
3508
|
+
# "b" => ["x", "y", "z", "x"],
|
3509
|
+
# }
|
3510
|
+
# )
|
3511
|
+
# df.is_duplicated
|
3512
|
+
# # =>
|
3513
|
+
# # shape: (4,)
|
3514
|
+
# # Series: '' [bool]
|
3515
|
+
# # [
|
3516
|
+
# # true
|
3517
|
+
# # false
|
3518
|
+
# # false
|
3519
|
+
# # true
|
3520
|
+
# # ]
|
3521
|
+
def is_duplicated
|
3522
|
+
Utils.wrap_s(_df.is_duplicated)
|
3523
|
+
end
|
3524
|
+
|
3525
|
+
# Get a mask of all unique rows in this DataFrame.
|
3526
|
+
#
|
3527
|
+
# @return [Series]
|
3528
|
+
#
|
3529
|
+
# @example
|
3530
|
+
# df = Polars::DataFrame.new(
|
3531
|
+
# {
|
3532
|
+
# "a" => [1, 2, 3, 1],
|
3533
|
+
# "b" => ["x", "y", "z", "x"]
|
3534
|
+
# }
|
3535
|
+
# )
|
3536
|
+
# df.is_unique
|
3537
|
+
# # =>
|
3538
|
+
# # shape: (4,)
|
3539
|
+
# # Series: '' [bool]
|
3540
|
+
# # [
|
3541
|
+
# # false
|
3542
|
+
# # true
|
3543
|
+
# # true
|
3544
|
+
# # false
|
3545
|
+
# # ]
|
3546
|
+
def is_unique
|
3547
|
+
Utils.wrap_s(_df.is_unique)
|
3548
|
+
end
|
3549
|
+
|
3550
|
+
# Start a lazy query from this point.
|
3551
|
+
#
|
3552
|
+
# @return [LazyFrame]
|
3553
|
+
def lazy
|
3554
|
+
wrap_ldf(_df.lazy)
|
3555
|
+
end
|
3556
|
+
|
3557
|
+
# Select columns from this DataFrame.
|
3558
|
+
#
|
3559
|
+
# @param exprs [Object]
|
3560
|
+
# Column or columns to select.
|
3561
|
+
#
|
3562
|
+
# @return [DataFrame]
|
3563
|
+
#
|
3564
|
+
# @example
|
3565
|
+
# df = Polars::DataFrame.new(
|
3566
|
+
# {
|
3567
|
+
# "foo" => [1, 2, 3],
|
3568
|
+
# "bar" => [6, 7, 8],
|
3569
|
+
# "ham" => ["a", "b", "c"]
|
3570
|
+
# }
|
3571
|
+
# )
|
3572
|
+
# df.select("foo")
|
3573
|
+
# # =>
|
3574
|
+
# # shape: (3, 1)
|
3575
|
+
# # ┌─────┐
|
3576
|
+
# # │ foo │
|
3577
|
+
# # │ --- │
|
3578
|
+
# # │ i64 │
|
3579
|
+
# # ╞═════╡
|
3580
|
+
# # │ 1 │
|
3581
|
+
# # ├╌╌╌╌╌┤
|
3582
|
+
# # │ 2 │
|
3583
|
+
# # ├╌╌╌╌╌┤
|
3584
|
+
# # │ 3 │
|
3585
|
+
# # └─────┘
|
3586
|
+
#
|
3587
|
+
# @example
|
3588
|
+
# df.select(["foo", "bar"])
|
3589
|
+
# # =>
|
3590
|
+
# # shape: (3, 2)
|
3591
|
+
# # ┌─────┬─────┐
|
3592
|
+
# # │ foo ┆ bar │
|
3593
|
+
# # │ --- ┆ --- │
|
3594
|
+
# # │ i64 ┆ i64 │
|
3595
|
+
# # ╞═════╪═════╡
|
3596
|
+
# # │ 1 ┆ 6 │
|
3597
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
3598
|
+
# # │ 2 ┆ 7 │
|
3599
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
3600
|
+
# # │ 3 ┆ 8 │
|
3601
|
+
# # └─────┴─────┘
|
3602
|
+
#
|
3603
|
+
# @example
|
3604
|
+
# df.select(Polars.col("foo") + 1)
|
3605
|
+
# # =>
|
3606
|
+
# # shape: (3, 1)
|
3607
|
+
# # ┌─────┐
|
3608
|
+
# # │ foo │
|
3609
|
+
# # │ --- │
|
3610
|
+
# # │ i64 │
|
3611
|
+
# # ╞═════╡
|
3612
|
+
# # │ 2 │
|
3613
|
+
# # ├╌╌╌╌╌┤
|
3614
|
+
# # │ 3 │
|
3615
|
+
# # ├╌╌╌╌╌┤
|
3616
|
+
# # │ 4 │
|
3617
|
+
# # └─────┘
|
3618
|
+
#
|
3619
|
+
# @example
|
3620
|
+
# df.select([Polars.col("foo") + 1, Polars.col("bar") + 1])
|
3621
|
+
# # =>
|
3622
|
+
# # shape: (3, 2)
|
3623
|
+
# # ┌─────┬─────┐
|
3624
|
+
# # │ foo ┆ bar │
|
3625
|
+
# # │ --- ┆ --- │
|
3626
|
+
# # │ i64 ┆ i64 │
|
3627
|
+
# # ╞═════╪═════╡
|
3628
|
+
# # │ 2 ┆ 7 │
|
3629
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
3630
|
+
# # │ 3 ┆ 8 │
|
3631
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
3632
|
+
# # │ 4 ┆ 9 │
|
3633
|
+
# # └─────┴─────┘
|
3634
|
+
#
|
3635
|
+
# @example
|
3636
|
+
# df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0))
|
3637
|
+
# # =>
|
3638
|
+
# # shape: (3, 1)
|
3639
|
+
# # ┌─────────┐
|
3640
|
+
# # │ literal │
|
3641
|
+
# # │ --- │
|
3642
|
+
# # │ i64 │
|
3643
|
+
# # ╞═════════╡
|
3644
|
+
# # │ 0 │
|
3645
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
3646
|
+
# # │ 0 │
|
3647
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
3648
|
+
# # │ 10 │
|
3649
|
+
# # └─────────┘
|
3650
|
+
def select(exprs)
|
3651
|
+
_from_rbdf(
|
3652
|
+
lazy
|
3653
|
+
.select(exprs)
|
3654
|
+
.collect(no_optimization: true, string_cache: false)
|
3655
|
+
._df
|
3656
|
+
)
|
3657
|
+
end
|
3658
|
+
|
3659
|
+
# Add or overwrite multiple columns in a DataFrame.
|
3660
|
+
#
|
3661
|
+
# @param exprs [Array]
|
3662
|
+
# Array of Expressions that evaluate to columns.
|
3663
|
+
#
|
3664
|
+
# @return [DataFrame]
|
3665
|
+
#
|
3666
|
+
# @example
|
3667
|
+
# df = Polars::DataFrame.new(
|
3668
|
+
# {
|
3669
|
+
# "a" => [1, 2, 3, 4],
|
3670
|
+
# "b" => [0.5, 4, 10, 13],
|
3671
|
+
# "c" => [true, true, false, true]
|
3672
|
+
# }
|
3673
|
+
# )
|
3674
|
+
# df.with_columns(
|
3675
|
+
# [
|
3676
|
+
# (Polars.col("a") ** 2).alias("a^2"),
|
3677
|
+
# (Polars.col("b") / 2).alias("b/2"),
|
3678
|
+
# (Polars.col("c").is_not).alias("not c")
|
3679
|
+
# ]
|
3680
|
+
# )
|
3681
|
+
# # =>
|
3682
|
+
# # shape: (4, 6)
|
3683
|
+
# # ┌─────┬──────┬───────┬──────┬──────┬───────┐
|
3684
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
3685
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3686
|
+
# # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
|
3687
|
+
# # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
|
3688
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
|
3689
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3690
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
|
3691
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3692
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
|
3693
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3694
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
3695
|
+
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
3696
|
+
def with_columns(exprs)
|
3697
|
+
if !exprs.nil? && !exprs.is_a?(Array)
|
3698
|
+
exprs = [exprs]
|
3699
|
+
end
|
3700
|
+
lazy
|
3701
|
+
.with_columns(exprs)
|
3702
|
+
.collect(no_optimization: true, string_cache: false)
|
3703
|
+
end
|
3704
|
+
|
3705
|
+
# Get number of chunks used by the ChunkedArrays of this DataFrame.
|
3706
|
+
#
|
3707
|
+
# @param strategy ["first", "all"]
|
3708
|
+
# Return the number of chunks of the 'first' column,
|
3709
|
+
# or 'all' columns in this DataFrame.
|
3710
|
+
#
|
3711
|
+
# @return [Object]
|
3712
|
+
#
|
3713
|
+
# @example
|
3714
|
+
# df = Polars::DataFrame.new(
|
3715
|
+
# {
|
3716
|
+
# "a" => [1, 2, 3, 4],
|
3717
|
+
# "b" => [0.5, 4, 10, 13],
|
3718
|
+
# "c" => [true, true, false, true]
|
3719
|
+
# }
|
3720
|
+
# )
|
3721
|
+
# df.n_chunks
|
3722
|
+
# # => 1
|
3723
|
+
# df.n_chunks(strategy: "all")
|
3724
|
+
# # => [1, 1, 1]
|
3725
|
+
def n_chunks(strategy: "first")
|
3726
|
+
if strategy == "first"
|
3727
|
+
_df.n_chunks
|
3728
|
+
elsif strategy == "all"
|
3729
|
+
get_columns.map(&:n_chunks)
|
3730
|
+
else
|
3731
|
+
raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first', 'all'}}"
|
3732
|
+
end
|
3733
|
+
end
|
3734
|
+
|
3735
|
+
# Aggregate the columns of this DataFrame to their maximum value.
|
3736
|
+
#
|
3737
|
+
# @return [DataFrame]
|
3738
|
+
#
|
3739
|
+
# @example
|
3740
|
+
# df = Polars::DataFrame.new(
|
3741
|
+
# {
|
3742
|
+
# "foo" => [1, 2, 3],
|
3743
|
+
# "bar" => [6, 7, 8],
|
3744
|
+
# "ham" => ["a", "b", "c"]
|
3745
|
+
# }
|
3746
|
+
# )
|
3747
|
+
# df.max
|
3748
|
+
# # =>
|
3749
|
+
# # shape: (1, 3)
|
3750
|
+
# # ┌─────┬─────┬─────┐
|
3751
|
+
# # │ foo ┆ bar ┆ ham │
|
3752
|
+
# # │ --- ┆ --- ┆ --- │
|
3753
|
+
# # │ i64 ┆ i64 ┆ str │
|
3754
|
+
# # ╞═════╪═════╪═════╡
|
3755
|
+
# # │ 3 ┆ 8 ┆ c │
|
3756
|
+
# # └─────┴─────┴─────┘
|
3757
|
+
def max(axis: 0)
|
3758
|
+
if axis == 0
|
3759
|
+
_from_rbdf(_df.max)
|
3760
|
+
elsif axis == 1
|
3761
|
+
Utils.wrap_s(_df.hmax)
|
3762
|
+
else
|
3763
|
+
raise ArgumentError, "Axis should be 0 or 1."
|
3764
|
+
end
|
3765
|
+
end
|
3766
|
+
|
3767
|
+
# Aggregate the columns of this DataFrame to their minimum value.
|
3768
|
+
#
|
3769
|
+
# @return [DataFrame]
|
3770
|
+
#
|
3771
|
+
# @example
|
3772
|
+
# df = Polars::DataFrame.new(
|
3773
|
+
# {
|
3774
|
+
# "foo" => [1, 2, 3],
|
3775
|
+
# "bar" => [6, 7, 8],
|
3776
|
+
# "ham" => ["a", "b", "c"]
|
3777
|
+
# }
|
3778
|
+
# )
|
3779
|
+
# df.min
|
3780
|
+
# # =>
|
3781
|
+
# # shape: (1, 3)
|
3782
|
+
# # ┌─────┬─────┬─────┐
|
3783
|
+
# # │ foo ┆ bar ┆ ham │
|
3784
|
+
# # │ --- ┆ --- ┆ --- │
|
3785
|
+
# # │ i64 ┆ i64 ┆ str │
|
3786
|
+
# # ╞═════╪═════╪═════╡
|
3787
|
+
# # │ 1 ┆ 6 ┆ a │
|
3788
|
+
# # └─────┴─────┴─────┘
|
3789
|
+
def min(axis: 0)
|
3790
|
+
if axis == 0
|
3791
|
+
_from_rbdf(_df.min)
|
3792
|
+
elsif axis == 1
|
3793
|
+
Utils.wrap_s(_df.hmin)
|
3794
|
+
else
|
3795
|
+
raise ArgumentError, "Axis should be 0 or 1."
|
3796
|
+
end
|
3797
|
+
end
|
3798
|
+
|
3799
|
+
# Aggregate the columns of this DataFrame to their sum value.
|
3800
|
+
#
|
3801
|
+
# @param axis [Integer]
|
3802
|
+
# Either 0 or 1.
|
3803
|
+
# @param null_strategy ["ignore", "propagate"]
|
3804
|
+
# This argument is only used if axis == 1.
|
3805
|
+
#
|
3806
|
+
# @return [DataFrame]
|
3807
|
+
#
|
3808
|
+
# @example
|
3809
|
+
# df = Polars::DataFrame.new(
|
3810
|
+
# {
|
3811
|
+
# "foo" => [1, 2, 3],
|
3812
|
+
# "bar" => [6, 7, 8],
|
3813
|
+
# "ham" => ["a", "b", "c"],
|
3814
|
+
# }
|
3815
|
+
# )
|
3816
|
+
# df.sum
|
3817
|
+
# # =>
|
3818
|
+
# # shape: (1, 3)
|
3819
|
+
# # ┌─────┬─────┬──────┐
|
3820
|
+
# # │ foo ┆ bar ┆ ham │
|
3821
|
+
# # │ --- ┆ --- ┆ --- │
|
3822
|
+
# # │ i64 ┆ i64 ┆ str │
|
3823
|
+
# # ╞═════╪═════╪══════╡
|
3824
|
+
# # │ 6 ┆ 21 ┆ null │
|
3825
|
+
# # └─────┴─────┴──────┘
|
3826
|
+
#
|
3827
|
+
# @example
|
3828
|
+
# df.sum(axis: 1)
|
3829
|
+
# # =>
|
3830
|
+
# # shape: (3,)
|
3831
|
+
# # Series: 'foo' [str]
|
3832
|
+
# # [
|
3833
|
+
# # "16a"
|
3834
|
+
# # "27b"
|
3835
|
+
# # "38c"
|
3836
|
+
# # ]
|
3837
|
+
def sum(axis: 0, null_strategy: "ignore")
|
3838
|
+
case axis
|
3839
|
+
when 0
|
3840
|
+
_from_rbdf(_df.sum)
|
3841
|
+
when 1
|
3842
|
+
Utils.wrap_s(_df.hsum(null_strategy))
|
3843
|
+
else
|
3844
|
+
raise ArgumentError, "Axis should be 0 or 1."
|
3845
|
+
end
|
3846
|
+
end
|
3847
|
+
|
3848
|
+
# Aggregate the columns of this DataFrame to their mean value.
|
3849
|
+
#
|
3850
|
+
# @param axis [Integer]
|
3851
|
+
# Either 0 or 1.
|
3852
|
+
# @param null_strategy ["ignore", "propagate"]
|
3853
|
+
# This argument is only used if axis == 1.
|
3854
|
+
#
|
3855
|
+
# @return [DataFrame]
|
3856
|
+
#
|
3857
|
+
# @example
|
3858
|
+
# df = Polars::DataFrame.new(
|
3859
|
+
# {
|
3860
|
+
# "foo" => [1, 2, 3],
|
3861
|
+
# "bar" => [6, 7, 8],
|
3862
|
+
# "ham" => ["a", "b", "c"]
|
3863
|
+
# }
|
3864
|
+
# )
|
3865
|
+
# df.mean
|
3866
|
+
# # =>
|
3867
|
+
# # shape: (1, 3)
|
3868
|
+
# # ┌─────┬─────┬──────┐
|
3869
|
+
# # │ foo ┆ bar ┆ ham │
|
3870
|
+
# # │ --- ┆ --- ┆ --- │
|
3871
|
+
# # │ f64 ┆ f64 ┆ str │
|
3872
|
+
# # ╞═════╪═════╪══════╡
|
3873
|
+
# # │ 2.0 ┆ 7.0 ┆ null │
|
3874
|
+
# # └─────┴─────┴──────┘
|
3875
|
+
def mean(axis: 0, null_strategy: "ignore")
|
3876
|
+
case axis
|
3877
|
+
when 0
|
3878
|
+
_from_rbdf(_df.mean)
|
3879
|
+
when 1
|
3880
|
+
Utils.wrap_s(_df.hmean(null_strategy))
|
3881
|
+
else
|
3882
|
+
raise ArgumentError, "Axis should be 0 or 1."
|
3883
|
+
end
|
3884
|
+
end
|
3885
|
+
|
3886
|
+
# Aggregate the columns of this DataFrame to their standard deviation value.
|
3887
|
+
#
|
3888
|
+
# @param ddof [Integer]
|
3889
|
+
# Degrees of freedom
|
3890
|
+
#
|
3891
|
+
# @return [DataFrame]
|
3892
|
+
#
|
3893
|
+
# @example
|
3894
|
+
# df = Polars::DataFrame.new(
|
3895
|
+
# {
|
3896
|
+
# "foo" => [1, 2, 3],
|
3897
|
+
# "bar" => [6, 7, 8],
|
3898
|
+
# "ham" => ["a", "b", "c"]
|
3899
|
+
# }
|
3900
|
+
# )
|
3901
|
+
# df.std
|
3902
|
+
# # =>
|
3903
|
+
# # shape: (1, 3)
|
3904
|
+
# # ┌─────┬─────┬──────┐
|
3905
|
+
# # │ foo ┆ bar ┆ ham │
|
3906
|
+
# # │ --- ┆ --- ┆ --- │
|
3907
|
+
# # │ f64 ┆ f64 ┆ str │
|
3908
|
+
# # ╞═════╪═════╪══════╡
|
3909
|
+
# # │ 1.0 ┆ 1.0 ┆ null │
|
3910
|
+
# # └─────┴─────┴──────┘
|
3911
|
+
#
|
3912
|
+
# @example
|
3913
|
+
# df.std(ddof: 0)
|
3914
|
+
# # =>
|
3915
|
+
# # shape: (1, 3)
|
3916
|
+
# # ┌──────────┬──────────┬──────┐
|
3917
|
+
# # │ foo ┆ bar ┆ ham │
|
3918
|
+
# # │ --- ┆ --- ┆ --- │
|
3919
|
+
# # │ f64 ┆ f64 ┆ str │
|
3920
|
+
# # ╞══════════╪══════════╪══════╡
|
3921
|
+
# # │ 0.816497 ┆ 0.816497 ┆ null │
|
3922
|
+
# # └──────────┴──────────┴──────┘
|
3923
|
+
def std(ddof: 1)
|
3924
|
+
_from_rbdf(_df.std(ddof))
|
3925
|
+
end
|
3926
|
+
|
3927
|
+
# Aggregate the columns of this DataFrame to their variance value.
|
3928
|
+
#
|
3929
|
+
# @param ddof [Integer]
|
3930
|
+
# Degrees of freedom
|
3931
|
+
#
|
3932
|
+
# @return [DataFrame]
|
3933
|
+
#
|
3934
|
+
# @example
|
3935
|
+
# df = Polars::DataFrame.new(
|
3936
|
+
# {
|
3937
|
+
# "foo" => [1, 2, 3],
|
3938
|
+
# "bar" => [6, 7, 8],
|
3939
|
+
# "ham" => ["a", "b", "c"]
|
3940
|
+
# }
|
3941
|
+
# )
|
3942
|
+
# df.var
|
3943
|
+
# # =>
|
3944
|
+
# # shape: (1, 3)
|
3945
|
+
# # ┌─────┬─────┬──────┐
|
3946
|
+
# # │ foo ┆ bar ┆ ham │
|
3947
|
+
# # │ --- ┆ --- ┆ --- │
|
3948
|
+
# # │ f64 ┆ f64 ┆ str │
|
3949
|
+
# # ╞═════╪═════╪══════╡
|
3950
|
+
# # │ 1.0 ┆ 1.0 ┆ null │
|
3951
|
+
# # └─────┴─────┴──────┘
|
3952
|
+
#
|
3953
|
+
# @example
|
3954
|
+
# df.var(ddof: 0)
|
3955
|
+
# # =>
|
3956
|
+
# # shape: (1, 3)
|
3957
|
+
# # ┌──────────┬──────────┬──────┐
|
3958
|
+
# # │ foo ┆ bar ┆ ham │
|
3959
|
+
# # │ --- ┆ --- ┆ --- │
|
3960
|
+
# # │ f64 ┆ f64 ┆ str │
|
3961
|
+
# # ╞══════════╪══════════╪══════╡
|
3962
|
+
# # │ 0.666667 ┆ 0.666667 ┆ null │
|
3963
|
+
# # └──────────┴──────────┴──────┘
|
3964
|
+
def var(ddof: 1)
|
3965
|
+
_from_rbdf(_df.var(ddof))
|
3966
|
+
end
|
3967
|
+
|
3968
|
+
# Aggregate the columns of this DataFrame to their median value.
|
3969
|
+
#
|
3970
|
+
# @return [DataFrame]
|
3971
|
+
#
|
3972
|
+
# @example
|
3973
|
+
# df = Polars::DataFrame.new(
|
3974
|
+
# {
|
3975
|
+
# "foo" => [1, 2, 3],
|
3976
|
+
# "bar" => [6, 7, 8],
|
3977
|
+
# "ham" => ["a", "b", "c"]
|
3978
|
+
# }
|
3979
|
+
# )
|
3980
|
+
# df.median
|
3981
|
+
# # =>
|
3982
|
+
# # shape: (1, 3)
|
3983
|
+
# # ┌─────┬─────┬──────┐
|
3984
|
+
# # │ foo ┆ bar ┆ ham │
|
3985
|
+
# # │ --- ┆ --- ┆ --- │
|
3986
|
+
# # │ f64 ┆ f64 ┆ str │
|
3987
|
+
# # ╞═════╪═════╪══════╡
|
3988
|
+
# # │ 2.0 ┆ 7.0 ┆ null │
|
3989
|
+
# # └─────┴─────┴──────┘
|
3990
|
+
def median
|
3991
|
+
_from_rbdf(_df.median)
|
3992
|
+
end
|
3993
|
+
|
3994
|
+
# Aggregate the columns of this DataFrame to their product values.
|
3995
|
+
#
|
3996
|
+
# @return [DataFrame]
|
3997
|
+
#
|
3998
|
+
# @example
|
3999
|
+
# df = Polars::DataFrame.new(
|
4000
|
+
# {
|
4001
|
+
# "a" => [1, 2, 3],
|
4002
|
+
# "b" => [0.5, 4, 10],
|
4003
|
+
# "c" => [true, true, false]
|
4004
|
+
# }
|
4005
|
+
# )
|
4006
|
+
# df.product
|
4007
|
+
# # =>
|
4008
|
+
# # shape: (1, 3)
|
4009
|
+
# # ┌─────┬──────┬─────┐
|
4010
|
+
# # │ a ┆ b ┆ c │
|
4011
|
+
# # │ --- ┆ --- ┆ --- │
|
4012
|
+
# # │ i64 ┆ f64 ┆ i64 │
|
4013
|
+
# # ╞═════╪══════╪═════╡
|
4014
|
+
# # │ 6 ┆ 20.0 ┆ 0 │
|
4015
|
+
# # └─────┴──────┴─────┘
|
4016
|
+
def product
|
4017
|
+
select(Polars.all.product)
|
4018
|
+
end
|
4019
|
+
|
4020
|
+
# Aggregate the columns of this DataFrame to their quantile value.
|
4021
|
+
#
|
4022
|
+
# @param quantile [Float]
|
4023
|
+
# Quantile between 0.0 and 1.0.
|
4024
|
+
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
|
4025
|
+
# Interpolation method.
|
4026
|
+
#
|
4027
|
+
# @return [DataFrame]
|
4028
|
+
#
|
4029
|
+
# @example
|
4030
|
+
# df = Polars::DataFrame.new(
|
4031
|
+
# {
|
4032
|
+
# "foo" => [1, 2, 3],
|
4033
|
+
# "bar" => [6, 7, 8],
|
4034
|
+
# "ham" => ["a", "b", "c"]
|
4035
|
+
# }
|
4036
|
+
# )
|
4037
|
+
# df.quantile(0.5, interpolation: "nearest")
|
4038
|
+
# # =>
|
4039
|
+
# # shape: (1, 3)
|
4040
|
+
# # ┌─────┬─────┬──────┐
|
4041
|
+
# # │ foo ┆ bar ┆ ham │
|
4042
|
+
# # │ --- ┆ --- ┆ --- │
|
4043
|
+
# # │ f64 ┆ f64 ┆ str │
|
4044
|
+
# # ╞═════╪═════╪══════╡
|
4045
|
+
# # │ 2.0 ┆ 7.0 ┆ null │
|
4046
|
+
# # └─────┴─────┴──────┘
|
4047
|
+
def quantile(quantile, interpolation: "nearest")
|
4048
|
+
_from_rbdf(_df.quantile(quantile, interpolation))
|
4049
|
+
end
|
4050
|
+
|
4051
|
+
# Get one hot encoded dummy variables.
|
4052
|
+
#
|
4053
|
+
# @param columns
|
4054
|
+
# A subset of columns to convert to dummy variables. `nil` means
|
4055
|
+
# "all columns".
|
4056
|
+
#
|
4057
|
+
# @return [DataFrame]
|
4058
|
+
#
|
4059
|
+
# @example
|
4060
|
+
# df = Polars::DataFrame.new(
|
4061
|
+
# {
|
4062
|
+
# "foo" => [1, 2],
|
4063
|
+
# "bar" => [3, 4],
|
4064
|
+
# "ham" => ["a", "b"]
|
4065
|
+
# }
|
4066
|
+
# )
|
4067
|
+
# df.to_dummies
|
4068
|
+
# # =>
|
4069
|
+
# # shape: (2, 6)
|
4070
|
+
# # ┌───────┬───────┬───────┬───────┬───────┬───────┐
|
4071
|
+
# # │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
|
4072
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
4073
|
+
# # │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
|
4074
|
+
# # ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
|
4075
|
+
# # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
|
4076
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
4077
|
+
# # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
|
4078
|
+
# # └───────┴───────┴───────┴───────┴───────┴───────┘
|
4079
|
+
def to_dummies(columns: nil)
|
4080
|
+
if columns.is_a?(String)
|
4081
|
+
columns = [columns]
|
4082
|
+
end
|
4083
|
+
_from_rbdf(_df.to_dummies(columns))
|
4084
|
+
end
|
4085
|
+
|
4086
|
+
# Drop duplicate rows from this DataFrame.
|
4087
|
+
#
|
4088
|
+
# @param maintain_order [Boolean]
|
4089
|
+
# Keep the same order as the original DataFrame. This requires more work to
|
4090
|
+
# compute.
|
4091
|
+
# @param subset [Object]
|
4092
|
+
# Subset to use to compare rows.
|
4093
|
+
# @param keep ["first", "last"]
|
4094
|
+
# Which of the duplicate rows to keep (in conjunction with `subset`).
|
4095
|
+
#
|
4096
|
+
# @return [DataFrame]
|
4097
|
+
#
|
4098
|
+
# @note
|
4099
|
+
# Note that this fails if there is a column of type `List` in the DataFrame or
|
4100
|
+
# subset.
|
4101
|
+
#
|
4102
|
+
# @example
|
4103
|
+
# df = Polars::DataFrame.new(
|
4104
|
+
# {
|
4105
|
+
# "a" => [1, 1, 2, 3, 4, 5],
|
4106
|
+
# "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
|
4107
|
+
# "c" => [true, true, true, false, true, true]
|
4108
|
+
# }
|
4109
|
+
# )
|
4110
|
+
# df.unique
|
4111
|
+
# # =>
|
4112
|
+
# # shape: (5, 3)
|
4113
|
+
# # ┌─────┬─────┬───────┐
|
4114
|
+
# # │ a ┆ b ┆ c │
|
4115
|
+
# # │ --- ┆ --- ┆ --- │
|
4116
|
+
# # │ i64 ┆ f64 ┆ bool │
|
4117
|
+
# # ╞═════╪═════╪═══════╡
|
4118
|
+
# # │ 1 ┆ 0.5 ┆ true │
|
4119
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
4120
|
+
# # │ 2 ┆ 1.0 ┆ true │
|
4121
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
4122
|
+
# # │ 3 ┆ 2.0 ┆ false │
|
4123
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
4124
|
+
# # │ 4 ┆ 3.0 ┆ true │
|
4125
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
4126
|
+
# # │ 5 ┆ 3.0 ┆ true │
|
4127
|
+
# # └─────┴─────┴───────┘
|
4128
|
+
def unique(maintain_order: true, subset: nil, keep: "first")
|
4129
|
+
if !subset.nil?
|
4130
|
+
if subset.is_a?(String)
|
4131
|
+
subset = [subset]
|
4132
|
+
elsif !subset.is_a?(Array)
|
4133
|
+
subset = subset.to_a
|
4134
|
+
end
|
4135
|
+
end
|
4136
|
+
|
4137
|
+
_from_rbdf(_df.unique(maintain_order, subset, keep))
|
4138
|
+
end
|
4139
|
+
|
4140
|
+
# Return the number of unique rows, or the number of unique row-subsets.
|
4141
|
+
#
|
4142
|
+
# @param subset [Object]
|
4143
|
+
# One or more columns/expressions that define what to count;
|
4144
|
+
# omit to return the count of unique rows.
|
4145
|
+
#
|
4146
|
+
# @return [DataFrame]
|
4147
|
+
#
|
4148
|
+
# @example
|
4149
|
+
# df = Polars::DataFrame.new(
|
4150
|
+
# {
|
4151
|
+
# "a" => [1, 1, 2, 3, 4, 5],
|
4152
|
+
# "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
|
4153
|
+
# "c" => [true, true, true, false, true, true]
|
4154
|
+
# }
|
4155
|
+
# )
|
4156
|
+
# df.n_unique
|
4157
|
+
# # => 5
|
4158
|
+
#
|
4159
|
+
# @example Simple columns subset
|
4160
|
+
# df.n_unique(subset: ["b", "c"])
|
4161
|
+
# # => 4
|
4162
|
+
#
|
4163
|
+
# @example Expression subset
|
4164
|
+
# df.n_unique(
|
4165
|
+
# subset: [
|
4166
|
+
# (Polars.col("a").floordiv(2)),
|
4167
|
+
# (Polars.col("c") | (Polars.col("b") >= 2))
|
4168
|
+
# ]
|
4169
|
+
# )
|
4170
|
+
# # => 3
|
4171
|
+
def n_unique(subset: nil)
|
4172
|
+
if subset.is_a?(StringIO)
|
4173
|
+
subset = [Polars.col(subset)]
|
4174
|
+
elsif subset.is_a?(Expr)
|
4175
|
+
subset = [subset]
|
4176
|
+
end
|
4177
|
+
|
4178
|
+
if subset.is_a?(Array) && subset.length == 1
|
4179
|
+
expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
|
4180
|
+
else
|
4181
|
+
struct_fields = subset.nil? ? Polars.all : subset
|
4182
|
+
expr = Polars.struct(struct_fields)
|
4183
|
+
end
|
4184
|
+
|
4185
|
+
df = lazy.select(expr.n_unique).collect
|
4186
|
+
df.is_empty ? 0 : df.row(0)[0]
|
4187
|
+
end
|
4188
|
+
|
4189
|
+
# Rechunk the data in this DataFrame to a contiguous allocation.
|
4190
|
+
|
4191
|
+
# This will make sure all subsequent operations have optimal and predictable
|
4192
|
+
# performance.
|
4193
|
+
#
|
4194
|
+
# @return [DataFrame]
|
4195
|
+
def rechunk
|
4196
|
+
_from_rbdf(_df.rechunk)
|
4197
|
+
end
|
4198
|
+
|
4199
|
+
# Create a new DataFrame that shows the null counts per column.
|
4200
|
+
#
|
4201
|
+
# @return [DataFrame]
|
4202
|
+
#
|
4203
|
+
# @example
|
4204
|
+
# df = Polars::DataFrame.new(
|
4205
|
+
# {
|
4206
|
+
# "foo" => [1, nil, 3],
|
4207
|
+
# "bar" => [6, 7, nil],
|
4208
|
+
# "ham" => ["a", "b", "c"]
|
4209
|
+
# }
|
4210
|
+
# )
|
4211
|
+
# df.null_count
|
4212
|
+
# # =>
|
4213
|
+
# # shape: (1, 3)
|
4214
|
+
# # ┌─────┬─────┬─────┐
|
4215
|
+
# # │ foo ┆ bar ┆ ham │
|
4216
|
+
# # │ --- ┆ --- ┆ --- │
|
4217
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
4218
|
+
# # ╞═════╪═════╪═════╡
|
4219
|
+
# # │ 1 ┆ 1 ┆ 0 │
|
4220
|
+
# # └─────┴─────┴─────┘
|
4221
|
+
def null_count
|
4222
|
+
_from_rbdf(_df.null_count)
|
4223
|
+
end
|
4224
|
+
|
4225
|
+
# Sample from this DataFrame.
|
4226
|
+
#
|
4227
|
+
# @param n [Integer]
|
4228
|
+
# Number of items to return. Cannot be used with `frac`. Defaults to 1 if
|
4229
|
+
# `frac` is nil.
|
4230
|
+
# @param frac [Float]
|
4231
|
+
# Fraction of items to return. Cannot be used with `n`.
|
4232
|
+
# @param with_replacement [Boolean]
|
4233
|
+
# Allow values to be sampled more than once.
|
4234
|
+
# @param shuffle [Boolean]
|
4235
|
+
# Shuffle the order of sampled data points.
|
4236
|
+
# @param seed [Integer]
|
4237
|
+
# Seed for the random number generator. If set to nil (default), a random
|
4238
|
+
# seed is used.
|
4239
|
+
#
|
4240
|
+
# @return [DataFrame]
|
4241
|
+
#
|
4242
|
+
# @example
|
4243
|
+
# df = Polars::DataFrame.new(
|
4244
|
+
# {
|
4245
|
+
# "foo" => [1, 2, 3],
|
4246
|
+
# "bar" => [6, 7, 8],
|
4247
|
+
# "ham" => ["a", "b", "c"]
|
4248
|
+
# }
|
4249
|
+
# )
|
4250
|
+
# df.sample(n: 2, seed: 0)
|
4251
|
+
# # =>
|
4252
|
+
# # shape: (2, 3)
|
4253
|
+
# # ┌─────┬─────┬─────┐
|
4254
|
+
# # │ foo ┆ bar ┆ ham │
|
4255
|
+
# # │ --- ┆ --- ┆ --- │
|
4256
|
+
# # │ i64 ┆ i64 ┆ str │
|
4257
|
+
# # ╞═════╪═════╪═════╡
|
4258
|
+
# # │ 3 ┆ 8 ┆ c │
|
4259
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
4260
|
+
# # │ 2 ┆ 7 ┆ b │
|
4261
|
+
# # └─────┴─────┴─────┘
|
4262
|
+
def sample(
|
4263
|
+
n: nil,
|
4264
|
+
frac: nil,
|
4265
|
+
with_replacement: false,
|
4266
|
+
shuffle: false,
|
4267
|
+
seed: nil
|
4268
|
+
)
|
4269
|
+
if !n.nil? && !frac.nil?
|
4270
|
+
raise ArgumentError, "cannot specify both `n` and `frac`"
|
4271
|
+
end
|
4272
|
+
|
4273
|
+
if n.nil? && !frac.nil?
|
4274
|
+
_from_rbdf(
|
4275
|
+
_df.sample_frac(frac, with_replacement, shuffle, seed)
|
4276
|
+
)
|
4277
|
+
end
|
4278
|
+
|
4279
|
+
if n.nil?
|
4280
|
+
n = 1
|
4281
|
+
end
|
4282
|
+
_from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
|
4283
|
+
end
|
4284
|
+
|
4285
|
+
# Apply a horizontal reduction on a DataFrame.
|
4286
|
+
#
|
4287
|
+
# This can be used to effectively determine aggregations on a row level, and can
|
4288
|
+
# be applied to any DataType that can be supercasted (casted to a similar parent
|
4289
|
+
# type).
|
4290
|
+
#
|
4291
|
+
# An example of the supercast rules when applying an arithmetic operation on two
|
4292
|
+
# DataTypes are for instance:
|
4293
|
+
#
|
4294
|
+
# i8 + str = str
|
4295
|
+
# f32 + i64 = f32
|
4296
|
+
# f32 + f64 = f64
|
4297
|
+
#
|
4298
|
+
# @return [Series]
|
4299
|
+
#
|
4300
|
+
# @example A horizontal sum operation:
|
4301
|
+
# df = Polars::DataFrame.new(
|
4302
|
+
# {
|
4303
|
+
# "a" => [2, 1, 3],
|
4304
|
+
# "b" => [1, 2, 3],
|
4305
|
+
# "c" => [1.0, 2.0, 3.0]
|
4306
|
+
# }
|
4307
|
+
# )
|
4308
|
+
# df.fold { |s1, s2| s1 + s2 }
|
4309
|
+
# # =>
|
4310
|
+
# # shape: (3,)
|
4311
|
+
# # Series: 'a' [f64]
|
4312
|
+
# # [
|
4313
|
+
# # 4.0
|
4314
|
+
# # 5.0
|
4315
|
+
# # 9.0
|
4316
|
+
# # ]
|
4317
|
+
#
|
4318
|
+
# @example A horizontal minimum operation:
|
4319
|
+
# df = Polars::DataFrame.new({"a" => [2, 1, 3], "b" => [1, 2, 3], "c" => [1.0, 2.0, 3.0]})
|
4320
|
+
# df.fold { |s1, s2| s1.zip_with(s1 < s2, s2) }
|
4321
|
+
# # =>
|
4322
|
+
# # shape: (3,)
|
4323
|
+
# # Series: 'a' [f64]
|
4324
|
+
# # [
|
4325
|
+
# # 1.0
|
4326
|
+
# # 1.0
|
4327
|
+
# # 3.0
|
4328
|
+
# # ]
|
4329
|
+
#
|
4330
|
+
# @example A horizontal string concatenation:
|
4331
|
+
# df = Polars::DataFrame.new(
|
4332
|
+
# {
|
4333
|
+
# "a" => ["foo", "bar", 2],
|
4334
|
+
# "b" => [1, 2, 3],
|
4335
|
+
# "c" => [1.0, 2.0, 3.0]
|
4336
|
+
# }
|
4337
|
+
# )
|
4338
|
+
# df.fold { |s1, s2| s1 + s2 }
|
4339
|
+
# # =>
|
4340
|
+
# # shape: (3,)
|
4341
|
+
# # Series: 'a' [str]
|
4342
|
+
# # [
|
4343
|
+
# # "foo11.0"
|
4344
|
+
# # "bar22.0"
|
4345
|
+
# # null
|
4346
|
+
# # ]
|
4347
|
+
#
|
4348
|
+
# @example A horizontal boolean or, similar to a row-wise .any():
|
4349
|
+
# df = Polars::DataFrame.new(
|
4350
|
+
# {
|
4351
|
+
# "a" => [false, false, true],
|
4352
|
+
# "b" => [false, true, false]
|
4353
|
+
# }
|
4354
|
+
# )
|
4355
|
+
# df.fold { |s1, s2| s1 | s2 }
|
4356
|
+
# # =>
|
4357
|
+
# # shape: (3,)
|
4358
|
+
# # Series: 'a' [bool]
|
4359
|
+
# # [
|
4360
|
+
# # false
|
4361
|
+
# # true
|
4362
|
+
# # true
|
4363
|
+
# # ]
|
4364
|
+
def fold(&operation)
|
4365
|
+
acc = to_series(0)
|
4366
|
+
|
4367
|
+
1.upto(width - 1) do |i|
|
4368
|
+
acc = operation.call(acc, to_series(i))
|
4369
|
+
end
|
4370
|
+
acc
|
4371
|
+
end
|
4372
|
+
|
4373
|
+
# Get a row as tuple, either by index or by predicate.
|
4374
|
+
#
|
4375
|
+
# @param index [Object]
|
4376
|
+
# Row index.
|
4377
|
+
# @param by_predicate [Object]
|
4378
|
+
# Select the row according to a given expression/predicate.
|
4379
|
+
#
|
4380
|
+
# @return [Object]
|
4381
|
+
#
|
4382
|
+
# @note
|
4383
|
+
# The `index` and `by_predicate` params are mutually exclusive. Additionally,
|
4384
|
+
# to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
|
4385
|
+
#
|
4386
|
+
# When using `by_predicate` it is an error condition if anything other than
|
4387
|
+
# one row is returned; more than one row raises `TooManyRowsReturned`, and
|
4388
|
+
# zero rows will raise `NoRowsReturned` (both inherit from `RowsException`).
|
4389
|
+
#
|
4390
|
+
# @example Return the row at the given index
|
4391
|
+
# df = Polars::DataFrame.new(
|
4392
|
+
# {
|
4393
|
+
# "foo" => [1, 2, 3],
|
4394
|
+
# "bar" => [6, 7, 8],
|
4395
|
+
# "ham" => ["a", "b", "c"]
|
4396
|
+
# }
|
4397
|
+
# )
|
4398
|
+
# df.row(2)
|
4399
|
+
# # => [3, 8, "c"]
|
4400
|
+
#
|
4401
|
+
# @example Return the row that matches the given predicate
|
4402
|
+
# df.row(by_predicate: Polars.col("ham") == "b")
|
4403
|
+
# # => [2, 7, "b"]
|
4404
|
+
def row(index = nil, by_predicate: nil)
|
4405
|
+
if !index.nil? && !by_predicate.nil?
|
4406
|
+
raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive"
|
4407
|
+
elsif index.is_a?(Expr)
|
4408
|
+
raise TypeError, "Expressions should be passed to the 'by_predicate' param"
|
4409
|
+
elsif index.is_a?(Integer)
|
4410
|
+
_df.row_tuple(index)
|
4411
|
+
elsif by_predicate.is_a?(Expr)
|
4412
|
+
rows = filter(by_predicate).rows
|
4413
|
+
n_rows = rows.length
|
4414
|
+
if n_rows > 1
|
4415
|
+
raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows"
|
4416
|
+
elsif n_rows == 0
|
4417
|
+
raise NoRowsReturned, "Predicate <{by_predicate!s}> returned no rows"
|
4418
|
+
end
|
4419
|
+
rows[0]
|
4420
|
+
else
|
4421
|
+
raise ArgumentError, "One of 'index' or 'by_predicate' must be set"
|
4422
|
+
end
|
4423
|
+
end
|
4424
|
+
|
4425
|
+
# Convert columnar data to rows as Ruby arrays.
|
4426
|
+
#
|
4427
|
+
# @return [Array]
|
4428
|
+
#
|
4429
|
+
# @example
|
4430
|
+
# df = Polars::DataFrame.new(
|
4431
|
+
# {
|
4432
|
+
# "a" => [1, 3, 5],
|
4433
|
+
# "b" => [2, 4, 6]
|
4434
|
+
# }
|
4435
|
+
# )
|
4436
|
+
# df.rows
|
4437
|
+
# # => [[1, 2], [3, 4], [5, 6]]
|
4438
|
+
def rows
|
4439
|
+
_df.row_tuples
|
4440
|
+
end
|
4441
|
+
|
4442
|
+
# Shrink DataFrame memory usage.
|
4443
|
+
#
|
4444
|
+
# Shrinks to fit the exact capacity needed to hold the data.
|
4445
|
+
#
|
4446
|
+
# @return [DataFrame]
|
4447
|
+
def shrink_to_fit(in_place: false)
|
4448
|
+
if in_place
|
4449
|
+
_df.shrink_to_fit
|
4450
|
+
self
|
4451
|
+
else
|
4452
|
+
df = clone
|
4453
|
+
df._df.shrink_to_fit
|
4454
|
+
df
|
4455
|
+
end
|
4456
|
+
end
|
4457
|
+
|
4458
|
+
# Take every nth row in the DataFrame and return as a new DataFrame.
|
4459
|
+
#
|
4460
|
+
# @return [DataFrame]
|
4461
|
+
#
|
4462
|
+
# @example
|
4463
|
+
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
|
4464
|
+
# s.take_every(2)
|
4465
|
+
# # =>
|
4466
|
+
# # shape: (2, 2)
|
4467
|
+
# # ┌─────┬─────┐
|
4468
|
+
# # │ a ┆ b │
|
4469
|
+
# # │ --- ┆ --- │
|
4470
|
+
# # │ i64 ┆ i64 │
|
4471
|
+
# # ╞═════╪═════╡
|
4472
|
+
# # │ 1 ┆ 5 │
|
4473
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
4474
|
+
# # │ 3 ┆ 7 │
|
4475
|
+
# # └─────┴─────┘
|
4476
|
+
def take_every(n)
|
4477
|
+
select(Utils.col("*").take_every(n))
|
4478
|
+
end
|
4479
|
+
|
4480
|
+
# Hash and combine the rows in this DataFrame.
|
4481
|
+
#
|
4482
|
+
# The hash value is of type `:u64`.
|
4483
|
+
#
|
4484
|
+
# @param seed [Integer]
|
4485
|
+
# Random seed parameter. Defaults to 0.
|
4486
|
+
# @param seed_1 [Integer]
|
4487
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
4488
|
+
# @param seed_2 [Integer]
|
4489
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
4490
|
+
# @param seed_3 [Integer]
|
4491
|
+
# Random seed parameter. Defaults to `seed` if not set.
|
4492
|
+
#
|
4493
|
+
# @return [Series]
|
4494
|
+
#
|
4495
|
+
# @example
|
4496
|
+
# df = Polars::DataFrame.new(
|
4497
|
+
# {
|
4498
|
+
# "foo" => [1, nil, 3, 4],
|
4499
|
+
# "ham" => ["a", "b", nil, "d"]
|
4500
|
+
# }
|
4501
|
+
# )
|
4502
|
+
# df.hash_rows(seed: 42)
|
4503
|
+
# # =>
|
4504
|
+
# # shape: (4,)
|
4505
|
+
# # Series: '' [u64]
|
4506
|
+
# # [
|
4507
|
+
# # 4238614331852490969
|
4508
|
+
# # 17976148875586754089
|
4509
|
+
# # 4702262519505526977
|
4510
|
+
# # 18144177983981041107
|
4511
|
+
# # ]
|
4512
|
+
def hash_rows(seed: 0, seed_1: nil, seed_2: nil, seed_3: nil)
|
4513
|
+
k0 = seed
|
4514
|
+
k1 = seed_1.nil? ? seed : seed_1
|
4515
|
+
k2 = seed_2.nil? ? seed : seed_2
|
4516
|
+
k3 = seed_3.nil? ? seed : seed_3
|
4517
|
+
Utils.wrap_s(_df.hash_rows(k0, k1, k2, k3))
|
4518
|
+
end
|
4519
|
+
|
4520
|
+
# Interpolate intermediate values. The interpolation method is linear.
|
4521
|
+
#
|
4522
|
+
# @return [DataFrame]
|
4523
|
+
#
|
4524
|
+
# @example
|
4525
|
+
# df = Polars::DataFrame.new(
|
4526
|
+
# {
|
4527
|
+
# "foo" => [1, nil, 9, 10],
|
4528
|
+
# "bar" => [6, 7, 9, nil],
|
4529
|
+
# "baz" => [1, nil, nil, 9]
|
4530
|
+
# }
|
4531
|
+
# )
|
4532
|
+
# df.interpolate
|
4533
|
+
# # =>
|
4534
|
+
# # shape: (4, 3)
|
4535
|
+
# # ┌─────┬──────┬─────┐
|
4536
|
+
# # │ foo ┆ bar ┆ baz │
|
4537
|
+
# # │ --- ┆ --- ┆ --- │
|
4538
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
4539
|
+
# # ╞═════╪══════╪═════╡
|
4540
|
+
# # │ 1 ┆ 6 ┆ 1 │
|
4541
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
4542
|
+
# # │ 5 ┆ 7 ┆ 3 │
|
4543
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
4544
|
+
# # │ 9 ┆ 9 ┆ 6 │
|
4545
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
4546
|
+
# # │ 10 ┆ null ┆ 9 │
|
4547
|
+
# # └─────┴──────┴─────┘
|
4548
|
+
def interpolate
|
4549
|
+
select(Utils.col("*").interpolate)
|
4550
|
+
end
|
4551
|
+
|
4552
|
+
# Check if the dataframe is empty.
|
4553
|
+
#
|
4554
|
+
# @return [Boolean]
|
4555
|
+
#
|
4556
|
+
# @example
|
4557
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
4558
|
+
# df.is_empty
|
4559
|
+
# # => false
|
4560
|
+
# df.filter(Polars.col("foo") > 99).is_empty
|
4561
|
+
# # => true
|
4562
|
+
def is_empty
|
4563
|
+
height == 0
|
4564
|
+
end
|
4565
|
+
alias_method :empty?, :is_empty
|
4566
|
+
|
4567
|
+
# Convert a `DataFrame` to a `Series` of type `Struct`.
|
4568
|
+
#
|
4569
|
+
# @param name [String]
|
4570
|
+
# Name for the struct Series
|
4571
|
+
#
|
4572
|
+
# @return [Series]
|
4573
|
+
#
|
4574
|
+
# @example
|
4575
|
+
# df = Polars::DataFrame.new(
|
4576
|
+
# {
|
4577
|
+
# "a" => [1, 2, 3, 4, 5],
|
4578
|
+
# "b" => ["one", "two", "three", "four", "five"]
|
4579
|
+
# }
|
4580
|
+
# )
|
4581
|
+
# df.to_struct("nums")
|
4582
|
+
# # =>
|
4583
|
+
# # shape: (5,)
|
4584
|
+
# # Series: 'nums' [struct[2]]
|
4585
|
+
# # [
|
4586
|
+
# # {1,"one"}
|
4587
|
+
# # {2,"two"}
|
4588
|
+
# # {3,"three"}
|
4589
|
+
# # {4,"four"}
|
4590
|
+
# # {5,"five"}
|
4591
|
+
# # ]
|
4592
|
+
def to_struct(name)
|
4593
|
+
Utils.wrap_s(_df.to_struct(name))
|
4594
|
+
end
|
4595
|
+
|
4596
|
+
# Decompose a struct into its fields.
|
4597
|
+
#
|
4598
|
+
# The fields will be inserted into the `DataFrame` on the location of the
|
4599
|
+
# `struct` type.
|
4600
|
+
#
|
4601
|
+
# @param names [Object]
|
4602
|
+
# Names of the struct columns that will be decomposed by its fields
|
4603
|
+
#
|
4604
|
+
# @return [DataFrame]
|
4605
|
+
#
|
4606
|
+
# @example
|
4607
|
+
# df = Polars::DataFrame.new(
|
4608
|
+
# {
|
4609
|
+
# "before" => ["foo", "bar"],
|
4610
|
+
# "t_a" => [1, 2],
|
4611
|
+
# "t_b" => ["a", "b"],
|
4612
|
+
# "t_c" => [true, nil],
|
4613
|
+
# "t_d" => [[1, 2], [3]],
|
4614
|
+
# "after" => ["baz", "womp"]
|
4615
|
+
# }
|
4616
|
+
# ).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"])
|
4617
|
+
# df.unnest("t_struct")
|
4618
|
+
# # =>
|
4619
|
+
# # shape: (2, 6)
|
4620
|
+
# # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
|
4621
|
+
# # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
|
4622
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
4623
|
+
# # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
|
4624
|
+
# # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
|
4625
|
+
# # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
|
4626
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
4627
|
+
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
4628
|
+
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
4629
|
+
def unnest(names)
|
4630
|
+
if names.is_a?(String)
|
4631
|
+
names = [names]
|
4632
|
+
end
|
4633
|
+
_from_rbdf(_df.unnest(names))
|
4634
|
+
end
|
4635
|
+
|
4636
|
+
private
|
4637
|
+
|
4638
|
+
def initialize_copy(other)
|
4639
|
+
super
|
4640
|
+
self._df = _df._clone
|
4641
|
+
end
|
4642
|
+
|
4643
|
+
def _pos_idx(idx, dim)
|
4644
|
+
if idx >= 0
|
4645
|
+
idx
|
4646
|
+
else
|
4647
|
+
shape[dim] + idx
|
4648
|
+
end
|
4649
|
+
end
|
4650
|
+
|
4651
|
+
# def _pos_idxs
|
4652
|
+
# end
|
4653
|
+
|
4654
|
+
# @private
|
4655
|
+
def self.hash_to_rbdf(data, columns: nil)
|
4656
|
+
if !columns.nil?
|
4657
|
+
columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
|
4658
|
+
|
4659
|
+
if data.empty? && dtypes
|
4660
|
+
data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
|
4661
|
+
else
|
4662
|
+
data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
|
4663
|
+
end
|
4664
|
+
data_series = _handle_columns_arg(data_series, columns: columns)
|
4665
|
+
return RbDataFrame.new(data_series)
|
4666
|
+
end
|
4667
|
+
|
4668
|
+
RbDataFrame.read_hash(data)
|
4669
|
+
end
|
4670
|
+
|
4671
|
+
# @private
|
4672
|
+
def self._unpack_columns(columns, lookup_names: nil, n_expected: nil)
|
4673
|
+
if columns.is_a?(Hash)
|
4674
|
+
columns = columns.to_a
|
4675
|
+
end
|
4676
|
+
column_names =
|
4677
|
+
(columns || []).map.with_index do |col, i|
|
4678
|
+
if col.is_a?(String)
|
4679
|
+
col || "column_#{i}"
|
4680
|
+
else
|
4681
|
+
col[0]
|
4682
|
+
end
|
4683
|
+
end
|
4684
|
+
if column_names.empty? && n_expected
|
4685
|
+
column_names = n_expected.times.map { |i| "column_#{i}" }
|
4686
|
+
end
|
4687
|
+
# TODO zip_longest
|
4688
|
+
lookup = column_names.zip(lookup_names || []).to_h
|
4689
|
+
|
4690
|
+
[
|
4691
|
+
column_names,
|
4692
|
+
(columns || []).select { |col| !col.is_a?(String) && col[1] }.to_h do |col|
|
4693
|
+
[lookup[col[0]] || col[0], col[1]]
|
4694
|
+
end
|
4695
|
+
]
|
4696
|
+
end
|
4697
|
+
|
4698
|
+
def self._handle_columns_arg(data, columns: nil)
|
4699
|
+
if columns.nil?
|
4700
|
+
data
|
4701
|
+
else
|
4702
|
+
if data.empty?
|
4703
|
+
columns.map { |c| Series.new(c, nil)._s }
|
4704
|
+
elsif data.length == columns.length
|
4705
|
+
columns.each_with_index do |c, i|
|
4706
|
+
# not in-place?
|
4707
|
+
data[i].rename(c)
|
4708
|
+
end
|
4709
|
+
data
|
4710
|
+
else
|
4711
|
+
raise ArgumentError, "Dimensions of columns arg must match data dimensions."
|
4712
|
+
end
|
4713
|
+
end
|
4714
|
+
end
|
4715
|
+
|
4716
|
+
# @private
|
4717
|
+
def self.sequence_to_rbdf(data, columns: nil, orient: nil)
|
4718
|
+
if data.length == 0
|
4719
|
+
return hash_to_rbdf({}, columns: columns)
|
4720
|
+
end
|
4721
|
+
|
4722
|
+
if data[0].is_a?(Series)
|
4723
|
+
# series_names = data.map(&:name)
|
4724
|
+
# columns, dtypes = _unpack_columns(columns || series_names, n_expected: data.length)
|
4725
|
+
data_series = []
|
4726
|
+
data.each do |s|
|
4727
|
+
data_series << s._s
|
4728
|
+
end
|
4729
|
+
elsif data[0].is_a?(Array)
|
4730
|
+
if orient.nil? && !columns.nil?
|
4731
|
+
orient = columns.length == data.length ? "col" : "row"
|
4732
|
+
end
|
4733
|
+
|
4734
|
+
if orient == "row"
|
4735
|
+
raise Todo
|
4736
|
+
elsif orient == "col" || orient.nil?
|
4737
|
+
raise Todo
|
4738
|
+
else
|
4739
|
+
raise ArgumentError, "orient must be one of {{'col', 'row', nil}}, got #{orient} instead."
|
4740
|
+
end
|
4741
|
+
end
|
4742
|
+
|
4743
|
+
data_series = _handle_columns_arg(data_series, columns: columns)
|
4744
|
+
RbDataFrame.new(data_series)
|
4745
|
+
end
|
4746
|
+
|
4747
|
+
# @private
|
4748
|
+
def self.series_to_rbdf(data, columns: nil)
|
4749
|
+
if columns
|
4750
|
+
raise Todo
|
4751
|
+
end
|
4752
|
+
RbDataFrame.new([data._s])
|
4753
|
+
end
|
4754
|
+
|
4755
|
+
def wrap_ldf(ldf)
|
4756
|
+
LazyFrame._from_rbldf(ldf)
|
4757
|
+
end
|
4758
|
+
|
4759
|
+
def _from_rbdf(rb_df)
|
4760
|
+
self.class._from_rbdf(rb_df)
|
4761
|
+
end
|
4762
|
+
|
4763
|
+
def _comp(other, op)
|
4764
|
+
if other.is_a?(DataFrame)
|
4765
|
+
_compare_to_other_df(other, op)
|
4766
|
+
else
|
4767
|
+
_compare_to_non_df(other, op)
|
4768
|
+
end
|
4769
|
+
end
|
4770
|
+
|
4771
|
+
def _compare_to_other_df(other, op)
|
4772
|
+
if columns != other.columns
|
4773
|
+
raise ArgmentError, "DataFrame columns do not match"
|
4774
|
+
end
|
4775
|
+
if shape != other.shape
|
4776
|
+
raise ArgmentError, "DataFrame dimensions do not match"
|
4777
|
+
end
|
4778
|
+
|
4779
|
+
suffix = "__POLARS_CMP_OTHER"
|
4780
|
+
other_renamed = other.select(Polars.all.suffix(suffix))
|
4781
|
+
combined = Polars.concat([self, other_renamed], how: "horizontal")
|
4782
|
+
|
4783
|
+
expr = case op
|
4784
|
+
when "eq"
|
4785
|
+
columns.map { |n| Polars.col(n) == Polars.col("#{n}#{suffix}") }
|
4786
|
+
when "neq"
|
4787
|
+
columns.map { |n| Polars.col(n) != Polars.col("#{n}#{suffix}") }
|
4788
|
+
when "gt"
|
4789
|
+
columns.map { |n| Polars.col(n) > Polars.col("#{n}#{suffix}") }
|
4790
|
+
when "lt"
|
4791
|
+
columns.map { |n| Polars.col(n) < Polars.col("#{n}#{suffix}") }
|
4792
|
+
when "gt_eq"
|
4793
|
+
columns.map { |n| Polars.col(n) >= Polars.col("#{n}#{suffix}") }
|
4794
|
+
when "lt_eq"
|
4795
|
+
columns.map { |n| Polars.col(n) <= Polars.col("#{n}#{suffix}") }
|
4796
|
+
else
|
4797
|
+
raise ArgumentError, "got unexpected comparison operator: #{op}"
|
4798
|
+
end
|
4799
|
+
|
4800
|
+
combined.select(expr)
|
4801
|
+
end
|
4802
|
+
|
4803
|
+
def _compare_to_non_df(other, op)
|
4804
|
+
case op
|
4805
|
+
when "eq"
|
4806
|
+
select(Polars.all == other)
|
4807
|
+
when "neq"
|
4808
|
+
select(Polars.all != other)
|
4809
|
+
when "gt"
|
4810
|
+
select(Polars.all > other)
|
4811
|
+
when "lt"
|
4812
|
+
select(Polars.all < other)
|
4813
|
+
when "gt_eq"
|
4814
|
+
select(Polars.all >= other)
|
4815
|
+
when "lt_eq"
|
4816
|
+
select(Polars.all <= other)
|
4817
|
+
else
|
4818
|
+
raise ArgumentError, "got unexpected comparison operator: #{op}"
|
4819
|
+
end
|
4820
|
+
end
|
4821
|
+
|
4822
|
+
def _prepare_other_arg(other)
|
4823
|
+
if !other.is_a?(Series)
|
4824
|
+
if other.is_a?(Array)
|
4825
|
+
raise ArgumentError, "Operation not supported."
|
4826
|
+
end
|
4827
|
+
|
4828
|
+
other = Series.new("", [other])
|
4829
|
+
end
|
4830
|
+
other
|
4831
|
+
end
|
4832
|
+
end
|
4833
|
+
end
|