polars-df 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +2 -1
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +139 -6
- data/ext/polars/src/dataframe.rs +360 -15
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +135 -3
- data/ext/polars/src/lazy/dsl.rs +97 -2
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +227 -12
- data/ext/polars/src/series.rs +190 -38
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/data_frame.rb +2813 -100
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +631 -11
- data/lib/polars/expr_dispatch.rb +14 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +517 -0
- data/lib/polars/io.rb +763 -4
- data/lib/polars/lazy_frame.rb +1415 -67
- data/lib/polars/lazy_functions.rb +430 -9
- data/lib/polars/lazy_group_by.rb +79 -0
- data/lib/polars/list_expr.rb +5 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2244 -192
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +663 -2
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/utils.rb +76 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +8 -2
- metadata +12 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -1,8 +1,22 @@
|
|
1
1
|
module Polars
|
2
|
+
# Two-dimensional data structure representing data as a table with rows and columns.
|
2
3
|
class DataFrame
|
4
|
+
# @private
|
3
5
|
attr_accessor :_df
|
4
6
|
|
5
|
-
|
7
|
+
# Create a new DataFrame.
|
8
|
+
#
|
9
|
+
# @param data [Hash, Array, Series, nil]
|
10
|
+
# Two-dimensional data in various forms. Hash must contain Arrays.
|
11
|
+
# Array may contain Series.
|
12
|
+
# @param columns [Array, Hash, nil]
|
13
|
+
# Column labels to use for resulting DataFrame. If specified, overrides any
|
14
|
+
# labels already present in the data. Must match data dimensions.
|
15
|
+
# @param orient ["col", "row", nil]
|
16
|
+
# Whether to interpret two-dimensional data as columns or as rows. If `nil`,
|
17
|
+
# the orientation is inferred by matching the columns and data dimensions. If
|
18
|
+
# this does not yield conclusive results, column orientation is used.
|
19
|
+
def initialize(data = nil, columns: nil, orient: nil)
|
6
20
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
7
21
|
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
|
8
22
|
data = {}
|
@@ -12,41 +26,204 @@ module Polars
|
|
12
26
|
end
|
13
27
|
|
14
28
|
if data.nil?
|
15
|
-
self._df = hash_to_rbdf({})
|
29
|
+
self._df = hash_to_rbdf({}, columns: columns)
|
16
30
|
elsif data.is_a?(Hash)
|
17
31
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
18
|
-
self._df = hash_to_rbdf(data)
|
32
|
+
self._df = hash_to_rbdf(data, columns: columns)
|
19
33
|
elsif data.is_a?(Array)
|
20
|
-
self._df = sequence_to_rbdf(data)
|
34
|
+
self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
|
21
35
|
elsif data.is_a?(Series)
|
22
|
-
self._df = series_to_rbdf(data)
|
36
|
+
self._df = series_to_rbdf(data, columns: columns)
|
23
37
|
else
|
24
38
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
25
39
|
end
|
26
40
|
end
|
27
41
|
|
42
|
+
# @private
|
28
43
|
def self._from_rbdf(rb_df)
|
29
44
|
df = DataFrame.allocate
|
30
45
|
df._df = rb_df
|
31
46
|
df
|
32
47
|
end
|
33
48
|
|
34
|
-
def self.
|
49
|
+
# def self._from_hashes
|
50
|
+
# end
|
51
|
+
|
52
|
+
# def self._from_hash
|
53
|
+
# end
|
54
|
+
|
55
|
+
# def self._from_records
|
56
|
+
# end
|
57
|
+
|
58
|
+
# def self._from_numo
|
59
|
+
# end
|
60
|
+
|
61
|
+
# no self._from_arrow
|
62
|
+
|
63
|
+
# no self._from_pandas
|
64
|
+
|
65
|
+
# @private
|
66
|
+
def self._read_csv(
|
67
|
+
file,
|
68
|
+
has_header: true,
|
69
|
+
columns: nil,
|
70
|
+
sep: str = ",",
|
71
|
+
comment_char: nil,
|
72
|
+
quote_char: '"',
|
73
|
+
skip_rows: 0,
|
74
|
+
dtypes: nil,
|
75
|
+
null_values: nil,
|
76
|
+
ignore_errors: false,
|
77
|
+
parse_dates: false,
|
78
|
+
n_threads: nil,
|
79
|
+
infer_schema_length: 100,
|
80
|
+
batch_size: 8192,
|
81
|
+
n_rows: nil,
|
82
|
+
encoding: "utf8",
|
83
|
+
low_memory: false,
|
84
|
+
rechunk: true,
|
85
|
+
skip_rows_after_header: 0,
|
86
|
+
row_count_name: nil,
|
87
|
+
row_count_offset: 0,
|
88
|
+
sample_size: 1024,
|
89
|
+
eol_char: "\n"
|
90
|
+
)
|
91
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
92
|
+
path = Utils.format_path(file)
|
93
|
+
else
|
94
|
+
path = nil
|
95
|
+
# if defined?(StringIO) && file.is_a?(StringIO)
|
96
|
+
# file = file.string
|
97
|
+
# end
|
98
|
+
end
|
99
|
+
|
100
|
+
dtype_list = nil
|
101
|
+
dtype_slice = nil
|
102
|
+
if !dtypes.nil?
|
103
|
+
if dtypes.is_a?(Hash)
|
104
|
+
dtype_list = []
|
105
|
+
dtypes.each do|k, v|
|
106
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
107
|
+
end
|
108
|
+
elsif dtypes.is_a?(Array)
|
109
|
+
dtype_slice = dtypes
|
110
|
+
else
|
111
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
processed_null_values = Utils._process_null_values(null_values)
|
116
|
+
|
117
|
+
if columns.is_a?(String)
|
118
|
+
columns = [columns]
|
119
|
+
end
|
120
|
+
if file.is_a?(String) && file.include?("*")
|
121
|
+
raise Todo
|
122
|
+
end
|
123
|
+
|
124
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
125
|
+
|
126
|
+
_from_rbdf(
|
127
|
+
RbDataFrame.read_csv(
|
128
|
+
file,
|
129
|
+
infer_schema_length,
|
130
|
+
batch_size,
|
131
|
+
has_header,
|
132
|
+
ignore_errors,
|
133
|
+
n_rows,
|
134
|
+
skip_rows,
|
135
|
+
projection,
|
136
|
+
sep,
|
137
|
+
rechunk,
|
138
|
+
columns,
|
139
|
+
encoding,
|
140
|
+
n_threads,
|
141
|
+
path,
|
142
|
+
dtype_list,
|
143
|
+
dtype_slice,
|
144
|
+
low_memory,
|
145
|
+
comment_char,
|
146
|
+
quote_char,
|
147
|
+
processed_null_values,
|
148
|
+
parse_dates,
|
149
|
+
skip_rows_after_header,
|
150
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
151
|
+
sample_size,
|
152
|
+
eol_char
|
153
|
+
)
|
154
|
+
)
|
155
|
+
end
|
156
|
+
|
157
|
+
# @private
|
158
|
+
def self._read_parquet(
|
159
|
+
file,
|
160
|
+
columns: nil,
|
161
|
+
n_rows: nil,
|
162
|
+
parallel: "auto",
|
163
|
+
row_count_name: nil,
|
164
|
+
row_count_offset: 0,
|
165
|
+
low_memory: false
|
166
|
+
)
|
35
167
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
36
168
|
file = Utils.format_path(file)
|
37
169
|
end
|
38
170
|
|
39
|
-
|
171
|
+
if file.is_a?(String) && file.include?("*")
|
172
|
+
raise Todo
|
173
|
+
end
|
174
|
+
|
175
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
176
|
+
_from_rbdf(
|
177
|
+
RbDataFrame.read_parquet(
|
178
|
+
file,
|
179
|
+
columns,
|
180
|
+
projection,
|
181
|
+
n_rows,
|
182
|
+
parallel,
|
183
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
184
|
+
low_memory
|
185
|
+
)
|
186
|
+
)
|
40
187
|
end
|
41
188
|
|
42
|
-
def self.
|
189
|
+
# def self._read_avro
|
190
|
+
# end
|
191
|
+
|
192
|
+
# @private
|
193
|
+
def self._read_ipc(
|
194
|
+
file,
|
195
|
+
columns: nil,
|
196
|
+
n_rows: nil,
|
197
|
+
row_count_name: nil,
|
198
|
+
row_count_offset: 0,
|
199
|
+
rechunk: true,
|
200
|
+
memory_map: true
|
201
|
+
)
|
43
202
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
44
203
|
file = Utils.format_path(file)
|
45
204
|
end
|
205
|
+
if columns.is_a?(String)
|
206
|
+
columns = [columns]
|
207
|
+
end
|
208
|
+
|
209
|
+
if file.is_a?(String) && file.include?("*")
|
210
|
+
raise Todo
|
211
|
+
end
|
46
212
|
|
47
|
-
|
213
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
214
|
+
_from_rbdf(
|
215
|
+
RbDataFrame.read_ipc(
|
216
|
+
file,
|
217
|
+
columns,
|
218
|
+
projection,
|
219
|
+
n_rows,
|
220
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
221
|
+
memory_map
|
222
|
+
)
|
223
|
+
)
|
48
224
|
end
|
49
225
|
|
226
|
+
# @private
|
50
227
|
def self._read_json(file)
|
51
228
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
52
229
|
file = Utils.format_path(file)
|
@@ -55,6 +232,7 @@ module Polars
|
|
55
232
|
_from_rbdf(RbDataFrame.read_json(file))
|
56
233
|
end
|
57
234
|
|
235
|
+
# @private
|
58
236
|
def self._read_ndjson(file)
|
59
237
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
60
238
|
file = Utils.format_path(file)
|
@@ -63,83 +241,339 @@ module Polars
|
|
63
241
|
_from_rbdf(RbDataFrame.read_ndjson(file))
|
64
242
|
end
|
65
243
|
|
244
|
+
# Get the shape of the DataFrame.
|
245
|
+
#
|
246
|
+
# @return [Array]
|
247
|
+
#
|
248
|
+
# @example
|
249
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
250
|
+
# df.shape
|
251
|
+
# # => [5, 1]
|
66
252
|
def shape
|
67
253
|
_df.shape
|
68
254
|
end
|
69
255
|
|
256
|
+
# Get the height of the DataFrame.
|
257
|
+
#
|
258
|
+
# @return [Integer]
|
259
|
+
#
|
260
|
+
# @example
|
261
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
262
|
+
# df.height
|
263
|
+
# # => 5
|
70
264
|
def height
|
71
265
|
_df.height
|
72
266
|
end
|
73
267
|
|
268
|
+
# Get the width of the DataFrame.
|
269
|
+
#
|
270
|
+
# @return [Integer]
|
271
|
+
#
|
272
|
+
# @example
|
273
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
274
|
+
# df.width
|
275
|
+
# # => 1
|
74
276
|
def width
|
75
277
|
_df.width
|
76
278
|
end
|
77
279
|
|
280
|
+
# Get column names.
|
281
|
+
#
|
282
|
+
# @return [Array]
|
283
|
+
#
|
284
|
+
# @example
|
285
|
+
# df = Polars::DataFrame.new(
|
286
|
+
# {
|
287
|
+
# "foo" => [1, 2, 3],
|
288
|
+
# "bar" => [6, 7, 8],
|
289
|
+
# "ham" => ["a", "b", "c"]
|
290
|
+
# }
|
291
|
+
# )
|
292
|
+
# df.columns
|
293
|
+
# # => ["foo", "bar", "ham"]
|
78
294
|
def columns
|
79
295
|
_df.columns
|
80
296
|
end
|
81
297
|
|
298
|
+
# Change the column names of the DataFrame.
|
299
|
+
#
|
300
|
+
# @param columns [Array]
|
301
|
+
# A list with new names for the DataFrame.
|
302
|
+
# The length of the list should be equal to the width of the DataFrame.
|
303
|
+
#
|
304
|
+
# @return [Object]
|
305
|
+
#
|
306
|
+
# @example
|
307
|
+
# df = Polars::DataFrame.new(
|
308
|
+
# {
|
309
|
+
# "foo" => [1, 2, 3],
|
310
|
+
# "bar" => [6, 7, 8],
|
311
|
+
# "ham" => ["a", "b", "c"]
|
312
|
+
# }
|
313
|
+
# )
|
314
|
+
# df.columns = ["apple", "banana", "orange"]
|
315
|
+
# df
|
316
|
+
# # =>
|
317
|
+
# # shape: (3, 3)
|
318
|
+
# # ┌───────┬────────┬────────┐
|
319
|
+
# # │ apple ┆ banana ┆ orange │
|
320
|
+
# # │ --- ┆ --- ┆ --- │
|
321
|
+
# # │ i64 ┆ i64 ┆ str │
|
322
|
+
# # ╞═══════╪════════╪════════╡
|
323
|
+
# # │ 1 ┆ 6 ┆ a │
|
324
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
325
|
+
# # │ 2 ┆ 7 ┆ b │
|
326
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
327
|
+
# # │ 3 ┆ 8 ┆ c │
|
328
|
+
# # └───────┴────────┴────────┘
|
82
329
|
def columns=(columns)
|
83
330
|
_df.set_column_names(columns)
|
84
331
|
end
|
85
332
|
|
333
|
+
# Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
|
334
|
+
#
|
335
|
+
# @return [Array]
|
336
|
+
#
|
337
|
+
# @example
|
338
|
+
# df = Polars::DataFrame.new(
|
339
|
+
# {
|
340
|
+
# "foo" => [1, 2, 3],
|
341
|
+
# "bar" => [6.0, 7.0, 8.0],
|
342
|
+
# "ham" => ["a", "b", "c"]
|
343
|
+
# }
|
344
|
+
# )
|
345
|
+
# df.dtypes
|
346
|
+
# # => [:i64, :f64, :str]
|
86
347
|
def dtypes
|
87
|
-
_df.dtypes
|
348
|
+
_df.dtypes
|
88
349
|
end
|
89
350
|
|
351
|
+
# Get the schema.
|
352
|
+
#
|
353
|
+
# @return [Hash]
|
354
|
+
#
|
355
|
+
# @example
|
356
|
+
# df = Polars::DataFrame.new(
|
357
|
+
# {
|
358
|
+
# "foo" => [1, 2, 3],
|
359
|
+
# "bar" => [6.0, 7.0, 8.0],
|
360
|
+
# "ham" => ["a", "b", "c"]
|
361
|
+
# }
|
362
|
+
# )
|
363
|
+
# df.schema
|
364
|
+
# # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
|
90
365
|
def schema
|
91
366
|
columns.zip(dtypes).to_h
|
92
367
|
end
|
93
368
|
|
94
|
-
#
|
95
|
-
#
|
369
|
+
# Equal.
|
370
|
+
#
|
371
|
+
# @return [DataFrame]
|
372
|
+
def ==(other)
|
373
|
+
_comp(other, "eq")
|
374
|
+
end
|
96
375
|
|
97
|
-
#
|
98
|
-
#
|
376
|
+
# Not equal.
|
377
|
+
#
|
378
|
+
# @return [DataFrame]
|
379
|
+
def !=(other)
|
380
|
+
_comp(other, "neq")
|
381
|
+
end
|
99
382
|
|
100
|
-
#
|
101
|
-
#
|
383
|
+
# Greater than.
|
384
|
+
#
|
385
|
+
# @return [DataFrame]
|
386
|
+
def >(other)
|
387
|
+
_comp(other, "gt")
|
388
|
+
end
|
102
389
|
|
103
|
-
#
|
104
|
-
#
|
390
|
+
# Less than.
|
391
|
+
#
|
392
|
+
# @return [DataFrame]
|
393
|
+
def <(other)
|
394
|
+
_comp(other, "lt")
|
395
|
+
end
|
105
396
|
|
106
|
-
#
|
107
|
-
#
|
397
|
+
# Greater than or equal.
|
398
|
+
#
|
399
|
+
# @return [DataFrame]
|
400
|
+
def >=(other)
|
401
|
+
_comp(other, "gt_eq")
|
402
|
+
end
|
108
403
|
|
109
|
-
#
|
110
|
-
#
|
404
|
+
# Less than or equal.
|
405
|
+
#
|
406
|
+
# @return [DataFrame]
|
407
|
+
def <=(other)
|
408
|
+
_comp(other, "lt_eq")
|
409
|
+
end
|
111
410
|
|
112
|
-
#
|
113
|
-
#
|
411
|
+
# Performs multiplication.
|
412
|
+
#
|
413
|
+
# @return [DataFrame]
|
414
|
+
def *(other)
|
415
|
+
if other.is_a?(DataFrame)
|
416
|
+
return _from_rbdf(_df.mul_df(other._df))
|
417
|
+
end
|
114
418
|
|
115
|
-
|
116
|
-
|
419
|
+
other = _prepare_other_arg(other)
|
420
|
+
_from_rbdf(_df.mul(other._s))
|
421
|
+
end
|
117
422
|
|
118
|
-
#
|
119
|
-
#
|
423
|
+
# Performs division.
|
424
|
+
#
|
425
|
+
# @return [DataFrame]
|
426
|
+
def /(other)
|
427
|
+
if other.is_a?(DataFrame)
|
428
|
+
return _from_rbdf(_df.div_df(other._df))
|
429
|
+
end
|
120
430
|
|
121
|
-
|
122
|
-
|
431
|
+
other = _prepare_other_arg(other)
|
432
|
+
_from_rbdf(_df.div(other._s))
|
433
|
+
end
|
123
434
|
|
124
|
-
#
|
125
|
-
#
|
435
|
+
# Performs addition.
|
436
|
+
#
|
437
|
+
# @return [DataFrame]
|
438
|
+
def +(other)
|
439
|
+
if other.is_a?(DataFrame)
|
440
|
+
return _from_rbdf(_df.add_df(other._df))
|
441
|
+
end
|
442
|
+
|
443
|
+
other = _prepare_other_arg(other)
|
444
|
+
_from_rbdf(_df.add(other._s))
|
445
|
+
end
|
446
|
+
|
447
|
+
# Performs subtraction.
|
448
|
+
#
|
449
|
+
# @return [DataFrame]
|
450
|
+
def -(other)
|
451
|
+
if other.is_a?(DataFrame)
|
452
|
+
return _from_rbdf(_df.sub_df(other._df))
|
453
|
+
end
|
454
|
+
|
455
|
+
other = _prepare_other_arg(other)
|
456
|
+
_from_rbdf(_df.sub(other._s))
|
457
|
+
end
|
458
|
+
|
459
|
+
# Returns the modulo.
|
460
|
+
#
|
461
|
+
# @return [DataFrame]
|
462
|
+
def %(other)
|
463
|
+
if other.is_a?(DataFrame)
|
464
|
+
return _from_rbdf(_df.rem_df(other._df))
|
465
|
+
end
|
466
|
+
|
467
|
+
other = _prepare_other_arg(other)
|
468
|
+
_from_rbdf(_df.rem(other._s))
|
469
|
+
end
|
126
470
|
|
471
|
+
# Returns a string representing the DataFrame.
|
472
|
+
#
|
473
|
+
# @return [String]
|
127
474
|
def to_s
|
128
475
|
_df.to_s
|
129
476
|
end
|
130
477
|
alias_method :inspect, :to_s
|
131
478
|
|
479
|
+
# Check if DataFrame includes column.
|
480
|
+
#
|
481
|
+
# @return [Boolean]
|
132
482
|
def include?(name)
|
133
483
|
columns.include?(name)
|
134
484
|
end
|
135
485
|
|
136
|
-
def
|
137
|
-
|
486
|
+
# def each
|
487
|
+
# end
|
488
|
+
|
489
|
+
# def _pos_idx
|
490
|
+
# end
|
491
|
+
|
492
|
+
# def _pos_idxs
|
493
|
+
# end
|
494
|
+
|
495
|
+
# Returns subset of the DataFrame.
|
496
|
+
#
|
497
|
+
# @return [Object]
|
498
|
+
def [](*args)
|
499
|
+
if args.size == 2
|
500
|
+
row_selection, col_selection = args
|
501
|
+
|
502
|
+
# df[.., unknown]
|
503
|
+
if row_selection.is_a?(Range)
|
504
|
+
|
505
|
+
# multiple slices
|
506
|
+
# df[.., ..]
|
507
|
+
if col_selection.is_a?(Range)
|
508
|
+
raise Todo
|
509
|
+
end
|
510
|
+
end
|
511
|
+
|
512
|
+
# df[2, ..] (select row as df)
|
513
|
+
if row_selection.is_a?(Integer)
|
514
|
+
if col_selection.is_a?(Array)
|
515
|
+
df = self[0.., col_selection]
|
516
|
+
return df.slice(row_selection, 1)
|
517
|
+
end
|
518
|
+
# df[2, "a"]
|
519
|
+
if col_selection.is_a?(String)
|
520
|
+
return self[col_selection][row_selection]
|
521
|
+
end
|
522
|
+
end
|
523
|
+
|
524
|
+
# column selection can be "a" and ["a", "b"]
|
525
|
+
if col_selection.is_a?(String)
|
526
|
+
col_selection = [col_selection]
|
527
|
+
end
|
528
|
+
|
529
|
+
# df[.., 1]
|
530
|
+
if col_selection.is_a?(Integer)
|
531
|
+
series = to_series(col_selection)
|
532
|
+
return series[row_selection]
|
533
|
+
end
|
534
|
+
|
535
|
+
if col_selection.is_a?(Array)
|
536
|
+
# df[.., [1, 2]]
|
537
|
+
if is_int_sequence(col_selection)
|
538
|
+
series_list = col_selection.map { |i| to_series(i) }
|
539
|
+
df = self.class.new(series_list)
|
540
|
+
return df[row_selection]
|
541
|
+
end
|
542
|
+
end
|
543
|
+
|
544
|
+
df = self[col_selection]
|
545
|
+
return df[row_selection]
|
546
|
+
elsif args.size == 1
|
547
|
+
item = args[0]
|
548
|
+
|
549
|
+
# select single column
|
550
|
+
# df["foo"]
|
551
|
+
if item.is_a?(String)
|
552
|
+
return Utils.wrap_s(_df.column(item))
|
553
|
+
end
|
554
|
+
|
555
|
+
# df[idx]
|
556
|
+
if item.is_a?(Integer)
|
557
|
+
return slice(_pos_idx(item, dim: 0), 1)
|
558
|
+
end
|
559
|
+
|
560
|
+
# df[..]
|
561
|
+
if item.is_a?(Range)
|
562
|
+
return Slice.new(self).apply(item)
|
563
|
+
end
|
564
|
+
end
|
565
|
+
|
566
|
+
raise ArgumentError, "Cannot get item of type: #{item.class.name}"
|
138
567
|
end
|
139
568
|
|
140
569
|
# def []=(key, value)
|
141
570
|
# end
|
142
571
|
|
572
|
+
# no to_arrow
|
573
|
+
|
574
|
+
# Convert DataFrame to a hash mapping column name to values.
|
575
|
+
#
|
576
|
+
# @return [Hash]
|
143
577
|
def to_h(as_series: true)
|
144
578
|
if as_series
|
145
579
|
get_columns.to_h { |s| [s.name, s] }
|
@@ -148,7 +582,7 @@ module Polars
|
|
148
582
|
end
|
149
583
|
end
|
150
584
|
|
151
|
-
# def
|
585
|
+
# def to_hashes / to_a
|
152
586
|
# end
|
153
587
|
|
154
588
|
# def to_numo
|
@@ -156,6 +590,30 @@ module Polars
|
|
156
590
|
|
157
591
|
# no to_pandas
|
158
592
|
|
593
|
+
# Select column as Series at index location.
|
594
|
+
#
|
595
|
+
# @param index [Integer]
|
596
|
+
# Location of selection.
|
597
|
+
#
|
598
|
+
# @return [Series]
|
599
|
+
#
|
600
|
+
# @example
|
601
|
+
# df = Polars::DataFrame.new(
|
602
|
+
# {
|
603
|
+
# "foo" => [1, 2, 3],
|
604
|
+
# "bar" => [6, 7, 8],
|
605
|
+
# "ham" => ["a", "b", "c"]
|
606
|
+
# }
|
607
|
+
# )
|
608
|
+
# df.to_series(1)
|
609
|
+
# # =>
|
610
|
+
# # shape: (3,)
|
611
|
+
# # Series: 'bar' [i64]
|
612
|
+
# # [
|
613
|
+
# # 6
|
614
|
+
# # 7
|
615
|
+
# # 8
|
616
|
+
# # ]
|
159
617
|
def to_series(index = 0)
|
160
618
|
if index < 0
|
161
619
|
index = columns.length + index
|
@@ -163,6 +621,18 @@ module Polars
|
|
163
621
|
Utils.wrap_s(_df.select_at_idx(index))
|
164
622
|
end
|
165
623
|
|
624
|
+
# Serialize to JSON representation.
|
625
|
+
#
|
626
|
+
# @return [nil]
|
627
|
+
#
|
628
|
+
# @param file [String]
|
629
|
+
# File path to which the result should be written.
|
630
|
+
# @param pretty [Boolean]
|
631
|
+
# Pretty serialize json.
|
632
|
+
# @param row_oriented [Boolean]
|
633
|
+
# Write to row oriented json. This is slower, but more common.
|
634
|
+
#
|
635
|
+
# @see #write_ndjson
|
166
636
|
def write_json(
|
167
637
|
file,
|
168
638
|
pretty: false,
|
@@ -176,6 +646,12 @@ module Polars
|
|
176
646
|
nil
|
177
647
|
end
|
178
648
|
|
649
|
+
# Serialize to newline delimited JSON representation.
|
650
|
+
#
|
651
|
+
# @param file [String]
|
652
|
+
# File path to which the result should be written.
|
653
|
+
#
|
654
|
+
# @return [nil]
|
179
655
|
def write_ndjson(file)
|
180
656
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
181
657
|
file = Utils.format_path(file)
|
@@ -185,6 +661,50 @@ module Polars
|
|
185
661
|
nil
|
186
662
|
end
|
187
663
|
|
664
|
+
# Write to comma-separated values (CSV) file.
|
665
|
+
#
|
666
|
+
# @param file [String, nil]
|
667
|
+
# File path to which the result should be written. If set to `nil`
|
668
|
+
# (default), the output is returned as a string instead.
|
669
|
+
# @param has_header [Boolean]
|
670
|
+
# Whether to include header in the CSV output.
|
671
|
+
# @param sep [String]
|
672
|
+
# Separate CSV fields with this symbol.
|
673
|
+
# @param quote [String]
|
674
|
+
# Byte to use as quoting character.
|
675
|
+
# @param batch_size [Integer]
|
676
|
+
# Number of rows that will be processed per thread.
|
677
|
+
# @param datetime_format [String, nil]
|
678
|
+
# A format string, with the specifiers defined by the
|
679
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
680
|
+
# Rust crate. If no format specified, the default fractional-second
|
681
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
682
|
+
# Datetime cols (if any).
|
683
|
+
# @param date_format [String, nil]
|
684
|
+
# A format string, with the specifiers defined by the
|
685
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
686
|
+
# Rust crate.
|
687
|
+
# @param time_format [String, nil]
|
688
|
+
# A format string, with the specifiers defined by the
|
689
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
690
|
+
# Rust crate.
|
691
|
+
# @param float_precision [Integer, nil]
|
692
|
+
# Number of decimal places to write, applied to both `:f32` and
|
693
|
+
# `:f64` datatypes.
|
694
|
+
# @param null_value [String, nil]
|
695
|
+
# A string representing null values (defaulting to the empty string).
|
696
|
+
#
|
697
|
+
# @return [String, nil]
|
698
|
+
#
|
699
|
+
# @example
|
700
|
+
# df = Polars::DataFrame.new(
|
701
|
+
# {
|
702
|
+
# "foo" => [1, 2, 3, 4, 5],
|
703
|
+
# "bar" => [6, 7, 8, 9, 10],
|
704
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
705
|
+
# }
|
706
|
+
# )
|
707
|
+
# df.write_csv("file.csv")
|
188
708
|
def write_csv(
|
189
709
|
file = nil,
|
190
710
|
has_header: true,
|
@@ -220,8 +740,7 @@ module Polars
|
|
220
740
|
float_precision,
|
221
741
|
null_value
|
222
742
|
)
|
223
|
-
buffer.
|
224
|
-
return buffer.read.force_encoding(Encoding::UTF_8)
|
743
|
+
return buffer.string.force_encoding(Encoding::UTF_8)
|
225
744
|
end
|
226
745
|
|
227
746
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
@@ -246,9 +765,50 @@ module Polars
|
|
246
765
|
# def write_avro
|
247
766
|
# end
|
248
767
|
|
249
|
-
#
|
250
|
-
#
|
768
|
+
# Write to Arrow IPC binary stream or Feather file.
|
769
|
+
#
|
770
|
+
# @param file [String]
|
771
|
+
# File path to which the file should be written.
|
772
|
+
# @param compression ["uncompressed", "lz4", "zstd"]
|
773
|
+
# Compression method. Defaults to "uncompressed".
|
774
|
+
#
|
775
|
+
# @return [nil]
|
776
|
+
def write_ipc(file, compression: "uncompressed")
|
777
|
+
if compression.nil?
|
778
|
+
compression = "uncompressed"
|
779
|
+
end
|
780
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
781
|
+
file = Utils.format_path(file)
|
782
|
+
end
|
251
783
|
|
784
|
+
_df.write_ipc(file, compression)
|
785
|
+
end
|
786
|
+
|
787
|
+
# Write to Apache Parquet file.
|
788
|
+
#
|
789
|
+
# @param file [String]
|
790
|
+
# File path to which the file should be written.
|
791
|
+
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
792
|
+
# Choose "zstd" for good compression performance.
|
793
|
+
# Choose "lz4" for fast compression/decompression.
|
794
|
+
# Choose "snappy" for more backwards compatibility guarantees
|
795
|
+
# when you deal with older parquet readers.
|
796
|
+
# @param compression_level [Integer, nil]
|
797
|
+
# The level of compression to use. Higher compression means smaller files on
|
798
|
+
# disk.
|
799
|
+
#
|
800
|
+
# - "gzip" : min-level: 0, max-level: 10.
|
801
|
+
# - "brotli" : min-level: 0, max-level: 11.
|
802
|
+
# - "zstd" : min-level: 1, max-level: 22.
|
803
|
+
# @param statistics [Boolean]
|
804
|
+
# Write statistics to the parquet headers. This requires extra compute.
|
805
|
+
# @param row_group_size [Integer, nil]
|
806
|
+
# Size of the row groups in number of rows.
|
807
|
+
# If `nil` (default), the chunks of the DataFrame are
|
808
|
+
# used. Writing in smaller chunks may reduce memory pressure and improve
|
809
|
+
# writing speeds.
|
810
|
+
#
|
811
|
+
# @return [nil]
|
252
812
|
def write_parquet(
|
253
813
|
file,
|
254
814
|
compression: "zstd",
|
@@ -268,6 +828,39 @@ module Polars
|
|
268
828
|
)
|
269
829
|
end
|
270
830
|
|
831
|
+
# Return an estimation of the total (heap) allocated size of the DataFrame.
|
832
|
+
#
|
833
|
+
# Estimated size is given in the specified unit (bytes by default).
|
834
|
+
#
|
835
|
+
# This estimation is the sum of the size of its buffers, validity, including
|
836
|
+
# nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
|
837
|
+
# size of 2 arrays is not the sum of the sizes computed from this function. In
|
838
|
+
# particular, StructArray's size is an upper bound.
|
839
|
+
#
|
840
|
+
# When an array is sliced, its allocated size remains constant because the buffer
|
841
|
+
# unchanged. However, this function will yield a smaller number. This is because
|
842
|
+
# this function returns the visible size of the buffer, not its total capacity.
|
843
|
+
#
|
844
|
+
# FFI buffers are included in this estimation.
|
845
|
+
#
|
846
|
+
# @param unit ["b", "kb", "mb", "gb", "tb"]
|
847
|
+
# Scale the returned size to the given unit.
|
848
|
+
#
|
849
|
+
# @return [Numeric]
|
850
|
+
#
|
851
|
+
# @example
|
852
|
+
# df = Polars::DataFrame.new(
|
853
|
+
# {
|
854
|
+
# "x" => 1_000_000.times.to_a.reverse,
|
855
|
+
# "y" => 1_000_000.times.map { |v| v / 1000.0 },
|
856
|
+
# "z" => 1_000_000.times.map(&:to_s)
|
857
|
+
# },
|
858
|
+
# columns: {"x" => :u32, "y" => :f64, "z" => :str}
|
859
|
+
# )
|
860
|
+
# df.estimated_size
|
861
|
+
# # => 25888898
|
862
|
+
# df.estimated_size("mb")
|
863
|
+
# # => 24.689577102661133
|
271
864
|
def estimated_size(unit = "b")
|
272
865
|
sz = _df.estimated_size
|
273
866
|
Utils.scale_bytes(sz, to: unit)
|
@@ -276,14 +869,120 @@ module Polars
|
|
276
869
|
# def transpose
|
277
870
|
# end
|
278
871
|
|
872
|
+
# Reverse the DataFrame.
|
873
|
+
#
|
874
|
+
# @return [DataFrame]
|
875
|
+
#
|
876
|
+
# @example
|
877
|
+
# df = Polars::DataFrame.new(
|
878
|
+
# {
|
879
|
+
# "key" => ["a", "b", "c"],
|
880
|
+
# "val" => [1, 2, 3]
|
881
|
+
# }
|
882
|
+
# )
|
883
|
+
# df.reverse()
|
884
|
+
# # =>
|
885
|
+
# # shape: (3, 2)
|
886
|
+
# # ┌─────┬─────┐
|
887
|
+
# # │ key ┆ val │
|
888
|
+
# # │ --- ┆ --- │
|
889
|
+
# # │ str ┆ i64 │
|
890
|
+
# # ╞═════╪═════╡
|
891
|
+
# # │ c ┆ 3 │
|
892
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
893
|
+
# # │ b ┆ 2 │
|
894
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
895
|
+
# # │ a ┆ 1 │
|
896
|
+
# # └─────┴─────┘
|
279
897
|
def reverse
|
280
898
|
select(Polars.col("*").reverse)
|
281
899
|
end
|
282
900
|
|
901
|
+
# Rename column names.
|
902
|
+
#
|
903
|
+
# @param mapping [Hash]
|
904
|
+
# Key value pairs that map from old name to new name.
|
905
|
+
#
|
906
|
+
# @return [DataFrame]
|
907
|
+
#
|
908
|
+
# @example
|
909
|
+
# df = Polars::DataFrame.new(
|
910
|
+
# {
|
911
|
+
# "foo" => [1, 2, 3],
|
912
|
+
# "bar" => [6, 7, 8],
|
913
|
+
# "ham" => ["a", "b", "c"]
|
914
|
+
# }
|
915
|
+
# )
|
916
|
+
# df.rename({"foo" => "apple"})
|
917
|
+
# # =>
|
918
|
+
# # shape: (3, 3)
|
919
|
+
# # ┌───────┬─────┬─────┐
|
920
|
+
# # │ apple ┆ bar ┆ ham │
|
921
|
+
# # │ --- ┆ --- ┆ --- │
|
922
|
+
# # │ i64 ┆ i64 ┆ str │
|
923
|
+
# # ╞═══════╪═════╪═════╡
|
924
|
+
# # │ 1 ┆ 6 ┆ a │
|
925
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
926
|
+
# # │ 2 ┆ 7 ┆ b │
|
927
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
928
|
+
# # │ 3 ┆ 8 ┆ c │
|
929
|
+
# # └───────┴─────┴─────┘
|
283
930
|
def rename(mapping)
|
284
931
|
lazy.rename(mapping).collect(no_optimization: true)
|
285
932
|
end
|
286
933
|
|
934
|
+
# Insert a Series at a certain column index. This operation is in place.
|
935
|
+
#
|
936
|
+
# @param index [Integer]
|
937
|
+
# Column to insert the new `Series` column.
|
938
|
+
# @param series [Series]
|
939
|
+
# `Series` to insert.
|
940
|
+
#
|
941
|
+
# @return [DataFrame]
|
942
|
+
#
|
943
|
+
# @example
|
944
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
945
|
+
# s = Polars::Series.new("baz", [97, 98, 99])
|
946
|
+
# df.insert_at_idx(1, s)
|
947
|
+
# # =>
|
948
|
+
# # shape: (3, 3)
|
949
|
+
# # ┌─────┬─────┬─────┐
|
950
|
+
# # │ foo ┆ baz ┆ bar │
|
951
|
+
# # │ --- ┆ --- ┆ --- │
|
952
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
953
|
+
# # ╞═════╪═════╪═════╡
|
954
|
+
# # │ 1 ┆ 97 ┆ 4 │
|
955
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
956
|
+
# # │ 2 ┆ 98 ┆ 5 │
|
957
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
958
|
+
# # │ 3 ┆ 99 ┆ 6 │
|
959
|
+
# # └─────┴─────┴─────┘
|
960
|
+
#
|
961
|
+
# @example
|
962
|
+
# df = Polars::DataFrame.new(
|
963
|
+
# {
|
964
|
+
# "a" => [1, 2, 3, 4],
|
965
|
+
# "b" => [0.5, 4, 10, 13],
|
966
|
+
# "c" => [true, true, false, true]
|
967
|
+
# }
|
968
|
+
# )
|
969
|
+
# s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
|
970
|
+
# df.insert_at_idx(3, s)
|
971
|
+
# # =>
|
972
|
+
# # shape: (4, 4)
|
973
|
+
# # ┌─────┬──────┬───────┬──────┐
|
974
|
+
# # │ a ┆ b ┆ c ┆ d │
|
975
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
976
|
+
# # │ i64 ┆ f64 ┆ bool ┆ f64 │
|
977
|
+
# # ╞═════╪══════╪═══════╪══════╡
|
978
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
|
979
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
980
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
|
981
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
982
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
|
983
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
984
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
|
985
|
+
# # └─────┴──────┴───────┴──────┘
|
287
986
|
def insert_at_idx(index, series)
|
288
987
|
if index < 0
|
289
988
|
index = columns.length + index
|
@@ -292,30 +991,337 @@ module Polars
|
|
292
991
|
self
|
293
992
|
end
|
294
993
|
|
994
|
+
# Filter the rows in the DataFrame based on a predicate expression.
|
995
|
+
#
|
996
|
+
# @param predicate [Expr]
|
997
|
+
# Expression that evaluates to a boolean Series.
|
998
|
+
#
|
999
|
+
# @return [DataFrame]
|
1000
|
+
#
|
1001
|
+
# @example Filter on one condition:
|
1002
|
+
# df = Polars::DataFrame.new(
|
1003
|
+
# {
|
1004
|
+
# "foo" => [1, 2, 3],
|
1005
|
+
# "bar" => [6, 7, 8],
|
1006
|
+
# "ham" => ["a", "b", "c"]
|
1007
|
+
# }
|
1008
|
+
# )
|
1009
|
+
# df.filter(Polars.col("foo") < 3)
|
1010
|
+
# # =>
|
1011
|
+
# # shape: (2, 3)
|
1012
|
+
# # ┌─────┬─────┬─────┐
|
1013
|
+
# # │ foo ┆ bar ┆ ham │
|
1014
|
+
# # │ --- ┆ --- ┆ --- │
|
1015
|
+
# # │ i64 ┆ i64 ┆ str │
|
1016
|
+
# # ╞═════╪═════╪═════╡
|
1017
|
+
# # │ 1 ┆ 6 ┆ a │
|
1018
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1019
|
+
# # │ 2 ┆ 7 ┆ b │
|
1020
|
+
# # └─────┴─────┴─────┘
|
1021
|
+
#
|
1022
|
+
# @example Filter on multiple conditions:
|
1023
|
+
# df.filter((Polars.col("foo") < 3) & (Polars.col("ham") == "a"))
|
1024
|
+
# # =>
|
1025
|
+
# # shape: (1, 3)
|
1026
|
+
# # ┌─────┬─────┬─────┐
|
1027
|
+
# # │ foo ┆ bar ┆ ham │
|
1028
|
+
# # │ --- ┆ --- ┆ --- │
|
1029
|
+
# # │ i64 ┆ i64 ┆ str │
|
1030
|
+
# # ╞═════╪═════╪═════╡
|
1031
|
+
# # │ 1 ┆ 6 ┆ a │
|
1032
|
+
# # └─────┴─────┴─────┘
|
295
1033
|
def filter(predicate)
|
296
1034
|
lazy.filter(predicate).collect
|
297
1035
|
end
|
298
1036
|
|
299
|
-
#
|
300
|
-
#
|
1037
|
+
# Summary statistics for a DataFrame.
|
1038
|
+
#
|
1039
|
+
# @return [DataFrame]
|
1040
|
+
#
|
1041
|
+
# @example
|
1042
|
+
# df = Polars::DataFrame.new(
|
1043
|
+
# {
|
1044
|
+
# "a" => [1.0, 2.8, 3.0],
|
1045
|
+
# "b" => [4, 5, nil],
|
1046
|
+
# "c" => [true, false, true],
|
1047
|
+
# "d" => [nil, "b", "c"],
|
1048
|
+
# "e" => ["usd", "eur", nil]
|
1049
|
+
# }
|
1050
|
+
# )
|
1051
|
+
# df.describe
|
1052
|
+
# # =>
|
1053
|
+
# # shape: (7, 6)
|
1054
|
+
# # ┌────────────┬──────────┬──────────┬──────┬──────┬──────┐
|
1055
|
+
# # │ describe ┆ a ┆ b ┆ c ┆ d ┆ e │
|
1056
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
1057
|
+
# # │ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str │
|
1058
|
+
# # ╞════════════╪══════════╪══════════╪══════╪══════╪══════╡
|
1059
|
+
# # │ count ┆ 3.0 ┆ 3.0 ┆ 3.0 ┆ 3 ┆ 3 │
|
1060
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1061
|
+
# # │ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 1 ┆ 1 │
|
1062
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1063
|
+
# # │ mean ┆ 2.266667 ┆ 4.5 ┆ null ┆ null ┆ null │
|
1064
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1065
|
+
# # │ std ┆ 1.101514 ┆ 0.707107 ┆ null ┆ null ┆ null │
|
1066
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1067
|
+
# # │ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur │
|
1068
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1069
|
+
# # │ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd │
|
1070
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
1071
|
+
# # │ median ┆ 2.8 ┆ 4.5 ┆ null ┆ null ┆ null │
|
1072
|
+
# # └────────────┴──────────┴──────────┴──────┴──────┴──────┘
|
1073
|
+
def describe
|
1074
|
+
describe_cast = lambda do |stat|
|
1075
|
+
columns = []
|
1076
|
+
self.columns.each_with_index do |s, i|
|
1077
|
+
if self[s].is_numeric || self[s].is_boolean
|
1078
|
+
columns << stat[0.., i].cast(:f64)
|
1079
|
+
else
|
1080
|
+
# for dates, strings, etc, we cast to string so that all
|
1081
|
+
# statistics can be shown
|
1082
|
+
columns << stat[0.., i].cast(:str)
|
1083
|
+
end
|
1084
|
+
end
|
1085
|
+
self.class.new(columns)
|
1086
|
+
end
|
301
1087
|
|
302
|
-
|
303
|
-
|
1088
|
+
summary = _from_rbdf(
|
1089
|
+
Polars.concat(
|
1090
|
+
[
|
1091
|
+
describe_cast.(
|
1092
|
+
self.class.new(columns.to_h { |c| [c, [height]] })
|
1093
|
+
),
|
1094
|
+
describe_cast.(null_count),
|
1095
|
+
describe_cast.(mean),
|
1096
|
+
describe_cast.(std),
|
1097
|
+
describe_cast.(min),
|
1098
|
+
describe_cast.(max),
|
1099
|
+
describe_cast.(median)
|
1100
|
+
]
|
1101
|
+
)._df
|
1102
|
+
)
|
1103
|
+
summary.insert_at_idx(
|
1104
|
+
0,
|
1105
|
+
Polars::Series.new(
|
1106
|
+
"describe",
|
1107
|
+
["count", "null_count", "mean", "std", "min", "max", "median"],
|
1108
|
+
)
|
1109
|
+
)
|
1110
|
+
summary
|
1111
|
+
end
|
304
1112
|
|
305
|
-
#
|
306
|
-
#
|
1113
|
+
# Find the index of a column by name.
|
1114
|
+
#
|
1115
|
+
# @param name [String]
|
1116
|
+
# Name of the column to find.
|
1117
|
+
#
|
1118
|
+
# @return [Series]
|
1119
|
+
#
|
1120
|
+
# @example
|
1121
|
+
# df = Polars::DataFrame.new(
|
1122
|
+
# {"foo" => [1, 2, 3], "bar" => [6, 7, 8], "ham" => ["a", "b", "c"]}
|
1123
|
+
# )
|
1124
|
+
# df.find_idx_by_name("ham")
|
1125
|
+
# # => 2
|
1126
|
+
def find_idx_by_name(name)
|
1127
|
+
_df.find_idx_by_name(name)
|
1128
|
+
end
|
1129
|
+
|
1130
|
+
# Replace a column at an index location.
|
1131
|
+
#
|
1132
|
+
# @param index [Integer]
|
1133
|
+
# Column index.
|
1134
|
+
# @param series [Series]
|
1135
|
+
# Series that will replace the column.
|
1136
|
+
#
|
1137
|
+
# @return [DataFrame]
|
1138
|
+
#
|
1139
|
+
# @example
|
1140
|
+
# df = Polars::DataFrame.new(
|
1141
|
+
# {
|
1142
|
+
# "foo" => [1, 2, 3],
|
1143
|
+
# "bar" => [6, 7, 8],
|
1144
|
+
# "ham" => ["a", "b", "c"]
|
1145
|
+
# }
|
1146
|
+
# )
|
1147
|
+
# s = Polars::Series.new("apple", [10, 20, 30])
|
1148
|
+
# df.replace_at_idx(0, s)
|
1149
|
+
# # =>
|
1150
|
+
# # shape: (3, 3)
|
1151
|
+
# # ┌───────┬─────┬─────┐
|
1152
|
+
# # │ apple ┆ bar ┆ ham │
|
1153
|
+
# # │ --- ┆ --- ┆ --- │
|
1154
|
+
# # │ i64 ┆ i64 ┆ str │
|
1155
|
+
# # ╞═══════╪═════╪═════╡
|
1156
|
+
# # │ 10 ┆ 6 ┆ a │
|
1157
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1158
|
+
# # │ 20 ┆ 7 ┆ b │
|
1159
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1160
|
+
# # │ 30 ┆ 8 ┆ c │
|
1161
|
+
# # └───────┴─────┴─────┘
|
1162
|
+
def replace_at_idx(index, series)
|
1163
|
+
if index < 0
|
1164
|
+
index = columns.length + index
|
1165
|
+
end
|
1166
|
+
_df.replace_at_idx(index, series._s)
|
1167
|
+
self
|
1168
|
+
end
|
307
1169
|
|
1170
|
+
# Sort the DataFrame by column.
|
1171
|
+
#
|
1172
|
+
# @param by [String]
|
1173
|
+
# By which column to sort.
|
1174
|
+
# @param reverse [Boolean]
|
1175
|
+
# Reverse/descending sort.
|
1176
|
+
# @param nulls_last [Boolean]
|
1177
|
+
# Place null values last. Can only be used if sorted by a single column.
|
1178
|
+
#
|
1179
|
+
# @return [DataFrame]
|
1180
|
+
#
|
1181
|
+
# @example
|
1182
|
+
# df = Polars::DataFrame.new(
|
1183
|
+
# {
|
1184
|
+
# "foo" => [1, 2, 3],
|
1185
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1186
|
+
# "ham" => ["a", "b", "c"]
|
1187
|
+
# }
|
1188
|
+
# )
|
1189
|
+
# df.sort("foo", reverse: true)
|
1190
|
+
# # =>
|
1191
|
+
# # shape: (3, 3)
|
1192
|
+
# # ┌─────┬─────┬─────┐
|
1193
|
+
# # │ foo ┆ bar ┆ ham │
|
1194
|
+
# # │ --- ┆ --- ┆ --- │
|
1195
|
+
# # │ i64 ┆ f64 ┆ str │
|
1196
|
+
# # ╞═════╪═════╪═════╡
|
1197
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1198
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1199
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1200
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1201
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
1202
|
+
# # └─────┴─────┴─────┘
|
1203
|
+
#
|
1204
|
+
# @example Sort by multiple columns.
|
1205
|
+
# df.sort(
|
1206
|
+
# [Polars.col("foo"), Polars.col("bar")**2],
|
1207
|
+
# reverse: [true, false]
|
1208
|
+
# )
|
1209
|
+
# # =>
|
1210
|
+
# # shape: (3, 3)
|
1211
|
+
# # ┌─────┬─────┬─────┐
|
1212
|
+
# # │ foo ┆ bar ┆ ham │
|
1213
|
+
# # │ --- ┆ --- ┆ --- │
|
1214
|
+
# # │ i64 ┆ f64 ┆ str │
|
1215
|
+
# # ╞═════╪═════╪═════╡
|
1216
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1217
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1218
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1219
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1220
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
1221
|
+
# # └─────┴─────┴─────┘
|
308
1222
|
def sort(by, reverse: false, nulls_last: false)
|
309
|
-
|
1223
|
+
if by.is_a?(Array) || by.is_a?(Expr)
|
1224
|
+
lazy
|
1225
|
+
.sort(by, reverse: reverse, nulls_last: nulls_last)
|
1226
|
+
.collect(no_optimization: true, string_cache: false)
|
1227
|
+
else
|
1228
|
+
_from_rbdf(_df.sort(by, reverse, nulls_last))
|
1229
|
+
end
|
310
1230
|
end
|
311
1231
|
|
1232
|
+
# Check if DataFrame is equal to other.
|
1233
|
+
#
|
1234
|
+
# @param other [DataFrame]
|
1235
|
+
# DataFrame to compare with.
|
1236
|
+
# @param null_equal [Boolean]
|
1237
|
+
# Consider null values as equal.
|
1238
|
+
#
|
1239
|
+
# @return [Boolean]
|
1240
|
+
#
|
1241
|
+
# @example
|
1242
|
+
# df1 = Polars::DataFrame.new(
|
1243
|
+
# {
|
1244
|
+
# "foo" => [1, 2, 3],
|
1245
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1246
|
+
# "ham" => ["a", "b", "c"]
|
1247
|
+
# }
|
1248
|
+
# )
|
1249
|
+
# df2 = Polars::DataFrame.new(
|
1250
|
+
# {
|
1251
|
+
# "foo" => [3, 2, 1],
|
1252
|
+
# "bar" => [8.0, 7.0, 6.0],
|
1253
|
+
# "ham" => ["c", "b", "a"]
|
1254
|
+
# }
|
1255
|
+
# )
|
1256
|
+
# df1.frame_equal(df1)
|
1257
|
+
# # => true
|
1258
|
+
# df1.frame_equal(df2)
|
1259
|
+
# # => false
|
312
1260
|
def frame_equal(other, null_equal: true)
|
313
1261
|
_df.frame_equal(other._df, null_equal)
|
314
1262
|
end
|
315
1263
|
|
316
|
-
#
|
317
|
-
#
|
1264
|
+
# Replace a column by a new Series.
|
1265
|
+
#
|
1266
|
+
# @param column [String]
|
1267
|
+
# Column to replace.
|
1268
|
+
# @param new_col [Series]
|
1269
|
+
# New column to insert.
|
1270
|
+
#
|
1271
|
+
# @return [DataFrame]
|
1272
|
+
#
|
1273
|
+
# @example
|
1274
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
1275
|
+
# s = Polars::Series.new([10, 20, 30])
|
1276
|
+
# df.replace("foo", s)
|
1277
|
+
# # =>
|
1278
|
+
# # shape: (3, 2)
|
1279
|
+
# # ┌─────┬─────┐
|
1280
|
+
# # │ foo ┆ bar │
|
1281
|
+
# # │ --- ┆ --- │
|
1282
|
+
# # │ i64 ┆ i64 │
|
1283
|
+
# # ╞═════╪═════╡
|
1284
|
+
# # │ 10 ┆ 4 │
|
1285
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1286
|
+
# # │ 20 ┆ 5 │
|
1287
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1288
|
+
# # │ 30 ┆ 6 │
|
1289
|
+
# # └─────┴─────┘
|
1290
|
+
def replace(column, new_col)
|
1291
|
+
_df.replace(column, new_col._s)
|
1292
|
+
self
|
1293
|
+
end
|
318
1294
|
|
1295
|
+
# Get a slice of this DataFrame.
|
1296
|
+
#
|
1297
|
+
# @param offset [Integer]
|
1298
|
+
# Start index. Negative indexing is supported.
|
1299
|
+
# @param length [Integer, nil]
|
1300
|
+
# Length of the slice. If set to `nil`, all rows starting at the offset
|
1301
|
+
# will be selected.
|
1302
|
+
#
|
1303
|
+
# @return [DataFrame]
|
1304
|
+
#
|
1305
|
+
# @example
|
1306
|
+
# df = Polars::DataFrame.new(
|
1307
|
+
# {
|
1308
|
+
# "foo" => [1, 2, 3],
|
1309
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1310
|
+
# "ham" => ["a", "b", "c"]
|
1311
|
+
# }
|
1312
|
+
# )
|
1313
|
+
# df.slice(1, 2)
|
1314
|
+
# # =>
|
1315
|
+
# # shape: (2, 3)
|
1316
|
+
# # ┌─────┬─────┬─────┐
|
1317
|
+
# # │ foo ┆ bar ┆ ham │
|
1318
|
+
# # │ --- ┆ --- ┆ --- │
|
1319
|
+
# # │ i64 ┆ f64 ┆ str │
|
1320
|
+
# # ╞═════╪═════╪═════╡
|
1321
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1322
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1323
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1324
|
+
# # └─────┴─────┴─────┘
|
319
1325
|
def slice(offset, length = nil)
|
320
1326
|
if !length.nil? && length < 0
|
321
1327
|
length = height - offset + length
|
@@ -323,29 +1329,222 @@ module Polars
|
|
323
1329
|
_from_rbdf(_df.slice(offset, length))
|
324
1330
|
end
|
325
1331
|
|
1332
|
+
# Get the first `n` rows.
|
1333
|
+
#
|
1334
|
+
# Alias for {#head}.
|
1335
|
+
#
|
1336
|
+
# @param n [Integer]
|
1337
|
+
# Number of rows to return.
|
1338
|
+
#
|
1339
|
+
# @return [DataFrame]
|
1340
|
+
#
|
1341
|
+
# @example
|
1342
|
+
# df = Polars::DataFrame.new(
|
1343
|
+
# {"foo" => [1, 2, 3, 4, 5, 6], "bar" => ["a", "b", "c", "d", "e", "f"]}
|
1344
|
+
# )
|
1345
|
+
# df.limit(4)
|
1346
|
+
# # =>
|
1347
|
+
# # shape: (4, 2)
|
1348
|
+
# # ┌─────┬─────┐
|
1349
|
+
# # │ foo ┆ bar │
|
1350
|
+
# # │ --- ┆ --- │
|
1351
|
+
# # │ i64 ┆ str │
|
1352
|
+
# # ╞═════╪═════╡
|
1353
|
+
# # │ 1 ┆ a │
|
1354
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1355
|
+
# # │ 2 ┆ b │
|
1356
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1357
|
+
# # │ 3 ┆ c │
|
1358
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1359
|
+
# # │ 4 ┆ d │
|
1360
|
+
# # └─────┴─────┘
|
326
1361
|
def limit(n = 5)
|
327
1362
|
head(n)
|
328
1363
|
end
|
329
1364
|
|
1365
|
+
# Get the first `n` rows.
|
1366
|
+
#
|
1367
|
+
# @param n [Integer]
|
1368
|
+
# Number of rows to return.
|
1369
|
+
#
|
1370
|
+
# @return [DataFrame]
|
1371
|
+
#
|
1372
|
+
# @example
|
1373
|
+
# df = Polars::DataFrame.new(
|
1374
|
+
# {
|
1375
|
+
# "foo" => [1, 2, 3, 4, 5],
|
1376
|
+
# "bar" => [6, 7, 8, 9, 10],
|
1377
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
1378
|
+
# }
|
1379
|
+
# )
|
1380
|
+
# df.head(3)
|
1381
|
+
# # =>
|
1382
|
+
# # shape: (3, 3)
|
1383
|
+
# # ┌─────┬─────┬─────┐
|
1384
|
+
# # │ foo ┆ bar ┆ ham │
|
1385
|
+
# # │ --- ┆ --- ┆ --- │
|
1386
|
+
# # │ i64 ┆ i64 ┆ str │
|
1387
|
+
# # ╞═════╪═════╪═════╡
|
1388
|
+
# # │ 1 ┆ 6 ┆ a │
|
1389
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1390
|
+
# # │ 2 ┆ 7 ┆ b │
|
1391
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1392
|
+
# # │ 3 ┆ 8 ┆ c │
|
1393
|
+
# # └─────┴─────┴─────┘
|
330
1394
|
def head(n = 5)
|
331
1395
|
_from_rbdf(_df.head(n))
|
332
1396
|
end
|
333
1397
|
|
1398
|
+
# Get the last `n` rows.
|
1399
|
+
#
|
1400
|
+
# @param n [Integer]
|
1401
|
+
# Number of rows to return.
|
1402
|
+
#
|
1403
|
+
# @return [DataFrame]
|
1404
|
+
#
|
1405
|
+
# @example
|
1406
|
+
# df = Polars::DataFrame.new(
|
1407
|
+
# {
|
1408
|
+
# "foo" => [1, 2, 3, 4, 5],
|
1409
|
+
# "bar" => [6, 7, 8, 9, 10],
|
1410
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
1411
|
+
# }
|
1412
|
+
# )
|
1413
|
+
# df.tail(3)
|
1414
|
+
# # =>
|
1415
|
+
# # shape: (3, 3)
|
1416
|
+
# # ┌─────┬─────┬─────┐
|
1417
|
+
# # │ foo ┆ bar ┆ ham │
|
1418
|
+
# # │ --- ┆ --- ┆ --- │
|
1419
|
+
# # │ i64 ┆ i64 ┆ str │
|
1420
|
+
# # ╞═════╪═════╪═════╡
|
1421
|
+
# # │ 3 ┆ 8 ┆ c │
|
1422
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1423
|
+
# # │ 4 ┆ 9 ┆ d │
|
1424
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1425
|
+
# # │ 5 ┆ 10 ┆ e │
|
1426
|
+
# # └─────┴─────┴─────┘
|
334
1427
|
def tail(n = 5)
|
335
1428
|
_from_rbdf(_df.tail(n))
|
336
1429
|
end
|
337
1430
|
|
338
|
-
#
|
339
|
-
#
|
1431
|
+
# Return a new DataFrame where the null values are dropped.
|
1432
|
+
#
|
1433
|
+
# @param subset [Object]
|
1434
|
+
# Subset of column(s) on which `drop_nulls` will be applied.
|
1435
|
+
#
|
1436
|
+
# @return [DataFrame]
|
1437
|
+
#
|
1438
|
+
# @example
|
1439
|
+
# df = Polars::DataFrame.new(
|
1440
|
+
# {
|
1441
|
+
# "foo" => [1, 2, 3],
|
1442
|
+
# "bar" => [6, nil, 8],
|
1443
|
+
# "ham" => ["a", "b", "c"]
|
1444
|
+
# }
|
1445
|
+
# )
|
1446
|
+
# df.drop_nulls
|
1447
|
+
# # =>
|
1448
|
+
# # shape: (2, 3)
|
1449
|
+
# # ┌─────┬─────┬─────┐
|
1450
|
+
# # │ foo ┆ bar ┆ ham │
|
1451
|
+
# # │ --- ┆ --- ┆ --- │
|
1452
|
+
# # │ i64 ┆ i64 ┆ str │
|
1453
|
+
# # ╞═════╪═════╪═════╡
|
1454
|
+
# # │ 1 ┆ 6 ┆ a │
|
1455
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1456
|
+
# # │ 3 ┆ 8 ┆ c │
|
1457
|
+
# # └─────┴─────┴─────┘
|
1458
|
+
def drop_nulls(subset: nil)
|
1459
|
+
if subset.is_a?(String)
|
1460
|
+
subset = [subset]
|
1461
|
+
end
|
1462
|
+
_from_rbdf(_df.drop_nulls(subset))
|
1463
|
+
end
|
340
1464
|
|
341
1465
|
# def pipe
|
342
1466
|
# end
|
343
1467
|
|
344
|
-
#
|
345
|
-
#
|
1468
|
+
# Add a column at index 0 that counts the rows.
|
1469
|
+
#
|
1470
|
+
# @param name [String]
|
1471
|
+
# Name of the column to add.
|
1472
|
+
# @param offset [Integer]
|
1473
|
+
# Start the row count at this offset.
|
1474
|
+
#
|
1475
|
+
# @return [DataFrame]
|
1476
|
+
#
|
1477
|
+
# @example
|
1478
|
+
# df = Polars::DataFrame.new(
|
1479
|
+
# {
|
1480
|
+
# "a" => [1, 3, 5],
|
1481
|
+
# "b" => [2, 4, 6]
|
1482
|
+
# }
|
1483
|
+
# )
|
1484
|
+
# df.with_row_count
|
1485
|
+
# # =>
|
1486
|
+
# # shape: (3, 3)
|
1487
|
+
# # ┌────────┬─────┬─────┐
|
1488
|
+
# # │ row_nr ┆ a ┆ b │
|
1489
|
+
# # │ --- ┆ --- ┆ --- │
|
1490
|
+
# # │ u32 ┆ i64 ┆ i64 │
|
1491
|
+
# # ╞════════╪═════╪═════╡
|
1492
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
1493
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1494
|
+
# # │ 1 ┆ 3 ┆ 4 │
|
1495
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1496
|
+
# # │ 2 ┆ 5 ┆ 6 │
|
1497
|
+
# # └────────┴─────┴─────┘
|
1498
|
+
def with_row_count(name: "row_nr", offset: 0)
|
1499
|
+
_from_rbdf(_df.with_row_count(name, offset))
|
1500
|
+
end
|
346
1501
|
|
1502
|
+
# Start a groupby operation.
|
1503
|
+
#
|
1504
|
+
# @param by [Object]
|
1505
|
+
# Column(s) to group by.
|
1506
|
+
# @param maintain_order [Boolean]
|
1507
|
+
# Make sure that the order of the groups remain consistent. This is more
|
1508
|
+
# expensive than a default groupby. Note that this only works in expression
|
1509
|
+
# aggregations.
|
1510
|
+
#
|
1511
|
+
# @return [GroupBy]
|
1512
|
+
#
|
1513
|
+
# @example
|
1514
|
+
# df = Polars::DataFrame.new(
|
1515
|
+
# {
|
1516
|
+
# "a" => ["a", "b", "a", "b", "b", "c"],
|
1517
|
+
# "b" => [1, 2, 3, 4, 5, 6],
|
1518
|
+
# "c" => [6, 5, 4, 3, 2, 1]
|
1519
|
+
# }
|
1520
|
+
# )
|
1521
|
+
# df.groupby("a").agg(Polars.col("b").sum).sort("a")
|
1522
|
+
# # =>
|
1523
|
+
# # shape: (3, 2)
|
1524
|
+
# # ┌─────┬─────┐
|
1525
|
+
# # │ a ┆ b │
|
1526
|
+
# # │ --- ┆ --- │
|
1527
|
+
# # │ str ┆ i64 │
|
1528
|
+
# # ╞═════╪═════╡
|
1529
|
+
# # │ a ┆ 4 │
|
1530
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1531
|
+
# # │ b ┆ 11 │
|
1532
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1533
|
+
# # │ c ┆ 6 │
|
1534
|
+
# # └─────┴─────┘
|
347
1535
|
def groupby(by, maintain_order: false)
|
348
|
-
|
1536
|
+
if !Utils.bool?(maintain_order)
|
1537
|
+
raise TypeError, "invalid input for groupby arg `maintain_order`: #{maintain_order}."
|
1538
|
+
end
|
1539
|
+
if by.is_a?(String)
|
1540
|
+
by = [by]
|
1541
|
+
end
|
1542
|
+
GroupBy.new(
|
1543
|
+
_df,
|
1544
|
+
by,
|
1545
|
+
self.class,
|
1546
|
+
maintain_order: maintain_order
|
1547
|
+
)
|
349
1548
|
end
|
350
1549
|
|
351
1550
|
# def groupby_rolling
|
@@ -360,6 +1559,109 @@ module Polars
|
|
360
1559
|
# def join_asof
|
361
1560
|
# end
|
362
1561
|
|
1562
|
+
# Join in SQL-like fashion.
|
1563
|
+
#
|
1564
|
+
# @param other [DataFrame]
|
1565
|
+
# DataFrame to join with.
|
1566
|
+
# @param left_on [Object]
|
1567
|
+
# Name(s) of the left join column(s).
|
1568
|
+
# @param right_on [Object]
|
1569
|
+
# Name(s) of the right join column(s).
|
1570
|
+
# @param on [Object]
|
1571
|
+
# Name(s) of the join columns in both DataFrames.
|
1572
|
+
# @param how ["inner", "left", "outer", "semi", "anti", "cross"]
|
1573
|
+
# Join strategy.
|
1574
|
+
# @param suffix [String]
|
1575
|
+
# Suffix to append to columns with a duplicate name.
|
1576
|
+
#
|
1577
|
+
# @return [DataFrame]
|
1578
|
+
#
|
1579
|
+
# @example
|
1580
|
+
# df = Polars::DataFrame.new(
|
1581
|
+
# {
|
1582
|
+
# "foo" => [1, 2, 3],
|
1583
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1584
|
+
# "ham" => ["a", "b", "c"]
|
1585
|
+
# }
|
1586
|
+
# )
|
1587
|
+
# other_df = Polars::DataFrame.new(
|
1588
|
+
# {
|
1589
|
+
# "apple" => ["x", "y", "z"],
|
1590
|
+
# "ham" => ["a", "b", "d"]
|
1591
|
+
# }
|
1592
|
+
# )
|
1593
|
+
# df.join(other_df, on: "ham")
|
1594
|
+
# # =>
|
1595
|
+
# # shape: (2, 4)
|
1596
|
+
# # ┌─────┬─────┬─────┬───────┐
|
1597
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1598
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1599
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
1600
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
1601
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
1602
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1603
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
1604
|
+
# # └─────┴─────┴─────┴───────┘
|
1605
|
+
#
|
1606
|
+
# @example
|
1607
|
+
# df.join(other_df, on: "ham", how: "outer")
|
1608
|
+
# # =>
|
1609
|
+
# # shape: (4, 4)
|
1610
|
+
# # ┌──────┬──────┬─────┬───────┐
|
1611
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1612
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1613
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
1614
|
+
# # ╞══════╪══════╪═════╪═══════╡
|
1615
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
1616
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1617
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
1618
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1619
|
+
# # │ null ┆ null ┆ d ┆ z │
|
1620
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1621
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null │
|
1622
|
+
# # └──────┴──────┴─────┴───────┘
|
1623
|
+
#
|
1624
|
+
# @example
|
1625
|
+
# df.join(other_df, on: "ham", how: "left")
|
1626
|
+
# # =>
|
1627
|
+
# # shape: (3, 4)
|
1628
|
+
# # ┌─────┬─────┬─────┬───────┐
|
1629
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1630
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1631
|
+
# # │ i64 ┆ f64 ┆ str ┆ str │
|
1632
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
1633
|
+
# # │ 1 ┆ 6.0 ┆ a ┆ x │
|
1634
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1635
|
+
# # │ 2 ┆ 7.0 ┆ b ┆ y │
|
1636
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1637
|
+
# # │ 3 ┆ 8.0 ┆ c ┆ null │
|
1638
|
+
# # └─────┴─────┴─────┴───────┘
|
1639
|
+
#
|
1640
|
+
# @example
|
1641
|
+
# df.join(other_df, on: "ham", how: "semi")
|
1642
|
+
# # =>
|
1643
|
+
# # shape: (2, 3)
|
1644
|
+
# # ┌─────┬─────┬─────┐
|
1645
|
+
# # │ foo ┆ bar ┆ ham │
|
1646
|
+
# # │ --- ┆ --- ┆ --- │
|
1647
|
+
# # │ i64 ┆ f64 ┆ str │
|
1648
|
+
# # ╞═════╪═════╪═════╡
|
1649
|
+
# # │ 1 ┆ 6.0 ┆ a │
|
1650
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1651
|
+
# # │ 2 ┆ 7.0 ┆ b │
|
1652
|
+
# # └─────┴─────┴─────┘
|
1653
|
+
#
|
1654
|
+
# @example
|
1655
|
+
# df.join(other_df, on: "ham", how: "anti")
|
1656
|
+
# # =>
|
1657
|
+
# # shape: (1, 3)
|
1658
|
+
# # ┌─────┬─────┬─────┐
|
1659
|
+
# # │ foo ┆ bar ┆ ham │
|
1660
|
+
# # │ --- ┆ --- ┆ --- │
|
1661
|
+
# # │ i64 ┆ f64 ┆ str │
|
1662
|
+
# # ╞═════╪═════╪═════╡
|
1663
|
+
# # │ 3 ┆ 8.0 ┆ c │
|
1664
|
+
# # └─────┴─────┴─────┘
|
363
1665
|
def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
|
364
1666
|
lazy
|
365
1667
|
.join(
|
@@ -376,36 +1678,322 @@ module Polars
|
|
376
1678
|
# def apply
|
377
1679
|
# end
|
378
1680
|
|
1681
|
+
# Return a new DataFrame with the column added or replaced.
|
1682
|
+
#
|
1683
|
+
# @param column [Object]
|
1684
|
+
# Series, where the name of the Series refers to the column in the DataFrame.
|
1685
|
+
#
|
1686
|
+
# @return [DataFrame]
|
1687
|
+
#
|
1688
|
+
# @example Added
|
1689
|
+
# df = Polars::DataFrame.new(
|
1690
|
+
# {
|
1691
|
+
# "a" => [1, 3, 5],
|
1692
|
+
# "b" => [2, 4, 6]
|
1693
|
+
# }
|
1694
|
+
# )
|
1695
|
+
# df.with_column((Polars.col("b") ** 2).alias("b_squared"))
|
1696
|
+
# # =>
|
1697
|
+
# # shape: (3, 3)
|
1698
|
+
# # ┌─────┬─────┬───────────┐
|
1699
|
+
# # │ a ┆ b ┆ b_squared │
|
1700
|
+
# # │ --- ┆ --- ┆ --- │
|
1701
|
+
# # │ i64 ┆ i64 ┆ f64 │
|
1702
|
+
# # ╞═════╪═════╪═══════════╡
|
1703
|
+
# # │ 1 ┆ 2 ┆ 4.0 │
|
1704
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
1705
|
+
# # │ 3 ┆ 4 ┆ 16.0 │
|
1706
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
|
1707
|
+
# # │ 5 ┆ 6 ┆ 36.0 │
|
1708
|
+
# # └─────┴─────┴───────────┘
|
1709
|
+
#
|
1710
|
+
# @example Replaced
|
1711
|
+
# df.with_column(Polars.col("a") ** 2)
|
1712
|
+
# # =>
|
1713
|
+
# # shape: (3, 2)
|
1714
|
+
# # ┌──────┬─────┐
|
1715
|
+
# # │ a ┆ b │
|
1716
|
+
# # │ --- ┆ --- │
|
1717
|
+
# # │ f64 ┆ i64 │
|
1718
|
+
# # ╞══════╪═════╡
|
1719
|
+
# # │ 1.0 ┆ 2 │
|
1720
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1721
|
+
# # │ 9.0 ┆ 4 │
|
1722
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌┤
|
1723
|
+
# # │ 25.0 ┆ 6 │
|
1724
|
+
# # └──────┴─────┘
|
379
1725
|
def with_column(column)
|
380
1726
|
lazy
|
381
1727
|
.with_column(column)
|
382
1728
|
.collect(no_optimization: true, string_cache: false)
|
383
1729
|
end
|
384
1730
|
|
385
|
-
#
|
386
|
-
#
|
1731
|
+
# Return a new DataFrame grown horizontally by stacking multiple Series to it.
|
1732
|
+
#
|
1733
|
+
# @param columns [Object]
|
1734
|
+
# Series to stack.
|
1735
|
+
# @param in_place [Boolean]
|
1736
|
+
# Modify in place.
|
1737
|
+
#
|
1738
|
+
# @return [DataFrame]
|
1739
|
+
#
|
1740
|
+
# @example
|
1741
|
+
# df = Polars::DataFrame.new(
|
1742
|
+
# {
|
1743
|
+
# "foo" => [1, 2, 3],
|
1744
|
+
# "bar" => [6, 7, 8],
|
1745
|
+
# "ham" => ["a", "b", "c"]
|
1746
|
+
# }
|
1747
|
+
# )
|
1748
|
+
# x = Polars::Series.new("apple", [10, 20, 30])
|
1749
|
+
# df.hstack([x])
|
1750
|
+
# # =>
|
1751
|
+
# # shape: (3, 4)
|
1752
|
+
# # ┌─────┬─────┬─────┬───────┐
|
1753
|
+
# # │ foo ┆ bar ┆ ham ┆ apple │
|
1754
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
1755
|
+
# # │ i64 ┆ i64 ┆ str ┆ i64 │
|
1756
|
+
# # ╞═════╪═════╪═════╪═══════╡
|
1757
|
+
# # │ 1 ┆ 6 ┆ a ┆ 10 │
|
1758
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1759
|
+
# # │ 2 ┆ 7 ┆ b ┆ 20 │
|
1760
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
1761
|
+
# # │ 3 ┆ 8 ┆ c ┆ 30 │
|
1762
|
+
# # └─────┴─────┴─────┴───────┘
|
1763
|
+
def hstack(columns, in_place: false)
|
1764
|
+
if !columns.is_a?(Array)
|
1765
|
+
columns = columns.get_columns
|
1766
|
+
end
|
1767
|
+
if in_place
|
1768
|
+
_df.hstack_mut(columns.map(&:_s))
|
1769
|
+
self
|
1770
|
+
else
|
1771
|
+
_from_rbdf(_df.hstack(columns.map(&:_s)))
|
1772
|
+
end
|
1773
|
+
end
|
387
1774
|
|
388
|
-
#
|
389
|
-
#
|
1775
|
+
# Grow this DataFrame vertically by stacking a DataFrame to it.
|
1776
|
+
#
|
1777
|
+
# @param df [DataFrame]
|
1778
|
+
# DataFrame to stack.
|
1779
|
+
# @param in_place [Boolean]
|
1780
|
+
# Modify in place
|
1781
|
+
#
|
1782
|
+
# @return [DataFrame]
|
1783
|
+
#
|
1784
|
+
# @example
|
1785
|
+
# df1 = Polars::DataFrame.new(
|
1786
|
+
# {
|
1787
|
+
# "foo" => [1, 2],
|
1788
|
+
# "bar" => [6, 7],
|
1789
|
+
# "ham" => ["a", "b"]
|
1790
|
+
# }
|
1791
|
+
# )
|
1792
|
+
# df2 = Polars::DataFrame.new(
|
1793
|
+
# {
|
1794
|
+
# "foo" => [3, 4],
|
1795
|
+
# "bar" => [8, 9],
|
1796
|
+
# "ham" => ["c", "d"]
|
1797
|
+
# }
|
1798
|
+
# )
|
1799
|
+
# df1.vstack(df2)
|
1800
|
+
# # =>
|
1801
|
+
# # shape: (4, 3)
|
1802
|
+
# # ┌─────┬─────┬─────┐
|
1803
|
+
# # │ foo ┆ bar ┆ ham │
|
1804
|
+
# # │ --- ┆ --- ┆ --- │
|
1805
|
+
# # │ i64 ┆ i64 ┆ str │
|
1806
|
+
# # ╞═════╪═════╪═════╡
|
1807
|
+
# # │ 1 ┆ 6 ┆ a │
|
1808
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1809
|
+
# # │ 2 ┆ 7 ┆ b │
|
1810
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1811
|
+
# # │ 3 ┆ 8 ┆ c │
|
1812
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
1813
|
+
# # │ 4 ┆ 9 ┆ d │
|
1814
|
+
# # └─────┴─────┴─────┘
|
1815
|
+
def vstack(df, in_place: false)
|
1816
|
+
if in_place
|
1817
|
+
_df.vstack_mut(df._df)
|
1818
|
+
self
|
1819
|
+
else
|
1820
|
+
_from_rbdf(_df.vstack(df._df))
|
1821
|
+
end
|
1822
|
+
end
|
390
1823
|
|
391
|
-
#
|
392
|
-
#
|
1824
|
+
# Extend the memory backed by this `DataFrame` with the values from `other`.
|
1825
|
+
#
|
1826
|
+
# Different from `vstack` which adds the chunks from `other` to the chunks of this
|
1827
|
+
# `DataFrame` `extend` appends the data from `other` to the underlying memory
|
1828
|
+
# locations and thus may cause a reallocation.
|
1829
|
+
#
|
1830
|
+
# If this does not cause a reallocation, the resulting data structure will not
|
1831
|
+
# have any extra chunks and thus will yield faster queries.
|
1832
|
+
#
|
1833
|
+
# Prefer `extend` over `vstack` when you want to do a query after a single append.
|
1834
|
+
# For instance during online operations where you add `n` rows and rerun a query.
|
1835
|
+
#
|
1836
|
+
# Prefer `vstack` over `extend` when you want to append many times before doing a
|
1837
|
+
# query. For instance when you read in multiple files and when to store them in a
|
1838
|
+
# single `DataFrame`. In the latter case, finish the sequence of `vstack`
|
1839
|
+
# operations with a `rechunk`.
|
1840
|
+
#
|
1841
|
+
# @param other [DataFrame]
|
1842
|
+
# DataFrame to vertically add.
|
1843
|
+
#
|
1844
|
+
# @return [DataFrame]
|
1845
|
+
#
|
1846
|
+
# @example
|
1847
|
+
# df1 = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
1848
|
+
# df2 = Polars::DataFrame.new({"foo" => [10, 20, 30], "bar" => [40, 50, 60]})
|
1849
|
+
# df1.extend(df2)
|
1850
|
+
# # =>
|
1851
|
+
# # shape: (6, 2)
|
1852
|
+
# # ┌─────┬─────┐
|
1853
|
+
# # │ foo ┆ bar │
|
1854
|
+
# # │ --- ┆ --- │
|
1855
|
+
# # │ i64 ┆ i64 │
|
1856
|
+
# # ╞═════╪═════╡
|
1857
|
+
# # │ 1 ┆ 4 │
|
1858
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1859
|
+
# # │ 2 ┆ 5 │
|
1860
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1861
|
+
# # │ 3 ┆ 6 │
|
1862
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1863
|
+
# # │ 10 ┆ 40 │
|
1864
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1865
|
+
# # │ 20 ┆ 50 │
|
1866
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1867
|
+
# # │ 30 ┆ 60 │
|
1868
|
+
# # └─────┴─────┘
|
1869
|
+
def extend(other)
|
1870
|
+
_df.extend(other._df)
|
1871
|
+
self
|
1872
|
+
end
|
393
1873
|
|
394
|
-
#
|
395
|
-
#
|
1874
|
+
# Remove column from DataFrame and return as new.
|
1875
|
+
#
|
1876
|
+
# @param columns [Object]
|
1877
|
+
# Column(s) to drop.
|
1878
|
+
#
|
1879
|
+
# @return [DataFrame]
|
1880
|
+
#
|
1881
|
+
# @example
|
1882
|
+
# df = Polars::DataFrame.new(
|
1883
|
+
# {
|
1884
|
+
# "foo" => [1, 2, 3],
|
1885
|
+
# "bar" => [6.0, 7.0, 8.0],
|
1886
|
+
# "ham" => ["a", "b", "c"]
|
1887
|
+
# }
|
1888
|
+
# )
|
1889
|
+
# df.drop("ham")
|
1890
|
+
# # =>
|
1891
|
+
# # shape: (3, 2)
|
1892
|
+
# # ┌─────┬─────┐
|
1893
|
+
# # │ foo ┆ bar │
|
1894
|
+
# # │ --- ┆ --- │
|
1895
|
+
# # │ i64 ┆ f64 │
|
1896
|
+
# # ╞═════╪═════╡
|
1897
|
+
# # │ 1 ┆ 6.0 │
|
1898
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1899
|
+
# # │ 2 ┆ 7.0 │
|
1900
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
1901
|
+
# # │ 3 ┆ 8.0 │
|
1902
|
+
# # └─────┴─────┘
|
1903
|
+
def drop(columns)
|
1904
|
+
if columns.is_a?(Array)
|
1905
|
+
df = clone
|
1906
|
+
columns.each do |n|
|
1907
|
+
df._df.drop_in_place(n)
|
1908
|
+
end
|
1909
|
+
df
|
1910
|
+
else
|
1911
|
+
_from_rbdf(_df.drop(columns))
|
1912
|
+
end
|
1913
|
+
end
|
396
1914
|
|
397
|
-
#
|
398
|
-
#
|
1915
|
+
# Drop in place.
|
1916
|
+
#
|
1917
|
+
# @param name [Object]
|
1918
|
+
# Column to drop.
|
1919
|
+
#
|
1920
|
+
# @return [Series]
|
1921
|
+
#
|
1922
|
+
# @example
|
1923
|
+
# df = Polars::DataFrame.new(
|
1924
|
+
# {
|
1925
|
+
# "foo" => [1, 2, 3],
|
1926
|
+
# "bar" => [6, 7, 8],
|
1927
|
+
# "ham" => ["a", "b", "c"]
|
1928
|
+
# }
|
1929
|
+
# )
|
1930
|
+
# df.drop_in_place("ham")
|
1931
|
+
# # =>
|
1932
|
+
# # shape: (3,)
|
1933
|
+
# # Series: 'ham' [str]
|
1934
|
+
# # [
|
1935
|
+
# # "a"
|
1936
|
+
# # "b"
|
1937
|
+
# # "c"
|
1938
|
+
# # ]
|
1939
|
+
def drop_in_place(name)
|
1940
|
+
Utils.wrap_s(_df.drop_in_place(name))
|
1941
|
+
end
|
399
1942
|
|
400
|
-
#
|
401
|
-
#
|
1943
|
+
# Create an empty copy of the current DataFrame.
|
1944
|
+
#
|
1945
|
+
# Returns a DataFrame with identical schema but no data.
|
1946
|
+
#
|
1947
|
+
# @return [DataFrame]
|
1948
|
+
#
|
1949
|
+
# @example
|
1950
|
+
# df = Polars::DataFrame.new(
|
1951
|
+
# {
|
1952
|
+
# "a" => [nil, 2, 3, 4],
|
1953
|
+
# "b" => [0.5, nil, 2.5, 13],
|
1954
|
+
# "c" => [true, true, false, nil]
|
1955
|
+
# }
|
1956
|
+
# )
|
1957
|
+
# df.cleared
|
1958
|
+
# # =>
|
1959
|
+
# # shape: (0, 3)
|
1960
|
+
# # ┌─────┬─────┬──────┐
|
1961
|
+
# # │ a ┆ b ┆ c │
|
1962
|
+
# # │ --- ┆ --- ┆ --- │
|
1963
|
+
# # │ i64 ┆ f64 ┆ bool │
|
1964
|
+
# # ╞═════╪═════╪══════╡
|
1965
|
+
# # └─────┴─────┴──────┘
|
1966
|
+
def cleared
|
1967
|
+
height > 0 ? head(0) : clone
|
1968
|
+
end
|
402
1969
|
|
403
1970
|
# clone handled by initialize_copy
|
404
1971
|
|
1972
|
+
# Get the DataFrame as a Array of Series.
|
1973
|
+
#
|
1974
|
+
# @return [Array]
|
405
1975
|
def get_columns
|
406
1976
|
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
407
1977
|
end
|
408
1978
|
|
1979
|
+
# Get a single column as Series by name.
|
1980
|
+
#
|
1981
|
+
# @param name [String]
|
1982
|
+
# Name of the column to retrieve.
|
1983
|
+
#
|
1984
|
+
# @return [Series]
|
1985
|
+
#
|
1986
|
+
# @example
|
1987
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
1988
|
+
# df.get_column("foo")
|
1989
|
+
# # =>
|
1990
|
+
# # shape: (3,)
|
1991
|
+
# # Series: 'foo' [i64]
|
1992
|
+
# # [
|
1993
|
+
# # 1
|
1994
|
+
# # 2
|
1995
|
+
# # 3
|
1996
|
+
# # ]
|
409
1997
|
def get_column(name)
|
410
1998
|
self[name]
|
411
1999
|
end
|
@@ -413,12 +2001,85 @@ module Polars
|
|
413
2001
|
# def fill_null
|
414
2002
|
# end
|
415
2003
|
|
2004
|
+
# Fill floating point NaN values by an Expression evaluation.
|
2005
|
+
#
|
2006
|
+
# @param fill_value [Object]
|
2007
|
+
# Value to fill NaN with.
|
2008
|
+
#
|
2009
|
+
# @return [DataFrame]
|
2010
|
+
#
|
2011
|
+
# @note
|
2012
|
+
# Note that floating point NaNs (Not a Number) are not missing values!
|
2013
|
+
# To replace missing values, use `fill_null`.
|
2014
|
+
#
|
2015
|
+
# @example
|
2016
|
+
# df = Polars::DataFrame.new(
|
2017
|
+
# {
|
2018
|
+
# "a" => [1.5, 2, Float::NAN, 4],
|
2019
|
+
# "b" => [0.5, 4, Float::NAN, 13]
|
2020
|
+
# }
|
2021
|
+
# )
|
2022
|
+
# df.fill_nan(99)
|
2023
|
+
# # =>
|
2024
|
+
# # shape: (4, 2)
|
2025
|
+
# # ┌──────┬──────┐
|
2026
|
+
# # │ a ┆ b │
|
2027
|
+
# # │ --- ┆ --- │
|
2028
|
+
# # │ f64 ┆ f64 │
|
2029
|
+
# # ╞══════╪══════╡
|
2030
|
+
# # │ 1.5 ┆ 0.5 │
|
2031
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2032
|
+
# # │ 2.0 ┆ 4.0 │
|
2033
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2034
|
+
# # │ 99.0 ┆ 99.0 │
|
2035
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2036
|
+
# # │ 4.0 ┆ 13.0 │
|
2037
|
+
# # └──────┴──────┘
|
416
2038
|
def fill_nan(fill_value)
|
417
2039
|
lazy.fill_nan(fill_value).collect(no_optimization: true)
|
418
2040
|
end
|
419
2041
|
|
420
|
-
#
|
421
|
-
#
|
2042
|
+
# Explode `DataFrame` to long format by exploding a column with Lists.
|
2043
|
+
#
|
2044
|
+
# @param columns [Object]
|
2045
|
+
# Column of LargeList type.
|
2046
|
+
#
|
2047
|
+
# @return [DataFrame]
|
2048
|
+
#
|
2049
|
+
# @example
|
2050
|
+
# df = Polars::DataFrame.new(
|
2051
|
+
# {
|
2052
|
+
# "letters" => ["a", "a", "b", "c"],
|
2053
|
+
# "numbers" => [[1], [2, 3], [4, 5], [6, 7, 8]]
|
2054
|
+
# }
|
2055
|
+
# )
|
2056
|
+
# df.explode("numbers")
|
2057
|
+
# # =>
|
2058
|
+
# # shape: (8, 2)
|
2059
|
+
# # ┌─────────┬─────────┐
|
2060
|
+
# # │ letters ┆ numbers │
|
2061
|
+
# # │ --- ┆ --- │
|
2062
|
+
# # │ str ┆ i64 │
|
2063
|
+
# # ╞═════════╪═════════╡
|
2064
|
+
# # │ a ┆ 1 │
|
2065
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2066
|
+
# # │ a ┆ 2 │
|
2067
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2068
|
+
# # │ a ┆ 3 │
|
2069
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2070
|
+
# # │ b ┆ 4 │
|
2071
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2072
|
+
# # │ b ┆ 5 │
|
2073
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2074
|
+
# # │ c ┆ 6 │
|
2075
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2076
|
+
# # │ c ┆ 7 │
|
2077
|
+
# # ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
2078
|
+
# # │ c ┆ 8 │
|
2079
|
+
# # └─────────┴─────────┘
|
2080
|
+
def explode(columns)
|
2081
|
+
lazy.explode(columns).collect(no_optimization: true)
|
2082
|
+
end
|
422
2083
|
|
423
2084
|
# def pivot
|
424
2085
|
# end
|
@@ -432,24 +2093,242 @@ module Polars
|
|
432
2093
|
# def partition_by
|
433
2094
|
# end
|
434
2095
|
|
435
|
-
#
|
436
|
-
#
|
2096
|
+
# Shift values by the given period.
|
2097
|
+
#
|
2098
|
+
# @param periods [Integer]
|
2099
|
+
# Number of places to shift (may be negative).
|
2100
|
+
#
|
2101
|
+
# @return [DataFrame]
|
2102
|
+
#
|
2103
|
+
# @example
|
2104
|
+
# df = Polars::DataFrame.new(
|
2105
|
+
# {
|
2106
|
+
# "foo" => [1, 2, 3],
|
2107
|
+
# "bar" => [6, 7, 8],
|
2108
|
+
# "ham" => ["a", "b", "c"]
|
2109
|
+
# }
|
2110
|
+
# )
|
2111
|
+
# df.shift(1)
|
2112
|
+
# # =>
|
2113
|
+
# # shape: (3, 3)
|
2114
|
+
# # ┌──────┬──────┬──────┐
|
2115
|
+
# # │ foo ┆ bar ┆ ham │
|
2116
|
+
# # │ --- ┆ --- ┆ --- │
|
2117
|
+
# # │ i64 ┆ i64 ┆ str │
|
2118
|
+
# # ╞══════╪══════╪══════╡
|
2119
|
+
# # │ null ┆ null ┆ null │
|
2120
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2121
|
+
# # │ 1 ┆ 6 ┆ a │
|
2122
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2123
|
+
# # │ 2 ┆ 7 ┆ b │
|
2124
|
+
# # └──────┴──────┴──────┘
|
2125
|
+
#
|
2126
|
+
# @example
|
2127
|
+
# df.shift(-1)
|
2128
|
+
# # =>
|
2129
|
+
# # shape: (3, 3)
|
2130
|
+
# # ┌──────┬──────┬──────┐
|
2131
|
+
# # │ foo ┆ bar ┆ ham │
|
2132
|
+
# # │ --- ┆ --- ┆ --- │
|
2133
|
+
# # │ i64 ┆ i64 ┆ str │
|
2134
|
+
# # ╞══════╪══════╪══════╡
|
2135
|
+
# # │ 2 ┆ 7 ┆ b │
|
2136
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2137
|
+
# # │ 3 ┆ 8 ┆ c │
|
2138
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
2139
|
+
# # │ null ┆ null ┆ null │
|
2140
|
+
# # └──────┴──────┴──────┘
|
2141
|
+
def shift(periods)
|
2142
|
+
_from_rbdf(_df.shift(periods))
|
2143
|
+
end
|
437
2144
|
|
438
|
-
#
|
439
|
-
#
|
2145
|
+
# Shift the values by a given period and fill the resulting null values.
|
2146
|
+
#
|
2147
|
+
# @param periods [Integer]
|
2148
|
+
# Number of places to shift (may be negative).
|
2149
|
+
# @param fill_value [Object]
|
2150
|
+
# fill nil values with this value.
|
2151
|
+
#
|
2152
|
+
# @return [DataFrame]
|
2153
|
+
#
|
2154
|
+
# @example
|
2155
|
+
# df = Polars::DataFrame.new(
|
2156
|
+
# {
|
2157
|
+
# "foo" => [1, 2, 3],
|
2158
|
+
# "bar" => [6, 7, 8],
|
2159
|
+
# "ham" => ["a", "b", "c"]
|
2160
|
+
# }
|
2161
|
+
# )
|
2162
|
+
# df.shift_and_fill(1, 0)
|
2163
|
+
# # =>
|
2164
|
+
# # shape: (3, 3)
|
2165
|
+
# # ┌─────┬─────┬─────┐
|
2166
|
+
# # │ foo ┆ bar ┆ ham │
|
2167
|
+
# # │ --- ┆ --- ┆ --- │
|
2168
|
+
# # │ i64 ┆ i64 ┆ str │
|
2169
|
+
# # ╞═════╪═════╪═════╡
|
2170
|
+
# # │ 0 ┆ 0 ┆ 0 │
|
2171
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2172
|
+
# # │ 1 ┆ 6 ┆ a │
|
2173
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2174
|
+
# # │ 2 ┆ 7 ┆ b │
|
2175
|
+
# # └─────┴─────┴─────┘
|
2176
|
+
def shift_and_fill(periods, fill_value)
|
2177
|
+
lazy
|
2178
|
+
.shift_and_fill(periods, fill_value)
|
2179
|
+
.collect(no_optimization: true, string_cache: false)
|
2180
|
+
end
|
440
2181
|
|
2182
|
+
# Get a mask of all duplicated rows in this DataFrame.
|
2183
|
+
#
|
2184
|
+
# @return [Series]
|
2185
|
+
#
|
2186
|
+
# @example
|
2187
|
+
# df = Polars::DataFrame.new(
|
2188
|
+
# {
|
2189
|
+
# "a" => [1, 2, 3, 1],
|
2190
|
+
# "b" => ["x", "y", "z", "x"],
|
2191
|
+
# }
|
2192
|
+
# )
|
2193
|
+
# df.is_duplicated
|
2194
|
+
# # =>
|
2195
|
+
# # shape: (4,)
|
2196
|
+
# # Series: '' [bool]
|
2197
|
+
# # [
|
2198
|
+
# # true
|
2199
|
+
# # false
|
2200
|
+
# # false
|
2201
|
+
# # true
|
2202
|
+
# # ]
|
441
2203
|
def is_duplicated
|
442
2204
|
Utils.wrap_s(_df.is_duplicated)
|
443
2205
|
end
|
444
2206
|
|
2207
|
+
# Get a mask of all unique rows in this DataFrame.
|
2208
|
+
#
|
2209
|
+
# @return [Series]
|
2210
|
+
#
|
2211
|
+
# @example
|
2212
|
+
# df = Polars::DataFrame.new(
|
2213
|
+
# {
|
2214
|
+
# "a" => [1, 2, 3, 1],
|
2215
|
+
# "b" => ["x", "y", "z", "x"]
|
2216
|
+
# }
|
2217
|
+
# )
|
2218
|
+
# df.is_unique
|
2219
|
+
# # =>
|
2220
|
+
# # shape: (4,)
|
2221
|
+
# # Series: '' [bool]
|
2222
|
+
# # [
|
2223
|
+
# # false
|
2224
|
+
# # true
|
2225
|
+
# # true
|
2226
|
+
# # false
|
2227
|
+
# # ]
|
445
2228
|
def is_unique
|
446
2229
|
Utils.wrap_s(_df.is_unique)
|
447
2230
|
end
|
448
2231
|
|
2232
|
+
# Start a lazy query from this point.
|
2233
|
+
#
|
2234
|
+
# @return [LazyFrame]
|
449
2235
|
def lazy
|
450
2236
|
wrap_ldf(_df.lazy)
|
451
2237
|
end
|
452
2238
|
|
2239
|
+
# Select columns from this DataFrame.
|
2240
|
+
#
|
2241
|
+
# @param exprs [Object]
|
2242
|
+
# Column or columns to select.
|
2243
|
+
#
|
2244
|
+
# @return [DataFrame]
|
2245
|
+
#
|
2246
|
+
# @example
|
2247
|
+
# df = Polars::DataFrame.new(
|
2248
|
+
# {
|
2249
|
+
# "foo" => [1, 2, 3],
|
2250
|
+
# "bar" => [6, 7, 8],
|
2251
|
+
# "ham" => ["a", "b", "c"]
|
2252
|
+
# }
|
2253
|
+
# )
|
2254
|
+
# df.select("foo")
|
2255
|
+
# # =>
|
2256
|
+
# # shape: (3, 1)
|
2257
|
+
# # ┌─────┐
|
2258
|
+
# # │ foo │
|
2259
|
+
# # │ --- │
|
2260
|
+
# # │ i64 │
|
2261
|
+
# # ╞═════╡
|
2262
|
+
# # │ 1 │
|
2263
|
+
# # ├╌╌╌╌╌┤
|
2264
|
+
# # │ 2 │
|
2265
|
+
# # ├╌╌╌╌╌┤
|
2266
|
+
# # │ 3 │
|
2267
|
+
# # └─────┘
|
2268
|
+
#
|
2269
|
+
# @example
|
2270
|
+
# df.select(["foo", "bar"])
|
2271
|
+
# # =>
|
2272
|
+
# # shape: (3, 2)
|
2273
|
+
# # ┌─────┬─────┐
|
2274
|
+
# # │ foo ┆ bar │
|
2275
|
+
# # │ --- ┆ --- │
|
2276
|
+
# # │ i64 ┆ i64 │
|
2277
|
+
# # ╞═════╪═════╡
|
2278
|
+
# # │ 1 ┆ 6 │
|
2279
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2280
|
+
# # │ 2 ┆ 7 │
|
2281
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2282
|
+
# # │ 3 ┆ 8 │
|
2283
|
+
# # └─────┴─────┘
|
2284
|
+
#
|
2285
|
+
# @example
|
2286
|
+
# df.select(Polars.col("foo") + 1)
|
2287
|
+
# # =>
|
2288
|
+
# # shape: (3, 1)
|
2289
|
+
# # ┌─────┐
|
2290
|
+
# # │ foo │
|
2291
|
+
# # │ --- │
|
2292
|
+
# # │ i64 │
|
2293
|
+
# # ╞═════╡
|
2294
|
+
# # │ 2 │
|
2295
|
+
# # ├╌╌╌╌╌┤
|
2296
|
+
# # │ 3 │
|
2297
|
+
# # ├╌╌╌╌╌┤
|
2298
|
+
# # │ 4 │
|
2299
|
+
# # └─────┘
|
2300
|
+
#
|
2301
|
+
# @example
|
2302
|
+
# df.select([Polars.col("foo") + 1, Polars.col("bar") + 1])
|
2303
|
+
# # =>
|
2304
|
+
# # shape: (3, 2)
|
2305
|
+
# # ┌─────┬─────┐
|
2306
|
+
# # │ foo ┆ bar │
|
2307
|
+
# # │ --- ┆ --- │
|
2308
|
+
# # │ i64 ┆ i64 │
|
2309
|
+
# # ╞═════╪═════╡
|
2310
|
+
# # │ 2 ┆ 7 │
|
2311
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2312
|
+
# # │ 3 ┆ 8 │
|
2313
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
2314
|
+
# # │ 4 ┆ 9 │
|
2315
|
+
# # └─────┴─────┘
|
2316
|
+
#
|
2317
|
+
# @example
|
2318
|
+
# df.select(Polars.when(Polars.col("foo") > 2).then(10).otherwise(0))
|
2319
|
+
# # =>
|
2320
|
+
# # shape: (3, 1)
|
2321
|
+
# # ┌─────────┐
|
2322
|
+
# # │ literal │
|
2323
|
+
# # │ --- │
|
2324
|
+
# # │ i64 │
|
2325
|
+
# # ╞═════════╡
|
2326
|
+
# # │ 0 │
|
2327
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
2328
|
+
# # │ 0 │
|
2329
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
2330
|
+
# # │ 10 │
|
2331
|
+
# # └─────────┘
|
453
2332
|
def select(exprs)
|
454
2333
|
_from_rbdf(
|
455
2334
|
lazy
|
@@ -459,6 +2338,43 @@ module Polars
|
|
459
2338
|
)
|
460
2339
|
end
|
461
2340
|
|
2341
|
+
# Add or overwrite multiple columns in a DataFrame.
|
2342
|
+
#
|
2343
|
+
# @param exprs [Array]
|
2344
|
+
# Array of Expressions that evaluate to columns.
|
2345
|
+
#
|
2346
|
+
# @return [DataFrame]
|
2347
|
+
#
|
2348
|
+
# @example
|
2349
|
+
# df = Polars::DataFrame.new(
|
2350
|
+
# {
|
2351
|
+
# "a" => [1, 2, 3, 4],
|
2352
|
+
# "b" => [0.5, 4, 10, 13],
|
2353
|
+
# "c" => [true, true, false, true]
|
2354
|
+
# }
|
2355
|
+
# )
|
2356
|
+
# df.with_columns(
|
2357
|
+
# [
|
2358
|
+
# (Polars.col("a") ** 2).alias("a^2"),
|
2359
|
+
# (Polars.col("b") / 2).alias("b/2"),
|
2360
|
+
# (Polars.col("c").is_not()).alias("not c")
|
2361
|
+
# ]
|
2362
|
+
# )
|
2363
|
+
# # =>
|
2364
|
+
# # shape: (4, 6)
|
2365
|
+
# # ┌─────┬──────┬───────┬──────┬──────┬───────┐
|
2366
|
+
# # │ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
|
2367
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2368
|
+
# # │ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
|
2369
|
+
# # ╞═════╪══════╪═══════╪══════╪══════╪═══════╡
|
2370
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
|
2371
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2372
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
|
2373
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2374
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
|
2375
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2376
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
|
2377
|
+
# # └─────┴──────┴───────┴──────┴──────┴───────┘
|
462
2378
|
def with_columns(exprs)
|
463
2379
|
if !exprs.nil? && !exprs.is_a?(Array)
|
464
2380
|
exprs = [exprs]
|
@@ -468,6 +2384,26 @@ module Polars
|
|
468
2384
|
.collect(no_optimization: true, string_cache: false)
|
469
2385
|
end
|
470
2386
|
|
2387
|
+
# Get number of chunks used by the ChunkedArrays of this DataFrame.
|
2388
|
+
#
|
2389
|
+
# @param strategy ["first", "all"]
|
2390
|
+
# Return the number of chunks of the 'first' column,
|
2391
|
+
# or 'all' columns in this DataFrame.
|
2392
|
+
#
|
2393
|
+
# @return [Object]
|
2394
|
+
#
|
2395
|
+
# @example
|
2396
|
+
# df = Polars::DataFrame.new(
|
2397
|
+
# {
|
2398
|
+
# "a" => [1, 2, 3, 4],
|
2399
|
+
# "b" => [0.5, 4, 10, 13],
|
2400
|
+
# "c" => [true, true, false, true]
|
2401
|
+
# }
|
2402
|
+
# )
|
2403
|
+
# df.n_chunks
|
2404
|
+
# # => 1
|
2405
|
+
# df.n_chunks(strategy: "all")
|
2406
|
+
# # => [1, 1, 1]
|
471
2407
|
def n_chunks(strategy: "first")
|
472
2408
|
if strategy == "first"
|
473
2409
|
_df.n_chunks
|
@@ -478,6 +2414,28 @@ module Polars
|
|
478
2414
|
end
|
479
2415
|
end
|
480
2416
|
|
2417
|
+
# Aggregate the columns of this DataFrame to their maximum value.
|
2418
|
+
#
|
2419
|
+
# @return [DataFrame]
|
2420
|
+
#
|
2421
|
+
# @example
|
2422
|
+
# df = Polars::DataFrame.new(
|
2423
|
+
# {
|
2424
|
+
# "foo" => [1, 2, 3],
|
2425
|
+
# "bar" => [6, 7, 8],
|
2426
|
+
# "ham" => ["a", "b", "c"]
|
2427
|
+
# }
|
2428
|
+
# )
|
2429
|
+
# df.max
|
2430
|
+
# # =>
|
2431
|
+
# # shape: (1, 3)
|
2432
|
+
# # ┌─────┬─────┬─────┐
|
2433
|
+
# # │ foo ┆ bar ┆ ham │
|
2434
|
+
# # │ --- ┆ --- ┆ --- │
|
2435
|
+
# # │ i64 ┆ i64 ┆ str │
|
2436
|
+
# # ╞═════╪═════╪═════╡
|
2437
|
+
# # │ 3 ┆ 8 ┆ c │
|
2438
|
+
# # └─────┴─────┴─────┘
|
481
2439
|
def max(axis: 0)
|
482
2440
|
if axis == 0
|
483
2441
|
_from_rbdf(_df.max)
|
@@ -488,6 +2446,28 @@ module Polars
|
|
488
2446
|
end
|
489
2447
|
end
|
490
2448
|
|
2449
|
+
# Aggregate the columns of this DataFrame to their minimum value.
|
2450
|
+
#
|
2451
|
+
# @return [DataFrame]
|
2452
|
+
#
|
2453
|
+
# @example
|
2454
|
+
# df = Polars::DataFrame.new(
|
2455
|
+
# {
|
2456
|
+
# "foo" => [1, 2, 3],
|
2457
|
+
# "bar" => [6, 7, 8],
|
2458
|
+
# "ham" => ["a", "b", "c"]
|
2459
|
+
# }
|
2460
|
+
# )
|
2461
|
+
# df.min
|
2462
|
+
# # =>
|
2463
|
+
# # shape: (1, 3)
|
2464
|
+
# # ┌─────┬─────┬─────┐
|
2465
|
+
# # │ foo ┆ bar ┆ ham │
|
2466
|
+
# # │ --- ┆ --- ┆ --- │
|
2467
|
+
# # │ i64 ┆ i64 ┆ str │
|
2468
|
+
# # ╞═════╪═════╪═════╡
|
2469
|
+
# # │ 1 ┆ 6 ┆ a │
|
2470
|
+
# # └─────┴─────┴─────┘
|
491
2471
|
def min(axis: 0)
|
492
2472
|
if axis == 0
|
493
2473
|
_from_rbdf(_df.min)
|
@@ -498,6 +2478,44 @@ module Polars
|
|
498
2478
|
end
|
499
2479
|
end
|
500
2480
|
|
2481
|
+
# Aggregate the columns of this DataFrame to their sum value.
|
2482
|
+
#
|
2483
|
+
# @param axis [Integer]
|
2484
|
+
# Either 0 or 1.
|
2485
|
+
# @param null_strategy ["ignore", "propagate"]
|
2486
|
+
# This argument is only used if axis == 1.
|
2487
|
+
#
|
2488
|
+
# @return [DataFrame]
|
2489
|
+
#
|
2490
|
+
# @example
|
2491
|
+
# df = Polars::DataFrame.new(
|
2492
|
+
# {
|
2493
|
+
# "foo" => [1, 2, 3],
|
2494
|
+
# "bar" => [6, 7, 8],
|
2495
|
+
# "ham" => ["a", "b", "c"],
|
2496
|
+
# }
|
2497
|
+
# )
|
2498
|
+
# df.sum
|
2499
|
+
# # =>
|
2500
|
+
# # shape: (1, 3)
|
2501
|
+
# # ┌─────┬─────┬──────┐
|
2502
|
+
# # │ foo ┆ bar ┆ ham │
|
2503
|
+
# # │ --- ┆ --- ┆ --- │
|
2504
|
+
# # │ i64 ┆ i64 ┆ str │
|
2505
|
+
# # ╞═════╪═════╪══════╡
|
2506
|
+
# # │ 6 ┆ 21 ┆ null │
|
2507
|
+
# # └─────┴─────┴──────┘
|
2508
|
+
#
|
2509
|
+
# @example
|
2510
|
+
# df.sum(axis: 1)
|
2511
|
+
# # =>
|
2512
|
+
# # shape: (3,)
|
2513
|
+
# # Series: 'foo' [str]
|
2514
|
+
# # [
|
2515
|
+
# # "16a"
|
2516
|
+
# # "27b"
|
2517
|
+
# # "38c"
|
2518
|
+
# # ]
|
501
2519
|
def sum(axis: 0, null_strategy: "ignore")
|
502
2520
|
case axis
|
503
2521
|
when 0
|
@@ -509,6 +2527,33 @@ module Polars
|
|
509
2527
|
end
|
510
2528
|
end
|
511
2529
|
|
2530
|
+
# Aggregate the columns of this DataFrame to their mean value.
|
2531
|
+
#
|
2532
|
+
# @param axis [Integer]
|
2533
|
+
# Either 0 or 1.
|
2534
|
+
# @param null_strategy ["ignore", "propagate"]
|
2535
|
+
# This argument is only used if axis == 1.
|
2536
|
+
#
|
2537
|
+
# @return [DataFrame]
|
2538
|
+
#
|
2539
|
+
# @example
|
2540
|
+
# df = Polars::DataFrame.new(
|
2541
|
+
# {
|
2542
|
+
# "foo" => [1, 2, 3],
|
2543
|
+
# "bar" => [6, 7, 8],
|
2544
|
+
# "ham" => ["a", "b", "c"]
|
2545
|
+
# }
|
2546
|
+
# )
|
2547
|
+
# df.mean
|
2548
|
+
# # =>
|
2549
|
+
# # shape: (1, 3)
|
2550
|
+
# # ┌─────┬─────┬──────┐
|
2551
|
+
# # │ foo ┆ bar ┆ ham │
|
2552
|
+
# # │ --- ┆ --- ┆ --- │
|
2553
|
+
# # │ f64 ┆ f64 ┆ str │
|
2554
|
+
# # ╞═════╪═════╪══════╡
|
2555
|
+
# # │ 2.0 ┆ 7.0 ┆ null │
|
2556
|
+
# # └─────┴─────┴──────┘
|
512
2557
|
def mean(axis: 0, null_strategy: "ignore")
|
513
2558
|
case axis
|
514
2559
|
when 0
|
@@ -520,75 +2565,633 @@ module Polars
|
|
520
2565
|
end
|
521
2566
|
end
|
522
2567
|
|
2568
|
+
# Aggregate the columns of this DataFrame to their standard deviation value.
|
2569
|
+
#
|
2570
|
+
# @param ddof [Integer]
|
2571
|
+
# Degrees of freedom
|
2572
|
+
#
|
2573
|
+
# @return [DataFrame]
|
2574
|
+
#
|
2575
|
+
# @example
|
2576
|
+
# df = Polars::DataFrame.new(
|
2577
|
+
# {
|
2578
|
+
# "foo" => [1, 2, 3],
|
2579
|
+
# "bar" => [6, 7, 8],
|
2580
|
+
# "ham" => ["a", "b", "c"]
|
2581
|
+
# }
|
2582
|
+
# )
|
2583
|
+
# df.std
|
2584
|
+
# # =>
|
2585
|
+
# # shape: (1, 3)
|
2586
|
+
# # ┌─────┬─────┬──────┐
|
2587
|
+
# # │ foo ┆ bar ┆ ham │
|
2588
|
+
# # │ --- ┆ --- ┆ --- │
|
2589
|
+
# # │ f64 ┆ f64 ┆ str │
|
2590
|
+
# # ╞═════╪═════╪══════╡
|
2591
|
+
# # │ 1.0 ┆ 1.0 ┆ null │
|
2592
|
+
# # └─────┴─────┴──────┘
|
2593
|
+
#
|
2594
|
+
# @example
|
2595
|
+
# df.std(ddof: 0)
|
2596
|
+
# # =>
|
2597
|
+
# # shape: (1, 3)
|
2598
|
+
# # ┌──────────┬──────────┬──────┐
|
2599
|
+
# # │ foo ┆ bar ┆ ham │
|
2600
|
+
# # │ --- ┆ --- ┆ --- │
|
2601
|
+
# # │ f64 ┆ f64 ┆ str │
|
2602
|
+
# # ╞══════════╪══════════╪══════╡
|
2603
|
+
# # │ 0.816497 ┆ 0.816497 ┆ null │
|
2604
|
+
# # └──────────┴──────────┴──────┘
|
523
2605
|
def std(ddof: 1)
|
524
2606
|
_from_rbdf(_df.std(ddof))
|
525
2607
|
end
|
526
2608
|
|
2609
|
+
# Aggregate the columns of this DataFrame to their variance value.
|
2610
|
+
#
|
2611
|
+
# @param ddof [Integer]
|
2612
|
+
# Degrees of freedom
|
2613
|
+
#
|
2614
|
+
# @return [DataFrame]
|
2615
|
+
#
|
2616
|
+
# @example
|
2617
|
+
# df = Polars::DataFrame.new(
|
2618
|
+
# {
|
2619
|
+
# "foo" => [1, 2, 3],
|
2620
|
+
# "bar" => [6, 7, 8],
|
2621
|
+
# "ham" => ["a", "b", "c"]
|
2622
|
+
# }
|
2623
|
+
# )
|
2624
|
+
# df.var
|
2625
|
+
# # =>
|
2626
|
+
# # shape: (1, 3)
|
2627
|
+
# # ┌─────┬─────┬──────┐
|
2628
|
+
# # │ foo ┆ bar ┆ ham │
|
2629
|
+
# # │ --- ┆ --- ┆ --- │
|
2630
|
+
# # │ f64 ┆ f64 ┆ str │
|
2631
|
+
# # ╞═════╪═════╪══════╡
|
2632
|
+
# # │ 1.0 ┆ 1.0 ┆ null │
|
2633
|
+
# # └─────┴─────┴──────┘
|
2634
|
+
#
|
2635
|
+
# @example
|
2636
|
+
# df.var(ddof: 0)
|
2637
|
+
# # =>
|
2638
|
+
# # shape: (1, 3)
|
2639
|
+
# # ┌──────────┬──────────┬──────┐
|
2640
|
+
# # │ foo ┆ bar ┆ ham │
|
2641
|
+
# # │ --- ┆ --- ┆ --- │
|
2642
|
+
# # │ f64 ┆ f64 ┆ str │
|
2643
|
+
# # ╞══════════╪══════════╪══════╡
|
2644
|
+
# # │ 0.666667 ┆ 0.666667 ┆ null │
|
2645
|
+
# # └──────────┴──────────┴──────┘
|
527
2646
|
def var(ddof: 1)
|
528
2647
|
_from_rbdf(_df.var(ddof))
|
529
2648
|
end
|
530
2649
|
|
2650
|
+
# Aggregate the columns of this DataFrame to their median value.
|
2651
|
+
#
|
2652
|
+
# @return [DataFrame]
|
2653
|
+
#
|
2654
|
+
# @example
|
2655
|
+
# df = Polars::DataFrame.new(
|
2656
|
+
# {
|
2657
|
+
# "foo" => [1, 2, 3],
|
2658
|
+
# "bar" => [6, 7, 8],
|
2659
|
+
# "ham" => ["a", "b", "c"]
|
2660
|
+
# }
|
2661
|
+
# )
|
2662
|
+
# df.median
|
2663
|
+
# # =>
|
2664
|
+
# # shape: (1, 3)
|
2665
|
+
# # ┌─────┬─────┬──────┐
|
2666
|
+
# # │ foo ┆ bar ┆ ham │
|
2667
|
+
# # │ --- ┆ --- ┆ --- │
|
2668
|
+
# # │ f64 ┆ f64 ┆ str │
|
2669
|
+
# # ╞═════╪═════╪══════╡
|
2670
|
+
# # │ 2.0 ┆ 7.0 ┆ null │
|
2671
|
+
# # └─────┴─────┴──────┘
|
531
2672
|
def median
|
532
2673
|
_from_rbdf(_df.median)
|
533
2674
|
end
|
534
2675
|
|
535
|
-
#
|
536
|
-
#
|
2676
|
+
# Aggregate the columns of this DataFrame to their product values.
|
2677
|
+
#
|
2678
|
+
# @return [DataFrame]
|
2679
|
+
#
|
2680
|
+
# @example
|
2681
|
+
# df = Polars::DataFrame.new(
|
2682
|
+
# {
|
2683
|
+
# "a" => [1, 2, 3],
|
2684
|
+
# "b" => [0.5, 4, 10],
|
2685
|
+
# "c" => [true, true, false]
|
2686
|
+
# }
|
2687
|
+
# )
|
2688
|
+
# df.product
|
2689
|
+
# # =>
|
2690
|
+
# # shape: (1, 3)
|
2691
|
+
# # ┌─────┬──────┬─────┐
|
2692
|
+
# # │ a ┆ b ┆ c │
|
2693
|
+
# # │ --- ┆ --- ┆ --- │
|
2694
|
+
# # │ i64 ┆ f64 ┆ i64 │
|
2695
|
+
# # ╞═════╪══════╪═════╡
|
2696
|
+
# # │ 6 ┆ 20.0 ┆ 0 │
|
2697
|
+
# # └─────┴──────┴─────┘
|
2698
|
+
def product
|
2699
|
+
select(Polars.all.product)
|
2700
|
+
end
|
537
2701
|
|
538
|
-
#
|
539
|
-
#
|
2702
|
+
# Aggregate the columns of this DataFrame to their quantile value.
|
2703
|
+
#
|
2704
|
+
# @param quantile [Float]
|
2705
|
+
# Quantile between 0.0 and 1.0.
|
2706
|
+
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
|
2707
|
+
# Interpolation method.
|
2708
|
+
#
|
2709
|
+
# @return [DataFrame]
|
2710
|
+
#
|
2711
|
+
# @example
|
2712
|
+
# df = Polars::DataFrame.new(
|
2713
|
+
# {
|
2714
|
+
# "foo" => [1, 2, 3],
|
2715
|
+
# "bar" => [6, 7, 8],
|
2716
|
+
# "ham" => ["a", "b", "c"]
|
2717
|
+
# }
|
2718
|
+
# )
|
2719
|
+
# df.quantile(0.5, interpolation: "nearest")
|
2720
|
+
# # =>
|
2721
|
+
# # shape: (1, 3)
|
2722
|
+
# # ┌─────┬─────┬──────┐
|
2723
|
+
# # │ foo ┆ bar ┆ ham │
|
2724
|
+
# # │ --- ┆ --- ┆ --- │
|
2725
|
+
# # │ f64 ┆ f64 ┆ str │
|
2726
|
+
# # ╞═════╪═════╪══════╡
|
2727
|
+
# # │ 2.0 ┆ 7.0 ┆ null │
|
2728
|
+
# # └─────┴─────┴──────┘
|
2729
|
+
def quantile(quantile, interpolation: "nearest")
|
2730
|
+
_from_rbdf(_df.quantile(quantile, interpolation))
|
2731
|
+
end
|
540
2732
|
|
541
|
-
#
|
542
|
-
#
|
2733
|
+
# Get one hot encoded dummy variables.
|
2734
|
+
#
|
2735
|
+
# @param columns
|
2736
|
+
# A subset of columns to convert to dummy variables. `nil` means
|
2737
|
+
# "all columns".
|
2738
|
+
#
|
2739
|
+
# @return [DataFrame]
|
2740
|
+
#
|
2741
|
+
# @example
|
2742
|
+
# df = Polars::DataFrame.new(
|
2743
|
+
# {
|
2744
|
+
# "foo" => [1, 2],
|
2745
|
+
# "bar" => [3, 4],
|
2746
|
+
# "ham" => ["a", "b"]
|
2747
|
+
# }
|
2748
|
+
# )
|
2749
|
+
# df.to_dummies
|
2750
|
+
# # =>
|
2751
|
+
# # shape: (2, 6)
|
2752
|
+
# # ┌───────┬───────┬───────┬───────┬───────┬───────┐
|
2753
|
+
# # │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
|
2754
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
2755
|
+
# # │ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
|
2756
|
+
# # ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
|
2757
|
+
# # │ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
|
2758
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2759
|
+
# # │ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
|
2760
|
+
# # └───────┴───────┴───────┴───────┴───────┴───────┘
|
2761
|
+
def to_dummies(columns: nil)
|
2762
|
+
if columns.is_a?(String)
|
2763
|
+
columns = [columns]
|
2764
|
+
end
|
2765
|
+
_from_rbdf(_df.to_dummies(columns))
|
2766
|
+
end
|
543
2767
|
|
544
|
-
#
|
545
|
-
#
|
2768
|
+
# Drop duplicate rows from this DataFrame.
|
2769
|
+
#
|
2770
|
+
# @param maintain_order [Boolean]
|
2771
|
+
# Keep the same order as the original DataFrame. This requires more work to
|
2772
|
+
# compute.
|
2773
|
+
# @param subset [Object]
|
2774
|
+
# Subset to use to compare rows.
|
2775
|
+
# @param keep ["first", "last"]
|
2776
|
+
# Which of the duplicate rows to keep (in conjunction with `subset`).
|
2777
|
+
#
|
2778
|
+
# @return [DataFrame]
|
2779
|
+
#
|
2780
|
+
# @note
|
2781
|
+
# Note that this fails if there is a column of type `List` in the DataFrame or
|
2782
|
+
# subset.
|
2783
|
+
#
|
2784
|
+
# @example
|
2785
|
+
# df = Polars::DataFrame.new(
|
2786
|
+
# {
|
2787
|
+
# "a" => [1, 1, 2, 3, 4, 5],
|
2788
|
+
# "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
|
2789
|
+
# "c" => [true, true, true, false, true, true]
|
2790
|
+
# }
|
2791
|
+
# )
|
2792
|
+
# df.unique
|
2793
|
+
# # =>
|
2794
|
+
# # shape: (5, 3)
|
2795
|
+
# # ┌─────┬─────┬───────┐
|
2796
|
+
# # │ a ┆ b ┆ c │
|
2797
|
+
# # │ --- ┆ --- ┆ --- │
|
2798
|
+
# # │ i64 ┆ f64 ┆ bool │
|
2799
|
+
# # ╞═════╪═════╪═══════╡
|
2800
|
+
# # │ 1 ┆ 0.5 ┆ true │
|
2801
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2802
|
+
# # │ 2 ┆ 1.0 ┆ true │
|
2803
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2804
|
+
# # │ 3 ┆ 2.0 ┆ false │
|
2805
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2806
|
+
# # │ 4 ┆ 3.0 ┆ true │
|
2807
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
2808
|
+
# # │ 5 ┆ 3.0 ┆ true │
|
2809
|
+
# # └─────┴─────┴───────┘
|
2810
|
+
def unique(maintain_order: true, subset: nil, keep: "first")
|
2811
|
+
if !subset.nil?
|
2812
|
+
if subset.is_a?(String)
|
2813
|
+
subset = [subset]
|
2814
|
+
elsif !subset.is_a?(Array)
|
2815
|
+
subset = subset.to_a
|
2816
|
+
end
|
2817
|
+
end
|
546
2818
|
|
547
|
-
|
548
|
-
|
2819
|
+
_from_rbdf(_df.unique(maintain_order, subset, keep))
|
2820
|
+
end
|
2821
|
+
|
2822
|
+
# Return the number of unique rows, or the number of unique row-subsets.
|
2823
|
+
#
|
2824
|
+
# @param subset [Object]
|
2825
|
+
# One or more columns/expressions that define what to count;
|
2826
|
+
# omit to return the count of unique rows.
|
2827
|
+
#
|
2828
|
+
# @return [DataFrame]
|
2829
|
+
#
|
2830
|
+
# @example
|
2831
|
+
# df = Polars::DataFrame.new(
|
2832
|
+
# {
|
2833
|
+
# "a" => [1, 1, 2, 3, 4, 5],
|
2834
|
+
# "b" => [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
|
2835
|
+
# "c" => [true, true, true, false, true, true]
|
2836
|
+
# }
|
2837
|
+
# )
|
2838
|
+
# df.n_unique
|
2839
|
+
# # => 5
|
2840
|
+
#
|
2841
|
+
# @example Simple columns subset
|
2842
|
+
# df.n_unique(subset: ["b", "c"])
|
2843
|
+
# # => 4
|
2844
|
+
#
|
2845
|
+
# @example Expression subset
|
2846
|
+
# df.n_unique(
|
2847
|
+
# subset: [
|
2848
|
+
# (Polars.col("a").floordiv(2)),
|
2849
|
+
# (Polars.col("c") | (Polars.col("b") >= 2))
|
2850
|
+
# ]
|
2851
|
+
# )
|
2852
|
+
# # => 3
|
2853
|
+
def n_unique(subset: nil)
|
2854
|
+
if subset.is_a?(StringIO)
|
2855
|
+
subset = [Polars.col(subset)]
|
2856
|
+
elsif subset.is_a?(Expr)
|
2857
|
+
subset = [subset]
|
2858
|
+
end
|
2859
|
+
|
2860
|
+
if subset.is_a?(Array) && subset.length == 1
|
2861
|
+
expr = Utils.expr_to_lit_or_expr(subset[0], str_to_lit: false)
|
2862
|
+
else
|
2863
|
+
struct_fields = subset.nil? ? Polars.all : subset
|
2864
|
+
expr = Polars.struct(struct_fields)
|
2865
|
+
end
|
2866
|
+
|
2867
|
+
df = lazy.select(expr.n_unique).collect
|
2868
|
+
df.is_empty ? 0 : df.row(0)[0]
|
2869
|
+
end
|
549
2870
|
|
2871
|
+
# Rechunk the data in this DataFrame to a contiguous allocation.
|
2872
|
+
|
2873
|
+
# This will make sure all subsequent operations have optimal and predictable
|
2874
|
+
# performance.
|
2875
|
+
#
|
2876
|
+
# @return [DataFrame]
|
550
2877
|
def rechunk
|
551
2878
|
_from_rbdf(_df.rechunk)
|
552
2879
|
end
|
553
2880
|
|
2881
|
+
# Create a new DataFrame that shows the null counts per column.
|
2882
|
+
#
|
2883
|
+
# @return [DataFrame]
|
2884
|
+
#
|
2885
|
+
# @example
|
2886
|
+
# df = Polars::DataFrame.new(
|
2887
|
+
# {
|
2888
|
+
# "foo" => [1, nil, 3],
|
2889
|
+
# "bar" => [6, 7, nil],
|
2890
|
+
# "ham" => ["a", "b", "c"]
|
2891
|
+
# }
|
2892
|
+
# )
|
2893
|
+
# df.null_count
|
2894
|
+
# # =>
|
2895
|
+
# # shape: (1, 3)
|
2896
|
+
# # ┌─────┬─────┬─────┐
|
2897
|
+
# # │ foo ┆ bar ┆ ham │
|
2898
|
+
# # │ --- ┆ --- ┆ --- │
|
2899
|
+
# # │ u32 ┆ u32 ┆ u32 │
|
2900
|
+
# # ╞═════╪═════╪═════╡
|
2901
|
+
# # │ 1 ┆ 1 ┆ 0 │
|
2902
|
+
# # └─────┴─────┴─────┘
|
554
2903
|
def null_count
|
555
2904
|
_from_rbdf(_df.null_count)
|
556
2905
|
end
|
557
2906
|
|
558
|
-
#
|
559
|
-
#
|
2907
|
+
# Sample from this DataFrame.
|
2908
|
+
#
|
2909
|
+
# @param n [Integer]
|
2910
|
+
# Number of items to return. Cannot be used with `frac`. Defaults to 1 if
|
2911
|
+
# `frac` is nil.
|
2912
|
+
# @param frac [Float]
|
2913
|
+
# Fraction of items to return. Cannot be used with `n`.
|
2914
|
+
# @param with_replacement [Boolean]
|
2915
|
+
# Allow values to be sampled more than once.
|
2916
|
+
# @param shuffle [Boolean]
|
2917
|
+
# Shuffle the order of sampled data points.
|
2918
|
+
# @param seed [Integer]
|
2919
|
+
# Seed for the random number generator. If set to nil (default), a random
|
2920
|
+
# seed is used.
|
2921
|
+
#
|
2922
|
+
# @return [DataFrame]
|
2923
|
+
#
|
2924
|
+
# @example
|
2925
|
+
# df = Polars::DataFrame.new(
|
2926
|
+
# {
|
2927
|
+
# "foo" => [1, 2, 3],
|
2928
|
+
# "bar" => [6, 7, 8],
|
2929
|
+
# "ham" => ["a", "b", "c"]
|
2930
|
+
# }
|
2931
|
+
# )
|
2932
|
+
# df.sample(n: 2, seed: 0)
|
2933
|
+
# # =>
|
2934
|
+
# # shape: (2, 3)
|
2935
|
+
# # ┌─────┬─────┬─────┐
|
2936
|
+
# # │ foo ┆ bar ┆ ham │
|
2937
|
+
# # │ --- ┆ --- ┆ --- │
|
2938
|
+
# # │ i64 ┆ i64 ┆ str │
|
2939
|
+
# # ╞═════╪═════╪═════╡
|
2940
|
+
# # │ 3 ┆ 8 ┆ c │
|
2941
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
2942
|
+
# # │ 2 ┆ 7 ┆ b │
|
2943
|
+
# # └─────┴─────┴─────┘
|
2944
|
+
def sample(
|
2945
|
+
n: nil,
|
2946
|
+
frac: nil,
|
2947
|
+
with_replacement: false,
|
2948
|
+
shuffle: false,
|
2949
|
+
seed: nil
|
2950
|
+
)
|
2951
|
+
if !n.nil? && !frac.nil?
|
2952
|
+
raise ArgumentError, "cannot specify both `n` and `frac`"
|
2953
|
+
end
|
2954
|
+
|
2955
|
+
if n.nil? && !frac.nil?
|
2956
|
+
_from_rbdf(
|
2957
|
+
_df.sample_frac(frac, with_replacement, shuffle, seed)
|
2958
|
+
)
|
2959
|
+
end
|
2960
|
+
|
2961
|
+
if n.nil?
|
2962
|
+
n = 1
|
2963
|
+
end
|
2964
|
+
_from_rbdf(_df.sample_n(n, with_replacement, shuffle, seed))
|
2965
|
+
end
|
560
2966
|
|
561
2967
|
# def fold
|
562
2968
|
# end
|
563
2969
|
|
564
|
-
#
|
565
|
-
#
|
2970
|
+
# Get a row as tuple, either by index or by predicate.
|
2971
|
+
#
|
2972
|
+
# @param index [Object]
|
2973
|
+
# Row index.
|
2974
|
+
# @param by_predicate [Object]
|
2975
|
+
# Select the row according to a given expression/predicate.
|
2976
|
+
#
|
2977
|
+
# @return [Object]
|
2978
|
+
#
|
2979
|
+
# @note
|
2980
|
+
# The `index` and `by_predicate` params are mutually exclusive. Additionally,
|
2981
|
+
# to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
|
2982
|
+
#
|
2983
|
+
# When using `by_predicate` it is an error condition if anything other than
|
2984
|
+
# one row is returned; more than one row raises `TooManyRowsReturned`, and
|
2985
|
+
# zero rows will raise `NoRowsReturned` (both inherit from `RowsException`).
|
2986
|
+
#
|
2987
|
+
# @example Return the row at the given index
|
2988
|
+
# df = Polars::DataFrame.new(
|
2989
|
+
# {
|
2990
|
+
# "foo" => [1, 2, 3],
|
2991
|
+
# "bar" => [6, 7, 8],
|
2992
|
+
# "ham" => ["a", "b", "c"]
|
2993
|
+
# }
|
2994
|
+
# )
|
2995
|
+
# df.row(2)
|
2996
|
+
# # => [3, 8, "c"]
|
2997
|
+
#
|
2998
|
+
# @example Return the row that matches the given predicate
|
2999
|
+
# df.row(by_predicate: Polars.col("ham") == "b")
|
3000
|
+
# # => [2, 7, "b"]
|
3001
|
+
def row(index = nil, by_predicate: nil)
|
3002
|
+
if !index.nil? && !by_predicate.nil?
|
3003
|
+
raise ArgumentError, "Cannot set both 'index' and 'by_predicate'; mutually exclusive"
|
3004
|
+
elsif index.is_a?(Expr)
|
3005
|
+
raise TypeError, "Expressions should be passed to the 'by_predicate' param"
|
3006
|
+
elsif index.is_a?(Integer)
|
3007
|
+
_df.row_tuple(index)
|
3008
|
+
elsif by_predicate.is_a?(Expr)
|
3009
|
+
rows = filter(by_predicate).rows
|
3010
|
+
n_rows = rows.length
|
3011
|
+
if n_rows > 1
|
3012
|
+
raise TooManyRowsReturned, "Predicate #{by_predicate} returned #{n_rows} rows"
|
3013
|
+
elsif n_rows == 0
|
3014
|
+
raise NoRowsReturned, "Predicate <{by_predicate!s}> returned no rows"
|
3015
|
+
end
|
3016
|
+
rows[0]
|
3017
|
+
else
|
3018
|
+
raise ArgumentError, "One of 'index' or 'by_predicate' must be set"
|
3019
|
+
end
|
3020
|
+
end
|
566
3021
|
|
567
|
-
#
|
568
|
-
#
|
3022
|
+
# Convert columnar data to rows as Ruby arrays.
|
3023
|
+
#
|
3024
|
+
# @return [Array]
|
3025
|
+
#
|
3026
|
+
# @example
|
3027
|
+
# df = Polars::DataFrame.new(
|
3028
|
+
# {
|
3029
|
+
# "a" => [1, 3, 5],
|
3030
|
+
# "b" => [2, 4, 6]
|
3031
|
+
# }
|
3032
|
+
# )
|
3033
|
+
# df.rows
|
3034
|
+
# # => [[1, 2], [3, 4], [5, 6]]
|
3035
|
+
def rows
|
3036
|
+
_df.row_tuples
|
3037
|
+
end
|
569
3038
|
|
570
|
-
#
|
571
|
-
#
|
3039
|
+
# Shrink DataFrame memory usage.
|
3040
|
+
#
|
3041
|
+
# Shrinks to fit the exact capacity needed to hold the data.
|
3042
|
+
#
|
3043
|
+
# @return [DataFrame]
|
3044
|
+
def shrink_to_fit(in_place: false)
|
3045
|
+
if in_place
|
3046
|
+
_df.shrink_to_fit
|
3047
|
+
self
|
3048
|
+
else
|
3049
|
+
df = clone
|
3050
|
+
df._df.shrink_to_fit
|
3051
|
+
df
|
3052
|
+
end
|
3053
|
+
end
|
572
3054
|
|
573
|
-
#
|
574
|
-
#
|
3055
|
+
# Take every nth row in the DataFrame and return as a new DataFrame.
|
3056
|
+
#
|
3057
|
+
# @return [DataFrame]
|
3058
|
+
#
|
3059
|
+
# @example
|
3060
|
+
# s = Polars::DataFrame.new({"a" => [1, 2, 3, 4], "b" => [5, 6, 7, 8]})
|
3061
|
+
# s.take_every(2)
|
3062
|
+
# # =>
|
3063
|
+
# # shape: (2, 2)
|
3064
|
+
# # ┌─────┬─────┐
|
3065
|
+
# # │ a ┆ b │
|
3066
|
+
# # │ --- ┆ --- │
|
3067
|
+
# # │ i64 ┆ i64 │
|
3068
|
+
# # ╞═════╪═════╡
|
3069
|
+
# # │ 1 ┆ 5 │
|
3070
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
3071
|
+
# # │ 3 ┆ 7 │
|
3072
|
+
# # └─────┴─────┘
|
3073
|
+
def take_every(n)
|
3074
|
+
select(Utils.col("*").take_every(n))
|
3075
|
+
end
|
575
3076
|
|
576
3077
|
# def hash_rows
|
577
3078
|
# end
|
578
3079
|
|
579
|
-
#
|
580
|
-
#
|
3080
|
+
# Interpolate intermediate values. The interpolation method is linear.
|
3081
|
+
#
|
3082
|
+
# @return [DataFrame]
|
3083
|
+
#
|
3084
|
+
# @example
|
3085
|
+
# df = Polars::DataFrame.new(
|
3086
|
+
# {
|
3087
|
+
# "foo" => [1, nil, 9, 10],
|
3088
|
+
# "bar" => [6, 7, 9, nil],
|
3089
|
+
# "baz" => [1, nil, nil, 9]
|
3090
|
+
# }
|
3091
|
+
# )
|
3092
|
+
# df.interpolate
|
3093
|
+
# # =>
|
3094
|
+
# # shape: (4, 3)
|
3095
|
+
# # ┌─────┬──────┬─────┐
|
3096
|
+
# # │ foo ┆ bar ┆ baz │
|
3097
|
+
# # │ --- ┆ --- ┆ --- │
|
3098
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
3099
|
+
# # ╞═════╪══════╪═════╡
|
3100
|
+
# # │ 1 ┆ 6 ┆ 1 │
|
3101
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
3102
|
+
# # │ 5 ┆ 7 ┆ 3 │
|
3103
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
3104
|
+
# # │ 9 ┆ 9 ┆ 6 │
|
3105
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┤
|
3106
|
+
# # │ 10 ┆ null ┆ 9 │
|
3107
|
+
# # └─────┴──────┴─────┘
|
3108
|
+
def interpolate
|
3109
|
+
select(Utils.col("*").interpolate)
|
3110
|
+
end
|
581
3111
|
|
3112
|
+
# Check if the dataframe is empty.
|
3113
|
+
#
|
3114
|
+
# @return [Boolean]
|
3115
|
+
#
|
3116
|
+
# @example
|
3117
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
3118
|
+
# df.is_empty
|
3119
|
+
# # => false
|
3120
|
+
# df.filter(Polars.col("foo") > 99).is_empty
|
3121
|
+
# # => true
|
582
3122
|
def is_empty
|
583
3123
|
height == 0
|
584
3124
|
end
|
585
3125
|
alias_method :empty?, :is_empty
|
586
3126
|
|
587
|
-
#
|
588
|
-
#
|
3127
|
+
# Convert a `DataFrame` to a `Series` of type `Struct`.
|
3128
|
+
#
|
3129
|
+
# @param name [String]
|
3130
|
+
# Name for the struct Series
|
3131
|
+
#
|
3132
|
+
# @return [Series]
|
3133
|
+
#
|
3134
|
+
# @example
|
3135
|
+
# df = Polars::DataFrame.new(
|
3136
|
+
# {
|
3137
|
+
# "a" => [1, 2, 3, 4, 5],
|
3138
|
+
# "b" => ["one", "two", "three", "four", "five"]
|
3139
|
+
# }
|
3140
|
+
# )
|
3141
|
+
# df.to_struct("nums")
|
3142
|
+
# # =>
|
3143
|
+
# # shape: (5,)
|
3144
|
+
# # Series: 'nums' [struct[2]]
|
3145
|
+
# # [
|
3146
|
+
# # {1,"one"}
|
3147
|
+
# # {2,"two"}
|
3148
|
+
# # {3,"three"}
|
3149
|
+
# # {4,"four"}
|
3150
|
+
# # {5,"five"}
|
3151
|
+
# # ]
|
3152
|
+
def to_struct(name)
|
3153
|
+
Utils.wrap_s(_df.to_struct(name))
|
3154
|
+
end
|
589
3155
|
|
590
|
-
#
|
591
|
-
#
|
3156
|
+
# Decompose a struct into its fields.
|
3157
|
+
#
|
3158
|
+
# The fields will be inserted into the `DataFrame` on the location of the
|
3159
|
+
# `struct` type.
|
3160
|
+
#
|
3161
|
+
# @param names [Object]
|
3162
|
+
# Names of the struct columns that will be decomposed by its fields
|
3163
|
+
#
|
3164
|
+
# @return [DataFrame]
|
3165
|
+
#
|
3166
|
+
# @example
|
3167
|
+
# df = Polars::DataFrame.new(
|
3168
|
+
# {
|
3169
|
+
# "before" => ["foo", "bar"],
|
3170
|
+
# "t_a" => [1, 2],
|
3171
|
+
# "t_b" => ["a", "b"],
|
3172
|
+
# "t_c" => [true, nil],
|
3173
|
+
# "t_d" => [[1, 2], [3]],
|
3174
|
+
# "after" => ["baz", "womp"]
|
3175
|
+
# }
|
3176
|
+
# ).select(["before", Polars.struct(Polars.col("^t_.$")).alias("t_struct"), "after"])
|
3177
|
+
# df.unnest("t_struct")
|
3178
|
+
# # =>
|
3179
|
+
# # shape: (2, 6)
|
3180
|
+
# # ┌────────┬─────┬─────┬──────┬───────────┬───────┐
|
3181
|
+
# # │ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
|
3182
|
+
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
3183
|
+
# # │ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
|
3184
|
+
# # ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
|
3185
|
+
# # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
|
3186
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
|
3187
|
+
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
3188
|
+
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
3189
|
+
def unnest(names)
|
3190
|
+
if names.is_a?(String)
|
3191
|
+
names = [names]
|
3192
|
+
end
|
3193
|
+
_from_rbdf(_df.unnest(names))
|
3194
|
+
end
|
592
3195
|
|
593
3196
|
private
|
594
3197
|
|
@@ -597,15 +3200,55 @@ module Polars
|
|
597
3200
|
self._df = _df._clone
|
598
3201
|
end
|
599
3202
|
|
600
|
-
def hash_to_rbdf(data)
|
3203
|
+
def hash_to_rbdf(data, columns: nil)
|
3204
|
+
if !columns.nil?
|
3205
|
+
columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
|
3206
|
+
|
3207
|
+
if data.empty? && dtypes
|
3208
|
+
data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
|
3209
|
+
else
|
3210
|
+
data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
|
3211
|
+
end
|
3212
|
+
data_series = _handle_columns_arg(data_series, columns: columns)
|
3213
|
+
return RbDataFrame.new(data_series)
|
3214
|
+
end
|
3215
|
+
|
601
3216
|
RbDataFrame.read_hash(data)
|
602
3217
|
end
|
603
3218
|
|
604
|
-
def
|
3219
|
+
def _unpack_columns(columns, lookup_names: nil)
|
3220
|
+
[columns.keys, columns]
|
3221
|
+
end
|
3222
|
+
|
3223
|
+
def _handle_columns_arg(data, columns: nil)
|
3224
|
+
if columns.nil?
|
3225
|
+
data
|
3226
|
+
else
|
3227
|
+
if data.empty?
|
3228
|
+
columns.map { |c| Series.new(c, nil)._s }
|
3229
|
+
elsif data.length == columns.length
|
3230
|
+
columns.each_with_index do |c, i|
|
3231
|
+
# not in-place?
|
3232
|
+
data[i].rename(c)
|
3233
|
+
end
|
3234
|
+
data
|
3235
|
+
else
|
3236
|
+
raise ArgumentError, "Dimensions of columns arg must match data dimensions."
|
3237
|
+
end
|
3238
|
+
end
|
3239
|
+
end
|
3240
|
+
|
3241
|
+
def sequence_to_rbdf(data, columns: nil, orient: nil)
|
3242
|
+
if columns || orient
|
3243
|
+
raise Todo
|
3244
|
+
end
|
605
3245
|
RbDataFrame.new(data.map(&:_s))
|
606
3246
|
end
|
607
3247
|
|
608
|
-
def series_to_rbdf(data)
|
3248
|
+
def series_to_rbdf(data, columns: nil)
|
3249
|
+
if columns
|
3250
|
+
raise Todo
|
3251
|
+
end
|
609
3252
|
RbDataFrame.new([data._s])
|
610
3253
|
end
|
611
3254
|
|
@@ -616,5 +3259,75 @@ module Polars
|
|
616
3259
|
def _from_rbdf(rb_df)
|
617
3260
|
self.class._from_rbdf(rb_df)
|
618
3261
|
end
|
3262
|
+
|
3263
|
+
def _comp(other, op)
|
3264
|
+
if other.is_a?(DataFrame)
|
3265
|
+
_compare_to_other_df(other, op)
|
3266
|
+
else
|
3267
|
+
_compare_to_non_df(other, op)
|
3268
|
+
end
|
3269
|
+
end
|
3270
|
+
|
3271
|
+
def _compare_to_other_df(other, op)
|
3272
|
+
if columns != other.columns
|
3273
|
+
raise ArgmentError, "DataFrame columns do not match"
|
3274
|
+
end
|
3275
|
+
if shape != other.shape
|
3276
|
+
raise ArgmentError, "DataFrame dimensions do not match"
|
3277
|
+
end
|
3278
|
+
|
3279
|
+
suffix = "__POLARS_CMP_OTHER"
|
3280
|
+
other_renamed = other.select(Polars.all.suffix(suffix))
|
3281
|
+
combined = Polars.concat([self, other_renamed], how: "horizontal")
|
3282
|
+
|
3283
|
+
expr = case op
|
3284
|
+
when "eq"
|
3285
|
+
columns.map { |n| Polars.col(n) == Polars.col("#{n}#{suffix}") }
|
3286
|
+
when "neq"
|
3287
|
+
columns.map { |n| Polars.col(n) != Polars.col("#{n}#{suffix}") }
|
3288
|
+
when "gt"
|
3289
|
+
columns.map { |n| Polars.col(n) > Polars.col("#{n}#{suffix}") }
|
3290
|
+
when "lt"
|
3291
|
+
columns.map { |n| Polars.col(n) < Polars.col("#{n}#{suffix}") }
|
3292
|
+
when "gt_eq"
|
3293
|
+
columns.map { |n| Polars.col(n) >= Polars.col("#{n}#{suffix}") }
|
3294
|
+
when "lt_eq"
|
3295
|
+
columns.map { |n| Polars.col(n) <= Polars.col("#{n}#{suffix}") }
|
3296
|
+
else
|
3297
|
+
raise ArgumentError, "got unexpected comparison operator: #{op}"
|
3298
|
+
end
|
3299
|
+
|
3300
|
+
combined.select(expr)
|
3301
|
+
end
|
3302
|
+
|
3303
|
+
def _compare_to_non_df(other, op)
|
3304
|
+
case op
|
3305
|
+
when "eq"
|
3306
|
+
select(Polars.all == other)
|
3307
|
+
when "neq"
|
3308
|
+
select(Polars.all != other)
|
3309
|
+
when "gt"
|
3310
|
+
select(Polars.all > other)
|
3311
|
+
when "lt"
|
3312
|
+
select(Polars.all < other)
|
3313
|
+
when "gt_eq"
|
3314
|
+
select(Polars.all >= other)
|
3315
|
+
when "lt_eq"
|
3316
|
+
select(Polars.all <= other)
|
3317
|
+
else
|
3318
|
+
raise ArgumentError, "got unexpected comparison operator: #{op}"
|
3319
|
+
end
|
3320
|
+
end
|
3321
|
+
|
3322
|
+
def _prepare_other_arg(other)
|
3323
|
+
if !other.is_a?(Series)
|
3324
|
+
if other.is_a?(Array)
|
3325
|
+
raise ArgumentError, "Operation not supported."
|
3326
|
+
end
|
3327
|
+
|
3328
|
+
other = Series.new("", [other])
|
3329
|
+
end
|
3330
|
+
other
|
3331
|
+
end
|
619
3332
|
end
|
620
3333
|
end
|