polars-df 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +1946 -0
- data/Cargo.toml +5 -0
- data/ext/polars/Cargo.toml +31 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +336 -42
- data/ext/polars/src/dataframe.rs +409 -4
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +436 -10
- data/ext/polars/src/lazy/dsl.rs +1134 -5
- data/ext/polars/src/lazy/meta.rs +41 -0
- data/ext/polars/src/lazy/mod.rs +2 -0
- data/ext/polars/src/lib.rs +390 -3
- data/ext/polars/src/series.rs +175 -13
- data/lib/polars/batched_csv_reader.rb +95 -0
- data/lib/polars/cat_expr.rb +13 -0
- data/lib/polars/data_frame.rb +892 -21
- data/lib/polars/date_time_expr.rb +143 -0
- data/lib/polars/expr.rb +503 -0
- data/lib/polars/io.rb +342 -2
- data/lib/polars/lazy_frame.rb +338 -6
- data/lib/polars/lazy_functions.rb +158 -11
- data/lib/polars/list_expr.rb +108 -0
- data/lib/polars/meta_expr.rb +33 -0
- data/lib/polars/series.rb +1304 -14
- data/lib/polars/string_expr.rb +117 -0
- data/lib/polars/struct_expr.rb +27 -0
- data/lib/polars/utils.rb +60 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -1
- metadata +13 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -1,8 +1,22 @@
|
|
1
1
|
module Polars
|
2
|
+
# Two-dimensional data structure representing data as a table with rows and columns.
|
2
3
|
class DataFrame
|
4
|
+
# @private
|
3
5
|
attr_accessor :_df
|
4
6
|
|
5
|
-
|
7
|
+
# Create a new DataFrame.
|
8
|
+
#
|
9
|
+
# @param data [Hash, Array, Series, nil]
|
10
|
+
# Two-dimensional data in various forms. Hash must contain Arrays.
|
11
|
+
# Array may contain Series.
|
12
|
+
# @param columns [Array, Hash, nil]
|
13
|
+
# Column labels to use for resulting DataFrame. If specified, overrides any
|
14
|
+
# labels already present in the data. Must match data dimensions.
|
15
|
+
# @param orient ["col", "row", nil]
|
16
|
+
# Whether to interpret two-dimensional data as columns or as rows. If `nil`,
|
17
|
+
# the orientation is inferred by matching the columns and data dimensions. If
|
18
|
+
# this does not yield conclusive results, column orientation is used.
|
19
|
+
def initialize(data = nil, columns: nil, orient: nil)
|
6
20
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
7
21
|
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
|
8
22
|
data = {}
|
@@ -12,33 +26,135 @@ module Polars
|
|
12
26
|
end
|
13
27
|
|
14
28
|
if data.nil?
|
15
|
-
self._df = hash_to_rbdf({})
|
29
|
+
self._df = hash_to_rbdf({}, columns: columns)
|
16
30
|
elsif data.is_a?(Hash)
|
17
31
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
18
|
-
self._df = hash_to_rbdf(data)
|
32
|
+
self._df = hash_to_rbdf(data, columns: columns)
|
19
33
|
elsif data.is_a?(Array)
|
20
|
-
self._df = sequence_to_rbdf(data)
|
34
|
+
self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
|
21
35
|
elsif data.is_a?(Series)
|
22
|
-
self._df = series_to_rbdf(data)
|
36
|
+
self._df = series_to_rbdf(data, columns: columns)
|
23
37
|
else
|
24
38
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
25
39
|
end
|
26
40
|
end
|
27
41
|
|
42
|
+
# @private
|
28
43
|
def self._from_rbdf(rb_df)
|
29
44
|
df = DataFrame.allocate
|
30
45
|
df._df = rb_df
|
31
46
|
df
|
32
47
|
end
|
33
48
|
|
34
|
-
def self.
|
49
|
+
# def self._from_hashes
|
50
|
+
# end
|
51
|
+
|
52
|
+
# def self._from_hash
|
53
|
+
# end
|
54
|
+
|
55
|
+
# def self._from_records
|
56
|
+
# end
|
57
|
+
|
58
|
+
# def self._from_numo
|
59
|
+
# end
|
60
|
+
|
61
|
+
# no self._from_arrow
|
62
|
+
|
63
|
+
# no self._from_pandas
|
64
|
+
|
65
|
+
# @private
|
66
|
+
def self._read_csv(
|
67
|
+
file,
|
68
|
+
has_header: true,
|
69
|
+
columns: nil,
|
70
|
+
sep: str = ",",
|
71
|
+
comment_char: nil,
|
72
|
+
quote_char: '"',
|
73
|
+
skip_rows: 0,
|
74
|
+
dtypes: nil,
|
75
|
+
null_values: nil,
|
76
|
+
ignore_errors: false,
|
77
|
+
parse_dates: false,
|
78
|
+
n_threads: nil,
|
79
|
+
infer_schema_length: 100,
|
80
|
+
batch_size: 8192,
|
81
|
+
n_rows: nil,
|
82
|
+
encoding: "utf8",
|
83
|
+
low_memory: false,
|
84
|
+
rechunk: true,
|
85
|
+
skip_rows_after_header: 0,
|
86
|
+
row_count_name: nil,
|
87
|
+
row_count_offset: 0,
|
88
|
+
sample_size: 1024,
|
89
|
+
eol_char: "\n"
|
90
|
+
)
|
35
91
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
36
|
-
|
92
|
+
path = Utils.format_path(file)
|
93
|
+
else
|
94
|
+
path = nil
|
95
|
+
# if defined?(StringIO) && file.is_a?(StringIO)
|
96
|
+
# file = file.string
|
97
|
+
# end
|
98
|
+
end
|
99
|
+
|
100
|
+
dtype_list = nil
|
101
|
+
dtype_slice = nil
|
102
|
+
if !dtypes.nil?
|
103
|
+
if dtypes.is_a?(Hash)
|
104
|
+
dtype_list = []
|
105
|
+
dtypes.each do|k, v|
|
106
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
107
|
+
end
|
108
|
+
elsif dtypes.is_a?(Array)
|
109
|
+
dtype_slice = dtypes
|
110
|
+
else
|
111
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
112
|
+
end
|
37
113
|
end
|
38
114
|
|
39
|
-
|
115
|
+
processed_null_values = Utils._process_null_values(null_values)
|
116
|
+
|
117
|
+
if columns.is_a?(String)
|
118
|
+
columns = [columns]
|
119
|
+
end
|
120
|
+
if file.is_a?(String) && file.include?("*")
|
121
|
+
raise Todo
|
122
|
+
end
|
123
|
+
|
124
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
125
|
+
|
126
|
+
_from_rbdf(
|
127
|
+
RbDataFrame.read_csv(
|
128
|
+
file,
|
129
|
+
infer_schema_length,
|
130
|
+
batch_size,
|
131
|
+
has_header,
|
132
|
+
ignore_errors,
|
133
|
+
n_rows,
|
134
|
+
skip_rows,
|
135
|
+
projection,
|
136
|
+
sep,
|
137
|
+
rechunk,
|
138
|
+
columns,
|
139
|
+
encoding,
|
140
|
+
n_threads,
|
141
|
+
path,
|
142
|
+
dtype_list,
|
143
|
+
dtype_slice,
|
144
|
+
low_memory,
|
145
|
+
comment_char,
|
146
|
+
quote_char,
|
147
|
+
processed_null_values,
|
148
|
+
parse_dates,
|
149
|
+
skip_rows_after_header,
|
150
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
151
|
+
sample_size,
|
152
|
+
eol_char
|
153
|
+
)
|
154
|
+
)
|
40
155
|
end
|
41
156
|
|
157
|
+
# @private
|
42
158
|
def self._read_parquet(file)
|
43
159
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
44
160
|
file = Utils.format_path(file)
|
@@ -47,6 +163,44 @@ module Polars
|
|
47
163
|
_from_rbdf(RbDataFrame.read_parquet(file))
|
48
164
|
end
|
49
165
|
|
166
|
+
# def self._read_avro
|
167
|
+
# end
|
168
|
+
|
169
|
+
# @private
|
170
|
+
def self._read_ipc(
|
171
|
+
file,
|
172
|
+
columns: nil,
|
173
|
+
n_rows: nil,
|
174
|
+
row_count_name: nil,
|
175
|
+
row_count_offset: 0,
|
176
|
+
rechunk: true,
|
177
|
+
memory_map: true
|
178
|
+
)
|
179
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
180
|
+
file = Utils.format_path(file)
|
181
|
+
end
|
182
|
+
if columns.is_a?(String)
|
183
|
+
columns = [columns]
|
184
|
+
end
|
185
|
+
|
186
|
+
if file.is_a?(String) && file.include?("*")
|
187
|
+
raise Todo
|
188
|
+
end
|
189
|
+
|
190
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
191
|
+
_from_rbdf(
|
192
|
+
RbDataFrame.read_ipc(
|
193
|
+
file,
|
194
|
+
columns,
|
195
|
+
projection,
|
196
|
+
n_rows,
|
197
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
198
|
+
memory_map
|
199
|
+
)
|
200
|
+
)
|
201
|
+
end
|
202
|
+
|
203
|
+
# @private
|
50
204
|
def self._read_json(file)
|
51
205
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
52
206
|
file = Utils.format_path(file)
|
@@ -55,6 +209,7 @@ module Polars
|
|
55
209
|
_from_rbdf(RbDataFrame.read_json(file))
|
56
210
|
end
|
57
211
|
|
212
|
+
# @private
|
58
213
|
def self._read_ndjson(file)
|
59
214
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
60
215
|
file = Utils.format_path(file)
|
@@ -63,26 +218,157 @@ module Polars
|
|
63
218
|
_from_rbdf(RbDataFrame.read_ndjson(file))
|
64
219
|
end
|
65
220
|
|
221
|
+
# Get the shape of the DataFrame.
|
222
|
+
#
|
223
|
+
# @return [Array]
|
224
|
+
#
|
225
|
+
# @example
|
226
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
227
|
+
# df.shape
|
228
|
+
# # => [5, 1]
|
66
229
|
def shape
|
67
230
|
_df.shape
|
68
231
|
end
|
69
232
|
|
233
|
+
# Get the height of the DataFrame.
|
234
|
+
#
|
235
|
+
# @return [Integer]
|
236
|
+
#
|
237
|
+
# @example
|
238
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
239
|
+
# df.height
|
240
|
+
# # => 5
|
70
241
|
def height
|
71
242
|
_df.height
|
72
243
|
end
|
73
244
|
|
245
|
+
# Get the width of the DataFrame.
|
246
|
+
#
|
247
|
+
# @return [Integer]
|
248
|
+
#
|
249
|
+
# @example
|
250
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
251
|
+
# df.width
|
252
|
+
# # => 1
|
74
253
|
def width
|
75
254
|
_df.width
|
76
255
|
end
|
77
256
|
|
257
|
+
# Get column names.
|
258
|
+
#
|
259
|
+
# @return [Array]
|
260
|
+
#
|
261
|
+
# @example
|
262
|
+
# df = Polars::DataFrame.new({
|
263
|
+
# "foo" => [1, 2, 3],
|
264
|
+
# "bar" => [6, 7, 8],
|
265
|
+
# "ham" => ["a", "b", "c"]
|
266
|
+
# })
|
267
|
+
# df.columns
|
268
|
+
# # => ["foo", "bar", "ham"]
|
78
269
|
def columns
|
79
270
|
_df.columns
|
80
271
|
end
|
81
272
|
|
273
|
+
# Change the column names of the DataFrame.
|
274
|
+
#
|
275
|
+
# @param columns [Array]
|
276
|
+
# A list with new names for the DataFrame.
|
277
|
+
# The length of the list should be equal to the width of the DataFrame.
|
278
|
+
#
|
279
|
+
# @return [Object]
|
280
|
+
#
|
281
|
+
# @example
|
282
|
+
# df = Polars::DataFrame.new({
|
283
|
+
# "foo" => [1, 2, 3],
|
284
|
+
# "bar" => [6, 7, 8],
|
285
|
+
# "ham" => ["a", "b", "c"]
|
286
|
+
# })
|
287
|
+
# df.columns = ["apple", "banana", "orange"]
|
288
|
+
# df
|
289
|
+
# # =>
|
290
|
+
# # shape: (3, 3)
|
291
|
+
# # ┌───────┬────────┬────────┐
|
292
|
+
# # │ apple ┆ banana ┆ orange │
|
293
|
+
# # │ --- ┆ --- ┆ --- │
|
294
|
+
# # │ i64 ┆ i64 ┆ str │
|
295
|
+
# # ╞═══════╪════════╪════════╡
|
296
|
+
# # │ 1 ┆ 6 ┆ a │
|
297
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
298
|
+
# # │ 2 ┆ 7 ┆ b │
|
299
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
300
|
+
# # │ 3 ┆ 8 ┆ c │
|
301
|
+
# # └───────┴────────┴────────┘
|
302
|
+
def columns=(columns)
|
303
|
+
_df.set_column_names(columns)
|
304
|
+
end
|
305
|
+
|
306
|
+
# Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
|
307
|
+
#
|
308
|
+
# @return [Array]
|
309
|
+
#
|
310
|
+
# @example
|
311
|
+
# df = Polars::DataFrame.new({
|
312
|
+
# "foo" => [1, 2, 3],
|
313
|
+
# "bar" => [6.0, 7.0, 8.0],
|
314
|
+
# "ham" => ["a", "b", "c"]
|
315
|
+
# })
|
316
|
+
# df.dtypes
|
317
|
+
# # => [:i64, :f64, :str]
|
82
318
|
def dtypes
|
83
|
-
_df.dtypes
|
319
|
+
_df.dtypes
|
320
|
+
end
|
321
|
+
|
322
|
+
# Get the schema.
|
323
|
+
#
|
324
|
+
# @return [Hash]
|
325
|
+
#
|
326
|
+
# @example
|
327
|
+
# df = Polars::DataFrame.new({
|
328
|
+
# "foo" => [1, 2, 3],
|
329
|
+
# "bar" => [6.0, 7.0, 8.0],
|
330
|
+
# "ham" => ["a", "b", "c"]
|
331
|
+
# })
|
332
|
+
# df.schema
|
333
|
+
# # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
|
334
|
+
def schema
|
335
|
+
columns.zip(dtypes).to_h
|
84
336
|
end
|
85
337
|
|
338
|
+
# def ==(other)
|
339
|
+
# end
|
340
|
+
|
341
|
+
# def !=(other)
|
342
|
+
# end
|
343
|
+
|
344
|
+
# def >(other)
|
345
|
+
# end
|
346
|
+
|
347
|
+
# def <(other)
|
348
|
+
# end
|
349
|
+
|
350
|
+
# def >=(other)
|
351
|
+
# end
|
352
|
+
|
353
|
+
# def <=(other)
|
354
|
+
# end
|
355
|
+
|
356
|
+
# def *(other)
|
357
|
+
# end
|
358
|
+
|
359
|
+
# def /(other)
|
360
|
+
# end
|
361
|
+
|
362
|
+
# def +(other)
|
363
|
+
# end
|
364
|
+
|
365
|
+
# def -(other)
|
366
|
+
# end
|
367
|
+
|
368
|
+
# def %(other)
|
369
|
+
# end
|
370
|
+
|
371
|
+
#
|
86
372
|
def to_s
|
87
373
|
_df.to_s
|
88
374
|
end
|
@@ -92,10 +378,64 @@ module Polars
|
|
92
378
|
columns.include?(name)
|
93
379
|
end
|
94
380
|
|
381
|
+
# def each
|
382
|
+
# end
|
383
|
+
|
384
|
+
# def _pos_idx
|
385
|
+
# end
|
386
|
+
|
387
|
+
# def _pos_idxs
|
388
|
+
# end
|
389
|
+
|
390
|
+
#
|
95
391
|
def [](name)
|
96
392
|
Utils.wrap_s(_df.column(name))
|
97
393
|
end
|
98
394
|
|
395
|
+
# def []=(key, value)
|
396
|
+
# end
|
397
|
+
|
398
|
+
# no to_arrow
|
399
|
+
|
400
|
+
#
|
401
|
+
def to_h(as_series: true)
|
402
|
+
if as_series
|
403
|
+
get_columns.to_h { |s| [s.name, s] }
|
404
|
+
else
|
405
|
+
get_columns.to_h { |s| [s.name, s.to_a] }
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
# def to_hashes / to_a
|
410
|
+
# end
|
411
|
+
|
412
|
+
# def to_numo
|
413
|
+
# end
|
414
|
+
|
415
|
+
# no to_pandas
|
416
|
+
|
417
|
+
# Select column as Series at index location.
|
418
|
+
#
|
419
|
+
# @param index [Integer]
|
420
|
+
# Location of selection.
|
421
|
+
#
|
422
|
+
# @return [Series]
|
423
|
+
#
|
424
|
+
# @example
|
425
|
+
# df = Polars::DataFrame.new({
|
426
|
+
# "foo" => [1, 2, 3],
|
427
|
+
# "bar" => [6, 7, 8],
|
428
|
+
# "ham" => ["a", "b", "c"]
|
429
|
+
# })
|
430
|
+
# df.to_series(1)
|
431
|
+
# # =>
|
432
|
+
# # shape: (3,)
|
433
|
+
# # Series: 'bar' [i64]
|
434
|
+
# # [
|
435
|
+
# # 6
|
436
|
+
# # 7
|
437
|
+
# # 8
|
438
|
+
# # ]
|
99
439
|
def to_series(index = 0)
|
100
440
|
if index < 0
|
101
441
|
index = columns.length + index
|
@@ -103,6 +443,18 @@ module Polars
|
|
103
443
|
Utils.wrap_s(_df.select_at_idx(index))
|
104
444
|
end
|
105
445
|
|
446
|
+
# Serialize to JSON representation.
|
447
|
+
#
|
448
|
+
# @return [nil]
|
449
|
+
#
|
450
|
+
# @param file [String]
|
451
|
+
# File path to which the result should be written.
|
452
|
+
# @param pretty [Boolean]
|
453
|
+
# Pretty serialize json.
|
454
|
+
# @param row_oriented [Boolean]
|
455
|
+
# Write to row oriented json. This is slower, but more common.
|
456
|
+
#
|
457
|
+
# @see #write_ndjson
|
106
458
|
def write_json(
|
107
459
|
file,
|
108
460
|
pretty: false,
|
@@ -116,6 +468,12 @@ module Polars
|
|
116
468
|
nil
|
117
469
|
end
|
118
470
|
|
471
|
+
# Serialize to newline delimited JSON representation.
|
472
|
+
#
|
473
|
+
# @param file [String]
|
474
|
+
# File path to which the result should be written.
|
475
|
+
#
|
476
|
+
# @return [nil]
|
119
477
|
def write_ndjson(file)
|
120
478
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
121
479
|
file = Utils.format_path(file)
|
@@ -125,6 +483,48 @@ module Polars
|
|
125
483
|
nil
|
126
484
|
end
|
127
485
|
|
486
|
+
# Write to comma-separated values (CSV) file.
|
487
|
+
#
|
488
|
+
# @param file [String, nil]
|
489
|
+
# File path to which the result should be written. If set to `nil`
|
490
|
+
# (default), the output is returned as a string instead.
|
491
|
+
# @param has_header [Boolean]
|
492
|
+
# Whether to include header in the CSV output.
|
493
|
+
# @param sep [String]
|
494
|
+
# Separate CSV fields with this symbol.
|
495
|
+
# @param quote [String]
|
496
|
+
# Byte to use as quoting character.
|
497
|
+
# @param batch_size [Integer]
|
498
|
+
# Number of rows that will be processed per thread.
|
499
|
+
# @param datetime_format [String, nil]
|
500
|
+
# A format string, with the specifiers defined by the
|
501
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
502
|
+
# Rust crate. If no format specified, the default fractional-second
|
503
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
504
|
+
# Datetime cols (if any).
|
505
|
+
# @param date_format [String, nil]
|
506
|
+
# A format string, with the specifiers defined by the
|
507
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
508
|
+
# Rust crate.
|
509
|
+
# @param time_format [String, nil]
|
510
|
+
# A format string, with the specifiers defined by the
|
511
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
512
|
+
# Rust crate.
|
513
|
+
# @param float_precision [Integer, nil]
|
514
|
+
# Number of decimal places to write, applied to both `:f32` and
|
515
|
+
# `:f64` datatypes.
|
516
|
+
# @param null_value [String, nil]
|
517
|
+
# A string representing null values (defaulting to the empty string).
|
518
|
+
#
|
519
|
+
# @return [String, nil]
|
520
|
+
#
|
521
|
+
# @example
|
522
|
+
# df = Polars::DataFrame.new({
|
523
|
+
# "foo" => [1, 2, 3, 4, 5],
|
524
|
+
# "bar" => [6, 7, 8, 9, 10],
|
525
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
526
|
+
# })
|
527
|
+
# df.write_csv("file.csv")
|
128
528
|
def write_csv(
|
129
529
|
file = nil,
|
130
530
|
has_header: true,
|
@@ -160,8 +560,7 @@ module Polars
|
|
160
560
|
float_precision,
|
161
561
|
null_value
|
162
562
|
)
|
163
|
-
buffer.
|
164
|
-
return buffer.read.force_encoding(Encoding::UTF_8)
|
563
|
+
return buffer.string.force_encoding(Encoding::UTF_8)
|
165
564
|
end
|
166
565
|
|
167
566
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
@@ -183,6 +582,53 @@ module Polars
|
|
183
582
|
nil
|
184
583
|
end
|
185
584
|
|
585
|
+
# def write_avro
|
586
|
+
# end
|
587
|
+
|
588
|
+
# Write to Arrow IPC binary stream or Feather file.
|
589
|
+
#
|
590
|
+
# @param file [String]
|
591
|
+
# File path to which the file should be written.
|
592
|
+
# @param compression ["uncompressed", "lz4", "zstd"]
|
593
|
+
# Compression method. Defaults to "uncompressed".
|
594
|
+
#
|
595
|
+
# @return [nil]
|
596
|
+
def write_ipc(file, compression: "uncompressed")
|
597
|
+
if compression.nil?
|
598
|
+
compression = "uncompressed"
|
599
|
+
end
|
600
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
601
|
+
file = Utils.format_path(file)
|
602
|
+
end
|
603
|
+
|
604
|
+
_df.write_ipc(file, compression)
|
605
|
+
end
|
606
|
+
|
607
|
+
# Write to Apache Parquet file.
|
608
|
+
#
|
609
|
+
# @param file [String]
|
610
|
+
# File path to which the file should be written.
|
611
|
+
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
612
|
+
# Choose "zstd" for good compression performance.
|
613
|
+
# Choose "lz4" for fast compression/decompression.
|
614
|
+
# Choose "snappy" for more backwards compatibility guarantees
|
615
|
+
# when you deal with older parquet readers.
|
616
|
+
# @param compression_level [Integer, nil]
|
617
|
+
# The level of compression to use. Higher compression means smaller files on
|
618
|
+
# disk.
|
619
|
+
#
|
620
|
+
# - "gzip" : min-level: 0, max-level: 10.
|
621
|
+
# - "brotli" : min-level: 0, max-level: 11.
|
622
|
+
# - "zstd" : min-level: 1, max-level: 22.
|
623
|
+
# @param statistics [Boolean]
|
624
|
+
# Write statistics to the parquet headers. This requires extra compute.
|
625
|
+
# @param row_group_size [Integer, nil]
|
626
|
+
# Size of the row groups in number of rows.
|
627
|
+
# If `nil` (default), the chunks of the DataFrame are
|
628
|
+
# used. Writing in smaller chunks may reduce memory pressure and improve
|
629
|
+
# writing speeds.
|
630
|
+
#
|
631
|
+
# @return [nil]
|
186
632
|
def write_parquet(
|
187
633
|
file,
|
188
634
|
compression: "zstd",
|
@@ -202,10 +648,177 @@ module Polars
|
|
202
648
|
)
|
203
649
|
end
|
204
650
|
|
651
|
+
# Return an estimation of the total (heap) allocated size of the DataFrame.
|
652
|
+
#
|
653
|
+
# Estimated size is given in the specified unit (bytes by default).
|
654
|
+
#
|
655
|
+
# This estimation is the sum of the size of its buffers, validity, including
|
656
|
+
# nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
|
657
|
+
# size of 2 arrays is not the sum of the sizes computed from this function. In
|
658
|
+
# particular, StructArray's size is an upper bound.
|
659
|
+
#
|
660
|
+
# When an array is sliced, its allocated size remains constant because the buffer
|
661
|
+
# unchanged. However, this function will yield a smaller number. This is because
|
662
|
+
# this function returns the visible size of the buffer, not its total capacity.
|
663
|
+
#
|
664
|
+
# FFI buffers are included in this estimation.
|
665
|
+
#
|
666
|
+
# @param unit ["b", "kb", "mb", "gb", "tb"]
|
667
|
+
# Scale the returned size to the given unit.
|
668
|
+
#
|
669
|
+
# @return [Numeric]
|
670
|
+
#
|
671
|
+
# @example
|
672
|
+
# df = Polars::DataFrame.new(
|
673
|
+
# {
|
674
|
+
# "x" => 1_000_000.times.to_a.reverse,
|
675
|
+
# "y" => 1_000_000.times.map { |v| v / 1000.0 },
|
676
|
+
# "z" => 1_000_000.times.map(&:to_s)
|
677
|
+
# },
|
678
|
+
# columns: {"x" => :u32, "y" => :f64, "z" => :str}
|
679
|
+
# )
|
680
|
+
# df.estimated_size
|
681
|
+
# # => 25888898
|
682
|
+
# df.estimated_size("mb")
|
683
|
+
# # => 24.689577102661133
|
684
|
+
def estimated_size(unit = "b")
|
685
|
+
sz = _df.estimated_size
|
686
|
+
Utils.scale_bytes(sz, to: unit)
|
687
|
+
end
|
688
|
+
|
689
|
+
# def transpose
|
690
|
+
# end
|
691
|
+
|
692
|
+
# Reverse the DataFrame.
|
693
|
+
#
|
694
|
+
# @return [DataFrame]
|
695
|
+
#
|
696
|
+
# @example
|
697
|
+
# df = Polars::DataFrame.new({
|
698
|
+
# "key" => ["a", "b", "c"],
|
699
|
+
# "val" => [1, 2, 3]
|
700
|
+
# })
|
701
|
+
# df.reverse()
|
702
|
+
# # =>
|
703
|
+
# # shape: (3, 2)
|
704
|
+
# # ┌─────┬─────┐
|
705
|
+
# # │ key ┆ val │
|
706
|
+
# # │ --- ┆ --- │
|
707
|
+
# # │ str ┆ i64 │
|
708
|
+
# # ╞═════╪═════╡
|
709
|
+
# # │ c ┆ 3 │
|
710
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
711
|
+
# # │ b ┆ 2 │
|
712
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
713
|
+
# # │ a ┆ 1 │
|
714
|
+
# # └─────┴─────┘
|
715
|
+
def reverse
|
716
|
+
select(Polars.col("*").reverse)
|
717
|
+
end
|
718
|
+
|
719
|
+
# Rename column names.
|
720
|
+
#
|
721
|
+
# @param mapping [Hash]
|
722
|
+
# Key value pairs that map from old name to new name.
|
723
|
+
#
|
724
|
+
# @return [DataFrame]
|
725
|
+
#
|
726
|
+
# @example
|
727
|
+
# df = Polars::DataFrame.new({
|
728
|
+
# "foo" => [1, 2, 3],
|
729
|
+
# "bar" => [6, 7, 8],
|
730
|
+
# "ham" => ["a", "b", "c"]
|
731
|
+
# })
|
732
|
+
# df.rename({"foo" => "apple"})
|
733
|
+
# # =>
|
734
|
+
# # shape: (3, 3)
|
735
|
+
# # ┌───────┬─────┬─────┐
|
736
|
+
# # │ apple ┆ bar ┆ ham │
|
737
|
+
# # │ --- ┆ --- ┆ --- │
|
738
|
+
# # │ i64 ┆ i64 ┆ str │
|
739
|
+
# # ╞═══════╪═════╪═════╡
|
740
|
+
# # │ 1 ┆ 6 ┆ a │
|
741
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
742
|
+
# # │ 2 ┆ 7 ┆ b │
|
743
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
744
|
+
# # │ 3 ┆ 8 ┆ c │
|
745
|
+
# # └───────┴─────┴─────┘
|
746
|
+
def rename(mapping)
|
747
|
+
lazy.rename(mapping).collect(no_optimization: true)
|
748
|
+
end
|
749
|
+
|
750
|
+
# Insert a Series at a certain column index. This operation is in place.
|
751
|
+
#
|
752
|
+
# @param index [Integer]
|
753
|
+
# Column to insert the new `Series` column.
|
754
|
+
# @param series [Series]
|
755
|
+
# `Series` to insert.
|
756
|
+
#
|
757
|
+
# @return [DataFrame]
|
758
|
+
#
|
759
|
+
# @example
|
760
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
761
|
+
# s = Polars::Series.new("baz", [97, 98, 99])
|
762
|
+
# df.insert_at_idx(1, s)
|
763
|
+
# # =>
|
764
|
+
# # shape: (3, 3)
|
765
|
+
# # ┌─────┬─────┬─────┐
|
766
|
+
# # │ foo ┆ baz ┆ bar │
|
767
|
+
# # │ --- ┆ --- ┆ --- │
|
768
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
769
|
+
# # ╞═════╪═════╪═════╡
|
770
|
+
# # │ 1 ┆ 97 ┆ 4 │
|
771
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
772
|
+
# # │ 2 ┆ 98 ┆ 5 │
|
773
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
774
|
+
# # │ 3 ┆ 99 ┆ 6 │
|
775
|
+
# # └─────┴─────┴─────┘
|
776
|
+
#
|
777
|
+
# @example
|
778
|
+
# df = Polars::DataFrame.new({
|
779
|
+
# "a" => [1, 2, 3, 4],
|
780
|
+
# "b" => [0.5, 4, 10, 13],
|
781
|
+
# "c" => [true, true, false, true]
|
782
|
+
# })
|
783
|
+
# s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
|
784
|
+
# df.insert_at_idx(3, s)
|
785
|
+
# # =>
|
786
|
+
# # shape: (4, 4)
|
787
|
+
# # ┌─────┬──────┬───────┬──────┐
|
788
|
+
# # │ a ┆ b ┆ c ┆ d │
|
789
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
790
|
+
# # │ i64 ┆ f64 ┆ bool ┆ f64 │
|
791
|
+
# # ╞═════╪══════╪═══════╪══════╡
|
792
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
|
793
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
794
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
|
795
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
796
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
|
797
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
798
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
|
799
|
+
# # └─────┴──────┴───────┴──────┘
|
800
|
+
def insert_at_idx(index, series)
|
801
|
+
if index < 0
|
802
|
+
index = columns.length + index
|
803
|
+
end
|
804
|
+
_df.insert_at_idx(index, series._s)
|
805
|
+
self
|
806
|
+
end
|
807
|
+
|
205
808
|
def filter(predicate)
|
206
809
|
lazy.filter(predicate).collect
|
207
810
|
end
|
208
811
|
|
812
|
+
# def describe
|
813
|
+
# end
|
814
|
+
|
815
|
+
# def find_idx_by_name
|
816
|
+
# end
|
817
|
+
|
818
|
+
# def replace_at_idx
|
819
|
+
# end
|
820
|
+
|
821
|
+
#
|
209
822
|
def sort(by, reverse: false, nulls_last: false)
|
210
823
|
_from_rbdf(_df.sort(by, reverse, nulls_last))
|
211
824
|
end
|
@@ -214,6 +827,17 @@ module Polars
|
|
214
827
|
_df.frame_equal(other._df, null_equal)
|
215
828
|
end
|
216
829
|
|
830
|
+
# def replace
|
831
|
+
# end
|
832
|
+
|
833
|
+
#
|
834
|
+
def slice(offset, length = nil)
|
835
|
+
if !length.nil? && length < 0
|
836
|
+
length = height - offset + length
|
837
|
+
end
|
838
|
+
_from_rbdf(_df.slice(offset, length))
|
839
|
+
end
|
840
|
+
|
217
841
|
def limit(n = 5)
|
218
842
|
head(n)
|
219
843
|
end
|
@@ -226,10 +850,33 @@ module Polars
|
|
226
850
|
_from_rbdf(_df.tail(n))
|
227
851
|
end
|
228
852
|
|
853
|
+
# def drop_nulls
|
854
|
+
# end
|
855
|
+
|
856
|
+
# def pipe
|
857
|
+
# end
|
858
|
+
|
859
|
+
# def with_row_count
|
860
|
+
# end
|
861
|
+
|
862
|
+
#
|
229
863
|
def groupby(by, maintain_order: false)
|
230
864
|
lazy.groupby(by, maintain_order: maintain_order)
|
231
865
|
end
|
232
866
|
|
867
|
+
# def groupby_rolling
|
868
|
+
# end
|
869
|
+
|
870
|
+
# def groupby_dynamic
|
871
|
+
# end
|
872
|
+
|
873
|
+
# def upsample
|
874
|
+
# end
|
875
|
+
|
876
|
+
# def join_asof
|
877
|
+
# end
|
878
|
+
|
879
|
+
#
|
233
880
|
def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
|
234
881
|
lazy
|
235
882
|
.join(
|
@@ -243,12 +890,86 @@ module Polars
|
|
243
890
|
.collect(no_optimization: true)
|
244
891
|
end
|
245
892
|
|
893
|
+
# def apply
|
894
|
+
# end
|
895
|
+
|
896
|
+
#
|
246
897
|
def with_column(column)
|
247
898
|
lazy
|
248
899
|
.with_column(column)
|
249
900
|
.collect(no_optimization: true, string_cache: false)
|
250
901
|
end
|
251
902
|
|
903
|
+
# def hstack
|
904
|
+
# end
|
905
|
+
|
906
|
+
# def vstack
|
907
|
+
# end
|
908
|
+
|
909
|
+
#
|
910
|
+
def extend(other)
|
911
|
+
_df.extend(other._df)
|
912
|
+
self
|
913
|
+
end
|
914
|
+
|
915
|
+
# def drop
|
916
|
+
# end
|
917
|
+
|
918
|
+
# def drop_in_place
|
919
|
+
# end
|
920
|
+
|
921
|
+
# def cleared
|
922
|
+
# end
|
923
|
+
|
924
|
+
# clone handled by initialize_copy
|
925
|
+
|
926
|
+
#
|
927
|
+
def get_columns
|
928
|
+
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
929
|
+
end
|
930
|
+
|
931
|
+
def get_column(name)
|
932
|
+
self[name]
|
933
|
+
end
|
934
|
+
|
935
|
+
# def fill_null
|
936
|
+
# end
|
937
|
+
|
938
|
+
#
|
939
|
+
def fill_nan(fill_value)
|
940
|
+
lazy.fill_nan(fill_value).collect(no_optimization: true)
|
941
|
+
end
|
942
|
+
|
943
|
+
# def explode
|
944
|
+
# end
|
945
|
+
|
946
|
+
# def pivot
|
947
|
+
# end
|
948
|
+
|
949
|
+
# def melt
|
950
|
+
# end
|
951
|
+
|
952
|
+
# def unstack
|
953
|
+
# end
|
954
|
+
|
955
|
+
# def partition_by
|
956
|
+
# end
|
957
|
+
|
958
|
+
# def shift
|
959
|
+
# end
|
960
|
+
|
961
|
+
# def shift_and_fill
|
962
|
+
# end
|
963
|
+
|
964
|
+
#
|
965
|
+
def is_duplicated
|
966
|
+
Utils.wrap_s(_df.is_duplicated)
|
967
|
+
end
|
968
|
+
|
969
|
+
def is_unique
|
970
|
+
Utils.wrap_s(_df.is_unique)
|
971
|
+
end
|
972
|
+
|
252
973
|
def lazy
|
253
974
|
wrap_ldf(_df.lazy)
|
254
975
|
end
|
@@ -262,6 +983,56 @@ module Polars
|
|
262
983
|
)
|
263
984
|
end
|
264
985
|
|
986
|
+
def with_columns(exprs)
|
987
|
+
if !exprs.nil? && !exprs.is_a?(Array)
|
988
|
+
exprs = [exprs]
|
989
|
+
end
|
990
|
+
lazy
|
991
|
+
.with_columns(exprs)
|
992
|
+
.collect(no_optimization: true, string_cache: false)
|
993
|
+
end
|
994
|
+
|
995
|
+
def n_chunks(strategy: "first")
|
996
|
+
if strategy == "first"
|
997
|
+
_df.n_chunks
|
998
|
+
elsif strategy == "all"
|
999
|
+
get_columns.map(&:n_chunks)
|
1000
|
+
else
|
1001
|
+
raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first', 'all'}}"
|
1002
|
+
end
|
1003
|
+
end
|
1004
|
+
|
1005
|
+
def max(axis: 0)
|
1006
|
+
if axis == 0
|
1007
|
+
_from_rbdf(_df.max)
|
1008
|
+
elsif axis == 1
|
1009
|
+
Utils.wrap_s(_df.hmax)
|
1010
|
+
else
|
1011
|
+
raise ArgumentError, "Axis should be 0 or 1."
|
1012
|
+
end
|
1013
|
+
end
|
1014
|
+
|
1015
|
+
def min(axis: 0)
|
1016
|
+
if axis == 0
|
1017
|
+
_from_rbdf(_df.min)
|
1018
|
+
elsif axis == 1
|
1019
|
+
Utils.wrap_s(_df.hmin)
|
1020
|
+
else
|
1021
|
+
raise ArgumentError, "Axis should be 0 or 1."
|
1022
|
+
end
|
1023
|
+
end
|
1024
|
+
|
1025
|
+
def sum(axis: 0, null_strategy: "ignore")
|
1026
|
+
case axis
|
1027
|
+
when 0
|
1028
|
+
_from_rbdf(_df.sum)
|
1029
|
+
when 1
|
1030
|
+
Utils.wrap_s(_df.hsum(null_strategy))
|
1031
|
+
else
|
1032
|
+
raise ArgumentError, "Axis should be 0 or 1."
|
1033
|
+
end
|
1034
|
+
end
|
1035
|
+
|
265
1036
|
def mean(axis: 0, null_strategy: "ignore")
|
266
1037
|
case axis
|
267
1038
|
when 0
|
@@ -273,15 +1044,34 @@ module Polars
|
|
273
1044
|
end
|
274
1045
|
end
|
275
1046
|
|
276
|
-
def
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
.collect(no_optimization: true, string_cache: false)
|
1047
|
+
def std(ddof: 1)
|
1048
|
+
_from_rbdf(_df.std(ddof))
|
1049
|
+
end
|
1050
|
+
|
1051
|
+
def var(ddof: 1)
|
1052
|
+
_from_rbdf(_df.var(ddof))
|
283
1053
|
end
|
284
1054
|
|
1055
|
+
def median
|
1056
|
+
_from_rbdf(_df.median)
|
1057
|
+
end
|
1058
|
+
|
1059
|
+
# def product
|
1060
|
+
# end
|
1061
|
+
|
1062
|
+
# def quantile(quantile, interpolation: "nearest")
|
1063
|
+
# end
|
1064
|
+
|
1065
|
+
# def to_dummies
|
1066
|
+
# end
|
1067
|
+
|
1068
|
+
# def unique
|
1069
|
+
# end
|
1070
|
+
|
1071
|
+
# def n_unique
|
1072
|
+
# end
|
1073
|
+
|
1074
|
+
#
|
285
1075
|
def rechunk
|
286
1076
|
_from_rbdf(_df.rechunk)
|
287
1077
|
end
|
@@ -290,17 +1080,98 @@ module Polars
|
|
290
1080
|
_from_rbdf(_df.null_count)
|
291
1081
|
end
|
292
1082
|
|
1083
|
+
# def sample
|
1084
|
+
# end
|
1085
|
+
|
1086
|
+
# def fold
|
1087
|
+
# end
|
1088
|
+
|
1089
|
+
# def row
|
1090
|
+
# end
|
1091
|
+
|
1092
|
+
# def rows
|
1093
|
+
# end
|
1094
|
+
|
1095
|
+
# def shrink_to_fit
|
1096
|
+
# end
|
1097
|
+
|
1098
|
+
# def take_every
|
1099
|
+
# end
|
1100
|
+
|
1101
|
+
# def hash_rows
|
1102
|
+
# end
|
1103
|
+
|
1104
|
+
# def interpolate
|
1105
|
+
# end
|
1106
|
+
|
1107
|
+
#
|
1108
|
+
def is_empty
|
1109
|
+
height == 0
|
1110
|
+
end
|
1111
|
+
alias_method :empty?, :is_empty
|
1112
|
+
|
1113
|
+
# def to_struct(name)
|
1114
|
+
# end
|
1115
|
+
|
1116
|
+
# def unnest
|
1117
|
+
# end
|
1118
|
+
|
293
1119
|
private
|
294
1120
|
|
295
|
-
def
|
1121
|
+
def initialize_copy(other)
|
1122
|
+
super
|
1123
|
+
self._df = _df._clone
|
1124
|
+
end
|
1125
|
+
|
1126
|
+
def hash_to_rbdf(data, columns: nil)
|
1127
|
+
if !columns.nil?
|
1128
|
+
columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
|
1129
|
+
|
1130
|
+
if !data && dtypes
|
1131
|
+
data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
|
1132
|
+
else
|
1133
|
+
data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
|
1134
|
+
end
|
1135
|
+
data_series = _handle_columns_arg(data_series, columns: columns)
|
1136
|
+
return RbDataFrame.new(data_series)
|
1137
|
+
end
|
1138
|
+
|
296
1139
|
RbDataFrame.read_hash(data)
|
297
1140
|
end
|
298
1141
|
|
299
|
-
def
|
1142
|
+
def _unpack_columns(columns, lookup_names: nil)
|
1143
|
+
[columns.keys, columns]
|
1144
|
+
end
|
1145
|
+
|
1146
|
+
def _handle_columns_arg(data, columns: nil)
|
1147
|
+
if columns.nil?
|
1148
|
+
data
|
1149
|
+
else
|
1150
|
+
if !data
|
1151
|
+
columns.map { |c| Series.new(c, nil)._s }
|
1152
|
+
elsif data.length == columns.length
|
1153
|
+
columns.each_with_index do |c, i|
|
1154
|
+
# not in-place?
|
1155
|
+
data[i].rename(c)
|
1156
|
+
end
|
1157
|
+
data
|
1158
|
+
else
|
1159
|
+
raise ArgumentError, "Dimensions of columns arg must match data dimensions."
|
1160
|
+
end
|
1161
|
+
end
|
1162
|
+
end
|
1163
|
+
|
1164
|
+
def sequence_to_rbdf(data, columns: nil, orient: nil)
|
1165
|
+
if columns || orient
|
1166
|
+
raise Todo
|
1167
|
+
end
|
300
1168
|
RbDataFrame.new(data.map(&:_s))
|
301
1169
|
end
|
302
1170
|
|
303
|
-
def series_to_rbdf(data)
|
1171
|
+
def series_to_rbdf(data, columns: nil)
|
1172
|
+
if columns
|
1173
|
+
raise Todo
|
1174
|
+
end
|
304
1175
|
RbDataFrame.new([data._s])
|
305
1176
|
end
|
306
1177
|
|