polars-df 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +1 -1
- data/ext/polars/Cargo.toml +1 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +105 -5
- data/ext/polars/src/dataframe.rs +132 -4
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +132 -0
- data/ext/polars/src/lazy/dsl.rs +38 -0
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +77 -3
- data/ext/polars/src/series.rs +8 -9
- data/lib/polars/batched_csv_reader.rb +95 -0
- data/lib/polars/data_frame.rb +585 -19
- data/lib/polars/expr.rb +17 -2
- data/lib/polars/io.rb +342 -2
- data/lib/polars/lazy_frame.rb +156 -2
- data/lib/polars/lazy_functions.rb +154 -11
- data/lib/polars/series.rb +806 -18
- data/lib/polars/utils.rb +33 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +9 -0
- metadata +5 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -1,8 +1,22 @@
|
|
1
1
|
module Polars
|
2
|
+
# Two-dimensional data structure representing data as a table with rows and columns.
|
2
3
|
class DataFrame
|
4
|
+
# @private
|
3
5
|
attr_accessor :_df
|
4
6
|
|
5
|
-
|
7
|
+
# Create a new DataFrame.
|
8
|
+
#
|
9
|
+
# @param data [Hash, Array, Series, nil]
|
10
|
+
# Two-dimensional data in various forms. Hash must contain Arrays.
|
11
|
+
# Array may contain Series.
|
12
|
+
# @param columns [Array, Hash, nil]
|
13
|
+
# Column labels to use for resulting DataFrame. If specified, overrides any
|
14
|
+
# labels already present in the data. Must match data dimensions.
|
15
|
+
# @param orient ["col", "row", nil]
|
16
|
+
# Whether to interpret two-dimensional data as columns or as rows. If `nil`,
|
17
|
+
# the orientation is inferred by matching the columns and data dimensions. If
|
18
|
+
# this does not yield conclusive results, column orientation is used.
|
19
|
+
def initialize(data = nil, columns: nil, orient: nil)
|
6
20
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
7
21
|
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
|
8
22
|
data = {}
|
@@ -12,33 +26,135 @@ module Polars
|
|
12
26
|
end
|
13
27
|
|
14
28
|
if data.nil?
|
15
|
-
self._df = hash_to_rbdf({})
|
29
|
+
self._df = hash_to_rbdf({}, columns: columns)
|
16
30
|
elsif data.is_a?(Hash)
|
17
31
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
18
|
-
self._df = hash_to_rbdf(data)
|
32
|
+
self._df = hash_to_rbdf(data, columns: columns)
|
19
33
|
elsif data.is_a?(Array)
|
20
|
-
self._df = sequence_to_rbdf(data)
|
34
|
+
self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
|
21
35
|
elsif data.is_a?(Series)
|
22
|
-
self._df = series_to_rbdf(data)
|
36
|
+
self._df = series_to_rbdf(data, columns: columns)
|
23
37
|
else
|
24
38
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
25
39
|
end
|
26
40
|
end
|
27
41
|
|
42
|
+
# @private
|
28
43
|
def self._from_rbdf(rb_df)
|
29
44
|
df = DataFrame.allocate
|
30
45
|
df._df = rb_df
|
31
46
|
df
|
32
47
|
end
|
33
48
|
|
34
|
-
def self.
|
49
|
+
# def self._from_hashes
|
50
|
+
# end
|
51
|
+
|
52
|
+
# def self._from_hash
|
53
|
+
# end
|
54
|
+
|
55
|
+
# def self._from_records
|
56
|
+
# end
|
57
|
+
|
58
|
+
# def self._from_numo
|
59
|
+
# end
|
60
|
+
|
61
|
+
# no self._from_arrow
|
62
|
+
|
63
|
+
# no self._from_pandas
|
64
|
+
|
65
|
+
# @private
|
66
|
+
def self._read_csv(
|
67
|
+
file,
|
68
|
+
has_header: true,
|
69
|
+
columns: nil,
|
70
|
+
sep: str = ",",
|
71
|
+
comment_char: nil,
|
72
|
+
quote_char: '"',
|
73
|
+
skip_rows: 0,
|
74
|
+
dtypes: nil,
|
75
|
+
null_values: nil,
|
76
|
+
ignore_errors: false,
|
77
|
+
parse_dates: false,
|
78
|
+
n_threads: nil,
|
79
|
+
infer_schema_length: 100,
|
80
|
+
batch_size: 8192,
|
81
|
+
n_rows: nil,
|
82
|
+
encoding: "utf8",
|
83
|
+
low_memory: false,
|
84
|
+
rechunk: true,
|
85
|
+
skip_rows_after_header: 0,
|
86
|
+
row_count_name: nil,
|
87
|
+
row_count_offset: 0,
|
88
|
+
sample_size: 1024,
|
89
|
+
eol_char: "\n"
|
90
|
+
)
|
35
91
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
36
|
-
|
92
|
+
path = Utils.format_path(file)
|
93
|
+
else
|
94
|
+
path = nil
|
95
|
+
# if defined?(StringIO) && file.is_a?(StringIO)
|
96
|
+
# file = file.string
|
97
|
+
# end
|
98
|
+
end
|
99
|
+
|
100
|
+
dtype_list = nil
|
101
|
+
dtype_slice = nil
|
102
|
+
if !dtypes.nil?
|
103
|
+
if dtypes.is_a?(Hash)
|
104
|
+
dtype_list = []
|
105
|
+
dtypes.each do|k, v|
|
106
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
107
|
+
end
|
108
|
+
elsif dtypes.is_a?(Array)
|
109
|
+
dtype_slice = dtypes
|
110
|
+
else
|
111
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
processed_null_values = Utils._process_null_values(null_values)
|
116
|
+
|
117
|
+
if columns.is_a?(String)
|
118
|
+
columns = [columns]
|
119
|
+
end
|
120
|
+
if file.is_a?(String) && file.include?("*")
|
121
|
+
raise Todo
|
37
122
|
end
|
38
123
|
|
39
|
-
|
124
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
125
|
+
|
126
|
+
_from_rbdf(
|
127
|
+
RbDataFrame.read_csv(
|
128
|
+
file,
|
129
|
+
infer_schema_length,
|
130
|
+
batch_size,
|
131
|
+
has_header,
|
132
|
+
ignore_errors,
|
133
|
+
n_rows,
|
134
|
+
skip_rows,
|
135
|
+
projection,
|
136
|
+
sep,
|
137
|
+
rechunk,
|
138
|
+
columns,
|
139
|
+
encoding,
|
140
|
+
n_threads,
|
141
|
+
path,
|
142
|
+
dtype_list,
|
143
|
+
dtype_slice,
|
144
|
+
low_memory,
|
145
|
+
comment_char,
|
146
|
+
quote_char,
|
147
|
+
processed_null_values,
|
148
|
+
parse_dates,
|
149
|
+
skip_rows_after_header,
|
150
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
151
|
+
sample_size,
|
152
|
+
eol_char
|
153
|
+
)
|
154
|
+
)
|
40
155
|
end
|
41
156
|
|
157
|
+
# @private
|
42
158
|
def self._read_parquet(file)
|
43
159
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
44
160
|
file = Utils.format_path(file)
|
@@ -47,6 +163,44 @@ module Polars
|
|
47
163
|
_from_rbdf(RbDataFrame.read_parquet(file))
|
48
164
|
end
|
49
165
|
|
166
|
+
# def self._read_avro
|
167
|
+
# end
|
168
|
+
|
169
|
+
# @private
|
170
|
+
def self._read_ipc(
|
171
|
+
file,
|
172
|
+
columns: nil,
|
173
|
+
n_rows: nil,
|
174
|
+
row_count_name: nil,
|
175
|
+
row_count_offset: 0,
|
176
|
+
rechunk: true,
|
177
|
+
memory_map: true
|
178
|
+
)
|
179
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
180
|
+
file = Utils.format_path(file)
|
181
|
+
end
|
182
|
+
if columns.is_a?(String)
|
183
|
+
columns = [columns]
|
184
|
+
end
|
185
|
+
|
186
|
+
if file.is_a?(String) && file.include?("*")
|
187
|
+
raise Todo
|
188
|
+
end
|
189
|
+
|
190
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
191
|
+
_from_rbdf(
|
192
|
+
RbDataFrame.read_ipc(
|
193
|
+
file,
|
194
|
+
columns,
|
195
|
+
projection,
|
196
|
+
n_rows,
|
197
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
198
|
+
memory_map
|
199
|
+
)
|
200
|
+
)
|
201
|
+
end
|
202
|
+
|
203
|
+
# @private
|
50
204
|
def self._read_json(file)
|
51
205
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
52
206
|
file = Utils.format_path(file)
|
@@ -55,6 +209,7 @@ module Polars
|
|
55
209
|
_from_rbdf(RbDataFrame.read_json(file))
|
56
210
|
end
|
57
211
|
|
212
|
+
# @private
|
58
213
|
def self._read_ndjson(file)
|
59
214
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
60
215
|
file = Utils.format_path(file)
|
@@ -63,30 +218,119 @@ module Polars
|
|
63
218
|
_from_rbdf(RbDataFrame.read_ndjson(file))
|
64
219
|
end
|
65
220
|
|
221
|
+
# Get the shape of the DataFrame.
|
222
|
+
#
|
223
|
+
# @return [Array]
|
224
|
+
#
|
225
|
+
# @example
|
226
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
227
|
+
# df.shape
|
228
|
+
# # => [5, 1]
|
66
229
|
def shape
|
67
230
|
_df.shape
|
68
231
|
end
|
69
232
|
|
233
|
+
# Get the height of the DataFrame.
|
234
|
+
#
|
235
|
+
# @return [Integer]
|
236
|
+
#
|
237
|
+
# @example
|
238
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
239
|
+
# df.height
|
240
|
+
# # => 5
|
70
241
|
def height
|
71
242
|
_df.height
|
72
243
|
end
|
73
244
|
|
245
|
+
# Get the width of the DataFrame.
|
246
|
+
#
|
247
|
+
# @return [Integer]
|
248
|
+
#
|
249
|
+
# @example
|
250
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
251
|
+
# df.width
|
252
|
+
# # => 1
|
74
253
|
def width
|
75
254
|
_df.width
|
76
255
|
end
|
77
256
|
|
257
|
+
# Get column names.
|
258
|
+
#
|
259
|
+
# @return [Array]
|
260
|
+
#
|
261
|
+
# @example
|
262
|
+
# df = Polars::DataFrame.new({
|
263
|
+
# "foo" => [1, 2, 3],
|
264
|
+
# "bar" => [6, 7, 8],
|
265
|
+
# "ham" => ["a", "b", "c"]
|
266
|
+
# })
|
267
|
+
# df.columns
|
268
|
+
# # => ["foo", "bar", "ham"]
|
78
269
|
def columns
|
79
270
|
_df.columns
|
80
271
|
end
|
81
272
|
|
273
|
+
# Change the column names of the DataFrame.
|
274
|
+
#
|
275
|
+
# @param columns [Array]
|
276
|
+
# A list with new names for the DataFrame.
|
277
|
+
# The length of the list should be equal to the width of the DataFrame.
|
278
|
+
#
|
279
|
+
# @return [Object]
|
280
|
+
#
|
281
|
+
# @example
|
282
|
+
# df = Polars::DataFrame.new({
|
283
|
+
# "foo" => [1, 2, 3],
|
284
|
+
# "bar" => [6, 7, 8],
|
285
|
+
# "ham" => ["a", "b", "c"]
|
286
|
+
# })
|
287
|
+
# df.columns = ["apple", "banana", "orange"]
|
288
|
+
# df
|
289
|
+
# # =>
|
290
|
+
# # shape: (3, 3)
|
291
|
+
# # ┌───────┬────────┬────────┐
|
292
|
+
# # │ apple ┆ banana ┆ orange │
|
293
|
+
# # │ --- ┆ --- ┆ --- │
|
294
|
+
# # │ i64 ┆ i64 ┆ str │
|
295
|
+
# # ╞═══════╪════════╪════════╡
|
296
|
+
# # │ 1 ┆ 6 ┆ a │
|
297
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
298
|
+
# # │ 2 ┆ 7 ┆ b │
|
299
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
300
|
+
# # │ 3 ┆ 8 ┆ c │
|
301
|
+
# # └───────┴────────┴────────┘
|
82
302
|
def columns=(columns)
|
83
303
|
_df.set_column_names(columns)
|
84
304
|
end
|
85
305
|
|
306
|
+
# Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
|
307
|
+
#
|
308
|
+
# @return [Array]
|
309
|
+
#
|
310
|
+
# @example
|
311
|
+
# df = Polars::DataFrame.new({
|
312
|
+
# "foo" => [1, 2, 3],
|
313
|
+
# "bar" => [6.0, 7.0, 8.0],
|
314
|
+
# "ham" => ["a", "b", "c"]
|
315
|
+
# })
|
316
|
+
# df.dtypes
|
317
|
+
# # => [:i64, :f64, :str]
|
86
318
|
def dtypes
|
87
|
-
_df.dtypes
|
319
|
+
_df.dtypes
|
88
320
|
end
|
89
321
|
|
322
|
+
# Get the schema.
|
323
|
+
#
|
324
|
+
# @return [Hash]
|
325
|
+
#
|
326
|
+
# @example
|
327
|
+
# df = Polars::DataFrame.new({
|
328
|
+
# "foo" => [1, 2, 3],
|
329
|
+
# "bar" => [6.0, 7.0, 8.0],
|
330
|
+
# "ham" => ["a", "b", "c"]
|
331
|
+
# })
|
332
|
+
# df.schema
|
333
|
+
# # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
|
90
334
|
def schema
|
91
335
|
columns.zip(dtypes).to_h
|
92
336
|
end
|
@@ -124,6 +368,7 @@ module Polars
|
|
124
368
|
# def %(other)
|
125
369
|
# end
|
126
370
|
|
371
|
+
#
|
127
372
|
def to_s
|
128
373
|
_df.to_s
|
129
374
|
end
|
@@ -133,6 +378,16 @@ module Polars
|
|
133
378
|
columns.include?(name)
|
134
379
|
end
|
135
380
|
|
381
|
+
# def each
|
382
|
+
# end
|
383
|
+
|
384
|
+
# def _pos_idx
|
385
|
+
# end
|
386
|
+
|
387
|
+
# def _pos_idxs
|
388
|
+
# end
|
389
|
+
|
390
|
+
#
|
136
391
|
def [](name)
|
137
392
|
Utils.wrap_s(_df.column(name))
|
138
393
|
end
|
@@ -140,6 +395,9 @@ module Polars
|
|
140
395
|
# def []=(key, value)
|
141
396
|
# end
|
142
397
|
|
398
|
+
# no to_arrow
|
399
|
+
|
400
|
+
#
|
143
401
|
def to_h(as_series: true)
|
144
402
|
if as_series
|
145
403
|
get_columns.to_h { |s| [s.name, s] }
|
@@ -148,7 +406,7 @@ module Polars
|
|
148
406
|
end
|
149
407
|
end
|
150
408
|
|
151
|
-
# def
|
409
|
+
# def to_hashes / to_a
|
152
410
|
# end
|
153
411
|
|
154
412
|
# def to_numo
|
@@ -156,6 +414,28 @@ module Polars
|
|
156
414
|
|
157
415
|
# no to_pandas
|
158
416
|
|
417
|
+
# Select column as Series at index location.
|
418
|
+
#
|
419
|
+
# @param index [Integer]
|
420
|
+
# Location of selection.
|
421
|
+
#
|
422
|
+
# @return [Series]
|
423
|
+
#
|
424
|
+
# @example
|
425
|
+
# df = Polars::DataFrame.new({
|
426
|
+
# "foo" => [1, 2, 3],
|
427
|
+
# "bar" => [6, 7, 8],
|
428
|
+
# "ham" => ["a", "b", "c"]
|
429
|
+
# })
|
430
|
+
# df.to_series(1)
|
431
|
+
# # =>
|
432
|
+
# # shape: (3,)
|
433
|
+
# # Series: 'bar' [i64]
|
434
|
+
# # [
|
435
|
+
# # 6
|
436
|
+
# # 7
|
437
|
+
# # 8
|
438
|
+
# # ]
|
159
439
|
def to_series(index = 0)
|
160
440
|
if index < 0
|
161
441
|
index = columns.length + index
|
@@ -163,6 +443,18 @@ module Polars
|
|
163
443
|
Utils.wrap_s(_df.select_at_idx(index))
|
164
444
|
end
|
165
445
|
|
446
|
+
# Serialize to JSON representation.
|
447
|
+
#
|
448
|
+
# @return [nil]
|
449
|
+
#
|
450
|
+
# @param file [String]
|
451
|
+
# File path to which the result should be written.
|
452
|
+
# @param pretty [Boolean]
|
453
|
+
# Pretty serialize json.
|
454
|
+
# @param row_oriented [Boolean]
|
455
|
+
# Write to row oriented json. This is slower, but more common.
|
456
|
+
#
|
457
|
+
# @see #write_ndjson
|
166
458
|
def write_json(
|
167
459
|
file,
|
168
460
|
pretty: false,
|
@@ -176,6 +468,12 @@ module Polars
|
|
176
468
|
nil
|
177
469
|
end
|
178
470
|
|
471
|
+
# Serialize to newline delimited JSON representation.
|
472
|
+
#
|
473
|
+
# @param file [String]
|
474
|
+
# File path to which the result should be written.
|
475
|
+
#
|
476
|
+
# @return [nil]
|
179
477
|
def write_ndjson(file)
|
180
478
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
181
479
|
file = Utils.format_path(file)
|
@@ -185,6 +483,48 @@ module Polars
|
|
185
483
|
nil
|
186
484
|
end
|
187
485
|
|
486
|
+
# Write to comma-separated values (CSV) file.
|
487
|
+
#
|
488
|
+
# @param file [String, nil]
|
489
|
+
# File path to which the result should be written. If set to `nil`
|
490
|
+
# (default), the output is returned as a string instead.
|
491
|
+
# @param has_header [Boolean]
|
492
|
+
# Whether to include header in the CSV output.
|
493
|
+
# @param sep [String]
|
494
|
+
# Separate CSV fields with this symbol.
|
495
|
+
# @param quote [String]
|
496
|
+
# Byte to use as quoting character.
|
497
|
+
# @param batch_size [Integer]
|
498
|
+
# Number of rows that will be processed per thread.
|
499
|
+
# @param datetime_format [String, nil]
|
500
|
+
# A format string, with the specifiers defined by the
|
501
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
502
|
+
# Rust crate. If no format specified, the default fractional-second
|
503
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
504
|
+
# Datetime cols (if any).
|
505
|
+
# @param date_format [String, nil]
|
506
|
+
# A format string, with the specifiers defined by the
|
507
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
508
|
+
# Rust crate.
|
509
|
+
# @param time_format [String, nil]
|
510
|
+
# A format string, with the specifiers defined by the
|
511
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
512
|
+
# Rust crate.
|
513
|
+
# @param float_precision [Integer, nil]
|
514
|
+
# Number of decimal places to write, applied to both `:f32` and
|
515
|
+
# `:f64` datatypes.
|
516
|
+
# @param null_value [String, nil]
|
517
|
+
# A string representing null values (defaulting to the empty string).
|
518
|
+
#
|
519
|
+
# @return [String, nil]
|
520
|
+
#
|
521
|
+
# @example
|
522
|
+
# df = Polars::DataFrame.new({
|
523
|
+
# "foo" => [1, 2, 3, 4, 5],
|
524
|
+
# "bar" => [6, 7, 8, 9, 10],
|
525
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
526
|
+
# })
|
527
|
+
# df.write_csv("file.csv")
|
188
528
|
def write_csv(
|
189
529
|
file = nil,
|
190
530
|
has_header: true,
|
@@ -220,8 +560,7 @@ module Polars
|
|
220
560
|
float_precision,
|
221
561
|
null_value
|
222
562
|
)
|
223
|
-
buffer.
|
224
|
-
return buffer.read.force_encoding(Encoding::UTF_8)
|
563
|
+
return buffer.string.force_encoding(Encoding::UTF_8)
|
225
564
|
end
|
226
565
|
|
227
566
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
@@ -246,9 +585,50 @@ module Polars
|
|
246
585
|
# def write_avro
|
247
586
|
# end
|
248
587
|
|
249
|
-
#
|
250
|
-
#
|
588
|
+
# Write to Arrow IPC binary stream or Feather file.
|
589
|
+
#
|
590
|
+
# @param file [String]
|
591
|
+
# File path to which the file should be written.
|
592
|
+
# @param compression ["uncompressed", "lz4", "zstd"]
|
593
|
+
# Compression method. Defaults to "uncompressed".
|
594
|
+
#
|
595
|
+
# @return [nil]
|
596
|
+
def write_ipc(file, compression: "uncompressed")
|
597
|
+
if compression.nil?
|
598
|
+
compression = "uncompressed"
|
599
|
+
end
|
600
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
601
|
+
file = Utils.format_path(file)
|
602
|
+
end
|
603
|
+
|
604
|
+
_df.write_ipc(file, compression)
|
605
|
+
end
|
251
606
|
|
607
|
+
# Write to Apache Parquet file.
|
608
|
+
#
|
609
|
+
# @param file [String]
|
610
|
+
# File path to which the file should be written.
|
611
|
+
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
612
|
+
# Choose "zstd" for good compression performance.
|
613
|
+
# Choose "lz4" for fast compression/decompression.
|
614
|
+
# Choose "snappy" for more backwards compatibility guarantees
|
615
|
+
# when you deal with older parquet readers.
|
616
|
+
# @param compression_level [Integer, nil]
|
617
|
+
# The level of compression to use. Higher compression means smaller files on
|
618
|
+
# disk.
|
619
|
+
#
|
620
|
+
# - "gzip" : min-level: 0, max-level: 10.
|
621
|
+
# - "brotli" : min-level: 0, max-level: 11.
|
622
|
+
# - "zstd" : min-level: 1, max-level: 22.
|
623
|
+
# @param statistics [Boolean]
|
624
|
+
# Write statistics to the parquet headers. This requires extra compute.
|
625
|
+
# @param row_group_size [Integer, nil]
|
626
|
+
# Size of the row groups in number of rows.
|
627
|
+
# If `nil` (default), the chunks of the DataFrame are
|
628
|
+
# used. Writing in smaller chunks may reduce memory pressure and improve
|
629
|
+
# writing speeds.
|
630
|
+
#
|
631
|
+
# @return [nil]
|
252
632
|
def write_parquet(
|
253
633
|
file,
|
254
634
|
compression: "zstd",
|
@@ -268,6 +648,39 @@ module Polars
|
|
268
648
|
)
|
269
649
|
end
|
270
650
|
|
651
|
+
# Return an estimation of the total (heap) allocated size of the DataFrame.
|
652
|
+
#
|
653
|
+
# Estimated size is given in the specified unit (bytes by default).
|
654
|
+
#
|
655
|
+
# This estimation is the sum of the size of its buffers, validity, including
|
656
|
+
# nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
|
657
|
+
# size of 2 arrays is not the sum of the sizes computed from this function. In
|
658
|
+
# particular, StructArray's size is an upper bound.
|
659
|
+
#
|
660
|
+
# When an array is sliced, its allocated size remains constant because the buffer
|
661
|
+
# unchanged. However, this function will yield a smaller number. This is because
|
662
|
+
# this function returns the visible size of the buffer, not its total capacity.
|
663
|
+
#
|
664
|
+
# FFI buffers are included in this estimation.
|
665
|
+
#
|
666
|
+
# @param unit ["b", "kb", "mb", "gb", "tb"]
|
667
|
+
# Scale the returned size to the given unit.
|
668
|
+
#
|
669
|
+
# @return [Numeric]
|
670
|
+
#
|
671
|
+
# @example
|
672
|
+
# df = Polars::DataFrame.new(
|
673
|
+
# {
|
674
|
+
# "x" => 1_000_000.times.to_a.reverse,
|
675
|
+
# "y" => 1_000_000.times.map { |v| v / 1000.0 },
|
676
|
+
# "z" => 1_000_000.times.map(&:to_s)
|
677
|
+
# },
|
678
|
+
# columns: {"x" => :u32, "y" => :f64, "z" => :str}
|
679
|
+
# )
|
680
|
+
# df.estimated_size
|
681
|
+
# # => 25888898
|
682
|
+
# df.estimated_size("mb")
|
683
|
+
# # => 24.689577102661133
|
271
684
|
def estimated_size(unit = "b")
|
272
685
|
sz = _df.estimated_size
|
273
686
|
Utils.scale_bytes(sz, to: unit)
|
@@ -276,14 +689,114 @@ module Polars
|
|
276
689
|
# def transpose
|
277
690
|
# end
|
278
691
|
|
692
|
+
# Reverse the DataFrame.
|
693
|
+
#
|
694
|
+
# @return [DataFrame]
|
695
|
+
#
|
696
|
+
# @example
|
697
|
+
# df = Polars::DataFrame.new({
|
698
|
+
# "key" => ["a", "b", "c"],
|
699
|
+
# "val" => [1, 2, 3]
|
700
|
+
# })
|
701
|
+
# df.reverse()
|
702
|
+
# # =>
|
703
|
+
# # shape: (3, 2)
|
704
|
+
# # ┌─────┬─────┐
|
705
|
+
# # │ key ┆ val │
|
706
|
+
# # │ --- ┆ --- │
|
707
|
+
# # │ str ┆ i64 │
|
708
|
+
# # ╞═════╪═════╡
|
709
|
+
# # │ c ┆ 3 │
|
710
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
711
|
+
# # │ b ┆ 2 │
|
712
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
713
|
+
# # │ a ┆ 1 │
|
714
|
+
# # └─────┴─────┘
|
279
715
|
def reverse
|
280
716
|
select(Polars.col("*").reverse)
|
281
717
|
end
|
282
718
|
|
719
|
+
# Rename column names.
|
720
|
+
#
|
721
|
+
# @param mapping [Hash]
|
722
|
+
# Key value pairs that map from old name to new name.
|
723
|
+
#
|
724
|
+
# @return [DataFrame]
|
725
|
+
#
|
726
|
+
# @example
|
727
|
+
# df = Polars::DataFrame.new({
|
728
|
+
# "foo" => [1, 2, 3],
|
729
|
+
# "bar" => [6, 7, 8],
|
730
|
+
# "ham" => ["a", "b", "c"]
|
731
|
+
# })
|
732
|
+
# df.rename({"foo" => "apple"})
|
733
|
+
# # =>
|
734
|
+
# # shape: (3, 3)
|
735
|
+
# # ┌───────┬─────┬─────┐
|
736
|
+
# # │ apple ┆ bar ┆ ham │
|
737
|
+
# # │ --- ┆ --- ┆ --- │
|
738
|
+
# # │ i64 ┆ i64 ┆ str │
|
739
|
+
# # ╞═══════╪═════╪═════╡
|
740
|
+
# # │ 1 ┆ 6 ┆ a │
|
741
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
742
|
+
# # │ 2 ┆ 7 ┆ b │
|
743
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
744
|
+
# # │ 3 ┆ 8 ┆ c │
|
745
|
+
# # └───────┴─────┴─────┘
|
283
746
|
def rename(mapping)
|
284
747
|
lazy.rename(mapping).collect(no_optimization: true)
|
285
748
|
end
|
286
749
|
|
750
|
+
# Insert a Series at a certain column index. This operation is in place.
|
751
|
+
#
|
752
|
+
# @param index [Integer]
|
753
|
+
# Column to insert the new `Series` column.
|
754
|
+
# @param series [Series]
|
755
|
+
# `Series` to insert.
|
756
|
+
#
|
757
|
+
# @return [DataFrame]
|
758
|
+
#
|
759
|
+
# @example
|
760
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
761
|
+
# s = Polars::Series.new("baz", [97, 98, 99])
|
762
|
+
# df.insert_at_idx(1, s)
|
763
|
+
# # =>
|
764
|
+
# # shape: (3, 3)
|
765
|
+
# # ┌─────┬─────┬─────┐
|
766
|
+
# # │ foo ┆ baz ┆ bar │
|
767
|
+
# # │ --- ┆ --- ┆ --- │
|
768
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
769
|
+
# # ╞═════╪═════╪═════╡
|
770
|
+
# # │ 1 ┆ 97 ┆ 4 │
|
771
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
772
|
+
# # │ 2 ┆ 98 ┆ 5 │
|
773
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
774
|
+
# # │ 3 ┆ 99 ┆ 6 │
|
775
|
+
# # └─────┴─────┴─────┘
|
776
|
+
#
|
777
|
+
# @example
|
778
|
+
# df = Polars::DataFrame.new({
|
779
|
+
# "a" => [1, 2, 3, 4],
|
780
|
+
# "b" => [0.5, 4, 10, 13],
|
781
|
+
# "c" => [true, true, false, true]
|
782
|
+
# })
|
783
|
+
# s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
|
784
|
+
# df.insert_at_idx(3, s)
|
785
|
+
# # =>
|
786
|
+
# # shape: (4, 4)
|
787
|
+
# # ┌─────┬──────┬───────┬──────┐
|
788
|
+
# # │ a ┆ b ┆ c ┆ d │
|
789
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
790
|
+
# # │ i64 ┆ f64 ┆ bool ┆ f64 │
|
791
|
+
# # ╞═════╪══════╪═══════╪══════╡
|
792
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
|
793
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
794
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
|
795
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
796
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
|
797
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
798
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
|
799
|
+
# # └─────┴──────┴───────┴──────┘
|
287
800
|
def insert_at_idx(index, series)
|
288
801
|
if index < 0
|
289
802
|
index = columns.length + index
|
@@ -305,6 +818,7 @@ module Polars
|
|
305
818
|
# def replace_at_idx
|
306
819
|
# end
|
307
820
|
|
821
|
+
#
|
308
822
|
def sort(by, reverse: false, nulls_last: false)
|
309
823
|
_from_rbdf(_df.sort(by, reverse, nulls_last))
|
310
824
|
end
|
@@ -316,6 +830,7 @@ module Polars
|
|
316
830
|
# def replace
|
317
831
|
# end
|
318
832
|
|
833
|
+
#
|
319
834
|
def slice(offset, length = nil)
|
320
835
|
if !length.nil? && length < 0
|
321
836
|
length = height - offset + length
|
@@ -344,6 +859,7 @@ module Polars
|
|
344
859
|
# def with_row_count
|
345
860
|
# end
|
346
861
|
|
862
|
+
#
|
347
863
|
def groupby(by, maintain_order: false)
|
348
864
|
lazy.groupby(by, maintain_order: maintain_order)
|
349
865
|
end
|
@@ -360,6 +876,7 @@ module Polars
|
|
360
876
|
# def join_asof
|
361
877
|
# end
|
362
878
|
|
879
|
+
#
|
363
880
|
def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
|
364
881
|
lazy
|
365
882
|
.join(
|
@@ -376,6 +893,7 @@ module Polars
|
|
376
893
|
# def apply
|
377
894
|
# end
|
378
895
|
|
896
|
+
#
|
379
897
|
def with_column(column)
|
380
898
|
lazy
|
381
899
|
.with_column(column)
|
@@ -388,8 +906,11 @@ module Polars
|
|
388
906
|
# def vstack
|
389
907
|
# end
|
390
908
|
|
391
|
-
#
|
392
|
-
|
909
|
+
#
|
910
|
+
def extend(other)
|
911
|
+
_df.extend(other._df)
|
912
|
+
self
|
913
|
+
end
|
393
914
|
|
394
915
|
# def drop
|
395
916
|
# end
|
@@ -402,6 +923,7 @@ module Polars
|
|
402
923
|
|
403
924
|
# clone handled by initialize_copy
|
404
925
|
|
926
|
+
#
|
405
927
|
def get_columns
|
406
928
|
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
407
929
|
end
|
@@ -413,6 +935,7 @@ module Polars
|
|
413
935
|
# def fill_null
|
414
936
|
# end
|
415
937
|
|
938
|
+
#
|
416
939
|
def fill_nan(fill_value)
|
417
940
|
lazy.fill_nan(fill_value).collect(no_optimization: true)
|
418
941
|
end
|
@@ -438,6 +961,7 @@ module Polars
|
|
438
961
|
# def shift_and_fill
|
439
962
|
# end
|
440
963
|
|
964
|
+
#
|
441
965
|
def is_duplicated
|
442
966
|
Utils.wrap_s(_df.is_duplicated)
|
443
967
|
end
|
@@ -547,6 +1071,7 @@ module Polars
|
|
547
1071
|
# def n_unique
|
548
1072
|
# end
|
549
1073
|
|
1074
|
+
#
|
550
1075
|
def rechunk
|
551
1076
|
_from_rbdf(_df.rechunk)
|
552
1077
|
end
|
@@ -579,6 +1104,7 @@ module Polars
|
|
579
1104
|
# def interpolate
|
580
1105
|
# end
|
581
1106
|
|
1107
|
+
#
|
582
1108
|
def is_empty
|
583
1109
|
height == 0
|
584
1110
|
end
|
@@ -597,15 +1123,55 @@ module Polars
|
|
597
1123
|
self._df = _df._clone
|
598
1124
|
end
|
599
1125
|
|
600
|
-
def hash_to_rbdf(data)
|
1126
|
+
def hash_to_rbdf(data, columns: nil)
|
1127
|
+
if !columns.nil?
|
1128
|
+
columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
|
1129
|
+
|
1130
|
+
if !data && dtypes
|
1131
|
+
data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
|
1132
|
+
else
|
1133
|
+
data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
|
1134
|
+
end
|
1135
|
+
data_series = _handle_columns_arg(data_series, columns: columns)
|
1136
|
+
return RbDataFrame.new(data_series)
|
1137
|
+
end
|
1138
|
+
|
601
1139
|
RbDataFrame.read_hash(data)
|
602
1140
|
end
|
603
1141
|
|
604
|
-
def
|
1142
|
+
def _unpack_columns(columns, lookup_names: nil)
|
1143
|
+
[columns.keys, columns]
|
1144
|
+
end
|
1145
|
+
|
1146
|
+
def _handle_columns_arg(data, columns: nil)
|
1147
|
+
if columns.nil?
|
1148
|
+
data
|
1149
|
+
else
|
1150
|
+
if !data
|
1151
|
+
columns.map { |c| Series.new(c, nil)._s }
|
1152
|
+
elsif data.length == columns.length
|
1153
|
+
columns.each_with_index do |c, i|
|
1154
|
+
# not in-place?
|
1155
|
+
data[i].rename(c)
|
1156
|
+
end
|
1157
|
+
data
|
1158
|
+
else
|
1159
|
+
raise ArgumentError, "Dimensions of columns arg must match data dimensions."
|
1160
|
+
end
|
1161
|
+
end
|
1162
|
+
end
|
1163
|
+
|
1164
|
+
def sequence_to_rbdf(data, columns: nil, orient: nil)
|
1165
|
+
if columns || orient
|
1166
|
+
raise Todo
|
1167
|
+
end
|
605
1168
|
RbDataFrame.new(data.map(&:_s))
|
606
1169
|
end
|
607
1170
|
|
608
|
-
def series_to_rbdf(data)
|
1171
|
+
def series_to_rbdf(data, columns: nil)
|
1172
|
+
if columns
|
1173
|
+
raise Todo
|
1174
|
+
end
|
609
1175
|
RbDataFrame.new([data._s])
|
610
1176
|
end
|
611
1177
|
|