polars-df 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +1 -1
- data/ext/polars/Cargo.toml +1 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +105 -5
- data/ext/polars/src/dataframe.rs +132 -4
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +132 -0
- data/ext/polars/src/lazy/dsl.rs +38 -0
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +77 -3
- data/ext/polars/src/series.rs +8 -9
- data/lib/polars/batched_csv_reader.rb +95 -0
- data/lib/polars/data_frame.rb +585 -19
- data/lib/polars/expr.rb +17 -2
- data/lib/polars/io.rb +342 -2
- data/lib/polars/lazy_frame.rb +156 -2
- data/lib/polars/lazy_functions.rb +154 -11
- data/lib/polars/series.rb +806 -18
- data/lib/polars/utils.rb +33 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +9 -0
- metadata +5 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -1,8 +1,22 @@
|
|
1
1
|
module Polars
|
2
|
+
# Two-dimensional data structure representing data as a table with rows and columns.
|
2
3
|
class DataFrame
|
4
|
+
# @private
|
3
5
|
attr_accessor :_df
|
4
6
|
|
5
|
-
|
7
|
+
# Create a new DataFrame.
|
8
|
+
#
|
9
|
+
# @param data [Hash, Array, Series, nil]
|
10
|
+
# Two-dimensional data in various forms. Hash must contain Arrays.
|
11
|
+
# Array may contain Series.
|
12
|
+
# @param columns [Array, Hash, nil]
|
13
|
+
# Column labels to use for resulting DataFrame. If specified, overrides any
|
14
|
+
# labels already present in the data. Must match data dimensions.
|
15
|
+
# @param orient ["col", "row", nil]
|
16
|
+
# Whether to interpret two-dimensional data as columns or as rows. If `nil`,
|
17
|
+
# the orientation is inferred by matching the columns and data dimensions. If
|
18
|
+
# this does not yield conclusive results, column orientation is used.
|
19
|
+
def initialize(data = nil, columns: nil, orient: nil)
|
6
20
|
if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
|
7
21
|
result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
|
8
22
|
data = {}
|
@@ -12,33 +26,135 @@ module Polars
|
|
12
26
|
end
|
13
27
|
|
14
28
|
if data.nil?
|
15
|
-
self._df = hash_to_rbdf({})
|
29
|
+
self._df = hash_to_rbdf({}, columns: columns)
|
16
30
|
elsif data.is_a?(Hash)
|
17
31
|
data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
|
18
|
-
self._df = hash_to_rbdf(data)
|
32
|
+
self._df = hash_to_rbdf(data, columns: columns)
|
19
33
|
elsif data.is_a?(Array)
|
20
|
-
self._df = sequence_to_rbdf(data)
|
34
|
+
self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
|
21
35
|
elsif data.is_a?(Series)
|
22
|
-
self._df = series_to_rbdf(data)
|
36
|
+
self._df = series_to_rbdf(data, columns: columns)
|
23
37
|
else
|
24
38
|
raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
|
25
39
|
end
|
26
40
|
end
|
27
41
|
|
42
|
+
# @private
|
28
43
|
def self._from_rbdf(rb_df)
|
29
44
|
df = DataFrame.allocate
|
30
45
|
df._df = rb_df
|
31
46
|
df
|
32
47
|
end
|
33
48
|
|
34
|
-
def self.
|
49
|
+
# def self._from_hashes
|
50
|
+
# end
|
51
|
+
|
52
|
+
# def self._from_hash
|
53
|
+
# end
|
54
|
+
|
55
|
+
# def self._from_records
|
56
|
+
# end
|
57
|
+
|
58
|
+
# def self._from_numo
|
59
|
+
# end
|
60
|
+
|
61
|
+
# no self._from_arrow
|
62
|
+
|
63
|
+
# no self._from_pandas
|
64
|
+
|
65
|
+
# @private
|
66
|
+
def self._read_csv(
|
67
|
+
file,
|
68
|
+
has_header: true,
|
69
|
+
columns: nil,
|
70
|
+
sep: str = ",",
|
71
|
+
comment_char: nil,
|
72
|
+
quote_char: '"',
|
73
|
+
skip_rows: 0,
|
74
|
+
dtypes: nil,
|
75
|
+
null_values: nil,
|
76
|
+
ignore_errors: false,
|
77
|
+
parse_dates: false,
|
78
|
+
n_threads: nil,
|
79
|
+
infer_schema_length: 100,
|
80
|
+
batch_size: 8192,
|
81
|
+
n_rows: nil,
|
82
|
+
encoding: "utf8",
|
83
|
+
low_memory: false,
|
84
|
+
rechunk: true,
|
85
|
+
skip_rows_after_header: 0,
|
86
|
+
row_count_name: nil,
|
87
|
+
row_count_offset: 0,
|
88
|
+
sample_size: 1024,
|
89
|
+
eol_char: "\n"
|
90
|
+
)
|
35
91
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
36
|
-
|
92
|
+
path = Utils.format_path(file)
|
93
|
+
else
|
94
|
+
path = nil
|
95
|
+
# if defined?(StringIO) && file.is_a?(StringIO)
|
96
|
+
# file = file.string
|
97
|
+
# end
|
98
|
+
end
|
99
|
+
|
100
|
+
dtype_list = nil
|
101
|
+
dtype_slice = nil
|
102
|
+
if !dtypes.nil?
|
103
|
+
if dtypes.is_a?(Hash)
|
104
|
+
dtype_list = []
|
105
|
+
dtypes.each do|k, v|
|
106
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
107
|
+
end
|
108
|
+
elsif dtypes.is_a?(Array)
|
109
|
+
dtype_slice = dtypes
|
110
|
+
else
|
111
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
processed_null_values = Utils._process_null_values(null_values)
|
116
|
+
|
117
|
+
if columns.is_a?(String)
|
118
|
+
columns = [columns]
|
119
|
+
end
|
120
|
+
if file.is_a?(String) && file.include?("*")
|
121
|
+
raise Todo
|
37
122
|
end
|
38
123
|
|
39
|
-
|
124
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
125
|
+
|
126
|
+
_from_rbdf(
|
127
|
+
RbDataFrame.read_csv(
|
128
|
+
file,
|
129
|
+
infer_schema_length,
|
130
|
+
batch_size,
|
131
|
+
has_header,
|
132
|
+
ignore_errors,
|
133
|
+
n_rows,
|
134
|
+
skip_rows,
|
135
|
+
projection,
|
136
|
+
sep,
|
137
|
+
rechunk,
|
138
|
+
columns,
|
139
|
+
encoding,
|
140
|
+
n_threads,
|
141
|
+
path,
|
142
|
+
dtype_list,
|
143
|
+
dtype_slice,
|
144
|
+
low_memory,
|
145
|
+
comment_char,
|
146
|
+
quote_char,
|
147
|
+
processed_null_values,
|
148
|
+
parse_dates,
|
149
|
+
skip_rows_after_header,
|
150
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
151
|
+
sample_size,
|
152
|
+
eol_char
|
153
|
+
)
|
154
|
+
)
|
40
155
|
end
|
41
156
|
|
157
|
+
# @private
|
42
158
|
def self._read_parquet(file)
|
43
159
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
44
160
|
file = Utils.format_path(file)
|
@@ -47,6 +163,44 @@ module Polars
|
|
47
163
|
_from_rbdf(RbDataFrame.read_parquet(file))
|
48
164
|
end
|
49
165
|
|
166
|
+
# def self._read_avro
|
167
|
+
# end
|
168
|
+
|
169
|
+
# @private
|
170
|
+
def self._read_ipc(
|
171
|
+
file,
|
172
|
+
columns: nil,
|
173
|
+
n_rows: nil,
|
174
|
+
row_count_name: nil,
|
175
|
+
row_count_offset: 0,
|
176
|
+
rechunk: true,
|
177
|
+
memory_map: true
|
178
|
+
)
|
179
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
180
|
+
file = Utils.format_path(file)
|
181
|
+
end
|
182
|
+
if columns.is_a?(String)
|
183
|
+
columns = [columns]
|
184
|
+
end
|
185
|
+
|
186
|
+
if file.is_a?(String) && file.include?("*")
|
187
|
+
raise Todo
|
188
|
+
end
|
189
|
+
|
190
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
191
|
+
_from_rbdf(
|
192
|
+
RbDataFrame.read_ipc(
|
193
|
+
file,
|
194
|
+
columns,
|
195
|
+
projection,
|
196
|
+
n_rows,
|
197
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
198
|
+
memory_map
|
199
|
+
)
|
200
|
+
)
|
201
|
+
end
|
202
|
+
|
203
|
+
# @private
|
50
204
|
def self._read_json(file)
|
51
205
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
52
206
|
file = Utils.format_path(file)
|
@@ -55,6 +209,7 @@ module Polars
|
|
55
209
|
_from_rbdf(RbDataFrame.read_json(file))
|
56
210
|
end
|
57
211
|
|
212
|
+
# @private
|
58
213
|
def self._read_ndjson(file)
|
59
214
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
60
215
|
file = Utils.format_path(file)
|
@@ -63,30 +218,119 @@ module Polars
|
|
63
218
|
_from_rbdf(RbDataFrame.read_ndjson(file))
|
64
219
|
end
|
65
220
|
|
221
|
+
# Get the shape of the DataFrame.
|
222
|
+
#
|
223
|
+
# @return [Array]
|
224
|
+
#
|
225
|
+
# @example
|
226
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
227
|
+
# df.shape
|
228
|
+
# # => [5, 1]
|
66
229
|
def shape
|
67
230
|
_df.shape
|
68
231
|
end
|
69
232
|
|
233
|
+
# Get the height of the DataFrame.
|
234
|
+
#
|
235
|
+
# @return [Integer]
|
236
|
+
#
|
237
|
+
# @example
|
238
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
239
|
+
# df.height
|
240
|
+
# # => 5
|
70
241
|
def height
|
71
242
|
_df.height
|
72
243
|
end
|
73
244
|
|
245
|
+
# Get the width of the DataFrame.
|
246
|
+
#
|
247
|
+
# @return [Integer]
|
248
|
+
#
|
249
|
+
# @example
|
250
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
|
251
|
+
# df.width
|
252
|
+
# # => 1
|
74
253
|
def width
|
75
254
|
_df.width
|
76
255
|
end
|
77
256
|
|
257
|
+
# Get column names.
|
258
|
+
#
|
259
|
+
# @return [Array]
|
260
|
+
#
|
261
|
+
# @example
|
262
|
+
# df = Polars::DataFrame.new({
|
263
|
+
# "foo" => [1, 2, 3],
|
264
|
+
# "bar" => [6, 7, 8],
|
265
|
+
# "ham" => ["a", "b", "c"]
|
266
|
+
# })
|
267
|
+
# df.columns
|
268
|
+
# # => ["foo", "bar", "ham"]
|
78
269
|
def columns
|
79
270
|
_df.columns
|
80
271
|
end
|
81
272
|
|
273
|
+
# Change the column names of the DataFrame.
|
274
|
+
#
|
275
|
+
# @param columns [Array]
|
276
|
+
# A list with new names for the DataFrame.
|
277
|
+
# The length of the list should be equal to the width of the DataFrame.
|
278
|
+
#
|
279
|
+
# @return [Object]
|
280
|
+
#
|
281
|
+
# @example
|
282
|
+
# df = Polars::DataFrame.new({
|
283
|
+
# "foo" => [1, 2, 3],
|
284
|
+
# "bar" => [6, 7, 8],
|
285
|
+
# "ham" => ["a", "b", "c"]
|
286
|
+
# })
|
287
|
+
# df.columns = ["apple", "banana", "orange"]
|
288
|
+
# df
|
289
|
+
# # =>
|
290
|
+
# # shape: (3, 3)
|
291
|
+
# # ┌───────┬────────┬────────┐
|
292
|
+
# # │ apple ┆ banana ┆ orange │
|
293
|
+
# # │ --- ┆ --- ┆ --- │
|
294
|
+
# # │ i64 ┆ i64 ┆ str │
|
295
|
+
# # ╞═══════╪════════╪════════╡
|
296
|
+
# # │ 1 ┆ 6 ┆ a │
|
297
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
298
|
+
# # │ 2 ┆ 7 ┆ b │
|
299
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
300
|
+
# # │ 3 ┆ 8 ┆ c │
|
301
|
+
# # └───────┴────────┴────────┘
|
82
302
|
def columns=(columns)
|
83
303
|
_df.set_column_names(columns)
|
84
304
|
end
|
85
305
|
|
306
|
+
# Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
|
307
|
+
#
|
308
|
+
# @return [Array]
|
309
|
+
#
|
310
|
+
# @example
|
311
|
+
# df = Polars::DataFrame.new({
|
312
|
+
# "foo" => [1, 2, 3],
|
313
|
+
# "bar" => [6.0, 7.0, 8.0],
|
314
|
+
# "ham" => ["a", "b", "c"]
|
315
|
+
# })
|
316
|
+
# df.dtypes
|
317
|
+
# # => [:i64, :f64, :str]
|
86
318
|
def dtypes
|
87
|
-
_df.dtypes
|
319
|
+
_df.dtypes
|
88
320
|
end
|
89
321
|
|
322
|
+
# Get the schema.
|
323
|
+
#
|
324
|
+
# @return [Hash]
|
325
|
+
#
|
326
|
+
# @example
|
327
|
+
# df = Polars::DataFrame.new({
|
328
|
+
# "foo" => [1, 2, 3],
|
329
|
+
# "bar" => [6.0, 7.0, 8.0],
|
330
|
+
# "ham" => ["a", "b", "c"]
|
331
|
+
# })
|
332
|
+
# df.schema
|
333
|
+
# # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
|
90
334
|
def schema
|
91
335
|
columns.zip(dtypes).to_h
|
92
336
|
end
|
@@ -124,6 +368,7 @@ module Polars
|
|
124
368
|
# def %(other)
|
125
369
|
# end
|
126
370
|
|
371
|
+
#
|
127
372
|
def to_s
|
128
373
|
_df.to_s
|
129
374
|
end
|
@@ -133,6 +378,16 @@ module Polars
|
|
133
378
|
columns.include?(name)
|
134
379
|
end
|
135
380
|
|
381
|
+
# def each
|
382
|
+
# end
|
383
|
+
|
384
|
+
# def _pos_idx
|
385
|
+
# end
|
386
|
+
|
387
|
+
# def _pos_idxs
|
388
|
+
# end
|
389
|
+
|
390
|
+
#
|
136
391
|
def [](name)
|
137
392
|
Utils.wrap_s(_df.column(name))
|
138
393
|
end
|
@@ -140,6 +395,9 @@ module Polars
|
|
140
395
|
# def []=(key, value)
|
141
396
|
# end
|
142
397
|
|
398
|
+
# no to_arrow
|
399
|
+
|
400
|
+
#
|
143
401
|
def to_h(as_series: true)
|
144
402
|
if as_series
|
145
403
|
get_columns.to_h { |s| [s.name, s] }
|
@@ -148,7 +406,7 @@ module Polars
|
|
148
406
|
end
|
149
407
|
end
|
150
408
|
|
151
|
-
# def
|
409
|
+
# def to_hashes / to_a
|
152
410
|
# end
|
153
411
|
|
154
412
|
# def to_numo
|
@@ -156,6 +414,28 @@ module Polars
|
|
156
414
|
|
157
415
|
# no to_pandas
|
158
416
|
|
417
|
+
# Select column as Series at index location.
|
418
|
+
#
|
419
|
+
# @param index [Integer]
|
420
|
+
# Location of selection.
|
421
|
+
#
|
422
|
+
# @return [Series]
|
423
|
+
#
|
424
|
+
# @example
|
425
|
+
# df = Polars::DataFrame.new({
|
426
|
+
# "foo" => [1, 2, 3],
|
427
|
+
# "bar" => [6, 7, 8],
|
428
|
+
# "ham" => ["a", "b", "c"]
|
429
|
+
# })
|
430
|
+
# df.to_series(1)
|
431
|
+
# # =>
|
432
|
+
# # shape: (3,)
|
433
|
+
# # Series: 'bar' [i64]
|
434
|
+
# # [
|
435
|
+
# # 6
|
436
|
+
# # 7
|
437
|
+
# # 8
|
438
|
+
# # ]
|
159
439
|
def to_series(index = 0)
|
160
440
|
if index < 0
|
161
441
|
index = columns.length + index
|
@@ -163,6 +443,18 @@ module Polars
|
|
163
443
|
Utils.wrap_s(_df.select_at_idx(index))
|
164
444
|
end
|
165
445
|
|
446
|
+
# Serialize to JSON representation.
|
447
|
+
#
|
448
|
+
# @return [nil]
|
449
|
+
#
|
450
|
+
# @param file [String]
|
451
|
+
# File path to which the result should be written.
|
452
|
+
# @param pretty [Boolean]
|
453
|
+
# Pretty serialize json.
|
454
|
+
# @param row_oriented [Boolean]
|
455
|
+
# Write to row oriented json. This is slower, but more common.
|
456
|
+
#
|
457
|
+
# @see #write_ndjson
|
166
458
|
def write_json(
|
167
459
|
file,
|
168
460
|
pretty: false,
|
@@ -176,6 +468,12 @@ module Polars
|
|
176
468
|
nil
|
177
469
|
end
|
178
470
|
|
471
|
+
# Serialize to newline delimited JSON representation.
|
472
|
+
#
|
473
|
+
# @param file [String]
|
474
|
+
# File path to which the result should be written.
|
475
|
+
#
|
476
|
+
# @return [nil]
|
179
477
|
def write_ndjson(file)
|
180
478
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
181
479
|
file = Utils.format_path(file)
|
@@ -185,6 +483,48 @@ module Polars
|
|
185
483
|
nil
|
186
484
|
end
|
187
485
|
|
486
|
+
# Write to comma-separated values (CSV) file.
|
487
|
+
#
|
488
|
+
# @param file [String, nil]
|
489
|
+
# File path to which the result should be written. If set to `nil`
|
490
|
+
# (default), the output is returned as a string instead.
|
491
|
+
# @param has_header [Boolean]
|
492
|
+
# Whether to include header in the CSV output.
|
493
|
+
# @param sep [String]
|
494
|
+
# Separate CSV fields with this symbol.
|
495
|
+
# @param quote [String]
|
496
|
+
# Byte to use as quoting character.
|
497
|
+
# @param batch_size [Integer]
|
498
|
+
# Number of rows that will be processed per thread.
|
499
|
+
# @param datetime_format [String, nil]
|
500
|
+
# A format string, with the specifiers defined by the
|
501
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
502
|
+
# Rust crate. If no format specified, the default fractional-second
|
503
|
+
# precision is inferred from the maximum timeunit found in the frame's
|
504
|
+
# Datetime cols (if any).
|
505
|
+
# @param date_format [String, nil]
|
506
|
+
# A format string, with the specifiers defined by the
|
507
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
508
|
+
# Rust crate.
|
509
|
+
# @param time_format [String, nil]
|
510
|
+
# A format string, with the specifiers defined by the
|
511
|
+
# [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
512
|
+
# Rust crate.
|
513
|
+
# @param float_precision [Integer, nil]
|
514
|
+
# Number of decimal places to write, applied to both `:f32` and
|
515
|
+
# `:f64` datatypes.
|
516
|
+
# @param null_value [String, nil]
|
517
|
+
# A string representing null values (defaulting to the empty string).
|
518
|
+
#
|
519
|
+
# @return [String, nil]
|
520
|
+
#
|
521
|
+
# @example
|
522
|
+
# df = Polars::DataFrame.new({
|
523
|
+
# "foo" => [1, 2, 3, 4, 5],
|
524
|
+
# "bar" => [6, 7, 8, 9, 10],
|
525
|
+
# "ham" => ["a", "b", "c", "d", "e"]
|
526
|
+
# })
|
527
|
+
# df.write_csv("file.csv")
|
188
528
|
def write_csv(
|
189
529
|
file = nil,
|
190
530
|
has_header: true,
|
@@ -220,8 +560,7 @@ module Polars
|
|
220
560
|
float_precision,
|
221
561
|
null_value
|
222
562
|
)
|
223
|
-
buffer.
|
224
|
-
return buffer.read.force_encoding(Encoding::UTF_8)
|
563
|
+
return buffer.string.force_encoding(Encoding::UTF_8)
|
225
564
|
end
|
226
565
|
|
227
566
|
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
@@ -246,9 +585,50 @@ module Polars
|
|
246
585
|
# def write_avro
|
247
586
|
# end
|
248
587
|
|
249
|
-
#
|
250
|
-
#
|
588
|
+
# Write to Arrow IPC binary stream or Feather file.
|
589
|
+
#
|
590
|
+
# @param file [String]
|
591
|
+
# File path to which the file should be written.
|
592
|
+
# @param compression ["uncompressed", "lz4", "zstd"]
|
593
|
+
# Compression method. Defaults to "uncompressed".
|
594
|
+
#
|
595
|
+
# @return [nil]
|
596
|
+
def write_ipc(file, compression: "uncompressed")
|
597
|
+
if compression.nil?
|
598
|
+
compression = "uncompressed"
|
599
|
+
end
|
600
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
601
|
+
file = Utils.format_path(file)
|
602
|
+
end
|
603
|
+
|
604
|
+
_df.write_ipc(file, compression)
|
605
|
+
end
|
251
606
|
|
607
|
+
# Write to Apache Parquet file.
|
608
|
+
#
|
609
|
+
# @param file [String]
|
610
|
+
# File path to which the file should be written.
|
611
|
+
# @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
|
612
|
+
# Choose "zstd" for good compression performance.
|
613
|
+
# Choose "lz4" for fast compression/decompression.
|
614
|
+
# Choose "snappy" for more backwards compatibility guarantees
|
615
|
+
# when you deal with older parquet readers.
|
616
|
+
# @param compression_level [Integer, nil]
|
617
|
+
# The level of compression to use. Higher compression means smaller files on
|
618
|
+
# disk.
|
619
|
+
#
|
620
|
+
# - "gzip" : min-level: 0, max-level: 10.
|
621
|
+
# - "brotli" : min-level: 0, max-level: 11.
|
622
|
+
# - "zstd" : min-level: 1, max-level: 22.
|
623
|
+
# @param statistics [Boolean]
|
624
|
+
# Write statistics to the parquet headers. This requires extra compute.
|
625
|
+
# @param row_group_size [Integer, nil]
|
626
|
+
# Size of the row groups in number of rows.
|
627
|
+
# If `nil` (default), the chunks of the DataFrame are
|
628
|
+
# used. Writing in smaller chunks may reduce memory pressure and improve
|
629
|
+
# writing speeds.
|
630
|
+
#
|
631
|
+
# @return [nil]
|
252
632
|
def write_parquet(
|
253
633
|
file,
|
254
634
|
compression: "zstd",
|
@@ -268,6 +648,39 @@ module Polars
|
|
268
648
|
)
|
269
649
|
end
|
270
650
|
|
651
|
+
# Return an estimation of the total (heap) allocated size of the DataFrame.
|
652
|
+
#
|
653
|
+
# Estimated size is given in the specified unit (bytes by default).
|
654
|
+
#
|
655
|
+
# This estimation is the sum of the size of its buffers, validity, including
|
656
|
+
# nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
|
657
|
+
# size of 2 arrays is not the sum of the sizes computed from this function. In
|
658
|
+
# particular, StructArray's size is an upper bound.
|
659
|
+
#
|
660
|
+
# When an array is sliced, its allocated size remains constant because the buffer
|
661
|
+
# unchanged. However, this function will yield a smaller number. This is because
|
662
|
+
# this function returns the visible size of the buffer, not its total capacity.
|
663
|
+
#
|
664
|
+
# FFI buffers are included in this estimation.
|
665
|
+
#
|
666
|
+
# @param unit ["b", "kb", "mb", "gb", "tb"]
|
667
|
+
# Scale the returned size to the given unit.
|
668
|
+
#
|
669
|
+
# @return [Numeric]
|
670
|
+
#
|
671
|
+
# @example
|
672
|
+
# df = Polars::DataFrame.new(
|
673
|
+
# {
|
674
|
+
# "x" => 1_000_000.times.to_a.reverse,
|
675
|
+
# "y" => 1_000_000.times.map { |v| v / 1000.0 },
|
676
|
+
# "z" => 1_000_000.times.map(&:to_s)
|
677
|
+
# },
|
678
|
+
# columns: {"x" => :u32, "y" => :f64, "z" => :str}
|
679
|
+
# )
|
680
|
+
# df.estimated_size
|
681
|
+
# # => 25888898
|
682
|
+
# df.estimated_size("mb")
|
683
|
+
# # => 24.689577102661133
|
271
684
|
def estimated_size(unit = "b")
|
272
685
|
sz = _df.estimated_size
|
273
686
|
Utils.scale_bytes(sz, to: unit)
|
@@ -276,14 +689,114 @@ module Polars
|
|
276
689
|
# def transpose
|
277
690
|
# end
|
278
691
|
|
692
|
+
# Reverse the DataFrame.
|
693
|
+
#
|
694
|
+
# @return [DataFrame]
|
695
|
+
#
|
696
|
+
# @example
|
697
|
+
# df = Polars::DataFrame.new({
|
698
|
+
# "key" => ["a", "b", "c"],
|
699
|
+
# "val" => [1, 2, 3]
|
700
|
+
# })
|
701
|
+
# df.reverse()
|
702
|
+
# # =>
|
703
|
+
# # shape: (3, 2)
|
704
|
+
# # ┌─────┬─────┐
|
705
|
+
# # │ key ┆ val │
|
706
|
+
# # │ --- ┆ --- │
|
707
|
+
# # │ str ┆ i64 │
|
708
|
+
# # ╞═════╪═════╡
|
709
|
+
# # │ c ┆ 3 │
|
710
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
711
|
+
# # │ b ┆ 2 │
|
712
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
713
|
+
# # │ a ┆ 1 │
|
714
|
+
# # └─────┴─────┘
|
279
715
|
def reverse
|
280
716
|
select(Polars.col("*").reverse)
|
281
717
|
end
|
282
718
|
|
719
|
+
# Rename column names.
|
720
|
+
#
|
721
|
+
# @param mapping [Hash]
|
722
|
+
# Key value pairs that map from old name to new name.
|
723
|
+
#
|
724
|
+
# @return [DataFrame]
|
725
|
+
#
|
726
|
+
# @example
|
727
|
+
# df = Polars::DataFrame.new({
|
728
|
+
# "foo" => [1, 2, 3],
|
729
|
+
# "bar" => [6, 7, 8],
|
730
|
+
# "ham" => ["a", "b", "c"]
|
731
|
+
# })
|
732
|
+
# df.rename({"foo" => "apple"})
|
733
|
+
# # =>
|
734
|
+
# # shape: (3, 3)
|
735
|
+
# # ┌───────┬─────┬─────┐
|
736
|
+
# # │ apple ┆ bar ┆ ham │
|
737
|
+
# # │ --- ┆ --- ┆ --- │
|
738
|
+
# # │ i64 ┆ i64 ┆ str │
|
739
|
+
# # ╞═══════╪═════╪═════╡
|
740
|
+
# # │ 1 ┆ 6 ┆ a │
|
741
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
742
|
+
# # │ 2 ┆ 7 ┆ b │
|
743
|
+
# # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
744
|
+
# # │ 3 ┆ 8 ┆ c │
|
745
|
+
# # └───────┴─────┴─────┘
|
283
746
|
def rename(mapping)
|
284
747
|
lazy.rename(mapping).collect(no_optimization: true)
|
285
748
|
end
|
286
749
|
|
750
|
+
# Insert a Series at a certain column index. This operation is in place.
|
751
|
+
#
|
752
|
+
# @param index [Integer]
|
753
|
+
# Column to insert the new `Series` column.
|
754
|
+
# @param series [Series]
|
755
|
+
# `Series` to insert.
|
756
|
+
#
|
757
|
+
# @return [DataFrame]
|
758
|
+
#
|
759
|
+
# @example
|
760
|
+
# df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
|
761
|
+
# s = Polars::Series.new("baz", [97, 98, 99])
|
762
|
+
# df.insert_at_idx(1, s)
|
763
|
+
# # =>
|
764
|
+
# # shape: (3, 3)
|
765
|
+
# # ┌─────┬─────┬─────┐
|
766
|
+
# # │ foo ┆ baz ┆ bar │
|
767
|
+
# # │ --- ┆ --- ┆ --- │
|
768
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
769
|
+
# # ╞═════╪═════╪═════╡
|
770
|
+
# # │ 1 ┆ 97 ┆ 4 │
|
771
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
772
|
+
# # │ 2 ┆ 98 ┆ 5 │
|
773
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
774
|
+
# # │ 3 ┆ 99 ┆ 6 │
|
775
|
+
# # └─────┴─────┴─────┘
|
776
|
+
#
|
777
|
+
# @example
|
778
|
+
# df = Polars::DataFrame.new({
|
779
|
+
# "a" => [1, 2, 3, 4],
|
780
|
+
# "b" => [0.5, 4, 10, 13],
|
781
|
+
# "c" => [true, true, false, true]
|
782
|
+
# })
|
783
|
+
# s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
|
784
|
+
# df.insert_at_idx(3, s)
|
785
|
+
# # =>
|
786
|
+
# # shape: (4, 4)
|
787
|
+
# # ┌─────┬──────┬───────┬──────┐
|
788
|
+
# # │ a ┆ b ┆ c ┆ d │
|
789
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
790
|
+
# # │ i64 ┆ f64 ┆ bool ┆ f64 │
|
791
|
+
# # ╞═════╪══════╪═══════╪══════╡
|
792
|
+
# # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
|
793
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
794
|
+
# # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
|
795
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
796
|
+
# # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
|
797
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
798
|
+
# # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
|
799
|
+
# # └─────┴──────┴───────┴──────┘
|
287
800
|
def insert_at_idx(index, series)
|
288
801
|
if index < 0
|
289
802
|
index = columns.length + index
|
@@ -305,6 +818,7 @@ module Polars
|
|
305
818
|
# def replace_at_idx
|
306
819
|
# end
|
307
820
|
|
821
|
+
#
|
308
822
|
def sort(by, reverse: false, nulls_last: false)
|
309
823
|
_from_rbdf(_df.sort(by, reverse, nulls_last))
|
310
824
|
end
|
@@ -316,6 +830,7 @@ module Polars
|
|
316
830
|
# def replace
|
317
831
|
# end
|
318
832
|
|
833
|
+
#
|
319
834
|
def slice(offset, length = nil)
|
320
835
|
if !length.nil? && length < 0
|
321
836
|
length = height - offset + length
|
@@ -344,6 +859,7 @@ module Polars
|
|
344
859
|
# def with_row_count
|
345
860
|
# end
|
346
861
|
|
862
|
+
#
|
347
863
|
def groupby(by, maintain_order: false)
|
348
864
|
lazy.groupby(by, maintain_order: maintain_order)
|
349
865
|
end
|
@@ -360,6 +876,7 @@ module Polars
|
|
360
876
|
# def join_asof
|
361
877
|
# end
|
362
878
|
|
879
|
+
#
|
363
880
|
def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
|
364
881
|
lazy
|
365
882
|
.join(
|
@@ -376,6 +893,7 @@ module Polars
|
|
376
893
|
# def apply
|
377
894
|
# end
|
378
895
|
|
896
|
+
#
|
379
897
|
def with_column(column)
|
380
898
|
lazy
|
381
899
|
.with_column(column)
|
@@ -388,8 +906,11 @@ module Polars
|
|
388
906
|
# def vstack
|
389
907
|
# end
|
390
908
|
|
391
|
-
#
|
392
|
-
|
909
|
+
#
|
910
|
+
def extend(other)
|
911
|
+
_df.extend(other._df)
|
912
|
+
self
|
913
|
+
end
|
393
914
|
|
394
915
|
# def drop
|
395
916
|
# end
|
@@ -402,6 +923,7 @@ module Polars
|
|
402
923
|
|
403
924
|
# clone handled by initialize_copy
|
404
925
|
|
926
|
+
#
|
405
927
|
def get_columns
|
406
928
|
_df.get_columns.map { |s| Utils.wrap_s(s) }
|
407
929
|
end
|
@@ -413,6 +935,7 @@ module Polars
|
|
413
935
|
# def fill_null
|
414
936
|
# end
|
415
937
|
|
938
|
+
#
|
416
939
|
def fill_nan(fill_value)
|
417
940
|
lazy.fill_nan(fill_value).collect(no_optimization: true)
|
418
941
|
end
|
@@ -438,6 +961,7 @@ module Polars
|
|
438
961
|
# def shift_and_fill
|
439
962
|
# end
|
440
963
|
|
964
|
+
#
|
441
965
|
def is_duplicated
|
442
966
|
Utils.wrap_s(_df.is_duplicated)
|
443
967
|
end
|
@@ -547,6 +1071,7 @@ module Polars
|
|
547
1071
|
# def n_unique
|
548
1072
|
# end
|
549
1073
|
|
1074
|
+
#
|
550
1075
|
def rechunk
|
551
1076
|
_from_rbdf(_df.rechunk)
|
552
1077
|
end
|
@@ -579,6 +1104,7 @@ module Polars
|
|
579
1104
|
# def interpolate
|
580
1105
|
# end
|
581
1106
|
|
1107
|
+
#
|
582
1108
|
def is_empty
|
583
1109
|
height == 0
|
584
1110
|
end
|
@@ -597,15 +1123,55 @@ module Polars
|
|
597
1123
|
self._df = _df._clone
|
598
1124
|
end
|
599
1125
|
|
600
|
-
def hash_to_rbdf(data)
|
1126
|
+
def hash_to_rbdf(data, columns: nil)
|
1127
|
+
if !columns.nil?
|
1128
|
+
columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
|
1129
|
+
|
1130
|
+
if !data && dtypes
|
1131
|
+
data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
|
1132
|
+
else
|
1133
|
+
data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
|
1134
|
+
end
|
1135
|
+
data_series = _handle_columns_arg(data_series, columns: columns)
|
1136
|
+
return RbDataFrame.new(data_series)
|
1137
|
+
end
|
1138
|
+
|
601
1139
|
RbDataFrame.read_hash(data)
|
602
1140
|
end
|
603
1141
|
|
604
|
-
def
|
1142
|
+
def _unpack_columns(columns, lookup_names: nil)
|
1143
|
+
[columns.keys, columns]
|
1144
|
+
end
|
1145
|
+
|
1146
|
+
def _handle_columns_arg(data, columns: nil)
|
1147
|
+
if columns.nil?
|
1148
|
+
data
|
1149
|
+
else
|
1150
|
+
if !data
|
1151
|
+
columns.map { |c| Series.new(c, nil)._s }
|
1152
|
+
elsif data.length == columns.length
|
1153
|
+
columns.each_with_index do |c, i|
|
1154
|
+
# not in-place?
|
1155
|
+
data[i].rename(c)
|
1156
|
+
end
|
1157
|
+
data
|
1158
|
+
else
|
1159
|
+
raise ArgumentError, "Dimensions of columns arg must match data dimensions."
|
1160
|
+
end
|
1161
|
+
end
|
1162
|
+
end
|
1163
|
+
|
1164
|
+
def sequence_to_rbdf(data, columns: nil, orient: nil)
|
1165
|
+
if columns || orient
|
1166
|
+
raise Todo
|
1167
|
+
end
|
605
1168
|
RbDataFrame.new(data.map(&:_s))
|
606
1169
|
end
|
607
1170
|
|
608
|
-
def series_to_rbdf(data)
|
1171
|
+
def series_to_rbdf(data, columns: nil)
|
1172
|
+
if columns
|
1173
|
+
raise Todo
|
1174
|
+
end
|
609
1175
|
RbDataFrame.new([data._s])
|
610
1176
|
end
|
611
1177
|
|