polars-df 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Cargo.lock +588 -456
- data/README.md +37 -2
- data/ext/polars/Cargo.toml +7 -7
- data/ext/polars/src/conversion/mod.rs +31 -21
- data/ext/polars/src/dataframe/general.rs +1 -48
- data/ext/polars/src/dataframe/io.rs +13 -9
- data/ext/polars/src/expr/general.rs +3 -0
- data/ext/polars/src/expr/meta.rs +6 -2
- data/ext/polars/src/file.rs +21 -3
- data/ext/polars/src/functions/aggregation.rs +4 -4
- data/ext/polars/src/functions/io.rs +34 -13
- data/ext/polars/src/functions/lazy.rs +5 -4
- data/ext/polars/src/functions/meta.rs +1 -1
- data/ext/polars/src/interop/arrow/to_ruby.rs +2 -2
- data/ext/polars/src/lazyframe/general.rs +48 -5
- data/ext/polars/src/lib.rs +11 -15
- data/ext/polars/src/series/general.rs +3 -15
- data/ext/polars/src/series/import.rs +1 -1
- data/lib/polars/data_frame.rb +179 -51
- data/lib/polars/data_types.rb +1 -1
- data/lib/polars/functions/aggregation/horizontal.rb +10 -4
- data/lib/polars/functions/lazy.rb +7 -3
- data/lib/polars/io/delta.rb +126 -0
- data/lib/polars/lazy_frame.rb +35 -5
- data/lib/polars/selectors.rb +85 -3
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +15 -0
- metadata +5 -8
@@ -0,0 +1,126 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Reads into a DataFrame from a Delta lake table.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# DeltaTable or a Path or URI to the root of the Delta lake table.
|
7
|
+
# @param version [Object]
|
8
|
+
# Numerical version or timestamp version of the Delta lake table.
|
9
|
+
# @param columns [Array]
|
10
|
+
# Columns to select. Accepts a list of column names.
|
11
|
+
# @param rechunk [Boolean]
|
12
|
+
# Make sure that all columns are contiguous in memory by
|
13
|
+
# aggregating the chunks into a single array.
|
14
|
+
# @param storage_options [Hash]
|
15
|
+
# Extra options for the storage backends supported by `deltalake-rb`.
|
16
|
+
# @param delta_table_options [Hash]
|
17
|
+
# Additional keyword arguments while reading a Delta lake Table.
|
18
|
+
#
|
19
|
+
# @return [DataFrame]
|
20
|
+
def read_delta(
|
21
|
+
source,
|
22
|
+
version: nil,
|
23
|
+
columns: nil,
|
24
|
+
rechunk: false,
|
25
|
+
storage_options: nil,
|
26
|
+
delta_table_options: nil
|
27
|
+
)
|
28
|
+
dl_tbl =
|
29
|
+
_get_delta_lake_table(
|
30
|
+
source,
|
31
|
+
version: version,
|
32
|
+
storage_options: storage_options,
|
33
|
+
delta_table_options: delta_table_options
|
34
|
+
)
|
35
|
+
|
36
|
+
dl_tbl.to_polars(columns: columns, rechunk: rechunk)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Lazily read from a Delta lake table.
|
40
|
+
#
|
41
|
+
# @param source [Object]
|
42
|
+
# DeltaTable or a Path or URI to the root of the Delta lake table.
|
43
|
+
# @param version [Object]
|
44
|
+
# Numerical version or timestamp version of the Delta lake table.
|
45
|
+
# @param storage_options [Hash]
|
46
|
+
# Extra options for the storage backends supported by `deltalake-rb`.
|
47
|
+
# @param delta_table_options [Hash]
|
48
|
+
# Additional keyword arguments while reading a Delta lake Table.
|
49
|
+
#
|
50
|
+
# @return [LazyFrame]
|
51
|
+
def scan_delta(
|
52
|
+
source,
|
53
|
+
version: nil,
|
54
|
+
storage_options: nil,
|
55
|
+
delta_table_options: nil
|
56
|
+
)
|
57
|
+
dl_tbl =
|
58
|
+
_get_delta_lake_table(
|
59
|
+
source,
|
60
|
+
version: version,
|
61
|
+
storage_options: storage_options,
|
62
|
+
delta_table_options: delta_table_options
|
63
|
+
)
|
64
|
+
|
65
|
+
dl_tbl.to_polars(eager: false)
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def _resolve_delta_lake_uri(table_uri, strict: true)
|
71
|
+
require "uri"
|
72
|
+
|
73
|
+
parsed_result = URI(table_uri)
|
74
|
+
|
75
|
+
resolved_uri =
|
76
|
+
if parsed_result.scheme == ""
|
77
|
+
Utils.normalize_filepath(table_uri)
|
78
|
+
else
|
79
|
+
table_uri
|
80
|
+
end
|
81
|
+
|
82
|
+
resolved_uri
|
83
|
+
end
|
84
|
+
|
85
|
+
def _get_delta_lake_table(
|
86
|
+
table_path,
|
87
|
+
version: nil,
|
88
|
+
storage_options: nil,
|
89
|
+
delta_table_options: nil
|
90
|
+
)
|
91
|
+
_check_if_delta_available
|
92
|
+
|
93
|
+
if table_path.is_a?(DeltaLake::Table)
|
94
|
+
return table_path
|
95
|
+
end
|
96
|
+
delta_table_options ||= {}
|
97
|
+
resolved_uri = _resolve_delta_lake_uri(table_path)
|
98
|
+
if !version.is_a?(::String) && !version.is_a?(::Time)
|
99
|
+
dl_tbl =
|
100
|
+
DeltaLake::Table.new(
|
101
|
+
resolved_uri,
|
102
|
+
version: version,
|
103
|
+
storage_options: storage_options,
|
104
|
+
**delta_table_options
|
105
|
+
)
|
106
|
+
else
|
107
|
+
dl_tbl =
|
108
|
+
DeltaLake::Table.new(
|
109
|
+
resolved_uri,
|
110
|
+
storage_options: storage_options,
|
111
|
+
**delta_table_options
|
112
|
+
)
|
113
|
+
dl_tbl.load_as_version(version)
|
114
|
+
end
|
115
|
+
|
116
|
+
dl_tbl = DeltaLake::Table.new(table_path)
|
117
|
+
dl_tbl
|
118
|
+
end
|
119
|
+
|
120
|
+
def _check_if_delta_available
|
121
|
+
if !defined?(DeltaLake)
|
122
|
+
raise Error, "Delta Lake not available"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/lib/polars/lazy_frame.rb
CHANGED
@@ -431,7 +431,9 @@ module Polars
|
|
431
431
|
projection_pushdown: true,
|
432
432
|
simplify_expression: true,
|
433
433
|
no_optimization: false,
|
434
|
-
slice_pushdown: true
|
434
|
+
slice_pushdown: true,
|
435
|
+
storage_options: nil,
|
436
|
+
retries: 2
|
435
437
|
)
|
436
438
|
lf = _set_sink_optimizations(
|
437
439
|
type_coercion: type_coercion,
|
@@ -460,6 +462,12 @@ module Polars
|
|
460
462
|
}
|
461
463
|
end
|
462
464
|
|
465
|
+
if storage_options&.any?
|
466
|
+
storage_options = storage_options.to_a
|
467
|
+
else
|
468
|
+
storage_options = nil
|
469
|
+
end
|
470
|
+
|
463
471
|
lf.sink_parquet(
|
464
472
|
path,
|
465
473
|
compression,
|
@@ -467,7 +475,9 @@ module Polars
|
|
467
475
|
statistics,
|
468
476
|
row_group_size,
|
469
477
|
data_pagesize_limit,
|
470
|
-
maintain_order
|
478
|
+
maintain_order,
|
479
|
+
storage_options,
|
480
|
+
retries
|
471
481
|
)
|
472
482
|
end
|
473
483
|
|
@@ -512,6 +522,10 @@ module Polars
|
|
512
522
|
slice_pushdown: true,
|
513
523
|
no_optimization: false
|
514
524
|
)
|
525
|
+
# TODO support storage options in Rust
|
526
|
+
storage_options = nil
|
527
|
+
retries = 2
|
528
|
+
|
515
529
|
lf = _set_sink_optimizations(
|
516
530
|
type_coercion: type_coercion,
|
517
531
|
predicate_pushdown: predicate_pushdown,
|
@@ -521,10 +535,18 @@ module Polars
|
|
521
535
|
no_optimization: no_optimization
|
522
536
|
)
|
523
537
|
|
538
|
+
if storage_options&.any?
|
539
|
+
storage_options = storage_options.to_a
|
540
|
+
else
|
541
|
+
storage_options = nil
|
542
|
+
end
|
543
|
+
|
524
544
|
lf.sink_ipc(
|
525
545
|
path,
|
526
546
|
compression,
|
527
|
-
maintain_order
|
547
|
+
maintain_order,
|
548
|
+
storage_options,
|
549
|
+
retries
|
528
550
|
)
|
529
551
|
end
|
530
552
|
|
@@ -692,7 +714,9 @@ module Polars
|
|
692
714
|
projection_pushdown: true,
|
693
715
|
simplify_expression: true,
|
694
716
|
slice_pushdown: true,
|
695
|
-
no_optimization: false
|
717
|
+
no_optimization: false,
|
718
|
+
storage_options: nil,
|
719
|
+
retries: 2
|
696
720
|
)
|
697
721
|
lf = _set_sink_optimizations(
|
698
722
|
type_coercion: type_coercion,
|
@@ -703,7 +727,13 @@ module Polars
|
|
703
727
|
no_optimization: no_optimization
|
704
728
|
)
|
705
729
|
|
706
|
-
|
730
|
+
if storage_options&.any?
|
731
|
+
storage_options = storage_options.to_a
|
732
|
+
else
|
733
|
+
storage_options = nil
|
734
|
+
end
|
735
|
+
|
736
|
+
lf.sink_json(path, maintain_order, storage_options, retries)
|
707
737
|
end
|
708
738
|
|
709
739
|
# @private
|
data/lib/polars/selectors.rb
CHANGED
@@ -372,9 +372,91 @@ module Polars
|
|
372
372
|
# def by_index
|
373
373
|
# end
|
374
374
|
|
375
|
-
#
|
376
|
-
#
|
377
|
-
#
|
375
|
+
# Select all columns matching the given names.
|
376
|
+
#
|
377
|
+
# @param names [Array]
|
378
|
+
# One or more names of columns to select.
|
379
|
+
# @param require_all [Boolean]
|
380
|
+
# Whether to match *all* names (the default) or *any* of the names.
|
381
|
+
#
|
382
|
+
# @return [SelectorProxy]
|
383
|
+
#
|
384
|
+
# @note
|
385
|
+
# Matching columns are returned in the order in which they are declared in
|
386
|
+
# the selector, not the underlying schema order.
|
387
|
+
#
|
388
|
+
# @example
|
389
|
+
# df = Polars::DataFrame.new(
|
390
|
+
# {
|
391
|
+
# "foo" => ["x", "y"],
|
392
|
+
# "bar" => [123, 456],
|
393
|
+
# "baz" => [2.0, 5.5],
|
394
|
+
# "zap" => [false, true]
|
395
|
+
# }
|
396
|
+
# )
|
397
|
+
#
|
398
|
+
# @example Select columns by name:
|
399
|
+
# df.select(Polars.cs.by_name("foo", "bar"))
|
400
|
+
# # =>
|
401
|
+
# # shape: (2, 2)
|
402
|
+
# # ┌─────┬─────┐
|
403
|
+
# # │ foo ┆ bar │
|
404
|
+
# # │ --- ┆ --- │
|
405
|
+
# # │ str ┆ i64 │
|
406
|
+
# # ╞═════╪═════╡
|
407
|
+
# # │ x ┆ 123 │
|
408
|
+
# # │ y ┆ 456 │
|
409
|
+
# # └─────┴─────┘
|
410
|
+
#
|
411
|
+
# @example Match *any* of the given columns by name:
|
412
|
+
# df.select(Polars.cs.by_name("baz", "moose", "foo", "bear", require_all: false))
|
413
|
+
# # =>
|
414
|
+
# # shape: (2, 2)
|
415
|
+
# # ┌─────┬─────┐
|
416
|
+
# # │ foo ┆ baz │
|
417
|
+
# # │ --- ┆ --- │
|
418
|
+
# # │ str ┆ f64 │
|
419
|
+
# # ╞═════╪═════╡
|
420
|
+
# # │ x ┆ 2.0 │
|
421
|
+
# # │ y ┆ 5.5 │
|
422
|
+
# # └─────┴─────┘
|
423
|
+
#
|
424
|
+
# @example Match all columns *except* for those given:
|
425
|
+
# df.select(~Polars.cs.by_name("foo", "bar"))
|
426
|
+
# # =>
|
427
|
+
# # shape: (2, 2)
|
428
|
+
# # ┌─────┬───────┐
|
429
|
+
# # │ baz ┆ zap │
|
430
|
+
# # │ --- ┆ --- │
|
431
|
+
# # │ f64 ┆ bool │
|
432
|
+
# # ╞═════╪═══════╡
|
433
|
+
# # │ 2.0 ┆ false │
|
434
|
+
# # │ 5.5 ┆ true │
|
435
|
+
# # └─────┴───────┘
|
436
|
+
def self.by_name(*names, require_all: true)
|
437
|
+
all_names = []
|
438
|
+
names.each do |nm|
|
439
|
+
if nm.is_a?(::String)
|
440
|
+
all_names << nm
|
441
|
+
else
|
442
|
+
msg = "invalid name: #{nm.inspect}"
|
443
|
+
raise TypeError, msg
|
444
|
+
end
|
445
|
+
end
|
446
|
+
|
447
|
+
selector_params = {"*names" => all_names}
|
448
|
+
match_cols = all_names
|
449
|
+
if !require_all
|
450
|
+
match_cols = "^(#{all_names.map { |nm| Utils.re_escape(nm) }.join("|")})$"
|
451
|
+
selector_params["require_all"] = require_all
|
452
|
+
end
|
453
|
+
|
454
|
+
_selector_proxy_(
|
455
|
+
F.col(match_cols),
|
456
|
+
name: "by_name",
|
457
|
+
parameters: selector_params
|
458
|
+
)
|
459
|
+
end
|
378
460
|
|
379
461
|
# Select all categorical columns.
|
380
462
|
#
|
data/lib/polars/version.rb
CHANGED
data/lib/polars.rb
CHANGED
@@ -49,6 +49,7 @@ require_relative "polars/group_by"
|
|
49
49
|
require_relative "polars/io/avro"
|
50
50
|
require_relative "polars/io/csv"
|
51
51
|
require_relative "polars/io/database"
|
52
|
+
require_relative "polars/io/delta"
|
52
53
|
require_relative "polars/io/ipc"
|
53
54
|
require_relative "polars/io/json"
|
54
55
|
require_relative "polars/io/ndjson"
|
@@ -89,4 +90,18 @@ module Polars
|
|
89
90
|
|
90
91
|
# @private
|
91
92
|
N_INFER_DEFAULT = 100
|
93
|
+
|
94
|
+
# @private
|
95
|
+
class ArrowArrayStream
|
96
|
+
def arrow_c_stream
|
97
|
+
self
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Return the number of threads in the Polars thread pool.
|
102
|
+
#
|
103
|
+
# @return [Integer]
|
104
|
+
def self.thread_pool_size
|
105
|
+
Plr.thread_pool_size
|
106
|
+
end
|
92
107
|
end
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polars-df
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date: 2024-
|
10
|
+
date: 2024-12-29 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: bigdecimal
|
@@ -38,7 +37,6 @@ dependencies:
|
|
38
37
|
- - ">="
|
39
38
|
- !ruby/object:Gem::Version
|
40
39
|
version: '0'
|
41
|
-
description:
|
42
40
|
email: andrew@ankane.org
|
43
41
|
executables: []
|
44
42
|
extensions:
|
@@ -160,6 +158,7 @@ files:
|
|
160
158
|
- lib/polars/io/avro.rb
|
161
159
|
- lib/polars/io/csv.rb
|
162
160
|
- lib/polars/io/database.rb
|
161
|
+
- lib/polars/io/delta.rb
|
163
162
|
- lib/polars/io/ipc.rb
|
164
163
|
- lib/polars/io/json.rb
|
165
164
|
- lib/polars/io/ndjson.rb
|
@@ -194,7 +193,6 @@ homepage: https://github.com/ankane/ruby-polars
|
|
194
193
|
licenses:
|
195
194
|
- MIT
|
196
195
|
metadata: {}
|
197
|
-
post_install_message:
|
198
196
|
rdoc_options: []
|
199
197
|
require_paths:
|
200
198
|
- lib
|
@@ -202,15 +200,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
202
200
|
requirements:
|
203
201
|
- - ">="
|
204
202
|
- !ruby/object:Gem::Version
|
205
|
-
version: '3.
|
203
|
+
version: '3.2'
|
206
204
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
207
205
|
requirements:
|
208
206
|
- - ">="
|
209
207
|
- !ruby/object:Gem::Version
|
210
208
|
version: '0'
|
211
209
|
requirements: []
|
212
|
-
rubygems_version: 3.
|
213
|
-
signing_key:
|
210
|
+
rubygems_version: 3.6.2
|
214
211
|
specification_version: 4
|
215
212
|
summary: Blazingly fast DataFrames for Ruby
|
216
213
|
test_files: []
|