polars-df 0.21.1-aarch64-linux-musl → 0.23.0-aarch64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/Cargo.lock +120 -90
- data/Cargo.toml +3 -0
- data/LICENSE-THIRD-PARTY.txt +1835 -1836
- data/README.md +20 -7
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/3.4/polars.so +0 -0
- data/lib/polars/array_expr.rb +1 -1
- data/lib/polars/data_frame.rb +119 -15
- data/lib/polars/data_types.rb +23 -6
- data/lib/polars/date_time_expr.rb +36 -15
- data/lib/polars/expr.rb +41 -32
- data/lib/polars/functions/business.rb +95 -0
- data/lib/polars/functions/lazy.rb +1 -1
- data/lib/polars/iceberg_dataset.rb +113 -0
- data/lib/polars/io/iceberg.rb +34 -0
- data/lib/polars/io/ipc.rb +28 -49
- data/lib/polars/io/parquet.rb +7 -4
- data/lib/polars/io/scan_options.rb +12 -3
- data/lib/polars/io/utils.rb +17 -0
- data/lib/polars/lazy_frame.rb +97 -10
- data/lib/polars/list_expr.rb +21 -13
- data/lib/polars/list_name_space.rb +33 -21
- data/lib/polars/meta_expr.rb +25 -0
- data/lib/polars/query_opt_flags.rb +50 -0
- data/lib/polars/scan_cast_options.rb +23 -1
- data/lib/polars/schema.rb +1 -1
- data/lib/polars/selectors.rb +8 -8
- data/lib/polars/series.rb +26 -2
- data/lib/polars/string_expr.rb +27 -28
- data/lib/polars/string_name_space.rb +18 -5
- data/lib/polars/utils/convert.rb +2 -2
- data/lib/polars/utils/serde.rb +17 -0
- data/lib/polars/utils/various.rb +4 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +8 -2
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
module Polars
|
|
2
|
+
# @private
|
|
3
|
+
class IcebergDataset
|
|
4
|
+
def initialize(
|
|
5
|
+
source,
|
|
6
|
+
snapshot_id:,
|
|
7
|
+
storage_options:
|
|
8
|
+
)
|
|
9
|
+
@source = source
|
|
10
|
+
@snapshot_id = snapshot_id
|
|
11
|
+
@storage_options = storage_options
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def to_lazyframe
|
|
15
|
+
# for iceberg < 0.1.3
|
|
16
|
+
if !@source.respond_to?(:scan)
|
|
17
|
+
return @source.to_polars(snapshot_id: @snapshot_id, storage_options: @storage_options)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
scan = @source.scan(snapshot_id: @snapshot_id)
|
|
21
|
+
files = scan.plan_files
|
|
22
|
+
|
|
23
|
+
table = scan.table
|
|
24
|
+
snapshot = scan.snapshot
|
|
25
|
+
schema = snapshot ? table.schema_by_id(snapshot[:schema_id]) : table.current_schema
|
|
26
|
+
|
|
27
|
+
if files.empty?
|
|
28
|
+
# TODO improve
|
|
29
|
+
schema =
|
|
30
|
+
schema.fields.to_h do |field|
|
|
31
|
+
dtype =
|
|
32
|
+
case field[:type]
|
|
33
|
+
when "int"
|
|
34
|
+
Polars::Int32
|
|
35
|
+
when "long"
|
|
36
|
+
Polars::Int64
|
|
37
|
+
when "double"
|
|
38
|
+
Polars::Float64
|
|
39
|
+
when "string"
|
|
40
|
+
Polars::String
|
|
41
|
+
when "timestamp"
|
|
42
|
+
Polars::Datetime
|
|
43
|
+
else
|
|
44
|
+
raise Todo
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
[field[:name], dtype]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
LazyFrame.new(schema: schema)
|
|
51
|
+
else
|
|
52
|
+
sources = files.map { |v| v[:data_file_path] }
|
|
53
|
+
|
|
54
|
+
column_mapping = [
|
|
55
|
+
"iceberg-column-mapping",
|
|
56
|
+
arrow_schema(schema)
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
deletion_files = [
|
|
60
|
+
"iceberg-position-delete",
|
|
61
|
+
files.map.with_index
|
|
62
|
+
.select { |v, i| v[:deletes].any? }
|
|
63
|
+
.to_h { |v, i| [i, v[:deletes].map { |d| d[:file_path] }] }
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
scan_options = {
|
|
67
|
+
storage_options: @storage_options,
|
|
68
|
+
cast_options: Polars::ScanCastOptions._default_iceberg,
|
|
69
|
+
allow_missing_columns: true,
|
|
70
|
+
extra_columns: "ignore",
|
|
71
|
+
_column_mapping: column_mapping,
|
|
72
|
+
_deletion_files: deletion_files
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
Polars.scan_parquet(sources, **scan_options)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def arrow_schema(schema)
|
|
82
|
+
fields =
|
|
83
|
+
schema.fields.map do |field|
|
|
84
|
+
type =
|
|
85
|
+
case field[:type]
|
|
86
|
+
when "boolean"
|
|
87
|
+
"boolean"
|
|
88
|
+
when "int"
|
|
89
|
+
"int32"
|
|
90
|
+
when "long"
|
|
91
|
+
"int64"
|
|
92
|
+
when "float"
|
|
93
|
+
"float32"
|
|
94
|
+
when "double"
|
|
95
|
+
"float64"
|
|
96
|
+
else
|
|
97
|
+
raise Todo
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
{
|
|
101
|
+
name: field[:name],
|
|
102
|
+
type: type,
|
|
103
|
+
nullable: !field[:required],
|
|
104
|
+
metadata: {
|
|
105
|
+
"PARQUET:field_id" => field[:id].to_s
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
{fields: fields}
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
module Polars
|
|
2
|
+
module IO
|
|
3
|
+
# Lazily read from an Apache Iceberg table.
|
|
4
|
+
#
|
|
5
|
+
# @param source [Object]
|
|
6
|
+
# A Iceberg Ruby table, or a direct path to the metadata.
|
|
7
|
+
# @param snapshot_id [Integer]
|
|
8
|
+
# The snapshot ID to scan from.
|
|
9
|
+
# @param storage_options [Hash]
|
|
10
|
+
# Extra options for the storage backends.
|
|
11
|
+
#
|
|
12
|
+
# @return [LazyFrame]
|
|
13
|
+
def scan_iceberg(
|
|
14
|
+
source,
|
|
15
|
+
snapshot_id: nil,
|
|
16
|
+
storage_options: nil
|
|
17
|
+
)
|
|
18
|
+
require "iceberg"
|
|
19
|
+
|
|
20
|
+
unless source.is_a?(Iceberg::Table)
|
|
21
|
+
raise Todo
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
dataset =
|
|
25
|
+
IcebergDataset.new(
|
|
26
|
+
source,
|
|
27
|
+
snapshot_id:,
|
|
28
|
+
storage_options:
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
dataset.to_lazyframe
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
data/lib/polars/io/ipc.rb
CHANGED
|
@@ -187,8 +187,16 @@ module Polars
|
|
|
187
187
|
# DataFrame.
|
|
188
188
|
# @param row_count_offset [Integer]
|
|
189
189
|
# Offset to start the row_count column (only use if the name is set).
|
|
190
|
+
# @param glob [Boolean]
|
|
191
|
+
# Expand path given via globbing rules.
|
|
190
192
|
# @param storage_options [Hash]
|
|
191
193
|
# Extra options that make sense for a particular storage connection.
|
|
194
|
+
# @param retries [Integer]
|
|
195
|
+
# Number of retries if accessing a cloud instance fails.
|
|
196
|
+
# @param file_cache_ttl [Integer]
|
|
197
|
+
# Amount of time to keep downloaded cloud files since their last access time,
|
|
198
|
+
# in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
|
|
199
|
+
# (which defaults to 1 hour) if not given.
|
|
192
200
|
# @param hive_partitioning [Boolean]
|
|
193
201
|
# Infer statistics and schema from Hive partitioned URL and use them
|
|
194
202
|
# to prune reads. This is unset by default (i.e. `nil`), meaning it is
|
|
@@ -210,66 +218,37 @@ module Polars
|
|
|
210
218
|
rechunk: true,
|
|
211
219
|
row_count_name: nil,
|
|
212
220
|
row_count_offset: 0,
|
|
221
|
+
glob: true,
|
|
213
222
|
storage_options: nil,
|
|
223
|
+
retries: 2,
|
|
224
|
+
file_cache_ttl: nil,
|
|
214
225
|
hive_partitioning: nil,
|
|
215
226
|
hive_schema: nil,
|
|
216
227
|
try_parse_hive_dates: true,
|
|
217
228
|
include_file_paths: nil
|
|
218
229
|
)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
n_rows: n_rows,
|
|
222
|
-
cache: cache,
|
|
223
|
-
rechunk: rechunk,
|
|
224
|
-
row_count_name: row_count_name,
|
|
225
|
-
row_count_offset: row_count_offset,
|
|
226
|
-
storage_options: storage_options,
|
|
227
|
-
hive_partitioning: hive_partitioning,
|
|
228
|
-
hive_schema: hive_schema,
|
|
229
|
-
try_parse_hive_dates: try_parse_hive_dates,
|
|
230
|
-
include_file_paths: include_file_paths
|
|
231
|
-
)
|
|
232
|
-
end
|
|
233
|
-
|
|
234
|
-
# @private
|
|
235
|
-
def _scan_ipc_impl(
|
|
236
|
-
source,
|
|
237
|
-
n_rows: nil,
|
|
238
|
-
cache: true,
|
|
239
|
-
rechunk: true,
|
|
240
|
-
row_count_name: nil,
|
|
241
|
-
row_count_offset: 0,
|
|
242
|
-
storage_options: nil,
|
|
243
|
-
hive_partitioning: nil,
|
|
244
|
-
hive_schema: nil,
|
|
245
|
-
try_parse_hive_dates: true,
|
|
246
|
-
include_file_paths: nil
|
|
247
|
-
)
|
|
248
|
-
sources = []
|
|
249
|
-
if Utils.pathlike?(source)
|
|
250
|
-
source = Utils.normalize_filepath(source)
|
|
251
|
-
elsif source.is_a?(::Array)
|
|
252
|
-
if Utils.is_path_or_str_sequence(source)
|
|
253
|
-
sources = source.map { |s| Utils.normalize_filepath(s) }
|
|
254
|
-
else
|
|
255
|
-
sources = source
|
|
256
|
-
end
|
|
230
|
+
row_index_name = row_count_name
|
|
231
|
+
row_index_offset = row_count_offset
|
|
257
232
|
|
|
258
|
-
|
|
259
|
-
end
|
|
233
|
+
sources = get_sources(source)
|
|
260
234
|
|
|
261
235
|
rblf =
|
|
262
236
|
RbLazyFrame.new_from_ipc(
|
|
263
|
-
source,
|
|
264
237
|
sources,
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
238
|
+
ScanOptions.new(
|
|
239
|
+
row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
|
|
240
|
+
pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
|
|
241
|
+
include_file_paths: include_file_paths,
|
|
242
|
+
glob: glob,
|
|
243
|
+
hive_partitioning: hive_partitioning,
|
|
244
|
+
hive_schema: hive_schema,
|
|
245
|
+
try_parse_hive_dates: try_parse_hive_dates,
|
|
246
|
+
rechunk: rechunk,
|
|
247
|
+
cache: cache,
|
|
248
|
+
storage_options: !storage_options.nil? ? storage_options.to_a : nil,
|
|
249
|
+
retries: retries
|
|
250
|
+
),
|
|
251
|
+
file_cache_ttl
|
|
273
252
|
)
|
|
274
253
|
Utils.wrap_ldf(rblf)
|
|
275
254
|
end
|
data/lib/polars/io/parquet.rb
CHANGED
|
@@ -117,14 +117,13 @@ module Polars
|
|
|
117
117
|
# @param source [Object]
|
|
118
118
|
# Path to a file or a file-like object.
|
|
119
119
|
#
|
|
120
|
-
# @return [
|
|
120
|
+
# @return [Schema]
|
|
121
121
|
def read_parquet_schema(source)
|
|
122
122
|
if Utils.pathlike?(source)
|
|
123
123
|
source = Utils.normalize_filepath(source)
|
|
124
124
|
end
|
|
125
125
|
|
|
126
|
-
|
|
127
|
-
scan_parquet(source).collect_schema.to_h
|
|
126
|
+
scan_parquet(source).collect_schema
|
|
128
127
|
end
|
|
129
128
|
|
|
130
129
|
# Get file-level custom metadata of a Parquet file without reading data.
|
|
@@ -207,6 +206,9 @@ module Polars
|
|
|
207
206
|
# defined schema are encountered in the data:
|
|
208
207
|
# * `ignore`: Silently ignores.
|
|
209
208
|
# * `raise`: Raises an error.
|
|
209
|
+
# @param cast_options [Object]
|
|
210
|
+
# Configuration for column type-casting during scans. Useful for datasets
|
|
211
|
+
# containing files that have differing schemas.
|
|
210
212
|
#
|
|
211
213
|
# @return [LazyFrame]
|
|
212
214
|
def scan_parquet(
|
|
@@ -230,6 +232,7 @@ module Polars
|
|
|
230
232
|
include_file_paths: nil,
|
|
231
233
|
allow_missing_columns: false,
|
|
232
234
|
extra_columns: "raise",
|
|
235
|
+
cast_options: nil,
|
|
233
236
|
_column_mapping: nil,
|
|
234
237
|
_deletion_files: nil
|
|
235
238
|
)
|
|
@@ -268,7 +271,7 @@ module Polars
|
|
|
268
271
|
ScanOptions.new(
|
|
269
272
|
row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
|
|
270
273
|
pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
|
|
271
|
-
|
|
274
|
+
cast_options: cast_options,
|
|
272
275
|
extra_columns: extra_columns,
|
|
273
276
|
missing_columns: missing_columns,
|
|
274
277
|
include_file_paths: include_file_paths,
|
|
@@ -2,8 +2,9 @@ module Polars
|
|
|
2
2
|
module IO
|
|
3
3
|
class ScanOptions
|
|
4
4
|
attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
|
|
5
|
-
:include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
|
|
6
|
-
:rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping,
|
|
5
|
+
:include_file_paths, :glob, :hidden_file_prefix, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
|
|
6
|
+
:rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping,
|
|
7
|
+
:default_values, :deletion_files, :table_statistics, :row_count
|
|
7
8
|
|
|
8
9
|
def initialize(
|
|
9
10
|
row_index: nil,
|
|
@@ -13,6 +14,7 @@ module Polars
|
|
|
13
14
|
missing_columns: "raise",
|
|
14
15
|
include_file_paths: nil,
|
|
15
16
|
glob: true,
|
|
17
|
+
hidden_file_prefix: nil,
|
|
16
18
|
hive_partitioning: nil,
|
|
17
19
|
hive_schema: nil,
|
|
18
20
|
try_parse_hive_dates: true,
|
|
@@ -22,7 +24,10 @@ module Polars
|
|
|
22
24
|
credential_provider: nil,
|
|
23
25
|
retries: 2,
|
|
24
26
|
column_mapping: nil,
|
|
25
|
-
|
|
27
|
+
default_values: nil,
|
|
28
|
+
deletion_files: nil,
|
|
29
|
+
table_statistics: nil,
|
|
30
|
+
row_count: nil
|
|
26
31
|
)
|
|
27
32
|
@row_index = row_index
|
|
28
33
|
@pre_slice = pre_slice
|
|
@@ -31,6 +36,7 @@ module Polars
|
|
|
31
36
|
@missing_columns = missing_columns
|
|
32
37
|
@include_file_paths = include_file_paths
|
|
33
38
|
@glob = glob
|
|
39
|
+
@hidden_file_prefix = hidden_file_prefix
|
|
34
40
|
@hive_partitioning = hive_partitioning
|
|
35
41
|
@hive_schema = hive_schema
|
|
36
42
|
@try_parse_hive_dates = try_parse_hive_dates
|
|
@@ -40,7 +46,10 @@ module Polars
|
|
|
40
46
|
@credential_provider = credential_provider
|
|
41
47
|
@retries = retries
|
|
42
48
|
@column_mapping = column_mapping
|
|
49
|
+
@default_values = default_values
|
|
43
50
|
@deletion_files = deletion_files
|
|
51
|
+
@table_statistics = table_statistics
|
|
52
|
+
@row_count = row_count
|
|
44
53
|
end
|
|
45
54
|
end
|
|
46
55
|
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Polars
|
|
2
|
+
module IO
|
|
3
|
+
private
|
|
4
|
+
|
|
5
|
+
def get_sources(source)
|
|
6
|
+
if Utils.pathlike?(source)
|
|
7
|
+
source = Utils.normalize_filepath(source, check_not_directory: false)
|
|
8
|
+
elsif Utils.is_path_or_str_sequence(source)
|
|
9
|
+
source = source.map { |s| Utils.normalize_filepath(s, check_not_directory: false) }
|
|
10
|
+
end
|
|
11
|
+
unless source.is_a?(::Array)
|
|
12
|
+
source = [source]
|
|
13
|
+
end
|
|
14
|
+
source
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
data/lib/polars/lazy_frame.rb
CHANGED
|
@@ -27,9 +27,6 @@ module Polars
|
|
|
27
27
|
ldf
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
# def self.from_json
|
|
31
|
-
# end
|
|
32
|
-
|
|
33
30
|
# Read a logical plan from a JSON file to construct a LazyFrame.
|
|
34
31
|
#
|
|
35
32
|
# @param file [String]
|
|
@@ -41,7 +38,49 @@ module Polars
|
|
|
41
38
|
file = Utils.normalize_filepath(file)
|
|
42
39
|
end
|
|
43
40
|
|
|
44
|
-
Utils.wrap_ldf(RbLazyFrame.
|
|
41
|
+
Utils.wrap_ldf(RbLazyFrame.deserialize_json(file))
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Read a logical plan from a file to construct a LazyFrame.
|
|
45
|
+
#
|
|
46
|
+
# @param source [Object]
|
|
47
|
+
# Path to a file or a file-like object (by file-like object, we refer to
|
|
48
|
+
# objects that have a `read` method, such as a file handler or `StringIO`).
|
|
49
|
+
#
|
|
50
|
+
# @return [LazyFrame]
|
|
51
|
+
#
|
|
52
|
+
# @note
|
|
53
|
+
# This function uses marshaling if the logical plan contains Ruby UDFs,
|
|
54
|
+
# and as such inherits the security implications. Deserializing can execute
|
|
55
|
+
# arbitrary code, so it should only be attempted on trusted data.
|
|
56
|
+
#
|
|
57
|
+
# @note
|
|
58
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
|
59
|
+
# in one Polars version may not be deserializable in another Polars version.
|
|
60
|
+
#
|
|
61
|
+
# @example
|
|
62
|
+
# lf = Polars::LazyFrame.new({"a" => [1, 2, 3]}).sum
|
|
63
|
+
# bytes = lf.serialize
|
|
64
|
+
# Polars::LazyFrame.deserialize(StringIO.new(bytes)).collect
|
|
65
|
+
# # =>
|
|
66
|
+
# # shape: (1, 1)
|
|
67
|
+
# # ┌─────┐
|
|
68
|
+
# # │ a │
|
|
69
|
+
# # │ --- │
|
|
70
|
+
# # │ i64 │
|
|
71
|
+
# # ╞═════╡
|
|
72
|
+
# # │ 6 │
|
|
73
|
+
# # └─────┘
|
|
74
|
+
def self.deserialize(source)
|
|
75
|
+
raise Todo unless RbLazyFrame.respond_to?(:deserialize_binary)
|
|
76
|
+
|
|
77
|
+
if Utils.pathlike?(source)
|
|
78
|
+
source = Utils.normalize_filepath(source)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
deserializer = RbLazyFrame.method(:deserialize_binary)
|
|
82
|
+
|
|
83
|
+
_from_rbldf(deserializer.(source))
|
|
45
84
|
end
|
|
46
85
|
|
|
47
86
|
# Get or set column names.
|
|
@@ -151,6 +190,38 @@ module Polars
|
|
|
151
190
|
nil
|
|
152
191
|
end
|
|
153
192
|
|
|
193
|
+
# Serialize the logical plan of this LazyFrame to a file or string.
|
|
194
|
+
#
|
|
195
|
+
# @param file [Object]
|
|
196
|
+
# File path to which the result should be written. If set to `nil`
|
|
197
|
+
# (default), the output is returned as a string instead.
|
|
198
|
+
#
|
|
199
|
+
# @return [Object]
|
|
200
|
+
#
|
|
201
|
+
# @note
|
|
202
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
|
203
|
+
# in one Polars version may not be deserializable in another Polars version.
|
|
204
|
+
#
|
|
205
|
+
# @example Serialize the logical plan into a binary representation.
|
|
206
|
+
# lf = Polars::LazyFrame.new({"a" => [1, 2, 3]}).sum
|
|
207
|
+
# bytes = lf.serialize
|
|
208
|
+
# Polars::LazyFrame.deserialize(StringIO.new(bytes)).collect
|
|
209
|
+
# # =>
|
|
210
|
+
# # shape: (1, 1)
|
|
211
|
+
# # ┌─────┐
|
|
212
|
+
# # │ a │
|
|
213
|
+
# # │ --- │
|
|
214
|
+
# # │ i64 │
|
|
215
|
+
# # ╞═════╡
|
|
216
|
+
# # │ 6 │
|
|
217
|
+
# # └─────┘
|
|
218
|
+
def serialize(file = nil)
|
|
219
|
+
raise Todo unless _ldf.respond_to?(:serialize_binary)
|
|
220
|
+
|
|
221
|
+
serializer = _ldf.method(:serialize_binary)
|
|
222
|
+
Utils.serialize_polars_object(serializer, file)
|
|
223
|
+
end
|
|
224
|
+
|
|
154
225
|
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
|
155
226
|
#
|
|
156
227
|
# @param func [Object]
|
|
@@ -774,6 +845,21 @@ module Polars
|
|
|
774
845
|
# @param maintain_order [Boolean]
|
|
775
846
|
# Maintain the order in which data is processed.
|
|
776
847
|
# Setting this to `false` will be slightly faster.
|
|
848
|
+
# @param storage_options [String]
|
|
849
|
+
# Options that indicate how to connect to a cloud provider.
|
|
850
|
+
#
|
|
851
|
+
# The cloud providers currently supported are AWS, GCP, and Azure.
|
|
852
|
+
# See supported keys here:
|
|
853
|
+
#
|
|
854
|
+
# * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
|
|
855
|
+
# * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
|
|
856
|
+
# * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
|
|
857
|
+
# * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
|
858
|
+
#
|
|
859
|
+
# If `storage_options` is not provided, Polars will try to infer the
|
|
860
|
+
# information from environment variables.
|
|
861
|
+
# @param retries [Integer]
|
|
862
|
+
# Number of retries if accessing a cloud instance fails.
|
|
777
863
|
# @param type_coercion [Boolean]
|
|
778
864
|
# Do type coercion optimization.
|
|
779
865
|
# @param predicate_pushdown [Boolean]
|
|
@@ -806,6 +892,8 @@ module Polars
|
|
|
806
892
|
path,
|
|
807
893
|
compression: "zstd",
|
|
808
894
|
maintain_order: true,
|
|
895
|
+
storage_options: nil,
|
|
896
|
+
retries: 2,
|
|
809
897
|
type_coercion: true,
|
|
810
898
|
predicate_pushdown: true,
|
|
811
899
|
projection_pushdown: true,
|
|
@@ -816,10 +904,6 @@ module Polars
|
|
|
816
904
|
mkdir: false,
|
|
817
905
|
lazy: false
|
|
818
906
|
)
|
|
819
|
-
# TODO support storage options in Rust
|
|
820
|
-
storage_options = nil
|
|
821
|
-
retries = 2
|
|
822
|
-
|
|
823
907
|
lf = _set_sink_optimizations(
|
|
824
908
|
type_coercion: type_coercion,
|
|
825
909
|
predicate_pushdown: predicate_pushdown,
|
|
@@ -4059,6 +4143,9 @@ module Polars
|
|
|
4059
4143
|
# Names of the struct columns that will be decomposed by its fields
|
|
4060
4144
|
# @param more_columns [Array]
|
|
4061
4145
|
# Additional columns to unnest, specified as positional arguments.
|
|
4146
|
+
# @param separator [String]
|
|
4147
|
+
# Rename output column names as combination of the struct column name,
|
|
4148
|
+
# name separator and field name.
|
|
4062
4149
|
#
|
|
4063
4150
|
# @return [LazyFrame]
|
|
4064
4151
|
#
|
|
@@ -4103,11 +4190,11 @@ module Polars
|
|
|
4103
4190
|
# # │ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
|
|
4104
4191
|
# # │ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
|
|
4105
4192
|
# # └────────┴─────┴─────┴──────┴───────────┴───────┘
|
|
4106
|
-
def unnest(columns, *more_columns)
|
|
4193
|
+
def unnest(columns, *more_columns, separator: nil)
|
|
4107
4194
|
subset = Utils.parse_list_into_selector(columns) | Utils.parse_list_into_selector(
|
|
4108
4195
|
more_columns
|
|
4109
4196
|
)
|
|
4110
|
-
_from_rbldf(_ldf.unnest(subset._rbselector))
|
|
4197
|
+
_from_rbldf(_ldf.unnest(subset._rbselector, separator))
|
|
4111
4198
|
end
|
|
4112
4199
|
|
|
4113
4200
|
# Take two sorted DataFrames and merge them by the sorted key.
|
data/lib/polars/list_expr.rb
CHANGED
|
@@ -925,7 +925,7 @@ module Polars
|
|
|
925
925
|
# Convert the series of type `List` to a series of type `Struct`.
|
|
926
926
|
#
|
|
927
927
|
# @param n_field_strategy ["first_non_null", "max_width"]
|
|
928
|
-
#
|
|
928
|
+
# Deprecated and ignored.
|
|
929
929
|
# @param fields pArray
|
|
930
930
|
# If the name and number of the desired fields is known in advance
|
|
931
931
|
# a list of field names can be given, which will be assigned by index.
|
|
@@ -945,20 +945,28 @@ module Polars
|
|
|
945
945
|
# @return [Expr]
|
|
946
946
|
#
|
|
947
947
|
# @example
|
|
948
|
-
# df = Polars::DataFrame.new({"
|
|
949
|
-
# df.
|
|
948
|
+
# df = Polars::DataFrame.new({"n" => [[0, 1], [0, 1, 2]]})
|
|
949
|
+
# df.with_columns(struct: Polars.col("n").list.to_struct(upper_bound: 2))
|
|
950
950
|
# # =>
|
|
951
|
-
# # shape: (2,
|
|
952
|
-
# #
|
|
953
|
-
# # │
|
|
954
|
-
# # │ ---
|
|
955
|
-
# # │ struct[
|
|
956
|
-
# #
|
|
957
|
-
# # │ {1
|
|
958
|
-
# # │
|
|
959
|
-
# #
|
|
951
|
+
# # shape: (2, 2)
|
|
952
|
+
# # ┌───────────┬───────────┐
|
|
953
|
+
# # │ n ┆ struct │
|
|
954
|
+
# # │ --- ┆ --- │
|
|
955
|
+
# # │ list[i64] ┆ struct[2] │
|
|
956
|
+
# # ╞═══════════╪═══════════╡
|
|
957
|
+
# # │ [0, 1] ┆ {0,1} │
|
|
958
|
+
# # │ [0, 1, 2] ┆ {0,1} │
|
|
959
|
+
# # └───────────┴───────────┘
|
|
960
960
|
def to_struct(n_field_strategy: "first_non_null", fields: nil, upper_bound: nil)
|
|
961
|
-
|
|
961
|
+
if !fields.is_a?(::Array)
|
|
962
|
+
if fields.nil?
|
|
963
|
+
fields = upper_bound.times.map { |i| "field_#{i}" }
|
|
964
|
+
else
|
|
965
|
+
fields = upper_bound.times.map { |i| fields.(i) }
|
|
966
|
+
end
|
|
967
|
+
end
|
|
968
|
+
|
|
969
|
+
Utils.wrap_expr(_rbexpr.list_to_struct(fields))
|
|
962
970
|
end
|
|
963
971
|
|
|
964
972
|
# Run any polars expression against the lists' elements.
|
|
@@ -755,27 +755,39 @@ module Polars
|
|
|
755
755
|
#
|
|
756
756
|
# @param n_field_strategy ["first_non_null", "max_width"]
|
|
757
757
|
# Strategy to determine the number of fields of the struct.
|
|
758
|
-
# @param
|
|
759
|
-
#
|
|
760
|
-
#
|
|
761
|
-
#
|
|
762
|
-
#
|
|
763
|
-
#
|
|
764
|
-
# @
|
|
765
|
-
#
|
|
766
|
-
#
|
|
767
|
-
#
|
|
768
|
-
#
|
|
769
|
-
# #
|
|
770
|
-
# #
|
|
771
|
-
# #
|
|
772
|
-
# # │
|
|
773
|
-
# #
|
|
774
|
-
# # │
|
|
775
|
-
# #
|
|
776
|
-
# #
|
|
777
|
-
|
|
778
|
-
|
|
758
|
+
# @param fields [Object]
|
|
759
|
+
# If the name and number of the desired fields is known in advance
|
|
760
|
+
# a list of field names can be given, which will be assigned by index.
|
|
761
|
+
# Otherwise, to dynamically assign field names, a custom function can be
|
|
762
|
+
# used; if neither are set, fields will be `field_0, field_1 .. field_n`.
|
|
763
|
+
#
|
|
764
|
+
# @return [Series]
|
|
765
|
+
#
|
|
766
|
+
# @example Convert list to struct with field name assignment by index from a list of names:
|
|
767
|
+
# s1 = Polars::Series.new("n", [[0, 1, 2], [0, 1]])
|
|
768
|
+
# s1.list.to_struct(fields: ["one", "two", "three"]).struct.unnest
|
|
769
|
+
# # =>
|
|
770
|
+
# # shape: (2, 3)
|
|
771
|
+
# # ┌─────┬─────┬───────┐
|
|
772
|
+
# # │ one ┆ two ┆ three │
|
|
773
|
+
# # │ --- ┆ --- ┆ --- │
|
|
774
|
+
# # │ i64 ┆ i64 ┆ i64 │
|
|
775
|
+
# # ╞═════╪═════╪═══════╡
|
|
776
|
+
# # │ 0 ┆ 1 ┆ 2 │
|
|
777
|
+
# # │ 0 ┆ 1 ┆ null │
|
|
778
|
+
# # └─────┴─────┴───────┘
|
|
779
|
+
def to_struct(n_field_strategy: "first_non_null", fields: nil)
|
|
780
|
+
if fields.is_a?(::Array)
|
|
781
|
+
s = Utils.wrap_s(_s)
|
|
782
|
+
return (
|
|
783
|
+
s.to_frame
|
|
784
|
+
.select_seq(F.col(s.name).list.to_struct(fields: fields))
|
|
785
|
+
.to_series
|
|
786
|
+
)
|
|
787
|
+
end
|
|
788
|
+
|
|
789
|
+
raise Todo
|
|
790
|
+
# Utils.wrap_s(_s.list_to_struct(n_field_strategy, fields))
|
|
779
791
|
end
|
|
780
792
|
|
|
781
793
|
# Run any polars expression against the lists' elements.
|
data/lib/polars/meta_expr.rb
CHANGED
|
@@ -248,6 +248,31 @@ module Polars
|
|
|
248
248
|
Selector._from_rbselector(_rbexpr.into_selector)
|
|
249
249
|
end
|
|
250
250
|
|
|
251
|
+
# Serialize this expression to a file or string.
|
|
252
|
+
#
|
|
253
|
+
# @param file [Object]
|
|
254
|
+
# File path to which the result should be written. If set to `nil`
|
|
255
|
+
# (default), the output is returned as a string instead.
|
|
256
|
+
#
|
|
257
|
+
# @return [Object]
|
|
258
|
+
#
|
|
259
|
+
# @note
|
|
260
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
|
261
|
+
# in one Polars version may not be deserializable in another Polars version.
|
|
262
|
+
#
|
|
263
|
+
# @example Serialize the expression into a binary representation.
|
|
264
|
+
# expr = Polars.col("foo").sum.over("bar")
|
|
265
|
+
# bytes = expr.meta.serialize
|
|
266
|
+
# Polars::Expr.deserialize(StringIO.new(bytes))
|
|
267
|
+
# # => col("foo").sum().over([col("bar")])
|
|
268
|
+
def serialize(file = nil)
|
|
269
|
+
raise Todo unless _rbexpr.respond_to?(:serialize_binary)
|
|
270
|
+
|
|
271
|
+
serializer = _rbexpr.method(:serialize_binary)
|
|
272
|
+
|
|
273
|
+
Utils.serialize_polars_object(serializer, file)
|
|
274
|
+
end
|
|
275
|
+
|
|
251
276
|
# Format the expression as a tree.
|
|
252
277
|
#
|
|
253
278
|
# @param return_as_string [Boolean]
|