polars-df 0.13.0-x86_64-linux-musl → 0.15.0-x86_64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -0
- data/Cargo.lock +1368 -319
- data/LICENSE-THIRD-PARTY.txt +24801 -13447
- data/LICENSE.txt +1 -0
- data/README.md +1 -2
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +285 -62
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +2 -0
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +109 -8
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +88 -10
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/ipc.rb +14 -12
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +649 -15
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1144 -0
- data/lib/polars/series.rb +470 -40
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +4 -3
- data/lib/polars/functions.rb +0 -57
data/lib/polars/io/parquet.rb
CHANGED
@@ -2,120 +2,108 @@ module Polars
|
|
2
2
|
module IO
|
3
3
|
# Read into a DataFrame from a parquet file.
|
4
4
|
#
|
5
|
-
# @param source [
|
5
|
+
# @param source [Object]
|
6
6
|
# Path to a file or a file-like object.
|
7
7
|
# @param columns [Object]
|
8
8
|
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
9
9
|
# of column names.
|
10
10
|
# @param n_rows [Integer]
|
11
11
|
# Stop reading from parquet file after reading `n_rows`.
|
12
|
-
# @param storage_options [Hash]
|
13
|
-
# Extra options that make sense for a particular storage connection.
|
14
|
-
# @param parallel ["auto", "columns", "row_groups", "none"]
|
15
|
-
# This determines the direction of parallelism. 'auto' will try to determine the
|
16
|
-
# optimal direction.
|
17
12
|
# @param row_count_name [String]
|
18
13
|
# If not nil, this will insert a row count column with give name into the
|
19
14
|
# DataFrame.
|
20
15
|
# @param row_count_offset [Integer]
|
21
16
|
# Offset to start the row_count column (only use if the name is set).
|
22
|
-
# @param
|
23
|
-
#
|
17
|
+
# @param parallel ["auto", "columns", "row_groups", "none"]
|
18
|
+
# This determines the direction of parallelism. 'auto' will try to determine the
|
19
|
+
# optimal direction.
|
24
20
|
# @param use_statistics [Boolean]
|
25
21
|
# Use statistics in the parquet to determine if pages
|
26
22
|
# can be skipped from reading.
|
23
|
+
# @param hive_partitioning [Boolean]
|
24
|
+
# Infer statistics and schema from hive partitioned URL and use them
|
25
|
+
# to prune reads.
|
26
|
+
# @param glob [Boolean]
|
27
|
+
# Expand path given via globbing rules.
|
28
|
+
# @param schema [Object]
|
29
|
+
# Specify the datatypes of the columns. The datatypes must match the
|
30
|
+
# datatypes in the file(s). If there are extra columns that are not in the
|
31
|
+
# file(s), consider also enabling `allow_missing_columns`.
|
32
|
+
# @param hive_schema [Object]
|
33
|
+
# The column names and data types of the columns by which the data is partitioned.
|
34
|
+
# If set to `nil` (default), the schema of the Hive partitions is inferred.
|
35
|
+
# @param try_parse_hive_dates [Boolean]
|
36
|
+
# Whether to try parsing hive values as date/datetime types.
|
27
37
|
# @param rechunk [Boolean]
|
28
|
-
#
|
29
|
-
#
|
38
|
+
# In case of reading multiple files via a glob pattern rechunk the final DataFrame
|
39
|
+
# into contiguous memory chunks.
|
40
|
+
# @param low_memory [Boolean]
|
41
|
+
# Reduce memory pressure at the expense of performance.
|
42
|
+
# @param storage_options [Hash]
|
43
|
+
# Extra options that make sense for a particular storage connection.
|
44
|
+
# @param credential_provider [Object]
|
45
|
+
# Provide a function that can be called to provide cloud storage
|
46
|
+
# credentials. The function is expected to return a dictionary of
|
47
|
+
# credential keys along with an optional credential expiry time.
|
48
|
+
# @param retries [Integer]
|
49
|
+
# Number of retries if accessing a cloud instance fails.
|
50
|
+
# @param include_file_paths [String]
|
51
|
+
# Include the path of the source file(s) as a column with this name.
|
30
52
|
#
|
31
53
|
# @return [DataFrame]
|
32
|
-
#
|
33
|
-
# @note
|
34
|
-
# This operation defaults to a `rechunk` operation at the end, meaning that
|
35
|
-
# all data will be stored continuously in memory.
|
36
|
-
# Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
|
37
|
-
# an expensive operation.
|
38
54
|
def read_parquet(
|
39
55
|
source,
|
40
56
|
columns: nil,
|
41
57
|
n_rows: nil,
|
42
|
-
storage_options: nil,
|
43
|
-
parallel: "auto",
|
44
58
|
row_count_name: nil,
|
45
59
|
row_count_offset: 0,
|
46
|
-
|
60
|
+
parallel: "auto",
|
47
61
|
use_statistics: true,
|
48
|
-
|
62
|
+
hive_partitioning: nil,
|
63
|
+
glob: true,
|
64
|
+
schema: nil,
|
65
|
+
hive_schema: nil,
|
66
|
+
try_parse_hive_dates: true,
|
67
|
+
rechunk: false,
|
68
|
+
low_memory: false,
|
69
|
+
storage_options: nil,
|
70
|
+
credential_provider: nil,
|
71
|
+
retries: 2,
|
72
|
+
include_file_paths: nil,
|
73
|
+
allow_missing_columns: false
|
49
74
|
)
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
columns: columns,
|
75
|
+
lf =
|
76
|
+
scan_parquet(
|
77
|
+
source,
|
54
78
|
n_rows: n_rows,
|
55
|
-
parallel: parallel,
|
56
79
|
row_count_name: row_count_name,
|
57
80
|
row_count_offset: row_count_offset,
|
58
|
-
|
81
|
+
parallel: parallel,
|
59
82
|
use_statistics: use_statistics,
|
60
|
-
|
83
|
+
hive_partitioning: hive_partitioning,
|
84
|
+
schema: schema,
|
85
|
+
hive_schema: hive_schema,
|
86
|
+
try_parse_hive_dates: try_parse_hive_dates,
|
87
|
+
rechunk: rechunk,
|
88
|
+
low_memory: low_memory,
|
89
|
+
cache: false,
|
90
|
+
storage_options: storage_options,
|
91
|
+
credential_provider: credential_provider,
|
92
|
+
retries: retries,
|
93
|
+
glob: glob,
|
94
|
+
include_file_paths: include_file_paths,
|
95
|
+
allow_missing_columns: allow_missing_columns
|
61
96
|
)
|
62
|
-
end
|
63
|
-
end
|
64
97
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
columns: nil,
|
69
|
-
n_rows: nil,
|
70
|
-
parallel: "auto",
|
71
|
-
row_count_name: nil,
|
72
|
-
row_count_offset: 0,
|
73
|
-
low_memory: false,
|
74
|
-
use_statistics: true,
|
75
|
-
rechunk: true
|
76
|
-
)
|
77
|
-
if Utils.pathlike?(source)
|
78
|
-
source = Utils.normalize_filepath(source)
|
79
|
-
end
|
80
|
-
if columns.is_a?(::String)
|
81
|
-
columns = [columns]
|
82
|
-
end
|
83
|
-
|
84
|
-
if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
|
85
|
-
scan =
|
86
|
-
scan_parquet(
|
87
|
-
source,
|
88
|
-
n_rows: n_rows,
|
89
|
-
rechunk: true,
|
90
|
-
parallel: parallel,
|
91
|
-
row_count_name: row_count_name,
|
92
|
-
row_count_offset: row_count_offset,
|
93
|
-
low_memory: low_memory
|
94
|
-
)
|
95
|
-
|
96
|
-
if columns.nil?
|
97
|
-
return scan.collect
|
98
|
-
elsif Utils.is_str_sequence(columns, allow_str: false)
|
99
|
-
return scan.select(columns).collect
|
98
|
+
if !columns.nil?
|
99
|
+
if Utils.is_int_sequence(columns)
|
100
|
+
lf = lf.select(F.nth(columns))
|
100
101
|
else
|
101
|
-
|
102
|
+
lf = lf.select(columns)
|
102
103
|
end
|
103
104
|
end
|
104
105
|
|
105
|
-
|
106
|
-
rbdf =
|
107
|
-
RbDataFrame.read_parquet(
|
108
|
-
source,
|
109
|
-
columns,
|
110
|
-
projection,
|
111
|
-
n_rows,
|
112
|
-
parallel,
|
113
|
-
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
114
|
-
low_memory,
|
115
|
-
use_statistics,
|
116
|
-
rechunk
|
117
|
-
)
|
118
|
-
Utils.wrap_df(rbdf)
|
106
|
+
lf.collect
|
119
107
|
end
|
120
108
|
|
121
109
|
# Get a schema of the Parquet file without reading data.
|
@@ -137,46 +125,83 @@ module Polars
|
|
137
125
|
# This allows the query optimizer to push down predicates and projections to the scan
|
138
126
|
# level, thereby potentially reducing memory overhead.
|
139
127
|
#
|
140
|
-
# @param source [
|
141
|
-
# Path to a file.
|
128
|
+
# @param source [Object]
|
129
|
+
# Path to a file or a file-like object.
|
142
130
|
# @param n_rows [Integer]
|
143
131
|
# Stop reading from parquet file after reading `n_rows`.
|
144
|
-
# @param
|
145
|
-
#
|
132
|
+
# @param row_count_name [String]
|
133
|
+
# If not nil, this will insert a row count column with give name into the
|
134
|
+
# DataFrame.
|
135
|
+
# @param row_count_offset [Integer]
|
136
|
+
# Offset to start the row_count column (only use if the name is set).
|
146
137
|
# @param parallel ["auto", "columns", "row_groups", "none"]
|
147
138
|
# This determines the direction of parallelism. 'auto' will try to determine the
|
148
139
|
# optimal direction.
|
140
|
+
# @param use_statistics [Boolean]
|
141
|
+
# Use statistics in the parquet to determine if pages
|
142
|
+
# can be skipped from reading.
|
143
|
+
# @param hive_partitioning [Boolean]
|
144
|
+
# Infer statistics and schema from hive partitioned URL and use them
|
145
|
+
# to prune reads.
|
146
|
+
# @param glob [Boolean]
|
147
|
+
# Expand path given via globbing rules.
|
148
|
+
# @param schema [Object]
|
149
|
+
# Specify the datatypes of the columns. The datatypes must match the
|
150
|
+
# datatypes in the file(s). If there are extra columns that are not in the
|
151
|
+
# file(s), consider also enabling `allow_missing_columns`.
|
152
|
+
# @param hive_schema [Object]
|
153
|
+
# The column names and data types of the columns by which the data is partitioned.
|
154
|
+
# If set to `nil` (default), the schema of the Hive partitions is inferred.
|
155
|
+
# @param try_parse_hive_dates [Boolean]
|
156
|
+
# Whether to try parsing hive values as date/datetime types.
|
149
157
|
# @param rechunk [Boolean]
|
150
158
|
# In case of reading multiple files via a glob pattern rechunk the final DataFrame
|
151
159
|
# into contiguous memory chunks.
|
152
|
-
# @param row_count_name [String]
|
153
|
-
# If not nil, this will insert a row count column with give name into the
|
154
|
-
# DataFrame.
|
155
|
-
# @param row_count_offset [Integer]
|
156
|
-
# Offset to start the row_count column (only use if the name is set).
|
157
|
-
# @param storage_options [Hash]
|
158
|
-
# Extra options that make sense for a particular storage connection.
|
159
160
|
# @param low_memory [Boolean]
|
160
161
|
# Reduce memory pressure at the expense of performance.
|
162
|
+
# @param cache [Boolean]
|
163
|
+
# Cache the result after reading.
|
164
|
+
# @param storage_options [Hash]
|
165
|
+
# Extra options that make sense for a particular storage connection.
|
166
|
+
# @param credential_provider [Object]
|
167
|
+
# Provide a function that can be called to provide cloud storage
|
168
|
+
# credentials. The function is expected to return a dictionary of
|
169
|
+
# credential keys along with an optional credential expiry time.
|
170
|
+
# @param retries [Integer]
|
171
|
+
# Number of retries if accessing a cloud instance fails.
|
161
172
|
# @param include_file_paths [String]
|
162
|
-
#
|
173
|
+
# Include the path of the source file(s) as a column with this name.
|
163
174
|
#
|
164
175
|
# @return [LazyFrame]
|
165
176
|
def scan_parquet(
|
166
177
|
source,
|
167
178
|
n_rows: nil,
|
168
|
-
cache: true,
|
169
|
-
parallel: "auto",
|
170
|
-
glob: true,
|
171
|
-
rechunk: true,
|
172
179
|
row_count_name: nil,
|
173
180
|
row_count_offset: 0,
|
174
|
-
|
181
|
+
parallel: "auto",
|
182
|
+
use_statistics: true,
|
183
|
+
hive_partitioning: nil,
|
184
|
+
glob: true,
|
185
|
+
schema: nil,
|
186
|
+
hive_schema: nil,
|
187
|
+
try_parse_hive_dates: true,
|
188
|
+
rechunk: false,
|
175
189
|
low_memory: false,
|
176
|
-
|
190
|
+
cache: true,
|
191
|
+
storage_options: nil,
|
192
|
+
credential_provider: nil,
|
193
|
+
retries: 2,
|
194
|
+
include_file_paths: nil,
|
195
|
+
allow_missing_columns: false
|
177
196
|
)
|
178
197
|
if Utils.pathlike?(source)
|
179
|
-
source = Utils.normalize_filepath(source)
|
198
|
+
source = Utils.normalize_filepath(source, check_not_directory: false)
|
199
|
+
elsif Utils.is_path_or_str_sequence(source)
|
200
|
+
source = source.map { |s| Utils.normalize_filepath(s, check_not_directory: false) }
|
201
|
+
end
|
202
|
+
|
203
|
+
if credential_provider
|
204
|
+
raise Todo
|
180
205
|
end
|
181
206
|
|
182
207
|
_scan_parquet_impl(
|
@@ -185,47 +210,79 @@ module Polars
|
|
185
210
|
cache: cache,
|
186
211
|
parallel: parallel,
|
187
212
|
rechunk: rechunk,
|
188
|
-
|
189
|
-
|
213
|
+
row_index_name: row_count_name,
|
214
|
+
row_index_offset: row_count_offset,
|
190
215
|
storage_options: storage_options,
|
216
|
+
credential_provider: credential_provider,
|
191
217
|
low_memory: low_memory,
|
218
|
+
use_statistics: use_statistics,
|
219
|
+
hive_partitioning: hive_partitioning,
|
220
|
+
schema: schema,
|
221
|
+
hive_schema: hive_schema,
|
222
|
+
try_parse_hive_dates: try_parse_hive_dates,
|
223
|
+
retries: retries,
|
192
224
|
glob: glob,
|
193
|
-
include_file_paths: include_file_paths
|
225
|
+
include_file_paths: include_file_paths,
|
226
|
+
allow_missing_columns: allow_missing_columns
|
194
227
|
)
|
195
228
|
end
|
196
229
|
|
197
230
|
# @private
|
198
231
|
def _scan_parquet_impl(
|
199
|
-
|
232
|
+
source,
|
200
233
|
n_rows: nil,
|
201
234
|
cache: true,
|
202
235
|
parallel: "auto",
|
203
236
|
rechunk: true,
|
204
|
-
|
205
|
-
|
237
|
+
row_index_name: nil,
|
238
|
+
row_index_offset: 0,
|
206
239
|
storage_options: nil,
|
240
|
+
credential_provider: nil,
|
207
241
|
low_memory: false,
|
208
242
|
use_statistics: true,
|
209
243
|
hive_partitioning: nil,
|
210
244
|
glob: true,
|
211
|
-
|
245
|
+
schema: nil,
|
246
|
+
hive_schema: nil,
|
247
|
+
try_parse_hive_dates: true,
|
248
|
+
retries: 2,
|
249
|
+
include_file_paths: nil,
|
250
|
+
allow_missing_columns: false
|
212
251
|
)
|
252
|
+
if source.is_a?(::Array)
|
253
|
+
sources = source
|
254
|
+
source = nil
|
255
|
+
else
|
256
|
+
sources = []
|
257
|
+
end
|
258
|
+
|
259
|
+
if storage_options
|
260
|
+
storage_options = storage_options.map { |k, v| [k.to_s, v.to_s] }
|
261
|
+
else
|
262
|
+
storage_options = nil
|
263
|
+
end
|
264
|
+
|
213
265
|
rblf =
|
214
266
|
RbLazyFrame.new_from_parquet(
|
215
|
-
|
216
|
-
|
267
|
+
source,
|
268
|
+
sources,
|
217
269
|
n_rows,
|
218
270
|
cache,
|
219
271
|
parallel,
|
220
272
|
rechunk,
|
221
|
-
Utils.parse_row_index_args(
|
273
|
+
Utils.parse_row_index_args(row_index_name, row_index_offset),
|
222
274
|
low_memory,
|
275
|
+
storage_options,
|
276
|
+
credential_provider,
|
223
277
|
use_statistics,
|
224
278
|
hive_partitioning,
|
225
|
-
|
226
|
-
|
279
|
+
schema,
|
280
|
+
hive_schema,
|
281
|
+
try_parse_hive_dates,
|
282
|
+
retries,
|
227
283
|
glob,
|
228
|
-
include_file_paths
|
284
|
+
include_file_paths,
|
285
|
+
allow_missing_columns
|
229
286
|
)
|
230
287
|
Utils.wrap_ldf(rblf)
|
231
288
|
end
|