polars-df 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +127 -1
- data/Cargo.lock +72 -58
- data/README.md +31 -27
- data/ext/polars/Cargo.toml +15 -6
- data/ext/polars/src/batched_csv.rs +35 -39
- data/ext/polars/src/c_api/allocator.rs +7 -0
- data/ext/polars/src/c_api/mod.rs +1 -0
- data/ext/polars/src/catalog/unity.rs +123 -101
- data/ext/polars/src/conversion/any_value.rs +13 -17
- data/ext/polars/src/conversion/chunked_array.rs +5 -5
- data/ext/polars/src/conversion/datetime.rs +3 -2
- data/ext/polars/src/conversion/mod.rs +50 -45
- data/ext/polars/src/dataframe/export.rs +13 -13
- data/ext/polars/src/dataframe/general.rs +223 -223
- data/ext/polars/src/dataframe/io.rs +27 -141
- data/ext/polars/src/dataframe/mod.rs +13 -5
- data/ext/polars/src/dataframe/serde.rs +1 -1
- data/ext/polars/src/error.rs +44 -7
- data/ext/polars/src/exceptions.rs +45 -12
- data/ext/polars/src/expr/array.rs +12 -0
- data/ext/polars/src/expr/datatype.rs +2 -2
- data/ext/polars/src/expr/datetime.rs +4 -5
- data/ext/polars/src/expr/general.rs +49 -13
- data/ext/polars/src/expr/list.rs +4 -0
- data/ext/polars/src/expr/meta.rs +8 -3
- data/ext/polars/src/expr/mod.rs +22 -6
- data/ext/polars/src/expr/name.rs +19 -8
- data/ext/polars/src/expr/rolling.rs +50 -1
- data/ext/polars/src/expr/string.rs +0 -1
- data/ext/polars/src/expr/struct.rs +7 -2
- data/ext/polars/src/file.rs +136 -103
- data/ext/polars/src/functions/aggregation.rs +9 -8
- data/ext/polars/src/functions/io.rs +81 -10
- data/ext/polars/src/functions/lazy.rs +95 -21
- data/ext/polars/src/functions/mod.rs +2 -0
- data/ext/polars/src/functions/range.rs +19 -3
- data/ext/polars/src/functions/strings.rs +6 -0
- data/ext/polars/src/functions/utils.rs +6 -0
- data/ext/polars/src/interop/arrow/mod.rs +50 -1
- data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
- data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
- data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
- data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
- data/ext/polars/src/lazyframe/exitable.rs +39 -0
- data/ext/polars/src/lazyframe/general.rs +340 -236
- data/ext/polars/src/lazyframe/mod.rs +46 -10
- data/ext/polars/src/lazyframe/optflags.rs +5 -4
- data/ext/polars/src/lazyframe/serde.rs +11 -3
- data/ext/polars/src/lazyframe/sink.rs +10 -5
- data/ext/polars/src/lazygroupby.rs +6 -7
- data/ext/polars/src/lib.rs +141 -76
- data/ext/polars/src/map/dataframe.rs +12 -12
- data/ext/polars/src/map/lazy.rs +7 -5
- data/ext/polars/src/map/mod.rs +15 -8
- data/ext/polars/src/map/series.rs +3 -3
- data/ext/polars/src/on_startup.rs +16 -8
- data/ext/polars/src/prelude.rs +1 -0
- data/ext/polars/src/rb_modules.rs +19 -49
- data/ext/polars/src/series/aggregation.rs +79 -140
- data/ext/polars/src/series/arithmetic.rs +16 -22
- data/ext/polars/src/series/comparison.rs +101 -222
- data/ext/polars/src/series/construction.rs +17 -18
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +254 -289
- data/ext/polars/src/series/import.rs +17 -0
- data/ext/polars/src/series/map.rs +178 -160
- data/ext/polars/src/series/mod.rs +28 -12
- data/ext/polars/src/series/scatter.rs +12 -9
- data/ext/polars/src/sql.rs +16 -9
- data/ext/polars/src/testing/frame.rs +31 -0
- data/ext/polars/src/testing/mod.rs +5 -0
- data/ext/polars/src/testing/series.rs +31 -0
- data/ext/polars/src/timeout.rs +105 -0
- data/ext/polars/src/utils.rs +159 -1
- data/lib/polars/array_expr.rb +81 -12
- data/lib/polars/array_name_space.rb +74 -7
- data/lib/polars/batched_csv_reader.rb +21 -21
- data/lib/polars/binary_name_space.rb +1 -1
- data/lib/polars/cat_expr.rb +7 -7
- data/lib/polars/config.rb +1 -1
- data/lib/polars/convert.rb +189 -34
- data/lib/polars/data_frame.rb +1066 -831
- data/lib/polars/data_frame_plot.rb +173 -0
- data/lib/polars/data_type_group.rb +1 -0
- data/lib/polars/data_types.rb +31 -12
- data/lib/polars/date_time_expr.rb +51 -69
- data/lib/polars/date_time_name_space.rb +80 -112
- data/lib/polars/dynamic_group_by.rb +7 -7
- data/lib/polars/exceptions.rb +50 -10
- data/lib/polars/expr.rb +470 -517
- data/lib/polars/functions/aggregation/horizontal.rb +0 -1
- data/lib/polars/functions/aggregation/vertical.rb +2 -3
- data/lib/polars/functions/as_datatype.rb +290 -8
- data/lib/polars/functions/eager.rb +204 -10
- data/lib/polars/functions/escape_regex.rb +21 -0
- data/lib/polars/functions/lazy.rb +409 -169
- data/lib/polars/functions/lit.rb +17 -1
- data/lib/polars/functions/range/int_range.rb +74 -2
- data/lib/polars/functions/range/linear_space.rb +77 -0
- data/lib/polars/functions/range/time_range.rb +1 -1
- data/lib/polars/functions/repeat.rb +3 -12
- data/lib/polars/functions/whenthen.rb +2 -2
- data/lib/polars/group_by.rb +72 -20
- data/lib/polars/iceberg_dataset.rb +1 -6
- data/lib/polars/in_process_query.rb +37 -0
- data/lib/polars/io/cloud.rb +18 -0
- data/lib/polars/io/csv.rb +265 -126
- data/lib/polars/io/database.rb +0 -1
- data/lib/polars/io/delta.rb +15 -7
- data/lib/polars/io/ipc.rb +24 -17
- data/lib/polars/io/ndjson.rb +161 -24
- data/lib/polars/io/parquet.rb +101 -38
- data/lib/polars/lazy_frame.rb +849 -558
- data/lib/polars/lazy_group_by.rb +327 -2
- data/lib/polars/list_expr.rb +94 -16
- data/lib/polars/list_name_space.rb +88 -24
- data/lib/polars/meta_expr.rb +42 -1
- data/lib/polars/name_expr.rb +41 -4
- data/lib/polars/query_opt_flags.rb +198 -2
- data/lib/polars/rolling_group_by.rb +3 -3
- data/lib/polars/schema.rb +21 -3
- data/lib/polars/selector.rb +37 -2
- data/lib/polars/selectors.rb +45 -9
- data/lib/polars/series.rb +1156 -728
- data/lib/polars/series_plot.rb +72 -0
- data/lib/polars/slice.rb +1 -1
- data/lib/polars/sql_context.rb +11 -4
- data/lib/polars/string_expr.rb +59 -68
- data/lib/polars/string_name_space.rb +51 -87
- data/lib/polars/struct_expr.rb +36 -18
- data/lib/polars/testing.rb +24 -273
- data/lib/polars/utils/constants.rb +2 -0
- data/lib/polars/utils/construction/data_frame.rb +410 -0
- data/lib/polars/utils/construction/series.rb +364 -0
- data/lib/polars/utils/construction/utils.rb +9 -0
- data/lib/polars/utils/deprecation.rb +11 -0
- data/lib/polars/utils/serde.rb +8 -3
- data/lib/polars/utils/unstable.rb +19 -0
- data/lib/polars/utils/various.rb +59 -0
- data/lib/polars/utils.rb +46 -47
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +47 -1
- metadata +25 -6
- data/ext/polars/src/allocator.rs +0 -13
- data/lib/polars/plot.rb +0 -109
data/lib/polars/io/parquet.rb
CHANGED
|
@@ -9,10 +9,10 @@ module Polars
|
|
|
9
9
|
# of column names.
|
|
10
10
|
# @param n_rows [Integer]
|
|
11
11
|
# Stop reading from parquet file after reading `n_rows`.
|
|
12
|
-
# @param
|
|
12
|
+
# @param row_index_name [String]
|
|
13
13
|
# If not nil, this will insert a row count column with give name into the
|
|
14
14
|
# DataFrame.
|
|
15
|
-
# @param
|
|
15
|
+
# @param row_index_offset [Integer]
|
|
16
16
|
# Offset to start the row_count column (only use if the name is set).
|
|
17
17
|
# @param parallel ["auto", "columns", "row_groups", "none"]
|
|
18
18
|
# This determines the direction of parallelism. 'auto' will try to determine the
|
|
@@ -49,6 +49,12 @@ module Polars
|
|
|
49
49
|
# Number of retries if accessing a cloud instance fails.
|
|
50
50
|
# @param include_file_paths [String]
|
|
51
51
|
# Include the path of the source file(s) as a column with this name.
|
|
52
|
+
# @param missing_columns ['insert', 'raise']
|
|
53
|
+
# Configuration for behavior when columns defined in the schema
|
|
54
|
+
# are missing from the data:
|
|
55
|
+
#
|
|
56
|
+
# * `insert`: Inserts the missing columns using NULLs as the row values.
|
|
57
|
+
# * `raise`: Raises an error.
|
|
52
58
|
# @param allow_missing_columns [Boolean]
|
|
53
59
|
# When reading a list of parquet files, if a column existing in the first
|
|
54
60
|
# file cannot be found in subsequent files, the default behavior is to
|
|
@@ -61,8 +67,8 @@ module Polars
|
|
|
61
67
|
source,
|
|
62
68
|
columns: nil,
|
|
63
69
|
n_rows: nil,
|
|
64
|
-
|
|
65
|
-
|
|
70
|
+
row_index_name: nil,
|
|
71
|
+
row_index_offset: 0,
|
|
66
72
|
parallel: "auto",
|
|
67
73
|
use_statistics: true,
|
|
68
74
|
hive_partitioning: nil,
|
|
@@ -73,17 +79,18 @@ module Polars
|
|
|
73
79
|
rechunk: false,
|
|
74
80
|
low_memory: false,
|
|
75
81
|
storage_options: nil,
|
|
76
|
-
credential_provider:
|
|
82
|
+
credential_provider: "auto",
|
|
77
83
|
retries: 2,
|
|
78
84
|
include_file_paths: nil,
|
|
79
|
-
|
|
85
|
+
missing_columns: "raise",
|
|
86
|
+
allow_missing_columns: nil
|
|
80
87
|
)
|
|
81
88
|
lf =
|
|
82
89
|
scan_parquet(
|
|
83
90
|
source,
|
|
84
91
|
n_rows: n_rows,
|
|
85
|
-
|
|
86
|
-
|
|
92
|
+
row_index_name: row_index_name,
|
|
93
|
+
row_index_offset: row_index_offset,
|
|
87
94
|
parallel: parallel,
|
|
88
95
|
use_statistics: use_statistics,
|
|
89
96
|
hive_partitioning: hive_partitioning,
|
|
@@ -98,6 +105,7 @@ module Polars
|
|
|
98
105
|
retries: retries,
|
|
99
106
|
glob: glob,
|
|
100
107
|
include_file_paths: include_file_paths,
|
|
108
|
+
missing_columns: missing_columns,
|
|
101
109
|
allow_missing_columns: allow_missing_columns
|
|
102
110
|
)
|
|
103
111
|
|
|
@@ -134,14 +142,40 @@ module Polars
|
|
|
134
142
|
#
|
|
135
143
|
# @param source [Object]
|
|
136
144
|
# Path to a file or a file-like object.
|
|
145
|
+
# @param storage_options [Hash]
|
|
146
|
+
# Extra options that make sense for a particular storage connection.
|
|
147
|
+
# @param credential_provider [Object]
|
|
148
|
+
# Provide a function that can be called to provide cloud storage
|
|
149
|
+
# credentials. The function is expected to return a hash of
|
|
150
|
+
# credential keys along with an optional credential expiry time.
|
|
151
|
+
# @param retries [Integer]
|
|
152
|
+
# Number of retries if accessing a cloud instance fails.
|
|
137
153
|
#
|
|
138
154
|
# @return [Hash]
|
|
139
|
-
def read_parquet_metadata(
|
|
155
|
+
def read_parquet_metadata(
|
|
156
|
+
source,
|
|
157
|
+
storage_options: nil,
|
|
158
|
+
credential_provider: "auto",
|
|
159
|
+
retries: 2
|
|
160
|
+
)
|
|
161
|
+
if storage_options
|
|
162
|
+
raise Todo
|
|
163
|
+
end
|
|
164
|
+
|
|
140
165
|
if Utils.pathlike?(source)
|
|
141
166
|
source = Utils.normalize_filepath(source, check_not_directory: false)
|
|
142
167
|
end
|
|
143
168
|
|
|
144
|
-
|
|
169
|
+
credential_provider_builder = _init_credential_provider_builder(
|
|
170
|
+
credential_provider, source, storage_options, "scan_parquet"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
Plr.read_parquet_metadata(
|
|
174
|
+
source,
|
|
175
|
+
storage_options&.any? ? storage_options.map { |k, v| [k.to_s, v.to_s] } : nil,
|
|
176
|
+
credential_provider_builder,
|
|
177
|
+
retries
|
|
178
|
+
)
|
|
145
179
|
end
|
|
146
180
|
|
|
147
181
|
# Lazily read from a parquet file or multiple files via glob patterns.
|
|
@@ -153,10 +187,10 @@ module Polars
|
|
|
153
187
|
# Path to a file or a file-like object.
|
|
154
188
|
# @param n_rows [Integer]
|
|
155
189
|
# Stop reading from parquet file after reading `n_rows`.
|
|
156
|
-
# @param
|
|
190
|
+
# @param row_index_name [String]
|
|
157
191
|
# If not nil, this will insert a row count column with give name into the
|
|
158
192
|
# DataFrame.
|
|
159
|
-
# @param
|
|
193
|
+
# @param row_index_offset [Integer]
|
|
160
194
|
# Offset to start the row_count column (only use if the name is set).
|
|
161
195
|
# @param parallel ["auto", "columns", "row_groups", "none"]
|
|
162
196
|
# This determines the direction of parallelism. 'auto' will try to determine the
|
|
@@ -169,6 +203,8 @@ module Polars
|
|
|
169
203
|
# to prune reads.
|
|
170
204
|
# @param glob [Boolean]
|
|
171
205
|
# Expand path given via globbing rules.
|
|
206
|
+
# @param hidden_file_prefix [Boolean]
|
|
207
|
+
# Skip reading files whose names begin with the specified prefixes.
|
|
172
208
|
# @param schema [Object]
|
|
173
209
|
# Specify the datatypes of the columns. The datatypes must match the
|
|
174
210
|
# datatypes in the file(s). If there are extra columns that are not in the
|
|
@@ -195,6 +231,12 @@ module Polars
|
|
|
195
231
|
# Number of retries if accessing a cloud instance fails.
|
|
196
232
|
# @param include_file_paths [String]
|
|
197
233
|
# Include the path of the source file(s) as a column with this name.
|
|
234
|
+
# @param missing_columns ['insert', 'raise']
|
|
235
|
+
# Configuration for behavior when columns defined in the schema
|
|
236
|
+
# are missing from the data:
|
|
237
|
+
#
|
|
238
|
+
# * `insert`: Inserts the missing columns using NULLs as the row values.
|
|
239
|
+
# * `raise`: Raises an error.
|
|
198
240
|
# @param allow_missing_columns [Boolean]
|
|
199
241
|
# When reading a list of parquet files, if a column existing in the first
|
|
200
242
|
# file cannot be found in subsequent files, the default behavior is to
|
|
@@ -214,12 +256,13 @@ module Polars
|
|
|
214
256
|
def scan_parquet(
|
|
215
257
|
source,
|
|
216
258
|
n_rows: nil,
|
|
217
|
-
|
|
218
|
-
|
|
259
|
+
row_index_name: nil,
|
|
260
|
+
row_index_offset: 0,
|
|
219
261
|
parallel: "auto",
|
|
220
262
|
use_statistics: true,
|
|
221
263
|
hive_partitioning: nil,
|
|
222
264
|
glob: true,
|
|
265
|
+
hidden_file_prefix: nil,
|
|
223
266
|
schema: nil,
|
|
224
267
|
hive_schema: nil,
|
|
225
268
|
try_parse_hive_dates: true,
|
|
@@ -227,42 +270,58 @@ module Polars
|
|
|
227
270
|
low_memory: false,
|
|
228
271
|
cache: true,
|
|
229
272
|
storage_options: nil,
|
|
230
|
-
credential_provider:
|
|
273
|
+
credential_provider: "auto",
|
|
231
274
|
retries: 2,
|
|
232
275
|
include_file_paths: nil,
|
|
233
|
-
|
|
276
|
+
missing_columns: "raise",
|
|
277
|
+
allow_missing_columns: nil,
|
|
234
278
|
extra_columns: "raise",
|
|
235
279
|
cast_options: nil,
|
|
236
280
|
_column_mapping: nil,
|
|
237
|
-
|
|
281
|
+
_default_values: nil,
|
|
282
|
+
_deletion_files: nil,
|
|
283
|
+
_table_statistics: nil,
|
|
284
|
+
_row_count: nil
|
|
238
285
|
)
|
|
239
|
-
|
|
286
|
+
if !schema.nil?
|
|
287
|
+
msg = "the `schema` parameter of `scan_parquet` is considered unstable."
|
|
288
|
+
Utils.issue_unstable_warning(msg)
|
|
289
|
+
end
|
|
240
290
|
|
|
241
|
-
if
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
source = source.map { |s| Utils.normalize_filepath(s, check_not_directory: false) }
|
|
291
|
+
if !hive_schema.nil?
|
|
292
|
+
msg = "the `hive_schema` parameter of `scan_parquet` is considered unstable."
|
|
293
|
+
Utils.issue_unstable_warning(msg)
|
|
245
294
|
end
|
|
246
295
|
|
|
247
|
-
if
|
|
248
|
-
|
|
296
|
+
if !cast_options.nil?
|
|
297
|
+
msg = "The `cast_options` parameter of `scan_parquet` is considered unstable."
|
|
298
|
+
Utils.issue_unstable_warning(msg)
|
|
249
299
|
end
|
|
250
300
|
|
|
251
|
-
if
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
else
|
|
255
|
-
sources = [source]
|
|
301
|
+
if !hidden_file_prefix.nil?
|
|
302
|
+
msg = "The `hidden_file_prefix` parameter of `scan_parquet` is considered unstable."
|
|
303
|
+
Utils.issue_unstable_warning(msg)
|
|
256
304
|
end
|
|
257
305
|
|
|
258
|
-
if
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
306
|
+
if !allow_missing_columns.nil?
|
|
307
|
+
Utils.issue_deprecation_warning(
|
|
308
|
+
"the parameter `allow_missing_columns` for `scan_parquet` is deprecated. " +
|
|
309
|
+
"Use the parameter `missing_columns` instead and pass one of " +
|
|
310
|
+
"`('insert', 'raise')`."
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
missing_columns = allow_missing_columns ? "insert" : "raise"
|
|
262
314
|
end
|
|
263
315
|
|
|
264
|
-
|
|
265
|
-
|
|
316
|
+
sources = get_sources(source)
|
|
317
|
+
|
|
318
|
+
credential_provider_builder =
|
|
319
|
+
_init_credential_provider_builder(
|
|
320
|
+
credential_provider,
|
|
321
|
+
sources,
|
|
322
|
+
storage_options,
|
|
323
|
+
"scan_parquet"
|
|
324
|
+
)
|
|
266
325
|
|
|
267
326
|
rblf =
|
|
268
327
|
RbLazyFrame.new_from_parquet(
|
|
@@ -276,16 +335,20 @@ module Polars
|
|
|
276
335
|
missing_columns: missing_columns,
|
|
277
336
|
include_file_paths: include_file_paths,
|
|
278
337
|
glob: glob,
|
|
338
|
+
hidden_file_prefix: hidden_file_prefix.is_a?(::String) ? [hidden_file_prefix] : hidden_file_prefix,
|
|
279
339
|
hive_partitioning: hive_partitioning,
|
|
280
340
|
hive_schema: hive_schema,
|
|
281
341
|
try_parse_hive_dates: try_parse_hive_dates,
|
|
282
342
|
rechunk: rechunk,
|
|
283
343
|
cache: cache,
|
|
284
|
-
storage_options: storage_options,
|
|
285
|
-
|
|
344
|
+
storage_options: storage_options ? storage_options.map { |k, v| [k.to_s, v.to_s] } : nil,
|
|
345
|
+
credential_provider: credential_provider_builder,
|
|
286
346
|
retries: retries,
|
|
347
|
+
column_mapping: _column_mapping,
|
|
348
|
+
default_values: _default_values,
|
|
287
349
|
deletion_files: _deletion_files,
|
|
288
|
-
|
|
350
|
+
table_statistics: _table_statistics,
|
|
351
|
+
row_count: _row_count
|
|
289
352
|
),
|
|
290
353
|
parallel,
|
|
291
354
|
low_memory,
|