polars-df 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +127 -1
- data/Cargo.lock +72 -58
- data/README.md +31 -27
- data/ext/polars/Cargo.toml +15 -6
- data/ext/polars/src/batched_csv.rs +35 -39
- data/ext/polars/src/c_api/allocator.rs +7 -0
- data/ext/polars/src/c_api/mod.rs +1 -0
- data/ext/polars/src/catalog/unity.rs +123 -101
- data/ext/polars/src/conversion/any_value.rs +13 -17
- data/ext/polars/src/conversion/chunked_array.rs +5 -5
- data/ext/polars/src/conversion/datetime.rs +3 -2
- data/ext/polars/src/conversion/mod.rs +50 -45
- data/ext/polars/src/dataframe/export.rs +13 -13
- data/ext/polars/src/dataframe/general.rs +223 -223
- data/ext/polars/src/dataframe/io.rs +27 -141
- data/ext/polars/src/dataframe/mod.rs +13 -5
- data/ext/polars/src/dataframe/serde.rs +1 -1
- data/ext/polars/src/error.rs +44 -7
- data/ext/polars/src/exceptions.rs +45 -12
- data/ext/polars/src/expr/array.rs +12 -0
- data/ext/polars/src/expr/datatype.rs +2 -2
- data/ext/polars/src/expr/datetime.rs +4 -5
- data/ext/polars/src/expr/general.rs +49 -13
- data/ext/polars/src/expr/list.rs +4 -0
- data/ext/polars/src/expr/meta.rs +8 -3
- data/ext/polars/src/expr/mod.rs +22 -6
- data/ext/polars/src/expr/name.rs +19 -8
- data/ext/polars/src/expr/rolling.rs +50 -1
- data/ext/polars/src/expr/string.rs +0 -1
- data/ext/polars/src/expr/struct.rs +7 -2
- data/ext/polars/src/file.rs +136 -103
- data/ext/polars/src/functions/aggregation.rs +9 -8
- data/ext/polars/src/functions/io.rs +81 -10
- data/ext/polars/src/functions/lazy.rs +95 -21
- data/ext/polars/src/functions/mod.rs +2 -0
- data/ext/polars/src/functions/range.rs +19 -3
- data/ext/polars/src/functions/strings.rs +6 -0
- data/ext/polars/src/functions/utils.rs +6 -0
- data/ext/polars/src/interop/arrow/mod.rs +50 -1
- data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
- data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
- data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
- data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
- data/ext/polars/src/lazyframe/exitable.rs +39 -0
- data/ext/polars/src/lazyframe/general.rs +340 -236
- data/ext/polars/src/lazyframe/mod.rs +46 -10
- data/ext/polars/src/lazyframe/optflags.rs +5 -4
- data/ext/polars/src/lazyframe/serde.rs +11 -3
- data/ext/polars/src/lazyframe/sink.rs +10 -5
- data/ext/polars/src/lazygroupby.rs +6 -7
- data/ext/polars/src/lib.rs +141 -76
- data/ext/polars/src/map/dataframe.rs +12 -12
- data/ext/polars/src/map/lazy.rs +7 -5
- data/ext/polars/src/map/mod.rs +15 -8
- data/ext/polars/src/map/series.rs +3 -3
- data/ext/polars/src/on_startup.rs +16 -8
- data/ext/polars/src/prelude.rs +1 -0
- data/ext/polars/src/rb_modules.rs +19 -49
- data/ext/polars/src/series/aggregation.rs +79 -140
- data/ext/polars/src/series/arithmetic.rs +16 -22
- data/ext/polars/src/series/comparison.rs +101 -222
- data/ext/polars/src/series/construction.rs +17 -18
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +254 -289
- data/ext/polars/src/series/import.rs +17 -0
- data/ext/polars/src/series/map.rs +178 -160
- data/ext/polars/src/series/mod.rs +28 -12
- data/ext/polars/src/series/scatter.rs +12 -9
- data/ext/polars/src/sql.rs +16 -9
- data/ext/polars/src/testing/frame.rs +31 -0
- data/ext/polars/src/testing/mod.rs +5 -0
- data/ext/polars/src/testing/series.rs +31 -0
- data/ext/polars/src/timeout.rs +105 -0
- data/ext/polars/src/utils.rs +159 -1
- data/lib/polars/array_expr.rb +81 -12
- data/lib/polars/array_name_space.rb +74 -7
- data/lib/polars/batched_csv_reader.rb +21 -21
- data/lib/polars/binary_name_space.rb +1 -1
- data/lib/polars/cat_expr.rb +7 -7
- data/lib/polars/config.rb +1 -1
- data/lib/polars/convert.rb +189 -34
- data/lib/polars/data_frame.rb +1066 -831
- data/lib/polars/data_frame_plot.rb +173 -0
- data/lib/polars/data_type_group.rb +1 -0
- data/lib/polars/data_types.rb +31 -12
- data/lib/polars/date_time_expr.rb +51 -69
- data/lib/polars/date_time_name_space.rb +80 -112
- data/lib/polars/dynamic_group_by.rb +7 -7
- data/lib/polars/exceptions.rb +50 -10
- data/lib/polars/expr.rb +470 -517
- data/lib/polars/functions/aggregation/horizontal.rb +0 -1
- data/lib/polars/functions/aggregation/vertical.rb +2 -3
- data/lib/polars/functions/as_datatype.rb +290 -8
- data/lib/polars/functions/eager.rb +204 -10
- data/lib/polars/functions/escape_regex.rb +21 -0
- data/lib/polars/functions/lazy.rb +409 -169
- data/lib/polars/functions/lit.rb +17 -1
- data/lib/polars/functions/range/int_range.rb +74 -2
- data/lib/polars/functions/range/linear_space.rb +77 -0
- data/lib/polars/functions/range/time_range.rb +1 -1
- data/lib/polars/functions/repeat.rb +3 -12
- data/lib/polars/functions/whenthen.rb +2 -2
- data/lib/polars/group_by.rb +72 -20
- data/lib/polars/iceberg_dataset.rb +1 -6
- data/lib/polars/in_process_query.rb +37 -0
- data/lib/polars/io/cloud.rb +18 -0
- data/lib/polars/io/csv.rb +265 -126
- data/lib/polars/io/database.rb +0 -1
- data/lib/polars/io/delta.rb +15 -7
- data/lib/polars/io/ipc.rb +24 -17
- data/lib/polars/io/ndjson.rb +161 -24
- data/lib/polars/io/parquet.rb +101 -38
- data/lib/polars/lazy_frame.rb +849 -558
- data/lib/polars/lazy_group_by.rb +327 -2
- data/lib/polars/list_expr.rb +94 -16
- data/lib/polars/list_name_space.rb +88 -24
- data/lib/polars/meta_expr.rb +42 -1
- data/lib/polars/name_expr.rb +41 -4
- data/lib/polars/query_opt_flags.rb +198 -2
- data/lib/polars/rolling_group_by.rb +3 -3
- data/lib/polars/schema.rb +21 -3
- data/lib/polars/selector.rb +37 -2
- data/lib/polars/selectors.rb +45 -9
- data/lib/polars/series.rb +1156 -728
- data/lib/polars/series_plot.rb +72 -0
- data/lib/polars/slice.rb +1 -1
- data/lib/polars/sql_context.rb +11 -4
- data/lib/polars/string_expr.rb +59 -68
- data/lib/polars/string_name_space.rb +51 -87
- data/lib/polars/struct_expr.rb +36 -18
- data/lib/polars/testing.rb +24 -273
- data/lib/polars/utils/constants.rb +2 -0
- data/lib/polars/utils/construction/data_frame.rb +410 -0
- data/lib/polars/utils/construction/series.rb +364 -0
- data/lib/polars/utils/construction/utils.rb +9 -0
- data/lib/polars/utils/deprecation.rb +11 -0
- data/lib/polars/utils/serde.rb +8 -3
- data/lib/polars/utils/unstable.rb +19 -0
- data/lib/polars/utils/various.rb +59 -0
- data/lib/polars/utils.rb +46 -47
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +47 -1
- metadata +25 -6
- data/ext/polars/src/allocator.rs +0 -13
- data/lib/polars/plot.rb +0 -109
data/lib/polars/io/database.rb
CHANGED
data/lib/polars/io/delta.rb
CHANGED
|
@@ -21,19 +21,23 @@ module Polars
|
|
|
21
21
|
source,
|
|
22
22
|
version: nil,
|
|
23
23
|
columns: nil,
|
|
24
|
-
rechunk:
|
|
24
|
+
rechunk: nil,
|
|
25
25
|
storage_options: nil,
|
|
26
26
|
delta_table_options: nil
|
|
27
27
|
)
|
|
28
|
-
|
|
29
|
-
|
|
28
|
+
df =
|
|
29
|
+
scan_delta(
|
|
30
30
|
source,
|
|
31
31
|
version: version,
|
|
32
32
|
storage_options: storage_options,
|
|
33
|
-
delta_table_options: delta_table_options
|
|
33
|
+
delta_table_options: delta_table_options,
|
|
34
|
+
rechunk: rechunk
|
|
34
35
|
)
|
|
35
36
|
|
|
36
|
-
|
|
37
|
+
if !columns.nil?
|
|
38
|
+
df = df.select(columns)
|
|
39
|
+
end
|
|
40
|
+
df.collect
|
|
37
41
|
end
|
|
38
42
|
|
|
39
43
|
# Lazily read from a Delta lake table.
|
|
@@ -46,13 +50,17 @@ module Polars
|
|
|
46
50
|
# Extra options for the storage backends supported by `deltalake-rb`.
|
|
47
51
|
# @param delta_table_options [Hash]
|
|
48
52
|
# Additional keyword arguments while reading a Delta lake Table.
|
|
53
|
+
# @param rechunk [Boolean]
|
|
54
|
+
# Make sure that all columns are contiguous in memory by
|
|
55
|
+
# aggregating the chunks into a single array.
|
|
49
56
|
#
|
|
50
57
|
# @return [LazyFrame]
|
|
51
58
|
def scan_delta(
|
|
52
59
|
source,
|
|
53
60
|
version: nil,
|
|
54
61
|
storage_options: nil,
|
|
55
|
-
delta_table_options: nil
|
|
62
|
+
delta_table_options: nil,
|
|
63
|
+
rechunk: nil
|
|
56
64
|
)
|
|
57
65
|
dl_tbl =
|
|
58
66
|
_get_delta_lake_table(
|
|
@@ -62,7 +70,7 @@ module Polars
|
|
|
62
70
|
delta_table_options: delta_table_options
|
|
63
71
|
)
|
|
64
72
|
|
|
65
|
-
dl_tbl.to_polars(eager: false)
|
|
73
|
+
dl_tbl.to_polars(eager: false, rechunk: rechunk || false)
|
|
66
74
|
end
|
|
67
75
|
|
|
68
76
|
private
|
data/lib/polars/io/ipc.rb
CHANGED
|
@@ -15,10 +15,10 @@ module Polars
|
|
|
15
15
|
# Only uncompressed IPC files can be memory mapped.
|
|
16
16
|
# @param storage_options [Hash]
|
|
17
17
|
# Extra options that make sense for a particular storage connection.
|
|
18
|
-
# @param
|
|
18
|
+
# @param row_index_name [String]
|
|
19
19
|
# If not nil, this will insert a row count column with give name into the
|
|
20
20
|
# DataFrame.
|
|
21
|
-
# @param
|
|
21
|
+
# @param row_index_offset [Integer]
|
|
22
22
|
# Offset to start the row_count column (only use if the name is set).
|
|
23
23
|
# @param rechunk [Boolean]
|
|
24
24
|
# Make sure that all data is contiguous.
|
|
@@ -30,8 +30,8 @@ module Polars
|
|
|
30
30
|
n_rows: nil,
|
|
31
31
|
memory_map: true,
|
|
32
32
|
storage_options: nil,
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
row_index_name: nil,
|
|
34
|
+
row_index_offset: 0,
|
|
35
35
|
rechunk: true
|
|
36
36
|
)
|
|
37
37
|
storage_options ||= {}
|
|
@@ -40,8 +40,8 @@ module Polars
|
|
|
40
40
|
data,
|
|
41
41
|
columns: columns,
|
|
42
42
|
n_rows: n_rows,
|
|
43
|
-
|
|
44
|
-
|
|
43
|
+
row_index_name: row_index_name,
|
|
44
|
+
row_index_offset: row_index_offset,
|
|
45
45
|
rechunk: rechunk,
|
|
46
46
|
memory_map: memory_map
|
|
47
47
|
)
|
|
@@ -53,8 +53,8 @@ module Polars
|
|
|
53
53
|
file,
|
|
54
54
|
columns: nil,
|
|
55
55
|
n_rows: nil,
|
|
56
|
-
|
|
57
|
-
|
|
56
|
+
row_index_name: nil,
|
|
57
|
+
row_index_offset: 0,
|
|
58
58
|
rechunk: true,
|
|
59
59
|
memory_map: true
|
|
60
60
|
)
|
|
@@ -76,7 +76,7 @@ module Polars
|
|
|
76
76
|
columns,
|
|
77
77
|
projection,
|
|
78
78
|
n_rows,
|
|
79
|
-
Utils.parse_row_index_args(
|
|
79
|
+
Utils.parse_row_index_args(row_index_name, row_index_offset),
|
|
80
80
|
memory_map
|
|
81
81
|
)
|
|
82
82
|
Utils.wrap_df(rbdf)
|
|
@@ -182,15 +182,19 @@ module Polars
|
|
|
182
182
|
# Cache the result after reading.
|
|
183
183
|
# @param rechunk [Boolean]
|
|
184
184
|
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
|
185
|
-
# @param
|
|
185
|
+
# @param row_index_name [String]
|
|
186
186
|
# If not nil, this will insert a row count column with give name into the
|
|
187
187
|
# DataFrame.
|
|
188
|
-
# @param
|
|
188
|
+
# @param row_index_offset [Integer]
|
|
189
189
|
# Offset to start the row_count column (only use if the name is set).
|
|
190
190
|
# @param glob [Boolean]
|
|
191
191
|
# Expand path given via globbing rules.
|
|
192
192
|
# @param storage_options [Hash]
|
|
193
193
|
# Extra options that make sense for a particular storage connection.
|
|
194
|
+
# @param credential_provider [Object]
|
|
195
|
+
# Provide a function that can be called to provide cloud storage
|
|
196
|
+
# credentials. The function is expected to return a hash of
|
|
197
|
+
# credential keys along with an optional credential expiry time.
|
|
194
198
|
# @param retries [Integer]
|
|
195
199
|
# Number of retries if accessing a cloud instance fails.
|
|
196
200
|
# @param file_cache_ttl [Integer]
|
|
@@ -215,11 +219,12 @@ module Polars
|
|
|
215
219
|
source,
|
|
216
220
|
n_rows: nil,
|
|
217
221
|
cache: true,
|
|
218
|
-
rechunk:
|
|
219
|
-
|
|
220
|
-
|
|
222
|
+
rechunk: false,
|
|
223
|
+
row_index_name: nil,
|
|
224
|
+
row_index_offset: 0,
|
|
221
225
|
glob: true,
|
|
222
226
|
storage_options: nil,
|
|
227
|
+
credential_provider: "auto",
|
|
223
228
|
retries: 2,
|
|
224
229
|
file_cache_ttl: nil,
|
|
225
230
|
hive_partitioning: nil,
|
|
@@ -227,11 +232,12 @@ module Polars
|
|
|
227
232
|
try_parse_hive_dates: true,
|
|
228
233
|
include_file_paths: nil
|
|
229
234
|
)
|
|
230
|
-
row_index_name = row_count_name
|
|
231
|
-
row_index_offset = row_count_offset
|
|
232
|
-
|
|
233
235
|
sources = get_sources(source)
|
|
234
236
|
|
|
237
|
+
credential_provider_builder = _init_credential_provider_builder(
|
|
238
|
+
credential_provider, sources, storage_options, "scan_parquet"
|
|
239
|
+
)
|
|
240
|
+
|
|
235
241
|
rblf =
|
|
236
242
|
RbLazyFrame.new_from_ipc(
|
|
237
243
|
sources,
|
|
@@ -246,6 +252,7 @@ module Polars
|
|
|
246
252
|
rechunk: rechunk,
|
|
247
253
|
cache: cache,
|
|
248
254
|
storage_options: !storage_options.nil? ? storage_options.to_a : nil,
|
|
255
|
+
credential_provider: credential_provider_builder,
|
|
249
256
|
retries: retries
|
|
250
257
|
),
|
|
251
258
|
file_cache_ttl
|
data/lib/polars/io/ndjson.rb
CHANGED
|
@@ -2,41 +2,106 @@ module Polars
|
|
|
2
2
|
module IO
|
|
3
3
|
# Read into a DataFrame from a newline delimited JSON file.
|
|
4
4
|
#
|
|
5
|
-
# @param source [
|
|
6
|
-
# Path to a file
|
|
5
|
+
# @param source [String]
|
|
6
|
+
# Path to a file.
|
|
7
7
|
# @param schema [Object]
|
|
8
8
|
# The DataFrame schema may be declared in several ways:
|
|
9
9
|
#
|
|
10
|
-
# * As a
|
|
11
|
-
# * As
|
|
12
|
-
# * As
|
|
10
|
+
# * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
|
11
|
+
# * As a list of column names; in this case types are automatically inferred.
|
|
12
|
+
# * As a list of (name,type) pairs; this is equivalent to the hash form.
|
|
13
13
|
#
|
|
14
|
-
# If you supply
|
|
14
|
+
# If you supply a list of column names that does not match the names in the
|
|
15
15
|
# underlying data, the names given here will overwrite them. The number
|
|
16
16
|
# of names given in the schema should match the underlying data dimensions.
|
|
17
17
|
# @param schema_overrides [Hash]
|
|
18
18
|
# Support type specification or override of one or more columns; note that
|
|
19
19
|
# any dtypes inferred from the schema param will be overridden.
|
|
20
|
+
# @param infer_schema_length [Integer]
|
|
21
|
+
# Infer the schema length from the first `infer_schema_length` rows.
|
|
22
|
+
# @param batch_size [Integer]
|
|
23
|
+
# Number of rows to read in each batch.
|
|
24
|
+
# @param n_rows [Integer]
|
|
25
|
+
# Stop reading from JSON file after reading `n_rows`.
|
|
26
|
+
# @param low_memory [Boolean]
|
|
27
|
+
# Reduce memory pressure at the expense of performance.
|
|
28
|
+
# @param rechunk [Boolean]
|
|
29
|
+
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
|
30
|
+
# @param row_index_name [String]
|
|
31
|
+
# If not nil, this will insert a row count column with give name into the
|
|
32
|
+
# DataFrame.
|
|
33
|
+
# @param row_index_offset [Integer]
|
|
34
|
+
# Offset to start the row_count column (only use if the name is set).
|
|
35
|
+
# @param ignore_errors [Boolean]
|
|
36
|
+
# Return `Null` if parsing fails because of schema mismatches.
|
|
37
|
+
# @param storage_options [Hash]
|
|
38
|
+
# Options that indicate how to connect to a cloud provider.
|
|
39
|
+
#
|
|
40
|
+
# The cloud providers currently supported are AWS, GCP, and Azure.
|
|
41
|
+
# See supported keys here:
|
|
42
|
+
#
|
|
43
|
+
# * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
|
|
44
|
+
# * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
|
|
45
|
+
# * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
|
|
46
|
+
# * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
|
|
47
|
+
# `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
|
48
|
+
#
|
|
49
|
+
# If `storage_options` is not provided, Polars will try to infer the information
|
|
50
|
+
# from environment variables.
|
|
51
|
+
# @param credential_provider [Object]
|
|
52
|
+
# Provide a function that can be called to provide cloud storage
|
|
53
|
+
# credentials. The function is expected to return a hash of
|
|
54
|
+
# credential keys along with an optional credential expiry time.
|
|
55
|
+
# @param retries [Integer]
|
|
56
|
+
# Number of retries if accessing a cloud instance fails.
|
|
57
|
+
# @param file_cache_ttl [Integer]
|
|
58
|
+
# Amount of time to keep downloaded cloud files since their last access time,
|
|
59
|
+
# in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
|
|
60
|
+
# (which defaults to 1 hour) if not given.
|
|
61
|
+
# @param include_file_paths [String]
|
|
62
|
+
# Include the path of the source file(s) as a column with this name.
|
|
20
63
|
#
|
|
21
64
|
# @return [DataFrame]
|
|
22
65
|
def read_ndjson(
|
|
23
66
|
source,
|
|
24
67
|
schema: nil,
|
|
25
68
|
schema_overrides: nil,
|
|
26
|
-
|
|
69
|
+
infer_schema_length: N_INFER_DEFAULT,
|
|
70
|
+
batch_size: 1024,
|
|
71
|
+
n_rows: nil,
|
|
72
|
+
low_memory: false,
|
|
73
|
+
rechunk: false,
|
|
74
|
+
row_index_name: nil,
|
|
75
|
+
row_index_offset: 0,
|
|
76
|
+
ignore_errors: false,
|
|
77
|
+
storage_options: nil,
|
|
78
|
+
credential_provider: "auto",
|
|
79
|
+
retries: 2,
|
|
80
|
+
file_cache_ttl: nil,
|
|
81
|
+
include_file_paths: nil
|
|
27
82
|
)
|
|
28
|
-
|
|
29
|
-
source
|
|
30
|
-
|
|
83
|
+
credential_provider_builder = _init_credential_provider_builder(
|
|
84
|
+
credential_provider, source, storage_options, "read_ndjson"
|
|
85
|
+
)
|
|
31
86
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
87
|
+
scan_ndjson(
|
|
88
|
+
source,
|
|
89
|
+
schema: schema,
|
|
90
|
+
schema_overrides: schema_overrides,
|
|
91
|
+
infer_schema_length: infer_schema_length,
|
|
92
|
+
batch_size: batch_size,
|
|
93
|
+
n_rows: n_rows,
|
|
94
|
+
low_memory: low_memory,
|
|
95
|
+
rechunk: rechunk,
|
|
96
|
+
row_index_name: row_index_name,
|
|
97
|
+
row_index_offset: row_index_offset,
|
|
98
|
+
ignore_errors: ignore_errors,
|
|
99
|
+
include_file_paths: include_file_paths,
|
|
100
|
+
retries: retries,
|
|
101
|
+
storage_options: storage_options,
|
|
102
|
+
credential_provider: credential_provider_builder,
|
|
103
|
+
file_cache_ttl: file_cache_ttl,
|
|
104
|
+
).collect
|
|
40
105
|
end
|
|
41
106
|
|
|
42
107
|
# Lazily read from a newline delimited JSON file.
|
|
@@ -46,6 +111,19 @@ module Polars
|
|
|
46
111
|
#
|
|
47
112
|
# @param source [String]
|
|
48
113
|
# Path to a file.
|
|
114
|
+
# @param schema [Object]
|
|
115
|
+
# The DataFrame schema may be declared in several ways:
|
|
116
|
+
#
|
|
117
|
+
# * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
|
118
|
+
# * As a list of column names; in this case types are automatically inferred.
|
|
119
|
+
# * As a list of (name,type) pairs; this is equivalent to the hash form.
|
|
120
|
+
#
|
|
121
|
+
# If you supply a list of column names that does not match the names in the
|
|
122
|
+
# underlying data, the names given here will overwrite them. The number
|
|
123
|
+
# of names given in the schema should match the underlying data dimensions.
|
|
124
|
+
# @param schema_overrides [Hash]
|
|
125
|
+
# Support type specification or override of one or more columns; note that
|
|
126
|
+
# any dtypes inferred from the schema param will be overridden.
|
|
49
127
|
# @param infer_schema_length [Integer]
|
|
50
128
|
# Infer the schema length from the first `infer_schema_length` rows.
|
|
51
129
|
# @param batch_size [Integer]
|
|
@@ -56,22 +134,58 @@ module Polars
|
|
|
56
134
|
# Reduce memory pressure at the expense of performance.
|
|
57
135
|
# @param rechunk [Boolean]
|
|
58
136
|
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
|
59
|
-
# @param
|
|
137
|
+
# @param row_index_name [String]
|
|
60
138
|
# If not nil, this will insert a row count column with give name into the
|
|
61
139
|
# DataFrame.
|
|
62
|
-
# @param
|
|
140
|
+
# @param row_index_offset [Integer]
|
|
63
141
|
# Offset to start the row_count column (only use if the name is set).
|
|
142
|
+
# @param ignore_errors [Boolean]
|
|
143
|
+
# Return `Null` if parsing fails because of schema mismatches.
|
|
144
|
+
# @param storage_options [Hash]
|
|
145
|
+
# Options that indicate how to connect to a cloud provider.
|
|
146
|
+
#
|
|
147
|
+
# The cloud providers currently supported are AWS, GCP, and Azure.
|
|
148
|
+
# See supported keys here:
|
|
149
|
+
#
|
|
150
|
+
# * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
|
|
151
|
+
# * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
|
|
152
|
+
# * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
|
|
153
|
+
# * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
|
|
154
|
+
# `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
|
155
|
+
#
|
|
156
|
+
# If `storage_options` is not provided, Polars will try to infer the information
|
|
157
|
+
# from environment variables.
|
|
158
|
+
# @param credential_provider [Object]
|
|
159
|
+
# Provide a function that can be called to provide cloud storage
|
|
160
|
+
# credentials. The function is expected to return a hash of
|
|
161
|
+
# credential keys along with an optional credential expiry time.
|
|
162
|
+
# @param retries [Integer]
|
|
163
|
+
# Number of retries if accessing a cloud instance fails.
|
|
164
|
+
# @param file_cache_ttl [Integer]
|
|
165
|
+
# Amount of time to keep downloaded cloud files since their last access time,
|
|
166
|
+
# in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
|
|
167
|
+
# (which defaults to 1 hour) if not given.
|
|
168
|
+
# @param include_file_paths [String]
|
|
169
|
+
# Include the path of the source file(s) as a column with this name.
|
|
64
170
|
#
|
|
65
171
|
# @return [LazyFrame]
|
|
66
172
|
def scan_ndjson(
|
|
67
173
|
source,
|
|
174
|
+
schema: nil,
|
|
175
|
+
schema_overrides: nil,
|
|
68
176
|
infer_schema_length: N_INFER_DEFAULT,
|
|
69
177
|
batch_size: 1024,
|
|
70
178
|
n_rows: nil,
|
|
71
179
|
low_memory: false,
|
|
72
|
-
rechunk:
|
|
73
|
-
|
|
74
|
-
|
|
180
|
+
rechunk: false,
|
|
181
|
+
row_index_name: nil,
|
|
182
|
+
row_index_offset: 0,
|
|
183
|
+
ignore_errors: false,
|
|
184
|
+
storage_options: nil,
|
|
185
|
+
credential_provider: "auto",
|
|
186
|
+
retries: 2,
|
|
187
|
+
file_cache_ttl: nil,
|
|
188
|
+
include_file_paths: nil
|
|
75
189
|
)
|
|
76
190
|
sources = []
|
|
77
191
|
if Utils.pathlike?(source)
|
|
@@ -86,16 +200,39 @@ module Polars
|
|
|
86
200
|
source = nil
|
|
87
201
|
end
|
|
88
202
|
|
|
203
|
+
if infer_schema_length == 0
|
|
204
|
+
msg = "'infer_schema_length' should be positive"
|
|
205
|
+
raise ArgumentError, msg
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
credential_provider_builder = _init_credential_provider_builder(
|
|
209
|
+
credential_provider, source, storage_options, "scan_ndjson"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if storage_options&.any?
|
|
213
|
+
storage_options = storage_options.map { |k, v| [k.to_s, v.to_s] }
|
|
214
|
+
else
|
|
215
|
+
storage_options = nil
|
|
216
|
+
end
|
|
217
|
+
|
|
89
218
|
rblf =
|
|
90
219
|
RbLazyFrame.new_from_ndjson(
|
|
91
220
|
source,
|
|
92
221
|
sources,
|
|
93
222
|
infer_schema_length,
|
|
223
|
+
schema,
|
|
224
|
+
schema_overrides,
|
|
94
225
|
batch_size,
|
|
95
226
|
n_rows,
|
|
96
227
|
low_memory,
|
|
97
228
|
rechunk,
|
|
98
|
-
Utils.parse_row_index_args(
|
|
229
|
+
Utils.parse_row_index_args(row_index_name, row_index_offset),
|
|
230
|
+
ignore_errors,
|
|
231
|
+
include_file_paths,
|
|
232
|
+
storage_options,
|
|
233
|
+
credential_provider_builder,
|
|
234
|
+
retries,
|
|
235
|
+
file_cache_ttl
|
|
99
236
|
)
|
|
100
237
|
Utils.wrap_ldf(rblf)
|
|
101
238
|
end
|