polars-df 0.14.0-x86_64-linux → 0.15.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/Cargo.lock +1296 -283
- data/LICENSE-THIRD-PARTY.txt +24793 -13160
- data/LICENSE.txt +1 -0
- data/README.md +1 -2
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +275 -52
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +2 -0
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +103 -2
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +88 -10
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/ipc.rb +14 -4
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +649 -15
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1144 -0
- data/lib/polars/series.rb +465 -35
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -1
- metadata +4 -3
- data/lib/polars/functions.rb +0 -57
data/lib/polars/io/parquet.rb
CHANGED
@@ -2,120 +2,108 @@ module Polars
|
|
2
2
|
module IO
|
3
3
|
# Read into a DataFrame from a parquet file.
|
4
4
|
#
|
5
|
-
# @param source [
|
5
|
+
# @param source [Object]
|
6
6
|
# Path to a file or a file-like object.
|
7
7
|
# @param columns [Object]
|
8
8
|
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
9
9
|
# of column names.
|
10
10
|
# @param n_rows [Integer]
|
11
11
|
# Stop reading from parquet file after reading `n_rows`.
|
12
|
-
# @param storage_options [Hash]
|
13
|
-
# Extra options that make sense for a particular storage connection.
|
14
|
-
# @param parallel ["auto", "columns", "row_groups", "none"]
|
15
|
-
# This determines the direction of parallelism. 'auto' will try to determine the
|
16
|
-
# optimal direction.
|
17
12
|
# @param row_count_name [String]
|
18
13
|
# If not nil, this will insert a row count column with give name into the
|
19
14
|
# DataFrame.
|
20
15
|
# @param row_count_offset [Integer]
|
21
16
|
# Offset to start the row_count column (only use if the name is set).
|
22
|
-
# @param
|
23
|
-
#
|
17
|
+
# @param parallel ["auto", "columns", "row_groups", "none"]
|
18
|
+
# This determines the direction of parallelism. 'auto' will try to determine the
|
19
|
+
# optimal direction.
|
24
20
|
# @param use_statistics [Boolean]
|
25
21
|
# Use statistics in the parquet to determine if pages
|
26
22
|
# can be skipped from reading.
|
23
|
+
# @param hive_partitioning [Boolean]
|
24
|
+
# Infer statistics and schema from hive partitioned URL and use them
|
25
|
+
# to prune reads.
|
26
|
+
# @param glob [Boolean]
|
27
|
+
# Expand path given via globbing rules.
|
28
|
+
# @param schema [Object]
|
29
|
+
# Specify the datatypes of the columns. The datatypes must match the
|
30
|
+
# datatypes in the file(s). If there are extra columns that are not in the
|
31
|
+
# file(s), consider also enabling `allow_missing_columns`.
|
32
|
+
# @param hive_schema [Object]
|
33
|
+
# The column names and data types of the columns by which the data is partitioned.
|
34
|
+
# If set to `nil` (default), the schema of the Hive partitions is inferred.
|
35
|
+
# @param try_parse_hive_dates [Boolean]
|
36
|
+
# Whether to try parsing hive values as date/datetime types.
|
27
37
|
# @param rechunk [Boolean]
|
28
|
-
#
|
29
|
-
#
|
38
|
+
# In case of reading multiple files via a glob pattern rechunk the final DataFrame
|
39
|
+
# into contiguous memory chunks.
|
40
|
+
# @param low_memory [Boolean]
|
41
|
+
# Reduce memory pressure at the expense of performance.
|
42
|
+
# @param storage_options [Hash]
|
43
|
+
# Extra options that make sense for a particular storage connection.
|
44
|
+
# @param credential_provider [Object]
|
45
|
+
# Provide a function that can be called to provide cloud storage
|
46
|
+
# credentials. The function is expected to return a dictionary of
|
47
|
+
# credential keys along with an optional credential expiry time.
|
48
|
+
# @param retries [Integer]
|
49
|
+
# Number of retries if accessing a cloud instance fails.
|
50
|
+
# @param include_file_paths [String]
|
51
|
+
# Include the path of the source file(s) as a column with this name.
|
30
52
|
#
|
31
53
|
# @return [DataFrame]
|
32
|
-
#
|
33
|
-
# @note
|
34
|
-
# This operation defaults to a `rechunk` operation at the end, meaning that
|
35
|
-
# all data will be stored continuously in memory.
|
36
|
-
# Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
|
37
|
-
# an expensive operation.
|
38
54
|
def read_parquet(
|
39
55
|
source,
|
40
56
|
columns: nil,
|
41
57
|
n_rows: nil,
|
42
|
-
storage_options: nil,
|
43
|
-
parallel: "auto",
|
44
58
|
row_count_name: nil,
|
45
59
|
row_count_offset: 0,
|
46
|
-
|
60
|
+
parallel: "auto",
|
47
61
|
use_statistics: true,
|
48
|
-
|
62
|
+
hive_partitioning: nil,
|
63
|
+
glob: true,
|
64
|
+
schema: nil,
|
65
|
+
hive_schema: nil,
|
66
|
+
try_parse_hive_dates: true,
|
67
|
+
rechunk: false,
|
68
|
+
low_memory: false,
|
69
|
+
storage_options: nil,
|
70
|
+
credential_provider: nil,
|
71
|
+
retries: 2,
|
72
|
+
include_file_paths: nil,
|
73
|
+
allow_missing_columns: false
|
49
74
|
)
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
columns: columns,
|
75
|
+
lf =
|
76
|
+
scan_parquet(
|
77
|
+
source,
|
54
78
|
n_rows: n_rows,
|
55
|
-
parallel: parallel,
|
56
79
|
row_count_name: row_count_name,
|
57
80
|
row_count_offset: row_count_offset,
|
58
|
-
|
81
|
+
parallel: parallel,
|
59
82
|
use_statistics: use_statistics,
|
60
|
-
|
83
|
+
hive_partitioning: hive_partitioning,
|
84
|
+
schema: schema,
|
85
|
+
hive_schema: hive_schema,
|
86
|
+
try_parse_hive_dates: try_parse_hive_dates,
|
87
|
+
rechunk: rechunk,
|
88
|
+
low_memory: low_memory,
|
89
|
+
cache: false,
|
90
|
+
storage_options: storage_options,
|
91
|
+
credential_provider: credential_provider,
|
92
|
+
retries: retries,
|
93
|
+
glob: glob,
|
94
|
+
include_file_paths: include_file_paths,
|
95
|
+
allow_missing_columns: allow_missing_columns
|
61
96
|
)
|
62
|
-
end
|
63
|
-
end
|
64
97
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
columns: nil,
|
69
|
-
n_rows: nil,
|
70
|
-
parallel: "auto",
|
71
|
-
row_count_name: nil,
|
72
|
-
row_count_offset: 0,
|
73
|
-
low_memory: false,
|
74
|
-
use_statistics: true,
|
75
|
-
rechunk: true
|
76
|
-
)
|
77
|
-
if Utils.pathlike?(source)
|
78
|
-
source = Utils.normalize_filepath(source)
|
79
|
-
end
|
80
|
-
if columns.is_a?(::String)
|
81
|
-
columns = [columns]
|
82
|
-
end
|
83
|
-
|
84
|
-
if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
|
85
|
-
scan =
|
86
|
-
scan_parquet(
|
87
|
-
source,
|
88
|
-
n_rows: n_rows,
|
89
|
-
rechunk: true,
|
90
|
-
parallel: parallel,
|
91
|
-
row_count_name: row_count_name,
|
92
|
-
row_count_offset: row_count_offset,
|
93
|
-
low_memory: low_memory
|
94
|
-
)
|
95
|
-
|
96
|
-
if columns.nil?
|
97
|
-
return scan.collect
|
98
|
-
elsif Utils.is_str_sequence(columns, allow_str: false)
|
99
|
-
return scan.select(columns).collect
|
98
|
+
if !columns.nil?
|
99
|
+
if Utils.is_int_sequence(columns)
|
100
|
+
lf = lf.select(F.nth(columns))
|
100
101
|
else
|
101
|
-
|
102
|
+
lf = lf.select(columns)
|
102
103
|
end
|
103
104
|
end
|
104
105
|
|
105
|
-
|
106
|
-
rbdf =
|
107
|
-
RbDataFrame.read_parquet(
|
108
|
-
source,
|
109
|
-
columns,
|
110
|
-
projection,
|
111
|
-
n_rows,
|
112
|
-
parallel,
|
113
|
-
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
114
|
-
low_memory,
|
115
|
-
use_statistics,
|
116
|
-
rechunk
|
117
|
-
)
|
118
|
-
Utils.wrap_df(rbdf)
|
106
|
+
lf.collect
|
119
107
|
end
|
120
108
|
|
121
109
|
# Get a schema of the Parquet file without reading data.
|
@@ -137,46 +125,83 @@ module Polars
|
|
137
125
|
# This allows the query optimizer to push down predicates and projections to the scan
|
138
126
|
# level, thereby potentially reducing memory overhead.
|
139
127
|
#
|
140
|
-
# @param source [
|
141
|
-
# Path to a file.
|
128
|
+
# @param source [Object]
|
129
|
+
# Path to a file or a file-like object.
|
142
130
|
# @param n_rows [Integer]
|
143
131
|
# Stop reading from parquet file after reading `n_rows`.
|
144
|
-
# @param
|
145
|
-
#
|
132
|
+
# @param row_count_name [String]
|
133
|
+
# If not nil, this will insert a row count column with give name into the
|
134
|
+
# DataFrame.
|
135
|
+
# @param row_count_offset [Integer]
|
136
|
+
# Offset to start the row_count column (only use if the name is set).
|
146
137
|
# @param parallel ["auto", "columns", "row_groups", "none"]
|
147
138
|
# This determines the direction of parallelism. 'auto' will try to determine the
|
148
139
|
# optimal direction.
|
140
|
+
# @param use_statistics [Boolean]
|
141
|
+
# Use statistics in the parquet to determine if pages
|
142
|
+
# can be skipped from reading.
|
143
|
+
# @param hive_partitioning [Boolean]
|
144
|
+
# Infer statistics and schema from hive partitioned URL and use them
|
145
|
+
# to prune reads.
|
146
|
+
# @param glob [Boolean]
|
147
|
+
# Expand path given via globbing rules.
|
148
|
+
# @param schema [Object]
|
149
|
+
# Specify the datatypes of the columns. The datatypes must match the
|
150
|
+
# datatypes in the file(s). If there are extra columns that are not in the
|
151
|
+
# file(s), consider also enabling `allow_missing_columns`.
|
152
|
+
# @param hive_schema [Object]
|
153
|
+
# The column names and data types of the columns by which the data is partitioned.
|
154
|
+
# If set to `nil` (default), the schema of the Hive partitions is inferred.
|
155
|
+
# @param try_parse_hive_dates [Boolean]
|
156
|
+
# Whether to try parsing hive values as date/datetime types.
|
149
157
|
# @param rechunk [Boolean]
|
150
158
|
# In case of reading multiple files via a glob pattern rechunk the final DataFrame
|
151
159
|
# into contiguous memory chunks.
|
152
|
-
# @param row_count_name [String]
|
153
|
-
# If not nil, this will insert a row count column with give name into the
|
154
|
-
# DataFrame.
|
155
|
-
# @param row_count_offset [Integer]
|
156
|
-
# Offset to start the row_count column (only use if the name is set).
|
157
|
-
# @param storage_options [Hash]
|
158
|
-
# Extra options that make sense for a particular storage connection.
|
159
160
|
# @param low_memory [Boolean]
|
160
161
|
# Reduce memory pressure at the expense of performance.
|
162
|
+
# @param cache [Boolean]
|
163
|
+
# Cache the result after reading.
|
164
|
+
# @param storage_options [Hash]
|
165
|
+
# Extra options that make sense for a particular storage connection.
|
166
|
+
# @param credential_provider [Object]
|
167
|
+
# Provide a function that can be called to provide cloud storage
|
168
|
+
# credentials. The function is expected to return a dictionary of
|
169
|
+
# credential keys along with an optional credential expiry time.
|
170
|
+
# @param retries [Integer]
|
171
|
+
# Number of retries if accessing a cloud instance fails.
|
161
172
|
# @param include_file_paths [String]
|
162
|
-
#
|
173
|
+
# Include the path of the source file(s) as a column with this name.
|
163
174
|
#
|
164
175
|
# @return [LazyFrame]
|
165
176
|
def scan_parquet(
|
166
177
|
source,
|
167
178
|
n_rows: nil,
|
168
|
-
cache: true,
|
169
|
-
parallel: "auto",
|
170
|
-
glob: true,
|
171
|
-
rechunk: true,
|
172
179
|
row_count_name: nil,
|
173
180
|
row_count_offset: 0,
|
174
|
-
|
181
|
+
parallel: "auto",
|
182
|
+
use_statistics: true,
|
183
|
+
hive_partitioning: nil,
|
184
|
+
glob: true,
|
185
|
+
schema: nil,
|
186
|
+
hive_schema: nil,
|
187
|
+
try_parse_hive_dates: true,
|
188
|
+
rechunk: false,
|
175
189
|
low_memory: false,
|
176
|
-
|
190
|
+
cache: true,
|
191
|
+
storage_options: nil,
|
192
|
+
credential_provider: nil,
|
193
|
+
retries: 2,
|
194
|
+
include_file_paths: nil,
|
195
|
+
allow_missing_columns: false
|
177
196
|
)
|
178
197
|
if Utils.pathlike?(source)
|
179
|
-
source = Utils.normalize_filepath(source)
|
198
|
+
source = Utils.normalize_filepath(source, check_not_directory: false)
|
199
|
+
elsif Utils.is_path_or_str_sequence(source)
|
200
|
+
source = source.map { |s| Utils.normalize_filepath(s, check_not_directory: false) }
|
201
|
+
end
|
202
|
+
|
203
|
+
if credential_provider
|
204
|
+
raise Todo
|
180
205
|
end
|
181
206
|
|
182
207
|
_scan_parquet_impl(
|
@@ -185,47 +210,79 @@ module Polars
|
|
185
210
|
cache: cache,
|
186
211
|
parallel: parallel,
|
187
212
|
rechunk: rechunk,
|
188
|
-
|
189
|
-
|
213
|
+
row_index_name: row_count_name,
|
214
|
+
row_index_offset: row_count_offset,
|
190
215
|
storage_options: storage_options,
|
216
|
+
credential_provider: credential_provider,
|
191
217
|
low_memory: low_memory,
|
218
|
+
use_statistics: use_statistics,
|
219
|
+
hive_partitioning: hive_partitioning,
|
220
|
+
schema: schema,
|
221
|
+
hive_schema: hive_schema,
|
222
|
+
try_parse_hive_dates: try_parse_hive_dates,
|
223
|
+
retries: retries,
|
192
224
|
glob: glob,
|
193
|
-
include_file_paths: include_file_paths
|
225
|
+
include_file_paths: include_file_paths,
|
226
|
+
allow_missing_columns: allow_missing_columns
|
194
227
|
)
|
195
228
|
end
|
196
229
|
|
197
230
|
# @private
|
198
231
|
def _scan_parquet_impl(
|
199
|
-
|
232
|
+
source,
|
200
233
|
n_rows: nil,
|
201
234
|
cache: true,
|
202
235
|
parallel: "auto",
|
203
236
|
rechunk: true,
|
204
|
-
|
205
|
-
|
237
|
+
row_index_name: nil,
|
238
|
+
row_index_offset: 0,
|
206
239
|
storage_options: nil,
|
240
|
+
credential_provider: nil,
|
207
241
|
low_memory: false,
|
208
242
|
use_statistics: true,
|
209
243
|
hive_partitioning: nil,
|
210
244
|
glob: true,
|
211
|
-
|
245
|
+
schema: nil,
|
246
|
+
hive_schema: nil,
|
247
|
+
try_parse_hive_dates: true,
|
248
|
+
retries: 2,
|
249
|
+
include_file_paths: nil,
|
250
|
+
allow_missing_columns: false
|
212
251
|
)
|
252
|
+
if source.is_a?(::Array)
|
253
|
+
sources = source
|
254
|
+
source = nil
|
255
|
+
else
|
256
|
+
sources = []
|
257
|
+
end
|
258
|
+
|
259
|
+
if storage_options
|
260
|
+
storage_options = storage_options.map { |k, v| [k.to_s, v.to_s] }
|
261
|
+
else
|
262
|
+
storage_options = nil
|
263
|
+
end
|
264
|
+
|
213
265
|
rblf =
|
214
266
|
RbLazyFrame.new_from_parquet(
|
215
|
-
|
216
|
-
|
267
|
+
source,
|
268
|
+
sources,
|
217
269
|
n_rows,
|
218
270
|
cache,
|
219
271
|
parallel,
|
220
272
|
rechunk,
|
221
|
-
Utils.parse_row_index_args(
|
273
|
+
Utils.parse_row_index_args(row_index_name, row_index_offset),
|
222
274
|
low_memory,
|
275
|
+
storage_options,
|
276
|
+
credential_provider,
|
223
277
|
use_statistics,
|
224
278
|
hive_partitioning,
|
225
|
-
|
226
|
-
|
279
|
+
schema,
|
280
|
+
hive_schema,
|
281
|
+
try_parse_hive_dates,
|
282
|
+
retries,
|
227
283
|
glob,
|
228
|
-
include_file_paths
|
284
|
+
include_file_paths,
|
285
|
+
allow_missing_columns
|
229
286
|
)
|
230
287
|
Utils.wrap_ldf(rblf)
|
231
288
|
end
|