polars-df 0.13.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +208 -0
- data/Cargo.lock +2556 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +39278 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +104 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +36 -0
- data/lib/polars/cat_name_space.rb +88 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +98 -0
- data/lib/polars/data_frame.rb +5191 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1397 -0
- data/lib/polars/date_time_name_space.rb +1287 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +38 -0
- data/lib/polars/expr.rb +7256 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +271 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1329 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +136 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +613 -0
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/io/csv.rb +696 -0
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +275 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +233 -0
- data/lib/polars/lazy_frame.rb +2708 -0
- data/lib/polars/lazy_group_by.rb +181 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +449 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +4444 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1495 -0
- data/lib/polars/string_name_space.rb +811 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +130 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +91 -0
- metadata +138 -0
@@ -0,0 +1,73 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Read a SQL query into a DataFrame.
|
4
|
+
#
|
5
|
+
# @param query [Object]
|
6
|
+
# ActiveRecord::Relation or ActiveRecord::Result.
|
7
|
+
# @param schema_overrides [Hash]
|
8
|
+
# A hash mapping column names to dtypes, used to override the schema
|
9
|
+
# inferred from the query.
|
10
|
+
#
|
11
|
+
# @return [DataFrame]
|
12
|
+
def read_database(query, schema_overrides: nil)
|
13
|
+
if !defined?(ActiveRecord)
|
14
|
+
raise Error, "Active Record not available"
|
15
|
+
end
|
16
|
+
|
17
|
+
result =
|
18
|
+
if query.is_a?(ActiveRecord::Result)
|
19
|
+
query
|
20
|
+
elsif query.is_a?(ActiveRecord::Relation)
|
21
|
+
query.connection.select_all(query.to_sql)
|
22
|
+
elsif query.is_a?(::String)
|
23
|
+
ActiveRecord::Base.connection.select_all(query)
|
24
|
+
else
|
25
|
+
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
26
|
+
end
|
27
|
+
|
28
|
+
data = {}
|
29
|
+
schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
|
30
|
+
|
31
|
+
result.columns.each_with_index do |k, i|
|
32
|
+
column_type = result.column_types[i]
|
33
|
+
|
34
|
+
data[k] =
|
35
|
+
if column_type
|
36
|
+
result.rows.map { |r| column_type.deserialize(r[i]) }
|
37
|
+
else
|
38
|
+
result.rows.map { |r| r[i] }
|
39
|
+
end
|
40
|
+
|
41
|
+
polars_type =
|
42
|
+
case column_type&.type
|
43
|
+
when :binary
|
44
|
+
Binary
|
45
|
+
when :boolean
|
46
|
+
Boolean
|
47
|
+
when :date
|
48
|
+
Date
|
49
|
+
when :datetime, :timestamp
|
50
|
+
Datetime
|
51
|
+
when :decimal
|
52
|
+
Decimal
|
53
|
+
when :float
|
54
|
+
Float64
|
55
|
+
when :integer
|
56
|
+
Int64
|
57
|
+
when :string, :text
|
58
|
+
String
|
59
|
+
when :time
|
60
|
+
Time
|
61
|
+
# TODO fix issue with null
|
62
|
+
# when :json, :jsonb
|
63
|
+
# Struct
|
64
|
+
end
|
65
|
+
|
66
|
+
schema_overrides[k] ||= polars_type if polars_type
|
67
|
+
end
|
68
|
+
|
69
|
+
DataFrame.new(data, schema_overrides: schema_overrides)
|
70
|
+
end
|
71
|
+
alias_method :read_sql, :read_database
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,275 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Read into a DataFrame from Arrow IPC (Feather v2) file.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# Path to a file or a file-like object.
|
7
|
+
# @param columns [Object]
|
8
|
+
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
9
|
+
# of column names.
|
10
|
+
# @param n_rows [Integer]
|
11
|
+
# Stop reading from IPC file after reading `n_rows`.
|
12
|
+
# @param memory_map [Boolean]
|
13
|
+
# Try to memory map the file. This can greatly improve performance on repeated
|
14
|
+
# queries as the OS may cache pages.
|
15
|
+
# Only uncompressed IPC files can be memory mapped.
|
16
|
+
# @param storage_options [Hash]
|
17
|
+
# Extra options that make sense for a particular storage connection.
|
18
|
+
# @param row_count_name [String]
|
19
|
+
# If not nil, this will insert a row count column with give name into the
|
20
|
+
# DataFrame.
|
21
|
+
# @param row_count_offset [Integer]
|
22
|
+
# Offset to start the row_count column (only use if the name is set).
|
23
|
+
# @param rechunk [Boolean]
|
24
|
+
# Make sure that all data is contiguous.
|
25
|
+
#
|
26
|
+
# @return [DataFrame]
|
27
|
+
def read_ipc(
|
28
|
+
source,
|
29
|
+
columns: nil,
|
30
|
+
n_rows: nil,
|
31
|
+
memory_map: true,
|
32
|
+
storage_options: nil,
|
33
|
+
row_count_name: nil,
|
34
|
+
row_count_offset: 0,
|
35
|
+
rechunk: true
|
36
|
+
)
|
37
|
+
storage_options ||= {}
|
38
|
+
_prepare_file_arg(source, **storage_options) do |data|
|
39
|
+
_read_ipc_impl(
|
40
|
+
data,
|
41
|
+
columns: columns,
|
42
|
+
n_rows: n_rows,
|
43
|
+
row_count_name: row_count_name,
|
44
|
+
row_count_offset: row_count_offset,
|
45
|
+
rechunk: rechunk,
|
46
|
+
memory_map: memory_map
|
47
|
+
)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# @private
|
52
|
+
def _read_ipc_impl(
|
53
|
+
file,
|
54
|
+
columns: nil,
|
55
|
+
n_rows: nil,
|
56
|
+
row_count_name: nil,
|
57
|
+
row_count_offset: 0,
|
58
|
+
rechunk: true,
|
59
|
+
memory_map: true
|
60
|
+
)
|
61
|
+
if Utils.pathlike?(file)
|
62
|
+
file = Utils.normalize_filepath(file)
|
63
|
+
end
|
64
|
+
if columns.is_a?(::String)
|
65
|
+
columns = [columns]
|
66
|
+
end
|
67
|
+
|
68
|
+
if file.is_a?(::String) && file.include?("*")
|
69
|
+
raise Todo
|
70
|
+
end
|
71
|
+
|
72
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
73
|
+
rbdf =
|
74
|
+
RbDataFrame.read_ipc(
|
75
|
+
file,
|
76
|
+
columns,
|
77
|
+
projection,
|
78
|
+
n_rows,
|
79
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
80
|
+
memory_map
|
81
|
+
)
|
82
|
+
Utils.wrap_df(rbdf)
|
83
|
+
end
|
84
|
+
|
85
|
+
# Read into a DataFrame from Arrow IPC record batch stream.
|
86
|
+
#
|
87
|
+
# See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html.
|
88
|
+
#
|
89
|
+
# @param source [Object]
|
90
|
+
# Path to a file or a file-like object.
|
91
|
+
# @param columns [Array]
|
92
|
+
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
93
|
+
# of column names.
|
94
|
+
# @param n_rows [Integer]
|
95
|
+
# Stop reading from IPC stream after reading `n_rows`.
|
96
|
+
# @param storage_options [Hash]
|
97
|
+
# Extra options that make sense for a particular storage connection.
|
98
|
+
# @param row_index_name [String]
|
99
|
+
# Insert a row index column with the given name into the DataFrame as the first
|
100
|
+
# column. If set to `nil` (default), no row index column is created.
|
101
|
+
# @param row_index_offset [Integer]
|
102
|
+
# Start the row index at this offset. Cannot be negative.
|
103
|
+
# Only used if `row_index_name` is set.
|
104
|
+
# @param rechunk [Boolean]
|
105
|
+
# Make sure that all data is contiguous.
|
106
|
+
#
|
107
|
+
# @return [DataFrame]
|
108
|
+
def read_ipc_stream(
|
109
|
+
source,
|
110
|
+
columns: nil,
|
111
|
+
n_rows: nil,
|
112
|
+
storage_options: nil,
|
113
|
+
row_index_name: nil,
|
114
|
+
row_index_offset: 0,
|
115
|
+
rechunk: true
|
116
|
+
)
|
117
|
+
storage_options ||= {}
|
118
|
+
_prepare_file_arg(source, **storage_options) do |data|
|
119
|
+
_read_ipc_stream_impl(
|
120
|
+
data,
|
121
|
+
columns: columns,
|
122
|
+
n_rows: n_rows,
|
123
|
+
row_index_name: row_index_name,
|
124
|
+
row_index_offset: row_index_offset,
|
125
|
+
rechunk: rechunk
|
126
|
+
)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# @private
|
131
|
+
def _read_ipc_stream_impl(
|
132
|
+
source,
|
133
|
+
columns: nil,
|
134
|
+
n_rows: nil,
|
135
|
+
row_index_name: nil,
|
136
|
+
row_index_offset: 0,
|
137
|
+
rechunk: true
|
138
|
+
)
|
139
|
+
if Utils.pathlike?(source)
|
140
|
+
source = Utils.normalize_filepath(source)
|
141
|
+
end
|
142
|
+
if columns.is_a?(String)
|
143
|
+
columns = [columns]
|
144
|
+
end
|
145
|
+
|
146
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
147
|
+
pydf = RbDataFrame.read_ipc_stream(
|
148
|
+
source,
|
149
|
+
columns,
|
150
|
+
projection,
|
151
|
+
n_rows,
|
152
|
+
Utils.parse_row_index_args(row_index_name, row_index_offset),
|
153
|
+
rechunk
|
154
|
+
)
|
155
|
+
Utils.wrap_df(pydf)
|
156
|
+
end
|
157
|
+
|
158
|
+
# Get a schema of the IPC file without reading data.
|
159
|
+
#
|
160
|
+
# @param source [Object]
|
161
|
+
# Path to a file or a file-like object.
|
162
|
+
#
|
163
|
+
# @return [Hash]
|
164
|
+
def read_ipc_schema(source)
|
165
|
+
if Utils.pathlike?(source)
|
166
|
+
source = Utils.normalize_filepath(source)
|
167
|
+
end
|
168
|
+
|
169
|
+
Plr.ipc_schema(source)
|
170
|
+
end
|
171
|
+
|
172
|
+
# Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
|
173
|
+
#
|
174
|
+
# This allows the query optimizer to push down predicates and projections to the scan
|
175
|
+
# level, thereby potentially reducing memory overhead.
|
176
|
+
#
|
177
|
+
# @param source [String]
|
178
|
+
# Path to a IPC file.
|
179
|
+
# @param n_rows [Integer]
|
180
|
+
# Stop reading from IPC file after reading `n_rows`.
|
181
|
+
# @param cache [Boolean]
|
182
|
+
# Cache the result after reading.
|
183
|
+
# @param rechunk [Boolean]
|
184
|
+
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
185
|
+
# @param row_count_name [String]
|
186
|
+
# If not nil, this will insert a row count column with give name into the
|
187
|
+
# DataFrame.
|
188
|
+
# @param row_count_offset [Integer]
|
189
|
+
# Offset to start the row_count column (only use if the name is set).
|
190
|
+
# @param storage_options [Hash]
|
191
|
+
# Extra options that make sense for a particular storage connection.
|
192
|
+
# @param memory_map [Boolean]
|
193
|
+
# Try to memory map the file. This can greatly improve performance on repeated
|
194
|
+
# queries as the OS may cache pages.
|
195
|
+
# Only uncompressed IPC files can be memory mapped.
|
196
|
+
# @param hive_partitioning [Boolean]
|
197
|
+
# Infer statistics and schema from Hive partitioned URL and use them
|
198
|
+
# to prune reads. This is unset by default (i.e. `nil`), meaning it is
|
199
|
+
# automatically enabled when a single directory is passed, and otherwise
|
200
|
+
# disabled.
|
201
|
+
# @param hive_schema [Hash]
|
202
|
+
# The column names and data types of the columns by which the data is partitioned.
|
203
|
+
# If set to `nil` (default), the schema of the Hive partitions is inferred.
|
204
|
+
# @param try_parse_hive_dates [Boolean]
|
205
|
+
# Whether to try parsing hive values as date/datetime types.
|
206
|
+
# @param include_file_paths [String]
|
207
|
+
# Include the path of the source file(s) as a column with this name.
|
208
|
+
#
|
209
|
+
# @return [LazyFrame]
|
210
|
+
def scan_ipc(
|
211
|
+
source,
|
212
|
+
n_rows: nil,
|
213
|
+
cache: true,
|
214
|
+
rechunk: true,
|
215
|
+
row_count_name: nil,
|
216
|
+
row_count_offset: 0,
|
217
|
+
storage_options: nil,
|
218
|
+
memory_map: true,
|
219
|
+
hive_partitioning: nil,
|
220
|
+
hive_schema: nil,
|
221
|
+
try_parse_hive_dates: true,
|
222
|
+
include_file_paths: nil
|
223
|
+
)
|
224
|
+
_scan_ipc_impl(
|
225
|
+
source,
|
226
|
+
n_rows: n_rows,
|
227
|
+
cache: cache,
|
228
|
+
rechunk: rechunk,
|
229
|
+
row_count_name: row_count_name,
|
230
|
+
row_count_offset: row_count_offset,
|
231
|
+
storage_options: storage_options,
|
232
|
+
memory_map: memory_map,
|
233
|
+
hive_partitioning: hive_partitioning,
|
234
|
+
hive_schema: hive_schema,
|
235
|
+
try_parse_hive_dates: try_parse_hive_dates,
|
236
|
+
include_file_paths: include_file_paths
|
237
|
+
)
|
238
|
+
end
|
239
|
+
|
240
|
+
# @private
|
241
|
+
def _scan_ipc_impl(
|
242
|
+
file,
|
243
|
+
n_rows: nil,
|
244
|
+
cache: true,
|
245
|
+
rechunk: true,
|
246
|
+
row_count_name: nil,
|
247
|
+
row_count_offset: 0,
|
248
|
+
storage_options: nil,
|
249
|
+
memory_map: true,
|
250
|
+
hive_partitioning: nil,
|
251
|
+
hive_schema: nil,
|
252
|
+
try_parse_hive_dates: true,
|
253
|
+
include_file_paths: nil
|
254
|
+
)
|
255
|
+
if Utils.pathlike?(file)
|
256
|
+
file = Utils.normalize_filepath(file)
|
257
|
+
end
|
258
|
+
|
259
|
+
rblf =
|
260
|
+
RbLazyFrame.new_from_ipc(
|
261
|
+
file,
|
262
|
+
n_rows,
|
263
|
+
cache,
|
264
|
+
rechunk,
|
265
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
266
|
+
memory_map,
|
267
|
+
hive_partitioning,
|
268
|
+
hive_schema,
|
269
|
+
try_parse_hive_dates,
|
270
|
+
include_file_paths
|
271
|
+
)
|
272
|
+
Utils.wrap_ldf(rblf)
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Read into a DataFrame from a JSON file.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# Path to a file or a file-like object.
|
7
|
+
#
|
8
|
+
# @return [DataFrame]
|
9
|
+
def read_json(
|
10
|
+
source,
|
11
|
+
schema: nil,
|
12
|
+
schema_overrides: nil,
|
13
|
+
infer_schema_length: N_INFER_DEFAULT
|
14
|
+
)
|
15
|
+
if Utils.pathlike?(source)
|
16
|
+
source = Utils.normalize_filepath(source)
|
17
|
+
end
|
18
|
+
|
19
|
+
rbdf =
|
20
|
+
RbDataFrame.read_json(
|
21
|
+
source,
|
22
|
+
infer_schema_length,
|
23
|
+
schema,
|
24
|
+
schema_overrides
|
25
|
+
)
|
26
|
+
Utils.wrap_df(rbdf)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Read into a DataFrame from a newline delimited JSON file.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# Path to a file or a file-like object.
|
7
|
+
#
|
8
|
+
# @return [DataFrame]
|
9
|
+
def read_ndjson(
|
10
|
+
source,
|
11
|
+
schema: nil,
|
12
|
+
schema_overrides: nil,
|
13
|
+
ignore_errors: false
|
14
|
+
)
|
15
|
+
if Utils.pathlike?(source)
|
16
|
+
source = Utils.normalize_filepath(source)
|
17
|
+
end
|
18
|
+
|
19
|
+
rbdf =
|
20
|
+
RbDataFrame.read_ndjson(
|
21
|
+
source,
|
22
|
+
ignore_errors,
|
23
|
+
schema,
|
24
|
+
schema_overrides
|
25
|
+
)
|
26
|
+
Utils.wrap_df(rbdf)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Lazily read from a newline delimited JSON file.
|
30
|
+
#
|
31
|
+
# This allows the query optimizer to push down predicates and projections to the scan
|
32
|
+
# level, thereby potentially reducing memory overhead.
|
33
|
+
#
|
34
|
+
# @param source [String]
|
35
|
+
# Path to a file.
|
36
|
+
# @param infer_schema_length [Integer]
|
37
|
+
# Infer the schema length from the first `infer_schema_length` rows.
|
38
|
+
# @param batch_size [Integer]
|
39
|
+
# Number of rows to read in each batch.
|
40
|
+
# @param n_rows [Integer]
|
41
|
+
# Stop reading from JSON file after reading `n_rows`.
|
42
|
+
# @param low_memory [Boolean]
|
43
|
+
# Reduce memory pressure at the expense of performance.
|
44
|
+
# @param rechunk [Boolean]
|
45
|
+
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
46
|
+
# @param row_count_name [String]
|
47
|
+
# If not nil, this will insert a row count column with give name into the
|
48
|
+
# DataFrame.
|
49
|
+
# @param row_count_offset [Integer]
|
50
|
+
# Offset to start the row_count column (only use if the name is set).
|
51
|
+
#
|
52
|
+
# @return [LazyFrame]
|
53
|
+
def scan_ndjson(
|
54
|
+
source,
|
55
|
+
infer_schema_length: N_INFER_DEFAULT,
|
56
|
+
batch_size: 1024,
|
57
|
+
n_rows: nil,
|
58
|
+
low_memory: false,
|
59
|
+
rechunk: true,
|
60
|
+
row_count_name: nil,
|
61
|
+
row_count_offset: 0
|
62
|
+
)
|
63
|
+
if Utils.pathlike?(source)
|
64
|
+
source = Utils.normalize_filepath(source)
|
65
|
+
end
|
66
|
+
|
67
|
+
rblf =
|
68
|
+
RbLazyFrame.new_from_ndjson(
|
69
|
+
source,
|
70
|
+
infer_schema_length,
|
71
|
+
batch_size,
|
72
|
+
n_rows,
|
73
|
+
low_memory,
|
74
|
+
rechunk,
|
75
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset)
|
76
|
+
)
|
77
|
+
Utils.wrap_ldf(rblf)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Read into a DataFrame from a parquet file.
|
4
|
+
#
|
5
|
+
# @param source [String, Pathname, StringIO]
|
6
|
+
# Path to a file or a file-like object.
|
7
|
+
# @param columns [Object]
|
8
|
+
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
9
|
+
# of column names.
|
10
|
+
# @param n_rows [Integer]
|
11
|
+
# Stop reading from parquet file after reading `n_rows`.
|
12
|
+
# @param storage_options [Hash]
|
13
|
+
# Extra options that make sense for a particular storage connection.
|
14
|
+
# @param parallel ["auto", "columns", "row_groups", "none"]
|
15
|
+
# This determines the direction of parallelism. 'auto' will try to determine the
|
16
|
+
# optimal direction.
|
17
|
+
# @param row_count_name [String]
|
18
|
+
# If not nil, this will insert a row count column with give name into the
|
19
|
+
# DataFrame.
|
20
|
+
# @param row_count_offset [Integer]
|
21
|
+
# Offset to start the row_count column (only use if the name is set).
|
22
|
+
# @param low_memory [Boolean]
|
23
|
+
# Reduce memory pressure at the expense of performance.
|
24
|
+
# @param use_statistics [Boolean]
|
25
|
+
# Use statistics in the parquet to determine if pages
|
26
|
+
# can be skipped from reading.
|
27
|
+
# @param rechunk [Boolean]
|
28
|
+
# Make sure that all columns are contiguous in memory by
|
29
|
+
# aggregating the chunks into a single array.
|
30
|
+
#
|
31
|
+
# @return [DataFrame]
|
32
|
+
#
|
33
|
+
# @note
|
34
|
+
# This operation defaults to a `rechunk` operation at the end, meaning that
|
35
|
+
# all data will be stored continuously in memory.
|
36
|
+
# Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
|
37
|
+
# an expensive operation.
|
38
|
+
def read_parquet(
|
39
|
+
source,
|
40
|
+
columns: nil,
|
41
|
+
n_rows: nil,
|
42
|
+
storage_options: nil,
|
43
|
+
parallel: "auto",
|
44
|
+
row_count_name: nil,
|
45
|
+
row_count_offset: 0,
|
46
|
+
low_memory: false,
|
47
|
+
use_statistics: true,
|
48
|
+
rechunk: true
|
49
|
+
)
|
50
|
+
_prepare_file_arg(source) do |data|
|
51
|
+
_read_parquet_impl(
|
52
|
+
data,
|
53
|
+
columns: columns,
|
54
|
+
n_rows: n_rows,
|
55
|
+
parallel: parallel,
|
56
|
+
row_count_name: row_count_name,
|
57
|
+
row_count_offset: row_count_offset,
|
58
|
+
low_memory: low_memory,
|
59
|
+
use_statistics: use_statistics,
|
60
|
+
rechunk: rechunk
|
61
|
+
)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# @private
|
66
|
+
def _read_parquet_impl(
|
67
|
+
source,
|
68
|
+
columns: nil,
|
69
|
+
n_rows: nil,
|
70
|
+
parallel: "auto",
|
71
|
+
row_count_name: nil,
|
72
|
+
row_count_offset: 0,
|
73
|
+
low_memory: false,
|
74
|
+
use_statistics: true,
|
75
|
+
rechunk: true
|
76
|
+
)
|
77
|
+
if Utils.pathlike?(source)
|
78
|
+
source = Utils.normalize_filepath(source)
|
79
|
+
end
|
80
|
+
if columns.is_a?(::String)
|
81
|
+
columns = [columns]
|
82
|
+
end
|
83
|
+
|
84
|
+
if source.is_a?(::String) && source.include?("*") && Utils.local_file?(source)
|
85
|
+
scan =
|
86
|
+
scan_parquet(
|
87
|
+
source,
|
88
|
+
n_rows: n_rows,
|
89
|
+
rechunk: true,
|
90
|
+
parallel: parallel,
|
91
|
+
row_count_name: row_count_name,
|
92
|
+
row_count_offset: row_count_offset,
|
93
|
+
low_memory: low_memory
|
94
|
+
)
|
95
|
+
|
96
|
+
if columns.nil?
|
97
|
+
return scan.collect
|
98
|
+
elsif Utils.is_str_sequence(columns, allow_str: false)
|
99
|
+
return scan.select(columns).collect
|
100
|
+
else
|
101
|
+
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: Array[String]"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
106
|
+
rbdf =
|
107
|
+
RbDataFrame.read_parquet(
|
108
|
+
source,
|
109
|
+
columns,
|
110
|
+
projection,
|
111
|
+
n_rows,
|
112
|
+
parallel,
|
113
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
114
|
+
low_memory,
|
115
|
+
use_statistics,
|
116
|
+
rechunk
|
117
|
+
)
|
118
|
+
Utils.wrap_df(rbdf)
|
119
|
+
end
|
120
|
+
|
121
|
+
# Get a schema of the Parquet file without reading data.
|
122
|
+
#
|
123
|
+
# @param source [Object]
|
124
|
+
# Path to a file or a file-like object.
|
125
|
+
#
|
126
|
+
# @return [Hash]
|
127
|
+
def read_parquet_schema(source)
|
128
|
+
if Utils.pathlike?(source)
|
129
|
+
source = Utils.normalize_filepath(source)
|
130
|
+
end
|
131
|
+
|
132
|
+
Plr.parquet_schema(source)
|
133
|
+
end
|
134
|
+
|
135
|
+
# Lazily read from a parquet file or multiple files via glob patterns.
|
136
|
+
#
|
137
|
+
# This allows the query optimizer to push down predicates and projections to the scan
|
138
|
+
# level, thereby potentially reducing memory overhead.
|
139
|
+
#
|
140
|
+
# @param source [String]
|
141
|
+
# Path to a file.
|
142
|
+
# @param n_rows [Integer]
|
143
|
+
# Stop reading from parquet file after reading `n_rows`.
|
144
|
+
# @param cache [Boolean]
|
145
|
+
# Cache the result after reading.
|
146
|
+
# @param parallel ["auto", "columns", "row_groups", "none"]
|
147
|
+
# This determines the direction of parallelism. 'auto' will try to determine the
|
148
|
+
# optimal direction.
|
149
|
+
# @param rechunk [Boolean]
|
150
|
+
# In case of reading multiple files via a glob pattern rechunk the final DataFrame
|
151
|
+
# into contiguous memory chunks.
|
152
|
+
# @param row_count_name [String]
|
153
|
+
# If not nil, this will insert a row count column with give name into the
|
154
|
+
# DataFrame.
|
155
|
+
# @param row_count_offset [Integer]
|
156
|
+
# Offset to start the row_count column (only use if the name is set).
|
157
|
+
# @param storage_options [Hash]
|
158
|
+
# Extra options that make sense for a particular storage connection.
|
159
|
+
# @param low_memory [Boolean]
|
160
|
+
# Reduce memory pressure at the expense of performance.
|
161
|
+
# @param include_file_paths [String]
|
162
|
+
# Include the path of the source file(s) as a column with this name.
|
163
|
+
#
|
164
|
+
# @return [LazyFrame]
|
165
|
+
def scan_parquet(
|
166
|
+
source,
|
167
|
+
n_rows: nil,
|
168
|
+
cache: true,
|
169
|
+
parallel: "auto",
|
170
|
+
glob: true,
|
171
|
+
rechunk: true,
|
172
|
+
row_count_name: nil,
|
173
|
+
row_count_offset: 0,
|
174
|
+
storage_options: nil,
|
175
|
+
low_memory: false,
|
176
|
+
include_file_paths: nil
|
177
|
+
)
|
178
|
+
if Utils.pathlike?(source)
|
179
|
+
source = Utils.normalize_filepath(source)
|
180
|
+
end
|
181
|
+
|
182
|
+
_scan_parquet_impl(
|
183
|
+
source,
|
184
|
+
n_rows: n_rows,
|
185
|
+
cache: cache,
|
186
|
+
parallel: parallel,
|
187
|
+
rechunk: rechunk,
|
188
|
+
row_count_name: row_count_name,
|
189
|
+
row_count_offset: row_count_offset,
|
190
|
+
storage_options: storage_options,
|
191
|
+
low_memory: low_memory,
|
192
|
+
glob: glob,
|
193
|
+
include_file_paths: include_file_paths
|
194
|
+
)
|
195
|
+
end
|
196
|
+
|
197
|
+
# @private
|
198
|
+
def _scan_parquet_impl(
|
199
|
+
file,
|
200
|
+
n_rows: nil,
|
201
|
+
cache: true,
|
202
|
+
parallel: "auto",
|
203
|
+
rechunk: true,
|
204
|
+
row_count_name: nil,
|
205
|
+
row_count_offset: 0,
|
206
|
+
storage_options: nil,
|
207
|
+
low_memory: false,
|
208
|
+
use_statistics: true,
|
209
|
+
hive_partitioning: nil,
|
210
|
+
glob: true,
|
211
|
+
include_file_paths: nil
|
212
|
+
)
|
213
|
+
rblf =
|
214
|
+
RbLazyFrame.new_from_parquet(
|
215
|
+
file,
|
216
|
+
[],
|
217
|
+
n_rows,
|
218
|
+
cache,
|
219
|
+
parallel,
|
220
|
+
rechunk,
|
221
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
222
|
+
low_memory,
|
223
|
+
use_statistics,
|
224
|
+
hive_partitioning,
|
225
|
+
nil,
|
226
|
+
true,
|
227
|
+
glob,
|
228
|
+
include_file_paths
|
229
|
+
)
|
230
|
+
Utils.wrap_ldf(rblf)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|