polars-df 0.9.0-arm64-darwin → 0.11.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/Cargo.lock +144 -57
- data/LICENSE-THIRD-PARTY.txt +629 -29
- data/README.md +7 -6
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/3.3/polars.bundle +0 -0
- data/lib/polars/array_expr.rb +6 -2
- data/lib/polars/batched_csv_reader.rb +11 -3
- data/lib/polars/convert.rb +6 -1
- data/lib/polars/data_frame.rb +225 -370
- data/lib/polars/date_time_expr.rb +11 -4
- data/lib/polars/date_time_name_space.rb +14 -4
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/exceptions.rb +4 -0
- data/lib/polars/expr.rb +1171 -54
- data/lib/polars/functions/lazy.rb +3 -3
- data/lib/polars/functions/range/date_range.rb +92 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/whenthen.rb +74 -5
- data/lib/polars/group_by.rb +88 -23
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/{io.rb → io/csv.rb} +307 -489
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +247 -0
- data/lib/polars/io/json.rb +18 -0
- data/lib/polars/io/ndjson.rb +69 -0
- data/lib/polars/io/parquet.rb +226 -0
- data/lib/polars/lazy_frame.rb +55 -195
- data/lib/polars/lazy_group_by.rb +100 -3
- data/lib/polars/list_expr.rb +6 -2
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +14 -12
- data/lib/polars/string_expr.rb +38 -36
- data/lib/polars/utils.rb +89 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars.rb +10 -3
- metadata +13 -6
- data/lib/polars/when.rb +0 -16
- data/lib/polars/when_then.rb +0 -19
@@ -80,6 +80,8 @@ module Polars
|
|
80
80
|
# allocation needed.
|
81
81
|
# @param eol_char [String]
|
82
82
|
# Single byte end of line character.
|
83
|
+
# @param truncate_ragged_lines [Boolean]
|
84
|
+
# Truncate lines that are longer than the schema.
|
83
85
|
#
|
84
86
|
# @return [DataFrame]
|
85
87
|
#
|
@@ -113,7 +115,8 @@ module Polars
|
|
113
115
|
row_count_name: nil,
|
114
116
|
row_count_offset: 0,
|
115
117
|
sample_size: 1024,
|
116
|
-
eol_char: "\n"
|
118
|
+
eol_char: "\n",
|
119
|
+
truncate_ragged_lines: false
|
117
120
|
)
|
118
121
|
Utils._check_arg_is_1byte("sep", sep, false)
|
119
122
|
Utils._check_arg_is_1byte("comment_char", comment_char, false)
|
@@ -138,7 +141,7 @@ module Polars
|
|
138
141
|
|
139
142
|
df = nil
|
140
143
|
_prepare_file_arg(source) do |data|
|
141
|
-
df =
|
144
|
+
df = _read_csv_impl(
|
142
145
|
data,
|
143
146
|
has_header: has_header,
|
144
147
|
columns: columns || projection,
|
@@ -161,7 +164,8 @@ module Polars
|
|
161
164
|
row_count_name: row_count_name,
|
162
165
|
row_count_offset: row_count_offset,
|
163
166
|
sample_size: sample_size,
|
164
|
-
eol_char: eol_char
|
167
|
+
eol_char: eol_char,
|
168
|
+
truncate_ragged_lines: truncate_ragged_lines
|
165
169
|
)
|
166
170
|
end
|
167
171
|
|
@@ -172,88 +176,24 @@ module Polars
|
|
172
176
|
end
|
173
177
|
end
|
174
178
|
|
175
|
-
#
|
176
|
-
|
177
|
-
|
178
|
-
# projections to the scan level, thereby potentially reducing
|
179
|
-
# memory overhead.
|
180
|
-
#
|
181
|
-
# @param source [Object]
|
182
|
-
# Path to a file.
|
183
|
-
# @param has_header [Boolean]
|
184
|
-
# Indicate if the first row of dataset is a header or not.
|
185
|
-
# If set to false, column names will be autogenerated in the
|
186
|
-
# following format: `column_x`, with `x` being an
|
187
|
-
# enumeration over every column in the dataset starting at 1.
|
188
|
-
# @param sep [String]
|
189
|
-
# Single byte character to use as delimiter in the file.
|
190
|
-
# @param comment_char [String]
|
191
|
-
# Single byte character that indicates the start of a comment line,
|
192
|
-
# for instance `#`.
|
193
|
-
# @param quote_char [String]
|
194
|
-
# Single byte character used for csv quoting.
|
195
|
-
# Set to None to turn off special handling and escaping of quotes.
|
196
|
-
# @param skip_rows [Integer]
|
197
|
-
# Start reading after `skip_rows` lines. The header will be parsed at this
|
198
|
-
# offset.
|
199
|
-
# @param dtypes [Object]
|
200
|
-
# Overwrite dtypes during inference.
|
201
|
-
# @param null_values [Object]
|
202
|
-
# Values to interpret as null values. You can provide a:
|
203
|
-
#
|
204
|
-
# - `String`: All values equal to this string will be null.
|
205
|
-
# - `Array`: All values equal to any string in this array will be null.
|
206
|
-
# - `Hash`: A hash that maps column name to a null value string.
|
207
|
-
# @param ignore_errors [Boolean]
|
208
|
-
# Try to keep reading lines if some lines yield errors.
|
209
|
-
# First try `infer_schema_length: 0` to read all columns as
|
210
|
-
# `:str` to check which values might cause an issue.
|
211
|
-
# @param cache [Boolean]
|
212
|
-
# Cache the result after reading.
|
213
|
-
# @param with_column_names [Object]
|
214
|
-
# Apply a function over the column names.
|
215
|
-
# This can be used to update a schema just in time, thus before
|
216
|
-
# scanning.
|
217
|
-
# @param infer_schema_length [Integer]
|
218
|
-
# Maximum number of lines to read to infer schema.
|
219
|
-
# If set to 0, all columns will be read as `:str`.
|
220
|
-
# If set to `nil`, a full table scan will be done (slow).
|
221
|
-
# @param n_rows [Integer]
|
222
|
-
# Stop reading from CSV file after reading `n_rows`.
|
223
|
-
# @param encoding ["utf8", "utf8-lossy"]
|
224
|
-
# Lossy means that invalid utf8 values are replaced with `�`
|
225
|
-
# characters.
|
226
|
-
# @param low_memory [Boolean]
|
227
|
-
# Reduce memory usage in expense of performance.
|
228
|
-
# @param rechunk [Boolean]
|
229
|
-
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
230
|
-
# @param skip_rows_after_header [Integer]
|
231
|
-
# Skip this number of rows when the header is parsed.
|
232
|
-
# @param row_count_name [String]
|
233
|
-
# If not nil, this will insert a row count column with the given name into
|
234
|
-
# the DataFrame.
|
235
|
-
# @param row_count_offset [Integer]
|
236
|
-
# Offset to start the row_count column (only used if the name is set).
|
237
|
-
# @param parse_dates [Boolean]
|
238
|
-
# Try to automatically parse dates. If this does not succeed,
|
239
|
-
# the column remains of data type `:str`.
|
240
|
-
# @param eol_char [String]
|
241
|
-
# Single byte end of line character.
|
242
|
-
#
|
243
|
-
# @return [LazyFrame]
|
244
|
-
def scan_csv(
|
245
|
-
source,
|
179
|
+
# @private
|
180
|
+
def _read_csv_impl(
|
181
|
+
file,
|
246
182
|
has_header: true,
|
183
|
+
columns: nil,
|
247
184
|
sep: ",",
|
248
185
|
comment_char: nil,
|
249
186
|
quote_char: '"',
|
250
187
|
skip_rows: 0,
|
251
188
|
dtypes: nil,
|
189
|
+
schema: nil,
|
252
190
|
null_values: nil,
|
191
|
+
missing_utf8_is_empty_string: false,
|
253
192
|
ignore_errors: false,
|
254
|
-
|
255
|
-
|
193
|
+
parse_dates: false,
|
194
|
+
n_threads: nil,
|
256
195
|
infer_schema_length: 100,
|
196
|
+
batch_size: 8192,
|
257
197
|
n_rows: nil,
|
258
198
|
encoding: "utf8",
|
259
199
|
low_memory: false,
|
@@ -261,417 +201,119 @@ module Polars
|
|
261
201
|
skip_rows_after_header: 0,
|
262
202
|
row_count_name: nil,
|
263
203
|
row_count_offset: 0,
|
264
|
-
|
265
|
-
eol_char: "\n"
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
if Utils.pathlike?(source)
|
272
|
-
source = Utils.normalise_filepath(source)
|
273
|
-
end
|
274
|
-
|
275
|
-
LazyFrame._scan_csv(
|
276
|
-
source,
|
277
|
-
has_header: has_header,
|
278
|
-
sep: sep,
|
279
|
-
comment_char: comment_char,
|
280
|
-
quote_char: quote_char,
|
281
|
-
skip_rows: skip_rows,
|
282
|
-
dtypes: dtypes,
|
283
|
-
null_values: null_values,
|
284
|
-
ignore_errors: ignore_errors,
|
285
|
-
cache: cache,
|
286
|
-
with_column_names: with_column_names,
|
287
|
-
infer_schema_length: infer_schema_length,
|
288
|
-
n_rows: n_rows,
|
289
|
-
low_memory: low_memory,
|
290
|
-
rechunk: rechunk,
|
291
|
-
skip_rows_after_header: skip_rows_after_header,
|
292
|
-
encoding: encoding,
|
293
|
-
row_count_name: row_count_name,
|
294
|
-
row_count_offset: row_count_offset,
|
295
|
-
parse_dates: parse_dates,
|
296
|
-
eol_char: eol_char,
|
297
|
-
)
|
298
|
-
end
|
299
|
-
|
300
|
-
# Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
|
301
|
-
#
|
302
|
-
# This allows the query optimizer to push down predicates and projections to the scan
|
303
|
-
# level, thereby potentially reducing memory overhead.
|
304
|
-
#
|
305
|
-
# @param source [String]
|
306
|
-
# Path to a IPC file.
|
307
|
-
# @param n_rows [Integer]
|
308
|
-
# Stop reading from IPC file after reading `n_rows`.
|
309
|
-
# @param cache [Boolean]
|
310
|
-
# Cache the result after reading.
|
311
|
-
# @param rechunk [Boolean]
|
312
|
-
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
313
|
-
# @param row_count_name [String]
|
314
|
-
# If not nil, this will insert a row count column with give name into the
|
315
|
-
# DataFrame.
|
316
|
-
# @param row_count_offset [Integer]
|
317
|
-
# Offset to start the row_count column (only use if the name is set).
|
318
|
-
# @param storage_options [Hash]
|
319
|
-
# Extra options that make sense for a particular storage connection.
|
320
|
-
# @param memory_map [Boolean]
|
321
|
-
# Try to memory map the file. This can greatly improve performance on repeated
|
322
|
-
# queries as the OS may cache pages.
|
323
|
-
# Only uncompressed IPC files can be memory mapped.
|
324
|
-
#
|
325
|
-
# @return [LazyFrame]
|
326
|
-
def scan_ipc(
|
327
|
-
source,
|
328
|
-
n_rows: nil,
|
329
|
-
cache: true,
|
330
|
-
rechunk: true,
|
331
|
-
row_count_name: nil,
|
332
|
-
row_count_offset: 0,
|
333
|
-
storage_options: nil,
|
334
|
-
memory_map: true
|
335
|
-
)
|
336
|
-
LazyFrame._scan_ipc(
|
337
|
-
source,
|
338
|
-
n_rows: n_rows,
|
339
|
-
cache: cache,
|
340
|
-
rechunk: rechunk,
|
341
|
-
row_count_name: row_count_name,
|
342
|
-
row_count_offset: row_count_offset,
|
343
|
-
storage_options: storage_options,
|
344
|
-
memory_map: memory_map
|
345
|
-
)
|
346
|
-
end
|
347
|
-
|
348
|
-
# Lazily read from a parquet file or multiple files via glob patterns.
|
349
|
-
#
|
350
|
-
# This allows the query optimizer to push down predicates and projections to the scan
|
351
|
-
# level, thereby potentially reducing memory overhead.
|
352
|
-
#
|
353
|
-
# @param source [String]
|
354
|
-
# Path to a file.
|
355
|
-
# @param n_rows [Integer]
|
356
|
-
# Stop reading from parquet file after reading `n_rows`.
|
357
|
-
# @param cache [Boolean]
|
358
|
-
# Cache the result after reading.
|
359
|
-
# @param parallel ["auto", "columns", "row_groups", "none"]
|
360
|
-
# This determines the direction of parallelism. 'auto' will try to determine the
|
361
|
-
# optimal direction.
|
362
|
-
# @param rechunk [Boolean]
|
363
|
-
# In case of reading multiple files via a glob pattern rechunk the final DataFrame
|
364
|
-
# into contiguous memory chunks.
|
365
|
-
# @param row_count_name [String]
|
366
|
-
# If not nil, this will insert a row count column with give name into the
|
367
|
-
# DataFrame.
|
368
|
-
# @param row_count_offset [Integer]
|
369
|
-
# Offset to start the row_count column (only use if the name is set).
|
370
|
-
# @param storage_options [Hash]
|
371
|
-
# Extra options that make sense for a particular storage connection.
|
372
|
-
# @param low_memory [Boolean]
|
373
|
-
# Reduce memory pressure at the expense of performance.
|
374
|
-
#
|
375
|
-
# @return [LazyFrame]
|
376
|
-
def scan_parquet(
|
377
|
-
source,
|
378
|
-
n_rows: nil,
|
379
|
-
cache: true,
|
380
|
-
parallel: "auto",
|
381
|
-
rechunk: true,
|
382
|
-
row_count_name: nil,
|
383
|
-
row_count_offset: 0,
|
384
|
-
storage_options: nil,
|
385
|
-
low_memory: false
|
204
|
+
sample_size: 1024,
|
205
|
+
eol_char: "\n",
|
206
|
+
raise_if_empty: true,
|
207
|
+
truncate_ragged_lines: false,
|
208
|
+
decimal_comma: false,
|
209
|
+
glob: true
|
386
210
|
)
|
387
|
-
if Utils.pathlike?(
|
388
|
-
|
211
|
+
if Utils.pathlike?(file)
|
212
|
+
path = Utils.normalize_filepath(file)
|
213
|
+
else
|
214
|
+
path = nil
|
215
|
+
# if defined?(StringIO) && file.is_a?(StringIO)
|
216
|
+
# file = file.string
|
217
|
+
# end
|
389
218
|
end
|
390
219
|
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
# Lazily read from a newline delimited JSON file.
|
405
|
-
#
|
406
|
-
# This allows the query optimizer to push down predicates and projections to the scan
|
407
|
-
# level, thereby potentially reducing memory overhead.
|
408
|
-
#
|
409
|
-
# @param source [String]
|
410
|
-
# Path to a file.
|
411
|
-
# @param infer_schema_length [Integer]
|
412
|
-
# Infer the schema length from the first `infer_schema_length` rows.
|
413
|
-
# @param batch_size [Integer]
|
414
|
-
# Number of rows to read in each batch.
|
415
|
-
# @param n_rows [Integer]
|
416
|
-
# Stop reading from JSON file after reading `n_rows`.
|
417
|
-
# @param low_memory [Boolean]
|
418
|
-
# Reduce memory pressure at the expense of performance.
|
419
|
-
# @param rechunk [Boolean]
|
420
|
-
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
421
|
-
# @param row_count_name [String]
|
422
|
-
# If not nil, this will insert a row count column with give name into the
|
423
|
-
# DataFrame.
|
424
|
-
# @param row_count_offset [Integer]
|
425
|
-
# Offset to start the row_count column (only use if the name is set).
|
426
|
-
#
|
427
|
-
# @return [LazyFrame]
|
428
|
-
def scan_ndjson(
|
429
|
-
source,
|
430
|
-
infer_schema_length: 100,
|
431
|
-
batch_size: 1024,
|
432
|
-
n_rows: nil,
|
433
|
-
low_memory: false,
|
434
|
-
rechunk: true,
|
435
|
-
row_count_name: nil,
|
436
|
-
row_count_offset: 0
|
437
|
-
)
|
438
|
-
if Utils.pathlike?(source)
|
439
|
-
source = Utils.normalise_filepath(source)
|
220
|
+
dtype_list = nil
|
221
|
+
dtype_slice = nil
|
222
|
+
if !dtypes.nil?
|
223
|
+
if dtypes.is_a?(Hash)
|
224
|
+
dtype_list = []
|
225
|
+
dtypes.each do|k, v|
|
226
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
227
|
+
end
|
228
|
+
elsif dtypes.is_a?(::Array)
|
229
|
+
dtype_slice = dtypes
|
230
|
+
else
|
231
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
232
|
+
end
|
440
233
|
end
|
441
234
|
|
442
|
-
|
443
|
-
source,
|
444
|
-
infer_schema_length: infer_schema_length,
|
445
|
-
batch_size: batch_size,
|
446
|
-
n_rows: n_rows,
|
447
|
-
low_memory: low_memory,
|
448
|
-
rechunk: rechunk,
|
449
|
-
row_count_name: row_count_name,
|
450
|
-
row_count_offset: row_count_offset,
|
451
|
-
)
|
452
|
-
end
|
235
|
+
processed_null_values = Utils._process_null_values(null_values)
|
453
236
|
|
454
|
-
|
455
|
-
|
456
|
-
# @param source [Object]
|
457
|
-
# Path to a file or a file-like object.
|
458
|
-
# @param columns [Object]
|
459
|
-
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
460
|
-
# of column names.
|
461
|
-
# @param n_rows [Integer]
|
462
|
-
# Stop reading from Apache Avro file after reading ``n_rows``.
|
463
|
-
#
|
464
|
-
# @return [DataFrame]
|
465
|
-
def read_avro(source, columns: nil, n_rows: nil)
|
466
|
-
if Utils.pathlike?(source)
|
467
|
-
source = Utils.normalise_filepath(source)
|
237
|
+
if columns.is_a?(::String)
|
238
|
+
columns = [columns]
|
468
239
|
end
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
# If not nil, this will insert a row count column with give name into the
|
490
|
-
# DataFrame.
|
491
|
-
# @param row_count_offset [Integer]
|
492
|
-
# Offset to start the row_count column (only use if the name is set).
|
493
|
-
# @param rechunk [Boolean]
|
494
|
-
# Make sure that all data is contiguous.
|
495
|
-
#
|
496
|
-
# @return [DataFrame]
|
497
|
-
def read_ipc(
|
498
|
-
source,
|
499
|
-
columns: nil,
|
500
|
-
n_rows: nil,
|
501
|
-
memory_map: true,
|
502
|
-
storage_options: nil,
|
503
|
-
row_count_name: nil,
|
504
|
-
row_count_offset: 0,
|
505
|
-
rechunk: true
|
506
|
-
)
|
507
|
-
storage_options ||= {}
|
508
|
-
_prepare_file_arg(source, **storage_options) do |data|
|
509
|
-
DataFrame._read_ipc(
|
510
|
-
data,
|
511
|
-
columns: columns,
|
240
|
+
if file.is_a?(::String) && file.include?("*")
|
241
|
+
dtypes_dict = nil
|
242
|
+
if !dtype_list.nil?
|
243
|
+
dtypes_dict = dtype_list.to_h
|
244
|
+
end
|
245
|
+
if !dtype_slice.nil?
|
246
|
+
raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
|
247
|
+
end
|
248
|
+
scan = scan_csv(
|
249
|
+
file,
|
250
|
+
has_header: has_header,
|
251
|
+
sep: sep,
|
252
|
+
comment_char: comment_char,
|
253
|
+
quote_char: quote_char,
|
254
|
+
skip_rows: skip_rows,
|
255
|
+
dtypes: dtypes_dict,
|
256
|
+
null_values: null_values,
|
257
|
+
missing_utf8_is_empty_string: missing_utf8_is_empty_string,
|
258
|
+
ignore_errors: ignore_errors,
|
259
|
+
infer_schema_length: infer_schema_length,
|
512
260
|
n_rows: n_rows,
|
513
|
-
|
514
|
-
row_count_offset: row_count_offset,
|
261
|
+
low_memory: low_memory,
|
515
262
|
rechunk: rechunk,
|
516
|
-
|
517
|
-
)
|
518
|
-
end
|
519
|
-
end
|
520
|
-
|
521
|
-
# Read into a DataFrame from a parquet file.
|
522
|
-
#
|
523
|
-
# @param source [Object]
|
524
|
-
# Path to a file or a file-like object.
|
525
|
-
# @param columns [Object]
|
526
|
-
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
527
|
-
# of column names.
|
528
|
-
# @param n_rows [Integer]
|
529
|
-
# Stop reading from parquet file after reading `n_rows`.
|
530
|
-
# @param storage_options [Hash]
|
531
|
-
# Extra options that make sense for a particular storage connection.
|
532
|
-
# @param parallel ["auto", "columns", "row_groups", "none"]
|
533
|
-
# This determines the direction of parallelism. 'auto' will try to determine the
|
534
|
-
# optimal direction.
|
535
|
-
# @param row_count_name [String]
|
536
|
-
# If not nil, this will insert a row count column with give name into the
|
537
|
-
# DataFrame.
|
538
|
-
# @param row_count_offset [Integer]
|
539
|
-
# Offset to start the row_count column (only use if the name is set).
|
540
|
-
# @param low_memory [Boolean]
|
541
|
-
# Reduce memory pressure at the expense of performance.
|
542
|
-
# @param use_statistics [Boolean]
|
543
|
-
# Use statistics in the parquet to determine if pages
|
544
|
-
# can be skipped from reading.
|
545
|
-
# @param rechunk [Boolean]
|
546
|
-
# Make sure that all columns are contiguous in memory by
|
547
|
-
# aggregating the chunks into a single array.
|
548
|
-
#
|
549
|
-
# @return [DataFrame]
|
550
|
-
#
|
551
|
-
# @note
|
552
|
-
# This operation defaults to a `rechunk` operation at the end, meaning that
|
553
|
-
# all data will be stored continuously in memory.
|
554
|
-
# Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
|
555
|
-
# an expensive operation.
|
556
|
-
def read_parquet(
|
557
|
-
source,
|
558
|
-
columns: nil,
|
559
|
-
n_rows: nil,
|
560
|
-
storage_options: nil,
|
561
|
-
parallel: "auto",
|
562
|
-
row_count_name: nil,
|
563
|
-
row_count_offset: 0,
|
564
|
-
low_memory: false,
|
565
|
-
use_statistics: true,
|
566
|
-
rechunk: true
|
567
|
-
)
|
568
|
-
_prepare_file_arg(source) do |data|
|
569
|
-
DataFrame._read_parquet(
|
570
|
-
data,
|
571
|
-
columns: columns,
|
572
|
-
n_rows: n_rows,
|
573
|
-
parallel: parallel,
|
263
|
+
skip_rows_after_header: skip_rows_after_header,
|
574
264
|
row_count_name: row_count_name,
|
575
265
|
row_count_offset: row_count_offset,
|
576
|
-
|
577
|
-
|
578
|
-
|
266
|
+
eol_char: eol_char,
|
267
|
+
truncate_ragged_lines: truncate_ragged_lines,
|
268
|
+
decimal_comma: decimal_comma,
|
269
|
+
glob: glob
|
579
270
|
)
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
#
|
585
|
-
# @param source [Object]
|
586
|
-
# Path to a file or a file-like object.
|
587
|
-
#
|
588
|
-
# @return [DataFrame]
|
589
|
-
def read_json(source)
|
590
|
-
DataFrame._read_json(source)
|
591
|
-
end
|
592
|
-
|
593
|
-
# Read into a DataFrame from a newline delimited JSON file.
|
594
|
-
#
|
595
|
-
# @param source [Object]
|
596
|
-
# Path to a file or a file-like object.
|
597
|
-
#
|
598
|
-
# @return [DataFrame]
|
599
|
-
def read_ndjson(source)
|
600
|
-
DataFrame._read_ndjson(source)
|
601
|
-
end
|
602
|
-
|
603
|
-
# Read a SQL query into a DataFrame.
|
604
|
-
#
|
605
|
-
# @param query [Object]
|
606
|
-
# ActiveRecord::Relation or ActiveRecord::Result.
|
607
|
-
# @param schema_overrides [Hash]
|
608
|
-
# A hash mapping column names to dtypes, used to override the schema
|
609
|
-
# inferred from the query.
|
610
|
-
#
|
611
|
-
# @return [DataFrame]
|
612
|
-
def read_database(query, schema_overrides: nil)
|
613
|
-
if !defined?(ActiveRecord)
|
614
|
-
raise Error, "Active Record not available"
|
615
|
-
end
|
616
|
-
|
617
|
-
result =
|
618
|
-
if query.is_a?(ActiveRecord::Result)
|
619
|
-
query
|
620
|
-
elsif query.is_a?(ActiveRecord::Relation)
|
621
|
-
query.connection.select_all(query.to_sql)
|
622
|
-
elsif query.is_a?(::String)
|
623
|
-
ActiveRecord::Base.connection.select_all(query)
|
271
|
+
if columns.nil?
|
272
|
+
return scan.collect
|
273
|
+
elsif is_str_sequence(columns, allow_str: false)
|
274
|
+
return scan.select(columns).collect
|
624
275
|
else
|
625
|
-
raise ArgumentError, "
|
276
|
+
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
|
626
277
|
end
|
627
|
-
|
628
|
-
data = {}
|
629
|
-
schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
|
630
|
-
|
631
|
-
result.columns.each_with_index do |k, i|
|
632
|
-
column_type = result.column_types[i]
|
633
|
-
|
634
|
-
data[k] =
|
635
|
-
if column_type
|
636
|
-
result.rows.map { |r| column_type.deserialize(r[i]) }
|
637
|
-
else
|
638
|
-
result.rows.map { |r| r[i] }
|
639
|
-
end
|
640
|
-
|
641
|
-
polars_type =
|
642
|
-
case column_type&.type
|
643
|
-
when :binary
|
644
|
-
Binary
|
645
|
-
when :boolean
|
646
|
-
Boolean
|
647
|
-
when :date
|
648
|
-
Date
|
649
|
-
when :datetime, :timestamp
|
650
|
-
Datetime
|
651
|
-
when :decimal
|
652
|
-
Decimal
|
653
|
-
when :float
|
654
|
-
Float64
|
655
|
-
when :integer
|
656
|
-
Int64
|
657
|
-
when :string, :text
|
658
|
-
String
|
659
|
-
when :time
|
660
|
-
Time
|
661
|
-
# TODO fix issue with null
|
662
|
-
# when :json, :jsonb
|
663
|
-
# Struct
|
664
|
-
end
|
665
|
-
|
666
|
-
schema_overrides[k] ||= polars_type if polars_type
|
667
278
|
end
|
668
279
|
|
669
|
-
|
670
|
-
end
|
671
|
-
alias_method :read_sql, :read_database
|
280
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
672
281
|
|
673
|
-
|
674
|
-
|
282
|
+
rbdf =
|
283
|
+
RbDataFrame.read_csv(
|
284
|
+
file,
|
285
|
+
infer_schema_length,
|
286
|
+
batch_size,
|
287
|
+
has_header,
|
288
|
+
ignore_errors,
|
289
|
+
n_rows,
|
290
|
+
skip_rows,
|
291
|
+
projection,
|
292
|
+
sep,
|
293
|
+
rechunk,
|
294
|
+
columns,
|
295
|
+
encoding,
|
296
|
+
n_threads,
|
297
|
+
path,
|
298
|
+
dtype_list,
|
299
|
+
dtype_slice,
|
300
|
+
low_memory,
|
301
|
+
comment_char,
|
302
|
+
quote_char,
|
303
|
+
processed_null_values,
|
304
|
+
missing_utf8_is_empty_string,
|
305
|
+
parse_dates,
|
306
|
+
skip_rows_after_header,
|
307
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
308
|
+
sample_size,
|
309
|
+
eol_char,
|
310
|
+
raise_if_empty,
|
311
|
+
truncate_ragged_lines,
|
312
|
+
decimal_comma,
|
313
|
+
schema
|
314
|
+
)
|
315
|
+
Utils.wrap_df(rbdf)
|
316
|
+
end
|
675
317
|
|
676
318
|
# Read a CSV file in batches.
|
677
319
|
#
|
@@ -755,6 +397,8 @@ module Polars
|
|
755
397
|
# allocation needed.
|
756
398
|
# @param eol_char [String]
|
757
399
|
# Single byte end of line character.
|
400
|
+
# @param truncate_ragged_lines [Boolean]
|
401
|
+
# Truncate lines that are longer than the schema.
|
758
402
|
#
|
759
403
|
# @return [BatchedCsvReader]
|
760
404
|
#
|
@@ -774,6 +418,7 @@ module Polars
|
|
774
418
|
skip_rows: 0,
|
775
419
|
dtypes: nil,
|
776
420
|
null_values: nil,
|
421
|
+
missing_utf8_is_empty_string: false,
|
777
422
|
ignore_errors: false,
|
778
423
|
parse_dates: false,
|
779
424
|
n_threads: nil,
|
@@ -787,7 +432,10 @@ module Polars
|
|
787
432
|
row_count_name: nil,
|
788
433
|
row_count_offset: 0,
|
789
434
|
sample_size: 1024,
|
790
|
-
eol_char: "\n"
|
435
|
+
eol_char: "\n",
|
436
|
+
raise_if_empty: true,
|
437
|
+
truncate_ragged_lines: false,
|
438
|
+
decimal_comma: false
|
791
439
|
)
|
792
440
|
projection, columns = Utils.handle_projection_columns(columns)
|
793
441
|
|
@@ -813,6 +461,7 @@ module Polars
|
|
813
461
|
skip_rows: skip_rows,
|
814
462
|
dtypes: dtypes,
|
815
463
|
null_values: null_values,
|
464
|
+
missing_utf8_is_empty_string: missing_utf8_is_empty_string,
|
816
465
|
ignore_errors: ignore_errors,
|
817
466
|
parse_dates: parse_dates,
|
818
467
|
n_threads: n_threads,
|
@@ -827,36 +476,205 @@ module Polars
|
|
827
476
|
row_count_offset: row_count_offset,
|
828
477
|
sample_size: sample_size,
|
829
478
|
eol_char: eol_char,
|
830
|
-
new_columns: new_columns
|
479
|
+
new_columns: new_columns,
|
480
|
+
raise_if_empty: raise_if_empty,
|
481
|
+
truncate_ragged_lines: truncate_ragged_lines,
|
482
|
+
decimal_comma: decimal_comma
|
831
483
|
)
|
832
484
|
end
|
833
485
|
|
834
|
-
#
|
486
|
+
# Lazily read from a CSV file or multiple files via glob patterns.
|
487
|
+
#
|
488
|
+
# This allows the query optimizer to push down predicates and
|
489
|
+
# projections to the scan level, thereby potentially reducing
|
490
|
+
# memory overhead.
|
835
491
|
#
|
836
492
|
# @param source [Object]
|
837
|
-
# Path to a file
|
493
|
+
# Path to a file.
|
494
|
+
# @param has_header [Boolean]
|
495
|
+
# Indicate if the first row of dataset is a header or not.
|
496
|
+
# If set to false, column names will be autogenerated in the
|
497
|
+
# following format: `column_x`, with `x` being an
|
498
|
+
# enumeration over every column in the dataset starting at 1.
|
499
|
+
# @param sep [String]
|
500
|
+
# Single byte character to use as delimiter in the file.
|
501
|
+
# @param comment_char [String]
|
502
|
+
# Single byte character that indicates the start of a comment line,
|
503
|
+
# for instance `#`.
|
504
|
+
# @param quote_char [String]
|
505
|
+
# Single byte character used for csv quoting.
|
506
|
+
# Set to None to turn off special handling and escaping of quotes.
|
507
|
+
# @param skip_rows [Integer]
|
508
|
+
# Start reading after `skip_rows` lines. The header will be parsed at this
|
509
|
+
# offset.
|
510
|
+
# @param dtypes [Object]
|
511
|
+
# Overwrite dtypes during inference.
|
512
|
+
# @param null_values [Object]
|
513
|
+
# Values to interpret as null values. You can provide a:
|
514
|
+
#
|
515
|
+
# - `String`: All values equal to this string will be null.
|
516
|
+
# - `Array`: All values equal to any string in this array will be null.
|
517
|
+
# - `Hash`: A hash that maps column name to a null value string.
|
518
|
+
# @param ignore_errors [Boolean]
|
519
|
+
# Try to keep reading lines if some lines yield errors.
|
520
|
+
# First try `infer_schema_length: 0` to read all columns as
|
521
|
+
# `:str` to check which values might cause an issue.
|
522
|
+
# @param cache [Boolean]
|
523
|
+
# Cache the result after reading.
|
524
|
+
# @param with_column_names [Object]
|
525
|
+
# Apply a function over the column names.
|
526
|
+
# This can be used to update a schema just in time, thus before
|
527
|
+
# scanning.
|
528
|
+
# @param infer_schema_length [Integer]
|
529
|
+
# Maximum number of lines to read to infer schema.
|
530
|
+
# If set to 0, all columns will be read as `:str`.
|
531
|
+
# If set to `nil`, a full table scan will be done (slow).
|
532
|
+
# @param n_rows [Integer]
|
533
|
+
# Stop reading from CSV file after reading `n_rows`.
|
534
|
+
# @param encoding ["utf8", "utf8-lossy"]
|
535
|
+
# Lossy means that invalid utf8 values are replaced with `�`
|
536
|
+
# characters.
|
537
|
+
# @param low_memory [Boolean]
|
538
|
+
# Reduce memory usage in expense of performance.
|
539
|
+
# @param rechunk [Boolean]
|
540
|
+
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
541
|
+
# @param skip_rows_after_header [Integer]
|
542
|
+
# Skip this number of rows when the header is parsed.
|
543
|
+
# @param row_count_name [String]
|
544
|
+
# If not nil, this will insert a row count column with the given name into
|
545
|
+
# the DataFrame.
|
546
|
+
# @param row_count_offset [Integer]
|
547
|
+
# Offset to start the row_count column (only used if the name is set).
|
548
|
+
# @param parse_dates [Boolean]
|
549
|
+
# Try to automatically parse dates. If this does not succeed,
|
550
|
+
# the column remains of data type `:str`.
|
551
|
+
# @param eol_char [String]
|
552
|
+
# Single byte end of line character.
|
553
|
+
# @param truncate_ragged_lines [Boolean]
|
554
|
+
# Truncate lines that are longer than the schema.
|
838
555
|
#
|
839
|
-
# @return [
|
840
|
-
def
|
556
|
+
# @return [LazyFrame]
|
557
|
+
def scan_csv(
|
558
|
+
source,
|
559
|
+
has_header: true,
|
560
|
+
sep: ",",
|
561
|
+
comment_char: nil,
|
562
|
+
quote_char: '"',
|
563
|
+
skip_rows: 0,
|
564
|
+
dtypes: nil,
|
565
|
+
null_values: nil,
|
566
|
+
missing_utf8_is_empty_string: false,
|
567
|
+
ignore_errors: false,
|
568
|
+
cache: true,
|
569
|
+
with_column_names: nil,
|
570
|
+
infer_schema_length: 100,
|
571
|
+
n_rows: nil,
|
572
|
+
encoding: "utf8",
|
573
|
+
low_memory: false,
|
574
|
+
rechunk: true,
|
575
|
+
skip_rows_after_header: 0,
|
576
|
+
row_count_name: nil,
|
577
|
+
row_count_offset: 0,
|
578
|
+
parse_dates: false,
|
579
|
+
eol_char: "\n",
|
580
|
+
raise_if_empty: true,
|
581
|
+
truncate_ragged_lines: false,
|
582
|
+
decimal_comma: false,
|
583
|
+
glob: true
|
584
|
+
)
|
585
|
+
Utils._check_arg_is_1byte("sep", sep, false)
|
586
|
+
Utils._check_arg_is_1byte("comment_char", comment_char, false)
|
587
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
588
|
+
|
841
589
|
if Utils.pathlike?(source)
|
842
|
-
source = Utils.
|
590
|
+
source = Utils.normalize_filepath(source)
|
843
591
|
end
|
844
592
|
|
845
|
-
|
593
|
+
_scan_csv_impl(
|
594
|
+
source,
|
595
|
+
has_header: has_header,
|
596
|
+
sep: sep,
|
597
|
+
comment_char: comment_char,
|
598
|
+
quote_char: quote_char,
|
599
|
+
skip_rows: skip_rows,
|
600
|
+
dtypes: dtypes,
|
601
|
+
null_values: null_values,
|
602
|
+
ignore_errors: ignore_errors,
|
603
|
+
cache: cache,
|
604
|
+
with_column_names: with_column_names,
|
605
|
+
infer_schema_length: infer_schema_length,
|
606
|
+
n_rows: n_rows,
|
607
|
+
low_memory: low_memory,
|
608
|
+
rechunk: rechunk,
|
609
|
+
skip_rows_after_header: skip_rows_after_header,
|
610
|
+
encoding: encoding,
|
611
|
+
row_count_name: row_count_name,
|
612
|
+
row_count_offset: row_count_offset,
|
613
|
+
parse_dates: parse_dates,
|
614
|
+
eol_char: eol_char,
|
615
|
+
truncate_ragged_lines: truncate_ragged_lines
|
616
|
+
)
|
846
617
|
end
|
847
618
|
|
848
|
-
#
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
619
|
+
# @private
|
620
|
+
def _scan_csv_impl(
|
621
|
+
file,
|
622
|
+
has_header: true,
|
623
|
+
sep: ",",
|
624
|
+
comment_char: nil,
|
625
|
+
quote_char: '"',
|
626
|
+
skip_rows: 0,
|
627
|
+
dtypes: nil,
|
628
|
+
null_values: nil,
|
629
|
+
ignore_errors: false,
|
630
|
+
cache: true,
|
631
|
+
with_column_names: nil,
|
632
|
+
infer_schema_length: 100,
|
633
|
+
n_rows: nil,
|
634
|
+
encoding: "utf8",
|
635
|
+
low_memory: false,
|
636
|
+
rechunk: true,
|
637
|
+
skip_rows_after_header: 0,
|
638
|
+
row_count_name: nil,
|
639
|
+
row_count_offset: 0,
|
640
|
+
parse_dates: false,
|
641
|
+
eol_char: "\n",
|
642
|
+
truncate_ragged_lines: true
|
643
|
+
)
|
644
|
+
dtype_list = nil
|
645
|
+
if !dtypes.nil?
|
646
|
+
dtype_list = []
|
647
|
+
dtypes.each do |k, v|
|
648
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
649
|
+
end
|
857
650
|
end
|
858
|
-
|
859
|
-
|
651
|
+
processed_null_values = Utils._process_null_values(null_values)
|
652
|
+
|
653
|
+
rblf =
|
654
|
+
RbLazyFrame.new_from_csv(
|
655
|
+
file,
|
656
|
+
sep,
|
657
|
+
has_header,
|
658
|
+
ignore_errors,
|
659
|
+
skip_rows,
|
660
|
+
n_rows,
|
661
|
+
cache,
|
662
|
+
dtype_list,
|
663
|
+
low_memory,
|
664
|
+
comment_char,
|
665
|
+
quote_char,
|
666
|
+
processed_null_values,
|
667
|
+
infer_schema_length,
|
668
|
+
with_column_names,
|
669
|
+
rechunk,
|
670
|
+
skip_rows_after_header,
|
671
|
+
encoding,
|
672
|
+
Utils._prepare_row_count_args(row_count_name, row_count_offset),
|
673
|
+
parse_dates,
|
674
|
+
eol_char,
|
675
|
+
truncate_ragged_lines
|
676
|
+
)
|
677
|
+
Utils.wrap_ldf(rblf)
|
860
678
|
end
|
861
679
|
|
862
680
|
private
|