polars-df 0.10.0-x86_64-linux → 0.11.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
@@ -141,7 +141,7 @@ module Polars
141
141
 
142
142
  df = nil
143
143
  _prepare_file_arg(source) do |data|
144
- df = DataFrame._read_csv(
144
+ df = _read_csv_impl(
145
145
  data,
146
146
  has_header: has_header,
147
147
  columns: columns || projection,
@@ -176,90 +176,24 @@ module Polars
176
176
  end
177
177
  end
178
178
 
179
- # Lazily read from a CSV file or multiple files via glob patterns.
180
- #
181
- # This allows the query optimizer to push down predicates and
182
- # projections to the scan level, thereby potentially reducing
183
- # memory overhead.
184
- #
185
- # @param source [Object]
186
- # Path to a file.
187
- # @param has_header [Boolean]
188
- # Indicate if the first row of dataset is a header or not.
189
- # If set to false, column names will be autogenerated in the
190
- # following format: `column_x`, with `x` being an
191
- # enumeration over every column in the dataset starting at 1.
192
- # @param sep [String]
193
- # Single byte character to use as delimiter in the file.
194
- # @param comment_char [String]
195
- # Single byte character that indicates the start of a comment line,
196
- # for instance `#`.
197
- # @param quote_char [String]
198
- # Single byte character used for csv quoting.
199
- # Set to None to turn off special handling and escaping of quotes.
200
- # @param skip_rows [Integer]
201
- # Start reading after `skip_rows` lines. The header will be parsed at this
202
- # offset.
203
- # @param dtypes [Object]
204
- # Overwrite dtypes during inference.
205
- # @param null_values [Object]
206
- # Values to interpret as null values. You can provide a:
207
- #
208
- # - `String`: All values equal to this string will be null.
209
- # - `Array`: All values equal to any string in this array will be null.
210
- # - `Hash`: A hash that maps column name to a null value string.
211
- # @param ignore_errors [Boolean]
212
- # Try to keep reading lines if some lines yield errors.
213
- # First try `infer_schema_length: 0` to read all columns as
214
- # `:str` to check which values might cause an issue.
215
- # @param cache [Boolean]
216
- # Cache the result after reading.
217
- # @param with_column_names [Object]
218
- # Apply a function over the column names.
219
- # This can be used to update a schema just in time, thus before
220
- # scanning.
221
- # @param infer_schema_length [Integer]
222
- # Maximum number of lines to read to infer schema.
223
- # If set to 0, all columns will be read as `:str`.
224
- # If set to `nil`, a full table scan will be done (slow).
225
- # @param n_rows [Integer]
226
- # Stop reading from CSV file after reading `n_rows`.
227
- # @param encoding ["utf8", "utf8-lossy"]
228
- # Lossy means that invalid utf8 values are replaced with `�`
229
- # characters.
230
- # @param low_memory [Boolean]
231
- # Reduce memory usage in expense of performance.
232
- # @param rechunk [Boolean]
233
- # Reallocate to contiguous memory when all chunks/ files are parsed.
234
- # @param skip_rows_after_header [Integer]
235
- # Skip this number of rows when the header is parsed.
236
- # @param row_count_name [String]
237
- # If not nil, this will insert a row count column with the given name into
238
- # the DataFrame.
239
- # @param row_count_offset [Integer]
240
- # Offset to start the row_count column (only used if the name is set).
241
- # @param parse_dates [Boolean]
242
- # Try to automatically parse dates. If this does not succeed,
243
- # the column remains of data type `:str`.
244
- # @param eol_char [String]
245
- # Single byte end of line character.
246
- # @param truncate_ragged_lines [Boolean]
247
- # Truncate lines that are longer than the schema.
248
- #
249
- # @return [LazyFrame]
250
- def scan_csv(
251
- source,
179
+ # @private
180
+ def _read_csv_impl(
181
+ file,
252
182
  has_header: true,
183
+ columns: nil,
253
184
  sep: ",",
254
185
  comment_char: nil,
255
186
  quote_char: '"',
256
187
  skip_rows: 0,
257
188
  dtypes: nil,
189
+ schema: nil,
258
190
  null_values: nil,
191
+ missing_utf8_is_empty_string: false,
259
192
  ignore_errors: false,
260
- cache: true,
261
- with_column_names: nil,
193
+ parse_dates: false,
194
+ n_threads: nil,
262
195
  infer_schema_length: 100,
196
+ batch_size: 8192,
263
197
  n_rows: nil,
264
198
  encoding: "utf8",
265
199
  low_memory: false,
@@ -267,419 +201,119 @@ module Polars
267
201
  skip_rows_after_header: 0,
268
202
  row_count_name: nil,
269
203
  row_count_offset: 0,
270
- parse_dates: false,
204
+ sample_size: 1024,
271
205
  eol_char: "\n",
272
- truncate_ragged_lines: false
273
- )
274
- Utils._check_arg_is_1byte("sep", sep, false)
275
- Utils._check_arg_is_1byte("comment_char", comment_char, false)
276
- Utils._check_arg_is_1byte("quote_char", quote_char, true)
277
-
278
- if Utils.pathlike?(source)
279
- source = Utils.normalise_filepath(source)
280
- end
281
-
282
- LazyFrame._scan_csv(
283
- source,
284
- has_header: has_header,
285
- sep: sep,
286
- comment_char: comment_char,
287
- quote_char: quote_char,
288
- skip_rows: skip_rows,
289
- dtypes: dtypes,
290
- null_values: null_values,
291
- ignore_errors: ignore_errors,
292
- cache: cache,
293
- with_column_names: with_column_names,
294
- infer_schema_length: infer_schema_length,
295
- n_rows: n_rows,
296
- low_memory: low_memory,
297
- rechunk: rechunk,
298
- skip_rows_after_header: skip_rows_after_header,
299
- encoding: encoding,
300
- row_count_name: row_count_name,
301
- row_count_offset: row_count_offset,
302
- parse_dates: parse_dates,
303
- eol_char: eol_char,
304
- truncate_ragged_lines: truncate_ragged_lines
305
- )
306
- end
307
-
308
- # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
309
- #
310
- # This allows the query optimizer to push down predicates and projections to the scan
311
- # level, thereby potentially reducing memory overhead.
312
- #
313
- # @param source [String]
314
- # Path to a IPC file.
315
- # @param n_rows [Integer]
316
- # Stop reading from IPC file after reading `n_rows`.
317
- # @param cache [Boolean]
318
- # Cache the result after reading.
319
- # @param rechunk [Boolean]
320
- # Reallocate to contiguous memory when all chunks/ files are parsed.
321
- # @param row_count_name [String]
322
- # If not nil, this will insert a row count column with give name into the
323
- # DataFrame.
324
- # @param row_count_offset [Integer]
325
- # Offset to start the row_count column (only use if the name is set).
326
- # @param storage_options [Hash]
327
- # Extra options that make sense for a particular storage connection.
328
- # @param memory_map [Boolean]
329
- # Try to memory map the file. This can greatly improve performance on repeated
330
- # queries as the OS may cache pages.
331
- # Only uncompressed IPC files can be memory mapped.
332
- #
333
- # @return [LazyFrame]
334
- def scan_ipc(
335
- source,
336
- n_rows: nil,
337
- cache: true,
338
- rechunk: true,
339
- row_count_name: nil,
340
- row_count_offset: 0,
341
- storage_options: nil,
342
- memory_map: true
206
+ raise_if_empty: true,
207
+ truncate_ragged_lines: false,
208
+ decimal_comma: false,
209
+ glob: true
343
210
  )
344
- LazyFrame._scan_ipc(
345
- source,
346
- n_rows: n_rows,
347
- cache: cache,
348
- rechunk: rechunk,
349
- row_count_name: row_count_name,
350
- row_count_offset: row_count_offset,
351
- storage_options: storage_options,
352
- memory_map: memory_map
353
- )
354
- end
355
-
356
- # Lazily read from a parquet file or multiple files via glob patterns.
357
- #
358
- # This allows the query optimizer to push down predicates and projections to the scan
359
- # level, thereby potentially reducing memory overhead.
360
- #
361
- # @param source [String]
362
- # Path to a file.
363
- # @param n_rows [Integer]
364
- # Stop reading from parquet file after reading `n_rows`.
365
- # @param cache [Boolean]
366
- # Cache the result after reading.
367
- # @param parallel ["auto", "columns", "row_groups", "none"]
368
- # This determines the direction of parallelism. 'auto' will try to determine the
369
- # optimal direction.
370
- # @param rechunk [Boolean]
371
- # In case of reading multiple files via a glob pattern rechunk the final DataFrame
372
- # into contiguous memory chunks.
373
- # @param row_count_name [String]
374
- # If not nil, this will insert a row count column with give name into the
375
- # DataFrame.
376
- # @param row_count_offset [Integer]
377
- # Offset to start the row_count column (only use if the name is set).
378
- # @param storage_options [Hash]
379
- # Extra options that make sense for a particular storage connection.
380
- # @param low_memory [Boolean]
381
- # Reduce memory pressure at the expense of performance.
382
- #
383
- # @return [LazyFrame]
384
- def scan_parquet(
385
- source,
386
- n_rows: nil,
387
- cache: true,
388
- parallel: "auto",
389
- rechunk: true,
390
- row_count_name: nil,
391
- row_count_offset: 0,
392
- storage_options: nil,
393
- low_memory: false
394
- )
395
- if Utils.pathlike?(source)
396
- source = Utils.normalise_filepath(source)
211
+ if Utils.pathlike?(file)
212
+ path = Utils.normalize_filepath(file)
213
+ else
214
+ path = nil
215
+ # if defined?(StringIO) && file.is_a?(StringIO)
216
+ # file = file.string
217
+ # end
397
218
  end
398
219
 
399
- LazyFrame._scan_parquet(
400
- source,
401
- n_rows:n_rows,
402
- cache: cache,
403
- parallel: parallel,
404
- rechunk: rechunk,
405
- row_count_name: row_count_name,
406
- row_count_offset: row_count_offset,
407
- storage_options: storage_options,
408
- low_memory: low_memory
409
- )
410
- end
411
-
412
- # Lazily read from a newline delimited JSON file.
413
- #
414
- # This allows the query optimizer to push down predicates and projections to the scan
415
- # level, thereby potentially reducing memory overhead.
416
- #
417
- # @param source [String]
418
- # Path to a file.
419
- # @param infer_schema_length [Integer]
420
- # Infer the schema length from the first `infer_schema_length` rows.
421
- # @param batch_size [Integer]
422
- # Number of rows to read in each batch.
423
- # @param n_rows [Integer]
424
- # Stop reading from JSON file after reading `n_rows`.
425
- # @param low_memory [Boolean]
426
- # Reduce memory pressure at the expense of performance.
427
- # @param rechunk [Boolean]
428
- # Reallocate to contiguous memory when all chunks/ files are parsed.
429
- # @param row_count_name [String]
430
- # If not nil, this will insert a row count column with give name into the
431
- # DataFrame.
432
- # @param row_count_offset [Integer]
433
- # Offset to start the row_count column (only use if the name is set).
434
- #
435
- # @return [LazyFrame]
436
- def scan_ndjson(
437
- source,
438
- infer_schema_length: 100,
439
- batch_size: 1024,
440
- n_rows: nil,
441
- low_memory: false,
442
- rechunk: true,
443
- row_count_name: nil,
444
- row_count_offset: 0
445
- )
446
- if Utils.pathlike?(source)
447
- source = Utils.normalise_filepath(source)
220
+ dtype_list = nil
221
+ dtype_slice = nil
222
+ if !dtypes.nil?
223
+ if dtypes.is_a?(Hash)
224
+ dtype_list = []
225
+ dtypes.each do|k, v|
226
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
227
+ end
228
+ elsif dtypes.is_a?(::Array)
229
+ dtype_slice = dtypes
230
+ else
231
+ raise ArgumentError, "dtype arg should be list or dict"
232
+ end
448
233
  end
449
234
 
450
- LazyFrame._scan_ndjson(
451
- source,
452
- infer_schema_length: infer_schema_length,
453
- batch_size: batch_size,
454
- n_rows: n_rows,
455
- low_memory: low_memory,
456
- rechunk: rechunk,
457
- row_count_name: row_count_name,
458
- row_count_offset: row_count_offset,
459
- )
460
- end
235
+ processed_null_values = Utils._process_null_values(null_values)
461
236
 
462
- # Read into a DataFrame from Apache Avro format.
463
- #
464
- # @param source [Object]
465
- # Path to a file or a file-like object.
466
- # @param columns [Object]
467
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
468
- # of column names.
469
- # @param n_rows [Integer]
470
- # Stop reading from Apache Avro file after reading ``n_rows``.
471
- #
472
- # @return [DataFrame]
473
- def read_avro(source, columns: nil, n_rows: nil)
474
- if Utils.pathlike?(source)
475
- source = Utils.normalise_filepath(source)
237
+ if columns.is_a?(::String)
238
+ columns = [columns]
476
239
  end
477
-
478
- DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
479
- end
480
-
481
- # Read into a DataFrame from Arrow IPC (Feather v2) file.
482
- #
483
- # @param source [Object]
484
- # Path to a file or a file-like object.
485
- # @param columns [Object]
486
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
487
- # of column names.
488
- # @param n_rows [Integer]
489
- # Stop reading from IPC file after reading `n_rows`.
490
- # @param memory_map [Boolean]
491
- # Try to memory map the file. This can greatly improve performance on repeated
492
- # queries as the OS may cache pages.
493
- # Only uncompressed IPC files can be memory mapped.
494
- # @param storage_options [Hash]
495
- # Extra options that make sense for a particular storage connection.
496
- # @param row_count_name [String]
497
- # If not nil, this will insert a row count column with give name into the
498
- # DataFrame.
499
- # @param row_count_offset [Integer]
500
- # Offset to start the row_count column (only use if the name is set).
501
- # @param rechunk [Boolean]
502
- # Make sure that all data is contiguous.
503
- #
504
- # @return [DataFrame]
505
- def read_ipc(
506
- source,
507
- columns: nil,
508
- n_rows: nil,
509
- memory_map: true,
510
- storage_options: nil,
511
- row_count_name: nil,
512
- row_count_offset: 0,
513
- rechunk: true
514
- )
515
- storage_options ||= {}
516
- _prepare_file_arg(source, **storage_options) do |data|
517
- DataFrame._read_ipc(
518
- data,
519
- columns: columns,
240
+ if file.is_a?(::String) && file.include?("*")
241
+ dtypes_dict = nil
242
+ if !dtype_list.nil?
243
+ dtypes_dict = dtype_list.to_h
244
+ end
245
+ if !dtype_slice.nil?
246
+ raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
247
+ end
248
+ scan = scan_csv(
249
+ file,
250
+ has_header: has_header,
251
+ sep: sep,
252
+ comment_char: comment_char,
253
+ quote_char: quote_char,
254
+ skip_rows: skip_rows,
255
+ dtypes: dtypes_dict,
256
+ null_values: null_values,
257
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
258
+ ignore_errors: ignore_errors,
259
+ infer_schema_length: infer_schema_length,
520
260
  n_rows: n_rows,
521
- row_count_name: row_count_name,
522
- row_count_offset: row_count_offset,
261
+ low_memory: low_memory,
523
262
  rechunk: rechunk,
524
- memory_map: memory_map
525
- )
526
- end
527
- end
528
-
529
- # Read into a DataFrame from a parquet file.
530
- #
531
- # @param source [String, Pathname, StringIO]
532
- # Path to a file or a file-like object.
533
- # @param columns [Object]
534
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
535
- # of column names.
536
- # @param n_rows [Integer]
537
- # Stop reading from parquet file after reading `n_rows`.
538
- # @param storage_options [Hash]
539
- # Extra options that make sense for a particular storage connection.
540
- # @param parallel ["auto", "columns", "row_groups", "none"]
541
- # This determines the direction of parallelism. 'auto' will try to determine the
542
- # optimal direction.
543
- # @param row_count_name [String]
544
- # If not nil, this will insert a row count column with give name into the
545
- # DataFrame.
546
- # @param row_count_offset [Integer]
547
- # Offset to start the row_count column (only use if the name is set).
548
- # @param low_memory [Boolean]
549
- # Reduce memory pressure at the expense of performance.
550
- # @param use_statistics [Boolean]
551
- # Use statistics in the parquet to determine if pages
552
- # can be skipped from reading.
553
- # @param rechunk [Boolean]
554
- # Make sure that all columns are contiguous in memory by
555
- # aggregating the chunks into a single array.
556
- #
557
- # @return [DataFrame]
558
- #
559
- # @note
560
- # This operation defaults to a `rechunk` operation at the end, meaning that
561
- # all data will be stored continuously in memory.
562
- # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
563
- # an expensive operation.
564
- def read_parquet(
565
- source,
566
- columns: nil,
567
- n_rows: nil,
568
- storage_options: nil,
569
- parallel: "auto",
570
- row_count_name: nil,
571
- row_count_offset: 0,
572
- low_memory: false,
573
- use_statistics: true,
574
- rechunk: true
575
- )
576
- _prepare_file_arg(source) do |data|
577
- DataFrame._read_parquet(
578
- data,
579
- columns: columns,
580
- n_rows: n_rows,
581
- parallel: parallel,
263
+ skip_rows_after_header: skip_rows_after_header,
582
264
  row_count_name: row_count_name,
583
265
  row_count_offset: row_count_offset,
584
- low_memory: low_memory,
585
- use_statistics: use_statistics,
586
- rechunk: rechunk
266
+ eol_char: eol_char,
267
+ truncate_ragged_lines: truncate_ragged_lines,
268
+ decimal_comma: decimal_comma,
269
+ glob: glob
587
270
  )
588
- end
589
- end
590
-
591
- # Read into a DataFrame from a JSON file.
592
- #
593
- # @param source [Object]
594
- # Path to a file or a file-like object.
595
- #
596
- # @return [DataFrame]
597
- def read_json(source)
598
- DataFrame._read_json(source)
599
- end
600
-
601
- # Read into a DataFrame from a newline delimited JSON file.
602
- #
603
- # @param source [Object]
604
- # Path to a file or a file-like object.
605
- #
606
- # @return [DataFrame]
607
- def read_ndjson(source)
608
- DataFrame._read_ndjson(source)
609
- end
610
-
611
- # Read a SQL query into a DataFrame.
612
- #
613
- # @param query [Object]
614
- # ActiveRecord::Relation or ActiveRecord::Result.
615
- # @param schema_overrides [Hash]
616
- # A hash mapping column names to dtypes, used to override the schema
617
- # inferred from the query.
618
- #
619
- # @return [DataFrame]
620
- def read_database(query, schema_overrides: nil)
621
- if !defined?(ActiveRecord)
622
- raise Error, "Active Record not available"
623
- end
624
-
625
- result =
626
- if query.is_a?(ActiveRecord::Result)
627
- query
628
- elsif query.is_a?(ActiveRecord::Relation)
629
- query.connection.select_all(query.to_sql)
630
- elsif query.is_a?(::String)
631
- ActiveRecord::Base.connection.select_all(query)
271
+ if columns.nil?
272
+ return scan.collect
273
+ elsif is_str_sequence(columns, allow_str: false)
274
+ return scan.select(columns).collect
632
275
  else
633
- raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
276
+ raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
634
277
  end
635
-
636
- data = {}
637
- schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
638
-
639
- result.columns.each_with_index do |k, i|
640
- column_type = result.column_types[i]
641
-
642
- data[k] =
643
- if column_type
644
- result.rows.map { |r| column_type.deserialize(r[i]) }
645
- else
646
- result.rows.map { |r| r[i] }
647
- end
648
-
649
- polars_type =
650
- case column_type&.type
651
- when :binary
652
- Binary
653
- when :boolean
654
- Boolean
655
- when :date
656
- Date
657
- when :datetime, :timestamp
658
- Datetime
659
- when :decimal
660
- Decimal
661
- when :float
662
- Float64
663
- when :integer
664
- Int64
665
- when :string, :text
666
- String
667
- when :time
668
- Time
669
- # TODO fix issue with null
670
- # when :json, :jsonb
671
- # Struct
672
- end
673
-
674
- schema_overrides[k] ||= polars_type if polars_type
675
278
  end
676
279
 
677
- DataFrame.new(data, schema_overrides: schema_overrides)
678
- end
679
- alias_method :read_sql, :read_database
280
+ projection, columns = Utils.handle_projection_columns(columns)
680
281
 
681
- # def read_excel
682
- # end
282
+ rbdf =
283
+ RbDataFrame.read_csv(
284
+ file,
285
+ infer_schema_length,
286
+ batch_size,
287
+ has_header,
288
+ ignore_errors,
289
+ n_rows,
290
+ skip_rows,
291
+ projection,
292
+ sep,
293
+ rechunk,
294
+ columns,
295
+ encoding,
296
+ n_threads,
297
+ path,
298
+ dtype_list,
299
+ dtype_slice,
300
+ low_memory,
301
+ comment_char,
302
+ quote_char,
303
+ processed_null_values,
304
+ missing_utf8_is_empty_string,
305
+ parse_dates,
306
+ skip_rows_after_header,
307
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
308
+ sample_size,
309
+ eol_char,
310
+ raise_if_empty,
311
+ truncate_ragged_lines,
312
+ decimal_comma,
313
+ schema
314
+ )
315
+ Utils.wrap_df(rbdf)
316
+ end
683
317
 
684
318
  # Read a CSV file in batches.
685
319
  #
@@ -784,6 +418,7 @@ module Polars
784
418
  skip_rows: 0,
785
419
  dtypes: nil,
786
420
  null_values: nil,
421
+ missing_utf8_is_empty_string: false,
787
422
  ignore_errors: false,
788
423
  parse_dates: false,
789
424
  n_threads: nil,
@@ -798,7 +433,9 @@ module Polars
798
433
  row_count_offset: 0,
799
434
  sample_size: 1024,
800
435
  eol_char: "\n",
801
- truncate_ragged_lines: false
436
+ raise_if_empty: true,
437
+ truncate_ragged_lines: false,
438
+ decimal_comma: false
802
439
  )
803
440
  projection, columns = Utils.handle_projection_columns(columns)
804
441
 
@@ -824,6 +461,7 @@ module Polars
824
461
  skip_rows: skip_rows,
825
462
  dtypes: dtypes,
826
463
  null_values: null_values,
464
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
827
465
  ignore_errors: ignore_errors,
828
466
  parse_dates: parse_dates,
829
467
  n_threads: n_threads,
@@ -839,36 +477,204 @@ module Polars
839
477
  sample_size: sample_size,
840
478
  eol_char: eol_char,
841
479
  new_columns: new_columns,
842
- truncate_ragged_lines: truncate_ragged_lines
480
+ raise_if_empty: raise_if_empty,
481
+ truncate_ragged_lines: truncate_ragged_lines,
482
+ decimal_comma: decimal_comma
843
483
  )
844
484
  end
845
485
 
846
- # Get a schema of the IPC file without reading data.
486
+ # Lazily read from a CSV file or multiple files via glob patterns.
487
+ #
488
+ # This allows the query optimizer to push down predicates and
489
+ # projections to the scan level, thereby potentially reducing
490
+ # memory overhead.
847
491
  #
848
492
  # @param source [Object]
849
- # Path to a file or a file-like object.
493
+ # Path to a file.
494
+ # @param has_header [Boolean]
495
+ # Indicate if the first row of dataset is a header or not.
496
+ # If set to false, column names will be autogenerated in the
497
+ # following format: `column_x`, with `x` being an
498
+ # enumeration over every column in the dataset starting at 1.
499
+ # @param sep [String]
500
+ # Single byte character to use as delimiter in the file.
501
+ # @param comment_char [String]
502
+ # Single byte character that indicates the start of a comment line,
503
+ # for instance `#`.
504
+ # @param quote_char [String]
505
+ # Single byte character used for csv quoting.
506
+ # Set to None to turn off special handling and escaping of quotes.
507
+ # @param skip_rows [Integer]
508
+ # Start reading after `skip_rows` lines. The header will be parsed at this
509
+ # offset.
510
+ # @param dtypes [Object]
511
+ # Overwrite dtypes during inference.
512
+ # @param null_values [Object]
513
+ # Values to interpret as null values. You can provide a:
850
514
  #
851
- # @return [Hash]
852
- def read_ipc_schema(source)
515
+ # - `String`: All values equal to this string will be null.
516
+ # - `Array`: All values equal to any string in this array will be null.
517
+ # - `Hash`: A hash that maps column name to a null value string.
518
+ # @param ignore_errors [Boolean]
519
+ # Try to keep reading lines if some lines yield errors.
520
+ # First try `infer_schema_length: 0` to read all columns as
521
+ # `:str` to check which values might cause an issue.
522
+ # @param cache [Boolean]
523
+ # Cache the result after reading.
524
+ # @param with_column_names [Object]
525
+ # Apply a function over the column names.
526
+ # This can be used to update a schema just in time, thus before
527
+ # scanning.
528
+ # @param infer_schema_length [Integer]
529
+ # Maximum number of lines to read to infer schema.
530
+ # If set to 0, all columns will be read as `:str`.
531
+ # If set to `nil`, a full table scan will be done (slow).
532
+ # @param n_rows [Integer]
533
+ # Stop reading from CSV file after reading `n_rows`.
534
+ # @param encoding ["utf8", "utf8-lossy"]
535
+ # Lossy means that invalid utf8 values are replaced with `�`
536
+ # characters.
537
+ # @param low_memory [Boolean]
538
+ # Reduce memory usage in expense of performance.
539
+ # @param rechunk [Boolean]
540
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
541
+ # @param skip_rows_after_header [Integer]
542
+ # Skip this number of rows when the header is parsed.
543
+ # @param row_count_name [String]
544
+ # If not nil, this will insert a row count column with the given name into
545
+ # the DataFrame.
546
+ # @param row_count_offset [Integer]
547
+ # Offset to start the row_count column (only used if the name is set).
548
+ # @param parse_dates [Boolean]
549
+ # Try to automatically parse dates. If this does not succeed,
550
+ # the column remains of data type `:str`.
551
+ # @param eol_char [String]
552
+ # Single byte end of line character.
553
+ # @param truncate_ragged_lines [Boolean]
554
+ # Truncate lines that are longer than the schema.
555
+ #
556
+ # @return [LazyFrame]
557
+ def scan_csv(
558
+ source,
559
+ has_header: true,
560
+ sep: ",",
561
+ comment_char: nil,
562
+ quote_char: '"',
563
+ skip_rows: 0,
564
+ dtypes: nil,
565
+ null_values: nil,
566
+ missing_utf8_is_empty_string: false,
567
+ ignore_errors: false,
568
+ cache: true,
569
+ with_column_names: nil,
570
+ infer_schema_length: 100,
571
+ n_rows: nil,
572
+ encoding: "utf8",
573
+ low_memory: false,
574
+ rechunk: true,
575
+ skip_rows_after_header: 0,
576
+ row_count_name: nil,
577
+ row_count_offset: 0,
578
+ parse_dates: false,
579
+ eol_char: "\n",
580
+ raise_if_empty: true,
581
+ truncate_ragged_lines: false,
582
+ decimal_comma: false,
583
+ glob: true
584
+ )
585
+ Utils._check_arg_is_1byte("sep", sep, false)
586
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
587
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
588
+
853
589
  if Utils.pathlike?(source)
854
- source = Utils.normalise_filepath(source)
590
+ source = Utils.normalize_filepath(source)
855
591
  end
856
592
 
857
- Plr.ipc_schema(source)
593
+ _scan_csv_impl(
594
+ source,
595
+ has_header: has_header,
596
+ sep: sep,
597
+ comment_char: comment_char,
598
+ quote_char: quote_char,
599
+ skip_rows: skip_rows,
600
+ dtypes: dtypes,
601
+ null_values: null_values,
602
+ ignore_errors: ignore_errors,
603
+ cache: cache,
604
+ with_column_names: with_column_names,
605
+ infer_schema_length: infer_schema_length,
606
+ n_rows: n_rows,
607
+ low_memory: low_memory,
608
+ rechunk: rechunk,
609
+ skip_rows_after_header: skip_rows_after_header,
610
+ encoding: encoding,
611
+ row_count_name: row_count_name,
612
+ row_count_offset: row_count_offset,
613
+ parse_dates: parse_dates,
614
+ eol_char: eol_char,
615
+ truncate_ragged_lines: truncate_ragged_lines
616
+ )
858
617
  end
859
618
 
860
- # Get a schema of the Parquet file without reading data.
861
- #
862
- # @param source [Object]
863
- # Path to a file or a file-like object.
864
- #
865
- # @return [Hash]
866
- def read_parquet_schema(source)
867
- if Utils.pathlike?(source)
868
- source = Utils.normalise_filepath(source)
619
+ # @private
620
+ def _scan_csv_impl(
621
+ file,
622
+ has_header: true,
623
+ sep: ",",
624
+ comment_char: nil,
625
+ quote_char: '"',
626
+ skip_rows: 0,
627
+ dtypes: nil,
628
+ null_values: nil,
629
+ ignore_errors: false,
630
+ cache: true,
631
+ with_column_names: nil,
632
+ infer_schema_length: 100,
633
+ n_rows: nil,
634
+ encoding: "utf8",
635
+ low_memory: false,
636
+ rechunk: true,
637
+ skip_rows_after_header: 0,
638
+ row_count_name: nil,
639
+ row_count_offset: 0,
640
+ parse_dates: false,
641
+ eol_char: "\n",
642
+ truncate_ragged_lines: true
643
+ )
644
+ dtype_list = nil
645
+ if !dtypes.nil?
646
+ dtype_list = []
647
+ dtypes.each do |k, v|
648
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
649
+ end
869
650
  end
870
-
871
- Plr.parquet_schema(source)
651
+ processed_null_values = Utils._process_null_values(null_values)
652
+
653
+ rblf =
654
+ RbLazyFrame.new_from_csv(
655
+ file,
656
+ sep,
657
+ has_header,
658
+ ignore_errors,
659
+ skip_rows,
660
+ n_rows,
661
+ cache,
662
+ dtype_list,
663
+ low_memory,
664
+ comment_char,
665
+ quote_char,
666
+ processed_null_values,
667
+ infer_schema_length,
668
+ with_column_names,
669
+ rechunk,
670
+ skip_rows_after_header,
671
+ encoding,
672
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
673
+ parse_dates,
674
+ eol_char,
675
+ truncate_ragged_lines
676
+ )
677
+ Utils.wrap_ldf(rblf)
872
678
  end
873
679
 
874
680
  private