polars-df 0.10.0-arm64-darwin → 0.11.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -141,7 +141,7 @@ module Polars
141
141
 
142
142
  df = nil
143
143
  _prepare_file_arg(source) do |data|
144
- df = DataFrame._read_csv(
144
+ df = _read_csv_impl(
145
145
  data,
146
146
  has_header: has_header,
147
147
  columns: columns || projection,
@@ -176,90 +176,24 @@ module Polars
176
176
  end
177
177
  end
178
178
 
179
- # Lazily read from a CSV file or multiple files via glob patterns.
180
- #
181
- # This allows the query optimizer to push down predicates and
182
- # projections to the scan level, thereby potentially reducing
183
- # memory overhead.
184
- #
185
- # @param source [Object]
186
- # Path to a file.
187
- # @param has_header [Boolean]
188
- # Indicate if the first row of dataset is a header or not.
189
- # If set to false, column names will be autogenerated in the
190
- # following format: `column_x`, with `x` being an
191
- # enumeration over every column in the dataset starting at 1.
192
- # @param sep [String]
193
- # Single byte character to use as delimiter in the file.
194
- # @param comment_char [String]
195
- # Single byte character that indicates the start of a comment line,
196
- # for instance `#`.
197
- # @param quote_char [String]
198
- # Single byte character used for csv quoting.
199
- # Set to None to turn off special handling and escaping of quotes.
200
- # @param skip_rows [Integer]
201
- # Start reading after `skip_rows` lines. The header will be parsed at this
202
- # offset.
203
- # @param dtypes [Object]
204
- # Overwrite dtypes during inference.
205
- # @param null_values [Object]
206
- # Values to interpret as null values. You can provide a:
207
- #
208
- # - `String`: All values equal to this string will be null.
209
- # - `Array`: All values equal to any string in this array will be null.
210
- # - `Hash`: A hash that maps column name to a null value string.
211
- # @param ignore_errors [Boolean]
212
- # Try to keep reading lines if some lines yield errors.
213
- # First try `infer_schema_length: 0` to read all columns as
214
- # `:str` to check which values might cause an issue.
215
- # @param cache [Boolean]
216
- # Cache the result after reading.
217
- # @param with_column_names [Object]
218
- # Apply a function over the column names.
219
- # This can be used to update a schema just in time, thus before
220
- # scanning.
221
- # @param infer_schema_length [Integer]
222
- # Maximum number of lines to read to infer schema.
223
- # If set to 0, all columns will be read as `:str`.
224
- # If set to `nil`, a full table scan will be done (slow).
225
- # @param n_rows [Integer]
226
- # Stop reading from CSV file after reading `n_rows`.
227
- # @param encoding ["utf8", "utf8-lossy"]
228
- # Lossy means that invalid utf8 values are replaced with `�`
229
- # characters.
230
- # @param low_memory [Boolean]
231
- # Reduce memory usage in expense of performance.
232
- # @param rechunk [Boolean]
233
- # Reallocate to contiguous memory when all chunks/ files are parsed.
234
- # @param skip_rows_after_header [Integer]
235
- # Skip this number of rows when the header is parsed.
236
- # @param row_count_name [String]
237
- # If not nil, this will insert a row count column with the given name into
238
- # the DataFrame.
239
- # @param row_count_offset [Integer]
240
- # Offset to start the row_count column (only used if the name is set).
241
- # @param parse_dates [Boolean]
242
- # Try to automatically parse dates. If this does not succeed,
243
- # the column remains of data type `:str`.
244
- # @param eol_char [String]
245
- # Single byte end of line character.
246
- # @param truncate_ragged_lines [Boolean]
247
- # Truncate lines that are longer than the schema.
248
- #
249
- # @return [LazyFrame]
250
- def scan_csv(
251
- source,
179
+ # @private
180
+ def _read_csv_impl(
181
+ file,
252
182
  has_header: true,
183
+ columns: nil,
253
184
  sep: ",",
254
185
  comment_char: nil,
255
186
  quote_char: '"',
256
187
  skip_rows: 0,
257
188
  dtypes: nil,
189
+ schema: nil,
258
190
  null_values: nil,
191
+ missing_utf8_is_empty_string: false,
259
192
  ignore_errors: false,
260
- cache: true,
261
- with_column_names: nil,
193
+ parse_dates: false,
194
+ n_threads: nil,
262
195
  infer_schema_length: 100,
196
+ batch_size: 8192,
263
197
  n_rows: nil,
264
198
  encoding: "utf8",
265
199
  low_memory: false,
@@ -267,419 +201,119 @@ module Polars
267
201
  skip_rows_after_header: 0,
268
202
  row_count_name: nil,
269
203
  row_count_offset: 0,
270
- parse_dates: false,
204
+ sample_size: 1024,
271
205
  eol_char: "\n",
272
- truncate_ragged_lines: false
273
- )
274
- Utils._check_arg_is_1byte("sep", sep, false)
275
- Utils._check_arg_is_1byte("comment_char", comment_char, false)
276
- Utils._check_arg_is_1byte("quote_char", quote_char, true)
277
-
278
- if Utils.pathlike?(source)
279
- source = Utils.normalise_filepath(source)
280
- end
281
-
282
- LazyFrame._scan_csv(
283
- source,
284
- has_header: has_header,
285
- sep: sep,
286
- comment_char: comment_char,
287
- quote_char: quote_char,
288
- skip_rows: skip_rows,
289
- dtypes: dtypes,
290
- null_values: null_values,
291
- ignore_errors: ignore_errors,
292
- cache: cache,
293
- with_column_names: with_column_names,
294
- infer_schema_length: infer_schema_length,
295
- n_rows: n_rows,
296
- low_memory: low_memory,
297
- rechunk: rechunk,
298
- skip_rows_after_header: skip_rows_after_header,
299
- encoding: encoding,
300
- row_count_name: row_count_name,
301
- row_count_offset: row_count_offset,
302
- parse_dates: parse_dates,
303
- eol_char: eol_char,
304
- truncate_ragged_lines: truncate_ragged_lines
305
- )
306
- end
307
-
308
- # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
309
- #
310
- # This allows the query optimizer to push down predicates and projections to the scan
311
- # level, thereby potentially reducing memory overhead.
312
- #
313
- # @param source [String]
314
- # Path to a IPC file.
315
- # @param n_rows [Integer]
316
- # Stop reading from IPC file after reading `n_rows`.
317
- # @param cache [Boolean]
318
- # Cache the result after reading.
319
- # @param rechunk [Boolean]
320
- # Reallocate to contiguous memory when all chunks/ files are parsed.
321
- # @param row_count_name [String]
322
- # If not nil, this will insert a row count column with give name into the
323
- # DataFrame.
324
- # @param row_count_offset [Integer]
325
- # Offset to start the row_count column (only use if the name is set).
326
- # @param storage_options [Hash]
327
- # Extra options that make sense for a particular storage connection.
328
- # @param memory_map [Boolean]
329
- # Try to memory map the file. This can greatly improve performance on repeated
330
- # queries as the OS may cache pages.
331
- # Only uncompressed IPC files can be memory mapped.
332
- #
333
- # @return [LazyFrame]
334
- def scan_ipc(
335
- source,
336
- n_rows: nil,
337
- cache: true,
338
- rechunk: true,
339
- row_count_name: nil,
340
- row_count_offset: 0,
341
- storage_options: nil,
342
- memory_map: true
206
+ raise_if_empty: true,
207
+ truncate_ragged_lines: false,
208
+ decimal_comma: false,
209
+ glob: true
343
210
  )
344
- LazyFrame._scan_ipc(
345
- source,
346
- n_rows: n_rows,
347
- cache: cache,
348
- rechunk: rechunk,
349
- row_count_name: row_count_name,
350
- row_count_offset: row_count_offset,
351
- storage_options: storage_options,
352
- memory_map: memory_map
353
- )
354
- end
355
-
356
- # Lazily read from a parquet file or multiple files via glob patterns.
357
- #
358
- # This allows the query optimizer to push down predicates and projections to the scan
359
- # level, thereby potentially reducing memory overhead.
360
- #
361
- # @param source [String]
362
- # Path to a file.
363
- # @param n_rows [Integer]
364
- # Stop reading from parquet file after reading `n_rows`.
365
- # @param cache [Boolean]
366
- # Cache the result after reading.
367
- # @param parallel ["auto", "columns", "row_groups", "none"]
368
- # This determines the direction of parallelism. 'auto' will try to determine the
369
- # optimal direction.
370
- # @param rechunk [Boolean]
371
- # In case of reading multiple files via a glob pattern rechunk the final DataFrame
372
- # into contiguous memory chunks.
373
- # @param row_count_name [String]
374
- # If not nil, this will insert a row count column with give name into the
375
- # DataFrame.
376
- # @param row_count_offset [Integer]
377
- # Offset to start the row_count column (only use if the name is set).
378
- # @param storage_options [Hash]
379
- # Extra options that make sense for a particular storage connection.
380
- # @param low_memory [Boolean]
381
- # Reduce memory pressure at the expense of performance.
382
- #
383
- # @return [LazyFrame]
384
- def scan_parquet(
385
- source,
386
- n_rows: nil,
387
- cache: true,
388
- parallel: "auto",
389
- rechunk: true,
390
- row_count_name: nil,
391
- row_count_offset: 0,
392
- storage_options: nil,
393
- low_memory: false
394
- )
395
- if Utils.pathlike?(source)
396
- source = Utils.normalise_filepath(source)
211
+ if Utils.pathlike?(file)
212
+ path = Utils.normalize_filepath(file)
213
+ else
214
+ path = nil
215
+ # if defined?(StringIO) && file.is_a?(StringIO)
216
+ # file = file.string
217
+ # end
397
218
  end
398
219
 
399
- LazyFrame._scan_parquet(
400
- source,
401
- n_rows:n_rows,
402
- cache: cache,
403
- parallel: parallel,
404
- rechunk: rechunk,
405
- row_count_name: row_count_name,
406
- row_count_offset: row_count_offset,
407
- storage_options: storage_options,
408
- low_memory: low_memory
409
- )
410
- end
411
-
412
- # Lazily read from a newline delimited JSON file.
413
- #
414
- # This allows the query optimizer to push down predicates and projections to the scan
415
- # level, thereby potentially reducing memory overhead.
416
- #
417
- # @param source [String]
418
- # Path to a file.
419
- # @param infer_schema_length [Integer]
420
- # Infer the schema length from the first `infer_schema_length` rows.
421
- # @param batch_size [Integer]
422
- # Number of rows to read in each batch.
423
- # @param n_rows [Integer]
424
- # Stop reading from JSON file after reading `n_rows`.
425
- # @param low_memory [Boolean]
426
- # Reduce memory pressure at the expense of performance.
427
- # @param rechunk [Boolean]
428
- # Reallocate to contiguous memory when all chunks/ files are parsed.
429
- # @param row_count_name [String]
430
- # If not nil, this will insert a row count column with give name into the
431
- # DataFrame.
432
- # @param row_count_offset [Integer]
433
- # Offset to start the row_count column (only use if the name is set).
434
- #
435
- # @return [LazyFrame]
436
- def scan_ndjson(
437
- source,
438
- infer_schema_length: 100,
439
- batch_size: 1024,
440
- n_rows: nil,
441
- low_memory: false,
442
- rechunk: true,
443
- row_count_name: nil,
444
- row_count_offset: 0
445
- )
446
- if Utils.pathlike?(source)
447
- source = Utils.normalise_filepath(source)
220
+ dtype_list = nil
221
+ dtype_slice = nil
222
+ if !dtypes.nil?
223
+ if dtypes.is_a?(Hash)
224
+ dtype_list = []
225
+ dtypes.each do|k, v|
226
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
227
+ end
228
+ elsif dtypes.is_a?(::Array)
229
+ dtype_slice = dtypes
230
+ else
231
+ raise ArgumentError, "dtype arg should be list or dict"
232
+ end
448
233
  end
449
234
 
450
- LazyFrame._scan_ndjson(
451
- source,
452
- infer_schema_length: infer_schema_length,
453
- batch_size: batch_size,
454
- n_rows: n_rows,
455
- low_memory: low_memory,
456
- rechunk: rechunk,
457
- row_count_name: row_count_name,
458
- row_count_offset: row_count_offset,
459
- )
460
- end
235
+ processed_null_values = Utils._process_null_values(null_values)
461
236
 
462
- # Read into a DataFrame from Apache Avro format.
463
- #
464
- # @param source [Object]
465
- # Path to a file or a file-like object.
466
- # @param columns [Object]
467
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
468
- # of column names.
469
- # @param n_rows [Integer]
470
- # Stop reading from Apache Avro file after reading ``n_rows``.
471
- #
472
- # @return [DataFrame]
473
- def read_avro(source, columns: nil, n_rows: nil)
474
- if Utils.pathlike?(source)
475
- source = Utils.normalise_filepath(source)
237
+ if columns.is_a?(::String)
238
+ columns = [columns]
476
239
  end
477
-
478
- DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
479
- end
480
-
481
- # Read into a DataFrame from Arrow IPC (Feather v2) file.
482
- #
483
- # @param source [Object]
484
- # Path to a file or a file-like object.
485
- # @param columns [Object]
486
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
487
- # of column names.
488
- # @param n_rows [Integer]
489
- # Stop reading from IPC file after reading `n_rows`.
490
- # @param memory_map [Boolean]
491
- # Try to memory map the file. This can greatly improve performance on repeated
492
- # queries as the OS may cache pages.
493
- # Only uncompressed IPC files can be memory mapped.
494
- # @param storage_options [Hash]
495
- # Extra options that make sense for a particular storage connection.
496
- # @param row_count_name [String]
497
- # If not nil, this will insert a row count column with give name into the
498
- # DataFrame.
499
- # @param row_count_offset [Integer]
500
- # Offset to start the row_count column (only use if the name is set).
501
- # @param rechunk [Boolean]
502
- # Make sure that all data is contiguous.
503
- #
504
- # @return [DataFrame]
505
- def read_ipc(
506
- source,
507
- columns: nil,
508
- n_rows: nil,
509
- memory_map: true,
510
- storage_options: nil,
511
- row_count_name: nil,
512
- row_count_offset: 0,
513
- rechunk: true
514
- )
515
- storage_options ||= {}
516
- _prepare_file_arg(source, **storage_options) do |data|
517
- DataFrame._read_ipc(
518
- data,
519
- columns: columns,
240
+ if file.is_a?(::String) && file.include?("*")
241
+ dtypes_dict = nil
242
+ if !dtype_list.nil?
243
+ dtypes_dict = dtype_list.to_h
244
+ end
245
+ if !dtype_slice.nil?
246
+ raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
247
+ end
248
+ scan = scan_csv(
249
+ file,
250
+ has_header: has_header,
251
+ sep: sep,
252
+ comment_char: comment_char,
253
+ quote_char: quote_char,
254
+ skip_rows: skip_rows,
255
+ dtypes: dtypes_dict,
256
+ null_values: null_values,
257
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
258
+ ignore_errors: ignore_errors,
259
+ infer_schema_length: infer_schema_length,
520
260
  n_rows: n_rows,
521
- row_count_name: row_count_name,
522
- row_count_offset: row_count_offset,
261
+ low_memory: low_memory,
523
262
  rechunk: rechunk,
524
- memory_map: memory_map
525
- )
526
- end
527
- end
528
-
529
- # Read into a DataFrame from a parquet file.
530
- #
531
- # @param source [String, Pathname, StringIO]
532
- # Path to a file or a file-like object.
533
- # @param columns [Object]
534
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
535
- # of column names.
536
- # @param n_rows [Integer]
537
- # Stop reading from parquet file after reading `n_rows`.
538
- # @param storage_options [Hash]
539
- # Extra options that make sense for a particular storage connection.
540
- # @param parallel ["auto", "columns", "row_groups", "none"]
541
- # This determines the direction of parallelism. 'auto' will try to determine the
542
- # optimal direction.
543
- # @param row_count_name [String]
544
- # If not nil, this will insert a row count column with give name into the
545
- # DataFrame.
546
- # @param row_count_offset [Integer]
547
- # Offset to start the row_count column (only use if the name is set).
548
- # @param low_memory [Boolean]
549
- # Reduce memory pressure at the expense of performance.
550
- # @param use_statistics [Boolean]
551
- # Use statistics in the parquet to determine if pages
552
- # can be skipped from reading.
553
- # @param rechunk [Boolean]
554
- # Make sure that all columns are contiguous in memory by
555
- # aggregating the chunks into a single array.
556
- #
557
- # @return [DataFrame]
558
- #
559
- # @note
560
- # This operation defaults to a `rechunk` operation at the end, meaning that
561
- # all data will be stored continuously in memory.
562
- # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
563
- # an expensive operation.
564
- def read_parquet(
565
- source,
566
- columns: nil,
567
- n_rows: nil,
568
- storage_options: nil,
569
- parallel: "auto",
570
- row_count_name: nil,
571
- row_count_offset: 0,
572
- low_memory: false,
573
- use_statistics: true,
574
- rechunk: true
575
- )
576
- _prepare_file_arg(source) do |data|
577
- DataFrame._read_parquet(
578
- data,
579
- columns: columns,
580
- n_rows: n_rows,
581
- parallel: parallel,
263
+ skip_rows_after_header: skip_rows_after_header,
582
264
  row_count_name: row_count_name,
583
265
  row_count_offset: row_count_offset,
584
- low_memory: low_memory,
585
- use_statistics: use_statistics,
586
- rechunk: rechunk
266
+ eol_char: eol_char,
267
+ truncate_ragged_lines: truncate_ragged_lines,
268
+ decimal_comma: decimal_comma,
269
+ glob: glob
587
270
  )
588
- end
589
- end
590
-
591
- # Read into a DataFrame from a JSON file.
592
- #
593
- # @param source [Object]
594
- # Path to a file or a file-like object.
595
- #
596
- # @return [DataFrame]
597
- def read_json(source)
598
- DataFrame._read_json(source)
599
- end
600
-
601
- # Read into a DataFrame from a newline delimited JSON file.
602
- #
603
- # @param source [Object]
604
- # Path to a file or a file-like object.
605
- #
606
- # @return [DataFrame]
607
- def read_ndjson(source)
608
- DataFrame._read_ndjson(source)
609
- end
610
-
611
- # Read a SQL query into a DataFrame.
612
- #
613
- # @param query [Object]
614
- # ActiveRecord::Relation or ActiveRecord::Result.
615
- # @param schema_overrides [Hash]
616
- # A hash mapping column names to dtypes, used to override the schema
617
- # inferred from the query.
618
- #
619
- # @return [DataFrame]
620
- def read_database(query, schema_overrides: nil)
621
- if !defined?(ActiveRecord)
622
- raise Error, "Active Record not available"
623
- end
624
-
625
- result =
626
- if query.is_a?(ActiveRecord::Result)
627
- query
628
- elsif query.is_a?(ActiveRecord::Relation)
629
- query.connection.select_all(query.to_sql)
630
- elsif query.is_a?(::String)
631
- ActiveRecord::Base.connection.select_all(query)
271
+ if columns.nil?
272
+ return scan.collect
273
+ elsif is_str_sequence(columns, allow_str: false)
274
+ return scan.select(columns).collect
632
275
  else
633
- raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
276
+ raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
634
277
  end
635
-
636
- data = {}
637
- schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
638
-
639
- result.columns.each_with_index do |k, i|
640
- column_type = result.column_types[i]
641
-
642
- data[k] =
643
- if column_type
644
- result.rows.map { |r| column_type.deserialize(r[i]) }
645
- else
646
- result.rows.map { |r| r[i] }
647
- end
648
-
649
- polars_type =
650
- case column_type&.type
651
- when :binary
652
- Binary
653
- when :boolean
654
- Boolean
655
- when :date
656
- Date
657
- when :datetime, :timestamp
658
- Datetime
659
- when :decimal
660
- Decimal
661
- when :float
662
- Float64
663
- when :integer
664
- Int64
665
- when :string, :text
666
- String
667
- when :time
668
- Time
669
- # TODO fix issue with null
670
- # when :json, :jsonb
671
- # Struct
672
- end
673
-
674
- schema_overrides[k] ||= polars_type if polars_type
675
278
  end
676
279
 
677
- DataFrame.new(data, schema_overrides: schema_overrides)
678
- end
679
- alias_method :read_sql, :read_database
280
+ projection, columns = Utils.handle_projection_columns(columns)
680
281
 
681
- # def read_excel
682
- # end
282
+ rbdf =
283
+ RbDataFrame.read_csv(
284
+ file,
285
+ infer_schema_length,
286
+ batch_size,
287
+ has_header,
288
+ ignore_errors,
289
+ n_rows,
290
+ skip_rows,
291
+ projection,
292
+ sep,
293
+ rechunk,
294
+ columns,
295
+ encoding,
296
+ n_threads,
297
+ path,
298
+ dtype_list,
299
+ dtype_slice,
300
+ low_memory,
301
+ comment_char,
302
+ quote_char,
303
+ processed_null_values,
304
+ missing_utf8_is_empty_string,
305
+ parse_dates,
306
+ skip_rows_after_header,
307
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
308
+ sample_size,
309
+ eol_char,
310
+ raise_if_empty,
311
+ truncate_ragged_lines,
312
+ decimal_comma,
313
+ schema
314
+ )
315
+ Utils.wrap_df(rbdf)
316
+ end
683
317
 
684
318
  # Read a CSV file in batches.
685
319
  #
@@ -784,6 +418,7 @@ module Polars
784
418
  skip_rows: 0,
785
419
  dtypes: nil,
786
420
  null_values: nil,
421
+ missing_utf8_is_empty_string: false,
787
422
  ignore_errors: false,
788
423
  parse_dates: false,
789
424
  n_threads: nil,
@@ -798,7 +433,9 @@ module Polars
798
433
  row_count_offset: 0,
799
434
  sample_size: 1024,
800
435
  eol_char: "\n",
801
- truncate_ragged_lines: false
436
+ raise_if_empty: true,
437
+ truncate_ragged_lines: false,
438
+ decimal_comma: false
802
439
  )
803
440
  projection, columns = Utils.handle_projection_columns(columns)
804
441
 
@@ -824,6 +461,7 @@ module Polars
824
461
  skip_rows: skip_rows,
825
462
  dtypes: dtypes,
826
463
  null_values: null_values,
464
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
827
465
  ignore_errors: ignore_errors,
828
466
  parse_dates: parse_dates,
829
467
  n_threads: n_threads,
@@ -839,36 +477,204 @@ module Polars
839
477
  sample_size: sample_size,
840
478
  eol_char: eol_char,
841
479
  new_columns: new_columns,
842
- truncate_ragged_lines: truncate_ragged_lines
480
+ raise_if_empty: raise_if_empty,
481
+ truncate_ragged_lines: truncate_ragged_lines,
482
+ decimal_comma: decimal_comma
843
483
  )
844
484
  end
845
485
 
846
- # Get a schema of the IPC file without reading data.
486
+ # Lazily read from a CSV file or multiple files via glob patterns.
487
+ #
488
+ # This allows the query optimizer to push down predicates and
489
+ # projections to the scan level, thereby potentially reducing
490
+ # memory overhead.
847
491
  #
848
492
  # @param source [Object]
849
- # Path to a file or a file-like object.
493
+ # Path to a file.
494
+ # @param has_header [Boolean]
495
+ # Indicate if the first row of dataset is a header or not.
496
+ # If set to false, column names will be autogenerated in the
497
+ # following format: `column_x`, with `x` being an
498
+ # enumeration over every column in the dataset starting at 1.
499
+ # @param sep [String]
500
+ # Single byte character to use as delimiter in the file.
501
+ # @param comment_char [String]
502
+ # Single byte character that indicates the start of a comment line,
503
+ # for instance `#`.
504
+ # @param quote_char [String]
505
+ # Single byte character used for csv quoting.
506
+ # Set to None to turn off special handling and escaping of quotes.
507
+ # @param skip_rows [Integer]
508
+ # Start reading after `skip_rows` lines. The header will be parsed at this
509
+ # offset.
510
+ # @param dtypes [Object]
511
+ # Overwrite dtypes during inference.
512
+ # @param null_values [Object]
513
+ # Values to interpret as null values. You can provide a:
850
514
  #
851
- # @return [Hash]
852
- def read_ipc_schema(source)
515
+ # - `String`: All values equal to this string will be null.
516
+ # - `Array`: All values equal to any string in this array will be null.
517
+ # - `Hash`: A hash that maps column name to a null value string.
518
+ # @param ignore_errors [Boolean]
519
+ # Try to keep reading lines if some lines yield errors.
520
+ # First try `infer_schema_length: 0` to read all columns as
521
+ # `:str` to check which values might cause an issue.
522
+ # @param cache [Boolean]
523
+ # Cache the result after reading.
524
+ # @param with_column_names [Object]
525
+ # Apply a function over the column names.
526
+ # This can be used to update a schema just in time, thus before
527
+ # scanning.
528
+ # @param infer_schema_length [Integer]
529
+ # Maximum number of lines to read to infer schema.
530
+ # If set to 0, all columns will be read as `:str`.
531
+ # If set to `nil`, a full table scan will be done (slow).
532
+ # @param n_rows [Integer]
533
+ # Stop reading from CSV file after reading `n_rows`.
534
+ # @param encoding ["utf8", "utf8-lossy"]
535
+ # Lossy means that invalid utf8 values are replaced with `�`
536
+ # characters.
537
+ # @param low_memory [Boolean]
538
+ # Reduce memory usage in expense of performance.
539
+ # @param rechunk [Boolean]
540
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
541
+ # @param skip_rows_after_header [Integer]
542
+ # Skip this number of rows when the header is parsed.
543
+ # @param row_count_name [String]
544
+ # If not nil, this will insert a row count column with the given name into
545
+ # the DataFrame.
546
+ # @param row_count_offset [Integer]
547
+ # Offset to start the row_count column (only used if the name is set).
548
+ # @param parse_dates [Boolean]
549
+ # Try to automatically parse dates. If this does not succeed,
550
+ # the column remains of data type `:str`.
551
+ # @param eol_char [String]
552
+ # Single byte end of line character.
553
+ # @param truncate_ragged_lines [Boolean]
554
+ # Truncate lines that are longer than the schema.
555
+ #
556
+ # @return [LazyFrame]
557
+ def scan_csv(
558
+ source,
559
+ has_header: true,
560
+ sep: ",",
561
+ comment_char: nil,
562
+ quote_char: '"',
563
+ skip_rows: 0,
564
+ dtypes: nil,
565
+ null_values: nil,
566
+ missing_utf8_is_empty_string: false,
567
+ ignore_errors: false,
568
+ cache: true,
569
+ with_column_names: nil,
570
+ infer_schema_length: 100,
571
+ n_rows: nil,
572
+ encoding: "utf8",
573
+ low_memory: false,
574
+ rechunk: true,
575
+ skip_rows_after_header: 0,
576
+ row_count_name: nil,
577
+ row_count_offset: 0,
578
+ parse_dates: false,
579
+ eol_char: "\n",
580
+ raise_if_empty: true,
581
+ truncate_ragged_lines: false,
582
+ decimal_comma: false,
583
+ glob: true
584
+ )
585
+ Utils._check_arg_is_1byte("sep", sep, false)
586
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
587
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
588
+
853
589
  if Utils.pathlike?(source)
854
- source = Utils.normalise_filepath(source)
590
+ source = Utils.normalize_filepath(source)
855
591
  end
856
592
 
857
- Plr.ipc_schema(source)
593
+ _scan_csv_impl(
594
+ source,
595
+ has_header: has_header,
596
+ sep: sep,
597
+ comment_char: comment_char,
598
+ quote_char: quote_char,
599
+ skip_rows: skip_rows,
600
+ dtypes: dtypes,
601
+ null_values: null_values,
602
+ ignore_errors: ignore_errors,
603
+ cache: cache,
604
+ with_column_names: with_column_names,
605
+ infer_schema_length: infer_schema_length,
606
+ n_rows: n_rows,
607
+ low_memory: low_memory,
608
+ rechunk: rechunk,
609
+ skip_rows_after_header: skip_rows_after_header,
610
+ encoding: encoding,
611
+ row_count_name: row_count_name,
612
+ row_count_offset: row_count_offset,
613
+ parse_dates: parse_dates,
614
+ eol_char: eol_char,
615
+ truncate_ragged_lines: truncate_ragged_lines
616
+ )
858
617
  end
859
618
 
860
- # Get a schema of the Parquet file without reading data.
861
- #
862
- # @param source [Object]
863
- # Path to a file or a file-like object.
864
- #
865
- # @return [Hash]
866
- def read_parquet_schema(source)
867
- if Utils.pathlike?(source)
868
- source = Utils.normalise_filepath(source)
619
+ # @private
620
+ def _scan_csv_impl(
621
+ file,
622
+ has_header: true,
623
+ sep: ",",
624
+ comment_char: nil,
625
+ quote_char: '"',
626
+ skip_rows: 0,
627
+ dtypes: nil,
628
+ null_values: nil,
629
+ ignore_errors: false,
630
+ cache: true,
631
+ with_column_names: nil,
632
+ infer_schema_length: 100,
633
+ n_rows: nil,
634
+ encoding: "utf8",
635
+ low_memory: false,
636
+ rechunk: true,
637
+ skip_rows_after_header: 0,
638
+ row_count_name: nil,
639
+ row_count_offset: 0,
640
+ parse_dates: false,
641
+ eol_char: "\n",
642
+ truncate_ragged_lines: true
643
+ )
644
+ dtype_list = nil
645
+ if !dtypes.nil?
646
+ dtype_list = []
647
+ dtypes.each do |k, v|
648
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
649
+ end
869
650
  end
870
-
871
- Plr.parquet_schema(source)
651
+ processed_null_values = Utils._process_null_values(null_values)
652
+
653
+ rblf =
654
+ RbLazyFrame.new_from_csv(
655
+ file,
656
+ sep,
657
+ has_header,
658
+ ignore_errors,
659
+ skip_rows,
660
+ n_rows,
661
+ cache,
662
+ dtype_list,
663
+ low_memory,
664
+ comment_char,
665
+ quote_char,
666
+ processed_null_values,
667
+ infer_schema_length,
668
+ with_column_names,
669
+ rechunk,
670
+ skip_rows_after_header,
671
+ encoding,
672
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
673
+ parse_dates,
674
+ eol_char,
675
+ truncate_ragged_lines
676
+ )
677
+ Utils.wrap_ldf(rblf)
872
678
  end
873
679
 
874
680
  private