polars-df 0.9.0-x86_64-linux → 0.11.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +23 -0
  3. data/Cargo.lock +144 -57
  4. data/LICENSE-THIRD-PARTY.txt +629 -29
  5. data/README.md +7 -6
  6. data/lib/polars/3.1/polars.so +0 -0
  7. data/lib/polars/3.2/polars.so +0 -0
  8. data/lib/polars/3.3/polars.so +0 -0
  9. data/lib/polars/array_expr.rb +6 -2
  10. data/lib/polars/batched_csv_reader.rb +11 -3
  11. data/lib/polars/convert.rb +6 -1
  12. data/lib/polars/data_frame.rb +225 -370
  13. data/lib/polars/date_time_expr.rb +11 -4
  14. data/lib/polars/date_time_name_space.rb +14 -4
  15. data/lib/polars/dynamic_group_by.rb +2 -2
  16. data/lib/polars/exceptions.rb +4 -0
  17. data/lib/polars/expr.rb +1171 -54
  18. data/lib/polars/functions/lazy.rb +3 -3
  19. data/lib/polars/functions/range/date_range.rb +92 -0
  20. data/lib/polars/functions/range/datetime_range.rb +149 -0
  21. data/lib/polars/functions/range/time_range.rb +141 -0
  22. data/lib/polars/functions/whenthen.rb +74 -5
  23. data/lib/polars/group_by.rb +88 -23
  24. data/lib/polars/io/avro.rb +24 -0
  25. data/lib/polars/{io.rb → io/csv.rb} +307 -489
  26. data/lib/polars/io/database.rb +73 -0
  27. data/lib/polars/io/ipc.rb +247 -0
  28. data/lib/polars/io/json.rb +18 -0
  29. data/lib/polars/io/ndjson.rb +69 -0
  30. data/lib/polars/io/parquet.rb +226 -0
  31. data/lib/polars/lazy_frame.rb +55 -195
  32. data/lib/polars/lazy_group_by.rb +100 -3
  33. data/lib/polars/list_expr.rb +6 -2
  34. data/lib/polars/rolling_group_by.rb +2 -2
  35. data/lib/polars/series.rb +14 -12
  36. data/lib/polars/string_expr.rb +38 -36
  37. data/lib/polars/utils.rb +89 -1
  38. data/lib/polars/version.rb +1 -1
  39. data/lib/polars/whenthen.rb +83 -0
  40. data/lib/polars.rb +10 -3
  41. metadata +13 -6
  42. data/lib/polars/when.rb +0 -16
  43. data/lib/polars/when_then.rb +0 -19
@@ -80,6 +80,8 @@ module Polars
80
80
  # allocation needed.
81
81
  # @param eol_char [String]
82
82
  # Single byte end of line character.
83
+ # @param truncate_ragged_lines [Boolean]
84
+ # Truncate lines that are longer than the schema.
83
85
  #
84
86
  # @return [DataFrame]
85
87
  #
@@ -113,7 +115,8 @@ module Polars
113
115
  row_count_name: nil,
114
116
  row_count_offset: 0,
115
117
  sample_size: 1024,
116
- eol_char: "\n"
118
+ eol_char: "\n",
119
+ truncate_ragged_lines: false
117
120
  )
118
121
  Utils._check_arg_is_1byte("sep", sep, false)
119
122
  Utils._check_arg_is_1byte("comment_char", comment_char, false)
@@ -138,7 +141,7 @@ module Polars
138
141
 
139
142
  df = nil
140
143
  _prepare_file_arg(source) do |data|
141
- df = DataFrame._read_csv(
144
+ df = _read_csv_impl(
142
145
  data,
143
146
  has_header: has_header,
144
147
  columns: columns || projection,
@@ -161,7 +164,8 @@ module Polars
161
164
  row_count_name: row_count_name,
162
165
  row_count_offset: row_count_offset,
163
166
  sample_size: sample_size,
164
- eol_char: eol_char
167
+ eol_char: eol_char,
168
+ truncate_ragged_lines: truncate_ragged_lines
165
169
  )
166
170
  end
167
171
 
@@ -172,88 +176,24 @@ module Polars
172
176
  end
173
177
  end
174
178
 
175
- # Lazily read from a CSV file or multiple files via glob patterns.
176
- #
177
- # This allows the query optimizer to push down predicates and
178
- # projections to the scan level, thereby potentially reducing
179
- # memory overhead.
180
- #
181
- # @param source [Object]
182
- # Path to a file.
183
- # @param has_header [Boolean]
184
- # Indicate if the first row of dataset is a header or not.
185
- # If set to false, column names will be autogenerated in the
186
- # following format: `column_x`, with `x` being an
187
- # enumeration over every column in the dataset starting at 1.
188
- # @param sep [String]
189
- # Single byte character to use as delimiter in the file.
190
- # @param comment_char [String]
191
- # Single byte character that indicates the start of a comment line,
192
- # for instance `#`.
193
- # @param quote_char [String]
194
- # Single byte character used for csv quoting.
195
- # Set to None to turn off special handling and escaping of quotes.
196
- # @param skip_rows [Integer]
197
- # Start reading after `skip_rows` lines. The header will be parsed at this
198
- # offset.
199
- # @param dtypes [Object]
200
- # Overwrite dtypes during inference.
201
- # @param null_values [Object]
202
- # Values to interpret as null values. You can provide a:
203
- #
204
- # - `String`: All values equal to this string will be null.
205
- # - `Array`: All values equal to any string in this array will be null.
206
- # - `Hash`: A hash that maps column name to a null value string.
207
- # @param ignore_errors [Boolean]
208
- # Try to keep reading lines if some lines yield errors.
209
- # First try `infer_schema_length: 0` to read all columns as
210
- # `:str` to check which values might cause an issue.
211
- # @param cache [Boolean]
212
- # Cache the result after reading.
213
- # @param with_column_names [Object]
214
- # Apply a function over the column names.
215
- # This can be used to update a schema just in time, thus before
216
- # scanning.
217
- # @param infer_schema_length [Integer]
218
- # Maximum number of lines to read to infer schema.
219
- # If set to 0, all columns will be read as `:str`.
220
- # If set to `nil`, a full table scan will be done (slow).
221
- # @param n_rows [Integer]
222
- # Stop reading from CSV file after reading `n_rows`.
223
- # @param encoding ["utf8", "utf8-lossy"]
224
- # Lossy means that invalid utf8 values are replaced with `�`
225
- # characters.
226
- # @param low_memory [Boolean]
227
- # Reduce memory usage in expense of performance.
228
- # @param rechunk [Boolean]
229
- # Reallocate to contiguous memory when all chunks/ files are parsed.
230
- # @param skip_rows_after_header [Integer]
231
- # Skip this number of rows when the header is parsed.
232
- # @param row_count_name [String]
233
- # If not nil, this will insert a row count column with the given name into
234
- # the DataFrame.
235
- # @param row_count_offset [Integer]
236
- # Offset to start the row_count column (only used if the name is set).
237
- # @param parse_dates [Boolean]
238
- # Try to automatically parse dates. If this does not succeed,
239
- # the column remains of data type `:str`.
240
- # @param eol_char [String]
241
- # Single byte end of line character.
242
- #
243
- # @return [LazyFrame]
244
- def scan_csv(
245
- source,
179
+ # @private
180
+ def _read_csv_impl(
181
+ file,
246
182
  has_header: true,
183
+ columns: nil,
247
184
  sep: ",",
248
185
  comment_char: nil,
249
186
  quote_char: '"',
250
187
  skip_rows: 0,
251
188
  dtypes: nil,
189
+ schema: nil,
252
190
  null_values: nil,
191
+ missing_utf8_is_empty_string: false,
253
192
  ignore_errors: false,
254
- cache: true,
255
- with_column_names: nil,
193
+ parse_dates: false,
194
+ n_threads: nil,
256
195
  infer_schema_length: 100,
196
+ batch_size: 8192,
257
197
  n_rows: nil,
258
198
  encoding: "utf8",
259
199
  low_memory: false,
@@ -261,417 +201,119 @@ module Polars
261
201
  skip_rows_after_header: 0,
262
202
  row_count_name: nil,
263
203
  row_count_offset: 0,
264
- parse_dates: false,
265
- eol_char: "\n"
266
- )
267
- Utils._check_arg_is_1byte("sep", sep, false)
268
- Utils._check_arg_is_1byte("comment_char", comment_char, false)
269
- Utils._check_arg_is_1byte("quote_char", quote_char, true)
270
-
271
- if Utils.pathlike?(source)
272
- source = Utils.normalise_filepath(source)
273
- end
274
-
275
- LazyFrame._scan_csv(
276
- source,
277
- has_header: has_header,
278
- sep: sep,
279
- comment_char: comment_char,
280
- quote_char: quote_char,
281
- skip_rows: skip_rows,
282
- dtypes: dtypes,
283
- null_values: null_values,
284
- ignore_errors: ignore_errors,
285
- cache: cache,
286
- with_column_names: with_column_names,
287
- infer_schema_length: infer_schema_length,
288
- n_rows: n_rows,
289
- low_memory: low_memory,
290
- rechunk: rechunk,
291
- skip_rows_after_header: skip_rows_after_header,
292
- encoding: encoding,
293
- row_count_name: row_count_name,
294
- row_count_offset: row_count_offset,
295
- parse_dates: parse_dates,
296
- eol_char: eol_char,
297
- )
298
- end
299
-
300
- # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
301
- #
302
- # This allows the query optimizer to push down predicates and projections to the scan
303
- # level, thereby potentially reducing memory overhead.
304
- #
305
- # @param source [String]
306
- # Path to a IPC file.
307
- # @param n_rows [Integer]
308
- # Stop reading from IPC file after reading `n_rows`.
309
- # @param cache [Boolean]
310
- # Cache the result after reading.
311
- # @param rechunk [Boolean]
312
- # Reallocate to contiguous memory when all chunks/ files are parsed.
313
- # @param row_count_name [String]
314
- # If not nil, this will insert a row count column with give name into the
315
- # DataFrame.
316
- # @param row_count_offset [Integer]
317
- # Offset to start the row_count column (only use if the name is set).
318
- # @param storage_options [Hash]
319
- # Extra options that make sense for a particular storage connection.
320
- # @param memory_map [Boolean]
321
- # Try to memory map the file. This can greatly improve performance on repeated
322
- # queries as the OS may cache pages.
323
- # Only uncompressed IPC files can be memory mapped.
324
- #
325
- # @return [LazyFrame]
326
- def scan_ipc(
327
- source,
328
- n_rows: nil,
329
- cache: true,
330
- rechunk: true,
331
- row_count_name: nil,
332
- row_count_offset: 0,
333
- storage_options: nil,
334
- memory_map: true
335
- )
336
- LazyFrame._scan_ipc(
337
- source,
338
- n_rows: n_rows,
339
- cache: cache,
340
- rechunk: rechunk,
341
- row_count_name: row_count_name,
342
- row_count_offset: row_count_offset,
343
- storage_options: storage_options,
344
- memory_map: memory_map
345
- )
346
- end
347
-
348
- # Lazily read from a parquet file or multiple files via glob patterns.
349
- #
350
- # This allows the query optimizer to push down predicates and projections to the scan
351
- # level, thereby potentially reducing memory overhead.
352
- #
353
- # @param source [String]
354
- # Path to a file.
355
- # @param n_rows [Integer]
356
- # Stop reading from parquet file after reading `n_rows`.
357
- # @param cache [Boolean]
358
- # Cache the result after reading.
359
- # @param parallel ["auto", "columns", "row_groups", "none"]
360
- # This determines the direction of parallelism. 'auto' will try to determine the
361
- # optimal direction.
362
- # @param rechunk [Boolean]
363
- # In case of reading multiple files via a glob pattern rechunk the final DataFrame
364
- # into contiguous memory chunks.
365
- # @param row_count_name [String]
366
- # If not nil, this will insert a row count column with give name into the
367
- # DataFrame.
368
- # @param row_count_offset [Integer]
369
- # Offset to start the row_count column (only use if the name is set).
370
- # @param storage_options [Hash]
371
- # Extra options that make sense for a particular storage connection.
372
- # @param low_memory [Boolean]
373
- # Reduce memory pressure at the expense of performance.
374
- #
375
- # @return [LazyFrame]
376
- def scan_parquet(
377
- source,
378
- n_rows: nil,
379
- cache: true,
380
- parallel: "auto",
381
- rechunk: true,
382
- row_count_name: nil,
383
- row_count_offset: 0,
384
- storage_options: nil,
385
- low_memory: false
204
+ sample_size: 1024,
205
+ eol_char: "\n",
206
+ raise_if_empty: true,
207
+ truncate_ragged_lines: false,
208
+ decimal_comma: false,
209
+ glob: true
386
210
  )
387
- if Utils.pathlike?(source)
388
- source = Utils.normalise_filepath(source)
211
+ if Utils.pathlike?(file)
212
+ path = Utils.normalize_filepath(file)
213
+ else
214
+ path = nil
215
+ # if defined?(StringIO) && file.is_a?(StringIO)
216
+ # file = file.string
217
+ # end
389
218
  end
390
219
 
391
- LazyFrame._scan_parquet(
392
- source,
393
- n_rows:n_rows,
394
- cache: cache,
395
- parallel: parallel,
396
- rechunk: rechunk,
397
- row_count_name: row_count_name,
398
- row_count_offset: row_count_offset,
399
- storage_options: storage_options,
400
- low_memory: low_memory
401
- )
402
- end
403
-
404
- # Lazily read from a newline delimited JSON file.
405
- #
406
- # This allows the query optimizer to push down predicates and projections to the scan
407
- # level, thereby potentially reducing memory overhead.
408
- #
409
- # @param source [String]
410
- # Path to a file.
411
- # @param infer_schema_length [Integer]
412
- # Infer the schema length from the first `infer_schema_length` rows.
413
- # @param batch_size [Integer]
414
- # Number of rows to read in each batch.
415
- # @param n_rows [Integer]
416
- # Stop reading from JSON file after reading `n_rows`.
417
- # @param low_memory [Boolean]
418
- # Reduce memory pressure at the expense of performance.
419
- # @param rechunk [Boolean]
420
- # Reallocate to contiguous memory when all chunks/ files are parsed.
421
- # @param row_count_name [String]
422
- # If not nil, this will insert a row count column with give name into the
423
- # DataFrame.
424
- # @param row_count_offset [Integer]
425
- # Offset to start the row_count column (only use if the name is set).
426
- #
427
- # @return [LazyFrame]
428
- def scan_ndjson(
429
- source,
430
- infer_schema_length: 100,
431
- batch_size: 1024,
432
- n_rows: nil,
433
- low_memory: false,
434
- rechunk: true,
435
- row_count_name: nil,
436
- row_count_offset: 0
437
- )
438
- if Utils.pathlike?(source)
439
- source = Utils.normalise_filepath(source)
220
+ dtype_list = nil
221
+ dtype_slice = nil
222
+ if !dtypes.nil?
223
+ if dtypes.is_a?(Hash)
224
+ dtype_list = []
225
+ dtypes.each do|k, v|
226
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
227
+ end
228
+ elsif dtypes.is_a?(::Array)
229
+ dtype_slice = dtypes
230
+ else
231
+ raise ArgumentError, "dtype arg should be list or dict"
232
+ end
440
233
  end
441
234
 
442
- LazyFrame._scan_ndjson(
443
- source,
444
- infer_schema_length: infer_schema_length,
445
- batch_size: batch_size,
446
- n_rows: n_rows,
447
- low_memory: low_memory,
448
- rechunk: rechunk,
449
- row_count_name: row_count_name,
450
- row_count_offset: row_count_offset,
451
- )
452
- end
235
+ processed_null_values = Utils._process_null_values(null_values)
453
236
 
454
- # Read into a DataFrame from Apache Avro format.
455
- #
456
- # @param source [Object]
457
- # Path to a file or a file-like object.
458
- # @param columns [Object]
459
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
460
- # of column names.
461
- # @param n_rows [Integer]
462
- # Stop reading from Apache Avro file after reading ``n_rows``.
463
- #
464
- # @return [DataFrame]
465
- def read_avro(source, columns: nil, n_rows: nil)
466
- if Utils.pathlike?(source)
467
- source = Utils.normalise_filepath(source)
237
+ if columns.is_a?(::String)
238
+ columns = [columns]
468
239
  end
469
-
470
- DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
471
- end
472
-
473
- # Read into a DataFrame from Arrow IPC (Feather v2) file.
474
- #
475
- # @param source [Object]
476
- # Path to a file or a file-like object.
477
- # @param columns [Object]
478
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
479
- # of column names.
480
- # @param n_rows [Integer]
481
- # Stop reading from IPC file after reading `n_rows`.
482
- # @param memory_map [Boolean]
483
- # Try to memory map the file. This can greatly improve performance on repeated
484
- # queries as the OS may cache pages.
485
- # Only uncompressed IPC files can be memory mapped.
486
- # @param storage_options [Hash]
487
- # Extra options that make sense for a particular storage connection.
488
- # @param row_count_name [String]
489
- # If not nil, this will insert a row count column with give name into the
490
- # DataFrame.
491
- # @param row_count_offset [Integer]
492
- # Offset to start the row_count column (only use if the name is set).
493
- # @param rechunk [Boolean]
494
- # Make sure that all data is contiguous.
495
- #
496
- # @return [DataFrame]
497
- def read_ipc(
498
- source,
499
- columns: nil,
500
- n_rows: nil,
501
- memory_map: true,
502
- storage_options: nil,
503
- row_count_name: nil,
504
- row_count_offset: 0,
505
- rechunk: true
506
- )
507
- storage_options ||= {}
508
- _prepare_file_arg(source, **storage_options) do |data|
509
- DataFrame._read_ipc(
510
- data,
511
- columns: columns,
240
+ if file.is_a?(::String) && file.include?("*")
241
+ dtypes_dict = nil
242
+ if !dtype_list.nil?
243
+ dtypes_dict = dtype_list.to_h
244
+ end
245
+ if !dtype_slice.nil?
246
+ raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
247
+ end
248
+ scan = scan_csv(
249
+ file,
250
+ has_header: has_header,
251
+ sep: sep,
252
+ comment_char: comment_char,
253
+ quote_char: quote_char,
254
+ skip_rows: skip_rows,
255
+ dtypes: dtypes_dict,
256
+ null_values: null_values,
257
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
258
+ ignore_errors: ignore_errors,
259
+ infer_schema_length: infer_schema_length,
512
260
  n_rows: n_rows,
513
- row_count_name: row_count_name,
514
- row_count_offset: row_count_offset,
261
+ low_memory: low_memory,
515
262
  rechunk: rechunk,
516
- memory_map: memory_map
517
- )
518
- end
519
- end
520
-
521
- # Read into a DataFrame from a parquet file.
522
- #
523
- # @param source [Object]
524
- # Path to a file or a file-like object.
525
- # @param columns [Object]
526
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
527
- # of column names.
528
- # @param n_rows [Integer]
529
- # Stop reading from parquet file after reading `n_rows`.
530
- # @param storage_options [Hash]
531
- # Extra options that make sense for a particular storage connection.
532
- # @param parallel ["auto", "columns", "row_groups", "none"]
533
- # This determines the direction of parallelism. 'auto' will try to determine the
534
- # optimal direction.
535
- # @param row_count_name [String]
536
- # If not nil, this will insert a row count column with give name into the
537
- # DataFrame.
538
- # @param row_count_offset [Integer]
539
- # Offset to start the row_count column (only use if the name is set).
540
- # @param low_memory [Boolean]
541
- # Reduce memory pressure at the expense of performance.
542
- # @param use_statistics [Boolean]
543
- # Use statistics in the parquet to determine if pages
544
- # can be skipped from reading.
545
- # @param rechunk [Boolean]
546
- # Make sure that all columns are contiguous in memory by
547
- # aggregating the chunks into a single array.
548
- #
549
- # @return [DataFrame]
550
- #
551
- # @note
552
- # This operation defaults to a `rechunk` operation at the end, meaning that
553
- # all data will be stored continuously in memory.
554
- # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
555
- # an expensive operation.
556
- def read_parquet(
557
- source,
558
- columns: nil,
559
- n_rows: nil,
560
- storage_options: nil,
561
- parallel: "auto",
562
- row_count_name: nil,
563
- row_count_offset: 0,
564
- low_memory: false,
565
- use_statistics: true,
566
- rechunk: true
567
- )
568
- _prepare_file_arg(source) do |data|
569
- DataFrame._read_parquet(
570
- data,
571
- columns: columns,
572
- n_rows: n_rows,
573
- parallel: parallel,
263
+ skip_rows_after_header: skip_rows_after_header,
574
264
  row_count_name: row_count_name,
575
265
  row_count_offset: row_count_offset,
576
- low_memory: low_memory,
577
- use_statistics: use_statistics,
578
- rechunk: rechunk
266
+ eol_char: eol_char,
267
+ truncate_ragged_lines: truncate_ragged_lines,
268
+ decimal_comma: decimal_comma,
269
+ glob: glob
579
270
  )
580
- end
581
- end
582
-
583
- # Read into a DataFrame from a JSON file.
584
- #
585
- # @param source [Object]
586
- # Path to a file or a file-like object.
587
- #
588
- # @return [DataFrame]
589
- def read_json(source)
590
- DataFrame._read_json(source)
591
- end
592
-
593
- # Read into a DataFrame from a newline delimited JSON file.
594
- #
595
- # @param source [Object]
596
- # Path to a file or a file-like object.
597
- #
598
- # @return [DataFrame]
599
- def read_ndjson(source)
600
- DataFrame._read_ndjson(source)
601
- end
602
-
603
- # Read a SQL query into a DataFrame.
604
- #
605
- # @param query [Object]
606
- # ActiveRecord::Relation or ActiveRecord::Result.
607
- # @param schema_overrides [Hash]
608
- # A hash mapping column names to dtypes, used to override the schema
609
- # inferred from the query.
610
- #
611
- # @return [DataFrame]
612
- def read_database(query, schema_overrides: nil)
613
- if !defined?(ActiveRecord)
614
- raise Error, "Active Record not available"
615
- end
616
-
617
- result =
618
- if query.is_a?(ActiveRecord::Result)
619
- query
620
- elsif query.is_a?(ActiveRecord::Relation)
621
- query.connection.select_all(query.to_sql)
622
- elsif query.is_a?(::String)
623
- ActiveRecord::Base.connection.select_all(query)
271
+ if columns.nil?
272
+ return scan.collect
273
+ elsif is_str_sequence(columns, allow_str: false)
274
+ return scan.select(columns).collect
624
275
  else
625
- raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
276
+ raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
626
277
  end
627
-
628
- data = {}
629
- schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
630
-
631
- result.columns.each_with_index do |k, i|
632
- column_type = result.column_types[i]
633
-
634
- data[k] =
635
- if column_type
636
- result.rows.map { |r| column_type.deserialize(r[i]) }
637
- else
638
- result.rows.map { |r| r[i] }
639
- end
640
-
641
- polars_type =
642
- case column_type&.type
643
- when :binary
644
- Binary
645
- when :boolean
646
- Boolean
647
- when :date
648
- Date
649
- when :datetime, :timestamp
650
- Datetime
651
- when :decimal
652
- Decimal
653
- when :float
654
- Float64
655
- when :integer
656
- Int64
657
- when :string, :text
658
- String
659
- when :time
660
- Time
661
- # TODO fix issue with null
662
- # when :json, :jsonb
663
- # Struct
664
- end
665
-
666
- schema_overrides[k] ||= polars_type if polars_type
667
278
  end
668
279
 
669
- DataFrame.new(data, schema_overrides: schema_overrides)
670
- end
671
- alias_method :read_sql, :read_database
280
+ projection, columns = Utils.handle_projection_columns(columns)
672
281
 
673
- # def read_excel
674
- # end
282
+ rbdf =
283
+ RbDataFrame.read_csv(
284
+ file,
285
+ infer_schema_length,
286
+ batch_size,
287
+ has_header,
288
+ ignore_errors,
289
+ n_rows,
290
+ skip_rows,
291
+ projection,
292
+ sep,
293
+ rechunk,
294
+ columns,
295
+ encoding,
296
+ n_threads,
297
+ path,
298
+ dtype_list,
299
+ dtype_slice,
300
+ low_memory,
301
+ comment_char,
302
+ quote_char,
303
+ processed_null_values,
304
+ missing_utf8_is_empty_string,
305
+ parse_dates,
306
+ skip_rows_after_header,
307
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
308
+ sample_size,
309
+ eol_char,
310
+ raise_if_empty,
311
+ truncate_ragged_lines,
312
+ decimal_comma,
313
+ schema
314
+ )
315
+ Utils.wrap_df(rbdf)
316
+ end
675
317
 
676
318
  # Read a CSV file in batches.
677
319
  #
@@ -755,6 +397,8 @@ module Polars
755
397
  # allocation needed.
756
398
  # @param eol_char [String]
757
399
  # Single byte end of line character.
400
+ # @param truncate_ragged_lines [Boolean]
401
+ # Truncate lines that are longer than the schema.
758
402
  #
759
403
  # @return [BatchedCsvReader]
760
404
  #
@@ -774,6 +418,7 @@ module Polars
774
418
  skip_rows: 0,
775
419
  dtypes: nil,
776
420
  null_values: nil,
421
+ missing_utf8_is_empty_string: false,
777
422
  ignore_errors: false,
778
423
  parse_dates: false,
779
424
  n_threads: nil,
@@ -787,7 +432,10 @@ module Polars
787
432
  row_count_name: nil,
788
433
  row_count_offset: 0,
789
434
  sample_size: 1024,
790
- eol_char: "\n"
435
+ eol_char: "\n",
436
+ raise_if_empty: true,
437
+ truncate_ragged_lines: false,
438
+ decimal_comma: false
791
439
  )
792
440
  projection, columns = Utils.handle_projection_columns(columns)
793
441
 
@@ -813,6 +461,7 @@ module Polars
813
461
  skip_rows: skip_rows,
814
462
  dtypes: dtypes,
815
463
  null_values: null_values,
464
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
816
465
  ignore_errors: ignore_errors,
817
466
  parse_dates: parse_dates,
818
467
  n_threads: n_threads,
@@ -827,36 +476,205 @@ module Polars
827
476
  row_count_offset: row_count_offset,
828
477
  sample_size: sample_size,
829
478
  eol_char: eol_char,
830
- new_columns: new_columns
479
+ new_columns: new_columns,
480
+ raise_if_empty: raise_if_empty,
481
+ truncate_ragged_lines: truncate_ragged_lines,
482
+ decimal_comma: decimal_comma
831
483
  )
832
484
  end
833
485
 
834
- # Get a schema of the IPC file without reading data.
486
+ # Lazily read from a CSV file or multiple files via glob patterns.
487
+ #
488
+ # This allows the query optimizer to push down predicates and
489
+ # projections to the scan level, thereby potentially reducing
490
+ # memory overhead.
835
491
  #
836
492
  # @param source [Object]
837
- # Path to a file or a file-like object.
493
+ # Path to a file.
494
+ # @param has_header [Boolean]
495
+ # Indicate if the first row of dataset is a header or not.
496
+ # If set to false, column names will be autogenerated in the
497
+ # following format: `column_x`, with `x` being an
498
+ # enumeration over every column in the dataset starting at 1.
499
+ # @param sep [String]
500
+ # Single byte character to use as delimiter in the file.
501
+ # @param comment_char [String]
502
+ # Single byte character that indicates the start of a comment line,
503
+ # for instance `#`.
504
+ # @param quote_char [String]
505
+ # Single byte character used for csv quoting.
506
+ # Set to None to turn off special handling and escaping of quotes.
507
+ # @param skip_rows [Integer]
508
+ # Start reading after `skip_rows` lines. The header will be parsed at this
509
+ # offset.
510
+ # @param dtypes [Object]
511
+ # Overwrite dtypes during inference.
512
+ # @param null_values [Object]
513
+ # Values to interpret as null values. You can provide a:
514
+ #
515
+ # - `String`: All values equal to this string will be null.
516
+ # - `Array`: All values equal to any string in this array will be null.
517
+ # - `Hash`: A hash that maps column name to a null value string.
518
+ # @param ignore_errors [Boolean]
519
+ # Try to keep reading lines if some lines yield errors.
520
+ # First try `infer_schema_length: 0` to read all columns as
521
+ # `:str` to check which values might cause an issue.
522
+ # @param cache [Boolean]
523
+ # Cache the result after reading.
524
+ # @param with_column_names [Object]
525
+ # Apply a function over the column names.
526
+ # This can be used to update a schema just in time, thus before
527
+ # scanning.
528
+ # @param infer_schema_length [Integer]
529
+ # Maximum number of lines to read to infer schema.
530
+ # If set to 0, all columns will be read as `:str`.
531
+ # If set to `nil`, a full table scan will be done (slow).
532
+ # @param n_rows [Integer]
533
+ # Stop reading from CSV file after reading `n_rows`.
534
+ # @param encoding ["utf8", "utf8-lossy"]
535
+ # Lossy means that invalid utf8 values are replaced with `�`
536
+ # characters.
537
+ # @param low_memory [Boolean]
538
+ # Reduce memory usage in expense of performance.
539
+ # @param rechunk [Boolean]
540
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
541
+ # @param skip_rows_after_header [Integer]
542
+ # Skip this number of rows when the header is parsed.
543
+ # @param row_count_name [String]
544
+ # If not nil, this will insert a row count column with the given name into
545
+ # the DataFrame.
546
+ # @param row_count_offset [Integer]
547
+ # Offset to start the row_count column (only used if the name is set).
548
+ # @param parse_dates [Boolean]
549
+ # Try to automatically parse dates. If this does not succeed,
550
+ # the column remains of data type `:str`.
551
+ # @param eol_char [String]
552
+ # Single byte end of line character.
553
+ # @param truncate_ragged_lines [Boolean]
554
+ # Truncate lines that are longer than the schema.
838
555
  #
839
- # @return [Hash]
840
- def read_ipc_schema(source)
556
+ # @return [LazyFrame]
557
+ def scan_csv(
558
+ source,
559
+ has_header: true,
560
+ sep: ",",
561
+ comment_char: nil,
562
+ quote_char: '"',
563
+ skip_rows: 0,
564
+ dtypes: nil,
565
+ null_values: nil,
566
+ missing_utf8_is_empty_string: false,
567
+ ignore_errors: false,
568
+ cache: true,
569
+ with_column_names: nil,
570
+ infer_schema_length: 100,
571
+ n_rows: nil,
572
+ encoding: "utf8",
573
+ low_memory: false,
574
+ rechunk: true,
575
+ skip_rows_after_header: 0,
576
+ row_count_name: nil,
577
+ row_count_offset: 0,
578
+ parse_dates: false,
579
+ eol_char: "\n",
580
+ raise_if_empty: true,
581
+ truncate_ragged_lines: false,
582
+ decimal_comma: false,
583
+ glob: true
584
+ )
585
+ Utils._check_arg_is_1byte("sep", sep, false)
586
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
587
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
588
+
841
589
  if Utils.pathlike?(source)
842
- source = Utils.normalise_filepath(source)
590
+ source = Utils.normalize_filepath(source)
843
591
  end
844
592
 
845
- Plr.ipc_schema(source)
593
+ _scan_csv_impl(
594
+ source,
595
+ has_header: has_header,
596
+ sep: sep,
597
+ comment_char: comment_char,
598
+ quote_char: quote_char,
599
+ skip_rows: skip_rows,
600
+ dtypes: dtypes,
601
+ null_values: null_values,
602
+ ignore_errors: ignore_errors,
603
+ cache: cache,
604
+ with_column_names: with_column_names,
605
+ infer_schema_length: infer_schema_length,
606
+ n_rows: n_rows,
607
+ low_memory: low_memory,
608
+ rechunk: rechunk,
609
+ skip_rows_after_header: skip_rows_after_header,
610
+ encoding: encoding,
611
+ row_count_name: row_count_name,
612
+ row_count_offset: row_count_offset,
613
+ parse_dates: parse_dates,
614
+ eol_char: eol_char,
615
+ truncate_ragged_lines: truncate_ragged_lines
616
+ )
846
617
  end
847
618
 
848
- # Get a schema of the Parquet file without reading data.
849
- #
850
- # @param source [Object]
851
- # Path to a file or a file-like object.
852
- #
853
- # @return [Hash]
854
- def read_parquet_schema(source)
855
- if Utils.pathlike?(source)
856
- source = Utils.normalise_filepath(source)
619
+ # @private
620
+ def _scan_csv_impl(
621
+ file,
622
+ has_header: true,
623
+ sep: ",",
624
+ comment_char: nil,
625
+ quote_char: '"',
626
+ skip_rows: 0,
627
+ dtypes: nil,
628
+ null_values: nil,
629
+ ignore_errors: false,
630
+ cache: true,
631
+ with_column_names: nil,
632
+ infer_schema_length: 100,
633
+ n_rows: nil,
634
+ encoding: "utf8",
635
+ low_memory: false,
636
+ rechunk: true,
637
+ skip_rows_after_header: 0,
638
+ row_count_name: nil,
639
+ row_count_offset: 0,
640
+ parse_dates: false,
641
+ eol_char: "\n",
642
+ truncate_ragged_lines: true
643
+ )
644
+ dtype_list = nil
645
+ if !dtypes.nil?
646
+ dtype_list = []
647
+ dtypes.each do |k, v|
648
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
649
+ end
857
650
  end
858
-
859
- Plr.parquet_schema(source)
651
+ processed_null_values = Utils._process_null_values(null_values)
652
+
653
+ rblf =
654
+ RbLazyFrame.new_from_csv(
655
+ file,
656
+ sep,
657
+ has_header,
658
+ ignore_errors,
659
+ skip_rows,
660
+ n_rows,
661
+ cache,
662
+ dtype_list,
663
+ low_memory,
664
+ comment_char,
665
+ quote_char,
666
+ processed_null_values,
667
+ infer_schema_length,
668
+ with_column_names,
669
+ rechunk,
670
+ skip_rows_after_header,
671
+ encoding,
672
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
673
+ parse_dates,
674
+ eol_char,
675
+ truncate_ragged_lines
676
+ )
677
+ Utils.wrap_ldf(rblf)
860
678
  end
861
679
 
862
680
  private