polars-df 0.9.0-x86_64-darwin → 0.11.0-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +23 -0
  3. data/Cargo.lock +144 -57
  4. data/LICENSE-THIRD-PARTY.txt +629 -29
  5. data/README.md +7 -6
  6. data/lib/polars/3.1/polars.bundle +0 -0
  7. data/lib/polars/3.2/polars.bundle +0 -0
  8. data/lib/polars/3.3/polars.bundle +0 -0
  9. data/lib/polars/array_expr.rb +6 -2
  10. data/lib/polars/batched_csv_reader.rb +11 -3
  11. data/lib/polars/convert.rb +6 -1
  12. data/lib/polars/data_frame.rb +225 -370
  13. data/lib/polars/date_time_expr.rb +11 -4
  14. data/lib/polars/date_time_name_space.rb +14 -4
  15. data/lib/polars/dynamic_group_by.rb +2 -2
  16. data/lib/polars/exceptions.rb +4 -0
  17. data/lib/polars/expr.rb +1171 -54
  18. data/lib/polars/functions/lazy.rb +3 -3
  19. data/lib/polars/functions/range/date_range.rb +92 -0
  20. data/lib/polars/functions/range/datetime_range.rb +149 -0
  21. data/lib/polars/functions/range/time_range.rb +141 -0
  22. data/lib/polars/functions/whenthen.rb +74 -5
  23. data/lib/polars/group_by.rb +88 -23
  24. data/lib/polars/io/avro.rb +24 -0
  25. data/lib/polars/{io.rb → io/csv.rb} +307 -489
  26. data/lib/polars/io/database.rb +73 -0
  27. data/lib/polars/io/ipc.rb +247 -0
  28. data/lib/polars/io/json.rb +18 -0
  29. data/lib/polars/io/ndjson.rb +69 -0
  30. data/lib/polars/io/parquet.rb +226 -0
  31. data/lib/polars/lazy_frame.rb +55 -195
  32. data/lib/polars/lazy_group_by.rb +100 -3
  33. data/lib/polars/list_expr.rb +6 -2
  34. data/lib/polars/rolling_group_by.rb +2 -2
  35. data/lib/polars/series.rb +14 -12
  36. data/lib/polars/string_expr.rb +38 -36
  37. data/lib/polars/utils.rb +89 -1
  38. data/lib/polars/version.rb +1 -1
  39. data/lib/polars/whenthen.rb +83 -0
  40. data/lib/polars.rb +10 -3
  41. metadata +13 -6
  42. data/lib/polars/when.rb +0 -16
  43. data/lib/polars/when_then.rb +0 -19
@@ -80,6 +80,8 @@ module Polars
80
80
  # allocation needed.
81
81
  # @param eol_char [String]
82
82
  # Single byte end of line character.
83
+ # @param truncate_ragged_lines [Boolean]
84
+ # Truncate lines that are longer than the schema.
83
85
  #
84
86
  # @return [DataFrame]
85
87
  #
@@ -113,7 +115,8 @@ module Polars
113
115
  row_count_name: nil,
114
116
  row_count_offset: 0,
115
117
  sample_size: 1024,
116
- eol_char: "\n"
118
+ eol_char: "\n",
119
+ truncate_ragged_lines: false
117
120
  )
118
121
  Utils._check_arg_is_1byte("sep", sep, false)
119
122
  Utils._check_arg_is_1byte("comment_char", comment_char, false)
@@ -138,7 +141,7 @@ module Polars
138
141
 
139
142
  df = nil
140
143
  _prepare_file_arg(source) do |data|
141
- df = DataFrame._read_csv(
144
+ df = _read_csv_impl(
142
145
  data,
143
146
  has_header: has_header,
144
147
  columns: columns || projection,
@@ -161,7 +164,8 @@ module Polars
161
164
  row_count_name: row_count_name,
162
165
  row_count_offset: row_count_offset,
163
166
  sample_size: sample_size,
164
- eol_char: eol_char
167
+ eol_char: eol_char,
168
+ truncate_ragged_lines: truncate_ragged_lines
165
169
  )
166
170
  end
167
171
 
@@ -172,88 +176,24 @@ module Polars
172
176
  end
173
177
  end
174
178
 
175
- # Lazily read from a CSV file or multiple files via glob patterns.
176
- #
177
- # This allows the query optimizer to push down predicates and
178
- # projections to the scan level, thereby potentially reducing
179
- # memory overhead.
180
- #
181
- # @param source [Object]
182
- # Path to a file.
183
- # @param has_header [Boolean]
184
- # Indicate if the first row of dataset is a header or not.
185
- # If set to false, column names will be autogenerated in the
186
- # following format: `column_x`, with `x` being an
187
- # enumeration over every column in the dataset starting at 1.
188
- # @param sep [String]
189
- # Single byte character to use as delimiter in the file.
190
- # @param comment_char [String]
191
- # Single byte character that indicates the start of a comment line,
192
- # for instance `#`.
193
- # @param quote_char [String]
194
- # Single byte character used for csv quoting.
195
- # Set to None to turn off special handling and escaping of quotes.
196
- # @param skip_rows [Integer]
197
- # Start reading after `skip_rows` lines. The header will be parsed at this
198
- # offset.
199
- # @param dtypes [Object]
200
- # Overwrite dtypes during inference.
201
- # @param null_values [Object]
202
- # Values to interpret as null values. You can provide a:
203
- #
204
- # - `String`: All values equal to this string will be null.
205
- # - `Array`: All values equal to any string in this array will be null.
206
- # - `Hash`: A hash that maps column name to a null value string.
207
- # @param ignore_errors [Boolean]
208
- # Try to keep reading lines if some lines yield errors.
209
- # First try `infer_schema_length: 0` to read all columns as
210
- # `:str` to check which values might cause an issue.
211
- # @param cache [Boolean]
212
- # Cache the result after reading.
213
- # @param with_column_names [Object]
214
- # Apply a function over the column names.
215
- # This can be used to update a schema just in time, thus before
216
- # scanning.
217
- # @param infer_schema_length [Integer]
218
- # Maximum number of lines to read to infer schema.
219
- # If set to 0, all columns will be read as `:str`.
220
- # If set to `nil`, a full table scan will be done (slow).
221
- # @param n_rows [Integer]
222
- # Stop reading from CSV file after reading `n_rows`.
223
- # @param encoding ["utf8", "utf8-lossy"]
224
- # Lossy means that invalid utf8 values are replaced with `�`
225
- # characters.
226
- # @param low_memory [Boolean]
227
- # Reduce memory usage in expense of performance.
228
- # @param rechunk [Boolean]
229
- # Reallocate to contiguous memory when all chunks/ files are parsed.
230
- # @param skip_rows_after_header [Integer]
231
- # Skip this number of rows when the header is parsed.
232
- # @param row_count_name [String]
233
- # If not nil, this will insert a row count column with the given name into
234
- # the DataFrame.
235
- # @param row_count_offset [Integer]
236
- # Offset to start the row_count column (only used if the name is set).
237
- # @param parse_dates [Boolean]
238
- # Try to automatically parse dates. If this does not succeed,
239
- # the column remains of data type `:str`.
240
- # @param eol_char [String]
241
- # Single byte end of line character.
242
- #
243
- # @return [LazyFrame]
244
- def scan_csv(
245
- source,
179
+ # @private
180
+ def _read_csv_impl(
181
+ file,
246
182
  has_header: true,
183
+ columns: nil,
247
184
  sep: ",",
248
185
  comment_char: nil,
249
186
  quote_char: '"',
250
187
  skip_rows: 0,
251
188
  dtypes: nil,
189
+ schema: nil,
252
190
  null_values: nil,
191
+ missing_utf8_is_empty_string: false,
253
192
  ignore_errors: false,
254
- cache: true,
255
- with_column_names: nil,
193
+ parse_dates: false,
194
+ n_threads: nil,
256
195
  infer_schema_length: 100,
196
+ batch_size: 8192,
257
197
  n_rows: nil,
258
198
  encoding: "utf8",
259
199
  low_memory: false,
@@ -261,417 +201,119 @@ module Polars
261
201
  skip_rows_after_header: 0,
262
202
  row_count_name: nil,
263
203
  row_count_offset: 0,
264
- parse_dates: false,
265
- eol_char: "\n"
266
- )
267
- Utils._check_arg_is_1byte("sep", sep, false)
268
- Utils._check_arg_is_1byte("comment_char", comment_char, false)
269
- Utils._check_arg_is_1byte("quote_char", quote_char, true)
270
-
271
- if Utils.pathlike?(source)
272
- source = Utils.normalise_filepath(source)
273
- end
274
-
275
- LazyFrame._scan_csv(
276
- source,
277
- has_header: has_header,
278
- sep: sep,
279
- comment_char: comment_char,
280
- quote_char: quote_char,
281
- skip_rows: skip_rows,
282
- dtypes: dtypes,
283
- null_values: null_values,
284
- ignore_errors: ignore_errors,
285
- cache: cache,
286
- with_column_names: with_column_names,
287
- infer_schema_length: infer_schema_length,
288
- n_rows: n_rows,
289
- low_memory: low_memory,
290
- rechunk: rechunk,
291
- skip_rows_after_header: skip_rows_after_header,
292
- encoding: encoding,
293
- row_count_name: row_count_name,
294
- row_count_offset: row_count_offset,
295
- parse_dates: parse_dates,
296
- eol_char: eol_char,
297
- )
298
- end
299
-
300
- # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
301
- #
302
- # This allows the query optimizer to push down predicates and projections to the scan
303
- # level, thereby potentially reducing memory overhead.
304
- #
305
- # @param source [String]
306
- # Path to a IPC file.
307
- # @param n_rows [Integer]
308
- # Stop reading from IPC file after reading `n_rows`.
309
- # @param cache [Boolean]
310
- # Cache the result after reading.
311
- # @param rechunk [Boolean]
312
- # Reallocate to contiguous memory when all chunks/ files are parsed.
313
- # @param row_count_name [String]
314
- # If not nil, this will insert a row count column with give name into the
315
- # DataFrame.
316
- # @param row_count_offset [Integer]
317
- # Offset to start the row_count column (only use if the name is set).
318
- # @param storage_options [Hash]
319
- # Extra options that make sense for a particular storage connection.
320
- # @param memory_map [Boolean]
321
- # Try to memory map the file. This can greatly improve performance on repeated
322
- # queries as the OS may cache pages.
323
- # Only uncompressed IPC files can be memory mapped.
324
- #
325
- # @return [LazyFrame]
326
- def scan_ipc(
327
- source,
328
- n_rows: nil,
329
- cache: true,
330
- rechunk: true,
331
- row_count_name: nil,
332
- row_count_offset: 0,
333
- storage_options: nil,
334
- memory_map: true
335
- )
336
- LazyFrame._scan_ipc(
337
- source,
338
- n_rows: n_rows,
339
- cache: cache,
340
- rechunk: rechunk,
341
- row_count_name: row_count_name,
342
- row_count_offset: row_count_offset,
343
- storage_options: storage_options,
344
- memory_map: memory_map
345
- )
346
- end
347
-
348
- # Lazily read from a parquet file or multiple files via glob patterns.
349
- #
350
- # This allows the query optimizer to push down predicates and projections to the scan
351
- # level, thereby potentially reducing memory overhead.
352
- #
353
- # @param source [String]
354
- # Path to a file.
355
- # @param n_rows [Integer]
356
- # Stop reading from parquet file after reading `n_rows`.
357
- # @param cache [Boolean]
358
- # Cache the result after reading.
359
- # @param parallel ["auto", "columns", "row_groups", "none"]
360
- # This determines the direction of parallelism. 'auto' will try to determine the
361
- # optimal direction.
362
- # @param rechunk [Boolean]
363
- # In case of reading multiple files via a glob pattern rechunk the final DataFrame
364
- # into contiguous memory chunks.
365
- # @param row_count_name [String]
366
- # If not nil, this will insert a row count column with give name into the
367
- # DataFrame.
368
- # @param row_count_offset [Integer]
369
- # Offset to start the row_count column (only use if the name is set).
370
- # @param storage_options [Hash]
371
- # Extra options that make sense for a particular storage connection.
372
- # @param low_memory [Boolean]
373
- # Reduce memory pressure at the expense of performance.
374
- #
375
- # @return [LazyFrame]
376
- def scan_parquet(
377
- source,
378
- n_rows: nil,
379
- cache: true,
380
- parallel: "auto",
381
- rechunk: true,
382
- row_count_name: nil,
383
- row_count_offset: 0,
384
- storage_options: nil,
385
- low_memory: false
204
+ sample_size: 1024,
205
+ eol_char: "\n",
206
+ raise_if_empty: true,
207
+ truncate_ragged_lines: false,
208
+ decimal_comma: false,
209
+ glob: true
386
210
  )
387
- if Utils.pathlike?(source)
388
- source = Utils.normalise_filepath(source)
211
+ if Utils.pathlike?(file)
212
+ path = Utils.normalize_filepath(file)
213
+ else
214
+ path = nil
215
+ # if defined?(StringIO) && file.is_a?(StringIO)
216
+ # file = file.string
217
+ # end
389
218
  end
390
219
 
391
- LazyFrame._scan_parquet(
392
- source,
393
- n_rows:n_rows,
394
- cache: cache,
395
- parallel: parallel,
396
- rechunk: rechunk,
397
- row_count_name: row_count_name,
398
- row_count_offset: row_count_offset,
399
- storage_options: storage_options,
400
- low_memory: low_memory
401
- )
402
- end
403
-
404
- # Lazily read from a newline delimited JSON file.
405
- #
406
- # This allows the query optimizer to push down predicates and projections to the scan
407
- # level, thereby potentially reducing memory overhead.
408
- #
409
- # @param source [String]
410
- # Path to a file.
411
- # @param infer_schema_length [Integer]
412
- # Infer the schema length from the first `infer_schema_length` rows.
413
- # @param batch_size [Integer]
414
- # Number of rows to read in each batch.
415
- # @param n_rows [Integer]
416
- # Stop reading from JSON file after reading `n_rows`.
417
- # @param low_memory [Boolean]
418
- # Reduce memory pressure at the expense of performance.
419
- # @param rechunk [Boolean]
420
- # Reallocate to contiguous memory when all chunks/ files are parsed.
421
- # @param row_count_name [String]
422
- # If not nil, this will insert a row count column with give name into the
423
- # DataFrame.
424
- # @param row_count_offset [Integer]
425
- # Offset to start the row_count column (only use if the name is set).
426
- #
427
- # @return [LazyFrame]
428
- def scan_ndjson(
429
- source,
430
- infer_schema_length: 100,
431
- batch_size: 1024,
432
- n_rows: nil,
433
- low_memory: false,
434
- rechunk: true,
435
- row_count_name: nil,
436
- row_count_offset: 0
437
- )
438
- if Utils.pathlike?(source)
439
- source = Utils.normalise_filepath(source)
220
+ dtype_list = nil
221
+ dtype_slice = nil
222
+ if !dtypes.nil?
223
+ if dtypes.is_a?(Hash)
224
+ dtype_list = []
225
+ dtypes.each do|k, v|
226
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
227
+ end
228
+ elsif dtypes.is_a?(::Array)
229
+ dtype_slice = dtypes
230
+ else
231
+ raise ArgumentError, "dtype arg should be list or dict"
232
+ end
440
233
  end
441
234
 
442
- LazyFrame._scan_ndjson(
443
- source,
444
- infer_schema_length: infer_schema_length,
445
- batch_size: batch_size,
446
- n_rows: n_rows,
447
- low_memory: low_memory,
448
- rechunk: rechunk,
449
- row_count_name: row_count_name,
450
- row_count_offset: row_count_offset,
451
- )
452
- end
235
+ processed_null_values = Utils._process_null_values(null_values)
453
236
 
454
- # Read into a DataFrame from Apache Avro format.
455
- #
456
- # @param source [Object]
457
- # Path to a file or a file-like object.
458
- # @param columns [Object]
459
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
460
- # of column names.
461
- # @param n_rows [Integer]
462
- # Stop reading from Apache Avro file after reading ``n_rows``.
463
- #
464
- # @return [DataFrame]
465
- def read_avro(source, columns: nil, n_rows: nil)
466
- if Utils.pathlike?(source)
467
- source = Utils.normalise_filepath(source)
237
+ if columns.is_a?(::String)
238
+ columns = [columns]
468
239
  end
469
-
470
- DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
471
- end
472
-
473
- # Read into a DataFrame from Arrow IPC (Feather v2) file.
474
- #
475
- # @param source [Object]
476
- # Path to a file or a file-like object.
477
- # @param columns [Object]
478
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
479
- # of column names.
480
- # @param n_rows [Integer]
481
- # Stop reading from IPC file after reading `n_rows`.
482
- # @param memory_map [Boolean]
483
- # Try to memory map the file. This can greatly improve performance on repeated
484
- # queries as the OS may cache pages.
485
- # Only uncompressed IPC files can be memory mapped.
486
- # @param storage_options [Hash]
487
- # Extra options that make sense for a particular storage connection.
488
- # @param row_count_name [String]
489
- # If not nil, this will insert a row count column with give name into the
490
- # DataFrame.
491
- # @param row_count_offset [Integer]
492
- # Offset to start the row_count column (only use if the name is set).
493
- # @param rechunk [Boolean]
494
- # Make sure that all data is contiguous.
495
- #
496
- # @return [DataFrame]
497
- def read_ipc(
498
- source,
499
- columns: nil,
500
- n_rows: nil,
501
- memory_map: true,
502
- storage_options: nil,
503
- row_count_name: nil,
504
- row_count_offset: 0,
505
- rechunk: true
506
- )
507
- storage_options ||= {}
508
- _prepare_file_arg(source, **storage_options) do |data|
509
- DataFrame._read_ipc(
510
- data,
511
- columns: columns,
240
+ if file.is_a?(::String) && file.include?("*")
241
+ dtypes_dict = nil
242
+ if !dtype_list.nil?
243
+ dtypes_dict = dtype_list.to_h
244
+ end
245
+ if !dtype_slice.nil?
246
+ raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
247
+ end
248
+ scan = scan_csv(
249
+ file,
250
+ has_header: has_header,
251
+ sep: sep,
252
+ comment_char: comment_char,
253
+ quote_char: quote_char,
254
+ skip_rows: skip_rows,
255
+ dtypes: dtypes_dict,
256
+ null_values: null_values,
257
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
258
+ ignore_errors: ignore_errors,
259
+ infer_schema_length: infer_schema_length,
512
260
  n_rows: n_rows,
513
- row_count_name: row_count_name,
514
- row_count_offset: row_count_offset,
261
+ low_memory: low_memory,
515
262
  rechunk: rechunk,
516
- memory_map: memory_map
517
- )
518
- end
519
- end
520
-
521
- # Read into a DataFrame from a parquet file.
522
- #
523
- # @param source [Object]
524
- # Path to a file or a file-like object.
525
- # @param columns [Object]
526
- # Columns to select. Accepts a list of column indices (starting at zero) or a list
527
- # of column names.
528
- # @param n_rows [Integer]
529
- # Stop reading from parquet file after reading `n_rows`.
530
- # @param storage_options [Hash]
531
- # Extra options that make sense for a particular storage connection.
532
- # @param parallel ["auto", "columns", "row_groups", "none"]
533
- # This determines the direction of parallelism. 'auto' will try to determine the
534
- # optimal direction.
535
- # @param row_count_name [String]
536
- # If not nil, this will insert a row count column with give name into the
537
- # DataFrame.
538
- # @param row_count_offset [Integer]
539
- # Offset to start the row_count column (only use if the name is set).
540
- # @param low_memory [Boolean]
541
- # Reduce memory pressure at the expense of performance.
542
- # @param use_statistics [Boolean]
543
- # Use statistics in the parquet to determine if pages
544
- # can be skipped from reading.
545
- # @param rechunk [Boolean]
546
- # Make sure that all columns are contiguous in memory by
547
- # aggregating the chunks into a single array.
548
- #
549
- # @return [DataFrame]
550
- #
551
- # @note
552
- # This operation defaults to a `rechunk` operation at the end, meaning that
553
- # all data will be stored continuously in memory.
554
- # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
555
- # an expensive operation.
556
- def read_parquet(
557
- source,
558
- columns: nil,
559
- n_rows: nil,
560
- storage_options: nil,
561
- parallel: "auto",
562
- row_count_name: nil,
563
- row_count_offset: 0,
564
- low_memory: false,
565
- use_statistics: true,
566
- rechunk: true
567
- )
568
- _prepare_file_arg(source) do |data|
569
- DataFrame._read_parquet(
570
- data,
571
- columns: columns,
572
- n_rows: n_rows,
573
- parallel: parallel,
263
+ skip_rows_after_header: skip_rows_after_header,
574
264
  row_count_name: row_count_name,
575
265
  row_count_offset: row_count_offset,
576
- low_memory: low_memory,
577
- use_statistics: use_statistics,
578
- rechunk: rechunk
266
+ eol_char: eol_char,
267
+ truncate_ragged_lines: truncate_ragged_lines,
268
+ decimal_comma: decimal_comma,
269
+ glob: glob
579
270
  )
580
- end
581
- end
582
-
583
- # Read into a DataFrame from a JSON file.
584
- #
585
- # @param source [Object]
586
- # Path to a file or a file-like object.
587
- #
588
- # @return [DataFrame]
589
- def read_json(source)
590
- DataFrame._read_json(source)
591
- end
592
-
593
- # Read into a DataFrame from a newline delimited JSON file.
594
- #
595
- # @param source [Object]
596
- # Path to a file or a file-like object.
597
- #
598
- # @return [DataFrame]
599
- def read_ndjson(source)
600
- DataFrame._read_ndjson(source)
601
- end
602
-
603
- # Read a SQL query into a DataFrame.
604
- #
605
- # @param query [Object]
606
- # ActiveRecord::Relation or ActiveRecord::Result.
607
- # @param schema_overrides [Hash]
608
- # A hash mapping column names to dtypes, used to override the schema
609
- # inferred from the query.
610
- #
611
- # @return [DataFrame]
612
- def read_database(query, schema_overrides: nil)
613
- if !defined?(ActiveRecord)
614
- raise Error, "Active Record not available"
615
- end
616
-
617
- result =
618
- if query.is_a?(ActiveRecord::Result)
619
- query
620
- elsif query.is_a?(ActiveRecord::Relation)
621
- query.connection.select_all(query.to_sql)
622
- elsif query.is_a?(::String)
623
- ActiveRecord::Base.connection.select_all(query)
271
+ if columns.nil?
272
+ return scan.collect
273
+ elsif is_str_sequence(columns, allow_str: false)
274
+ return scan.select(columns).collect
624
275
  else
625
- raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
276
+ raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
626
277
  end
627
-
628
- data = {}
629
- schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
630
-
631
- result.columns.each_with_index do |k, i|
632
- column_type = result.column_types[i]
633
-
634
- data[k] =
635
- if column_type
636
- result.rows.map { |r| column_type.deserialize(r[i]) }
637
- else
638
- result.rows.map { |r| r[i] }
639
- end
640
-
641
- polars_type =
642
- case column_type&.type
643
- when :binary
644
- Binary
645
- when :boolean
646
- Boolean
647
- when :date
648
- Date
649
- when :datetime, :timestamp
650
- Datetime
651
- when :decimal
652
- Decimal
653
- when :float
654
- Float64
655
- when :integer
656
- Int64
657
- when :string, :text
658
- String
659
- when :time
660
- Time
661
- # TODO fix issue with null
662
- # when :json, :jsonb
663
- # Struct
664
- end
665
-
666
- schema_overrides[k] ||= polars_type if polars_type
667
278
  end
668
279
 
669
- DataFrame.new(data, schema_overrides: schema_overrides)
670
- end
671
- alias_method :read_sql, :read_database
280
+ projection, columns = Utils.handle_projection_columns(columns)
672
281
 
673
- # def read_excel
674
- # end
282
+ rbdf =
283
+ RbDataFrame.read_csv(
284
+ file,
285
+ infer_schema_length,
286
+ batch_size,
287
+ has_header,
288
+ ignore_errors,
289
+ n_rows,
290
+ skip_rows,
291
+ projection,
292
+ sep,
293
+ rechunk,
294
+ columns,
295
+ encoding,
296
+ n_threads,
297
+ path,
298
+ dtype_list,
299
+ dtype_slice,
300
+ low_memory,
301
+ comment_char,
302
+ quote_char,
303
+ processed_null_values,
304
+ missing_utf8_is_empty_string,
305
+ parse_dates,
306
+ skip_rows_after_header,
307
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
308
+ sample_size,
309
+ eol_char,
310
+ raise_if_empty,
311
+ truncate_ragged_lines,
312
+ decimal_comma,
313
+ schema
314
+ )
315
+ Utils.wrap_df(rbdf)
316
+ end
675
317
 
676
318
  # Read a CSV file in batches.
677
319
  #
@@ -755,6 +397,8 @@ module Polars
755
397
  # allocation needed.
756
398
  # @param eol_char [String]
757
399
  # Single byte end of line character.
400
+ # @param truncate_ragged_lines [Boolean]
401
+ # Truncate lines that are longer than the schema.
758
402
  #
759
403
  # @return [BatchedCsvReader]
760
404
  #
@@ -774,6 +418,7 @@ module Polars
774
418
  skip_rows: 0,
775
419
  dtypes: nil,
776
420
  null_values: nil,
421
+ missing_utf8_is_empty_string: false,
777
422
  ignore_errors: false,
778
423
  parse_dates: false,
779
424
  n_threads: nil,
@@ -787,7 +432,10 @@ module Polars
787
432
  row_count_name: nil,
788
433
  row_count_offset: 0,
789
434
  sample_size: 1024,
790
- eol_char: "\n"
435
+ eol_char: "\n",
436
+ raise_if_empty: true,
437
+ truncate_ragged_lines: false,
438
+ decimal_comma: false
791
439
  )
792
440
  projection, columns = Utils.handle_projection_columns(columns)
793
441
 
@@ -813,6 +461,7 @@ module Polars
813
461
  skip_rows: skip_rows,
814
462
  dtypes: dtypes,
815
463
  null_values: null_values,
464
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
816
465
  ignore_errors: ignore_errors,
817
466
  parse_dates: parse_dates,
818
467
  n_threads: n_threads,
@@ -827,36 +476,205 @@ module Polars
827
476
  row_count_offset: row_count_offset,
828
477
  sample_size: sample_size,
829
478
  eol_char: eol_char,
830
- new_columns: new_columns
479
+ new_columns: new_columns,
480
+ raise_if_empty: raise_if_empty,
481
+ truncate_ragged_lines: truncate_ragged_lines,
482
+ decimal_comma: decimal_comma
831
483
  )
832
484
  end
833
485
 
834
- # Get a schema of the IPC file without reading data.
486
+ # Lazily read from a CSV file or multiple files via glob patterns.
487
+ #
488
+ # This allows the query optimizer to push down predicates and
489
+ # projections to the scan level, thereby potentially reducing
490
+ # memory overhead.
835
491
  #
836
492
  # @param source [Object]
837
- # Path to a file or a file-like object.
493
+ # Path to a file.
494
+ # @param has_header [Boolean]
495
+ # Indicate if the first row of dataset is a header or not.
496
+ # If set to false, column names will be autogenerated in the
497
+ # following format: `column_x`, with `x` being an
498
+ # enumeration over every column in the dataset starting at 1.
499
+ # @param sep [String]
500
+ # Single byte character to use as delimiter in the file.
501
+ # @param comment_char [String]
502
+ # Single byte character that indicates the start of a comment line,
503
+ # for instance `#`.
504
+ # @param quote_char [String]
505
+ # Single byte character used for csv quoting.
506
+ # Set to None to turn off special handling and escaping of quotes.
507
+ # @param skip_rows [Integer]
508
+ # Start reading after `skip_rows` lines. The header will be parsed at this
509
+ # offset.
510
+ # @param dtypes [Object]
511
+ # Overwrite dtypes during inference.
512
+ # @param null_values [Object]
513
+ # Values to interpret as null values. You can provide a:
514
+ #
515
+ # - `String`: All values equal to this string will be null.
516
+ # - `Array`: All values equal to any string in this array will be null.
517
+ # - `Hash`: A hash that maps column name to a null value string.
518
+ # @param ignore_errors [Boolean]
519
+ # Try to keep reading lines if some lines yield errors.
520
+ # First try `infer_schema_length: 0` to read all columns as
521
+ # `:str` to check which values might cause an issue.
522
+ # @param cache [Boolean]
523
+ # Cache the result after reading.
524
+ # @param with_column_names [Object]
525
+ # Apply a function over the column names.
526
+ # This can be used to update a schema just in time, thus before
527
+ # scanning.
528
+ # @param infer_schema_length [Integer]
529
+ # Maximum number of lines to read to infer schema.
530
+ # If set to 0, all columns will be read as `:str`.
531
+ # If set to `nil`, a full table scan will be done (slow).
532
+ # @param n_rows [Integer]
533
+ # Stop reading from CSV file after reading `n_rows`.
534
+ # @param encoding ["utf8", "utf8-lossy"]
535
+ # Lossy means that invalid utf8 values are replaced with `�`
536
+ # characters.
537
+ # @param low_memory [Boolean]
538
+ # Reduce memory usage in expense of performance.
539
+ # @param rechunk [Boolean]
540
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
541
+ # @param skip_rows_after_header [Integer]
542
+ # Skip this number of rows when the header is parsed.
543
+ # @param row_count_name [String]
544
+ # If not nil, this will insert a row count column with the given name into
545
+ # the DataFrame.
546
+ # @param row_count_offset [Integer]
547
+ # Offset to start the row_count column (only used if the name is set).
548
+ # @param parse_dates [Boolean]
549
+ # Try to automatically parse dates. If this does not succeed,
550
+ # the column remains of data type `:str`.
551
+ # @param eol_char [String]
552
+ # Single byte end of line character.
553
+ # @param truncate_ragged_lines [Boolean]
554
+ # Truncate lines that are longer than the schema.
838
555
  #
839
- # @return [Hash]
840
- def read_ipc_schema(source)
556
+ # @return [LazyFrame]
557
+ def scan_csv(
558
+ source,
559
+ has_header: true,
560
+ sep: ",",
561
+ comment_char: nil,
562
+ quote_char: '"',
563
+ skip_rows: 0,
564
+ dtypes: nil,
565
+ null_values: nil,
566
+ missing_utf8_is_empty_string: false,
567
+ ignore_errors: false,
568
+ cache: true,
569
+ with_column_names: nil,
570
+ infer_schema_length: 100,
571
+ n_rows: nil,
572
+ encoding: "utf8",
573
+ low_memory: false,
574
+ rechunk: true,
575
+ skip_rows_after_header: 0,
576
+ row_count_name: nil,
577
+ row_count_offset: 0,
578
+ parse_dates: false,
579
+ eol_char: "\n",
580
+ raise_if_empty: true,
581
+ truncate_ragged_lines: false,
582
+ decimal_comma: false,
583
+ glob: true
584
+ )
585
+ Utils._check_arg_is_1byte("sep", sep, false)
586
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
587
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
588
+
841
589
  if Utils.pathlike?(source)
842
- source = Utils.normalise_filepath(source)
590
+ source = Utils.normalize_filepath(source)
843
591
  end
844
592
 
845
- Plr.ipc_schema(source)
593
+ _scan_csv_impl(
594
+ source,
595
+ has_header: has_header,
596
+ sep: sep,
597
+ comment_char: comment_char,
598
+ quote_char: quote_char,
599
+ skip_rows: skip_rows,
600
+ dtypes: dtypes,
601
+ null_values: null_values,
602
+ ignore_errors: ignore_errors,
603
+ cache: cache,
604
+ with_column_names: with_column_names,
605
+ infer_schema_length: infer_schema_length,
606
+ n_rows: n_rows,
607
+ low_memory: low_memory,
608
+ rechunk: rechunk,
609
+ skip_rows_after_header: skip_rows_after_header,
610
+ encoding: encoding,
611
+ row_count_name: row_count_name,
612
+ row_count_offset: row_count_offset,
613
+ parse_dates: parse_dates,
614
+ eol_char: eol_char,
615
+ truncate_ragged_lines: truncate_ragged_lines
616
+ )
846
617
  end
847
618
 
848
- # Get a schema of the Parquet file without reading data.
849
- #
850
- # @param source [Object]
851
- # Path to a file or a file-like object.
852
- #
853
- # @return [Hash]
854
- def read_parquet_schema(source)
855
- if Utils.pathlike?(source)
856
- source = Utils.normalise_filepath(source)
619
+ # @private
620
+ def _scan_csv_impl(
621
+ file,
622
+ has_header: true,
623
+ sep: ",",
624
+ comment_char: nil,
625
+ quote_char: '"',
626
+ skip_rows: 0,
627
+ dtypes: nil,
628
+ null_values: nil,
629
+ ignore_errors: false,
630
+ cache: true,
631
+ with_column_names: nil,
632
+ infer_schema_length: 100,
633
+ n_rows: nil,
634
+ encoding: "utf8",
635
+ low_memory: false,
636
+ rechunk: true,
637
+ skip_rows_after_header: 0,
638
+ row_count_name: nil,
639
+ row_count_offset: 0,
640
+ parse_dates: false,
641
+ eol_char: "\n",
642
+ truncate_ragged_lines: true
643
+ )
644
+ dtype_list = nil
645
+ if !dtypes.nil?
646
+ dtype_list = []
647
+ dtypes.each do |k, v|
648
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
649
+ end
857
650
  end
858
-
859
- Plr.parquet_schema(source)
651
+ processed_null_values = Utils._process_null_values(null_values)
652
+
653
+ rblf =
654
+ RbLazyFrame.new_from_csv(
655
+ file,
656
+ sep,
657
+ has_header,
658
+ ignore_errors,
659
+ skip_rows,
660
+ n_rows,
661
+ cache,
662
+ dtype_list,
663
+ low_memory,
664
+ comment_char,
665
+ quote_char,
666
+ processed_null_values,
667
+ infer_schema_length,
668
+ with_column_names,
669
+ rechunk,
670
+ skip_rows_after_header,
671
+ encoding,
672
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
673
+ parse_dates,
674
+ eol_char,
675
+ truncate_ragged_lines
676
+ )
677
+ Utils.wrap_ldf(rblf)
860
678
  end
861
679
 
862
680
  private