polars-df 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/polars/io.rb CHANGED
@@ -1,5 +1,93 @@
1
1
  module Polars
2
2
  module IO
3
+ # Read a CSV file into a DataFrame.
4
+ #
5
+ # @param file [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param has_header [Boolean]
8
+ # Indicate if the first row of dataset is a header or not.
9
+ # If set to false, column names will be autogenerated in the
10
+ # following format: `column_x`, with `x` being an
11
+ # enumeration over every column in the dataset starting at 1.
12
+ # @param columns [Object]
13
+ # Columns to select. Accepts a list of column indices (starting
14
+ # at zero) or a list of column names.
15
+ # @param new_columns [Object]
16
+ # Rename columns right after parsing the CSV file. If the given
17
+ # list is shorter than the width of the DataFrame the remaining
18
+ # columns will have their original name.
19
+ # @param sep [String]
20
+ # Single byte character to use as delimiter in the file.
21
+ # @param comment_char [String]
22
+ # Single byte character that indicates the start of a comment line,
23
+ # for instance `#`.
24
+ # @param quote_char [String]
25
+ # Single byte character used for csv quoting.
26
+ # Set to nil to turn off special handling and escaping of quotes.
27
+ # @param skip_rows [Integer]
28
+ # Start reading after `skip_rows` lines.
29
+ # @param dtypes [Object]
30
+ # Overwrite dtypes during inference.
31
+ # @param null_values [Object]
32
+ # Values to interpret as null values. You can provide a:
33
+ #
34
+ # - `String`: All values equal to this string will be null.
35
+ # - `Array`: All values equal to any string in this array will be null.
36
+ # - `Hash`: A hash that maps column name to a null value string.
37
+ # @param ignore_errors [Boolean]
38
+ # Try to keep reading lines if some lines yield errors.
39
+ # First try `infer_schema_length: 0` to read all columns as
40
+ # `:str` to check which values might cause an issue.
41
+ # @param parse_dates [Boolean]
42
+ # Try to automatically parse dates. If this does not succeed,
43
+ # the column remains of data type `:str`.
44
+ # @param n_threads [Integer]
45
+ # Number of threads to use in csv parsing.
46
+ # Defaults to the number of physical cpu's of your system.
47
+ # @param infer_schema_length [Integer]
48
+ # Maximum number of lines to read to infer schema.
49
+ # If set to 0, all columns will be read as `:utf8`.
50
+ # If set to `nil`, a full table scan will be done (slow).
51
+ # @param batch_size [Integer]
52
+ # Number of lines to read into the buffer at once.
53
+ # Modify this to change performance.
54
+ # @param n_rows [Integer]
55
+ # Stop reading from CSV file after reading `n_rows`.
56
+ # During multi-threaded parsing, an upper bound of `n_rows`
57
+ # rows cannot be guaranteed.
58
+ # @param encoding ["utf8", "utf8-lossy"]
59
+ # Lossy means that invalid utf8 values are replaced with `�`
60
+ # characters. When using other encodings than `utf8` or
61
+ # `utf8-lossy`, the input is first decoded im memory with
62
+ # python.
63
+ # @param low_memory [Boolean]
64
+ # Reduce memory usage at expense of performance.
65
+ # @param rechunk [Boolean]
66
+ # Make sure that all columns are contiguous in memory by
67
+ # aggregating the chunks into a single array.
68
+ # @param storage_options [Hash]
69
+ # Extra options that make sense for a
70
+ # particular storage connection.
71
+ # @param skip_rows_after_header [Integer]
72
+ # Skip this number of rows when the header is parsed.
73
+ # @param row_count_name [String]
74
+ # If not nil, this will insert a row count column with the given name into
75
+ # the DataFrame.
76
+ # @param row_count_offset [Integer]
77
+ # Offset to start the row_count column (only used if the name is set).
78
+ # @param sample_size [Integer]
79
+ # Set the sample size. This is used to sample statistics to estimate the
80
+ # allocation needed.
81
+ # @param eol_char [String]
82
+ # Single byte end of line character.
83
+ #
84
+ # @return [DataFrame]
85
+ #
86
+ # @note
87
+ # This operation defaults to a `rechunk` operation at the end, meaning that
88
+ # all data will be stored continuously in memory.
89
+ # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
90
+ # an expensive operation.
3
91
  def read_csv(
4
92
  file,
5
93
  has_header: true,
@@ -84,6 +172,75 @@ module Polars
84
172
  end
85
173
  end
86
174
 
175
+ # Lazily read from a CSV file or multiple files via glob patterns.
176
+ #
177
+ # This allows the query optimizer to push down predicates and
178
+ # projections to the scan level, thereby potentially reducing
179
+ # memory overhead.
180
+ #
181
+ # @param file [Object]
182
+ # Path to a file.
183
+ # @param has_header [Boolean]
184
+ # Indicate if the first row of dataset is a header or not.
185
+ # If set to false, column names will be autogenerated in the
186
+ # following format: ``column_x``, with ``x`` being an
187
+ # enumeration over every column in the dataset starting at 1.
188
+ # @param sep [String]
189
+ # Single byte character to use as delimiter in the file.
190
+ # @param comment_char [String]
191
+ # Single byte character that indicates the start of a comment line,
192
+ # for instance `#`.
193
+ # @param quote_char [String]
194
+ # Single byte character used for csv quoting.
195
+ # Set to None to turn off special handling and escaping of quotes.
196
+ # @param skip_rows [Integer]
197
+ # Start reading after `skip_rows` lines. The header will be parsed at this
198
+ # offset.
199
+ # @param dtypes [Object]
200
+ # Overwrite dtypes during inference.
201
+ # @param null_values [Object]
202
+ # Values to interpret as null values. You can provide a:
203
+ #
204
+ # - `String`: All values equal to this string will be null.
205
+ # - `Array`: All values equal to any string in this array will be null.
206
+ # - `Hash`: A hash that maps column name to a null value string.
207
+ # @param ignore_errors [Boolean]
208
+ # Try to keep reading lines if some lines yield errors.
209
+ # First try `infer_schema_length: 0` to read all columns as
210
+ # `:str` to check which values might cause an issue.
211
+ # @param cache [Boolean]
212
+ # Cache the result after reading.
213
+ # @param with_column_names [Object]
214
+ # Apply a function over the column names.
215
+ # This can be used to update a schema just in time, thus before
216
+ # scanning.
217
+ # @param infer_schema_length [Integer]
218
+ # Maximum number of lines to read to infer schema.
219
+ # If set to 0, all columns will be read as `:str`.
220
+ # If set to `nil`, a full table scan will be done (slow).
221
+ # @param n_rows [Integer]
222
+ # Stop reading from CSV file after reading `n_rows`.
223
+ # @param encoding ["utf8", "utf8-lossy"]
224
+ # Lossy means that invalid utf8 values are replaced with `�`
225
+ # characters.
226
+ # @param low_memory [Boolean]
227
+ # Reduce memory usage in expense of performance.
228
+ # @param rechunk [Boolean]
229
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
230
+ # @param skip_rows_after_header [Integer]
231
+ # Skip this number of rows when the header is parsed.
232
+ # @param row_count_name [String]
233
+ # If not nil, this will insert a row count column with the given name into
234
+ # the DataFrame.
235
+ # @param row_count_offset [Integer]
236
+ # Offset to start the row_count column (only used if the name is set).
237
+ # @param parse_dates [Boolean]
238
+ # Try to automatically parse dates. If this does not succeed,
239
+ # the column remains of data type `:str`.
240
+ # @param eol_char [String]
241
+ # Single byte end of line character.
242
+ #
243
+ # @return [LazyFrame]
87
244
  def scan_csv(
88
245
  file,
89
246
  has_header: true,
@@ -140,6 +297,32 @@ module Polars
140
297
  )
141
298
  end
142
299
 
300
+ # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
301
+ #
302
+ # This allows the query optimizer to push down predicates and projections to the scan
303
+ # level, thereby potentially reducing memory overhead.
304
+ #
305
+ # @param file [String]
306
+ # Path to a IPC file.
307
+ # @param n_rows [Integer]
308
+ # Stop reading from IPC file after reading `n_rows`.
309
+ # @param cache [Boolean]
310
+ # Cache the result after reading.
311
+ # @param rechunk [Boolean]
312
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
313
+ # @param row_count_name [String]
314
+ # If not nil, this will insert a row count column with give name into the
315
+ # DataFrame.
316
+ # @param row_count_offset [Integer]
317
+ # Offset to start the row_count column (only use if the name is set).
318
+ # @param storage_options [Hash]
319
+ # Extra options that make sense for a particular storage connection.
320
+ # @param memory_map [Boolean]
321
+ # Try to memory map the file. This can greatly improve performance on repeated
322
+ # queries as the OS may cache pages.
323
+ # Only uncompressed IPC files can be memory mapped.
324
+ #
325
+ # @return [LazyFrame]
143
326
  def scan_ipc(
144
327
  file,
145
328
  n_rows: nil,
@@ -162,6 +345,34 @@ module Polars
162
345
  )
163
346
  end
164
347
 
348
+ # Lazily read from a parquet file or multiple files via glob patterns.
349
+ #
350
+ # This allows the query optimizer to push down predicates and projections to the scan
351
+ # level, thereby potentially reducing memory overhead.
352
+ #
353
+ # @param file [String]
354
+ # Path to a file.
355
+ # @param n_rows [Integer]
356
+ # Stop reading from parquet file after reading `n_rows`.
357
+ # @param cache [Boolean]
358
+ # Cache the result after reading.
359
+ # @param parallel ["auto", "columns", "row_groups", "none"]
360
+ # This determines the direction of parallelism. 'auto' will try to determine the
361
+ # optimal direction.
362
+ # @param rechunk [Boolean]
363
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
364
+ # into contiguous memory chunks.
365
+ # @param row_count_name [String]
366
+ # If not nil, this will insert a row count column with give name into the
367
+ # DataFrame.
368
+ # @param row_count_offset [Integer]
369
+ # Offset to start the row_count column (only use if the name is set).
370
+ # @param storage_options [Hash]
371
+ # Extra options that make sense for a particular storage connection.
372
+ # @param low_memory [Boolean]
373
+ # Reduce memory pressure at the expense of performance.
374
+ #
375
+ # @return [LazyFrame]
165
376
  def scan_parquet(
166
377
  file,
167
378
  n_rows: nil,
@@ -190,6 +401,30 @@ module Polars
190
401
  )
191
402
  end
192
403
 
404
+ # Lazily read from a newline delimited JSON file.
405
+ #
406
+ # This allows the query optimizer to push down predicates and projections to the scan
407
+ # level, thereby potentially reducing memory overhead.
408
+ #
409
+ # @param file [String]
410
+ # Path to a file.
411
+ # @param infer_schema_length [Integer]
412
+ # Infer the schema length from the first `infer_schema_length` rows.
413
+ # @param batch_size [Integer]
414
+ # Number of rows to read in each batch.
415
+ # @param n_rows [Integer]
416
+ # Stop reading from JSON file after reading `n_rows`.
417
+ # @param low_memory [Boolean]
418
+ # Reduce memory pressure at the expense of performance.
419
+ # @param rechunk [Boolean]
420
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
421
+ # @param row_count_name [String]
422
+ # If not nil, this will insert a row count column with give name into the
423
+ # DataFrame.
424
+ # @param row_count_offset [Integer]
425
+ # Offset to start the row_count column (only use if the name is set).
426
+ #
427
+ # @return [LazyFrame]
193
428
  def scan_ndjson(
194
429
  file,
195
430
  infer_schema_length: 100,
@@ -219,6 +454,30 @@ module Polars
219
454
  # def read_avro
220
455
  # end
221
456
 
457
+ # Read into a DataFrame from Arrow IPC (Feather v2) file.
458
+ #
459
+ # @param file [Object]
460
+ # Path to a file or a file-like object.
461
+ # @param columns [Object]
462
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
463
+ # of column names.
464
+ # @param n_rows [Integer]
465
+ # Stop reading from IPC file after reading `n_rows`.
466
+ # @param memory_map [Boolean]
467
+ # Try to memory map the file. This can greatly improve performance on repeated
468
+ # queries as the OS may cache pages.
469
+ # Only uncompressed IPC files can be memory mapped.
470
+ # @param storage_options [Hash]
471
+ # Extra options that make sense for a particular storage connection.
472
+ # @param row_count_name [String]
473
+ # If not nil, this will insert a row count column with give name into the
474
+ # DataFrame.
475
+ # @param row_count_offset [Integer]
476
+ # Offset to start the row_count column (only use if the name is set).
477
+ # @param rechunk [Boolean]
478
+ # Make sure that all data is contiguous.
479
+ #
480
+ # @return [DataFrame]
222
481
  def read_ipc(
223
482
  file,
224
483
  columns: nil,
@@ -243,16 +502,74 @@ module Polars
243
502
  end
244
503
  end
245
504
 
246
- def read_parquet(file)
505
+ # Read into a DataFrame from a parquet file.
506
+ #
507
+ # @param file [Object]
508
+ # Path to a file, or a file-like object.
509
+ # @param columns [Object]
510
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
511
+ # of column names.
512
+ # @param n_rows [Integer]
513
+ # Stop reading from parquet file after reading `n_rows`.
514
+ # @param storage_options [Hash]
515
+ # Extra options that make sense for a particular storage connection.
516
+ # @param parallel ["auto", "columns", "row_groups", "none"]
517
+ # This determines the direction of parallelism. 'auto' will try to determine the
518
+ # optimal direction.
519
+ # @param row_count_name [String]
520
+ # If not nil, this will insert a row count column with give name into the
521
+ # DataFrame.
522
+ # @param row_count_offset [Integer]
523
+ # Offset to start the row_count column (only use if the name is set).
524
+ # @param low_memory [Boolean]
525
+ # Reduce memory pressure at the expense of performance.
526
+ #
527
+ # @return [DataFrame]
528
+ #
529
+ # @note
530
+ # This operation defaults to a `rechunk` operation at the end, meaning that
531
+ # all data will be stored continuously in memory.
532
+ # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
533
+ # an expensive operation.
534
+ def read_parquet(
535
+ file,
536
+ columns: nil,
537
+ n_rows: nil,
538
+ storage_options: nil,
539
+ parallel: "auto",
540
+ row_count_name: nil,
541
+ row_count_offset: 0,
542
+ low_memory: false
543
+ )
247
544
  _prepare_file_arg(file) do |data|
248
- DataFrame._read_parquet(data)
545
+ DataFrame._read_parquet(
546
+ data,
547
+ columns: columns,
548
+ n_rows: n_rows,
549
+ parallel: parallel,
550
+ row_count_name: row_count_name,
551
+ row_count_offset: row_count_offset,
552
+ low_memory: low_memory
553
+ )
249
554
  end
250
555
  end
251
556
 
557
+ # Read into a DataFrame from a JSON file.
558
+ #
559
+ # @param file [Object]
560
+ # Path to a file or a file-like object.
561
+ #
562
+ # @return [DataFrame]
252
563
  def read_json(file)
253
564
  DataFrame._read_json(file)
254
565
  end
255
566
 
567
+ # Read into a DataFrame from a newline delimited JSON file.
568
+ #
569
+ # @param file [Object]
570
+ # Path to a file or a file-like object.
571
+ #
572
+ # @return [DataFrame]
256
573
  def read_ndjson(file)
257
574
  DataFrame._read_ndjson(file)
258
575
  end
@@ -263,6 +580,96 @@ module Polars
263
580
  # def read_excel
264
581
  # end
265
582
 
583
+ # Read a CSV file in batches.
584
+ #
585
+ # Upon creation of the `BatchedCsvReader`,
586
+ # polars will gather statistics and determine the
587
+ # file chunks. After that work will only be done
588
+ # if `next_batches` is called.
589
+ #
590
+ # @param file [Object]
591
+ # Path to a file or a file-like object.
592
+ # @param has_header [Boolean]
593
+ # Indicate if the first row of dataset is a header or not.
594
+ # If set to False, column names will be autogenerated in the
595
+ # following format: `column_x`, with `x` being an
596
+ # enumeration over every column in the dataset starting at 1.
597
+ # @param columns [Object]
598
+ # Columns to select. Accepts a list of column indices (starting
599
+ # at zero) or a list of column names.
600
+ # @param new_columns [Object]
601
+ # Rename columns right after parsing the CSV file. If the given
602
+ # list is shorter than the width of the DataFrame the remaining
603
+ # columns will have their original name.
604
+ # @param sep [String]
605
+ # Single byte character to use as delimiter in the file.
606
+ # @param comment_char [String]
607
+ # Single byte character that indicates the start of a comment line,
608
+ # for instance `#`.
609
+ # @param quote_char [String]
610
+ # Single byte character used for csv quoting, default = `"`.
611
+ # Set to nil to turn off special handling and escaping of quotes.
612
+ # @param skip_rows [Integer]
613
+ # Start reading after `skip_rows` lines.
614
+ # @param dtypes [Object]
615
+ # Overwrite dtypes during inference.
616
+ # @param null_values [Object]
617
+ # Values to interpret as null values. You can provide a:
618
+ #
619
+ # - `String`: All values equal to this string will be null.
620
+ # - `Array`: All values equal to any string in this array will be null.
621
+ # - `Hash`: A hash that maps column name to a null value string.
622
+ # @param ignore_errors [Boolean]
623
+ # Try to keep reading lines if some lines yield errors.
624
+ # First try `infer_schema_length: 0` to read all columns as
625
+ # `:str` to check which values might cause an issue.
626
+ # @param parse_dates [Boolean]
627
+ # Try to automatically parse dates. If this does not succeed,
628
+ # the column remains of data type `:str`.
629
+ # @param n_threads [Integer]
630
+ # Number of threads to use in csv parsing.
631
+ # Defaults to the number of physical cpu's of your system.
632
+ # @param infer_schema_length [Integer]
633
+ # Maximum number of lines to read to infer schema.
634
+ # If set to 0, all columns will be read as `:str`.
635
+ # If set to `nil`, a full table scan will be done (slow).
636
+ # @param batch_size [Integer]
637
+ # Number of lines to read into the buffer at once.
638
+ # Modify this to change performance.
639
+ # @param n_rows [Integer]
640
+ # Stop reading from CSV file after reading `n_rows`.
641
+ # During multi-threaded parsing, an upper bound of `n_rows`
642
+ # rows cannot be guaranteed.
643
+ # @param encoding ["utf8", "utf8-lossy"]
644
+ # Lossy means that invalid utf8 values are replaced with `�`
645
+ # characters. When using other encodings than `utf8` or
646
+ # `utf8-lossy`, the input is first decoded im memory with
647
+ # Ruby. Defaults to `utf8`.
648
+ # @param low_memory [Boolean]
649
+ # Reduce memory usage at expense of performance.
650
+ # @param rechunk [Boolean]
651
+ # Make sure that all columns are contiguous in memory by
652
+ # aggregating the chunks into a single array.
653
+ # @param skip_rows_after_header [Integer]
654
+ # Skip this number of rows when the header is parsed.
655
+ # @param row_count_name [String]
656
+ # If not nil, this will insert a row count column with the given name into
657
+ # the DataFrame.
658
+ # @param row_count_offset [Integer]
659
+ # Offset to start the row_count column (only used if the name is set).
660
+ # @param sample_size [Integer]
661
+ # Set the sample size. This is used to sample statistics to estimate the
662
+ # allocation needed.
663
+ # @param eol_char [String]
664
+ # Single byte end of line character.
665
+ #
666
+ # @return [BatchedCsvReader]
667
+ #
668
+ # @example
669
+ # reader = Polars.read_csv_batched(
670
+ # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
671
+ # )
672
+ # reader.next_batches(5)
266
673
  def read_csv_batched(
267
674
  file,
268
675
  has_header: true,
@@ -331,6 +738,12 @@ module Polars
331
738
  )
332
739
  end
333
740
 
741
+ # Get a schema of the IPC file without reading data.
742
+ #
743
+ # @param file [Object]
744
+ # Path to a file or a file-like object.
745
+ #
746
+ # @return [Hash]
334
747
  def read_ipc_schema(file)
335
748
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
336
749
  file = Utils.format_path(file)
@@ -339,6 +752,12 @@ module Polars
339
752
  _ipc_schema(file)
340
753
  end
341
754
 
755
+ # Get a schema of the Parquet file without reading data.
756
+ #
757
+ # @param file [Object]
758
+ # Path to a file or a file-like object.
759
+ #
760
+ # @return [Hash]
342
761
  def read_parquet_schema(file)
343
762
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
344
763
  file = Utils.format_path(file)