polars-df 0.3.0-arm64-darwin → 0.4.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
data/lib/polars/io.rb CHANGED
@@ -2,7 +2,7 @@ module Polars
2
2
  module IO
3
3
  # Read a CSV file into a DataFrame.
4
4
  #
5
- # @param file [Object]
5
+ # @param source [Object]
6
6
  # Path to a file or a file-like object.
7
7
  # @param has_header [Boolean]
8
8
  # Indicate if the first row of dataset is a header or not.
@@ -89,7 +89,7 @@ module Polars
89
89
  # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
90
90
  # an expensive operation.
91
91
  def read_csv(
92
- file,
92
+ source,
93
93
  has_header: true,
94
94
  columns: nil,
95
95
  new_columns: nil,
@@ -137,7 +137,7 @@ module Polars
137
137
  end
138
138
 
139
139
  df = nil
140
- _prepare_file_arg(file) do |data|
140
+ _prepare_file_arg(source) do |data|
141
141
  df = DataFrame._read_csv(
142
142
  data,
143
143
  has_header: has_header,
@@ -178,7 +178,7 @@ module Polars
178
178
  # projections to the scan level, thereby potentially reducing
179
179
  # memory overhead.
180
180
  #
181
- # @param file [Object]
181
+ # @param source [Object]
182
182
  # Path to a file.
183
183
  # @param has_header [Boolean]
184
184
  # Indicate if the first row of dataset is a header or not.
@@ -242,7 +242,7 @@ module Polars
242
242
  #
243
243
  # @return [LazyFrame]
244
244
  def scan_csv(
245
- file,
245
+ source,
246
246
  has_header: true,
247
247
  sep: ",",
248
248
  comment_char: nil,
@@ -268,12 +268,12 @@ module Polars
268
268
  _check_arg_is_1byte("comment_char", comment_char, false)
269
269
  _check_arg_is_1byte("quote_char", quote_char, true)
270
270
 
271
- if Utils.pathlike?(file)
272
- file = Utils.format_path(file)
271
+ if Utils.pathlike?(source)
272
+ source = Utils.normalise_filepath(source)
273
273
  end
274
274
 
275
275
  LazyFrame._scan_csv(
276
- file,
276
+ source,
277
277
  has_header: has_header,
278
278
  sep: sep,
279
279
  comment_char: comment_char,
@@ -302,7 +302,7 @@ module Polars
302
302
  # This allows the query optimizer to push down predicates and projections to the scan
303
303
  # level, thereby potentially reducing memory overhead.
304
304
  #
305
- # @param file [String]
305
+ # @param source [String]
306
306
  # Path to a IPC file.
307
307
  # @param n_rows [Integer]
308
308
  # Stop reading from IPC file after reading `n_rows`.
@@ -324,7 +324,7 @@ module Polars
324
324
  #
325
325
  # @return [LazyFrame]
326
326
  def scan_ipc(
327
- file,
327
+ source,
328
328
  n_rows: nil,
329
329
  cache: true,
330
330
  rechunk: true,
@@ -334,7 +334,7 @@ module Polars
334
334
  memory_map: true
335
335
  )
336
336
  LazyFrame._scan_ipc(
337
- file,
337
+ source,
338
338
  n_rows: n_rows,
339
339
  cache: cache,
340
340
  rechunk: rechunk,
@@ -350,7 +350,7 @@ module Polars
350
350
  # This allows the query optimizer to push down predicates and projections to the scan
351
351
  # level, thereby potentially reducing memory overhead.
352
352
  #
353
- # @param file [String]
353
+ # @param source [String]
354
354
  # Path to a file.
355
355
  # @param n_rows [Integer]
356
356
  # Stop reading from parquet file after reading `n_rows`.
@@ -374,7 +374,7 @@ module Polars
374
374
  #
375
375
  # @return [LazyFrame]
376
376
  def scan_parquet(
377
- file,
377
+ source,
378
378
  n_rows: nil,
379
379
  cache: true,
380
380
  parallel: "auto",
@@ -384,12 +384,12 @@ module Polars
384
384
  storage_options: nil,
385
385
  low_memory: false
386
386
  )
387
- if Utils.pathlike?(file)
388
- file = Utils.format_path(file)
387
+ if Utils.pathlike?(source)
388
+ source = Utils.normalise_filepath(source)
389
389
  end
390
390
 
391
391
  LazyFrame._scan_parquet(
392
- file,
392
+ source,
393
393
  n_rows:n_rows,
394
394
  cache: cache,
395
395
  parallel: parallel,
@@ -406,7 +406,7 @@ module Polars
406
406
  # This allows the query optimizer to push down predicates and projections to the scan
407
407
  # level, thereby potentially reducing memory overhead.
408
408
  #
409
- # @param file [String]
409
+ # @param source [String]
410
410
  # Path to a file.
411
411
  # @param infer_schema_length [Integer]
412
412
  # Infer the schema length from the first `infer_schema_length` rows.
@@ -426,7 +426,7 @@ module Polars
426
426
  #
427
427
  # @return [LazyFrame]
428
428
  def scan_ndjson(
429
- file,
429
+ source,
430
430
  infer_schema_length: 100,
431
431
  batch_size: 1024,
432
432
  n_rows: nil,
@@ -435,12 +435,12 @@ module Polars
435
435
  row_count_name: nil,
436
436
  row_count_offset: 0
437
437
  )
438
- if Utils.pathlike?(file)
439
- file = Utils.format_path(file)
438
+ if Utils.pathlike?(source)
439
+ source = Utils.normalise_filepath(source)
440
440
  end
441
441
 
442
442
  LazyFrame._scan_ndjson(
443
- file,
443
+ source,
444
444
  infer_schema_length: infer_schema_length,
445
445
  batch_size: batch_size,
446
446
  n_rows: n_rows,
@@ -453,7 +453,7 @@ module Polars
453
453
 
454
454
  # Read into a DataFrame from Apache Avro format.
455
455
  #
456
- # @param file [Object]
456
+ # @param source [Object]
457
457
  # Path to a file or a file-like object.
458
458
  # @param columns [Object]
459
459
  # Columns to select. Accepts a list of column indices (starting at zero) or a list
@@ -462,17 +462,17 @@ module Polars
462
462
  # Stop reading from Apache Avro file after reading ``n_rows``.
463
463
  #
464
464
  # @return [DataFrame]
465
- def read_avro(file, columns: nil, n_rows: nil)
466
- if Utils.pathlike?(file)
467
- file = Utils.format_path(file)
465
+ def read_avro(source, columns: nil, n_rows: nil)
466
+ if Utils.pathlike?(source)
467
+ source = Utils.normalise_filepath(source)
468
468
  end
469
469
 
470
- DataFrame._read_avro(file, n_rows: n_rows, columns: columns)
470
+ DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
471
471
  end
472
472
 
473
473
  # Read into a DataFrame from Arrow IPC (Feather v2) file.
474
474
  #
475
- # @param file [Object]
475
+ # @param source [Object]
476
476
  # Path to a file or a file-like object.
477
477
  # @param columns [Object]
478
478
  # Columns to select. Accepts a list of column indices (starting at zero) or a list
@@ -495,7 +495,7 @@ module Polars
495
495
  #
496
496
  # @return [DataFrame]
497
497
  def read_ipc(
498
- file,
498
+ source,
499
499
  columns: nil,
500
500
  n_rows: nil,
501
501
  memory_map: true,
@@ -505,7 +505,7 @@ module Polars
505
505
  rechunk: true
506
506
  )
507
507
  storage_options ||= {}
508
- _prepare_file_arg(file, **storage_options) do |data|
508
+ _prepare_file_arg(source, **storage_options) do |data|
509
509
  DataFrame._read_ipc(
510
510
  data,
511
511
  columns: columns,
@@ -520,8 +520,8 @@ module Polars
520
520
 
521
521
  # Read into a DataFrame from a parquet file.
522
522
  #
523
- # @param file [Object]
524
- # Path to a file, or a file-like object.
523
+ # @param source [Object]
524
+ # Path to a file or a file-like object.
525
525
  # @param columns [Object]
526
526
  # Columns to select. Accepts a list of column indices (starting at zero) or a list
527
527
  # of column names.
@@ -539,6 +539,12 @@ module Polars
539
539
  # Offset to start the row_count column (only use if the name is set).
540
540
  # @param low_memory [Boolean]
541
541
  # Reduce memory pressure at the expense of performance.
542
+ # @param use_statistics [Boolean]
543
+ # Use statistics in the parquet to determine if pages
544
+ # can be skipped from reading.
545
+ # @param rechunk [Boolean]
546
+ # Make sure that all columns are contiguous in memory by
547
+ # aggregating the chunks into a single array.
542
548
  #
543
549
  # @return [DataFrame]
544
550
  #
@@ -548,16 +554,18 @@ module Polars
548
554
  # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
549
555
  # an expensive operation.
550
556
  def read_parquet(
551
- file,
557
+ source,
552
558
  columns: nil,
553
559
  n_rows: nil,
554
560
  storage_options: nil,
555
561
  parallel: "auto",
556
562
  row_count_name: nil,
557
563
  row_count_offset: 0,
558
- low_memory: false
564
+ low_memory: false,
565
+ use_statistics: true,
566
+ rechunk: true
559
567
  )
560
- _prepare_file_arg(file) do |data|
568
+ _prepare_file_arg(source) do |data|
561
569
  DataFrame._read_parquet(
562
570
  data,
563
571
  columns: columns,
@@ -565,49 +573,51 @@ module Polars
565
573
  parallel: parallel,
566
574
  row_count_name: row_count_name,
567
575
  row_count_offset: row_count_offset,
568
- low_memory: low_memory
576
+ low_memory: low_memory,
577
+ use_statistics: use_statistics,
578
+ rechunk: rechunk
569
579
  )
570
580
  end
571
581
  end
572
582
 
573
583
  # Read into a DataFrame from a JSON file.
574
584
  #
575
- # @param file [Object]
585
+ # @param source [Object]
576
586
  # Path to a file or a file-like object.
577
587
  #
578
588
  # @return [DataFrame]
579
- def read_json(file)
580
- DataFrame._read_json(file)
589
+ def read_json(source)
590
+ DataFrame._read_json(source)
581
591
  end
582
592
 
583
593
  # Read into a DataFrame from a newline delimited JSON file.
584
594
  #
585
- # @param file [Object]
595
+ # @param source [Object]
586
596
  # Path to a file or a file-like object.
587
597
  #
588
598
  # @return [DataFrame]
589
- def read_ndjson(file)
590
- DataFrame._read_ndjson(file)
599
+ def read_ndjson(source)
600
+ DataFrame._read_ndjson(source)
591
601
  end
592
602
 
593
603
  # Read a SQL query into a DataFrame.
594
604
  #
595
- # @param sql [Object]
605
+ # @param query [Object]
596
606
  # ActiveRecord::Relation or ActiveRecord::Result.
597
607
  #
598
608
  # @return [DataFrame]
599
- def read_sql(sql)
609
+ def read_database(query)
600
610
  if !defined?(ActiveRecord)
601
611
  raise Error, "Active Record not available"
602
612
  end
603
613
 
604
614
  result =
605
- if sql.is_a?(ActiveRecord::Result)
606
- sql
607
- elsif sql.is_a?(ActiveRecord::Relation)
608
- sql.connection.select_all(sql.to_sql)
609
- elsif sql.is_a?(String)
610
- ActiveRecord::Base.connection.select_all(sql)
615
+ if query.is_a?(ActiveRecord::Result)
616
+ query
617
+ elsif query.is_a?(ActiveRecord::Relation)
618
+ query.connection.select_all(query.to_sql)
619
+ elsif query.is_a?(String)
620
+ ActiveRecord::Base.connection.select_all(query)
611
621
  else
612
622
  raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
613
623
  end
@@ -617,6 +627,7 @@ module Polars
617
627
  end
618
628
  DataFrame.new(data)
619
629
  end
630
+ alias_method :read_sql, :read_database
620
631
 
621
632
  # def read_excel
622
633
  # end
@@ -628,7 +639,7 @@ module Polars
628
639
  # file chunks. After that work will only be done
629
640
  # if `next_batches` is called.
630
641
  #
631
- # @param file [Object]
642
+ # @param source [Object]
632
643
  # Path to a file or a file-like object.
633
644
  # @param has_header [Boolean]
634
645
  # Indicate if the first row of dataset is a header or not.
@@ -712,7 +723,7 @@ module Polars
712
723
  # )
713
724
  # reader.next_batches(5)
714
725
  def read_csv_batched(
715
- file,
726
+ source,
716
727
  has_header: true,
717
728
  columns: nil,
718
729
  new_columns: nil,
@@ -752,7 +763,7 @@ module Polars
752
763
  end
753
764
 
754
765
  BatchedCsvReader.new(
755
- file,
766
+ source,
756
767
  has_header: has_header,
757
768
  columns: columns || projection,
758
769
  sep: sep,
@@ -781,30 +792,30 @@ module Polars
781
792
 
782
793
  # Get a schema of the IPC file without reading data.
783
794
  #
784
- # @param file [Object]
795
+ # @param source [Object]
785
796
  # Path to a file or a file-like object.
786
797
  #
787
798
  # @return [Hash]
788
- def read_ipc_schema(file)
789
- if Utils.pathlike?(file)
790
- file = Utils.format_path(file)
799
+ def read_ipc_schema(source)
800
+ if Utils.pathlike?(source)
801
+ source = Utils.normalise_filepath(source)
791
802
  end
792
803
 
793
- _ipc_schema(file)
804
+ _ipc_schema(source)
794
805
  end
795
806
 
796
807
  # Get a schema of the Parquet file without reading data.
797
808
  #
798
- # @param file [Object]
809
+ # @param source [Object]
799
810
  # Path to a file or a file-like object.
800
811
  #
801
812
  # @return [Hash]
802
- def read_parquet_schema(file)
803
- if Utils.pathlike?(file)
804
- file = Utils.format_path(file)
813
+ def read_parquet_schema(source)
814
+ if Utils.pathlike?(source)
815
+ source = Utils.normalise_filepath(source)
805
816
  end
806
817
 
807
- _parquet_schema(file)
818
+ _parquet_schema(source)
808
819
  end
809
820
 
810
821
  private
@@ -80,7 +80,8 @@ module Polars
80
80
  row_count_name: nil,
81
81
  row_count_offset: 0,
82
82
  storage_options: nil,
83
- low_memory: false
83
+ low_memory: false,
84
+ use_statistics: true
84
85
  )
85
86
  _from_rbldf(
86
87
  RbLazyFrame.new_from_parquet(
@@ -90,7 +91,8 @@ module Polars
90
91
  parallel,
91
92
  rechunk,
92
93
  Utils._prepare_row_count_args(row_count_name, row_count_offset),
93
- low_memory
94
+ low_memory,
95
+ use_statistics
94
96
  )
95
97
  )
96
98
  end
@@ -107,7 +109,7 @@ module Polars
107
109
  memory_map: true
108
110
  )
109
111
  if Utils.pathlike?(file)
110
- file = Utils.format_path(file)
112
+ file = Utils.normalise_filepath(file)
111
113
  end
112
114
 
113
115
  _from_rbldf(
@@ -157,7 +159,7 @@ module Polars
157
159
  # @return [LazyFrame]
158
160
  def self.read_json(file)
159
161
  if Utils.pathlike?(file)
160
- file = Utils.format_path(file)
162
+ file = Utils.normalise_filepath(file)
161
163
  end
162
164
 
163
165
  Utils.wrap_ldf(RbLazyFrame.read_json(file))
@@ -264,7 +266,7 @@ module Polars
264
266
  # @return [nil]
265
267
  def write_json(file)
266
268
  if Utils.pathlike?(file)
267
- file = Utils.format_path(file)
269
+ file = Utils.normalise_filepath(file)
268
270
  end
269
271
  _ldf.write_json(file)
270
272
  nil
@@ -473,6 +475,96 @@ module Polars
473
475
  Utils.wrap_df(ldf.collect)
474
476
  end
475
477
 
478
+ # Persists a LazyFrame at the provided path.
479
+ #
480
+ # This allows streaming results that are larger than RAM to be written to disk.
481
+ #
482
+ # @param path [String]
483
+ # File path to which the file should be written.
484
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
485
+ # Choose "zstd" for good compression performance.
486
+ # Choose "lz4" for fast compression/decompression.
487
+ # Choose "snappy" for more backwards compatibility guarantees
488
+ # when you deal with older parquet readers.
489
+ # @param compression_level [Integer]
490
+ # The level of compression to use. Higher compression means smaller files on
491
+ # disk.
492
+ #
493
+ # - "gzip" : min-level: 0, max-level: 10.
494
+ # - "brotli" : min-level: 0, max-level: 11.
495
+ # - "zstd" : min-level: 1, max-level: 22.
496
+ # @param statistics [Boolean]
497
+ # Write statistics to the parquet headers. This requires extra compute.
498
+ # @param row_group_size [Integer]
499
+ # Size of the row groups in number of rows.
500
+ # If `nil` (default), the chunks of the `DataFrame` are
501
+ # used. Writing in smaller chunks may reduce memory pressure and improve
502
+ # writing speeds.
503
+ # @param data_pagesize_limit [Integer]
504
+ # Size limit of individual data pages.
505
+ # If not set defaults to 1024 * 1024 bytes
506
+ # @param maintain_order [Boolean]
507
+ # Maintain the order in which data is processed.
508
+ # Setting this to `false` will be slightly faster.
509
+ # @param type_coercion [Boolean]
510
+ # Do type coercion optimization.
511
+ # @param predicate_pushdown [Boolean]
512
+ # Do predicate pushdown optimization.
513
+ # @param projection_pushdown [Boolean]
514
+ # Do projection pushdown optimization.
515
+ # @param simplify_expression [Boolean]
516
+ # Run simplify expressions optimization.
517
+ # @param no_optimization [Boolean]
518
+ # Turn off (certain) optimizations.
519
+ # @param slice_pushdown [Boolean]
520
+ # Slice pushdown optimization.
521
+ #
522
+ # @return [DataFrame]
523
+ #
524
+ # @example
525
+ # lf = Polars.scan_csv("/path/to/my_larger_than_ram_file.csv")
526
+ # lf.sink_parquet("out.parquet")
527
+ def sink_parquet(
528
+ path,
529
+ compression: "zstd",
530
+ compression_level: nil,
531
+ statistics: false,
532
+ row_group_size: nil,
533
+ data_pagesize_limit: nil,
534
+ maintain_order: true,
535
+ type_coercion: true,
536
+ predicate_pushdown: true,
537
+ projection_pushdown: true,
538
+ simplify_expression: true,
539
+ no_optimization: false,
540
+ slice_pushdown: true
541
+ )
542
+ if no_optimization
543
+ predicate_pushdown = false
544
+ projection_pushdown = false
545
+ slice_pushdown = false
546
+ end
547
+
548
+ lf = _ldf.optimization_toggle(
549
+ type_coercion,
550
+ predicate_pushdown,
551
+ projection_pushdown,
552
+ simplify_expression,
553
+ slice_pushdown,
554
+ false,
555
+ true
556
+ )
557
+ lf.sink_parquet(
558
+ path,
559
+ compression,
560
+ compression_level,
561
+ statistics,
562
+ row_group_size,
563
+ data_pagesize_limit,
564
+ maintain_order
565
+ )
566
+ end
567
+
476
568
  # Collect a small number of rows for debugging purposes.
477
569
  #
478
570
  # Fetch is like a {#collect} operation, but it overwrites the number of rows
@@ -2192,6 +2284,10 @@ module Polars
2192
2284
  # Name to give to the `value` column. Defaults to "variable"
2193
2285
  # @param value_name [String]
2194
2286
  # Name to give to the `value` column. Defaults to "value"
2287
+ # @param streamable [Boolean]
2288
+ # Allow this node to run in the streaming engine.
2289
+ # If this runs in streaming, the output of the melt operation
2290
+ # will not have a stable ordering.
2195
2291
  #
2196
2292
  # @return [LazyFrame]
2197
2293
  #
@@ -2218,7 +2314,7 @@ module Polars
2218
2314
  # # │ y ┆ c ┆ 4 │
2219
2315
  # # │ z ┆ c ┆ 6 │
2220
2316
  # # └─────┴──────────┴───────┘
2221
- def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil)
2317
+ def melt(id_vars: nil, value_vars: nil, variable_name: nil, value_name: nil, streamable: true)
2222
2318
  if value_vars.is_a?(String)
2223
2319
  value_vars = [value_vars]
2224
2320
  end
@@ -2232,7 +2328,7 @@ module Polars
2232
2328
  id_vars = []
2233
2329
  end
2234
2330
  _from_rbldf(
2235
- _ldf.melt(id_vars, value_vars, value_name, variable_name)
2331
+ _ldf.melt(id_vars, value_vars, value_name, variable_name, streamable)
2236
2332
  )
2237
2333
  end
2238
2334
 
@@ -657,7 +657,7 @@ module Polars
657
657
  # Default is ascending.
658
658
  #
659
659
  # @return [Expr]
660
- def argsort_by(exprs, reverse: false)
660
+ def arg_sort_by(exprs, reverse: false)
661
661
  if !exprs.is_a?(Array)
662
662
  exprs = [exprs]
663
663
  end
@@ -665,8 +665,9 @@ module Polars
665
665
  reverse = [reverse] * exprs.length
666
666
  end
667
667
  exprs = Utils.selection_to_rbexpr_list(exprs)
668
- Utils.wrap_expr(RbExpr.argsort_by(exprs, reverse))
668
+ Utils.wrap_expr(RbExpr.arg_sort_by(exprs, reverse))
669
669
  end
670
+ alias_method :argsort_by, :arg_sort_by
670
671
 
671
672
  # Create polars `Duration` from distinct time components.
672
673
  #
@@ -426,7 +426,7 @@ module Polars
426
426
  # # shape: (2,)
427
427
  # # Series: 'a' [list[i64]]
428
428
  # # [
429
- # # [null, 1, ... 1]
429
+ # # [null, 1, 1]
430
430
  # # [null, -8, -1]
431
431
  # # ]
432
432
  def diff(n: 1, null_behavior: "ignore")
@@ -447,7 +447,7 @@ module Polars
447
447
  # # shape: (2,)
448
448
  # # Series: 'a' [list[i64]]
449
449
  # # [
450
- # # [null, 1, ... 3]
450
+ # # [null, 1, 3]
451
451
  # # [null, 10, 2]
452
452
  # # ]
453
453
  def shift(periods = 1)
@@ -185,7 +185,7 @@ module Polars
185
185
  # # shape: (2,)
186
186
  # # Series: 'a' [list[i64]]
187
187
  # # [
188
- # # [null, 1, ... 1]
188
+ # # [null, 1, 1]
189
189
  # # [null, -8, -1]
190
190
  # # ]
191
191
  def diff(n: 1, null_behavior: "ignore")
@@ -206,7 +206,7 @@ module Polars
206
206
  # # shape: (2,)
207
207
  # # Series: 'a' [list[i64]]
208
208
  # # [
209
- # # [null, 1, ... 3]
209
+ # # [null, 1, 3]
210
210
  # # [null, 10, 2]
211
211
  # # ]
212
212
  def shift(periods = 1)