polars-df 0.20.0-x64-mingw-ucrt → 0.21.1-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/Cargo.lock +192 -186
  4. data/LICENSE-THIRD-PARTY.txt +2153 -2532
  5. data/LICENSE.txt +1 -1
  6. data/lib/polars/3.2/polars.so +0 -0
  7. data/lib/polars/3.3/polars.so +0 -0
  8. data/lib/polars/3.4/polars.so +0 -0
  9. data/lib/polars/array_expr.rb +382 -3
  10. data/lib/polars/array_name_space.rb +281 -0
  11. data/lib/polars/binary_expr.rb +67 -0
  12. data/lib/polars/binary_name_space.rb +43 -0
  13. data/lib/polars/cat_expr.rb +224 -0
  14. data/lib/polars/cat_name_space.rb +130 -32
  15. data/lib/polars/catalog/unity/catalog_info.rb +20 -0
  16. data/lib/polars/catalog/unity/column_info.rb +31 -0
  17. data/lib/polars/catalog/unity/namespace_info.rb +21 -0
  18. data/lib/polars/catalog/unity/table_info.rb +50 -0
  19. data/lib/polars/catalog.rb +448 -0
  20. data/lib/polars/config.rb +2 -2
  21. data/lib/polars/convert.rb +12 -2
  22. data/lib/polars/data_frame.rb +834 -48
  23. data/lib/polars/data_type_expr.rb +52 -0
  24. data/lib/polars/data_types.rb +61 -5
  25. data/lib/polars/date_time_expr.rb +251 -0
  26. data/lib/polars/date_time_name_space.rb +299 -0
  27. data/lib/polars/exceptions.rb +7 -2
  28. data/lib/polars/expr.rb +1247 -211
  29. data/lib/polars/functions/col.rb +6 -5
  30. data/lib/polars/functions/datatype.rb +21 -0
  31. data/lib/polars/functions/lazy.rb +127 -15
  32. data/lib/polars/functions/repeat.rb +4 -0
  33. data/lib/polars/io/csv.rb +19 -1
  34. data/lib/polars/io/json.rb +16 -0
  35. data/lib/polars/io/ndjson.rb +13 -0
  36. data/lib/polars/io/parquet.rb +70 -66
  37. data/lib/polars/io/scan_options.rb +47 -0
  38. data/lib/polars/lazy_frame.rb +1099 -95
  39. data/lib/polars/list_expr.rb +400 -11
  40. data/lib/polars/list_name_space.rb +321 -5
  41. data/lib/polars/meta_expr.rb +71 -22
  42. data/lib/polars/name_expr.rb +36 -0
  43. data/lib/polars/scan_cast_options.rb +64 -0
  44. data/lib/polars/schema.rb +84 -3
  45. data/lib/polars/selector.rb +210 -0
  46. data/lib/polars/selectors.rb +932 -203
  47. data/lib/polars/series.rb +1083 -63
  48. data/lib/polars/string_expr.rb +435 -9
  49. data/lib/polars/string_name_space.rb +729 -45
  50. data/lib/polars/struct_expr.rb +103 -0
  51. data/lib/polars/struct_name_space.rb +19 -1
  52. data/lib/polars/utils/parse.rb +40 -0
  53. data/lib/polars/utils/various.rb +18 -1
  54. data/lib/polars/utils.rb +9 -1
  55. data/lib/polars/version.rb +1 -1
  56. data/lib/polars.rb +10 -0
  57. metadata +12 -2
@@ -8,11 +8,11 @@ module Polars
8
8
  if Utils.strlike?(name)
9
9
  names_str = [name]
10
10
  names_str.concat(more_names)
11
- return Utils.wrap_expr(Plr.cols(names_str.map(&:to_s)))
11
+ return Selector._by_name(names_str.map(&:to_s), strict: true).as_expr
12
12
  elsif Utils.is_polars_dtype(name)
13
13
  dtypes = [name]
14
14
  dtypes.concat(more_names)
15
- return Utils.wrap_expr(Plr.dtype_cols(dtypes))
15
+ return Selector._by_type(dtypes).as_expr
16
16
  else
17
17
  msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
18
18
  raise TypeError, msg
@@ -22,7 +22,8 @@ module Polars
22
22
  if Utils.strlike?(name)
23
23
  Utils.wrap_expr(Plr.col(name.to_s))
24
24
  elsif Utils.is_polars_dtype(name)
25
- Utils.wrap_expr(Plr.dtype_cols([name]))
25
+ dtypes = [name]
26
+ Selector._by_dtype(dtypes).as_expr
26
27
  elsif name.is_a?(::Array) || name.is_a?(::Set)
27
28
  names = Array(name)
28
29
  if names.empty?
@@ -31,9 +32,9 @@ module Polars
31
32
 
32
33
  item = names[0]
33
34
  if Utils.strlike?(item)
34
- Utils.wrap_expr(Plr.cols(names.map(&:to_s)))
35
+ Selector._by_name(names.map(&:to_s), strict: true).as_expr
35
36
  elsif Utils.is_polars_dtype(item)
36
- Utils.wrap_expr(Plr.dtype_cols(names))
37
+ Selector._by_dtype(names).as_expr
37
38
  else
38
39
  msg = "invalid input for `col`\n\nExpected iterable of type `str` or `DataType`, got iterable of type #{item.class.name}."
39
40
  raise TypeError, msg
@@ -0,0 +1,21 @@
1
+ module Polars
2
+ module Functions
3
+ # Get a lazily evaluated :class:`DataType` of a column or expression.
4
+ #
5
+ # @note
6
+ # This functionality is considered **unstable**. It may be changed
7
+ # at any point without it being considered a breaking change.
8
+ #
9
+ # @return [DataTypeExpr]
10
+ def dtype_of(col_or_expr)
11
+ e = nil
12
+ if col_or_expr.is_a?(::String)
13
+ e = F.col(col_or_expr)
14
+ else
15
+ e = col_or_expr
16
+ end
17
+
18
+ DataTypeExpr._from_rbdatatype_expr(RbDataTypeExpr.of_expr(e._rbexpr))
19
+ end
20
+ end
21
+ end
@@ -1,5 +1,18 @@
1
1
  module Polars
2
2
  module Functions
3
+ # Select a field in the current `struct.with_fields` scope.
4
+ #
5
+ # @param name [Object]
6
+ # Name of the field(s) to select.
7
+ #
8
+ # @return [Expr]
9
+ def field(name)
10
+ if name.is_a?(::String)
11
+ name = [name]
12
+ end
13
+ Utils.wrap_expr(Plr.field(name))
14
+ end
15
+
3
16
  # Alias for an element in evaluated in an `eval` expression.
4
17
  #
5
18
  # @return [Expr]
@@ -458,7 +471,7 @@ module Polars
458
471
  # # └─────┴─────┘
459
472
  def first(*columns)
460
473
  if columns.empty?
461
- return Utils.wrap_expr(Plr.first)
474
+ return cs.first.as_expr
462
475
  end
463
476
 
464
477
  col(*columns).first
@@ -518,7 +531,7 @@ module Polars
518
531
  # # └─────┴─────┘
519
532
  def last(*columns)
520
533
  if columns.empty?
521
- return Utils.wrap_expr(Plr.last)
534
+ return cs.last.as_expr
522
535
  end
523
536
 
524
537
  col(*columns).last
@@ -565,12 +578,8 @@ module Polars
565
578
  # # │ bar ┆ 8 │
566
579
  # # │ baz ┆ 3 │
567
580
  # # └─────┴─────┘
568
- def nth(*indices)
569
- if indices.length == 1 && indices[0].is_a?(Array)
570
- indices = indices[0]
571
- end
572
-
573
- Utils.wrap_expr(Plr.index_cols(indices))
581
+ def nth(*indices, strict: true)
582
+ cs.by_index(*indices, require_all: strict).as_expr
574
583
  end
575
584
 
576
585
  # Get the first `n` rows.
@@ -675,12 +684,12 @@ module Polars
675
684
  # Column name or Expression.
676
685
  # @param b [Object]
677
686
  # Column name or Expression.
687
+ # @param method ["pearson", "spearman"]
688
+ # Correlation method.
678
689
  # @param ddof [Integer]
679
690
  # "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
680
691
  # where N represents the number of elements.
681
692
  # By default ddof is 1.
682
- # @param method ["pearson", "spearman"]
683
- # Correlation method.
684
693
  # @param propagate_nans [Boolean]
685
694
  # If `true` any `NaN` encountered will lead to `NaN` in the output.
686
695
  # Defaults to `False` where `NaN` are regarded as larger than any finite number
@@ -795,14 +804,82 @@ module Polars
795
804
  # Accumulate over multiple columns horizontally/row wise with a left fold.
796
805
  #
797
806
  # @return [Expr]
798
- def fold(acc, f, exprs)
807
+ #
808
+ # @example Horizontally sum over all columns and add 1.
809
+ # df = Polars::DataFrame.new(
810
+ # {
811
+ # "a" => [1, 2, 3],
812
+ # "b" => [3, 4, 5],
813
+ # "c" => [5, 6, 7]
814
+ # }
815
+ # )
816
+ # df.select(
817
+ # Polars.fold(
818
+ # Polars.lit(1), ->(acc, x) { acc + x }, Polars.col("*")
819
+ # ).alias("sum")
820
+ # )
821
+ # # =>
822
+ # # shape: (3, 1)
823
+ # # ┌─────┐
824
+ # # │ sum │
825
+ # # │ --- │
826
+ # # │ i64 │
827
+ # # ╞═════╡
828
+ # # │ 10 │
829
+ # # │ 13 │
830
+ # # │ 16 │
831
+ # # └─────┘
832
+ #
833
+ # @example You can also apply a condition/predicate on all columns:
834
+ # df = Polars::DataFrame.new(
835
+ # {
836
+ # "a" => [1, 2, 3],
837
+ # "b" => [0, 1, 2]
838
+ # }
839
+ # )
840
+ # df.filter(
841
+ # Polars.fold(
842
+ # Polars.lit(true),
843
+ # ->(acc, x) { acc & x },
844
+ # Polars.col("*") > 1
845
+ # )
846
+ # )
847
+ # # =>
848
+ # # shape: (1, 2)
849
+ # # ┌─────┬─────┐
850
+ # # │ a ┆ b │
851
+ # # │ --- ┆ --- │
852
+ # # │ i64 ┆ i64 │
853
+ # # ╞═════╪═════╡
854
+ # # │ 3 ┆ 2 │
855
+ # # └─────┴─────┘
856
+ def fold(
857
+ acc,
858
+ function,
859
+ exprs,
860
+ returns_scalar: false,
861
+ return_dtype: nil
862
+ )
799
863
  acc = Utils.parse_into_expression(acc, str_as_lit: true)
800
864
  if exprs.is_a?(Expr)
801
865
  exprs = [exprs]
802
866
  end
803
867
 
868
+ rt = nil
869
+ if !return_dtype.nil?
870
+ rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
871
+ end
872
+
804
873
  exprs = Utils.parse_into_list_of_expressions(exprs)
805
- Utils.wrap_expr(Plr.fold(acc, f, exprs))
874
+ Utils.wrap_expr(
875
+ Plr.fold(
876
+ acc,
877
+ function,
878
+ exprs,
879
+ returns_scalar,
880
+ rt
881
+ )
882
+ )
806
883
  end
807
884
 
808
885
  # def reduce
@@ -815,11 +892,17 @@ module Polars
815
892
  # @param acc [Object]
816
893
  # Accumulator Expression. This is the value that will be initialized when the fold
817
894
  # starts. For a sum this could for instance be lit(0).
818
- # @param f [Object]
895
+ # @param function [Object]
819
896
  # Function to apply over the accumulator and the value.
820
897
  # Fn(acc, value) -> new_value
821
898
  # @param exprs [Object]
822
899
  # Expressions to aggregate over. May also be a wildcard expression.
900
+ # @param returns_scalar [Boolean]
901
+ # Whether or not `function` applied returns a scalar. This must be set correctly
902
+ # by the user.
903
+ # @param return_dtype [Object]
904
+ # Output datatype.
905
+ # If not set, the dtype will be inferred based on the dtype of the accumulator.
823
906
  # @param include_init [Boolean]
824
907
  # Include the initial accumulator state as struct field.
825
908
  #
@@ -851,14 +934,35 @@ module Polars
851
934
  # # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
852
935
  # # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
853
936
  # # └─────┴─────┴─────┴───────────┘
854
- def cum_fold(acc, f, exprs, include_init: false)
937
+ def cum_fold(
938
+ acc,
939
+ function,
940
+ exprs,
941
+ returns_scalar: false,
942
+ return_dtype: nil,
943
+ include_init: false
944
+ )
855
945
  acc = Utils.parse_into_expression(acc, str_as_lit: true)
856
946
  if exprs.is_a?(Expr)
857
947
  exprs = [exprs]
858
948
  end
859
949
 
950
+ rt = nil
951
+ if !return_dtype.nil?
952
+ rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
953
+ end
954
+
860
955
  exprs = Utils.parse_into_list_of_expressions(exprs)
861
- Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
956
+ Utils.wrap_expr(
957
+ Plr.cum_fold(
958
+ acc,
959
+ function,
960
+ exprs,
961
+ returns_scalar,
962
+ rt,
963
+ include_init
964
+ )._alias("cum_fold")
965
+ )
862
966
  end
863
967
  alias_method :cumfold, :cum_fold
864
968
 
@@ -1047,8 +1151,16 @@ module Polars
1047
1151
  #
1048
1152
  # @param exprs [Object]
1049
1153
  # Columns use to determine the ordering.
1154
+ # @param more_exprs [Array]
1155
+ # Additional columns to arg sort by, specified as positional arguments.
1050
1156
  # @param reverse [Boolean]
1051
1157
  # Default is ascending.
1158
+ # @param nulls_last [Boolean]
1159
+ # Place null values last.
1160
+ # @param multithreaded [Boolean]
1161
+ # Sort using multiple threads.
1162
+ # @param maintain_order [Boolean]
1163
+ # Whether the order should be maintained if elements are equal.
1052
1164
  #
1053
1165
  # @return [Expr]
1054
1166
  #
@@ -6,6 +6,10 @@ module Polars
6
6
  # Value to repeat.
7
7
  # @param n [Integer]
8
8
  # Repeat `n` times.
9
+ # @param dtype [Object]
10
+ # Data type of the resulting column. If set to `nil` (default), data type is
11
+ # inferred from the given value. Defaults to Int32 for integer values, unless
12
+ # Int64 is required to fit the given value. Defaults to Float64 for float values.
9
13
  # @param eager [Boolean]
10
14
  # Run eagerly and collect into a `Series`.
11
15
  # @param name [String]
data/lib/polars/io/csv.rb CHANGED
@@ -347,6 +347,9 @@ module Polars
347
347
  # - `String`: All values equal to this string will be null.
348
348
  # - `Array`: All values equal to any string in this array will be null.
349
349
  # - `Hash`: A hash that maps column name to a null value string.
350
+ # @param missing_utf8_is_empty_string [Boolean]
351
+ # By default a missing value is considered to be null; if you would prefer missing
352
+ # utf8 values to be treated as the empty string you can set this param true.
350
353
  # @param ignore_errors [Boolean]
351
354
  # Try to keep reading lines if some lines yield errors.
352
355
  # First try `infer_schema_length: 0` to read all columns as
@@ -387,8 +390,13 @@ module Polars
387
390
  # Offset to start the row_count column (only used if the name is set).
388
391
  # @param eol_char [String]
389
392
  # Single byte end of line character.
393
+ # @param raise_if_empty [Boolean]
394
+ # When there is no data in the source,`NoDataError` is raised. If this parameter
395
+ # is set to false, `nil` will be returned from `next_batches(n)` instead.
390
396
  # @param truncate_ragged_lines [Boolean]
391
397
  # Truncate lines that are longer than the schema.
398
+ # @param decimal_comma [Boolean]
399
+ # Parse floats using a comma as the decimal separator instead of a period.
392
400
  #
393
401
  # @return [BatchedCsvReader]
394
402
  #
@@ -491,7 +499,7 @@ module Polars
491
499
  # for instance `#`.
492
500
  # @param quote_char [String]
493
501
  # Single byte character used for csv quoting.
494
- # Set to None to turn off special handling and escaping of quotes.
502
+ # Set to nil to turn off special handling and escaping of quotes.
495
503
  # @param skip_rows [Integer]
496
504
  # Start reading after `skip_rows` lines. The header will be parsed at this
497
505
  # offset.
@@ -503,6 +511,9 @@ module Polars
503
511
  # - `String`: All values equal to this string will be null.
504
512
  # - `Array`: All values equal to any string in this array will be null.
505
513
  # - `Hash`: A hash that maps column name to a null value string.
514
+ # @param missing_utf8_is_empty_string [Boolean]
515
+ # By default a missing value is considered to be null; if you would prefer missing
516
+ # utf8 values to be treated as the empty string you can set this param true.
506
517
  # @param ignore_errors [Boolean]
507
518
  # Try to keep reading lines if some lines yield errors.
508
519
  # First try `infer_schema_length: 0` to read all columns as
@@ -538,8 +549,15 @@ module Polars
538
549
  # the column remains of data type `:str`.
539
550
  # @param eol_char [String]
540
551
  # Single byte end of line character.
552
+ # @param raise_if_empty [Boolean]
553
+ # When there is no data in the source, `NoDataError` is raised. If this parameter
554
+ # is set to false, an empty LazyFrame (with no columns) is returned instead.
541
555
  # @param truncate_ragged_lines [Boolean]
542
556
  # Truncate lines that are longer than the schema.
557
+ # @param decimal_comma [Boolean]
558
+ # Parse floats using a comma as the decimal separator instead of a period.
559
+ # @param glob [Boolean]
560
+ # Expand path given via globbing rules.
543
561
  #
544
562
  # @return [LazyFrame]
545
563
  def scan_csv(
@@ -4,6 +4,22 @@ module Polars
4
4
  #
5
5
  # @param source [Object]
6
6
  # Path to a file or a file-like object.
7
+ # @param schema [Object]
8
+ # The DataFrame schema may be declared in several ways:
9
+ #
10
+ # * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
11
+ # * As an array of column names; in this case types are automatically inferred.
12
+ # * As an array of [name,type] pairs; this is equivalent to the hash form.
13
+ #
14
+ # If you supply an array of column names that does not match the names in the
15
+ # underlying data, the names given here will overwrite them. The number
16
+ # of names given in the schema should match the underlying data dimensions.
17
+ # @param schema_overrides [Hash]
18
+ # Support type specification or override of one or more columns; note that
19
+ # any dtypes inferred from the schema param will be overridden.
20
+ # @param infer_schema_length [Integer]
21
+ # The maximum number of rows to scan for schema inference.
22
+ # If set to `nil`, the full data may be scanned *(this is slow)*.
7
23
  #
8
24
  # @return [DataFrame]
9
25
  def read_json(
@@ -4,6 +4,19 @@ module Polars
4
4
  #
5
5
  # @param source [Object]
6
6
  # Path to a file or a file-like object.
7
+ # @param schema [Object]
8
+ # The DataFrame schema may be declared in several ways:
9
+ #
10
+ # * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
11
+ # * As an array of column names; in this case types are automatically inferred.
12
+ # * As an array of [name,type] pairs; this is equivalent to the hash form.
13
+ #
14
+ # If you supply an array of column names that does not match the names in the
15
+ # underlying data, the names given here will overwrite them. The number
16
+ # of names given in the schema should match the underlying data dimensions.
17
+ # @param schema_overrides [Hash]
18
+ # Support type specification or override of one or more columns; note that
19
+ # any dtypes inferred from the schema param will be overridden.
7
20
  #
8
21
  # @return [DataFrame]
9
22
  def read_ndjson(
@@ -43,12 +43,18 @@ module Polars
43
43
  # Extra options that make sense for a particular storage connection.
44
44
  # @param credential_provider [Object]
45
45
  # Provide a function that can be called to provide cloud storage
46
- # credentials. The function is expected to return a dictionary of
46
+ # credentials. The function is expected to return a hash of
47
47
  # credential keys along with an optional credential expiry time.
48
48
  # @param retries [Integer]
49
49
  # Number of retries if accessing a cloud instance fails.
50
50
  # @param include_file_paths [String]
51
51
  # Include the path of the source file(s) as a column with this name.
52
+ # @param allow_missing_columns [Boolean]
53
+ # When reading a list of parquet files, if a column existing in the first
54
+ # file cannot be found in subsequent files, the default behavior is to
55
+ # raise an error. However, if `allow_missing_columns` is set to
56
+ # `true`, a full-NULL column is returned instead of erroring for the files
57
+ # that do not contain the column.
52
58
  #
53
59
  # @return [DataFrame]
54
60
  def read_parquet(
@@ -117,7 +123,26 @@ module Polars
117
123
  source = Utils.normalize_filepath(source)
118
124
  end
119
125
 
120
- Plr.parquet_schema(source)
126
+ # TODO return Schema
127
+ scan_parquet(source).collect_schema.to_h
128
+ end
129
+
130
+ # Get file-level custom metadata of a Parquet file without reading data.
131
+ #
132
+ # @note
133
+ # This functionality is considered **experimental**. It may be removed or
134
+ # changed at any point without it being considered a breaking change.
135
+ #
136
+ # @param source [Object]
137
+ # Path to a file or a file-like object.
138
+ #
139
+ # @return [Hash]
140
+ def read_parquet_metadata(source)
141
+ if Utils.pathlike?(source)
142
+ source = Utils.normalize_filepath(source, check_not_directory: false)
143
+ end
144
+
145
+ Plr.read_parquet_metadata(source)
121
146
  end
122
147
 
123
148
  # Lazily read from a parquet file or multiple files via glob patterns.
@@ -165,12 +190,23 @@ module Polars
165
190
  # Extra options that make sense for a particular storage connection.
166
191
  # @param credential_provider [Object]
167
192
  # Provide a function that can be called to provide cloud storage
168
- # credentials. The function is expected to return a dictionary of
193
+ # credentials. The function is expected to return a hash of
169
194
  # credential keys along with an optional credential expiry time.
170
195
  # @param retries [Integer]
171
196
  # Number of retries if accessing a cloud instance fails.
172
197
  # @param include_file_paths [String]
173
198
  # Include the path of the source file(s) as a column with this name.
199
+ # @param allow_missing_columns [Boolean]
200
+ # When reading a list of parquet files, if a column existing in the first
201
+ # file cannot be found in subsequent files, the default behavior is to
202
+ # raise an error. However, if `allow_missing_columns` is set to
203
+ # `true`, a full-NULL column is returned instead of erroring for the files
204
+ # that do not contain the column.
205
+ # @param extra_columns ['ignore', 'raise']
206
+ # Configuration for behavior when extra columns outside of the
207
+ # defined schema are encountered in the data:
208
+ # * `ignore`: Silently ignores.
209
+ # * `raise`: Raises an error.
174
210
  #
175
211
  # @return [LazyFrame]
176
212
  def scan_parquet(
@@ -192,8 +228,13 @@ module Polars
192
228
  credential_provider: nil,
193
229
  retries: 2,
194
230
  include_file_paths: nil,
195
- allow_missing_columns: false
231
+ allow_missing_columns: false,
232
+ extra_columns: "raise",
233
+ _column_mapping: nil,
234
+ _deletion_files: nil
196
235
  )
236
+ missing_columns = allow_missing_columns ? "insert" : "raise"
237
+
197
238
  if Utils.pathlike?(source)
198
239
  source = Utils.normalize_filepath(source, check_not_directory: false)
199
240
  elsif Utils.is_path_or_str_sequence(source)
@@ -204,56 +245,11 @@ module Polars
204
245
  raise Todo
205
246
  end
206
247
 
207
- _scan_parquet_impl(
208
- source,
209
- n_rows: n_rows,
210
- cache: cache,
211
- parallel: parallel,
212
- rechunk: rechunk,
213
- row_index_name: row_count_name,
214
- row_index_offset: row_count_offset,
215
- storage_options: storage_options,
216
- credential_provider: credential_provider,
217
- low_memory: low_memory,
218
- use_statistics: use_statistics,
219
- hive_partitioning: hive_partitioning,
220
- schema: schema,
221
- hive_schema: hive_schema,
222
- try_parse_hive_dates: try_parse_hive_dates,
223
- retries: retries,
224
- glob: glob,
225
- include_file_paths: include_file_paths,
226
- allow_missing_columns: allow_missing_columns
227
- )
228
- end
229
-
230
- # @private
231
- def _scan_parquet_impl(
232
- source,
233
- n_rows: nil,
234
- cache: true,
235
- parallel: "auto",
236
- rechunk: true,
237
- row_index_name: nil,
238
- row_index_offset: 0,
239
- storage_options: nil,
240
- credential_provider: nil,
241
- low_memory: false,
242
- use_statistics: true,
243
- hive_partitioning: nil,
244
- glob: true,
245
- schema: nil,
246
- hive_schema: nil,
247
- try_parse_hive_dates: true,
248
- retries: 2,
249
- include_file_paths: nil,
250
- allow_missing_columns: false
251
- )
252
248
  if source.is_a?(::Array)
253
249
  sources = source
254
250
  source = nil
255
251
  else
256
- sources = []
252
+ sources = [source]
257
253
  end
258
254
 
259
255
  if storage_options
@@ -262,27 +258,35 @@ module Polars
262
258
  storage_options = nil
263
259
  end
264
260
 
261
+ row_index_name = row_count_name
262
+ row_index_offset = row_count_offset
263
+
265
264
  rblf =
266
265
  RbLazyFrame.new_from_parquet(
267
- source,
268
266
  sources,
269
- n_rows,
270
- cache,
267
+ schema,
268
+ ScanOptions.new(
269
+ row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
270
+ pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
271
+ # cast_options: cast_options,
272
+ extra_columns: extra_columns,
273
+ missing_columns: missing_columns,
274
+ include_file_paths: include_file_paths,
275
+ glob: glob,
276
+ hive_partitioning: hive_partitioning,
277
+ hive_schema: hive_schema,
278
+ try_parse_hive_dates: try_parse_hive_dates,
279
+ rechunk: rechunk,
280
+ cache: cache,
281
+ storage_options: storage_options,
282
+ # credential_provider: credential_provider_builder,
283
+ retries: retries,
284
+ deletion_files: _deletion_files,
285
+ column_mapping: _column_mapping
286
+ ),
271
287
  parallel,
272
- rechunk,
273
- Utils.parse_row_index_args(row_index_name, row_index_offset),
274
288
  low_memory,
275
- storage_options,
276
- credential_provider,
277
- use_statistics,
278
- hive_partitioning,
279
- schema,
280
- hive_schema,
281
- try_parse_hive_dates,
282
- retries,
283
- glob,
284
- include_file_paths,
285
- allow_missing_columns
289
+ use_statistics
286
290
  )
287
291
  Utils.wrap_ldf(rblf)
288
292
  end
@@ -0,0 +1,47 @@
1
+ module Polars
2
+ module IO
3
+ class ScanOptions
4
+ attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
5
+ :include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
6
+ :rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping, :deletion_files
7
+
8
+ def initialize(
9
+ row_index: nil,
10
+ pre_slice: nil,
11
+ cast_options: nil,
12
+ extra_columns: "raise",
13
+ missing_columns: "raise",
14
+ include_file_paths: nil,
15
+ glob: true,
16
+ hive_partitioning: nil,
17
+ hive_schema: nil,
18
+ try_parse_hive_dates: true,
19
+ rechunk: false,
20
+ cache: true,
21
+ storage_options: nil,
22
+ credential_provider: nil,
23
+ retries: 2,
24
+ column_mapping: nil,
25
+ deletion_files: nil
26
+ )
27
+ @row_index = row_index
28
+ @pre_slice = pre_slice
29
+ @cast_options = cast_options
30
+ @extra_columns = extra_columns
31
+ @missing_columns = missing_columns
32
+ @include_file_paths = include_file_paths
33
+ @glob = glob
34
+ @hive_partitioning = hive_partitioning
35
+ @hive_schema = hive_schema
36
+ @try_parse_hive_dates = try_parse_hive_dates
37
+ @rechunk = rechunk
38
+ @cache = cache
39
+ @storage_options = storage_options
40
+ @credential_provider = credential_provider
41
+ @retries = retries
42
+ @column_mapping = column_mapping
43
+ @deletion_files = deletion_files
44
+ end
45
+ end
46
+ end
47
+ end