polars-df 0.20.0-x64-mingw-ucrt → 0.21.1-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/Cargo.lock +192 -186
- data/LICENSE-THIRD-PARTY.txt +2153 -2532
- data/LICENSE.txt +1 -1
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/3.4/polars.so +0 -0
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +130 -32
- data/lib/polars/catalog/unity/catalog_info.rb +20 -0
- data/lib/polars/catalog/unity/column_info.rb +31 -0
- data/lib/polars/catalog/unity/namespace_info.rb +21 -0
- data/lib/polars/catalog/unity/table_info.rb +50 -0
- data/lib/polars/catalog.rb +448 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +12 -2
- data/lib/polars/data_frame.rb +834 -48
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +61 -5
- data/lib/polars/date_time_expr.rb +251 -0
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/exceptions.rb +7 -2
- data/lib/polars/expr.rb +1247 -211
- data/lib/polars/functions/col.rb +6 -5
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +127 -15
- data/lib/polars/functions/repeat.rb +4 -0
- data/lib/polars/io/csv.rb +19 -1
- data/lib/polars/io/json.rb +16 -0
- data/lib/polars/io/ndjson.rb +13 -0
- data/lib/polars/io/parquet.rb +70 -66
- data/lib/polars/io/scan_options.rb +47 -0
- data/lib/polars/lazy_frame.rb +1099 -95
- data/lib/polars/list_expr.rb +400 -11
- data/lib/polars/list_name_space.rb +321 -5
- data/lib/polars/meta_expr.rb +71 -22
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/scan_cast_options.rb +64 -0
- data/lib/polars/schema.rb +84 -3
- data/lib/polars/selector.rb +210 -0
- data/lib/polars/selectors.rb +932 -203
- data/lib/polars/series.rb +1083 -63
- data/lib/polars/string_expr.rb +435 -9
- data/lib/polars/string_name_space.rb +729 -45
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/parse.rb +40 -0
- data/lib/polars/utils/various.rb +18 -1
- data/lib/polars/utils.rb +9 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +10 -0
- metadata +12 -2
data/lib/polars/functions/col.rb
CHANGED
@@ -8,11 +8,11 @@ module Polars
|
|
8
8
|
if Utils.strlike?(name)
|
9
9
|
names_str = [name]
|
10
10
|
names_str.concat(more_names)
|
11
|
-
return
|
11
|
+
return Selector._by_name(names_str.map(&:to_s), strict: true).as_expr
|
12
12
|
elsif Utils.is_polars_dtype(name)
|
13
13
|
dtypes = [name]
|
14
14
|
dtypes.concat(more_names)
|
15
|
-
return
|
15
|
+
return Selector._by_type(dtypes).as_expr
|
16
16
|
else
|
17
17
|
msg = "invalid input for `col`\n\nExpected `str` or `DataType`, got #{name.class.name}."
|
18
18
|
raise TypeError, msg
|
@@ -22,7 +22,8 @@ module Polars
|
|
22
22
|
if Utils.strlike?(name)
|
23
23
|
Utils.wrap_expr(Plr.col(name.to_s))
|
24
24
|
elsif Utils.is_polars_dtype(name)
|
25
|
-
|
25
|
+
dtypes = [name]
|
26
|
+
Selector._by_dtype(dtypes).as_expr
|
26
27
|
elsif name.is_a?(::Array) || name.is_a?(::Set)
|
27
28
|
names = Array(name)
|
28
29
|
if names.empty?
|
@@ -31,9 +32,9 @@ module Polars
|
|
31
32
|
|
32
33
|
item = names[0]
|
33
34
|
if Utils.strlike?(item)
|
34
|
-
|
35
|
+
Selector._by_name(names.map(&:to_s), strict: true).as_expr
|
35
36
|
elsif Utils.is_polars_dtype(item)
|
36
|
-
|
37
|
+
Selector._by_dtype(names).as_expr
|
37
38
|
else
|
38
39
|
msg = "invalid input for `col`\n\nExpected iterable of type `str` or `DataType`, got iterable of type #{item.class.name}."
|
39
40
|
raise TypeError, msg
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Get a lazily evaluated :class:`DataType` of a column or expression.
|
4
|
+
#
|
5
|
+
# @note
|
6
|
+
# This functionality is considered **unstable**. It may be changed
|
7
|
+
# at any point without it being considered a breaking change.
|
8
|
+
#
|
9
|
+
# @return [DataTypeExpr]
|
10
|
+
def dtype_of(col_or_expr)
|
11
|
+
e = nil
|
12
|
+
if col_or_expr.is_a?(::String)
|
13
|
+
e = F.col(col_or_expr)
|
14
|
+
else
|
15
|
+
e = col_or_expr
|
16
|
+
end
|
17
|
+
|
18
|
+
DataTypeExpr._from_rbdatatype_expr(RbDataTypeExpr.of_expr(e._rbexpr))
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -1,5 +1,18 @@
|
|
1
1
|
module Polars
|
2
2
|
module Functions
|
3
|
+
# Select a field in the current `struct.with_fields` scope.
|
4
|
+
#
|
5
|
+
# @param name [Object]
|
6
|
+
# Name of the field(s) to select.
|
7
|
+
#
|
8
|
+
# @return [Expr]
|
9
|
+
def field(name)
|
10
|
+
if name.is_a?(::String)
|
11
|
+
name = [name]
|
12
|
+
end
|
13
|
+
Utils.wrap_expr(Plr.field(name))
|
14
|
+
end
|
15
|
+
|
3
16
|
# Alias for an element in evaluated in an `eval` expression.
|
4
17
|
#
|
5
18
|
# @return [Expr]
|
@@ -458,7 +471,7 @@ module Polars
|
|
458
471
|
# # └─────┴─────┘
|
459
472
|
def first(*columns)
|
460
473
|
if columns.empty?
|
461
|
-
return
|
474
|
+
return cs.first.as_expr
|
462
475
|
end
|
463
476
|
|
464
477
|
col(*columns).first
|
@@ -518,7 +531,7 @@ module Polars
|
|
518
531
|
# # └─────┴─────┘
|
519
532
|
def last(*columns)
|
520
533
|
if columns.empty?
|
521
|
-
return
|
534
|
+
return cs.last.as_expr
|
522
535
|
end
|
523
536
|
|
524
537
|
col(*columns).last
|
@@ -565,12 +578,8 @@ module Polars
|
|
565
578
|
# # │ bar ┆ 8 │
|
566
579
|
# # │ baz ┆ 3 │
|
567
580
|
# # └─────┴─────┘
|
568
|
-
def nth(*indices)
|
569
|
-
|
570
|
-
indices = indices[0]
|
571
|
-
end
|
572
|
-
|
573
|
-
Utils.wrap_expr(Plr.index_cols(indices))
|
581
|
+
def nth(*indices, strict: true)
|
582
|
+
cs.by_index(*indices, require_all: strict).as_expr
|
574
583
|
end
|
575
584
|
|
576
585
|
# Get the first `n` rows.
|
@@ -675,12 +684,12 @@ module Polars
|
|
675
684
|
# Column name or Expression.
|
676
685
|
# @param b [Object]
|
677
686
|
# Column name or Expression.
|
687
|
+
# @param method ["pearson", "spearman"]
|
688
|
+
# Correlation method.
|
678
689
|
# @param ddof [Integer]
|
679
690
|
# "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
|
680
691
|
# where N represents the number of elements.
|
681
692
|
# By default ddof is 1.
|
682
|
-
# @param method ["pearson", "spearman"]
|
683
|
-
# Correlation method.
|
684
693
|
# @param propagate_nans [Boolean]
|
685
694
|
# If `true` any `NaN` encountered will lead to `NaN` in the output.
|
686
695
|
# Defaults to `False` where `NaN` are regarded as larger than any finite number
|
@@ -795,14 +804,82 @@ module Polars
|
|
795
804
|
# Accumulate over multiple columns horizontally/row wise with a left fold.
|
796
805
|
#
|
797
806
|
# @return [Expr]
|
798
|
-
|
807
|
+
#
|
808
|
+
# @example Horizontally sum over all columns and add 1.
|
809
|
+
# df = Polars::DataFrame.new(
|
810
|
+
# {
|
811
|
+
# "a" => [1, 2, 3],
|
812
|
+
# "b" => [3, 4, 5],
|
813
|
+
# "c" => [5, 6, 7]
|
814
|
+
# }
|
815
|
+
# )
|
816
|
+
# df.select(
|
817
|
+
# Polars.fold(
|
818
|
+
# Polars.lit(1), ->(acc, x) { acc + x }, Polars.col("*")
|
819
|
+
# ).alias("sum")
|
820
|
+
# )
|
821
|
+
# # =>
|
822
|
+
# # shape: (3, 1)
|
823
|
+
# # ┌─────┐
|
824
|
+
# # │ sum │
|
825
|
+
# # │ --- │
|
826
|
+
# # │ i64 │
|
827
|
+
# # ╞═════╡
|
828
|
+
# # │ 10 │
|
829
|
+
# # │ 13 │
|
830
|
+
# # │ 16 │
|
831
|
+
# # └─────┘
|
832
|
+
#
|
833
|
+
# @example You can also apply a condition/predicate on all columns:
|
834
|
+
# df = Polars::DataFrame.new(
|
835
|
+
# {
|
836
|
+
# "a" => [1, 2, 3],
|
837
|
+
# "b" => [0, 1, 2]
|
838
|
+
# }
|
839
|
+
# )
|
840
|
+
# df.filter(
|
841
|
+
# Polars.fold(
|
842
|
+
# Polars.lit(true),
|
843
|
+
# ->(acc, x) { acc & x },
|
844
|
+
# Polars.col("*") > 1
|
845
|
+
# )
|
846
|
+
# )
|
847
|
+
# # =>
|
848
|
+
# # shape: (1, 2)
|
849
|
+
# # ┌─────┬─────┐
|
850
|
+
# # │ a ┆ b │
|
851
|
+
# # │ --- ┆ --- │
|
852
|
+
# # │ i64 ┆ i64 │
|
853
|
+
# # ╞═════╪═════╡
|
854
|
+
# # │ 3 ┆ 2 │
|
855
|
+
# # └─────┴─────┘
|
856
|
+
def fold(
|
857
|
+
acc,
|
858
|
+
function,
|
859
|
+
exprs,
|
860
|
+
returns_scalar: false,
|
861
|
+
return_dtype: nil
|
862
|
+
)
|
799
863
|
acc = Utils.parse_into_expression(acc, str_as_lit: true)
|
800
864
|
if exprs.is_a?(Expr)
|
801
865
|
exprs = [exprs]
|
802
866
|
end
|
803
867
|
|
868
|
+
rt = nil
|
869
|
+
if !return_dtype.nil?
|
870
|
+
rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
|
871
|
+
end
|
872
|
+
|
804
873
|
exprs = Utils.parse_into_list_of_expressions(exprs)
|
805
|
-
Utils.wrap_expr(
|
874
|
+
Utils.wrap_expr(
|
875
|
+
Plr.fold(
|
876
|
+
acc,
|
877
|
+
function,
|
878
|
+
exprs,
|
879
|
+
returns_scalar,
|
880
|
+
rt
|
881
|
+
)
|
882
|
+
)
|
806
883
|
end
|
807
884
|
|
808
885
|
# def reduce
|
@@ -815,11 +892,17 @@ module Polars
|
|
815
892
|
# @param acc [Object]
|
816
893
|
# Accumulator Expression. This is the value that will be initialized when the fold
|
817
894
|
# starts. For a sum this could for instance be lit(0).
|
818
|
-
# @param
|
895
|
+
# @param function [Object]
|
819
896
|
# Function to apply over the accumulator and the value.
|
820
897
|
# Fn(acc, value) -> new_value
|
821
898
|
# @param exprs [Object]
|
822
899
|
# Expressions to aggregate over. May also be a wildcard expression.
|
900
|
+
# @param returns_scalar [Boolean]
|
901
|
+
# Whether or not `function` applied returns a scalar. This must be set correctly
|
902
|
+
# by the user.
|
903
|
+
# @param return_dtype [Object]
|
904
|
+
# Output datatype.
|
905
|
+
# If not set, the dtype will be inferred based on the dtype of the accumulator.
|
823
906
|
# @param include_init [Boolean]
|
824
907
|
# Include the initial accumulator state as struct field.
|
825
908
|
#
|
@@ -851,14 +934,35 @@ module Polars
|
|
851
934
|
# # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
|
852
935
|
# # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
|
853
936
|
# # └─────┴─────┴─────┴───────────┘
|
854
|
-
def cum_fold(
|
937
|
+
def cum_fold(
|
938
|
+
acc,
|
939
|
+
function,
|
940
|
+
exprs,
|
941
|
+
returns_scalar: false,
|
942
|
+
return_dtype: nil,
|
943
|
+
include_init: false
|
944
|
+
)
|
855
945
|
acc = Utils.parse_into_expression(acc, str_as_lit: true)
|
856
946
|
if exprs.is_a?(Expr)
|
857
947
|
exprs = [exprs]
|
858
948
|
end
|
859
949
|
|
950
|
+
rt = nil
|
951
|
+
if !return_dtype.nil?
|
952
|
+
rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
|
953
|
+
end
|
954
|
+
|
860
955
|
exprs = Utils.parse_into_list_of_expressions(exprs)
|
861
|
-
Utils.wrap_expr(
|
956
|
+
Utils.wrap_expr(
|
957
|
+
Plr.cum_fold(
|
958
|
+
acc,
|
959
|
+
function,
|
960
|
+
exprs,
|
961
|
+
returns_scalar,
|
962
|
+
rt,
|
963
|
+
include_init
|
964
|
+
)._alias("cum_fold")
|
965
|
+
)
|
862
966
|
end
|
863
967
|
alias_method :cumfold, :cum_fold
|
864
968
|
|
@@ -1047,8 +1151,16 @@ module Polars
|
|
1047
1151
|
#
|
1048
1152
|
# @param exprs [Object]
|
1049
1153
|
# Columns use to determine the ordering.
|
1154
|
+
# @param more_exprs [Array]
|
1155
|
+
# Additional columns to arg sort by, specified as positional arguments.
|
1050
1156
|
# @param reverse [Boolean]
|
1051
1157
|
# Default is ascending.
|
1158
|
+
# @param nulls_last [Boolean]
|
1159
|
+
# Place null values last.
|
1160
|
+
# @param multithreaded [Boolean]
|
1161
|
+
# Sort using multiple threads.
|
1162
|
+
# @param maintain_order [Boolean]
|
1163
|
+
# Whether the order should be maintained if elements are equal.
|
1052
1164
|
#
|
1053
1165
|
# @return [Expr]
|
1054
1166
|
#
|
@@ -6,6 +6,10 @@ module Polars
|
|
6
6
|
# Value to repeat.
|
7
7
|
# @param n [Integer]
|
8
8
|
# Repeat `n` times.
|
9
|
+
# @param dtype [Object]
|
10
|
+
# Data type of the resulting column. If set to `nil` (default), data type is
|
11
|
+
# inferred from the given value. Defaults to Int32 for integer values, unless
|
12
|
+
# Int64 is required to fit the given value. Defaults to Float64 for float values.
|
9
13
|
# @param eager [Boolean]
|
10
14
|
# Run eagerly and collect into a `Series`.
|
11
15
|
# @param name [String]
|
data/lib/polars/io/csv.rb
CHANGED
@@ -347,6 +347,9 @@ module Polars
|
|
347
347
|
# - `String`: All values equal to this string will be null.
|
348
348
|
# - `Array`: All values equal to any string in this array will be null.
|
349
349
|
# - `Hash`: A hash that maps column name to a null value string.
|
350
|
+
# @param missing_utf8_is_empty_string [Boolean]
|
351
|
+
# By default a missing value is considered to be null; if you would prefer missing
|
352
|
+
# utf8 values to be treated as the empty string you can set this param true.
|
350
353
|
# @param ignore_errors [Boolean]
|
351
354
|
# Try to keep reading lines if some lines yield errors.
|
352
355
|
# First try `infer_schema_length: 0` to read all columns as
|
@@ -387,8 +390,13 @@ module Polars
|
|
387
390
|
# Offset to start the row_count column (only used if the name is set).
|
388
391
|
# @param eol_char [String]
|
389
392
|
# Single byte end of line character.
|
393
|
+
# @param raise_if_empty [Boolean]
|
394
|
+
# When there is no data in the source,`NoDataError` is raised. If this parameter
|
395
|
+
# is set to false, `nil` will be returned from `next_batches(n)` instead.
|
390
396
|
# @param truncate_ragged_lines [Boolean]
|
391
397
|
# Truncate lines that are longer than the schema.
|
398
|
+
# @param decimal_comma [Boolean]
|
399
|
+
# Parse floats using a comma as the decimal separator instead of a period.
|
392
400
|
#
|
393
401
|
# @return [BatchedCsvReader]
|
394
402
|
#
|
@@ -491,7 +499,7 @@ module Polars
|
|
491
499
|
# for instance `#`.
|
492
500
|
# @param quote_char [String]
|
493
501
|
# Single byte character used for csv quoting.
|
494
|
-
# Set to
|
502
|
+
# Set to nil to turn off special handling and escaping of quotes.
|
495
503
|
# @param skip_rows [Integer]
|
496
504
|
# Start reading after `skip_rows` lines. The header will be parsed at this
|
497
505
|
# offset.
|
@@ -503,6 +511,9 @@ module Polars
|
|
503
511
|
# - `String`: All values equal to this string will be null.
|
504
512
|
# - `Array`: All values equal to any string in this array will be null.
|
505
513
|
# - `Hash`: A hash that maps column name to a null value string.
|
514
|
+
# @param missing_utf8_is_empty_string [Boolean]
|
515
|
+
# By default a missing value is considered to be null; if you would prefer missing
|
516
|
+
# utf8 values to be treated as the empty string you can set this param true.
|
506
517
|
# @param ignore_errors [Boolean]
|
507
518
|
# Try to keep reading lines if some lines yield errors.
|
508
519
|
# First try `infer_schema_length: 0` to read all columns as
|
@@ -538,8 +549,15 @@ module Polars
|
|
538
549
|
# the column remains of data type `:str`.
|
539
550
|
# @param eol_char [String]
|
540
551
|
# Single byte end of line character.
|
552
|
+
# @param raise_if_empty [Boolean]
|
553
|
+
# When there is no data in the source, `NoDataError` is raised. If this parameter
|
554
|
+
# is set to false, an empty LazyFrame (with no columns) is returned instead.
|
541
555
|
# @param truncate_ragged_lines [Boolean]
|
542
556
|
# Truncate lines that are longer than the schema.
|
557
|
+
# @param decimal_comma [Boolean]
|
558
|
+
# Parse floats using a comma as the decimal separator instead of a period.
|
559
|
+
# @param glob [Boolean]
|
560
|
+
# Expand path given via globbing rules.
|
543
561
|
#
|
544
562
|
# @return [LazyFrame]
|
545
563
|
def scan_csv(
|
data/lib/polars/io/json.rb
CHANGED
@@ -4,6 +4,22 @@ module Polars
|
|
4
4
|
#
|
5
5
|
# @param source [Object]
|
6
6
|
# Path to a file or a file-like object.
|
7
|
+
# @param schema [Object]
|
8
|
+
# The DataFrame schema may be declared in several ways:
|
9
|
+
#
|
10
|
+
# * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
11
|
+
# * As an array of column names; in this case types are automatically inferred.
|
12
|
+
# * As an array of [name,type] pairs; this is equivalent to the hash form.
|
13
|
+
#
|
14
|
+
# If you supply an array of column names that does not match the names in the
|
15
|
+
# underlying data, the names given here will overwrite them. The number
|
16
|
+
# of names given in the schema should match the underlying data dimensions.
|
17
|
+
# @param schema_overrides [Hash]
|
18
|
+
# Support type specification or override of one or more columns; note that
|
19
|
+
# any dtypes inferred from the schema param will be overridden.
|
20
|
+
# @param infer_schema_length [Integer]
|
21
|
+
# The maximum number of rows to scan for schema inference.
|
22
|
+
# If set to `nil`, the full data may be scanned *(this is slow)*.
|
7
23
|
#
|
8
24
|
# @return [DataFrame]
|
9
25
|
def read_json(
|
data/lib/polars/io/ndjson.rb
CHANGED
@@ -4,6 +4,19 @@ module Polars
|
|
4
4
|
#
|
5
5
|
# @param source [Object]
|
6
6
|
# Path to a file or a file-like object.
|
7
|
+
# @param schema [Object]
|
8
|
+
# The DataFrame schema may be declared in several ways:
|
9
|
+
#
|
10
|
+
# * As a hash of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
11
|
+
# * As an array of column names; in this case types are automatically inferred.
|
12
|
+
# * As an array of [name,type] pairs; this is equivalent to the hash form.
|
13
|
+
#
|
14
|
+
# If you supply an array of column names that does not match the names in the
|
15
|
+
# underlying data, the names given here will overwrite them. The number
|
16
|
+
# of names given in the schema should match the underlying data dimensions.
|
17
|
+
# @param schema_overrides [Hash]
|
18
|
+
# Support type specification or override of one or more columns; note that
|
19
|
+
# any dtypes inferred from the schema param will be overridden.
|
7
20
|
#
|
8
21
|
# @return [DataFrame]
|
9
22
|
def read_ndjson(
|
data/lib/polars/io/parquet.rb
CHANGED
@@ -43,12 +43,18 @@ module Polars
|
|
43
43
|
# Extra options that make sense for a particular storage connection.
|
44
44
|
# @param credential_provider [Object]
|
45
45
|
# Provide a function that can be called to provide cloud storage
|
46
|
-
# credentials. The function is expected to return a
|
46
|
+
# credentials. The function is expected to return a hash of
|
47
47
|
# credential keys along with an optional credential expiry time.
|
48
48
|
# @param retries [Integer]
|
49
49
|
# Number of retries if accessing a cloud instance fails.
|
50
50
|
# @param include_file_paths [String]
|
51
51
|
# Include the path of the source file(s) as a column with this name.
|
52
|
+
# @param allow_missing_columns [Boolean]
|
53
|
+
# When reading a list of parquet files, if a column existing in the first
|
54
|
+
# file cannot be found in subsequent files, the default behavior is to
|
55
|
+
# raise an error. However, if `allow_missing_columns` is set to
|
56
|
+
# `true`, a full-NULL column is returned instead of erroring for the files
|
57
|
+
# that do not contain the column.
|
52
58
|
#
|
53
59
|
# @return [DataFrame]
|
54
60
|
def read_parquet(
|
@@ -117,7 +123,26 @@ module Polars
|
|
117
123
|
source = Utils.normalize_filepath(source)
|
118
124
|
end
|
119
125
|
|
120
|
-
|
126
|
+
# TODO return Schema
|
127
|
+
scan_parquet(source).collect_schema.to_h
|
128
|
+
end
|
129
|
+
|
130
|
+
# Get file-level custom metadata of a Parquet file without reading data.
|
131
|
+
#
|
132
|
+
# @note
|
133
|
+
# This functionality is considered **experimental**. It may be removed or
|
134
|
+
# changed at any point without it being considered a breaking change.
|
135
|
+
#
|
136
|
+
# @param source [Object]
|
137
|
+
# Path to a file or a file-like object.
|
138
|
+
#
|
139
|
+
# @return [Hash]
|
140
|
+
def read_parquet_metadata(source)
|
141
|
+
if Utils.pathlike?(source)
|
142
|
+
source = Utils.normalize_filepath(source, check_not_directory: false)
|
143
|
+
end
|
144
|
+
|
145
|
+
Plr.read_parquet_metadata(source)
|
121
146
|
end
|
122
147
|
|
123
148
|
# Lazily read from a parquet file or multiple files via glob patterns.
|
@@ -165,12 +190,23 @@ module Polars
|
|
165
190
|
# Extra options that make sense for a particular storage connection.
|
166
191
|
# @param credential_provider [Object]
|
167
192
|
# Provide a function that can be called to provide cloud storage
|
168
|
-
# credentials. The function is expected to return a
|
193
|
+
# credentials. The function is expected to return a hash of
|
169
194
|
# credential keys along with an optional credential expiry time.
|
170
195
|
# @param retries [Integer]
|
171
196
|
# Number of retries if accessing a cloud instance fails.
|
172
197
|
# @param include_file_paths [String]
|
173
198
|
# Include the path of the source file(s) as a column with this name.
|
199
|
+
# @param allow_missing_columns [Boolean]
|
200
|
+
# When reading a list of parquet files, if a column existing in the first
|
201
|
+
# file cannot be found in subsequent files, the default behavior is to
|
202
|
+
# raise an error. However, if `allow_missing_columns` is set to
|
203
|
+
# `true`, a full-NULL column is returned instead of erroring for the files
|
204
|
+
# that do not contain the column.
|
205
|
+
# @param extra_columns ['ignore', 'raise']
|
206
|
+
# Configuration for behavior when extra columns outside of the
|
207
|
+
# defined schema are encountered in the data:
|
208
|
+
# * `ignore`: Silently ignores.
|
209
|
+
# * `raise`: Raises an error.
|
174
210
|
#
|
175
211
|
# @return [LazyFrame]
|
176
212
|
def scan_parquet(
|
@@ -192,8 +228,13 @@ module Polars
|
|
192
228
|
credential_provider: nil,
|
193
229
|
retries: 2,
|
194
230
|
include_file_paths: nil,
|
195
|
-
allow_missing_columns: false
|
231
|
+
allow_missing_columns: false,
|
232
|
+
extra_columns: "raise",
|
233
|
+
_column_mapping: nil,
|
234
|
+
_deletion_files: nil
|
196
235
|
)
|
236
|
+
missing_columns = allow_missing_columns ? "insert" : "raise"
|
237
|
+
|
197
238
|
if Utils.pathlike?(source)
|
198
239
|
source = Utils.normalize_filepath(source, check_not_directory: false)
|
199
240
|
elsif Utils.is_path_or_str_sequence(source)
|
@@ -204,56 +245,11 @@ module Polars
|
|
204
245
|
raise Todo
|
205
246
|
end
|
206
247
|
|
207
|
-
_scan_parquet_impl(
|
208
|
-
source,
|
209
|
-
n_rows: n_rows,
|
210
|
-
cache: cache,
|
211
|
-
parallel: parallel,
|
212
|
-
rechunk: rechunk,
|
213
|
-
row_index_name: row_count_name,
|
214
|
-
row_index_offset: row_count_offset,
|
215
|
-
storage_options: storage_options,
|
216
|
-
credential_provider: credential_provider,
|
217
|
-
low_memory: low_memory,
|
218
|
-
use_statistics: use_statistics,
|
219
|
-
hive_partitioning: hive_partitioning,
|
220
|
-
schema: schema,
|
221
|
-
hive_schema: hive_schema,
|
222
|
-
try_parse_hive_dates: try_parse_hive_dates,
|
223
|
-
retries: retries,
|
224
|
-
glob: glob,
|
225
|
-
include_file_paths: include_file_paths,
|
226
|
-
allow_missing_columns: allow_missing_columns
|
227
|
-
)
|
228
|
-
end
|
229
|
-
|
230
|
-
# @private
|
231
|
-
def _scan_parquet_impl(
|
232
|
-
source,
|
233
|
-
n_rows: nil,
|
234
|
-
cache: true,
|
235
|
-
parallel: "auto",
|
236
|
-
rechunk: true,
|
237
|
-
row_index_name: nil,
|
238
|
-
row_index_offset: 0,
|
239
|
-
storage_options: nil,
|
240
|
-
credential_provider: nil,
|
241
|
-
low_memory: false,
|
242
|
-
use_statistics: true,
|
243
|
-
hive_partitioning: nil,
|
244
|
-
glob: true,
|
245
|
-
schema: nil,
|
246
|
-
hive_schema: nil,
|
247
|
-
try_parse_hive_dates: true,
|
248
|
-
retries: 2,
|
249
|
-
include_file_paths: nil,
|
250
|
-
allow_missing_columns: false
|
251
|
-
)
|
252
248
|
if source.is_a?(::Array)
|
253
249
|
sources = source
|
254
250
|
source = nil
|
255
251
|
else
|
256
|
-
sources = []
|
252
|
+
sources = [source]
|
257
253
|
end
|
258
254
|
|
259
255
|
if storage_options
|
@@ -262,27 +258,35 @@ module Polars
|
|
262
258
|
storage_options = nil
|
263
259
|
end
|
264
260
|
|
261
|
+
row_index_name = row_count_name
|
262
|
+
row_index_offset = row_count_offset
|
263
|
+
|
265
264
|
rblf =
|
266
265
|
RbLazyFrame.new_from_parquet(
|
267
|
-
source,
|
268
266
|
sources,
|
269
|
-
|
270
|
-
|
267
|
+
schema,
|
268
|
+
ScanOptions.new(
|
269
|
+
row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
|
270
|
+
pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
|
271
|
+
# cast_options: cast_options,
|
272
|
+
extra_columns: extra_columns,
|
273
|
+
missing_columns: missing_columns,
|
274
|
+
include_file_paths: include_file_paths,
|
275
|
+
glob: glob,
|
276
|
+
hive_partitioning: hive_partitioning,
|
277
|
+
hive_schema: hive_schema,
|
278
|
+
try_parse_hive_dates: try_parse_hive_dates,
|
279
|
+
rechunk: rechunk,
|
280
|
+
cache: cache,
|
281
|
+
storage_options: storage_options,
|
282
|
+
# credential_provider: credential_provider_builder,
|
283
|
+
retries: retries,
|
284
|
+
deletion_files: _deletion_files,
|
285
|
+
column_mapping: _column_mapping
|
286
|
+
),
|
271
287
|
parallel,
|
272
|
-
rechunk,
|
273
|
-
Utils.parse_row_index_args(row_index_name, row_index_offset),
|
274
288
|
low_memory,
|
275
|
-
|
276
|
-
credential_provider,
|
277
|
-
use_statistics,
|
278
|
-
hive_partitioning,
|
279
|
-
schema,
|
280
|
-
hive_schema,
|
281
|
-
try_parse_hive_dates,
|
282
|
-
retries,
|
283
|
-
glob,
|
284
|
-
include_file_paths,
|
285
|
-
allow_missing_columns
|
289
|
+
use_statistics
|
286
290
|
)
|
287
291
|
Utils.wrap_ldf(rblf)
|
288
292
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
class ScanOptions
|
4
|
+
attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
|
5
|
+
:include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
|
6
|
+
:rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping, :deletion_files
|
7
|
+
|
8
|
+
def initialize(
|
9
|
+
row_index: nil,
|
10
|
+
pre_slice: nil,
|
11
|
+
cast_options: nil,
|
12
|
+
extra_columns: "raise",
|
13
|
+
missing_columns: "raise",
|
14
|
+
include_file_paths: nil,
|
15
|
+
glob: true,
|
16
|
+
hive_partitioning: nil,
|
17
|
+
hive_schema: nil,
|
18
|
+
try_parse_hive_dates: true,
|
19
|
+
rechunk: false,
|
20
|
+
cache: true,
|
21
|
+
storage_options: nil,
|
22
|
+
credential_provider: nil,
|
23
|
+
retries: 2,
|
24
|
+
column_mapping: nil,
|
25
|
+
deletion_files: nil
|
26
|
+
)
|
27
|
+
@row_index = row_index
|
28
|
+
@pre_slice = pre_slice
|
29
|
+
@cast_options = cast_options
|
30
|
+
@extra_columns = extra_columns
|
31
|
+
@missing_columns = missing_columns
|
32
|
+
@include_file_paths = include_file_paths
|
33
|
+
@glob = glob
|
34
|
+
@hive_partitioning = hive_partitioning
|
35
|
+
@hive_schema = hive_schema
|
36
|
+
@try_parse_hive_dates = try_parse_hive_dates
|
37
|
+
@rechunk = rechunk
|
38
|
+
@cache = cache
|
39
|
+
@storage_options = storage_options
|
40
|
+
@credential_provider = credential_provider
|
41
|
+
@retries = retries
|
42
|
+
@column_mapping = column_mapping
|
43
|
+
@deletion_files = deletion_files
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|