polars-df 0.14.0 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -0
- data/Cargo.lock +1523 -378
- data/LICENSE.txt +1 -0
- data/README.md +38 -4
- data/ext/polars/Cargo.toml +15 -5
- data/ext/polars/src/batched_csv.rs +7 -10
- data/ext/polars/src/conversion/any_value.rs +31 -21
- data/ext/polars/src/conversion/mod.rs +155 -48
- data/ext/polars/src/dataframe/construction.rs +0 -3
- data/ext/polars/src/dataframe/export.rs +9 -2
- data/ext/polars/src/dataframe/general.rs +15 -57
- data/ext/polars/src/dataframe/io.rs +77 -169
- data/ext/polars/src/dataframe/mod.rs +1 -0
- data/ext/polars/src/dataframe/serde.rs +15 -0
- data/ext/polars/src/error.rs +31 -48
- data/ext/polars/src/exceptions.rs +24 -0
- data/ext/polars/src/expr/binary.rs +4 -42
- data/ext/polars/src/expr/datetime.rs +5 -4
- data/ext/polars/src/expr/general.rs +16 -22
- data/ext/polars/src/expr/list.rs +18 -11
- data/ext/polars/src/expr/meta.rs +6 -2
- data/ext/polars/src/expr/rolling.rs +6 -7
- data/ext/polars/src/expr/string.rs +9 -36
- data/ext/polars/src/file.rs +78 -23
- data/ext/polars/src/functions/aggregation.rs +4 -4
- data/ext/polars/src/functions/business.rs +15 -0
- data/ext/polars/src/functions/io.rs +34 -13
- data/ext/polars/src/functions/lazy.rs +22 -12
- data/ext/polars/src/functions/meta.rs +1 -1
- data/ext/polars/src/functions/mod.rs +1 -0
- data/ext/polars/src/interop/arrow/mod.rs +1 -0
- data/ext/polars/src/interop/arrow/to_ruby.rs +83 -0
- data/ext/polars/src/interop/mod.rs +1 -0
- data/ext/polars/src/lazyframe/general.rs +920 -0
- data/ext/polars/src/lazyframe/mod.rs +3 -827
- data/ext/polars/src/lazyframe/serde.rs +31 -0
- data/ext/polars/src/lib.rs +54 -27
- data/ext/polars/src/map/dataframe.rs +10 -6
- data/ext/polars/src/map/lazy.rs +65 -4
- data/ext/polars/src/map/mod.rs +9 -8
- data/ext/polars/src/on_startup.rs +1 -1
- data/ext/polars/src/series/aggregation.rs +1 -5
- data/ext/polars/src/series/arithmetic.rs +10 -10
- data/ext/polars/src/series/construction.rs +2 -2
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +631 -0
- data/ext/polars/src/series/import.rs +55 -0
- data/ext/polars/src/series/mod.rs +11 -638
- data/ext/polars/src/series/scatter.rs +2 -2
- data/ext/polars/src/utils.rs +0 -20
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +452 -101
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +3 -1
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +103 -2
- data/lib/polars/functions/aggregation/horizontal.rb +10 -4
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +95 -13
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/delta.rb +126 -0
- data/lib/polars/io/ipc.rb +14 -4
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +684 -20
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1226 -0
- data/lib/polars/series.rb +465 -35
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +17 -1
- metadata +16 -9
- data/lib/polars/functions.rb +0 -57
@@ -86,8 +86,57 @@ module Polars
|
|
86
86
|
# Concat the arrays in a Series dtype List in linear time.
|
87
87
|
#
|
88
88
|
# @return [Expr]
|
89
|
-
|
90
|
-
|
89
|
+
#
|
90
|
+
# @example Concatenate two existing list columns. Null values are propagated.
|
91
|
+
# df = Polars::DataFrame.new({"a" => [[1, 2], [3], [4, 5]], "b" => [[4], [], nil]})
|
92
|
+
# df.with_columns(concat_list: Polars.concat_list("a", "b"))
|
93
|
+
# # =>
|
94
|
+
# # shape: (3, 3)
|
95
|
+
# # ┌───────────┬───────────┬─────────────┐
|
96
|
+
# # │ a ┆ b ┆ concat_list │
|
97
|
+
# # │ --- ┆ --- ┆ --- │
|
98
|
+
# # │ list[i64] ┆ list[i64] ┆ list[i64] │
|
99
|
+
# # ╞═══════════╪═══════════╪═════════════╡
|
100
|
+
# # │ [1, 2] ┆ [4] ┆ [1, 2, 4] │
|
101
|
+
# # │ [3] ┆ [] ┆ [3] │
|
102
|
+
# # │ [4, 5] ┆ null ┆ null │
|
103
|
+
# # └───────────┴───────────┴─────────────┘
|
104
|
+
#
|
105
|
+
# @example Non-list columns are cast to a list before concatenation. The output data type is the supertype of the concatenated columns.
|
106
|
+
# df.select("a", concat_list: Polars.concat_list("a", Polars.lit("x")))
|
107
|
+
# # =>
|
108
|
+
# # shape: (3, 2)
|
109
|
+
# # ┌───────────┬─────────────────┐
|
110
|
+
# # │ a ┆ concat_list │
|
111
|
+
# # │ --- ┆ --- │
|
112
|
+
# # │ list[i64] ┆ list[str] │
|
113
|
+
# # ╞═══════════╪═════════════════╡
|
114
|
+
# # │ [1, 2] ┆ ["1", "2", "x"] │
|
115
|
+
# # │ [3] ┆ ["3", "x"] │
|
116
|
+
# # │ [4, 5] ┆ ["4", "5", "x"] │
|
117
|
+
# # └───────────┴─────────────────┘
|
118
|
+
#
|
119
|
+
# @example Create lagged columns and collect them into a list. This mimics a rolling window.
|
120
|
+
# df = Polars::DataFrame.new({"A" => [1.0, 2.0, 9.0, 2.0, 13.0]})
|
121
|
+
# df = df.select(3.times.map { |i| Polars.col("A").shift(i).alias("A_lag_#{i}") })
|
122
|
+
# df.select(
|
123
|
+
# Polars.concat_list(3.times.map { |i| "A_lag_#{i}" }.reverse).alias("A_rolling")
|
124
|
+
# )
|
125
|
+
# # =>
|
126
|
+
# # shape: (5, 1)
|
127
|
+
# # ┌───────────────────┐
|
128
|
+
# # │ A_rolling │
|
129
|
+
# # │ --- │
|
130
|
+
# # │ list[f64] │
|
131
|
+
# # ╞═══════════════════╡
|
132
|
+
# # │ [null, null, 1.0] │
|
133
|
+
# # │ [null, 1.0, 2.0] │
|
134
|
+
# # │ [1.0, 2.0, 9.0] │
|
135
|
+
# # │ [2.0, 9.0, 2.0] │
|
136
|
+
# # │ [9.0, 2.0, 13.0] │
|
137
|
+
# # └───────────────────┘
|
138
|
+
def concat_list(exprs, *more_exprs)
|
139
|
+
exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
|
91
140
|
Utils.wrap_expr(Plr.concat_list(exprs))
|
92
141
|
end
|
93
142
|
|
data/lib/polars/functions/col.rb
CHANGED
@@ -23,7 +23,7 @@ module Polars
|
|
23
23
|
Utils.wrap_expr(Plr.col(name.to_s))
|
24
24
|
elsif Utils.is_polars_dtype(name)
|
25
25
|
Utils.wrap_expr(Plr.dtype_cols([name]))
|
26
|
-
elsif name.is_a?(::Array)
|
26
|
+
elsif name.is_a?(::Array) || name.is_a?(::Set)
|
27
27
|
names = Array(name)
|
28
28
|
if names.empty?
|
29
29
|
return Utils.wrap_expr(Plr.cols(names))
|
@@ -127,7 +127,7 @@ module Polars
|
|
127
127
|
# af1, af2, af3 = Polars.align_frames(
|
128
128
|
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
129
129
|
# )
|
130
|
-
# (af1 * af2 * af3).fill_null(0).select(Polars.
|
130
|
+
# (af1 * af2 * af3).fill_null(0).select(Polars.sum_horizontal("*").alias("dot"))
|
131
131
|
# # =>
|
132
132
|
# # shape: (3, 1)
|
133
133
|
# # ┌───────┐
|
@@ -136,9 +136,7 @@ module Polars
|
|
136
136
|
# # │ f64 │
|
137
137
|
# # ╞═══════╡
|
138
138
|
# # │ 0.0 │
|
139
|
-
# # ├╌╌╌╌╌╌╌┤
|
140
139
|
# # │ 167.5 │
|
141
|
-
# # ├╌╌╌╌╌╌╌┤
|
142
140
|
# # │ 47.0 │
|
143
141
|
# # └───────┘
|
144
142
|
def align_frames(
|
@@ -729,16 +729,20 @@ module Polars
|
|
729
729
|
a,
|
730
730
|
b,
|
731
731
|
method: "pearson",
|
732
|
-
ddof:
|
732
|
+
ddof: nil,
|
733
733
|
propagate_nans: false
|
734
734
|
)
|
735
|
+
if !ddof.nil?
|
736
|
+
warn "The `ddof` parameter has no effect. Do not use it."
|
737
|
+
end
|
738
|
+
|
735
739
|
a = Utils.parse_into_expression(a)
|
736
740
|
b = Utils.parse_into_expression(b)
|
737
741
|
|
738
742
|
if method == "pearson"
|
739
|
-
Utils.wrap_expr(Plr.pearson_corr(a, b
|
743
|
+
Utils.wrap_expr(Plr.pearson_corr(a, b))
|
740
744
|
elsif method == "spearman"
|
741
|
-
Utils.wrap_expr(Plr.spearman_rank_corr(a, b,
|
745
|
+
Utils.wrap_expr(Plr.spearman_rank_corr(a, b, propagate_nans))
|
742
746
|
else
|
743
747
|
msg = "method must be one of {{'pearson', 'spearman'}}, got #{method}"
|
744
748
|
raise ArgumentError, msg
|
@@ -824,6 +828,29 @@ module Polars
|
|
824
828
|
# @note
|
825
829
|
# If you simply want the first encountered expression as accumulator,
|
826
830
|
# consider using `cumreduce`.
|
831
|
+
#
|
832
|
+
# @example
|
833
|
+
# df = Polars::DataFrame.new(
|
834
|
+
# {
|
835
|
+
# "a" => [1, 2, 3],
|
836
|
+
# "b" => [3, 4, 5],
|
837
|
+
# "c" => [5, 6, 7]
|
838
|
+
# }
|
839
|
+
# )
|
840
|
+
# df.with_columns(
|
841
|
+
# Polars.cum_fold(Polars.lit(1), ->(acc, x) { acc + x }, Polars.all)
|
842
|
+
# )
|
843
|
+
# # =>
|
844
|
+
# # shape: (3, 4)
|
845
|
+
# # ┌─────┬─────┬─────┬───────────┐
|
846
|
+
# # │ a ┆ b ┆ c ┆ cum_fold │
|
847
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
848
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
|
849
|
+
# # ╞═════╪═════╪═════╪═══════════╡
|
850
|
+
# # │ 1 ┆ 3 ┆ 5 ┆ {2,5,10} │
|
851
|
+
# # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
|
852
|
+
# # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
|
853
|
+
# # └─────┴─────┴─────┴───────────┘
|
827
854
|
def cum_fold(acc, f, exprs, include_init: false)
|
828
855
|
acc = Utils.parse_into_expression(acc, str_as_lit: true)
|
829
856
|
if exprs.is_a?(Expr)
|
@@ -831,7 +858,7 @@ module Polars
|
|
831
858
|
end
|
832
859
|
|
833
860
|
exprs = Utils.parse_into_list_of_expressions(exprs)
|
834
|
-
Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init))
|
861
|
+
Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
|
835
862
|
end
|
836
863
|
alias_method :cumfold, :cum_fold
|
837
864
|
|
@@ -1024,15 +1051,70 @@ module Polars
|
|
1024
1051
|
# Default is ascending.
|
1025
1052
|
#
|
1026
1053
|
# @return [Expr]
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1054
|
+
#
|
1055
|
+
# @example Pass a single column name to compute the arg sort by that column.
|
1056
|
+
# df = Polars::DataFrame.new(
|
1057
|
+
# {
|
1058
|
+
# "a" => [0, 1, 1, 0],
|
1059
|
+
# "b" => [3, 2, 3, 2],
|
1060
|
+
# "c" => [1, 2, 3, 4]
|
1061
|
+
# }
|
1062
|
+
# )
|
1063
|
+
# df.select(Polars.arg_sort_by("a"))
|
1064
|
+
# # =>
|
1065
|
+
# # shape: (4, 1)
|
1066
|
+
# # ┌─────┐
|
1067
|
+
# # │ a │
|
1068
|
+
# # │ --- │
|
1069
|
+
# # │ u32 │
|
1070
|
+
# # ╞═════╡
|
1071
|
+
# # │ 0 │
|
1072
|
+
# # │ 3 │
|
1073
|
+
# # │ 1 │
|
1074
|
+
# # │ 2 │
|
1075
|
+
# # └─────┘
|
1076
|
+
#
|
1077
|
+
# @example Compute the arg sort by multiple columns by either passing a list of columns, or by specifying each column as a positional argument.
|
1078
|
+
# df.select(Polars.arg_sort_by(["a", "b"], reverse: true))
|
1079
|
+
# # =>
|
1080
|
+
# # shape: (4, 1)
|
1081
|
+
# # ┌─────┐
|
1082
|
+
# # │ a │
|
1083
|
+
# # │ --- │
|
1084
|
+
# # │ u32 │
|
1085
|
+
# # ╞═════╡
|
1086
|
+
# # │ 2 │
|
1087
|
+
# # │ 1 │
|
1088
|
+
# # │ 0 │
|
1089
|
+
# # │ 3 │
|
1090
|
+
# # └─────┘
|
1091
|
+
#
|
1092
|
+
# @example Use gather to apply the arg sort to other columns.
|
1093
|
+
# df.select(Polars.col("c").gather(Polars.arg_sort_by("a")))
|
1094
|
+
# # =>
|
1095
|
+
# # shape: (4, 1)
|
1096
|
+
# # ┌─────┐
|
1097
|
+
# # │ c │
|
1098
|
+
# # │ --- │
|
1099
|
+
# # │ i64 │
|
1100
|
+
# # ╞═════╡
|
1101
|
+
# # │ 1 │
|
1102
|
+
# # │ 4 │
|
1103
|
+
# # │ 2 │
|
1104
|
+
# # │ 3 │
|
1105
|
+
# # └─────┘
|
1106
|
+
def arg_sort_by(
|
1107
|
+
exprs,
|
1108
|
+
*more_exprs,
|
1109
|
+
reverse: false,
|
1110
|
+
nulls_last: false,
|
1111
|
+
multithreaded: true,
|
1112
|
+
maintain_order: false
|
1113
|
+
)
|
1114
|
+
exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
|
1115
|
+
reverse = Utils.extend_bool(reverse, exprs.length, "reverse", "exprs")
|
1116
|
+
nulls_last = Utils.extend_bool(nulls_last, exprs.length, "nulls_last", "exprs")
|
1117
|
+
Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse, nulls_last, multithreaded, maintain_order))
|
1036
1118
|
end
|
1037
1119
|
alias_method :argsort_by, :arg_sort_by
|
1038
1120
|
|
@@ -18,7 +18,7 @@ module Polars
|
|
18
18
|
#
|
19
19
|
# @example
|
20
20
|
# Polars.time_range(
|
21
|
-
#
|
21
|
+
# Time.utc(2000, 1, 1, 14, 0),
|
22
22
|
# nil,
|
23
23
|
# "3h15m",
|
24
24
|
# eager: true
|
@@ -48,12 +48,12 @@ module Polars
|
|
48
48
|
end
|
49
49
|
|
50
50
|
if start.nil?
|
51
|
-
#
|
52
|
-
|
51
|
+
# date part is ignored
|
52
|
+
start = ::Time.utc(2000, 1, 1, 0, 0, 0)
|
53
53
|
end
|
54
54
|
if stop.nil?
|
55
|
-
#
|
56
|
-
|
55
|
+
# date part is ignored
|
56
|
+
stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
|
57
57
|
end
|
58
58
|
|
59
59
|
start_rbexpr = Utils.parse_into_expression(start)
|
@@ -87,21 +87,21 @@ module Polars
|
|
87
87
|
# @example
|
88
88
|
# df = Polars::DataFrame.new(
|
89
89
|
# {
|
90
|
-
# "start" => [
|
91
|
-
# "end" =>
|
90
|
+
# "start" => [Time.utc(2000, 1, 1, 9, 0), Time.utc(2000, 1, 1, 10, 0)],
|
91
|
+
# "end" => Time.utc(2000, 1, 1, 11, 0)
|
92
92
|
# }
|
93
93
|
# )
|
94
|
-
# df.
|
94
|
+
# df.select(time_range: Polars.time_ranges("start", "end"))
|
95
95
|
# # =>
|
96
|
-
# # shape: (2,
|
97
|
-
# #
|
98
|
-
# # │
|
99
|
-
# # │ ---
|
100
|
-
# # │
|
101
|
-
# #
|
102
|
-
# # │
|
103
|
-
# # │
|
104
|
-
# #
|
96
|
+
# # shape: (2, 1)
|
97
|
+
# # ┌────────────────────────────────┐
|
98
|
+
# # │ time_range │
|
99
|
+
# # │ --- │
|
100
|
+
# # │ list[time] │
|
101
|
+
# # ╞════════════════════════════════╡
|
102
|
+
# # │ [09:00:00, 10:00:00, 11:00:00] │
|
103
|
+
# # │ [10:00:00, 11:00:00] │
|
104
|
+
# # └────────────────────────────────┘
|
105
105
|
def time_ranges(
|
106
106
|
start = nil,
|
107
107
|
stop = nil,
|
@@ -118,12 +118,12 @@ module Polars
|
|
118
118
|
end
|
119
119
|
|
120
120
|
if start.nil?
|
121
|
-
#
|
122
|
-
|
121
|
+
# date part is ignored
|
122
|
+
start = ::Time.utc(2000, 1, 1, 0, 0, 0)
|
123
123
|
end
|
124
124
|
if stop.nil?
|
125
|
-
#
|
126
|
-
|
125
|
+
# date part is ignored
|
126
|
+
stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
|
127
127
|
end
|
128
128
|
|
129
129
|
start_rbexpr = Utils.parse_into_expression(start)
|
data/lib/polars/io/csv.rb
CHANGED
@@ -75,9 +75,6 @@ module Polars
|
|
75
75
|
# the DataFrame.
|
76
76
|
# @param row_count_offset [Integer]
|
77
77
|
# Offset to start the row_count column (only used if the name is set).
|
78
|
-
# @param sample_size [Integer]
|
79
|
-
# Set the sample size. This is used to sample statistics to estimate the
|
80
|
-
# allocation needed.
|
81
78
|
# @param eol_char [String]
|
82
79
|
# Single byte end of line character.
|
83
80
|
# @param truncate_ragged_lines [Boolean]
|
@@ -114,7 +111,6 @@ module Polars
|
|
114
111
|
skip_rows_after_header: 0,
|
115
112
|
row_count_name: nil,
|
116
113
|
row_count_offset: 0,
|
117
|
-
sample_size: 1024,
|
118
114
|
eol_char: "\n",
|
119
115
|
truncate_ragged_lines: false
|
120
116
|
)
|
@@ -163,7 +159,6 @@ module Polars
|
|
163
159
|
skip_rows_after_header: skip_rows_after_header,
|
164
160
|
row_count_name: row_count_name,
|
165
161
|
row_count_offset: row_count_offset,
|
166
|
-
sample_size: sample_size,
|
167
162
|
eol_char: eol_char,
|
168
163
|
truncate_ragged_lines: truncate_ragged_lines
|
169
164
|
)
|
@@ -201,7 +196,6 @@ module Polars
|
|
201
196
|
skip_rows_after_header: 0,
|
202
197
|
row_count_name: nil,
|
203
198
|
row_count_offset: 0,
|
204
|
-
sample_size: 1024,
|
205
199
|
eol_char: "\n",
|
206
200
|
raise_if_empty: true,
|
207
201
|
truncate_ragged_lines: false,
|
@@ -305,7 +299,6 @@ module Polars
|
|
305
299
|
parse_dates,
|
306
300
|
skip_rows_after_header,
|
307
301
|
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
308
|
-
sample_size,
|
309
302
|
eol_char,
|
310
303
|
raise_if_empty,
|
311
304
|
truncate_ragged_lines,
|
@@ -392,9 +385,6 @@ module Polars
|
|
392
385
|
# the DataFrame.
|
393
386
|
# @param row_count_offset [Integer]
|
394
387
|
# Offset to start the row_count column (only used if the name is set).
|
395
|
-
# @param sample_size [Integer]
|
396
|
-
# Set the sample size. This is used to sample statistics to estimate the
|
397
|
-
# allocation needed.
|
398
388
|
# @param eol_char [String]
|
399
389
|
# Single byte end of line character.
|
400
390
|
# @param truncate_ragged_lines [Boolean]
|
@@ -431,7 +421,6 @@ module Polars
|
|
431
421
|
skip_rows_after_header: 0,
|
432
422
|
row_count_name: nil,
|
433
423
|
row_count_offset: 0,
|
434
|
-
sample_size: 1024,
|
435
424
|
eol_char: "\n",
|
436
425
|
raise_if_empty: true,
|
437
426
|
truncate_ragged_lines: false,
|
@@ -474,7 +463,6 @@ module Polars
|
|
474
463
|
skip_rows_after_header: skip_rows_after_header,
|
475
464
|
row_count_name: row_count_name,
|
476
465
|
row_count_offset: row_count_offset,
|
477
|
-
sample_size: sample_size,
|
478
466
|
eol_char: eol_char,
|
479
467
|
new_columns: new_columns,
|
480
468
|
raise_if_empty: raise_if_empty,
|
@@ -618,7 +606,7 @@ module Polars
|
|
618
606
|
|
619
607
|
# @private
|
620
608
|
def _scan_csv_impl(
|
621
|
-
|
609
|
+
source,
|
622
610
|
has_header: true,
|
623
611
|
sep: ",",
|
624
612
|
comment_char: nil,
|
@@ -650,9 +638,16 @@ module Polars
|
|
650
638
|
end
|
651
639
|
processed_null_values = Utils._process_null_values(null_values)
|
652
640
|
|
641
|
+
if source.is_a?(::Array)
|
642
|
+
sources = source
|
643
|
+
source = nil
|
644
|
+
else
|
645
|
+
sources = []
|
646
|
+
end
|
647
|
+
|
653
648
|
rblf =
|
654
649
|
RbLazyFrame.new_from_csv(
|
655
|
-
|
650
|
+
source,
|
656
651
|
sep,
|
657
652
|
has_header,
|
658
653
|
ignore_errors,
|
@@ -672,7 +667,8 @@ module Polars
|
|
672
667
|
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
673
668
|
parse_dates,
|
674
669
|
eol_char,
|
675
|
-
truncate_ragged_lines
|
670
|
+
truncate_ragged_lines,
|
671
|
+
sources
|
676
672
|
)
|
677
673
|
Utils.wrap_ldf(rblf)
|
678
674
|
end
|
@@ -681,7 +677,9 @@ module Polars
|
|
681
677
|
|
682
678
|
def _prepare_file_arg(file)
|
683
679
|
if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
|
684
|
-
|
680
|
+
require "uri"
|
681
|
+
|
682
|
+
file = URI(file)
|
685
683
|
end
|
686
684
|
|
687
685
|
if defined?(URI) && file.is_a?(URI)
|
data/lib/polars/io/database.rb
CHANGED
@@ -18,9 +18,9 @@ module Polars
|
|
18
18
|
if query.is_a?(ActiveRecord::Result)
|
19
19
|
query
|
20
20
|
elsif query.is_a?(ActiveRecord::Relation)
|
21
|
-
query.
|
21
|
+
query.connection_pool.with_connection { |c| c.select_all(query.to_sql) }
|
22
22
|
elsif query.is_a?(::String)
|
23
|
-
ActiveRecord::Base.
|
23
|
+
ActiveRecord::Base.connection_pool.with_connection { |c| c.select_all(query) }
|
24
24
|
else
|
25
25
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
26
26
|
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Reads into a DataFrame from a Delta lake table.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# DeltaTable or a Path or URI to the root of the Delta lake table.
|
7
|
+
# @param version [Object]
|
8
|
+
# Numerical version or timestamp version of the Delta lake table.
|
9
|
+
# @param columns [Array]
|
10
|
+
# Columns to select. Accepts a list of column names.
|
11
|
+
# @param rechunk [Boolean]
|
12
|
+
# Make sure that all columns are contiguous in memory by
|
13
|
+
# aggregating the chunks into a single array.
|
14
|
+
# @param storage_options [Hash]
|
15
|
+
# Extra options for the storage backends supported by `deltalake-rb`.
|
16
|
+
# @param delta_table_options [Hash]
|
17
|
+
# Additional keyword arguments while reading a Delta lake Table.
|
18
|
+
#
|
19
|
+
# @return [DataFrame]
|
20
|
+
def read_delta(
|
21
|
+
source,
|
22
|
+
version: nil,
|
23
|
+
columns: nil,
|
24
|
+
rechunk: false,
|
25
|
+
storage_options: nil,
|
26
|
+
delta_table_options: nil
|
27
|
+
)
|
28
|
+
dl_tbl =
|
29
|
+
_get_delta_lake_table(
|
30
|
+
source,
|
31
|
+
version: version,
|
32
|
+
storage_options: storage_options,
|
33
|
+
delta_table_options: delta_table_options
|
34
|
+
)
|
35
|
+
|
36
|
+
dl_tbl.to_polars(columns: columns, rechunk: rechunk)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Lazily read from a Delta lake table.
|
40
|
+
#
|
41
|
+
# @param source [Object]
|
42
|
+
# DeltaTable or a Path or URI to the root of the Delta lake table.
|
43
|
+
# @param version [Object]
|
44
|
+
# Numerical version or timestamp version of the Delta lake table.
|
45
|
+
# @param storage_options [Hash]
|
46
|
+
# Extra options for the storage backends supported by `deltalake-rb`.
|
47
|
+
# @param delta_table_options [Hash]
|
48
|
+
# Additional keyword arguments while reading a Delta lake Table.
|
49
|
+
#
|
50
|
+
# @return [LazyFrame]
|
51
|
+
def scan_delta(
|
52
|
+
source,
|
53
|
+
version: nil,
|
54
|
+
storage_options: nil,
|
55
|
+
delta_table_options: nil
|
56
|
+
)
|
57
|
+
dl_tbl =
|
58
|
+
_get_delta_lake_table(
|
59
|
+
source,
|
60
|
+
version: version,
|
61
|
+
storage_options: storage_options,
|
62
|
+
delta_table_options: delta_table_options
|
63
|
+
)
|
64
|
+
|
65
|
+
dl_tbl.to_polars(eager: false)
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def _resolve_delta_lake_uri(table_uri, strict: true)
|
71
|
+
require "uri"
|
72
|
+
|
73
|
+
parsed_result = URI(table_uri)
|
74
|
+
|
75
|
+
resolved_uri =
|
76
|
+
if parsed_result.scheme == ""
|
77
|
+
Utils.normalize_filepath(table_uri)
|
78
|
+
else
|
79
|
+
table_uri
|
80
|
+
end
|
81
|
+
|
82
|
+
resolved_uri
|
83
|
+
end
|
84
|
+
|
85
|
+
def _get_delta_lake_table(
|
86
|
+
table_path,
|
87
|
+
version: nil,
|
88
|
+
storage_options: nil,
|
89
|
+
delta_table_options: nil
|
90
|
+
)
|
91
|
+
_check_if_delta_available
|
92
|
+
|
93
|
+
if table_path.is_a?(DeltaLake::Table)
|
94
|
+
return table_path
|
95
|
+
end
|
96
|
+
delta_table_options ||= {}
|
97
|
+
resolved_uri = _resolve_delta_lake_uri(table_path)
|
98
|
+
if !version.is_a?(::String) && !version.is_a?(::Time)
|
99
|
+
dl_tbl =
|
100
|
+
DeltaLake::Table.new(
|
101
|
+
resolved_uri,
|
102
|
+
version: version,
|
103
|
+
storage_options: storage_options,
|
104
|
+
**delta_table_options
|
105
|
+
)
|
106
|
+
else
|
107
|
+
dl_tbl =
|
108
|
+
DeltaLake::Table.new(
|
109
|
+
resolved_uri,
|
110
|
+
storage_options: storage_options,
|
111
|
+
**delta_table_options
|
112
|
+
)
|
113
|
+
dl_tbl.load_as_version(version)
|
114
|
+
end
|
115
|
+
|
116
|
+
dl_tbl = DeltaLake::Table.new(table_path)
|
117
|
+
dl_tbl
|
118
|
+
end
|
119
|
+
|
120
|
+
def _check_if_delta_available
|
121
|
+
if !defined?(DeltaLake)
|
122
|
+
raise Error, "Delta Lake not available"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/lib/polars/io/ipc.rb
CHANGED
@@ -233,7 +233,7 @@ module Polars
|
|
233
233
|
|
234
234
|
# @private
|
235
235
|
def _scan_ipc_impl(
|
236
|
-
|
236
|
+
source,
|
237
237
|
n_rows: nil,
|
238
238
|
cache: true,
|
239
239
|
rechunk: true,
|
@@ -245,13 +245,23 @@ module Polars
|
|
245
245
|
try_parse_hive_dates: true,
|
246
246
|
include_file_paths: nil
|
247
247
|
)
|
248
|
-
|
249
|
-
|
248
|
+
sources = []
|
249
|
+
if Utils.pathlike?(source)
|
250
|
+
source = Utils.normalize_filepath(source)
|
251
|
+
elsif source.is_a?(::Array)
|
252
|
+
if Utils.is_path_or_str_sequence(source)
|
253
|
+
sources = source.map { |s| Utils.normalize_filepath(s) }
|
254
|
+
else
|
255
|
+
sources = source
|
256
|
+
end
|
257
|
+
|
258
|
+
source = nil
|
250
259
|
end
|
251
260
|
|
252
261
|
rblf =
|
253
262
|
RbLazyFrame.new_from_ipc(
|
254
|
-
|
263
|
+
source,
|
264
|
+
sources,
|
255
265
|
n_rows,
|
256
266
|
cache,
|
257
267
|
rechunk,
|
data/lib/polars/io/ndjson.rb
CHANGED
@@ -60,13 +60,23 @@ module Polars
|
|
60
60
|
row_count_name: nil,
|
61
61
|
row_count_offset: 0
|
62
62
|
)
|
63
|
+
sources = []
|
63
64
|
if Utils.pathlike?(source)
|
64
65
|
source = Utils.normalize_filepath(source)
|
66
|
+
elsif source.is_a?(::Array)
|
67
|
+
if Utils.is_path_or_str_sequence(source)
|
68
|
+
sources = source.map { |s| Utils.normalize_filepath(s) }
|
69
|
+
else
|
70
|
+
sources = source
|
71
|
+
end
|
72
|
+
|
73
|
+
source = nil
|
65
74
|
end
|
66
75
|
|
67
76
|
rblf =
|
68
77
|
RbLazyFrame.new_from_ndjson(
|
69
78
|
source,
|
79
|
+
sources,
|
70
80
|
infer_schema_length,
|
71
81
|
batch_size,
|
72
82
|
n_rows,
|