polars-df 0.14.0-aarch64-linux-musl → 0.16.0-aarch64-linux-musl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -0
- data/Cargo.lock +1523 -378
- data/LICENSE-THIRD-PARTY.txt +23495 -12923
- data/LICENSE.txt +1 -0
- data/README.md +38 -4
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/{3.1 → 3.4}/polars.so +0 -0
- data/lib/polars/batched_csv_reader.rb +0 -2
- data/lib/polars/binary_expr.rb +133 -9
- data/lib/polars/binary_name_space.rb +101 -6
- data/lib/polars/config.rb +4 -0
- data/lib/polars/data_frame.rb +452 -101
- data/lib/polars/data_type_group.rb +28 -0
- data/lib/polars/data_types.rb +3 -1
- data/lib/polars/date_time_expr.rb +244 -0
- data/lib/polars/date_time_name_space.rb +87 -0
- data/lib/polars/expr.rb +103 -2
- data/lib/polars/functions/aggregation/horizontal.rb +10 -4
- data/lib/polars/functions/as_datatype.rb +51 -2
- data/lib/polars/functions/col.rb +1 -1
- data/lib/polars/functions/eager.rb +1 -3
- data/lib/polars/functions/lazy.rb +95 -13
- data/lib/polars/functions/range/time_range.rb +21 -21
- data/lib/polars/io/csv.rb +14 -16
- data/lib/polars/io/database.rb +2 -2
- data/lib/polars/io/delta.rb +126 -0
- data/lib/polars/io/ipc.rb +14 -4
- data/lib/polars/io/ndjson.rb +10 -0
- data/lib/polars/io/parquet.rb +168 -111
- data/lib/polars/lazy_frame.rb +684 -20
- data/lib/polars/list_name_space.rb +169 -0
- data/lib/polars/selectors.rb +1226 -0
- data/lib/polars/series.rb +465 -35
- data/lib/polars/string_cache.rb +27 -1
- data/lib/polars/string_expr.rb +0 -1
- data/lib/polars/string_name_space.rb +73 -3
- data/lib/polars/struct_name_space.rb +31 -7
- data/lib/polars/utils/various.rb +5 -1
- data/lib/polars/utils.rb +45 -10
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +17 -1
- metadata +10 -9
- data/lib/polars/functions.rb +0 -57
@@ -86,8 +86,57 @@ module Polars
|
|
86
86
|
# Concat the arrays in a Series dtype List in linear time.
|
87
87
|
#
|
88
88
|
# @return [Expr]
|
89
|
-
|
90
|
-
|
89
|
+
#
|
90
|
+
# @example Concatenate two existing list columns. Null values are propagated.
|
91
|
+
# df = Polars::DataFrame.new({"a" => [[1, 2], [3], [4, 5]], "b" => [[4], [], nil]})
|
92
|
+
# df.with_columns(concat_list: Polars.concat_list("a", "b"))
|
93
|
+
# # =>
|
94
|
+
# # shape: (3, 3)
|
95
|
+
# # ┌───────────┬───────────┬─────────────┐
|
96
|
+
# # │ a ┆ b ┆ concat_list │
|
97
|
+
# # │ --- ┆ --- ┆ --- │
|
98
|
+
# # │ list[i64] ┆ list[i64] ┆ list[i64] │
|
99
|
+
# # ╞═══════════╪═══════════╪═════════════╡
|
100
|
+
# # │ [1, 2] ┆ [4] ┆ [1, 2, 4] │
|
101
|
+
# # │ [3] ┆ [] ┆ [3] │
|
102
|
+
# # │ [4, 5] ┆ null ┆ null │
|
103
|
+
# # └───────────┴───────────┴─────────────┘
|
104
|
+
#
|
105
|
+
# @example Non-list columns are cast to a list before concatenation. The output data type is the supertype of the concatenated columns.
|
106
|
+
# df.select("a", concat_list: Polars.concat_list("a", Polars.lit("x")))
|
107
|
+
# # =>
|
108
|
+
# # shape: (3, 2)
|
109
|
+
# # ┌───────────┬─────────────────┐
|
110
|
+
# # │ a ┆ concat_list │
|
111
|
+
# # │ --- ┆ --- │
|
112
|
+
# # │ list[i64] ┆ list[str] │
|
113
|
+
# # ╞═══════════╪═════════════════╡
|
114
|
+
# # │ [1, 2] ┆ ["1", "2", "x"] │
|
115
|
+
# # │ [3] ┆ ["3", "x"] │
|
116
|
+
# # │ [4, 5] ┆ ["4", "5", "x"] │
|
117
|
+
# # └───────────┴─────────────────┘
|
118
|
+
#
|
119
|
+
# @example Create lagged columns and collect them into a list. This mimics a rolling window.
|
120
|
+
# df = Polars::DataFrame.new({"A" => [1.0, 2.0, 9.0, 2.0, 13.0]})
|
121
|
+
# df = df.select(3.times.map { |i| Polars.col("A").shift(i).alias("A_lag_#{i}") })
|
122
|
+
# df.select(
|
123
|
+
# Polars.concat_list(3.times.map { |i| "A_lag_#{i}" }.reverse).alias("A_rolling")
|
124
|
+
# )
|
125
|
+
# # =>
|
126
|
+
# # shape: (5, 1)
|
127
|
+
# # ┌───────────────────┐
|
128
|
+
# # │ A_rolling │
|
129
|
+
# # │ --- │
|
130
|
+
# # │ list[f64] │
|
131
|
+
# # ╞═══════════════════╡
|
132
|
+
# # │ [null, null, 1.0] │
|
133
|
+
# # │ [null, 1.0, 2.0] │
|
134
|
+
# # │ [1.0, 2.0, 9.0] │
|
135
|
+
# # │ [2.0, 9.0, 2.0] │
|
136
|
+
# # │ [9.0, 2.0, 13.0] │
|
137
|
+
# # └───────────────────┘
|
138
|
+
def concat_list(exprs, *more_exprs)
|
139
|
+
exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
|
91
140
|
Utils.wrap_expr(Plr.concat_list(exprs))
|
92
141
|
end
|
93
142
|
|
data/lib/polars/functions/col.rb
CHANGED
@@ -23,7 +23,7 @@ module Polars
|
|
23
23
|
Utils.wrap_expr(Plr.col(name.to_s))
|
24
24
|
elsif Utils.is_polars_dtype(name)
|
25
25
|
Utils.wrap_expr(Plr.dtype_cols([name]))
|
26
|
-
elsif name.is_a?(::Array)
|
26
|
+
elsif name.is_a?(::Array) || name.is_a?(::Set)
|
27
27
|
names = Array(name)
|
28
28
|
if names.empty?
|
29
29
|
return Utils.wrap_expr(Plr.cols(names))
|
@@ -127,7 +127,7 @@ module Polars
|
|
127
127
|
# af1, af2, af3 = Polars.align_frames(
|
128
128
|
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
129
129
|
# )
|
130
|
-
# (af1 * af2 * af3).fill_null(0).select(Polars.
|
130
|
+
# (af1 * af2 * af3).fill_null(0).select(Polars.sum_horizontal("*").alias("dot"))
|
131
131
|
# # =>
|
132
132
|
# # shape: (3, 1)
|
133
133
|
# # ┌───────┐
|
@@ -136,9 +136,7 @@ module Polars
|
|
136
136
|
# # │ f64 │
|
137
137
|
# # ╞═══════╡
|
138
138
|
# # │ 0.0 │
|
139
|
-
# # ├╌╌╌╌╌╌╌┤
|
140
139
|
# # │ 167.5 │
|
141
|
-
# # ├╌╌╌╌╌╌╌┤
|
142
140
|
# # │ 47.0 │
|
143
141
|
# # └───────┘
|
144
142
|
def align_frames(
|
@@ -729,16 +729,20 @@ module Polars
|
|
729
729
|
a,
|
730
730
|
b,
|
731
731
|
method: "pearson",
|
732
|
-
ddof:
|
732
|
+
ddof: nil,
|
733
733
|
propagate_nans: false
|
734
734
|
)
|
735
|
+
if !ddof.nil?
|
736
|
+
warn "The `ddof` parameter has no effect. Do not use it."
|
737
|
+
end
|
738
|
+
|
735
739
|
a = Utils.parse_into_expression(a)
|
736
740
|
b = Utils.parse_into_expression(b)
|
737
741
|
|
738
742
|
if method == "pearson"
|
739
|
-
Utils.wrap_expr(Plr.pearson_corr(a, b
|
743
|
+
Utils.wrap_expr(Plr.pearson_corr(a, b))
|
740
744
|
elsif method == "spearman"
|
741
|
-
Utils.wrap_expr(Plr.spearman_rank_corr(a, b,
|
745
|
+
Utils.wrap_expr(Plr.spearman_rank_corr(a, b, propagate_nans))
|
742
746
|
else
|
743
747
|
msg = "method must be one of {{'pearson', 'spearman'}}, got #{method}"
|
744
748
|
raise ArgumentError, msg
|
@@ -824,6 +828,29 @@ module Polars
|
|
824
828
|
# @note
|
825
829
|
# If you simply want the first encountered expression as accumulator,
|
826
830
|
# consider using `cumreduce`.
|
831
|
+
#
|
832
|
+
# @example
|
833
|
+
# df = Polars::DataFrame.new(
|
834
|
+
# {
|
835
|
+
# "a" => [1, 2, 3],
|
836
|
+
# "b" => [3, 4, 5],
|
837
|
+
# "c" => [5, 6, 7]
|
838
|
+
# }
|
839
|
+
# )
|
840
|
+
# df.with_columns(
|
841
|
+
# Polars.cum_fold(Polars.lit(1), ->(acc, x) { acc + x }, Polars.all)
|
842
|
+
# )
|
843
|
+
# # =>
|
844
|
+
# # shape: (3, 4)
|
845
|
+
# # ┌─────┬─────┬─────┬───────────┐
|
846
|
+
# # │ a ┆ b ┆ c ┆ cum_fold │
|
847
|
+
# # │ --- ┆ --- ┆ --- ┆ --- │
|
848
|
+
# # │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
|
849
|
+
# # ╞═════╪═════╪═════╪═══════════╡
|
850
|
+
# # │ 1 ┆ 3 ┆ 5 ┆ {2,5,10} │
|
851
|
+
# # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
|
852
|
+
# # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
|
853
|
+
# # └─────┴─────┴─────┴───────────┘
|
827
854
|
def cum_fold(acc, f, exprs, include_init: false)
|
828
855
|
acc = Utils.parse_into_expression(acc, str_as_lit: true)
|
829
856
|
if exprs.is_a?(Expr)
|
@@ -831,7 +858,7 @@ module Polars
|
|
831
858
|
end
|
832
859
|
|
833
860
|
exprs = Utils.parse_into_list_of_expressions(exprs)
|
834
|
-
Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init))
|
861
|
+
Utils.wrap_expr(Plr.cum_fold(acc, f, exprs, include_init)._alias("cum_fold"))
|
835
862
|
end
|
836
863
|
alias_method :cumfold, :cum_fold
|
837
864
|
|
@@ -1024,15 +1051,70 @@ module Polars
|
|
1024
1051
|
# Default is ascending.
|
1025
1052
|
#
|
1026
1053
|
# @return [Expr]
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1054
|
+
#
|
1055
|
+
# @example Pass a single column name to compute the arg sort by that column.
|
1056
|
+
# df = Polars::DataFrame.new(
|
1057
|
+
# {
|
1058
|
+
# "a" => [0, 1, 1, 0],
|
1059
|
+
# "b" => [3, 2, 3, 2],
|
1060
|
+
# "c" => [1, 2, 3, 4]
|
1061
|
+
# }
|
1062
|
+
# )
|
1063
|
+
# df.select(Polars.arg_sort_by("a"))
|
1064
|
+
# # =>
|
1065
|
+
# # shape: (4, 1)
|
1066
|
+
# # ┌─────┐
|
1067
|
+
# # │ a │
|
1068
|
+
# # │ --- │
|
1069
|
+
# # │ u32 │
|
1070
|
+
# # ╞═════╡
|
1071
|
+
# # │ 0 │
|
1072
|
+
# # │ 3 │
|
1073
|
+
# # │ 1 │
|
1074
|
+
# # │ 2 │
|
1075
|
+
# # └─────┘
|
1076
|
+
#
|
1077
|
+
# @example Compute the arg sort by multiple columns by either passing a list of columns, or by specifying each column as a positional argument.
|
1078
|
+
# df.select(Polars.arg_sort_by(["a", "b"], reverse: true))
|
1079
|
+
# # =>
|
1080
|
+
# # shape: (4, 1)
|
1081
|
+
# # ┌─────┐
|
1082
|
+
# # │ a │
|
1083
|
+
# # │ --- │
|
1084
|
+
# # │ u32 │
|
1085
|
+
# # ╞═════╡
|
1086
|
+
# # │ 2 │
|
1087
|
+
# # │ 1 │
|
1088
|
+
# # │ 0 │
|
1089
|
+
# # │ 3 │
|
1090
|
+
# # └─────┘
|
1091
|
+
#
|
1092
|
+
# @example Use gather to apply the arg sort to other columns.
|
1093
|
+
# df.select(Polars.col("c").gather(Polars.arg_sort_by("a")))
|
1094
|
+
# # =>
|
1095
|
+
# # shape: (4, 1)
|
1096
|
+
# # ┌─────┐
|
1097
|
+
# # │ c │
|
1098
|
+
# # │ --- │
|
1099
|
+
# # │ i64 │
|
1100
|
+
# # ╞═════╡
|
1101
|
+
# # │ 1 │
|
1102
|
+
# # │ 4 │
|
1103
|
+
# # │ 2 │
|
1104
|
+
# # │ 3 │
|
1105
|
+
# # └─────┘
|
1106
|
+
def arg_sort_by(
|
1107
|
+
exprs,
|
1108
|
+
*more_exprs,
|
1109
|
+
reverse: false,
|
1110
|
+
nulls_last: false,
|
1111
|
+
multithreaded: true,
|
1112
|
+
maintain_order: false
|
1113
|
+
)
|
1114
|
+
exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
|
1115
|
+
reverse = Utils.extend_bool(reverse, exprs.length, "reverse", "exprs")
|
1116
|
+
nulls_last = Utils.extend_bool(nulls_last, exprs.length, "nulls_last", "exprs")
|
1117
|
+
Utils.wrap_expr(Plr.arg_sort_by(exprs, reverse, nulls_last, multithreaded, maintain_order))
|
1036
1118
|
end
|
1037
1119
|
alias_method :argsort_by, :arg_sort_by
|
1038
1120
|
|
@@ -18,7 +18,7 @@ module Polars
|
|
18
18
|
#
|
19
19
|
# @example
|
20
20
|
# Polars.time_range(
|
21
|
-
#
|
21
|
+
# Time.utc(2000, 1, 1, 14, 0),
|
22
22
|
# nil,
|
23
23
|
# "3h15m",
|
24
24
|
# eager: true
|
@@ -48,12 +48,12 @@ module Polars
|
|
48
48
|
end
|
49
49
|
|
50
50
|
if start.nil?
|
51
|
-
#
|
52
|
-
|
51
|
+
# date part is ignored
|
52
|
+
start = ::Time.utc(2000, 1, 1, 0, 0, 0)
|
53
53
|
end
|
54
54
|
if stop.nil?
|
55
|
-
#
|
56
|
-
|
55
|
+
# date part is ignored
|
56
|
+
stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
|
57
57
|
end
|
58
58
|
|
59
59
|
start_rbexpr = Utils.parse_into_expression(start)
|
@@ -87,21 +87,21 @@ module Polars
|
|
87
87
|
# @example
|
88
88
|
# df = Polars::DataFrame.new(
|
89
89
|
# {
|
90
|
-
# "start" => [
|
91
|
-
# "end" =>
|
90
|
+
# "start" => [Time.utc(2000, 1, 1, 9, 0), Time.utc(2000, 1, 1, 10, 0)],
|
91
|
+
# "end" => Time.utc(2000, 1, 1, 11, 0)
|
92
92
|
# }
|
93
93
|
# )
|
94
|
-
# df.
|
94
|
+
# df.select(time_range: Polars.time_ranges("start", "end"))
|
95
95
|
# # =>
|
96
|
-
# # shape: (2,
|
97
|
-
# #
|
98
|
-
# # │
|
99
|
-
# # │ ---
|
100
|
-
# # │
|
101
|
-
# #
|
102
|
-
# # │
|
103
|
-
# # │
|
104
|
-
# #
|
96
|
+
# # shape: (2, 1)
|
97
|
+
# # ┌────────────────────────────────┐
|
98
|
+
# # │ time_range │
|
99
|
+
# # │ --- │
|
100
|
+
# # │ list[time] │
|
101
|
+
# # ╞════════════════════════════════╡
|
102
|
+
# # │ [09:00:00, 10:00:00, 11:00:00] │
|
103
|
+
# # │ [10:00:00, 11:00:00] │
|
104
|
+
# # └────────────────────────────────┘
|
105
105
|
def time_ranges(
|
106
106
|
start = nil,
|
107
107
|
stop = nil,
|
@@ -118,12 +118,12 @@ module Polars
|
|
118
118
|
end
|
119
119
|
|
120
120
|
if start.nil?
|
121
|
-
#
|
122
|
-
|
121
|
+
# date part is ignored
|
122
|
+
start = ::Time.utc(2000, 1, 1, 0, 0, 0)
|
123
123
|
end
|
124
124
|
if stop.nil?
|
125
|
-
#
|
126
|
-
|
125
|
+
# date part is ignored
|
126
|
+
stop = ::Time.utc(2000, 1, 1, 23, 59, 59, 999999)
|
127
127
|
end
|
128
128
|
|
129
129
|
start_rbexpr = Utils.parse_into_expression(start)
|
data/lib/polars/io/csv.rb
CHANGED
@@ -75,9 +75,6 @@ module Polars
|
|
75
75
|
# the DataFrame.
|
76
76
|
# @param row_count_offset [Integer]
|
77
77
|
# Offset to start the row_count column (only used if the name is set).
|
78
|
-
# @param sample_size [Integer]
|
79
|
-
# Set the sample size. This is used to sample statistics to estimate the
|
80
|
-
# allocation needed.
|
81
78
|
# @param eol_char [String]
|
82
79
|
# Single byte end of line character.
|
83
80
|
# @param truncate_ragged_lines [Boolean]
|
@@ -114,7 +111,6 @@ module Polars
|
|
114
111
|
skip_rows_after_header: 0,
|
115
112
|
row_count_name: nil,
|
116
113
|
row_count_offset: 0,
|
117
|
-
sample_size: 1024,
|
118
114
|
eol_char: "\n",
|
119
115
|
truncate_ragged_lines: false
|
120
116
|
)
|
@@ -163,7 +159,6 @@ module Polars
|
|
163
159
|
skip_rows_after_header: skip_rows_after_header,
|
164
160
|
row_count_name: row_count_name,
|
165
161
|
row_count_offset: row_count_offset,
|
166
|
-
sample_size: sample_size,
|
167
162
|
eol_char: eol_char,
|
168
163
|
truncate_ragged_lines: truncate_ragged_lines
|
169
164
|
)
|
@@ -201,7 +196,6 @@ module Polars
|
|
201
196
|
skip_rows_after_header: 0,
|
202
197
|
row_count_name: nil,
|
203
198
|
row_count_offset: 0,
|
204
|
-
sample_size: 1024,
|
205
199
|
eol_char: "\n",
|
206
200
|
raise_if_empty: true,
|
207
201
|
truncate_ragged_lines: false,
|
@@ -305,7 +299,6 @@ module Polars
|
|
305
299
|
parse_dates,
|
306
300
|
skip_rows_after_header,
|
307
301
|
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
308
|
-
sample_size,
|
309
302
|
eol_char,
|
310
303
|
raise_if_empty,
|
311
304
|
truncate_ragged_lines,
|
@@ -392,9 +385,6 @@ module Polars
|
|
392
385
|
# the DataFrame.
|
393
386
|
# @param row_count_offset [Integer]
|
394
387
|
# Offset to start the row_count column (only used if the name is set).
|
395
|
-
# @param sample_size [Integer]
|
396
|
-
# Set the sample size. This is used to sample statistics to estimate the
|
397
|
-
# allocation needed.
|
398
388
|
# @param eol_char [String]
|
399
389
|
# Single byte end of line character.
|
400
390
|
# @param truncate_ragged_lines [Boolean]
|
@@ -431,7 +421,6 @@ module Polars
|
|
431
421
|
skip_rows_after_header: 0,
|
432
422
|
row_count_name: nil,
|
433
423
|
row_count_offset: 0,
|
434
|
-
sample_size: 1024,
|
435
424
|
eol_char: "\n",
|
436
425
|
raise_if_empty: true,
|
437
426
|
truncate_ragged_lines: false,
|
@@ -474,7 +463,6 @@ module Polars
|
|
474
463
|
skip_rows_after_header: skip_rows_after_header,
|
475
464
|
row_count_name: row_count_name,
|
476
465
|
row_count_offset: row_count_offset,
|
477
|
-
sample_size: sample_size,
|
478
466
|
eol_char: eol_char,
|
479
467
|
new_columns: new_columns,
|
480
468
|
raise_if_empty: raise_if_empty,
|
@@ -618,7 +606,7 @@ module Polars
|
|
618
606
|
|
619
607
|
# @private
|
620
608
|
def _scan_csv_impl(
|
621
|
-
|
609
|
+
source,
|
622
610
|
has_header: true,
|
623
611
|
sep: ",",
|
624
612
|
comment_char: nil,
|
@@ -650,9 +638,16 @@ module Polars
|
|
650
638
|
end
|
651
639
|
processed_null_values = Utils._process_null_values(null_values)
|
652
640
|
|
641
|
+
if source.is_a?(::Array)
|
642
|
+
sources = source
|
643
|
+
source = nil
|
644
|
+
else
|
645
|
+
sources = []
|
646
|
+
end
|
647
|
+
|
653
648
|
rblf =
|
654
649
|
RbLazyFrame.new_from_csv(
|
655
|
-
|
650
|
+
source,
|
656
651
|
sep,
|
657
652
|
has_header,
|
658
653
|
ignore_errors,
|
@@ -672,7 +667,8 @@ module Polars
|
|
672
667
|
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
673
668
|
parse_dates,
|
674
669
|
eol_char,
|
675
|
-
truncate_ragged_lines
|
670
|
+
truncate_ragged_lines,
|
671
|
+
sources
|
676
672
|
)
|
677
673
|
Utils.wrap_ldf(rblf)
|
678
674
|
end
|
@@ -681,7 +677,9 @@ module Polars
|
|
681
677
|
|
682
678
|
def _prepare_file_arg(file)
|
683
679
|
if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
|
684
|
-
|
680
|
+
require "uri"
|
681
|
+
|
682
|
+
file = URI(file)
|
685
683
|
end
|
686
684
|
|
687
685
|
if defined?(URI) && file.is_a?(URI)
|
data/lib/polars/io/database.rb
CHANGED
@@ -18,9 +18,9 @@ module Polars
|
|
18
18
|
if query.is_a?(ActiveRecord::Result)
|
19
19
|
query
|
20
20
|
elsif query.is_a?(ActiveRecord::Relation)
|
21
|
-
query.
|
21
|
+
query.connection_pool.with_connection { |c| c.select_all(query.to_sql) }
|
22
22
|
elsif query.is_a?(::String)
|
23
|
-
ActiveRecord::Base.
|
23
|
+
ActiveRecord::Base.connection_pool.with_connection { |c| c.select_all(query) }
|
24
24
|
else
|
25
25
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
26
26
|
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Reads into a DataFrame from a Delta lake table.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# DeltaTable or a Path or URI to the root of the Delta lake table.
|
7
|
+
# @param version [Object]
|
8
|
+
# Numerical version or timestamp version of the Delta lake table.
|
9
|
+
# @param columns [Array]
|
10
|
+
# Columns to select. Accepts a list of column names.
|
11
|
+
# @param rechunk [Boolean]
|
12
|
+
# Make sure that all columns are contiguous in memory by
|
13
|
+
# aggregating the chunks into a single array.
|
14
|
+
# @param storage_options [Hash]
|
15
|
+
# Extra options for the storage backends supported by `deltalake-rb`.
|
16
|
+
# @param delta_table_options [Hash]
|
17
|
+
# Additional keyword arguments while reading a Delta lake Table.
|
18
|
+
#
|
19
|
+
# @return [DataFrame]
|
20
|
+
def read_delta(
|
21
|
+
source,
|
22
|
+
version: nil,
|
23
|
+
columns: nil,
|
24
|
+
rechunk: false,
|
25
|
+
storage_options: nil,
|
26
|
+
delta_table_options: nil
|
27
|
+
)
|
28
|
+
dl_tbl =
|
29
|
+
_get_delta_lake_table(
|
30
|
+
source,
|
31
|
+
version: version,
|
32
|
+
storage_options: storage_options,
|
33
|
+
delta_table_options: delta_table_options
|
34
|
+
)
|
35
|
+
|
36
|
+
dl_tbl.to_polars(columns: columns, rechunk: rechunk)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Lazily read from a Delta lake table.
|
40
|
+
#
|
41
|
+
# @param source [Object]
|
42
|
+
# DeltaTable or a Path or URI to the root of the Delta lake table.
|
43
|
+
# @param version [Object]
|
44
|
+
# Numerical version or timestamp version of the Delta lake table.
|
45
|
+
# @param storage_options [Hash]
|
46
|
+
# Extra options for the storage backends supported by `deltalake-rb`.
|
47
|
+
# @param delta_table_options [Hash]
|
48
|
+
# Additional keyword arguments while reading a Delta lake Table.
|
49
|
+
#
|
50
|
+
# @return [LazyFrame]
|
51
|
+
def scan_delta(
|
52
|
+
source,
|
53
|
+
version: nil,
|
54
|
+
storage_options: nil,
|
55
|
+
delta_table_options: nil
|
56
|
+
)
|
57
|
+
dl_tbl =
|
58
|
+
_get_delta_lake_table(
|
59
|
+
source,
|
60
|
+
version: version,
|
61
|
+
storage_options: storage_options,
|
62
|
+
delta_table_options: delta_table_options
|
63
|
+
)
|
64
|
+
|
65
|
+
dl_tbl.to_polars(eager: false)
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def _resolve_delta_lake_uri(table_uri, strict: true)
|
71
|
+
require "uri"
|
72
|
+
|
73
|
+
parsed_result = URI(table_uri)
|
74
|
+
|
75
|
+
resolved_uri =
|
76
|
+
if parsed_result.scheme == ""
|
77
|
+
Utils.normalize_filepath(table_uri)
|
78
|
+
else
|
79
|
+
table_uri
|
80
|
+
end
|
81
|
+
|
82
|
+
resolved_uri
|
83
|
+
end
|
84
|
+
|
85
|
+
def _get_delta_lake_table(
|
86
|
+
table_path,
|
87
|
+
version: nil,
|
88
|
+
storage_options: nil,
|
89
|
+
delta_table_options: nil
|
90
|
+
)
|
91
|
+
_check_if_delta_available
|
92
|
+
|
93
|
+
if table_path.is_a?(DeltaLake::Table)
|
94
|
+
return table_path
|
95
|
+
end
|
96
|
+
delta_table_options ||= {}
|
97
|
+
resolved_uri = _resolve_delta_lake_uri(table_path)
|
98
|
+
if !version.is_a?(::String) && !version.is_a?(::Time)
|
99
|
+
dl_tbl =
|
100
|
+
DeltaLake::Table.new(
|
101
|
+
resolved_uri,
|
102
|
+
version: version,
|
103
|
+
storage_options: storage_options,
|
104
|
+
**delta_table_options
|
105
|
+
)
|
106
|
+
else
|
107
|
+
dl_tbl =
|
108
|
+
DeltaLake::Table.new(
|
109
|
+
resolved_uri,
|
110
|
+
storage_options: storage_options,
|
111
|
+
**delta_table_options
|
112
|
+
)
|
113
|
+
dl_tbl.load_as_version(version)
|
114
|
+
end
|
115
|
+
|
116
|
+
dl_tbl = DeltaLake::Table.new(table_path)
|
117
|
+
dl_tbl
|
118
|
+
end
|
119
|
+
|
120
|
+
def _check_if_delta_available
|
121
|
+
if !defined?(DeltaLake)
|
122
|
+
raise Error, "Delta Lake not available"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/lib/polars/io/ipc.rb
CHANGED
@@ -233,7 +233,7 @@ module Polars
|
|
233
233
|
|
234
234
|
# @private
|
235
235
|
def _scan_ipc_impl(
|
236
|
-
|
236
|
+
source,
|
237
237
|
n_rows: nil,
|
238
238
|
cache: true,
|
239
239
|
rechunk: true,
|
@@ -245,13 +245,23 @@ module Polars
|
|
245
245
|
try_parse_hive_dates: true,
|
246
246
|
include_file_paths: nil
|
247
247
|
)
|
248
|
-
|
249
|
-
|
248
|
+
sources = []
|
249
|
+
if Utils.pathlike?(source)
|
250
|
+
source = Utils.normalize_filepath(source)
|
251
|
+
elsif source.is_a?(::Array)
|
252
|
+
if Utils.is_path_or_str_sequence(source)
|
253
|
+
sources = source.map { |s| Utils.normalize_filepath(s) }
|
254
|
+
else
|
255
|
+
sources = source
|
256
|
+
end
|
257
|
+
|
258
|
+
source = nil
|
250
259
|
end
|
251
260
|
|
252
261
|
rblf =
|
253
262
|
RbLazyFrame.new_from_ipc(
|
254
|
-
|
263
|
+
source,
|
264
|
+
sources,
|
255
265
|
n_rows,
|
256
266
|
cache,
|
257
267
|
rechunk,
|
data/lib/polars/io/ndjson.rb
CHANGED
@@ -60,13 +60,23 @@ module Polars
|
|
60
60
|
row_count_name: nil,
|
61
61
|
row_count_offset: 0
|
62
62
|
)
|
63
|
+
sources = []
|
63
64
|
if Utils.pathlike?(source)
|
64
65
|
source = Utils.normalize_filepath(source)
|
66
|
+
elsif source.is_a?(::Array)
|
67
|
+
if Utils.is_path_or_str_sequence(source)
|
68
|
+
sources = source.map { |s| Utils.normalize_filepath(s) }
|
69
|
+
else
|
70
|
+
sources = source
|
71
|
+
end
|
72
|
+
|
73
|
+
source = nil
|
65
74
|
end
|
66
75
|
|
67
76
|
rblf =
|
68
77
|
RbLazyFrame.new_from_ndjson(
|
69
78
|
source,
|
79
|
+
sources,
|
70
80
|
infer_schema_length,
|
71
81
|
batch_size,
|
72
82
|
n_rows,
|