polars-df 0.21.1-aarch64-linux → 0.22.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Cargo.lock +55 -48
- data/Cargo.toml +3 -0
- data/LICENSE-THIRD-PARTY.txt +23 -49
- data/README.md +12 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/3.4/polars.so +0 -0
- data/lib/polars/array_expr.rb +1 -1
- data/lib/polars/data_frame.rb +110 -8
- data/lib/polars/data_types.rb +14 -5
- data/lib/polars/date_time_expr.rb +1 -1
- data/lib/polars/expr.rb +39 -30
- data/lib/polars/functions/business.rb +95 -0
- data/lib/polars/functions/lazy.rb +1 -1
- data/lib/polars/io/iceberg.rb +27 -0
- data/lib/polars/io/parquet.rb +7 -4
- data/lib/polars/io/scan_options.rb +4 -1
- data/lib/polars/lazy_frame.rb +92 -8
- data/lib/polars/list_expr.rb +21 -13
- data/lib/polars/list_name_space.rb +33 -21
- data/lib/polars/meta_expr.rb +25 -0
- data/lib/polars/query_opt_flags.rb +50 -0
- data/lib/polars/scan_cast_options.rb +20 -1
- data/lib/polars/schema.rb +1 -1
- data/lib/polars/series.rb +3 -1
- data/lib/polars/string_expr.rb +26 -27
- data/lib/polars/string_name_space.rb +17 -4
- data/lib/polars/utils/serde.rb +17 -0
- data/lib/polars/utils/various.rb +4 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +4 -0
- metadata +6 -2
data/lib/polars/data_frame.rb
CHANGED
@@ -72,6 +72,43 @@ module Polars
|
|
72
72
|
end
|
73
73
|
end
|
74
74
|
|
75
|
+
# Read a serialized DataFrame from a file.
|
76
|
+
#
|
77
|
+
# @param source [Object]
|
78
|
+
# Path to a file or a file-like object (by file-like object, we refer to
|
79
|
+
# objects that have a `read` method, such as a file handler or `StringIO`).
|
80
|
+
#
|
81
|
+
# @return [DataFrame]
|
82
|
+
#
|
83
|
+
# @note
|
84
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
85
|
+
# in one Polars version may not be deserializable in another Polars version.
|
86
|
+
#
|
87
|
+
# @example
|
88
|
+
# df = Polars::DataFrame.new({"a" => [1, 2, 3], "b" => [4.0, 5.0, 6.0]})
|
89
|
+
# bytes = df.serialize
|
90
|
+
# Polars::DataFrame.deserialize(StringIO.new(bytes))
|
91
|
+
# # =>
|
92
|
+
# # shape: (3, 2)
|
93
|
+
# # ┌─────┬─────┐
|
94
|
+
# # │ a ┆ b │
|
95
|
+
# # │ --- ┆ --- │
|
96
|
+
# # │ i64 ┆ f64 │
|
97
|
+
# # ╞═════╪═════╡
|
98
|
+
# # │ 1 ┆ 4.0 │
|
99
|
+
# # │ 2 ┆ 5.0 │
|
100
|
+
# # │ 3 ┆ 6.0 │
|
101
|
+
# # └─────┴─────┘
|
102
|
+
def self.deserialize(source)
|
103
|
+
if Utils.pathlike?(source)
|
104
|
+
source = Utils.normalize_filepath(source)
|
105
|
+
end
|
106
|
+
|
107
|
+
deserializer = RbDataFrame.method(:deserialize_binary)
|
108
|
+
|
109
|
+
_from_rbdf(deserializer.(source))
|
110
|
+
end
|
111
|
+
|
75
112
|
# @private
|
76
113
|
def self._from_rbdf(rb_df)
|
77
114
|
df = DataFrame.allocate
|
@@ -562,8 +599,6 @@ module Polars
|
|
562
599
|
|
563
600
|
# Convert every row to a hash.
|
564
601
|
#
|
565
|
-
# Note that this is slow.
|
566
|
-
#
|
567
602
|
# @return [Array]
|
568
603
|
#
|
569
604
|
# @example
|
@@ -572,12 +607,7 @@ module Polars
|
|
572
607
|
# # =>
|
573
608
|
# # [{"foo"=>1, "bar"=>4}, {"foo"=>2, "bar"=>5}, {"foo"=>3, "bar"=>6}]
|
574
609
|
def to_hashes
|
575
|
-
|
576
|
-
names = columns
|
577
|
-
|
578
|
-
height.times.map do |i|
|
579
|
-
names.zip(rbdf.row_tuple(i)).to_h
|
580
|
-
end
|
610
|
+
rows(named: true)
|
581
611
|
end
|
582
612
|
|
583
613
|
# Convert DataFrame to a 2D Numo array.
|
@@ -634,6 +664,44 @@ module Polars
|
|
634
664
|
Utils.wrap_s(_df.select_at_idx(index))
|
635
665
|
end
|
636
666
|
|
667
|
+
# Serialize this DataFrame to a file or string.
|
668
|
+
#
|
669
|
+
# @param file [Object]
|
670
|
+
# File path or writable file-like object to which the result will be written.
|
671
|
+
# If set to `nil` (default), the output is returned as a string instead.
|
672
|
+
#
|
673
|
+
# @return [Object]
|
674
|
+
#
|
675
|
+
# @note
|
676
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
677
|
+
# in one Polars version may not be deserializable in another Polars version.
|
678
|
+
#
|
679
|
+
# @example
|
680
|
+
# df = Polars::DataFrame.new(
|
681
|
+
# {
|
682
|
+
# "foo" => [1, 2, 3],
|
683
|
+
# "bar" => [6, 7, 8]
|
684
|
+
# }
|
685
|
+
# )
|
686
|
+
# bytes = df.serialize
|
687
|
+
# Polars::DataFrame.deserialize(StringIO.new(bytes))
|
688
|
+
# # =>
|
689
|
+
# # shape: (3, 2)
|
690
|
+
# # ┌─────┬─────┐
|
691
|
+
# # │ foo ┆ bar │
|
692
|
+
# # │ --- ┆ --- │
|
693
|
+
# # │ i64 ┆ i64 │
|
694
|
+
# # ╞═════╪═════╡
|
695
|
+
# # │ 1 ┆ 6 │
|
696
|
+
# # │ 2 ┆ 7 │
|
697
|
+
# # │ 3 ┆ 8 │
|
698
|
+
# # └─────┴─────┘
|
699
|
+
def serialize(file = nil)
|
700
|
+
serializer = _df.method(:serialize_binary)
|
701
|
+
|
702
|
+
Utils.serialize_polars_object(serializer, file)
|
703
|
+
end
|
704
|
+
|
637
705
|
# Serialize to JSON representation.
|
638
706
|
#
|
639
707
|
# @param file [String]
|
@@ -1148,6 +1216,40 @@ module Polars
|
|
1148
1216
|
end
|
1149
1217
|
end
|
1150
1218
|
|
1219
|
+
# Write DataFrame to an Iceberg table.
|
1220
|
+
#
|
1221
|
+
# @note
|
1222
|
+
# This functionality is currently considered **unstable**. It may be
|
1223
|
+
# changed at any point without it being considered a breaking change.
|
1224
|
+
#
|
1225
|
+
# @param target [Object]
|
1226
|
+
# Name of the table or the Table object representing an Iceberg table.
|
1227
|
+
# @param mode ['append', 'overwrite']
|
1228
|
+
# How to handle existing data.
|
1229
|
+
#
|
1230
|
+
# - If 'append', will add new data.
|
1231
|
+
# - If 'overwrite', will replace table with new data.
|
1232
|
+
#
|
1233
|
+
# @return [nil]
|
1234
|
+
def write_iceberg(target, mode:)
|
1235
|
+
require "iceberg"
|
1236
|
+
|
1237
|
+
table =
|
1238
|
+
if target.is_a?(Iceberg::Table)
|
1239
|
+
target
|
1240
|
+
else
|
1241
|
+
raise Todo
|
1242
|
+
end
|
1243
|
+
|
1244
|
+
data = self
|
1245
|
+
|
1246
|
+
if mode == "append"
|
1247
|
+
table.append(data)
|
1248
|
+
else
|
1249
|
+
raise Todo
|
1250
|
+
end
|
1251
|
+
end
|
1252
|
+
|
1151
1253
|
# Write DataFrame as delta table.
|
1152
1254
|
#
|
1153
1255
|
# @param target [Object]
|
data/lib/polars/data_types.rb
CHANGED
@@ -110,12 +110,23 @@ module Polars
|
|
110
110
|
DataTypeExpr._from_rbdatatype_expr(RbDataTypeExpr.from_dtype(self))
|
111
111
|
end
|
112
112
|
|
113
|
-
[:numeric?, :decimal?, :integer?, :signed_integer?, :unsigned_integer?, :float?, :temporal?, :nested
|
113
|
+
[:numeric?, :decimal?, :integer?, :signed_integer?, :unsigned_integer?, :float?, :temporal?, :nested?].each do |v|
|
114
114
|
define_method(v) do
|
115
115
|
self.class.public_send(v)
|
116
116
|
end
|
117
117
|
end
|
118
118
|
|
119
|
+
# Return a `DataTypeExpr` with a static `DataType`.
|
120
|
+
#
|
121
|
+
# @return [Expr]
|
122
|
+
#
|
123
|
+
# @example
|
124
|
+
# Polars::Int16.new.to_dtype_expr.collect_dtype({})
|
125
|
+
# # => Polars::Int16
|
126
|
+
def to_dtype_expr
|
127
|
+
DataTypeExpr._from_rbdatatype_expr(RbDataTypeExpr.from_dtype(self))
|
128
|
+
end
|
129
|
+
|
119
130
|
# Returns a string representing the data type.
|
120
131
|
#
|
121
132
|
# @return [String]
|
@@ -317,11 +328,9 @@ module Polars
|
|
317
328
|
class Categories
|
318
329
|
attr_accessor :_categories
|
319
330
|
|
320
|
-
def initialize
|
321
|
-
# TODO fix
|
322
|
-
name = nil
|
331
|
+
def initialize(name = nil)
|
323
332
|
if name.nil? || name == ""
|
324
|
-
|
333
|
+
self._categories = RbCategories.global_categories
|
325
334
|
return
|
326
335
|
end
|
327
336
|
|
@@ -1188,7 +1188,7 @@ module Polars
|
|
1188
1188
|
if Utils::DTYPE_TEMPORAL_UNITS.include?(time_unit)
|
1189
1189
|
timestamp(time_unit)
|
1190
1190
|
elsif time_unit == "s"
|
1191
|
-
|
1191
|
+
timestamp("ms").floordiv(F.lit(1000, dtype: Int64))
|
1192
1192
|
elsif time_unit == "d"
|
1193
1193
|
Utils.wrap_expr(_rbexpr).cast(:date).cast(:i32)
|
1194
1194
|
else
|
data/lib/polars/expr.rb
CHANGED
@@ -146,6 +146,40 @@ module Polars
|
|
146
146
|
wrap_expr(_rbexpr.neg)
|
147
147
|
end
|
148
148
|
|
149
|
+
# Read a serialized expression from a file.
|
150
|
+
#
|
151
|
+
# @param source [Object]
|
152
|
+
# Path to a file or a file-like object (by file-like object, we refer to
|
153
|
+
# objects that have a `read` method, such as a file handler or `StringIO`).
|
154
|
+
#
|
155
|
+
# @return [Expr]
|
156
|
+
#
|
157
|
+
# @note
|
158
|
+
# This function uses marshaling if the logical plan contains Ruby UDFs,
|
159
|
+
# and as such inherits the security implications. Deserializing can execute
|
160
|
+
# arbitrary code, so it should only be attempted on trusted data.
|
161
|
+
#
|
162
|
+
# @note
|
163
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
164
|
+
# in one Polars version may not be deserializable in another Polars version.
|
165
|
+
#
|
166
|
+
# @example
|
167
|
+
# expr = Polars.col("foo").sum.over("bar")
|
168
|
+
# bytes = expr.meta.serialize
|
169
|
+
# Polars::Expr.deserialize(StringIO.new(bytes))
|
170
|
+
# # => col("foo").sum().over([col("bar")])
|
171
|
+
def self.deserialize(source)
|
172
|
+
raise Todo unless RbExpr.respond_to?(:deserialize_binary)
|
173
|
+
|
174
|
+
if Utils.pathlike?(source)
|
175
|
+
source = Utils.normalize_filepath(source)
|
176
|
+
end
|
177
|
+
|
178
|
+
deserializer = RbExpr.method(:deserialize_binary)
|
179
|
+
|
180
|
+
_from_rbexpr(deserializer.(source))
|
181
|
+
end
|
182
|
+
|
149
183
|
# Cast to physical representation of the logical dtype.
|
150
184
|
#
|
151
185
|
# - `:date` -> `:i32`
|
@@ -377,8 +411,6 @@ module Polars
|
|
377
411
|
wrap_expr(_rbexpr._alias(name))
|
378
412
|
end
|
379
413
|
|
380
|
-
# TODO support symbols for exclude
|
381
|
-
|
382
414
|
# Exclude certain columns from a wildcard/regex selection.
|
383
415
|
#
|
384
416
|
# You may also use regexes in the exclude list. They must start with `^` and end
|
@@ -1787,7 +1819,7 @@ module Polars
|
|
1787
1819
|
wrap_expr(_rbexpr.arg_min)
|
1788
1820
|
end
|
1789
1821
|
|
1790
|
-
# Get the index of the first occurrence of a value, or
|
1822
|
+
# Get the index of the first occurrence of a value, or `nil` if it's not found.
|
1791
1823
|
#
|
1792
1824
|
# @param element [Object]
|
1793
1825
|
# Value to find.
|
@@ -7571,7 +7603,8 @@ module Polars
|
|
7571
7603
|
# # │ 1.584963 │
|
7572
7604
|
# # └──────────┘
|
7573
7605
|
def log(base = Math::E)
|
7574
|
-
|
7606
|
+
base_rbexpr = Utils.parse_into_expression(base)
|
7607
|
+
wrap_expr(_rbexpr.log(base_rbexpr))
|
7575
7608
|
end
|
7576
7609
|
|
7577
7610
|
# Compute the natural logarithm of each element plus one.
|
@@ -7743,33 +7776,9 @@ module Polars
|
|
7743
7776
|
# This can be used to reduce memory pressure.
|
7744
7777
|
#
|
7745
7778
|
# @return [Expr]
|
7746
|
-
#
|
7747
|
-
# @example
|
7748
|
-
# Polars::DataFrame.new(
|
7749
|
-
# {
|
7750
|
-
# "a" => [1, 2, 3],
|
7751
|
-
# "b" => [1, 2, 2 << 32],
|
7752
|
-
# "c" => [-1, 2, 1 << 30],
|
7753
|
-
# "d" => [-112, 2, 112],
|
7754
|
-
# "e" => [-112, 2, 129],
|
7755
|
-
# "f" => ["a", "b", "c"],
|
7756
|
-
# "g" => [0.1, 1.32, 0.12],
|
7757
|
-
# "h" => [true, nil, false]
|
7758
|
-
# }
|
7759
|
-
# ).select(Polars.all.shrink_dtype)
|
7760
|
-
# # =>
|
7761
|
-
# # shape: (3, 8)
|
7762
|
-
# # ┌─────┬────────────┬────────────┬──────┬──────┬─────┬──────┬───────┐
|
7763
|
-
# # │ a ┆ b ┆ c ┆ d ┆ e ┆ f ┆ g ┆ h │
|
7764
|
-
# # │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
7765
|
-
# # │ i8 ┆ i64 ┆ i32 ┆ i8 ┆ i16 ┆ str ┆ f32 ┆ bool │
|
7766
|
-
# # ╞═════╪════════════╪════════════╪══════╪══════╪═════╪══════╪═══════╡
|
7767
|
-
# # │ 1 ┆ 1 ┆ -1 ┆ -112 ┆ -112 ┆ a ┆ 0.1 ┆ true │
|
7768
|
-
# # │ 2 ┆ 2 ┆ 2 ┆ 2 ┆ 2 ┆ b ┆ 1.32 ┆ null │
|
7769
|
-
# # │ 3 ┆ 8589934592 ┆ 1073741824 ┆ 112 ┆ 129 ┆ c ┆ 0.12 ┆ false │
|
7770
|
-
# # └─────┴────────────┴────────────┴──────┴──────┴─────┴──────┴───────┘
|
7771
7779
|
def shrink_dtype
|
7772
|
-
|
7780
|
+
warn "`Expr.shrink_dtype` is deprecated and is a no-op; use `Series.shrink_dtype` instead."
|
7781
|
+
self
|
7773
7782
|
end
|
7774
7783
|
|
7775
7784
|
# Bin values into buckets and count their occurrences.
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Polars
|
2
|
+
module Functions
|
3
|
+
# Count the number of business days between `start` and `end` (not including `end`).
|
4
|
+
#
|
5
|
+
# @note
|
6
|
+
# This functionality is considered **unstable**. It may be changed
|
7
|
+
# at any point without it being considered a breaking change.
|
8
|
+
#
|
9
|
+
# @param start [Object]
|
10
|
+
# Start dates.
|
11
|
+
# @param stop [Object]
|
12
|
+
# End dates.
|
13
|
+
# @param week_mask [Array]
|
14
|
+
# Which days of the week to count. The default is Monday to Friday.
|
15
|
+
# If you wanted to count only Monday to Thursday, you would pass
|
16
|
+
# `[true, true, true, true, false, false, false]`.
|
17
|
+
# @param holidays [Array]
|
18
|
+
# Holidays to exclude from the count.
|
19
|
+
#
|
20
|
+
# @return [Expr]
|
21
|
+
#
|
22
|
+
# @example
|
23
|
+
# df = Polars::DataFrame.new(
|
24
|
+
# {
|
25
|
+
# "start" => [Date.new(2020, 1, 1), Date.new(2020, 1, 2)],
|
26
|
+
# "end" => [Date.new(2020, 1, 2), Date.new(2020, 1, 10)]
|
27
|
+
# }
|
28
|
+
# )
|
29
|
+
# df.with_columns(
|
30
|
+
# business_day_count: Polars.business_day_count("start", "end")
|
31
|
+
# )
|
32
|
+
# # =>
|
33
|
+
# # shape: (2, 3)
|
34
|
+
# # ┌────────────┬────────────┬────────────────────┐
|
35
|
+
# # │ start ┆ end ┆ business_day_count │
|
36
|
+
# # │ --- ┆ --- ┆ --- │
|
37
|
+
# # │ date ┆ date ┆ i32 │
|
38
|
+
# # ╞════════════╪════════════╪════════════════════╡
|
39
|
+
# # │ 2020-01-01 ┆ 2020-01-02 ┆ 1 │
|
40
|
+
# # │ 2020-01-02 ┆ 2020-01-10 ┆ 6 │
|
41
|
+
# # └────────────┴────────────┴────────────────────┘
|
42
|
+
#
|
43
|
+
# @example You can pass a custom weekend - for example, if you only take Sunday off:
|
44
|
+
# week_mask = [true, true, true, true, true, true, false]
|
45
|
+
# df.with_columns(
|
46
|
+
# business_day_count: Polars.business_day_count(
|
47
|
+
# "start", "end", week_mask: week_mask
|
48
|
+
# )
|
49
|
+
# )
|
50
|
+
# # =>
|
51
|
+
# # shape: (2, 3)
|
52
|
+
# # ┌────────────┬────────────┬────────────────────┐
|
53
|
+
# # │ start ┆ end ┆ business_day_count │
|
54
|
+
# # │ --- ┆ --- ┆ --- │
|
55
|
+
# # │ date ┆ date ┆ i32 │
|
56
|
+
# # ╞════════════╪════════════╪════════════════════╡
|
57
|
+
# # │ 2020-01-01 ┆ 2020-01-02 ┆ 1 │
|
58
|
+
# # │ 2020-01-02 ┆ 2020-01-10 ┆ 7 │
|
59
|
+
# # └────────────┴────────────┴────────────────────┘
|
60
|
+
#
|
61
|
+
# @example You can also pass a list of holidays to exclude from the count:
|
62
|
+
# holidays = [Date.new(2020, 1, 1), Date.new(2020, 1, 2)]
|
63
|
+
# df.with_columns(
|
64
|
+
# business_day_count: Polars.business_day_count("start", "end", holidays: holidays)
|
65
|
+
# )
|
66
|
+
# # =>
|
67
|
+
# # shape: (2, 3)
|
68
|
+
# # ┌────────────┬────────────┬────────────────────┐
|
69
|
+
# # │ start ┆ end ┆ business_day_count │
|
70
|
+
# # │ --- ┆ --- ┆ --- │
|
71
|
+
# # │ date ┆ date ┆ i32 │
|
72
|
+
# # ╞════════════╪════════════╪════════════════════╡
|
73
|
+
# # │ 2020-01-01 ┆ 2020-01-02 ┆ 0 │
|
74
|
+
# # │ 2020-01-02 ┆ 2020-01-10 ┆ 5 │
|
75
|
+
# # └────────────┴────────────┴────────────────────┘
|
76
|
+
def business_day_count(
|
77
|
+
start,
|
78
|
+
stop,
|
79
|
+
week_mask: [true, true, true, true, true, false, false],
|
80
|
+
holidays: []
|
81
|
+
)
|
82
|
+
start_rbexpr = Utils.parse_into_expression(start)
|
83
|
+
end_rbexpr = Utils.parse_into_expression(stop)
|
84
|
+
unix_epoch = ::Date.new(1970, 1, 1)
|
85
|
+
Utils.wrap_expr(
|
86
|
+
Plr.business_day_count(
|
87
|
+
start_rbexpr,
|
88
|
+
end_rbexpr,
|
89
|
+
week_mask,
|
90
|
+
holidays.map { |holiday| holiday - unix_epoch }
|
91
|
+
)
|
92
|
+
)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Lazily read from an Apache Iceberg table.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# A Iceberg Ruby table, or a direct path to the metadata.
|
7
|
+
# @param snapshot_id [Integer]
|
8
|
+
# The snapshot ID to scan from.
|
9
|
+
# @param storage_options [Hash]
|
10
|
+
# Extra options for the storage backends.
|
11
|
+
#
|
12
|
+
# @return [LazyFrame]
|
13
|
+
def scan_iceberg(
|
14
|
+
source,
|
15
|
+
snapshot_id: nil,
|
16
|
+
storage_options: nil
|
17
|
+
)
|
18
|
+
require "iceberg"
|
19
|
+
|
20
|
+
unless source.is_a?(Iceberg::Table)
|
21
|
+
raise Todo
|
22
|
+
end
|
23
|
+
|
24
|
+
source.to_polars(snapshot_id:, storage_options:)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/polars/io/parquet.rb
CHANGED
@@ -117,14 +117,13 @@ module Polars
|
|
117
117
|
# @param source [Object]
|
118
118
|
# Path to a file or a file-like object.
|
119
119
|
#
|
120
|
-
# @return [
|
120
|
+
# @return [Schema]
|
121
121
|
def read_parquet_schema(source)
|
122
122
|
if Utils.pathlike?(source)
|
123
123
|
source = Utils.normalize_filepath(source)
|
124
124
|
end
|
125
125
|
|
126
|
-
|
127
|
-
scan_parquet(source).collect_schema.to_h
|
126
|
+
scan_parquet(source).collect_schema
|
128
127
|
end
|
129
128
|
|
130
129
|
# Get file-level custom metadata of a Parquet file without reading data.
|
@@ -207,6 +206,9 @@ module Polars
|
|
207
206
|
# defined schema are encountered in the data:
|
208
207
|
# * `ignore`: Silently ignores.
|
209
208
|
# * `raise`: Raises an error.
|
209
|
+
# @param cast_options [Object]
|
210
|
+
# Configuration for column type-casting during scans. Useful for datasets
|
211
|
+
# containing files that have differing schemas.
|
210
212
|
#
|
211
213
|
# @return [LazyFrame]
|
212
214
|
def scan_parquet(
|
@@ -230,6 +232,7 @@ module Polars
|
|
230
232
|
include_file_paths: nil,
|
231
233
|
allow_missing_columns: false,
|
232
234
|
extra_columns: "raise",
|
235
|
+
cast_options: nil,
|
233
236
|
_column_mapping: nil,
|
234
237
|
_deletion_files: nil
|
235
238
|
)
|
@@ -268,7 +271,7 @@ module Polars
|
|
268
271
|
ScanOptions.new(
|
269
272
|
row_index: !row_index_name.nil? ? [row_index_name, row_index_offset] : nil,
|
270
273
|
pre_slice: !n_rows.nil? ? [0, n_rows] : nil,
|
271
|
-
|
274
|
+
cast_options: cast_options,
|
272
275
|
extra_columns: extra_columns,
|
273
276
|
missing_columns: missing_columns,
|
274
277
|
include_file_paths: include_file_paths,
|
@@ -3,7 +3,8 @@ module Polars
|
|
3
3
|
class ScanOptions
|
4
4
|
attr_reader :row_index, :pre_slice, :cast_options, :extra_columns, :missing_columns,
|
5
5
|
:include_file_paths, :glob, :hive_partitioning, :hive_schema, :try_parse_hive_dates,
|
6
|
-
:rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping,
|
6
|
+
:rechunk, :cache, :storage_options, :credential_provider, :retries, :column_mapping,
|
7
|
+
:default_values, :deletion_files
|
7
8
|
|
8
9
|
def initialize(
|
9
10
|
row_index: nil,
|
@@ -22,6 +23,7 @@ module Polars
|
|
22
23
|
credential_provider: nil,
|
23
24
|
retries: 2,
|
24
25
|
column_mapping: nil,
|
26
|
+
default_values: nil,
|
25
27
|
deletion_files: nil
|
26
28
|
)
|
27
29
|
@row_index = row_index
|
@@ -40,6 +42,7 @@ module Polars
|
|
40
42
|
@credential_provider = credential_provider
|
41
43
|
@retries = retries
|
42
44
|
@column_mapping = column_mapping
|
45
|
+
@default_values = default_values
|
43
46
|
@deletion_files = deletion_files
|
44
47
|
end
|
45
48
|
end
|
data/lib/polars/lazy_frame.rb
CHANGED
@@ -27,9 +27,6 @@ module Polars
|
|
27
27
|
ldf
|
28
28
|
end
|
29
29
|
|
30
|
-
# def self.from_json
|
31
|
-
# end
|
32
|
-
|
33
30
|
# Read a logical plan from a JSON file to construct a LazyFrame.
|
34
31
|
#
|
35
32
|
# @param file [String]
|
@@ -41,7 +38,49 @@ module Polars
|
|
41
38
|
file = Utils.normalize_filepath(file)
|
42
39
|
end
|
43
40
|
|
44
|
-
Utils.wrap_ldf(RbLazyFrame.
|
41
|
+
Utils.wrap_ldf(RbLazyFrame.deserialize_json(file))
|
42
|
+
end
|
43
|
+
|
44
|
+
# Read a logical plan from a file to construct a LazyFrame.
|
45
|
+
#
|
46
|
+
# @param source [Object]
|
47
|
+
# Path to a file or a file-like object (by file-like object, we refer to
|
48
|
+
# objects that have a `read` method, such as a file handler or `StringIO`).
|
49
|
+
#
|
50
|
+
# @return [LazyFrame]
|
51
|
+
#
|
52
|
+
# @note
|
53
|
+
# This function uses marshaling if the logical plan contains Ruby UDFs,
|
54
|
+
# and as such inherits the security implications. Deserializing can execute
|
55
|
+
# arbitrary code, so it should only be attempted on trusted data.
|
56
|
+
#
|
57
|
+
# @note
|
58
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
59
|
+
# in one Polars version may not be deserializable in another Polars version.
|
60
|
+
#
|
61
|
+
# @example
|
62
|
+
# lf = Polars::LazyFrame.new({"a" => [1, 2, 3]}).sum
|
63
|
+
# bytes = lf.serialize
|
64
|
+
# Polars::LazyFrame.deserialize(StringIO.new(bytes)).collect
|
65
|
+
# # =>
|
66
|
+
# # shape: (1, 1)
|
67
|
+
# # ┌─────┐
|
68
|
+
# # │ a │
|
69
|
+
# # │ --- │
|
70
|
+
# # │ i64 │
|
71
|
+
# # ╞═════╡
|
72
|
+
# # │ 6 │
|
73
|
+
# # └─────┘
|
74
|
+
def self.deserialize(source)
|
75
|
+
raise Todo unless RbLazyFrame.respond_to?(:deserialize_binary)
|
76
|
+
|
77
|
+
if Utils.pathlike?(source)
|
78
|
+
source = Utils.normalize_filepath(source)
|
79
|
+
end
|
80
|
+
|
81
|
+
deserializer = RbLazyFrame.method(:deserialize_binary)
|
82
|
+
|
83
|
+
_from_rbldf(deserializer.(source))
|
45
84
|
end
|
46
85
|
|
47
86
|
# Get or set column names.
|
@@ -151,6 +190,38 @@ module Polars
|
|
151
190
|
nil
|
152
191
|
end
|
153
192
|
|
193
|
+
# Serialize the logical plan of this LazyFrame to a file or string.
|
194
|
+
#
|
195
|
+
# @param file [Object]
|
196
|
+
# File path to which the result should be written. If set to `nil`
|
197
|
+
# (default), the output is returned as a string instead.
|
198
|
+
#
|
199
|
+
# @return [Object]
|
200
|
+
#
|
201
|
+
# @note
|
202
|
+
# Serialization is not stable across Polars versions: a LazyFrame serialized
|
203
|
+
# in one Polars version may not be deserializable in another Polars version.
|
204
|
+
#
|
205
|
+
# @example Serialize the logical plan into a binary representation.
|
206
|
+
# lf = Polars::LazyFrame.new({"a" => [1, 2, 3]}).sum
|
207
|
+
# bytes = lf.serialize
|
208
|
+
# Polars::LazyFrame.deserialize(StringIO.new(bytes)).collect
|
209
|
+
# # =>
|
210
|
+
# # shape: (1, 1)
|
211
|
+
# # ┌─────┐
|
212
|
+
# # │ a │
|
213
|
+
# # │ --- │
|
214
|
+
# # │ i64 │
|
215
|
+
# # ╞═════╡
|
216
|
+
# # │ 6 │
|
217
|
+
# # └─────┘
|
218
|
+
def serialize(file = nil)
|
219
|
+
raise Todo unless _ldf.respond_to?(:serialize_binary)
|
220
|
+
|
221
|
+
serializer = _ldf.method(:serialize_binary)
|
222
|
+
Utils.serialize_polars_object(serializer, file)
|
223
|
+
end
|
224
|
+
|
154
225
|
# Offers a structured way to apply a sequence of user-defined functions (UDFs).
|
155
226
|
#
|
156
227
|
# @param func [Object]
|
@@ -774,6 +845,21 @@ module Polars
|
|
774
845
|
# @param maintain_order [Boolean]
|
775
846
|
# Maintain the order in which data is processed.
|
776
847
|
# Setting this to `false` will be slightly faster.
|
848
|
+
# @param storage_options [String]
|
849
|
+
# Options that indicate how to connect to a cloud provider.
|
850
|
+
#
|
851
|
+
# The cloud providers currently supported are AWS, GCP, and Azure.
|
852
|
+
# See supported keys here:
|
853
|
+
#
|
854
|
+
# * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
|
855
|
+
# * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
|
856
|
+
# * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
|
857
|
+
# * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
858
|
+
#
|
859
|
+
# If `storage_options` is not provided, Polars will try to infer the
|
860
|
+
# information from environment variables.
|
861
|
+
# @param retries [Integer]
|
862
|
+
# Number of retries if accessing a cloud instance fails.
|
777
863
|
# @param type_coercion [Boolean]
|
778
864
|
# Do type coercion optimization.
|
779
865
|
# @param predicate_pushdown [Boolean]
|
@@ -806,6 +892,8 @@ module Polars
|
|
806
892
|
path,
|
807
893
|
compression: "zstd",
|
808
894
|
maintain_order: true,
|
895
|
+
storage_options: nil,
|
896
|
+
retries: 2,
|
809
897
|
type_coercion: true,
|
810
898
|
predicate_pushdown: true,
|
811
899
|
projection_pushdown: true,
|
@@ -816,10 +904,6 @@ module Polars
|
|
816
904
|
mkdir: false,
|
817
905
|
lazy: false
|
818
906
|
)
|
819
|
-
# TODO support storage options in Rust
|
820
|
-
storage_options = nil
|
821
|
-
retries = 2
|
822
|
-
|
823
907
|
lf = _set_sink_optimizations(
|
824
908
|
type_coercion: type_coercion,
|
825
909
|
predicate_pushdown: predicate_pushdown,
|