polars-df 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +127 -1
- data/Cargo.lock +72 -58
- data/README.md +31 -27
- data/ext/polars/Cargo.toml +15 -6
- data/ext/polars/src/batched_csv.rs +35 -39
- data/ext/polars/src/c_api/allocator.rs +7 -0
- data/ext/polars/src/c_api/mod.rs +1 -0
- data/ext/polars/src/catalog/unity.rs +123 -101
- data/ext/polars/src/conversion/any_value.rs +13 -17
- data/ext/polars/src/conversion/chunked_array.rs +5 -5
- data/ext/polars/src/conversion/datetime.rs +3 -2
- data/ext/polars/src/conversion/mod.rs +50 -45
- data/ext/polars/src/dataframe/export.rs +13 -13
- data/ext/polars/src/dataframe/general.rs +223 -223
- data/ext/polars/src/dataframe/io.rs +27 -141
- data/ext/polars/src/dataframe/mod.rs +13 -5
- data/ext/polars/src/dataframe/serde.rs +1 -1
- data/ext/polars/src/error.rs +44 -7
- data/ext/polars/src/exceptions.rs +45 -12
- data/ext/polars/src/expr/array.rs +12 -0
- data/ext/polars/src/expr/datatype.rs +2 -2
- data/ext/polars/src/expr/datetime.rs +4 -5
- data/ext/polars/src/expr/general.rs +49 -13
- data/ext/polars/src/expr/list.rs +4 -0
- data/ext/polars/src/expr/meta.rs +8 -3
- data/ext/polars/src/expr/mod.rs +22 -6
- data/ext/polars/src/expr/name.rs +19 -8
- data/ext/polars/src/expr/rolling.rs +50 -1
- data/ext/polars/src/expr/string.rs +0 -1
- data/ext/polars/src/expr/struct.rs +7 -2
- data/ext/polars/src/file.rs +136 -103
- data/ext/polars/src/functions/aggregation.rs +9 -8
- data/ext/polars/src/functions/io.rs +81 -10
- data/ext/polars/src/functions/lazy.rs +95 -21
- data/ext/polars/src/functions/mod.rs +2 -0
- data/ext/polars/src/functions/range.rs +19 -3
- data/ext/polars/src/functions/strings.rs +6 -0
- data/ext/polars/src/functions/utils.rs +6 -0
- data/ext/polars/src/interop/arrow/mod.rs +50 -1
- data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
- data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
- data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
- data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
- data/ext/polars/src/lazyframe/exitable.rs +39 -0
- data/ext/polars/src/lazyframe/general.rs +340 -236
- data/ext/polars/src/lazyframe/mod.rs +46 -10
- data/ext/polars/src/lazyframe/optflags.rs +5 -4
- data/ext/polars/src/lazyframe/serde.rs +11 -3
- data/ext/polars/src/lazyframe/sink.rs +10 -5
- data/ext/polars/src/lazygroupby.rs +6 -7
- data/ext/polars/src/lib.rs +141 -76
- data/ext/polars/src/map/dataframe.rs +12 -12
- data/ext/polars/src/map/lazy.rs +7 -5
- data/ext/polars/src/map/mod.rs +15 -8
- data/ext/polars/src/map/series.rs +3 -3
- data/ext/polars/src/on_startup.rs +16 -8
- data/ext/polars/src/prelude.rs +1 -0
- data/ext/polars/src/rb_modules.rs +19 -49
- data/ext/polars/src/series/aggregation.rs +79 -140
- data/ext/polars/src/series/arithmetic.rs +16 -22
- data/ext/polars/src/series/comparison.rs +101 -222
- data/ext/polars/src/series/construction.rs +17 -18
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +254 -289
- data/ext/polars/src/series/import.rs +17 -0
- data/ext/polars/src/series/map.rs +178 -160
- data/ext/polars/src/series/mod.rs +28 -12
- data/ext/polars/src/series/scatter.rs +12 -9
- data/ext/polars/src/sql.rs +16 -9
- data/ext/polars/src/testing/frame.rs +31 -0
- data/ext/polars/src/testing/mod.rs +5 -0
- data/ext/polars/src/testing/series.rs +31 -0
- data/ext/polars/src/timeout.rs +105 -0
- data/ext/polars/src/utils.rs +159 -1
- data/lib/polars/array_expr.rb +81 -12
- data/lib/polars/array_name_space.rb +74 -7
- data/lib/polars/batched_csv_reader.rb +21 -21
- data/lib/polars/binary_name_space.rb +1 -1
- data/lib/polars/cat_expr.rb +7 -7
- data/lib/polars/config.rb +1 -1
- data/lib/polars/convert.rb +189 -34
- data/lib/polars/data_frame.rb +1066 -831
- data/lib/polars/data_frame_plot.rb +173 -0
- data/lib/polars/data_type_group.rb +1 -0
- data/lib/polars/data_types.rb +31 -12
- data/lib/polars/date_time_expr.rb +51 -69
- data/lib/polars/date_time_name_space.rb +80 -112
- data/lib/polars/dynamic_group_by.rb +7 -7
- data/lib/polars/exceptions.rb +50 -10
- data/lib/polars/expr.rb +470 -517
- data/lib/polars/functions/aggregation/horizontal.rb +0 -1
- data/lib/polars/functions/aggregation/vertical.rb +2 -3
- data/lib/polars/functions/as_datatype.rb +290 -8
- data/lib/polars/functions/eager.rb +204 -10
- data/lib/polars/functions/escape_regex.rb +21 -0
- data/lib/polars/functions/lazy.rb +409 -169
- data/lib/polars/functions/lit.rb +17 -1
- data/lib/polars/functions/range/int_range.rb +74 -2
- data/lib/polars/functions/range/linear_space.rb +77 -0
- data/lib/polars/functions/range/time_range.rb +1 -1
- data/lib/polars/functions/repeat.rb +3 -12
- data/lib/polars/functions/whenthen.rb +2 -2
- data/lib/polars/group_by.rb +72 -20
- data/lib/polars/iceberg_dataset.rb +1 -6
- data/lib/polars/in_process_query.rb +37 -0
- data/lib/polars/io/cloud.rb +18 -0
- data/lib/polars/io/csv.rb +265 -126
- data/lib/polars/io/database.rb +0 -1
- data/lib/polars/io/delta.rb +15 -7
- data/lib/polars/io/ipc.rb +24 -17
- data/lib/polars/io/ndjson.rb +161 -24
- data/lib/polars/io/parquet.rb +101 -38
- data/lib/polars/lazy_frame.rb +849 -558
- data/lib/polars/lazy_group_by.rb +327 -2
- data/lib/polars/list_expr.rb +94 -16
- data/lib/polars/list_name_space.rb +88 -24
- data/lib/polars/meta_expr.rb +42 -1
- data/lib/polars/name_expr.rb +41 -4
- data/lib/polars/query_opt_flags.rb +198 -2
- data/lib/polars/rolling_group_by.rb +3 -3
- data/lib/polars/schema.rb +21 -3
- data/lib/polars/selector.rb +37 -2
- data/lib/polars/selectors.rb +45 -9
- data/lib/polars/series.rb +1156 -728
- data/lib/polars/series_plot.rb +72 -0
- data/lib/polars/slice.rb +1 -1
- data/lib/polars/sql_context.rb +11 -4
- data/lib/polars/string_expr.rb +59 -68
- data/lib/polars/string_name_space.rb +51 -87
- data/lib/polars/struct_expr.rb +36 -18
- data/lib/polars/testing.rb +24 -273
- data/lib/polars/utils/constants.rb +2 -0
- data/lib/polars/utils/construction/data_frame.rb +410 -0
- data/lib/polars/utils/construction/series.rb +364 -0
- data/lib/polars/utils/construction/utils.rb +9 -0
- data/lib/polars/utils/deprecation.rb +11 -0
- data/lib/polars/utils/serde.rb +8 -3
- data/lib/polars/utils/unstable.rb +19 -0
- data/lib/polars/utils/various.rb +59 -0
- data/lib/polars/utils.rb +46 -47
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +47 -1
- metadata +25 -6
- data/ext/polars/src/allocator.rs +0 -13
- data/lib/polars/plot.rb +0 -109
|
@@ -110,7 +110,7 @@ module Polars
|
|
|
110
110
|
# # b"\x00\x00\xff"
|
|
111
111
|
# # ]
|
|
112
112
|
#
|
|
113
|
-
# @example Set `strict
|
|
113
|
+
# @example Set `strict: false` to set invalid values to null instead of raising an error.
|
|
114
114
|
# s = Polars::Series.new("colors", ["000000".b, "ffff00".b, "invalid_value".b])
|
|
115
115
|
# s.bin.decode("hex", strict: false)
|
|
116
116
|
# # =>
|
data/lib/polars/cat_expr.rb
CHANGED
|
@@ -44,13 +44,13 @@ module Polars
|
|
|
44
44
|
# `len_chars` (_O(n)_).
|
|
45
45
|
#
|
|
46
46
|
# @example
|
|
47
|
-
#
|
|
48
|
-
#
|
|
49
|
-
#
|
|
50
|
-
#
|
|
51
|
-
#
|
|
52
|
-
#
|
|
53
|
-
#
|
|
47
|
+
# df = Polars::DataFrame.new(
|
|
48
|
+
# {"a" => Polars::Series.new(["Café", "345", "東京", nil], dtype: Polars::Categorical)}
|
|
49
|
+
# )
|
|
50
|
+
# df.with_columns(
|
|
51
|
+
# Polars.col("a").cat.len_bytes.alias("n_bytes"),
|
|
52
|
+
# Polars.col("a").cat.len_chars.alias("n_chars")
|
|
53
|
+
# )
|
|
54
54
|
# # =>
|
|
55
55
|
# # shape: (4, 3)
|
|
56
56
|
# # ┌──────┬─────────┬─────────┐
|
data/lib/polars/config.rb
CHANGED
data/lib/polars/convert.rb
CHANGED
|
@@ -17,9 +17,14 @@ module Polars
|
|
|
17
17
|
# If you supply an array of column names that does not match the names in the
|
|
18
18
|
# underlying data, the names given here will overwrite them. The number
|
|
19
19
|
# of names given in the schema should match the underlying data dimensions.
|
|
20
|
-
# @param
|
|
21
|
-
#
|
|
22
|
-
#
|
|
20
|
+
# @param schema_overrides [Hash]
|
|
21
|
+
# Support type specification or override of one or more columns; note that
|
|
22
|
+
# any dtypes inferred from the columns param will be overridden.
|
|
23
|
+
# @param strict [Boolean]
|
|
24
|
+
# Throw an error if any `data` value does not exactly match the given or inferred
|
|
25
|
+
# data type for that column. If set to `false`, values that do not match the data
|
|
26
|
+
# type are cast to that data type or, if casting is not possible, set to null
|
|
27
|
+
# instead.
|
|
23
28
|
#
|
|
24
29
|
# @return [DataFrame]
|
|
25
30
|
#
|
|
@@ -36,24 +41,47 @@ module Polars
|
|
|
36
41
|
# # │ 1 ┆ 3 │
|
|
37
42
|
# # │ 2 ┆ 4 │
|
|
38
43
|
# # └─────┴─────┘
|
|
39
|
-
def from_hash(data, schema: nil,
|
|
44
|
+
def from_hash(data, schema: nil, schema_overrides: nil, strict: true)
|
|
40
45
|
Utils.wrap_df(
|
|
41
|
-
|
|
46
|
+
Utils.hash_to_rbdf(
|
|
42
47
|
data,
|
|
43
|
-
schema: schema
|
|
48
|
+
schema: schema,
|
|
49
|
+
schema_overrides: schema_overrides,
|
|
50
|
+
strict: strict
|
|
44
51
|
)
|
|
45
52
|
)
|
|
46
53
|
end
|
|
47
54
|
|
|
48
55
|
# Construct a DataFrame from an array of hashes. This operation clones data.
|
|
49
56
|
#
|
|
50
|
-
# @param
|
|
51
|
-
# Array with hashes mapping column name to value
|
|
52
|
-
# @param infer_schema_length [Integer]
|
|
53
|
-
# How many hashes/rows to scan to determine the data types
|
|
54
|
-
# if set to `nil` all rows are scanned. This will be slow.
|
|
57
|
+
# @param data [Array]
|
|
58
|
+
# Array with hashes mapping column name to value
|
|
55
59
|
# @param schema [Object]
|
|
56
|
-
#
|
|
60
|
+
# The DataFrame schema may be declared in several ways:
|
|
61
|
+
#
|
|
62
|
+
# * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
|
63
|
+
# * As a list of column names; in this case types are automatically inferred.
|
|
64
|
+
# * As a list of (name,type) pairs; this is equivalent to the hash form.
|
|
65
|
+
#
|
|
66
|
+
# If a list of column names is supplied that does NOT match the names in the
|
|
67
|
+
# underlying data, the names given here will overwrite the actual fields in
|
|
68
|
+
# the order that they appear - however, in this case it is typically clearer
|
|
69
|
+
# to rename after loading the frame.
|
|
70
|
+
#
|
|
71
|
+
# If you want to drop some of the fields found in the input hashes, a
|
|
72
|
+
# *partial* schema can be declared, in which case omitted fields will not be
|
|
73
|
+
# loaded. Similarly, you can extend the loaded frame with empty columns by
|
|
74
|
+
# adding them to the schema.
|
|
75
|
+
# @param schema_overrides [Hash]
|
|
76
|
+
# Support override of inferred types for one or more columns.
|
|
77
|
+
# @param strict [Boolean]
|
|
78
|
+
# Throw an error if any `data` value does not exactly match the given or inferred
|
|
79
|
+
# data type for that column. If set to `false`, values that do not match the data
|
|
80
|
+
# type are cast to that data type or, if casting is not possible, set to null
|
|
81
|
+
# instead.
|
|
82
|
+
# @param infer_schema_length [Integer]
|
|
83
|
+
# The maximum number of rows to scan for schema inference.
|
|
84
|
+
# If set to `nil`, the full data may be scanned *(this is slow)*.
|
|
57
85
|
#
|
|
58
86
|
# @return [DataFrame]
|
|
59
87
|
#
|
|
@@ -72,37 +100,164 @@ module Polars
|
|
|
72
100
|
# # │ 3 ┆ 6 │
|
|
73
101
|
# # └─────┴─────┘
|
|
74
102
|
#
|
|
75
|
-
# @example
|
|
76
|
-
# Polars.from_hashes(data, schema: {"
|
|
103
|
+
# @example Declaring a partial `schema` will drop the omitted columns.
|
|
104
|
+
# Polars.from_hashes(data, schema: {"a" => Polars::Int32})
|
|
105
|
+
# # =>
|
|
106
|
+
# # shape: (3, 1)
|
|
107
|
+
# # ┌─────┐
|
|
108
|
+
# # │ a │
|
|
109
|
+
# # │ --- │
|
|
110
|
+
# # │ i32 │
|
|
111
|
+
# # ╞═════╡
|
|
112
|
+
# # │ 1 │
|
|
113
|
+
# # │ 2 │
|
|
114
|
+
# # │ 3 │
|
|
115
|
+
# # └─────┘
|
|
116
|
+
def from_hashes(
|
|
117
|
+
data,
|
|
118
|
+
schema: nil,
|
|
119
|
+
schema_overrides: nil,
|
|
120
|
+
strict: true,
|
|
121
|
+
infer_schema_length: N_INFER_DEFAULT
|
|
122
|
+
)
|
|
123
|
+
if !data.any? && !(schema.any? || schema_overrides.any?)
|
|
124
|
+
msg = "no data, cannot infer schema"
|
|
125
|
+
raise NoDataError, msg
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
DataFrame.new(
|
|
129
|
+
data,
|
|
130
|
+
schema: schema,
|
|
131
|
+
schema_overrides: schema_overrides,
|
|
132
|
+
strict: strict,
|
|
133
|
+
infer_schema_length: infer_schema_length
|
|
134
|
+
)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Construct a DataFrame from an array of arrays. This operation clones data.
|
|
138
|
+
#
|
|
139
|
+
# Note that this is slower than creating from columnar memory.
|
|
140
|
+
#
|
|
141
|
+
# @param data [Array]
|
|
142
|
+
# Two-dimensional data represented as an array of arrays.
|
|
143
|
+
# @param schema [Object]
|
|
144
|
+
# The DataFrame schema may be declared in several ways:
|
|
145
|
+
#
|
|
146
|
+
# * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
|
147
|
+
# * As a list of column names; in this case types are automatically inferred.
|
|
148
|
+
# * As a list of (name,type) pairs; this is equivalent to the hash form.
|
|
149
|
+
#
|
|
150
|
+
# If you supply a list of column names that does not match the names in the
|
|
151
|
+
# underlying data, the names given here will overwrite them. The number
|
|
152
|
+
# of names given in the schema should match the underlying data dimensions.
|
|
153
|
+
# @param schema_overrides [Hash]
|
|
154
|
+
# Support type specification or override of one or more columns; note that
|
|
155
|
+
# any dtypes inferred from the columns param will be overridden.
|
|
156
|
+
# @param strict [Boolean]
|
|
157
|
+
# Throw an error if any `data` value does not exactly match the given or inferred
|
|
158
|
+
# data type for that column. If set to `false`, values that do not match the data
|
|
159
|
+
# type are cast to that data type or, if casting is not possible, set to null
|
|
160
|
+
# instead.
|
|
161
|
+
# @param orient ['col', 'row']
|
|
162
|
+
# Whether to interpret two-dimensional data as columns or as rows. If nil,
|
|
163
|
+
# the orientation is inferred by matching the columns and data dimensions. If
|
|
164
|
+
# this does not yield conclusive results, column orientation is used.
|
|
165
|
+
# @param infer_schema_length [Integer]
|
|
166
|
+
# The maximum number of rows to scan for schema inference.
|
|
167
|
+
# If set to `nil`, the full data may be scanned *(this is slow)*.
|
|
168
|
+
#
|
|
169
|
+
# @return [DataFrame]
|
|
170
|
+
#
|
|
171
|
+
# @example
|
|
172
|
+
# data = [[1, 2, 3], [4, 5, 6]]
|
|
173
|
+
# Polars.from_records(data, schema: ["a", "b"])
|
|
77
174
|
# # =>
|
|
78
175
|
# # shape: (3, 2)
|
|
79
176
|
# # ┌─────┬─────┐
|
|
80
|
-
# # │
|
|
177
|
+
# # │ a ┆ b │
|
|
81
178
|
# # │ --- ┆ --- │
|
|
82
|
-
# # │
|
|
179
|
+
# # │ i64 ┆ i64 │
|
|
83
180
|
# # ╞═════╪═════╡
|
|
84
181
|
# # │ 1 ┆ 4 │
|
|
85
182
|
# # │ 2 ┆ 5 │
|
|
86
183
|
# # │ 3 ┆ 6 │
|
|
87
184
|
# # └─────┴─────┘
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
185
|
+
def from_records(
|
|
186
|
+
data,
|
|
187
|
+
schema: nil,
|
|
188
|
+
schema_overrides: nil,
|
|
189
|
+
strict: true,
|
|
190
|
+
orient: nil,
|
|
191
|
+
infer_schema_length: N_INFER_DEFAULT
|
|
192
|
+
)
|
|
193
|
+
if !data.is_a?(::Array)
|
|
194
|
+
msg = (
|
|
195
|
+
"expected data of type Array, got #{data.class.name.inspect}" +
|
|
196
|
+
"\n\nHint: Try passing your data to the DataFrame constructor instead," +
|
|
197
|
+
" e.g. `Polars::DataFrame.new(data)`."
|
|
198
|
+
)
|
|
199
|
+
raise TypeError, msg
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
Utils.wrap_df(
|
|
203
|
+
Utils.sequence_to_rbdf(
|
|
204
|
+
data,
|
|
205
|
+
schema: schema,
|
|
206
|
+
schema_overrides: schema_overrides,
|
|
207
|
+
strict: strict,
|
|
208
|
+
orient: orient,
|
|
209
|
+
infer_schema_length: infer_schema_length
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
end
|
|
104
213
|
|
|
105
|
-
#
|
|
106
|
-
#
|
|
214
|
+
# Construct a DataFrame from a NumPy ndarray. This operation clones data.
|
|
215
|
+
#
|
|
216
|
+
# Note that this is slower than creating from columnar memory.
|
|
217
|
+
#
|
|
218
|
+
# @param data [Numo::NArray]
|
|
219
|
+
# Two-dimensional data represented as a NumPy ndarray.
|
|
220
|
+
# @param schema [Object]
|
|
221
|
+
# The DataFrame schema may be declared in several ways:
|
|
222
|
+
#
|
|
223
|
+
# * As a dict of \\\\{name:type} pairs; if type is nil, it will be auto-inferred.
|
|
224
|
+
# * As a list of column names; in this case types are automatically inferred.
|
|
225
|
+
# * As a list of (name,type) pairs; this is equivalent to the hash form.
|
|
226
|
+
#
|
|
227
|
+
# If you supply a list of column names that does not match the names in the
|
|
228
|
+
# underlying data, the names given here will overwrite them. The number
|
|
229
|
+
# of names given in the schema should match the underlying data dimensions.
|
|
230
|
+
# @param schema_overrides [Hash]
|
|
231
|
+
# Support type specification or override of one or more columns; note that
|
|
232
|
+
# any dtypes inferred from the columns param will be overridden.
|
|
233
|
+
# @param orient ['col', 'row']
|
|
234
|
+
# Whether to interpret two-dimensional data as columns or as rows. If nil,
|
|
235
|
+
# the orientation is inferred by matching the columns and data dimensions. If
|
|
236
|
+
# this does not yield conclusive results, column orientation is used.
|
|
237
|
+
#
|
|
238
|
+
# @return [DataFrame]
|
|
239
|
+
#
|
|
240
|
+
# @example
|
|
241
|
+
# data = Numo::NArray.cast([[1, 2, 3], [4, 5, 6]])
|
|
242
|
+
# Polars.from_numo(data, schema: ["a", "b"], orient: "col")
|
|
243
|
+
# # =>
|
|
244
|
+
# # shape: (3, 2)
|
|
245
|
+
# # ┌─────┬─────┐
|
|
246
|
+
# # │ a ┆ b │
|
|
247
|
+
# # │ --- ┆ --- │
|
|
248
|
+
# # │ i64 ┆ i64 │
|
|
249
|
+
# # ╞═════╪═════╡
|
|
250
|
+
# # │ 1 ┆ 4 │
|
|
251
|
+
# # │ 2 ┆ 5 │
|
|
252
|
+
# # │ 3 ┆ 6 │
|
|
253
|
+
# # └─────┴─────┘
|
|
254
|
+
def from_numo(
|
|
255
|
+
data,
|
|
256
|
+
schema: nil,
|
|
257
|
+
schema_overrides: nil,
|
|
258
|
+
orient: nil
|
|
259
|
+
)
|
|
260
|
+
raise Todo
|
|
261
|
+
end
|
|
107
262
|
end
|
|
108
263
|
end
|