polars-df 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +9 -0
- data/Cargo.lock +74 -3
- data/Cargo.toml +3 -0
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +18 -1
- data/ext/polars/src/conversion.rs +115 -2
- data/ext/polars/src/dataframe.rs +228 -11
- data/ext/polars/src/error.rs +4 -0
- data/ext/polars/src/lazy/dataframe.rs +5 -5
- data/ext/polars/src/lazy/dsl.rs +157 -2
- data/ext/polars/src/lib.rs +185 -10
- data/ext/polars/src/list_construction.rs +100 -0
- data/ext/polars/src/series.rs +217 -29
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +1 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/cat_name_space.rb +54 -0
- data/lib/polars/data_frame.rb +2384 -140
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/date_time_name_space.rb +1484 -0
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +4374 -53
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +518 -0
- data/lib/polars/io.rb +421 -2
- data/lib/polars/lazy_frame.rb +1267 -69
- data/lib/polars/lazy_functions.rb +412 -24
- data/lib/polars/lazy_group_by.rb +80 -0
- data/lib/polars/list_expr.rb +507 -5
- data/lib/polars/list_name_space.rb +346 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2256 -242
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +847 -10
- data/lib/polars/string_name_space.rb +690 -0
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/struct_name_space.rb +64 -0
- data/lib/polars/utils.rb +71 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +12 -10
- metadata +15 -2
@@ -0,0 +1,22 @@
|
|
1
|
+
module Polars
|
2
|
+
# @private
|
3
|
+
module ExprDispatch
|
4
|
+
private
|
5
|
+
|
6
|
+
def self.included(base)
|
7
|
+
base.attr_accessor :_s
|
8
|
+
base.singleton_class.attr_accessor :_accessor
|
9
|
+
end
|
10
|
+
|
11
|
+
def method_missing(method, ...)
|
12
|
+
return super unless self.class.method_defined?(method)
|
13
|
+
|
14
|
+
namespace = self.class._accessor
|
15
|
+
|
16
|
+
s = Utils.wrap_s(_s)
|
17
|
+
expr = Utils.col(s.name)
|
18
|
+
expr = expr.send(namespace) if namespace
|
19
|
+
s.to_frame.select(expr.send(method, ...)).to_series
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/polars/functions.rb
CHANGED
@@ -1,5 +1,51 @@
|
|
1
1
|
module Polars
|
2
2
|
module Functions
|
3
|
+
# Convert categorical variables into dummy/indicator variables.
|
4
|
+
#
|
5
|
+
# @param df [DataFrame]
|
6
|
+
# DataFrame to convert.
|
7
|
+
# @param columns [Array, nil]
|
8
|
+
# A subset of columns to convert to dummy variables. `nil` means
|
9
|
+
# "all columns".
|
10
|
+
#
|
11
|
+
# @return [DataFrame]
|
12
|
+
def get_dummies(df, columns: nil)
|
13
|
+
df.to_dummies(columns: columns)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Aggregate multiple Dataframes/Series to a single DataFrame/Series.
|
17
|
+
#
|
18
|
+
# @param items [Object]
|
19
|
+
# DataFrames/Series/LazyFrames to concatenate.
|
20
|
+
# @param rechunk [Boolean]
|
21
|
+
# Make sure that all data is in contiguous memory.
|
22
|
+
# @param how ["vertical", "diagonal", "horizontal"]
|
23
|
+
# Lazy only supports the 'vertical' strategy.
|
24
|
+
#
|
25
|
+
# - Vertical: applies multiple `vstack` operations.
|
26
|
+
# - Diagonal: finds a union between the column schemas and fills missing column values with null.
|
27
|
+
# - Horizontal: stacks Series horizontally and fills with nulls if the lengths don't match.
|
28
|
+
# @param parallel [Boolean]
|
29
|
+
# Only relevant for LazyFrames. This determines if the concatenated
|
30
|
+
# lazy computations may be executed in parallel.
|
31
|
+
#
|
32
|
+
# @return [Object]
|
33
|
+
#
|
34
|
+
# @example
|
35
|
+
# df1 = Polars::DataFrame.new({"a" => [1], "b" => [3]})
|
36
|
+
# df2 = Polars::DataFrame.new({"a" => [2], "b" => [4]})
|
37
|
+
# Polars.concat([df1, df2])
|
38
|
+
# # =>
|
39
|
+
# # shape: (2, 2)
|
40
|
+
# # ┌─────┬─────┐
|
41
|
+
# # │ a ┆ b │
|
42
|
+
# # │ --- ┆ --- │
|
43
|
+
# # │ i64 ┆ i64 │
|
44
|
+
# # ╞═════╪═════╡
|
45
|
+
# # │ 1 ┆ 3 │
|
46
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
47
|
+
# # │ 2 ┆ 4 │
|
48
|
+
# # └─────┴─────┘
|
3
49
|
def concat(items, rechunk: true, how: "vertical", parallel: true)
|
4
50
|
if items.empty?
|
5
51
|
raise ArgumentError, "cannot concat empty list"
|
@@ -41,5 +87,178 @@ module Polars
|
|
41
87
|
out
|
42
88
|
end
|
43
89
|
end
|
90
|
+
|
91
|
+
# Create a range of type `Datetime` (or `Date`).
|
92
|
+
#
|
93
|
+
# @param low [Object]
|
94
|
+
# Lower bound of the date range.
|
95
|
+
# @param high [Object]
|
96
|
+
# Upper bound of the date range.
|
97
|
+
# @param interval [Object]
|
98
|
+
# Interval periods. It can be a polars duration string, such as `3d12h4m25s`
|
99
|
+
# representing 3 days, 12 hours, 4 minutes, and 25 seconds.
|
100
|
+
# @param lazy [Boolean]
|
101
|
+
# Return an expression.
|
102
|
+
# @param closed ["both", "left", "right", "none"]
|
103
|
+
# Define whether the temporal window interval is closed or not.
|
104
|
+
# @param name [String]
|
105
|
+
# Name of the output Series.
|
106
|
+
# @param time_unit [nil, "ns", "us", "ms"]
|
107
|
+
# Set the time unit.
|
108
|
+
# @param time_zone [String]
|
109
|
+
# Optional timezone
|
110
|
+
#
|
111
|
+
# @return [Object]
|
112
|
+
#
|
113
|
+
# @note
|
114
|
+
# If both `low` and `high` are passed as date types (not datetime), and the
|
115
|
+
# interval granularity is no finer than 1d, the returned range is also of
|
116
|
+
# type date. All other permutations return a datetime Series.
|
117
|
+
#
|
118
|
+
# @example Using polars duration string to specify the interval
|
119
|
+
# Polars.date_range(Date.new(2022, 1, 1), Date.new(2022, 3, 1), "1mo", name: "drange")
|
120
|
+
# # =>
|
121
|
+
# # shape: (3,)
|
122
|
+
# # Series: 'drange' [date]
|
123
|
+
# # [
|
124
|
+
# # 2022-01-01
|
125
|
+
# # 2022-02-01
|
126
|
+
# # 2022-03-01
|
127
|
+
# # ]
|
128
|
+
#
|
129
|
+
# @example Using `timedelta` object to specify the interval:
|
130
|
+
# Polars.date_range(
|
131
|
+
# DateTime.new(1985, 1, 1),
|
132
|
+
# DateTime.new(1985, 1, 10),
|
133
|
+
# "1d12h",
|
134
|
+
# time_unit: "ms"
|
135
|
+
# )
|
136
|
+
# # =>
|
137
|
+
# # shape: (7,)
|
138
|
+
# # Series: '' [datetime[ms]]
|
139
|
+
# # [
|
140
|
+
# # 1985-01-01 00:00:00
|
141
|
+
# # 1985-01-02 12:00:00
|
142
|
+
# # 1985-01-04 00:00:00
|
143
|
+
# # 1985-01-05 12:00:00
|
144
|
+
# # 1985-01-07 00:00:00
|
145
|
+
# # 1985-01-08 12:00:00
|
146
|
+
# # 1985-01-10 00:00:00
|
147
|
+
# # ]
|
148
|
+
def date_range(
|
149
|
+
low,
|
150
|
+
high,
|
151
|
+
interval,
|
152
|
+
lazy: false,
|
153
|
+
closed: "both",
|
154
|
+
name: nil,
|
155
|
+
time_unit: nil,
|
156
|
+
time_zone: nil
|
157
|
+
)
|
158
|
+
if defined?(ActiveSupport::Duration) && interval.is_a?(ActiveSupport::Duration)
|
159
|
+
raise Todo
|
160
|
+
else
|
161
|
+
interval = interval.to_s
|
162
|
+
if interval.include?(" ")
|
163
|
+
interval = interval.gsub(" ", "")
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
if low.is_a?(Expr) || high.is_a?(Expr) || lazy
|
168
|
+
low = Utils.expr_to_lit_or_expr(low, str_to_lit: true)
|
169
|
+
high = Utils.expr_to_lit_or_expr(high, str_to_lit: true)
|
170
|
+
return Utils.wrap_expr(
|
171
|
+
_rb_date_range_lazy(low, high, interval, closed, name, time_zone)
|
172
|
+
)
|
173
|
+
end
|
174
|
+
|
175
|
+
low, low_is_date = _ensure_datetime(low)
|
176
|
+
high, high_is_date = _ensure_datetime(high)
|
177
|
+
|
178
|
+
if !time_unit.nil?
|
179
|
+
tu = time_unit
|
180
|
+
elsif interval.include?("ns")
|
181
|
+
tu = "ns"
|
182
|
+
else
|
183
|
+
tu = "us"
|
184
|
+
end
|
185
|
+
|
186
|
+
start = Utils._datetime_to_pl_timestamp(low, tu)
|
187
|
+
stop = Utils._datetime_to_pl_timestamp(high, tu)
|
188
|
+
if name.nil?
|
189
|
+
name = ""
|
190
|
+
end
|
191
|
+
|
192
|
+
dt_range = Utils.wrap_s(
|
193
|
+
_rb_date_range(start, stop, interval, closed, name, tu, time_zone)
|
194
|
+
)
|
195
|
+
if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
|
196
|
+
dt_range = dt_range.cast(Date)
|
197
|
+
end
|
198
|
+
|
199
|
+
dt_range
|
200
|
+
end
|
201
|
+
|
202
|
+
# def cut
|
203
|
+
# end
|
204
|
+
|
205
|
+
# def align_frames
|
206
|
+
# end
|
207
|
+
|
208
|
+
# Return a new Series of given length and type, filled with ones.
|
209
|
+
#
|
210
|
+
# @param n [Integer]
|
211
|
+
# Number of elements in the `Series`
|
212
|
+
# @param dtype [Symbol]
|
213
|
+
# DataType of the elements, defaults to `:f64`
|
214
|
+
#
|
215
|
+
# @return [Series]
|
216
|
+
#
|
217
|
+
# @note
|
218
|
+
# In the lazy API you should probably not use this, but use `lit(1)`
|
219
|
+
# instead.
|
220
|
+
def ones(n, dtype: nil)
|
221
|
+
s = Series.new([1.0])
|
222
|
+
if dtype
|
223
|
+
s = s.cast(dtype)
|
224
|
+
end
|
225
|
+
s.new_from_index(0, n)
|
226
|
+
end
|
227
|
+
|
228
|
+
# Return a new Series of given length and type, filled with zeros.
|
229
|
+
#
|
230
|
+
# @param n [Integer]
|
231
|
+
# Number of elements in the `Series`
|
232
|
+
# @param dtype [Symbol]
|
233
|
+
# DataType of the elements, defaults to `:f64`
|
234
|
+
#
|
235
|
+
# @return [Series]
|
236
|
+
#
|
237
|
+
# @note
|
238
|
+
# In the lazy API you should probably not use this, but use `lit(0)`
|
239
|
+
# instead.
|
240
|
+
def zeros(n, dtype: nil)
|
241
|
+
s = Series.new([0.0])
|
242
|
+
if dtype
|
243
|
+
s = s.cast(dtype)
|
244
|
+
end
|
245
|
+
s.new_from_index(0, n)
|
246
|
+
end
|
247
|
+
|
248
|
+
private
|
249
|
+
|
250
|
+
def _ensure_datetime(value)
|
251
|
+
is_date_type = false
|
252
|
+
if !value.is_a?(DateTime)
|
253
|
+
value = DateTime.new(value.year, value.month, value.day)
|
254
|
+
is_date_type = true
|
255
|
+
end
|
256
|
+
[value, is_date_type]
|
257
|
+
end
|
258
|
+
|
259
|
+
# TODO
|
260
|
+
def _interval_granularity(interval)
|
261
|
+
interval
|
262
|
+
end
|
44
263
|
end
|
45
264
|
end
|