polars-df 0.1.3 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Cargo.lock +142 -11
- data/Cargo.toml +5 -0
- data/ext/polars/Cargo.toml +17 -1
- data/ext/polars/src/apply/dataframe.rs +292 -0
- data/ext/polars/src/apply/mod.rs +254 -0
- data/ext/polars/src/apply/series.rs +1173 -0
- data/ext/polars/src/conversion.rs +180 -5
- data/ext/polars/src/dataframe.rs +146 -1
- data/ext/polars/src/error.rs +12 -0
- data/ext/polars/src/lazy/apply.rs +34 -2
- data/ext/polars/src/lazy/dataframe.rs +74 -3
- data/ext/polars/src/lazy/dsl.rs +136 -0
- data/ext/polars/src/lib.rs +199 -1
- data/ext/polars/src/list_construction.rs +100 -0
- data/ext/polars/src/series.rs +331 -0
- data/ext/polars/src/utils.rs +25 -0
- data/lib/polars/cat_name_space.rb +54 -0
- data/lib/polars/convert.rb +100 -0
- data/lib/polars/data_frame.rb +1558 -60
- data/lib/polars/date_time_expr.rb +2 -2
- data/lib/polars/date_time_name_space.rb +1484 -0
- data/lib/polars/dynamic_group_by.rb +49 -0
- data/lib/polars/expr.rb +4072 -107
- data/lib/polars/expr_dispatch.rb +8 -0
- data/lib/polars/functions.rb +192 -3
- data/lib/polars/group_by.rb +44 -3
- data/lib/polars/io.rb +20 -4
- data/lib/polars/lazy_frame.rb +800 -26
- data/lib/polars/lazy_functions.rb +687 -43
- data/lib/polars/lazy_group_by.rb +1 -0
- data/lib/polars/list_expr.rb +502 -5
- data/lib/polars/list_name_space.rb +346 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +934 -62
- data/lib/polars/string_expr.rb +189 -13
- data/lib/polars/string_name_space.rb +690 -0
- data/lib/polars/struct_name_space.rb +64 -0
- data/lib/polars/utils.rb +44 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +14 -1
- metadata +15 -3
data/lib/polars/expr_dispatch.rb
CHANGED
@@ -3,11 +3,19 @@ module Polars
|
|
3
3
|
module ExprDispatch
|
4
4
|
private
|
5
5
|
|
6
|
+
def self.included(base)
|
7
|
+
base.attr_accessor :_s
|
8
|
+
base.singleton_class.attr_accessor :_accessor
|
9
|
+
end
|
10
|
+
|
6
11
|
def method_missing(method, ...)
|
7
12
|
return super unless self.class.method_defined?(method)
|
8
13
|
|
14
|
+
namespace = self.class._accessor
|
15
|
+
|
9
16
|
s = Utils.wrap_s(_s)
|
10
17
|
expr = Utils.col(s.name)
|
18
|
+
expr = expr.send(namespace) if namespace
|
11
19
|
s.to_frame.select(expr.send(method, ...)).to_series
|
12
20
|
end
|
13
21
|
end
|
data/lib/polars/functions.rb
CHANGED
@@ -199,12 +199,201 @@ module Polars
|
|
199
199
|
dt_range
|
200
200
|
end
|
201
201
|
|
202
|
-
#
|
203
|
-
#
|
202
|
+
# Bin values into discrete values.
|
203
|
+
#
|
204
|
+
# @param s [Series]
|
205
|
+
# Series to bin.
|
206
|
+
# @param bins [Array]
|
207
|
+
# Bins to create.
|
208
|
+
# @param labels [Array]
|
209
|
+
# Labels to assign to the bins. If given the length of labels must be
|
210
|
+
# len(bins) + 1.
|
211
|
+
# @param break_point_label [String]
|
212
|
+
# Name given to the breakpoint column.
|
213
|
+
# @param category_label [String]
|
214
|
+
# Name given to the category column.
|
215
|
+
#
|
216
|
+
# @return [DataFrame]
|
217
|
+
#
|
218
|
+
# @note
|
219
|
+
# This functionality is experimental and may change without it being considered a
|
220
|
+
# breaking change.
|
221
|
+
#
|
222
|
+
# @example
|
223
|
+
# a = Polars::Series.new("a", 13.times.map { |i| (-30 + i * 5) / 10.0 })
|
224
|
+
# Polars.cut(a, [-1, 1])
|
225
|
+
# # =>
|
226
|
+
# # shape: (12, 3)
|
227
|
+
# # ┌──────┬─────────────┬──────────────┐
|
228
|
+
# # │ a ┆ break_point ┆ category │
|
229
|
+
# # │ --- ┆ --- ┆ --- │
|
230
|
+
# # │ f64 ┆ f64 ┆ cat │
|
231
|
+
# # ╞══════╪═════════════╪══════════════╡
|
232
|
+
# # │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
233
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
234
|
+
# # │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
235
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
236
|
+
# # │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │
|
237
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
238
|
+
# # │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │
|
239
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
240
|
+
# # │ ... ┆ ... ┆ ... │
|
241
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
242
|
+
# # │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │
|
243
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
244
|
+
# # │ 1.5 ┆ inf ┆ (1.0, inf] │
|
245
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
246
|
+
# # │ 2.0 ┆ inf ┆ (1.0, inf] │
|
247
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
248
|
+
# # │ 2.5 ┆ inf ┆ (1.0, inf] │
|
249
|
+
# # └──────┴─────────────┴──────────────┘
|
250
|
+
# def cut(
|
251
|
+
# s,
|
252
|
+
# bins,
|
253
|
+
# labels: nil,
|
254
|
+
# break_point_label: "break_point",
|
255
|
+
# category_label: "category"
|
256
|
+
# )
|
257
|
+
# var_nm = s.name
|
204
258
|
|
205
|
-
#
|
259
|
+
# cuts_df = DataFrame.new(
|
260
|
+
# [
|
261
|
+
# Series.new(
|
262
|
+
# break_point_label, bins, dtype: :f64
|
263
|
+
# ).extend_constant(Float::INFINITY, 1)
|
264
|
+
# ]
|
265
|
+
# )
|
266
|
+
|
267
|
+
# if labels
|
268
|
+
# if labels.length != bins.length + 1
|
269
|
+
# raise ArgumentError, "expected more labels"
|
270
|
+
# end
|
271
|
+
# cuts_df = cuts_df.with_column(Series.new(category_label, labels))
|
272
|
+
# else
|
273
|
+
# cuts_df = cuts_df.with_column(
|
274
|
+
# Polars.format(
|
275
|
+
# "({}, {}]",
|
276
|
+
# Polars.col(break_point_label).shift_and_fill(1, -Float::INFINITY),
|
277
|
+
# Polars.col(break_point_label)
|
278
|
+
# ).alias(category_label)
|
279
|
+
# )
|
280
|
+
# end
|
281
|
+
|
282
|
+
# cuts_df = cuts_df.with_column(Polars.col(category_label).cast(:cat))
|
283
|
+
|
284
|
+
# s.cast(:f64)
|
285
|
+
# .sort
|
286
|
+
# .to_frame
|
287
|
+
# .join_asof(
|
288
|
+
# cuts_df,
|
289
|
+
# left_on: var_nm,
|
290
|
+
# right_on: break_point_label,
|
291
|
+
# strategy: "forward"
|
292
|
+
# )
|
206
293
|
# end
|
207
294
|
|
295
|
+
# Align a sequence of frames using the uique values from one or more columns as a key.
|
296
|
+
#
|
297
|
+
# Frames that do not contain the given key values have rows injected (with nulls
|
298
|
+
# filling the non-key columns), and each resulting frame is sorted by the key.
|
299
|
+
#
|
300
|
+
# The original column order of input frames is not changed unless ``select`` is
|
301
|
+
# specified (in which case the final column order is determined from that).
|
302
|
+
#
|
303
|
+
# Note that this does not result in a joined frame - you receive the same number
|
304
|
+
# of frames back that you passed in, but each is now aligned by key and has
|
305
|
+
# the same number of rows.
|
306
|
+
#
|
307
|
+
# @param frames [Array]
|
308
|
+
# Sequence of DataFrames or LazyFrames.
|
309
|
+
# @param on [Object]
|
310
|
+
# One or more columns whose unique values will be used to align the frames.
|
311
|
+
# @param select [Object]
|
312
|
+
# Optional post-alignment column select to constrain and/or order
|
313
|
+
# the columns returned from the newly aligned frames.
|
314
|
+
# @param reverse [Object]
|
315
|
+
# Sort the alignment column values in descending order; can be a single
|
316
|
+
# boolean or a list of booleans associated with each column in `on`.
|
317
|
+
#
|
318
|
+
# @return [Object]
|
319
|
+
#
|
320
|
+
# @example
|
321
|
+
# df1 = Polars::DataFrame.new(
|
322
|
+
# {
|
323
|
+
# "dt" => [Date.new(2022, 9, 1), Date.new(2022, 9, 2), Date.new(2022, 9, 3)],
|
324
|
+
# "x" => [3.5, 4.0, 1.0],
|
325
|
+
# "y" => [10.0, 2.5, 1.5]
|
326
|
+
# }
|
327
|
+
# )
|
328
|
+
# df2 = Polars::DataFrame.new(
|
329
|
+
# {
|
330
|
+
# "dt" => [Date.new(2022, 9, 2), Date.new(2022, 9, 3), Date.new(2022, 9, 1)],
|
331
|
+
# "x" => [8.0, 1.0, 3.5],
|
332
|
+
# "y" => [1.5, 12.0, 5.0]
|
333
|
+
# }
|
334
|
+
# )
|
335
|
+
# df3 = Polars::DataFrame.new(
|
336
|
+
# {
|
337
|
+
# "dt" => [Date.new(2022, 9, 3), Date.new(2022, 9, 2)],
|
338
|
+
# "x" => [2.0, 5.0],
|
339
|
+
# "y" => [2.5, 2.0]
|
340
|
+
# }
|
341
|
+
# )
|
342
|
+
# af1, af2, af3 = Polars.align_frames(
|
343
|
+
# df1, df2, df3, on: "dt", select: ["x", "y"]
|
344
|
+
# )
|
345
|
+
# (af1 * af2 * af3).fill_null(0).select(Polars.sum(Polars.col("*")).alias("dot"))
|
346
|
+
# # =>
|
347
|
+
# # shape: (3, 1)
|
348
|
+
# # ┌───────┐
|
349
|
+
# # │ dot │
|
350
|
+
# # │ --- │
|
351
|
+
# # │ f64 │
|
352
|
+
# # ╞═══════╡
|
353
|
+
# # │ 0.0 │
|
354
|
+
# # ├╌╌╌╌╌╌╌┤
|
355
|
+
# # │ 167.5 │
|
356
|
+
# # ├╌╌╌╌╌╌╌┤
|
357
|
+
# # │ 47.0 │
|
358
|
+
# # └───────┘
|
359
|
+
def align_frames(
|
360
|
+
*frames,
|
361
|
+
on:,
|
362
|
+
select: nil,
|
363
|
+
reverse: false
|
364
|
+
)
|
365
|
+
if frames.empty?
|
366
|
+
return []
|
367
|
+
elsif frames.map(&:class).uniq.length != 1
|
368
|
+
raise TypeError, "Input frames must be of a consistent type (all LazyFrame or all DataFrame)"
|
369
|
+
end
|
370
|
+
|
371
|
+
# establish the superset of all "on" column values, sort, and cache
|
372
|
+
eager = frames[0].is_a?(DataFrame)
|
373
|
+
alignment_frame = (
|
374
|
+
concat(frames.map { |df| df.lazy.select(on) })
|
375
|
+
.unique(maintain_order: false)
|
376
|
+
.sort(on, reverse: reverse)
|
377
|
+
)
|
378
|
+
alignment_frame = (
|
379
|
+
eager ? alignment_frame.collect.lazy : alignment_frame.cache
|
380
|
+
)
|
381
|
+
# finally, align all frames
|
382
|
+
aligned_frames =
|
383
|
+
frames.map do |df|
|
384
|
+
alignment_frame.join(
|
385
|
+
df.lazy,
|
386
|
+
on: alignment_frame.columns,
|
387
|
+
how: "left"
|
388
|
+
).select(df.columns)
|
389
|
+
end
|
390
|
+
if !select.nil?
|
391
|
+
aligned_frames = aligned_frames.map { |df| df.select(select) }
|
392
|
+
end
|
393
|
+
|
394
|
+
eager ? aligned_frames.map(&:collect) : aligned_frames
|
395
|
+
end
|
396
|
+
|
208
397
|
# Return a new Series of given length and type, filled with ones.
|
209
398
|
#
|
210
399
|
# @param n [Integer]
|
data/lib/polars/group_by.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
module Polars
|
2
|
+
# Starts a new GroupBy operation.
|
2
3
|
class GroupBy
|
3
4
|
# @private
|
4
5
|
attr_accessor :_df, :_dataframe_class, :by, :maintain_order
|
@@ -11,7 +12,48 @@ module Polars
|
|
11
12
|
self.maintain_order = maintain_order
|
12
13
|
end
|
13
14
|
|
14
|
-
#
|
15
|
+
# Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
|
16
|
+
#
|
17
|
+
# Implementing logic using a Ruby function is almost always _significantly_
|
18
|
+
# slower and more memory intensive than implementing the same logic using
|
19
|
+
# the native expression API because:
|
20
|
+
|
21
|
+
# - The native expression engine runs in Rust; UDFs run in Ruby.
|
22
|
+
# - Use of Ruby UDFs forces the DataFrame to be materialized in memory.
|
23
|
+
# - Polars-native expressions can be parallelised (UDFs cannot).
|
24
|
+
# - Polars-native expressions can be logically optimised (UDFs cannot).
|
25
|
+
#
|
26
|
+
# Wherever possible you should strongly prefer the native expression API
|
27
|
+
# to achieve the best performance.
|
28
|
+
#
|
29
|
+
# @return [DataFrame]
|
30
|
+
#
|
31
|
+
# @example
|
32
|
+
# df = Polars::DataFrame.new(
|
33
|
+
# {
|
34
|
+
# "id" => [0, 1, 2, 3, 4],
|
35
|
+
# "color" => ["red", "green", "green", "red", "red"],
|
36
|
+
# "shape" => ["square", "triangle", "square", "triangle", "square"]
|
37
|
+
# }
|
38
|
+
# )
|
39
|
+
# df.groupby("color").apply { |group_df| group_df.sample(2) }
|
40
|
+
# # =>
|
41
|
+
# # shape: (4, 3)
|
42
|
+
# # ┌─────┬───────┬──────────┐
|
43
|
+
# # │ id ┆ color ┆ shape │
|
44
|
+
# # │ --- ┆ --- ┆ --- │
|
45
|
+
# # │ i64 ┆ str ┆ str │
|
46
|
+
# # ╞═════╪═══════╪══════════╡
|
47
|
+
# # │ 1 ┆ green ┆ triangle │
|
48
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
49
|
+
# # │ 2 ┆ green ┆ square │
|
50
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
51
|
+
# # │ 4 ┆ red ┆ square │
|
52
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
53
|
+
# # │ 3 ┆ red ┆ triangle │
|
54
|
+
# # └─────┴───────┴──────────┘
|
55
|
+
# def apply(&f)
|
56
|
+
# _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
|
15
57
|
# end
|
16
58
|
|
17
59
|
# Use multiple aggregations on columns.
|
@@ -181,8 +223,7 @@ module Polars
|
|
181
223
|
_dataframe_class._from_rbdf(df._df)
|
182
224
|
end
|
183
225
|
|
184
|
-
#
|
185
|
-
# end
|
226
|
+
# pivot is deprecated
|
186
227
|
|
187
228
|
# Aggregate the first values in the group.
|
188
229
|
#
|
data/lib/polars/io.rb
CHANGED
@@ -59,7 +59,7 @@ module Polars
|
|
59
59
|
# Lossy means that invalid utf8 values are replaced with `�`
|
60
60
|
# characters. When using other encodings than `utf8` or
|
61
61
|
# `utf8-lossy`, the input is first decoded im memory with
|
62
|
-
#
|
62
|
+
# Ruby.
|
63
63
|
# @param low_memory [Boolean]
|
64
64
|
# Reduce memory usage at expense of performance.
|
65
65
|
# @param rechunk [Boolean]
|
@@ -183,7 +183,7 @@ module Polars
|
|
183
183
|
# @param has_header [Boolean]
|
184
184
|
# Indicate if the first row of dataset is a header or not.
|
185
185
|
# If set to false, column names will be autogenerated in the
|
186
|
-
# following format:
|
186
|
+
# following format: `column_x`, with `x` being an
|
187
187
|
# enumeration over every column in the dataset starting at 1.
|
188
188
|
# @param sep [String]
|
189
189
|
# Single byte character to use as delimiter in the file.
|
@@ -451,8 +451,24 @@ module Polars
|
|
451
451
|
)
|
452
452
|
end
|
453
453
|
|
454
|
-
#
|
455
|
-
#
|
454
|
+
# Read into a DataFrame from Apache Avro format.
|
455
|
+
#
|
456
|
+
# @param file [Object]
|
457
|
+
# Path to a file or a file-like object.
|
458
|
+
# @param columns [Object]
|
459
|
+
# Columns to select. Accepts a list of column indices (starting at zero) or a list
|
460
|
+
# of column names.
|
461
|
+
# @param n_rows [Integer]
|
462
|
+
# Stop reading from Apache Avro file after reading ``n_rows``.
|
463
|
+
#
|
464
|
+
# @return [DataFrame]
|
465
|
+
def read_avro(file, columns: nil, n_rows: nil)
|
466
|
+
if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
|
467
|
+
file = Utils.format_path(file)
|
468
|
+
end
|
469
|
+
|
470
|
+
DataFrame._read_avro(file, n_rows: n_rows, columns: columns)
|
471
|
+
end
|
456
472
|
|
457
473
|
# Read into a DataFrame from Arrow IPC (Feather v2) file.
|
458
474
|
#
|