polars-df 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Cargo.lock +468 -538
- data/Cargo.toml +1 -0
- data/README.md +8 -7
- data/ext/polars/Cargo.toml +17 -10
- data/ext/polars/src/batched_csv.rs +26 -26
- data/ext/polars/src/conversion.rs +121 -93
- data/ext/polars/src/dataframe.rs +116 -71
- data/ext/polars/src/error.rs +0 -5
- data/ext/polars/src/expr/binary.rs +18 -6
- data/ext/polars/src/expr/datetime.rs +10 -12
- data/ext/polars/src/expr/general.rs +68 -284
- data/ext/polars/src/expr/list.rs +17 -9
- data/ext/polars/src/{expr.rs → expr/mod.rs} +4 -2
- data/ext/polars/src/expr/name.rs +44 -0
- data/ext/polars/src/expr/rolling.rs +196 -0
- data/ext/polars/src/expr/string.rs +85 -58
- data/ext/polars/src/file.rs +3 -3
- data/ext/polars/src/functions/aggregation.rs +35 -0
- data/ext/polars/src/functions/eager.rs +7 -31
- data/ext/polars/src/functions/io.rs +10 -10
- data/ext/polars/src/functions/lazy.rs +66 -41
- data/ext/polars/src/functions/meta.rs +30 -0
- data/ext/polars/src/functions/misc.rs +8 -0
- data/ext/polars/src/functions/mod.rs +5 -0
- data/ext/polars/src/functions/random.rs +6 -0
- data/ext/polars/src/functions/range.rs +46 -0
- data/ext/polars/src/functions/string_cache.rs +11 -0
- data/ext/polars/src/functions/whenthen.rs +7 -7
- data/ext/polars/src/lazyframe.rs +47 -42
- data/ext/polars/src/lib.rs +156 -72
- data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
- data/ext/polars/src/{apply → map}/mod.rs +3 -3
- data/ext/polars/src/{apply → map}/series.rs +12 -16
- data/ext/polars/src/object.rs +1 -1
- data/ext/polars/src/rb_modules.rs +22 -7
- data/ext/polars/src/series/construction.rs +4 -4
- data/ext/polars/src/series/export.rs +2 -2
- data/ext/polars/src/series/set_at_idx.rs +33 -17
- data/ext/polars/src/series.rs +7 -27
- data/ext/polars/src/sql.rs +46 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +115 -82
- data/lib/polars/date_time_expr.rb +13 -18
- data/lib/polars/date_time_name_space.rb +5 -25
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +177 -94
- data/lib/polars/functions.rb +29 -37
- data/lib/polars/group_by.rb +38 -55
- data/lib/polars/io.rb +37 -2
- data/lib/polars/lazy_frame.rb +93 -66
- data/lib/polars/lazy_functions.rb +36 -48
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +12 -8
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +26 -13
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +114 -60
- data/lib/polars/string_name_space.rb +19 -4
- data/lib/polars/utils.rb +12 -0
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +3 -0
- metadata +18 -7
- /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
data/lib/polars/functions.rb
CHANGED
@@ -19,8 +19,8 @@ module Polars
|
|
19
19
|
# DataFrames/Series/LazyFrames to concatenate.
|
20
20
|
# @param rechunk [Boolean]
|
21
21
|
# Make sure that all data is in contiguous memory.
|
22
|
-
# @param how ["vertical", "diagonal", "horizontal"]
|
23
|
-
#
|
22
|
+
# @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
|
23
|
+
# LazyFrames do not support the `horizontal` strategy.
|
24
24
|
#
|
25
25
|
# - Vertical: applies multiple `vstack` operations.
|
26
26
|
# - Diagonal: finds a union between the column schemas and fills missing column values with null.
|
@@ -55,18 +55,21 @@ module Polars
|
|
55
55
|
if how == "vertical"
|
56
56
|
out = Utils.wrap_df(_concat_df(items))
|
57
57
|
elsif how == "diagonal"
|
58
|
-
out = Utils.wrap_df(
|
58
|
+
out = Utils.wrap_df(_concat_df_diagonal(items))
|
59
59
|
elsif how == "horizontal"
|
60
|
-
out = Utils.wrap_df(
|
60
|
+
out = Utils.wrap_df(_concat_df_horizontal(items))
|
61
61
|
else
|
62
62
|
raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
|
63
63
|
end
|
64
64
|
elsif first.is_a?(LazyFrame)
|
65
65
|
if how == "vertical"
|
66
|
-
|
67
|
-
|
66
|
+
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
|
67
|
+
elsif how == "vertical_relaxed"
|
68
|
+
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
|
69
|
+
elsif how == "diagonal"
|
70
|
+
return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
|
68
71
|
else
|
69
|
-
raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
|
72
|
+
raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
|
70
73
|
end
|
71
74
|
elsif first.is_a?(Series)
|
72
75
|
# TODO
|
@@ -89,9 +92,9 @@ module Polars
|
|
89
92
|
|
90
93
|
# Create a range of type `Datetime` (or `Date`).
|
91
94
|
#
|
92
|
-
# @param
|
95
|
+
# @param start [Object]
|
93
96
|
# Lower bound of the date range.
|
94
|
-
# @param
|
97
|
+
# @param stop [Object]
|
95
98
|
# Upper bound of the date range.
|
96
99
|
# @param interval [Object]
|
97
100
|
# Interval periods. It can be a polars duration string, such as `3d12h4m25s`
|
@@ -145,8 +148,8 @@ module Polars
|
|
145
148
|
# # 1985-01-10 00:00:00
|
146
149
|
# # ]
|
147
150
|
def date_range(
|
148
|
-
|
149
|
-
|
151
|
+
start,
|
152
|
+
stop,
|
150
153
|
interval,
|
151
154
|
lazy: false,
|
152
155
|
closed: "both",
|
@@ -163,39 +166,28 @@ module Polars
|
|
163
166
|
end
|
164
167
|
end
|
165
168
|
|
166
|
-
if
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
169
|
+
if time_unit.nil?
|
170
|
+
if interval.include?("ns")
|
171
|
+
time_unit = "ns"
|
172
|
+
else
|
173
|
+
time_unit = "us"
|
174
|
+
end
|
172
175
|
end
|
173
176
|
|
174
|
-
|
175
|
-
|
177
|
+
start_rbexpr = Utils.parse_as_expression(start)
|
178
|
+
stop_rbexpr = Utils.parse_as_expression(stop)
|
176
179
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
tu = "ns"
|
181
|
-
else
|
182
|
-
tu = "us"
|
183
|
-
end
|
180
|
+
result = Utils.wrap_expr(
|
181
|
+
_rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
|
182
|
+
)
|
184
183
|
|
185
|
-
|
186
|
-
stop = Utils._datetime_to_pl_timestamp(high, tu)
|
187
|
-
if name.nil?
|
188
|
-
name = ""
|
189
|
-
end
|
184
|
+
result = result.alias(name.to_s)
|
190
185
|
|
191
|
-
|
192
|
-
|
193
|
-
)
|
194
|
-
if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
|
195
|
-
dt_range = dt_range.cast(Date)
|
186
|
+
if !lazy
|
187
|
+
return select(result).to_series
|
196
188
|
end
|
197
189
|
|
198
|
-
|
190
|
+
result
|
199
191
|
end
|
200
192
|
|
201
193
|
# Bin values into discrete values.
|
data/lib/polars/group_by.rb
CHANGED
@@ -2,23 +2,19 @@ module Polars
|
|
2
2
|
# Starts a new GroupBy operation.
|
3
3
|
class GroupBy
|
4
4
|
# @private
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
self._df = df
|
10
|
-
self._dataframe_class = dataframe_class
|
11
|
-
self.by = by
|
12
|
-
self.maintain_order = maintain_order
|
5
|
+
def initialize(df, by, maintain_order: false)
|
6
|
+
@df = df
|
7
|
+
@by = by
|
8
|
+
@maintain_order = maintain_order
|
13
9
|
end
|
14
10
|
|
15
|
-
# Allows iteration over the groups of the
|
11
|
+
# Allows iteration over the groups of the group by operation.
|
16
12
|
#
|
17
13
|
# @return [Object]
|
18
14
|
#
|
19
15
|
# @example
|
20
16
|
# df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]})
|
21
|
-
# df.
|
17
|
+
# df.group_by("foo", maintain_order: true).each.to_h
|
22
18
|
# # =>
|
23
19
|
# # {"a"=>shape: (2, 2)
|
24
20
|
# # ┌─────┬─────┐
|
@@ -41,10 +37,9 @@ module Polars
|
|
41
37
|
|
42
38
|
temp_col = "__POLARS_GB_GROUP_INDICES"
|
43
39
|
groups_df =
|
44
|
-
|
45
|
-
.lazy
|
40
|
+
@df.lazy
|
46
41
|
.with_row_count(name: temp_col)
|
47
|
-
.
|
42
|
+
.group_by(@by, maintain_order: @maintain_order)
|
48
43
|
.agg(Polars.col(temp_col))
|
49
44
|
.collect(no_optimization: true)
|
50
45
|
|
@@ -52,7 +47,7 @@ module Polars
|
|
52
47
|
|
53
48
|
# When grouping by a single column, group name is a single value
|
54
49
|
# When grouping by multiple columns, group name is a tuple of values
|
55
|
-
if by.is_a?(String) || by.is_a?(Expr)
|
50
|
+
if @by.is_a?(String) || @by.is_a?(Expr)
|
56
51
|
_group_names = group_names.to_series.each
|
57
52
|
else
|
58
53
|
_group_names = group_names.iter_rows
|
@@ -62,10 +57,8 @@ module Polars
|
|
62
57
|
_current_index = 0
|
63
58
|
|
64
59
|
while _current_index < _group_indices.length
|
65
|
-
df = _dataframe_class._from_rbdf(_df)
|
66
|
-
|
67
60
|
group_name = _group_names.next
|
68
|
-
group_data = df[_group_indices[_current_index]]
|
61
|
+
group_data = @df[_group_indices[_current_index]]
|
69
62
|
_current_index += 1
|
70
63
|
|
71
64
|
yield group_name, group_data
|
@@ -96,7 +89,7 @@ module Polars
|
|
96
89
|
# "shape" => ["square", "triangle", "square", "triangle", "square"]
|
97
90
|
# }
|
98
91
|
# )
|
99
|
-
# df.
|
92
|
+
# df.group_by("color").apply { |group_df| group_df.sample(2) }
|
100
93
|
# # =>
|
101
94
|
# # shape: (4, 3)
|
102
95
|
# # ┌─────┬───────┬──────────┐
|
@@ -110,7 +103,7 @@ module Polars
|
|
110
103
|
# # │ 3 ┆ red ┆ triangle │
|
111
104
|
# # └─────┴───────┴──────────┘
|
112
105
|
# def apply(&f)
|
113
|
-
# _dataframe_class._from_rbdf(_df.
|
106
|
+
# _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
|
114
107
|
# end
|
115
108
|
|
116
109
|
# Use multiple aggregations on columns.
|
@@ -126,7 +119,7 @@ module Polars
|
|
126
119
|
# df = Polars::DataFrame.new(
|
127
120
|
# {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
|
128
121
|
# )
|
129
|
-
# df.
|
122
|
+
# df.group_by("foo", maintain_order: true).agg(
|
130
123
|
# [
|
131
124
|
# Polars.sum("bar").suffix("_sum"),
|
132
125
|
# Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
|
@@ -143,12 +136,10 @@ module Polars
|
|
143
136
|
# # │ two ┆ 6 ┆ 5 │
|
144
137
|
# # └─────┴─────────┴──────────────┘
|
145
138
|
def agg(aggs)
|
146
|
-
df
|
147
|
-
.
|
148
|
-
.groupby(by, maintain_order: maintain_order)
|
139
|
+
@df.lazy
|
140
|
+
.group_by(@by, maintain_order: @maintain_order)
|
149
141
|
.agg(aggs)
|
150
|
-
.collect(no_optimization: true
|
151
|
-
_dataframe_class._from_rbdf(df._df)
|
142
|
+
.collect(no_optimization: true)
|
152
143
|
end
|
153
144
|
|
154
145
|
# Get the first `n` rows of each group.
|
@@ -181,7 +172,7 @@ module Polars
|
|
181
172
|
# # └─────────┴─────┘
|
182
173
|
#
|
183
174
|
# @example
|
184
|
-
# df.
|
175
|
+
# df.group_by("letters").head(2).sort("letters")
|
185
176
|
# # =>
|
186
177
|
# # shape: (5, 2)
|
187
178
|
# # ┌─────────┬─────┐
|
@@ -196,14 +187,10 @@ module Polars
|
|
196
187
|
# # │ c ┆ 2 │
|
197
188
|
# # └─────────┴─────┘
|
198
189
|
def head(n = 5)
|
199
|
-
df
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
.head(n)
|
204
|
-
.collect(no_optimization: true, string_cache: false)
|
205
|
-
)
|
206
|
-
_dataframe_class._from_rbdf(df._df)
|
190
|
+
@df.lazy
|
191
|
+
.group_by(@by, maintain_order: @maintain_order)
|
192
|
+
.head(n)
|
193
|
+
.collect(no_optimization: true)
|
207
194
|
end
|
208
195
|
|
209
196
|
# Get the last `n` rows of each group.
|
@@ -236,7 +223,7 @@ module Polars
|
|
236
223
|
# # └─────────┴─────┘
|
237
224
|
#
|
238
225
|
# @example
|
239
|
-
# df.
|
226
|
+
# df.group_by("letters").tail(2).sort("letters")
|
240
227
|
# # =>
|
241
228
|
# # shape: (5, 2)
|
242
229
|
# # ┌─────────┬─────┐
|
@@ -251,14 +238,10 @@ module Polars
|
|
251
238
|
# # │ c ┆ 4 │
|
252
239
|
# # └─────────┴─────┘
|
253
240
|
def tail(n = 5)
|
254
|
-
df
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
.tail(n)
|
259
|
-
.collect(no_optimization: true, string_cache: false)
|
260
|
-
)
|
261
|
-
_dataframe_class._from_rbdf(df._df)
|
241
|
+
@df.lazy
|
242
|
+
.group_by(@by, maintain_order: @maintain_order)
|
243
|
+
.tail(n)
|
244
|
+
.collect(no_optimization: true)
|
262
245
|
end
|
263
246
|
|
264
247
|
# Aggregate the first values in the group.
|
@@ -274,7 +257,7 @@ module Polars
|
|
274
257
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
275
258
|
# }
|
276
259
|
# )
|
277
|
-
# df.
|
260
|
+
# df.group_by("d", maintain_order: true).first
|
278
261
|
# # =>
|
279
262
|
# # shape: (3, 4)
|
280
263
|
# # ┌────────┬─────┬──────┬───────┐
|
@@ -303,7 +286,7 @@ module Polars
|
|
303
286
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
304
287
|
# }
|
305
288
|
# )
|
306
|
-
# df.
|
289
|
+
# df.group_by("d", maintain_order: true).last
|
307
290
|
# # =>
|
308
291
|
# # shape: (3, 4)
|
309
292
|
# # ┌────────┬─────┬──────┬───────┐
|
@@ -332,7 +315,7 @@ module Polars
|
|
332
315
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
333
316
|
# }
|
334
317
|
# )
|
335
|
-
# df.
|
318
|
+
# df.group_by("d", maintain_order: true).sum
|
336
319
|
# # =>
|
337
320
|
# # shape: (3, 4)
|
338
321
|
# # ┌────────┬─────┬──────┬─────┐
|
@@ -361,7 +344,7 @@ module Polars
|
|
361
344
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
362
345
|
# }
|
363
346
|
# )
|
364
|
-
# df.
|
347
|
+
# df.group_by("d", maintain_order: true).min
|
365
348
|
# # =>
|
366
349
|
# # shape: (3, 4)
|
367
350
|
# # ┌────────┬─────┬──────┬───────┐
|
@@ -390,7 +373,7 @@ module Polars
|
|
390
373
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
391
374
|
# }
|
392
375
|
# )
|
393
|
-
# df.
|
376
|
+
# df.group_by("d", maintain_order: true).max
|
394
377
|
# # =>
|
395
378
|
# # shape: (3, 4)
|
396
379
|
# # ┌────────┬─────┬──────┬──────┐
|
@@ -419,7 +402,7 @@ module Polars
|
|
419
402
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
420
403
|
# }
|
421
404
|
# )
|
422
|
-
# df.
|
405
|
+
# df.group_by("d", maintain_order: true).count
|
423
406
|
# # =>
|
424
407
|
# # shape: (3, 2)
|
425
408
|
# # ┌────────┬───────┐
|
@@ -448,7 +431,7 @@ module Polars
|
|
448
431
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
449
432
|
# }
|
450
433
|
# )
|
451
|
-
# df.
|
434
|
+
# df.group_by("d", maintain_order: true).mean
|
452
435
|
# # =>
|
453
436
|
# # shape: (3, 4)
|
454
437
|
# # ┌────────┬─────┬──────────┬──────────┐
|
@@ -476,7 +459,7 @@ module Polars
|
|
476
459
|
# "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
|
477
460
|
# }
|
478
461
|
# )
|
479
|
-
# df.
|
462
|
+
# df.group_by("d", maintain_order: true).n_unique
|
480
463
|
# # =>
|
481
464
|
# # shape: (2, 3)
|
482
465
|
# # ┌────────┬─────┬─────┐
|
@@ -508,7 +491,7 @@ module Polars
|
|
508
491
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
509
492
|
# }
|
510
493
|
# )
|
511
|
-
# df.
|
494
|
+
# df.group_by("d", maintain_order: true).quantile(1)
|
512
495
|
# # =>
|
513
496
|
# # shape: (3, 3)
|
514
497
|
# # ┌────────┬─────┬──────┐
|
@@ -536,7 +519,7 @@ module Polars
|
|
536
519
|
# "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
|
537
520
|
# }
|
538
521
|
# )
|
539
|
-
# df.
|
522
|
+
# df.group_by("d", maintain_order: true).median
|
540
523
|
# # =>
|
541
524
|
# # shape: (2, 3)
|
542
525
|
# # ┌────────┬─────┬──────┐
|
@@ -555,11 +538,11 @@ module Polars
|
|
555
538
|
#
|
556
539
|
# @return [Vega::LiteChart]
|
557
540
|
def plot(*args, **options)
|
558
|
-
raise ArgumentError, "Multiple groups not supported" if by.is_a?(::Array) && by.size > 1
|
541
|
+
raise ArgumentError, "Multiple groups not supported" if @by.is_a?(::Array) && @by.size > 1
|
559
542
|
# same message as Ruby
|
560
543
|
raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
|
561
544
|
|
562
|
-
|
545
|
+
@df.plot(*args, **options, group: @by)
|
563
546
|
end
|
564
547
|
end
|
565
548
|
end
|
data/lib/polars/io.rb
CHANGED
@@ -621,11 +621,46 @@ module Polars
|
|
621
621
|
else
|
622
622
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
623
623
|
end
|
624
|
+
|
624
625
|
data = {}
|
626
|
+
schema_overrides = {}
|
627
|
+
|
625
628
|
result.columns.each_with_index do |k, i|
|
626
|
-
|
629
|
+
column_type = result.column_types[i]
|
630
|
+
|
631
|
+
data[k] =
|
632
|
+
if column_type
|
633
|
+
result.rows.map { |r| column_type.deserialize(r[i]) }
|
634
|
+
else
|
635
|
+
result.rows.map { |r| r[i] }
|
636
|
+
end
|
637
|
+
|
638
|
+
polars_type =
|
639
|
+
case column_type&.type
|
640
|
+
when :binary
|
641
|
+
Binary
|
642
|
+
when :boolean
|
643
|
+
Boolean
|
644
|
+
when :date
|
645
|
+
Date
|
646
|
+
when :datetime, :timestamp
|
647
|
+
Datetime
|
648
|
+
when :decimal
|
649
|
+
Decimal
|
650
|
+
when :float
|
651
|
+
Float64
|
652
|
+
when :integer
|
653
|
+
Int64
|
654
|
+
when :string, :text
|
655
|
+
Utf8
|
656
|
+
when :time
|
657
|
+
Time
|
658
|
+
end
|
659
|
+
|
660
|
+
schema_overrides[k] = polars_type if polars_type
|
627
661
|
end
|
628
|
-
|
662
|
+
|
663
|
+
DataFrame.new(data, schema_overrides: schema_overrides)
|
629
664
|
end
|
630
665
|
alias_method :read_sql, :read_database
|
631
666
|
|