polars-df 0.5.0-x86_64-linux → 0.7.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +26 -0
- data/Cargo.lock +595 -709
- data/Cargo.toml +1 -0
- data/LICENSE-THIRD-PARTY.txt +3854 -4496
- data/README.md +11 -9
- data/lib/polars/3.0/polars.so +0 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/array_expr.rb +84 -0
- data/lib/polars/array_name_space.rb +77 -0
- data/lib/polars/batched_csv_reader.rb +1 -1
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +206 -131
- data/lib/polars/data_types.rb +163 -29
- data/lib/polars/date_time_expr.rb +13 -18
- data/lib/polars/date_time_name_space.rb +22 -28
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +241 -151
- data/lib/polars/functions.rb +29 -38
- data/lib/polars/group_by.rb +38 -76
- data/lib/polars/io.rb +37 -2
- data/lib/polars/lazy_frame.rb +174 -95
- data/lib/polars/lazy_functions.rb +87 -63
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +40 -36
- data/lib/polars/list_name_space.rb +15 -15
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +6 -4
- data/lib/polars/series.rb +95 -28
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +249 -69
- data/lib/polars/string_name_space.rb +155 -25
- data/lib/polars/utils.rb +119 -57
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +6 -0
- metadata +7 -2
data/lib/polars/functions.rb
CHANGED
@@ -19,8 +19,8 @@ module Polars
|
|
19
19
|
# DataFrames/Series/LazyFrames to concatenate.
|
20
20
|
# @param rechunk [Boolean]
|
21
21
|
# Make sure that all data is in contiguous memory.
|
22
|
-
# @param how ["vertical", "diagonal", "horizontal"]
|
23
|
-
#
|
22
|
+
# @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
|
23
|
+
# LazyFrames do not support the `horizontal` strategy.
|
24
24
|
#
|
25
25
|
# - Vertical: applies multiple `vstack` operations.
|
26
26
|
# - Diagonal: finds a union between the column schemas and fills missing column values with null.
|
@@ -43,7 +43,6 @@ module Polars
|
|
43
43
|
# # │ i64 ┆ i64 │
|
44
44
|
# # ╞═════╪═════╡
|
45
45
|
# # │ 1 ┆ 3 │
|
46
|
-
# # ├╌╌╌╌╌┼╌╌╌╌╌┤
|
47
46
|
# # │ 2 ┆ 4 │
|
48
47
|
# # └─────┴─────┘
|
49
48
|
def concat(items, rechunk: true, how: "vertical", parallel: true)
|
@@ -56,18 +55,21 @@ module Polars
|
|
56
55
|
if how == "vertical"
|
57
56
|
out = Utils.wrap_df(_concat_df(items))
|
58
57
|
elsif how == "diagonal"
|
59
|
-
out = Utils.wrap_df(
|
58
|
+
out = Utils.wrap_df(_concat_df_diagonal(items))
|
60
59
|
elsif how == "horizontal"
|
61
|
-
out = Utils.wrap_df(
|
60
|
+
out = Utils.wrap_df(_concat_df_horizontal(items))
|
62
61
|
else
|
63
62
|
raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
|
64
63
|
end
|
65
64
|
elsif first.is_a?(LazyFrame)
|
66
65
|
if how == "vertical"
|
67
|
-
|
68
|
-
|
66
|
+
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
|
67
|
+
elsif how == "vertical_relaxed"
|
68
|
+
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
|
69
|
+
elsif how == "diagonal"
|
70
|
+
return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
|
69
71
|
else
|
70
|
-
raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
|
72
|
+
raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
|
71
73
|
end
|
72
74
|
elsif first.is_a?(Series)
|
73
75
|
# TODO
|
@@ -90,9 +92,9 @@ module Polars
|
|
90
92
|
|
91
93
|
# Create a range of type `Datetime` (or `Date`).
|
92
94
|
#
|
93
|
-
# @param
|
95
|
+
# @param start [Object]
|
94
96
|
# Lower bound of the date range.
|
95
|
-
# @param
|
97
|
+
# @param stop [Object]
|
96
98
|
# Upper bound of the date range.
|
97
99
|
# @param interval [Object]
|
98
100
|
# Interval periods. It can be a polars duration string, such as `3d12h4m25s`
|
@@ -146,8 +148,8 @@ module Polars
|
|
146
148
|
# # 1985-01-10 00:00:00
|
147
149
|
# # ]
|
148
150
|
def date_range(
|
149
|
-
|
150
|
-
|
151
|
+
start,
|
152
|
+
stop,
|
151
153
|
interval,
|
152
154
|
lazy: false,
|
153
155
|
closed: "both",
|
@@ -164,39 +166,28 @@ module Polars
|
|
164
166
|
end
|
165
167
|
end
|
166
168
|
|
167
|
-
if
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
169
|
+
if time_unit.nil?
|
170
|
+
if interval.include?("ns")
|
171
|
+
time_unit = "ns"
|
172
|
+
else
|
173
|
+
time_unit = "us"
|
174
|
+
end
|
173
175
|
end
|
174
176
|
|
175
|
-
|
176
|
-
|
177
|
+
start_rbexpr = Utils.parse_as_expression(start)
|
178
|
+
stop_rbexpr = Utils.parse_as_expression(stop)
|
177
179
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
tu = "ns"
|
182
|
-
else
|
183
|
-
tu = "us"
|
184
|
-
end
|
180
|
+
result = Utils.wrap_expr(
|
181
|
+
_rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
|
182
|
+
)
|
185
183
|
|
186
|
-
|
187
|
-
stop = Utils._datetime_to_pl_timestamp(high, tu)
|
188
|
-
if name.nil?
|
189
|
-
name = ""
|
190
|
-
end
|
184
|
+
result = result.alias(name.to_s)
|
191
185
|
|
192
|
-
|
193
|
-
|
194
|
-
)
|
195
|
-
if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
|
196
|
-
dt_range = dt_range.cast(Date)
|
186
|
+
if !lazy
|
187
|
+
return select(result).to_series
|
197
188
|
end
|
198
189
|
|
199
|
-
|
190
|
+
result
|
200
191
|
end
|
201
192
|
|
202
193
|
# Bin values into discrete values.
|
data/lib/polars/group_by.rb
CHANGED
@@ -2,23 +2,19 @@ module Polars
|
|
2
2
|
# Starts a new GroupBy operation.
|
3
3
|
class GroupBy
|
4
4
|
# @private
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
self._df = df
|
10
|
-
self._dataframe_class = dataframe_class
|
11
|
-
self.by = by
|
12
|
-
self.maintain_order = maintain_order
|
5
|
+
def initialize(df, by, maintain_order: false)
|
6
|
+
@df = df
|
7
|
+
@by = by
|
8
|
+
@maintain_order = maintain_order
|
13
9
|
end
|
14
10
|
|
15
|
-
# Allows iteration over the groups of the
|
11
|
+
# Allows iteration over the groups of the group by operation.
|
16
12
|
#
|
17
13
|
# @return [Object]
|
18
14
|
#
|
19
15
|
# @example
|
20
16
|
# df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]})
|
21
|
-
# df.
|
17
|
+
# df.group_by("foo", maintain_order: true).each.to_h
|
22
18
|
# # =>
|
23
19
|
# # {"a"=>shape: (2, 2)
|
24
20
|
# # ┌─────┬─────┐
|
@@ -41,10 +37,9 @@ module Polars
|
|
41
37
|
|
42
38
|
temp_col = "__POLARS_GB_GROUP_INDICES"
|
43
39
|
groups_df =
|
44
|
-
|
45
|
-
.lazy
|
40
|
+
@df.lazy
|
46
41
|
.with_row_count(name: temp_col)
|
47
|
-
.
|
42
|
+
.group_by(@by, maintain_order: @maintain_order)
|
48
43
|
.agg(Polars.col(temp_col))
|
49
44
|
.collect(no_optimization: true)
|
50
45
|
|
@@ -52,7 +47,7 @@ module Polars
|
|
52
47
|
|
53
48
|
# When grouping by a single column, group name is a single value
|
54
49
|
# When grouping by multiple columns, group name is a tuple of values
|
55
|
-
if by.is_a?(String) || by.is_a?(Expr)
|
50
|
+
if @by.is_a?(String) || @by.is_a?(Expr)
|
56
51
|
_group_names = group_names.to_series.each
|
57
52
|
else
|
58
53
|
_group_names = group_names.iter_rows
|
@@ -62,10 +57,8 @@ module Polars
|
|
62
57
|
_current_index = 0
|
63
58
|
|
64
59
|
while _current_index < _group_indices.length
|
65
|
-
df = _dataframe_class._from_rbdf(_df)
|
66
|
-
|
67
60
|
group_name = _group_names.next
|
68
|
-
group_data = df[_group_indices[_current_index]]
|
61
|
+
group_data = @df[_group_indices[_current_index]]
|
69
62
|
_current_index += 1
|
70
63
|
|
71
64
|
yield group_name, group_data
|
@@ -96,7 +89,7 @@ module Polars
|
|
96
89
|
# "shape" => ["square", "triangle", "square", "triangle", "square"]
|
97
90
|
# }
|
98
91
|
# )
|
99
|
-
# df.
|
92
|
+
# df.group_by("color").apply { |group_df| group_df.sample(2) }
|
100
93
|
# # =>
|
101
94
|
# # shape: (4, 3)
|
102
95
|
# # ┌─────┬───────┬──────────┐
|
@@ -110,7 +103,7 @@ module Polars
|
|
110
103
|
# # │ 3 ┆ red ┆ triangle │
|
111
104
|
# # └─────┴───────┴──────────┘
|
112
105
|
# def apply(&f)
|
113
|
-
# _dataframe_class._from_rbdf(_df.
|
106
|
+
# _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
|
114
107
|
# end
|
115
108
|
|
116
109
|
# Use multiple aggregations on columns.
|
@@ -126,7 +119,7 @@ module Polars
|
|
126
119
|
# df = Polars::DataFrame.new(
|
127
120
|
# {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
|
128
121
|
# )
|
129
|
-
# df.
|
122
|
+
# df.group_by("foo", maintain_order: true).agg(
|
130
123
|
# [
|
131
124
|
# Polars.sum("bar").suffix("_sum"),
|
132
125
|
# Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
|
@@ -143,12 +136,10 @@ module Polars
|
|
143
136
|
# # │ two ┆ 6 ┆ 5 │
|
144
137
|
# # └─────┴─────────┴──────────────┘
|
145
138
|
def agg(aggs)
|
146
|
-
df
|
147
|
-
.
|
148
|
-
.groupby(by, maintain_order: maintain_order)
|
139
|
+
@df.lazy
|
140
|
+
.group_by(@by, maintain_order: @maintain_order)
|
149
141
|
.agg(aggs)
|
150
|
-
.collect(no_optimization: true
|
151
|
-
_dataframe_class._from_rbdf(df._df)
|
142
|
+
.collect(no_optimization: true)
|
152
143
|
end
|
153
144
|
|
154
145
|
# Get the first `n` rows of each group.
|
@@ -181,7 +172,7 @@ module Polars
|
|
181
172
|
# # └─────────┴─────┘
|
182
173
|
#
|
183
174
|
# @example
|
184
|
-
# df.
|
175
|
+
# df.group_by("letters").head(2).sort("letters")
|
185
176
|
# # =>
|
186
177
|
# # shape: (5, 2)
|
187
178
|
# # ┌─────────┬─────┐
|
@@ -196,14 +187,10 @@ module Polars
|
|
196
187
|
# # │ c ┆ 2 │
|
197
188
|
# # └─────────┴─────┘
|
198
189
|
def head(n = 5)
|
199
|
-
df
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
.head(n)
|
204
|
-
.collect(no_optimization: true, string_cache: false)
|
205
|
-
)
|
206
|
-
_dataframe_class._from_rbdf(df._df)
|
190
|
+
@df.lazy
|
191
|
+
.group_by(@by, maintain_order: @maintain_order)
|
192
|
+
.head(n)
|
193
|
+
.collect(no_optimization: true)
|
207
194
|
end
|
208
195
|
|
209
196
|
# Get the last `n` rows of each group.
|
@@ -236,7 +223,7 @@ module Polars
|
|
236
223
|
# # └─────────┴─────┘
|
237
224
|
#
|
238
225
|
# @example
|
239
|
-
# df.
|
226
|
+
# df.group_by("letters").tail(2).sort("letters")
|
240
227
|
# # =>
|
241
228
|
# # shape: (5, 2)
|
242
229
|
# # ┌─────────┬─────┐
|
@@ -251,14 +238,10 @@ module Polars
|
|
251
238
|
# # │ c ┆ 4 │
|
252
239
|
# # └─────────┴─────┘
|
253
240
|
def tail(n = 5)
|
254
|
-
df
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
.tail(n)
|
259
|
-
.collect(no_optimization: true, string_cache: false)
|
260
|
-
)
|
261
|
-
_dataframe_class._from_rbdf(df._df)
|
241
|
+
@df.lazy
|
242
|
+
.group_by(@by, maintain_order: @maintain_order)
|
243
|
+
.tail(n)
|
244
|
+
.collect(no_optimization: true)
|
262
245
|
end
|
263
246
|
|
264
247
|
# Aggregate the first values in the group.
|
@@ -274,7 +257,7 @@ module Polars
|
|
274
257
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
275
258
|
# }
|
276
259
|
# )
|
277
|
-
# df.
|
260
|
+
# df.group_by("d", maintain_order: true).first
|
278
261
|
# # =>
|
279
262
|
# # shape: (3, 4)
|
280
263
|
# # ┌────────┬─────┬──────┬───────┐
|
@@ -303,7 +286,7 @@ module Polars
|
|
303
286
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
304
287
|
# }
|
305
288
|
# )
|
306
|
-
# df.
|
289
|
+
# df.group_by("d", maintain_order: true).last
|
307
290
|
# # =>
|
308
291
|
# # shape: (3, 4)
|
309
292
|
# # ┌────────┬─────┬──────┬───────┐
|
@@ -332,7 +315,7 @@ module Polars
|
|
332
315
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
333
316
|
# }
|
334
317
|
# )
|
335
|
-
# df.
|
318
|
+
# df.group_by("d", maintain_order: true).sum
|
336
319
|
# # =>
|
337
320
|
# # shape: (3, 4)
|
338
321
|
# # ┌────────┬─────┬──────┬─────┐
|
@@ -361,7 +344,7 @@ module Polars
|
|
361
344
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
362
345
|
# }
|
363
346
|
# )
|
364
|
-
# df.
|
347
|
+
# df.group_by("d", maintain_order: true).min
|
365
348
|
# # =>
|
366
349
|
# # shape: (3, 4)
|
367
350
|
# # ┌────────┬─────┬──────┬───────┐
|
@@ -390,7 +373,7 @@ module Polars
|
|
390
373
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
391
374
|
# }
|
392
375
|
# )
|
393
|
-
# df.
|
376
|
+
# df.group_by("d", maintain_order: true).max
|
394
377
|
# # =>
|
395
378
|
# # shape: (3, 4)
|
396
379
|
# # ┌────────┬─────┬──────┬──────┐
|
@@ -419,7 +402,7 @@ module Polars
|
|
419
402
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
420
403
|
# }
|
421
404
|
# )
|
422
|
-
# df.
|
405
|
+
# df.group_by("d", maintain_order: true).count
|
423
406
|
# # =>
|
424
407
|
# # shape: (3, 2)
|
425
408
|
# # ┌────────┬───────┐
|
@@ -448,7 +431,7 @@ module Polars
|
|
448
431
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
449
432
|
# }
|
450
433
|
# )
|
451
|
-
# df.
|
434
|
+
# df.group_by("d", maintain_order: true).mean
|
452
435
|
# # =>
|
453
436
|
# # shape: (3, 4)
|
454
437
|
# # ┌────────┬─────┬──────────┬──────────┐
|
@@ -476,7 +459,7 @@ module Polars
|
|
476
459
|
# "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
|
477
460
|
# }
|
478
461
|
# )
|
479
|
-
# df.
|
462
|
+
# df.group_by("d", maintain_order: true).n_unique
|
480
463
|
# # =>
|
481
464
|
# # shape: (2, 3)
|
482
465
|
# # ┌────────┬─────┬─────┐
|
@@ -508,7 +491,7 @@ module Polars
|
|
508
491
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
509
492
|
# }
|
510
493
|
# )
|
511
|
-
# df.
|
494
|
+
# df.group_by("d", maintain_order: true).quantile(1)
|
512
495
|
# # =>
|
513
496
|
# # shape: (3, 3)
|
514
497
|
# # ┌────────┬─────┬──────┐
|
@@ -536,7 +519,7 @@ module Polars
|
|
536
519
|
# "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
|
537
520
|
# }
|
538
521
|
# )
|
539
|
-
# df.
|
522
|
+
# df.group_by("d", maintain_order: true).median
|
540
523
|
# # =>
|
541
524
|
# # shape: (2, 3)
|
542
525
|
# # ┌────────┬─────┬──────┐
|
@@ -551,36 +534,15 @@ module Polars
|
|
551
534
|
agg(Polars.all.median)
|
552
535
|
end
|
553
536
|
|
554
|
-
# Aggregate the groups into Series.
|
555
|
-
#
|
556
|
-
# @return [DataFrame]
|
557
|
-
#
|
558
|
-
# @example
|
559
|
-
# df = Polars::DataFrame.new({"a" => ["one", "two", "one", "two"], "b" => [1, 2, 3, 4]})
|
560
|
-
# df.groupby("a", maintain_order: true).agg_list
|
561
|
-
# # =>
|
562
|
-
# # shape: (2, 2)
|
563
|
-
# # ┌─────┬─────────────────┐
|
564
|
-
# # │ a ┆ b │
|
565
|
-
# # │ --- ┆ --- │
|
566
|
-
# # │ str ┆ list[list[i64]] │
|
567
|
-
# # ╞═════╪═════════════════╡
|
568
|
-
# # │ one ┆ [[1, 3]] │
|
569
|
-
# # │ two ┆ [[2, 4]] │
|
570
|
-
# # └─────┴─────────────────┘
|
571
|
-
def agg_list
|
572
|
-
agg(Polars.all.list)
|
573
|
-
end
|
574
|
-
|
575
537
|
# Plot data.
|
576
538
|
#
|
577
539
|
# @return [Vega::LiteChart]
|
578
540
|
def plot(*args, **options)
|
579
|
-
raise ArgumentError, "Multiple groups not supported" if by.is_a?(Array) && by.size > 1
|
541
|
+
raise ArgumentError, "Multiple groups not supported" if @by.is_a?(::Array) && @by.size > 1
|
580
542
|
# same message as Ruby
|
581
543
|
raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
|
582
544
|
|
583
|
-
|
545
|
+
@df.plot(*args, **options, group: @by)
|
584
546
|
end
|
585
547
|
end
|
586
548
|
end
|
data/lib/polars/io.rb
CHANGED
@@ -621,11 +621,46 @@ module Polars
|
|
621
621
|
else
|
622
622
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
623
623
|
end
|
624
|
+
|
624
625
|
data = {}
|
626
|
+
schema_overrides = {}
|
627
|
+
|
625
628
|
result.columns.each_with_index do |k, i|
|
626
|
-
|
629
|
+
column_type = result.column_types[i]
|
630
|
+
|
631
|
+
data[k] =
|
632
|
+
if column_type
|
633
|
+
result.rows.map { |r| column_type.deserialize(r[i]) }
|
634
|
+
else
|
635
|
+
result.rows.map { |r| r[i] }
|
636
|
+
end
|
637
|
+
|
638
|
+
polars_type =
|
639
|
+
case column_type&.type
|
640
|
+
when :binary
|
641
|
+
Binary
|
642
|
+
when :boolean
|
643
|
+
Boolean
|
644
|
+
when :date
|
645
|
+
Date
|
646
|
+
when :datetime, :timestamp
|
647
|
+
Datetime
|
648
|
+
when :decimal
|
649
|
+
Decimal
|
650
|
+
when :float
|
651
|
+
Float64
|
652
|
+
when :integer
|
653
|
+
Int64
|
654
|
+
when :string, :text
|
655
|
+
Utf8
|
656
|
+
when :time
|
657
|
+
Time
|
658
|
+
end
|
659
|
+
|
660
|
+
schema_overrides[k] = polars_type if polars_type
|
627
661
|
end
|
628
|
-
|
662
|
+
|
663
|
+
DataFrame.new(data, schema_overrides: schema_overrides)
|
629
664
|
end
|
630
665
|
alias_method :read_sql, :read_database
|
631
666
|
|