polars-df 0.6.0-x86_64-darwin → 0.8.0-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -0
- data/Cargo.lock +597 -599
- data/Cargo.toml +1 -0
- data/LICENSE-THIRD-PARTY.txt +5523 -6947
- data/README.md +8 -7
- data/lib/polars/3.1/polars.bundle +0 -0
- data/lib/polars/3.2/polars.bundle +0 -0
- data/lib/polars/{3.0 → 3.3}/polars.bundle +0 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/data_frame.rb +182 -145
- data/lib/polars/data_types.rb +4 -1
- data/lib/polars/date_time_expr.rb +23 -28
- data/lib/polars/date_time_name_space.rb +17 -37
- data/lib/polars/dynamic_group_by.rb +2 -2
- data/lib/polars/expr.rb +398 -110
- data/lib/polars/functions.rb +29 -37
- data/lib/polars/group_by.rb +38 -55
- data/lib/polars/io.rb +40 -5
- data/lib/polars/lazy_frame.rb +116 -89
- data/lib/polars/lazy_functions.rb +40 -68
- data/lib/polars/lazy_group_by.rb +7 -8
- data/lib/polars/list_expr.rb +12 -8
- data/lib/polars/list_name_space.rb +2 -2
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/rolling_group_by.rb +2 -2
- data/lib/polars/series.rb +315 -43
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_expr.rb +114 -60
- data/lib/polars/string_name_space.rb +19 -4
- data/lib/polars/struct_expr.rb +1 -1
- data/lib/polars/struct_name_space.rb +1 -1
- data/lib/polars/utils.rb +25 -13
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +3 -0
- metadata +8 -5
data/lib/polars/functions.rb
CHANGED
@@ -19,8 +19,8 @@ module Polars
|
|
19
19
|
# DataFrames/Series/LazyFrames to concatenate.
|
20
20
|
# @param rechunk [Boolean]
|
21
21
|
# Make sure that all data is in contiguous memory.
|
22
|
-
# @param how ["vertical", "diagonal", "horizontal"]
|
23
|
-
#
|
22
|
+
# @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
|
23
|
+
# LazyFrames do not support the `horizontal` strategy.
|
24
24
|
#
|
25
25
|
# - Vertical: applies multiple `vstack` operations.
|
26
26
|
# - Diagonal: finds a union between the column schemas and fills missing column values with null.
|
@@ -55,18 +55,21 @@ module Polars
|
|
55
55
|
if how == "vertical"
|
56
56
|
out = Utils.wrap_df(_concat_df(items))
|
57
57
|
elsif how == "diagonal"
|
58
|
-
out = Utils.wrap_df(
|
58
|
+
out = Utils.wrap_df(_concat_df_diagonal(items))
|
59
59
|
elsif how == "horizontal"
|
60
|
-
out = Utils.wrap_df(
|
60
|
+
out = Utils.wrap_df(_concat_df_horizontal(items))
|
61
61
|
else
|
62
62
|
raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
|
63
63
|
end
|
64
64
|
elsif first.is_a?(LazyFrame)
|
65
65
|
if how == "vertical"
|
66
|
-
|
67
|
-
|
66
|
+
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
|
67
|
+
elsif how == "vertical_relaxed"
|
68
|
+
return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
|
69
|
+
elsif how == "diagonal"
|
70
|
+
return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
|
68
71
|
else
|
69
|
-
raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
|
72
|
+
raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
|
70
73
|
end
|
71
74
|
elsif first.is_a?(Series)
|
72
75
|
# TODO
|
@@ -89,9 +92,9 @@ module Polars
|
|
89
92
|
|
90
93
|
# Create a range of type `Datetime` (or `Date`).
|
91
94
|
#
|
92
|
-
# @param
|
95
|
+
# @param start [Object]
|
93
96
|
# Lower bound of the date range.
|
94
|
-
# @param
|
97
|
+
# @param stop [Object]
|
95
98
|
# Upper bound of the date range.
|
96
99
|
# @param interval [Object]
|
97
100
|
# Interval periods. It can be a polars duration string, such as `3d12h4m25s`
|
@@ -145,8 +148,8 @@ module Polars
|
|
145
148
|
# # 1985-01-10 00:00:00
|
146
149
|
# # ]
|
147
150
|
def date_range(
|
148
|
-
|
149
|
-
|
151
|
+
start,
|
152
|
+
stop,
|
150
153
|
interval,
|
151
154
|
lazy: false,
|
152
155
|
closed: "both",
|
@@ -163,39 +166,28 @@ module Polars
|
|
163
166
|
end
|
164
167
|
end
|
165
168
|
|
166
|
-
if
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
169
|
+
if time_unit.nil?
|
170
|
+
if interval.include?("ns")
|
171
|
+
time_unit = "ns"
|
172
|
+
else
|
173
|
+
time_unit = "us"
|
174
|
+
end
|
172
175
|
end
|
173
176
|
|
174
|
-
|
175
|
-
|
177
|
+
start_rbexpr = Utils.parse_as_expression(start)
|
178
|
+
stop_rbexpr = Utils.parse_as_expression(stop)
|
176
179
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
tu = "ns"
|
181
|
-
else
|
182
|
-
tu = "us"
|
183
|
-
end
|
180
|
+
result = Utils.wrap_expr(
|
181
|
+
_rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
|
182
|
+
)
|
184
183
|
|
185
|
-
|
186
|
-
stop = Utils._datetime_to_pl_timestamp(high, tu)
|
187
|
-
if name.nil?
|
188
|
-
name = ""
|
189
|
-
end
|
184
|
+
result = result.alias(name.to_s)
|
190
185
|
|
191
|
-
|
192
|
-
|
193
|
-
)
|
194
|
-
if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
|
195
|
-
dt_range = dt_range.cast(Date)
|
186
|
+
if !lazy
|
187
|
+
return select(result).to_series
|
196
188
|
end
|
197
189
|
|
198
|
-
|
190
|
+
result
|
199
191
|
end
|
200
192
|
|
201
193
|
# Bin values into discrete values.
|
data/lib/polars/group_by.rb
CHANGED
@@ -2,23 +2,19 @@ module Polars
|
|
2
2
|
# Starts a new GroupBy operation.
|
3
3
|
class GroupBy
|
4
4
|
# @private
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
self._df = df
|
10
|
-
self._dataframe_class = dataframe_class
|
11
|
-
self.by = by
|
12
|
-
self.maintain_order = maintain_order
|
5
|
+
def initialize(df, by, maintain_order: false)
|
6
|
+
@df = df
|
7
|
+
@by = by
|
8
|
+
@maintain_order = maintain_order
|
13
9
|
end
|
14
10
|
|
15
|
-
# Allows iteration over the groups of the
|
11
|
+
# Allows iteration over the groups of the group by operation.
|
16
12
|
#
|
17
13
|
# @return [Object]
|
18
14
|
#
|
19
15
|
# @example
|
20
16
|
# df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]})
|
21
|
-
# df.
|
17
|
+
# df.group_by("foo", maintain_order: true).each.to_h
|
22
18
|
# # =>
|
23
19
|
# # {"a"=>shape: (2, 2)
|
24
20
|
# # ┌─────┬─────┐
|
@@ -41,10 +37,9 @@ module Polars
|
|
41
37
|
|
42
38
|
temp_col = "__POLARS_GB_GROUP_INDICES"
|
43
39
|
groups_df =
|
44
|
-
|
45
|
-
.lazy
|
40
|
+
@df.lazy
|
46
41
|
.with_row_count(name: temp_col)
|
47
|
-
.
|
42
|
+
.group_by(@by, maintain_order: @maintain_order)
|
48
43
|
.agg(Polars.col(temp_col))
|
49
44
|
.collect(no_optimization: true)
|
50
45
|
|
@@ -52,7 +47,7 @@ module Polars
|
|
52
47
|
|
53
48
|
# When grouping by a single column, group name is a single value
|
54
49
|
# When grouping by multiple columns, group name is a tuple of values
|
55
|
-
if by.is_a?(String) || by.is_a?(Expr)
|
50
|
+
if @by.is_a?(::String) || @by.is_a?(Expr)
|
56
51
|
_group_names = group_names.to_series.each
|
57
52
|
else
|
58
53
|
_group_names = group_names.iter_rows
|
@@ -62,10 +57,8 @@ module Polars
|
|
62
57
|
_current_index = 0
|
63
58
|
|
64
59
|
while _current_index < _group_indices.length
|
65
|
-
df = _dataframe_class._from_rbdf(_df)
|
66
|
-
|
67
60
|
group_name = _group_names.next
|
68
|
-
group_data = df[_group_indices[_current_index]]
|
61
|
+
group_data = @df[_group_indices[_current_index]]
|
69
62
|
_current_index += 1
|
70
63
|
|
71
64
|
yield group_name, group_data
|
@@ -96,7 +89,7 @@ module Polars
|
|
96
89
|
# "shape" => ["square", "triangle", "square", "triangle", "square"]
|
97
90
|
# }
|
98
91
|
# )
|
99
|
-
# df.
|
92
|
+
# df.group_by("color").apply { |group_df| group_df.sample(2) }
|
100
93
|
# # =>
|
101
94
|
# # shape: (4, 3)
|
102
95
|
# # ┌─────┬───────┬──────────┐
|
@@ -110,7 +103,7 @@ module Polars
|
|
110
103
|
# # │ 3 ┆ red ┆ triangle │
|
111
104
|
# # └─────┴───────┴──────────┘
|
112
105
|
# def apply(&f)
|
113
|
-
# _dataframe_class._from_rbdf(_df.
|
106
|
+
# _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
|
114
107
|
# end
|
115
108
|
|
116
109
|
# Use multiple aggregations on columns.
|
@@ -126,7 +119,7 @@ module Polars
|
|
126
119
|
# df = Polars::DataFrame.new(
|
127
120
|
# {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
|
128
121
|
# )
|
129
|
-
# df.
|
122
|
+
# df.group_by("foo", maintain_order: true).agg(
|
130
123
|
# [
|
131
124
|
# Polars.sum("bar").suffix("_sum"),
|
132
125
|
# Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
|
@@ -143,12 +136,10 @@ module Polars
|
|
143
136
|
# # │ two ┆ 6 ┆ 5 │
|
144
137
|
# # └─────┴─────────┴──────────────┘
|
145
138
|
def agg(aggs)
|
146
|
-
df
|
147
|
-
.
|
148
|
-
.groupby(by, maintain_order: maintain_order)
|
139
|
+
@df.lazy
|
140
|
+
.group_by(@by, maintain_order: @maintain_order)
|
149
141
|
.agg(aggs)
|
150
|
-
.collect(no_optimization: true
|
151
|
-
_dataframe_class._from_rbdf(df._df)
|
142
|
+
.collect(no_optimization: true)
|
152
143
|
end
|
153
144
|
|
154
145
|
# Get the first `n` rows of each group.
|
@@ -181,7 +172,7 @@ module Polars
|
|
181
172
|
# # └─────────┴─────┘
|
182
173
|
#
|
183
174
|
# @example
|
184
|
-
# df.
|
175
|
+
# df.group_by("letters").head(2).sort("letters")
|
185
176
|
# # =>
|
186
177
|
# # shape: (5, 2)
|
187
178
|
# # ┌─────────┬─────┐
|
@@ -196,14 +187,10 @@ module Polars
|
|
196
187
|
# # │ c ┆ 2 │
|
197
188
|
# # └─────────┴─────┘
|
198
189
|
def head(n = 5)
|
199
|
-
df
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
.head(n)
|
204
|
-
.collect(no_optimization: true, string_cache: false)
|
205
|
-
)
|
206
|
-
_dataframe_class._from_rbdf(df._df)
|
190
|
+
@df.lazy
|
191
|
+
.group_by(@by, maintain_order: @maintain_order)
|
192
|
+
.head(n)
|
193
|
+
.collect(no_optimization: true)
|
207
194
|
end
|
208
195
|
|
209
196
|
# Get the last `n` rows of each group.
|
@@ -236,7 +223,7 @@ module Polars
|
|
236
223
|
# # └─────────┴─────┘
|
237
224
|
#
|
238
225
|
# @example
|
239
|
-
# df.
|
226
|
+
# df.group_by("letters").tail(2).sort("letters")
|
240
227
|
# # =>
|
241
228
|
# # shape: (5, 2)
|
242
229
|
# # ┌─────────┬─────┐
|
@@ -251,14 +238,10 @@ module Polars
|
|
251
238
|
# # │ c ┆ 4 │
|
252
239
|
# # └─────────┴─────┘
|
253
240
|
def tail(n = 5)
|
254
|
-
df
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
.tail(n)
|
259
|
-
.collect(no_optimization: true, string_cache: false)
|
260
|
-
)
|
261
|
-
_dataframe_class._from_rbdf(df._df)
|
241
|
+
@df.lazy
|
242
|
+
.group_by(@by, maintain_order: @maintain_order)
|
243
|
+
.tail(n)
|
244
|
+
.collect(no_optimization: true)
|
262
245
|
end
|
263
246
|
|
264
247
|
# Aggregate the first values in the group.
|
@@ -274,7 +257,7 @@ module Polars
|
|
274
257
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
275
258
|
# }
|
276
259
|
# )
|
277
|
-
# df.
|
260
|
+
# df.group_by("d", maintain_order: true).first
|
278
261
|
# # =>
|
279
262
|
# # shape: (3, 4)
|
280
263
|
# # ┌────────┬─────┬──────┬───────┐
|
@@ -303,7 +286,7 @@ module Polars
|
|
303
286
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
304
287
|
# }
|
305
288
|
# )
|
306
|
-
# df.
|
289
|
+
# df.group_by("d", maintain_order: true).last
|
307
290
|
# # =>
|
308
291
|
# # shape: (3, 4)
|
309
292
|
# # ┌────────┬─────┬──────┬───────┐
|
@@ -332,7 +315,7 @@ module Polars
|
|
332
315
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
333
316
|
# }
|
334
317
|
# )
|
335
|
-
# df.
|
318
|
+
# df.group_by("d", maintain_order: true).sum
|
336
319
|
# # =>
|
337
320
|
# # shape: (3, 4)
|
338
321
|
# # ┌────────┬─────┬──────┬─────┐
|
@@ -361,7 +344,7 @@ module Polars
|
|
361
344
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
|
362
345
|
# }
|
363
346
|
# )
|
364
|
-
# df.
|
347
|
+
# df.group_by("d", maintain_order: true).min
|
365
348
|
# # =>
|
366
349
|
# # shape: (3, 4)
|
367
350
|
# # ┌────────┬─────┬──────┬───────┐
|
@@ -390,7 +373,7 @@ module Polars
|
|
390
373
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
391
374
|
# }
|
392
375
|
# )
|
393
|
-
# df.
|
376
|
+
# df.group_by("d", maintain_order: true).max
|
394
377
|
# # =>
|
395
378
|
# # shape: (3, 4)
|
396
379
|
# # ┌────────┬─────┬──────┬──────┐
|
@@ -419,7 +402,7 @@ module Polars
|
|
419
402
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
420
403
|
# }
|
421
404
|
# )
|
422
|
-
# df.
|
405
|
+
# df.group_by("d", maintain_order: true).count
|
423
406
|
# # =>
|
424
407
|
# # shape: (3, 2)
|
425
408
|
# # ┌────────┬───────┐
|
@@ -448,7 +431,7 @@ module Polars
|
|
448
431
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
449
432
|
# }
|
450
433
|
# )
|
451
|
-
# df.
|
434
|
+
# df.group_by("d", maintain_order: true).mean
|
452
435
|
# # =>
|
453
436
|
# # shape: (3, 4)
|
454
437
|
# # ┌────────┬─────┬──────────┬──────────┐
|
@@ -476,7 +459,7 @@ module Polars
|
|
476
459
|
# "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
|
477
460
|
# }
|
478
461
|
# )
|
479
|
-
# df.
|
462
|
+
# df.group_by("d", maintain_order: true).n_unique
|
480
463
|
# # =>
|
481
464
|
# # shape: (2, 3)
|
482
465
|
# # ┌────────┬─────┬─────┐
|
@@ -508,7 +491,7 @@ module Polars
|
|
508
491
|
# "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
|
509
492
|
# }
|
510
493
|
# )
|
511
|
-
# df.
|
494
|
+
# df.group_by("d", maintain_order: true).quantile(1)
|
512
495
|
# # =>
|
513
496
|
# # shape: (3, 3)
|
514
497
|
# # ┌────────┬─────┬──────┐
|
@@ -536,7 +519,7 @@ module Polars
|
|
536
519
|
# "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
|
537
520
|
# }
|
538
521
|
# )
|
539
|
-
# df.
|
522
|
+
# df.group_by("d", maintain_order: true).median
|
540
523
|
# # =>
|
541
524
|
# # shape: (2, 3)
|
542
525
|
# # ┌────────┬─────┬──────┐
|
@@ -555,11 +538,11 @@ module Polars
|
|
555
538
|
#
|
556
539
|
# @return [Vega::LiteChart]
|
557
540
|
def plot(*args, **options)
|
558
|
-
raise ArgumentError, "Multiple groups not supported" if by.is_a?(::Array) && by.size > 1
|
541
|
+
raise ArgumentError, "Multiple groups not supported" if @by.is_a?(::Array) && @by.size > 1
|
559
542
|
# same message as Ruby
|
560
543
|
raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
|
561
544
|
|
562
|
-
|
545
|
+
@df.plot(*args, **options, group: @by)
|
563
546
|
end
|
564
547
|
end
|
565
548
|
end
|
data/lib/polars/io.rb
CHANGED
@@ -616,16 +616,51 @@ module Polars
|
|
616
616
|
query
|
617
617
|
elsif query.is_a?(ActiveRecord::Relation)
|
618
618
|
query.connection.select_all(query.to_sql)
|
619
|
-
elsif query.is_a?(String)
|
619
|
+
elsif query.is_a?(::String)
|
620
620
|
ActiveRecord::Base.connection.select_all(query)
|
621
621
|
else
|
622
622
|
raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
|
623
623
|
end
|
624
|
+
|
624
625
|
data = {}
|
626
|
+
schema_overrides = {}
|
627
|
+
|
625
628
|
result.columns.each_with_index do |k, i|
|
626
|
-
|
629
|
+
column_type = result.column_types[i]
|
630
|
+
|
631
|
+
data[k] =
|
632
|
+
if column_type
|
633
|
+
result.rows.map { |r| column_type.deserialize(r[i]) }
|
634
|
+
else
|
635
|
+
result.rows.map { |r| r[i] }
|
636
|
+
end
|
637
|
+
|
638
|
+
polars_type =
|
639
|
+
case column_type&.type
|
640
|
+
when :binary
|
641
|
+
Binary
|
642
|
+
when :boolean
|
643
|
+
Boolean
|
644
|
+
when :date
|
645
|
+
Date
|
646
|
+
when :datetime, :timestamp
|
647
|
+
Datetime
|
648
|
+
when :decimal
|
649
|
+
Decimal
|
650
|
+
when :float
|
651
|
+
Float64
|
652
|
+
when :integer
|
653
|
+
Int64
|
654
|
+
when :string, :text
|
655
|
+
String
|
656
|
+
when :time
|
657
|
+
Time
|
658
|
+
end
|
659
|
+
|
660
|
+
schema_overrides[k] = polars_type if polars_type
|
627
661
|
end
|
628
|
-
|
662
|
+
|
663
|
+
DataFrame.new(data, schema_overrides: schema_overrides)
|
629
664
|
end
|
630
665
|
alias_method :read_sql, :read_database
|
631
666
|
|
@@ -821,7 +856,7 @@ module Polars
|
|
821
856
|
private
|
822
857
|
|
823
858
|
def _prepare_file_arg(file)
|
824
|
-
if file.is_a?(String) && file =~ /\Ahttps?:\/\//
|
859
|
+
if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
|
825
860
|
raise ArgumentError, "use URI(...) for remote files"
|
826
861
|
end
|
827
862
|
|
@@ -835,7 +870,7 @@ module Polars
|
|
835
870
|
end
|
836
871
|
|
837
872
|
def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
|
838
|
-
if arg.is_a?(String)
|
873
|
+
if arg.is_a?(::String)
|
839
874
|
arg_byte_length = arg.bytesize
|
840
875
|
if can_be_empty
|
841
876
|
if arg_byte_length > 1
|