polars-df 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/Cargo.lock +468 -538
  4. data/Cargo.toml +1 -0
  5. data/README.md +8 -7
  6. data/ext/polars/Cargo.toml +17 -10
  7. data/ext/polars/src/batched_csv.rs +26 -26
  8. data/ext/polars/src/conversion.rs +121 -93
  9. data/ext/polars/src/dataframe.rs +116 -71
  10. data/ext/polars/src/error.rs +0 -5
  11. data/ext/polars/src/expr/binary.rs +18 -6
  12. data/ext/polars/src/expr/datetime.rs +10 -12
  13. data/ext/polars/src/expr/general.rs +68 -284
  14. data/ext/polars/src/expr/list.rs +17 -9
  15. data/ext/polars/src/{expr.rs → expr/mod.rs} +4 -2
  16. data/ext/polars/src/expr/name.rs +44 -0
  17. data/ext/polars/src/expr/rolling.rs +196 -0
  18. data/ext/polars/src/expr/string.rs +85 -58
  19. data/ext/polars/src/file.rs +3 -3
  20. data/ext/polars/src/functions/aggregation.rs +35 -0
  21. data/ext/polars/src/functions/eager.rs +7 -31
  22. data/ext/polars/src/functions/io.rs +10 -10
  23. data/ext/polars/src/functions/lazy.rs +66 -41
  24. data/ext/polars/src/functions/meta.rs +30 -0
  25. data/ext/polars/src/functions/misc.rs +8 -0
  26. data/ext/polars/src/functions/mod.rs +5 -0
  27. data/ext/polars/src/functions/random.rs +6 -0
  28. data/ext/polars/src/functions/range.rs +46 -0
  29. data/ext/polars/src/functions/string_cache.rs +11 -0
  30. data/ext/polars/src/functions/whenthen.rs +7 -7
  31. data/ext/polars/src/lazyframe.rs +47 -42
  32. data/ext/polars/src/lib.rs +156 -72
  33. data/ext/polars/src/{apply → map}/dataframe.rs +28 -33
  34. data/ext/polars/src/{apply → map}/mod.rs +3 -3
  35. data/ext/polars/src/{apply → map}/series.rs +12 -16
  36. data/ext/polars/src/object.rs +1 -1
  37. data/ext/polars/src/rb_modules.rs +22 -7
  38. data/ext/polars/src/series/construction.rs +4 -4
  39. data/ext/polars/src/series/export.rs +2 -2
  40. data/ext/polars/src/series/set_at_idx.rs +33 -17
  41. data/ext/polars/src/series.rs +7 -27
  42. data/ext/polars/src/sql.rs +46 -0
  43. data/lib/polars/config.rb +530 -0
  44. data/lib/polars/data_frame.rb +115 -82
  45. data/lib/polars/date_time_expr.rb +13 -18
  46. data/lib/polars/date_time_name_space.rb +5 -25
  47. data/lib/polars/dynamic_group_by.rb +2 -2
  48. data/lib/polars/expr.rb +177 -94
  49. data/lib/polars/functions.rb +29 -37
  50. data/lib/polars/group_by.rb +38 -55
  51. data/lib/polars/io.rb +37 -2
  52. data/lib/polars/lazy_frame.rb +93 -66
  53. data/lib/polars/lazy_functions.rb +36 -48
  54. data/lib/polars/lazy_group_by.rb +7 -8
  55. data/lib/polars/list_expr.rb +12 -8
  56. data/lib/polars/list_name_space.rb +2 -2
  57. data/lib/polars/name_expr.rb +198 -0
  58. data/lib/polars/rolling_group_by.rb +2 -2
  59. data/lib/polars/series.rb +26 -13
  60. data/lib/polars/sql_context.rb +194 -0
  61. data/lib/polars/string_expr.rb +114 -60
  62. data/lib/polars/string_name_space.rb +19 -4
  63. data/lib/polars/utils.rb +12 -0
  64. data/lib/polars/version.rb +1 -1
  65. data/lib/polars.rb +3 -0
  66. metadata +18 -7
  67. /data/ext/polars/src/{apply → map}/lazy.rs +0 -0
@@ -19,8 +19,8 @@ module Polars
19
19
  # DataFrames/Series/LazyFrames to concatenate.
20
20
  # @param rechunk [Boolean]
21
21
  # Make sure that all data is in contiguous memory.
22
- # @param how ["vertical", "diagonal", "horizontal"]
23
- # Lazy only supports the 'vertical' strategy.
22
+ # @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
23
+ # LazyFrames do not support the `horizontal` strategy.
24
24
  #
25
25
  # - Vertical: applies multiple `vstack` operations.
26
26
  # - Diagonal: finds a union between the column schemas and fills missing column values with null.
@@ -55,18 +55,21 @@ module Polars
55
55
  if how == "vertical"
56
56
  out = Utils.wrap_df(_concat_df(items))
57
57
  elsif how == "diagonal"
58
- out = Utils.wrap_df(_diag_concat_df(items))
58
+ out = Utils.wrap_df(_concat_df_diagonal(items))
59
59
  elsif how == "horizontal"
60
- out = Utils.wrap_df(_hor_concat_df(items))
60
+ out = Utils.wrap_df(_concat_df_horizontal(items))
61
61
  else
62
62
  raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
63
63
  end
64
64
  elsif first.is_a?(LazyFrame)
65
65
  if how == "vertical"
66
- # TODO
67
- return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel))
66
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
67
+ elsif how == "vertical_relaxed"
68
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
69
+ elsif how == "diagonal"
70
+ return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
68
71
  else
69
- raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
72
+ raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
70
73
  end
71
74
  elsif first.is_a?(Series)
72
75
  # TODO
@@ -89,9 +92,9 @@ module Polars
89
92
 
90
93
  # Create a range of type `Datetime` (or `Date`).
91
94
  #
92
- # @param low [Object]
95
+ # @param start [Object]
93
96
  # Lower bound of the date range.
94
- # @param high [Object]
97
+ # @param stop [Object]
95
98
  # Upper bound of the date range.
96
99
  # @param interval [Object]
97
100
  # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
@@ -145,8 +148,8 @@ module Polars
145
148
  # # 1985-01-10 00:00:00
146
149
  # # ]
147
150
  def date_range(
148
- low,
149
- high,
151
+ start,
152
+ stop,
150
153
  interval,
151
154
  lazy: false,
152
155
  closed: "both",
@@ -163,39 +166,28 @@ module Polars
163
166
  end
164
167
  end
165
168
 
166
- if low.is_a?(Expr) || high.is_a?(Expr) || lazy
167
- low = Utils.expr_to_lit_or_expr(low, str_to_lit: true)
168
- high = Utils.expr_to_lit_or_expr(high, str_to_lit: true)
169
- return Utils.wrap_expr(
170
- _rb_date_range_lazy(low, high, interval, closed, name, time_zone)
171
- )
169
+ if time_unit.nil?
170
+ if interval.include?("ns")
171
+ time_unit = "ns"
172
+ else
173
+ time_unit = "us"
174
+ end
172
175
  end
173
176
 
174
- low, low_is_date = _ensure_datetime(low)
175
- high, high_is_date = _ensure_datetime(high)
177
+ start_rbexpr = Utils.parse_as_expression(start)
178
+ stop_rbexpr = Utils.parse_as_expression(stop)
176
179
 
177
- if !time_unit.nil?
178
- tu = time_unit
179
- elsif interval.include?("ns")
180
- tu = "ns"
181
- else
182
- tu = "us"
183
- end
180
+ result = Utils.wrap_expr(
181
+ _rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
182
+ )
184
183
 
185
- start = Utils._datetime_to_pl_timestamp(low, tu)
186
- stop = Utils._datetime_to_pl_timestamp(high, tu)
187
- if name.nil?
188
- name = ""
189
- end
184
+ result = result.alias(name.to_s)
190
185
 
191
- dt_range = Utils.wrap_s(
192
- _rb_date_range(start, stop, interval, closed, name, tu, time_zone)
193
- )
194
- if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
195
- dt_range = dt_range.cast(Date)
186
+ if !lazy
187
+ return select(result).to_series
196
188
  end
197
189
 
198
- dt_range
190
+ result
199
191
  end
200
192
 
201
193
  # Bin values into discrete values.
@@ -2,23 +2,19 @@ module Polars
2
2
  # Starts a new GroupBy operation.
3
3
  class GroupBy
4
4
  # @private
5
- attr_accessor :_df, :_dataframe_class, :by, :maintain_order
6
-
7
- # @private
8
- def initialize(df, by, dataframe_class, maintain_order: false)
9
- self._df = df
10
- self._dataframe_class = dataframe_class
11
- self.by = by
12
- self.maintain_order = maintain_order
5
+ def initialize(df, by, maintain_order: false)
6
+ @df = df
7
+ @by = by
8
+ @maintain_order = maintain_order
13
9
  end
14
10
 
15
- # Allows iteration over the groups of the groupby operation.
11
+ # Allows iteration over the groups of the group by operation.
16
12
  #
17
13
  # @return [Object]
18
14
  #
19
15
  # @example
20
16
  # df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]})
21
- # df.groupby("foo", maintain_order: true).each.to_h
17
+ # df.group_by("foo", maintain_order: true).each.to_h
22
18
  # # =>
23
19
  # # {"a"=>shape: (2, 2)
24
20
  # # ┌─────┬─────┐
@@ -41,10 +37,9 @@ module Polars
41
37
 
42
38
  temp_col = "__POLARS_GB_GROUP_INDICES"
43
39
  groups_df =
44
- Utils.wrap_df(_df)
45
- .lazy
40
+ @df.lazy
46
41
  .with_row_count(name: temp_col)
47
- .groupby(by, maintain_order: maintain_order)
42
+ .group_by(@by, maintain_order: @maintain_order)
48
43
  .agg(Polars.col(temp_col))
49
44
  .collect(no_optimization: true)
50
45
 
@@ -52,7 +47,7 @@ module Polars
52
47
 
53
48
  # When grouping by a single column, group name is a single value
54
49
  # When grouping by multiple columns, group name is a tuple of values
55
- if by.is_a?(String) || by.is_a?(Expr)
50
+ if @by.is_a?(String) || @by.is_a?(Expr)
56
51
  _group_names = group_names.to_series.each
57
52
  else
58
53
  _group_names = group_names.iter_rows
@@ -62,10 +57,8 @@ module Polars
62
57
  _current_index = 0
63
58
 
64
59
  while _current_index < _group_indices.length
65
- df = _dataframe_class._from_rbdf(_df)
66
-
67
60
  group_name = _group_names.next
68
- group_data = df[_group_indices[_current_index]]
61
+ group_data = @df[_group_indices[_current_index]]
69
62
  _current_index += 1
70
63
 
71
64
  yield group_name, group_data
@@ -96,7 +89,7 @@ module Polars
96
89
  # "shape" => ["square", "triangle", "square", "triangle", "square"]
97
90
  # }
98
91
  # )
99
- # df.groupby("color").apply { |group_df| group_df.sample(2) }
92
+ # df.group_by("color").apply { |group_df| group_df.sample(2) }
100
93
  # # =>
101
94
  # # shape: (4, 3)
102
95
  # # ┌─────┬───────┬──────────┐
@@ -110,7 +103,7 @@ module Polars
110
103
  # # │ 3 ┆ red ┆ triangle │
111
104
  # # └─────┴───────┴──────────┘
112
105
  # def apply(&f)
113
- # _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
106
+ # _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
114
107
  # end
115
108
 
116
109
  # Use multiple aggregations on columns.
@@ -126,7 +119,7 @@ module Polars
126
119
  # df = Polars::DataFrame.new(
127
120
  # {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
128
121
  # )
129
- # df.groupby("foo", maintain_order: true).agg(
122
+ # df.group_by("foo", maintain_order: true).agg(
130
123
  # [
131
124
  # Polars.sum("bar").suffix("_sum"),
132
125
  # Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
@@ -143,12 +136,10 @@ module Polars
143
136
  # # │ two ┆ 6 ┆ 5 │
144
137
  # # └─────┴─────────┴──────────────┘
145
138
  def agg(aggs)
146
- df = Utils.wrap_df(_df)
147
- .lazy
148
- .groupby(by, maintain_order: maintain_order)
139
+ @df.lazy
140
+ .group_by(@by, maintain_order: @maintain_order)
149
141
  .agg(aggs)
150
- .collect(no_optimization: true, string_cache: false)
151
- _dataframe_class._from_rbdf(df._df)
142
+ .collect(no_optimization: true)
152
143
  end
153
144
 
154
145
  # Get the first `n` rows of each group.
@@ -181,7 +172,7 @@ module Polars
181
172
  # # └─────────┴─────┘
182
173
  #
183
174
  # @example
184
- # df.groupby("letters").head(2).sort("letters")
175
+ # df.group_by("letters").head(2).sort("letters")
185
176
  # # =>
186
177
  # # shape: (5, 2)
187
178
  # # ┌─────────┬─────┐
@@ -196,14 +187,10 @@ module Polars
196
187
  # # │ c ┆ 2 │
197
188
  # # └─────────┴─────┘
198
189
  def head(n = 5)
199
- df = (
200
- Utils.wrap_df(_df)
201
- .lazy
202
- .groupby(by, maintain_order: maintain_order)
203
- .head(n)
204
- .collect(no_optimization: true, string_cache: false)
205
- )
206
- _dataframe_class._from_rbdf(df._df)
190
+ @df.lazy
191
+ .group_by(@by, maintain_order: @maintain_order)
192
+ .head(n)
193
+ .collect(no_optimization: true)
207
194
  end
208
195
 
209
196
  # Get the last `n` rows of each group.
@@ -236,7 +223,7 @@ module Polars
236
223
  # # └─────────┴─────┘
237
224
  #
238
225
  # @example
239
- # df.groupby("letters").tail(2).sort("letters")
226
+ # df.group_by("letters").tail(2).sort("letters")
240
227
  # # =>
241
228
  # # shape: (5, 2)
242
229
  # # ┌─────────┬─────┐
@@ -251,14 +238,10 @@ module Polars
251
238
  # # │ c ┆ 4 │
252
239
  # # └─────────┴─────┘
253
240
  def tail(n = 5)
254
- df = (
255
- Utils.wrap_df(_df)
256
- .lazy
257
- .groupby(by, maintain_order: maintain_order)
258
- .tail(n)
259
- .collect(no_optimization: true, string_cache: false)
260
- )
261
- _dataframe_class._from_rbdf(df._df)
241
+ @df.lazy
242
+ .group_by(@by, maintain_order: @maintain_order)
243
+ .tail(n)
244
+ .collect(no_optimization: true)
262
245
  end
263
246
 
264
247
  # Aggregate the first values in the group.
@@ -274,7 +257,7 @@ module Polars
274
257
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
275
258
  # }
276
259
  # )
277
- # df.groupby("d", maintain_order: true).first
260
+ # df.group_by("d", maintain_order: true).first
278
261
  # # =>
279
262
  # # shape: (3, 4)
280
263
  # # ┌────────┬─────┬──────┬───────┐
@@ -303,7 +286,7 @@ module Polars
303
286
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
304
287
  # }
305
288
  # )
306
- # df.groupby("d", maintain_order: true).last
289
+ # df.group_by("d", maintain_order: true).last
307
290
  # # =>
308
291
  # # shape: (3, 4)
309
292
  # # ┌────────┬─────┬──────┬───────┐
@@ -332,7 +315,7 @@ module Polars
332
315
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
333
316
  # }
334
317
  # )
335
- # df.groupby("d", maintain_order: true).sum
318
+ # df.group_by("d", maintain_order: true).sum
336
319
  # # =>
337
320
  # # shape: (3, 4)
338
321
  # # ┌────────┬─────┬──────┬─────┐
@@ -361,7 +344,7 @@ module Polars
361
344
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
362
345
  # }
363
346
  # )
364
- # df.groupby("d", maintain_order: true).min
347
+ # df.group_by("d", maintain_order: true).min
365
348
  # # =>
366
349
  # # shape: (3, 4)
367
350
  # # ┌────────┬─────┬──────┬───────┐
@@ -390,7 +373,7 @@ module Polars
390
373
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
391
374
  # }
392
375
  # )
393
- # df.groupby("d", maintain_order: true).max
376
+ # df.group_by("d", maintain_order: true).max
394
377
  # # =>
395
378
  # # shape: (3, 4)
396
379
  # # ┌────────┬─────┬──────┬──────┐
@@ -419,7 +402,7 @@ module Polars
419
402
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
420
403
  # }
421
404
  # )
422
- # df.groupby("d", maintain_order: true).count
405
+ # df.group_by("d", maintain_order: true).count
423
406
  # # =>
424
407
  # # shape: (3, 2)
425
408
  # # ┌────────┬───────┐
@@ -448,7 +431,7 @@ module Polars
448
431
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
449
432
  # }
450
433
  # )
451
- # df.groupby("d", maintain_order: true).mean
434
+ # df.group_by("d", maintain_order: true).mean
452
435
  # # =>
453
436
  # # shape: (3, 4)
454
437
  # # ┌────────┬─────┬──────────┬──────────┐
@@ -476,7 +459,7 @@ module Polars
476
459
  # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
477
460
  # }
478
461
  # )
479
- # df.groupby("d", maintain_order: true).n_unique
462
+ # df.group_by("d", maintain_order: true).n_unique
480
463
  # # =>
481
464
  # # shape: (2, 3)
482
465
  # # ┌────────┬─────┬─────┐
@@ -508,7 +491,7 @@ module Polars
508
491
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
509
492
  # }
510
493
  # )
511
- # df.groupby("d", maintain_order: true).quantile(1)
494
+ # df.group_by("d", maintain_order: true).quantile(1)
512
495
  # # =>
513
496
  # # shape: (3, 3)
514
497
  # # ┌────────┬─────┬──────┐
@@ -536,7 +519,7 @@ module Polars
536
519
  # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
537
520
  # }
538
521
  # )
539
- # df.groupby("d", maintain_order: true).median
522
+ # df.group_by("d", maintain_order: true).median
540
523
  # # =>
541
524
  # # shape: (2, 3)
542
525
  # # ┌────────┬─────┬──────┐
@@ -555,11 +538,11 @@ module Polars
555
538
  #
556
539
  # @return [Vega::LiteChart]
557
540
  def plot(*args, **options)
558
- raise ArgumentError, "Multiple groups not supported" if by.is_a?(::Array) && by.size > 1
541
+ raise ArgumentError, "Multiple groups not supported" if @by.is_a?(::Array) && @by.size > 1
559
542
  # same message as Ruby
560
543
  raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
561
544
 
562
- Utils.wrap_df(_df).plot(*args, **options, group: by)
545
+ @df.plot(*args, **options, group: @by)
563
546
  end
564
547
  end
565
548
  end
data/lib/polars/io.rb CHANGED
@@ -621,11 +621,46 @@ module Polars
621
621
  else
622
622
  raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
623
623
  end
624
+
624
625
  data = {}
626
+ schema_overrides = {}
627
+
625
628
  result.columns.each_with_index do |k, i|
626
- data[k] = result.rows.map { |r| r[i] }
629
+ column_type = result.column_types[i]
630
+
631
+ data[k] =
632
+ if column_type
633
+ result.rows.map { |r| column_type.deserialize(r[i]) }
634
+ else
635
+ result.rows.map { |r| r[i] }
636
+ end
637
+
638
+ polars_type =
639
+ case column_type&.type
640
+ when :binary
641
+ Binary
642
+ when :boolean
643
+ Boolean
644
+ when :date
645
+ Date
646
+ when :datetime, :timestamp
647
+ Datetime
648
+ when :decimal
649
+ Decimal
650
+ when :float
651
+ Float64
652
+ when :integer
653
+ Int64
654
+ when :string, :text
655
+ Utf8
656
+ when :time
657
+ Time
658
+ end
659
+
660
+ schema_overrides[k] = polars_type if polars_type
627
661
  end
628
- DataFrame.new(data)
662
+
663
+ DataFrame.new(data, schema_overrides: schema_overrides)
629
664
  end
630
665
  alias_method :read_sql, :read_database
631
666