polars-df 0.5.0-arm64-darwin → 0.7.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,8 +19,8 @@ module Polars
19
19
  # DataFrames/Series/LazyFrames to concatenate.
20
20
  # @param rechunk [Boolean]
21
21
  # Make sure that all data is in contiguous memory.
22
- # @param how ["vertical", "diagonal", "horizontal"]
23
- # Lazy only supports the 'vertical' strategy.
22
+ # @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
23
+ # LazyFrames do not support the `horizontal` strategy.
24
24
  #
25
25
  # - Vertical: applies multiple `vstack` operations.
26
26
  # - Diagonal: finds a union between the column schemas and fills missing column values with null.
@@ -43,7 +43,6 @@ module Polars
43
43
  # # │ i64 ┆ i64 │
44
44
  # # ╞═════╪═════╡
45
45
  # # │ 1 ┆ 3 │
46
- # # ├╌╌╌╌╌┼╌╌╌╌╌┤
47
46
  # # │ 2 ┆ 4 │
48
47
  # # └─────┴─────┘
49
48
  def concat(items, rechunk: true, how: "vertical", parallel: true)
@@ -56,18 +55,21 @@ module Polars
56
55
  if how == "vertical"
57
56
  out = Utils.wrap_df(_concat_df(items))
58
57
  elsif how == "diagonal"
59
- out = Utils.wrap_df(_diag_concat_df(items))
58
+ out = Utils.wrap_df(_concat_df_diagonal(items))
60
59
  elsif how == "horizontal"
61
- out = Utils.wrap_df(_hor_concat_df(items))
60
+ out = Utils.wrap_df(_concat_df_horizontal(items))
62
61
  else
63
62
  raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
64
63
  end
65
64
  elsif first.is_a?(LazyFrame)
66
65
  if how == "vertical"
67
- # TODO
68
- return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel))
66
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
67
+ elsif how == "vertical_relaxed"
68
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
69
+ elsif how == "diagonal"
70
+ return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
69
71
  else
70
- raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
72
+ raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
71
73
  end
72
74
  elsif first.is_a?(Series)
73
75
  # TODO
@@ -90,9 +92,9 @@ module Polars
90
92
 
91
93
  # Create a range of type `Datetime` (or `Date`).
92
94
  #
93
- # @param low [Object]
95
+ # @param start [Object]
94
96
  # Lower bound of the date range.
95
- # @param high [Object]
97
+ # @param stop [Object]
96
98
  # Upper bound of the date range.
97
99
  # @param interval [Object]
98
100
  # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
@@ -146,8 +148,8 @@ module Polars
146
148
  # # 1985-01-10 00:00:00
147
149
  # # ]
148
150
  def date_range(
149
- low,
150
- high,
151
+ start,
152
+ stop,
151
153
  interval,
152
154
  lazy: false,
153
155
  closed: "both",
@@ -164,39 +166,28 @@ module Polars
164
166
  end
165
167
  end
166
168
 
167
- if low.is_a?(Expr) || high.is_a?(Expr) || lazy
168
- low = Utils.expr_to_lit_or_expr(low, str_to_lit: true)
169
- high = Utils.expr_to_lit_or_expr(high, str_to_lit: true)
170
- return Utils.wrap_expr(
171
- _rb_date_range_lazy(low, high, interval, closed, name, time_zone)
172
- )
169
+ if time_unit.nil?
170
+ if interval.include?("ns")
171
+ time_unit = "ns"
172
+ else
173
+ time_unit = "us"
174
+ end
173
175
  end
174
176
 
175
- low, low_is_date = _ensure_datetime(low)
176
- high, high_is_date = _ensure_datetime(high)
177
+ start_rbexpr = Utils.parse_as_expression(start)
178
+ stop_rbexpr = Utils.parse_as_expression(stop)
177
179
 
178
- if !time_unit.nil?
179
- tu = time_unit
180
- elsif interval.include?("ns")
181
- tu = "ns"
182
- else
183
- tu = "us"
184
- end
180
+ result = Utils.wrap_expr(
181
+ _rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
182
+ )
185
183
 
186
- start = Utils._datetime_to_pl_timestamp(low, tu)
187
- stop = Utils._datetime_to_pl_timestamp(high, tu)
188
- if name.nil?
189
- name = ""
190
- end
184
+ result = result.alias(name.to_s)
191
185
 
192
- dt_range = Utils.wrap_s(
193
- _rb_date_range(start, stop, interval, closed, name, tu, time_zone)
194
- )
195
- if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
196
- dt_range = dt_range.cast(Date)
186
+ if !lazy
187
+ return select(result).to_series
197
188
  end
198
189
 
199
- dt_range
190
+ result
200
191
  end
201
192
 
202
193
  # Bin values into discrete values.
@@ -2,23 +2,19 @@ module Polars
2
2
  # Starts a new GroupBy operation.
3
3
  class GroupBy
4
4
  # @private
5
- attr_accessor :_df, :_dataframe_class, :by, :maintain_order
6
-
7
- # @private
8
- def initialize(df, by, dataframe_class, maintain_order: false)
9
- self._df = df
10
- self._dataframe_class = dataframe_class
11
- self.by = by
12
- self.maintain_order = maintain_order
5
+ def initialize(df, by, maintain_order: false)
6
+ @df = df
7
+ @by = by
8
+ @maintain_order = maintain_order
13
9
  end
14
10
 
15
- # Allows iteration over the groups of the groupby operation.
11
+ # Allows iteration over the groups of the group by operation.
16
12
  #
17
13
  # @return [Object]
18
14
  #
19
15
  # @example
20
16
  # df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]})
21
- # df.groupby("foo", maintain_order: true).each.to_h
17
+ # df.group_by("foo", maintain_order: true).each.to_h
22
18
  # # =>
23
19
  # # {"a"=>shape: (2, 2)
24
20
  # # ┌─────┬─────┐
@@ -41,10 +37,9 @@ module Polars
41
37
 
42
38
  temp_col = "__POLARS_GB_GROUP_INDICES"
43
39
  groups_df =
44
- Utils.wrap_df(_df)
45
- .lazy
40
+ @df.lazy
46
41
  .with_row_count(name: temp_col)
47
- .groupby(by, maintain_order: maintain_order)
42
+ .group_by(@by, maintain_order: @maintain_order)
48
43
  .agg(Polars.col(temp_col))
49
44
  .collect(no_optimization: true)
50
45
 
@@ -52,7 +47,7 @@ module Polars
52
47
 
53
48
  # When grouping by a single column, group name is a single value
54
49
  # When grouping by multiple columns, group name is a tuple of values
55
- if by.is_a?(String) || by.is_a?(Expr)
50
+ if @by.is_a?(String) || @by.is_a?(Expr)
56
51
  _group_names = group_names.to_series.each
57
52
  else
58
53
  _group_names = group_names.iter_rows
@@ -62,10 +57,8 @@ module Polars
62
57
  _current_index = 0
63
58
 
64
59
  while _current_index < _group_indices.length
65
- df = _dataframe_class._from_rbdf(_df)
66
-
67
60
  group_name = _group_names.next
68
- group_data = df[_group_indices[_current_index]]
61
+ group_data = @df[_group_indices[_current_index]]
69
62
  _current_index += 1
70
63
 
71
64
  yield group_name, group_data
@@ -96,7 +89,7 @@ module Polars
96
89
  # "shape" => ["square", "triangle", "square", "triangle", "square"]
97
90
  # }
98
91
  # )
99
- # df.groupby("color").apply { |group_df| group_df.sample(2) }
92
+ # df.group_by("color").apply { |group_df| group_df.sample(2) }
100
93
  # # =>
101
94
  # # shape: (4, 3)
102
95
  # # ┌─────┬───────┬──────────┐
@@ -110,7 +103,7 @@ module Polars
110
103
  # # │ 3 ┆ red ┆ triangle │
111
104
  # # └─────┴───────┴──────────┘
112
105
  # def apply(&f)
113
- # _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
106
+ # _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
114
107
  # end
115
108
 
116
109
  # Use multiple aggregations on columns.
@@ -126,7 +119,7 @@ module Polars
126
119
  # df = Polars::DataFrame.new(
127
120
  # {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
128
121
  # )
129
- # df.groupby("foo", maintain_order: true).agg(
122
+ # df.group_by("foo", maintain_order: true).agg(
130
123
  # [
131
124
  # Polars.sum("bar").suffix("_sum"),
132
125
  # Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
@@ -143,12 +136,10 @@ module Polars
143
136
  # # │ two ┆ 6 ┆ 5 │
144
137
  # # └─────┴─────────┴──────────────┘
145
138
  def agg(aggs)
146
- df = Utils.wrap_df(_df)
147
- .lazy
148
- .groupby(by, maintain_order: maintain_order)
139
+ @df.lazy
140
+ .group_by(@by, maintain_order: @maintain_order)
149
141
  .agg(aggs)
150
- .collect(no_optimization: true, string_cache: false)
151
- _dataframe_class._from_rbdf(df._df)
142
+ .collect(no_optimization: true)
152
143
  end
153
144
 
154
145
  # Get the first `n` rows of each group.
@@ -181,7 +172,7 @@ module Polars
181
172
  # # └─────────┴─────┘
182
173
  #
183
174
  # @example
184
- # df.groupby("letters").head(2).sort("letters")
175
+ # df.group_by("letters").head(2).sort("letters")
185
176
  # # =>
186
177
  # # shape: (5, 2)
187
178
  # # ┌─────────┬─────┐
@@ -196,14 +187,10 @@ module Polars
196
187
  # # │ c ┆ 2 │
197
188
  # # └─────────┴─────┘
198
189
  def head(n = 5)
199
- df = (
200
- Utils.wrap_df(_df)
201
- .lazy
202
- .groupby(by, maintain_order: maintain_order)
203
- .head(n)
204
- .collect(no_optimization: true, string_cache: false)
205
- )
206
- _dataframe_class._from_rbdf(df._df)
190
+ @df.lazy
191
+ .group_by(@by, maintain_order: @maintain_order)
192
+ .head(n)
193
+ .collect(no_optimization: true)
207
194
  end
208
195
 
209
196
  # Get the last `n` rows of each group.
@@ -236,7 +223,7 @@ module Polars
236
223
  # # └─────────┴─────┘
237
224
  #
238
225
  # @example
239
- # df.groupby("letters").tail(2).sort("letters")
226
+ # df.group_by("letters").tail(2).sort("letters")
240
227
  # # =>
241
228
  # # shape: (5, 2)
242
229
  # # ┌─────────┬─────┐
@@ -251,14 +238,10 @@ module Polars
251
238
  # # │ c ┆ 4 │
252
239
  # # └─────────┴─────┘
253
240
  def tail(n = 5)
254
- df = (
255
- Utils.wrap_df(_df)
256
- .lazy
257
- .groupby(by, maintain_order: maintain_order)
258
- .tail(n)
259
- .collect(no_optimization: true, string_cache: false)
260
- )
261
- _dataframe_class._from_rbdf(df._df)
241
+ @df.lazy
242
+ .group_by(@by, maintain_order: @maintain_order)
243
+ .tail(n)
244
+ .collect(no_optimization: true)
262
245
  end
263
246
 
264
247
  # Aggregate the first values in the group.
@@ -274,7 +257,7 @@ module Polars
274
257
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
275
258
  # }
276
259
  # )
277
- # df.groupby("d", maintain_order: true).first
260
+ # df.group_by("d", maintain_order: true).first
278
261
  # # =>
279
262
  # # shape: (3, 4)
280
263
  # # ┌────────┬─────┬──────┬───────┐
@@ -303,7 +286,7 @@ module Polars
303
286
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
304
287
  # }
305
288
  # )
306
- # df.groupby("d", maintain_order: true).last
289
+ # df.group_by("d", maintain_order: true).last
307
290
  # # =>
308
291
  # # shape: (3, 4)
309
292
  # # ┌────────┬─────┬──────┬───────┐
@@ -332,7 +315,7 @@ module Polars
332
315
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
333
316
  # }
334
317
  # )
335
- # df.groupby("d", maintain_order: true).sum
318
+ # df.group_by("d", maintain_order: true).sum
336
319
  # # =>
337
320
  # # shape: (3, 4)
338
321
  # # ┌────────┬─────┬──────┬─────┐
@@ -361,7 +344,7 @@ module Polars
361
344
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
362
345
  # }
363
346
  # )
364
- # df.groupby("d", maintain_order: true).min
347
+ # df.group_by("d", maintain_order: true).min
365
348
  # # =>
366
349
  # # shape: (3, 4)
367
350
  # # ┌────────┬─────┬──────┬───────┐
@@ -390,7 +373,7 @@ module Polars
390
373
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
391
374
  # }
392
375
  # )
393
- # df.groupby("d", maintain_order: true).max
376
+ # df.group_by("d", maintain_order: true).max
394
377
  # # =>
395
378
  # # shape: (3, 4)
396
379
  # # ┌────────┬─────┬──────┬──────┐
@@ -419,7 +402,7 @@ module Polars
419
402
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
420
403
  # }
421
404
  # )
422
- # df.groupby("d", maintain_order: true).count
405
+ # df.group_by("d", maintain_order: true).count
423
406
  # # =>
424
407
  # # shape: (3, 2)
425
408
  # # ┌────────┬───────┐
@@ -448,7 +431,7 @@ module Polars
448
431
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
449
432
  # }
450
433
  # )
451
- # df.groupby("d", maintain_order: true).mean
434
+ # df.group_by("d", maintain_order: true).mean
452
435
  # # =>
453
436
  # # shape: (3, 4)
454
437
  # # ┌────────┬─────┬──────────┬──────────┐
@@ -476,7 +459,7 @@ module Polars
476
459
  # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
477
460
  # }
478
461
  # )
479
- # df.groupby("d", maintain_order: true).n_unique
462
+ # df.group_by("d", maintain_order: true).n_unique
480
463
  # # =>
481
464
  # # shape: (2, 3)
482
465
  # # ┌────────┬─────┬─────┐
@@ -508,7 +491,7 @@ module Polars
508
491
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
509
492
  # }
510
493
  # )
511
- # df.groupby("d", maintain_order: true).quantile(1)
494
+ # df.group_by("d", maintain_order: true).quantile(1)
512
495
  # # =>
513
496
  # # shape: (3, 3)
514
497
  # # ┌────────┬─────┬──────┐
@@ -536,7 +519,7 @@ module Polars
536
519
  # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
537
520
  # }
538
521
  # )
539
- # df.groupby("d", maintain_order: true).median
522
+ # df.group_by("d", maintain_order: true).median
540
523
  # # =>
541
524
  # # shape: (2, 3)
542
525
  # # ┌────────┬─────┬──────┐
@@ -551,36 +534,15 @@ module Polars
551
534
  agg(Polars.all.median)
552
535
  end
553
536
 
554
- # Aggregate the groups into Series.
555
- #
556
- # @return [DataFrame]
557
- #
558
- # @example
559
- # df = Polars::DataFrame.new({"a" => ["one", "two", "one", "two"], "b" => [1, 2, 3, 4]})
560
- # df.groupby("a", maintain_order: true).agg_list
561
- # # =>
562
- # # shape: (2, 2)
563
- # # ┌─────┬─────────────────┐
564
- # # │ a ┆ b │
565
- # # │ --- ┆ --- │
566
- # # │ str ┆ list[list[i64]] │
567
- # # ╞═════╪═════════════════╡
568
- # # │ one ┆ [[1, 3]] │
569
- # # │ two ┆ [[2, 4]] │
570
- # # └─────┴─────────────────┘
571
- def agg_list
572
- agg(Polars.all.list)
573
- end
574
-
575
537
  # Plot data.
576
538
  #
577
539
  # @return [Vega::LiteChart]
578
540
  def plot(*args, **options)
579
- raise ArgumentError, "Multiple groups not supported" if by.is_a?(Array) && by.size > 1
541
+ raise ArgumentError, "Multiple groups not supported" if @by.is_a?(::Array) && @by.size > 1
580
542
  # same message as Ruby
581
543
  raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
582
544
 
583
- Utils.wrap_df(_df).plot(*args, **options, group: by)
545
+ @df.plot(*args, **options, group: @by)
584
546
  end
585
547
  end
586
548
  end
data/lib/polars/io.rb CHANGED
@@ -621,11 +621,46 @@ module Polars
621
621
  else
622
622
  raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
623
623
  end
624
+
624
625
  data = {}
626
+ schema_overrides = {}
627
+
625
628
  result.columns.each_with_index do |k, i|
626
- data[k] = result.rows.map { |r| r[i] }
629
+ column_type = result.column_types[i]
630
+
631
+ data[k] =
632
+ if column_type
633
+ result.rows.map { |r| column_type.deserialize(r[i]) }
634
+ else
635
+ result.rows.map { |r| r[i] }
636
+ end
637
+
638
+ polars_type =
639
+ case column_type&.type
640
+ when :binary
641
+ Binary
642
+ when :boolean
643
+ Boolean
644
+ when :date
645
+ Date
646
+ when :datetime, :timestamp
647
+ Datetime
648
+ when :decimal
649
+ Decimal
650
+ when :float
651
+ Float64
652
+ when :integer
653
+ Int64
654
+ when :string, :text
655
+ Utf8
656
+ when :time
657
+ Time
658
+ end
659
+
660
+ schema_overrides[k] = polars_type if polars_type
627
661
  end
628
- DataFrame.new(data)
662
+
663
+ DataFrame.new(data, schema_overrides: schema_overrides)
629
664
  end
630
665
  alias_method :read_sql, :read_database
631
666