polars-df 0.6.0-x86_64-darwin → 0.8.0-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,8 +19,8 @@ module Polars
19
19
  # DataFrames/Series/LazyFrames to concatenate.
20
20
  # @param rechunk [Boolean]
21
21
  # Make sure that all data is in contiguous memory.
22
- # @param how ["vertical", "diagonal", "horizontal"]
23
- # Lazy only supports the 'vertical' strategy.
22
+ # @param how ["vertical", "vertical_relaxed", "diagonal", "horizontal"]
23
+ # LazyFrames do not support the `horizontal` strategy.
24
24
  #
25
25
  # - Vertical: applies multiple `vstack` operations.
26
26
  # - Diagonal: finds a union between the column schemas and fills missing column values with null.
@@ -55,18 +55,21 @@ module Polars
55
55
  if how == "vertical"
56
56
  out = Utils.wrap_df(_concat_df(items))
57
57
  elsif how == "diagonal"
58
- out = Utils.wrap_df(_diag_concat_df(items))
58
+ out = Utils.wrap_df(_concat_df_diagonal(items))
59
59
  elsif how == "horizontal"
60
- out = Utils.wrap_df(_hor_concat_df(items))
60
+ out = Utils.wrap_df(_concat_df_horizontal(items))
61
61
  else
62
62
  raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
63
63
  end
64
64
  elsif first.is_a?(LazyFrame)
65
65
  if how == "vertical"
66
- # TODO
67
- return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel))
66
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, false))
67
+ elsif how == "vertical_relaxed"
68
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel, true))
69
+ elsif how == "diagonal"
70
+ return Utils.wrap_ldf(_concat_lf_diagonal(items, rechunk, parallel, false))
68
71
  else
69
- raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
72
+ raise ArgumentError, "Lazy only allows 'vertical', 'vertical_relaxed', and 'diagonal' concat strategy."
70
73
  end
71
74
  elsif first.is_a?(Series)
72
75
  # TODO
@@ -89,9 +92,9 @@ module Polars
89
92
 
90
93
  # Create a range of type `Datetime` (or `Date`).
91
94
  #
92
- # @param low [Object]
95
+ # @param start [Object]
93
96
  # Lower bound of the date range.
94
- # @param high [Object]
97
+ # @param stop [Object]
95
98
  # Upper bound of the date range.
96
99
  # @param interval [Object]
97
100
  # Interval periods. It can be a polars duration string, such as `3d12h4m25s`
@@ -145,8 +148,8 @@ module Polars
145
148
  # # 1985-01-10 00:00:00
146
149
  # # ]
147
150
  def date_range(
148
- low,
149
- high,
151
+ start,
152
+ stop,
150
153
  interval,
151
154
  lazy: false,
152
155
  closed: "both",
@@ -163,39 +166,28 @@ module Polars
163
166
  end
164
167
  end
165
168
 
166
- if low.is_a?(Expr) || high.is_a?(Expr) || lazy
167
- low = Utils.expr_to_lit_or_expr(low, str_to_lit: true)
168
- high = Utils.expr_to_lit_or_expr(high, str_to_lit: true)
169
- return Utils.wrap_expr(
170
- _rb_date_range_lazy(low, high, interval, closed, name, time_zone)
171
- )
169
+ if time_unit.nil?
170
+ if interval.include?("ns")
171
+ time_unit = "ns"
172
+ else
173
+ time_unit = "us"
174
+ end
172
175
  end
173
176
 
174
- low, low_is_date = _ensure_datetime(low)
175
- high, high_is_date = _ensure_datetime(high)
177
+ start_rbexpr = Utils.parse_as_expression(start)
178
+ stop_rbexpr = Utils.parse_as_expression(stop)
176
179
 
177
- if !time_unit.nil?
178
- tu = time_unit
179
- elsif interval.include?("ns")
180
- tu = "ns"
181
- else
182
- tu = "us"
183
- end
180
+ result = Utils.wrap_expr(
181
+ _rb_date_range(start_rbexpr, stop_rbexpr, interval, closed, time_unit, time_zone)
182
+ )
184
183
 
185
- start = Utils._datetime_to_pl_timestamp(low, tu)
186
- stop = Utils._datetime_to_pl_timestamp(high, tu)
187
- if name.nil?
188
- name = ""
189
- end
184
+ result = result.alias(name.to_s)
190
185
 
191
- dt_range = Utils.wrap_s(
192
- _rb_date_range(start, stop, interval, closed, name, tu, time_zone)
193
- )
194
- if low_is_date && high_is_date && !["h", "m", "s"].any? { |v| _interval_granularity(interval).end_with?(v) }
195
- dt_range = dt_range.cast(Date)
186
+ if !lazy
187
+ return select(result).to_series
196
188
  end
197
189
 
198
- dt_range
190
+ result
199
191
  end
200
192
 
201
193
  # Bin values into discrete values.
@@ -2,23 +2,19 @@ module Polars
2
2
  # Starts a new GroupBy operation.
3
3
  class GroupBy
4
4
  # @private
5
- attr_accessor :_df, :_dataframe_class, :by, :maintain_order
6
-
7
- # @private
8
- def initialize(df, by, dataframe_class, maintain_order: false)
9
- self._df = df
10
- self._dataframe_class = dataframe_class
11
- self.by = by
12
- self.maintain_order = maintain_order
5
+ def initialize(df, by, maintain_order: false)
6
+ @df = df
7
+ @by = by
8
+ @maintain_order = maintain_order
13
9
  end
14
10
 
15
- # Allows iteration over the groups of the groupby operation.
11
+ # Allows iteration over the groups of the group by operation.
16
12
  #
17
13
  # @return [Object]
18
14
  #
19
15
  # @example
20
16
  # df = Polars::DataFrame.new({"foo" => ["a", "a", "b"], "bar" => [1, 2, 3]})
21
- # df.groupby("foo", maintain_order: true).each.to_h
17
+ # df.group_by("foo", maintain_order: true).each.to_h
22
18
  # # =>
23
19
  # # {"a"=>shape: (2, 2)
24
20
  # # ┌─────┬─────┐
@@ -41,10 +37,9 @@ module Polars
41
37
 
42
38
  temp_col = "__POLARS_GB_GROUP_INDICES"
43
39
  groups_df =
44
- Utils.wrap_df(_df)
45
- .lazy
40
+ @df.lazy
46
41
  .with_row_count(name: temp_col)
47
- .groupby(by, maintain_order: maintain_order)
42
+ .group_by(@by, maintain_order: @maintain_order)
48
43
  .agg(Polars.col(temp_col))
49
44
  .collect(no_optimization: true)
50
45
 
@@ -52,7 +47,7 @@ module Polars
52
47
 
53
48
  # When grouping by a single column, group name is a single value
54
49
  # When grouping by multiple columns, group name is a tuple of values
55
- if by.is_a?(String) || by.is_a?(Expr)
50
+ if @by.is_a?(::String) || @by.is_a?(Expr)
56
51
  _group_names = group_names.to_series.each
57
52
  else
58
53
  _group_names = group_names.iter_rows
@@ -62,10 +57,8 @@ module Polars
62
57
  _current_index = 0
63
58
 
64
59
  while _current_index < _group_indices.length
65
- df = _dataframe_class._from_rbdf(_df)
66
-
67
60
  group_name = _group_names.next
68
- group_data = df[_group_indices[_current_index]]
61
+ group_data = @df[_group_indices[_current_index]]
69
62
  _current_index += 1
70
63
 
71
64
  yield group_name, group_data
@@ -96,7 +89,7 @@ module Polars
96
89
  # "shape" => ["square", "triangle", "square", "triangle", "square"]
97
90
  # }
98
91
  # )
99
- # df.groupby("color").apply { |group_df| group_df.sample(2) }
92
+ # df.group_by("color").apply { |group_df| group_df.sample(2) }
100
93
  # # =>
101
94
  # # shape: (4, 3)
102
95
  # # ┌─────┬───────┬──────────┐
@@ -110,7 +103,7 @@ module Polars
110
103
  # # │ 3 ┆ red ┆ triangle │
111
104
  # # └─────┴───────┴──────────┘
112
105
  # def apply(&f)
113
- # _dataframe_class._from_rbdf(_df.groupby_apply(by, f))
106
+ # _dataframe_class._from_rbdf(_df.group_by_apply(by, f))
114
107
  # end
115
108
 
116
109
  # Use multiple aggregations on columns.
@@ -126,7 +119,7 @@ module Polars
126
119
  # df = Polars::DataFrame.new(
127
120
  # {"foo" => ["one", "two", "two", "one", "two"], "bar" => [5, 3, 2, 4, 1]}
128
121
  # )
129
- # df.groupby("foo", maintain_order: true).agg(
122
+ # df.group_by("foo", maintain_order: true).agg(
130
123
  # [
131
124
  # Polars.sum("bar").suffix("_sum"),
132
125
  # Polars.col("bar").sort.tail(2).sum.suffix("_tail_sum")
@@ -143,12 +136,10 @@ module Polars
143
136
  # # │ two ┆ 6 ┆ 5 │
144
137
  # # └─────┴─────────┴──────────────┘
145
138
  def agg(aggs)
146
- df = Utils.wrap_df(_df)
147
- .lazy
148
- .groupby(by, maintain_order: maintain_order)
139
+ @df.lazy
140
+ .group_by(@by, maintain_order: @maintain_order)
149
141
  .agg(aggs)
150
- .collect(no_optimization: true, string_cache: false)
151
- _dataframe_class._from_rbdf(df._df)
142
+ .collect(no_optimization: true)
152
143
  end
153
144
 
154
145
  # Get the first `n` rows of each group.
@@ -181,7 +172,7 @@ module Polars
181
172
  # # └─────────┴─────┘
182
173
  #
183
174
  # @example
184
- # df.groupby("letters").head(2).sort("letters")
175
+ # df.group_by("letters").head(2).sort("letters")
185
176
  # # =>
186
177
  # # shape: (5, 2)
187
178
  # # ┌─────────┬─────┐
@@ -196,14 +187,10 @@ module Polars
196
187
  # # │ c ┆ 2 │
197
188
  # # └─────────┴─────┘
198
189
  def head(n = 5)
199
- df = (
200
- Utils.wrap_df(_df)
201
- .lazy
202
- .groupby(by, maintain_order: maintain_order)
203
- .head(n)
204
- .collect(no_optimization: true, string_cache: false)
205
- )
206
- _dataframe_class._from_rbdf(df._df)
190
+ @df.lazy
191
+ .group_by(@by, maintain_order: @maintain_order)
192
+ .head(n)
193
+ .collect(no_optimization: true)
207
194
  end
208
195
 
209
196
  # Get the last `n` rows of each group.
@@ -236,7 +223,7 @@ module Polars
236
223
  # # └─────────┴─────┘
237
224
  #
238
225
  # @example
239
- # df.groupby("letters").tail(2).sort("letters")
226
+ # df.group_by("letters").tail(2).sort("letters")
240
227
  # # =>
241
228
  # # shape: (5, 2)
242
229
  # # ┌─────────┬─────┐
@@ -251,14 +238,10 @@ module Polars
251
238
  # # │ c ┆ 4 │
252
239
  # # └─────────┴─────┘
253
240
  def tail(n = 5)
254
- df = (
255
- Utils.wrap_df(_df)
256
- .lazy
257
- .groupby(by, maintain_order: maintain_order)
258
- .tail(n)
259
- .collect(no_optimization: true, string_cache: false)
260
- )
261
- _dataframe_class._from_rbdf(df._df)
241
+ @df.lazy
242
+ .group_by(@by, maintain_order: @maintain_order)
243
+ .tail(n)
244
+ .collect(no_optimization: true)
262
245
  end
263
246
 
264
247
  # Aggregate the first values in the group.
@@ -274,7 +257,7 @@ module Polars
274
257
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
275
258
  # }
276
259
  # )
277
- # df.groupby("d", maintain_order: true).first
260
+ # df.group_by("d", maintain_order: true).first
278
261
  # # =>
279
262
  # # shape: (3, 4)
280
263
  # # ┌────────┬─────┬──────┬───────┐
@@ -303,7 +286,7 @@ module Polars
303
286
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
304
287
  # }
305
288
  # )
306
- # df.groupby("d", maintain_order: true).last
289
+ # df.group_by("d", maintain_order: true).last
307
290
  # # =>
308
291
  # # shape: (3, 4)
309
292
  # # ┌────────┬─────┬──────┬───────┐
@@ -332,7 +315,7 @@ module Polars
332
315
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
333
316
  # }
334
317
  # )
335
- # df.groupby("d", maintain_order: true).sum
318
+ # df.group_by("d", maintain_order: true).sum
336
319
  # # =>
337
320
  # # shape: (3, 4)
338
321
  # # ┌────────┬─────┬──────┬─────┐
@@ -361,7 +344,7 @@ module Polars
361
344
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"],
362
345
  # }
363
346
  # )
364
- # df.groupby("d", maintain_order: true).min
347
+ # df.group_by("d", maintain_order: true).min
365
348
  # # =>
366
349
  # # shape: (3, 4)
367
350
  # # ┌────────┬─────┬──────┬───────┐
@@ -390,7 +373,7 @@ module Polars
390
373
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
391
374
  # }
392
375
  # )
393
- # df.groupby("d", maintain_order: true).max
376
+ # df.group_by("d", maintain_order: true).max
394
377
  # # =>
395
378
  # # shape: (3, 4)
396
379
  # # ┌────────┬─────┬──────┬──────┐
@@ -419,7 +402,7 @@ module Polars
419
402
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
420
403
  # }
421
404
  # )
422
- # df.groupby("d", maintain_order: true).count
405
+ # df.group_by("d", maintain_order: true).count
423
406
  # # =>
424
407
  # # shape: (3, 2)
425
408
  # # ┌────────┬───────┐
@@ -448,7 +431,7 @@ module Polars
448
431
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
449
432
  # }
450
433
  # )
451
- # df.groupby("d", maintain_order: true).mean
434
+ # df.group_by("d", maintain_order: true).mean
452
435
  # # =>
453
436
  # # shape: (3, 4)
454
437
  # # ┌────────┬─────┬──────────┬──────────┐
@@ -476,7 +459,7 @@ module Polars
476
459
  # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
477
460
  # }
478
461
  # )
479
- # df.groupby("d", maintain_order: true).n_unique
462
+ # df.group_by("d", maintain_order: true).n_unique
480
463
  # # =>
481
464
  # # shape: (2, 3)
482
465
  # # ┌────────┬─────┬─────┐
@@ -508,7 +491,7 @@ module Polars
508
491
  # "d" => ["Apple", "Orange", "Apple", "Apple", "Banana", "Banana"]
509
492
  # }
510
493
  # )
511
- # df.groupby("d", maintain_order: true).quantile(1)
494
+ # df.group_by("d", maintain_order: true).quantile(1)
512
495
  # # =>
513
496
  # # shape: (3, 3)
514
497
  # # ┌────────┬─────┬──────┐
@@ -536,7 +519,7 @@ module Polars
536
519
  # "d" => ["Apple", "Banana", "Apple", "Apple", "Banana", "Banana"]
537
520
  # }
538
521
  # )
539
- # df.groupby("d", maintain_order: true).median
522
+ # df.group_by("d", maintain_order: true).median
540
523
  # # =>
541
524
  # # shape: (2, 3)
542
525
  # # ┌────────┬─────┬──────┐
@@ -555,11 +538,11 @@ module Polars
555
538
  #
556
539
  # @return [Vega::LiteChart]
557
540
  def plot(*args, **options)
558
- raise ArgumentError, "Multiple groups not supported" if by.is_a?(::Array) && by.size > 1
541
+ raise ArgumentError, "Multiple groups not supported" if @by.is_a?(::Array) && @by.size > 1
559
542
  # same message as Ruby
560
543
  raise ArgumentError, "unknown keyword: :group" if options.key?(:group)
561
544
 
562
- Utils.wrap_df(_df).plot(*args, **options, group: by)
545
+ @df.plot(*args, **options, group: @by)
563
546
  end
564
547
  end
565
548
  end
data/lib/polars/io.rb CHANGED
@@ -616,16 +616,51 @@ module Polars
616
616
  query
617
617
  elsif query.is_a?(ActiveRecord::Relation)
618
618
  query.connection.select_all(query.to_sql)
619
- elsif query.is_a?(String)
619
+ elsif query.is_a?(::String)
620
620
  ActiveRecord::Base.connection.select_all(query)
621
621
  else
622
622
  raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
623
623
  end
624
+
624
625
  data = {}
626
+ schema_overrides = {}
627
+
625
628
  result.columns.each_with_index do |k, i|
626
- data[k] = result.rows.map { |r| r[i] }
629
+ column_type = result.column_types[i]
630
+
631
+ data[k] =
632
+ if column_type
633
+ result.rows.map { |r| column_type.deserialize(r[i]) }
634
+ else
635
+ result.rows.map { |r| r[i] }
636
+ end
637
+
638
+ polars_type =
639
+ case column_type&.type
640
+ when :binary
641
+ Binary
642
+ when :boolean
643
+ Boolean
644
+ when :date
645
+ Date
646
+ when :datetime, :timestamp
647
+ Datetime
648
+ when :decimal
649
+ Decimal
650
+ when :float
651
+ Float64
652
+ when :integer
653
+ Int64
654
+ when :string, :text
655
+ String
656
+ when :time
657
+ Time
658
+ end
659
+
660
+ schema_overrides[k] = polars_type if polars_type
627
661
  end
628
- DataFrame.new(data)
662
+
663
+ DataFrame.new(data, schema_overrides: schema_overrides)
629
664
  end
630
665
  alias_method :read_sql, :read_database
631
666
 
@@ -821,7 +856,7 @@ module Polars
821
856
  private
822
857
 
823
858
  def _prepare_file_arg(file)
824
- if file.is_a?(String) && file =~ /\Ahttps?:\/\//
859
+ if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
825
860
  raise ArgumentError, "use URI(...) for remote files"
826
861
  end
827
862
 
@@ -835,7 +870,7 @@ module Polars
835
870
  end
836
871
 
837
872
  def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
838
- if arg.is_a?(String)
873
+ if arg.is_a?(::String)
839
874
  arg_byte_length = arg.bytesize
840
875
  if can_be_empty
841
876
  if arg_byte_length > 1