polars-df 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
1
+ module Polars
2
+ class LazyFrame
3
+ attr_accessor :_ldf
4
+
5
+ def self._from_rbldf(rb_ldf)
6
+ ldf = LazyFrame.allocate
7
+ ldf._ldf = rb_ldf
8
+ ldf
9
+ end
10
+
11
+ def collect(
12
+ type_coercion: true,
13
+ predicate_pushdown: true,
14
+ projection_pushdown: true,
15
+ simplify_expression: true,
16
+ string_cache: false,
17
+ no_optimization: false,
18
+ slice_pushdown: true,
19
+ common_subplan_elimination: true,
20
+ allow_streaming: false
21
+ )
22
+ if no_optimization
23
+ predicate_pushdown = false
24
+ projection_pushdown = false
25
+ slice_pushdown = false
26
+ common_subplan_elimination = false
27
+ end
28
+
29
+ if allow_streaming
30
+ common_subplan_elimination = false
31
+ end
32
+
33
+ ldf = _ldf.optimization_toggle(
34
+ type_coercion,
35
+ predicate_pushdown,
36
+ projection_pushdown,
37
+ simplify_expression,
38
+ slice_pushdown,
39
+ common_subplan_elimination,
40
+ allow_streaming
41
+ )
42
+ Utils.wrap_df(ldf.collect)
43
+ end
44
+
45
+ def filter(predicate)
46
+ self.class._from_rbldf(
47
+ _ldf.filter(
48
+ Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
49
+ )
50
+ )
51
+ end
52
+
53
+ def select(exprs)
54
+ exprs = Utils.selection_to_rbexpr_list(exprs)
55
+ self.class._from_rbldf(_ldf.select(exprs))
56
+ end
57
+
58
+ def groupby(by, maintain_order: false)
59
+ rbexprs_by = Utils.selection_to_rbexpr_list(by)
60
+ lgb = _ldf.groupby(rbexprs_by, maintain_order)
61
+ LazyGroupBy.new(lgb, self.class)
62
+ end
63
+
64
+ def join(
65
+ other,
66
+ left_on: nil,
67
+ right_on: nil,
68
+ on: nil,
69
+ how: "inner",
70
+ suffix: "_right",
71
+ allow_parallel: true,
72
+ force_parallel: false
73
+ )
74
+ if !other.is_a?(LazyFrame)
75
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
76
+ end
77
+
78
+ if how == "cross"
79
+ return self.class._from_rbldf(
80
+ _ldf.join(
81
+ other._ldf, [], [], allow_parallel, force_parallel, how, suffix
82
+ )
83
+ )
84
+ end
85
+
86
+ if !on.nil?
87
+ rbexprs = Utils.selection_to_rbexpr_list(on)
88
+ rbexprs_left = rbexprs
89
+ rbexprs_right = rbexprs
90
+ elsif !left_on.nil? && !right_on.nil?
91
+ rbexprs_left = Utils.selection_to_rbexpr_list(left_on)
92
+ rbexprs_right = Utils.selection_to_rbexpr_list(right_on)
93
+ else
94
+ raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
95
+ end
96
+
97
+ self.class._from_rbldf(
98
+ self._ldf.join(
99
+ other._ldf,
100
+ rbexprs_left,
101
+ rbexprs_right,
102
+ allow_parallel,
103
+ force_parallel,
104
+ how,
105
+ suffix,
106
+ )
107
+ )
108
+ end
109
+
110
+ def with_columns(exprs)
111
+ exprs =
112
+ if exprs.nil?
113
+ []
114
+ elsif exprs.is_a?(Expr)
115
+ [exprs]
116
+ else
117
+ exprs.to_a
118
+ end
119
+
120
+ rbexprs = []
121
+ exprs.each do |e|
122
+ case e
123
+ when Expr
124
+ rbexprs << e._rbexpr
125
+ when Series
126
+ rbexprs = Internal.lit(e)._rbexpr
127
+ else
128
+ raise ArgumentError, "Expected an expression, got #{e}"
129
+ end
130
+ end
131
+
132
+ self.class._from_rbldf(_ldf.with_columns(rbexprs))
133
+ end
134
+
135
+ def with_column(column)
136
+ with_columns([column])
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,121 @@
1
+ module Polars
2
+ module LazyFunctions
3
+ def col(name)
4
+ name = name.to_s if name.is_a?(Symbol)
5
+ Utils.wrap_expr(RbExpr.col(name))
6
+ end
7
+
8
+ def std(column, ddof: 1)
9
+ if column.is_a?(Series)
10
+ column.std(ddof: ddof)
11
+ else
12
+ col(column).std(ddof: ddof)
13
+ end
14
+ end
15
+
16
+ def var(column, ddof: 1)
17
+ if column.is_a?(Series)
18
+ column.var(ddof: ddof)
19
+ else
20
+ col(column).var(ddof: ddof)
21
+ end
22
+ end
23
+
24
+ def max(column)
25
+ if column.is_a?(Series)
26
+ column.max
27
+ elsif column.is_a?(String) || column.is_a?(Symbol)
28
+ col(column).max
29
+ else
30
+ exprs = Utils.selection_to_rbexpr_list(column)
31
+ # TODO
32
+ Utils.wrap_expr(_max_exprs(exprs))
33
+ end
34
+ end
35
+
36
+ def min(column)
37
+ if column.is_a?(Series)
38
+ column.min
39
+ elsif column.is_a?(String) || column.is_a?(Symbol)
40
+ col(column).min
41
+ else
42
+ exprs = Utils.selection_to_rbexpr_list(column)
43
+ # TODO
44
+ Utils.wrap_expr(_min_exprs(exprs))
45
+ end
46
+ end
47
+
48
+ def sum(column)
49
+ if column.is_a?(Series)
50
+ column.sum
51
+ elsif column.is_a?(String) || column.is_a?(Symbol)
52
+ col(column.to_s).sum
53
+ elsif column.is_a?(Array)
54
+ exprs = Utils.selection_to_rbexpr_list(column)
55
+ # TODO
56
+ Utils.wrap_expr(_sum_exprs(exprs))
57
+ else
58
+ raise "todo"
59
+ end
60
+ end
61
+
62
+ def mean(column)
63
+ if column.is_a?(Series)
64
+ column.mean
65
+ else
66
+ col(column).mean
67
+ end
68
+ end
69
+
70
+ def avg(column)
71
+ mean(column)
72
+ end
73
+
74
+ def median(column)
75
+ if column.is_a?(Series)
76
+ column.median
77
+ else
78
+ col(column).median
79
+ end
80
+ end
81
+
82
+ def lit(value)
83
+ Utils.wrap_expr(RbExpr.lit(value))
84
+ end
85
+
86
+ def arange(low, high, step: 1, eager: false, dtype: nil)
87
+ low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
88
+ high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
89
+ range_expr = Utils.wrap_expr(RbExpr.arange(low._rbexpr, high._rbexpr, step))
90
+
91
+ if !dtype.nil? && dtype != "i64"
92
+ range_expr = range_expr.cast(dtype)
93
+ end
94
+
95
+ if !eager
96
+ range_expr
97
+ else
98
+ DataFrame.new
99
+ .select(range_expr)
100
+ .to_series
101
+ .rename("arange", in_place: true)
102
+ end
103
+ end
104
+
105
+ def all(name = nil)
106
+ if name.nil?
107
+ col("*")
108
+ elsif name.is_a?(String) || name.is_a?(Symbol)
109
+ col(name).all
110
+ else
111
+ raise "todo"
112
+ end
113
+ end
114
+
115
+ def when(expr)
116
+ expr = Utils.expr_to_lit_or_expr(expr)
117
+ pw = RbExpr.when(expr._rbexpr)
118
+ When.new(pw)
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,13 @@
1
+ module Polars
2
+ class LazyGroupBy
3
+ def initialize(lgb, lazyframe_class)
4
+ @lgb = lgb
5
+ @lazyframe_class = lazyframe_class
6
+ end
7
+
8
+ def agg(aggs)
9
+ rbexprs = Utils.selection_to_rbexpr_list(aggs)
10
+ @lazyframe_class._from_rbldf(@lgb.agg(rbexprs))
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,261 @@
1
+ module Polars
2
+ class Series
3
+ attr_accessor :_s
4
+
5
+ def initialize(name = nil, values = nil, dtype: nil, strict: true, nan_to_null: false, dtype_if_empty: nil)
6
+ # Handle case where values are passed as the first argument
7
+ if !name.nil? && !name.is_a?(String)
8
+ if values.nil?
9
+ values = name
10
+ name = nil
11
+ else
12
+ raise ArgumentError, "Series name must be a string."
13
+ end
14
+ end
15
+
16
+ name = "" if name.nil?
17
+
18
+ if values.nil?
19
+ self._s = sequence_to_rbseries(name, [], dtype: dtype, dtype_if_empty: dtype_if_empty)
20
+ elsif values.is_a?(Range)
21
+ self._s =
22
+ Polars.arange(
23
+ values.first,
24
+ values.last + (values.exclude_end? ? 0 : 1),
25
+ step: 1,
26
+ eager: true,
27
+ dtype: dtype
28
+ )
29
+ .rename(name, in_place: true)
30
+ ._s
31
+ elsif values.is_a?(Array)
32
+ self._s = sequence_to_rbseries(name, values, dtype: dtype, strict: strict, dtype_if_empty: dtype_if_empty)
33
+ else
34
+ raise ArgumentError, "Series constructor called with unsupported type; got #{values.class.name}"
35
+ end
36
+ end
37
+
38
+ def self._from_rbseries(s)
39
+ series = Series.allocate
40
+ series._s = s
41
+ series
42
+ end
43
+
44
+ def dtype
45
+ _s.dtype.to_sym
46
+ end
47
+
48
+ def name
49
+ _s.name
50
+ end
51
+
52
+ def shape
53
+ [_s.len]
54
+ end
55
+
56
+ def to_s
57
+ _s.to_s
58
+ end
59
+ alias_method :inspect, :to_s
60
+
61
+ def &(other)
62
+ Utils.wrap_s(_s.bitand(other._s))
63
+ end
64
+
65
+ def |(other)
66
+ Utils.wrap_s(_s.bitor(other._s))
67
+ end
68
+
69
+ def ^(other)
70
+ Utils.wrap_s(_s.bitxor(other._s))
71
+ end
72
+
73
+ def +(other)
74
+ Utils. wrap_s(_s.add(other._s))
75
+ end
76
+
77
+ def -(other)
78
+ Utils.wrap_s(_s.sub(other._s))
79
+ end
80
+
81
+ def *(other)
82
+ Utils.wrap_s(_s.mul(other._s))
83
+ end
84
+
85
+ def /(other)
86
+ Utils.wrap_s(_s.div(other._s))
87
+ end
88
+
89
+ def to_frame
90
+ Utils.wrap_df(RbDataFrame.new([_s]))
91
+ end
92
+
93
+ def sum
94
+ _s.sum
95
+ end
96
+
97
+ def mean
98
+ _s.mean
99
+ end
100
+
101
+ def min
102
+ _s.min
103
+ end
104
+
105
+ def max
106
+ _s.max
107
+ end
108
+
109
+ def alias(name)
110
+ s = dup
111
+ s._s.rename(name)
112
+ s
113
+ end
114
+
115
+ def rename(name, in_place: false)
116
+ if in_place
117
+ _s.rename(name)
118
+ self
119
+ else
120
+ self.alias(name)
121
+ end
122
+ end
123
+
124
+ def chunk_lengths
125
+ _s.chunk_lengths
126
+ end
127
+
128
+ def n_chunks
129
+ _s.n_chunks
130
+ end
131
+
132
+ def cumsum(reverse: false)
133
+ Utils.wrap_s(_s.cumsum(reverse))
134
+ end
135
+
136
+ def cummin(reverse: false)
137
+ Utils.wrap_s(_s.cummin(reverse))
138
+ end
139
+
140
+ def cummax(reverse: false)
141
+ Utils.wrap_s(_s.cummax(reverse))
142
+ end
143
+
144
+ def limit(n = 10)
145
+ to_frame().select(Utils.col(name).limit(n)).to_series
146
+ end
147
+
148
+ def slice(offset, length = nil)
149
+ length = len if length.nil?
150
+ Utils.wrap_s(_s.slice(offset, length))
151
+ end
152
+
153
+ def append(other)
154
+ _s.append(other._s)
155
+ self
156
+ end
157
+
158
+ def filter(predicate)
159
+ Utils.wrap_s(_s.filter(predicate._s))
160
+ end
161
+
162
+ def head(n = 10)
163
+ to_frame.select(Utils.col(name).head(n)).to_series
164
+ end
165
+
166
+ def tail(n = 10)
167
+ to_frame.select(Utils.col(name).tail(n)).to_series
168
+ end
169
+
170
+ def sort(reverse: false, in_place: false)
171
+ if in_place
172
+ self._s = _s.sort(reverse)
173
+ self
174
+ else
175
+ Utils.wrap_s(_s.sort(reverse))
176
+ end
177
+ end
178
+
179
+ def to_a
180
+ _s.to_a
181
+ end
182
+
183
+ def len
184
+ _s.len
185
+ end
186
+
187
+ def rechunk(in_place: false)
188
+ opt_s = _s.rechunk(in_place)
189
+ in_place ? self : Utils.wrap_s(opt_s)
190
+ end
191
+
192
+ private
193
+
194
+ def sequence_to_rbseries(name, values, dtype: nil, strict: true, dtype_if_empty: nil)
195
+ ruby_dtype = nil
196
+
197
+ if (values.nil? || values.empty?) && dtype.nil?
198
+ if dtype_if_empty
199
+ # if dtype for empty sequence could be guessed
200
+ # (e.g comparisons between self and other)
201
+ dtype = dtype_if_empty
202
+ else
203
+ # default to Float32 type
204
+ dtype = "f32"
205
+ end
206
+ end
207
+
208
+ # _get_first_non_none
209
+ value = values.find { |v| !v.nil? }
210
+
211
+ if !dtype.nil? && is_polars_dtype(dtype) && ruby_dtype.nil?
212
+ constructor = polars_type_to_constructor(dtype)
213
+ rbseries = constructor.call(name, values, strict)
214
+ return rbseries
215
+ end
216
+
217
+ constructor = rb_type_to_constructor(value.class)
218
+ constructor.call(name, values, strict)
219
+ end
220
+
221
+ POLARS_TYPE_TO_CONSTRUCTOR = {
222
+ f32: RbSeries.method(:new_opt_f32),
223
+ f64: RbSeries.method(:new_opt_f64),
224
+ i8: RbSeries.method(:new_opt_i8),
225
+ i16: RbSeries.method(:new_opt_i16),
226
+ i32: RbSeries.method(:new_opt_i32),
227
+ i64: RbSeries.method(:new_opt_i64),
228
+ u8: RbSeries.method(:new_opt_u8),
229
+ u16: RbSeries.method(:new_opt_u16),
230
+ u32: RbSeries.method(:new_opt_u32),
231
+ u64: RbSeries.method(:new_opt_u64),
232
+ bool: RbSeries.method(:new_opt_bool),
233
+ str: RbSeries.method(:new_str)
234
+ }
235
+
236
+ def polars_type_to_constructor(dtype)
237
+ POLARS_TYPE_TO_CONSTRUCTOR.fetch(dtype.to_sym)
238
+ rescue KeyError
239
+ raise ArgumentError, "Cannot construct RbSeries for type #{dtype}."
240
+ end
241
+
242
+ RB_TYPE_TO_CONSTRUCTOR = {
243
+ Float => RbSeries.method(:new_opt_f64),
244
+ Integer => RbSeries.method(:new_opt_i64),
245
+ String => RbSeries.method(:new_str),
246
+ TrueClass => RbSeries.method(:new_opt_bool),
247
+ FalseClass => RbSeries.method(:new_opt_bool)
248
+ }
249
+
250
+ def rb_type_to_constructor(dtype)
251
+ RB_TYPE_TO_CONSTRUCTOR.fetch(dtype)
252
+ rescue KeyError
253
+ # RbSeries.method(:new_object)
254
+ raise ArgumentError, "Cannot determine type"
255
+ end
256
+
257
+ def is_polars_dtype(data_type)
258
+ true
259
+ end
260
+ end
261
+ end
@@ -0,0 +1,17 @@
1
+ module Polars
2
+ class StringExpr
3
+ attr_accessor :_rbexpr
4
+
5
+ def initialize(expr)
6
+ self._rbexpr = expr._rbexpr
7
+ end
8
+
9
+ def lengths
10
+ Utils.wrap_expr(_rbexpr.str_lengths)
11
+ end
12
+
13
+ def contains(pattern, literal: false)
14
+ Utils.wrap_expr(_rbexpr.str_contains(pattern, literal))
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,47 @@
1
+ module Polars
2
+ module Utils
3
+ def self.wrap_s(s)
4
+ Series._from_rbseries(s)
5
+ end
6
+
7
+ def self.wrap_df(df)
8
+ DataFrame._from_rbdf(df)
9
+ end
10
+
11
+ def self.wrap_expr(rbexpr)
12
+ Expr._from_rbexpr(rbexpr)
13
+ end
14
+
15
+ def self.col(name)
16
+ Polars.col(name)
17
+ end
18
+
19
+ def self.selection_to_rbexpr_list(exprs)
20
+ if exprs.is_a?(String) || exprs.is_a?(Expr) || exprs.is_a?(Series)
21
+ exprs = [exprs]
22
+ end
23
+
24
+ exprs.map { |e| expr_to_lit_or_expr(e, str_to_lit: false)._rbexpr }
25
+ end
26
+
27
+ def self.expr_to_lit_or_expr(expr, str_to_lit: true)
28
+ if expr.is_a?(String) && !str_to_lit
29
+ col(expr)
30
+ elsif expr.is_a?(Integer) || expr.is_a?(Float) || expr.is_a?(String) || expr.is_a?(Series) || expr.nil?
31
+ lit(expr)
32
+ elsif expr.is_a?(Expr)
33
+ expr
34
+ else
35
+ raise ArgumentError, "did not expect value #{expr} of type #{expr.class.name}, maybe disambiguate with Polars.lit or Polars.col"
36
+ end
37
+ end
38
+
39
+ def self.lit(value)
40
+ Polars.lit(value)
41
+ end
42
+
43
+ def self.format_path(path)
44
+ File.expand_path(path)
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,3 @@
1
+ module Polars
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,15 @@
1
+ module Polars
2
+ class When
3
+ attr_accessor :_rbwhen
4
+
5
+ def initialize(rbwhen)
6
+ self._rbwhen = rbwhen
7
+ end
8
+
9
+ def then(expr)
10
+ expr = Utils.expr_to_lit_or_expr(expr)
11
+ rbwhenthen = _rbwhen._then(expr._rbexpr)
12
+ WhenThen.new(rbwhenthen)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,18 @@
1
+ module Polars
2
+ class WhenThen
3
+ attr_accessor :_rbwhenthen
4
+
5
+ def initialize(rbwhenthen)
6
+ self._rbwhenthen = rbwhenthen
7
+ end
8
+
9
+ def when(predicate)
10
+ WhenThenThen.new(_rbwhenthen.when(predicate._rbexpr))
11
+ end
12
+
13
+ def otherwise(expr)
14
+ expr = Utils.expr_to_lit_or_expr(expr)
15
+ Utils.wrap_expr(_rbwhenthen.otherwise(expr._rbexpr))
16
+ end
17
+ end
18
+ end
data/lib/polars-df.rb ADDED
@@ -0,0 +1 @@
1
+ require "polars"
data/lib/polars.rb ADDED
@@ -0,0 +1,25 @@
1
+ # ext
2
+ require "polars/polars"
3
+
4
+ # modules
5
+ require "polars/data_frame"
6
+ require "polars/expr"
7
+ require "polars/functions"
8
+ require "polars/lazy_frame"
9
+ require "polars/lazy_functions"
10
+ require "polars/lazy_group_by"
11
+ require "polars/io"
12
+ require "polars/series"
13
+ require "polars/string_expr"
14
+ require "polars/utils"
15
+ require "polars/version"
16
+ require "polars/when"
17
+ require "polars/when_then"
18
+
19
+ module Polars
20
+ class Error < StandardError; end
21
+
22
+ extend Functions
23
+ extend IO
24
+ extend LazyFunctions
25
+ end