polars-df 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,139 @@
1
+ module Polars
2
+ class LazyFrame
3
+ attr_accessor :_ldf
4
+
5
+ def self._from_rbldf(rb_ldf)
6
+ ldf = LazyFrame.allocate
7
+ ldf._ldf = rb_ldf
8
+ ldf
9
+ end
10
+
11
+ def collect(
12
+ type_coercion: true,
13
+ predicate_pushdown: true,
14
+ projection_pushdown: true,
15
+ simplify_expression: true,
16
+ string_cache: false,
17
+ no_optimization: false,
18
+ slice_pushdown: true,
19
+ common_subplan_elimination: true,
20
+ allow_streaming: false
21
+ )
22
+ if no_optimization
23
+ predicate_pushdown = false
24
+ projection_pushdown = false
25
+ slice_pushdown = false
26
+ common_subplan_elimination = false
27
+ end
28
+
29
+ if allow_streaming
30
+ common_subplan_elimination = false
31
+ end
32
+
33
+ ldf = _ldf.optimization_toggle(
34
+ type_coercion,
35
+ predicate_pushdown,
36
+ projection_pushdown,
37
+ simplify_expression,
38
+ slice_pushdown,
39
+ common_subplan_elimination,
40
+ allow_streaming
41
+ )
42
+ Utils.wrap_df(ldf.collect)
43
+ end
44
+
45
+ def filter(predicate)
46
+ self.class._from_rbldf(
47
+ _ldf.filter(
48
+ Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
49
+ )
50
+ )
51
+ end
52
+
53
+ def select(exprs)
54
+ exprs = Utils.selection_to_rbexpr_list(exprs)
55
+ self.class._from_rbldf(_ldf.select(exprs))
56
+ end
57
+
58
+ def groupby(by, maintain_order: false)
59
+ rbexprs_by = Utils.selection_to_rbexpr_list(by)
60
+ lgb = _ldf.groupby(rbexprs_by, maintain_order)
61
+ LazyGroupBy.new(lgb, self.class)
62
+ end
63
+
64
+ def join(
65
+ other,
66
+ left_on: nil,
67
+ right_on: nil,
68
+ on: nil,
69
+ how: "inner",
70
+ suffix: "_right",
71
+ allow_parallel: true,
72
+ force_parallel: false
73
+ )
74
+ if !other.is_a?(LazyFrame)
75
+ raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
76
+ end
77
+
78
+ if how == "cross"
79
+ return self.class._from_rbldf(
80
+ _ldf.join(
81
+ other._ldf, [], [], allow_parallel, force_parallel, how, suffix
82
+ )
83
+ )
84
+ end
85
+
86
+ if !on.nil?
87
+ rbexprs = Utils.selection_to_rbexpr_list(on)
88
+ rbexprs_left = rbexprs
89
+ rbexprs_right = rbexprs
90
+ elsif !left_on.nil? && !right_on.nil?
91
+ rbexprs_left = Utils.selection_to_rbexpr_list(left_on)
92
+ rbexprs_right = Utils.selection_to_rbexpr_list(right_on)
93
+ else
94
+ raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
95
+ end
96
+
97
+ self.class._from_rbldf(
98
+ self._ldf.join(
99
+ other._ldf,
100
+ rbexprs_left,
101
+ rbexprs_right,
102
+ allow_parallel,
103
+ force_parallel,
104
+ how,
105
+ suffix,
106
+ )
107
+ )
108
+ end
109
+
110
+ def with_columns(exprs)
111
+ exprs =
112
+ if exprs.nil?
113
+ []
114
+ elsif exprs.is_a?(Expr)
115
+ [exprs]
116
+ else
117
+ exprs.to_a
118
+ end
119
+
120
+ rbexprs = []
121
+ exprs.each do |e|
122
+ case e
123
+ when Expr
124
+ rbexprs << e._rbexpr
125
+ when Series
126
+ rbexprs = Internal.lit(e)._rbexpr
127
+ else
128
+ raise ArgumentError, "Expected an expression, got #{e}"
129
+ end
130
+ end
131
+
132
+ self.class._from_rbldf(_ldf.with_columns(rbexprs))
133
+ end
134
+
135
+ def with_column(column)
136
+ with_columns([column])
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,121 @@
1
+ module Polars
2
+ module LazyFunctions
3
+ def col(name)
4
+ name = name.to_s if name.is_a?(Symbol)
5
+ Utils.wrap_expr(RbExpr.col(name))
6
+ end
7
+
8
+ def std(column, ddof: 1)
9
+ if column.is_a?(Series)
10
+ column.std(ddof: ddof)
11
+ else
12
+ col(column).std(ddof: ddof)
13
+ end
14
+ end
15
+
16
+ def var(column, ddof: 1)
17
+ if column.is_a?(Series)
18
+ column.var(ddof: ddof)
19
+ else
20
+ col(column).var(ddof: ddof)
21
+ end
22
+ end
23
+
24
+ def max(column)
25
+ if column.is_a?(Series)
26
+ column.max
27
+ elsif column.is_a?(String) || column.is_a?(Symbol)
28
+ col(column).max
29
+ else
30
+ exprs = Utils.selection_to_rbexpr_list(column)
31
+ # TODO
32
+ Utils.wrap_expr(_max_exprs(exprs))
33
+ end
34
+ end
35
+
36
+ def min(column)
37
+ if column.is_a?(Series)
38
+ column.min
39
+ elsif column.is_a?(String) || column.is_a?(Symbol)
40
+ col(column).min
41
+ else
42
+ exprs = Utils.selection_to_rbexpr_list(column)
43
+ # TODO
44
+ Utils.wrap_expr(_min_exprs(exprs))
45
+ end
46
+ end
47
+
48
+ def sum(column)
49
+ if column.is_a?(Series)
50
+ column.sum
51
+ elsif column.is_a?(String) || column.is_a?(Symbol)
52
+ col(column.to_s).sum
53
+ elsif column.is_a?(Array)
54
+ exprs = Utils.selection_to_rbexpr_list(column)
55
+ # TODO
56
+ Utils.wrap_expr(_sum_exprs(exprs))
57
+ else
58
+ raise "todo"
59
+ end
60
+ end
61
+
62
+ def mean(column)
63
+ if column.is_a?(Series)
64
+ column.mean
65
+ else
66
+ col(column).mean
67
+ end
68
+ end
69
+
70
+ def avg(column)
71
+ mean(column)
72
+ end
73
+
74
+ def median(column)
75
+ if column.is_a?(Series)
76
+ column.median
77
+ else
78
+ col(column).median
79
+ end
80
+ end
81
+
82
+ def lit(value)
83
+ Utils.wrap_expr(RbExpr.lit(value))
84
+ end
85
+
86
+ def arange(low, high, step: 1, eager: false, dtype: nil)
87
+ low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
88
+ high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
89
+ range_expr = Utils.wrap_expr(RbExpr.arange(low._rbexpr, high._rbexpr, step))
90
+
91
+ if !dtype.nil? && dtype != "i64"
92
+ range_expr = range_expr.cast(dtype)
93
+ end
94
+
95
+ if !eager
96
+ range_expr
97
+ else
98
+ DataFrame.new
99
+ .select(range_expr)
100
+ .to_series
101
+ .rename("arange", in_place: true)
102
+ end
103
+ end
104
+
105
+ def all(name = nil)
106
+ if name.nil?
107
+ col("*")
108
+ elsif name.is_a?(String) || name.is_a?(Symbol)
109
+ col(name).all
110
+ else
111
+ raise "todo"
112
+ end
113
+ end
114
+
115
+ def when(expr)
116
+ expr = Utils.expr_to_lit_or_expr(expr)
117
+ pw = RbExpr.when(expr._rbexpr)
118
+ When.new(pw)
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,13 @@
1
+ module Polars
2
+ class LazyGroupBy
3
+ def initialize(lgb, lazyframe_class)
4
+ @lgb = lgb
5
+ @lazyframe_class = lazyframe_class
6
+ end
7
+
8
+ def agg(aggs)
9
+ rbexprs = Utils.selection_to_rbexpr_list(aggs)
10
+ @lazyframe_class._from_rbldf(@lgb.agg(rbexprs))
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,261 @@
1
+ module Polars
2
+ class Series
3
+ attr_accessor :_s
4
+
5
+ def initialize(name = nil, values = nil, dtype: nil, strict: true, nan_to_null: false, dtype_if_empty: nil)
6
+ # Handle case where values are passed as the first argument
7
+ if !name.nil? && !name.is_a?(String)
8
+ if values.nil?
9
+ values = name
10
+ name = nil
11
+ else
12
+ raise ArgumentError, "Series name must be a string."
13
+ end
14
+ end
15
+
16
+ name = "" if name.nil?
17
+
18
+ if values.nil?
19
+ self._s = sequence_to_rbseries(name, [], dtype: dtype, dtype_if_empty: dtype_if_empty)
20
+ elsif values.is_a?(Range)
21
+ self._s =
22
+ Polars.arange(
23
+ values.first,
24
+ values.last + (values.exclude_end? ? 0 : 1),
25
+ step: 1,
26
+ eager: true,
27
+ dtype: dtype
28
+ )
29
+ .rename(name, in_place: true)
30
+ ._s
31
+ elsif values.is_a?(Array)
32
+ self._s = sequence_to_rbseries(name, values, dtype: dtype, strict: strict, dtype_if_empty: dtype_if_empty)
33
+ else
34
+ raise ArgumentError, "Series constructor called with unsupported type; got #{values.class.name}"
35
+ end
36
+ end
37
+
38
+ def self._from_rbseries(s)
39
+ series = Series.allocate
40
+ series._s = s
41
+ series
42
+ end
43
+
44
+ def dtype
45
+ _s.dtype.to_sym
46
+ end
47
+
48
+ def name
49
+ _s.name
50
+ end
51
+
52
+ def shape
53
+ [_s.len]
54
+ end
55
+
56
+ def to_s
57
+ _s.to_s
58
+ end
59
+ alias_method :inspect, :to_s
60
+
61
+ def &(other)
62
+ Utils.wrap_s(_s.bitand(other._s))
63
+ end
64
+
65
+ def |(other)
66
+ Utils.wrap_s(_s.bitor(other._s))
67
+ end
68
+
69
+ def ^(other)
70
+ Utils.wrap_s(_s.bitxor(other._s))
71
+ end
72
+
73
+ def +(other)
74
+ Utils. wrap_s(_s.add(other._s))
75
+ end
76
+
77
+ def -(other)
78
+ Utils.wrap_s(_s.sub(other._s))
79
+ end
80
+
81
+ def *(other)
82
+ Utils.wrap_s(_s.mul(other._s))
83
+ end
84
+
85
+ def /(other)
86
+ Utils.wrap_s(_s.div(other._s))
87
+ end
88
+
89
+ def to_frame
90
+ Utils.wrap_df(RbDataFrame.new([_s]))
91
+ end
92
+
93
+ def sum
94
+ _s.sum
95
+ end
96
+
97
+ def mean
98
+ _s.mean
99
+ end
100
+
101
+ def min
102
+ _s.min
103
+ end
104
+
105
+ def max
106
+ _s.max
107
+ end
108
+
109
+ def alias(name)
110
+ s = dup
111
+ s._s.rename(name)
112
+ s
113
+ end
114
+
115
+ def rename(name, in_place: false)
116
+ if in_place
117
+ _s.rename(name)
118
+ self
119
+ else
120
+ self.alias(name)
121
+ end
122
+ end
123
+
124
+ def chunk_lengths
125
+ _s.chunk_lengths
126
+ end
127
+
128
+ def n_chunks
129
+ _s.n_chunks
130
+ end
131
+
132
+ def cumsum(reverse: false)
133
+ Utils.wrap_s(_s.cumsum(reverse))
134
+ end
135
+
136
+ def cummin(reverse: false)
137
+ Utils.wrap_s(_s.cummin(reverse))
138
+ end
139
+
140
+ def cummax(reverse: false)
141
+ Utils.wrap_s(_s.cummax(reverse))
142
+ end
143
+
144
+ def limit(n = 10)
145
+ to_frame().select(Utils.col(name).limit(n)).to_series
146
+ end
147
+
148
+ def slice(offset, length = nil)
149
+ length = len if length.nil?
150
+ Utils.wrap_s(_s.slice(offset, length))
151
+ end
152
+
153
+ def append(other)
154
+ _s.append(other._s)
155
+ self
156
+ end
157
+
158
+ def filter(predicate)
159
+ Utils.wrap_s(_s.filter(predicate._s))
160
+ end
161
+
162
+ def head(n = 10)
163
+ to_frame.select(Utils.col(name).head(n)).to_series
164
+ end
165
+
166
+ def tail(n = 10)
167
+ to_frame.select(Utils.col(name).tail(n)).to_series
168
+ end
169
+
170
+ def sort(reverse: false, in_place: false)
171
+ if in_place
172
+ self._s = _s.sort(reverse)
173
+ self
174
+ else
175
+ Utils.wrap_s(_s.sort(reverse))
176
+ end
177
+ end
178
+
179
+ def to_a
180
+ _s.to_a
181
+ end
182
+
183
+ def len
184
+ _s.len
185
+ end
186
+
187
+ def rechunk(in_place: false)
188
+ opt_s = _s.rechunk(in_place)
189
+ in_place ? self : Utils.wrap_s(opt_s)
190
+ end
191
+
192
+ private
193
+
194
+ def sequence_to_rbseries(name, values, dtype: nil, strict: true, dtype_if_empty: nil)
195
+ ruby_dtype = nil
196
+
197
+ if (values.nil? || values.empty?) && dtype.nil?
198
+ if dtype_if_empty
199
+ # if dtype for empty sequence could be guessed
200
+ # (e.g comparisons between self and other)
201
+ dtype = dtype_if_empty
202
+ else
203
+ # default to Float32 type
204
+ dtype = "f32"
205
+ end
206
+ end
207
+
208
+ # _get_first_non_none
209
+ value = values.find { |v| !v.nil? }
210
+
211
+ if !dtype.nil? && is_polars_dtype(dtype) && ruby_dtype.nil?
212
+ constructor = polars_type_to_constructor(dtype)
213
+ rbseries = constructor.call(name, values, strict)
214
+ return rbseries
215
+ end
216
+
217
+ constructor = rb_type_to_constructor(value.class)
218
+ constructor.call(name, values, strict)
219
+ end
220
+
221
+ POLARS_TYPE_TO_CONSTRUCTOR = {
222
+ f32: RbSeries.method(:new_opt_f32),
223
+ f64: RbSeries.method(:new_opt_f64),
224
+ i8: RbSeries.method(:new_opt_i8),
225
+ i16: RbSeries.method(:new_opt_i16),
226
+ i32: RbSeries.method(:new_opt_i32),
227
+ i64: RbSeries.method(:new_opt_i64),
228
+ u8: RbSeries.method(:new_opt_u8),
229
+ u16: RbSeries.method(:new_opt_u16),
230
+ u32: RbSeries.method(:new_opt_u32),
231
+ u64: RbSeries.method(:new_opt_u64),
232
+ bool: RbSeries.method(:new_opt_bool),
233
+ str: RbSeries.method(:new_str)
234
+ }
235
+
236
+ def polars_type_to_constructor(dtype)
237
+ POLARS_TYPE_TO_CONSTRUCTOR.fetch(dtype.to_sym)
238
+ rescue KeyError
239
+ raise ArgumentError, "Cannot construct RbSeries for type #{dtype}."
240
+ end
241
+
242
+ RB_TYPE_TO_CONSTRUCTOR = {
243
+ Float => RbSeries.method(:new_opt_f64),
244
+ Integer => RbSeries.method(:new_opt_i64),
245
+ String => RbSeries.method(:new_str),
246
+ TrueClass => RbSeries.method(:new_opt_bool),
247
+ FalseClass => RbSeries.method(:new_opt_bool)
248
+ }
249
+
250
+ def rb_type_to_constructor(dtype)
251
+ RB_TYPE_TO_CONSTRUCTOR.fetch(dtype)
252
+ rescue KeyError
253
+ # RbSeries.method(:new_object)
254
+ raise ArgumentError, "Cannot determine type"
255
+ end
256
+
257
+ def is_polars_dtype(data_type)
258
+ true
259
+ end
260
+ end
261
+ end
@@ -0,0 +1,17 @@
1
+ module Polars
2
+ class StringExpr
3
+ attr_accessor :_rbexpr
4
+
5
+ def initialize(expr)
6
+ self._rbexpr = expr._rbexpr
7
+ end
8
+
9
+ def lengths
10
+ Utils.wrap_expr(_rbexpr.str_lengths)
11
+ end
12
+
13
+ def contains(pattern, literal: false)
14
+ Utils.wrap_expr(_rbexpr.str_contains(pattern, literal))
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,47 @@
1
+ module Polars
2
+ module Utils
3
+ def self.wrap_s(s)
4
+ Series._from_rbseries(s)
5
+ end
6
+
7
+ def self.wrap_df(df)
8
+ DataFrame._from_rbdf(df)
9
+ end
10
+
11
+ def self.wrap_expr(rbexpr)
12
+ Expr._from_rbexpr(rbexpr)
13
+ end
14
+
15
+ def self.col(name)
16
+ Polars.col(name)
17
+ end
18
+
19
+ def self.selection_to_rbexpr_list(exprs)
20
+ if exprs.is_a?(String) || exprs.is_a?(Expr) || exprs.is_a?(Series)
21
+ exprs = [exprs]
22
+ end
23
+
24
+ exprs.map { |e| expr_to_lit_or_expr(e, str_to_lit: false)._rbexpr }
25
+ end
26
+
27
+ def self.expr_to_lit_or_expr(expr, str_to_lit: true)
28
+ if expr.is_a?(String) && !str_to_lit
29
+ col(expr)
30
+ elsif expr.is_a?(Integer) || expr.is_a?(Float) || expr.is_a?(String) || expr.is_a?(Series) || expr.nil?
31
+ lit(expr)
32
+ elsif expr.is_a?(Expr)
33
+ expr
34
+ else
35
+ raise ArgumentError, "did not expect value #{expr} of type #{expr.class.name}, maybe disambiguate with Polars.lit or Polars.col"
36
+ end
37
+ end
38
+
39
+ def self.lit(value)
40
+ Polars.lit(value)
41
+ end
42
+
43
+ def self.format_path(path)
44
+ File.expand_path(path)
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,3 @@
1
+ module Polars
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,15 @@
1
+ module Polars
2
+ class When
3
+ attr_accessor :_rbwhen
4
+
5
+ def initialize(rbwhen)
6
+ self._rbwhen = rbwhen
7
+ end
8
+
9
+ def then(expr)
10
+ expr = Utils.expr_to_lit_or_expr(expr)
11
+ rbwhenthen = _rbwhen._then(expr._rbexpr)
12
+ WhenThen.new(rbwhenthen)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,18 @@
1
+ module Polars
2
+ class WhenThen
3
+ attr_accessor :_rbwhenthen
4
+
5
+ def initialize(rbwhenthen)
6
+ self._rbwhenthen = rbwhenthen
7
+ end
8
+
9
+ def when(predicate)
10
+ WhenThenThen.new(_rbwhenthen.when(predicate._rbexpr))
11
+ end
12
+
13
+ def otherwise(expr)
14
+ expr = Utils.expr_to_lit_or_expr(expr)
15
+ Utils.wrap_expr(_rbwhenthen.otherwise(expr._rbexpr))
16
+ end
17
+ end
18
+ end
data/lib/polars-df.rb ADDED
@@ -0,0 +1 @@
1
+ require "polars"
data/lib/polars.rb ADDED
@@ -0,0 +1,25 @@
1
+ # ext
2
+ require "polars/polars"
3
+
4
+ # modules
5
+ require "polars/data_frame"
6
+ require "polars/expr"
7
+ require "polars/functions"
8
+ require "polars/lazy_frame"
9
+ require "polars/lazy_functions"
10
+ require "polars/lazy_group_by"
11
+ require "polars/io"
12
+ require "polars/series"
13
+ require "polars/string_expr"
14
+ require "polars/utils"
15
+ require "polars/version"
16
+ require "polars/when"
17
+ require "polars/when_then"
18
+
19
+ module Polars
20
+ class Error < StandardError; end
21
+
22
+ extend Functions
23
+ extend IO
24
+ extend LazyFunctions
25
+ end