polars-df 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,315 @@
1
+ module Polars
2
+ class DataFrame
3
+ attr_accessor :_df
4
+
5
+ def initialize(data = nil)
6
+ if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
7
+ result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
8
+ data = {}
9
+ result.columns.each_with_index do |k, i|
10
+ data[k] = result.rows.map { |r| r[i] }
11
+ end
12
+ end
13
+
14
+ if data.nil?
15
+ self._df = hash_to_rbdf({})
16
+ elsif data.is_a?(Hash)
17
+ data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
18
+ self._df = hash_to_rbdf(data)
19
+ elsif data.is_a?(Array)
20
+ self._df = sequence_to_rbdf(data)
21
+ elsif data.is_a?(Series)
22
+ self._df = series_to_rbdf(data)
23
+ else
24
+ raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
25
+ end
26
+ end
27
+
28
+ def self._from_rbdf(rb_df)
29
+ df = DataFrame.allocate
30
+ df._df = rb_df
31
+ df
32
+ end
33
+
34
+ def self._read_csv(file, has_header: true)
35
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
36
+ file = Utils.format_path(file)
37
+ end
38
+
39
+ _from_rbdf(RbDataFrame.read_csv(file, has_header))
40
+ end
41
+
42
+ def self._read_parquet(file)
43
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
44
+ file = Utils.format_path(file)
45
+ end
46
+
47
+ _from_rbdf(RbDataFrame.read_parquet(file))
48
+ end
49
+
50
+ def self._read_json(file)
51
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
52
+ file = Utils.format_path(file)
53
+ end
54
+
55
+ _from_rbdf(RbDataFrame.read_json(file))
56
+ end
57
+
58
+ def self._read_ndjson(file)
59
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
60
+ file = Utils.format_path(file)
61
+ end
62
+
63
+ _from_rbdf(RbDataFrame.read_ndjson(file))
64
+ end
65
+
66
+ def shape
67
+ _df.shape
68
+ end
69
+
70
+ def height
71
+ _df.height
72
+ end
73
+
74
+ def width
75
+ _df.width
76
+ end
77
+
78
+ def columns
79
+ _df.columns
80
+ end
81
+
82
+ def dtypes
83
+ _df.dtypes.map(&:to_sym)
84
+ end
85
+
86
+ def to_s
87
+ _df.to_s
88
+ end
89
+ alias_method :inspect, :to_s
90
+
91
+ def include?(name)
92
+ columns.include?(name)
93
+ end
94
+
95
+ def [](name)
96
+ Utils.wrap_s(_df.column(name))
97
+ end
98
+
99
+ def to_series(index = 0)
100
+ if index < 0
101
+ index = columns.length + index
102
+ end
103
+ Utils.wrap_s(_df.select_at_idx(index))
104
+ end
105
+
106
+ def write_json(
107
+ file,
108
+ pretty: false,
109
+ row_oriented: false
110
+ )
111
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
112
+ file = Utils.format_path(file)
113
+ end
114
+
115
+ _df.write_json(file, pretty, row_oriented)
116
+ nil
117
+ end
118
+
119
+ def write_ndjson(file)
120
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
121
+ file = Utils.format_path(file)
122
+ end
123
+
124
+ _df.write_ndjson(file)
125
+ nil
126
+ end
127
+
128
+ def write_csv(
129
+ file = nil,
130
+ has_header: true,
131
+ sep: ",",
132
+ quote: '"',
133
+ batch_size: 1024,
134
+ datetime_format: nil,
135
+ date_format: nil,
136
+ time_format: nil,
137
+ float_precision: nil,
138
+ null_value: nil
139
+ )
140
+ if sep.length > 1
141
+ raise ArgumentError, "only single byte separator is allowed"
142
+ elsif quote.length > 1
143
+ raise ArgumentError, "only single byte quote char is allowed"
144
+ elsif null_value == ""
145
+ null_value = nil
146
+ end
147
+
148
+ if file.nil?
149
+ buffer = StringIO.new
150
+ buffer.set_encoding(Encoding::BINARY)
151
+ _df.write_csv(
152
+ buffer,
153
+ has_header,
154
+ sep.ord,
155
+ quote.ord,
156
+ batch_size,
157
+ datetime_format,
158
+ date_format,
159
+ time_format,
160
+ float_precision,
161
+ null_value
162
+ )
163
+ buffer.rewind
164
+ return buffer.read.force_encoding(Encoding::UTF_8)
165
+ end
166
+
167
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
168
+ file = Utils.format_path(file)
169
+ end
170
+
171
+ _df.write_csv(
172
+ file,
173
+ has_header,
174
+ sep.ord,
175
+ quote.ord,
176
+ batch_size,
177
+ datetime_format,
178
+ date_format,
179
+ time_format,
180
+ float_precision,
181
+ null_value,
182
+ )
183
+ nil
184
+ end
185
+
186
+ def write_parquet(
187
+ file,
188
+ compression: "zstd",
189
+ compression_level: nil,
190
+ statistics: false,
191
+ row_group_size: nil
192
+ )
193
+ if compression.nil?
194
+ compression = "uncompressed"
195
+ end
196
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
197
+ file = Utils.format_path(file)
198
+ end
199
+
200
+ _df.write_parquet(
201
+ file, compression, compression_level, statistics, row_group_size
202
+ )
203
+ end
204
+
205
+ def filter(predicate)
206
+ lazy.filter(predicate).collect
207
+ end
208
+
209
+ def sort(by, reverse: false, nulls_last: false)
210
+ _from_rbdf(_df.sort(by, reverse, nulls_last))
211
+ end
212
+
213
+ def frame_equal(other, null_equal: true)
214
+ _df.frame_equal(other._df, null_equal)
215
+ end
216
+
217
+ def limit(n = 5)
218
+ head(n)
219
+ end
220
+
221
+ def head(n = 5)
222
+ _from_rbdf(_df.head(n))
223
+ end
224
+
225
+ def tail(n = 5)
226
+ _from_rbdf(_df.tail(n))
227
+ end
228
+
229
+ def groupby(by, maintain_order: false)
230
+ lazy.groupby(by, maintain_order: maintain_order)
231
+ end
232
+
233
+ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
234
+ lazy
235
+ .join(
236
+ other.lazy,
237
+ left_on: left_on,
238
+ right_on: right_on,
239
+ on: on,
240
+ how: how,
241
+ suffix: suffix,
242
+ )
243
+ .collect(no_optimization: true)
244
+ end
245
+
246
+ def with_column(column)
247
+ lazy
248
+ .with_column(column)
249
+ .collect(no_optimization: true, string_cache: false)
250
+ end
251
+
252
+ def lazy
253
+ wrap_ldf(_df.lazy)
254
+ end
255
+
256
+ def select(exprs)
257
+ _from_rbdf(
258
+ lazy
259
+ .select(exprs)
260
+ .collect(no_optimization: true, string_cache: false)
261
+ ._df
262
+ )
263
+ end
264
+
265
+ def mean(axis: 0, null_strategy: "ignore")
266
+ case axis
267
+ when 0
268
+ _from_rbdf(_df.mean)
269
+ when 1
270
+ Utils.wrap_s(_df.hmean(null_strategy))
271
+ else
272
+ raise ArgumentError, "Axis should be 0 or 1."
273
+ end
274
+ end
275
+
276
+ def with_columns(exprs)
277
+ if !exprs.nil? && !exprs.is_a?(Array)
278
+ exprs = [exprs]
279
+ end
280
+ lazy
281
+ .with_columns(exprs)
282
+ .collect(no_optimization: true, string_cache: false)
283
+ end
284
+
285
+ def rechunk
286
+ _from_rbdf(_df.rechunk)
287
+ end
288
+
289
+ def null_count
290
+ _from_rbdf(_df.null_count)
291
+ end
292
+
293
+ private
294
+
295
+ def hash_to_rbdf(data)
296
+ RbDataFrame.read_hash(data)
297
+ end
298
+
299
+ def sequence_to_rbdf(data)
300
+ RbDataFrame.new(data.map(&:_s))
301
+ end
302
+
303
+ def series_to_rbdf(data)
304
+ RbDataFrame.new([data._s])
305
+ end
306
+
307
+ def wrap_ldf(ldf)
308
+ LazyFrame._from_rbldf(ldf)
309
+ end
310
+
311
+ def _from_rbdf(rb_df)
312
+ self.class._from_rbdf(rb_df)
313
+ end
314
+ end
315
+ end
@@ -0,0 +1,233 @@
1
+ module Polars
2
+ class Expr
3
+ attr_accessor :_rbexpr
4
+
5
+ def self._from_rbexpr(rbexpr)
6
+ expr = Expr.allocate
7
+ expr._rbexpr = rbexpr
8
+ expr
9
+ end
10
+
11
+ def to_s
12
+ _rbexpr.to_str
13
+ end
14
+ alias_method :inspect, :to_s
15
+
16
+ def ^(other)
17
+ wrap_expr(_rbexpr._xor(_to_rbexpr(other)))
18
+ end
19
+
20
+ def &(other)
21
+ wrap_expr(_rbexpr._and(_to_rbexpr(other)))
22
+ end
23
+
24
+ def |(other)
25
+ wrap_expr(_rbexpr._or(_to_rbexpr(other)))
26
+ end
27
+
28
+ def *(other)
29
+ wrap_expr(_rbexpr * _to_rbexpr(other))
30
+ end
31
+
32
+ def >=(other)
33
+ wrap_expr(_rbexpr.gt_eq(_to_expr(other)._rbexpr))
34
+ end
35
+
36
+ def <=(other)
37
+ wrap_expr(_rbexpr.lt_eq(_to_expr(other)._rbexpr))
38
+ end
39
+
40
+ def ==(other)
41
+ wrap_expr(_rbexpr.eq(_to_expr(other)._rbexpr))
42
+ end
43
+
44
+ def !=(other)
45
+ wrap_expr(_rbexpr.neq(_to_expr(other)._rbexpr))
46
+ end
47
+
48
+ def <(other)
49
+ wrap_expr(_rbexpr.lt(_to_expr(other)._rbexpr))
50
+ end
51
+
52
+ def >(other)
53
+ wrap_expr(_rbexpr.gt(_to_expr(other)._rbexpr))
54
+ end
55
+
56
+ def alias(name)
57
+ wrap_expr(_rbexpr._alias(name))
58
+ end
59
+
60
+ def suffix(suffix)
61
+ wrap_expr(_rbexpr.suffix(suffix))
62
+ end
63
+
64
+ def is_not
65
+ wrap_expr(_rbexpr.is_not)
66
+ end
67
+
68
+ def is_null
69
+ wrap_expr(_rbexpr.is_null)
70
+ end
71
+
72
+ def is_not_null
73
+ wrap_expr(_rbexpr.is_not_null)
74
+ end
75
+
76
+ def count
77
+ wrap_expr(_rbexpr.count)
78
+ end
79
+
80
+ def len
81
+ count
82
+ end
83
+
84
+ def sort(reverse: false, nulls_last: false)
85
+ wrap_expr(_rbexpr.sort_with(reverse, nulls_last))
86
+ end
87
+
88
+ def sort_by(by, reverse: false)
89
+ if !by.is_a?(Array)
90
+ by = [by]
91
+ end
92
+ if !reverse.is_a?(Array)
93
+ reverse = [reverse]
94
+ end
95
+ by = Utils.selection_to_rbexpr_list(by)
96
+
97
+ wrap_expr(_rbexpr.sort_by(by, reverse))
98
+ end
99
+
100
+ def fill_null(value = nil, strategy: nil, limit: nil)
101
+ if !value.nil? && !strategy.nil?
102
+ raise ArgumentError, "cannot specify both 'value' and 'strategy'."
103
+ elsif value.nil? && strategy.nil?
104
+ raise ArgumentError, "must specify either a fill 'value' or 'strategy'"
105
+ elsif ["forward", "backward"].include?(strategy) && !limit.nil?
106
+ raise ArgumentError, "can only specify 'limit' when strategy is set to 'backward' or 'forward'"
107
+ end
108
+
109
+ if !value.nil?
110
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
111
+ wrap_expr(_rbexpr.fill_null(value._rbexpr))
112
+ else
113
+ wrap_expr(_rbexpr.fill_null_with_strategy(strategy, limit))
114
+ end
115
+ end
116
+
117
+ def fill_nan(fill_value)
118
+ fill_value = Utils.expr_to_lit_or_expr(fill_value, str_to_lit: true)
119
+ wrap_expr(_rbexpr.fill_nan(fill_value._rbexpr))
120
+ end
121
+
122
+ def reverse
123
+ wrap_expr(_rbexpr.reverse)
124
+ end
125
+
126
+ def std(ddof: 1)
127
+ wrap_expr(_rbexpr.std(ddof))
128
+ end
129
+
130
+ def var(ddof: 1)
131
+ wrap_expr(_rbexpr.var(ddof))
132
+ end
133
+
134
+ def max
135
+ wrap_expr(_rbexpr.max)
136
+ end
137
+
138
+ def min
139
+ wrap_expr(_rbexpr.min)
140
+ end
141
+
142
+ def nan_max
143
+ wrap_expr(_rbexpr.nan_max)
144
+ end
145
+
146
+ def nan_min
147
+ wrap_expr(_rbexpr.nan_min)
148
+ end
149
+
150
+ def sum
151
+ wrap_expr(_rbexpr.sum)
152
+ end
153
+
154
+ def mean
155
+ wrap_expr(_rbexpr.mean)
156
+ end
157
+
158
+ def median
159
+ wrap_expr(_rbexpr.median)
160
+ end
161
+
162
+ def product
163
+ wrap_expr(_rbexpr.product)
164
+ end
165
+
166
+ def n_unique
167
+ wrap_expr(_rbexpr.n_unique)
168
+ end
169
+
170
+ def unique(maintain_order: false)
171
+ if maintain_order
172
+ wrap_expr(_rbexpr.unique_stable)
173
+ else
174
+ wrap_expr(_rbexpr.unique)
175
+ end
176
+ end
177
+
178
+ def first
179
+ wrap_expr(_rbexpr.first)
180
+ end
181
+
182
+ def last
183
+ wrap_expr(_rbexpr.last)
184
+ end
185
+
186
+ def over(expr)
187
+ rbexprs = Utils.selection_to_rbexpr_list(expr)
188
+ wrap_expr(_rbexpr.over(rbexprs))
189
+ end
190
+
191
+ def filter(predicate)
192
+ wrap_expr(_rbexpr.filter(predicate._rbexpr))
193
+ end
194
+
195
+ def head(n = 10)
196
+ wrap_expr(_rbexpr.head(n))
197
+ end
198
+
199
+ def tail(n = 10)
200
+ wrap_expr(_rbexpr.tail(n))
201
+ end
202
+
203
+ def limit(n = 10)
204
+ head(n)
205
+ end
206
+
207
+ def interpolate
208
+ wrap_expr(_rbexpr.interpolate)
209
+ end
210
+
211
+ def list
212
+ wrap_expr(_rbexpr.list)
213
+ end
214
+
215
+ def str
216
+ StringExpr.new(self)
217
+ end
218
+
219
+ private
220
+
221
+ def wrap_expr(expr)
222
+ Utils.wrap_expr(expr)
223
+ end
224
+
225
+ def _to_rbexpr(other)
226
+ _to_expr(other)._rbexpr
227
+ end
228
+
229
+ def _to_expr(other)
230
+ other.is_a?(Expr) ? other : Utils.lit(other)
231
+ end
232
+ end
233
+ end
@@ -0,0 +1,45 @@
1
+ module Polars
2
+ module Functions
3
+ def concat(items, rechunk: true, how: "vertical", parallel: true)
4
+ if items.empty?
5
+ raise ArgumentError, "cannot concat empty list"
6
+ end
7
+
8
+ first = items[0]
9
+ if first.is_a?(DataFrame)
10
+ if how == "vertical"
11
+ out = Utils.wrap_df(_concat_df(items))
12
+ elsif how == "diagonal"
13
+ out = Utils.wrap_df(_diag_concat_df(items))
14
+ elsif how == "horizontal"
15
+ out = Utils.wrap_df(_hor_concat_df(items))
16
+ else
17
+ raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
18
+ end
19
+ elsif first.is_a?(LazyFrame)
20
+ if how == "vertical"
21
+ # TODO
22
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel))
23
+ else
24
+ raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
25
+ end
26
+ elsif first.is_a?(Series)
27
+ # TODO
28
+ out = Utils.wrap_s(_concat_series(items))
29
+ elsif first.is_a?(Expr)
30
+ out = first
31
+ items[1..-1].each do |e|
32
+ out = out.append(e)
33
+ end
34
+ else
35
+ raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
36
+ end
37
+
38
+ if rechunk
39
+ out.rechunk
40
+ else
41
+ out
42
+ end
43
+ end
44
+ end
45
+ end
data/lib/polars/io.rb ADDED
@@ -0,0 +1,39 @@
1
+ module Polars
2
+ module IO
3
+ def read_csv(file, has_header: true)
4
+ _prepare_file_arg(file) do |data|
5
+ DataFrame._read_csv(data, has_header: has_header)
6
+ end
7
+ end
8
+
9
+ def read_parquet(file)
10
+ _prepare_file_arg(file) do |data|
11
+ DataFrame._read_parquet(data)
12
+ end
13
+ end
14
+
15
+ def read_json(file)
16
+ DataFrame._read_json(file)
17
+ end
18
+
19
+ def read_ndjson(file)
20
+ DataFrame._read_ndjson(file)
21
+ end
22
+
23
+ private
24
+
25
+ def _prepare_file_arg(file)
26
+ if file.is_a?(String) && file =~ /\Ahttps?:\/\//
27
+ raise ArgumentError, "use URI(...) for remote files"
28
+ end
29
+
30
+ if defined?(URI) && file.is_a?(URI)
31
+ require "open-uri"
32
+
33
+ file = URI.open(file)
34
+ end
35
+
36
+ yield file
37
+ end
38
+ end
39
+ end