polars-df 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,315 @@
1
+ module Polars
2
+ class DataFrame
3
+ attr_accessor :_df
4
+
5
+ def initialize(data = nil)
6
+ if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
7
+ result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
8
+ data = {}
9
+ result.columns.each_with_index do |k, i|
10
+ data[k] = result.rows.map { |r| r[i] }
11
+ end
12
+ end
13
+
14
+ if data.nil?
15
+ self._df = hash_to_rbdf({})
16
+ elsif data.is_a?(Hash)
17
+ data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
18
+ self._df = hash_to_rbdf(data)
19
+ elsif data.is_a?(Array)
20
+ self._df = sequence_to_rbdf(data)
21
+ elsif data.is_a?(Series)
22
+ self._df = series_to_rbdf(data)
23
+ else
24
+ raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
25
+ end
26
+ end
27
+
28
+ def self._from_rbdf(rb_df)
29
+ df = DataFrame.allocate
30
+ df._df = rb_df
31
+ df
32
+ end
33
+
34
+ def self._read_csv(file, has_header: true)
35
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
36
+ file = Utils.format_path(file)
37
+ end
38
+
39
+ _from_rbdf(RbDataFrame.read_csv(file, has_header))
40
+ end
41
+
42
+ def self._read_parquet(file)
43
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
44
+ file = Utils.format_path(file)
45
+ end
46
+
47
+ _from_rbdf(RbDataFrame.read_parquet(file))
48
+ end
49
+
50
+ def self._read_json(file)
51
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
52
+ file = Utils.format_path(file)
53
+ end
54
+
55
+ _from_rbdf(RbDataFrame.read_json(file))
56
+ end
57
+
58
+ def self._read_ndjson(file)
59
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
60
+ file = Utils.format_path(file)
61
+ end
62
+
63
+ _from_rbdf(RbDataFrame.read_ndjson(file))
64
+ end
65
+
66
+ def shape
67
+ _df.shape
68
+ end
69
+
70
+ def height
71
+ _df.height
72
+ end
73
+
74
+ def width
75
+ _df.width
76
+ end
77
+
78
+ def columns
79
+ _df.columns
80
+ end
81
+
82
+ def dtypes
83
+ _df.dtypes.map(&:to_sym)
84
+ end
85
+
86
+ def to_s
87
+ _df.to_s
88
+ end
89
+ alias_method :inspect, :to_s
90
+
91
+ def include?(name)
92
+ columns.include?(name)
93
+ end
94
+
95
+ def [](name)
96
+ Utils.wrap_s(_df.column(name))
97
+ end
98
+
99
+ def to_series(index = 0)
100
+ if index < 0
101
+ index = columns.length + index
102
+ end
103
+ Utils.wrap_s(_df.select_at_idx(index))
104
+ end
105
+
106
+ def write_json(
107
+ file,
108
+ pretty: false,
109
+ row_oriented: false
110
+ )
111
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
112
+ file = Utils.format_path(file)
113
+ end
114
+
115
+ _df.write_json(file, pretty, row_oriented)
116
+ nil
117
+ end
118
+
119
+ def write_ndjson(file)
120
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
121
+ file = Utils.format_path(file)
122
+ end
123
+
124
+ _df.write_ndjson(file)
125
+ nil
126
+ end
127
+
128
+ def write_csv(
129
+ file = nil,
130
+ has_header: true,
131
+ sep: ",",
132
+ quote: '"',
133
+ batch_size: 1024,
134
+ datetime_format: nil,
135
+ date_format: nil,
136
+ time_format: nil,
137
+ float_precision: nil,
138
+ null_value: nil
139
+ )
140
+ if sep.length > 1
141
+ raise ArgumentError, "only single byte separator is allowed"
142
+ elsif quote.length > 1
143
+ raise ArgumentError, "only single byte quote char is allowed"
144
+ elsif null_value == ""
145
+ null_value = nil
146
+ end
147
+
148
+ if file.nil?
149
+ buffer = StringIO.new
150
+ buffer.set_encoding(Encoding::BINARY)
151
+ _df.write_csv(
152
+ buffer,
153
+ has_header,
154
+ sep.ord,
155
+ quote.ord,
156
+ batch_size,
157
+ datetime_format,
158
+ date_format,
159
+ time_format,
160
+ float_precision,
161
+ null_value
162
+ )
163
+ buffer.rewind
164
+ return buffer.read.force_encoding(Encoding::UTF_8)
165
+ end
166
+
167
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
168
+ file = Utils.format_path(file)
169
+ end
170
+
171
+ _df.write_csv(
172
+ file,
173
+ has_header,
174
+ sep.ord,
175
+ quote.ord,
176
+ batch_size,
177
+ datetime_format,
178
+ date_format,
179
+ time_format,
180
+ float_precision,
181
+ null_value,
182
+ )
183
+ nil
184
+ end
185
+
186
+ def write_parquet(
187
+ file,
188
+ compression: "zstd",
189
+ compression_level: nil,
190
+ statistics: false,
191
+ row_group_size: nil
192
+ )
193
+ if compression.nil?
194
+ compression = "uncompressed"
195
+ end
196
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
197
+ file = Utils.format_path(file)
198
+ end
199
+
200
+ _df.write_parquet(
201
+ file, compression, compression_level, statistics, row_group_size
202
+ )
203
+ end
204
+
205
+ def filter(predicate)
206
+ lazy.filter(predicate).collect
207
+ end
208
+
209
+ def sort(by, reverse: false, nulls_last: false)
210
+ _from_rbdf(_df.sort(by, reverse, nulls_last))
211
+ end
212
+
213
+ def frame_equal(other, null_equal: true)
214
+ _df.frame_equal(other._df, null_equal)
215
+ end
216
+
217
+ def limit(n = 5)
218
+ head(n)
219
+ end
220
+
221
+ def head(n = 5)
222
+ _from_rbdf(_df.head(n))
223
+ end
224
+
225
+ def tail(n = 5)
226
+ _from_rbdf(_df.tail(n))
227
+ end
228
+
229
+ def groupby(by, maintain_order: false)
230
+ lazy.groupby(by, maintain_order: maintain_order)
231
+ end
232
+
233
+ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
234
+ lazy
235
+ .join(
236
+ other.lazy,
237
+ left_on: left_on,
238
+ right_on: right_on,
239
+ on: on,
240
+ how: how,
241
+ suffix: suffix,
242
+ )
243
+ .collect(no_optimization: true)
244
+ end
245
+
246
+ def with_column(column)
247
+ lazy
248
+ .with_column(column)
249
+ .collect(no_optimization: true, string_cache: false)
250
+ end
251
+
252
+ def lazy
253
+ wrap_ldf(_df.lazy)
254
+ end
255
+
256
+ def select(exprs)
257
+ _from_rbdf(
258
+ lazy
259
+ .select(exprs)
260
+ .collect(no_optimization: true, string_cache: false)
261
+ ._df
262
+ )
263
+ end
264
+
265
+ def mean(axis: 0, null_strategy: "ignore")
266
+ case axis
267
+ when 0
268
+ _from_rbdf(_df.mean)
269
+ when 1
270
+ Utils.wrap_s(_df.hmean(null_strategy))
271
+ else
272
+ raise ArgumentError, "Axis should be 0 or 1."
273
+ end
274
+ end
275
+
276
+ def with_columns(exprs)
277
+ if !exprs.nil? && !exprs.is_a?(Array)
278
+ exprs = [exprs]
279
+ end
280
+ lazy
281
+ .with_columns(exprs)
282
+ .collect(no_optimization: true, string_cache: false)
283
+ end
284
+
285
+ def rechunk
286
+ _from_rbdf(_df.rechunk)
287
+ end
288
+
289
+ def null_count
290
+ _from_rbdf(_df.null_count)
291
+ end
292
+
293
+ private
294
+
295
+ def hash_to_rbdf(data)
296
+ RbDataFrame.read_hash(data)
297
+ end
298
+
299
+ def sequence_to_rbdf(data)
300
+ RbDataFrame.new(data.map(&:_s))
301
+ end
302
+
303
+ def series_to_rbdf(data)
304
+ RbDataFrame.new([data._s])
305
+ end
306
+
307
+ def wrap_ldf(ldf)
308
+ LazyFrame._from_rbldf(ldf)
309
+ end
310
+
311
+ def _from_rbdf(rb_df)
312
+ self.class._from_rbdf(rb_df)
313
+ end
314
+ end
315
+ end
@@ -0,0 +1,233 @@
1
+ module Polars
2
+ class Expr
3
+ attr_accessor :_rbexpr
4
+
5
+ def self._from_rbexpr(rbexpr)
6
+ expr = Expr.allocate
7
+ expr._rbexpr = rbexpr
8
+ expr
9
+ end
10
+
11
+ def to_s
12
+ _rbexpr.to_str
13
+ end
14
+ alias_method :inspect, :to_s
15
+
16
+ def ^(other)
17
+ wrap_expr(_rbexpr._xor(_to_rbexpr(other)))
18
+ end
19
+
20
+ def &(other)
21
+ wrap_expr(_rbexpr._and(_to_rbexpr(other)))
22
+ end
23
+
24
+ def |(other)
25
+ wrap_expr(_rbexpr._or(_to_rbexpr(other)))
26
+ end
27
+
28
+ def *(other)
29
+ wrap_expr(_rbexpr * _to_rbexpr(other))
30
+ end
31
+
32
+ def >=(other)
33
+ wrap_expr(_rbexpr.gt_eq(_to_expr(other)._rbexpr))
34
+ end
35
+
36
+ def <=(other)
37
+ wrap_expr(_rbexpr.lt_eq(_to_expr(other)._rbexpr))
38
+ end
39
+
40
+ def ==(other)
41
+ wrap_expr(_rbexpr.eq(_to_expr(other)._rbexpr))
42
+ end
43
+
44
+ def !=(other)
45
+ wrap_expr(_rbexpr.neq(_to_expr(other)._rbexpr))
46
+ end
47
+
48
+ def <(other)
49
+ wrap_expr(_rbexpr.lt(_to_expr(other)._rbexpr))
50
+ end
51
+
52
+ def >(other)
53
+ wrap_expr(_rbexpr.gt(_to_expr(other)._rbexpr))
54
+ end
55
+
56
+ def alias(name)
57
+ wrap_expr(_rbexpr._alias(name))
58
+ end
59
+
60
+ def suffix(suffix)
61
+ wrap_expr(_rbexpr.suffix(suffix))
62
+ end
63
+
64
+ def is_not
65
+ wrap_expr(_rbexpr.is_not)
66
+ end
67
+
68
+ def is_null
69
+ wrap_expr(_rbexpr.is_null)
70
+ end
71
+
72
+ def is_not_null
73
+ wrap_expr(_rbexpr.is_not_null)
74
+ end
75
+
76
+ def count
77
+ wrap_expr(_rbexpr.count)
78
+ end
79
+
80
+ def len
81
+ count
82
+ end
83
+
84
+ def sort(reverse: false, nulls_last: false)
85
+ wrap_expr(_rbexpr.sort_with(reverse, nulls_last))
86
+ end
87
+
88
+ def sort_by(by, reverse: false)
89
+ if !by.is_a?(Array)
90
+ by = [by]
91
+ end
92
+ if !reverse.is_a?(Array)
93
+ reverse = [reverse]
94
+ end
95
+ by = Utils.selection_to_rbexpr_list(by)
96
+
97
+ wrap_expr(_rbexpr.sort_by(by, reverse))
98
+ end
99
+
100
+ def fill_null(value = nil, strategy: nil, limit: nil)
101
+ if !value.nil? && !strategy.nil?
102
+ raise ArgumentError, "cannot specify both 'value' and 'strategy'."
103
+ elsif value.nil? && strategy.nil?
104
+ raise ArgumentError, "must specify either a fill 'value' or 'strategy'"
105
+ elsif ["forward", "backward"].include?(strategy) && !limit.nil?
106
+ raise ArgumentError, "can only specify 'limit' when strategy is set to 'backward' or 'forward'"
107
+ end
108
+
109
+ if !value.nil?
110
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
111
+ wrap_expr(_rbexpr.fill_null(value._rbexpr))
112
+ else
113
+ wrap_expr(_rbexpr.fill_null_with_strategy(strategy, limit))
114
+ end
115
+ end
116
+
117
+ def fill_nan(fill_value)
118
+ fill_value = Utils.expr_to_lit_or_expr(fill_value, str_to_lit: true)
119
+ wrap_expr(_rbexpr.fill_nan(fill_value._rbexpr))
120
+ end
121
+
122
+ def reverse
123
+ wrap_expr(_rbexpr.reverse)
124
+ end
125
+
126
+ def std(ddof: 1)
127
+ wrap_expr(_rbexpr.std(ddof))
128
+ end
129
+
130
+ def var(ddof: 1)
131
+ wrap_expr(_rbexpr.var(ddof))
132
+ end
133
+
134
+ def max
135
+ wrap_expr(_rbexpr.max)
136
+ end
137
+
138
+ def min
139
+ wrap_expr(_rbexpr.min)
140
+ end
141
+
142
+ def nan_max
143
+ wrap_expr(_rbexpr.nan_max)
144
+ end
145
+
146
+ def nan_min
147
+ wrap_expr(_rbexpr.nan_min)
148
+ end
149
+
150
+ def sum
151
+ wrap_expr(_rbexpr.sum)
152
+ end
153
+
154
+ def mean
155
+ wrap_expr(_rbexpr.mean)
156
+ end
157
+
158
+ def median
159
+ wrap_expr(_rbexpr.median)
160
+ end
161
+
162
+ def product
163
+ wrap_expr(_rbexpr.product)
164
+ end
165
+
166
+ def n_unique
167
+ wrap_expr(_rbexpr.n_unique)
168
+ end
169
+
170
+ def unique(maintain_order: false)
171
+ if maintain_order
172
+ wrap_expr(_rbexpr.unique_stable)
173
+ else
174
+ wrap_expr(_rbexpr.unique)
175
+ end
176
+ end
177
+
178
+ def first
179
+ wrap_expr(_rbexpr.first)
180
+ end
181
+
182
+ def last
183
+ wrap_expr(_rbexpr.last)
184
+ end
185
+
186
+ def over(expr)
187
+ rbexprs = Utils.selection_to_rbexpr_list(expr)
188
+ wrap_expr(_rbexpr.over(rbexprs))
189
+ end
190
+
191
+ def filter(predicate)
192
+ wrap_expr(_rbexpr.filter(predicate._rbexpr))
193
+ end
194
+
195
+ def head(n = 10)
196
+ wrap_expr(_rbexpr.head(n))
197
+ end
198
+
199
+ def tail(n = 10)
200
+ wrap_expr(_rbexpr.tail(n))
201
+ end
202
+
203
+ def limit(n = 10)
204
+ head(n)
205
+ end
206
+
207
+ def interpolate
208
+ wrap_expr(_rbexpr.interpolate)
209
+ end
210
+
211
+ def list
212
+ wrap_expr(_rbexpr.list)
213
+ end
214
+
215
+ def str
216
+ StringExpr.new(self)
217
+ end
218
+
219
+ private
220
+
221
+ def wrap_expr(expr)
222
+ Utils.wrap_expr(expr)
223
+ end
224
+
225
+ def _to_rbexpr(other)
226
+ _to_expr(other)._rbexpr
227
+ end
228
+
229
+ def _to_expr(other)
230
+ other.is_a?(Expr) ? other : Utils.lit(other)
231
+ end
232
+ end
233
+ end
@@ -0,0 +1,45 @@
1
+ module Polars
2
+ module Functions
3
+ def concat(items, rechunk: true, how: "vertical", parallel: true)
4
+ if items.empty?
5
+ raise ArgumentError, "cannot concat empty list"
6
+ end
7
+
8
+ first = items[0]
9
+ if first.is_a?(DataFrame)
10
+ if how == "vertical"
11
+ out = Utils.wrap_df(_concat_df(items))
12
+ elsif how == "diagonal"
13
+ out = Utils.wrap_df(_diag_concat_df(items))
14
+ elsif how == "horizontal"
15
+ out = Utils.wrap_df(_hor_concat_df(items))
16
+ else
17
+ raise ArgumentError, "how must be one of {{'vertical', 'diagonal', 'horizontal'}}, got #{how}"
18
+ end
19
+ elsif first.is_a?(LazyFrame)
20
+ if how == "vertical"
21
+ # TODO
22
+ return Utils.wrap_ldf(_concat_lf(items, rechunk, parallel))
23
+ else
24
+ raise ArgumentError, "Lazy only allows 'vertical' concat strategy."
25
+ end
26
+ elsif first.is_a?(Series)
27
+ # TODO
28
+ out = Utils.wrap_s(_concat_series(items))
29
+ elsif first.is_a?(Expr)
30
+ out = first
31
+ items[1..-1].each do |e|
32
+ out = out.append(e)
33
+ end
34
+ else
35
+ raise ArgumentError, "did not expect type: #{first.class.name} in 'Polars.concat'."
36
+ end
37
+
38
+ if rechunk
39
+ out.rechunk
40
+ else
41
+ out
42
+ end
43
+ end
44
+ end
45
+ end
data/lib/polars/io.rb ADDED
@@ -0,0 +1,39 @@
1
+ module Polars
2
+ module IO
3
+ def read_csv(file, has_header: true)
4
+ _prepare_file_arg(file) do |data|
5
+ DataFrame._read_csv(data, has_header: has_header)
6
+ end
7
+ end
8
+
9
+ def read_parquet(file)
10
+ _prepare_file_arg(file) do |data|
11
+ DataFrame._read_parquet(data)
12
+ end
13
+ end
14
+
15
+ def read_json(file)
16
+ DataFrame._read_json(file)
17
+ end
18
+
19
+ def read_ndjson(file)
20
+ DataFrame._read_ndjson(file)
21
+ end
22
+
23
+ private
24
+
25
+ def _prepare_file_arg(file)
26
+ if file.is_a?(String) && file =~ /\Ahttps?:\/\//
27
+ raise ArgumentError, "use URI(...) for remote files"
28
+ end
29
+
30
+ if defined?(URI) && file.is_a?(URI)
31
+ require "open-uri"
32
+
33
+ file = URI.open(file)
34
+ end
35
+
36
+ yield file
37
+ end
38
+ end
39
+ end