polars-df 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,13 +1,203 @@
1
1
  module Polars
2
+ # Representation of a Lazy computation graph/query againat a DataFrame.
2
3
  class LazyFrame
4
+ # @private
3
5
  attr_accessor :_ldf
4
6
 
7
+ # @private
5
8
  def self._from_rbldf(rb_ldf)
6
9
  ldf = LazyFrame.allocate
7
10
  ldf._ldf = rb_ldf
8
11
  ldf
9
12
  end
10
13
 
14
+ # @private
15
+ def self._scan_csv(
16
+ file,
17
+ has_header: true,
18
+ sep: ",",
19
+ comment_char: nil,
20
+ quote_char: '"',
21
+ skip_rows: 0,
22
+ dtypes: nil,
23
+ null_values: nil,
24
+ ignore_errors: false,
25
+ cache: true,
26
+ with_column_names: nil,
27
+ infer_schema_length: 100,
28
+ n_rows: nil,
29
+ encoding: "utf8",
30
+ low_memory: false,
31
+ rechunk: true,
32
+ skip_rows_after_header: 0,
33
+ row_count_name: nil,
34
+ row_count_offset: 0,
35
+ parse_dates: false,
36
+ eol_char: "\n"
37
+ )
38
+ dtype_list = nil
39
+ if !dtypes.nil?
40
+ dtype_list = []
41
+ dtypes.each do |k, v|
42
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
43
+ end
44
+ end
45
+ processed_null_values = Utils._process_null_values(null_values)
46
+
47
+ _from_rbldf(
48
+ RbLazyFrame.new_from_csv(
49
+ file,
50
+ sep,
51
+ has_header,
52
+ ignore_errors,
53
+ skip_rows,
54
+ n_rows,
55
+ cache,
56
+ dtype_list,
57
+ low_memory,
58
+ comment_char,
59
+ quote_char,
60
+ processed_null_values,
61
+ infer_schema_length,
62
+ with_column_names,
63
+ rechunk,
64
+ skip_rows_after_header,
65
+ encoding,
66
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
67
+ parse_dates,
68
+ eol_char
69
+ )
70
+ )
71
+ end
72
+
73
+ # @private
74
+ def self._scan_parquet(
75
+ file,
76
+ n_rows: nil,
77
+ cache: true,
78
+ parallel: "auto",
79
+ rechunk: true,
80
+ row_count_name: nil,
81
+ row_count_offset: 0,
82
+ storage_options: nil,
83
+ low_memory: false
84
+ )
85
+ _from_rbldf(
86
+ RbLazyFrame.new_from_parquet(
87
+ file,
88
+ n_rows,
89
+ cache,
90
+ parallel,
91
+ rechunk,
92
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
93
+ low_memory
94
+ )
95
+ )
96
+ end
97
+
98
+ # @private
99
+ def self._scan_ipc(
100
+ file,
101
+ n_rows: nil,
102
+ cache: true,
103
+ rechunk: true,
104
+ row_count_name: nil,
105
+ row_count_offset: 0,
106
+ storage_options: nil,
107
+ memory_map: true
108
+ )
109
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
110
+ file = Utils.format_path(file)
111
+ end
112
+
113
+ _from_rbldf(
114
+ RbLazyFrame.new_from_ipc(
115
+ file,
116
+ n_rows,
117
+ cache,
118
+ rechunk,
119
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
120
+ memory_map
121
+ )
122
+ )
123
+ end
124
+
125
+ # @private
126
+ def self._scan_ndjson(
127
+ file,
128
+ infer_schema_length: nil,
129
+ batch_size: nil,
130
+ n_rows: nil,
131
+ low_memory: false,
132
+ rechunk: true,
133
+ row_count_name: nil,
134
+ row_count_offset: 0
135
+ )
136
+ _from_rbldf(
137
+ RbLazyFrame.new_from_ndjson(
138
+ file,
139
+ infer_schema_length,
140
+ batch_size,
141
+ n_rows,
142
+ low_memory,
143
+ rechunk,
144
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
145
+ )
146
+ )
147
+ end
148
+
149
+ # def self.from_json
150
+ # end
151
+
152
+ # def self.read_json
153
+ # end
154
+
155
+ # def columns
156
+ # end
157
+
158
+ # def dtypes
159
+ # end
160
+
161
+ # def schema
162
+ # end
163
+
164
+ # def width
165
+ # end
166
+
167
+ # def include?(key)
168
+ # end
169
+
170
+ # clone handled by initialize_copy
171
+
172
+ # def [](item)
173
+ # end
174
+
175
+ # def to_s
176
+ # end
177
+ # alias_method :inspect, :to_s
178
+
179
+ # def write_json
180
+ # end
181
+
182
+ # def pipe
183
+ # end
184
+
185
+ # def describe_plan
186
+ # end
187
+
188
+ # def describe_optimized_plan
189
+ # end
190
+
191
+ # def show_graph
192
+ # end
193
+
194
+ # def sort
195
+ # end
196
+
197
+ # def profile
198
+ # end
199
+
200
+ #
11
201
  def collect(
12
202
  type_coercion: true,
13
203
  predicate_pushdown: true,
@@ -42,8 +232,23 @@ module Polars
42
232
  Utils.wrap_df(ldf.collect)
43
233
  end
44
234
 
235
+ # def fetch
236
+ # end
237
+
238
+ #
239
+ def lazy
240
+ self
241
+ end
242
+
243
+ # def cache
244
+ # end
245
+
246
+ # def cleared
247
+ # end
248
+
249
+ #
45
250
  def filter(predicate)
46
- self.class._from_rbldf(
251
+ _from_rbldf(
47
252
  _ldf.filter(
48
253
  Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
49
254
  )
@@ -52,7 +257,7 @@ module Polars
52
257
 
53
258
  def select(exprs)
54
259
  exprs = Utils.selection_to_rbexpr_list(exprs)
55
- self.class._from_rbldf(_ldf.select(exprs))
260
+ _from_rbldf(_ldf.select(exprs))
56
261
  end
57
262
 
58
263
  def groupby(by, maintain_order: false)
@@ -61,6 +266,16 @@ module Polars
61
266
  LazyGroupBy.new(lgb, self.class)
62
267
  end
63
268
 
269
+ # def groupby_rolling
270
+ # end
271
+
272
+ # def groupby_dynamic
273
+ # end
274
+
275
+ # def join_asof
276
+ # end
277
+
278
+ #
64
279
  def join(
65
280
  other,
66
281
  left_on: nil,
@@ -76,7 +291,7 @@ module Polars
76
291
  end
77
292
 
78
293
  if how == "cross"
79
- return self.class._from_rbldf(
294
+ return _from_rbldf(
80
295
  _ldf.join(
81
296
  other._ldf, [], [], allow_parallel, force_parallel, how, suffix
82
297
  )
@@ -94,7 +309,7 @@ module Polars
94
309
  raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
95
310
  end
96
311
 
97
- self.class._from_rbldf(
312
+ _from_rbldf(
98
313
  self._ldf.join(
99
314
  other._ldf,
100
315
  rbexprs_left,
@@ -123,17 +338,134 @@ module Polars
123
338
  when Expr
124
339
  rbexprs << e._rbexpr
125
340
  when Series
126
- rbexprs = Internal.lit(e)._rbexpr
341
+ rbexprs = Utils.lit(e)._rbexpr
127
342
  else
128
343
  raise ArgumentError, "Expected an expression, got #{e}"
129
344
  end
130
345
  end
131
346
 
132
- self.class._from_rbldf(_ldf.with_columns(rbexprs))
347
+ _from_rbldf(_ldf.with_columns(rbexprs))
133
348
  end
134
349
 
350
+ # def with_context
351
+ # end
352
+
353
+ #
135
354
  def with_column(column)
136
355
  with_columns([column])
137
356
  end
357
+
358
+ # def drop
359
+ # end
360
+
361
+ #
362
+ def rename(mapping)
363
+ existing = mapping.keys
364
+ _new = mapping.values
365
+ _from_rbldf(_ldf.rename(existing, _new))
366
+ end
367
+
368
+ # def reverse
369
+ # end
370
+
371
+ # def shift
372
+ # end
373
+
374
+ # def shift_and_fill
375
+ # end
376
+
377
+ # def slice
378
+ # end
379
+
380
+ # def limit
381
+ # end
382
+
383
+ # def head
384
+ # end
385
+
386
+ # def tail
387
+ # end
388
+
389
+ # def last
390
+ # end
391
+
392
+ # def first
393
+ # end
394
+
395
+ # def with_row_count
396
+ # end
397
+
398
+ # def take_every
399
+ # end
400
+
401
+ # def fill_null
402
+ # end
403
+
404
+ #
405
+ def fill_nan(fill_value)
406
+ if !fill_value.is_a?(Expr)
407
+ fill_value = Utils.lit(fill_value)
408
+ end
409
+ _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
410
+ end
411
+
412
+ # def std
413
+ # end
414
+
415
+ # def var
416
+ # end
417
+
418
+ # def max
419
+ # end
420
+
421
+ # def min
422
+ # end
423
+
424
+ # def sum
425
+ # end
426
+
427
+ # def mean
428
+ # end
429
+
430
+ # def median
431
+ # end
432
+
433
+ # def quantile
434
+ # end
435
+
436
+ #
437
+ def explode(columns)
438
+ columns = Utils.selection_to_rbexpr_list(columns)
439
+ _from_rbldf(_ldf.explode(columns))
440
+ end
441
+
442
+ # def unique
443
+ # end
444
+
445
+ # def drop_nulls
446
+ # end
447
+
448
+ # def melt
449
+ # end
450
+
451
+ # def map
452
+ # end
453
+
454
+ # def interpolate
455
+ # end
456
+
457
+ # def unnest
458
+ # end
459
+
460
+ private
461
+
462
+ def initialize_copy(other)
463
+ super
464
+ self._ldf = _ldf._clone
465
+ end
466
+
467
+ def _from_rbldf(rb_ldf)
468
+ self.class._from_rbldf(rb_ldf)
469
+ end
138
470
  end
139
471
  end
@@ -1,10 +1,45 @@
1
1
  module Polars
2
2
  module LazyFunctions
3
3
  def col(name)
4
- name = name.to_s if name.is_a?(Symbol)
5
- Utils.wrap_expr(RbExpr.col(name))
4
+ if name.is_a?(Series)
5
+ name = name.to_a
6
+ end
7
+
8
+ if name.is_a?(Array)
9
+ if name.length == 0 || name[0].is_a?(String) || name[0].is_a?(Symbol)
10
+ name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
11
+ Utils.wrap_expr(RbExpr.cols(name))
12
+ elsif Utils.is_polars_dtype(name[0])
13
+ raise Todo
14
+ # Utils.wrap_expr(_dtype_cols(name))
15
+ else
16
+ raise ArgumentError, "Expected list values to be all `str` or all `DataType`"
17
+ end
18
+ else
19
+ name = name.to_s if name.is_a?(Symbol)
20
+ Utils.wrap_expr(RbExpr.col(name))
21
+ end
6
22
  end
7
23
 
24
+ def element
25
+ col("")
26
+ end
27
+
28
+ def count(column = nil)
29
+ if column.nil?
30
+ return Utils.wrap_expr(RbExpr.count)
31
+ end
32
+
33
+ if column.is_a?(Series)
34
+ column.len
35
+ else
36
+ col(column).count
37
+ end
38
+ end
39
+
40
+ # def to_list
41
+ # end
42
+
8
43
  def std(column, ddof: 1)
9
44
  if column.is_a?(Series)
10
45
  column.std(ddof: ddof)
@@ -55,7 +90,7 @@ module Polars
55
90
  # TODO
56
91
  Utils.wrap_expr(_sum_exprs(exprs))
57
92
  else
58
- raise "todo"
93
+ raise Todo
59
94
  end
60
95
  end
61
96
 
@@ -79,10 +114,97 @@ module Polars
79
114
  end
80
115
  end
81
116
 
117
+ # def n_unique
118
+ # end
119
+
120
+ def first(column = nil)
121
+ if column.nil?
122
+ return Utils.wrap_expr(RbExpr.first)
123
+ end
124
+
125
+ if column.is_a?(Series)
126
+ if column.len > 0
127
+ column[0]
128
+ else
129
+ raise IndexError, "The series is empty, so no first value can be returned."
130
+ end
131
+ else
132
+ col(column).first
133
+ end
134
+ end
135
+
136
+ # def last
137
+ # end
138
+
139
+ # def head
140
+ # end
141
+
142
+ # def tail
143
+ # end
144
+
82
145
  def lit(value)
83
146
  Utils.wrap_expr(RbExpr.lit(value))
84
147
  end
85
148
 
149
+ # def cumsum
150
+ # end
151
+
152
+ # def spearman_rank_corr
153
+ # end
154
+
155
+ # def pearson_corr
156
+ # end
157
+
158
+ # def cov
159
+ # end
160
+
161
+ # def map
162
+ # end
163
+
164
+ # def apply
165
+ # end
166
+
167
+ def fold(acc, f, exprs)
168
+ acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
169
+ if exprs.is_a?(Expr)
170
+ exprs = [exprs]
171
+ end
172
+
173
+ exprs = Utils.selection_to_rbexpr_list(exprs)
174
+ Utils.wrap_expr(RbExpr.fold(acc._rbexpr, f, exprs))
175
+ end
176
+
177
+ # def reduce
178
+ # end
179
+
180
+ # def cumfold
181
+ # end
182
+
183
+ # def cumreduce
184
+ # end
185
+
186
+ # def any
187
+ # end
188
+
189
+ # def exclude
190
+ # end
191
+
192
+ def all(name = nil)
193
+ if name.nil?
194
+ col("*")
195
+ elsif name.is_a?(String) || name.is_a?(Symbol)
196
+ col(name).all
197
+ else
198
+ raise Todo
199
+ end
200
+ end
201
+
202
+ # def groups
203
+ # end
204
+
205
+ # def quantile
206
+ # end
207
+
86
208
  def arange(low, high, step: 1, eager: false, dtype: nil)
87
209
  low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
88
210
  high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
@@ -102,16 +224,41 @@ module Polars
102
224
  end
103
225
  end
104
226
 
105
- def all(name = nil)
106
- if name.nil?
107
- col("*")
108
- elsif name.is_a?(String) || name.is_a?(Symbol)
109
- col(name).all
110
- else
111
- raise "todo"
112
- end
227
+ # def argsort_by
228
+ # end
229
+
230
+ # def duration
231
+ # end
232
+
233
+ # def format
234
+ # end
235
+
236
+ def concat_list(exprs)
237
+ exprs = Utils.selection_to_rbexpr_list(exprs)
238
+ Utils.wrap_expr(RbExpr.concat_lst(exprs))
113
239
  end
114
240
 
241
+ # def collect_all
242
+ # end
243
+
244
+ # def select
245
+ # end
246
+
247
+ # def struct
248
+ # end
249
+
250
+ # def repeat
251
+ # end
252
+
253
+ # def arg_where
254
+ # end
255
+
256
+ # def coalesce
257
+ # end
258
+
259
+ # def from_epoch
260
+ # end
261
+
115
262
  def when(expr)
116
263
  expr = Utils.expr_to_lit_or_expr(expr)
117
264
  pw = RbExpr.when(expr._rbexpr)
@@ -0,0 +1,108 @@
1
+ module Polars
2
+ class ListExpr
3
+ attr_accessor :_rbexpr
4
+
5
+ def initialize(expr)
6
+ self._rbexpr = expr._rbexpr
7
+ end
8
+
9
+ def lengths
10
+ Utils.wrap_expr(_rbexpr.arr_lengths)
11
+ end
12
+
13
+ def sum
14
+ Utils.wrap_expr(_rbexpr.lst_sum)
15
+ end
16
+
17
+ def max
18
+ Utils.wrap_expr(_rbexpr.lst_max)
19
+ end
20
+
21
+ def min
22
+ Utils.wrap_expr(_rbexpr.lst_min)
23
+ end
24
+
25
+ def mean
26
+ Utils.wrap_expr(_rbexpr.lst_mean)
27
+ end
28
+
29
+ def sort(reverse: false)
30
+ Utils.wrap_expr(_rbexpr.lst_sort(reverse))
31
+ end
32
+
33
+ def reverse
34
+ Utils.wrap_expr(_rbexpr.lst_reverse)
35
+ end
36
+
37
+ def unique
38
+ Utils.wrap_expr(_rbexpr.lst_unique)
39
+ end
40
+
41
+ # def concat
42
+ # end
43
+
44
+ def get(index)
45
+ index = Utils.expr_to_lit_or_expr(index, str_to_lit: false)._rbexpr
46
+ Utils.wrap_expr(_rbexpr.lst_get(index))
47
+ end
48
+
49
+ def [](item)
50
+ get(item)
51
+ end
52
+
53
+ def first
54
+ get(0)
55
+ end
56
+
57
+ def last
58
+ get(-1)
59
+ end
60
+
61
+ def contains(item)
62
+ Utils.wrap_expr(_rbexpr.arr_contains(Utils.expr_to_lit_or_expr(item)._rbexpr))
63
+ end
64
+
65
+ def join(separator)
66
+ Utils.wrap_expr(_rbexpr.lst_join(separator))
67
+ end
68
+
69
+ def arg_min
70
+ Utils.wrap_expr(_rbexpr.lst_arg_min)
71
+ end
72
+
73
+ def arg_max
74
+ Utils.wrap_expr(_rbexpr.lst_arg_max)
75
+ end
76
+
77
+ def diff(n: 1, null_behavior: "ignore")
78
+ Utils.wrap_expr(_rbexpr.lst_diff(n, null_behavior))
79
+ end
80
+
81
+ def shift(periods = 1)
82
+ Utils.wrap_expr(_rbexpr.lst_shift(periods))
83
+ end
84
+
85
+ def slice(offset, length = nil)
86
+ offset = Utils.expr_to_lit_or_expr(offset, str_to_lit: false)._rbexpr
87
+ length = Utils.expr_to_lit_or_expr(length, str_to_lit: false)._rbexpr
88
+ Utils.wrap_expr(_rbexpr.lst_slice(offset, length))
89
+ end
90
+
91
+ def head(n = 5)
92
+ slice(0, n)
93
+ end
94
+
95
+ def tail(n = 5)
96
+ offset = -Utils.expr_to_lit_or_expr(n, str_to_lit: false)
97
+ slice(offset, n)
98
+ end
99
+
100
+ # def to_struct(n_field_strategy: "first_non_null", name_generator: nil)
101
+ # Utils.wrap_expr(_rbexpr.lst_to_struct(n_field_strategy, name_generator))
102
+ # end
103
+
104
+ def eval(expr, parallel: false)
105
+ Utils.wrap_expr(_rbexpr.lst_eval(expr._rbexpr, parallel))
106
+ end
107
+ end
108
+ end