polars-df 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,203 @@
1
1
  module Polars
2
+ # Representation of a Lazy computation graph/query againat a DataFrame.
2
3
  class LazyFrame
4
+ # @private
3
5
  attr_accessor :_ldf
4
6
 
7
+ # @private
5
8
  def self._from_rbldf(rb_ldf)
6
9
  ldf = LazyFrame.allocate
7
10
  ldf._ldf = rb_ldf
8
11
  ldf
9
12
  end
10
13
 
14
+ # @private
15
+ def self._scan_csv(
16
+ file,
17
+ has_header: true,
18
+ sep: ",",
19
+ comment_char: nil,
20
+ quote_char: '"',
21
+ skip_rows: 0,
22
+ dtypes: nil,
23
+ null_values: nil,
24
+ ignore_errors: false,
25
+ cache: true,
26
+ with_column_names: nil,
27
+ infer_schema_length: 100,
28
+ n_rows: nil,
29
+ encoding: "utf8",
30
+ low_memory: false,
31
+ rechunk: true,
32
+ skip_rows_after_header: 0,
33
+ row_count_name: nil,
34
+ row_count_offset: 0,
35
+ parse_dates: false,
36
+ eol_char: "\n"
37
+ )
38
+ dtype_list = nil
39
+ if !dtypes.nil?
40
+ dtype_list = []
41
+ dtypes.each do |k, v|
42
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
43
+ end
44
+ end
45
+ processed_null_values = Utils._process_null_values(null_values)
46
+
47
+ _from_rbldf(
48
+ RbLazyFrame.new_from_csv(
49
+ file,
50
+ sep,
51
+ has_header,
52
+ ignore_errors,
53
+ skip_rows,
54
+ n_rows,
55
+ cache,
56
+ dtype_list,
57
+ low_memory,
58
+ comment_char,
59
+ quote_char,
60
+ processed_null_values,
61
+ infer_schema_length,
62
+ with_column_names,
63
+ rechunk,
64
+ skip_rows_after_header,
65
+ encoding,
66
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
67
+ parse_dates,
68
+ eol_char
69
+ )
70
+ )
71
+ end
72
+
73
+ # @private
74
+ def self._scan_parquet(
75
+ file,
76
+ n_rows: nil,
77
+ cache: true,
78
+ parallel: "auto",
79
+ rechunk: true,
80
+ row_count_name: nil,
81
+ row_count_offset: 0,
82
+ storage_options: nil,
83
+ low_memory: false
84
+ )
85
+ _from_rbldf(
86
+ RbLazyFrame.new_from_parquet(
87
+ file,
88
+ n_rows,
89
+ cache,
90
+ parallel,
91
+ rechunk,
92
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
93
+ low_memory
94
+ )
95
+ )
96
+ end
97
+
98
+ # @private
99
+ def self._scan_ipc(
100
+ file,
101
+ n_rows: nil,
102
+ cache: true,
103
+ rechunk: true,
104
+ row_count_name: nil,
105
+ row_count_offset: 0,
106
+ storage_options: nil,
107
+ memory_map: true
108
+ )
109
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
110
+ file = Utils.format_path(file)
111
+ end
112
+
113
+ _from_rbldf(
114
+ RbLazyFrame.new_from_ipc(
115
+ file,
116
+ n_rows,
117
+ cache,
118
+ rechunk,
119
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
120
+ memory_map
121
+ )
122
+ )
123
+ end
124
+
125
+ # @private
126
+ def self._scan_ndjson(
127
+ file,
128
+ infer_schema_length: nil,
129
+ batch_size: nil,
130
+ n_rows: nil,
131
+ low_memory: false,
132
+ rechunk: true,
133
+ row_count_name: nil,
134
+ row_count_offset: 0
135
+ )
136
+ _from_rbldf(
137
+ RbLazyFrame.new_from_ndjson(
138
+ file,
139
+ infer_schema_length,
140
+ batch_size,
141
+ n_rows,
142
+ low_memory,
143
+ rechunk,
144
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
145
+ )
146
+ )
147
+ end
148
+
149
+ # def self.from_json
150
+ # end
151
+
152
+ # def self.read_json
153
+ # end
154
+
155
+ # def columns
156
+ # end
157
+
158
+ # def dtypes
159
+ # end
160
+
161
+ # def schema
162
+ # end
163
+
164
+ # def width
165
+ # end
166
+
167
+ # def include?(key)
168
+ # end
169
+
170
+ # clone handled by initialize_copy
171
+
172
+ # def [](item)
173
+ # end
174
+
175
+ # def to_s
176
+ # end
177
+ # alias_method :inspect, :to_s
178
+
179
+ # def write_json
180
+ # end
181
+
182
+ # def pipe
183
+ # end
184
+
185
+ # def describe_plan
186
+ # end
187
+
188
+ # def describe_optimized_plan
189
+ # end
190
+
191
+ # def show_graph
192
+ # end
193
+
194
+ # def sort
195
+ # end
196
+
197
+ # def profile
198
+ # end
199
+
200
+ #
11
201
  def collect(
12
202
  type_coercion: true,
13
203
  predicate_pushdown: true,
@@ -42,8 +232,23 @@ module Polars
42
232
  Utils.wrap_df(ldf.collect)
43
233
  end
44
234
 
235
+ # def fetch
236
+ # end
237
+
238
+ #
239
+ def lazy
240
+ self
241
+ end
242
+
243
+ # def cache
244
+ # end
245
+
246
+ # def cleared
247
+ # end
248
+
249
+ #
45
250
  def filter(predicate)
46
- self.class._from_rbldf(
251
+ _from_rbldf(
47
252
  _ldf.filter(
48
253
  Utils.expr_to_lit_or_expr(predicate, str_to_lit: false)._rbexpr
49
254
  )
@@ -52,7 +257,7 @@ module Polars
52
257
 
53
258
  def select(exprs)
54
259
  exprs = Utils.selection_to_rbexpr_list(exprs)
55
- self.class._from_rbldf(_ldf.select(exprs))
260
+ _from_rbldf(_ldf.select(exprs))
56
261
  end
57
262
 
58
263
  def groupby(by, maintain_order: false)
@@ -61,6 +266,16 @@ module Polars
61
266
  LazyGroupBy.new(lgb, self.class)
62
267
  end
63
268
 
269
+ # def groupby_rolling
270
+ # end
271
+
272
+ # def groupby_dynamic
273
+ # end
274
+
275
+ # def join_asof
276
+ # end
277
+
278
+ #
64
279
  def join(
65
280
  other,
66
281
  left_on: nil,
@@ -76,7 +291,7 @@ module Polars
76
291
  end
77
292
 
78
293
  if how == "cross"
79
- return self.class._from_rbldf(
294
+ return _from_rbldf(
80
295
  _ldf.join(
81
296
  other._ldf, [], [], allow_parallel, force_parallel, how, suffix
82
297
  )
@@ -94,7 +309,7 @@ module Polars
94
309
  raise ArgumentError, "must specify `on` OR `left_on` and `right_on`"
95
310
  end
96
311
 
97
- self.class._from_rbldf(
312
+ _from_rbldf(
98
313
  self._ldf.join(
99
314
  other._ldf,
100
315
  rbexprs_left,
@@ -123,17 +338,134 @@ module Polars
123
338
  when Expr
124
339
  rbexprs << e._rbexpr
125
340
  when Series
126
- rbexprs = Internal.lit(e)._rbexpr
341
+ rbexprs = Utils.lit(e)._rbexpr
127
342
  else
128
343
  raise ArgumentError, "Expected an expression, got #{e}"
129
344
  end
130
345
  end
131
346
 
132
- self.class._from_rbldf(_ldf.with_columns(rbexprs))
347
+ _from_rbldf(_ldf.with_columns(rbexprs))
133
348
  end
134
349
 
350
+ # def with_context
351
+ # end
352
+
353
+ #
135
354
  def with_column(column)
136
355
  with_columns([column])
137
356
  end
357
+
358
+ # def drop
359
+ # end
360
+
361
+ #
362
+ def rename(mapping)
363
+ existing = mapping.keys
364
+ _new = mapping.values
365
+ _from_rbldf(_ldf.rename(existing, _new))
366
+ end
367
+
368
+ # def reverse
369
+ # end
370
+
371
+ # def shift
372
+ # end
373
+
374
+ # def shift_and_fill
375
+ # end
376
+
377
+ # def slice
378
+ # end
379
+
380
+ # def limit
381
+ # end
382
+
383
+ # def head
384
+ # end
385
+
386
+ # def tail
387
+ # end
388
+
389
+ # def last
390
+ # end
391
+
392
+ # def first
393
+ # end
394
+
395
+ # def with_row_count
396
+ # end
397
+
398
+ # def take_every
399
+ # end
400
+
401
+ # def fill_null
402
+ # end
403
+
404
+ #
405
+ def fill_nan(fill_value)
406
+ if !fill_value.is_a?(Expr)
407
+ fill_value = Utils.lit(fill_value)
408
+ end
409
+ _from_rbldf(_ldf.fill_nan(fill_value._rbexpr))
410
+ end
411
+
412
+ # def std
413
+ # end
414
+
415
+ # def var
416
+ # end
417
+
418
+ # def max
419
+ # end
420
+
421
+ # def min
422
+ # end
423
+
424
+ # def sum
425
+ # end
426
+
427
+ # def mean
428
+ # end
429
+
430
+ # def median
431
+ # end
432
+
433
+ # def quantile
434
+ # end
435
+
436
+ #
437
+ def explode(columns)
438
+ columns = Utils.selection_to_rbexpr_list(columns)
439
+ _from_rbldf(_ldf.explode(columns))
440
+ end
441
+
442
+ # def unique
443
+ # end
444
+
445
+ # def drop_nulls
446
+ # end
447
+
448
+ # def melt
449
+ # end
450
+
451
+ # def map
452
+ # end
453
+
454
+ # def interpolate
455
+ # end
456
+
457
+ # def unnest
458
+ # end
459
+
460
+ private
461
+
462
+ def initialize_copy(other)
463
+ super
464
+ self._ldf = _ldf._clone
465
+ end
466
+
467
+ def _from_rbldf(rb_ldf)
468
+ self.class._from_rbldf(rb_ldf)
469
+ end
138
470
  end
139
471
  end
@@ -1,10 +1,45 @@
1
1
  module Polars
2
2
  module LazyFunctions
3
3
  def col(name)
4
- name = name.to_s if name.is_a?(Symbol)
5
- Utils.wrap_expr(RbExpr.col(name))
4
+ if name.is_a?(Series)
5
+ name = name.to_a
6
+ end
7
+
8
+ if name.is_a?(Array)
9
+ if name.length == 0 || name[0].is_a?(String) || name[0].is_a?(Symbol)
10
+ name = name.map { |v| v.is_a?(Symbol) ? v.to_s : v }
11
+ Utils.wrap_expr(RbExpr.cols(name))
12
+ elsif Utils.is_polars_dtype(name[0])
13
+ raise Todo
14
+ # Utils.wrap_expr(_dtype_cols(name))
15
+ else
16
+ raise ArgumentError, "Expected list values to be all `str` or all `DataType`"
17
+ end
18
+ else
19
+ name = name.to_s if name.is_a?(Symbol)
20
+ Utils.wrap_expr(RbExpr.col(name))
21
+ end
6
22
  end
7
23
 
24
+ def element
25
+ col("")
26
+ end
27
+
28
+ def count(column = nil)
29
+ if column.nil?
30
+ return Utils.wrap_expr(RbExpr.count)
31
+ end
32
+
33
+ if column.is_a?(Series)
34
+ column.len
35
+ else
36
+ col(column).count
37
+ end
38
+ end
39
+
40
+ # def to_list
41
+ # end
42
+
8
43
  def std(column, ddof: 1)
9
44
  if column.is_a?(Series)
10
45
  column.std(ddof: ddof)
@@ -55,7 +90,7 @@ module Polars
55
90
  # TODO
56
91
  Utils.wrap_expr(_sum_exprs(exprs))
57
92
  else
58
- raise "todo"
93
+ raise Todo
59
94
  end
60
95
  end
61
96
 
@@ -79,10 +114,97 @@ module Polars
79
114
  end
80
115
  end
81
116
 
117
+ # def n_unique
118
+ # end
119
+
120
+ def first(column = nil)
121
+ if column.nil?
122
+ return Utils.wrap_expr(RbExpr.first)
123
+ end
124
+
125
+ if column.is_a?(Series)
126
+ if column.len > 0
127
+ column[0]
128
+ else
129
+ raise IndexError, "The series is empty, so no first value can be returned."
130
+ end
131
+ else
132
+ col(column).first
133
+ end
134
+ end
135
+
136
+ # def last
137
+ # end
138
+
139
+ # def head
140
+ # end
141
+
142
+ # def tail
143
+ # end
144
+
82
145
  def lit(value)
83
146
  Utils.wrap_expr(RbExpr.lit(value))
84
147
  end
85
148
 
149
+ # def cumsum
150
+ # end
151
+
152
+ # def spearman_rank_corr
153
+ # end
154
+
155
+ # def pearson_corr
156
+ # end
157
+
158
+ # def cov
159
+ # end
160
+
161
+ # def map
162
+ # end
163
+
164
+ # def apply
165
+ # end
166
+
167
+ def fold(acc, f, exprs)
168
+ acc = Utils.expr_to_lit_or_expr(acc, str_to_lit: true)
169
+ if exprs.is_a?(Expr)
170
+ exprs = [exprs]
171
+ end
172
+
173
+ exprs = Utils.selection_to_rbexpr_list(exprs)
174
+ Utils.wrap_expr(RbExpr.fold(acc._rbexpr, f, exprs))
175
+ end
176
+
177
+ # def reduce
178
+ # end
179
+
180
+ # def cumfold
181
+ # end
182
+
183
+ # def cumreduce
184
+ # end
185
+
186
+ # def any
187
+ # end
188
+
189
+ # def exclude
190
+ # end
191
+
192
+ def all(name = nil)
193
+ if name.nil?
194
+ col("*")
195
+ elsif name.is_a?(String) || name.is_a?(Symbol)
196
+ col(name).all
197
+ else
198
+ raise Todo
199
+ end
200
+ end
201
+
202
+ # def groups
203
+ # end
204
+
205
+ # def quantile
206
+ # end
207
+
86
208
  def arange(low, high, step: 1, eager: false, dtype: nil)
87
209
  low = Utils.expr_to_lit_or_expr(low, str_to_lit: false)
88
210
  high = Utils.expr_to_lit_or_expr(high, str_to_lit: false)
@@ -102,16 +224,41 @@ module Polars
102
224
  end
103
225
  end
104
226
 
105
- def all(name = nil)
106
- if name.nil?
107
- col("*")
108
- elsif name.is_a?(String) || name.is_a?(Symbol)
109
- col(name).all
110
- else
111
- raise "todo"
112
- end
227
+ # def argsort_by
228
+ # end
229
+
230
+ # def duration
231
+ # end
232
+
233
+ # def format
234
+ # end
235
+
236
+ def concat_list(exprs)
237
+ exprs = Utils.selection_to_rbexpr_list(exprs)
238
+ Utils.wrap_expr(RbExpr.concat_lst(exprs))
113
239
  end
114
240
 
241
+ # def collect_all
242
+ # end
243
+
244
+ # def select
245
+ # end
246
+
247
+ # def struct
248
+ # end
249
+
250
+ # def repeat
251
+ # end
252
+
253
+ # def arg_where
254
+ # end
255
+
256
+ # def coalesce
257
+ # end
258
+
259
+ # def from_epoch
260
+ # end
261
+
115
262
  def when(expr)
116
263
  expr = Utils.expr_to_lit_or_expr(expr)
117
264
  pw = RbExpr.when(expr._rbexpr)
@@ -0,0 +1,108 @@
1
+ module Polars
2
+ class ListExpr
3
+ attr_accessor :_rbexpr
4
+
5
+ def initialize(expr)
6
+ self._rbexpr = expr._rbexpr
7
+ end
8
+
9
+ def lengths
10
+ Utils.wrap_expr(_rbexpr.arr_lengths)
11
+ end
12
+
13
+ def sum
14
+ Utils.wrap_expr(_rbexpr.lst_sum)
15
+ end
16
+
17
+ def max
18
+ Utils.wrap_expr(_rbexpr.lst_max)
19
+ end
20
+
21
+ def min
22
+ Utils.wrap_expr(_rbexpr.lst_min)
23
+ end
24
+
25
+ def mean
26
+ Utils.wrap_expr(_rbexpr.lst_mean)
27
+ end
28
+
29
+ def sort(reverse: false)
30
+ Utils.wrap_expr(_rbexpr.lst_sort(reverse))
31
+ end
32
+
33
+ def reverse
34
+ Utils.wrap_expr(_rbexpr.lst_reverse)
35
+ end
36
+
37
+ def unique
38
+ Utils.wrap_expr(_rbexpr.lst_unique)
39
+ end
40
+
41
+ # def concat
42
+ # end
43
+
44
+ def get(index)
45
+ index = Utils.expr_to_lit_or_expr(index, str_to_lit: false)._rbexpr
46
+ Utils.wrap_expr(_rbexpr.lst_get(index))
47
+ end
48
+
49
+ def [](item)
50
+ get(item)
51
+ end
52
+
53
+ def first
54
+ get(0)
55
+ end
56
+
57
+ def last
58
+ get(-1)
59
+ end
60
+
61
+ def contains(item)
62
+ Utils.wrap_expr(_rbexpr.arr_contains(Utils.expr_to_lit_or_expr(item)._rbexpr))
63
+ end
64
+
65
+ def join(separator)
66
+ Utils.wrap_expr(_rbexpr.lst_join(separator))
67
+ end
68
+
69
+ def arg_min
70
+ Utils.wrap_expr(_rbexpr.lst_arg_min)
71
+ end
72
+
73
+ def arg_max
74
+ Utils.wrap_expr(_rbexpr.lst_arg_max)
75
+ end
76
+
77
+ def diff(n: 1, null_behavior: "ignore")
78
+ Utils.wrap_expr(_rbexpr.lst_diff(n, null_behavior))
79
+ end
80
+
81
+ def shift(periods = 1)
82
+ Utils.wrap_expr(_rbexpr.lst_shift(periods))
83
+ end
84
+
85
+ def slice(offset, length = nil)
86
+ offset = Utils.expr_to_lit_or_expr(offset, str_to_lit: false)._rbexpr
87
+ length = Utils.expr_to_lit_or_expr(length, str_to_lit: false)._rbexpr
88
+ Utils.wrap_expr(_rbexpr.lst_slice(offset, length))
89
+ end
90
+
91
+ def head(n = 5)
92
+ slice(0, n)
93
+ end
94
+
95
+ def tail(n = 5)
96
+ offset = -Utils.expr_to_lit_or_expr(n, str_to_lit: false)
97
+ slice(offset, n)
98
+ end
99
+
100
+ # def to_struct(n_field_strategy: "first_non_null", name_generator: nil)
101
+ # Utils.wrap_expr(_rbexpr.lst_to_struct(n_field_strategy, name_generator))
102
+ # end
103
+
104
+ def eval(expr, parallel: false)
105
+ Utils.wrap_expr(_rbexpr.lst_eval(expr._rbexpr, parallel))
106
+ end
107
+ end
108
+ end