polars-df 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/polars/expr.rb CHANGED
@@ -1,7 +1,10 @@
1
1
  module Polars
2
+ # Expressions that can be used in various contexts.
2
3
  class Expr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def self._from_rbexpr(rbexpr)
6
9
  expr = Expr.allocate
7
10
  expr._rbexpr = rbexpr
@@ -80,6 +83,7 @@ module Polars
80
83
  # def to_physical
81
84
  # end
82
85
 
86
+ #
83
87
  def any
84
88
  wrap_expr(_rbexpr.any)
85
89
  end
@@ -104,7 +108,9 @@ module Polars
104
108
  wrap_expr(_rbexpr._alias(name))
105
109
  end
106
110
 
107
- # TODO support symbols
111
+ # TODO support symbols for exclude
112
+
113
+ #
108
114
  def exclude(columns)
109
115
  if columns.is_a?(String)
110
116
  columns = [columns]
@@ -140,6 +146,7 @@ module Polars
140
146
  # def map_alias
141
147
  # end
142
148
 
149
+ #
143
150
  def is_not
144
151
  wrap_expr(_rbexpr.is_not)
145
152
  end
@@ -293,7 +300,8 @@ module Polars
293
300
  # def take
294
301
  # end
295
302
 
296
- def shift(periods)
303
+ #
304
+ def shift(periods = 1)
297
305
  wrap_expr(_rbexpr.shift(periods))
298
306
  end
299
307
 
@@ -439,6 +447,7 @@ module Polars
439
447
  # def apply
440
448
  # end
441
449
 
450
+ #
442
451
  def flatten
443
452
  wrap_expr(_rbexpr.explode)
444
453
  end
@@ -471,6 +480,7 @@ module Polars
471
480
  # def is_in
472
481
  # end
473
482
 
483
+ #
474
484
  def repeat_by(by)
475
485
  by = Utils.expr_to_lit_or_expr(by, false)
476
486
  wrap_expr(_rbexpr.repeat_by(by._rbexpr))
@@ -482,6 +492,7 @@ module Polars
482
492
  # def _hash
483
493
  # end
484
494
 
495
+ #
485
496
  def reinterpret(signed: false)
486
497
  wrap_expr(_rbexpr.reinterpret(signed))
487
498
  end
@@ -489,6 +500,7 @@ module Polars
489
500
  # def _inspect
490
501
  # end
491
502
 
503
+ #
492
504
  def interpolate
493
505
  wrap_expr(_rbexpr.interpolate)
494
506
  end
@@ -520,6 +532,7 @@ module Polars
520
532
  # def rolling_apply
521
533
  # end
522
534
 
535
+ #
523
536
  def rolling_skew(window_size, bias: true)
524
537
  wrap_expr(_rbexpr.rolling_skew(window_size, bias))
525
538
  end
@@ -650,6 +663,7 @@ module Polars
650
663
  # def extend_constant
651
664
  # end
652
665
 
666
+ #
653
667
  def value_counts(multithreaded: false, sort: false)
654
668
  wrap_expr(_rbexpr.value_counts(multithreaded, sort))
655
669
  end
@@ -672,6 +686,7 @@ module Polars
672
686
  # def set_sorted
673
687
  # end
674
688
 
689
+ #
675
690
  def list
676
691
  wrap_expr(_rbexpr.list)
677
692
  end
data/lib/polars/io.rb CHANGED
@@ -1,8 +1,245 @@
1
1
  module Polars
2
2
  module IO
3
- def read_csv(file, has_header: true)
3
+ def read_csv(
4
+ file,
5
+ has_header: true,
6
+ columns: nil,
7
+ new_columns: nil,
8
+ sep: ",",
9
+ comment_char: nil,
10
+ quote_char: '"',
11
+ skip_rows: 0,
12
+ dtypes: nil,
13
+ null_values: nil,
14
+ ignore_errors: false,
15
+ parse_dates: false,
16
+ n_threads: nil,
17
+ infer_schema_length: 100,
18
+ batch_size: 8192,
19
+ n_rows: nil,
20
+ encoding: "utf8",
21
+ low_memory: false,
22
+ rechunk: true,
23
+ storage_options: nil,
24
+ skip_rows_after_header: 0,
25
+ row_count_name: nil,
26
+ row_count_offset: 0,
27
+ sample_size: 1024,
28
+ eol_char: "\n"
29
+ )
30
+ _check_arg_is_1byte("sep", sep, false)
31
+ _check_arg_is_1byte("comment_char", comment_char, false)
32
+ _check_arg_is_1byte("quote_char", quote_char, true)
33
+ _check_arg_is_1byte("eol_char", eol_char, false)
34
+
35
+ projection, columns = Utils.handle_projection_columns(columns)
36
+
37
+ storage_options ||= {}
38
+
39
+ if columns && !has_header
40
+ columns.each do |column|
41
+ if !column.start_with?("column_")
42
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
43
+ end
44
+ end
45
+ end
46
+
47
+ if projection || new_columns
48
+ raise Todo
49
+ end
50
+
51
+ df = nil
4
52
  _prepare_file_arg(file) do |data|
5
- DataFrame._read_csv(data, has_header: has_header)
53
+ df = DataFrame._read_csv(
54
+ data,
55
+ has_header: has_header,
56
+ columns: columns || projection,
57
+ sep: sep,
58
+ comment_char: comment_char,
59
+ quote_char: quote_char,
60
+ skip_rows: skip_rows,
61
+ dtypes: dtypes,
62
+ null_values: null_values,
63
+ ignore_errors: ignore_errors,
64
+ parse_dates: parse_dates,
65
+ n_threads: n_threads,
66
+ infer_schema_length: infer_schema_length,
67
+ batch_size: batch_size,
68
+ n_rows: n_rows,
69
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
70
+ low_memory: low_memory,
71
+ rechunk: rechunk,
72
+ skip_rows_after_header: skip_rows_after_header,
73
+ row_count_name: row_count_name,
74
+ row_count_offset: row_count_offset,
75
+ sample_size: sample_size,
76
+ eol_char: eol_char
77
+ )
78
+ end
79
+
80
+ if new_columns
81
+ Utils._update_columns(df, new_columns)
82
+ else
83
+ df
84
+ end
85
+ end
86
+
87
+ def scan_csv(
88
+ file,
89
+ has_header: true,
90
+ sep: ",",
91
+ comment_char: nil,
92
+ quote_char: '"',
93
+ skip_rows: 0,
94
+ dtypes: nil,
95
+ null_values: nil,
96
+ ignore_errors: false,
97
+ cache: true,
98
+ with_column_names: nil,
99
+ infer_schema_length: 100,
100
+ n_rows: nil,
101
+ encoding: "utf8",
102
+ low_memory: false,
103
+ rechunk: true,
104
+ skip_rows_after_header: 0,
105
+ row_count_name: nil,
106
+ row_count_offset: 0,
107
+ parse_dates: false,
108
+ eol_char: "\n"
109
+ )
110
+ _check_arg_is_1byte("sep", sep, false)
111
+ _check_arg_is_1byte("comment_char", comment_char, false)
112
+ _check_arg_is_1byte("quote_char", quote_char, true)
113
+
114
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
115
+ file = Utils.format_path(file)
116
+ end
117
+
118
+ LazyFrame._scan_csv(
119
+ file,
120
+ has_header: has_header,
121
+ sep: sep,
122
+ comment_char: comment_char,
123
+ quote_char: quote_char,
124
+ skip_rows: skip_rows,
125
+ dtypes: dtypes,
126
+ null_values: null_values,
127
+ ignore_errors: ignore_errors,
128
+ cache: cache,
129
+ with_column_names: with_column_names,
130
+ infer_schema_length: infer_schema_length,
131
+ n_rows: n_rows,
132
+ low_memory: low_memory,
133
+ rechunk: rechunk,
134
+ skip_rows_after_header: skip_rows_after_header,
135
+ encoding: encoding,
136
+ row_count_name: row_count_name,
137
+ row_count_offset: row_count_offset,
138
+ parse_dates: parse_dates,
139
+ eol_char: eol_char,
140
+ )
141
+ end
142
+
143
+ def scan_ipc(
144
+ file,
145
+ n_rows: nil,
146
+ cache: true,
147
+ rechunk: true,
148
+ row_count_name: nil,
149
+ row_count_offset: 0,
150
+ storage_options: nil,
151
+ memory_map: true
152
+ )
153
+ LazyFrame._scan_ipc(
154
+ file,
155
+ n_rows: n_rows,
156
+ cache: cache,
157
+ rechunk: rechunk,
158
+ row_count_name: row_count_name,
159
+ row_count_offset: row_count_offset,
160
+ storage_options: storage_options,
161
+ memory_map: memory_map
162
+ )
163
+ end
164
+
165
+ def scan_parquet(
166
+ file,
167
+ n_rows: nil,
168
+ cache: true,
169
+ parallel: "auto",
170
+ rechunk: true,
171
+ row_count_name: nil,
172
+ row_count_offset: 0,
173
+ storage_options: nil,
174
+ low_memory: false
175
+ )
176
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
177
+ file = Utils.format_path(file)
178
+ end
179
+
180
+ LazyFrame._scan_parquet(
181
+ file,
182
+ n_rows:n_rows,
183
+ cache: cache,
184
+ parallel: parallel,
185
+ rechunk: rechunk,
186
+ row_count_name: row_count_name,
187
+ row_count_offset: row_count_offset,
188
+ storage_options: storage_options,
189
+ low_memory: low_memory
190
+ )
191
+ end
192
+
193
+ def scan_ndjson(
194
+ file,
195
+ infer_schema_length: 100,
196
+ batch_size: 1024,
197
+ n_rows: nil,
198
+ low_memory: false,
199
+ rechunk: true,
200
+ row_count_name: nil,
201
+ row_count_offset: 0
202
+ )
203
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
204
+ file = Utils.format_path(file)
205
+ end
206
+
207
+ LazyFrame._scan_ndjson(
208
+ file,
209
+ infer_schema_length: infer_schema_length,
210
+ batch_size: batch_size,
211
+ n_rows: n_rows,
212
+ low_memory: low_memory,
213
+ rechunk: rechunk,
214
+ row_count_name: row_count_name,
215
+ row_count_offset: row_count_offset,
216
+ )
217
+ end
218
+
219
+ # def read_avro
220
+ # end
221
+
222
+ def read_ipc(
223
+ file,
224
+ columns: nil,
225
+ n_rows: nil,
226
+ memory_map: true,
227
+ storage_options: nil,
228
+ row_count_name: nil,
229
+ row_count_offset: 0,
230
+ rechunk: true
231
+ )
232
+ storage_options ||= {}
233
+ _prepare_file_arg(file, **storage_options) do |data|
234
+ DataFrame._read_ipc(
235
+ data,
236
+ columns: columns,
237
+ n_rows: n_rows,
238
+ row_count_name: row_count_name,
239
+ row_count_offset: row_count_offset,
240
+ rechunk: rechunk,
241
+ memory_map: memory_map
242
+ )
6
243
  end
7
244
  end
8
245
 
@@ -20,6 +257,96 @@ module Polars
20
257
  DataFrame._read_ndjson(file)
21
258
  end
22
259
 
260
+ # def read_sql
261
+ # end
262
+
263
+ # def read_excel
264
+ # end
265
+
266
+ def read_csv_batched(
267
+ file,
268
+ has_header: true,
269
+ columns: nil,
270
+ new_columns: nil,
271
+ sep: ",",
272
+ comment_char: nil,
273
+ quote_char: '"',
274
+ skip_rows: 0,
275
+ dtypes: nil,
276
+ null_values: nil,
277
+ ignore_errors: false,
278
+ parse_dates: false,
279
+ n_threads: nil,
280
+ infer_schema_length: 100,
281
+ batch_size: 50_000,
282
+ n_rows: nil,
283
+ encoding: "utf8",
284
+ low_memory: false,
285
+ rechunk: true,
286
+ skip_rows_after_header: 0,
287
+ row_count_name: nil,
288
+ row_count_offset: 0,
289
+ sample_size: 1024,
290
+ eol_char: "\n"
291
+ )
292
+ projection, columns = Utils.handle_projection_columns(columns)
293
+
294
+ if columns && !has_header
295
+ columns.each do |column|
296
+ if !column.start_with?("column_")
297
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
298
+ end
299
+ end
300
+ end
301
+
302
+ if projection || new_columns
303
+ raise Todo
304
+ end
305
+
306
+ BatchedCsvReader.new(
307
+ file,
308
+ has_header: has_header,
309
+ columns: columns || projection,
310
+ sep: sep,
311
+ comment_char: comment_char,
312
+ quote_char: quote_char,
313
+ skip_rows: skip_rows,
314
+ dtypes: dtypes,
315
+ null_values: null_values,
316
+ ignore_errors: ignore_errors,
317
+ parse_dates: parse_dates,
318
+ n_threads: n_threads,
319
+ infer_schema_length: infer_schema_length,
320
+ batch_size: batch_size,
321
+ n_rows: n_rows,
322
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
323
+ low_memory: low_memory,
324
+ rechunk: rechunk,
325
+ skip_rows_after_header: skip_rows_after_header,
326
+ row_count_name: row_count_name,
327
+ row_count_offset: row_count_offset,
328
+ sample_size: sample_size,
329
+ eol_char: eol_char,
330
+ new_columns: new_columns
331
+ )
332
+ end
333
+
334
+ def read_ipc_schema(file)
335
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
336
+ file = Utils.format_path(file)
337
+ end
338
+
339
+ _ipc_schema(file)
340
+ end
341
+
342
+ def read_parquet_schema(file)
343
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
344
+ file = Utils.format_path(file)
345
+ end
346
+
347
+ _parquet_schema(file)
348
+ end
349
+
23
350
  private
24
351
 
25
352
  def _prepare_file_arg(file)
@@ -35,5 +362,18 @@ module Polars
35
362
 
36
363
  yield file
37
364
  end
365
+
366
+ def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
367
+ if arg.is_a?(String)
368
+ arg_byte_length = arg.bytesize
369
+ if can_be_empty
370
+ if arg_byte_length > 1
371
+ raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
372
+ end
373
+ elsif arg_byte_length != 1
374
+ raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
375
+ end
376
+ end
377
+ end
38
378
  end
39
379
  end
@@ -1,13 +1,157 @@
1
1
  module Polars
2
+ # Representation of a Lazy computation graph/query againat a DataFrame.
2
3
  class LazyFrame
4
+ # @private
3
5
  attr_accessor :_ldf
4
6
 
7
+ # @private
5
8
  def self._from_rbldf(rb_ldf)
6
9
  ldf = LazyFrame.allocate
7
10
  ldf._ldf = rb_ldf
8
11
  ldf
9
12
  end
10
13
 
14
+ # @private
15
+ def self._scan_csv(
16
+ file,
17
+ has_header: true,
18
+ sep: ",",
19
+ comment_char: nil,
20
+ quote_char: '"',
21
+ skip_rows: 0,
22
+ dtypes: nil,
23
+ null_values: nil,
24
+ ignore_errors: false,
25
+ cache: true,
26
+ with_column_names: nil,
27
+ infer_schema_length: 100,
28
+ n_rows: nil,
29
+ encoding: "utf8",
30
+ low_memory: false,
31
+ rechunk: true,
32
+ skip_rows_after_header: 0,
33
+ row_count_name: nil,
34
+ row_count_offset: 0,
35
+ parse_dates: false,
36
+ eol_char: "\n"
37
+ )
38
+ dtype_list = nil
39
+ if !dtypes.nil?
40
+ dtype_list = []
41
+ dtypes.each do |k, v|
42
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
43
+ end
44
+ end
45
+ processed_null_values = Utils._process_null_values(null_values)
46
+
47
+ _from_rbldf(
48
+ RbLazyFrame.new_from_csv(
49
+ file,
50
+ sep,
51
+ has_header,
52
+ ignore_errors,
53
+ skip_rows,
54
+ n_rows,
55
+ cache,
56
+ dtype_list,
57
+ low_memory,
58
+ comment_char,
59
+ quote_char,
60
+ processed_null_values,
61
+ infer_schema_length,
62
+ with_column_names,
63
+ rechunk,
64
+ skip_rows_after_header,
65
+ encoding,
66
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
67
+ parse_dates,
68
+ eol_char
69
+ )
70
+ )
71
+ end
72
+
73
+ # @private
74
+ def self._scan_parquet(
75
+ file,
76
+ n_rows: nil,
77
+ cache: true,
78
+ parallel: "auto",
79
+ rechunk: true,
80
+ row_count_name: nil,
81
+ row_count_offset: 0,
82
+ storage_options: nil,
83
+ low_memory: false
84
+ )
85
+ _from_rbldf(
86
+ RbLazyFrame.new_from_parquet(
87
+ file,
88
+ n_rows,
89
+ cache,
90
+ parallel,
91
+ rechunk,
92
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
93
+ low_memory
94
+ )
95
+ )
96
+ end
97
+
98
+ # @private
99
+ def self._scan_ipc(
100
+ file,
101
+ n_rows: nil,
102
+ cache: true,
103
+ rechunk: true,
104
+ row_count_name: nil,
105
+ row_count_offset: 0,
106
+ storage_options: nil,
107
+ memory_map: true
108
+ )
109
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
110
+ file = Utils.format_path(file)
111
+ end
112
+
113
+ _from_rbldf(
114
+ RbLazyFrame.new_from_ipc(
115
+ file,
116
+ n_rows,
117
+ cache,
118
+ rechunk,
119
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
120
+ memory_map
121
+ )
122
+ )
123
+ end
124
+
125
+ # @private
126
+ def self._scan_ndjson(
127
+ file,
128
+ infer_schema_length: nil,
129
+ batch_size: nil,
130
+ n_rows: nil,
131
+ low_memory: false,
132
+ rechunk: true,
133
+ row_count_name: nil,
134
+ row_count_offset: 0
135
+ )
136
+ _from_rbldf(
137
+ RbLazyFrame.new_from_ndjson(
138
+ file,
139
+ infer_schema_length,
140
+ batch_size,
141
+ n_rows,
142
+ low_memory,
143
+ rechunk,
144
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
145
+ )
146
+ )
147
+ end
148
+
149
+ # def self.from_json
150
+ # end
151
+
152
+ # def self.read_json
153
+ # end
154
+
11
155
  # def columns
12
156
  # end
13
157
 
@@ -53,6 +197,7 @@ module Polars
53
197
  # def profile
54
198
  # end
55
199
 
200
+ #
56
201
  def collect(
57
202
  type_coercion: true,
58
203
  predicate_pushdown: true,
@@ -90,6 +235,7 @@ module Polars
90
235
  # def fetch
91
236
  # end
92
237
 
238
+ #
93
239
  def lazy
94
240
  self
95
241
  end
@@ -100,6 +246,7 @@ module Polars
100
246
  # def cleared
101
247
  # end
102
248
 
249
+ #
103
250
  def filter(predicate)
104
251
  _from_rbldf(
105
252
  _ldf.filter(
@@ -128,6 +275,7 @@ module Polars
128
275
  # def join_asof
129
276
  # end
130
277
 
278
+ #
131
279
  def join(
132
280
  other,
133
281
  left_on: nil,
@@ -202,6 +350,7 @@ module Polars
202
350
  # def with_context
203
351
  # end
204
352
 
353
+ #
205
354
  def with_column(column)
206
355
  with_columns([column])
207
356
  end
@@ -209,6 +358,7 @@ module Polars
209
358
  # def drop
210
359
  # end
211
360
 
361
+ #
212
362
  def rename(mapping)
213
363
  existing = mapping.keys
214
364
  _new = mapping.values
@@ -251,6 +401,7 @@ module Polars
251
401
  # def fill_null
252
402
  # end
253
403
 
404
+ #
254
405
  def fill_nan(fill_value)
255
406
  if !fill_value.is_a?(Expr)
256
407
  fill_value = Utils.lit(fill_value)
@@ -282,8 +433,11 @@ module Polars
282
433
  # def quantile
283
434
  # end
284
435
 
285
- # def explode
286
- # end
436
+ #
437
+ def explode(columns)
438
+ columns = Utils.selection_to_rbexpr_list(columns)
439
+ _from_rbldf(_ldf.explode(columns))
440
+ end
287
441
 
288
442
  # def unique
289
443
  # end