polars-df 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/polars/expr.rb CHANGED
@@ -1,7 +1,10 @@
1
1
  module Polars
2
+ # Expressions that can be used in various contexts.
2
3
  class Expr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def self._from_rbexpr(rbexpr)
6
9
  expr = Expr.allocate
7
10
  expr._rbexpr = rbexpr
@@ -80,6 +83,7 @@ module Polars
80
83
  # def to_physical
81
84
  # end
82
85
 
86
+ #
83
87
  def any
84
88
  wrap_expr(_rbexpr.any)
85
89
  end
@@ -104,7 +108,9 @@ module Polars
104
108
  wrap_expr(_rbexpr._alias(name))
105
109
  end
106
110
 
107
- # TODO support symbols
111
+ # TODO support symbols for exclude
112
+
113
+ #
108
114
  def exclude(columns)
109
115
  if columns.is_a?(String)
110
116
  columns = [columns]
@@ -140,6 +146,7 @@ module Polars
140
146
  # def map_alias
141
147
  # end
142
148
 
149
+ #
143
150
  def is_not
144
151
  wrap_expr(_rbexpr.is_not)
145
152
  end
@@ -293,7 +300,8 @@ module Polars
293
300
  # def take
294
301
  # end
295
302
 
296
- def shift(periods)
303
+ #
304
+ def shift(periods = 1)
297
305
  wrap_expr(_rbexpr.shift(periods))
298
306
  end
299
307
 
@@ -439,6 +447,7 @@ module Polars
439
447
  # def apply
440
448
  # end
441
449
 
450
+ #
442
451
  def flatten
443
452
  wrap_expr(_rbexpr.explode)
444
453
  end
@@ -471,6 +480,7 @@ module Polars
471
480
  # def is_in
472
481
  # end
473
482
 
483
+ #
474
484
  def repeat_by(by)
475
485
  by = Utils.expr_to_lit_or_expr(by, false)
476
486
  wrap_expr(_rbexpr.repeat_by(by._rbexpr))
@@ -482,6 +492,7 @@ module Polars
482
492
  # def _hash
483
493
  # end
484
494
 
495
+ #
485
496
  def reinterpret(signed: false)
486
497
  wrap_expr(_rbexpr.reinterpret(signed))
487
498
  end
@@ -489,6 +500,7 @@ module Polars
489
500
  # def _inspect
490
501
  # end
491
502
 
503
+ #
492
504
  def interpolate
493
505
  wrap_expr(_rbexpr.interpolate)
494
506
  end
@@ -520,6 +532,7 @@ module Polars
520
532
  # def rolling_apply
521
533
  # end
522
534
 
535
+ #
523
536
  def rolling_skew(window_size, bias: true)
524
537
  wrap_expr(_rbexpr.rolling_skew(window_size, bias))
525
538
  end
@@ -650,6 +663,7 @@ module Polars
650
663
  # def extend_constant
651
664
  # end
652
665
 
666
+ #
653
667
  def value_counts(multithreaded: false, sort: false)
654
668
  wrap_expr(_rbexpr.value_counts(multithreaded, sort))
655
669
  end
@@ -672,6 +686,7 @@ module Polars
672
686
  # def set_sorted
673
687
  # end
674
688
 
689
+ #
675
690
  def list
676
691
  wrap_expr(_rbexpr.list)
677
692
  end
data/lib/polars/io.rb CHANGED
@@ -1,8 +1,245 @@
1
1
  module Polars
2
2
  module IO
3
- def read_csv(file, has_header: true)
3
+ def read_csv(
4
+ file,
5
+ has_header: true,
6
+ columns: nil,
7
+ new_columns: nil,
8
+ sep: ",",
9
+ comment_char: nil,
10
+ quote_char: '"',
11
+ skip_rows: 0,
12
+ dtypes: nil,
13
+ null_values: nil,
14
+ ignore_errors: false,
15
+ parse_dates: false,
16
+ n_threads: nil,
17
+ infer_schema_length: 100,
18
+ batch_size: 8192,
19
+ n_rows: nil,
20
+ encoding: "utf8",
21
+ low_memory: false,
22
+ rechunk: true,
23
+ storage_options: nil,
24
+ skip_rows_after_header: 0,
25
+ row_count_name: nil,
26
+ row_count_offset: 0,
27
+ sample_size: 1024,
28
+ eol_char: "\n"
29
+ )
30
+ _check_arg_is_1byte("sep", sep, false)
31
+ _check_arg_is_1byte("comment_char", comment_char, false)
32
+ _check_arg_is_1byte("quote_char", quote_char, true)
33
+ _check_arg_is_1byte("eol_char", eol_char, false)
34
+
35
+ projection, columns = Utils.handle_projection_columns(columns)
36
+
37
+ storage_options ||= {}
38
+
39
+ if columns && !has_header
40
+ columns.each do |column|
41
+ if !column.start_with?("column_")
42
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
43
+ end
44
+ end
45
+ end
46
+
47
+ if projection || new_columns
48
+ raise Todo
49
+ end
50
+
51
+ df = nil
4
52
  _prepare_file_arg(file) do |data|
5
- DataFrame._read_csv(data, has_header: has_header)
53
+ df = DataFrame._read_csv(
54
+ data,
55
+ has_header: has_header,
56
+ columns: columns || projection,
57
+ sep: sep,
58
+ comment_char: comment_char,
59
+ quote_char: quote_char,
60
+ skip_rows: skip_rows,
61
+ dtypes: dtypes,
62
+ null_values: null_values,
63
+ ignore_errors: ignore_errors,
64
+ parse_dates: parse_dates,
65
+ n_threads: n_threads,
66
+ infer_schema_length: infer_schema_length,
67
+ batch_size: batch_size,
68
+ n_rows: n_rows,
69
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
70
+ low_memory: low_memory,
71
+ rechunk: rechunk,
72
+ skip_rows_after_header: skip_rows_after_header,
73
+ row_count_name: row_count_name,
74
+ row_count_offset: row_count_offset,
75
+ sample_size: sample_size,
76
+ eol_char: eol_char
77
+ )
78
+ end
79
+
80
+ if new_columns
81
+ Utils._update_columns(df, new_columns)
82
+ else
83
+ df
84
+ end
85
+ end
86
+
87
+ def scan_csv(
88
+ file,
89
+ has_header: true,
90
+ sep: ",",
91
+ comment_char: nil,
92
+ quote_char: '"',
93
+ skip_rows: 0,
94
+ dtypes: nil,
95
+ null_values: nil,
96
+ ignore_errors: false,
97
+ cache: true,
98
+ with_column_names: nil,
99
+ infer_schema_length: 100,
100
+ n_rows: nil,
101
+ encoding: "utf8",
102
+ low_memory: false,
103
+ rechunk: true,
104
+ skip_rows_after_header: 0,
105
+ row_count_name: nil,
106
+ row_count_offset: 0,
107
+ parse_dates: false,
108
+ eol_char: "\n"
109
+ )
110
+ _check_arg_is_1byte("sep", sep, false)
111
+ _check_arg_is_1byte("comment_char", comment_char, false)
112
+ _check_arg_is_1byte("quote_char", quote_char, true)
113
+
114
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
115
+ file = Utils.format_path(file)
116
+ end
117
+
118
+ LazyFrame._scan_csv(
119
+ file,
120
+ has_header: has_header,
121
+ sep: sep,
122
+ comment_char: comment_char,
123
+ quote_char: quote_char,
124
+ skip_rows: skip_rows,
125
+ dtypes: dtypes,
126
+ null_values: null_values,
127
+ ignore_errors: ignore_errors,
128
+ cache: cache,
129
+ with_column_names: with_column_names,
130
+ infer_schema_length: infer_schema_length,
131
+ n_rows: n_rows,
132
+ low_memory: low_memory,
133
+ rechunk: rechunk,
134
+ skip_rows_after_header: skip_rows_after_header,
135
+ encoding: encoding,
136
+ row_count_name: row_count_name,
137
+ row_count_offset: row_count_offset,
138
+ parse_dates: parse_dates,
139
+ eol_char: eol_char,
140
+ )
141
+ end
142
+
143
+ def scan_ipc(
144
+ file,
145
+ n_rows: nil,
146
+ cache: true,
147
+ rechunk: true,
148
+ row_count_name: nil,
149
+ row_count_offset: 0,
150
+ storage_options: nil,
151
+ memory_map: true
152
+ )
153
+ LazyFrame._scan_ipc(
154
+ file,
155
+ n_rows: n_rows,
156
+ cache: cache,
157
+ rechunk: rechunk,
158
+ row_count_name: row_count_name,
159
+ row_count_offset: row_count_offset,
160
+ storage_options: storage_options,
161
+ memory_map: memory_map
162
+ )
163
+ end
164
+
165
+ def scan_parquet(
166
+ file,
167
+ n_rows: nil,
168
+ cache: true,
169
+ parallel: "auto",
170
+ rechunk: true,
171
+ row_count_name: nil,
172
+ row_count_offset: 0,
173
+ storage_options: nil,
174
+ low_memory: false
175
+ )
176
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
177
+ file = Utils.format_path(file)
178
+ end
179
+
180
+ LazyFrame._scan_parquet(
181
+ file,
182
+ n_rows:n_rows,
183
+ cache: cache,
184
+ parallel: parallel,
185
+ rechunk: rechunk,
186
+ row_count_name: row_count_name,
187
+ row_count_offset: row_count_offset,
188
+ storage_options: storage_options,
189
+ low_memory: low_memory
190
+ )
191
+ end
192
+
193
+ def scan_ndjson(
194
+ file,
195
+ infer_schema_length: 100,
196
+ batch_size: 1024,
197
+ n_rows: nil,
198
+ low_memory: false,
199
+ rechunk: true,
200
+ row_count_name: nil,
201
+ row_count_offset: 0
202
+ )
203
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
204
+ file = Utils.format_path(file)
205
+ end
206
+
207
+ LazyFrame._scan_ndjson(
208
+ file,
209
+ infer_schema_length: infer_schema_length,
210
+ batch_size: batch_size,
211
+ n_rows: n_rows,
212
+ low_memory: low_memory,
213
+ rechunk: rechunk,
214
+ row_count_name: row_count_name,
215
+ row_count_offset: row_count_offset,
216
+ )
217
+ end
218
+
219
+ # def read_avro
220
+ # end
221
+
222
+ def read_ipc(
223
+ file,
224
+ columns: nil,
225
+ n_rows: nil,
226
+ memory_map: true,
227
+ storage_options: nil,
228
+ row_count_name: nil,
229
+ row_count_offset: 0,
230
+ rechunk: true
231
+ )
232
+ storage_options ||= {}
233
+ _prepare_file_arg(file, **storage_options) do |data|
234
+ DataFrame._read_ipc(
235
+ data,
236
+ columns: columns,
237
+ n_rows: n_rows,
238
+ row_count_name: row_count_name,
239
+ row_count_offset: row_count_offset,
240
+ rechunk: rechunk,
241
+ memory_map: memory_map
242
+ )
6
243
  end
7
244
  end
8
245
 
@@ -20,6 +257,96 @@ module Polars
20
257
  DataFrame._read_ndjson(file)
21
258
  end
22
259
 
260
+ # def read_sql
261
+ # end
262
+
263
+ # def read_excel
264
+ # end
265
+
266
+ def read_csv_batched(
267
+ file,
268
+ has_header: true,
269
+ columns: nil,
270
+ new_columns: nil,
271
+ sep: ",",
272
+ comment_char: nil,
273
+ quote_char: '"',
274
+ skip_rows: 0,
275
+ dtypes: nil,
276
+ null_values: nil,
277
+ ignore_errors: false,
278
+ parse_dates: false,
279
+ n_threads: nil,
280
+ infer_schema_length: 100,
281
+ batch_size: 50_000,
282
+ n_rows: nil,
283
+ encoding: "utf8",
284
+ low_memory: false,
285
+ rechunk: true,
286
+ skip_rows_after_header: 0,
287
+ row_count_name: nil,
288
+ row_count_offset: 0,
289
+ sample_size: 1024,
290
+ eol_char: "\n"
291
+ )
292
+ projection, columns = Utils.handle_projection_columns(columns)
293
+
294
+ if columns && !has_header
295
+ columns.each do |column|
296
+ if !column.start_with?("column_")
297
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
298
+ end
299
+ end
300
+ end
301
+
302
+ if projection || new_columns
303
+ raise Todo
304
+ end
305
+
306
+ BatchedCsvReader.new(
307
+ file,
308
+ has_header: has_header,
309
+ columns: columns || projection,
310
+ sep: sep,
311
+ comment_char: comment_char,
312
+ quote_char: quote_char,
313
+ skip_rows: skip_rows,
314
+ dtypes: dtypes,
315
+ null_values: null_values,
316
+ ignore_errors: ignore_errors,
317
+ parse_dates: parse_dates,
318
+ n_threads: n_threads,
319
+ infer_schema_length: infer_schema_length,
320
+ batch_size: batch_size,
321
+ n_rows: n_rows,
322
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
323
+ low_memory: low_memory,
324
+ rechunk: rechunk,
325
+ skip_rows_after_header: skip_rows_after_header,
326
+ row_count_name: row_count_name,
327
+ row_count_offset: row_count_offset,
328
+ sample_size: sample_size,
329
+ eol_char: eol_char,
330
+ new_columns: new_columns
331
+ )
332
+ end
333
+
334
+ def read_ipc_schema(file)
335
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
336
+ file = Utils.format_path(file)
337
+ end
338
+
339
+ _ipc_schema(file)
340
+ end
341
+
342
+ def read_parquet_schema(file)
343
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
344
+ file = Utils.format_path(file)
345
+ end
346
+
347
+ _parquet_schema(file)
348
+ end
349
+
23
350
  private
24
351
 
25
352
  def _prepare_file_arg(file)
@@ -35,5 +362,18 @@ module Polars
35
362
 
36
363
  yield file
37
364
  end
365
+
366
+ def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
367
+ if arg.is_a?(String)
368
+ arg_byte_length = arg.bytesize
369
+ if can_be_empty
370
+ if arg_byte_length > 1
371
+ raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
372
+ end
373
+ elsif arg_byte_length != 1
374
+ raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
375
+ end
376
+ end
377
+ end
38
378
  end
39
379
  end
@@ -1,13 +1,157 @@
1
1
  module Polars
2
+ # Representation of a Lazy computation graph/query againat a DataFrame.
2
3
  class LazyFrame
4
+ # @private
3
5
  attr_accessor :_ldf
4
6
 
7
+ # @private
5
8
  def self._from_rbldf(rb_ldf)
6
9
  ldf = LazyFrame.allocate
7
10
  ldf._ldf = rb_ldf
8
11
  ldf
9
12
  end
10
13
 
14
+ # @private
15
+ def self._scan_csv(
16
+ file,
17
+ has_header: true,
18
+ sep: ",",
19
+ comment_char: nil,
20
+ quote_char: '"',
21
+ skip_rows: 0,
22
+ dtypes: nil,
23
+ null_values: nil,
24
+ ignore_errors: false,
25
+ cache: true,
26
+ with_column_names: nil,
27
+ infer_schema_length: 100,
28
+ n_rows: nil,
29
+ encoding: "utf8",
30
+ low_memory: false,
31
+ rechunk: true,
32
+ skip_rows_after_header: 0,
33
+ row_count_name: nil,
34
+ row_count_offset: 0,
35
+ parse_dates: false,
36
+ eol_char: "\n"
37
+ )
38
+ dtype_list = nil
39
+ if !dtypes.nil?
40
+ dtype_list = []
41
+ dtypes.each do |k, v|
42
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
43
+ end
44
+ end
45
+ processed_null_values = Utils._process_null_values(null_values)
46
+
47
+ _from_rbldf(
48
+ RbLazyFrame.new_from_csv(
49
+ file,
50
+ sep,
51
+ has_header,
52
+ ignore_errors,
53
+ skip_rows,
54
+ n_rows,
55
+ cache,
56
+ dtype_list,
57
+ low_memory,
58
+ comment_char,
59
+ quote_char,
60
+ processed_null_values,
61
+ infer_schema_length,
62
+ with_column_names,
63
+ rechunk,
64
+ skip_rows_after_header,
65
+ encoding,
66
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
67
+ parse_dates,
68
+ eol_char
69
+ )
70
+ )
71
+ end
72
+
73
+ # @private
74
+ def self._scan_parquet(
75
+ file,
76
+ n_rows: nil,
77
+ cache: true,
78
+ parallel: "auto",
79
+ rechunk: true,
80
+ row_count_name: nil,
81
+ row_count_offset: 0,
82
+ storage_options: nil,
83
+ low_memory: false
84
+ )
85
+ _from_rbldf(
86
+ RbLazyFrame.new_from_parquet(
87
+ file,
88
+ n_rows,
89
+ cache,
90
+ parallel,
91
+ rechunk,
92
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
93
+ low_memory
94
+ )
95
+ )
96
+ end
97
+
98
+ # @private
99
+ def self._scan_ipc(
100
+ file,
101
+ n_rows: nil,
102
+ cache: true,
103
+ rechunk: true,
104
+ row_count_name: nil,
105
+ row_count_offset: 0,
106
+ storage_options: nil,
107
+ memory_map: true
108
+ )
109
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
110
+ file = Utils.format_path(file)
111
+ end
112
+
113
+ _from_rbldf(
114
+ RbLazyFrame.new_from_ipc(
115
+ file,
116
+ n_rows,
117
+ cache,
118
+ rechunk,
119
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
120
+ memory_map
121
+ )
122
+ )
123
+ end
124
+
125
+ # @private
126
+ def self._scan_ndjson(
127
+ file,
128
+ infer_schema_length: nil,
129
+ batch_size: nil,
130
+ n_rows: nil,
131
+ low_memory: false,
132
+ rechunk: true,
133
+ row_count_name: nil,
134
+ row_count_offset: 0
135
+ )
136
+ _from_rbldf(
137
+ RbLazyFrame.new_from_ndjson(
138
+ file,
139
+ infer_schema_length,
140
+ batch_size,
141
+ n_rows,
142
+ low_memory,
143
+ rechunk,
144
+ Utils._prepare_row_count_args(row_count_name, row_count_offset)
145
+ )
146
+ )
147
+ end
148
+
149
+ # def self.from_json
150
+ # end
151
+
152
+ # def self.read_json
153
+ # end
154
+
11
155
  # def columns
12
156
  # end
13
157
 
@@ -53,6 +197,7 @@ module Polars
53
197
  # def profile
54
198
  # end
55
199
 
200
+ #
56
201
  def collect(
57
202
  type_coercion: true,
58
203
  predicate_pushdown: true,
@@ -90,6 +235,7 @@ module Polars
90
235
  # def fetch
91
236
  # end
92
237
 
238
+ #
93
239
  def lazy
94
240
  self
95
241
  end
@@ -100,6 +246,7 @@ module Polars
100
246
  # def cleared
101
247
  # end
102
248
 
249
+ #
103
250
  def filter(predicate)
104
251
  _from_rbldf(
105
252
  _ldf.filter(
@@ -128,6 +275,7 @@ module Polars
128
275
  # def join_asof
129
276
  # end
130
277
 
278
+ #
131
279
  def join(
132
280
  other,
133
281
  left_on: nil,
@@ -202,6 +350,7 @@ module Polars
202
350
  # def with_context
203
351
  # end
204
352
 
353
+ #
205
354
  def with_column(column)
206
355
  with_columns([column])
207
356
  end
@@ -209,6 +358,7 @@ module Polars
209
358
  # def drop
210
359
  # end
211
360
 
361
+ #
212
362
  def rename(mapping)
213
363
  existing = mapping.keys
214
364
  _new = mapping.values
@@ -251,6 +401,7 @@ module Polars
251
401
  # def fill_null
252
402
  # end
253
403
 
404
+ #
254
405
  def fill_nan(fill_value)
255
406
  if !fill_value.is_a?(Expr)
256
407
  fill_value = Utils.lit(fill_value)
@@ -282,8 +433,11 @@ module Polars
282
433
  # def quantile
283
434
  # end
284
435
 
285
- # def explode
286
- # end
436
+ #
437
+ def explode(columns)
438
+ columns = Utils.selection_to_rbexpr_list(columns)
439
+ _from_rbldf(_ldf.explode(columns))
440
+ end
287
441
 
288
442
  # def unique
289
443
  # end