polars-df 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/polars/io.rb CHANGED
@@ -1,8 +1,245 @@
1
1
  module Polars
2
2
  module IO
3
- def read_csv(file, has_header: true)
3
+ def read_csv(
4
+ file,
5
+ has_header: true,
6
+ columns: nil,
7
+ new_columns: nil,
8
+ sep: ",",
9
+ comment_char: nil,
10
+ quote_char: '"',
11
+ skip_rows: 0,
12
+ dtypes: nil,
13
+ null_values: nil,
14
+ ignore_errors: false,
15
+ parse_dates: false,
16
+ n_threads: nil,
17
+ infer_schema_length: 100,
18
+ batch_size: 8192,
19
+ n_rows: nil,
20
+ encoding: "utf8",
21
+ low_memory: false,
22
+ rechunk: true,
23
+ storage_options: nil,
24
+ skip_rows_after_header: 0,
25
+ row_count_name: nil,
26
+ row_count_offset: 0,
27
+ sample_size: 1024,
28
+ eol_char: "\n"
29
+ )
30
+ _check_arg_is_1byte("sep", sep, false)
31
+ _check_arg_is_1byte("comment_char", comment_char, false)
32
+ _check_arg_is_1byte("quote_char", quote_char, true)
33
+ _check_arg_is_1byte("eol_char", eol_char, false)
34
+
35
+ projection, columns = Utils.handle_projection_columns(columns)
36
+
37
+ storage_options ||= {}
38
+
39
+ if columns && !has_header
40
+ columns.each do |column|
41
+ if !column.start_with?("column_")
42
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
43
+ end
44
+ end
45
+ end
46
+
47
+ if projection || new_columns
48
+ raise Todo
49
+ end
50
+
51
+ df = nil
4
52
  _prepare_file_arg(file) do |data|
5
- DataFrame._read_csv(data, has_header: has_header)
53
+ df = DataFrame._read_csv(
54
+ data,
55
+ has_header: has_header,
56
+ columns: columns || projection,
57
+ sep: sep,
58
+ comment_char: comment_char,
59
+ quote_char: quote_char,
60
+ skip_rows: skip_rows,
61
+ dtypes: dtypes,
62
+ null_values: null_values,
63
+ ignore_errors: ignore_errors,
64
+ parse_dates: parse_dates,
65
+ n_threads: n_threads,
66
+ infer_schema_length: infer_schema_length,
67
+ batch_size: batch_size,
68
+ n_rows: n_rows,
69
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
70
+ low_memory: low_memory,
71
+ rechunk: rechunk,
72
+ skip_rows_after_header: skip_rows_after_header,
73
+ row_count_name: row_count_name,
74
+ row_count_offset: row_count_offset,
75
+ sample_size: sample_size,
76
+ eol_char: eol_char
77
+ )
78
+ end
79
+
80
+ if new_columns
81
+ Utils._update_columns(df, new_columns)
82
+ else
83
+ df
84
+ end
85
+ end
86
+
87
+ def scan_csv(
88
+ file,
89
+ has_header: true,
90
+ sep: ",",
91
+ comment_char: nil,
92
+ quote_char: '"',
93
+ skip_rows: 0,
94
+ dtypes: nil,
95
+ null_values: nil,
96
+ ignore_errors: false,
97
+ cache: true,
98
+ with_column_names: nil,
99
+ infer_schema_length: 100,
100
+ n_rows: nil,
101
+ encoding: "utf8",
102
+ low_memory: false,
103
+ rechunk: true,
104
+ skip_rows_after_header: 0,
105
+ row_count_name: nil,
106
+ row_count_offset: 0,
107
+ parse_dates: false,
108
+ eol_char: "\n"
109
+ )
110
+ _check_arg_is_1byte("sep", sep, false)
111
+ _check_arg_is_1byte("comment_char", comment_char, false)
112
+ _check_arg_is_1byte("quote_char", quote_char, true)
113
+
114
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
115
+ file = Utils.format_path(file)
116
+ end
117
+
118
+ LazyFrame._scan_csv(
119
+ file,
120
+ has_header: has_header,
121
+ sep: sep,
122
+ comment_char: comment_char,
123
+ quote_char: quote_char,
124
+ skip_rows: skip_rows,
125
+ dtypes: dtypes,
126
+ null_values: null_values,
127
+ ignore_errors: ignore_errors,
128
+ cache: cache,
129
+ with_column_names: with_column_names,
130
+ infer_schema_length: infer_schema_length,
131
+ n_rows: n_rows,
132
+ low_memory: low_memory,
133
+ rechunk: rechunk,
134
+ skip_rows_after_header: skip_rows_after_header,
135
+ encoding: encoding,
136
+ row_count_name: row_count_name,
137
+ row_count_offset: row_count_offset,
138
+ parse_dates: parse_dates,
139
+ eol_char: eol_char,
140
+ )
141
+ end
142
+
143
+ def scan_ipc(
144
+ file,
145
+ n_rows: nil,
146
+ cache: true,
147
+ rechunk: true,
148
+ row_count_name: nil,
149
+ row_count_offset: 0,
150
+ storage_options: nil,
151
+ memory_map: true
152
+ )
153
+ LazyFrame._scan_ipc(
154
+ file,
155
+ n_rows: n_rows,
156
+ cache: cache,
157
+ rechunk: rechunk,
158
+ row_count_name: row_count_name,
159
+ row_count_offset: row_count_offset,
160
+ storage_options: storage_options,
161
+ memory_map: memory_map
162
+ )
163
+ end
164
+
165
+ def scan_parquet(
166
+ file,
167
+ n_rows: nil,
168
+ cache: true,
169
+ parallel: "auto",
170
+ rechunk: true,
171
+ row_count_name: nil,
172
+ row_count_offset: 0,
173
+ storage_options: nil,
174
+ low_memory: false
175
+ )
176
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
177
+ file = Utils.format_path(file)
178
+ end
179
+
180
+ LazyFrame._scan_parquet(
181
+ file,
182
+ n_rows:n_rows,
183
+ cache: cache,
184
+ parallel: parallel,
185
+ rechunk: rechunk,
186
+ row_count_name: row_count_name,
187
+ row_count_offset: row_count_offset,
188
+ storage_options: storage_options,
189
+ low_memory: low_memory
190
+ )
191
+ end
192
+
193
+ def scan_ndjson(
194
+ file,
195
+ infer_schema_length: 100,
196
+ batch_size: 1024,
197
+ n_rows: nil,
198
+ low_memory: false,
199
+ rechunk: true,
200
+ row_count_name: nil,
201
+ row_count_offset: 0
202
+ )
203
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
204
+ file = Utils.format_path(file)
205
+ end
206
+
207
+ LazyFrame._scan_ndjson(
208
+ file,
209
+ infer_schema_length: infer_schema_length,
210
+ batch_size: batch_size,
211
+ n_rows: n_rows,
212
+ low_memory: low_memory,
213
+ rechunk: rechunk,
214
+ row_count_name: row_count_name,
215
+ row_count_offset: row_count_offset,
216
+ )
217
+ end
218
+
219
+ # def read_avro
220
+ # end
221
+
222
+ def read_ipc(
223
+ file,
224
+ columns: nil,
225
+ n_rows: nil,
226
+ memory_map: true,
227
+ storage_options: nil,
228
+ row_count_name: nil,
229
+ row_count_offset: 0,
230
+ rechunk: true
231
+ )
232
+ storage_options ||= {}
233
+ _prepare_file_arg(file, **storage_options) do |data|
234
+ DataFrame._read_ipc(
235
+ data,
236
+ columns: columns,
237
+ n_rows: n_rows,
238
+ row_count_name: row_count_name,
239
+ row_count_offset: row_count_offset,
240
+ rechunk: rechunk,
241
+ memory_map: memory_map
242
+ )
6
243
  end
7
244
  end
8
245
 
@@ -20,6 +257,96 @@ module Polars
20
257
  DataFrame._read_ndjson(file)
21
258
  end
22
259
 
260
+ # def read_sql
261
+ # end
262
+
263
+ # def read_excel
264
+ # end
265
+
266
+ def read_csv_batched(
267
+ file,
268
+ has_header: true,
269
+ columns: nil,
270
+ new_columns: nil,
271
+ sep: ",",
272
+ comment_char: nil,
273
+ quote_char: '"',
274
+ skip_rows: 0,
275
+ dtypes: nil,
276
+ null_values: nil,
277
+ ignore_errors: false,
278
+ parse_dates: false,
279
+ n_threads: nil,
280
+ infer_schema_length: 100,
281
+ batch_size: 50_000,
282
+ n_rows: nil,
283
+ encoding: "utf8",
284
+ low_memory: false,
285
+ rechunk: true,
286
+ skip_rows_after_header: 0,
287
+ row_count_name: nil,
288
+ row_count_offset: 0,
289
+ sample_size: 1024,
290
+ eol_char: "\n"
291
+ )
292
+ projection, columns = Utils.handle_projection_columns(columns)
293
+
294
+ if columns && !has_header
295
+ columns.each do |column|
296
+ if !column.start_with?("column_")
297
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
298
+ end
299
+ end
300
+ end
301
+
302
+ if projection || new_columns
303
+ raise Todo
304
+ end
305
+
306
+ BatchedCsvReader.new(
307
+ file,
308
+ has_header: has_header,
309
+ columns: columns || projection,
310
+ sep: sep,
311
+ comment_char: comment_char,
312
+ quote_char: quote_char,
313
+ skip_rows: skip_rows,
314
+ dtypes: dtypes,
315
+ null_values: null_values,
316
+ ignore_errors: ignore_errors,
317
+ parse_dates: parse_dates,
318
+ n_threads: n_threads,
319
+ infer_schema_length: infer_schema_length,
320
+ batch_size: batch_size,
321
+ n_rows: n_rows,
322
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
323
+ low_memory: low_memory,
324
+ rechunk: rechunk,
325
+ skip_rows_after_header: skip_rows_after_header,
326
+ row_count_name: row_count_name,
327
+ row_count_offset: row_count_offset,
328
+ sample_size: sample_size,
329
+ eol_char: eol_char,
330
+ new_columns: new_columns
331
+ )
332
+ end
333
+
334
+ def read_ipc_schema(file)
335
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
336
+ file = Utils.format_path(file)
337
+ end
338
+
339
+ _ipc_schema(file)
340
+ end
341
+
342
+ def read_parquet_schema(file)
343
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
344
+ file = Utils.format_path(file)
345
+ end
346
+
347
+ _parquet_schema(file)
348
+ end
349
+
23
350
  private
24
351
 
25
352
  def _prepare_file_arg(file)
@@ -35,5 +362,18 @@ module Polars
35
362
 
36
363
  yield file
37
364
  end
365
+
366
+ def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
367
+ if arg.is_a?(String)
368
+ arg_byte_length = arg.bytesize
369
+ if can_be_empty
370
+ if arg_byte_length > 1
371
+ raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
372
+ end
373
+ elsif arg_byte_length != 1
374
+ raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
375
+ end
376
+ end
377
+ end
38
378
  end
39
379
  end