polars-df 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/polars/io.rb CHANGED
@@ -1,8 +1,245 @@
1
1
  module Polars
2
2
  module IO
3
- def read_csv(file, has_header: true)
3
+ def read_csv(
4
+ file,
5
+ has_header: true,
6
+ columns: nil,
7
+ new_columns: nil,
8
+ sep: ",",
9
+ comment_char: nil,
10
+ quote_char: '"',
11
+ skip_rows: 0,
12
+ dtypes: nil,
13
+ null_values: nil,
14
+ ignore_errors: false,
15
+ parse_dates: false,
16
+ n_threads: nil,
17
+ infer_schema_length: 100,
18
+ batch_size: 8192,
19
+ n_rows: nil,
20
+ encoding: "utf8",
21
+ low_memory: false,
22
+ rechunk: true,
23
+ storage_options: nil,
24
+ skip_rows_after_header: 0,
25
+ row_count_name: nil,
26
+ row_count_offset: 0,
27
+ sample_size: 1024,
28
+ eol_char: "\n"
29
+ )
30
+ _check_arg_is_1byte("sep", sep, false)
31
+ _check_arg_is_1byte("comment_char", comment_char, false)
32
+ _check_arg_is_1byte("quote_char", quote_char, true)
33
+ _check_arg_is_1byte("eol_char", eol_char, false)
34
+
35
+ projection, columns = Utils.handle_projection_columns(columns)
36
+
37
+ storage_options ||= {}
38
+
39
+ if columns && !has_header
40
+ columns.each do |column|
41
+ if !column.start_with?("column_")
42
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
43
+ end
44
+ end
45
+ end
46
+
47
+ if projection || new_columns
48
+ raise Todo
49
+ end
50
+
51
+ df = nil
4
52
  _prepare_file_arg(file) do |data|
5
- DataFrame._read_csv(data, has_header: has_header)
53
+ df = DataFrame._read_csv(
54
+ data,
55
+ has_header: has_header,
56
+ columns: columns || projection,
57
+ sep: sep,
58
+ comment_char: comment_char,
59
+ quote_char: quote_char,
60
+ skip_rows: skip_rows,
61
+ dtypes: dtypes,
62
+ null_values: null_values,
63
+ ignore_errors: ignore_errors,
64
+ parse_dates: parse_dates,
65
+ n_threads: n_threads,
66
+ infer_schema_length: infer_schema_length,
67
+ batch_size: batch_size,
68
+ n_rows: n_rows,
69
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
70
+ low_memory: low_memory,
71
+ rechunk: rechunk,
72
+ skip_rows_after_header: skip_rows_after_header,
73
+ row_count_name: row_count_name,
74
+ row_count_offset: row_count_offset,
75
+ sample_size: sample_size,
76
+ eol_char: eol_char
77
+ )
78
+ end
79
+
80
+ if new_columns
81
+ Utils._update_columns(df, new_columns)
82
+ else
83
+ df
84
+ end
85
+ end
86
+
87
+ def scan_csv(
88
+ file,
89
+ has_header: true,
90
+ sep: ",",
91
+ comment_char: nil,
92
+ quote_char: '"',
93
+ skip_rows: 0,
94
+ dtypes: nil,
95
+ null_values: nil,
96
+ ignore_errors: false,
97
+ cache: true,
98
+ with_column_names: nil,
99
+ infer_schema_length: 100,
100
+ n_rows: nil,
101
+ encoding: "utf8",
102
+ low_memory: false,
103
+ rechunk: true,
104
+ skip_rows_after_header: 0,
105
+ row_count_name: nil,
106
+ row_count_offset: 0,
107
+ parse_dates: false,
108
+ eol_char: "\n"
109
+ )
110
+ _check_arg_is_1byte("sep", sep, false)
111
+ _check_arg_is_1byte("comment_char", comment_char, false)
112
+ _check_arg_is_1byte("quote_char", quote_char, true)
113
+
114
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
115
+ file = Utils.format_path(file)
116
+ end
117
+
118
+ LazyFrame._scan_csv(
119
+ file,
120
+ has_header: has_header,
121
+ sep: sep,
122
+ comment_char: comment_char,
123
+ quote_char: quote_char,
124
+ skip_rows: skip_rows,
125
+ dtypes: dtypes,
126
+ null_values: null_values,
127
+ ignore_errors: ignore_errors,
128
+ cache: cache,
129
+ with_column_names: with_column_names,
130
+ infer_schema_length: infer_schema_length,
131
+ n_rows: n_rows,
132
+ low_memory: low_memory,
133
+ rechunk: rechunk,
134
+ skip_rows_after_header: skip_rows_after_header,
135
+ encoding: encoding,
136
+ row_count_name: row_count_name,
137
+ row_count_offset: row_count_offset,
138
+ parse_dates: parse_dates,
139
+ eol_char: eol_char,
140
+ )
141
+ end
142
+
143
+ def scan_ipc(
144
+ file,
145
+ n_rows: nil,
146
+ cache: true,
147
+ rechunk: true,
148
+ row_count_name: nil,
149
+ row_count_offset: 0,
150
+ storage_options: nil,
151
+ memory_map: true
152
+ )
153
+ LazyFrame._scan_ipc(
154
+ file,
155
+ n_rows: n_rows,
156
+ cache: cache,
157
+ rechunk: rechunk,
158
+ row_count_name: row_count_name,
159
+ row_count_offset: row_count_offset,
160
+ storage_options: storage_options,
161
+ memory_map: memory_map
162
+ )
163
+ end
164
+
165
+ def scan_parquet(
166
+ file,
167
+ n_rows: nil,
168
+ cache: true,
169
+ parallel: "auto",
170
+ rechunk: true,
171
+ row_count_name: nil,
172
+ row_count_offset: 0,
173
+ storage_options: nil,
174
+ low_memory: false
175
+ )
176
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
177
+ file = Utils.format_path(file)
178
+ end
179
+
180
+ LazyFrame._scan_parquet(
181
+ file,
182
+ n_rows:n_rows,
183
+ cache: cache,
184
+ parallel: parallel,
185
+ rechunk: rechunk,
186
+ row_count_name: row_count_name,
187
+ row_count_offset: row_count_offset,
188
+ storage_options: storage_options,
189
+ low_memory: low_memory
190
+ )
191
+ end
192
+
193
+ def scan_ndjson(
194
+ file,
195
+ infer_schema_length: 100,
196
+ batch_size: 1024,
197
+ n_rows: nil,
198
+ low_memory: false,
199
+ rechunk: true,
200
+ row_count_name: nil,
201
+ row_count_offset: 0
202
+ )
203
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
204
+ file = Utils.format_path(file)
205
+ end
206
+
207
+ LazyFrame._scan_ndjson(
208
+ file,
209
+ infer_schema_length: infer_schema_length,
210
+ batch_size: batch_size,
211
+ n_rows: n_rows,
212
+ low_memory: low_memory,
213
+ rechunk: rechunk,
214
+ row_count_name: row_count_name,
215
+ row_count_offset: row_count_offset,
216
+ )
217
+ end
218
+
219
+ # def read_avro
220
+ # end
221
+
222
+ def read_ipc(
223
+ file,
224
+ columns: nil,
225
+ n_rows: nil,
226
+ memory_map: true,
227
+ storage_options: nil,
228
+ row_count_name: nil,
229
+ row_count_offset: 0,
230
+ rechunk: true
231
+ )
232
+ storage_options ||= {}
233
+ _prepare_file_arg(file, **storage_options) do |data|
234
+ DataFrame._read_ipc(
235
+ data,
236
+ columns: columns,
237
+ n_rows: n_rows,
238
+ row_count_name: row_count_name,
239
+ row_count_offset: row_count_offset,
240
+ rechunk: rechunk,
241
+ memory_map: memory_map
242
+ )
6
243
  end
7
244
  end
8
245
 
@@ -20,6 +257,96 @@ module Polars
20
257
  DataFrame._read_ndjson(file)
21
258
  end
22
259
 
260
+ # def read_sql
261
+ # end
262
+
263
+ # def read_excel
264
+ # end
265
+
266
+ def read_csv_batched(
267
+ file,
268
+ has_header: true,
269
+ columns: nil,
270
+ new_columns: nil,
271
+ sep: ",",
272
+ comment_char: nil,
273
+ quote_char: '"',
274
+ skip_rows: 0,
275
+ dtypes: nil,
276
+ null_values: nil,
277
+ ignore_errors: false,
278
+ parse_dates: false,
279
+ n_threads: nil,
280
+ infer_schema_length: 100,
281
+ batch_size: 50_000,
282
+ n_rows: nil,
283
+ encoding: "utf8",
284
+ low_memory: false,
285
+ rechunk: true,
286
+ skip_rows_after_header: 0,
287
+ row_count_name: nil,
288
+ row_count_offset: 0,
289
+ sample_size: 1024,
290
+ eol_char: "\n"
291
+ )
292
+ projection, columns = Utils.handle_projection_columns(columns)
293
+
294
+ if columns && !has_header
295
+ columns.each do |column|
296
+ if !column.start_with?("column_")
297
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
298
+ end
299
+ end
300
+ end
301
+
302
+ if projection || new_columns
303
+ raise Todo
304
+ end
305
+
306
+ BatchedCsvReader.new(
307
+ file,
308
+ has_header: has_header,
309
+ columns: columns || projection,
310
+ sep: sep,
311
+ comment_char: comment_char,
312
+ quote_char: quote_char,
313
+ skip_rows: skip_rows,
314
+ dtypes: dtypes,
315
+ null_values: null_values,
316
+ ignore_errors: ignore_errors,
317
+ parse_dates: parse_dates,
318
+ n_threads: n_threads,
319
+ infer_schema_length: infer_schema_length,
320
+ batch_size: batch_size,
321
+ n_rows: n_rows,
322
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
323
+ low_memory: low_memory,
324
+ rechunk: rechunk,
325
+ skip_rows_after_header: skip_rows_after_header,
326
+ row_count_name: row_count_name,
327
+ row_count_offset: row_count_offset,
328
+ sample_size: sample_size,
329
+ eol_char: eol_char,
330
+ new_columns: new_columns
331
+ )
332
+ end
333
+
334
+ def read_ipc_schema(file)
335
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
336
+ file = Utils.format_path(file)
337
+ end
338
+
339
+ _ipc_schema(file)
340
+ end
341
+
342
+ def read_parquet_schema(file)
343
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
344
+ file = Utils.format_path(file)
345
+ end
346
+
347
+ _parquet_schema(file)
348
+ end
349
+
23
350
  private
24
351
 
25
352
  def _prepare_file_arg(file)
@@ -35,5 +362,18 @@ module Polars
35
362
 
36
363
  yield file
37
364
  end
365
+
366
+ def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
367
+ if arg.is_a?(String)
368
+ arg_byte_length = arg.bytesize
369
+ if can_be_empty
370
+ if arg_byte_length > 1
371
+ raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
372
+ end
373
+ elsif arg_byte_length != 1
374
+ raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
375
+ end
376
+ end
377
+ end
38
378
  end
39
379
  end