polars-df 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +8 -0
  4. data/Cargo.lock +2 -1
  5. data/README.md +1 -1
  6. data/ext/polars/Cargo.toml +7 -1
  7. data/ext/polars/src/batched_csv.rs +120 -0
  8. data/ext/polars/src/conversion.rs +139 -6
  9. data/ext/polars/src/dataframe.rs +360 -15
  10. data/ext/polars/src/error.rs +9 -0
  11. data/ext/polars/src/file.rs +8 -7
  12. data/ext/polars/src/lazy/apply.rs +7 -0
  13. data/ext/polars/src/lazy/dataframe.rs +135 -3
  14. data/ext/polars/src/lazy/dsl.rs +97 -2
  15. data/ext/polars/src/lazy/meta.rs +1 -1
  16. data/ext/polars/src/lazy/mod.rs +1 -0
  17. data/ext/polars/src/lib.rs +227 -12
  18. data/ext/polars/src/series.rs +190 -38
  19. data/ext/polars/src/set.rs +91 -0
  20. data/ext/polars/src/utils.rs +19 -0
  21. data/lib/polars/batched_csv_reader.rb +96 -0
  22. data/lib/polars/cat_expr.rb +39 -0
  23. data/lib/polars/data_frame.rb +2813 -100
  24. data/lib/polars/date_time_expr.rb +1282 -7
  25. data/lib/polars/exceptions.rb +20 -0
  26. data/lib/polars/expr.rb +631 -11
  27. data/lib/polars/expr_dispatch.rb +14 -0
  28. data/lib/polars/functions.rb +219 -0
  29. data/lib/polars/group_by.rb +517 -0
  30. data/lib/polars/io.rb +763 -4
  31. data/lib/polars/lazy_frame.rb +1415 -67
  32. data/lib/polars/lazy_functions.rb +430 -9
  33. data/lib/polars/lazy_group_by.rb +79 -0
  34. data/lib/polars/list_expr.rb +5 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2244 -192
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +663 -2
  39. data/lib/polars/struct_expr.rb +73 -0
  40. data/lib/polars/utils.rb +76 -3
  41. data/lib/polars/version.rb +2 -1
  42. data/lib/polars/when.rb +1 -0
  43. data/lib/polars/when_then.rb +1 -0
  44. data/lib/polars.rb +8 -2
  45. metadata +12 -2
data/lib/polars/io.rb CHANGED
@@ -1,25 +1,771 @@
1
1
  module Polars
2
2
  module IO
3
- def read_csv(file, has_header: true)
3
+ # Read a CSV file into a DataFrame.
4
+ #
5
+ # @param file [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param has_header [Boolean]
8
+ # Indicate if the first row of dataset is a header or not.
9
+ # If set to false, column names will be autogenerated in the
10
+ # following format: `column_x`, with `x` being an
11
+ # enumeration over every column in the dataset starting at 1.
12
+ # @param columns [Object]
13
+ # Columns to select. Accepts a list of column indices (starting
14
+ # at zero) or a list of column names.
15
+ # @param new_columns [Object]
16
+ # Rename columns right after parsing the CSV file. If the given
17
+ # list is shorter than the width of the DataFrame the remaining
18
+ # columns will have their original name.
19
+ # @param sep [String]
20
+ # Single byte character to use as delimiter in the file.
21
+ # @param comment_char [String]
22
+ # Single byte character that indicates the start of a comment line,
23
+ # for instance `#`.
24
+ # @param quote_char [String]
25
+ # Single byte character used for csv quoting.
26
+ # Set to nil to turn off special handling and escaping of quotes.
27
+ # @param skip_rows [Integer]
28
+ # Start reading after `skip_rows` lines.
29
+ # @param dtypes [Object]
30
+ # Overwrite dtypes during inference.
31
+ # @param null_values [Object]
32
+ # Values to interpret as null values. You can provide a:
33
+ #
34
+ # - `String`: All values equal to this string will be null.
35
+ # - `Array`: All values equal to any string in this array will be null.
36
+ # - `Hash`: A hash that maps column name to a null value string.
37
+ # @param ignore_errors [Boolean]
38
+ # Try to keep reading lines if some lines yield errors.
39
+ # First try `infer_schema_length: 0` to read all columns as
40
+ # `:str` to check which values might cause an issue.
41
+ # @param parse_dates [Boolean]
42
+ # Try to automatically parse dates. If this does not succeed,
43
+ # the column remains of data type `:str`.
44
+ # @param n_threads [Integer]
45
+ # Number of threads to use in csv parsing.
46
+ # Defaults to the number of physical cpu's of your system.
47
+ # @param infer_schema_length [Integer]
48
+ # Maximum number of lines to read to infer schema.
49
+ # If set to 0, all columns will be read as `:utf8`.
50
+ # If set to `nil`, a full table scan will be done (slow).
51
+ # @param batch_size [Integer]
52
+ # Number of lines to read into the buffer at once.
53
+ # Modify this to change performance.
54
+ # @param n_rows [Integer]
55
+ # Stop reading from CSV file after reading `n_rows`.
56
+ # During multi-threaded parsing, an upper bound of `n_rows`
57
+ # rows cannot be guaranteed.
58
+ # @param encoding ["utf8", "utf8-lossy"]
59
+ # Lossy means that invalid utf8 values are replaced with `�`
60
+ # characters. When using other encodings than `utf8` or
61
+ # `utf8-lossy`, the input is first decoded im memory with
62
+ # python.
63
+ # @param low_memory [Boolean]
64
+ # Reduce memory usage at expense of performance.
65
+ # @param rechunk [Boolean]
66
+ # Make sure that all columns are contiguous in memory by
67
+ # aggregating the chunks into a single array.
68
+ # @param storage_options [Hash]
69
+ # Extra options that make sense for a
70
+ # particular storage connection.
71
+ # @param skip_rows_after_header [Integer]
72
+ # Skip this number of rows when the header is parsed.
73
+ # @param row_count_name [String]
74
+ # If not nil, this will insert a row count column with the given name into
75
+ # the DataFrame.
76
+ # @param row_count_offset [Integer]
77
+ # Offset to start the row_count column (only used if the name is set).
78
+ # @param sample_size [Integer]
79
+ # Set the sample size. This is used to sample statistics to estimate the
80
+ # allocation needed.
81
+ # @param eol_char [String]
82
+ # Single byte end of line character.
83
+ #
84
+ # @return [DataFrame]
85
+ #
86
+ # @note
87
+ # This operation defaults to a `rechunk` operation at the end, meaning that
88
+ # all data will be stored continuously in memory.
89
+ # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
90
+ # an expensive operation.
91
+ def read_csv(
92
+ file,
93
+ has_header: true,
94
+ columns: nil,
95
+ new_columns: nil,
96
+ sep: ",",
97
+ comment_char: nil,
98
+ quote_char: '"',
99
+ skip_rows: 0,
100
+ dtypes: nil,
101
+ null_values: nil,
102
+ ignore_errors: false,
103
+ parse_dates: false,
104
+ n_threads: nil,
105
+ infer_schema_length: 100,
106
+ batch_size: 8192,
107
+ n_rows: nil,
108
+ encoding: "utf8",
109
+ low_memory: false,
110
+ rechunk: true,
111
+ storage_options: nil,
112
+ skip_rows_after_header: 0,
113
+ row_count_name: nil,
114
+ row_count_offset: 0,
115
+ sample_size: 1024,
116
+ eol_char: "\n"
117
+ )
118
+ _check_arg_is_1byte("sep", sep, false)
119
+ _check_arg_is_1byte("comment_char", comment_char, false)
120
+ _check_arg_is_1byte("quote_char", quote_char, true)
121
+ _check_arg_is_1byte("eol_char", eol_char, false)
122
+
123
+ projection, columns = Utils.handle_projection_columns(columns)
124
+
125
+ storage_options ||= {}
126
+
127
+ if columns && !has_header
128
+ columns.each do |column|
129
+ if !column.start_with?("column_")
130
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
131
+ end
132
+ end
133
+ end
134
+
135
+ if projection || new_columns
136
+ raise Todo
137
+ end
138
+
139
+ df = nil
4
140
  _prepare_file_arg(file) do |data|
5
- DataFrame._read_csv(data, has_header: has_header)
141
+ df = DataFrame._read_csv(
142
+ data,
143
+ has_header: has_header,
144
+ columns: columns || projection,
145
+ sep: sep,
146
+ comment_char: comment_char,
147
+ quote_char: quote_char,
148
+ skip_rows: skip_rows,
149
+ dtypes: dtypes,
150
+ null_values: null_values,
151
+ ignore_errors: ignore_errors,
152
+ parse_dates: parse_dates,
153
+ n_threads: n_threads,
154
+ infer_schema_length: infer_schema_length,
155
+ batch_size: batch_size,
156
+ n_rows: n_rows,
157
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
158
+ low_memory: low_memory,
159
+ rechunk: rechunk,
160
+ skip_rows_after_header: skip_rows_after_header,
161
+ row_count_name: row_count_name,
162
+ row_count_offset: row_count_offset,
163
+ sample_size: sample_size,
164
+ eol_char: eol_char
165
+ )
166
+ end
167
+
168
+ if new_columns
169
+ Utils._update_columns(df, new_columns)
170
+ else
171
+ df
172
+ end
173
+ end
174
+
175
+ # Lazily read from a CSV file or multiple files via glob patterns.
176
+ #
177
+ # This allows the query optimizer to push down predicates and
178
+ # projections to the scan level, thereby potentially reducing
179
+ # memory overhead.
180
+ #
181
+ # @param file [Object]
182
+ # Path to a file.
183
+ # @param has_header [Boolean]
184
+ # Indicate if the first row of dataset is a header or not.
185
+ # If set to false, column names will be autogenerated in the
186
+ # following format: ``column_x``, with ``x`` being an
187
+ # enumeration over every column in the dataset starting at 1.
188
+ # @param sep [String]
189
+ # Single byte character to use as delimiter in the file.
190
+ # @param comment_char [String]
191
+ # Single byte character that indicates the start of a comment line,
192
+ # for instance `#`.
193
+ # @param quote_char [String]
194
+ # Single byte character used for csv quoting.
195
+ # Set to None to turn off special handling and escaping of quotes.
196
+ # @param skip_rows [Integer]
197
+ # Start reading after `skip_rows` lines. The header will be parsed at this
198
+ # offset.
199
+ # @param dtypes [Object]
200
+ # Overwrite dtypes during inference.
201
+ # @param null_values [Object]
202
+ # Values to interpret as null values. You can provide a:
203
+ #
204
+ # - `String`: All values equal to this string will be null.
205
+ # - `Array`: All values equal to any string in this array will be null.
206
+ # - `Hash`: A hash that maps column name to a null value string.
207
+ # @param ignore_errors [Boolean]
208
+ # Try to keep reading lines if some lines yield errors.
209
+ # First try `infer_schema_length: 0` to read all columns as
210
+ # `:str` to check which values might cause an issue.
211
+ # @param cache [Boolean]
212
+ # Cache the result after reading.
213
+ # @param with_column_names [Object]
214
+ # Apply a function over the column names.
215
+ # This can be used to update a schema just in time, thus before
216
+ # scanning.
217
+ # @param infer_schema_length [Integer]
218
+ # Maximum number of lines to read to infer schema.
219
+ # If set to 0, all columns will be read as `:str`.
220
+ # If set to `nil`, a full table scan will be done (slow).
221
+ # @param n_rows [Integer]
222
+ # Stop reading from CSV file after reading `n_rows`.
223
+ # @param encoding ["utf8", "utf8-lossy"]
224
+ # Lossy means that invalid utf8 values are replaced with `�`
225
+ # characters.
226
+ # @param low_memory [Boolean]
227
+ # Reduce memory usage in expense of performance.
228
+ # @param rechunk [Boolean]
229
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
230
+ # @param skip_rows_after_header [Integer]
231
+ # Skip this number of rows when the header is parsed.
232
+ # @param row_count_name [String]
233
+ # If not nil, this will insert a row count column with the given name into
234
+ # the DataFrame.
235
+ # @param row_count_offset [Integer]
236
+ # Offset to start the row_count column (only used if the name is set).
237
+ # @param parse_dates [Boolean]
238
+ # Try to automatically parse dates. If this does not succeed,
239
+ # the column remains of data type `:str`.
240
+ # @param eol_char [String]
241
+ # Single byte end of line character.
242
+ #
243
+ # @return [LazyFrame]
244
+ def scan_csv(
245
+ file,
246
+ has_header: true,
247
+ sep: ",",
248
+ comment_char: nil,
249
+ quote_char: '"',
250
+ skip_rows: 0,
251
+ dtypes: nil,
252
+ null_values: nil,
253
+ ignore_errors: false,
254
+ cache: true,
255
+ with_column_names: nil,
256
+ infer_schema_length: 100,
257
+ n_rows: nil,
258
+ encoding: "utf8",
259
+ low_memory: false,
260
+ rechunk: true,
261
+ skip_rows_after_header: 0,
262
+ row_count_name: nil,
263
+ row_count_offset: 0,
264
+ parse_dates: false,
265
+ eol_char: "\n"
266
+ )
267
+ _check_arg_is_1byte("sep", sep, false)
268
+ _check_arg_is_1byte("comment_char", comment_char, false)
269
+ _check_arg_is_1byte("quote_char", quote_char, true)
270
+
271
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
272
+ file = Utils.format_path(file)
273
+ end
274
+
275
+ LazyFrame._scan_csv(
276
+ file,
277
+ has_header: has_header,
278
+ sep: sep,
279
+ comment_char: comment_char,
280
+ quote_char: quote_char,
281
+ skip_rows: skip_rows,
282
+ dtypes: dtypes,
283
+ null_values: null_values,
284
+ ignore_errors: ignore_errors,
285
+ cache: cache,
286
+ with_column_names: with_column_names,
287
+ infer_schema_length: infer_schema_length,
288
+ n_rows: n_rows,
289
+ low_memory: low_memory,
290
+ rechunk: rechunk,
291
+ skip_rows_after_header: skip_rows_after_header,
292
+ encoding: encoding,
293
+ row_count_name: row_count_name,
294
+ row_count_offset: row_count_offset,
295
+ parse_dates: parse_dates,
296
+ eol_char: eol_char,
297
+ )
298
+ end
299
+
300
+ # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
301
+ #
302
+ # This allows the query optimizer to push down predicates and projections to the scan
303
+ # level, thereby potentially reducing memory overhead.
304
+ #
305
+ # @param file [String]
306
+ # Path to a IPC file.
307
+ # @param n_rows [Integer]
308
+ # Stop reading from IPC file after reading `n_rows`.
309
+ # @param cache [Boolean]
310
+ # Cache the result after reading.
311
+ # @param rechunk [Boolean]
312
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
313
+ # @param row_count_name [String]
314
+ # If not nil, this will insert a row count column with give name into the
315
+ # DataFrame.
316
+ # @param row_count_offset [Integer]
317
+ # Offset to start the row_count column (only use if the name is set).
318
+ # @param storage_options [Hash]
319
+ # Extra options that make sense for a particular storage connection.
320
+ # @param memory_map [Boolean]
321
+ # Try to memory map the file. This can greatly improve performance on repeated
322
+ # queries as the OS may cache pages.
323
+ # Only uncompressed IPC files can be memory mapped.
324
+ #
325
+ # @return [LazyFrame]
326
+ def scan_ipc(
327
+ file,
328
+ n_rows: nil,
329
+ cache: true,
330
+ rechunk: true,
331
+ row_count_name: nil,
332
+ row_count_offset: 0,
333
+ storage_options: nil,
334
+ memory_map: true
335
+ )
336
+ LazyFrame._scan_ipc(
337
+ file,
338
+ n_rows: n_rows,
339
+ cache: cache,
340
+ rechunk: rechunk,
341
+ row_count_name: row_count_name,
342
+ row_count_offset: row_count_offset,
343
+ storage_options: storage_options,
344
+ memory_map: memory_map
345
+ )
346
+ end
347
+
348
+ # Lazily read from a parquet file or multiple files via glob patterns.
349
+ #
350
+ # This allows the query optimizer to push down predicates and projections to the scan
351
+ # level, thereby potentially reducing memory overhead.
352
+ #
353
+ # @param file [String]
354
+ # Path to a file.
355
+ # @param n_rows [Integer]
356
+ # Stop reading from parquet file after reading `n_rows`.
357
+ # @param cache [Boolean]
358
+ # Cache the result after reading.
359
+ # @param parallel ["auto", "columns", "row_groups", "none"]
360
+ # This determines the direction of parallelism. 'auto' will try to determine the
361
+ # optimal direction.
362
+ # @param rechunk [Boolean]
363
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
364
+ # into contiguous memory chunks.
365
+ # @param row_count_name [String]
366
+ # If not nil, this will insert a row count column with give name into the
367
+ # DataFrame.
368
+ # @param row_count_offset [Integer]
369
+ # Offset to start the row_count column (only use if the name is set).
370
+ # @param storage_options [Hash]
371
+ # Extra options that make sense for a particular storage connection.
372
+ # @param low_memory [Boolean]
373
+ # Reduce memory pressure at the expense of performance.
374
+ #
375
+ # @return [LazyFrame]
376
+ def scan_parquet(
377
+ file,
378
+ n_rows: nil,
379
+ cache: true,
380
+ parallel: "auto",
381
+ rechunk: true,
382
+ row_count_name: nil,
383
+ row_count_offset: 0,
384
+ storage_options: nil,
385
+ low_memory: false
386
+ )
387
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
388
+ file = Utils.format_path(file)
389
+ end
390
+
391
+ LazyFrame._scan_parquet(
392
+ file,
393
+ n_rows:n_rows,
394
+ cache: cache,
395
+ parallel: parallel,
396
+ rechunk: rechunk,
397
+ row_count_name: row_count_name,
398
+ row_count_offset: row_count_offset,
399
+ storage_options: storage_options,
400
+ low_memory: low_memory
401
+ )
402
+ end
403
+
404
+ # Lazily read from a newline delimited JSON file.
405
+ #
406
+ # This allows the query optimizer to push down predicates and projections to the scan
407
+ # level, thereby potentially reducing memory overhead.
408
+ #
409
+ # @param file [String]
410
+ # Path to a file.
411
+ # @param infer_schema_length [Integer]
412
+ # Infer the schema length from the first `infer_schema_length` rows.
413
+ # @param batch_size [Integer]
414
+ # Number of rows to read in each batch.
415
+ # @param n_rows [Integer]
416
+ # Stop reading from JSON file after reading `n_rows`.
417
+ # @param low_memory [Boolean]
418
+ # Reduce memory pressure at the expense of performance.
419
+ # @param rechunk [Boolean]
420
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
421
+ # @param row_count_name [String]
422
+ # If not nil, this will insert a row count column with give name into the
423
+ # DataFrame.
424
+ # @param row_count_offset [Integer]
425
+ # Offset to start the row_count column (only use if the name is set).
426
+ #
427
+ # @return [LazyFrame]
428
+ def scan_ndjson(
429
+ file,
430
+ infer_schema_length: 100,
431
+ batch_size: 1024,
432
+ n_rows: nil,
433
+ low_memory: false,
434
+ rechunk: true,
435
+ row_count_name: nil,
436
+ row_count_offset: 0
437
+ )
438
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
439
+ file = Utils.format_path(file)
440
+ end
441
+
442
+ LazyFrame._scan_ndjson(
443
+ file,
444
+ infer_schema_length: infer_schema_length,
445
+ batch_size: batch_size,
446
+ n_rows: n_rows,
447
+ low_memory: low_memory,
448
+ rechunk: rechunk,
449
+ row_count_name: row_count_name,
450
+ row_count_offset: row_count_offset,
451
+ )
452
+ end
453
+
454
+ # def read_avro
455
+ # end
456
+
457
+ # Read into a DataFrame from Arrow IPC (Feather v2) file.
458
+ #
459
+ # @param file [Object]
460
+ # Path to a file or a file-like object.
461
+ # @param columns [Object]
462
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
463
+ # of column names.
464
+ # @param n_rows [Integer]
465
+ # Stop reading from IPC file after reading `n_rows`.
466
+ # @param memory_map [Boolean]
467
+ # Try to memory map the file. This can greatly improve performance on repeated
468
+ # queries as the OS may cache pages.
469
+ # Only uncompressed IPC files can be memory mapped.
470
+ # @param storage_options [Hash]
471
+ # Extra options that make sense for a particular storage connection.
472
+ # @param row_count_name [String]
473
+ # If not nil, this will insert a row count column with give name into the
474
+ # DataFrame.
475
+ # @param row_count_offset [Integer]
476
+ # Offset to start the row_count column (only use if the name is set).
477
+ # @param rechunk [Boolean]
478
+ # Make sure that all data is contiguous.
479
+ #
480
+ # @return [DataFrame]
481
+ def read_ipc(
482
+ file,
483
+ columns: nil,
484
+ n_rows: nil,
485
+ memory_map: true,
486
+ storage_options: nil,
487
+ row_count_name: nil,
488
+ row_count_offset: 0,
489
+ rechunk: true
490
+ )
491
+ storage_options ||= {}
492
+ _prepare_file_arg(file, **storage_options) do |data|
493
+ DataFrame._read_ipc(
494
+ data,
495
+ columns: columns,
496
+ n_rows: n_rows,
497
+ row_count_name: row_count_name,
498
+ row_count_offset: row_count_offset,
499
+ rechunk: rechunk,
500
+ memory_map: memory_map
501
+ )
6
502
  end
7
503
  end
8
504
 
9
- def read_parquet(file)
505
+ # Read into a DataFrame from a parquet file.
506
+ #
507
+ # @param file [Object]
508
+ # Path to a file, or a file-like object.
509
+ # @param columns [Object]
510
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
511
+ # of column names.
512
+ # @param n_rows [Integer]
513
+ # Stop reading from parquet file after reading `n_rows`.
514
+ # @param storage_options [Hash]
515
+ # Extra options that make sense for a particular storage connection.
516
+ # @param parallel ["auto", "columns", "row_groups", "none"]
517
+ # This determines the direction of parallelism. 'auto' will try to determine the
518
+ # optimal direction.
519
+ # @param row_count_name [String]
520
+ # If not nil, this will insert a row count column with give name into the
521
+ # DataFrame.
522
+ # @param row_count_offset [Integer]
523
+ # Offset to start the row_count column (only use if the name is set).
524
+ # @param low_memory [Boolean]
525
+ # Reduce memory pressure at the expense of performance.
526
+ #
527
+ # @return [DataFrame]
528
+ #
529
+ # @note
530
+ # This operation defaults to a `rechunk` operation at the end, meaning that
531
+ # all data will be stored continuously in memory.
532
+ # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
533
+ # an expensive operation.
534
+ def read_parquet(
535
+ file,
536
+ columns: nil,
537
+ n_rows: nil,
538
+ storage_options: nil,
539
+ parallel: "auto",
540
+ row_count_name: nil,
541
+ row_count_offset: 0,
542
+ low_memory: false
543
+ )
10
544
  _prepare_file_arg(file) do |data|
11
- DataFrame._read_parquet(data)
545
+ DataFrame._read_parquet(
546
+ data,
547
+ columns: columns,
548
+ n_rows: n_rows,
549
+ parallel: parallel,
550
+ row_count_name: row_count_name,
551
+ row_count_offset: row_count_offset,
552
+ low_memory: low_memory
553
+ )
12
554
  end
13
555
  end
14
556
 
557
+ # Read into a DataFrame from a JSON file.
558
+ #
559
+ # @param file [Object]
560
+ # Path to a file or a file-like object.
561
+ #
562
+ # @return [DataFrame]
15
563
  def read_json(file)
16
564
  DataFrame._read_json(file)
17
565
  end
18
566
 
567
+ # Read into a DataFrame from a newline delimited JSON file.
568
+ #
569
+ # @param file [Object]
570
+ # Path to a file or a file-like object.
571
+ #
572
+ # @return [DataFrame]
19
573
  def read_ndjson(file)
20
574
  DataFrame._read_ndjson(file)
21
575
  end
22
576
 
577
+ # def read_sql
578
+ # end
579
+
580
+ # def read_excel
581
+ # end
582
+
583
+ # Read a CSV file in batches.
584
+ #
585
+ # Upon creation of the `BatchedCsvReader`,
586
+ # polars will gather statistics and determine the
587
+ # file chunks. After that work will only be done
588
+ # if `next_batches` is called.
589
+ #
590
+ # @param file [Object]
591
+ # Path to a file or a file-like object.
592
+ # @param has_header [Boolean]
593
+ # Indicate if the first row of dataset is a header or not.
594
+ # If set to False, column names will be autogenerated in the
595
+ # following format: `column_x`, with `x` being an
596
+ # enumeration over every column in the dataset starting at 1.
597
+ # @param columns [Object]
598
+ # Columns to select. Accepts a list of column indices (starting
599
+ # at zero) or a list of column names.
600
+ # @param new_columns [Object]
601
+ # Rename columns right after parsing the CSV file. If the given
602
+ # list is shorter than the width of the DataFrame the remaining
603
+ # columns will have their original name.
604
+ # @param sep [String]
605
+ # Single byte character to use as delimiter in the file.
606
+ # @param comment_char [String]
607
+ # Single byte character that indicates the start of a comment line,
608
+ # for instance `#`.
609
+ # @param quote_char [String]
610
+ # Single byte character used for csv quoting, default = `"`.
611
+ # Set to nil to turn off special handling and escaping of quotes.
612
+ # @param skip_rows [Integer]
613
+ # Start reading after `skip_rows` lines.
614
+ # @param dtypes [Object]
615
+ # Overwrite dtypes during inference.
616
+ # @param null_values [Object]
617
+ # Values to interpret as null values. You can provide a:
618
+ #
619
+ # - `String`: All values equal to this string will be null.
620
+ # - `Array`: All values equal to any string in this array will be null.
621
+ # - `Hash`: A hash that maps column name to a null value string.
622
+ # @param ignore_errors [Boolean]
623
+ # Try to keep reading lines if some lines yield errors.
624
+ # First try `infer_schema_length: 0` to read all columns as
625
+ # `:str` to check which values might cause an issue.
626
+ # @param parse_dates [Boolean]
627
+ # Try to automatically parse dates. If this does not succeed,
628
+ # the column remains of data type `:str`.
629
+ # @param n_threads [Integer]
630
+ # Number of threads to use in csv parsing.
631
+ # Defaults to the number of physical cpu's of your system.
632
+ # @param infer_schema_length [Integer]
633
+ # Maximum number of lines to read to infer schema.
634
+ # If set to 0, all columns will be read as `:str`.
635
+ # If set to `nil`, a full table scan will be done (slow).
636
+ # @param batch_size [Integer]
637
+ # Number of lines to read into the buffer at once.
638
+ # Modify this to change performance.
639
+ # @param n_rows [Integer]
640
+ # Stop reading from CSV file after reading `n_rows`.
641
+ # During multi-threaded parsing, an upper bound of `n_rows`
642
+ # rows cannot be guaranteed.
643
+ # @param encoding ["utf8", "utf8-lossy"]
644
+ # Lossy means that invalid utf8 values are replaced with `�`
645
+ # characters. When using other encodings than `utf8` or
646
+ # `utf8-lossy`, the input is first decoded im memory with
647
+ # Ruby. Defaults to `utf8`.
648
+ # @param low_memory [Boolean]
649
+ # Reduce memory usage at expense of performance.
650
+ # @param rechunk [Boolean]
651
+ # Make sure that all columns are contiguous in memory by
652
+ # aggregating the chunks into a single array.
653
+ # @param skip_rows_after_header [Integer]
654
+ # Skip this number of rows when the header is parsed.
655
+ # @param row_count_name [String]
656
+ # If not nil, this will insert a row count column with the given name into
657
+ # the DataFrame.
658
+ # @param row_count_offset [Integer]
659
+ # Offset to start the row_count column (only used if the name is set).
660
+ # @param sample_size [Integer]
661
+ # Set the sample size. This is used to sample statistics to estimate the
662
+ # allocation needed.
663
+ # @param eol_char [String]
664
+ # Single byte end of line character.
665
+ #
666
+ # @return [BatchedCsvReader]
667
+ #
668
+ # @example
669
+ # reader = Polars.read_csv_batched(
670
+ # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
671
+ # )
672
+ # reader.next_batches(5)
673
+ def read_csv_batched(
674
+ file,
675
+ has_header: true,
676
+ columns: nil,
677
+ new_columns: nil,
678
+ sep: ",",
679
+ comment_char: nil,
680
+ quote_char: '"',
681
+ skip_rows: 0,
682
+ dtypes: nil,
683
+ null_values: nil,
684
+ ignore_errors: false,
685
+ parse_dates: false,
686
+ n_threads: nil,
687
+ infer_schema_length: 100,
688
+ batch_size: 50_000,
689
+ n_rows: nil,
690
+ encoding: "utf8",
691
+ low_memory: false,
692
+ rechunk: true,
693
+ skip_rows_after_header: 0,
694
+ row_count_name: nil,
695
+ row_count_offset: 0,
696
+ sample_size: 1024,
697
+ eol_char: "\n"
698
+ )
699
+ projection, columns = Utils.handle_projection_columns(columns)
700
+
701
+ if columns && !has_header
702
+ columns.each do |column|
703
+ if !column.start_with?("column_")
704
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
705
+ end
706
+ end
707
+ end
708
+
709
+ if projection || new_columns
710
+ raise Todo
711
+ end
712
+
713
+ BatchedCsvReader.new(
714
+ file,
715
+ has_header: has_header,
716
+ columns: columns || projection,
717
+ sep: sep,
718
+ comment_char: comment_char,
719
+ quote_char: quote_char,
720
+ skip_rows: skip_rows,
721
+ dtypes: dtypes,
722
+ null_values: null_values,
723
+ ignore_errors: ignore_errors,
724
+ parse_dates: parse_dates,
725
+ n_threads: n_threads,
726
+ infer_schema_length: infer_schema_length,
727
+ batch_size: batch_size,
728
+ n_rows: n_rows,
729
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
730
+ low_memory: low_memory,
731
+ rechunk: rechunk,
732
+ skip_rows_after_header: skip_rows_after_header,
733
+ row_count_name: row_count_name,
734
+ row_count_offset: row_count_offset,
735
+ sample_size: sample_size,
736
+ eol_char: eol_char,
737
+ new_columns: new_columns
738
+ )
739
+ end
740
+
741
+ # Get a schema of the IPC file without reading data.
742
+ #
743
+ # @param file [Object]
744
+ # Path to a file or a file-like object.
745
+ #
746
+ # @return [Hash]
747
+ def read_ipc_schema(file)
748
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
749
+ file = Utils.format_path(file)
750
+ end
751
+
752
+ _ipc_schema(file)
753
+ end
754
+
755
+ # Get a schema of the Parquet file without reading data.
756
+ #
757
+ # @param file [Object]
758
+ # Path to a file or a file-like object.
759
+ #
760
+ # @return [Hash]
761
+ def read_parquet_schema(file)
762
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
763
+ file = Utils.format_path(file)
764
+ end
765
+
766
+ _parquet_schema(file)
767
+ end
768
+
23
769
  private
24
770
 
25
771
  def _prepare_file_arg(file)
@@ -35,5 +781,18 @@ module Polars
35
781
 
36
782
  yield file
37
783
  end
784
+
785
+ def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
786
+ if arg.is_a?(String)
787
+ arg_byte_length = arg.bytesize
788
+ if can_be_empty
789
+ if arg_byte_length > 1
790
+ raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
791
+ end
792
+ elsif arg_byte_length != 1
793
+ raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
794
+ end
795
+ end
796
+ end
38
797
  end
39
798
  end