polars-df 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +8 -0
  4. data/Cargo.lock +2 -1
  5. data/README.md +1 -1
  6. data/ext/polars/Cargo.toml +7 -1
  7. data/ext/polars/src/batched_csv.rs +120 -0
  8. data/ext/polars/src/conversion.rs +139 -6
  9. data/ext/polars/src/dataframe.rs +360 -15
  10. data/ext/polars/src/error.rs +9 -0
  11. data/ext/polars/src/file.rs +8 -7
  12. data/ext/polars/src/lazy/apply.rs +7 -0
  13. data/ext/polars/src/lazy/dataframe.rs +135 -3
  14. data/ext/polars/src/lazy/dsl.rs +97 -2
  15. data/ext/polars/src/lazy/meta.rs +1 -1
  16. data/ext/polars/src/lazy/mod.rs +1 -0
  17. data/ext/polars/src/lib.rs +227 -12
  18. data/ext/polars/src/series.rs +190 -38
  19. data/ext/polars/src/set.rs +91 -0
  20. data/ext/polars/src/utils.rs +19 -0
  21. data/lib/polars/batched_csv_reader.rb +96 -0
  22. data/lib/polars/cat_expr.rb +39 -0
  23. data/lib/polars/data_frame.rb +2813 -100
  24. data/lib/polars/date_time_expr.rb +1282 -7
  25. data/lib/polars/exceptions.rb +20 -0
  26. data/lib/polars/expr.rb +631 -11
  27. data/lib/polars/expr_dispatch.rb +14 -0
  28. data/lib/polars/functions.rb +219 -0
  29. data/lib/polars/group_by.rb +517 -0
  30. data/lib/polars/io.rb +763 -4
  31. data/lib/polars/lazy_frame.rb +1415 -67
  32. data/lib/polars/lazy_functions.rb +430 -9
  33. data/lib/polars/lazy_group_by.rb +79 -0
  34. data/lib/polars/list_expr.rb +5 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2244 -192
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +663 -2
  39. data/lib/polars/struct_expr.rb +73 -0
  40. data/lib/polars/utils.rb +76 -3
  41. data/lib/polars/version.rb +2 -1
  42. data/lib/polars/when.rb +1 -0
  43. data/lib/polars/when_then.rb +1 -0
  44. data/lib/polars.rb +8 -2
  45. metadata +12 -2
data/lib/polars/io.rb CHANGED
@@ -1,25 +1,771 @@
1
1
  module Polars
2
2
  module IO
3
- def read_csv(file, has_header: true)
3
+ # Read a CSV file into a DataFrame.
4
+ #
5
+ # @param file [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param has_header [Boolean]
8
+ # Indicate if the first row of dataset is a header or not.
9
+ # If set to false, column names will be autogenerated in the
10
+ # following format: `column_x`, with `x` being an
11
+ # enumeration over every column in the dataset starting at 1.
12
+ # @param columns [Object]
13
+ # Columns to select. Accepts a list of column indices (starting
14
+ # at zero) or a list of column names.
15
+ # @param new_columns [Object]
16
+ # Rename columns right after parsing the CSV file. If the given
17
+ # list is shorter than the width of the DataFrame the remaining
18
+ # columns will have their original name.
19
+ # @param sep [String]
20
+ # Single byte character to use as delimiter in the file.
21
+ # @param comment_char [String]
22
+ # Single byte character that indicates the start of a comment line,
23
+ # for instance `#`.
24
+ # @param quote_char [String]
25
+ # Single byte character used for csv quoting.
26
+ # Set to nil to turn off special handling and escaping of quotes.
27
+ # @param skip_rows [Integer]
28
+ # Start reading after `skip_rows` lines.
29
+ # @param dtypes [Object]
30
+ # Overwrite dtypes during inference.
31
+ # @param null_values [Object]
32
+ # Values to interpret as null values. You can provide a:
33
+ #
34
+ # - `String`: All values equal to this string will be null.
35
+ # - `Array`: All values equal to any string in this array will be null.
36
+ # - `Hash`: A hash that maps column name to a null value string.
37
+ # @param ignore_errors [Boolean]
38
+ # Try to keep reading lines if some lines yield errors.
39
+ # First try `infer_schema_length: 0` to read all columns as
40
+ # `:str` to check which values might cause an issue.
41
+ # @param parse_dates [Boolean]
42
+ # Try to automatically parse dates. If this does not succeed,
43
+ # the column remains of data type `:str`.
44
+ # @param n_threads [Integer]
45
+ # Number of threads to use in csv parsing.
46
+ # Defaults to the number of physical cpu's of your system.
47
+ # @param infer_schema_length [Integer]
48
+ # Maximum number of lines to read to infer schema.
49
+ # If set to 0, all columns will be read as `:utf8`.
50
+ # If set to `nil`, a full table scan will be done (slow).
51
+ # @param batch_size [Integer]
52
+ # Number of lines to read into the buffer at once.
53
+ # Modify this to change performance.
54
+ # @param n_rows [Integer]
55
+ # Stop reading from CSV file after reading `n_rows`.
56
+ # During multi-threaded parsing, an upper bound of `n_rows`
57
+ # rows cannot be guaranteed.
58
+ # @param encoding ["utf8", "utf8-lossy"]
59
+ # Lossy means that invalid utf8 values are replaced with `�`
60
+ # characters. When using other encodings than `utf8` or
61
+ # `utf8-lossy`, the input is first decoded im memory with
62
+ # python.
63
+ # @param low_memory [Boolean]
64
+ # Reduce memory usage at expense of performance.
65
+ # @param rechunk [Boolean]
66
+ # Make sure that all columns are contiguous in memory by
67
+ # aggregating the chunks into a single array.
68
+ # @param storage_options [Hash]
69
+ # Extra options that make sense for a
70
+ # particular storage connection.
71
+ # @param skip_rows_after_header [Integer]
72
+ # Skip this number of rows when the header is parsed.
73
+ # @param row_count_name [String]
74
+ # If not nil, this will insert a row count column with the given name into
75
+ # the DataFrame.
76
+ # @param row_count_offset [Integer]
77
+ # Offset to start the row_count column (only used if the name is set).
78
+ # @param sample_size [Integer]
79
+ # Set the sample size. This is used to sample statistics to estimate the
80
+ # allocation needed.
81
+ # @param eol_char [String]
82
+ # Single byte end of line character.
83
+ #
84
+ # @return [DataFrame]
85
+ #
86
+ # @note
87
+ # This operation defaults to a `rechunk` operation at the end, meaning that
88
+ # all data will be stored continuously in memory.
89
+ # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
90
+ # an expensive operation.
91
+ def read_csv(
92
+ file,
93
+ has_header: true,
94
+ columns: nil,
95
+ new_columns: nil,
96
+ sep: ",",
97
+ comment_char: nil,
98
+ quote_char: '"',
99
+ skip_rows: 0,
100
+ dtypes: nil,
101
+ null_values: nil,
102
+ ignore_errors: false,
103
+ parse_dates: false,
104
+ n_threads: nil,
105
+ infer_schema_length: 100,
106
+ batch_size: 8192,
107
+ n_rows: nil,
108
+ encoding: "utf8",
109
+ low_memory: false,
110
+ rechunk: true,
111
+ storage_options: nil,
112
+ skip_rows_after_header: 0,
113
+ row_count_name: nil,
114
+ row_count_offset: 0,
115
+ sample_size: 1024,
116
+ eol_char: "\n"
117
+ )
118
+ _check_arg_is_1byte("sep", sep, false)
119
+ _check_arg_is_1byte("comment_char", comment_char, false)
120
+ _check_arg_is_1byte("quote_char", quote_char, true)
121
+ _check_arg_is_1byte("eol_char", eol_char, false)
122
+
123
+ projection, columns = Utils.handle_projection_columns(columns)
124
+
125
+ storage_options ||= {}
126
+
127
+ if columns && !has_header
128
+ columns.each do |column|
129
+ if !column.start_with?("column_")
130
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
131
+ end
132
+ end
133
+ end
134
+
135
+ if projection || new_columns
136
+ raise Todo
137
+ end
138
+
139
+ df = nil
4
140
  _prepare_file_arg(file) do |data|
5
- DataFrame._read_csv(data, has_header: has_header)
141
+ df = DataFrame._read_csv(
142
+ data,
143
+ has_header: has_header,
144
+ columns: columns || projection,
145
+ sep: sep,
146
+ comment_char: comment_char,
147
+ quote_char: quote_char,
148
+ skip_rows: skip_rows,
149
+ dtypes: dtypes,
150
+ null_values: null_values,
151
+ ignore_errors: ignore_errors,
152
+ parse_dates: parse_dates,
153
+ n_threads: n_threads,
154
+ infer_schema_length: infer_schema_length,
155
+ batch_size: batch_size,
156
+ n_rows: n_rows,
157
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
158
+ low_memory: low_memory,
159
+ rechunk: rechunk,
160
+ skip_rows_after_header: skip_rows_after_header,
161
+ row_count_name: row_count_name,
162
+ row_count_offset: row_count_offset,
163
+ sample_size: sample_size,
164
+ eol_char: eol_char
165
+ )
166
+ end
167
+
168
+ if new_columns
169
+ Utils._update_columns(df, new_columns)
170
+ else
171
+ df
172
+ end
173
+ end
174
+
175
+ # Lazily read from a CSV file or multiple files via glob patterns.
176
+ #
177
+ # This allows the query optimizer to push down predicates and
178
+ # projections to the scan level, thereby potentially reducing
179
+ # memory overhead.
180
+ #
181
+ # @param file [Object]
182
+ # Path to a file.
183
+ # @param has_header [Boolean]
184
+ # Indicate if the first row of dataset is a header or not.
185
+ # If set to false, column names will be autogenerated in the
186
+ # following format: ``column_x``, with ``x`` being an
187
+ # enumeration over every column in the dataset starting at 1.
188
+ # @param sep [String]
189
+ # Single byte character to use as delimiter in the file.
190
+ # @param comment_char [String]
191
+ # Single byte character that indicates the start of a comment line,
192
+ # for instance `#`.
193
+ # @param quote_char [String]
194
+ # Single byte character used for csv quoting.
195
+ # Set to None to turn off special handling and escaping of quotes.
196
+ # @param skip_rows [Integer]
197
+ # Start reading after `skip_rows` lines. The header will be parsed at this
198
+ # offset.
199
+ # @param dtypes [Object]
200
+ # Overwrite dtypes during inference.
201
+ # @param null_values [Object]
202
+ # Values to interpret as null values. You can provide a:
203
+ #
204
+ # - `String`: All values equal to this string will be null.
205
+ # - `Array`: All values equal to any string in this array will be null.
206
+ # - `Hash`: A hash that maps column name to a null value string.
207
+ # @param ignore_errors [Boolean]
208
+ # Try to keep reading lines if some lines yield errors.
209
+ # First try `infer_schema_length: 0` to read all columns as
210
+ # `:str` to check which values might cause an issue.
211
+ # @param cache [Boolean]
212
+ # Cache the result after reading.
213
+ # @param with_column_names [Object]
214
+ # Apply a function over the column names.
215
+ # This can be used to update a schema just in time, thus before
216
+ # scanning.
217
+ # @param infer_schema_length [Integer]
218
+ # Maximum number of lines to read to infer schema.
219
+ # If set to 0, all columns will be read as `:str`.
220
+ # If set to `nil`, a full table scan will be done (slow).
221
+ # @param n_rows [Integer]
222
+ # Stop reading from CSV file after reading `n_rows`.
223
+ # @param encoding ["utf8", "utf8-lossy"]
224
+ # Lossy means that invalid utf8 values are replaced with `�`
225
+ # characters.
226
+ # @param low_memory [Boolean]
227
+ # Reduce memory usage in expense of performance.
228
+ # @param rechunk [Boolean]
229
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
230
+ # @param skip_rows_after_header [Integer]
231
+ # Skip this number of rows when the header is parsed.
232
+ # @param row_count_name [String]
233
+ # If not nil, this will insert a row count column with the given name into
234
+ # the DataFrame.
235
+ # @param row_count_offset [Integer]
236
+ # Offset to start the row_count column (only used if the name is set).
237
+ # @param parse_dates [Boolean]
238
+ # Try to automatically parse dates. If this does not succeed,
239
+ # the column remains of data type `:str`.
240
+ # @param eol_char [String]
241
+ # Single byte end of line character.
242
+ #
243
+ # @return [LazyFrame]
244
+ def scan_csv(
245
+ file,
246
+ has_header: true,
247
+ sep: ",",
248
+ comment_char: nil,
249
+ quote_char: '"',
250
+ skip_rows: 0,
251
+ dtypes: nil,
252
+ null_values: nil,
253
+ ignore_errors: false,
254
+ cache: true,
255
+ with_column_names: nil,
256
+ infer_schema_length: 100,
257
+ n_rows: nil,
258
+ encoding: "utf8",
259
+ low_memory: false,
260
+ rechunk: true,
261
+ skip_rows_after_header: 0,
262
+ row_count_name: nil,
263
+ row_count_offset: 0,
264
+ parse_dates: false,
265
+ eol_char: "\n"
266
+ )
267
+ _check_arg_is_1byte("sep", sep, false)
268
+ _check_arg_is_1byte("comment_char", comment_char, false)
269
+ _check_arg_is_1byte("quote_char", quote_char, true)
270
+
271
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
272
+ file = Utils.format_path(file)
273
+ end
274
+
275
+ LazyFrame._scan_csv(
276
+ file,
277
+ has_header: has_header,
278
+ sep: sep,
279
+ comment_char: comment_char,
280
+ quote_char: quote_char,
281
+ skip_rows: skip_rows,
282
+ dtypes: dtypes,
283
+ null_values: null_values,
284
+ ignore_errors: ignore_errors,
285
+ cache: cache,
286
+ with_column_names: with_column_names,
287
+ infer_schema_length: infer_schema_length,
288
+ n_rows: n_rows,
289
+ low_memory: low_memory,
290
+ rechunk: rechunk,
291
+ skip_rows_after_header: skip_rows_after_header,
292
+ encoding: encoding,
293
+ row_count_name: row_count_name,
294
+ row_count_offset: row_count_offset,
295
+ parse_dates: parse_dates,
296
+ eol_char: eol_char,
297
+ )
298
+ end
299
+
300
+ # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
301
+ #
302
+ # This allows the query optimizer to push down predicates and projections to the scan
303
+ # level, thereby potentially reducing memory overhead.
304
+ #
305
+ # @param file [String]
306
+ # Path to a IPC file.
307
+ # @param n_rows [Integer]
308
+ # Stop reading from IPC file after reading `n_rows`.
309
+ # @param cache [Boolean]
310
+ # Cache the result after reading.
311
+ # @param rechunk [Boolean]
312
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
313
+ # @param row_count_name [String]
314
+ # If not nil, this will insert a row count column with give name into the
315
+ # DataFrame.
316
+ # @param row_count_offset [Integer]
317
+ # Offset to start the row_count column (only use if the name is set).
318
+ # @param storage_options [Hash]
319
+ # Extra options that make sense for a particular storage connection.
320
+ # @param memory_map [Boolean]
321
+ # Try to memory map the file. This can greatly improve performance on repeated
322
+ # queries as the OS may cache pages.
323
+ # Only uncompressed IPC files can be memory mapped.
324
+ #
325
+ # @return [LazyFrame]
326
+ def scan_ipc(
327
+ file,
328
+ n_rows: nil,
329
+ cache: true,
330
+ rechunk: true,
331
+ row_count_name: nil,
332
+ row_count_offset: 0,
333
+ storage_options: nil,
334
+ memory_map: true
335
+ )
336
+ LazyFrame._scan_ipc(
337
+ file,
338
+ n_rows: n_rows,
339
+ cache: cache,
340
+ rechunk: rechunk,
341
+ row_count_name: row_count_name,
342
+ row_count_offset: row_count_offset,
343
+ storage_options: storage_options,
344
+ memory_map: memory_map
345
+ )
346
+ end
347
+
348
+ # Lazily read from a parquet file or multiple files via glob patterns.
349
+ #
350
+ # This allows the query optimizer to push down predicates and projections to the scan
351
+ # level, thereby potentially reducing memory overhead.
352
+ #
353
+ # @param file [String]
354
+ # Path to a file.
355
+ # @param n_rows [Integer]
356
+ # Stop reading from parquet file after reading `n_rows`.
357
+ # @param cache [Boolean]
358
+ # Cache the result after reading.
359
+ # @param parallel ["auto", "columns", "row_groups", "none"]
360
+ # This determines the direction of parallelism. 'auto' will try to determine the
361
+ # optimal direction.
362
+ # @param rechunk [Boolean]
363
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
364
+ # into contiguous memory chunks.
365
+ # @param row_count_name [String]
366
+ # If not nil, this will insert a row count column with give name into the
367
+ # DataFrame.
368
+ # @param row_count_offset [Integer]
369
+ # Offset to start the row_count column (only use if the name is set).
370
+ # @param storage_options [Hash]
371
+ # Extra options that make sense for a particular storage connection.
372
+ # @param low_memory [Boolean]
373
+ # Reduce memory pressure at the expense of performance.
374
+ #
375
+ # @return [LazyFrame]
376
+ def scan_parquet(
377
+ file,
378
+ n_rows: nil,
379
+ cache: true,
380
+ parallel: "auto",
381
+ rechunk: true,
382
+ row_count_name: nil,
383
+ row_count_offset: 0,
384
+ storage_options: nil,
385
+ low_memory: false
386
+ )
387
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
388
+ file = Utils.format_path(file)
389
+ end
390
+
391
+ LazyFrame._scan_parquet(
392
+ file,
393
+ n_rows:n_rows,
394
+ cache: cache,
395
+ parallel: parallel,
396
+ rechunk: rechunk,
397
+ row_count_name: row_count_name,
398
+ row_count_offset: row_count_offset,
399
+ storage_options: storage_options,
400
+ low_memory: low_memory
401
+ )
402
+ end
403
+
404
+ # Lazily read from a newline delimited JSON file.
405
+ #
406
+ # This allows the query optimizer to push down predicates and projections to the scan
407
+ # level, thereby potentially reducing memory overhead.
408
+ #
409
+ # @param file [String]
410
+ # Path to a file.
411
+ # @param infer_schema_length [Integer]
412
+ # Infer the schema length from the first `infer_schema_length` rows.
413
+ # @param batch_size [Integer]
414
+ # Number of rows to read in each batch.
415
+ # @param n_rows [Integer]
416
+ # Stop reading from JSON file after reading `n_rows`.
417
+ # @param low_memory [Boolean]
418
+ # Reduce memory pressure at the expense of performance.
419
+ # @param rechunk [Boolean]
420
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
421
+ # @param row_count_name [String]
422
+ # If not nil, this will insert a row count column with give name into the
423
+ # DataFrame.
424
+ # @param row_count_offset [Integer]
425
+ # Offset to start the row_count column (only use if the name is set).
426
+ #
427
+ # @return [LazyFrame]
428
+ def scan_ndjson(
429
+ file,
430
+ infer_schema_length: 100,
431
+ batch_size: 1024,
432
+ n_rows: nil,
433
+ low_memory: false,
434
+ rechunk: true,
435
+ row_count_name: nil,
436
+ row_count_offset: 0
437
+ )
438
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
439
+ file = Utils.format_path(file)
440
+ end
441
+
442
+ LazyFrame._scan_ndjson(
443
+ file,
444
+ infer_schema_length: infer_schema_length,
445
+ batch_size: batch_size,
446
+ n_rows: n_rows,
447
+ low_memory: low_memory,
448
+ rechunk: rechunk,
449
+ row_count_name: row_count_name,
450
+ row_count_offset: row_count_offset,
451
+ )
452
+ end
453
+
454
+ # def read_avro
455
+ # end
456
+
457
+ # Read into a DataFrame from Arrow IPC (Feather v2) file.
458
+ #
459
+ # @param file [Object]
460
+ # Path to a file or a file-like object.
461
+ # @param columns [Object]
462
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
463
+ # of column names.
464
+ # @param n_rows [Integer]
465
+ # Stop reading from IPC file after reading `n_rows`.
466
+ # @param memory_map [Boolean]
467
+ # Try to memory map the file. This can greatly improve performance on repeated
468
+ # queries as the OS may cache pages.
469
+ # Only uncompressed IPC files can be memory mapped.
470
+ # @param storage_options [Hash]
471
+ # Extra options that make sense for a particular storage connection.
472
+ # @param row_count_name [String]
473
+ # If not nil, this will insert a row count column with give name into the
474
+ # DataFrame.
475
+ # @param row_count_offset [Integer]
476
+ # Offset to start the row_count column (only use if the name is set).
477
+ # @param rechunk [Boolean]
478
+ # Make sure that all data is contiguous.
479
+ #
480
+ # @return [DataFrame]
481
+ def read_ipc(
482
+ file,
483
+ columns: nil,
484
+ n_rows: nil,
485
+ memory_map: true,
486
+ storage_options: nil,
487
+ row_count_name: nil,
488
+ row_count_offset: 0,
489
+ rechunk: true
490
+ )
491
+ storage_options ||= {}
492
+ _prepare_file_arg(file, **storage_options) do |data|
493
+ DataFrame._read_ipc(
494
+ data,
495
+ columns: columns,
496
+ n_rows: n_rows,
497
+ row_count_name: row_count_name,
498
+ row_count_offset: row_count_offset,
499
+ rechunk: rechunk,
500
+ memory_map: memory_map
501
+ )
6
502
  end
7
503
  end
8
504
 
9
- def read_parquet(file)
505
+ # Read into a DataFrame from a parquet file.
506
+ #
507
+ # @param file [Object]
508
+ # Path to a file, or a file-like object.
509
+ # @param columns [Object]
510
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
511
+ # of column names.
512
+ # @param n_rows [Integer]
513
+ # Stop reading from parquet file after reading `n_rows`.
514
+ # @param storage_options [Hash]
515
+ # Extra options that make sense for a particular storage connection.
516
+ # @param parallel ["auto", "columns", "row_groups", "none"]
517
+ # This determines the direction of parallelism. 'auto' will try to determine the
518
+ # optimal direction.
519
+ # @param row_count_name [String]
520
+ # If not nil, this will insert a row count column with give name into the
521
+ # DataFrame.
522
+ # @param row_count_offset [Integer]
523
+ # Offset to start the row_count column (only use if the name is set).
524
+ # @param low_memory [Boolean]
525
+ # Reduce memory pressure at the expense of performance.
526
+ #
527
+ # @return [DataFrame]
528
+ #
529
+ # @note
530
+ # This operation defaults to a `rechunk` operation at the end, meaning that
531
+ # all data will be stored continuously in memory.
532
+ # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
533
+ # an expensive operation.
534
+ def read_parquet(
535
+ file,
536
+ columns: nil,
537
+ n_rows: nil,
538
+ storage_options: nil,
539
+ parallel: "auto",
540
+ row_count_name: nil,
541
+ row_count_offset: 0,
542
+ low_memory: false
543
+ )
10
544
  _prepare_file_arg(file) do |data|
11
- DataFrame._read_parquet(data)
545
+ DataFrame._read_parquet(
546
+ data,
547
+ columns: columns,
548
+ n_rows: n_rows,
549
+ parallel: parallel,
550
+ row_count_name: row_count_name,
551
+ row_count_offset: row_count_offset,
552
+ low_memory: low_memory
553
+ )
12
554
  end
13
555
  end
14
556
 
557
+ # Read into a DataFrame from a JSON file.
558
+ #
559
+ # @param file [Object]
560
+ # Path to a file or a file-like object.
561
+ #
562
+ # @return [DataFrame]
15
563
  def read_json(file)
16
564
  DataFrame._read_json(file)
17
565
  end
18
566
 
567
+ # Read into a DataFrame from a newline delimited JSON file.
568
+ #
569
+ # @param file [Object]
570
+ # Path to a file or a file-like object.
571
+ #
572
+ # @return [DataFrame]
19
573
  def read_ndjson(file)
20
574
  DataFrame._read_ndjson(file)
21
575
  end
22
576
 
577
+ # def read_sql
578
+ # end
579
+
580
+ # def read_excel
581
+ # end
582
+
583
+ # Read a CSV file in batches.
584
+ #
585
+ # Upon creation of the `BatchedCsvReader`,
586
+ # polars will gather statistics and determine the
587
+ # file chunks. After that work will only be done
588
+ # if `next_batches` is called.
589
+ #
590
+ # @param file [Object]
591
+ # Path to a file or a file-like object.
592
+ # @param has_header [Boolean]
593
+ # Indicate if the first row of dataset is a header or not.
594
+ # If set to False, column names will be autogenerated in the
595
+ # following format: `column_x`, with `x` being an
596
+ # enumeration over every column in the dataset starting at 1.
597
+ # @param columns [Object]
598
+ # Columns to select. Accepts a list of column indices (starting
599
+ # at zero) or a list of column names.
600
+ # @param new_columns [Object]
601
+ # Rename columns right after parsing the CSV file. If the given
602
+ # list is shorter than the width of the DataFrame the remaining
603
+ # columns will have their original name.
604
+ # @param sep [String]
605
+ # Single byte character to use as delimiter in the file.
606
+ # @param comment_char [String]
607
+ # Single byte character that indicates the start of a comment line,
608
+ # for instance `#`.
609
+ # @param quote_char [String]
610
+ # Single byte character used for csv quoting, default = `"`.
611
+ # Set to nil to turn off special handling and escaping of quotes.
612
+ # @param skip_rows [Integer]
613
+ # Start reading after `skip_rows` lines.
614
+ # @param dtypes [Object]
615
+ # Overwrite dtypes during inference.
616
+ # @param null_values [Object]
617
+ # Values to interpret as null values. You can provide a:
618
+ #
619
+ # - `String`: All values equal to this string will be null.
620
+ # - `Array`: All values equal to any string in this array will be null.
621
+ # - `Hash`: A hash that maps column name to a null value string.
622
+ # @param ignore_errors [Boolean]
623
+ # Try to keep reading lines if some lines yield errors.
624
+ # First try `infer_schema_length: 0` to read all columns as
625
+ # `:str` to check which values might cause an issue.
626
+ # @param parse_dates [Boolean]
627
+ # Try to automatically parse dates. If this does not succeed,
628
+ # the column remains of data type `:str`.
629
+ # @param n_threads [Integer]
630
+ # Number of threads to use in csv parsing.
631
+ # Defaults to the number of physical cpu's of your system.
632
+ # @param infer_schema_length [Integer]
633
+ # Maximum number of lines to read to infer schema.
634
+ # If set to 0, all columns will be read as `:str`.
635
+ # If set to `nil`, a full table scan will be done (slow).
636
+ # @param batch_size [Integer]
637
+ # Number of lines to read into the buffer at once.
638
+ # Modify this to change performance.
639
+ # @param n_rows [Integer]
640
+ # Stop reading from CSV file after reading `n_rows`.
641
+ # During multi-threaded parsing, an upper bound of `n_rows`
642
+ # rows cannot be guaranteed.
643
+ # @param encoding ["utf8", "utf8-lossy"]
644
+ # Lossy means that invalid utf8 values are replaced with `�`
645
+ # characters. When using other encodings than `utf8` or
646
+ # `utf8-lossy`, the input is first decoded im memory with
647
+ # Ruby. Defaults to `utf8`.
648
+ # @param low_memory [Boolean]
649
+ # Reduce memory usage at expense of performance.
650
+ # @param rechunk [Boolean]
651
+ # Make sure that all columns are contiguous in memory by
652
+ # aggregating the chunks into a single array.
653
+ # @param skip_rows_after_header [Integer]
654
+ # Skip this number of rows when the header is parsed.
655
+ # @param row_count_name [String]
656
+ # If not nil, this will insert a row count column with the given name into
657
+ # the DataFrame.
658
+ # @param row_count_offset [Integer]
659
+ # Offset to start the row_count column (only used if the name is set).
660
+ # @param sample_size [Integer]
661
+ # Set the sample size. This is used to sample statistics to estimate the
662
+ # allocation needed.
663
+ # @param eol_char [String]
664
+ # Single byte end of line character.
665
+ #
666
+ # @return [BatchedCsvReader]
667
+ #
668
+ # @example
669
+ # reader = Polars.read_csv_batched(
670
+ # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
671
+ # )
672
+ # reader.next_batches(5)
673
+ def read_csv_batched(
674
+ file,
675
+ has_header: true,
676
+ columns: nil,
677
+ new_columns: nil,
678
+ sep: ",",
679
+ comment_char: nil,
680
+ quote_char: '"',
681
+ skip_rows: 0,
682
+ dtypes: nil,
683
+ null_values: nil,
684
+ ignore_errors: false,
685
+ parse_dates: false,
686
+ n_threads: nil,
687
+ infer_schema_length: 100,
688
+ batch_size: 50_000,
689
+ n_rows: nil,
690
+ encoding: "utf8",
691
+ low_memory: false,
692
+ rechunk: true,
693
+ skip_rows_after_header: 0,
694
+ row_count_name: nil,
695
+ row_count_offset: 0,
696
+ sample_size: 1024,
697
+ eol_char: "\n"
698
+ )
699
+ projection, columns = Utils.handle_projection_columns(columns)
700
+
701
+ if columns && !has_header
702
+ columns.each do |column|
703
+ if !column.start_with?("column_")
704
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
705
+ end
706
+ end
707
+ end
708
+
709
+ if projection || new_columns
710
+ raise Todo
711
+ end
712
+
713
+ BatchedCsvReader.new(
714
+ file,
715
+ has_header: has_header,
716
+ columns: columns || projection,
717
+ sep: sep,
718
+ comment_char: comment_char,
719
+ quote_char: quote_char,
720
+ skip_rows: skip_rows,
721
+ dtypes: dtypes,
722
+ null_values: null_values,
723
+ ignore_errors: ignore_errors,
724
+ parse_dates: parse_dates,
725
+ n_threads: n_threads,
726
+ infer_schema_length: infer_schema_length,
727
+ batch_size: batch_size,
728
+ n_rows: n_rows,
729
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
730
+ low_memory: low_memory,
731
+ rechunk: rechunk,
732
+ skip_rows_after_header: skip_rows_after_header,
733
+ row_count_name: row_count_name,
734
+ row_count_offset: row_count_offset,
735
+ sample_size: sample_size,
736
+ eol_char: eol_char,
737
+ new_columns: new_columns
738
+ )
739
+ end
740
+
741
+ # Get a schema of the IPC file without reading data.
742
+ #
743
+ # @param file [Object]
744
+ # Path to a file or a file-like object.
745
+ #
746
+ # @return [Hash]
747
+ def read_ipc_schema(file)
748
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
749
+ file = Utils.format_path(file)
750
+ end
751
+
752
+ _ipc_schema(file)
753
+ end
754
+
755
+ # Get a schema of the Parquet file without reading data.
756
+ #
757
+ # @param file [Object]
758
+ # Path to a file or a file-like object.
759
+ #
760
+ # @return [Hash]
761
+ def read_parquet_schema(file)
762
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
763
+ file = Utils.format_path(file)
764
+ end
765
+
766
+ _parquet_schema(file)
767
+ end
768
+
23
769
  private
24
770
 
25
771
  def _prepare_file_arg(file)
@@ -35,5 +781,18 @@ module Polars
35
781
 
36
782
  yield file
37
783
  end
784
+
785
+ def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
786
+ if arg.is_a?(String)
787
+ arg_byte_length = arg.bytesize
788
+ if can_be_empty
789
+ if arg_byte_length > 1
790
+ raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
791
+ end
792
+ elsif arg_byte_length != 1
793
+ raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
794
+ end
795
+ end
796
+ end
38
797
  end
39
798
  end