polars-df 0.2.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +33 -0
  4. data/Cargo.lock +2230 -0
  5. data/Cargo.toml +10 -0
  6. data/LICENSE-THIRD-PARTY.txt +38828 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +91 -0
  9. data/lib/polars/3.0/polars.so +0 -0
  10. data/lib/polars/3.1/polars.so +0 -0
  11. data/lib/polars/3.2/polars.so +0 -0
  12. data/lib/polars/batched_csv_reader.rb +96 -0
  13. data/lib/polars/cat_expr.rb +52 -0
  14. data/lib/polars/cat_name_space.rb +54 -0
  15. data/lib/polars/convert.rb +100 -0
  16. data/lib/polars/data_frame.rb +4833 -0
  17. data/lib/polars/data_types.rb +122 -0
  18. data/lib/polars/date_time_expr.rb +1418 -0
  19. data/lib/polars/date_time_name_space.rb +1484 -0
  20. data/lib/polars/dynamic_group_by.rb +52 -0
  21. data/lib/polars/exceptions.rb +20 -0
  22. data/lib/polars/expr.rb +5307 -0
  23. data/lib/polars/expr_dispatch.rb +22 -0
  24. data/lib/polars/functions.rb +453 -0
  25. data/lib/polars/group_by.rb +558 -0
  26. data/lib/polars/io.rb +814 -0
  27. data/lib/polars/lazy_frame.rb +2442 -0
  28. data/lib/polars/lazy_functions.rb +1195 -0
  29. data/lib/polars/lazy_group_by.rb +93 -0
  30. data/lib/polars/list_expr.rb +610 -0
  31. data/lib/polars/list_name_space.rb +346 -0
  32. data/lib/polars/meta_expr.rb +54 -0
  33. data/lib/polars/rolling_group_by.rb +35 -0
  34. data/lib/polars/series.rb +3730 -0
  35. data/lib/polars/slice.rb +104 -0
  36. data/lib/polars/string_expr.rb +972 -0
  37. data/lib/polars/string_name_space.rb +690 -0
  38. data/lib/polars/struct_expr.rb +100 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +192 -0
  41. data/lib/polars/version.rb +4 -0
  42. data/lib/polars/when.rb +16 -0
  43. data/lib/polars/when_then.rb +19 -0
  44. data/lib/polars-df.rb +1 -0
  45. data/lib/polars.rb +50 -0
  46. metadata +89 -0
data/lib/polars/io.rb ADDED
@@ -0,0 +1,814 @@
1
+ module Polars
2
+ module IO
3
+ # Read a CSV file into a DataFrame.
4
+ #
5
+ # @param file [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param has_header [Boolean]
8
+ # Indicate if the first row of dataset is a header or not.
9
+ # If set to false, column names will be autogenerated in the
10
+ # following format: `column_x`, with `x` being an
11
+ # enumeration over every column in the dataset starting at 1.
12
+ # @param columns [Object]
13
+ # Columns to select. Accepts a list of column indices (starting
14
+ # at zero) or a list of column names.
15
+ # @param new_columns [Object]
16
+ # Rename columns right after parsing the CSV file. If the given
17
+ # list is shorter than the width of the DataFrame the remaining
18
+ # columns will have their original name.
19
+ # @param sep [String]
20
+ # Single byte character to use as delimiter in the file.
21
+ # @param comment_char [String]
22
+ # Single byte character that indicates the start of a comment line,
23
+ # for instance `#`.
24
+ # @param quote_char [String]
25
+ # Single byte character used for csv quoting.
26
+ # Set to nil to turn off special handling and escaping of quotes.
27
+ # @param skip_rows [Integer]
28
+ # Start reading after `skip_rows` lines.
29
+ # @param dtypes [Object]
30
+ # Overwrite dtypes during inference.
31
+ # @param null_values [Object]
32
+ # Values to interpret as null values. You can provide a:
33
+ #
34
+ # - `String`: All values equal to this string will be null.
35
+ # - `Array`: All values equal to any string in this array will be null.
36
+ # - `Hash`: A hash that maps column name to a null value string.
37
+ # @param ignore_errors [Boolean]
38
+ # Try to keep reading lines if some lines yield errors.
39
+ # First try `infer_schema_length: 0` to read all columns as
40
+ # `:str` to check which values might cause an issue.
41
+ # @param parse_dates [Boolean]
42
+ # Try to automatically parse dates. If this does not succeed,
43
+ # the column remains of data type `:str`.
44
+ # @param n_threads [Integer]
45
+ # Number of threads to use in csv parsing.
46
+ # Defaults to the number of physical cpu's of your system.
47
+ # @param infer_schema_length [Integer]
48
+ # Maximum number of lines to read to infer schema.
49
+ # If set to 0, all columns will be read as `:utf8`.
50
+ # If set to `nil`, a full table scan will be done (slow).
51
+ # @param batch_size [Integer]
52
+ # Number of lines to read into the buffer at once.
53
+ # Modify this to change performance.
54
+ # @param n_rows [Integer]
55
+ # Stop reading from CSV file after reading `n_rows`.
56
+ # During multi-threaded parsing, an upper bound of `n_rows`
57
+ # rows cannot be guaranteed.
58
+ # @param encoding ["utf8", "utf8-lossy"]
59
+ # Lossy means that invalid utf8 values are replaced with `�`
60
+ # characters. When using other encodings than `utf8` or
61
+ # `utf8-lossy`, the input is first decoded im memory with
62
+ # Ruby.
63
+ # @param low_memory [Boolean]
64
+ # Reduce memory usage at expense of performance.
65
+ # @param rechunk [Boolean]
66
+ # Make sure that all columns are contiguous in memory by
67
+ # aggregating the chunks into a single array.
68
+ # @param storage_options [Hash]
69
+ # Extra options that make sense for a
70
+ # particular storage connection.
71
+ # @param skip_rows_after_header [Integer]
72
+ # Skip this number of rows when the header is parsed.
73
+ # @param row_count_name [String]
74
+ # If not nil, this will insert a row count column with the given name into
75
+ # the DataFrame.
76
+ # @param row_count_offset [Integer]
77
+ # Offset to start the row_count column (only used if the name is set).
78
+ # @param sample_size [Integer]
79
+ # Set the sample size. This is used to sample statistics to estimate the
80
+ # allocation needed.
81
+ # @param eol_char [String]
82
+ # Single byte end of line character.
83
+ #
84
+ # @return [DataFrame]
85
+ #
86
+ # @note
87
+ # This operation defaults to a `rechunk` operation at the end, meaning that
88
+ # all data will be stored continuously in memory.
89
+ # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
90
+ # an expensive operation.
91
+ def read_csv(
92
+ file,
93
+ has_header: true,
94
+ columns: nil,
95
+ new_columns: nil,
96
+ sep: ",",
97
+ comment_char: nil,
98
+ quote_char: '"',
99
+ skip_rows: 0,
100
+ dtypes: nil,
101
+ null_values: nil,
102
+ ignore_errors: false,
103
+ parse_dates: false,
104
+ n_threads: nil,
105
+ infer_schema_length: 100,
106
+ batch_size: 8192,
107
+ n_rows: nil,
108
+ encoding: "utf8",
109
+ low_memory: false,
110
+ rechunk: true,
111
+ storage_options: nil,
112
+ skip_rows_after_header: 0,
113
+ row_count_name: nil,
114
+ row_count_offset: 0,
115
+ sample_size: 1024,
116
+ eol_char: "\n"
117
+ )
118
+ _check_arg_is_1byte("sep", sep, false)
119
+ _check_arg_is_1byte("comment_char", comment_char, false)
120
+ _check_arg_is_1byte("quote_char", quote_char, true)
121
+ _check_arg_is_1byte("eol_char", eol_char, false)
122
+
123
+ projection, columns = Utils.handle_projection_columns(columns)
124
+
125
+ storage_options ||= {}
126
+
127
+ if columns && !has_header
128
+ columns.each do |column|
129
+ if !column.start_with?("column_")
130
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
131
+ end
132
+ end
133
+ end
134
+
135
+ if projection || new_columns
136
+ raise Todo
137
+ end
138
+
139
+ df = nil
140
+ _prepare_file_arg(file) do |data|
141
+ df = DataFrame._read_csv(
142
+ data,
143
+ has_header: has_header,
144
+ columns: columns || projection,
145
+ sep: sep,
146
+ comment_char: comment_char,
147
+ quote_char: quote_char,
148
+ skip_rows: skip_rows,
149
+ dtypes: dtypes,
150
+ null_values: null_values,
151
+ ignore_errors: ignore_errors,
152
+ parse_dates: parse_dates,
153
+ n_threads: n_threads,
154
+ infer_schema_length: infer_schema_length,
155
+ batch_size: batch_size,
156
+ n_rows: n_rows,
157
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
158
+ low_memory: low_memory,
159
+ rechunk: rechunk,
160
+ skip_rows_after_header: skip_rows_after_header,
161
+ row_count_name: row_count_name,
162
+ row_count_offset: row_count_offset,
163
+ sample_size: sample_size,
164
+ eol_char: eol_char
165
+ )
166
+ end
167
+
168
+ if new_columns
169
+ Utils._update_columns(df, new_columns)
170
+ else
171
+ df
172
+ end
173
+ end
174
+
175
+ # Lazily read from a CSV file or multiple files via glob patterns.
176
+ #
177
+ # This allows the query optimizer to push down predicates and
178
+ # projections to the scan level, thereby potentially reducing
179
+ # memory overhead.
180
+ #
181
+ # @param file [Object]
182
+ # Path to a file.
183
+ # @param has_header [Boolean]
184
+ # Indicate if the first row of dataset is a header or not.
185
+ # If set to false, column names will be autogenerated in the
186
+ # following format: `column_x`, with `x` being an
187
+ # enumeration over every column in the dataset starting at 1.
188
+ # @param sep [String]
189
+ # Single byte character to use as delimiter in the file.
190
+ # @param comment_char [String]
191
+ # Single byte character that indicates the start of a comment line,
192
+ # for instance `#`.
193
+ # @param quote_char [String]
194
+ # Single byte character used for csv quoting.
195
+ # Set to None to turn off special handling and escaping of quotes.
196
+ # @param skip_rows [Integer]
197
+ # Start reading after `skip_rows` lines. The header will be parsed at this
198
+ # offset.
199
+ # @param dtypes [Object]
200
+ # Overwrite dtypes during inference.
201
+ # @param null_values [Object]
202
+ # Values to interpret as null values. You can provide a:
203
+ #
204
+ # - `String`: All values equal to this string will be null.
205
+ # - `Array`: All values equal to any string in this array will be null.
206
+ # - `Hash`: A hash that maps column name to a null value string.
207
+ # @param ignore_errors [Boolean]
208
+ # Try to keep reading lines if some lines yield errors.
209
+ # First try `infer_schema_length: 0` to read all columns as
210
+ # `:str` to check which values might cause an issue.
211
+ # @param cache [Boolean]
212
+ # Cache the result after reading.
213
+ # @param with_column_names [Object]
214
+ # Apply a function over the column names.
215
+ # This can be used to update a schema just in time, thus before
216
+ # scanning.
217
+ # @param infer_schema_length [Integer]
218
+ # Maximum number of lines to read to infer schema.
219
+ # If set to 0, all columns will be read as `:str`.
220
+ # If set to `nil`, a full table scan will be done (slow).
221
+ # @param n_rows [Integer]
222
+ # Stop reading from CSV file after reading `n_rows`.
223
+ # @param encoding ["utf8", "utf8-lossy"]
224
+ # Lossy means that invalid utf8 values are replaced with `�`
225
+ # characters.
226
+ # @param low_memory [Boolean]
227
+ # Reduce memory usage in expense of performance.
228
+ # @param rechunk [Boolean]
229
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
230
+ # @param skip_rows_after_header [Integer]
231
+ # Skip this number of rows when the header is parsed.
232
+ # @param row_count_name [String]
233
+ # If not nil, this will insert a row count column with the given name into
234
+ # the DataFrame.
235
+ # @param row_count_offset [Integer]
236
+ # Offset to start the row_count column (only used if the name is set).
237
+ # @param parse_dates [Boolean]
238
+ # Try to automatically parse dates. If this does not succeed,
239
+ # the column remains of data type `:str`.
240
+ # @param eol_char [String]
241
+ # Single byte end of line character.
242
+ #
243
+ # @return [LazyFrame]
244
+ def scan_csv(
245
+ file,
246
+ has_header: true,
247
+ sep: ",",
248
+ comment_char: nil,
249
+ quote_char: '"',
250
+ skip_rows: 0,
251
+ dtypes: nil,
252
+ null_values: nil,
253
+ ignore_errors: false,
254
+ cache: true,
255
+ with_column_names: nil,
256
+ infer_schema_length: 100,
257
+ n_rows: nil,
258
+ encoding: "utf8",
259
+ low_memory: false,
260
+ rechunk: true,
261
+ skip_rows_after_header: 0,
262
+ row_count_name: nil,
263
+ row_count_offset: 0,
264
+ parse_dates: false,
265
+ eol_char: "\n"
266
+ )
267
+ _check_arg_is_1byte("sep", sep, false)
268
+ _check_arg_is_1byte("comment_char", comment_char, false)
269
+ _check_arg_is_1byte("quote_char", quote_char, true)
270
+
271
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
272
+ file = Utils.format_path(file)
273
+ end
274
+
275
+ LazyFrame._scan_csv(
276
+ file,
277
+ has_header: has_header,
278
+ sep: sep,
279
+ comment_char: comment_char,
280
+ quote_char: quote_char,
281
+ skip_rows: skip_rows,
282
+ dtypes: dtypes,
283
+ null_values: null_values,
284
+ ignore_errors: ignore_errors,
285
+ cache: cache,
286
+ with_column_names: with_column_names,
287
+ infer_schema_length: infer_schema_length,
288
+ n_rows: n_rows,
289
+ low_memory: low_memory,
290
+ rechunk: rechunk,
291
+ skip_rows_after_header: skip_rows_after_header,
292
+ encoding: encoding,
293
+ row_count_name: row_count_name,
294
+ row_count_offset: row_count_offset,
295
+ parse_dates: parse_dates,
296
+ eol_char: eol_char,
297
+ )
298
+ end
299
+
300
+ # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
301
+ #
302
+ # This allows the query optimizer to push down predicates and projections to the scan
303
+ # level, thereby potentially reducing memory overhead.
304
+ #
305
+ # @param file [String]
306
+ # Path to a IPC file.
307
+ # @param n_rows [Integer]
308
+ # Stop reading from IPC file after reading `n_rows`.
309
+ # @param cache [Boolean]
310
+ # Cache the result after reading.
311
+ # @param rechunk [Boolean]
312
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
313
+ # @param row_count_name [String]
314
+ # If not nil, this will insert a row count column with give name into the
315
+ # DataFrame.
316
+ # @param row_count_offset [Integer]
317
+ # Offset to start the row_count column (only use if the name is set).
318
+ # @param storage_options [Hash]
319
+ # Extra options that make sense for a particular storage connection.
320
+ # @param memory_map [Boolean]
321
+ # Try to memory map the file. This can greatly improve performance on repeated
322
+ # queries as the OS may cache pages.
323
+ # Only uncompressed IPC files can be memory mapped.
324
+ #
325
+ # @return [LazyFrame]
326
+ def scan_ipc(
327
+ file,
328
+ n_rows: nil,
329
+ cache: true,
330
+ rechunk: true,
331
+ row_count_name: nil,
332
+ row_count_offset: 0,
333
+ storage_options: nil,
334
+ memory_map: true
335
+ )
336
+ LazyFrame._scan_ipc(
337
+ file,
338
+ n_rows: n_rows,
339
+ cache: cache,
340
+ rechunk: rechunk,
341
+ row_count_name: row_count_name,
342
+ row_count_offset: row_count_offset,
343
+ storage_options: storage_options,
344
+ memory_map: memory_map
345
+ )
346
+ end
347
+
348
+ # Lazily read from a parquet file or multiple files via glob patterns.
349
+ #
350
+ # This allows the query optimizer to push down predicates and projections to the scan
351
+ # level, thereby potentially reducing memory overhead.
352
+ #
353
+ # @param file [String]
354
+ # Path to a file.
355
+ # @param n_rows [Integer]
356
+ # Stop reading from parquet file after reading `n_rows`.
357
+ # @param cache [Boolean]
358
+ # Cache the result after reading.
359
+ # @param parallel ["auto", "columns", "row_groups", "none"]
360
+ # This determines the direction of parallelism. 'auto' will try to determine the
361
+ # optimal direction.
362
+ # @param rechunk [Boolean]
363
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
364
+ # into contiguous memory chunks.
365
+ # @param row_count_name [String]
366
+ # If not nil, this will insert a row count column with give name into the
367
+ # DataFrame.
368
+ # @param row_count_offset [Integer]
369
+ # Offset to start the row_count column (only use if the name is set).
370
+ # @param storage_options [Hash]
371
+ # Extra options that make sense for a particular storage connection.
372
+ # @param low_memory [Boolean]
373
+ # Reduce memory pressure at the expense of performance.
374
+ #
375
+ # @return [LazyFrame]
376
+ def scan_parquet(
377
+ file,
378
+ n_rows: nil,
379
+ cache: true,
380
+ parallel: "auto",
381
+ rechunk: true,
382
+ row_count_name: nil,
383
+ row_count_offset: 0,
384
+ storage_options: nil,
385
+ low_memory: false
386
+ )
387
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
388
+ file = Utils.format_path(file)
389
+ end
390
+
391
+ LazyFrame._scan_parquet(
392
+ file,
393
+ n_rows:n_rows,
394
+ cache: cache,
395
+ parallel: parallel,
396
+ rechunk: rechunk,
397
+ row_count_name: row_count_name,
398
+ row_count_offset: row_count_offset,
399
+ storage_options: storage_options,
400
+ low_memory: low_memory
401
+ )
402
+ end
403
+
404
+ # Lazily read from a newline delimited JSON file.
405
+ #
406
+ # This allows the query optimizer to push down predicates and projections to the scan
407
+ # level, thereby potentially reducing memory overhead.
408
+ #
409
+ # @param file [String]
410
+ # Path to a file.
411
+ # @param infer_schema_length [Integer]
412
+ # Infer the schema length from the first `infer_schema_length` rows.
413
+ # @param batch_size [Integer]
414
+ # Number of rows to read in each batch.
415
+ # @param n_rows [Integer]
416
+ # Stop reading from JSON file after reading `n_rows`.
417
+ # @param low_memory [Boolean]
418
+ # Reduce memory pressure at the expense of performance.
419
+ # @param rechunk [Boolean]
420
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
421
+ # @param row_count_name [String]
422
+ # If not nil, this will insert a row count column with give name into the
423
+ # DataFrame.
424
+ # @param row_count_offset [Integer]
425
+ # Offset to start the row_count column (only use if the name is set).
426
+ #
427
+ # @return [LazyFrame]
428
+ def scan_ndjson(
429
+ file,
430
+ infer_schema_length: 100,
431
+ batch_size: 1024,
432
+ n_rows: nil,
433
+ low_memory: false,
434
+ rechunk: true,
435
+ row_count_name: nil,
436
+ row_count_offset: 0
437
+ )
438
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
439
+ file = Utils.format_path(file)
440
+ end
441
+
442
+ LazyFrame._scan_ndjson(
443
+ file,
444
+ infer_schema_length: infer_schema_length,
445
+ batch_size: batch_size,
446
+ n_rows: n_rows,
447
+ low_memory: low_memory,
448
+ rechunk: rechunk,
449
+ row_count_name: row_count_name,
450
+ row_count_offset: row_count_offset,
451
+ )
452
+ end
453
+
454
+ # Read into a DataFrame from Apache Avro format.
455
+ #
456
+ # @param file [Object]
457
+ # Path to a file or a file-like object.
458
+ # @param columns [Object]
459
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
460
+ # of column names.
461
+ # @param n_rows [Integer]
462
+ # Stop reading from Apache Avro file after reading ``n_rows``.
463
+ #
464
+ # @return [DataFrame]
465
+ def read_avro(file, columns: nil, n_rows: nil)
466
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
467
+ file = Utils.format_path(file)
468
+ end
469
+
470
+ DataFrame._read_avro(file, n_rows: n_rows, columns: columns)
471
+ end
472
+
473
+ # Read into a DataFrame from Arrow IPC (Feather v2) file.
474
+ #
475
+ # @param file [Object]
476
+ # Path to a file or a file-like object.
477
+ # @param columns [Object]
478
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
479
+ # of column names.
480
+ # @param n_rows [Integer]
481
+ # Stop reading from IPC file after reading `n_rows`.
482
+ # @param memory_map [Boolean]
483
+ # Try to memory map the file. This can greatly improve performance on repeated
484
+ # queries as the OS may cache pages.
485
+ # Only uncompressed IPC files can be memory mapped.
486
+ # @param storage_options [Hash]
487
+ # Extra options that make sense for a particular storage connection.
488
+ # @param row_count_name [String]
489
+ # If not nil, this will insert a row count column with give name into the
490
+ # DataFrame.
491
+ # @param row_count_offset [Integer]
492
+ # Offset to start the row_count column (only use if the name is set).
493
+ # @param rechunk [Boolean]
494
+ # Make sure that all data is contiguous.
495
+ #
496
+ # @return [DataFrame]
497
+ def read_ipc(
498
+ file,
499
+ columns: nil,
500
+ n_rows: nil,
501
+ memory_map: true,
502
+ storage_options: nil,
503
+ row_count_name: nil,
504
+ row_count_offset: 0,
505
+ rechunk: true
506
+ )
507
+ storage_options ||= {}
508
+ _prepare_file_arg(file, **storage_options) do |data|
509
+ DataFrame._read_ipc(
510
+ data,
511
+ columns: columns,
512
+ n_rows: n_rows,
513
+ row_count_name: row_count_name,
514
+ row_count_offset: row_count_offset,
515
+ rechunk: rechunk,
516
+ memory_map: memory_map
517
+ )
518
+ end
519
+ end
520
+
521
+ # Read into a DataFrame from a parquet file.
522
+ #
523
+ # @param file [Object]
524
+ # Path to a file, or a file-like object.
525
+ # @param columns [Object]
526
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
527
+ # of column names.
528
+ # @param n_rows [Integer]
529
+ # Stop reading from parquet file after reading `n_rows`.
530
+ # @param storage_options [Hash]
531
+ # Extra options that make sense for a particular storage connection.
532
+ # @param parallel ["auto", "columns", "row_groups", "none"]
533
+ # This determines the direction of parallelism. 'auto' will try to determine the
534
+ # optimal direction.
535
+ # @param row_count_name [String]
536
+ # If not nil, this will insert a row count column with give name into the
537
+ # DataFrame.
538
+ # @param row_count_offset [Integer]
539
+ # Offset to start the row_count column (only use if the name is set).
540
+ # @param low_memory [Boolean]
541
+ # Reduce memory pressure at the expense of performance.
542
+ #
543
+ # @return [DataFrame]
544
+ #
545
+ # @note
546
+ # This operation defaults to a `rechunk` operation at the end, meaning that
547
+ # all data will be stored continuously in memory.
548
+ # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
549
+ # an expensive operation.
550
+ def read_parquet(
551
+ file,
552
+ columns: nil,
553
+ n_rows: nil,
554
+ storage_options: nil,
555
+ parallel: "auto",
556
+ row_count_name: nil,
557
+ row_count_offset: 0,
558
+ low_memory: false
559
+ )
560
+ _prepare_file_arg(file) do |data|
561
+ DataFrame._read_parquet(
562
+ data,
563
+ columns: columns,
564
+ n_rows: n_rows,
565
+ parallel: parallel,
566
+ row_count_name: row_count_name,
567
+ row_count_offset: row_count_offset,
568
+ low_memory: low_memory
569
+ )
570
+ end
571
+ end
572
+
573
+ # Read into a DataFrame from a JSON file.
574
+ #
575
+ # @param file [Object]
576
+ # Path to a file or a file-like object.
577
+ #
578
+ # @return [DataFrame]
579
+ def read_json(file)
580
+ DataFrame._read_json(file)
581
+ end
582
+
583
+ # Read into a DataFrame from a newline delimited JSON file.
584
+ #
585
+ # @param file [Object]
586
+ # Path to a file or a file-like object.
587
+ #
588
+ # @return [DataFrame]
589
+ def read_ndjson(file)
590
+ DataFrame._read_ndjson(file)
591
+ end
592
+
593
+ # def read_sql
594
+ # end
595
+
596
+ # def read_excel
597
+ # end
598
+
599
+ # Read a CSV file in batches.
600
+ #
601
+ # Upon creation of the `BatchedCsvReader`,
602
+ # polars will gather statistics and determine the
603
+ # file chunks. After that work will only be done
604
+ # if `next_batches` is called.
605
+ #
606
+ # @param file [Object]
607
+ # Path to a file or a file-like object.
608
+ # @param has_header [Boolean]
609
+ # Indicate if the first row of dataset is a header or not.
610
+ # If set to False, column names will be autogenerated in the
611
+ # following format: `column_x`, with `x` being an
612
+ # enumeration over every column in the dataset starting at 1.
613
+ # @param columns [Object]
614
+ # Columns to select. Accepts a list of column indices (starting
615
+ # at zero) or a list of column names.
616
+ # @param new_columns [Object]
617
+ # Rename columns right after parsing the CSV file. If the given
618
+ # list is shorter than the width of the DataFrame the remaining
619
+ # columns will have their original name.
620
+ # @param sep [String]
621
+ # Single byte character to use as delimiter in the file.
622
+ # @param comment_char [String]
623
+ # Single byte character that indicates the start of a comment line,
624
+ # for instance `#`.
625
+ # @param quote_char [String]
626
+ # Single byte character used for csv quoting, default = `"`.
627
+ # Set to nil to turn off special handling and escaping of quotes.
628
+ # @param skip_rows [Integer]
629
+ # Start reading after `skip_rows` lines.
630
+ # @param dtypes [Object]
631
+ # Overwrite dtypes during inference.
632
+ # @param null_values [Object]
633
+ # Values to interpret as null values. You can provide a:
634
+ #
635
+ # - `String`: All values equal to this string will be null.
636
+ # - `Array`: All values equal to any string in this array will be null.
637
+ # - `Hash`: A hash that maps column name to a null value string.
638
+ # @param ignore_errors [Boolean]
639
+ # Try to keep reading lines if some lines yield errors.
640
+ # First try `infer_schema_length: 0` to read all columns as
641
+ # `:str` to check which values might cause an issue.
642
+ # @param parse_dates [Boolean]
643
+ # Try to automatically parse dates. If this does not succeed,
644
+ # the column remains of data type `:str`.
645
+ # @param n_threads [Integer]
646
+ # Number of threads to use in csv parsing.
647
+ # Defaults to the number of physical cpu's of your system.
648
+ # @param infer_schema_length [Integer]
649
+ # Maximum number of lines to read to infer schema.
650
+ # If set to 0, all columns will be read as `:str`.
651
+ # If set to `nil`, a full table scan will be done (slow).
652
+ # @param batch_size [Integer]
653
+ # Number of lines to read into the buffer at once.
654
+ # Modify this to change performance.
655
+ # @param n_rows [Integer]
656
+ # Stop reading from CSV file after reading `n_rows`.
657
+ # During multi-threaded parsing, an upper bound of `n_rows`
658
+ # rows cannot be guaranteed.
659
+ # @param encoding ["utf8", "utf8-lossy"]
660
+ # Lossy means that invalid utf8 values are replaced with `�`
661
+ # characters. When using other encodings than `utf8` or
662
+ # `utf8-lossy`, the input is first decoded im memory with
663
+ # Ruby. Defaults to `utf8`.
664
+ # @param low_memory [Boolean]
665
+ # Reduce memory usage at expense of performance.
666
+ # @param rechunk [Boolean]
667
+ # Make sure that all columns are contiguous in memory by
668
+ # aggregating the chunks into a single array.
669
+ # @param skip_rows_after_header [Integer]
670
+ # Skip this number of rows when the header is parsed.
671
+ # @param row_count_name [String]
672
+ # If not nil, this will insert a row count column with the given name into
673
+ # the DataFrame.
674
+ # @param row_count_offset [Integer]
675
+ # Offset to start the row_count column (only used if the name is set).
676
+ # @param sample_size [Integer]
677
+ # Set the sample size. This is used to sample statistics to estimate the
678
+ # allocation needed.
679
+ # @param eol_char [String]
680
+ # Single byte end of line character.
681
+ #
682
+ # @return [BatchedCsvReader]
683
+ #
684
+ # @example
685
+ # reader = Polars.read_csv_batched(
686
+ # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
687
+ # )
688
+ # reader.next_batches(5)
689
+ def read_csv_batched(
690
+ file,
691
+ has_header: true,
692
+ columns: nil,
693
+ new_columns: nil,
694
+ sep: ",",
695
+ comment_char: nil,
696
+ quote_char: '"',
697
+ skip_rows: 0,
698
+ dtypes: nil,
699
+ null_values: nil,
700
+ ignore_errors: false,
701
+ parse_dates: false,
702
+ n_threads: nil,
703
+ infer_schema_length: 100,
704
+ batch_size: 50_000,
705
+ n_rows: nil,
706
+ encoding: "utf8",
707
+ low_memory: false,
708
+ rechunk: true,
709
+ skip_rows_after_header: 0,
710
+ row_count_name: nil,
711
+ row_count_offset: 0,
712
+ sample_size: 1024,
713
+ eol_char: "\n"
714
+ )
715
+ projection, columns = Utils.handle_projection_columns(columns)
716
+
717
+ if columns && !has_header
718
+ columns.each do |column|
719
+ if !column.start_with?("column_")
720
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
721
+ end
722
+ end
723
+ end
724
+
725
+ if projection || new_columns
726
+ raise Todo
727
+ end
728
+
729
+ BatchedCsvReader.new(
730
+ file,
731
+ has_header: has_header,
732
+ columns: columns || projection,
733
+ sep: sep,
734
+ comment_char: comment_char,
735
+ quote_char: quote_char,
736
+ skip_rows: skip_rows,
737
+ dtypes: dtypes,
738
+ null_values: null_values,
739
+ ignore_errors: ignore_errors,
740
+ parse_dates: parse_dates,
741
+ n_threads: n_threads,
742
+ infer_schema_length: infer_schema_length,
743
+ batch_size: batch_size,
744
+ n_rows: n_rows,
745
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
746
+ low_memory: low_memory,
747
+ rechunk: rechunk,
748
+ skip_rows_after_header: skip_rows_after_header,
749
+ row_count_name: row_count_name,
750
+ row_count_offset: row_count_offset,
751
+ sample_size: sample_size,
752
+ eol_char: eol_char,
753
+ new_columns: new_columns
754
+ )
755
+ end
756
+
757
+ # Get a schema of the IPC file without reading data.
758
+ #
759
+ # @param file [Object]
760
+ # Path to a file or a file-like object.
761
+ #
762
+ # @return [Hash]
763
+ def read_ipc_schema(file)
764
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
765
+ file = Utils.format_path(file)
766
+ end
767
+
768
+ _ipc_schema(file)
769
+ end
770
+
771
+ # Get a schema of the Parquet file without reading data.
772
+ #
773
+ # @param file [Object]
774
+ # Path to a file or a file-like object.
775
+ #
776
+ # @return [Hash]
777
+ def read_parquet_schema(file)
778
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
779
+ file = Utils.format_path(file)
780
+ end
781
+
782
+ _parquet_schema(file)
783
+ end
784
+
785
+ private
786
+
787
+ def _prepare_file_arg(file)
788
+ if file.is_a?(String) && file =~ /\Ahttps?:\/\//
789
+ raise ArgumentError, "use URI(...) for remote files"
790
+ end
791
+
792
+ if defined?(URI) && file.is_a?(URI)
793
+ require "open-uri"
794
+
795
+ file = URI.open(file)
796
+ end
797
+
798
+ yield file
799
+ end
800
+
801
+ def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
802
+ if arg.is_a?(String)
803
+ arg_byte_length = arg.bytesize
804
+ if can_be_empty
805
+ if arg_byte_length > 1
806
+ raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
807
+ end
808
+ elsif arg_byte_length != 1
809
+ raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
810
+ end
811
+ end
812
+ end
813
+ end
814
+ end