polars-df 0.10.0-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +175 -0
  4. data/Cargo.lock +2536 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +38726 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +98 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +72 -0
  18. data/lib/polars/cat_name_space.rb +125 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +93 -0
  21. data/lib/polars/data_frame.rb +5418 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1444 -0
  24. data/lib/polars/date_time_name_space.rb +1484 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +31 -0
  27. data/lib/polars/expr.rb +6105 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +248 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1280 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +103 -0
  39. data/lib/polars/functions/range/int_range.rb +51 -0
  40. data/lib/polars/functions/repeat.rb +144 -0
  41. data/lib/polars/functions/whenthen.rb +96 -0
  42. data/lib/polars/functions.rb +57 -0
  43. data/lib/polars/group_by.rb +548 -0
  44. data/lib/polars/io.rb +890 -0
  45. data/lib/polars/lazy_frame.rb +2833 -0
  46. data/lib/polars/lazy_group_by.rb +84 -0
  47. data/lib/polars/list_expr.rb +791 -0
  48. data/lib/polars/list_name_space.rb +445 -0
  49. data/lib/polars/meta_expr.rb +222 -0
  50. data/lib/polars/name_expr.rb +198 -0
  51. data/lib/polars/plot.rb +109 -0
  52. data/lib/polars/rolling_group_by.rb +37 -0
  53. data/lib/polars/series.rb +4527 -0
  54. data/lib/polars/slice.rb +104 -0
  55. data/lib/polars/sql_context.rb +194 -0
  56. data/lib/polars/string_cache.rb +75 -0
  57. data/lib/polars/string_expr.rb +1519 -0
  58. data/lib/polars/string_name_space.rb +810 -0
  59. data/lib/polars/struct_expr.rb +98 -0
  60. data/lib/polars/struct_name_space.rb +96 -0
  61. data/lib/polars/testing.rb +507 -0
  62. data/lib/polars/utils.rb +422 -0
  63. data/lib/polars/version.rb +4 -0
  64. data/lib/polars/whenthen.rb +83 -0
  65. data/lib/polars-df.rb +1 -0
  66. data/lib/polars.rb +72 -0
  67. metadata +125 -0
data/lib/polars/io.rb ADDED
@@ -0,0 +1,890 @@
1
+ module Polars
2
+ module IO
3
+ # Read a CSV file into a DataFrame.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param has_header [Boolean]
8
+ # Indicate if the first row of dataset is a header or not.
9
+ # If set to false, column names will be autogenerated in the
10
+ # following format: `column_x`, with `x` being an
11
+ # enumeration over every column in the dataset starting at 1.
12
+ # @param columns [Object]
13
+ # Columns to select. Accepts a list of column indices (starting
14
+ # at zero) or a list of column names.
15
+ # @param new_columns [Object]
16
+ # Rename columns right after parsing the CSV file. If the given
17
+ # list is shorter than the width of the DataFrame the remaining
18
+ # columns will have their original name.
19
+ # @param sep [String]
20
+ # Single byte character to use as delimiter in the file.
21
+ # @param comment_char [String]
22
+ # Single byte character that indicates the start of a comment line,
23
+ # for instance `#`.
24
+ # @param quote_char [String]
25
+ # Single byte character used for csv quoting.
26
+ # Set to nil to turn off special handling and escaping of quotes.
27
+ # @param skip_rows [Integer]
28
+ # Start reading after `skip_rows` lines.
29
+ # @param dtypes [Object]
30
+ # Overwrite dtypes during inference.
31
+ # @param null_values [Object]
32
+ # Values to interpret as null values. You can provide a:
33
+ #
34
+ # - `String`: All values equal to this string will be null.
35
+ # - `Array`: All values equal to any string in this array will be null.
36
+ # - `Hash`: A hash that maps column name to a null value string.
37
+ # @param ignore_errors [Boolean]
38
+ # Try to keep reading lines if some lines yield errors.
39
+ # First try `infer_schema_length: 0` to read all columns as
40
+ # `:str` to check which values might cause an issue.
41
+ # @param parse_dates [Boolean]
42
+ # Try to automatically parse dates. If this does not succeed,
43
+ # the column remains of data type `:str`.
44
+ # @param n_threads [Integer]
45
+ # Number of threads to use in csv parsing.
46
+ # Defaults to the number of physical cpu's of your system.
47
+ # @param infer_schema_length [Integer]
48
+ # Maximum number of lines to read to infer schema.
49
+ # If set to 0, all columns will be read as `:utf8`.
50
+ # If set to `nil`, a full table scan will be done (slow).
51
+ # @param batch_size [Integer]
52
+ # Number of lines to read into the buffer at once.
53
+ # Modify this to change performance.
54
+ # @param n_rows [Integer]
55
+ # Stop reading from CSV file after reading `n_rows`.
56
+ # During multi-threaded parsing, an upper bound of `n_rows`
57
+ # rows cannot be guaranteed.
58
+ # @param encoding ["utf8", "utf8-lossy"]
59
+ # Lossy means that invalid utf8 values are replaced with `�`
60
+ # characters. When using other encodings than `utf8` or
61
+ # `utf8-lossy`, the input is first decoded im memory with
62
+ # Ruby.
63
+ # @param low_memory [Boolean]
64
+ # Reduce memory usage at expense of performance.
65
+ # @param rechunk [Boolean]
66
+ # Make sure that all columns are contiguous in memory by
67
+ # aggregating the chunks into a single array.
68
+ # @param storage_options [Hash]
69
+ # Extra options that make sense for a
70
+ # particular storage connection.
71
+ # @param skip_rows_after_header [Integer]
72
+ # Skip this number of rows when the header is parsed.
73
+ # @param row_count_name [String]
74
+ # If not nil, this will insert a row count column with the given name into
75
+ # the DataFrame.
76
+ # @param row_count_offset [Integer]
77
+ # Offset to start the row_count column (only used if the name is set).
78
+ # @param sample_size [Integer]
79
+ # Set the sample size. This is used to sample statistics to estimate the
80
+ # allocation needed.
81
+ # @param eol_char [String]
82
+ # Single byte end of line character.
83
+ # @param truncate_ragged_lines [Boolean]
84
+ # Truncate lines that are longer than the schema.
85
+ #
86
+ # @return [DataFrame]
87
+ #
88
+ # @note
89
+ # This operation defaults to a `rechunk` operation at the end, meaning that
90
+ # all data will be stored continuously in memory.
91
+ # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
92
+ # an expensive operation.
93
+ def read_csv(
94
+ source,
95
+ has_header: true,
96
+ columns: nil,
97
+ new_columns: nil,
98
+ sep: ",",
99
+ comment_char: nil,
100
+ quote_char: '"',
101
+ skip_rows: 0,
102
+ dtypes: nil,
103
+ null_values: nil,
104
+ ignore_errors: false,
105
+ parse_dates: false,
106
+ n_threads: nil,
107
+ infer_schema_length: 100,
108
+ batch_size: 8192,
109
+ n_rows: nil,
110
+ encoding: "utf8",
111
+ low_memory: false,
112
+ rechunk: true,
113
+ storage_options: nil,
114
+ skip_rows_after_header: 0,
115
+ row_count_name: nil,
116
+ row_count_offset: 0,
117
+ sample_size: 1024,
118
+ eol_char: "\n",
119
+ truncate_ragged_lines: false
120
+ )
121
+ Utils._check_arg_is_1byte("sep", sep, false)
122
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
123
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
124
+ Utils._check_arg_is_1byte("eol_char", eol_char, false)
125
+
126
+ projection, columns = Utils.handle_projection_columns(columns)
127
+
128
+ storage_options ||= {}
129
+
130
+ if columns && !has_header
131
+ columns.each do |column|
132
+ if !column.start_with?("column_")
133
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
134
+ end
135
+ end
136
+ end
137
+
138
+ if projection || new_columns
139
+ raise Todo
140
+ end
141
+
142
+ df = nil
143
+ _prepare_file_arg(source) do |data|
144
+ df = DataFrame._read_csv(
145
+ data,
146
+ has_header: has_header,
147
+ columns: columns || projection,
148
+ sep: sep,
149
+ comment_char: comment_char,
150
+ quote_char: quote_char,
151
+ skip_rows: skip_rows,
152
+ dtypes: dtypes,
153
+ null_values: null_values,
154
+ ignore_errors: ignore_errors,
155
+ parse_dates: parse_dates,
156
+ n_threads: n_threads,
157
+ infer_schema_length: infer_schema_length,
158
+ batch_size: batch_size,
159
+ n_rows: n_rows,
160
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
161
+ low_memory: low_memory,
162
+ rechunk: rechunk,
163
+ skip_rows_after_header: skip_rows_after_header,
164
+ row_count_name: row_count_name,
165
+ row_count_offset: row_count_offset,
166
+ sample_size: sample_size,
167
+ eol_char: eol_char,
168
+ truncate_ragged_lines: truncate_ragged_lines
169
+ )
170
+ end
171
+
172
+ if new_columns
173
+ Utils._update_columns(df, new_columns)
174
+ else
175
+ df
176
+ end
177
+ end
178
+
179
+ # Lazily read from a CSV file or multiple files via glob patterns.
180
+ #
181
+ # This allows the query optimizer to push down predicates and
182
+ # projections to the scan level, thereby potentially reducing
183
+ # memory overhead.
184
+ #
185
+ # @param source [Object]
186
+ # Path to a file.
187
+ # @param has_header [Boolean]
188
+ # Indicate if the first row of dataset is a header or not.
189
+ # If set to false, column names will be autogenerated in the
190
+ # following format: `column_x`, with `x` being an
191
+ # enumeration over every column in the dataset starting at 1.
192
+ # @param sep [String]
193
+ # Single byte character to use as delimiter in the file.
194
+ # @param comment_char [String]
195
+ # Single byte character that indicates the start of a comment line,
196
+ # for instance `#`.
197
+ # @param quote_char [String]
198
+ # Single byte character used for csv quoting.
199
+ # Set to None to turn off special handling and escaping of quotes.
200
+ # @param skip_rows [Integer]
201
+ # Start reading after `skip_rows` lines. The header will be parsed at this
202
+ # offset.
203
+ # @param dtypes [Object]
204
+ # Overwrite dtypes during inference.
205
+ # @param null_values [Object]
206
+ # Values to interpret as null values. You can provide a:
207
+ #
208
+ # - `String`: All values equal to this string will be null.
209
+ # - `Array`: All values equal to any string in this array will be null.
210
+ # - `Hash`: A hash that maps column name to a null value string.
211
+ # @param ignore_errors [Boolean]
212
+ # Try to keep reading lines if some lines yield errors.
213
+ # First try `infer_schema_length: 0` to read all columns as
214
+ # `:str` to check which values might cause an issue.
215
+ # @param cache [Boolean]
216
+ # Cache the result after reading.
217
+ # @param with_column_names [Object]
218
+ # Apply a function over the column names.
219
+ # This can be used to update a schema just in time, thus before
220
+ # scanning.
221
+ # @param infer_schema_length [Integer]
222
+ # Maximum number of lines to read to infer schema.
223
+ # If set to 0, all columns will be read as `:str`.
224
+ # If set to `nil`, a full table scan will be done (slow).
225
+ # @param n_rows [Integer]
226
+ # Stop reading from CSV file after reading `n_rows`.
227
+ # @param encoding ["utf8", "utf8-lossy"]
228
+ # Lossy means that invalid utf8 values are replaced with `�`
229
+ # characters.
230
+ # @param low_memory [Boolean]
231
+ # Reduce memory usage in expense of performance.
232
+ # @param rechunk [Boolean]
233
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
234
+ # @param skip_rows_after_header [Integer]
235
+ # Skip this number of rows when the header is parsed.
236
+ # @param row_count_name [String]
237
+ # If not nil, this will insert a row count column with the given name into
238
+ # the DataFrame.
239
+ # @param row_count_offset [Integer]
240
+ # Offset to start the row_count column (only used if the name is set).
241
+ # @param parse_dates [Boolean]
242
+ # Try to automatically parse dates. If this does not succeed,
243
+ # the column remains of data type `:str`.
244
+ # @param eol_char [String]
245
+ # Single byte end of line character.
246
+ # @param truncate_ragged_lines [Boolean]
247
+ # Truncate lines that are longer than the schema.
248
+ #
249
+ # @return [LazyFrame]
250
+ def scan_csv(
251
+ source,
252
+ has_header: true,
253
+ sep: ",",
254
+ comment_char: nil,
255
+ quote_char: '"',
256
+ skip_rows: 0,
257
+ dtypes: nil,
258
+ null_values: nil,
259
+ ignore_errors: false,
260
+ cache: true,
261
+ with_column_names: nil,
262
+ infer_schema_length: 100,
263
+ n_rows: nil,
264
+ encoding: "utf8",
265
+ low_memory: false,
266
+ rechunk: true,
267
+ skip_rows_after_header: 0,
268
+ row_count_name: nil,
269
+ row_count_offset: 0,
270
+ parse_dates: false,
271
+ eol_char: "\n",
272
+ truncate_ragged_lines: false
273
+ )
274
+ Utils._check_arg_is_1byte("sep", sep, false)
275
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
276
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
277
+
278
+ if Utils.pathlike?(source)
279
+ source = Utils.normalise_filepath(source)
280
+ end
281
+
282
+ LazyFrame._scan_csv(
283
+ source,
284
+ has_header: has_header,
285
+ sep: sep,
286
+ comment_char: comment_char,
287
+ quote_char: quote_char,
288
+ skip_rows: skip_rows,
289
+ dtypes: dtypes,
290
+ null_values: null_values,
291
+ ignore_errors: ignore_errors,
292
+ cache: cache,
293
+ with_column_names: with_column_names,
294
+ infer_schema_length: infer_schema_length,
295
+ n_rows: n_rows,
296
+ low_memory: low_memory,
297
+ rechunk: rechunk,
298
+ skip_rows_after_header: skip_rows_after_header,
299
+ encoding: encoding,
300
+ row_count_name: row_count_name,
301
+ row_count_offset: row_count_offset,
302
+ parse_dates: parse_dates,
303
+ eol_char: eol_char,
304
+ truncate_ragged_lines: truncate_ragged_lines
305
+ )
306
+ end
307
+
308
+ # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
309
+ #
310
+ # This allows the query optimizer to push down predicates and projections to the scan
311
+ # level, thereby potentially reducing memory overhead.
312
+ #
313
+ # @param source [String]
314
+ # Path to a IPC file.
315
+ # @param n_rows [Integer]
316
+ # Stop reading from IPC file after reading `n_rows`.
317
+ # @param cache [Boolean]
318
+ # Cache the result after reading.
319
+ # @param rechunk [Boolean]
320
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
321
+ # @param row_count_name [String]
322
+ # If not nil, this will insert a row count column with give name into the
323
+ # DataFrame.
324
+ # @param row_count_offset [Integer]
325
+ # Offset to start the row_count column (only use if the name is set).
326
+ # @param storage_options [Hash]
327
+ # Extra options that make sense for a particular storage connection.
328
+ # @param memory_map [Boolean]
329
+ # Try to memory map the file. This can greatly improve performance on repeated
330
+ # queries as the OS may cache pages.
331
+ # Only uncompressed IPC files can be memory mapped.
332
+ #
333
+ # @return [LazyFrame]
334
+ def scan_ipc(
335
+ source,
336
+ n_rows: nil,
337
+ cache: true,
338
+ rechunk: true,
339
+ row_count_name: nil,
340
+ row_count_offset: 0,
341
+ storage_options: nil,
342
+ memory_map: true
343
+ )
344
+ LazyFrame._scan_ipc(
345
+ source,
346
+ n_rows: n_rows,
347
+ cache: cache,
348
+ rechunk: rechunk,
349
+ row_count_name: row_count_name,
350
+ row_count_offset: row_count_offset,
351
+ storage_options: storage_options,
352
+ memory_map: memory_map
353
+ )
354
+ end
355
+
356
+ # Lazily read from a parquet file or multiple files via glob patterns.
357
+ #
358
+ # This allows the query optimizer to push down predicates and projections to the scan
359
+ # level, thereby potentially reducing memory overhead.
360
+ #
361
+ # @param source [String]
362
+ # Path to a file.
363
+ # @param n_rows [Integer]
364
+ # Stop reading from parquet file after reading `n_rows`.
365
+ # @param cache [Boolean]
366
+ # Cache the result after reading.
367
+ # @param parallel ["auto", "columns", "row_groups", "none"]
368
+ # This determines the direction of parallelism. 'auto' will try to determine the
369
+ # optimal direction.
370
+ # @param rechunk [Boolean]
371
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
372
+ # into contiguous memory chunks.
373
+ # @param row_count_name [String]
374
+ # If not nil, this will insert a row count column with give name into the
375
+ # DataFrame.
376
+ # @param row_count_offset [Integer]
377
+ # Offset to start the row_count column (only use if the name is set).
378
+ # @param storage_options [Hash]
379
+ # Extra options that make sense for a particular storage connection.
380
+ # @param low_memory [Boolean]
381
+ # Reduce memory pressure at the expense of performance.
382
+ #
383
+ # @return [LazyFrame]
384
+ def scan_parquet(
385
+ source,
386
+ n_rows: nil,
387
+ cache: true,
388
+ parallel: "auto",
389
+ rechunk: true,
390
+ row_count_name: nil,
391
+ row_count_offset: 0,
392
+ storage_options: nil,
393
+ low_memory: false
394
+ )
395
+ if Utils.pathlike?(source)
396
+ source = Utils.normalise_filepath(source)
397
+ end
398
+
399
+ LazyFrame._scan_parquet(
400
+ source,
401
+ n_rows:n_rows,
402
+ cache: cache,
403
+ parallel: parallel,
404
+ rechunk: rechunk,
405
+ row_count_name: row_count_name,
406
+ row_count_offset: row_count_offset,
407
+ storage_options: storage_options,
408
+ low_memory: low_memory
409
+ )
410
+ end
411
+
412
+ # Lazily read from a newline delimited JSON file.
413
+ #
414
+ # This allows the query optimizer to push down predicates and projections to the scan
415
+ # level, thereby potentially reducing memory overhead.
416
+ #
417
+ # @param source [String]
418
+ # Path to a file.
419
+ # @param infer_schema_length [Integer]
420
+ # Infer the schema length from the first `infer_schema_length` rows.
421
+ # @param batch_size [Integer]
422
+ # Number of rows to read in each batch.
423
+ # @param n_rows [Integer]
424
+ # Stop reading from JSON file after reading `n_rows`.
425
+ # @param low_memory [Boolean]
426
+ # Reduce memory pressure at the expense of performance.
427
+ # @param rechunk [Boolean]
428
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
429
+ # @param row_count_name [String]
430
+ # If not nil, this will insert a row count column with give name into the
431
+ # DataFrame.
432
+ # @param row_count_offset [Integer]
433
+ # Offset to start the row_count column (only use if the name is set).
434
+ #
435
+ # @return [LazyFrame]
436
+ def scan_ndjson(
437
+ source,
438
+ infer_schema_length: 100,
439
+ batch_size: 1024,
440
+ n_rows: nil,
441
+ low_memory: false,
442
+ rechunk: true,
443
+ row_count_name: nil,
444
+ row_count_offset: 0
445
+ )
446
+ if Utils.pathlike?(source)
447
+ source = Utils.normalise_filepath(source)
448
+ end
449
+
450
+ LazyFrame._scan_ndjson(
451
+ source,
452
+ infer_schema_length: infer_schema_length,
453
+ batch_size: batch_size,
454
+ n_rows: n_rows,
455
+ low_memory: low_memory,
456
+ rechunk: rechunk,
457
+ row_count_name: row_count_name,
458
+ row_count_offset: row_count_offset,
459
+ )
460
+ end
461
+
462
+ # Read into a DataFrame from Apache Avro format.
463
+ #
464
+ # @param source [Object]
465
+ # Path to a file or a file-like object.
466
+ # @param columns [Object]
467
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
468
+ # of column names.
469
+ # @param n_rows [Integer]
470
+ # Stop reading from Apache Avro file after reading ``n_rows``.
471
+ #
472
+ # @return [DataFrame]
473
+ def read_avro(source, columns: nil, n_rows: nil)
474
+ if Utils.pathlike?(source)
475
+ source = Utils.normalise_filepath(source)
476
+ end
477
+
478
+ DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
479
+ end
480
+
481
+ # Read into a DataFrame from Arrow IPC (Feather v2) file.
482
+ #
483
+ # @param source [Object]
484
+ # Path to a file or a file-like object.
485
+ # @param columns [Object]
486
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
487
+ # of column names.
488
+ # @param n_rows [Integer]
489
+ # Stop reading from IPC file after reading `n_rows`.
490
+ # @param memory_map [Boolean]
491
+ # Try to memory map the file. This can greatly improve performance on repeated
492
+ # queries as the OS may cache pages.
493
+ # Only uncompressed IPC files can be memory mapped.
494
+ # @param storage_options [Hash]
495
+ # Extra options that make sense for a particular storage connection.
496
+ # @param row_count_name [String]
497
+ # If not nil, this will insert a row count column with give name into the
498
+ # DataFrame.
499
+ # @param row_count_offset [Integer]
500
+ # Offset to start the row_count column (only use if the name is set).
501
+ # @param rechunk [Boolean]
502
+ # Make sure that all data is contiguous.
503
+ #
504
+ # @return [DataFrame]
505
+ def read_ipc(
506
+ source,
507
+ columns: nil,
508
+ n_rows: nil,
509
+ memory_map: true,
510
+ storage_options: nil,
511
+ row_count_name: nil,
512
+ row_count_offset: 0,
513
+ rechunk: true
514
+ )
515
+ storage_options ||= {}
516
+ _prepare_file_arg(source, **storage_options) do |data|
517
+ DataFrame._read_ipc(
518
+ data,
519
+ columns: columns,
520
+ n_rows: n_rows,
521
+ row_count_name: row_count_name,
522
+ row_count_offset: row_count_offset,
523
+ rechunk: rechunk,
524
+ memory_map: memory_map
525
+ )
526
+ end
527
+ end
528
+
529
+ # Read into a DataFrame from a parquet file.
530
+ #
531
+ # @param source [String, Pathname, StringIO]
532
+ # Path to a file or a file-like object.
533
+ # @param columns [Object]
534
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
535
+ # of column names.
536
+ # @param n_rows [Integer]
537
+ # Stop reading from parquet file after reading `n_rows`.
538
+ # @param storage_options [Hash]
539
+ # Extra options that make sense for a particular storage connection.
540
+ # @param parallel ["auto", "columns", "row_groups", "none"]
541
+ # This determines the direction of parallelism. 'auto' will try to determine the
542
+ # optimal direction.
543
+ # @param row_count_name [String]
544
+ # If not nil, this will insert a row count column with give name into the
545
+ # DataFrame.
546
+ # @param row_count_offset [Integer]
547
+ # Offset to start the row_count column (only use if the name is set).
548
+ # @param low_memory [Boolean]
549
+ # Reduce memory pressure at the expense of performance.
550
+ # @param use_statistics [Boolean]
551
+ # Use statistics in the parquet to determine if pages
552
+ # can be skipped from reading.
553
+ # @param rechunk [Boolean]
554
+ # Make sure that all columns are contiguous in memory by
555
+ # aggregating the chunks into a single array.
556
+ #
557
+ # @return [DataFrame]
558
+ #
559
+ # @note
560
+ # This operation defaults to a `rechunk` operation at the end, meaning that
561
+ # all data will be stored continuously in memory.
562
+ # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
563
+ # an expensive operation.
564
+ def read_parquet(
565
+ source,
566
+ columns: nil,
567
+ n_rows: nil,
568
+ storage_options: nil,
569
+ parallel: "auto",
570
+ row_count_name: nil,
571
+ row_count_offset: 0,
572
+ low_memory: false,
573
+ use_statistics: true,
574
+ rechunk: true
575
+ )
576
+ _prepare_file_arg(source) do |data|
577
+ DataFrame._read_parquet(
578
+ data,
579
+ columns: columns,
580
+ n_rows: n_rows,
581
+ parallel: parallel,
582
+ row_count_name: row_count_name,
583
+ row_count_offset: row_count_offset,
584
+ low_memory: low_memory,
585
+ use_statistics: use_statistics,
586
+ rechunk: rechunk
587
+ )
588
+ end
589
+ end
590
+
591
+ # Read into a DataFrame from a JSON file.
592
+ #
593
+ # @param source [Object]
594
+ # Path to a file or a file-like object.
595
+ #
596
+ # @return [DataFrame]
597
+ def read_json(source)
598
+ DataFrame._read_json(source)
599
+ end
600
+
601
+ # Read into a DataFrame from a newline delimited JSON file.
602
+ #
603
+ # @param source [Object]
604
+ # Path to a file or a file-like object.
605
+ #
606
+ # @return [DataFrame]
607
+ def read_ndjson(source)
608
+ DataFrame._read_ndjson(source)
609
+ end
610
+
611
+ # Read a SQL query into a DataFrame.
612
+ #
613
+ # @param query [Object]
614
+ # ActiveRecord::Relation or ActiveRecord::Result.
615
+ # @param schema_overrides [Hash]
616
+ # A hash mapping column names to dtypes, used to override the schema
617
+ # inferred from the query.
618
+ #
619
+ # @return [DataFrame]
620
+ def read_database(query, schema_overrides: nil)
621
+ if !defined?(ActiveRecord)
622
+ raise Error, "Active Record not available"
623
+ end
624
+
625
+ result =
626
+ if query.is_a?(ActiveRecord::Result)
627
+ query
628
+ elsif query.is_a?(ActiveRecord::Relation)
629
+ query.connection.select_all(query.to_sql)
630
+ elsif query.is_a?(::String)
631
+ ActiveRecord::Base.connection.select_all(query)
632
+ else
633
+ raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
634
+ end
635
+
636
+ data = {}
637
+ schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
638
+
639
+ result.columns.each_with_index do |k, i|
640
+ column_type = result.column_types[i]
641
+
642
+ data[k] =
643
+ if column_type
644
+ result.rows.map { |r| column_type.deserialize(r[i]) }
645
+ else
646
+ result.rows.map { |r| r[i] }
647
+ end
648
+
649
+ polars_type =
650
+ case column_type&.type
651
+ when :binary
652
+ Binary
653
+ when :boolean
654
+ Boolean
655
+ when :date
656
+ Date
657
+ when :datetime, :timestamp
658
+ Datetime
659
+ when :decimal
660
+ Decimal
661
+ when :float
662
+ Float64
663
+ when :integer
664
+ Int64
665
+ when :string, :text
666
+ String
667
+ when :time
668
+ Time
669
+ # TODO fix issue with null
670
+ # when :json, :jsonb
671
+ # Struct
672
+ end
673
+
674
+ schema_overrides[k] ||= polars_type if polars_type
675
+ end
676
+
677
+ DataFrame.new(data, schema_overrides: schema_overrides)
678
+ end
679
+ alias_method :read_sql, :read_database
680
+
681
+ # def read_excel
682
+ # end
683
+
684
+ # Read a CSV file in batches.
685
+ #
686
+ # Upon creation of the `BatchedCsvReader`,
687
+ # polars will gather statistics and determine the
688
+ # file chunks. After that work will only be done
689
+ # if `next_batches` is called.
690
+ #
691
+ # @param source [Object]
692
+ # Path to a file or a file-like object.
693
+ # @param has_header [Boolean]
694
+ # Indicate if the first row of dataset is a header or not.
695
+ # If set to False, column names will be autogenerated in the
696
+ # following format: `column_x`, with `x` being an
697
+ # enumeration over every column in the dataset starting at 1.
698
+ # @param columns [Object]
699
+ # Columns to select. Accepts a list of column indices (starting
700
+ # at zero) or a list of column names.
701
+ # @param new_columns [Object]
702
+ # Rename columns right after parsing the CSV file. If the given
703
+ # list is shorter than the width of the DataFrame the remaining
704
+ # columns will have their original name.
705
+ # @param sep [String]
706
+ # Single byte character to use as delimiter in the file.
707
+ # @param comment_char [String]
708
+ # Single byte character that indicates the start of a comment line,
709
+ # for instance `#`.
710
+ # @param quote_char [String]
711
+ # Single byte character used for csv quoting, default = `"`.
712
+ # Set to nil to turn off special handling and escaping of quotes.
713
+ # @param skip_rows [Integer]
714
+ # Start reading after `skip_rows` lines.
715
+ # @param dtypes [Object]
716
+ # Overwrite dtypes during inference.
717
+ # @param null_values [Object]
718
+ # Values to interpret as null values. You can provide a:
719
+ #
720
+ # - `String`: All values equal to this string will be null.
721
+ # - `Array`: All values equal to any string in this array will be null.
722
+ # - `Hash`: A hash that maps column name to a null value string.
723
+ # @param ignore_errors [Boolean]
724
+ # Try to keep reading lines if some lines yield errors.
725
+ # First try `infer_schema_length: 0` to read all columns as
726
+ # `:str` to check which values might cause an issue.
727
+ # @param parse_dates [Boolean]
728
+ # Try to automatically parse dates. If this does not succeed,
729
+ # the column remains of data type `:str`.
730
+ # @param n_threads [Integer]
731
+ # Number of threads to use in csv parsing.
732
+ # Defaults to the number of physical cpu's of your system.
733
+ # @param infer_schema_length [Integer]
734
+ # Maximum number of lines to read to infer schema.
735
+ # If set to 0, all columns will be read as `:str`.
736
+ # If set to `nil`, a full table scan will be done (slow).
737
+ # @param batch_size [Integer]
738
+ # Number of lines to read into the buffer at once.
739
+ # Modify this to change performance.
740
+ # @param n_rows [Integer]
741
+ # Stop reading from CSV file after reading `n_rows`.
742
+ # During multi-threaded parsing, an upper bound of `n_rows`
743
+ # rows cannot be guaranteed.
744
+ # @param encoding ["utf8", "utf8-lossy"]
745
+ # Lossy means that invalid utf8 values are replaced with `�`
746
+ # characters. When using other encodings than `utf8` or
747
+ # `utf8-lossy`, the input is first decoded im memory with
748
+ # Ruby. Defaults to `utf8`.
749
+ # @param low_memory [Boolean]
750
+ # Reduce memory usage at expense of performance.
751
+ # @param rechunk [Boolean]
752
+ # Make sure that all columns are contiguous in memory by
753
+ # aggregating the chunks into a single array.
754
+ # @param skip_rows_after_header [Integer]
755
+ # Skip this number of rows when the header is parsed.
756
+ # @param row_count_name [String]
757
+ # If not nil, this will insert a row count column with the given name into
758
+ # the DataFrame.
759
+ # @param row_count_offset [Integer]
760
+ # Offset to start the row_count column (only used if the name is set).
761
+ # @param sample_size [Integer]
762
+ # Set the sample size. This is used to sample statistics to estimate the
763
+ # allocation needed.
764
+ # @param eol_char [String]
765
+ # Single byte end of line character.
766
+ # @param truncate_ragged_lines [Boolean]
767
+ # Truncate lines that are longer than the schema.
768
+ #
769
+ # @return [BatchedCsvReader]
770
+ #
771
+ # @example
772
+ # reader = Polars.read_csv_batched(
773
+ # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
774
+ # )
775
+ # reader.next_batches(5)
776
+ def read_csv_batched(
777
+ source,
778
+ has_header: true,
779
+ columns: nil,
780
+ new_columns: nil,
781
+ sep: ",",
782
+ comment_char: nil,
783
+ quote_char: '"',
784
+ skip_rows: 0,
785
+ dtypes: nil,
786
+ null_values: nil,
787
+ ignore_errors: false,
788
+ parse_dates: false,
789
+ n_threads: nil,
790
+ infer_schema_length: 100,
791
+ batch_size: 50_000,
792
+ n_rows: nil,
793
+ encoding: "utf8",
794
+ low_memory: false,
795
+ rechunk: true,
796
+ skip_rows_after_header: 0,
797
+ row_count_name: nil,
798
+ row_count_offset: 0,
799
+ sample_size: 1024,
800
+ eol_char: "\n",
801
+ truncate_ragged_lines: false
802
+ )
803
+ projection, columns = Utils.handle_projection_columns(columns)
804
+
805
+ if columns && !has_header
806
+ columns.each do |column|
807
+ if !column.start_with?("column_")
808
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
809
+ end
810
+ end
811
+ end
812
+
813
+ if projection || new_columns
814
+ raise Todo
815
+ end
816
+
817
+ BatchedCsvReader.new(
818
+ source,
819
+ has_header: has_header,
820
+ columns: columns || projection,
821
+ sep: sep,
822
+ comment_char: comment_char,
823
+ quote_char: quote_char,
824
+ skip_rows: skip_rows,
825
+ dtypes: dtypes,
826
+ null_values: null_values,
827
+ ignore_errors: ignore_errors,
828
+ parse_dates: parse_dates,
829
+ n_threads: n_threads,
830
+ infer_schema_length: infer_schema_length,
831
+ batch_size: batch_size,
832
+ n_rows: n_rows,
833
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
834
+ low_memory: low_memory,
835
+ rechunk: rechunk,
836
+ skip_rows_after_header: skip_rows_after_header,
837
+ row_count_name: row_count_name,
838
+ row_count_offset: row_count_offset,
839
+ sample_size: sample_size,
840
+ eol_char: eol_char,
841
+ new_columns: new_columns,
842
+ truncate_ragged_lines: truncate_ragged_lines
843
+ )
844
+ end
845
+
846
+ # Get a schema of the IPC file without reading data.
847
+ #
848
+ # @param source [Object]
849
+ # Path to a file or a file-like object.
850
+ #
851
+ # @return [Hash]
852
+ def read_ipc_schema(source)
853
+ if Utils.pathlike?(source)
854
+ source = Utils.normalise_filepath(source)
855
+ end
856
+
857
+ Plr.ipc_schema(source)
858
+ end
859
+
860
+ # Get a schema of the Parquet file without reading data.
861
+ #
862
+ # @param source [Object]
863
+ # Path to a file or a file-like object.
864
+ #
865
+ # @return [Hash]
866
+ def read_parquet_schema(source)
867
+ if Utils.pathlike?(source)
868
+ source = Utils.normalise_filepath(source)
869
+ end
870
+
871
+ Plr.parquet_schema(source)
872
+ end
873
+
874
+ private
875
+
876
+ def _prepare_file_arg(file)
877
+ if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
878
+ raise ArgumentError, "use URI(...) for remote files"
879
+ end
880
+
881
+ if defined?(URI) && file.is_a?(URI)
882
+ require "open-uri"
883
+
884
+ file = file.open
885
+ end
886
+
887
+ yield file
888
+ end
889
+ end
890
+ end