polars-df 0.10.0-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +175 -0
  4. data/Cargo.lock +2536 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +38726 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +98 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +72 -0
  18. data/lib/polars/cat_name_space.rb +125 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +93 -0
  21. data/lib/polars/data_frame.rb +5418 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1444 -0
  24. data/lib/polars/date_time_name_space.rb +1484 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +31 -0
  27. data/lib/polars/expr.rb +6105 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +248 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1280 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +103 -0
  39. data/lib/polars/functions/range/int_range.rb +51 -0
  40. data/lib/polars/functions/repeat.rb +144 -0
  41. data/lib/polars/functions/whenthen.rb +96 -0
  42. data/lib/polars/functions.rb +57 -0
  43. data/lib/polars/group_by.rb +548 -0
  44. data/lib/polars/io.rb +890 -0
  45. data/lib/polars/lazy_frame.rb +2833 -0
  46. data/lib/polars/lazy_group_by.rb +84 -0
  47. data/lib/polars/list_expr.rb +791 -0
  48. data/lib/polars/list_name_space.rb +445 -0
  49. data/lib/polars/meta_expr.rb +222 -0
  50. data/lib/polars/name_expr.rb +198 -0
  51. data/lib/polars/plot.rb +109 -0
  52. data/lib/polars/rolling_group_by.rb +37 -0
  53. data/lib/polars/series.rb +4527 -0
  54. data/lib/polars/slice.rb +104 -0
  55. data/lib/polars/sql_context.rb +194 -0
  56. data/lib/polars/string_cache.rb +75 -0
  57. data/lib/polars/string_expr.rb +1519 -0
  58. data/lib/polars/string_name_space.rb +810 -0
  59. data/lib/polars/struct_expr.rb +98 -0
  60. data/lib/polars/struct_name_space.rb +96 -0
  61. data/lib/polars/testing.rb +507 -0
  62. data/lib/polars/utils.rb +422 -0
  63. data/lib/polars/version.rb +4 -0
  64. data/lib/polars/whenthen.rb +83 -0
  65. data/lib/polars-df.rb +1 -0
  66. data/lib/polars.rb +72 -0
  67. metadata +125 -0
data/lib/polars/io.rb ADDED
@@ -0,0 +1,890 @@
1
+ module Polars
2
+ module IO
3
+ # Read a CSV file into a DataFrame.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param has_header [Boolean]
8
+ # Indicate if the first row of dataset is a header or not.
9
+ # If set to false, column names will be autogenerated in the
10
+ # following format: `column_x`, with `x` being an
11
+ # enumeration over every column in the dataset starting at 1.
12
+ # @param columns [Object]
13
+ # Columns to select. Accepts a list of column indices (starting
14
+ # at zero) or a list of column names.
15
+ # @param new_columns [Object]
16
+ # Rename columns right after parsing the CSV file. If the given
17
+ # list is shorter than the width of the DataFrame the remaining
18
+ # columns will have their original name.
19
+ # @param sep [String]
20
+ # Single byte character to use as delimiter in the file.
21
+ # @param comment_char [String]
22
+ # Single byte character that indicates the start of a comment line,
23
+ # for instance `#`.
24
+ # @param quote_char [String]
25
+ # Single byte character used for csv quoting.
26
+ # Set to nil to turn off special handling and escaping of quotes.
27
+ # @param skip_rows [Integer]
28
+ # Start reading after `skip_rows` lines.
29
+ # @param dtypes [Object]
30
+ # Overwrite dtypes during inference.
31
+ # @param null_values [Object]
32
+ # Values to interpret as null values. You can provide a:
33
+ #
34
+ # - `String`: All values equal to this string will be null.
35
+ # - `Array`: All values equal to any string in this array will be null.
36
+ # - `Hash`: A hash that maps column name to a null value string.
37
+ # @param ignore_errors [Boolean]
38
+ # Try to keep reading lines if some lines yield errors.
39
+ # First try `infer_schema_length: 0` to read all columns as
40
+ # `:str` to check which values might cause an issue.
41
+ # @param parse_dates [Boolean]
42
+ # Try to automatically parse dates. If this does not succeed,
43
+ # the column remains of data type `:str`.
44
+ # @param n_threads [Integer]
45
+ # Number of threads to use in csv parsing.
46
+ # Defaults to the number of physical cpu's of your system.
47
+ # @param infer_schema_length [Integer]
48
+ # Maximum number of lines to read to infer schema.
49
+ # If set to 0, all columns will be read as `:utf8`.
50
+ # If set to `nil`, a full table scan will be done (slow).
51
+ # @param batch_size [Integer]
52
+ # Number of lines to read into the buffer at once.
53
+ # Modify this to change performance.
54
+ # @param n_rows [Integer]
55
+ # Stop reading from CSV file after reading `n_rows`.
56
+ # During multi-threaded parsing, an upper bound of `n_rows`
57
+ # rows cannot be guaranteed.
58
+ # @param encoding ["utf8", "utf8-lossy"]
59
+ # Lossy means that invalid utf8 values are replaced with `�`
60
+ # characters. When using other encodings than `utf8` or
61
+ # `utf8-lossy`, the input is first decoded im memory with
62
+ # Ruby.
63
+ # @param low_memory [Boolean]
64
+ # Reduce memory usage at expense of performance.
65
+ # @param rechunk [Boolean]
66
+ # Make sure that all columns are contiguous in memory by
67
+ # aggregating the chunks into a single array.
68
+ # @param storage_options [Hash]
69
+ # Extra options that make sense for a
70
+ # particular storage connection.
71
+ # @param skip_rows_after_header [Integer]
72
+ # Skip this number of rows when the header is parsed.
73
+ # @param row_count_name [String]
74
+ # If not nil, this will insert a row count column with the given name into
75
+ # the DataFrame.
76
+ # @param row_count_offset [Integer]
77
+ # Offset to start the row_count column (only used if the name is set).
78
+ # @param sample_size [Integer]
79
+ # Set the sample size. This is used to sample statistics to estimate the
80
+ # allocation needed.
81
+ # @param eol_char [String]
82
+ # Single byte end of line character.
83
+ # @param truncate_ragged_lines [Boolean]
84
+ # Truncate lines that are longer than the schema.
85
+ #
86
+ # @return [DataFrame]
87
+ #
88
+ # @note
89
+ # This operation defaults to a `rechunk` operation at the end, meaning that
90
+ # all data will be stored continuously in memory.
91
+ # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
92
+ # an expensive operation.
93
+ def read_csv(
94
+ source,
95
+ has_header: true,
96
+ columns: nil,
97
+ new_columns: nil,
98
+ sep: ",",
99
+ comment_char: nil,
100
+ quote_char: '"',
101
+ skip_rows: 0,
102
+ dtypes: nil,
103
+ null_values: nil,
104
+ ignore_errors: false,
105
+ parse_dates: false,
106
+ n_threads: nil,
107
+ infer_schema_length: 100,
108
+ batch_size: 8192,
109
+ n_rows: nil,
110
+ encoding: "utf8",
111
+ low_memory: false,
112
+ rechunk: true,
113
+ storage_options: nil,
114
+ skip_rows_after_header: 0,
115
+ row_count_name: nil,
116
+ row_count_offset: 0,
117
+ sample_size: 1024,
118
+ eol_char: "\n",
119
+ truncate_ragged_lines: false
120
+ )
121
+ Utils._check_arg_is_1byte("sep", sep, false)
122
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
123
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
124
+ Utils._check_arg_is_1byte("eol_char", eol_char, false)
125
+
126
+ projection, columns = Utils.handle_projection_columns(columns)
127
+
128
+ storage_options ||= {}
129
+
130
+ if columns && !has_header
131
+ columns.each do |column|
132
+ if !column.start_with?("column_")
133
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
134
+ end
135
+ end
136
+ end
137
+
138
+ if projection || new_columns
139
+ raise Todo
140
+ end
141
+
142
+ df = nil
143
+ _prepare_file_arg(source) do |data|
144
+ df = DataFrame._read_csv(
145
+ data,
146
+ has_header: has_header,
147
+ columns: columns || projection,
148
+ sep: sep,
149
+ comment_char: comment_char,
150
+ quote_char: quote_char,
151
+ skip_rows: skip_rows,
152
+ dtypes: dtypes,
153
+ null_values: null_values,
154
+ ignore_errors: ignore_errors,
155
+ parse_dates: parse_dates,
156
+ n_threads: n_threads,
157
+ infer_schema_length: infer_schema_length,
158
+ batch_size: batch_size,
159
+ n_rows: n_rows,
160
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
161
+ low_memory: low_memory,
162
+ rechunk: rechunk,
163
+ skip_rows_after_header: skip_rows_after_header,
164
+ row_count_name: row_count_name,
165
+ row_count_offset: row_count_offset,
166
+ sample_size: sample_size,
167
+ eol_char: eol_char,
168
+ truncate_ragged_lines: truncate_ragged_lines
169
+ )
170
+ end
171
+
172
+ if new_columns
173
+ Utils._update_columns(df, new_columns)
174
+ else
175
+ df
176
+ end
177
+ end
178
+
179
+ # Lazily read from a CSV file or multiple files via glob patterns.
180
+ #
181
+ # This allows the query optimizer to push down predicates and
182
+ # projections to the scan level, thereby potentially reducing
183
+ # memory overhead.
184
+ #
185
+ # @param source [Object]
186
+ # Path to a file.
187
+ # @param has_header [Boolean]
188
+ # Indicate if the first row of dataset is a header or not.
189
+ # If set to false, column names will be autogenerated in the
190
+ # following format: `column_x`, with `x` being an
191
+ # enumeration over every column in the dataset starting at 1.
192
+ # @param sep [String]
193
+ # Single byte character to use as delimiter in the file.
194
+ # @param comment_char [String]
195
+ # Single byte character that indicates the start of a comment line,
196
+ # for instance `#`.
197
+ # @param quote_char [String]
198
+ # Single byte character used for csv quoting.
199
+ # Set to None to turn off special handling and escaping of quotes.
200
+ # @param skip_rows [Integer]
201
+ # Start reading after `skip_rows` lines. The header will be parsed at this
202
+ # offset.
203
+ # @param dtypes [Object]
204
+ # Overwrite dtypes during inference.
205
+ # @param null_values [Object]
206
+ # Values to interpret as null values. You can provide a:
207
+ #
208
+ # - `String`: All values equal to this string will be null.
209
+ # - `Array`: All values equal to any string in this array will be null.
210
+ # - `Hash`: A hash that maps column name to a null value string.
211
+ # @param ignore_errors [Boolean]
212
+ # Try to keep reading lines if some lines yield errors.
213
+ # First try `infer_schema_length: 0` to read all columns as
214
+ # `:str` to check which values might cause an issue.
215
+ # @param cache [Boolean]
216
+ # Cache the result after reading.
217
+ # @param with_column_names [Object]
218
+ # Apply a function over the column names.
219
+ # This can be used to update a schema just in time, thus before
220
+ # scanning.
221
+ # @param infer_schema_length [Integer]
222
+ # Maximum number of lines to read to infer schema.
223
+ # If set to 0, all columns will be read as `:str`.
224
+ # If set to `nil`, a full table scan will be done (slow).
225
+ # @param n_rows [Integer]
226
+ # Stop reading from CSV file after reading `n_rows`.
227
+ # @param encoding ["utf8", "utf8-lossy"]
228
+ # Lossy means that invalid utf8 values are replaced with `�`
229
+ # characters.
230
+ # @param low_memory [Boolean]
231
+ # Reduce memory usage in expense of performance.
232
+ # @param rechunk [Boolean]
233
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
234
+ # @param skip_rows_after_header [Integer]
235
+ # Skip this number of rows when the header is parsed.
236
+ # @param row_count_name [String]
237
+ # If not nil, this will insert a row count column with the given name into
238
+ # the DataFrame.
239
+ # @param row_count_offset [Integer]
240
+ # Offset to start the row_count column (only used if the name is set).
241
+ # @param parse_dates [Boolean]
242
+ # Try to automatically parse dates. If this does not succeed,
243
+ # the column remains of data type `:str`.
244
+ # @param eol_char [String]
245
+ # Single byte end of line character.
246
+ # @param truncate_ragged_lines [Boolean]
247
+ # Truncate lines that are longer than the schema.
248
+ #
249
+ # @return [LazyFrame]
250
+ def scan_csv(
251
+ source,
252
+ has_header: true,
253
+ sep: ",",
254
+ comment_char: nil,
255
+ quote_char: '"',
256
+ skip_rows: 0,
257
+ dtypes: nil,
258
+ null_values: nil,
259
+ ignore_errors: false,
260
+ cache: true,
261
+ with_column_names: nil,
262
+ infer_schema_length: 100,
263
+ n_rows: nil,
264
+ encoding: "utf8",
265
+ low_memory: false,
266
+ rechunk: true,
267
+ skip_rows_after_header: 0,
268
+ row_count_name: nil,
269
+ row_count_offset: 0,
270
+ parse_dates: false,
271
+ eol_char: "\n",
272
+ truncate_ragged_lines: false
273
+ )
274
+ Utils._check_arg_is_1byte("sep", sep, false)
275
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
276
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
277
+
278
+ if Utils.pathlike?(source)
279
+ source = Utils.normalise_filepath(source)
280
+ end
281
+
282
+ LazyFrame._scan_csv(
283
+ source,
284
+ has_header: has_header,
285
+ sep: sep,
286
+ comment_char: comment_char,
287
+ quote_char: quote_char,
288
+ skip_rows: skip_rows,
289
+ dtypes: dtypes,
290
+ null_values: null_values,
291
+ ignore_errors: ignore_errors,
292
+ cache: cache,
293
+ with_column_names: with_column_names,
294
+ infer_schema_length: infer_schema_length,
295
+ n_rows: n_rows,
296
+ low_memory: low_memory,
297
+ rechunk: rechunk,
298
+ skip_rows_after_header: skip_rows_after_header,
299
+ encoding: encoding,
300
+ row_count_name: row_count_name,
301
+ row_count_offset: row_count_offset,
302
+ parse_dates: parse_dates,
303
+ eol_char: eol_char,
304
+ truncate_ragged_lines: truncate_ragged_lines
305
+ )
306
+ end
307
+
308
+ # Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
309
+ #
310
+ # This allows the query optimizer to push down predicates and projections to the scan
311
+ # level, thereby potentially reducing memory overhead.
312
+ #
313
+ # @param source [String]
314
+ # Path to a IPC file.
315
+ # @param n_rows [Integer]
316
+ # Stop reading from IPC file after reading `n_rows`.
317
+ # @param cache [Boolean]
318
+ # Cache the result after reading.
319
+ # @param rechunk [Boolean]
320
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
321
+ # @param row_count_name [String]
322
+ # If not nil, this will insert a row count column with give name into the
323
+ # DataFrame.
324
+ # @param row_count_offset [Integer]
325
+ # Offset to start the row_count column (only use if the name is set).
326
+ # @param storage_options [Hash]
327
+ # Extra options that make sense for a particular storage connection.
328
+ # @param memory_map [Boolean]
329
+ # Try to memory map the file. This can greatly improve performance on repeated
330
+ # queries as the OS may cache pages.
331
+ # Only uncompressed IPC files can be memory mapped.
332
+ #
333
+ # @return [LazyFrame]
334
+ def scan_ipc(
335
+ source,
336
+ n_rows: nil,
337
+ cache: true,
338
+ rechunk: true,
339
+ row_count_name: nil,
340
+ row_count_offset: 0,
341
+ storage_options: nil,
342
+ memory_map: true
343
+ )
344
+ LazyFrame._scan_ipc(
345
+ source,
346
+ n_rows: n_rows,
347
+ cache: cache,
348
+ rechunk: rechunk,
349
+ row_count_name: row_count_name,
350
+ row_count_offset: row_count_offset,
351
+ storage_options: storage_options,
352
+ memory_map: memory_map
353
+ )
354
+ end
355
+
356
+ # Lazily read from a parquet file or multiple files via glob patterns.
357
+ #
358
+ # This allows the query optimizer to push down predicates and projections to the scan
359
+ # level, thereby potentially reducing memory overhead.
360
+ #
361
+ # @param source [String]
362
+ # Path to a file.
363
+ # @param n_rows [Integer]
364
+ # Stop reading from parquet file after reading `n_rows`.
365
+ # @param cache [Boolean]
366
+ # Cache the result after reading.
367
+ # @param parallel ["auto", "columns", "row_groups", "none"]
368
+ # This determines the direction of parallelism. 'auto' will try to determine the
369
+ # optimal direction.
370
+ # @param rechunk [Boolean]
371
+ # In case of reading multiple files via a glob pattern rechunk the final DataFrame
372
+ # into contiguous memory chunks.
373
+ # @param row_count_name [String]
374
+ # If not nil, this will insert a row count column with give name into the
375
+ # DataFrame.
376
+ # @param row_count_offset [Integer]
377
+ # Offset to start the row_count column (only use if the name is set).
378
+ # @param storage_options [Hash]
379
+ # Extra options that make sense for a particular storage connection.
380
+ # @param low_memory [Boolean]
381
+ # Reduce memory pressure at the expense of performance.
382
+ #
383
+ # @return [LazyFrame]
384
+ def scan_parquet(
385
+ source,
386
+ n_rows: nil,
387
+ cache: true,
388
+ parallel: "auto",
389
+ rechunk: true,
390
+ row_count_name: nil,
391
+ row_count_offset: 0,
392
+ storage_options: nil,
393
+ low_memory: false
394
+ )
395
+ if Utils.pathlike?(source)
396
+ source = Utils.normalise_filepath(source)
397
+ end
398
+
399
+ LazyFrame._scan_parquet(
400
+ source,
401
+ n_rows:n_rows,
402
+ cache: cache,
403
+ parallel: parallel,
404
+ rechunk: rechunk,
405
+ row_count_name: row_count_name,
406
+ row_count_offset: row_count_offset,
407
+ storage_options: storage_options,
408
+ low_memory: low_memory
409
+ )
410
+ end
411
+
412
+ # Lazily read from a newline delimited JSON file.
413
+ #
414
+ # This allows the query optimizer to push down predicates and projections to the scan
415
+ # level, thereby potentially reducing memory overhead.
416
+ #
417
+ # @param source [String]
418
+ # Path to a file.
419
+ # @param infer_schema_length [Integer]
420
+ # Infer the schema length from the first `infer_schema_length` rows.
421
+ # @param batch_size [Integer]
422
+ # Number of rows to read in each batch.
423
+ # @param n_rows [Integer]
424
+ # Stop reading from JSON file after reading `n_rows`.
425
+ # @param low_memory [Boolean]
426
+ # Reduce memory pressure at the expense of performance.
427
+ # @param rechunk [Boolean]
428
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
429
+ # @param row_count_name [String]
430
+ # If not nil, this will insert a row count column with give name into the
431
+ # DataFrame.
432
+ # @param row_count_offset [Integer]
433
+ # Offset to start the row_count column (only use if the name is set).
434
+ #
435
+ # @return [LazyFrame]
436
+ def scan_ndjson(
437
+ source,
438
+ infer_schema_length: 100,
439
+ batch_size: 1024,
440
+ n_rows: nil,
441
+ low_memory: false,
442
+ rechunk: true,
443
+ row_count_name: nil,
444
+ row_count_offset: 0
445
+ )
446
+ if Utils.pathlike?(source)
447
+ source = Utils.normalise_filepath(source)
448
+ end
449
+
450
+ LazyFrame._scan_ndjson(
451
+ source,
452
+ infer_schema_length: infer_schema_length,
453
+ batch_size: batch_size,
454
+ n_rows: n_rows,
455
+ low_memory: low_memory,
456
+ rechunk: rechunk,
457
+ row_count_name: row_count_name,
458
+ row_count_offset: row_count_offset,
459
+ )
460
+ end
461
+
462
+ # Read into a DataFrame from Apache Avro format.
463
+ #
464
+ # @param source [Object]
465
+ # Path to a file or a file-like object.
466
+ # @param columns [Object]
467
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
468
+ # of column names.
469
+ # @param n_rows [Integer]
470
+ # Stop reading from Apache Avro file after reading ``n_rows``.
471
+ #
472
+ # @return [DataFrame]
473
+ def read_avro(source, columns: nil, n_rows: nil)
474
+ if Utils.pathlike?(source)
475
+ source = Utils.normalise_filepath(source)
476
+ end
477
+
478
+ DataFrame._read_avro(source, n_rows: n_rows, columns: columns)
479
+ end
480
+
481
+ # Read into a DataFrame from Arrow IPC (Feather v2) file.
482
+ #
483
+ # @param source [Object]
484
+ # Path to a file or a file-like object.
485
+ # @param columns [Object]
486
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
487
+ # of column names.
488
+ # @param n_rows [Integer]
489
+ # Stop reading from IPC file after reading `n_rows`.
490
+ # @param memory_map [Boolean]
491
+ # Try to memory map the file. This can greatly improve performance on repeated
492
+ # queries as the OS may cache pages.
493
+ # Only uncompressed IPC files can be memory mapped.
494
+ # @param storage_options [Hash]
495
+ # Extra options that make sense for a particular storage connection.
496
+ # @param row_count_name [String]
497
+ # If not nil, this will insert a row count column with give name into the
498
+ # DataFrame.
499
+ # @param row_count_offset [Integer]
500
+ # Offset to start the row_count column (only use if the name is set).
501
+ # @param rechunk [Boolean]
502
+ # Make sure that all data is contiguous.
503
+ #
504
+ # @return [DataFrame]
505
+ def read_ipc(
506
+ source,
507
+ columns: nil,
508
+ n_rows: nil,
509
+ memory_map: true,
510
+ storage_options: nil,
511
+ row_count_name: nil,
512
+ row_count_offset: 0,
513
+ rechunk: true
514
+ )
515
+ storage_options ||= {}
516
+ _prepare_file_arg(source, **storage_options) do |data|
517
+ DataFrame._read_ipc(
518
+ data,
519
+ columns: columns,
520
+ n_rows: n_rows,
521
+ row_count_name: row_count_name,
522
+ row_count_offset: row_count_offset,
523
+ rechunk: rechunk,
524
+ memory_map: memory_map
525
+ )
526
+ end
527
+ end
528
+
529
+ # Read into a DataFrame from a parquet file.
530
+ #
531
+ # @param source [String, Pathname, StringIO]
532
+ # Path to a file or a file-like object.
533
+ # @param columns [Object]
534
+ # Columns to select. Accepts a list of column indices (starting at zero) or a list
535
+ # of column names.
536
+ # @param n_rows [Integer]
537
+ # Stop reading from parquet file after reading `n_rows`.
538
+ # @param storage_options [Hash]
539
+ # Extra options that make sense for a particular storage connection.
540
+ # @param parallel ["auto", "columns", "row_groups", "none"]
541
+ # This determines the direction of parallelism. 'auto' will try to determine the
542
+ # optimal direction.
543
+ # @param row_count_name [String]
544
+ # If not nil, this will insert a row count column with give name into the
545
+ # DataFrame.
546
+ # @param row_count_offset [Integer]
547
+ # Offset to start the row_count column (only use if the name is set).
548
+ # @param low_memory [Boolean]
549
+ # Reduce memory pressure at the expense of performance.
550
+ # @param use_statistics [Boolean]
551
+ # Use statistics in the parquet to determine if pages
552
+ # can be skipped from reading.
553
+ # @param rechunk [Boolean]
554
+ # Make sure that all columns are contiguous in memory by
555
+ # aggregating the chunks into a single array.
556
+ #
557
+ # @return [DataFrame]
558
+ #
559
+ # @note
560
+ # This operation defaults to a `rechunk` operation at the end, meaning that
561
+ # all data will be stored continuously in memory.
562
+ # Set `rechunk: false` if you are benchmarking the parquet-reader. A `rechunk` is
563
+ # an expensive operation.
564
+ def read_parquet(
565
+ source,
566
+ columns: nil,
567
+ n_rows: nil,
568
+ storage_options: nil,
569
+ parallel: "auto",
570
+ row_count_name: nil,
571
+ row_count_offset: 0,
572
+ low_memory: false,
573
+ use_statistics: true,
574
+ rechunk: true
575
+ )
576
+ _prepare_file_arg(source) do |data|
577
+ DataFrame._read_parquet(
578
+ data,
579
+ columns: columns,
580
+ n_rows: n_rows,
581
+ parallel: parallel,
582
+ row_count_name: row_count_name,
583
+ row_count_offset: row_count_offset,
584
+ low_memory: low_memory,
585
+ use_statistics: use_statistics,
586
+ rechunk: rechunk
587
+ )
588
+ end
589
+ end
590
+
591
+ # Read into a DataFrame from a JSON file.
592
+ #
593
+ # @param source [Object]
594
+ # Path to a file or a file-like object.
595
+ #
596
+ # @return [DataFrame]
597
+ def read_json(source)
598
+ DataFrame._read_json(source)
599
+ end
600
+
601
+ # Read into a DataFrame from a newline delimited JSON file.
602
+ #
603
+ # @param source [Object]
604
+ # Path to a file or a file-like object.
605
+ #
606
+ # @return [DataFrame]
607
+ def read_ndjson(source)
608
+ DataFrame._read_ndjson(source)
609
+ end
610
+
611
+ # Read a SQL query into a DataFrame.
612
+ #
613
+ # @param query [Object]
614
+ # ActiveRecord::Relation or ActiveRecord::Result.
615
+ # @param schema_overrides [Hash]
616
+ # A hash mapping column names to dtypes, used to override the schema
617
+ # inferred from the query.
618
+ #
619
+ # @return [DataFrame]
620
+ def read_database(query, schema_overrides: nil)
621
+ if !defined?(ActiveRecord)
622
+ raise Error, "Active Record not available"
623
+ end
624
+
625
+ result =
626
+ if query.is_a?(ActiveRecord::Result)
627
+ query
628
+ elsif query.is_a?(ActiveRecord::Relation)
629
+ query.connection.select_all(query.to_sql)
630
+ elsif query.is_a?(::String)
631
+ ActiveRecord::Base.connection.select_all(query)
632
+ else
633
+ raise ArgumentError, "Expected ActiveRecord::Relation, ActiveRecord::Result, or String"
634
+ end
635
+
636
+ data = {}
637
+ schema_overrides = (schema_overrides || {}).transform_keys(&:to_s)
638
+
639
+ result.columns.each_with_index do |k, i|
640
+ column_type = result.column_types[i]
641
+
642
+ data[k] =
643
+ if column_type
644
+ result.rows.map { |r| column_type.deserialize(r[i]) }
645
+ else
646
+ result.rows.map { |r| r[i] }
647
+ end
648
+
649
+ polars_type =
650
+ case column_type&.type
651
+ when :binary
652
+ Binary
653
+ when :boolean
654
+ Boolean
655
+ when :date
656
+ Date
657
+ when :datetime, :timestamp
658
+ Datetime
659
+ when :decimal
660
+ Decimal
661
+ when :float
662
+ Float64
663
+ when :integer
664
+ Int64
665
+ when :string, :text
666
+ String
667
+ when :time
668
+ Time
669
+ # TODO fix issue with null
670
+ # when :json, :jsonb
671
+ # Struct
672
+ end
673
+
674
+ schema_overrides[k] ||= polars_type if polars_type
675
+ end
676
+
677
+ DataFrame.new(data, schema_overrides: schema_overrides)
678
+ end
679
+ alias_method :read_sql, :read_database
680
+
681
+ # def read_excel
682
+ # end
683
+
684
+ # Read a CSV file in batches.
685
+ #
686
+ # Upon creation of the `BatchedCsvReader`,
687
+ # polars will gather statistics and determine the
688
+ # file chunks. After that work will only be done
689
+ # if `next_batches` is called.
690
+ #
691
+ # @param source [Object]
692
+ # Path to a file or a file-like object.
693
+ # @param has_header [Boolean]
694
+ # Indicate if the first row of dataset is a header or not.
695
+ # If set to False, column names will be autogenerated in the
696
+ # following format: `column_x`, with `x` being an
697
+ # enumeration over every column in the dataset starting at 1.
698
+ # @param columns [Object]
699
+ # Columns to select. Accepts a list of column indices (starting
700
+ # at zero) or a list of column names.
701
+ # @param new_columns [Object]
702
+ # Rename columns right after parsing the CSV file. If the given
703
+ # list is shorter than the width of the DataFrame the remaining
704
+ # columns will have their original name.
705
+ # @param sep [String]
706
+ # Single byte character to use as delimiter in the file.
707
+ # @param comment_char [String]
708
+ # Single byte character that indicates the start of a comment line,
709
+ # for instance `#`.
710
+ # @param quote_char [String]
711
+ # Single byte character used for csv quoting, default = `"`.
712
+ # Set to nil to turn off special handling and escaping of quotes.
713
+ # @param skip_rows [Integer]
714
+ # Start reading after `skip_rows` lines.
715
+ # @param dtypes [Object]
716
+ # Overwrite dtypes during inference.
717
+ # @param null_values [Object]
718
+ # Values to interpret as null values. You can provide a:
719
+ #
720
+ # - `String`: All values equal to this string will be null.
721
+ # - `Array`: All values equal to any string in this array will be null.
722
+ # - `Hash`: A hash that maps column name to a null value string.
723
+ # @param ignore_errors [Boolean]
724
+ # Try to keep reading lines if some lines yield errors.
725
+ # First try `infer_schema_length: 0` to read all columns as
726
+ # `:str` to check which values might cause an issue.
727
+ # @param parse_dates [Boolean]
728
+ # Try to automatically parse dates. If this does not succeed,
729
+ # the column remains of data type `:str`.
730
+ # @param n_threads [Integer]
731
+ # Number of threads to use in csv parsing.
732
+ # Defaults to the number of physical cpu's of your system.
733
+ # @param infer_schema_length [Integer]
734
+ # Maximum number of lines to read to infer schema.
735
+ # If set to 0, all columns will be read as `:str`.
736
+ # If set to `nil`, a full table scan will be done (slow).
737
+ # @param batch_size [Integer]
738
+ # Number of lines to read into the buffer at once.
739
+ # Modify this to change performance.
740
+ # @param n_rows [Integer]
741
+ # Stop reading from CSV file after reading `n_rows`.
742
+ # During multi-threaded parsing, an upper bound of `n_rows`
743
+ # rows cannot be guaranteed.
744
+ # @param encoding ["utf8", "utf8-lossy"]
745
+ # Lossy means that invalid utf8 values are replaced with `�`
746
+ # characters. When using other encodings than `utf8` or
747
+ # `utf8-lossy`, the input is first decoded im memory with
748
+ # Ruby. Defaults to `utf8`.
749
+ # @param low_memory [Boolean]
750
+ # Reduce memory usage at expense of performance.
751
+ # @param rechunk [Boolean]
752
+ # Make sure that all columns are contiguous in memory by
753
+ # aggregating the chunks into a single array.
754
+ # @param skip_rows_after_header [Integer]
755
+ # Skip this number of rows when the header is parsed.
756
+ # @param row_count_name [String]
757
+ # If not nil, this will insert a row count column with the given name into
758
+ # the DataFrame.
759
+ # @param row_count_offset [Integer]
760
+ # Offset to start the row_count column (only used if the name is set).
761
+ # @param sample_size [Integer]
762
+ # Set the sample size. This is used to sample statistics to estimate the
763
+ # allocation needed.
764
+ # @param eol_char [String]
765
+ # Single byte end of line character.
766
+ # @param truncate_ragged_lines [Boolean]
767
+ # Truncate lines that are longer than the schema.
768
+ #
769
+ # @return [BatchedCsvReader]
770
+ #
771
+ # @example
772
+ # reader = Polars.read_csv_batched(
773
+ # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
774
+ # )
775
+ # reader.next_batches(5)
776
+ def read_csv_batched(
777
+ source,
778
+ has_header: true,
779
+ columns: nil,
780
+ new_columns: nil,
781
+ sep: ",",
782
+ comment_char: nil,
783
+ quote_char: '"',
784
+ skip_rows: 0,
785
+ dtypes: nil,
786
+ null_values: nil,
787
+ ignore_errors: false,
788
+ parse_dates: false,
789
+ n_threads: nil,
790
+ infer_schema_length: 100,
791
+ batch_size: 50_000,
792
+ n_rows: nil,
793
+ encoding: "utf8",
794
+ low_memory: false,
795
+ rechunk: true,
796
+ skip_rows_after_header: 0,
797
+ row_count_name: nil,
798
+ row_count_offset: 0,
799
+ sample_size: 1024,
800
+ eol_char: "\n",
801
+ truncate_ragged_lines: false
802
+ )
803
+ projection, columns = Utils.handle_projection_columns(columns)
804
+
805
+ if columns && !has_header
806
+ columns.each do |column|
807
+ if !column.start_with?("column_")
808
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
809
+ end
810
+ end
811
+ end
812
+
813
+ if projection || new_columns
814
+ raise Todo
815
+ end
816
+
817
+ BatchedCsvReader.new(
818
+ source,
819
+ has_header: has_header,
820
+ columns: columns || projection,
821
+ sep: sep,
822
+ comment_char: comment_char,
823
+ quote_char: quote_char,
824
+ skip_rows: skip_rows,
825
+ dtypes: dtypes,
826
+ null_values: null_values,
827
+ ignore_errors: ignore_errors,
828
+ parse_dates: parse_dates,
829
+ n_threads: n_threads,
830
+ infer_schema_length: infer_schema_length,
831
+ batch_size: batch_size,
832
+ n_rows: n_rows,
833
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
834
+ low_memory: low_memory,
835
+ rechunk: rechunk,
836
+ skip_rows_after_header: skip_rows_after_header,
837
+ row_count_name: row_count_name,
838
+ row_count_offset: row_count_offset,
839
+ sample_size: sample_size,
840
+ eol_char: eol_char,
841
+ new_columns: new_columns,
842
+ truncate_ragged_lines: truncate_ragged_lines
843
+ )
844
+ end
845
+
846
+ # Get a schema of the IPC file without reading data.
847
+ #
848
+ # @param source [Object]
849
+ # Path to a file or a file-like object.
850
+ #
851
+ # @return [Hash]
852
+ def read_ipc_schema(source)
853
+ if Utils.pathlike?(source)
854
+ source = Utils.normalise_filepath(source)
855
+ end
856
+
857
+ Plr.ipc_schema(source)
858
+ end
859
+
860
+ # Get a schema of the Parquet file without reading data.
861
+ #
862
+ # @param source [Object]
863
+ # Path to a file or a file-like object.
864
+ #
865
+ # @return [Hash]
866
+ def read_parquet_schema(source)
867
+ if Utils.pathlike?(source)
868
+ source = Utils.normalise_filepath(source)
869
+ end
870
+
871
+ Plr.parquet_schema(source)
872
+ end
873
+
874
+ private
875
+
876
+ def _prepare_file_arg(file)
877
+ if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
878
+ raise ArgumentError, "use URI(...) for remote files"
879
+ end
880
+
881
+ if defined?(URI) && file.is_a?(URI)
882
+ require "open-uri"
883
+
884
+ file = file.open
885
+ end
886
+
887
+ yield file
888
+ end
889
+ end
890
+ end