polars-df 0.13.0-x64-mingw-ucrt

Sign up to get free protection for your applications and to get access to all the features.
Files changed (80) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +208 -0
  4. data/Cargo.lock +2556 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +39278 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +104 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +36 -0
  18. data/lib/polars/cat_name_space.rb +88 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +98 -0
  21. data/lib/polars/data_frame.rb +5191 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1397 -0
  24. data/lib/polars/date_time_name_space.rb +1287 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +38 -0
  27. data/lib/polars/expr.rb +7256 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +271 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1329 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +136 -0
  39. data/lib/polars/functions/range/datetime_range.rb +149 -0
  40. data/lib/polars/functions/range/int_range.rb +51 -0
  41. data/lib/polars/functions/range/time_range.rb +141 -0
  42. data/lib/polars/functions/repeat.rb +144 -0
  43. data/lib/polars/functions/whenthen.rb +96 -0
  44. data/lib/polars/functions.rb +57 -0
  45. data/lib/polars/group_by.rb +613 -0
  46. data/lib/polars/io/avro.rb +24 -0
  47. data/lib/polars/io/csv.rb +696 -0
  48. data/lib/polars/io/database.rb +73 -0
  49. data/lib/polars/io/ipc.rb +275 -0
  50. data/lib/polars/io/json.rb +29 -0
  51. data/lib/polars/io/ndjson.rb +80 -0
  52. data/lib/polars/io/parquet.rb +233 -0
  53. data/lib/polars/lazy_frame.rb +2708 -0
  54. data/lib/polars/lazy_group_by.rb +181 -0
  55. data/lib/polars/list_expr.rb +791 -0
  56. data/lib/polars/list_name_space.rb +449 -0
  57. data/lib/polars/meta_expr.rb +222 -0
  58. data/lib/polars/name_expr.rb +198 -0
  59. data/lib/polars/plot.rb +109 -0
  60. data/lib/polars/rolling_group_by.rb +35 -0
  61. data/lib/polars/series.rb +4444 -0
  62. data/lib/polars/slice.rb +104 -0
  63. data/lib/polars/sql_context.rb +194 -0
  64. data/lib/polars/string_cache.rb +75 -0
  65. data/lib/polars/string_expr.rb +1495 -0
  66. data/lib/polars/string_name_space.rb +811 -0
  67. data/lib/polars/struct_expr.rb +98 -0
  68. data/lib/polars/struct_name_space.rb +96 -0
  69. data/lib/polars/testing.rb +507 -0
  70. data/lib/polars/utils/constants.rb +9 -0
  71. data/lib/polars/utils/convert.rb +97 -0
  72. data/lib/polars/utils/parse.rb +89 -0
  73. data/lib/polars/utils/various.rb +76 -0
  74. data/lib/polars/utils/wrap.rb +19 -0
  75. data/lib/polars/utils.rb +130 -0
  76. data/lib/polars/version.rb +4 -0
  77. data/lib/polars/whenthen.rb +83 -0
  78. data/lib/polars-df.rb +1 -0
  79. data/lib/polars.rb +91 -0
  80. metadata +138 -0
@@ -0,0 +1,696 @@
1
+ module Polars
2
+ module IO
3
+ # Read a CSV file into a DataFrame.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param has_header [Boolean]
8
+ # Indicate if the first row of dataset is a header or not.
9
+ # If set to false, column names will be autogenerated in the
10
+ # following format: `column_x`, with `x` being an
11
+ # enumeration over every column in the dataset starting at 1.
12
+ # @param columns [Object]
13
+ # Columns to select. Accepts a list of column indices (starting
14
+ # at zero) or a list of column names.
15
+ # @param new_columns [Object]
16
+ # Rename columns right after parsing the CSV file. If the given
17
+ # list is shorter than the width of the DataFrame the remaining
18
+ # columns will have their original name.
19
+ # @param sep [String]
20
+ # Single byte character to use as delimiter in the file.
21
+ # @param comment_char [String]
22
+ # Single byte character that indicates the start of a comment line,
23
+ # for instance `#`.
24
+ # @param quote_char [String]
25
+ # Single byte character used for csv quoting.
26
+ # Set to nil to turn off special handling and escaping of quotes.
27
+ # @param skip_rows [Integer]
28
+ # Start reading after `skip_rows` lines.
29
+ # @param dtypes [Object]
30
+ # Overwrite dtypes during inference.
31
+ # @param null_values [Object]
32
+ # Values to interpret as null values. You can provide a:
33
+ #
34
+ # - `String`: All values equal to this string will be null.
35
+ # - `Array`: All values equal to any string in this array will be null.
36
+ # - `Hash`: A hash that maps column name to a null value string.
37
+ # @param ignore_errors [Boolean]
38
+ # Try to keep reading lines if some lines yield errors.
39
+ # First try `infer_schema_length: 0` to read all columns as
40
+ # `:str` to check which values might cause an issue.
41
+ # @param parse_dates [Boolean]
42
+ # Try to automatically parse dates. If this does not succeed,
43
+ # the column remains of data type `:str`.
44
+ # @param n_threads [Integer]
45
+ # Number of threads to use in csv parsing.
46
+ # Defaults to the number of physical cpu's of your system.
47
+ # @param infer_schema_length [Integer]
48
+ # Maximum number of lines to read to infer schema.
49
+ # If set to 0, all columns will be read as `:utf8`.
50
+ # If set to `nil`, a full table scan will be done (slow).
51
+ # @param batch_size [Integer]
52
+ # Number of lines to read into the buffer at once.
53
+ # Modify this to change performance.
54
+ # @param n_rows [Integer]
55
+ # Stop reading from CSV file after reading `n_rows`.
56
+ # During multi-threaded parsing, an upper bound of `n_rows`
57
+ # rows cannot be guaranteed.
58
+ # @param encoding ["utf8", "utf8-lossy"]
59
+ # Lossy means that invalid utf8 values are replaced with `�`
60
+ # characters. When using other encodings than `utf8` or
61
+ # `utf8-lossy`, the input is first decoded im memory with
62
+ # Ruby.
63
+ # @param low_memory [Boolean]
64
+ # Reduce memory usage at expense of performance.
65
+ # @param rechunk [Boolean]
66
+ # Make sure that all columns are contiguous in memory by
67
+ # aggregating the chunks into a single array.
68
+ # @param storage_options [Hash]
69
+ # Extra options that make sense for a
70
+ # particular storage connection.
71
+ # @param skip_rows_after_header [Integer]
72
+ # Skip this number of rows when the header is parsed.
73
+ # @param row_count_name [String]
74
+ # If not nil, this will insert a row count column with the given name into
75
+ # the DataFrame.
76
+ # @param row_count_offset [Integer]
77
+ # Offset to start the row_count column (only used if the name is set).
78
+ # @param sample_size [Integer]
79
+ # Set the sample size. This is used to sample statistics to estimate the
80
+ # allocation needed.
81
+ # @param eol_char [String]
82
+ # Single byte end of line character.
83
+ # @param truncate_ragged_lines [Boolean]
84
+ # Truncate lines that are longer than the schema.
85
+ #
86
+ # @return [DataFrame]
87
+ #
88
+ # @note
89
+ # This operation defaults to a `rechunk` operation at the end, meaning that
90
+ # all data will be stored continuously in memory.
91
+ # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
92
+ # an expensive operation.
93
+ def read_csv(
94
+ source,
95
+ has_header: true,
96
+ columns: nil,
97
+ new_columns: nil,
98
+ sep: ",",
99
+ comment_char: nil,
100
+ quote_char: '"',
101
+ skip_rows: 0,
102
+ dtypes: nil,
103
+ null_values: nil,
104
+ ignore_errors: false,
105
+ parse_dates: false,
106
+ n_threads: nil,
107
+ infer_schema_length: N_INFER_DEFAULT,
108
+ batch_size: 8192,
109
+ n_rows: nil,
110
+ encoding: "utf8",
111
+ low_memory: false,
112
+ rechunk: true,
113
+ storage_options: nil,
114
+ skip_rows_after_header: 0,
115
+ row_count_name: nil,
116
+ row_count_offset: 0,
117
+ sample_size: 1024,
118
+ eol_char: "\n",
119
+ truncate_ragged_lines: false
120
+ )
121
+ Utils._check_arg_is_1byte("sep", sep, false)
122
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
123
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
124
+ Utils._check_arg_is_1byte("eol_char", eol_char, false)
125
+
126
+ projection, columns = Utils.handle_projection_columns(columns)
127
+
128
+ storage_options ||= {}
129
+
130
+ if columns && !has_header
131
+ columns.each do |column|
132
+ if !column.start_with?("column_")
133
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
134
+ end
135
+ end
136
+ end
137
+
138
+ if projection || new_columns
139
+ raise Todo
140
+ end
141
+
142
+ df = nil
143
+ _prepare_file_arg(source) do |data|
144
+ df = _read_csv_impl(
145
+ data,
146
+ has_header: has_header,
147
+ columns: columns || projection,
148
+ sep: sep,
149
+ comment_char: comment_char,
150
+ quote_char: quote_char,
151
+ skip_rows: skip_rows,
152
+ dtypes: dtypes,
153
+ null_values: null_values,
154
+ ignore_errors: ignore_errors,
155
+ parse_dates: parse_dates,
156
+ n_threads: n_threads,
157
+ infer_schema_length: infer_schema_length,
158
+ batch_size: batch_size,
159
+ n_rows: n_rows,
160
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
161
+ low_memory: low_memory,
162
+ rechunk: rechunk,
163
+ skip_rows_after_header: skip_rows_after_header,
164
+ row_count_name: row_count_name,
165
+ row_count_offset: row_count_offset,
166
+ sample_size: sample_size,
167
+ eol_char: eol_char,
168
+ truncate_ragged_lines: truncate_ragged_lines
169
+ )
170
+ end
171
+
172
+ if new_columns
173
+ Utils._update_columns(df, new_columns)
174
+ else
175
+ df
176
+ end
177
+ end
178
+
179
+ # @private
180
+ def _read_csv_impl(
181
+ file,
182
+ has_header: true,
183
+ columns: nil,
184
+ sep: ",",
185
+ comment_char: nil,
186
+ quote_char: '"',
187
+ skip_rows: 0,
188
+ dtypes: nil,
189
+ schema: nil,
190
+ null_values: nil,
191
+ missing_utf8_is_empty_string: false,
192
+ ignore_errors: false,
193
+ parse_dates: false,
194
+ n_threads: nil,
195
+ infer_schema_length: N_INFER_DEFAULT,
196
+ batch_size: 8192,
197
+ n_rows: nil,
198
+ encoding: "utf8",
199
+ low_memory: false,
200
+ rechunk: true,
201
+ skip_rows_after_header: 0,
202
+ row_count_name: nil,
203
+ row_count_offset: 0,
204
+ sample_size: 1024,
205
+ eol_char: "\n",
206
+ raise_if_empty: true,
207
+ truncate_ragged_lines: false,
208
+ decimal_comma: false,
209
+ glob: true
210
+ )
211
+ if Utils.pathlike?(file)
212
+ path = Utils.normalize_filepath(file)
213
+ else
214
+ path = nil
215
+ # if defined?(StringIO) && file.is_a?(StringIO)
216
+ # file = file.string
217
+ # end
218
+ end
219
+
220
+ dtype_list = nil
221
+ dtype_slice = nil
222
+ if !dtypes.nil?
223
+ if dtypes.is_a?(Hash)
224
+ dtype_list = []
225
+ dtypes.each do |k, v|
226
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
227
+ end
228
+ elsif dtypes.is_a?(::Array)
229
+ dtype_slice = dtypes
230
+ else
231
+ raise ArgumentError, "dtype arg should be list or dict"
232
+ end
233
+ end
234
+
235
+ processed_null_values = Utils._process_null_values(null_values)
236
+
237
+ if columns.is_a?(::String)
238
+ columns = [columns]
239
+ end
240
+ if file.is_a?(::String) && file.include?("*")
241
+ dtypes_dict = nil
242
+ if !dtype_list.nil?
243
+ dtypes_dict = dtype_list.to_h
244
+ end
245
+ if !dtype_slice.nil?
246
+ raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
247
+ end
248
+ scan = scan_csv(
249
+ file,
250
+ has_header: has_header,
251
+ sep: sep,
252
+ comment_char: comment_char,
253
+ quote_char: quote_char,
254
+ skip_rows: skip_rows,
255
+ dtypes: dtypes_dict,
256
+ null_values: null_values,
257
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
258
+ ignore_errors: ignore_errors,
259
+ infer_schema_length: infer_schema_length,
260
+ n_rows: n_rows,
261
+ low_memory: low_memory,
262
+ rechunk: rechunk,
263
+ skip_rows_after_header: skip_rows_after_header,
264
+ row_count_name: row_count_name,
265
+ row_count_offset: row_count_offset,
266
+ eol_char: eol_char,
267
+ truncate_ragged_lines: truncate_ragged_lines,
268
+ decimal_comma: decimal_comma,
269
+ glob: glob
270
+ )
271
+ if columns.nil?
272
+ return scan.collect
273
+ elsif is_str_sequence(columns, allow_str: false)
274
+ return scan.select(columns).collect
275
+ else
276
+ raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
277
+ end
278
+ end
279
+
280
+ projection, columns = Utils.handle_projection_columns(columns)
281
+
282
+ rbdf =
283
+ RbDataFrame.read_csv(
284
+ file,
285
+ infer_schema_length,
286
+ batch_size,
287
+ has_header,
288
+ ignore_errors,
289
+ n_rows,
290
+ skip_rows,
291
+ projection,
292
+ sep,
293
+ rechunk,
294
+ columns,
295
+ encoding,
296
+ n_threads,
297
+ path,
298
+ dtype_list,
299
+ dtype_slice,
300
+ low_memory,
301
+ comment_char,
302
+ quote_char,
303
+ processed_null_values,
304
+ missing_utf8_is_empty_string,
305
+ parse_dates,
306
+ skip_rows_after_header,
307
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
308
+ sample_size,
309
+ eol_char,
310
+ raise_if_empty,
311
+ truncate_ragged_lines,
312
+ decimal_comma,
313
+ schema
314
+ )
315
+ Utils.wrap_df(rbdf)
316
+ end
317
+
318
+ # Read a CSV file in batches.
319
+ #
320
+ # Upon creation of the `BatchedCsvReader`,
321
+ # polars will gather statistics and determine the
322
+ # file chunks. After that work will only be done
323
+ # if `next_batches` is called.
324
+ #
325
+ # @param source [Object]
326
+ # Path to a file or a file-like object.
327
+ # @param has_header [Boolean]
328
+ # Indicate if the first row of dataset is a header or not.
329
+ # If set to False, column names will be autogenerated in the
330
+ # following format: `column_x`, with `x` being an
331
+ # enumeration over every column in the dataset starting at 1.
332
+ # @param columns [Object]
333
+ # Columns to select. Accepts a list of column indices (starting
334
+ # at zero) or a list of column names.
335
+ # @param new_columns [Object]
336
+ # Rename columns right after parsing the CSV file. If the given
337
+ # list is shorter than the width of the DataFrame the remaining
338
+ # columns will have their original name.
339
+ # @param sep [String]
340
+ # Single byte character to use as delimiter in the file.
341
+ # @param comment_char [String]
342
+ # Single byte character that indicates the start of a comment line,
343
+ # for instance `#`.
344
+ # @param quote_char [String]
345
+ # Single byte character used for csv quoting, default = `"`.
346
+ # Set to nil to turn off special handling and escaping of quotes.
347
+ # @param skip_rows [Integer]
348
+ # Start reading after `skip_rows` lines.
349
+ # @param dtypes [Object]
350
+ # Overwrite dtypes during inference.
351
+ # @param null_values [Object]
352
+ # Values to interpret as null values. You can provide a:
353
+ #
354
+ # - `String`: All values equal to this string will be null.
355
+ # - `Array`: All values equal to any string in this array will be null.
356
+ # - `Hash`: A hash that maps column name to a null value string.
357
+ # @param ignore_errors [Boolean]
358
+ # Try to keep reading lines if some lines yield errors.
359
+ # First try `infer_schema_length: 0` to read all columns as
360
+ # `:str` to check which values might cause an issue.
361
+ # @param parse_dates [Boolean]
362
+ # Try to automatically parse dates. If this does not succeed,
363
+ # the column remains of data type `:str`.
364
+ # @param n_threads [Integer]
365
+ # Number of threads to use in csv parsing.
366
+ # Defaults to the number of physical cpu's of your system.
367
+ # @param infer_schema_length [Integer]
368
+ # Maximum number of lines to read to infer schema.
369
+ # If set to 0, all columns will be read as `:str`.
370
+ # If set to `nil`, a full table scan will be done (slow).
371
+ # @param batch_size [Integer]
372
+ # Number of lines to read into the buffer at once.
373
+ # Modify this to change performance.
374
+ # @param n_rows [Integer]
375
+ # Stop reading from CSV file after reading `n_rows`.
376
+ # During multi-threaded parsing, an upper bound of `n_rows`
377
+ # rows cannot be guaranteed.
378
+ # @param encoding ["utf8", "utf8-lossy"]
379
+ # Lossy means that invalid utf8 values are replaced with `�`
380
+ # characters. When using other encodings than `utf8` or
381
+ # `utf8-lossy`, the input is first decoded im memory with
382
+ # Ruby. Defaults to `utf8`.
383
+ # @param low_memory [Boolean]
384
+ # Reduce memory usage at expense of performance.
385
+ # @param rechunk [Boolean]
386
+ # Make sure that all columns are contiguous in memory by
387
+ # aggregating the chunks into a single array.
388
+ # @param skip_rows_after_header [Integer]
389
+ # Skip this number of rows when the header is parsed.
390
+ # @param row_count_name [String]
391
+ # If not nil, this will insert a row count column with the given name into
392
+ # the DataFrame.
393
+ # @param row_count_offset [Integer]
394
+ # Offset to start the row_count column (only used if the name is set).
395
+ # @param sample_size [Integer]
396
+ # Set the sample size. This is used to sample statistics to estimate the
397
+ # allocation needed.
398
+ # @param eol_char [String]
399
+ # Single byte end of line character.
400
+ # @param truncate_ragged_lines [Boolean]
401
+ # Truncate lines that are longer than the schema.
402
+ #
403
+ # @return [BatchedCsvReader]
404
+ #
405
+ # @example
406
+ # reader = Polars.read_csv_batched(
407
+ # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
408
+ # )
409
+ # reader.next_batches(5)
410
+ def read_csv_batched(
411
+ source,
412
+ has_header: true,
413
+ columns: nil,
414
+ new_columns: nil,
415
+ sep: ",",
416
+ comment_char: nil,
417
+ quote_char: '"',
418
+ skip_rows: 0,
419
+ dtypes: nil,
420
+ null_values: nil,
421
+ missing_utf8_is_empty_string: false,
422
+ ignore_errors: false,
423
+ parse_dates: false,
424
+ n_threads: nil,
425
+ infer_schema_length: N_INFER_DEFAULT,
426
+ batch_size: 50_000,
427
+ n_rows: nil,
428
+ encoding: "utf8",
429
+ low_memory: false,
430
+ rechunk: true,
431
+ skip_rows_after_header: 0,
432
+ row_count_name: nil,
433
+ row_count_offset: 0,
434
+ sample_size: 1024,
435
+ eol_char: "\n",
436
+ raise_if_empty: true,
437
+ truncate_ragged_lines: false,
438
+ decimal_comma: false
439
+ )
440
+ projection, columns = Utils.handle_projection_columns(columns)
441
+
442
+ if columns && !has_header
443
+ columns.each do |column|
444
+ if !column.start_with?("column_")
445
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
446
+ end
447
+ end
448
+ end
449
+
450
+ if projection || new_columns
451
+ raise Todo
452
+ end
453
+
454
+ BatchedCsvReader.new(
455
+ source,
456
+ has_header: has_header,
457
+ columns: columns || projection,
458
+ sep: sep,
459
+ comment_char: comment_char,
460
+ quote_char: quote_char,
461
+ skip_rows: skip_rows,
462
+ dtypes: dtypes,
463
+ null_values: null_values,
464
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
465
+ ignore_errors: ignore_errors,
466
+ parse_dates: parse_dates,
467
+ n_threads: n_threads,
468
+ infer_schema_length: infer_schema_length,
469
+ batch_size: batch_size,
470
+ n_rows: n_rows,
471
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
472
+ low_memory: low_memory,
473
+ rechunk: rechunk,
474
+ skip_rows_after_header: skip_rows_after_header,
475
+ row_count_name: row_count_name,
476
+ row_count_offset: row_count_offset,
477
+ sample_size: sample_size,
478
+ eol_char: eol_char,
479
+ new_columns: new_columns,
480
+ raise_if_empty: raise_if_empty,
481
+ truncate_ragged_lines: truncate_ragged_lines,
482
+ decimal_comma: decimal_comma
483
+ )
484
+ end
485
+
486
+ # Lazily read from a CSV file or multiple files via glob patterns.
487
+ #
488
+ # This allows the query optimizer to push down predicates and
489
+ # projections to the scan level, thereby potentially reducing
490
+ # memory overhead.
491
+ #
492
+ # @param source [Object]
493
+ # Path to a file.
494
+ # @param has_header [Boolean]
495
+ # Indicate if the first row of dataset is a header or not.
496
+ # If set to false, column names will be autogenerated in the
497
+ # following format: `column_x`, with `x` being an
498
+ # enumeration over every column in the dataset starting at 1.
499
+ # @param sep [String]
500
+ # Single byte character to use as delimiter in the file.
501
+ # @param comment_char [String]
502
+ # Single byte character that indicates the start of a comment line,
503
+ # for instance `#`.
504
+ # @param quote_char [String]
505
+ # Single byte character used for csv quoting.
506
+ # Set to None to turn off special handling and escaping of quotes.
507
+ # @param skip_rows [Integer]
508
+ # Start reading after `skip_rows` lines. The header will be parsed at this
509
+ # offset.
510
+ # @param dtypes [Object]
511
+ # Overwrite dtypes during inference.
512
+ # @param null_values [Object]
513
+ # Values to interpret as null values. You can provide a:
514
+ #
515
+ # - `String`: All values equal to this string will be null.
516
+ # - `Array`: All values equal to any string in this array will be null.
517
+ # - `Hash`: A hash that maps column name to a null value string.
518
+ # @param ignore_errors [Boolean]
519
+ # Try to keep reading lines if some lines yield errors.
520
+ # First try `infer_schema_length: 0` to read all columns as
521
+ # `:str` to check which values might cause an issue.
522
+ # @param cache [Boolean]
523
+ # Cache the result after reading.
524
+ # @param with_column_names [Object]
525
+ # Apply a function over the column names.
526
+ # This can be used to update a schema just in time, thus before
527
+ # scanning.
528
+ # @param infer_schema_length [Integer]
529
+ # Maximum number of lines to read to infer schema.
530
+ # If set to 0, all columns will be read as `:str`.
531
+ # If set to `nil`, a full table scan will be done (slow).
532
+ # @param n_rows [Integer]
533
+ # Stop reading from CSV file after reading `n_rows`.
534
+ # @param encoding ["utf8", "utf8-lossy"]
535
+ # Lossy means that invalid utf8 values are replaced with `�`
536
+ # characters.
537
+ # @param low_memory [Boolean]
538
+ # Reduce memory usage in expense of performance.
539
+ # @param rechunk [Boolean]
540
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
541
+ # @param skip_rows_after_header [Integer]
542
+ # Skip this number of rows when the header is parsed.
543
+ # @param row_count_name [String]
544
+ # If not nil, this will insert a row count column with the given name into
545
+ # the DataFrame.
546
+ # @param row_count_offset [Integer]
547
+ # Offset to start the row_count column (only used if the name is set).
548
+ # @param parse_dates [Boolean]
549
+ # Try to automatically parse dates. If this does not succeed,
550
+ # the column remains of data type `:str`.
551
+ # @param eol_char [String]
552
+ # Single byte end of line character.
553
+ # @param truncate_ragged_lines [Boolean]
554
+ # Truncate lines that are longer than the schema.
555
+ #
556
+ # @return [LazyFrame]
557
+ def scan_csv(
558
+ source,
559
+ has_header: true,
560
+ sep: ",",
561
+ comment_char: nil,
562
+ quote_char: '"',
563
+ skip_rows: 0,
564
+ dtypes: nil,
565
+ null_values: nil,
566
+ missing_utf8_is_empty_string: false,
567
+ ignore_errors: false,
568
+ cache: true,
569
+ with_column_names: nil,
570
+ infer_schema_length: N_INFER_DEFAULT,
571
+ n_rows: nil,
572
+ encoding: "utf8",
573
+ low_memory: false,
574
+ rechunk: true,
575
+ skip_rows_after_header: 0,
576
+ row_count_name: nil,
577
+ row_count_offset: 0,
578
+ parse_dates: false,
579
+ eol_char: "\n",
580
+ raise_if_empty: true,
581
+ truncate_ragged_lines: false,
582
+ decimal_comma: false,
583
+ glob: true
584
+ )
585
+ Utils._check_arg_is_1byte("sep", sep, false)
586
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
587
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
588
+
589
+ if Utils.pathlike?(source)
590
+ source = Utils.normalize_filepath(source)
591
+ end
592
+
593
+ _scan_csv_impl(
594
+ source,
595
+ has_header: has_header,
596
+ sep: sep,
597
+ comment_char: comment_char,
598
+ quote_char: quote_char,
599
+ skip_rows: skip_rows,
600
+ dtypes: dtypes,
601
+ null_values: null_values,
602
+ ignore_errors: ignore_errors,
603
+ cache: cache,
604
+ with_column_names: with_column_names,
605
+ infer_schema_length: infer_schema_length,
606
+ n_rows: n_rows,
607
+ low_memory: low_memory,
608
+ rechunk: rechunk,
609
+ skip_rows_after_header: skip_rows_after_header,
610
+ encoding: encoding,
611
+ row_count_name: row_count_name,
612
+ row_count_offset: row_count_offset,
613
+ parse_dates: parse_dates,
614
+ eol_char: eol_char,
615
+ truncate_ragged_lines: truncate_ragged_lines
616
+ )
617
+ end
618
+
619
+ # @private
620
+ def _scan_csv_impl(
621
+ file,
622
+ has_header: true,
623
+ sep: ",",
624
+ comment_char: nil,
625
+ quote_char: '"',
626
+ skip_rows: 0,
627
+ dtypes: nil,
628
+ null_values: nil,
629
+ ignore_errors: false,
630
+ cache: true,
631
+ with_column_names: nil,
632
+ infer_schema_length: N_INFER_DEFAULT,
633
+ n_rows: nil,
634
+ encoding: "utf8",
635
+ low_memory: false,
636
+ rechunk: true,
637
+ skip_rows_after_header: 0,
638
+ row_count_name: nil,
639
+ row_count_offset: 0,
640
+ parse_dates: false,
641
+ eol_char: "\n",
642
+ truncate_ragged_lines: true
643
+ )
644
+ dtype_list = nil
645
+ if !dtypes.nil?
646
+ dtype_list = []
647
+ dtypes.each do |k, v|
648
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
649
+ end
650
+ end
651
+ processed_null_values = Utils._process_null_values(null_values)
652
+
653
+ rblf =
654
+ RbLazyFrame.new_from_csv(
655
+ file,
656
+ sep,
657
+ has_header,
658
+ ignore_errors,
659
+ skip_rows,
660
+ n_rows,
661
+ cache,
662
+ dtype_list,
663
+ low_memory,
664
+ comment_char,
665
+ quote_char,
666
+ processed_null_values,
667
+ infer_schema_length,
668
+ with_column_names,
669
+ rechunk,
670
+ skip_rows_after_header,
671
+ encoding,
672
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
673
+ parse_dates,
674
+ eol_char,
675
+ truncate_ragged_lines
676
+ )
677
+ Utils.wrap_ldf(rblf)
678
+ end
679
+
680
+ private
681
+
682
+ def _prepare_file_arg(file)
683
+ if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
684
+ raise ArgumentError, "use URI(...) for remote files"
685
+ end
686
+
687
+ if defined?(URI) && file.is_a?(URI)
688
+ require "open-uri"
689
+
690
+ file = file.open
691
+ end
692
+
693
+ yield file
694
+ end
695
+ end
696
+ end