polars-df 0.13.0-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +208 -0
  4. data/Cargo.lock +2556 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +39278 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +104 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +36 -0
  18. data/lib/polars/cat_name_space.rb +88 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +98 -0
  21. data/lib/polars/data_frame.rb +5191 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1397 -0
  24. data/lib/polars/date_time_name_space.rb +1287 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +38 -0
  27. data/lib/polars/expr.rb +7256 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +271 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1329 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +136 -0
  39. data/lib/polars/functions/range/datetime_range.rb +149 -0
  40. data/lib/polars/functions/range/int_range.rb +51 -0
  41. data/lib/polars/functions/range/time_range.rb +141 -0
  42. data/lib/polars/functions/repeat.rb +144 -0
  43. data/lib/polars/functions/whenthen.rb +96 -0
  44. data/lib/polars/functions.rb +57 -0
  45. data/lib/polars/group_by.rb +613 -0
  46. data/lib/polars/io/avro.rb +24 -0
  47. data/lib/polars/io/csv.rb +696 -0
  48. data/lib/polars/io/database.rb +73 -0
  49. data/lib/polars/io/ipc.rb +275 -0
  50. data/lib/polars/io/json.rb +29 -0
  51. data/lib/polars/io/ndjson.rb +80 -0
  52. data/lib/polars/io/parquet.rb +233 -0
  53. data/lib/polars/lazy_frame.rb +2708 -0
  54. data/lib/polars/lazy_group_by.rb +181 -0
  55. data/lib/polars/list_expr.rb +791 -0
  56. data/lib/polars/list_name_space.rb +449 -0
  57. data/lib/polars/meta_expr.rb +222 -0
  58. data/lib/polars/name_expr.rb +198 -0
  59. data/lib/polars/plot.rb +109 -0
  60. data/lib/polars/rolling_group_by.rb +35 -0
  61. data/lib/polars/series.rb +4444 -0
  62. data/lib/polars/slice.rb +104 -0
  63. data/lib/polars/sql_context.rb +194 -0
  64. data/lib/polars/string_cache.rb +75 -0
  65. data/lib/polars/string_expr.rb +1495 -0
  66. data/lib/polars/string_name_space.rb +811 -0
  67. data/lib/polars/struct_expr.rb +98 -0
  68. data/lib/polars/struct_name_space.rb +96 -0
  69. data/lib/polars/testing.rb +507 -0
  70. data/lib/polars/utils/constants.rb +9 -0
  71. data/lib/polars/utils/convert.rb +97 -0
  72. data/lib/polars/utils/parse.rb +89 -0
  73. data/lib/polars/utils/various.rb +76 -0
  74. data/lib/polars/utils/wrap.rb +19 -0
  75. data/lib/polars/utils.rb +130 -0
  76. data/lib/polars/version.rb +4 -0
  77. data/lib/polars/whenthen.rb +83 -0
  78. data/lib/polars-df.rb +1 -0
  79. data/lib/polars.rb +91 -0
  80. metadata +138 -0
@@ -0,0 +1,696 @@
1
+ module Polars
2
+ module IO
3
+ # Read a CSV file into a DataFrame.
4
+ #
5
+ # @param source [Object]
6
+ # Path to a file or a file-like object.
7
+ # @param has_header [Boolean]
8
+ # Indicate if the first row of dataset is a header or not.
9
+ # If set to false, column names will be autogenerated in the
10
+ # following format: `column_x`, with `x` being an
11
+ # enumeration over every column in the dataset starting at 1.
12
+ # @param columns [Object]
13
+ # Columns to select. Accepts a list of column indices (starting
14
+ # at zero) or a list of column names.
15
+ # @param new_columns [Object]
16
+ # Rename columns right after parsing the CSV file. If the given
17
+ # list is shorter than the width of the DataFrame the remaining
18
+ # columns will have their original name.
19
+ # @param sep [String]
20
+ # Single byte character to use as delimiter in the file.
21
+ # @param comment_char [String]
22
+ # Single byte character that indicates the start of a comment line,
23
+ # for instance `#`.
24
+ # @param quote_char [String]
25
+ # Single byte character used for csv quoting.
26
+ # Set to nil to turn off special handling and escaping of quotes.
27
+ # @param skip_rows [Integer]
28
+ # Start reading after `skip_rows` lines.
29
+ # @param dtypes [Object]
30
+ # Overwrite dtypes during inference.
31
+ # @param null_values [Object]
32
+ # Values to interpret as null values. You can provide a:
33
+ #
34
+ # - `String`: All values equal to this string will be null.
35
+ # - `Array`: All values equal to any string in this array will be null.
36
+ # - `Hash`: A hash that maps column name to a null value string.
37
+ # @param ignore_errors [Boolean]
38
+ # Try to keep reading lines if some lines yield errors.
39
+ # First try `infer_schema_length: 0` to read all columns as
40
+ # `:str` to check which values might cause an issue.
41
+ # @param parse_dates [Boolean]
42
+ # Try to automatically parse dates. If this does not succeed,
43
+ # the column remains of data type `:str`.
44
+ # @param n_threads [Integer]
45
+ # Number of threads to use in csv parsing.
46
+ # Defaults to the number of physical cpu's of your system.
47
+ # @param infer_schema_length [Integer]
48
+ # Maximum number of lines to read to infer schema.
49
+ # If set to 0, all columns will be read as `:utf8`.
50
+ # If set to `nil`, a full table scan will be done (slow).
51
+ # @param batch_size [Integer]
52
+ # Number of lines to read into the buffer at once.
53
+ # Modify this to change performance.
54
+ # @param n_rows [Integer]
55
+ # Stop reading from CSV file after reading `n_rows`.
56
+ # During multi-threaded parsing, an upper bound of `n_rows`
57
+ # rows cannot be guaranteed.
58
+ # @param encoding ["utf8", "utf8-lossy"]
59
+ # Lossy means that invalid utf8 values are replaced with `�`
60
+ # characters. When using other encodings than `utf8` or
61
+ # `utf8-lossy`, the input is first decoded im memory with
62
+ # Ruby.
63
+ # @param low_memory [Boolean]
64
+ # Reduce memory usage at expense of performance.
65
+ # @param rechunk [Boolean]
66
+ # Make sure that all columns are contiguous in memory by
67
+ # aggregating the chunks into a single array.
68
+ # @param storage_options [Hash]
69
+ # Extra options that make sense for a
70
+ # particular storage connection.
71
+ # @param skip_rows_after_header [Integer]
72
+ # Skip this number of rows when the header is parsed.
73
+ # @param row_count_name [String]
74
+ # If not nil, this will insert a row count column with the given name into
75
+ # the DataFrame.
76
+ # @param row_count_offset [Integer]
77
+ # Offset to start the row_count column (only used if the name is set).
78
+ # @param sample_size [Integer]
79
+ # Set the sample size. This is used to sample statistics to estimate the
80
+ # allocation needed.
81
+ # @param eol_char [String]
82
+ # Single byte end of line character.
83
+ # @param truncate_ragged_lines [Boolean]
84
+ # Truncate lines that are longer than the schema.
85
+ #
86
+ # @return [DataFrame]
87
+ #
88
+ # @note
89
+ # This operation defaults to a `rechunk` operation at the end, meaning that
90
+ # all data will be stored continuously in memory.
91
+ # Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
92
+ # an expensive operation.
93
+ def read_csv(
94
+ source,
95
+ has_header: true,
96
+ columns: nil,
97
+ new_columns: nil,
98
+ sep: ",",
99
+ comment_char: nil,
100
+ quote_char: '"',
101
+ skip_rows: 0,
102
+ dtypes: nil,
103
+ null_values: nil,
104
+ ignore_errors: false,
105
+ parse_dates: false,
106
+ n_threads: nil,
107
+ infer_schema_length: N_INFER_DEFAULT,
108
+ batch_size: 8192,
109
+ n_rows: nil,
110
+ encoding: "utf8",
111
+ low_memory: false,
112
+ rechunk: true,
113
+ storage_options: nil,
114
+ skip_rows_after_header: 0,
115
+ row_count_name: nil,
116
+ row_count_offset: 0,
117
+ sample_size: 1024,
118
+ eol_char: "\n",
119
+ truncate_ragged_lines: false
120
+ )
121
+ Utils._check_arg_is_1byte("sep", sep, false)
122
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
123
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
124
+ Utils._check_arg_is_1byte("eol_char", eol_char, false)
125
+
126
+ projection, columns = Utils.handle_projection_columns(columns)
127
+
128
+ storage_options ||= {}
129
+
130
+ if columns && !has_header
131
+ columns.each do |column|
132
+ if !column.start_with?("column_")
133
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
134
+ end
135
+ end
136
+ end
137
+
138
+ if projection || new_columns
139
+ raise Todo
140
+ end
141
+
142
+ df = nil
143
+ _prepare_file_arg(source) do |data|
144
+ df = _read_csv_impl(
145
+ data,
146
+ has_header: has_header,
147
+ columns: columns || projection,
148
+ sep: sep,
149
+ comment_char: comment_char,
150
+ quote_char: quote_char,
151
+ skip_rows: skip_rows,
152
+ dtypes: dtypes,
153
+ null_values: null_values,
154
+ ignore_errors: ignore_errors,
155
+ parse_dates: parse_dates,
156
+ n_threads: n_threads,
157
+ infer_schema_length: infer_schema_length,
158
+ batch_size: batch_size,
159
+ n_rows: n_rows,
160
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
161
+ low_memory: low_memory,
162
+ rechunk: rechunk,
163
+ skip_rows_after_header: skip_rows_after_header,
164
+ row_count_name: row_count_name,
165
+ row_count_offset: row_count_offset,
166
+ sample_size: sample_size,
167
+ eol_char: eol_char,
168
+ truncate_ragged_lines: truncate_ragged_lines
169
+ )
170
+ end
171
+
172
+ if new_columns
173
+ Utils._update_columns(df, new_columns)
174
+ else
175
+ df
176
+ end
177
+ end
178
+
179
+ # @private
180
+ def _read_csv_impl(
181
+ file,
182
+ has_header: true,
183
+ columns: nil,
184
+ sep: ",",
185
+ comment_char: nil,
186
+ quote_char: '"',
187
+ skip_rows: 0,
188
+ dtypes: nil,
189
+ schema: nil,
190
+ null_values: nil,
191
+ missing_utf8_is_empty_string: false,
192
+ ignore_errors: false,
193
+ parse_dates: false,
194
+ n_threads: nil,
195
+ infer_schema_length: N_INFER_DEFAULT,
196
+ batch_size: 8192,
197
+ n_rows: nil,
198
+ encoding: "utf8",
199
+ low_memory: false,
200
+ rechunk: true,
201
+ skip_rows_after_header: 0,
202
+ row_count_name: nil,
203
+ row_count_offset: 0,
204
+ sample_size: 1024,
205
+ eol_char: "\n",
206
+ raise_if_empty: true,
207
+ truncate_ragged_lines: false,
208
+ decimal_comma: false,
209
+ glob: true
210
+ )
211
+ if Utils.pathlike?(file)
212
+ path = Utils.normalize_filepath(file)
213
+ else
214
+ path = nil
215
+ # if defined?(StringIO) && file.is_a?(StringIO)
216
+ # file = file.string
217
+ # end
218
+ end
219
+
220
+ dtype_list = nil
221
+ dtype_slice = nil
222
+ if !dtypes.nil?
223
+ if dtypes.is_a?(Hash)
224
+ dtype_list = []
225
+ dtypes.each do |k, v|
226
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
227
+ end
228
+ elsif dtypes.is_a?(::Array)
229
+ dtype_slice = dtypes
230
+ else
231
+ raise ArgumentError, "dtype arg should be list or dict"
232
+ end
233
+ end
234
+
235
+ processed_null_values = Utils._process_null_values(null_values)
236
+
237
+ if columns.is_a?(::String)
238
+ columns = [columns]
239
+ end
240
+ if file.is_a?(::String) && file.include?("*")
241
+ dtypes_dict = nil
242
+ if !dtype_list.nil?
243
+ dtypes_dict = dtype_list.to_h
244
+ end
245
+ if !dtype_slice.nil?
246
+ raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
247
+ end
248
+ scan = scan_csv(
249
+ file,
250
+ has_header: has_header,
251
+ sep: sep,
252
+ comment_char: comment_char,
253
+ quote_char: quote_char,
254
+ skip_rows: skip_rows,
255
+ dtypes: dtypes_dict,
256
+ null_values: null_values,
257
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
258
+ ignore_errors: ignore_errors,
259
+ infer_schema_length: infer_schema_length,
260
+ n_rows: n_rows,
261
+ low_memory: low_memory,
262
+ rechunk: rechunk,
263
+ skip_rows_after_header: skip_rows_after_header,
264
+ row_count_name: row_count_name,
265
+ row_count_offset: row_count_offset,
266
+ eol_char: eol_char,
267
+ truncate_ragged_lines: truncate_ragged_lines,
268
+ decimal_comma: decimal_comma,
269
+ glob: glob
270
+ )
271
+ if columns.nil?
272
+ return scan.collect
273
+ elsif is_str_sequence(columns, allow_str: false)
274
+ return scan.select(columns).collect
275
+ else
276
+ raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
277
+ end
278
+ end
279
+
280
+ projection, columns = Utils.handle_projection_columns(columns)
281
+
282
+ rbdf =
283
+ RbDataFrame.read_csv(
284
+ file,
285
+ infer_schema_length,
286
+ batch_size,
287
+ has_header,
288
+ ignore_errors,
289
+ n_rows,
290
+ skip_rows,
291
+ projection,
292
+ sep,
293
+ rechunk,
294
+ columns,
295
+ encoding,
296
+ n_threads,
297
+ path,
298
+ dtype_list,
299
+ dtype_slice,
300
+ low_memory,
301
+ comment_char,
302
+ quote_char,
303
+ processed_null_values,
304
+ missing_utf8_is_empty_string,
305
+ parse_dates,
306
+ skip_rows_after_header,
307
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
308
+ sample_size,
309
+ eol_char,
310
+ raise_if_empty,
311
+ truncate_ragged_lines,
312
+ decimal_comma,
313
+ schema
314
+ )
315
+ Utils.wrap_df(rbdf)
316
+ end
317
+
318
+ # Read a CSV file in batches.
319
+ #
320
+ # Upon creation of the `BatchedCsvReader`,
321
+ # polars will gather statistics and determine the
322
+ # file chunks. After that work will only be done
323
+ # if `next_batches` is called.
324
+ #
325
+ # @param source [Object]
326
+ # Path to a file or a file-like object.
327
+ # @param has_header [Boolean]
328
+ # Indicate if the first row of dataset is a header or not.
329
+ # If set to False, column names will be autogenerated in the
330
+ # following format: `column_x`, with `x` being an
331
+ # enumeration over every column in the dataset starting at 1.
332
+ # @param columns [Object]
333
+ # Columns to select. Accepts a list of column indices (starting
334
+ # at zero) or a list of column names.
335
+ # @param new_columns [Object]
336
+ # Rename columns right after parsing the CSV file. If the given
337
+ # list is shorter than the width of the DataFrame the remaining
338
+ # columns will have their original name.
339
+ # @param sep [String]
340
+ # Single byte character to use as delimiter in the file.
341
+ # @param comment_char [String]
342
+ # Single byte character that indicates the start of a comment line,
343
+ # for instance `#`.
344
+ # @param quote_char [String]
345
+ # Single byte character used for csv quoting, default = `"`.
346
+ # Set to nil to turn off special handling and escaping of quotes.
347
+ # @param skip_rows [Integer]
348
+ # Start reading after `skip_rows` lines.
349
+ # @param dtypes [Object]
350
+ # Overwrite dtypes during inference.
351
+ # @param null_values [Object]
352
+ # Values to interpret as null values. You can provide a:
353
+ #
354
+ # - `String`: All values equal to this string will be null.
355
+ # - `Array`: All values equal to any string in this array will be null.
356
+ # - `Hash`: A hash that maps column name to a null value string.
357
+ # @param ignore_errors [Boolean]
358
+ # Try to keep reading lines if some lines yield errors.
359
+ # First try `infer_schema_length: 0` to read all columns as
360
+ # `:str` to check which values might cause an issue.
361
+ # @param parse_dates [Boolean]
362
+ # Try to automatically parse dates. If this does not succeed,
363
+ # the column remains of data type `:str`.
364
+ # @param n_threads [Integer]
365
+ # Number of threads to use in csv parsing.
366
+ # Defaults to the number of physical cpu's of your system.
367
+ # @param infer_schema_length [Integer]
368
+ # Maximum number of lines to read to infer schema.
369
+ # If set to 0, all columns will be read as `:str`.
370
+ # If set to `nil`, a full table scan will be done (slow).
371
+ # @param batch_size [Integer]
372
+ # Number of lines to read into the buffer at once.
373
+ # Modify this to change performance.
374
+ # @param n_rows [Integer]
375
+ # Stop reading from CSV file after reading `n_rows`.
376
+ # During multi-threaded parsing, an upper bound of `n_rows`
377
+ # rows cannot be guaranteed.
378
+ # @param encoding ["utf8", "utf8-lossy"]
379
+ # Lossy means that invalid utf8 values are replaced with `�`
380
+ # characters. When using other encodings than `utf8` or
381
+ # `utf8-lossy`, the input is first decoded im memory with
382
+ # Ruby. Defaults to `utf8`.
383
+ # @param low_memory [Boolean]
384
+ # Reduce memory usage at expense of performance.
385
+ # @param rechunk [Boolean]
386
+ # Make sure that all columns are contiguous in memory by
387
+ # aggregating the chunks into a single array.
388
+ # @param skip_rows_after_header [Integer]
389
+ # Skip this number of rows when the header is parsed.
390
+ # @param row_count_name [String]
391
+ # If not nil, this will insert a row count column with the given name into
392
+ # the DataFrame.
393
+ # @param row_count_offset [Integer]
394
+ # Offset to start the row_count column (only used if the name is set).
395
+ # @param sample_size [Integer]
396
+ # Set the sample size. This is used to sample statistics to estimate the
397
+ # allocation needed.
398
+ # @param eol_char [String]
399
+ # Single byte end of line character.
400
+ # @param truncate_ragged_lines [Boolean]
401
+ # Truncate lines that are longer than the schema.
402
+ #
403
+ # @return [BatchedCsvReader]
404
+ #
405
+ # @example
406
+ # reader = Polars.read_csv_batched(
407
+ # "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
408
+ # )
409
+ # reader.next_batches(5)
410
+ def read_csv_batched(
411
+ source,
412
+ has_header: true,
413
+ columns: nil,
414
+ new_columns: nil,
415
+ sep: ",",
416
+ comment_char: nil,
417
+ quote_char: '"',
418
+ skip_rows: 0,
419
+ dtypes: nil,
420
+ null_values: nil,
421
+ missing_utf8_is_empty_string: false,
422
+ ignore_errors: false,
423
+ parse_dates: false,
424
+ n_threads: nil,
425
+ infer_schema_length: N_INFER_DEFAULT,
426
+ batch_size: 50_000,
427
+ n_rows: nil,
428
+ encoding: "utf8",
429
+ low_memory: false,
430
+ rechunk: true,
431
+ skip_rows_after_header: 0,
432
+ row_count_name: nil,
433
+ row_count_offset: 0,
434
+ sample_size: 1024,
435
+ eol_char: "\n",
436
+ raise_if_empty: true,
437
+ truncate_ragged_lines: false,
438
+ decimal_comma: false
439
+ )
440
+ projection, columns = Utils.handle_projection_columns(columns)
441
+
442
+ if columns && !has_header
443
+ columns.each do |column|
444
+ if !column.start_with?("column_")
445
+ raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
446
+ end
447
+ end
448
+ end
449
+
450
+ if projection || new_columns
451
+ raise Todo
452
+ end
453
+
454
+ BatchedCsvReader.new(
455
+ source,
456
+ has_header: has_header,
457
+ columns: columns || projection,
458
+ sep: sep,
459
+ comment_char: comment_char,
460
+ quote_char: quote_char,
461
+ skip_rows: skip_rows,
462
+ dtypes: dtypes,
463
+ null_values: null_values,
464
+ missing_utf8_is_empty_string: missing_utf8_is_empty_string,
465
+ ignore_errors: ignore_errors,
466
+ parse_dates: parse_dates,
467
+ n_threads: n_threads,
468
+ infer_schema_length: infer_schema_length,
469
+ batch_size: batch_size,
470
+ n_rows: n_rows,
471
+ encoding: encoding == "utf8-lossy" ? encoding : "utf8",
472
+ low_memory: low_memory,
473
+ rechunk: rechunk,
474
+ skip_rows_after_header: skip_rows_after_header,
475
+ row_count_name: row_count_name,
476
+ row_count_offset: row_count_offset,
477
+ sample_size: sample_size,
478
+ eol_char: eol_char,
479
+ new_columns: new_columns,
480
+ raise_if_empty: raise_if_empty,
481
+ truncate_ragged_lines: truncate_ragged_lines,
482
+ decimal_comma: decimal_comma
483
+ )
484
+ end
485
+
486
+ # Lazily read from a CSV file or multiple files via glob patterns.
487
+ #
488
+ # This allows the query optimizer to push down predicates and
489
+ # projections to the scan level, thereby potentially reducing
490
+ # memory overhead.
491
+ #
492
+ # @param source [Object]
493
+ # Path to a file.
494
+ # @param has_header [Boolean]
495
+ # Indicate if the first row of dataset is a header or not.
496
+ # If set to false, column names will be autogenerated in the
497
+ # following format: `column_x`, with `x` being an
498
+ # enumeration over every column in the dataset starting at 1.
499
+ # @param sep [String]
500
+ # Single byte character to use as delimiter in the file.
501
+ # @param comment_char [String]
502
+ # Single byte character that indicates the start of a comment line,
503
+ # for instance `#`.
504
+ # @param quote_char [String]
505
+ # Single byte character used for csv quoting.
506
+ # Set to None to turn off special handling and escaping of quotes.
507
+ # @param skip_rows [Integer]
508
+ # Start reading after `skip_rows` lines. The header will be parsed at this
509
+ # offset.
510
+ # @param dtypes [Object]
511
+ # Overwrite dtypes during inference.
512
+ # @param null_values [Object]
513
+ # Values to interpret as null values. You can provide a:
514
+ #
515
+ # - `String`: All values equal to this string will be null.
516
+ # - `Array`: All values equal to any string in this array will be null.
517
+ # - `Hash`: A hash that maps column name to a null value string.
518
+ # @param ignore_errors [Boolean]
519
+ # Try to keep reading lines if some lines yield errors.
520
+ # First try `infer_schema_length: 0` to read all columns as
521
+ # `:str` to check which values might cause an issue.
522
+ # @param cache [Boolean]
523
+ # Cache the result after reading.
524
+ # @param with_column_names [Object]
525
+ # Apply a function over the column names.
526
+ # This can be used to update a schema just in time, thus before
527
+ # scanning.
528
+ # @param infer_schema_length [Integer]
529
+ # Maximum number of lines to read to infer schema.
530
+ # If set to 0, all columns will be read as `:str`.
531
+ # If set to `nil`, a full table scan will be done (slow).
532
+ # @param n_rows [Integer]
533
+ # Stop reading from CSV file after reading `n_rows`.
534
+ # @param encoding ["utf8", "utf8-lossy"]
535
+ # Lossy means that invalid utf8 values are replaced with `�`
536
+ # characters.
537
+ # @param low_memory [Boolean]
538
+ # Reduce memory usage in expense of performance.
539
+ # @param rechunk [Boolean]
540
+ # Reallocate to contiguous memory when all chunks/ files are parsed.
541
+ # @param skip_rows_after_header [Integer]
542
+ # Skip this number of rows when the header is parsed.
543
+ # @param row_count_name [String]
544
+ # If not nil, this will insert a row count column with the given name into
545
+ # the DataFrame.
546
+ # @param row_count_offset [Integer]
547
+ # Offset to start the row_count column (only used if the name is set).
548
+ # @param parse_dates [Boolean]
549
+ # Try to automatically parse dates. If this does not succeed,
550
+ # the column remains of data type `:str`.
551
+ # @param eol_char [String]
552
+ # Single byte end of line character.
553
+ # @param truncate_ragged_lines [Boolean]
554
+ # Truncate lines that are longer than the schema.
555
+ #
556
+ # @return [LazyFrame]
557
+ def scan_csv(
558
+ source,
559
+ has_header: true,
560
+ sep: ",",
561
+ comment_char: nil,
562
+ quote_char: '"',
563
+ skip_rows: 0,
564
+ dtypes: nil,
565
+ null_values: nil,
566
+ missing_utf8_is_empty_string: false,
567
+ ignore_errors: false,
568
+ cache: true,
569
+ with_column_names: nil,
570
+ infer_schema_length: N_INFER_DEFAULT,
571
+ n_rows: nil,
572
+ encoding: "utf8",
573
+ low_memory: false,
574
+ rechunk: true,
575
+ skip_rows_after_header: 0,
576
+ row_count_name: nil,
577
+ row_count_offset: 0,
578
+ parse_dates: false,
579
+ eol_char: "\n",
580
+ raise_if_empty: true,
581
+ truncate_ragged_lines: false,
582
+ decimal_comma: false,
583
+ glob: true
584
+ )
585
+ Utils._check_arg_is_1byte("sep", sep, false)
586
+ Utils._check_arg_is_1byte("comment_char", comment_char, false)
587
+ Utils._check_arg_is_1byte("quote_char", quote_char, true)
588
+
589
+ if Utils.pathlike?(source)
590
+ source = Utils.normalize_filepath(source)
591
+ end
592
+
593
+ _scan_csv_impl(
594
+ source,
595
+ has_header: has_header,
596
+ sep: sep,
597
+ comment_char: comment_char,
598
+ quote_char: quote_char,
599
+ skip_rows: skip_rows,
600
+ dtypes: dtypes,
601
+ null_values: null_values,
602
+ ignore_errors: ignore_errors,
603
+ cache: cache,
604
+ with_column_names: with_column_names,
605
+ infer_schema_length: infer_schema_length,
606
+ n_rows: n_rows,
607
+ low_memory: low_memory,
608
+ rechunk: rechunk,
609
+ skip_rows_after_header: skip_rows_after_header,
610
+ encoding: encoding,
611
+ row_count_name: row_count_name,
612
+ row_count_offset: row_count_offset,
613
+ parse_dates: parse_dates,
614
+ eol_char: eol_char,
615
+ truncate_ragged_lines: truncate_ragged_lines
616
+ )
617
+ end
618
+
619
+ # @private
620
+ def _scan_csv_impl(
621
+ file,
622
+ has_header: true,
623
+ sep: ",",
624
+ comment_char: nil,
625
+ quote_char: '"',
626
+ skip_rows: 0,
627
+ dtypes: nil,
628
+ null_values: nil,
629
+ ignore_errors: false,
630
+ cache: true,
631
+ with_column_names: nil,
632
+ infer_schema_length: N_INFER_DEFAULT,
633
+ n_rows: nil,
634
+ encoding: "utf8",
635
+ low_memory: false,
636
+ rechunk: true,
637
+ skip_rows_after_header: 0,
638
+ row_count_name: nil,
639
+ row_count_offset: 0,
640
+ parse_dates: false,
641
+ eol_char: "\n",
642
+ truncate_ragged_lines: true
643
+ )
644
+ dtype_list = nil
645
+ if !dtypes.nil?
646
+ dtype_list = []
647
+ dtypes.each do |k, v|
648
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
649
+ end
650
+ end
651
+ processed_null_values = Utils._process_null_values(null_values)
652
+
653
+ rblf =
654
+ RbLazyFrame.new_from_csv(
655
+ file,
656
+ sep,
657
+ has_header,
658
+ ignore_errors,
659
+ skip_rows,
660
+ n_rows,
661
+ cache,
662
+ dtype_list,
663
+ low_memory,
664
+ comment_char,
665
+ quote_char,
666
+ processed_null_values,
667
+ infer_schema_length,
668
+ with_column_names,
669
+ rechunk,
670
+ skip_rows_after_header,
671
+ encoding,
672
+ Utils.parse_row_index_args(row_count_name, row_count_offset),
673
+ parse_dates,
674
+ eol_char,
675
+ truncate_ragged_lines
676
+ )
677
+ Utils.wrap_ldf(rblf)
678
+ end
679
+
680
+ private
681
+
682
+ def _prepare_file_arg(file)
683
+ if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
684
+ raise ArgumentError, "use URI(...) for remote files"
685
+ end
686
+
687
+ if defined?(URI) && file.is_a?(URI)
688
+ require "open-uri"
689
+
690
+ file = file.open
691
+ end
692
+
693
+ yield file
694
+ end
695
+ end
696
+ end