polars-df 0.13.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +208 -0
- data/Cargo.lock +2556 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +39278 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +104 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +36 -0
- data/lib/polars/cat_name_space.rb +88 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +98 -0
- data/lib/polars/data_frame.rb +5191 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1397 -0
- data/lib/polars/date_time_name_space.rb +1287 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +38 -0
- data/lib/polars/expr.rb +7256 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +271 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1329 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +136 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +613 -0
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/io/csv.rb +696 -0
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +275 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +233 -0
- data/lib/polars/lazy_frame.rb +2708 -0
- data/lib/polars/lazy_group_by.rb +181 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +449 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +4444 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1495 -0
- data/lib/polars/string_name_space.rb +811 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +130 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +91 -0
- metadata +138 -0
@@ -0,0 +1,696 @@
|
|
1
|
+
module Polars
|
2
|
+
module IO
|
3
|
+
# Read a CSV file into a DataFrame.
|
4
|
+
#
|
5
|
+
# @param source [Object]
|
6
|
+
# Path to a file or a file-like object.
|
7
|
+
# @param has_header [Boolean]
|
8
|
+
# Indicate if the first row of dataset is a header or not.
|
9
|
+
# If set to false, column names will be autogenerated in the
|
10
|
+
# following format: `column_x`, with `x` being an
|
11
|
+
# enumeration over every column in the dataset starting at 1.
|
12
|
+
# @param columns [Object]
|
13
|
+
# Columns to select. Accepts a list of column indices (starting
|
14
|
+
# at zero) or a list of column names.
|
15
|
+
# @param new_columns [Object]
|
16
|
+
# Rename columns right after parsing the CSV file. If the given
|
17
|
+
# list is shorter than the width of the DataFrame the remaining
|
18
|
+
# columns will have their original name.
|
19
|
+
# @param sep [String]
|
20
|
+
# Single byte character to use as delimiter in the file.
|
21
|
+
# @param comment_char [String]
|
22
|
+
# Single byte character that indicates the start of a comment line,
|
23
|
+
# for instance `#`.
|
24
|
+
# @param quote_char [String]
|
25
|
+
# Single byte character used for csv quoting.
|
26
|
+
# Set to nil to turn off special handling and escaping of quotes.
|
27
|
+
# @param skip_rows [Integer]
|
28
|
+
# Start reading after `skip_rows` lines.
|
29
|
+
# @param dtypes [Object]
|
30
|
+
# Overwrite dtypes during inference.
|
31
|
+
# @param null_values [Object]
|
32
|
+
# Values to interpret as null values. You can provide a:
|
33
|
+
#
|
34
|
+
# - `String`: All values equal to this string will be null.
|
35
|
+
# - `Array`: All values equal to any string in this array will be null.
|
36
|
+
# - `Hash`: A hash that maps column name to a null value string.
|
37
|
+
# @param ignore_errors [Boolean]
|
38
|
+
# Try to keep reading lines if some lines yield errors.
|
39
|
+
# First try `infer_schema_length: 0` to read all columns as
|
40
|
+
# `:str` to check which values might cause an issue.
|
41
|
+
# @param parse_dates [Boolean]
|
42
|
+
# Try to automatically parse dates. If this does not succeed,
|
43
|
+
# the column remains of data type `:str`.
|
44
|
+
# @param n_threads [Integer]
|
45
|
+
# Number of threads to use in csv parsing.
|
46
|
+
# Defaults to the number of physical cpu's of your system.
|
47
|
+
# @param infer_schema_length [Integer]
|
48
|
+
# Maximum number of lines to read to infer schema.
|
49
|
+
# If set to 0, all columns will be read as `:utf8`.
|
50
|
+
# If set to `nil`, a full table scan will be done (slow).
|
51
|
+
# @param batch_size [Integer]
|
52
|
+
# Number of lines to read into the buffer at once.
|
53
|
+
# Modify this to change performance.
|
54
|
+
# @param n_rows [Integer]
|
55
|
+
# Stop reading from CSV file after reading `n_rows`.
|
56
|
+
# During multi-threaded parsing, an upper bound of `n_rows`
|
57
|
+
# rows cannot be guaranteed.
|
58
|
+
# @param encoding ["utf8", "utf8-lossy"]
|
59
|
+
# Lossy means that invalid utf8 values are replaced with `�`
|
60
|
+
# characters. When using other encodings than `utf8` or
|
61
|
+
# `utf8-lossy`, the input is first decoded im memory with
|
62
|
+
# Ruby.
|
63
|
+
# @param low_memory [Boolean]
|
64
|
+
# Reduce memory usage at expense of performance.
|
65
|
+
# @param rechunk [Boolean]
|
66
|
+
# Make sure that all columns are contiguous in memory by
|
67
|
+
# aggregating the chunks into a single array.
|
68
|
+
# @param storage_options [Hash]
|
69
|
+
# Extra options that make sense for a
|
70
|
+
# particular storage connection.
|
71
|
+
# @param skip_rows_after_header [Integer]
|
72
|
+
# Skip this number of rows when the header is parsed.
|
73
|
+
# @param row_count_name [String]
|
74
|
+
# If not nil, this will insert a row count column with the given name into
|
75
|
+
# the DataFrame.
|
76
|
+
# @param row_count_offset [Integer]
|
77
|
+
# Offset to start the row_count column (only used if the name is set).
|
78
|
+
# @param sample_size [Integer]
|
79
|
+
# Set the sample size. This is used to sample statistics to estimate the
|
80
|
+
# allocation needed.
|
81
|
+
# @param eol_char [String]
|
82
|
+
# Single byte end of line character.
|
83
|
+
# @param truncate_ragged_lines [Boolean]
|
84
|
+
# Truncate lines that are longer than the schema.
|
85
|
+
#
|
86
|
+
# @return [DataFrame]
|
87
|
+
#
|
88
|
+
# @note
|
89
|
+
# This operation defaults to a `rechunk` operation at the end, meaning that
|
90
|
+
# all data will be stored continuously in memory.
|
91
|
+
# Set `rechunk: false` if you are benchmarking the csv-reader. A `rechunk` is
|
92
|
+
# an expensive operation.
|
93
|
+
def read_csv(
|
94
|
+
source,
|
95
|
+
has_header: true,
|
96
|
+
columns: nil,
|
97
|
+
new_columns: nil,
|
98
|
+
sep: ",",
|
99
|
+
comment_char: nil,
|
100
|
+
quote_char: '"',
|
101
|
+
skip_rows: 0,
|
102
|
+
dtypes: nil,
|
103
|
+
null_values: nil,
|
104
|
+
ignore_errors: false,
|
105
|
+
parse_dates: false,
|
106
|
+
n_threads: nil,
|
107
|
+
infer_schema_length: N_INFER_DEFAULT,
|
108
|
+
batch_size: 8192,
|
109
|
+
n_rows: nil,
|
110
|
+
encoding: "utf8",
|
111
|
+
low_memory: false,
|
112
|
+
rechunk: true,
|
113
|
+
storage_options: nil,
|
114
|
+
skip_rows_after_header: 0,
|
115
|
+
row_count_name: nil,
|
116
|
+
row_count_offset: 0,
|
117
|
+
sample_size: 1024,
|
118
|
+
eol_char: "\n",
|
119
|
+
truncate_ragged_lines: false
|
120
|
+
)
|
121
|
+
Utils._check_arg_is_1byte("sep", sep, false)
|
122
|
+
Utils._check_arg_is_1byte("comment_char", comment_char, false)
|
123
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
124
|
+
Utils._check_arg_is_1byte("eol_char", eol_char, false)
|
125
|
+
|
126
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
127
|
+
|
128
|
+
storage_options ||= {}
|
129
|
+
|
130
|
+
if columns && !has_header
|
131
|
+
columns.each do |column|
|
132
|
+
if !column.start_with?("column_")
|
133
|
+
raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
if projection || new_columns
|
139
|
+
raise Todo
|
140
|
+
end
|
141
|
+
|
142
|
+
df = nil
|
143
|
+
_prepare_file_arg(source) do |data|
|
144
|
+
df = _read_csv_impl(
|
145
|
+
data,
|
146
|
+
has_header: has_header,
|
147
|
+
columns: columns || projection,
|
148
|
+
sep: sep,
|
149
|
+
comment_char: comment_char,
|
150
|
+
quote_char: quote_char,
|
151
|
+
skip_rows: skip_rows,
|
152
|
+
dtypes: dtypes,
|
153
|
+
null_values: null_values,
|
154
|
+
ignore_errors: ignore_errors,
|
155
|
+
parse_dates: parse_dates,
|
156
|
+
n_threads: n_threads,
|
157
|
+
infer_schema_length: infer_schema_length,
|
158
|
+
batch_size: batch_size,
|
159
|
+
n_rows: n_rows,
|
160
|
+
encoding: encoding == "utf8-lossy" ? encoding : "utf8",
|
161
|
+
low_memory: low_memory,
|
162
|
+
rechunk: rechunk,
|
163
|
+
skip_rows_after_header: skip_rows_after_header,
|
164
|
+
row_count_name: row_count_name,
|
165
|
+
row_count_offset: row_count_offset,
|
166
|
+
sample_size: sample_size,
|
167
|
+
eol_char: eol_char,
|
168
|
+
truncate_ragged_lines: truncate_ragged_lines
|
169
|
+
)
|
170
|
+
end
|
171
|
+
|
172
|
+
if new_columns
|
173
|
+
Utils._update_columns(df, new_columns)
|
174
|
+
else
|
175
|
+
df
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# @private
|
180
|
+
def _read_csv_impl(
|
181
|
+
file,
|
182
|
+
has_header: true,
|
183
|
+
columns: nil,
|
184
|
+
sep: ",",
|
185
|
+
comment_char: nil,
|
186
|
+
quote_char: '"',
|
187
|
+
skip_rows: 0,
|
188
|
+
dtypes: nil,
|
189
|
+
schema: nil,
|
190
|
+
null_values: nil,
|
191
|
+
missing_utf8_is_empty_string: false,
|
192
|
+
ignore_errors: false,
|
193
|
+
parse_dates: false,
|
194
|
+
n_threads: nil,
|
195
|
+
infer_schema_length: N_INFER_DEFAULT,
|
196
|
+
batch_size: 8192,
|
197
|
+
n_rows: nil,
|
198
|
+
encoding: "utf8",
|
199
|
+
low_memory: false,
|
200
|
+
rechunk: true,
|
201
|
+
skip_rows_after_header: 0,
|
202
|
+
row_count_name: nil,
|
203
|
+
row_count_offset: 0,
|
204
|
+
sample_size: 1024,
|
205
|
+
eol_char: "\n",
|
206
|
+
raise_if_empty: true,
|
207
|
+
truncate_ragged_lines: false,
|
208
|
+
decimal_comma: false,
|
209
|
+
glob: true
|
210
|
+
)
|
211
|
+
if Utils.pathlike?(file)
|
212
|
+
path = Utils.normalize_filepath(file)
|
213
|
+
else
|
214
|
+
path = nil
|
215
|
+
# if defined?(StringIO) && file.is_a?(StringIO)
|
216
|
+
# file = file.string
|
217
|
+
# end
|
218
|
+
end
|
219
|
+
|
220
|
+
dtype_list = nil
|
221
|
+
dtype_slice = nil
|
222
|
+
if !dtypes.nil?
|
223
|
+
if dtypes.is_a?(Hash)
|
224
|
+
dtype_list = []
|
225
|
+
dtypes.each do |k, v|
|
226
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
227
|
+
end
|
228
|
+
elsif dtypes.is_a?(::Array)
|
229
|
+
dtype_slice = dtypes
|
230
|
+
else
|
231
|
+
raise ArgumentError, "dtype arg should be list or dict"
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
processed_null_values = Utils._process_null_values(null_values)
|
236
|
+
|
237
|
+
if columns.is_a?(::String)
|
238
|
+
columns = [columns]
|
239
|
+
end
|
240
|
+
if file.is_a?(::String) && file.include?("*")
|
241
|
+
dtypes_dict = nil
|
242
|
+
if !dtype_list.nil?
|
243
|
+
dtypes_dict = dtype_list.to_h
|
244
|
+
end
|
245
|
+
if !dtype_slice.nil?
|
246
|
+
raise ArgumentError, "cannot use glob patterns and unnamed dtypes as `dtypes` argument; Use dtypes: Mapping[str, Type[DataType]"
|
247
|
+
end
|
248
|
+
scan = scan_csv(
|
249
|
+
file,
|
250
|
+
has_header: has_header,
|
251
|
+
sep: sep,
|
252
|
+
comment_char: comment_char,
|
253
|
+
quote_char: quote_char,
|
254
|
+
skip_rows: skip_rows,
|
255
|
+
dtypes: dtypes_dict,
|
256
|
+
null_values: null_values,
|
257
|
+
missing_utf8_is_empty_string: missing_utf8_is_empty_string,
|
258
|
+
ignore_errors: ignore_errors,
|
259
|
+
infer_schema_length: infer_schema_length,
|
260
|
+
n_rows: n_rows,
|
261
|
+
low_memory: low_memory,
|
262
|
+
rechunk: rechunk,
|
263
|
+
skip_rows_after_header: skip_rows_after_header,
|
264
|
+
row_count_name: row_count_name,
|
265
|
+
row_count_offset: row_count_offset,
|
266
|
+
eol_char: eol_char,
|
267
|
+
truncate_ragged_lines: truncate_ragged_lines,
|
268
|
+
decimal_comma: decimal_comma,
|
269
|
+
glob: glob
|
270
|
+
)
|
271
|
+
if columns.nil?
|
272
|
+
return scan.collect
|
273
|
+
elsif is_str_sequence(columns, allow_str: false)
|
274
|
+
return scan.select(columns).collect
|
275
|
+
else
|
276
|
+
raise ArgumentError, "cannot use glob patterns and integer based projection as `columns` argument; Use columns: List[str]"
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
281
|
+
|
282
|
+
rbdf =
|
283
|
+
RbDataFrame.read_csv(
|
284
|
+
file,
|
285
|
+
infer_schema_length,
|
286
|
+
batch_size,
|
287
|
+
has_header,
|
288
|
+
ignore_errors,
|
289
|
+
n_rows,
|
290
|
+
skip_rows,
|
291
|
+
projection,
|
292
|
+
sep,
|
293
|
+
rechunk,
|
294
|
+
columns,
|
295
|
+
encoding,
|
296
|
+
n_threads,
|
297
|
+
path,
|
298
|
+
dtype_list,
|
299
|
+
dtype_slice,
|
300
|
+
low_memory,
|
301
|
+
comment_char,
|
302
|
+
quote_char,
|
303
|
+
processed_null_values,
|
304
|
+
missing_utf8_is_empty_string,
|
305
|
+
parse_dates,
|
306
|
+
skip_rows_after_header,
|
307
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
308
|
+
sample_size,
|
309
|
+
eol_char,
|
310
|
+
raise_if_empty,
|
311
|
+
truncate_ragged_lines,
|
312
|
+
decimal_comma,
|
313
|
+
schema
|
314
|
+
)
|
315
|
+
Utils.wrap_df(rbdf)
|
316
|
+
end
|
317
|
+
|
318
|
+
# Read a CSV file in batches.
|
319
|
+
#
|
320
|
+
# Upon creation of the `BatchedCsvReader`,
|
321
|
+
# polars will gather statistics and determine the
|
322
|
+
# file chunks. After that work will only be done
|
323
|
+
# if `next_batches` is called.
|
324
|
+
#
|
325
|
+
# @param source [Object]
|
326
|
+
# Path to a file or a file-like object.
|
327
|
+
# @param has_header [Boolean]
|
328
|
+
# Indicate if the first row of dataset is a header or not.
|
329
|
+
# If set to False, column names will be autogenerated in the
|
330
|
+
# following format: `column_x`, with `x` being an
|
331
|
+
# enumeration over every column in the dataset starting at 1.
|
332
|
+
# @param columns [Object]
|
333
|
+
# Columns to select. Accepts a list of column indices (starting
|
334
|
+
# at zero) or a list of column names.
|
335
|
+
# @param new_columns [Object]
|
336
|
+
# Rename columns right after parsing the CSV file. If the given
|
337
|
+
# list is shorter than the width of the DataFrame the remaining
|
338
|
+
# columns will have their original name.
|
339
|
+
# @param sep [String]
|
340
|
+
# Single byte character to use as delimiter in the file.
|
341
|
+
# @param comment_char [String]
|
342
|
+
# Single byte character that indicates the start of a comment line,
|
343
|
+
# for instance `#`.
|
344
|
+
# @param quote_char [String]
|
345
|
+
# Single byte character used for csv quoting, default = `"`.
|
346
|
+
# Set to nil to turn off special handling and escaping of quotes.
|
347
|
+
# @param skip_rows [Integer]
|
348
|
+
# Start reading after `skip_rows` lines.
|
349
|
+
# @param dtypes [Object]
|
350
|
+
# Overwrite dtypes during inference.
|
351
|
+
# @param null_values [Object]
|
352
|
+
# Values to interpret as null values. You can provide a:
|
353
|
+
#
|
354
|
+
# - `String`: All values equal to this string will be null.
|
355
|
+
# - `Array`: All values equal to any string in this array will be null.
|
356
|
+
# - `Hash`: A hash that maps column name to a null value string.
|
357
|
+
# @param ignore_errors [Boolean]
|
358
|
+
# Try to keep reading lines if some lines yield errors.
|
359
|
+
# First try `infer_schema_length: 0` to read all columns as
|
360
|
+
# `:str` to check which values might cause an issue.
|
361
|
+
# @param parse_dates [Boolean]
|
362
|
+
# Try to automatically parse dates. If this does not succeed,
|
363
|
+
# the column remains of data type `:str`.
|
364
|
+
# @param n_threads [Integer]
|
365
|
+
# Number of threads to use in csv parsing.
|
366
|
+
# Defaults to the number of physical cpu's of your system.
|
367
|
+
# @param infer_schema_length [Integer]
|
368
|
+
# Maximum number of lines to read to infer schema.
|
369
|
+
# If set to 0, all columns will be read as `:str`.
|
370
|
+
# If set to `nil`, a full table scan will be done (slow).
|
371
|
+
# @param batch_size [Integer]
|
372
|
+
# Number of lines to read into the buffer at once.
|
373
|
+
# Modify this to change performance.
|
374
|
+
# @param n_rows [Integer]
|
375
|
+
# Stop reading from CSV file after reading `n_rows`.
|
376
|
+
# During multi-threaded parsing, an upper bound of `n_rows`
|
377
|
+
# rows cannot be guaranteed.
|
378
|
+
# @param encoding ["utf8", "utf8-lossy"]
|
379
|
+
# Lossy means that invalid utf8 values are replaced with `�`
|
380
|
+
# characters. When using other encodings than `utf8` or
|
381
|
+
# `utf8-lossy`, the input is first decoded im memory with
|
382
|
+
# Ruby. Defaults to `utf8`.
|
383
|
+
# @param low_memory [Boolean]
|
384
|
+
# Reduce memory usage at expense of performance.
|
385
|
+
# @param rechunk [Boolean]
|
386
|
+
# Make sure that all columns are contiguous in memory by
|
387
|
+
# aggregating the chunks into a single array.
|
388
|
+
# @param skip_rows_after_header [Integer]
|
389
|
+
# Skip this number of rows when the header is parsed.
|
390
|
+
# @param row_count_name [String]
|
391
|
+
# If not nil, this will insert a row count column with the given name into
|
392
|
+
# the DataFrame.
|
393
|
+
# @param row_count_offset [Integer]
|
394
|
+
# Offset to start the row_count column (only used if the name is set).
|
395
|
+
# @param sample_size [Integer]
|
396
|
+
# Set the sample size. This is used to sample statistics to estimate the
|
397
|
+
# allocation needed.
|
398
|
+
# @param eol_char [String]
|
399
|
+
# Single byte end of line character.
|
400
|
+
# @param truncate_ragged_lines [Boolean]
|
401
|
+
# Truncate lines that are longer than the schema.
|
402
|
+
#
|
403
|
+
# @return [BatchedCsvReader]
|
404
|
+
#
|
405
|
+
# @example
|
406
|
+
# reader = Polars.read_csv_batched(
|
407
|
+
# "./tpch/tables_scale_100/lineitem.tbl", sep: "|", parse_dates: true
|
408
|
+
# )
|
409
|
+
# reader.next_batches(5)
|
410
|
+
def read_csv_batched(
|
411
|
+
source,
|
412
|
+
has_header: true,
|
413
|
+
columns: nil,
|
414
|
+
new_columns: nil,
|
415
|
+
sep: ",",
|
416
|
+
comment_char: nil,
|
417
|
+
quote_char: '"',
|
418
|
+
skip_rows: 0,
|
419
|
+
dtypes: nil,
|
420
|
+
null_values: nil,
|
421
|
+
missing_utf8_is_empty_string: false,
|
422
|
+
ignore_errors: false,
|
423
|
+
parse_dates: false,
|
424
|
+
n_threads: nil,
|
425
|
+
infer_schema_length: N_INFER_DEFAULT,
|
426
|
+
batch_size: 50_000,
|
427
|
+
n_rows: nil,
|
428
|
+
encoding: "utf8",
|
429
|
+
low_memory: false,
|
430
|
+
rechunk: true,
|
431
|
+
skip_rows_after_header: 0,
|
432
|
+
row_count_name: nil,
|
433
|
+
row_count_offset: 0,
|
434
|
+
sample_size: 1024,
|
435
|
+
eol_char: "\n",
|
436
|
+
raise_if_empty: true,
|
437
|
+
truncate_ragged_lines: false,
|
438
|
+
decimal_comma: false
|
439
|
+
)
|
440
|
+
projection, columns = Utils.handle_projection_columns(columns)
|
441
|
+
|
442
|
+
if columns && !has_header
|
443
|
+
columns.each do |column|
|
444
|
+
if !column.start_with?("column_")
|
445
|
+
raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
|
446
|
+
end
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
if projection || new_columns
|
451
|
+
raise Todo
|
452
|
+
end
|
453
|
+
|
454
|
+
BatchedCsvReader.new(
|
455
|
+
source,
|
456
|
+
has_header: has_header,
|
457
|
+
columns: columns || projection,
|
458
|
+
sep: sep,
|
459
|
+
comment_char: comment_char,
|
460
|
+
quote_char: quote_char,
|
461
|
+
skip_rows: skip_rows,
|
462
|
+
dtypes: dtypes,
|
463
|
+
null_values: null_values,
|
464
|
+
missing_utf8_is_empty_string: missing_utf8_is_empty_string,
|
465
|
+
ignore_errors: ignore_errors,
|
466
|
+
parse_dates: parse_dates,
|
467
|
+
n_threads: n_threads,
|
468
|
+
infer_schema_length: infer_schema_length,
|
469
|
+
batch_size: batch_size,
|
470
|
+
n_rows: n_rows,
|
471
|
+
encoding: encoding == "utf8-lossy" ? encoding : "utf8",
|
472
|
+
low_memory: low_memory,
|
473
|
+
rechunk: rechunk,
|
474
|
+
skip_rows_after_header: skip_rows_after_header,
|
475
|
+
row_count_name: row_count_name,
|
476
|
+
row_count_offset: row_count_offset,
|
477
|
+
sample_size: sample_size,
|
478
|
+
eol_char: eol_char,
|
479
|
+
new_columns: new_columns,
|
480
|
+
raise_if_empty: raise_if_empty,
|
481
|
+
truncate_ragged_lines: truncate_ragged_lines,
|
482
|
+
decimal_comma: decimal_comma
|
483
|
+
)
|
484
|
+
end
|
485
|
+
|
486
|
+
# Lazily read from a CSV file or multiple files via glob patterns.
|
487
|
+
#
|
488
|
+
# This allows the query optimizer to push down predicates and
|
489
|
+
# projections to the scan level, thereby potentially reducing
|
490
|
+
# memory overhead.
|
491
|
+
#
|
492
|
+
# @param source [Object]
|
493
|
+
# Path to a file.
|
494
|
+
# @param has_header [Boolean]
|
495
|
+
# Indicate if the first row of dataset is a header or not.
|
496
|
+
# If set to false, column names will be autogenerated in the
|
497
|
+
# following format: `column_x`, with `x` being an
|
498
|
+
# enumeration over every column in the dataset starting at 1.
|
499
|
+
# @param sep [String]
|
500
|
+
# Single byte character to use as delimiter in the file.
|
501
|
+
# @param comment_char [String]
|
502
|
+
# Single byte character that indicates the start of a comment line,
|
503
|
+
# for instance `#`.
|
504
|
+
# @param quote_char [String]
|
505
|
+
# Single byte character used for csv quoting.
|
506
|
+
# Set to None to turn off special handling and escaping of quotes.
|
507
|
+
# @param skip_rows [Integer]
|
508
|
+
# Start reading after `skip_rows` lines. The header will be parsed at this
|
509
|
+
# offset.
|
510
|
+
# @param dtypes [Object]
|
511
|
+
# Overwrite dtypes during inference.
|
512
|
+
# @param null_values [Object]
|
513
|
+
# Values to interpret as null values. You can provide a:
|
514
|
+
#
|
515
|
+
# - `String`: All values equal to this string will be null.
|
516
|
+
# - `Array`: All values equal to any string in this array will be null.
|
517
|
+
# - `Hash`: A hash that maps column name to a null value string.
|
518
|
+
# @param ignore_errors [Boolean]
|
519
|
+
# Try to keep reading lines if some lines yield errors.
|
520
|
+
# First try `infer_schema_length: 0` to read all columns as
|
521
|
+
# `:str` to check which values might cause an issue.
|
522
|
+
# @param cache [Boolean]
|
523
|
+
# Cache the result after reading.
|
524
|
+
# @param with_column_names [Object]
|
525
|
+
# Apply a function over the column names.
|
526
|
+
# This can be used to update a schema just in time, thus before
|
527
|
+
# scanning.
|
528
|
+
# @param infer_schema_length [Integer]
|
529
|
+
# Maximum number of lines to read to infer schema.
|
530
|
+
# If set to 0, all columns will be read as `:str`.
|
531
|
+
# If set to `nil`, a full table scan will be done (slow).
|
532
|
+
# @param n_rows [Integer]
|
533
|
+
# Stop reading from CSV file after reading `n_rows`.
|
534
|
+
# @param encoding ["utf8", "utf8-lossy"]
|
535
|
+
# Lossy means that invalid utf8 values are replaced with `�`
|
536
|
+
# characters.
|
537
|
+
# @param low_memory [Boolean]
|
538
|
+
# Reduce memory usage in expense of performance.
|
539
|
+
# @param rechunk [Boolean]
|
540
|
+
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
541
|
+
# @param skip_rows_after_header [Integer]
|
542
|
+
# Skip this number of rows when the header is parsed.
|
543
|
+
# @param row_count_name [String]
|
544
|
+
# If not nil, this will insert a row count column with the given name into
|
545
|
+
# the DataFrame.
|
546
|
+
# @param row_count_offset [Integer]
|
547
|
+
# Offset to start the row_count column (only used if the name is set).
|
548
|
+
# @param parse_dates [Boolean]
|
549
|
+
# Try to automatically parse dates. If this does not succeed,
|
550
|
+
# the column remains of data type `:str`.
|
551
|
+
# @param eol_char [String]
|
552
|
+
# Single byte end of line character.
|
553
|
+
# @param truncate_ragged_lines [Boolean]
|
554
|
+
# Truncate lines that are longer than the schema.
|
555
|
+
#
|
556
|
+
# @return [LazyFrame]
|
557
|
+
def scan_csv(
|
558
|
+
source,
|
559
|
+
has_header: true,
|
560
|
+
sep: ",",
|
561
|
+
comment_char: nil,
|
562
|
+
quote_char: '"',
|
563
|
+
skip_rows: 0,
|
564
|
+
dtypes: nil,
|
565
|
+
null_values: nil,
|
566
|
+
missing_utf8_is_empty_string: false,
|
567
|
+
ignore_errors: false,
|
568
|
+
cache: true,
|
569
|
+
with_column_names: nil,
|
570
|
+
infer_schema_length: N_INFER_DEFAULT,
|
571
|
+
n_rows: nil,
|
572
|
+
encoding: "utf8",
|
573
|
+
low_memory: false,
|
574
|
+
rechunk: true,
|
575
|
+
skip_rows_after_header: 0,
|
576
|
+
row_count_name: nil,
|
577
|
+
row_count_offset: 0,
|
578
|
+
parse_dates: false,
|
579
|
+
eol_char: "\n",
|
580
|
+
raise_if_empty: true,
|
581
|
+
truncate_ragged_lines: false,
|
582
|
+
decimal_comma: false,
|
583
|
+
glob: true
|
584
|
+
)
|
585
|
+
Utils._check_arg_is_1byte("sep", sep, false)
|
586
|
+
Utils._check_arg_is_1byte("comment_char", comment_char, false)
|
587
|
+
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
588
|
+
|
589
|
+
if Utils.pathlike?(source)
|
590
|
+
source = Utils.normalize_filepath(source)
|
591
|
+
end
|
592
|
+
|
593
|
+
_scan_csv_impl(
|
594
|
+
source,
|
595
|
+
has_header: has_header,
|
596
|
+
sep: sep,
|
597
|
+
comment_char: comment_char,
|
598
|
+
quote_char: quote_char,
|
599
|
+
skip_rows: skip_rows,
|
600
|
+
dtypes: dtypes,
|
601
|
+
null_values: null_values,
|
602
|
+
ignore_errors: ignore_errors,
|
603
|
+
cache: cache,
|
604
|
+
with_column_names: with_column_names,
|
605
|
+
infer_schema_length: infer_schema_length,
|
606
|
+
n_rows: n_rows,
|
607
|
+
low_memory: low_memory,
|
608
|
+
rechunk: rechunk,
|
609
|
+
skip_rows_after_header: skip_rows_after_header,
|
610
|
+
encoding: encoding,
|
611
|
+
row_count_name: row_count_name,
|
612
|
+
row_count_offset: row_count_offset,
|
613
|
+
parse_dates: parse_dates,
|
614
|
+
eol_char: eol_char,
|
615
|
+
truncate_ragged_lines: truncate_ragged_lines
|
616
|
+
)
|
617
|
+
end
|
618
|
+
|
619
|
+
# @private
|
620
|
+
def _scan_csv_impl(
|
621
|
+
file,
|
622
|
+
has_header: true,
|
623
|
+
sep: ",",
|
624
|
+
comment_char: nil,
|
625
|
+
quote_char: '"',
|
626
|
+
skip_rows: 0,
|
627
|
+
dtypes: nil,
|
628
|
+
null_values: nil,
|
629
|
+
ignore_errors: false,
|
630
|
+
cache: true,
|
631
|
+
with_column_names: nil,
|
632
|
+
infer_schema_length: N_INFER_DEFAULT,
|
633
|
+
n_rows: nil,
|
634
|
+
encoding: "utf8",
|
635
|
+
low_memory: false,
|
636
|
+
rechunk: true,
|
637
|
+
skip_rows_after_header: 0,
|
638
|
+
row_count_name: nil,
|
639
|
+
row_count_offset: 0,
|
640
|
+
parse_dates: false,
|
641
|
+
eol_char: "\n",
|
642
|
+
truncate_ragged_lines: true
|
643
|
+
)
|
644
|
+
dtype_list = nil
|
645
|
+
if !dtypes.nil?
|
646
|
+
dtype_list = []
|
647
|
+
dtypes.each do |k, v|
|
648
|
+
dtype_list << [k, Utils.rb_type_to_dtype(v)]
|
649
|
+
end
|
650
|
+
end
|
651
|
+
processed_null_values = Utils._process_null_values(null_values)
|
652
|
+
|
653
|
+
rblf =
|
654
|
+
RbLazyFrame.new_from_csv(
|
655
|
+
file,
|
656
|
+
sep,
|
657
|
+
has_header,
|
658
|
+
ignore_errors,
|
659
|
+
skip_rows,
|
660
|
+
n_rows,
|
661
|
+
cache,
|
662
|
+
dtype_list,
|
663
|
+
low_memory,
|
664
|
+
comment_char,
|
665
|
+
quote_char,
|
666
|
+
processed_null_values,
|
667
|
+
infer_schema_length,
|
668
|
+
with_column_names,
|
669
|
+
rechunk,
|
670
|
+
skip_rows_after_header,
|
671
|
+
encoding,
|
672
|
+
Utils.parse_row_index_args(row_count_name, row_count_offset),
|
673
|
+
parse_dates,
|
674
|
+
eol_char,
|
675
|
+
truncate_ragged_lines
|
676
|
+
)
|
677
|
+
Utils.wrap_ldf(rblf)
|
678
|
+
end
|
679
|
+
|
680
|
+
private
|
681
|
+
|
682
|
+
def _prepare_file_arg(file)
|
683
|
+
if file.is_a?(::String) && file =~ /\Ahttps?:\/\//
|
684
|
+
raise ArgumentError, "use URI(...) for remote files"
|
685
|
+
end
|
686
|
+
|
687
|
+
if defined?(URI) && file.is_a?(URI)
|
688
|
+
require "open-uri"
|
689
|
+
|
690
|
+
file = file.open
|
691
|
+
end
|
692
|
+
|
693
|
+
yield file
|
694
|
+
end
|
695
|
+
end
|
696
|
+
end
|