polars-df 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +127 -1
- data/Cargo.lock +72 -58
- data/README.md +31 -27
- data/ext/polars/Cargo.toml +15 -6
- data/ext/polars/src/batched_csv.rs +35 -39
- data/ext/polars/src/c_api/allocator.rs +7 -0
- data/ext/polars/src/c_api/mod.rs +1 -0
- data/ext/polars/src/catalog/unity.rs +123 -101
- data/ext/polars/src/conversion/any_value.rs +13 -17
- data/ext/polars/src/conversion/chunked_array.rs +5 -5
- data/ext/polars/src/conversion/datetime.rs +3 -2
- data/ext/polars/src/conversion/mod.rs +50 -45
- data/ext/polars/src/dataframe/export.rs +13 -13
- data/ext/polars/src/dataframe/general.rs +223 -223
- data/ext/polars/src/dataframe/io.rs +27 -141
- data/ext/polars/src/dataframe/mod.rs +13 -5
- data/ext/polars/src/dataframe/serde.rs +1 -1
- data/ext/polars/src/error.rs +44 -7
- data/ext/polars/src/exceptions.rs +45 -12
- data/ext/polars/src/expr/array.rs +12 -0
- data/ext/polars/src/expr/datatype.rs +2 -2
- data/ext/polars/src/expr/datetime.rs +4 -5
- data/ext/polars/src/expr/general.rs +49 -13
- data/ext/polars/src/expr/list.rs +4 -0
- data/ext/polars/src/expr/meta.rs +8 -3
- data/ext/polars/src/expr/mod.rs +22 -6
- data/ext/polars/src/expr/name.rs +19 -8
- data/ext/polars/src/expr/rolling.rs +50 -1
- data/ext/polars/src/expr/string.rs +0 -1
- data/ext/polars/src/expr/struct.rs +7 -2
- data/ext/polars/src/file.rs +136 -103
- data/ext/polars/src/functions/aggregation.rs +9 -8
- data/ext/polars/src/functions/io.rs +81 -10
- data/ext/polars/src/functions/lazy.rs +95 -21
- data/ext/polars/src/functions/mod.rs +2 -0
- data/ext/polars/src/functions/range.rs +19 -3
- data/ext/polars/src/functions/strings.rs +6 -0
- data/ext/polars/src/functions/utils.rs +6 -0
- data/ext/polars/src/interop/arrow/mod.rs +50 -1
- data/ext/polars/src/interop/arrow/{to_ruby.rs → to_rb.rs} +30 -0
- data/ext/polars/src/interop/arrow/to_rust.rs +43 -0
- data/ext/polars/src/interop/numo/to_numo_df.rs +1 -1
- data/ext/polars/src/interop/numo/to_numo_series.rs +1 -1
- data/ext/polars/src/lazyframe/exitable.rs +39 -0
- data/ext/polars/src/lazyframe/general.rs +340 -236
- data/ext/polars/src/lazyframe/mod.rs +46 -10
- data/ext/polars/src/lazyframe/optflags.rs +5 -4
- data/ext/polars/src/lazyframe/serde.rs +11 -3
- data/ext/polars/src/lazyframe/sink.rs +10 -5
- data/ext/polars/src/lazygroupby.rs +6 -7
- data/ext/polars/src/lib.rs +141 -76
- data/ext/polars/src/map/dataframe.rs +12 -12
- data/ext/polars/src/map/lazy.rs +7 -5
- data/ext/polars/src/map/mod.rs +15 -8
- data/ext/polars/src/map/series.rs +3 -3
- data/ext/polars/src/on_startup.rs +16 -8
- data/ext/polars/src/prelude.rs +1 -0
- data/ext/polars/src/rb_modules.rs +19 -49
- data/ext/polars/src/series/aggregation.rs +79 -140
- data/ext/polars/src/series/arithmetic.rs +16 -22
- data/ext/polars/src/series/comparison.rs +101 -222
- data/ext/polars/src/series/construction.rs +17 -18
- data/ext/polars/src/series/export.rs +1 -1
- data/ext/polars/src/series/general.rs +254 -289
- data/ext/polars/src/series/import.rs +17 -0
- data/ext/polars/src/series/map.rs +178 -160
- data/ext/polars/src/series/mod.rs +28 -12
- data/ext/polars/src/series/scatter.rs +12 -9
- data/ext/polars/src/sql.rs +16 -9
- data/ext/polars/src/testing/frame.rs +31 -0
- data/ext/polars/src/testing/mod.rs +5 -0
- data/ext/polars/src/testing/series.rs +31 -0
- data/ext/polars/src/timeout.rs +105 -0
- data/ext/polars/src/utils.rs +159 -1
- data/lib/polars/array_expr.rb +81 -12
- data/lib/polars/array_name_space.rb +74 -7
- data/lib/polars/batched_csv_reader.rb +21 -21
- data/lib/polars/binary_name_space.rb +1 -1
- data/lib/polars/cat_expr.rb +7 -7
- data/lib/polars/config.rb +1 -1
- data/lib/polars/convert.rb +189 -34
- data/lib/polars/data_frame.rb +1066 -831
- data/lib/polars/data_frame_plot.rb +173 -0
- data/lib/polars/data_type_group.rb +1 -0
- data/lib/polars/data_types.rb +31 -12
- data/lib/polars/date_time_expr.rb +51 -69
- data/lib/polars/date_time_name_space.rb +80 -112
- data/lib/polars/dynamic_group_by.rb +7 -7
- data/lib/polars/exceptions.rb +50 -10
- data/lib/polars/expr.rb +470 -517
- data/lib/polars/functions/aggregation/horizontal.rb +0 -1
- data/lib/polars/functions/aggregation/vertical.rb +2 -3
- data/lib/polars/functions/as_datatype.rb +290 -8
- data/lib/polars/functions/eager.rb +204 -10
- data/lib/polars/functions/escape_regex.rb +21 -0
- data/lib/polars/functions/lazy.rb +409 -169
- data/lib/polars/functions/lit.rb +17 -1
- data/lib/polars/functions/range/int_range.rb +74 -2
- data/lib/polars/functions/range/linear_space.rb +77 -0
- data/lib/polars/functions/range/time_range.rb +1 -1
- data/lib/polars/functions/repeat.rb +3 -12
- data/lib/polars/functions/whenthen.rb +2 -2
- data/lib/polars/group_by.rb +72 -20
- data/lib/polars/iceberg_dataset.rb +1 -6
- data/lib/polars/in_process_query.rb +37 -0
- data/lib/polars/io/cloud.rb +18 -0
- data/lib/polars/io/csv.rb +265 -126
- data/lib/polars/io/database.rb +0 -1
- data/lib/polars/io/delta.rb +15 -7
- data/lib/polars/io/ipc.rb +24 -17
- data/lib/polars/io/ndjson.rb +161 -24
- data/lib/polars/io/parquet.rb +101 -38
- data/lib/polars/lazy_frame.rb +849 -558
- data/lib/polars/lazy_group_by.rb +327 -2
- data/lib/polars/list_expr.rb +94 -16
- data/lib/polars/list_name_space.rb +88 -24
- data/lib/polars/meta_expr.rb +42 -1
- data/lib/polars/name_expr.rb +41 -4
- data/lib/polars/query_opt_flags.rb +198 -2
- data/lib/polars/rolling_group_by.rb +3 -3
- data/lib/polars/schema.rb +21 -3
- data/lib/polars/selector.rb +37 -2
- data/lib/polars/selectors.rb +45 -9
- data/lib/polars/series.rb +1156 -728
- data/lib/polars/series_plot.rb +72 -0
- data/lib/polars/slice.rb +1 -1
- data/lib/polars/sql_context.rb +11 -4
- data/lib/polars/string_expr.rb +59 -68
- data/lib/polars/string_name_space.rb +51 -87
- data/lib/polars/struct_expr.rb +36 -18
- data/lib/polars/testing.rb +24 -273
- data/lib/polars/utils/constants.rb +2 -0
- data/lib/polars/utils/construction/data_frame.rb +410 -0
- data/lib/polars/utils/construction/series.rb +364 -0
- data/lib/polars/utils/construction/utils.rb +9 -0
- data/lib/polars/utils/deprecation.rb +11 -0
- data/lib/polars/utils/serde.rb +8 -3
- data/lib/polars/utils/unstable.rb +19 -0
- data/lib/polars/utils/various.rb +59 -0
- data/lib/polars/utils.rb +46 -47
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +47 -1
- metadata +25 -6
- data/ext/polars/src/allocator.rs +0 -13
- data/lib/polars/plot.rb +0 -109
data/lib/polars/io/csv.rb
CHANGED
|
@@ -16,38 +16,55 @@ module Polars
|
|
|
16
16
|
# Rename columns right after parsing the CSV file. If the given
|
|
17
17
|
# list is shorter than the width of the DataFrame the remaining
|
|
18
18
|
# columns will have their original name.
|
|
19
|
-
# @param
|
|
20
|
-
# Single byte character to use as
|
|
21
|
-
# @param
|
|
22
|
-
#
|
|
23
|
-
#
|
|
19
|
+
# @param separator [String]
|
|
20
|
+
# Single byte character to use as separator in the file.
|
|
21
|
+
# @param comment_prefix [String]
|
|
22
|
+
# A string used to indicate the start of a comment line. Comment lines are skipped
|
|
23
|
+
# during parsing. Common examples of comment prefixes are `#` and `//`.
|
|
24
24
|
# @param quote_char [String]
|
|
25
25
|
# Single byte character used for csv quoting.
|
|
26
26
|
# Set to nil to turn off special handling and escaping of quotes.
|
|
27
27
|
# @param skip_rows [Integer]
|
|
28
28
|
# Start reading after `skip_rows` lines.
|
|
29
|
-
# @param
|
|
30
|
-
#
|
|
29
|
+
# @param skip_lines [Integer]
|
|
30
|
+
# Start reading after `skip_lines` lines. The header will be parsed at this
|
|
31
|
+
# offset. Note that CSV escaping will not be respected when skipping lines.
|
|
32
|
+
# If you want to skip valid CSV rows, use `skip_rows`.
|
|
33
|
+
# @param schema [Object]
|
|
34
|
+
# Provide the schema. This means that polars doesn't do schema inference.
|
|
35
|
+
# This argument expects the complete schema, whereas `schema_overrides` can be
|
|
36
|
+
# used to partially overwrite a schema. Note that the order of the columns in
|
|
37
|
+
# the provided `schema` must match the order of the columns in the CSV being read.
|
|
38
|
+
# @param schema_overrides [Object]
|
|
39
|
+
# Overwrite dtypes for specific or all columns during schema inference.
|
|
31
40
|
# @param null_values [Object]
|
|
32
41
|
# Values to interpret as null values. You can provide a:
|
|
33
42
|
#
|
|
34
43
|
# - `String`: All values equal to this string will be null.
|
|
35
44
|
# - `Array`: All values equal to any string in this array will be null.
|
|
36
45
|
# - `Hash`: A hash that maps column name to a null value string.
|
|
46
|
+
# @param missing_utf8_is_empty_string [Boolean]
|
|
47
|
+
# By default a missing value is considered to be null; if you would prefer missing
|
|
48
|
+
# utf8 values to be treated as the empty string you can set this param true.
|
|
37
49
|
# @param ignore_errors [Boolean]
|
|
38
50
|
# Try to keep reading lines if some lines yield errors.
|
|
39
51
|
# First try `infer_schema_length: 0` to read all columns as
|
|
40
52
|
# `:str` to check which values might cause an issue.
|
|
41
|
-
# @param
|
|
53
|
+
# @param try_parse_dates [Boolean]
|
|
42
54
|
# Try to automatically parse dates. If this does not succeed,
|
|
43
55
|
# the column remains of data type `:str`.
|
|
44
56
|
# @param n_threads [Integer]
|
|
45
57
|
# Number of threads to use in csv parsing.
|
|
46
58
|
# Defaults to the number of physical cpu's of your system.
|
|
59
|
+
# @param infer_schema [Boolean]
|
|
60
|
+
# When `true`, the schema is inferred from the data using the first
|
|
61
|
+
# `infer_schema_length` rows.
|
|
62
|
+
# When `false`, the schema is not inferred and will be `Polars::String` if not
|
|
63
|
+
# specified in `schema` or `schema_overrides`.
|
|
47
64
|
# @param infer_schema_length [Integer]
|
|
48
|
-
#
|
|
49
|
-
# If set to
|
|
50
|
-
#
|
|
65
|
+
# The maximum number of rows to scan for schema inference.
|
|
66
|
+
# If set to `nil`, the full data may be scanned *(this is slow)*.
|
|
67
|
+
# Set `infer_schema: false` to read all columns as `Polars::String`.
|
|
51
68
|
# @param batch_size [Integer]
|
|
52
69
|
# Number of lines to read into the buffer at once.
|
|
53
70
|
# Modify this to change performance.
|
|
@@ -70,15 +87,22 @@ module Polars
|
|
|
70
87
|
# particular storage connection.
|
|
71
88
|
# @param skip_rows_after_header [Integer]
|
|
72
89
|
# Skip this number of rows when the header is parsed.
|
|
73
|
-
# @param
|
|
90
|
+
# @param row_index_name [String]
|
|
74
91
|
# If not nil, this will insert a row count column with the given name into
|
|
75
92
|
# the DataFrame.
|
|
76
|
-
# @param
|
|
93
|
+
# @param row_index_offset [Integer]
|
|
77
94
|
# Offset to start the row_count column (only used if the name is set).
|
|
78
95
|
# @param eol_char [String]
|
|
79
96
|
# Single byte end of line character.
|
|
97
|
+
# @param raise_if_empty [Boolean]
|
|
98
|
+
# When there is no data in the source, `NoDataError` is raised. If this parameter
|
|
99
|
+
# is set to false, an empty DataFrame (with no columns) is returned instead.
|
|
80
100
|
# @param truncate_ragged_lines [Boolean]
|
|
81
101
|
# Truncate lines that are longer than the schema.
|
|
102
|
+
# @param decimal_comma [Boolean]
|
|
103
|
+
# Parse floats using a comma as the decimal separator instead of a period.
|
|
104
|
+
# @param glob [Boolean]
|
|
105
|
+
# Expand path given via globbing rules.
|
|
82
106
|
#
|
|
83
107
|
# @return [DataFrame]
|
|
84
108
|
#
|
|
@@ -92,30 +116,36 @@ module Polars
|
|
|
92
116
|
has_header: true,
|
|
93
117
|
columns: nil,
|
|
94
118
|
new_columns: nil,
|
|
95
|
-
|
|
96
|
-
|
|
119
|
+
separator: ",",
|
|
120
|
+
comment_prefix: nil,
|
|
97
121
|
quote_char: '"',
|
|
98
122
|
skip_rows: 0,
|
|
99
|
-
|
|
123
|
+
skip_lines: 0,
|
|
124
|
+
schema: nil,
|
|
125
|
+
schema_overrides: nil,
|
|
100
126
|
null_values: nil,
|
|
127
|
+
missing_utf8_is_empty_string: false,
|
|
101
128
|
ignore_errors: false,
|
|
102
|
-
|
|
129
|
+
try_parse_dates: false,
|
|
103
130
|
n_threads: nil,
|
|
131
|
+
infer_schema: true,
|
|
104
132
|
infer_schema_length: N_INFER_DEFAULT,
|
|
105
133
|
batch_size: 8192,
|
|
106
134
|
n_rows: nil,
|
|
107
135
|
encoding: "utf8",
|
|
108
136
|
low_memory: false,
|
|
109
|
-
rechunk:
|
|
137
|
+
rechunk: false,
|
|
110
138
|
storage_options: nil,
|
|
111
139
|
skip_rows_after_header: 0,
|
|
112
|
-
|
|
113
|
-
|
|
140
|
+
row_index_name: nil,
|
|
141
|
+
row_index_offset: 0,
|
|
114
142
|
eol_char: "\n",
|
|
115
|
-
|
|
143
|
+
raise_if_empty: true,
|
|
144
|
+
truncate_ragged_lines: false,
|
|
145
|
+
decimal_comma: false,
|
|
146
|
+
glob: true
|
|
116
147
|
)
|
|
117
|
-
Utils._check_arg_is_1byte("
|
|
118
|
-
Utils._check_arg_is_1byte("comment_char", comment_char, false)
|
|
148
|
+
Utils._check_arg_is_1byte("separator", separator, false)
|
|
119
149
|
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
|
120
150
|
Utils._check_arg_is_1byte("eol_char", eol_char, false)
|
|
121
151
|
|
|
@@ -131,8 +161,8 @@ module Polars
|
|
|
131
161
|
end
|
|
132
162
|
end
|
|
133
163
|
|
|
134
|
-
if
|
|
135
|
-
|
|
164
|
+
if !infer_schema
|
|
165
|
+
infer_schema_length = 0
|
|
136
166
|
end
|
|
137
167
|
|
|
138
168
|
df = nil
|
|
@@ -141,14 +171,17 @@ module Polars
|
|
|
141
171
|
data,
|
|
142
172
|
has_header: has_header,
|
|
143
173
|
columns: columns || projection,
|
|
144
|
-
|
|
145
|
-
|
|
174
|
+
separator: separator,
|
|
175
|
+
comment_prefix: comment_prefix,
|
|
146
176
|
quote_char: quote_char,
|
|
147
177
|
skip_rows: skip_rows,
|
|
148
|
-
|
|
178
|
+
skip_lines: skip_lines,
|
|
179
|
+
schema_overrides: schema_overrides,
|
|
180
|
+
schema: schema,
|
|
149
181
|
null_values: null_values,
|
|
182
|
+
missing_utf8_is_empty_string: missing_utf8_is_empty_string,
|
|
150
183
|
ignore_errors: ignore_errors,
|
|
151
|
-
|
|
184
|
+
try_parse_dates: try_parse_dates,
|
|
152
185
|
n_threads: n_threads,
|
|
153
186
|
infer_schema_length: infer_schema_length,
|
|
154
187
|
batch_size: batch_size,
|
|
@@ -157,10 +190,13 @@ module Polars
|
|
|
157
190
|
low_memory: low_memory,
|
|
158
191
|
rechunk: rechunk,
|
|
159
192
|
skip_rows_after_header: skip_rows_after_header,
|
|
160
|
-
|
|
161
|
-
|
|
193
|
+
row_index_name: row_index_name,
|
|
194
|
+
row_index_offset: row_index_offset,
|
|
162
195
|
eol_char: eol_char,
|
|
163
|
-
|
|
196
|
+
raise_if_empty: raise_if_empty,
|
|
197
|
+
truncate_ragged_lines: truncate_ragged_lines,
|
|
198
|
+
decimal_comma: decimal_comma,
|
|
199
|
+
glob: glob
|
|
164
200
|
)
|
|
165
201
|
end
|
|
166
202
|
|
|
@@ -176,26 +212,27 @@ module Polars
|
|
|
176
212
|
file,
|
|
177
213
|
has_header: true,
|
|
178
214
|
columns: nil,
|
|
179
|
-
|
|
180
|
-
|
|
215
|
+
separator: ",",
|
|
216
|
+
comment_prefix: nil,
|
|
181
217
|
quote_char: '"',
|
|
182
218
|
skip_rows: 0,
|
|
183
|
-
|
|
219
|
+
skip_lines: 0,
|
|
184
220
|
schema: nil,
|
|
221
|
+
schema_overrides: nil,
|
|
185
222
|
null_values: nil,
|
|
186
223
|
missing_utf8_is_empty_string: false,
|
|
187
224
|
ignore_errors: false,
|
|
188
|
-
|
|
225
|
+
try_parse_dates: false,
|
|
189
226
|
n_threads: nil,
|
|
190
227
|
infer_schema_length: N_INFER_DEFAULT,
|
|
191
228
|
batch_size: 8192,
|
|
192
229
|
n_rows: nil,
|
|
193
230
|
encoding: "utf8",
|
|
194
231
|
low_memory: false,
|
|
195
|
-
rechunk:
|
|
232
|
+
rechunk: false,
|
|
196
233
|
skip_rows_after_header: 0,
|
|
197
|
-
|
|
198
|
-
|
|
234
|
+
row_index_name: nil,
|
|
235
|
+
row_index_offset: 0,
|
|
199
236
|
eol_char: "\n",
|
|
200
237
|
raise_if_empty: true,
|
|
201
238
|
truncate_ragged_lines: false,
|
|
@@ -213,16 +250,16 @@ module Polars
|
|
|
213
250
|
|
|
214
251
|
dtype_list = nil
|
|
215
252
|
dtype_slice = nil
|
|
216
|
-
if !
|
|
217
|
-
if
|
|
253
|
+
if !schema_overrides.nil?
|
|
254
|
+
if schema_overrides.is_a?(Hash)
|
|
218
255
|
dtype_list = []
|
|
219
|
-
|
|
220
|
-
dtype_list << [k, Utils.
|
|
256
|
+
schema_overrides.each do |k, v|
|
|
257
|
+
dtype_list << [k, Utils.parse_into_dtype(v)]
|
|
221
258
|
end
|
|
222
|
-
elsif
|
|
223
|
-
dtype_slice =
|
|
259
|
+
elsif schema_overrides.is_a?(::Array)
|
|
260
|
+
dtype_slice = schema_overrides
|
|
224
261
|
else
|
|
225
|
-
raise
|
|
262
|
+
raise TypeError, "dtype arg should be array or hash"
|
|
226
263
|
end
|
|
227
264
|
end
|
|
228
265
|
|
|
@@ -242,11 +279,13 @@ module Polars
|
|
|
242
279
|
scan = scan_csv(
|
|
243
280
|
file,
|
|
244
281
|
has_header: has_header,
|
|
245
|
-
|
|
246
|
-
|
|
282
|
+
separator: separator,
|
|
283
|
+
comment_prefix: comment_prefix,
|
|
247
284
|
quote_char: quote_char,
|
|
248
285
|
skip_rows: skip_rows,
|
|
249
|
-
|
|
286
|
+
skip_lines: skip_lines,
|
|
287
|
+
schema: schema,
|
|
288
|
+
schema_overrides: dtypes_dict,
|
|
250
289
|
null_values: null_values,
|
|
251
290
|
missing_utf8_is_empty_string: missing_utf8_is_empty_string,
|
|
252
291
|
ignore_errors: ignore_errors,
|
|
@@ -255,9 +294,10 @@ module Polars
|
|
|
255
294
|
low_memory: low_memory,
|
|
256
295
|
rechunk: rechunk,
|
|
257
296
|
skip_rows_after_header: skip_rows_after_header,
|
|
258
|
-
|
|
259
|
-
|
|
297
|
+
row_index_name: row_index_name,
|
|
298
|
+
row_index_offset: row_index_offset,
|
|
260
299
|
eol_char: eol_char,
|
|
300
|
+
raise_if_empty: raise_if_empty,
|
|
261
301
|
truncate_ragged_lines: truncate_ragged_lines,
|
|
262
302
|
decimal_comma: decimal_comma,
|
|
263
303
|
glob: glob
|
|
@@ -282,8 +322,9 @@ module Polars
|
|
|
282
322
|
ignore_errors,
|
|
283
323
|
n_rows,
|
|
284
324
|
skip_rows,
|
|
325
|
+
skip_lines,
|
|
285
326
|
projection,
|
|
286
|
-
|
|
327
|
+
separator,
|
|
287
328
|
rechunk,
|
|
288
329
|
columns,
|
|
289
330
|
encoding,
|
|
@@ -292,13 +333,13 @@ module Polars
|
|
|
292
333
|
dtype_list,
|
|
293
334
|
dtype_slice,
|
|
294
335
|
low_memory,
|
|
295
|
-
|
|
336
|
+
comment_prefix,
|
|
296
337
|
quote_char,
|
|
297
338
|
processed_null_values,
|
|
298
339
|
missing_utf8_is_empty_string,
|
|
299
|
-
|
|
340
|
+
try_parse_dates,
|
|
300
341
|
skip_rows_after_header,
|
|
301
|
-
Utils.parse_row_index_args(
|
|
342
|
+
Utils.parse_row_index_args(row_index_name, row_index_offset),
|
|
302
343
|
eol_char,
|
|
303
344
|
raise_if_empty,
|
|
304
345
|
truncate_ragged_lines,
|
|
@@ -319,7 +360,7 @@ module Polars
|
|
|
319
360
|
# Path to a file or a file-like object.
|
|
320
361
|
# @param has_header [Boolean]
|
|
321
362
|
# Indicate if the first row of dataset is a header or not.
|
|
322
|
-
# If set to
|
|
363
|
+
# If set to false, column names will be autogenerated in the
|
|
323
364
|
# following format: `column_x`, with `x` being an
|
|
324
365
|
# enumeration over every column in the dataset starting at 1.
|
|
325
366
|
# @param columns [Object]
|
|
@@ -329,17 +370,21 @@ module Polars
|
|
|
329
370
|
# Rename columns right after parsing the CSV file. If the given
|
|
330
371
|
# list is shorter than the width of the DataFrame the remaining
|
|
331
372
|
# columns will have their original name.
|
|
332
|
-
# @param
|
|
333
|
-
# Single byte character to use as
|
|
334
|
-
# @param
|
|
335
|
-
#
|
|
336
|
-
#
|
|
373
|
+
# @param separator [String]
|
|
374
|
+
# Single byte character to use as separator in the file.
|
|
375
|
+
# @param comment_prefix [String]
|
|
376
|
+
# A string used to indicate the start of a comment line. Comment lines are skipped
|
|
377
|
+
# during parsing. Common examples of comment prefixes are `#` and `//`.
|
|
337
378
|
# @param quote_char [String]
|
|
338
379
|
# Single byte character used for csv quoting, default = `"`.
|
|
339
380
|
# Set to nil to turn off special handling and escaping of quotes.
|
|
340
381
|
# @param skip_rows [Integer]
|
|
341
382
|
# Start reading after `skip_rows` lines.
|
|
342
|
-
# @param
|
|
383
|
+
# @param skip_lines [Integer]
|
|
384
|
+
# Start reading after `skip_lines` lines. The header will be parsed at this
|
|
385
|
+
# offset. Note that CSV escaping will not be respected when skipping lines.
|
|
386
|
+
# If you want to skip valid CSV rows, use `skip_rows`.
|
|
387
|
+
# @param schema_overrides [Object]
|
|
343
388
|
# Overwrite dtypes during inference.
|
|
344
389
|
# @param null_values [Object]
|
|
345
390
|
# Values to interpret as null values. You can provide a:
|
|
@@ -354,7 +399,7 @@ module Polars
|
|
|
354
399
|
# Try to keep reading lines if some lines yield errors.
|
|
355
400
|
# First try `infer_schema_length: 0` to read all columns as
|
|
356
401
|
# `:str` to check which values might cause an issue.
|
|
357
|
-
# @param
|
|
402
|
+
# @param try_parse_dates [Boolean]
|
|
358
403
|
# Try to automatically parse dates. If this does not succeed,
|
|
359
404
|
# the column remains of data type `:str`.
|
|
360
405
|
# @param n_threads [Integer]
|
|
@@ -383,10 +428,10 @@ module Polars
|
|
|
383
428
|
# aggregating the chunks into a single array.
|
|
384
429
|
# @param skip_rows_after_header [Integer]
|
|
385
430
|
# Skip this number of rows when the header is parsed.
|
|
386
|
-
# @param
|
|
431
|
+
# @param row_index_name [String]
|
|
387
432
|
# If not nil, this will insert a row count column with the given name into
|
|
388
433
|
# the DataFrame.
|
|
389
|
-
# @param
|
|
434
|
+
# @param row_index_offset [Integer]
|
|
390
435
|
# Offset to start the row_count column (only used if the name is set).
|
|
391
436
|
# @param eol_char [String]
|
|
392
437
|
# Single byte end of line character.
|
|
@@ -402,7 +447,7 @@ module Polars
|
|
|
402
447
|
#
|
|
403
448
|
# @example
|
|
404
449
|
# reader = Polars.read_csv_batched(
|
|
405
|
-
# "./tpch/tables_scale_100/lineitem.tbl",
|
|
450
|
+
# "./tpch/tables_scale_100/lineitem.tbl", separator: "|", try_parse_dates: true
|
|
406
451
|
# )
|
|
407
452
|
# reader.next_batches(5)
|
|
408
453
|
def read_csv_batched(
|
|
@@ -410,25 +455,26 @@ module Polars
|
|
|
410
455
|
has_header: true,
|
|
411
456
|
columns: nil,
|
|
412
457
|
new_columns: nil,
|
|
413
|
-
|
|
414
|
-
|
|
458
|
+
separator: ",",
|
|
459
|
+
comment_prefix: nil,
|
|
415
460
|
quote_char: '"',
|
|
416
461
|
skip_rows: 0,
|
|
417
|
-
|
|
462
|
+
skip_lines: 0,
|
|
463
|
+
schema_overrides: nil,
|
|
418
464
|
null_values: nil,
|
|
419
465
|
missing_utf8_is_empty_string: false,
|
|
420
466
|
ignore_errors: false,
|
|
421
|
-
|
|
467
|
+
try_parse_dates: false,
|
|
422
468
|
n_threads: nil,
|
|
423
469
|
infer_schema_length: N_INFER_DEFAULT,
|
|
424
470
|
batch_size: 50_000,
|
|
425
471
|
n_rows: nil,
|
|
426
472
|
encoding: "utf8",
|
|
427
473
|
low_memory: false,
|
|
428
|
-
rechunk:
|
|
474
|
+
rechunk: false,
|
|
429
475
|
skip_rows_after_header: 0,
|
|
430
|
-
|
|
431
|
-
|
|
476
|
+
row_index_name: nil,
|
|
477
|
+
row_index_offset: 0,
|
|
432
478
|
eol_char: "\n",
|
|
433
479
|
raise_if_empty: true,
|
|
434
480
|
truncate_ragged_lines: false,
|
|
@@ -444,23 +490,20 @@ module Polars
|
|
|
444
490
|
end
|
|
445
491
|
end
|
|
446
492
|
|
|
447
|
-
if projection || new_columns
|
|
448
|
-
raise Todo
|
|
449
|
-
end
|
|
450
|
-
|
|
451
493
|
BatchedCsvReader.new(
|
|
452
494
|
source,
|
|
453
495
|
has_header: has_header,
|
|
454
496
|
columns: columns || projection,
|
|
455
|
-
|
|
456
|
-
|
|
497
|
+
separator: separator,
|
|
498
|
+
comment_prefix: comment_prefix,
|
|
457
499
|
quote_char: quote_char,
|
|
458
500
|
skip_rows: skip_rows,
|
|
459
|
-
|
|
501
|
+
skip_lines: skip_lines,
|
|
502
|
+
schema_overrides: schema_overrides,
|
|
460
503
|
null_values: null_values,
|
|
461
504
|
missing_utf8_is_empty_string: missing_utf8_is_empty_string,
|
|
462
505
|
ignore_errors: ignore_errors,
|
|
463
|
-
|
|
506
|
+
try_parse_dates: try_parse_dates,
|
|
464
507
|
n_threads: n_threads,
|
|
465
508
|
infer_schema_length: infer_schema_length,
|
|
466
509
|
batch_size: batch_size,
|
|
@@ -469,8 +512,8 @@ module Polars
|
|
|
469
512
|
low_memory: low_memory,
|
|
470
513
|
rechunk: rechunk,
|
|
471
514
|
skip_rows_after_header: skip_rows_after_header,
|
|
472
|
-
|
|
473
|
-
|
|
515
|
+
row_index_name: row_index_name,
|
|
516
|
+
row_index_offset: row_index_offset,
|
|
474
517
|
eol_char: eol_char,
|
|
475
518
|
new_columns: new_columns,
|
|
476
519
|
raise_if_empty: raise_if_empty,
|
|
@@ -492,19 +535,28 @@ module Polars
|
|
|
492
535
|
# If set to false, column names will be autogenerated in the
|
|
493
536
|
# following format: `column_x`, with `x` being an
|
|
494
537
|
# enumeration over every column in the dataset starting at 1.
|
|
495
|
-
# @param
|
|
496
|
-
# Single byte character to use as
|
|
497
|
-
# @param
|
|
498
|
-
#
|
|
499
|
-
#
|
|
538
|
+
# @param separator [String]
|
|
539
|
+
# Single byte character to use as separator in the file.
|
|
540
|
+
# @param comment_prefix [String]
|
|
541
|
+
# A string used to indicate the start of a comment line. Comment lines are skipped
|
|
542
|
+
# during parsing. Common examples of comment prefixes are `#` and `//`.
|
|
500
543
|
# @param quote_char [String]
|
|
501
544
|
# Single byte character used for csv quoting.
|
|
502
545
|
# Set to nil to turn off special handling and escaping of quotes.
|
|
503
546
|
# @param skip_rows [Integer]
|
|
504
547
|
# Start reading after `skip_rows` lines. The header will be parsed at this
|
|
505
548
|
# offset.
|
|
506
|
-
# @param
|
|
507
|
-
#
|
|
549
|
+
# @param skip_lines [Integer]
|
|
550
|
+
# Start reading after `skip_lines` lines. The header will be parsed at this
|
|
551
|
+
# offset. Note that CSV escaping will not be respected when skipping lines.
|
|
552
|
+
# If you want to skip valid CSV rows, use `skip_rows`.
|
|
553
|
+
# @param schema [Object]
|
|
554
|
+
# Provide the schema. This means that polars doesn't do schema inference.
|
|
555
|
+
# This argument expects the complete schema, whereas `schema_overrides` can be
|
|
556
|
+
# used to partially overwrite a schema. Note that the order of the columns in
|
|
557
|
+
# the provided `schema` must match the order of the columns in the CSV being read.
|
|
558
|
+
# @param schema_overrides [Object]
|
|
559
|
+
# Overwrite dtypes for specific or all columns during schema inference.
|
|
508
560
|
# @param null_values [Object]
|
|
509
561
|
# Values to interpret as null values. You can provide a:
|
|
510
562
|
#
|
|
@@ -524,6 +576,11 @@ module Polars
|
|
|
524
576
|
# Apply a function over the column names.
|
|
525
577
|
# This can be used to update a schema just in time, thus before
|
|
526
578
|
# scanning.
|
|
579
|
+
# @param infer_schema [Boolean]
|
|
580
|
+
# When `true`, the schema is inferred from the data using the first
|
|
581
|
+
# `infer_schema_length` rows.
|
|
582
|
+
# When `false`, the schema is not inferred and will be `Polars::String` if not
|
|
583
|
+
# specified in `schema` or `schema_overrides`.
|
|
527
584
|
# @param infer_schema_length [Integer]
|
|
528
585
|
# Maximum number of lines to read to infer schema.
|
|
529
586
|
# If set to 0, all columns will be read as `:str`.
|
|
@@ -539,16 +596,20 @@ module Polars
|
|
|
539
596
|
# Reallocate to contiguous memory when all chunks/ files are parsed.
|
|
540
597
|
# @param skip_rows_after_header [Integer]
|
|
541
598
|
# Skip this number of rows when the header is parsed.
|
|
542
|
-
# @param
|
|
599
|
+
# @param row_index_name [String]
|
|
543
600
|
# If not nil, this will insert a row count column with the given name into
|
|
544
601
|
# the DataFrame.
|
|
545
|
-
# @param
|
|
602
|
+
# @param row_index_offset [Integer]
|
|
546
603
|
# Offset to start the row_count column (only used if the name is set).
|
|
547
|
-
# @param
|
|
604
|
+
# @param try_parse_dates [Boolean]
|
|
548
605
|
# Try to automatically parse dates. If this does not succeed,
|
|
549
606
|
# the column remains of data type `:str`.
|
|
550
607
|
# @param eol_char [String]
|
|
551
608
|
# Single byte end of line character.
|
|
609
|
+
# @param new_columns [Array]
|
|
610
|
+
# Provide an explicit list of string column names to use (for example, when
|
|
611
|
+
# scanning a headerless CSV file). If the given list is shorter than the width of
|
|
612
|
+
# the DataFrame the remaining columns will have their original name.
|
|
552
613
|
# @param raise_if_empty [Boolean]
|
|
553
614
|
# When there is no data in the source, `NoDataError` is raised. If this parameter
|
|
554
615
|
# is set to false, an empty LazyFrame (with no columns) is returned instead.
|
|
@@ -558,52 +619,100 @@ module Polars
|
|
|
558
619
|
# Parse floats using a comma as the decimal separator instead of a period.
|
|
559
620
|
# @param glob [Boolean]
|
|
560
621
|
# Expand path given via globbing rules.
|
|
622
|
+
# @param storage_options [Hash]
|
|
623
|
+
# Options that indicate how to connect to a cloud provider.
|
|
624
|
+
#
|
|
625
|
+
# The cloud providers currently supported are AWS, GCP, and Azure.
|
|
626
|
+
# See supported keys here:
|
|
627
|
+
#
|
|
628
|
+
# * [aws](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html)
|
|
629
|
+
# * [gcp](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html)
|
|
630
|
+
# * [azure](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html)
|
|
631
|
+
# * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
|
|
632
|
+
# `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
|
633
|
+
#
|
|
634
|
+
# If `storage_options` is not provided, Polars will try to infer the information
|
|
635
|
+
# from environment variables.
|
|
636
|
+
# @param credential_provider [Object]
|
|
637
|
+
# Provide a function that can be called to provide cloud storage
|
|
638
|
+
# credentials. The function is expected to return a hash of
|
|
639
|
+
# credential keys along with an optional credential expiry time.
|
|
640
|
+
# @param retries [Integer]
|
|
641
|
+
# Number of retries if accessing a cloud instance fails.
|
|
642
|
+
# @param file_cache_ttl [Integer]
|
|
643
|
+
# Amount of time to keep downloaded cloud files since their last access time,
|
|
644
|
+
# in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
|
|
645
|
+
# (which defaults to 1 hour) if not given.
|
|
646
|
+
# @param include_file_paths [String]
|
|
647
|
+
# Include the path of the source file(s) as a column with this name.
|
|
561
648
|
#
|
|
562
649
|
# @return [LazyFrame]
|
|
563
650
|
def scan_csv(
|
|
564
651
|
source,
|
|
565
652
|
has_header: true,
|
|
566
|
-
|
|
567
|
-
|
|
653
|
+
separator: ",",
|
|
654
|
+
comment_prefix: nil,
|
|
568
655
|
quote_char: '"',
|
|
569
656
|
skip_rows: 0,
|
|
570
|
-
|
|
657
|
+
skip_lines: 0,
|
|
658
|
+
schema: nil,
|
|
659
|
+
schema_overrides: nil,
|
|
571
660
|
null_values: nil,
|
|
572
661
|
missing_utf8_is_empty_string: false,
|
|
573
662
|
ignore_errors: false,
|
|
574
663
|
cache: true,
|
|
575
664
|
with_column_names: nil,
|
|
665
|
+
infer_schema: true,
|
|
576
666
|
infer_schema_length: N_INFER_DEFAULT,
|
|
577
667
|
n_rows: nil,
|
|
578
668
|
encoding: "utf8",
|
|
579
669
|
low_memory: false,
|
|
580
|
-
rechunk:
|
|
670
|
+
rechunk: false,
|
|
581
671
|
skip_rows_after_header: 0,
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
672
|
+
row_index_name: nil,
|
|
673
|
+
row_index_offset: 0,
|
|
674
|
+
try_parse_dates: false,
|
|
585
675
|
eol_char: "\n",
|
|
676
|
+
new_columns: nil,
|
|
586
677
|
raise_if_empty: true,
|
|
587
678
|
truncate_ragged_lines: false,
|
|
588
679
|
decimal_comma: false,
|
|
589
|
-
glob: true
|
|
680
|
+
glob: true,
|
|
681
|
+
storage_options: nil,
|
|
682
|
+
credential_provider: "auto",
|
|
683
|
+
retries: 2,
|
|
684
|
+
file_cache_ttl: nil,
|
|
685
|
+
include_file_paths: nil
|
|
590
686
|
)
|
|
591
|
-
|
|
592
|
-
|
|
687
|
+
if new_columns
|
|
688
|
+
raise Todo
|
|
689
|
+
end
|
|
690
|
+
|
|
691
|
+
Utils._check_arg_is_1byte("separator", separator, false)
|
|
593
692
|
Utils._check_arg_is_1byte("quote_char", quote_char, true)
|
|
594
693
|
|
|
595
694
|
if Utils.pathlike?(source)
|
|
596
695
|
source = Utils.normalize_filepath(source)
|
|
597
696
|
end
|
|
598
697
|
|
|
698
|
+
if !infer_schema
|
|
699
|
+
infer_schema_length = 0
|
|
700
|
+
end
|
|
701
|
+
|
|
702
|
+
credential_provider_builder = _init_credential_provider_builder(
|
|
703
|
+
credential_provider, source, storage_options, "scan_csv"
|
|
704
|
+
)
|
|
705
|
+
|
|
599
706
|
_scan_csv_impl(
|
|
600
707
|
source,
|
|
601
708
|
has_header: has_header,
|
|
602
|
-
|
|
603
|
-
|
|
709
|
+
separator: separator,
|
|
710
|
+
comment_prefix: comment_prefix,
|
|
604
711
|
quote_char: quote_char,
|
|
605
712
|
skip_rows: skip_rows,
|
|
606
|
-
|
|
713
|
+
skip_lines: skip_lines,
|
|
714
|
+
schema_overrides: schema_overrides,
|
|
715
|
+
schema: schema,
|
|
607
716
|
null_values: null_values,
|
|
608
717
|
ignore_errors: ignore_errors,
|
|
609
718
|
cache: cache,
|
|
@@ -614,11 +723,19 @@ module Polars
|
|
|
614
723
|
rechunk: rechunk,
|
|
615
724
|
skip_rows_after_header: skip_rows_after_header,
|
|
616
725
|
encoding: encoding,
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
726
|
+
row_index_name: row_index_name,
|
|
727
|
+
row_index_offset: row_index_offset,
|
|
728
|
+
try_parse_dates: try_parse_dates,
|
|
620
729
|
eol_char: eol_char,
|
|
621
|
-
|
|
730
|
+
raise_if_empty: raise_if_empty,
|
|
731
|
+
truncate_ragged_lines: truncate_ragged_lines,
|
|
732
|
+
decimal_comma: decimal_comma,
|
|
733
|
+
glob: glob,
|
|
734
|
+
retries: retries,
|
|
735
|
+
storage_options: storage_options,
|
|
736
|
+
credential_provider: credential_provider_builder,
|
|
737
|
+
file_cache_ttl: file_cache_ttl,
|
|
738
|
+
include_file_paths: include_file_paths
|
|
622
739
|
)
|
|
623
740
|
end
|
|
624
741
|
|
|
@@ -626,12 +743,15 @@ module Polars
|
|
|
626
743
|
def _scan_csv_impl(
|
|
627
744
|
source,
|
|
628
745
|
has_header: true,
|
|
629
|
-
|
|
630
|
-
|
|
746
|
+
separator: ",",
|
|
747
|
+
comment_prefix: nil,
|
|
631
748
|
quote_char: '"',
|
|
632
749
|
skip_rows: 0,
|
|
633
|
-
|
|
750
|
+
skip_lines: 0,
|
|
751
|
+
schema: nil,
|
|
752
|
+
schema_overrides: nil,
|
|
634
753
|
null_values: nil,
|
|
754
|
+
missing_utf8_is_empty_string: false,
|
|
635
755
|
ignore_errors: false,
|
|
636
756
|
cache: true,
|
|
637
757
|
with_column_names: nil,
|
|
@@ -639,19 +759,27 @@ module Polars
|
|
|
639
759
|
n_rows: nil,
|
|
640
760
|
encoding: "utf8",
|
|
641
761
|
low_memory: false,
|
|
642
|
-
rechunk:
|
|
762
|
+
rechunk: false,
|
|
643
763
|
skip_rows_after_header: 0,
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
764
|
+
row_index_name: nil,
|
|
765
|
+
row_index_offset: 0,
|
|
766
|
+
try_parse_dates: false,
|
|
647
767
|
eol_char: "\n",
|
|
648
|
-
|
|
768
|
+
raise_if_empty: true,
|
|
769
|
+
truncate_ragged_lines: true,
|
|
770
|
+
decimal_comma: false,
|
|
771
|
+
glob: true,
|
|
772
|
+
storage_options: nil,
|
|
773
|
+
credential_provider: nil,
|
|
774
|
+
retries: 2,
|
|
775
|
+
file_cache_ttl: nil,
|
|
776
|
+
include_file_paths: nil
|
|
649
777
|
)
|
|
650
778
|
dtype_list = nil
|
|
651
|
-
if !
|
|
779
|
+
if !schema_overrides.nil?
|
|
652
780
|
dtype_list = []
|
|
653
|
-
|
|
654
|
-
dtype_list << [k, Utils.
|
|
781
|
+
schema_overrides.each do |k, v|
|
|
782
|
+
dtype_list << [k, Utils.parse_into_dtype(v)]
|
|
655
783
|
end
|
|
656
784
|
end
|
|
657
785
|
processed_null_values = Utils._process_null_values(null_values)
|
|
@@ -666,27 +794,38 @@ module Polars
|
|
|
666
794
|
rblf =
|
|
667
795
|
RbLazyFrame.new_from_csv(
|
|
668
796
|
source,
|
|
669
|
-
|
|
797
|
+
sources,
|
|
798
|
+
separator,
|
|
670
799
|
has_header,
|
|
671
800
|
ignore_errors,
|
|
672
801
|
skip_rows,
|
|
802
|
+
skip_lines,
|
|
673
803
|
n_rows,
|
|
674
804
|
cache,
|
|
675
805
|
dtype_list,
|
|
676
806
|
low_memory,
|
|
677
|
-
|
|
807
|
+
comment_prefix,
|
|
678
808
|
quote_char,
|
|
679
809
|
processed_null_values,
|
|
810
|
+
missing_utf8_is_empty_string,
|
|
680
811
|
infer_schema_length,
|
|
681
812
|
with_column_names,
|
|
682
813
|
rechunk,
|
|
683
814
|
skip_rows_after_header,
|
|
684
815
|
encoding,
|
|
685
|
-
Utils.parse_row_index_args(
|
|
686
|
-
|
|
816
|
+
Utils.parse_row_index_args(row_index_name, row_index_offset),
|
|
817
|
+
try_parse_dates,
|
|
687
818
|
eol_char,
|
|
819
|
+
raise_if_empty,
|
|
688
820
|
truncate_ragged_lines,
|
|
689
|
-
|
|
821
|
+
decimal_comma,
|
|
822
|
+
glob,
|
|
823
|
+
schema,
|
|
824
|
+
storage_options,
|
|
825
|
+
credential_provider,
|
|
826
|
+
retries,
|
|
827
|
+
file_cache_ttl,
|
|
828
|
+
include_file_paths
|
|
690
829
|
)
|
|
691
830
|
Utils.wrap_ldf(rblf)
|
|
692
831
|
end
|