polars-runtime-compat 1.34.0b3__cp39-abi3-win_amd64.whl → 1.34.0b5__cp39-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
- polars_runtime_compat-1.34.0b5.dist-info/METADATA +35 -0
- polars_runtime_compat-1.34.0b5.dist-info/RECORD +6 -0
- polars/__init__.py +0 -528
- polars/_cpu_check.py +0 -265
- polars/_dependencies.py +0 -355
- polars/_plr.py +0 -99
- polars/_plr.pyi +0 -2496
- polars/_reexport.py +0 -23
- polars/_typing.py +0 -478
- polars/_utils/__init__.py +0 -37
- polars/_utils/async_.py +0 -102
- polars/_utils/cache.py +0 -176
- polars/_utils/cloud.py +0 -40
- polars/_utils/constants.py +0 -29
- polars/_utils/construction/__init__.py +0 -46
- polars/_utils/construction/dataframe.py +0 -1397
- polars/_utils/construction/other.py +0 -72
- polars/_utils/construction/series.py +0 -560
- polars/_utils/construction/utils.py +0 -118
- polars/_utils/convert.py +0 -224
- polars/_utils/deprecation.py +0 -406
- polars/_utils/getitem.py +0 -457
- polars/_utils/logging.py +0 -11
- polars/_utils/nest_asyncio.py +0 -264
- polars/_utils/parquet.py +0 -15
- polars/_utils/parse/__init__.py +0 -12
- polars/_utils/parse/expr.py +0 -242
- polars/_utils/polars_version.py +0 -19
- polars/_utils/pycapsule.py +0 -53
- polars/_utils/scan.py +0 -27
- polars/_utils/serde.py +0 -63
- polars/_utils/slice.py +0 -215
- polars/_utils/udfs.py +0 -1251
- polars/_utils/unstable.py +0 -63
- polars/_utils/various.py +0 -782
- polars/_utils/wrap.py +0 -25
- polars/api.py +0 -370
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +0 -19
- polars/catalog/unity/client.py +0 -733
- polars/catalog/unity/models.py +0 -152
- polars/config.py +0 -1571
- polars/convert/__init__.py +0 -25
- polars/convert/general.py +0 -1046
- polars/convert/normalize.py +0 -261
- polars/dataframe/__init__.py +0 -5
- polars/dataframe/_html.py +0 -186
- polars/dataframe/frame.py +0 -12582
- polars/dataframe/group_by.py +0 -1067
- polars/dataframe/plotting.py +0 -257
- polars/datatype_expr/__init__.py +0 -5
- polars/datatype_expr/array.py +0 -56
- polars/datatype_expr/datatype_expr.py +0 -304
- polars/datatype_expr/list.py +0 -18
- polars/datatype_expr/struct.py +0 -69
- polars/datatypes/__init__.py +0 -122
- polars/datatypes/_parse.py +0 -195
- polars/datatypes/_utils.py +0 -48
- polars/datatypes/classes.py +0 -1213
- polars/datatypes/constants.py +0 -11
- polars/datatypes/constructor.py +0 -172
- polars/datatypes/convert.py +0 -366
- polars/datatypes/group.py +0 -130
- polars/exceptions.py +0 -230
- polars/expr/__init__.py +0 -7
- polars/expr/array.py +0 -964
- polars/expr/binary.py +0 -346
- polars/expr/categorical.py +0 -306
- polars/expr/datetime.py +0 -2620
- polars/expr/expr.py +0 -11272
- polars/expr/list.py +0 -1408
- polars/expr/meta.py +0 -444
- polars/expr/name.py +0 -321
- polars/expr/string.py +0 -3045
- polars/expr/struct.py +0 -357
- polars/expr/whenthen.py +0 -185
- polars/functions/__init__.py +0 -193
- polars/functions/aggregation/__init__.py +0 -33
- polars/functions/aggregation/horizontal.py +0 -298
- polars/functions/aggregation/vertical.py +0 -341
- polars/functions/as_datatype.py +0 -848
- polars/functions/business.py +0 -138
- polars/functions/col.py +0 -384
- polars/functions/datatype.py +0 -121
- polars/functions/eager.py +0 -524
- polars/functions/escape_regex.py +0 -29
- polars/functions/lazy.py +0 -2751
- polars/functions/len.py +0 -68
- polars/functions/lit.py +0 -210
- polars/functions/random.py +0 -22
- polars/functions/range/__init__.py +0 -19
- polars/functions/range/_utils.py +0 -15
- polars/functions/range/date_range.py +0 -303
- polars/functions/range/datetime_range.py +0 -370
- polars/functions/range/int_range.py +0 -348
- polars/functions/range/linear_space.py +0 -311
- polars/functions/range/time_range.py +0 -287
- polars/functions/repeat.py +0 -301
- polars/functions/whenthen.py +0 -353
- polars/interchange/__init__.py +0 -10
- polars/interchange/buffer.py +0 -77
- polars/interchange/column.py +0 -190
- polars/interchange/dataframe.py +0 -230
- polars/interchange/from_dataframe.py +0 -328
- polars/interchange/protocol.py +0 -303
- polars/interchange/utils.py +0 -170
- polars/io/__init__.py +0 -64
- polars/io/_utils.py +0 -317
- polars/io/avro.py +0 -49
- polars/io/clipboard.py +0 -36
- polars/io/cloud/__init__.py +0 -17
- polars/io/cloud/_utils.py +0 -80
- polars/io/cloud/credential_provider/__init__.py +0 -17
- polars/io/cloud/credential_provider/_builder.py +0 -520
- polars/io/cloud/credential_provider/_providers.py +0 -618
- polars/io/csv/__init__.py +0 -9
- polars/io/csv/_utils.py +0 -38
- polars/io/csv/batched_reader.py +0 -142
- polars/io/csv/functions.py +0 -1495
- polars/io/database/__init__.py +0 -6
- polars/io/database/_arrow_registry.py +0 -70
- polars/io/database/_cursor_proxies.py +0 -147
- polars/io/database/_executor.py +0 -578
- polars/io/database/_inference.py +0 -314
- polars/io/database/_utils.py +0 -144
- polars/io/database/functions.py +0 -516
- polars/io/delta.py +0 -499
- polars/io/iceberg/__init__.py +0 -3
- polars/io/iceberg/_utils.py +0 -697
- polars/io/iceberg/dataset.py +0 -556
- polars/io/iceberg/functions.py +0 -151
- polars/io/ipc/__init__.py +0 -8
- polars/io/ipc/functions.py +0 -514
- polars/io/json/__init__.py +0 -3
- polars/io/json/read.py +0 -101
- polars/io/ndjson.py +0 -332
- polars/io/parquet/__init__.py +0 -17
- polars/io/parquet/field_overwrites.py +0 -140
- polars/io/parquet/functions.py +0 -722
- polars/io/partition.py +0 -491
- polars/io/plugins.py +0 -187
- polars/io/pyarrow_dataset/__init__.py +0 -5
- polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
- polars/io/pyarrow_dataset/functions.py +0 -79
- polars/io/scan_options/__init__.py +0 -5
- polars/io/scan_options/_options.py +0 -59
- polars/io/scan_options/cast_options.py +0 -126
- polars/io/spreadsheet/__init__.py +0 -6
- polars/io/spreadsheet/_utils.py +0 -52
- polars/io/spreadsheet/_write_utils.py +0 -647
- polars/io/spreadsheet/functions.py +0 -1323
- polars/lazyframe/__init__.py +0 -9
- polars/lazyframe/engine_config.py +0 -61
- polars/lazyframe/frame.py +0 -8564
- polars/lazyframe/group_by.py +0 -669
- polars/lazyframe/in_process.py +0 -42
- polars/lazyframe/opt_flags.py +0 -333
- polars/meta/__init__.py +0 -14
- polars/meta/build.py +0 -33
- polars/meta/index_type.py +0 -27
- polars/meta/thread_pool.py +0 -50
- polars/meta/versions.py +0 -120
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +0 -213
- polars/ml/utilities.py +0 -30
- polars/plugins.py +0 -155
- polars/py.typed +0 -0
- polars/pyproject.toml +0 -103
- polars/schema.py +0 -265
- polars/selectors.py +0 -3117
- polars/series/__init__.py +0 -5
- polars/series/array.py +0 -776
- polars/series/binary.py +0 -254
- polars/series/categorical.py +0 -246
- polars/series/datetime.py +0 -2275
- polars/series/list.py +0 -1087
- polars/series/plotting.py +0 -191
- polars/series/series.py +0 -9197
- polars/series/string.py +0 -2367
- polars/series/struct.py +0 -154
- polars/series/utils.py +0 -191
- polars/sql/__init__.py +0 -7
- polars/sql/context.py +0 -677
- polars/sql/functions.py +0 -139
- polars/string_cache.py +0 -185
- polars/testing/__init__.py +0 -13
- polars/testing/asserts/__init__.py +0 -9
- polars/testing/asserts/frame.py +0 -231
- polars/testing/asserts/series.py +0 -219
- polars/testing/asserts/utils.py +0 -12
- polars/testing/parametric/__init__.py +0 -33
- polars/testing/parametric/profiles.py +0 -107
- polars/testing/parametric/strategies/__init__.py +0 -22
- polars/testing/parametric/strategies/_utils.py +0 -14
- polars/testing/parametric/strategies/core.py +0 -615
- polars/testing/parametric/strategies/data.py +0 -452
- polars/testing/parametric/strategies/dtype.py +0 -436
- polars/testing/parametric/strategies/legacy.py +0 -169
- polars/type_aliases.py +0 -24
- polars_runtime_compat-1.34.0b3.dist-info/METADATA +0 -190
- polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/WHEEL +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/licenses/LICENSE +0 -0
polars/io/csv/functions.py
DELETED
|
@@ -1,1495 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import contextlib
|
|
4
|
-
import os
|
|
5
|
-
from collections.abc import Sequence
|
|
6
|
-
from io import BytesIO, StringIO
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import IO, TYPE_CHECKING, Any, Callable, Literal
|
|
9
|
-
|
|
10
|
-
import polars._reexport as pl
|
|
11
|
-
import polars.functions as F
|
|
12
|
-
from polars._utils.deprecation import deprecate_renamed_parameter
|
|
13
|
-
from polars._utils.various import (
|
|
14
|
-
_process_null_values,
|
|
15
|
-
is_path_or_str_sequence,
|
|
16
|
-
is_str_sequence,
|
|
17
|
-
normalize_filepath,
|
|
18
|
-
qualified_type_name,
|
|
19
|
-
)
|
|
20
|
-
from polars._utils.wrap import wrap_df, wrap_ldf
|
|
21
|
-
from polars.datatypes import N_INFER_DEFAULT, String, parse_into_dtype
|
|
22
|
-
from polars.io._utils import (
|
|
23
|
-
is_glob_pattern,
|
|
24
|
-
parse_columns_arg,
|
|
25
|
-
parse_row_index_args,
|
|
26
|
-
prepare_file_arg,
|
|
27
|
-
)
|
|
28
|
-
from polars.io.cloud.credential_provider._builder import (
|
|
29
|
-
_init_credential_provider_builder,
|
|
30
|
-
)
|
|
31
|
-
from polars.io.csv._utils import _check_arg_is_1byte, _update_columns
|
|
32
|
-
from polars.io.csv.batched_reader import BatchedCsvReader
|
|
33
|
-
|
|
34
|
-
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
35
|
-
from polars._plr import PyDataFrame, PyLazyFrame
|
|
36
|
-
|
|
37
|
-
if TYPE_CHECKING:
|
|
38
|
-
from collections.abc import Mapping
|
|
39
|
-
|
|
40
|
-
from polars import DataFrame, LazyFrame
|
|
41
|
-
from polars._typing import CsvEncoding, PolarsDataType, SchemaDict
|
|
42
|
-
from polars.io.cloud import CredentialProviderFunction
|
|
43
|
-
from polars.io.cloud.credential_provider._builder import CredentialProviderBuilder
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31")
|
|
47
|
-
@deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
|
|
48
|
-
@deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
|
|
49
|
-
def read_csv(
|
|
50
|
-
source: str | Path | IO[str] | IO[bytes] | bytes,
|
|
51
|
-
*,
|
|
52
|
-
has_header: bool = True,
|
|
53
|
-
columns: Sequence[int] | Sequence[str] | None = None,
|
|
54
|
-
new_columns: Sequence[str] | None = None,
|
|
55
|
-
separator: str = ",",
|
|
56
|
-
comment_prefix: str | None = None,
|
|
57
|
-
quote_char: str | None = '"',
|
|
58
|
-
skip_rows: int = 0,
|
|
59
|
-
skip_lines: int = 0,
|
|
60
|
-
schema: SchemaDict | None = None,
|
|
61
|
-
schema_overrides: (
|
|
62
|
-
Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None
|
|
63
|
-
) = None,
|
|
64
|
-
null_values: str | Sequence[str] | dict[str, str] | None = None,
|
|
65
|
-
missing_utf8_is_empty_string: bool = False,
|
|
66
|
-
ignore_errors: bool = False,
|
|
67
|
-
try_parse_dates: bool = False,
|
|
68
|
-
n_threads: int | None = None,
|
|
69
|
-
infer_schema: bool = True,
|
|
70
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
71
|
-
batch_size: int = 8192,
|
|
72
|
-
n_rows: int | None = None,
|
|
73
|
-
encoding: CsvEncoding | str = "utf8",
|
|
74
|
-
low_memory: bool = False,
|
|
75
|
-
rechunk: bool = False,
|
|
76
|
-
use_pyarrow: bool = False,
|
|
77
|
-
storage_options: dict[str, Any] | None = None,
|
|
78
|
-
skip_rows_after_header: int = 0,
|
|
79
|
-
row_index_name: str | None = None,
|
|
80
|
-
row_index_offset: int = 0,
|
|
81
|
-
sample_size: int = 1024,
|
|
82
|
-
eol_char: str = "\n",
|
|
83
|
-
raise_if_empty: bool = True,
|
|
84
|
-
truncate_ragged_lines: bool = False,
|
|
85
|
-
decimal_comma: bool = False,
|
|
86
|
-
glob: bool = True,
|
|
87
|
-
) -> DataFrame:
|
|
88
|
-
r"""
|
|
89
|
-
Read a CSV file into a DataFrame.
|
|
90
|
-
|
|
91
|
-
Polars expects CSV data to strictly conform to RFC 4180, unless documented
|
|
92
|
-
otherwise. Malformed data, though common, may lead to undefined behavior.
|
|
93
|
-
|
|
94
|
-
.. versionchanged:: 0.20.31
|
|
95
|
-
The `dtypes` parameter was renamed `schema_overrides`.
|
|
96
|
-
.. versionchanged:: 0.20.4
|
|
97
|
-
* The `row_count_name` parameter was renamed `row_index_name`.
|
|
98
|
-
* The `row_count_offset` parameter was renamed `row_index_offset`.
|
|
99
|
-
|
|
100
|
-
Parameters
|
|
101
|
-
----------
|
|
102
|
-
source
|
|
103
|
-
Path to a file or a file-like object (by "file-like object" we refer to objects
|
|
104
|
-
that have a `read()` method, such as a file handler like the builtin `open`
|
|
105
|
-
function, or a `BytesIO` instance). If `fsspec` is installed, it will be used
|
|
106
|
-
to open remote files. For file-like objects, the stream position may not be
|
|
107
|
-
updated accordingly after reading.
|
|
108
|
-
has_header
|
|
109
|
-
Indicate if the first row of the dataset is a header or not. If set to False,
|
|
110
|
-
column names will be autogenerated in the following format: `column_x`, with
|
|
111
|
-
`x` being an enumeration over every column in the dataset, starting at 1.
|
|
112
|
-
columns
|
|
113
|
-
Columns to select. Accepts a list of column indices (starting
|
|
114
|
-
at zero) or a list of column names.
|
|
115
|
-
new_columns
|
|
116
|
-
Rename columns right after parsing the CSV file. If the given
|
|
117
|
-
list is shorter than the width of the DataFrame the remaining
|
|
118
|
-
columns will have their original name.
|
|
119
|
-
separator
|
|
120
|
-
Single byte character to use as separator in the file.
|
|
121
|
-
comment_prefix
|
|
122
|
-
A string used to indicate the start of a comment line. Comment lines are skipped
|
|
123
|
-
during parsing. Common examples of comment prefixes are `#` and `//`.
|
|
124
|
-
quote_char
|
|
125
|
-
Single byte character used for csv quoting, default = `"`.
|
|
126
|
-
Set to None to turn off special handling and escaping of quotes.
|
|
127
|
-
skip_rows
|
|
128
|
-
Start reading after ``skip_rows`` rows. The header will be parsed at this
|
|
129
|
-
offset. Note that we respect CSV escaping/comments when skipping rows.
|
|
130
|
-
If you want to skip by newline char only, use `skip_lines`.
|
|
131
|
-
skip_lines
|
|
132
|
-
Start reading after `skip_lines` lines. The header will be parsed at this
|
|
133
|
-
offset. Note that CSV escaping will not be respected when skipping lines.
|
|
134
|
-
If you want to skip valid CSV rows, use ``skip_rows``.
|
|
135
|
-
schema
|
|
136
|
-
Provide the schema. This means that polars doesn't do schema inference.
|
|
137
|
-
This argument expects the complete schema, whereas `schema_overrides` can be
|
|
138
|
-
used to partially overwrite a schema. Note that the order of the columns in
|
|
139
|
-
the provided `schema` must match the order of the columns in the CSV being read.
|
|
140
|
-
schema_overrides
|
|
141
|
-
Overwrite dtypes for specific or all columns during schema inference.
|
|
142
|
-
null_values
|
|
143
|
-
Values to interpret as null values. You can provide a:
|
|
144
|
-
|
|
145
|
-
- `str`: All values equal to this string will be null.
|
|
146
|
-
- `List[str]`: All values equal to any string in this list will be null.
|
|
147
|
-
- `Dict[str, str]`: A dictionary that maps column name to a
|
|
148
|
-
null value string.
|
|
149
|
-
|
|
150
|
-
missing_utf8_is_empty_string
|
|
151
|
-
By default a missing value is considered to be null; if you would prefer missing
|
|
152
|
-
utf8 values to be treated as the empty string you can set this param True.
|
|
153
|
-
ignore_errors
|
|
154
|
-
Try to keep reading lines if some lines yield errors.
|
|
155
|
-
Before using this option, try to increase the number of lines used for schema
|
|
156
|
-
inference with e.g `infer_schema_length=10000` or override automatic dtype
|
|
157
|
-
inference for specific columns with the `schema_overrides` option or use
|
|
158
|
-
`infer_schema=False` to read all columns as `pl.String` to check which
|
|
159
|
-
values might cause an issue.
|
|
160
|
-
try_parse_dates
|
|
161
|
-
Try to automatically parse dates. Most ISO8601-like formats can
|
|
162
|
-
be inferred, as well as a handful of others. If this does not succeed,
|
|
163
|
-
the column remains of data type `pl.String`.
|
|
164
|
-
If `use_pyarrow=True`, dates will always be parsed.
|
|
165
|
-
n_threads
|
|
166
|
-
Number of threads to use in csv parsing.
|
|
167
|
-
Defaults to the number of physical cpu's of your system.
|
|
168
|
-
infer_schema
|
|
169
|
-
When `True`, the schema is inferred from the data using the first
|
|
170
|
-
`infer_schema_length` rows.
|
|
171
|
-
When `False`, the schema is not inferred and will be `pl.String` if not
|
|
172
|
-
specified in `schema` or `schema_overrides`.
|
|
173
|
-
infer_schema_length
|
|
174
|
-
The maximum number of rows to scan for schema inference.
|
|
175
|
-
If set to `None`, the full data may be scanned *(this is slow)*.
|
|
176
|
-
Set `infer_schema=False` to read all columns as `pl.String`.
|
|
177
|
-
batch_size
|
|
178
|
-
Number of lines to read into the buffer at once.
|
|
179
|
-
Modify this to change performance.
|
|
180
|
-
n_rows
|
|
181
|
-
Stop reading from CSV file after reading `n_rows`.
|
|
182
|
-
During multi-threaded parsing, an upper bound of `n_rows`
|
|
183
|
-
rows cannot be guaranteed.
|
|
184
|
-
encoding : {'utf8', 'utf8-lossy', 'windows-1252', 'windows-1252-lossy', ...}
|
|
185
|
-
Lossy means that invalid utf8 values are replaced with `�`
|
|
186
|
-
characters. When using other encodings than `utf8` or
|
|
187
|
-
`utf8-lossy`, the input is first decoded in memory with
|
|
188
|
-
python. Defaults to `utf8`.
|
|
189
|
-
low_memory
|
|
190
|
-
Reduce memory pressure at the expense of performance.
|
|
191
|
-
rechunk
|
|
192
|
-
Make sure that all columns are contiguous in memory by
|
|
193
|
-
aggregating the chunks into a single array.
|
|
194
|
-
use_pyarrow
|
|
195
|
-
Try to use pyarrow's native CSV parser. This will always
|
|
196
|
-
parse dates, even if `try_parse_dates=False`.
|
|
197
|
-
This is not always possible. The set of arguments given to
|
|
198
|
-
this function determines if it is possible to use pyarrow's
|
|
199
|
-
native parser. Note that pyarrow and polars may have a
|
|
200
|
-
different strategy regarding type inference.
|
|
201
|
-
storage_options
|
|
202
|
-
Extra options that make sense for `fsspec.open()` or a
|
|
203
|
-
particular storage connection.
|
|
204
|
-
e.g. host, port, username, password, etc.
|
|
205
|
-
skip_rows_after_header
|
|
206
|
-
Skip this number of rows when the header is parsed.
|
|
207
|
-
row_index_name
|
|
208
|
-
Insert a row index column with the given name into the DataFrame as the first
|
|
209
|
-
column. If set to `None` (default), no row index column is created.
|
|
210
|
-
row_index_offset
|
|
211
|
-
Start the row index at this offset. Cannot be negative.
|
|
212
|
-
Only used if `row_index_name` is set.
|
|
213
|
-
sample_size
|
|
214
|
-
Set the sample size. This is used to sample statistics to estimate the
|
|
215
|
-
allocation needed.
|
|
216
|
-
|
|
217
|
-
.. deprecated:: 1.10.0
|
|
218
|
-
This parameter is now a no-op.
|
|
219
|
-
eol_char
|
|
220
|
-
Single byte end of line character (default: `\n`). When encountering a file
|
|
221
|
-
with windows line endings (`\r\n`), one can go with the default `\n`. The extra
|
|
222
|
-
`\r` will be removed when processed.
|
|
223
|
-
raise_if_empty
|
|
224
|
-
When there is no data in the source, `NoDataError` is raised. If this parameter
|
|
225
|
-
is set to False, an empty DataFrame (with no columns) is returned instead.
|
|
226
|
-
truncate_ragged_lines
|
|
227
|
-
Truncate lines that are longer than the schema.
|
|
228
|
-
decimal_comma
|
|
229
|
-
Parse floats using a comma as the decimal separator instead of a period.
|
|
230
|
-
glob
|
|
231
|
-
Expand path given via globbing rules.
|
|
232
|
-
|
|
233
|
-
Returns
|
|
234
|
-
-------
|
|
235
|
-
DataFrame
|
|
236
|
-
|
|
237
|
-
See Also
|
|
238
|
-
--------
|
|
239
|
-
scan_csv : Lazily read from a CSV file or multiple files via glob patterns.
|
|
240
|
-
|
|
241
|
-
Warnings
|
|
242
|
-
--------
|
|
243
|
-
Calling `read_csv().lazy()` is an antipattern as this forces Polars to materialize
|
|
244
|
-
a full csv file and therefore cannot push any optimizations into the reader.
|
|
245
|
-
Therefore always prefer `scan_csv` if you want to work with `LazyFrame` s.
|
|
246
|
-
|
|
247
|
-
Notes
|
|
248
|
-
-----
|
|
249
|
-
If the schema is inferred incorrectly (e.g. as `pl.Int64` instead of `pl.Float64`),
|
|
250
|
-
try to increase the number of lines used to infer the schema with
|
|
251
|
-
`infer_schema_length` or override the inferred dtype for those columns with
|
|
252
|
-
`schema_overrides`.
|
|
253
|
-
|
|
254
|
-
Examples
|
|
255
|
-
--------
|
|
256
|
-
>>> pl.read_csv("data.csv", separator="|") # doctest: +SKIP
|
|
257
|
-
|
|
258
|
-
Demonstrate use against a BytesIO object, parsing string dates.
|
|
259
|
-
|
|
260
|
-
>>> from io import BytesIO
|
|
261
|
-
>>> data = BytesIO(
|
|
262
|
-
... b"ID,Name,Birthday\n"
|
|
263
|
-
... b"1,Alice,1995-07-12\n"
|
|
264
|
-
... b"2,Bob,1990-09-20\n"
|
|
265
|
-
... b"3,Charlie,2002-03-08\n"
|
|
266
|
-
... )
|
|
267
|
-
>>> pl.read_csv(data, try_parse_dates=True)
|
|
268
|
-
shape: (3, 3)
|
|
269
|
-
┌─────┬─────────┬────────────┐
|
|
270
|
-
│ ID ┆ Name ┆ Birthday │
|
|
271
|
-
│ --- ┆ --- ┆ --- │
|
|
272
|
-
│ i64 ┆ str ┆ date │
|
|
273
|
-
╞═════╪═════════╪════════════╡
|
|
274
|
-
│ 1 ┆ Alice ┆ 1995-07-12 │
|
|
275
|
-
│ 2 ┆ Bob ┆ 1990-09-20 │
|
|
276
|
-
│ 3 ┆ Charlie ┆ 2002-03-08 │
|
|
277
|
-
└─────┴─────────┴────────────┘
|
|
278
|
-
"""
|
|
279
|
-
_check_arg_is_1byte("separator", separator, can_be_empty=False)
|
|
280
|
-
_check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
|
|
281
|
-
_check_arg_is_1byte("eol_char", eol_char, can_be_empty=False)
|
|
282
|
-
|
|
283
|
-
projection, columns = parse_columns_arg(columns)
|
|
284
|
-
storage_options = storage_options or {}
|
|
285
|
-
|
|
286
|
-
if columns and not has_header:
|
|
287
|
-
for column in columns:
|
|
288
|
-
if not column.startswith("column_"):
|
|
289
|
-
msg = (
|
|
290
|
-
"specified column names do not start with 'column_',"
|
|
291
|
-
" but autogenerated header names were requested"
|
|
292
|
-
)
|
|
293
|
-
raise ValueError(msg)
|
|
294
|
-
|
|
295
|
-
if schema_overrides is not None and not isinstance(
|
|
296
|
-
schema_overrides, (dict, Sequence)
|
|
297
|
-
):
|
|
298
|
-
msg = "`schema_overrides` should be of type list or dict"
|
|
299
|
-
raise TypeError(msg)
|
|
300
|
-
|
|
301
|
-
if (
|
|
302
|
-
use_pyarrow
|
|
303
|
-
and schema_overrides is None
|
|
304
|
-
and n_rows is None
|
|
305
|
-
and n_threads is None
|
|
306
|
-
and not low_memory
|
|
307
|
-
and null_values is None
|
|
308
|
-
):
|
|
309
|
-
include_columns: Sequence[str] | None = None
|
|
310
|
-
if columns:
|
|
311
|
-
if not has_header:
|
|
312
|
-
# Convert 'column_1', 'column_2', ... column names to 'f0', 'f1', ...
|
|
313
|
-
# column names for pyarrow, if CSV file does not contain a header.
|
|
314
|
-
include_columns = [f"f{int(column[7:]) - 1}" for column in columns]
|
|
315
|
-
else:
|
|
316
|
-
include_columns = columns
|
|
317
|
-
|
|
318
|
-
if not columns and projection:
|
|
319
|
-
# Convert column indices from projection to 'f0', 'f1', ... column names
|
|
320
|
-
# for pyarrow.
|
|
321
|
-
include_columns = [f"f{column_idx}" for column_idx in projection]
|
|
322
|
-
|
|
323
|
-
with prepare_file_arg(
|
|
324
|
-
source,
|
|
325
|
-
encoding=None,
|
|
326
|
-
use_pyarrow=True,
|
|
327
|
-
raise_if_empty=raise_if_empty,
|
|
328
|
-
storage_options=storage_options,
|
|
329
|
-
) as data:
|
|
330
|
-
import pyarrow as pa
|
|
331
|
-
import pyarrow.csv
|
|
332
|
-
|
|
333
|
-
try:
|
|
334
|
-
tbl = pa.csv.read_csv(
|
|
335
|
-
data,
|
|
336
|
-
pa.csv.ReadOptions(
|
|
337
|
-
skip_rows=skip_rows,
|
|
338
|
-
skip_rows_after_names=skip_rows_after_header,
|
|
339
|
-
autogenerate_column_names=not has_header,
|
|
340
|
-
encoding=encoding,
|
|
341
|
-
),
|
|
342
|
-
pa.csv.ParseOptions(
|
|
343
|
-
delimiter=separator,
|
|
344
|
-
quote_char=quote_char if quote_char else False,
|
|
345
|
-
double_quote=quote_char is not None and quote_char == '"',
|
|
346
|
-
),
|
|
347
|
-
pa.csv.ConvertOptions(
|
|
348
|
-
column_types=None,
|
|
349
|
-
include_columns=include_columns,
|
|
350
|
-
include_missing_columns=ignore_errors,
|
|
351
|
-
),
|
|
352
|
-
)
|
|
353
|
-
except pa.ArrowInvalid as err:
|
|
354
|
-
if raise_if_empty or "Empty CSV" not in str(err):
|
|
355
|
-
raise
|
|
356
|
-
return pl.DataFrame()
|
|
357
|
-
|
|
358
|
-
if not has_header:
|
|
359
|
-
# Rename 'f0', 'f1', ... columns names autogenerated by pyarrow
|
|
360
|
-
# to 'column_1', 'column_2', ...
|
|
361
|
-
tbl = tbl.rename_columns(
|
|
362
|
-
[f"column_{int(column[1:]) + 1}" for column in tbl.column_names]
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
df = pl.DataFrame._from_arrow(tbl, rechunk=rechunk)
|
|
366
|
-
if new_columns:
|
|
367
|
-
return _update_columns(df, new_columns)
|
|
368
|
-
return df
|
|
369
|
-
|
|
370
|
-
if projection and schema_overrides and isinstance(schema_overrides, list):
|
|
371
|
-
if len(projection) < len(schema_overrides):
|
|
372
|
-
msg = "more schema overrides are specified than there are selected columns"
|
|
373
|
-
raise ValueError(msg)
|
|
374
|
-
|
|
375
|
-
# Fix list of dtypes when used together with projection as polars CSV reader
|
|
376
|
-
# wants a list of dtypes for the x first columns before it does the projection.
|
|
377
|
-
dtypes_list: list[PolarsDataType] = [String] * (max(projection) + 1)
|
|
378
|
-
|
|
379
|
-
for idx, column_idx in enumerate(projection):
|
|
380
|
-
if idx < len(schema_overrides):
|
|
381
|
-
dtypes_list[column_idx] = schema_overrides[idx]
|
|
382
|
-
|
|
383
|
-
schema_overrides = dtypes_list
|
|
384
|
-
|
|
385
|
-
if columns and schema_overrides and isinstance(schema_overrides, list):
|
|
386
|
-
if len(columns) < len(schema_overrides):
|
|
387
|
-
msg = "more dtypes overrides are specified than there are selected columns"
|
|
388
|
-
raise ValueError(msg)
|
|
389
|
-
|
|
390
|
-
# Map list of dtypes when used together with selected columns as a dtypes dict
|
|
391
|
-
# so the dtypes are applied to the correct column instead of the first x
|
|
392
|
-
# columns.
|
|
393
|
-
schema_overrides = dict(zip(columns, schema_overrides))
|
|
394
|
-
|
|
395
|
-
if new_columns and schema_overrides and isinstance(schema_overrides, dict):
|
|
396
|
-
current_columns = None
|
|
397
|
-
|
|
398
|
-
# As new column names are not available yet while parsing the CSV file, rename
|
|
399
|
-
# column names in dtypes to old names (if possible) so they can be used during
|
|
400
|
-
# CSV parsing.
|
|
401
|
-
if columns:
|
|
402
|
-
if len(columns) < len(new_columns):
|
|
403
|
-
msg = (
|
|
404
|
-
"more new column names are specified than there are selected"
|
|
405
|
-
" columns"
|
|
406
|
-
)
|
|
407
|
-
raise ValueError(msg)
|
|
408
|
-
|
|
409
|
-
# Get column names of requested columns.
|
|
410
|
-
current_columns = columns[0 : len(new_columns)]
|
|
411
|
-
elif not has_header:
|
|
412
|
-
# When there are no header, column names are autogenerated (and known).
|
|
413
|
-
|
|
414
|
-
if projection:
|
|
415
|
-
if columns and len(columns) < len(new_columns):
|
|
416
|
-
msg = (
|
|
417
|
-
"more new column names are specified than there are selected"
|
|
418
|
-
" columns"
|
|
419
|
-
)
|
|
420
|
-
raise ValueError(msg)
|
|
421
|
-
# Convert column indices from projection to 'column_1', 'column_2', ...
|
|
422
|
-
# column names.
|
|
423
|
-
current_columns = [
|
|
424
|
-
f"column_{column_idx + 1}" for column_idx in projection
|
|
425
|
-
]
|
|
426
|
-
else:
|
|
427
|
-
# Generate autogenerated 'column_1', 'column_2', ... column names for
|
|
428
|
-
# new column names.
|
|
429
|
-
current_columns = [
|
|
430
|
-
f"column_{column_idx}"
|
|
431
|
-
for column_idx in range(1, len(new_columns) + 1)
|
|
432
|
-
]
|
|
433
|
-
else:
|
|
434
|
-
# When a header is present, column names are not known yet.
|
|
435
|
-
|
|
436
|
-
if len(schema_overrides) <= len(new_columns):
|
|
437
|
-
# If dtypes dictionary contains less or same amount of values than new
|
|
438
|
-
# column names a list of dtypes can be created if all listed column
|
|
439
|
-
# names in dtypes dictionary appear in the first consecutive new column
|
|
440
|
-
# names.
|
|
441
|
-
dtype_list = [
|
|
442
|
-
schema_overrides[new_column_name]
|
|
443
|
-
for new_column_name in new_columns[0 : len(schema_overrides)]
|
|
444
|
-
if new_column_name in schema_overrides
|
|
445
|
-
]
|
|
446
|
-
|
|
447
|
-
if len(dtype_list) == len(schema_overrides):
|
|
448
|
-
schema_overrides = dtype_list
|
|
449
|
-
|
|
450
|
-
if current_columns and isinstance(schema_overrides, dict):
|
|
451
|
-
new_to_current = dict(zip(new_columns, current_columns))
|
|
452
|
-
# Change new column names to current column names in dtype.
|
|
453
|
-
schema_overrides = {
|
|
454
|
-
new_to_current.get(column_name, column_name): column_dtype
|
|
455
|
-
for column_name, column_dtype in schema_overrides.items()
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
if not infer_schema:
|
|
459
|
-
infer_schema_length = 0
|
|
460
|
-
|
|
461
|
-
# TODO: scan_csv doesn't support a "dtype slice" (i.e. list[DataType])
|
|
462
|
-
schema_overrides_is_list = isinstance(schema_overrides, Sequence)
|
|
463
|
-
encoding_supported_in_lazy = encoding in {"utf8", "utf8-lossy"}
|
|
464
|
-
|
|
465
|
-
new_streaming = (
|
|
466
|
-
os.getenv("POLARS_FORCE_NEW_STREAMING") == "1"
|
|
467
|
-
or os.getenv("POLARS_AUTO_NEW_STREAMING") == "1"
|
|
468
|
-
)
|
|
469
|
-
|
|
470
|
-
if new_streaming or (
|
|
471
|
-
# Check that it is not a BytesIO object
|
|
472
|
-
isinstance(v := source, (str, Path))
|
|
473
|
-
and (
|
|
474
|
-
# HuggingFace only for now ⊂( ◜◒◝ )⊃
|
|
475
|
-
str(v).startswith("hf://")
|
|
476
|
-
# Also dispatch on FORCE_ASYNC, so that this codepath gets run
|
|
477
|
-
# through by our test suite during CI.
|
|
478
|
-
or (
|
|
479
|
-
os.getenv("POLARS_FORCE_ASYNC") == "1"
|
|
480
|
-
and not schema_overrides_is_list
|
|
481
|
-
and encoding_supported_in_lazy
|
|
482
|
-
)
|
|
483
|
-
# TODO: We can't dispatch this for all paths due to a few reasons:
|
|
484
|
-
# * `scan_csv` does not support compressed files
|
|
485
|
-
# * The `storage_options` configuration keys are different between
|
|
486
|
-
# fsspec and object_store (would require a breaking change)
|
|
487
|
-
)
|
|
488
|
-
):
|
|
489
|
-
if isinstance(source, (str, Path)):
|
|
490
|
-
source = normalize_filepath(source, check_not_directory=False)
|
|
491
|
-
elif is_path_or_str_sequence(source, allow_str=False):
|
|
492
|
-
source = [ # type: ignore[assignment]
|
|
493
|
-
normalize_filepath(source, check_not_directory=False)
|
|
494
|
-
for source in source
|
|
495
|
-
]
|
|
496
|
-
|
|
497
|
-
if not new_streaming:
|
|
498
|
-
if schema_overrides_is_list:
|
|
499
|
-
msg = "passing a list to `schema_overrides` is unsupported for hf:// paths"
|
|
500
|
-
raise ValueError(msg)
|
|
501
|
-
if not encoding_supported_in_lazy:
|
|
502
|
-
msg = f"unsupported encoding {encoding} for hf:// paths"
|
|
503
|
-
raise ValueError(msg)
|
|
504
|
-
|
|
505
|
-
lf = _scan_csv_impl(
|
|
506
|
-
source,
|
|
507
|
-
has_header=has_header,
|
|
508
|
-
separator=separator,
|
|
509
|
-
comment_prefix=comment_prefix,
|
|
510
|
-
quote_char=quote_char,
|
|
511
|
-
skip_rows=skip_rows,
|
|
512
|
-
skip_lines=skip_lines,
|
|
513
|
-
schema_overrides=schema_overrides, # type: ignore[arg-type]
|
|
514
|
-
schema=schema,
|
|
515
|
-
null_values=null_values,
|
|
516
|
-
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
517
|
-
ignore_errors=ignore_errors,
|
|
518
|
-
try_parse_dates=try_parse_dates,
|
|
519
|
-
infer_schema_length=infer_schema_length,
|
|
520
|
-
n_rows=n_rows,
|
|
521
|
-
encoding=encoding, # type: ignore[arg-type]
|
|
522
|
-
low_memory=low_memory,
|
|
523
|
-
rechunk=rechunk,
|
|
524
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
525
|
-
row_index_name=row_index_name,
|
|
526
|
-
row_index_offset=row_index_offset,
|
|
527
|
-
eol_char=eol_char,
|
|
528
|
-
raise_if_empty=raise_if_empty,
|
|
529
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
530
|
-
decimal_comma=decimal_comma,
|
|
531
|
-
glob=glob,
|
|
532
|
-
)
|
|
533
|
-
|
|
534
|
-
if columns:
|
|
535
|
-
lf = lf.select(columns)
|
|
536
|
-
elif projection:
|
|
537
|
-
lf = lf.select(F.nth(projection))
|
|
538
|
-
|
|
539
|
-
df = lf.collect()
|
|
540
|
-
|
|
541
|
-
else:
|
|
542
|
-
with prepare_file_arg(
|
|
543
|
-
source,
|
|
544
|
-
encoding=encoding,
|
|
545
|
-
use_pyarrow=False,
|
|
546
|
-
raise_if_empty=raise_if_empty,
|
|
547
|
-
storage_options=storage_options,
|
|
548
|
-
) as data:
|
|
549
|
-
df = _read_csv_impl(
|
|
550
|
-
data,
|
|
551
|
-
has_header=has_header,
|
|
552
|
-
columns=columns if columns else projection,
|
|
553
|
-
separator=separator,
|
|
554
|
-
comment_prefix=comment_prefix,
|
|
555
|
-
quote_char=quote_char,
|
|
556
|
-
skip_rows=skip_rows,
|
|
557
|
-
skip_lines=skip_lines,
|
|
558
|
-
schema_overrides=schema_overrides,
|
|
559
|
-
schema=schema,
|
|
560
|
-
null_values=null_values,
|
|
561
|
-
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
562
|
-
ignore_errors=ignore_errors,
|
|
563
|
-
try_parse_dates=try_parse_dates,
|
|
564
|
-
n_threads=n_threads,
|
|
565
|
-
infer_schema_length=infer_schema_length,
|
|
566
|
-
batch_size=batch_size,
|
|
567
|
-
n_rows=n_rows,
|
|
568
|
-
encoding=encoding if encoding == "utf8-lossy" else "utf8",
|
|
569
|
-
low_memory=low_memory,
|
|
570
|
-
rechunk=rechunk,
|
|
571
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
572
|
-
row_index_name=row_index_name,
|
|
573
|
-
row_index_offset=row_index_offset,
|
|
574
|
-
eol_char=eol_char,
|
|
575
|
-
raise_if_empty=raise_if_empty,
|
|
576
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
577
|
-
decimal_comma=decimal_comma,
|
|
578
|
-
glob=glob,
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
if new_columns:
|
|
582
|
-
return _update_columns(df, new_columns)
|
|
583
|
-
return df
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
def _read_csv_impl(
|
|
587
|
-
source: str | Path | IO[bytes] | bytes,
|
|
588
|
-
*,
|
|
589
|
-
has_header: bool = True,
|
|
590
|
-
columns: Sequence[int] | Sequence[str] | None = None,
|
|
591
|
-
separator: str = ",",
|
|
592
|
-
comment_prefix: str | None = None,
|
|
593
|
-
quote_char: str | None = '"',
|
|
594
|
-
skip_rows: int = 0,
|
|
595
|
-
skip_lines: int = 0,
|
|
596
|
-
schema: None | SchemaDict = None,
|
|
597
|
-
schema_overrides: None | (SchemaDict | Sequence[PolarsDataType]) = None,
|
|
598
|
-
null_values: str | Sequence[str] | dict[str, str] | None = None,
|
|
599
|
-
missing_utf8_is_empty_string: bool = False,
|
|
600
|
-
ignore_errors: bool = False,
|
|
601
|
-
try_parse_dates: bool = False,
|
|
602
|
-
n_threads: int | None = None,
|
|
603
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
604
|
-
batch_size: int = 8192,
|
|
605
|
-
n_rows: int | None = None,
|
|
606
|
-
encoding: CsvEncoding = "utf8",
|
|
607
|
-
low_memory: bool = False,
|
|
608
|
-
rechunk: bool = False,
|
|
609
|
-
skip_rows_after_header: int = 0,
|
|
610
|
-
row_index_name: str | None = None,
|
|
611
|
-
row_index_offset: int = 0,
|
|
612
|
-
sample_size: int = 1024,
|
|
613
|
-
eol_char: str = "\n",
|
|
614
|
-
raise_if_empty: bool = True,
|
|
615
|
-
truncate_ragged_lines: bool = False,
|
|
616
|
-
decimal_comma: bool = False,
|
|
617
|
-
glob: bool = True,
|
|
618
|
-
) -> DataFrame:
|
|
619
|
-
path: str | None
|
|
620
|
-
if isinstance(source, (str, Path)):
|
|
621
|
-
path = normalize_filepath(source, check_not_directory=False)
|
|
622
|
-
else:
|
|
623
|
-
path = None
|
|
624
|
-
if isinstance(source, BytesIO):
|
|
625
|
-
source = source.getvalue()
|
|
626
|
-
if isinstance(source, StringIO):
|
|
627
|
-
source = source.getvalue().encode()
|
|
628
|
-
|
|
629
|
-
dtype_list: Sequence[tuple[str, PolarsDataType]] | None = None
|
|
630
|
-
dtype_slice: Sequence[PolarsDataType] | None = None
|
|
631
|
-
if schema_overrides is not None:
|
|
632
|
-
if isinstance(schema_overrides, dict):
|
|
633
|
-
dtype_list = []
|
|
634
|
-
for k, v in schema_overrides.items():
|
|
635
|
-
dtype_list.append((k, parse_into_dtype(v)))
|
|
636
|
-
elif isinstance(schema_overrides, Sequence):
|
|
637
|
-
dtype_slice = schema_overrides
|
|
638
|
-
else:
|
|
639
|
-
msg = f"`schema_overrides` should be of type list or dict, got {qualified_type_name(schema_overrides)!r}"
|
|
640
|
-
raise TypeError(msg)
|
|
641
|
-
|
|
642
|
-
processed_null_values = _process_null_values(null_values)
|
|
643
|
-
|
|
644
|
-
if isinstance(columns, str):
|
|
645
|
-
columns = [columns]
|
|
646
|
-
if isinstance(source, str) and is_glob_pattern(source):
|
|
647
|
-
dtypes_dict = None
|
|
648
|
-
if dtype_list is not None:
|
|
649
|
-
dtypes_dict = dict(dtype_list)
|
|
650
|
-
if dtype_slice is not None:
|
|
651
|
-
msg = (
|
|
652
|
-
"cannot use glob patterns and unnamed dtypes as `schema_overrides` argument"
|
|
653
|
-
"\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]"
|
|
654
|
-
)
|
|
655
|
-
raise ValueError(msg)
|
|
656
|
-
from polars import scan_csv
|
|
657
|
-
|
|
658
|
-
scan = scan_csv(
|
|
659
|
-
source,
|
|
660
|
-
has_header=has_header,
|
|
661
|
-
separator=separator,
|
|
662
|
-
comment_prefix=comment_prefix,
|
|
663
|
-
quote_char=quote_char,
|
|
664
|
-
skip_rows=skip_rows,
|
|
665
|
-
skip_lines=skip_lines,
|
|
666
|
-
schema=schema,
|
|
667
|
-
schema_overrides=dtypes_dict,
|
|
668
|
-
null_values=null_values,
|
|
669
|
-
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
670
|
-
ignore_errors=ignore_errors,
|
|
671
|
-
infer_schema_length=infer_schema_length,
|
|
672
|
-
n_rows=n_rows,
|
|
673
|
-
low_memory=low_memory,
|
|
674
|
-
rechunk=rechunk,
|
|
675
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
676
|
-
row_index_name=row_index_name,
|
|
677
|
-
row_index_offset=row_index_offset,
|
|
678
|
-
eol_char=eol_char,
|
|
679
|
-
raise_if_empty=raise_if_empty,
|
|
680
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
681
|
-
decimal_comma=decimal_comma,
|
|
682
|
-
glob=glob,
|
|
683
|
-
)
|
|
684
|
-
if columns is None:
|
|
685
|
-
return scan.collect()
|
|
686
|
-
elif is_str_sequence(columns, allow_str=False):
|
|
687
|
-
return scan.select(columns).collect()
|
|
688
|
-
else:
|
|
689
|
-
msg = (
|
|
690
|
-
"cannot use glob patterns and integer based projection as `columns` argument"
|
|
691
|
-
"\n\nUse columns: List[str]"
|
|
692
|
-
)
|
|
693
|
-
raise ValueError(msg)
|
|
694
|
-
|
|
695
|
-
projection, columns = parse_columns_arg(columns)
|
|
696
|
-
|
|
697
|
-
pydf = PyDataFrame.read_csv(
|
|
698
|
-
source,
|
|
699
|
-
infer_schema_length,
|
|
700
|
-
batch_size,
|
|
701
|
-
has_header,
|
|
702
|
-
ignore_errors,
|
|
703
|
-
n_rows,
|
|
704
|
-
skip_rows,
|
|
705
|
-
skip_lines,
|
|
706
|
-
projection,
|
|
707
|
-
separator,
|
|
708
|
-
rechunk,
|
|
709
|
-
columns,
|
|
710
|
-
encoding,
|
|
711
|
-
n_threads,
|
|
712
|
-
path,
|
|
713
|
-
dtype_list,
|
|
714
|
-
dtype_slice,
|
|
715
|
-
low_memory,
|
|
716
|
-
comment_prefix,
|
|
717
|
-
quote_char,
|
|
718
|
-
processed_null_values,
|
|
719
|
-
missing_utf8_is_empty_string,
|
|
720
|
-
try_parse_dates,
|
|
721
|
-
skip_rows_after_header,
|
|
722
|
-
parse_row_index_args(row_index_name, row_index_offset),
|
|
723
|
-
eol_char=eol_char,
|
|
724
|
-
raise_if_empty=raise_if_empty,
|
|
725
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
726
|
-
decimal_comma=decimal_comma,
|
|
727
|
-
schema=schema,
|
|
728
|
-
)
|
|
729
|
-
return wrap_df(pydf)
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
@deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31")
|
|
733
|
-
@deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
|
|
734
|
-
@deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
|
|
735
|
-
def read_csv_batched(
|
|
736
|
-
source: str | Path,
|
|
737
|
-
*,
|
|
738
|
-
has_header: bool = True,
|
|
739
|
-
columns: Sequence[int] | Sequence[str] | None = None,
|
|
740
|
-
new_columns: Sequence[str] | None = None,
|
|
741
|
-
separator: str = ",",
|
|
742
|
-
comment_prefix: str | None = None,
|
|
743
|
-
quote_char: str | None = '"',
|
|
744
|
-
skip_rows: int = 0,
|
|
745
|
-
skip_lines: int = 0,
|
|
746
|
-
schema_overrides: (
|
|
747
|
-
Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None
|
|
748
|
-
) = None,
|
|
749
|
-
null_values: str | Sequence[str] | dict[str, str] | None = None,
|
|
750
|
-
missing_utf8_is_empty_string: bool = False,
|
|
751
|
-
ignore_errors: bool = False,
|
|
752
|
-
try_parse_dates: bool = False,
|
|
753
|
-
n_threads: int | None = None,
|
|
754
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
755
|
-
batch_size: int = 50_000,
|
|
756
|
-
n_rows: int | None = None,
|
|
757
|
-
encoding: CsvEncoding | str = "utf8",
|
|
758
|
-
low_memory: bool = False,
|
|
759
|
-
rechunk: bool = False,
|
|
760
|
-
skip_rows_after_header: int = 0,
|
|
761
|
-
row_index_name: str | None = None,
|
|
762
|
-
row_index_offset: int = 0,
|
|
763
|
-
sample_size: int = 1024,
|
|
764
|
-
eol_char: str = "\n",
|
|
765
|
-
raise_if_empty: bool = True,
|
|
766
|
-
truncate_ragged_lines: bool = False,
|
|
767
|
-
decimal_comma: bool = False,
|
|
768
|
-
) -> BatchedCsvReader:
|
|
769
|
-
r"""
|
|
770
|
-
Read a CSV file in batches.
|
|
771
|
-
|
|
772
|
-
Upon creation of the `BatchedCsvReader`, Polars will gather statistics and
|
|
773
|
-
determine the file chunks. After that, work will only be done if `next_batches`
|
|
774
|
-
is called, which will return a list of `n` frames of the given batch size.
|
|
775
|
-
|
|
776
|
-
.. versionchanged:: 0.20.31
|
|
777
|
-
The `dtypes` parameter was renamed `schema_overrides`.
|
|
778
|
-
.. versionchanged:: 0.20.4
|
|
779
|
-
* The `row_count_name` parameter was renamed `row_index_name`.
|
|
780
|
-
* The `row_count_offset` parameter was renamed `row_index_offset`.
|
|
781
|
-
|
|
782
|
-
Parameters
|
|
783
|
-
----------
|
|
784
|
-
source
|
|
785
|
-
Path to a file or a file-like object (by "file-like object" we refer to objects
|
|
786
|
-
that have a `read()` method, such as a file handler like the builtin `open`
|
|
787
|
-
function, or a `BytesIO` instance). If `fsspec` is installed, it will be used
|
|
788
|
-
to open remote files. For file-like objects, the stream position may not be
|
|
789
|
-
updated accordingly after reading.
|
|
790
|
-
has_header
|
|
791
|
-
Indicate if the first row of the dataset is a header or not. If set to False,
|
|
792
|
-
column names will be autogenerated in the following format: `column_x`, with
|
|
793
|
-
`x` being an enumeration over every column in the dataset, starting at 1.
|
|
794
|
-
columns
|
|
795
|
-
Columns to select. Accepts a list of column indices (starting
|
|
796
|
-
at zero) or a list of column names.
|
|
797
|
-
new_columns
|
|
798
|
-
Rename columns right after parsing the CSV file. If the given
|
|
799
|
-
list is shorter than the width of the DataFrame the remaining
|
|
800
|
-
columns will have their original name.
|
|
801
|
-
separator
|
|
802
|
-
Single byte character to use as separator in the file.
|
|
803
|
-
comment_prefix
|
|
804
|
-
A string used to indicate the start of a comment line. Comment lines are skipped
|
|
805
|
-
during parsing. Common examples of comment prefixes are `#` and `//`.
|
|
806
|
-
quote_char
|
|
807
|
-
Single byte character used for csv quoting, default = `"`.
|
|
808
|
-
Set to None to turn off special handling and escaping of quotes.
|
|
809
|
-
skip_rows
|
|
810
|
-
Start reading after ``skip_rows`` rows. The header will be parsed at this
|
|
811
|
-
offset. Note that we respect CSV escaping/comments when skipping rows.
|
|
812
|
-
If you want to skip by newline char only, use `skip_lines`.
|
|
813
|
-
skip_lines
|
|
814
|
-
Start reading after `skip_lines` lines. The header will be parsed at this
|
|
815
|
-
offset. Note that CSV escaping will not be respected when skipping lines.
|
|
816
|
-
If you want to skip valid CSV rows, use ``skip_rows``.
|
|
817
|
-
schema_overrides
|
|
818
|
-
Overwrite dtypes during inference.
|
|
819
|
-
null_values
|
|
820
|
-
Values to interpret as null values. You can provide a:
|
|
821
|
-
|
|
822
|
-
- `str`: All values equal to this string will be null.
|
|
823
|
-
- `List[str]`: All values equal to any string in this list will be null.
|
|
824
|
-
- `Dict[str, str]`: A dictionary that maps column name to a
|
|
825
|
-
null value string.
|
|
826
|
-
|
|
827
|
-
missing_utf8_is_empty_string
|
|
828
|
-
By default a missing value is considered to be null; if you would prefer missing
|
|
829
|
-
utf8 values to be treated as the empty string you can set this param True.
|
|
830
|
-
ignore_errors
|
|
831
|
-
Try to keep reading lines if some lines yield errors.
|
|
832
|
-
First try `infer_schema_length=0` to read all columns as
|
|
833
|
-
`pl.String` to check which values might cause an issue.
|
|
834
|
-
try_parse_dates
|
|
835
|
-
Try to automatically parse dates. Most ISO8601-like formats can
|
|
836
|
-
be inferred, as well as a handful of others. If this does not succeed,
|
|
837
|
-
the column remains of data type `pl.String`.
|
|
838
|
-
n_threads
|
|
839
|
-
Number of threads to use in csv parsing.
|
|
840
|
-
Defaults to the number of physical cpu's of your system.
|
|
841
|
-
infer_schema_length
|
|
842
|
-
The maximum number of rows to scan for schema inference.
|
|
843
|
-
If set to `0`, all columns will be read as `pl.String`.
|
|
844
|
-
If set to `None`, the full data may be scanned *(this is slow)*.
|
|
845
|
-
batch_size
|
|
846
|
-
Number of lines to read into the buffer at once.
|
|
847
|
-
|
|
848
|
-
Modify this to change performance.
|
|
849
|
-
n_rows
|
|
850
|
-
Stop reading from CSV file after reading `n_rows`.
|
|
851
|
-
During multi-threaded parsing, an upper bound of `n_rows`
|
|
852
|
-
rows cannot be guaranteed.
|
|
853
|
-
encoding : {'utf8', 'utf8-lossy', ...}
|
|
854
|
-
Lossy means that invalid utf8 values are replaced with `�`
|
|
855
|
-
characters. When using other encodings than `utf8` or
|
|
856
|
-
`utf8-lossy`, the input is first decoded in memory with
|
|
857
|
-
python. Defaults to `utf8`.
|
|
858
|
-
low_memory
|
|
859
|
-
Reduce memory pressure at the expense of performance.
|
|
860
|
-
rechunk
|
|
861
|
-
Make sure that all columns are contiguous in memory by
|
|
862
|
-
aggregating the chunks into a single array.
|
|
863
|
-
skip_rows_after_header
|
|
864
|
-
Skip this number of rows when the header is parsed.
|
|
865
|
-
row_index_name
|
|
866
|
-
Insert a row index column with the given name into the DataFrame as the first
|
|
867
|
-
column. If set to `None` (default), no row index column is created.
|
|
868
|
-
row_index_offset
|
|
869
|
-
Start the row index at this offset. Cannot be negative.
|
|
870
|
-
Only used if `row_index_name` is set.
|
|
871
|
-
sample_size
|
|
872
|
-
Set the sample size. This is used to sample statistics to estimate the
|
|
873
|
-
allocation needed.
|
|
874
|
-
|
|
875
|
-
.. deprecated:: 1.10.0
|
|
876
|
-
Is a no-op.
|
|
877
|
-
eol_char
|
|
878
|
-
Single byte end of line character (default: `\n`). When encountering a file
|
|
879
|
-
with windows line endings (`\r\n`), one can go with the default `\n`. The extra
|
|
880
|
-
`\r` will be removed when processed.
|
|
881
|
-
raise_if_empty
|
|
882
|
-
When there is no data in the source,`NoDataError` is raised. If this parameter
|
|
883
|
-
is set to False, `None` will be returned from `next_batches(n)` instead.
|
|
884
|
-
truncate_ragged_lines
|
|
885
|
-
Truncate lines that are longer than the schema.
|
|
886
|
-
decimal_comma
|
|
887
|
-
Parse floats using a comma as the decimal separator instead of a period.
|
|
888
|
-
|
|
889
|
-
Returns
|
|
890
|
-
-------
|
|
891
|
-
BatchedCsvReader
|
|
892
|
-
|
|
893
|
-
See Also
|
|
894
|
-
--------
|
|
895
|
-
scan_csv : Lazily read from a CSV file or multiple files via glob patterns.
|
|
896
|
-
|
|
897
|
-
Examples
|
|
898
|
-
--------
|
|
899
|
-
>>> reader = pl.read_csv_batched(
|
|
900
|
-
... "./pdsh/tables_scale_100/lineitem.tbl",
|
|
901
|
-
... separator="|",
|
|
902
|
-
... try_parse_dates=True,
|
|
903
|
-
... ) # doctest: +SKIP
|
|
904
|
-
>>> batches = reader.next_batches(5) # doctest: +SKIP
|
|
905
|
-
>>> for df in batches: # doctest: +SKIP
|
|
906
|
-
... print(df)
|
|
907
|
-
|
|
908
|
-
Read big CSV file in batches and write a CSV file for each "group" of interest.
|
|
909
|
-
|
|
910
|
-
>>> seen_groups = set()
|
|
911
|
-
>>> reader = pl.read_csv_batched("big_file.csv") # doctest: +SKIP
|
|
912
|
-
>>> batches = reader.next_batches(100) # doctest: +SKIP
|
|
913
|
-
|
|
914
|
-
>>> while batches: # doctest: +SKIP
|
|
915
|
-
... df_current_batches = pl.concat(batches)
|
|
916
|
-
... partition_dfs = df_current_batches.partition_by("group", as_dict=True)
|
|
917
|
-
...
|
|
918
|
-
... for group, df in partition_dfs.items():
|
|
919
|
-
... if group in seen_groups:
|
|
920
|
-
... with open(f"./data/{group}.csv", "a") as fh:
|
|
921
|
-
... fh.write(df.write_csv(file=None, include_header=False))
|
|
922
|
-
... else:
|
|
923
|
-
... df.write_csv(file=f"./data/{group}.csv", include_header=True)
|
|
924
|
-
... seen_groups.add(group)
|
|
925
|
-
...
|
|
926
|
-
... batches = reader.next_batches(100)
|
|
927
|
-
"""
|
|
928
|
-
projection, columns = parse_columns_arg(columns)
|
|
929
|
-
|
|
930
|
-
if columns and not has_header:
|
|
931
|
-
for column in columns:
|
|
932
|
-
if not column.startswith("column_"):
|
|
933
|
-
msg = (
|
|
934
|
-
"specified column names do not start with 'column_',"
|
|
935
|
-
" but autogenerated header names were requested"
|
|
936
|
-
)
|
|
937
|
-
raise ValueError(msg)
|
|
938
|
-
|
|
939
|
-
if projection and schema_overrides and isinstance(schema_overrides, list):
|
|
940
|
-
if len(projection) < len(schema_overrides):
|
|
941
|
-
msg = "more schema overrides are specified than there are selected columns"
|
|
942
|
-
raise ValueError(msg)
|
|
943
|
-
|
|
944
|
-
# Fix list of dtypes when used together with projection as polars CSV reader
|
|
945
|
-
# wants a list of dtypes for the x first columns before it does the projection.
|
|
946
|
-
dtypes_list: list[PolarsDataType] = [String] * (max(projection) + 1)
|
|
947
|
-
|
|
948
|
-
for idx, column_idx in enumerate(projection):
|
|
949
|
-
if idx < len(schema_overrides):
|
|
950
|
-
dtypes_list[column_idx] = schema_overrides[idx]
|
|
951
|
-
|
|
952
|
-
schema_overrides = dtypes_list
|
|
953
|
-
|
|
954
|
-
if columns and schema_overrides and isinstance(schema_overrides, list):
|
|
955
|
-
if len(columns) < len(schema_overrides):
|
|
956
|
-
msg = "more schema overrides are specified than there are selected columns"
|
|
957
|
-
raise ValueError(msg)
|
|
958
|
-
|
|
959
|
-
# Map list of dtypes when used together with selected columns as a dtypes dict
|
|
960
|
-
# so the dtypes are applied to the correct column instead of the first x
|
|
961
|
-
# columns.
|
|
962
|
-
schema_overrides = dict(zip(columns, schema_overrides))
|
|
963
|
-
|
|
964
|
-
if new_columns and schema_overrides and isinstance(schema_overrides, dict):
|
|
965
|
-
current_columns = None
|
|
966
|
-
|
|
967
|
-
# As new column names are not available yet while parsing the CSV file, rename
|
|
968
|
-
# column names in dtypes to old names (if possible) so they can be used during
|
|
969
|
-
# CSV parsing.
|
|
970
|
-
if columns:
|
|
971
|
-
if len(columns) < len(new_columns):
|
|
972
|
-
msg = "more new column names are specified than there are selected columns"
|
|
973
|
-
raise ValueError(msg)
|
|
974
|
-
|
|
975
|
-
# Get column names of requested columns.
|
|
976
|
-
current_columns = columns[0 : len(new_columns)]
|
|
977
|
-
elif not has_header:
|
|
978
|
-
# When there are no header, column names are autogenerated (and known).
|
|
979
|
-
|
|
980
|
-
if projection:
|
|
981
|
-
if columns and len(columns) < len(new_columns):
|
|
982
|
-
msg = "more new column names are specified than there are selected columns"
|
|
983
|
-
raise ValueError(msg)
|
|
984
|
-
# Convert column indices from projection to 'column_1', 'column_2', ...
|
|
985
|
-
# column names.
|
|
986
|
-
current_columns = [
|
|
987
|
-
f"column_{column_idx + 1}" for column_idx in projection
|
|
988
|
-
]
|
|
989
|
-
else:
|
|
990
|
-
# Generate autogenerated 'column_1', 'column_2', ... column names for
|
|
991
|
-
# new column names.
|
|
992
|
-
current_columns = [
|
|
993
|
-
f"column_{column_idx}"
|
|
994
|
-
for column_idx in range(1, len(new_columns) + 1)
|
|
995
|
-
]
|
|
996
|
-
else:
|
|
997
|
-
# When a header is present, column names are not known yet.
|
|
998
|
-
|
|
999
|
-
if len(schema_overrides) <= len(new_columns):
|
|
1000
|
-
# If dtypes dictionary contains less or same amount of values than new
|
|
1001
|
-
# column names a list of dtypes can be created if all listed column
|
|
1002
|
-
# names in dtypes dictionary appear in the first consecutive new column
|
|
1003
|
-
# names.
|
|
1004
|
-
dtype_list = [
|
|
1005
|
-
schema_overrides[new_column_name]
|
|
1006
|
-
for new_column_name in new_columns[0 : len(schema_overrides)]
|
|
1007
|
-
if new_column_name in schema_overrides
|
|
1008
|
-
]
|
|
1009
|
-
|
|
1010
|
-
if len(dtype_list) == len(schema_overrides):
|
|
1011
|
-
schema_overrides = dtype_list
|
|
1012
|
-
|
|
1013
|
-
if current_columns and isinstance(schema_overrides, dict):
|
|
1014
|
-
new_to_current = dict(zip(new_columns, current_columns))
|
|
1015
|
-
# Change new column names to current column names in dtype.
|
|
1016
|
-
schema_overrides = {
|
|
1017
|
-
new_to_current.get(column_name, column_name): column_dtype
|
|
1018
|
-
for column_name, column_dtype in schema_overrides.items()
|
|
1019
|
-
}
|
|
1020
|
-
|
|
1021
|
-
return BatchedCsvReader(
|
|
1022
|
-
source,
|
|
1023
|
-
has_header=has_header,
|
|
1024
|
-
columns=columns if columns else projection,
|
|
1025
|
-
separator=separator,
|
|
1026
|
-
comment_prefix=comment_prefix,
|
|
1027
|
-
quote_char=quote_char,
|
|
1028
|
-
skip_rows=skip_rows,
|
|
1029
|
-
skip_lines=skip_lines,
|
|
1030
|
-
schema_overrides=schema_overrides,
|
|
1031
|
-
null_values=null_values,
|
|
1032
|
-
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
1033
|
-
ignore_errors=ignore_errors,
|
|
1034
|
-
try_parse_dates=try_parse_dates,
|
|
1035
|
-
n_threads=n_threads,
|
|
1036
|
-
infer_schema_length=infer_schema_length,
|
|
1037
|
-
batch_size=batch_size,
|
|
1038
|
-
n_rows=n_rows,
|
|
1039
|
-
encoding=encoding if encoding == "utf8-lossy" else "utf8",
|
|
1040
|
-
low_memory=low_memory,
|
|
1041
|
-
rechunk=rechunk,
|
|
1042
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
1043
|
-
row_index_name=row_index_name,
|
|
1044
|
-
row_index_offset=row_index_offset,
|
|
1045
|
-
eol_char=eol_char,
|
|
1046
|
-
new_columns=new_columns,
|
|
1047
|
-
raise_if_empty=raise_if_empty,
|
|
1048
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
1049
|
-
decimal_comma=decimal_comma,
|
|
1050
|
-
)
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
@deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31")
|
|
1054
|
-
@deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
|
|
1055
|
-
@deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
|
|
1056
|
-
def scan_csv(
|
|
1057
|
-
source: (
|
|
1058
|
-
str
|
|
1059
|
-
| Path
|
|
1060
|
-
| IO[str]
|
|
1061
|
-
| IO[bytes]
|
|
1062
|
-
| bytes
|
|
1063
|
-
| list[str]
|
|
1064
|
-
| list[Path]
|
|
1065
|
-
| list[IO[str]]
|
|
1066
|
-
| list[IO[bytes]]
|
|
1067
|
-
| list[bytes]
|
|
1068
|
-
),
|
|
1069
|
-
*,
|
|
1070
|
-
has_header: bool = True,
|
|
1071
|
-
separator: str = ",",
|
|
1072
|
-
comment_prefix: str | None = None,
|
|
1073
|
-
quote_char: str | None = '"',
|
|
1074
|
-
skip_rows: int = 0,
|
|
1075
|
-
skip_lines: int = 0,
|
|
1076
|
-
schema: SchemaDict | None = None,
|
|
1077
|
-
schema_overrides: SchemaDict | Sequence[PolarsDataType] | None = None,
|
|
1078
|
-
null_values: str | Sequence[str] | dict[str, str] | None = None,
|
|
1079
|
-
missing_utf8_is_empty_string: bool = False,
|
|
1080
|
-
ignore_errors: bool = False,
|
|
1081
|
-
cache: bool = True,
|
|
1082
|
-
with_column_names: Callable[[list[str]], list[str]] | None = None,
|
|
1083
|
-
infer_schema: bool = True,
|
|
1084
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
1085
|
-
n_rows: int | None = None,
|
|
1086
|
-
encoding: CsvEncoding = "utf8",
|
|
1087
|
-
low_memory: bool = False,
|
|
1088
|
-
rechunk: bool = False,
|
|
1089
|
-
skip_rows_after_header: int = 0,
|
|
1090
|
-
row_index_name: str | None = None,
|
|
1091
|
-
row_index_offset: int = 0,
|
|
1092
|
-
try_parse_dates: bool = False,
|
|
1093
|
-
eol_char: str = "\n",
|
|
1094
|
-
new_columns: Sequence[str] | None = None,
|
|
1095
|
-
raise_if_empty: bool = True,
|
|
1096
|
-
truncate_ragged_lines: bool = False,
|
|
1097
|
-
decimal_comma: bool = False,
|
|
1098
|
-
glob: bool = True,
|
|
1099
|
-
storage_options: dict[str, Any] | None = None,
|
|
1100
|
-
credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
|
|
1101
|
-
retries: int = 2,
|
|
1102
|
-
file_cache_ttl: int | None = None,
|
|
1103
|
-
include_file_paths: str | None = None,
|
|
1104
|
-
) -> LazyFrame:
|
|
1105
|
-
r"""
|
|
1106
|
-
Lazily read from a CSV file or multiple files via glob patterns.
|
|
1107
|
-
|
|
1108
|
-
This allows the query optimizer to push down predicates and
|
|
1109
|
-
projections to the scan level, thereby potentially reducing
|
|
1110
|
-
memory overhead.
|
|
1111
|
-
|
|
1112
|
-
.. versionchanged:: 0.20.31
|
|
1113
|
-
The `dtypes` parameter was renamed `schema_overrides`.
|
|
1114
|
-
.. versionchanged:: 0.20.4
|
|
1115
|
-
* The `row_count_name` parameter was renamed `row_index_name`.
|
|
1116
|
-
* The `row_count_offset` parameter was renamed `row_index_offset`.
|
|
1117
|
-
|
|
1118
|
-
Parameters
|
|
1119
|
-
----------
|
|
1120
|
-
source
|
|
1121
|
-
Path(s) to a file or directory
|
|
1122
|
-
When needing to authenticate for scanning cloud locations, see the
|
|
1123
|
-
`storage_options` parameter.
|
|
1124
|
-
has_header
|
|
1125
|
-
Indicate if the first row of the dataset is a header or not. If set to False,
|
|
1126
|
-
column names will be autogenerated in the following format: `column_x`, with
|
|
1127
|
-
`x` being an enumeration over every column in the dataset, starting at 1.
|
|
1128
|
-
separator
|
|
1129
|
-
Single byte character to use as separator in the file.
|
|
1130
|
-
comment_prefix
|
|
1131
|
-
A string used to indicate the start of a comment line. Comment lines are skipped
|
|
1132
|
-
during parsing. Common examples of comment prefixes are `#` and `//`.
|
|
1133
|
-
quote_char
|
|
1134
|
-
Single byte character used for csv quoting, default = `"`.
|
|
1135
|
-
Set to None to turn off special handling and escaping of quotes.
|
|
1136
|
-
skip_rows
|
|
1137
|
-
Start reading after ``skip_rows`` rows. The header will be parsed at this
|
|
1138
|
-
offset. Note that we respect CSV escaping/comments when skipping rows.
|
|
1139
|
-
If you want to skip by newline char only, use `skip_lines`.
|
|
1140
|
-
skip_lines
|
|
1141
|
-
Start reading after `skip_lines` lines. The header will be parsed at this
|
|
1142
|
-
offset. Note that CSV escaping will not be respected when skipping lines.
|
|
1143
|
-
If you want to skip valid CSV rows, use ``skip_rows``.
|
|
1144
|
-
schema
|
|
1145
|
-
Provide the schema. This means that polars doesn't do schema inference.
|
|
1146
|
-
This argument expects the complete schema, whereas `schema_overrides` can be
|
|
1147
|
-
used to partially overwrite a schema. Note that the order of the columns in
|
|
1148
|
-
the provided `schema` must match the order of the columns in the CSV being read.
|
|
1149
|
-
schema_overrides
|
|
1150
|
-
Overwrite dtypes during inference; should be a {colname:dtype,} dict or,
|
|
1151
|
-
if providing a list of strings to `new_columns`, a list of dtypes of
|
|
1152
|
-
the same length.
|
|
1153
|
-
null_values
|
|
1154
|
-
Values to interpret as null values. You can provide a:
|
|
1155
|
-
|
|
1156
|
-
- `str`: All values equal to this string will be null.
|
|
1157
|
-
- `List[str]`: All values equal to any string in this list will be null.
|
|
1158
|
-
- `Dict[str, str]`: A dictionary that maps column name to a
|
|
1159
|
-
null value string.
|
|
1160
|
-
|
|
1161
|
-
missing_utf8_is_empty_string
|
|
1162
|
-
By default a missing value is considered to be null; if you would prefer missing
|
|
1163
|
-
utf8 values to be treated as the empty string you can set this param True.
|
|
1164
|
-
ignore_errors
|
|
1165
|
-
Try to keep reading lines if some lines yield errors.
|
|
1166
|
-
First try `infer_schema=False` to read all columns as
|
|
1167
|
-
`pl.String` to check which values might cause an issue.
|
|
1168
|
-
cache
|
|
1169
|
-
Cache the result after reading.
|
|
1170
|
-
with_column_names
|
|
1171
|
-
Apply a function over the column names just in time (when they are determined);
|
|
1172
|
-
this function will receive (and should return) a list of column names.
|
|
1173
|
-
infer_schema
|
|
1174
|
-
When `True`, the schema is inferred from the data using the first
|
|
1175
|
-
`infer_schema_length` rows.
|
|
1176
|
-
When `False`, the schema is not inferred and will be `pl.String` if not
|
|
1177
|
-
specified in `schema` or `schema_overrides`.
|
|
1178
|
-
infer_schema_length
|
|
1179
|
-
The maximum number of rows to scan for schema inference.
|
|
1180
|
-
If set to `None`, the full data may be scanned *(this is slow)*.
|
|
1181
|
-
Set `infer_schema=False` to read all columns as `pl.String`.
|
|
1182
|
-
n_rows
|
|
1183
|
-
Stop reading from CSV file after reading `n_rows`.
|
|
1184
|
-
encoding : {'utf8', 'utf8-lossy'}
|
|
1185
|
-
Lossy means that invalid utf8 values are replaced with `�`
|
|
1186
|
-
characters. Defaults to "utf8".
|
|
1187
|
-
low_memory
|
|
1188
|
-
Reduce memory pressure at the expense of performance.
|
|
1189
|
-
rechunk
|
|
1190
|
-
Reallocate to contiguous memory when all chunks/ files are parsed.
|
|
1191
|
-
skip_rows_after_header
|
|
1192
|
-
Skip this number of rows when the header is parsed.
|
|
1193
|
-
row_index_name
|
|
1194
|
-
If not None, this will insert a row index column with the given name into
|
|
1195
|
-
the DataFrame.
|
|
1196
|
-
row_index_offset
|
|
1197
|
-
Offset to start the row index column (only used if the name is set).
|
|
1198
|
-
try_parse_dates
|
|
1199
|
-
Try to automatically parse dates. Most ISO8601-like formats
|
|
1200
|
-
can be inferred, as well as a handful of others. If this does not succeed,
|
|
1201
|
-
the column remains of data type `pl.String`.
|
|
1202
|
-
eol_char
|
|
1203
|
-
Single byte end of line character (default: `\n`). When encountering a file
|
|
1204
|
-
with windows line endings (`\r\n`), one can go with the default `\n`. The extra
|
|
1205
|
-
`\r` will be removed when processed.
|
|
1206
|
-
new_columns
|
|
1207
|
-
Provide an explicit list of string column names to use (for example, when
|
|
1208
|
-
scanning a headerless CSV file). If the given list is shorter than the width of
|
|
1209
|
-
the DataFrame the remaining columns will have their original name.
|
|
1210
|
-
raise_if_empty
|
|
1211
|
-
When there is no data in the source, `NoDataError` is raised. If this parameter
|
|
1212
|
-
is set to False, an empty LazyFrame (with no columns) is returned instead.
|
|
1213
|
-
truncate_ragged_lines
|
|
1214
|
-
Truncate lines that are longer than the schema.
|
|
1215
|
-
decimal_comma
|
|
1216
|
-
Parse floats using a comma as the decimal separator instead of a period.
|
|
1217
|
-
glob
|
|
1218
|
-
Expand path given via globbing rules.
|
|
1219
|
-
storage_options
|
|
1220
|
-
Options that indicate how to connect to a cloud provider.
|
|
1221
|
-
|
|
1222
|
-
The cloud providers currently supported are AWS, GCP, and Azure.
|
|
1223
|
-
See supported keys here:
|
|
1224
|
-
|
|
1225
|
-
* `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
|
|
1226
|
-
* `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
|
|
1227
|
-
* `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
|
|
1228
|
-
* Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
|
|
1229
|
-
`{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
|
|
1230
|
-
|
|
1231
|
-
If `storage_options` is not provided, Polars will try to infer the information
|
|
1232
|
-
from environment variables.
|
|
1233
|
-
credential_provider
|
|
1234
|
-
Provide a function that can be called to provide cloud storage
|
|
1235
|
-
credentials. The function is expected to return a dictionary of
|
|
1236
|
-
credential keys along with an optional credential expiry time.
|
|
1237
|
-
|
|
1238
|
-
.. warning::
|
|
1239
|
-
This functionality is considered **unstable**. It may be changed
|
|
1240
|
-
at any point without it being considered a breaking change.
|
|
1241
|
-
retries
|
|
1242
|
-
Number of retries if accessing a cloud instance fails.
|
|
1243
|
-
file_cache_ttl
|
|
1244
|
-
Amount of time to keep downloaded cloud files since their last access time,
|
|
1245
|
-
in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
|
|
1246
|
-
(which defaults to 1 hour) if not given.
|
|
1247
|
-
include_file_paths
|
|
1248
|
-
Include the path of the source file(s) as a column with this name.
|
|
1249
|
-
|
|
1250
|
-
Returns
|
|
1251
|
-
-------
|
|
1252
|
-
LazyFrame
|
|
1253
|
-
|
|
1254
|
-
See Also
|
|
1255
|
-
--------
|
|
1256
|
-
read_csv : Read a CSV file into a DataFrame.
|
|
1257
|
-
|
|
1258
|
-
Examples
|
|
1259
|
-
--------
|
|
1260
|
-
>>> import pathlib
|
|
1261
|
-
>>>
|
|
1262
|
-
>>> (
|
|
1263
|
-
... pl.scan_csv("my_long_file.csv") # lazy, doesn't do a thing
|
|
1264
|
-
... .select(
|
|
1265
|
-
... ["a", "c"]
|
|
1266
|
-
... ) # select only 2 columns (other columns will not be read)
|
|
1267
|
-
... .filter(
|
|
1268
|
-
... pl.col("a") > 10
|
|
1269
|
-
... ) # the filter is pushed down the scan, so less data is read into memory
|
|
1270
|
-
... .head(100) # constrain number of returned results to 100
|
|
1271
|
-
... ) # doctest: +SKIP
|
|
1272
|
-
|
|
1273
|
-
We can use `with_column_names` to modify the header before scanning:
|
|
1274
|
-
|
|
1275
|
-
>>> df = pl.DataFrame(
|
|
1276
|
-
... {"BrEeZaH": [1, 2, 3, 4], "LaNgUaGe": ["is", "hard", "to", "read"]}
|
|
1277
|
-
... )
|
|
1278
|
-
>>> path: pathlib.Path = dirpath / "mydf.csv"
|
|
1279
|
-
>>> df.write_csv(path)
|
|
1280
|
-
>>> pl.scan_csv(
|
|
1281
|
-
... path, with_column_names=lambda cols: [col.lower() for col in cols]
|
|
1282
|
-
... ).collect()
|
|
1283
|
-
shape: (4, 2)
|
|
1284
|
-
┌─────────┬──────────┐
|
|
1285
|
-
│ breezah ┆ language │
|
|
1286
|
-
│ --- ┆ --- │
|
|
1287
|
-
│ i64 ┆ str │
|
|
1288
|
-
╞═════════╪══════════╡
|
|
1289
|
-
│ 1 ┆ is │
|
|
1290
|
-
│ 2 ┆ hard │
|
|
1291
|
-
│ 3 ┆ to │
|
|
1292
|
-
│ 4 ┆ read │
|
|
1293
|
-
└─────────┴──────────┘
|
|
1294
|
-
|
|
1295
|
-
You can also simply replace column names (or provide them if the file has none)
|
|
1296
|
-
by passing a list of new column names to the `new_columns` parameter:
|
|
1297
|
-
|
|
1298
|
-
>>> df.write_csv(path)
|
|
1299
|
-
>>> pl.scan_csv(
|
|
1300
|
-
... path,
|
|
1301
|
-
... new_columns=["idx", "txt"],
|
|
1302
|
-
... schema_overrides=[pl.UInt16, pl.String],
|
|
1303
|
-
... ).collect()
|
|
1304
|
-
shape: (4, 2)
|
|
1305
|
-
┌─────┬──────┐
|
|
1306
|
-
│ idx ┆ txt │
|
|
1307
|
-
│ --- ┆ --- │
|
|
1308
|
-
│ u16 ┆ str │
|
|
1309
|
-
╞═════╪══════╡
|
|
1310
|
-
│ 1 ┆ is │
|
|
1311
|
-
│ 2 ┆ hard │
|
|
1312
|
-
│ 3 ┆ to │
|
|
1313
|
-
│ 4 ┆ read │
|
|
1314
|
-
└─────┴──────┘
|
|
1315
|
-
"""
|
|
1316
|
-
if schema_overrides is not None and not isinstance(
|
|
1317
|
-
schema_overrides, (dict, Sequence)
|
|
1318
|
-
):
|
|
1319
|
-
msg = "`schema_overrides` should be of type list or dict"
|
|
1320
|
-
raise TypeError(msg)
|
|
1321
|
-
|
|
1322
|
-
if not new_columns and isinstance(schema_overrides, Sequence):
|
|
1323
|
-
msg = f"expected 'schema_overrides' dict, found {qualified_type_name(schema_overrides)!r}"
|
|
1324
|
-
raise TypeError(msg)
|
|
1325
|
-
elif new_columns:
|
|
1326
|
-
if with_column_names:
|
|
1327
|
-
msg = "cannot set both `with_column_names` and `new_columns`; mutually exclusive"
|
|
1328
|
-
raise ValueError(msg)
|
|
1329
|
-
if schema_overrides and isinstance(schema_overrides, Sequence):
|
|
1330
|
-
schema_overrides = dict(zip(new_columns, schema_overrides))
|
|
1331
|
-
|
|
1332
|
-
# wrap new column names as a callable
|
|
1333
|
-
def with_column_names(cols: list[str]) -> list[str]:
|
|
1334
|
-
if len(cols) > len(new_columns):
|
|
1335
|
-
return new_columns + cols[len(new_columns) :] # type: ignore[operator]
|
|
1336
|
-
else:
|
|
1337
|
-
return new_columns # type: ignore[return-value]
|
|
1338
|
-
|
|
1339
|
-
_check_arg_is_1byte("separator", separator, can_be_empty=False)
|
|
1340
|
-
_check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
|
|
1341
|
-
|
|
1342
|
-
if isinstance(source, (str, Path)):
|
|
1343
|
-
source = normalize_filepath(source, check_not_directory=False)
|
|
1344
|
-
elif is_path_or_str_sequence(source, allow_str=False):
|
|
1345
|
-
source = [
|
|
1346
|
-
normalize_filepath(source, check_not_directory=False) for source in source
|
|
1347
|
-
]
|
|
1348
|
-
|
|
1349
|
-
if not infer_schema:
|
|
1350
|
-
infer_schema_length = 0
|
|
1351
|
-
|
|
1352
|
-
credential_provider_builder = _init_credential_provider_builder(
|
|
1353
|
-
credential_provider, source, storage_options, "scan_csv"
|
|
1354
|
-
)
|
|
1355
|
-
del credential_provider
|
|
1356
|
-
|
|
1357
|
-
return _scan_csv_impl(
|
|
1358
|
-
source,
|
|
1359
|
-
has_header=has_header,
|
|
1360
|
-
separator=separator,
|
|
1361
|
-
comment_prefix=comment_prefix,
|
|
1362
|
-
quote_char=quote_char,
|
|
1363
|
-
skip_rows=skip_rows,
|
|
1364
|
-
skip_lines=skip_lines,
|
|
1365
|
-
schema_overrides=schema_overrides, # type: ignore[arg-type]
|
|
1366
|
-
schema=schema,
|
|
1367
|
-
null_values=null_values,
|
|
1368
|
-
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
1369
|
-
ignore_errors=ignore_errors,
|
|
1370
|
-
cache=cache,
|
|
1371
|
-
with_column_names=with_column_names,
|
|
1372
|
-
infer_schema_length=infer_schema_length,
|
|
1373
|
-
n_rows=n_rows,
|
|
1374
|
-
low_memory=low_memory,
|
|
1375
|
-
rechunk=rechunk,
|
|
1376
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
1377
|
-
encoding=encoding,
|
|
1378
|
-
row_index_name=row_index_name,
|
|
1379
|
-
row_index_offset=row_index_offset,
|
|
1380
|
-
try_parse_dates=try_parse_dates,
|
|
1381
|
-
eol_char=eol_char,
|
|
1382
|
-
raise_if_empty=raise_if_empty,
|
|
1383
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
1384
|
-
decimal_comma=decimal_comma,
|
|
1385
|
-
glob=glob,
|
|
1386
|
-
retries=retries,
|
|
1387
|
-
storage_options=storage_options,
|
|
1388
|
-
credential_provider=credential_provider_builder,
|
|
1389
|
-
file_cache_ttl=file_cache_ttl,
|
|
1390
|
-
include_file_paths=include_file_paths,
|
|
1391
|
-
)
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
def _scan_csv_impl(
|
|
1395
|
-
source: str
|
|
1396
|
-
| IO[str]
|
|
1397
|
-
| IO[bytes]
|
|
1398
|
-
| bytes
|
|
1399
|
-
| list[str]
|
|
1400
|
-
| list[Path]
|
|
1401
|
-
| list[IO[str]]
|
|
1402
|
-
| list[IO[bytes]]
|
|
1403
|
-
| list[bytes],
|
|
1404
|
-
*,
|
|
1405
|
-
has_header: bool = True,
|
|
1406
|
-
separator: str = ",",
|
|
1407
|
-
comment_prefix: str | None = None,
|
|
1408
|
-
quote_char: str | None = '"',
|
|
1409
|
-
skip_rows: int = 0,
|
|
1410
|
-
skip_lines: int = 0,
|
|
1411
|
-
schema: SchemaDict | None = None,
|
|
1412
|
-
schema_overrides: SchemaDict | None = None,
|
|
1413
|
-
null_values: str | Sequence[str] | dict[str, str] | None = None,
|
|
1414
|
-
missing_utf8_is_empty_string: bool = False,
|
|
1415
|
-
ignore_errors: bool = False,
|
|
1416
|
-
cache: bool = True,
|
|
1417
|
-
with_column_names: Callable[[list[str]], list[str]] | None = None,
|
|
1418
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
1419
|
-
n_rows: int | None = None,
|
|
1420
|
-
encoding: CsvEncoding = "utf8",
|
|
1421
|
-
low_memory: bool = False,
|
|
1422
|
-
rechunk: bool = False,
|
|
1423
|
-
skip_rows_after_header: int = 0,
|
|
1424
|
-
row_index_name: str | None = None,
|
|
1425
|
-
row_index_offset: int = 0,
|
|
1426
|
-
try_parse_dates: bool = False,
|
|
1427
|
-
eol_char: str = "\n",
|
|
1428
|
-
raise_if_empty: bool = True,
|
|
1429
|
-
truncate_ragged_lines: bool = True,
|
|
1430
|
-
decimal_comma: bool = False,
|
|
1431
|
-
glob: bool = True,
|
|
1432
|
-
storage_options: dict[str, Any] | None = None,
|
|
1433
|
-
credential_provider: CredentialProviderBuilder | None = None,
|
|
1434
|
-
retries: int = 2,
|
|
1435
|
-
file_cache_ttl: int | None = None,
|
|
1436
|
-
include_file_paths: str | None = None,
|
|
1437
|
-
) -> LazyFrame:
|
|
1438
|
-
dtype_list: list[tuple[str, PolarsDataType]] | None = None
|
|
1439
|
-
if schema_overrides is not None:
|
|
1440
|
-
if not isinstance(schema_overrides, dict):
|
|
1441
|
-
msg = "expected 'schema_overrides' dict, found 'list'"
|
|
1442
|
-
raise TypeError(msg)
|
|
1443
|
-
dtype_list = []
|
|
1444
|
-
for k, v in schema_overrides.items():
|
|
1445
|
-
dtype_list.append((k, parse_into_dtype(v)))
|
|
1446
|
-
processed_null_values = _process_null_values(null_values)
|
|
1447
|
-
|
|
1448
|
-
if isinstance(source, list):
|
|
1449
|
-
sources = source
|
|
1450
|
-
source = None # type: ignore[assignment]
|
|
1451
|
-
else:
|
|
1452
|
-
sources = []
|
|
1453
|
-
|
|
1454
|
-
if storage_options:
|
|
1455
|
-
storage_options = list(storage_options.items()) # type: ignore[assignment]
|
|
1456
|
-
else:
|
|
1457
|
-
# Handle empty dict input
|
|
1458
|
-
storage_options = None
|
|
1459
|
-
|
|
1460
|
-
pylf = PyLazyFrame.new_from_csv(
|
|
1461
|
-
source,
|
|
1462
|
-
sources,
|
|
1463
|
-
separator=separator,
|
|
1464
|
-
has_header=has_header,
|
|
1465
|
-
ignore_errors=ignore_errors,
|
|
1466
|
-
skip_rows=skip_rows,
|
|
1467
|
-
skip_lines=skip_lines,
|
|
1468
|
-
n_rows=n_rows,
|
|
1469
|
-
cache=cache,
|
|
1470
|
-
overwrite_dtype=dtype_list,
|
|
1471
|
-
low_memory=low_memory,
|
|
1472
|
-
comment_prefix=comment_prefix,
|
|
1473
|
-
quote_char=quote_char,
|
|
1474
|
-
null_values=processed_null_values,
|
|
1475
|
-
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
1476
|
-
infer_schema_length=infer_schema_length,
|
|
1477
|
-
with_schema_modify=with_column_names,
|
|
1478
|
-
rechunk=rechunk,
|
|
1479
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
1480
|
-
encoding=encoding,
|
|
1481
|
-
row_index=parse_row_index_args(row_index_name, row_index_offset),
|
|
1482
|
-
try_parse_dates=try_parse_dates,
|
|
1483
|
-
eol_char=eol_char,
|
|
1484
|
-
raise_if_empty=raise_if_empty,
|
|
1485
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
1486
|
-
decimal_comma=decimal_comma,
|
|
1487
|
-
glob=glob,
|
|
1488
|
-
schema=schema,
|
|
1489
|
-
cloud_options=storage_options,
|
|
1490
|
-
credential_provider=credential_provider,
|
|
1491
|
-
retries=retries,
|
|
1492
|
-
file_cache_ttl=file_cache_ttl,
|
|
1493
|
-
include_file_paths=include_file_paths,
|
|
1494
|
-
)
|
|
1495
|
-
return wrap_ldf(pylf)
|