polars-runtime-compat 1.34.0b2__cp39-abi3-win_amd64.whl → 1.34.0b4__cp39-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
  2. {polars_runtime_compat-1.34.0b2.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
  3. polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
  4. polars/__init__.py +0 -528
  5. polars/_cpu_check.py +0 -265
  6. polars/_dependencies.py +0 -355
  7. polars/_plr.py +0 -99
  8. polars/_plr.pyi +0 -2496
  9. polars/_reexport.py +0 -23
  10. polars/_typing.py +0 -478
  11. polars/_utils/__init__.py +0 -37
  12. polars/_utils/async_.py +0 -102
  13. polars/_utils/cache.py +0 -176
  14. polars/_utils/cloud.py +0 -40
  15. polars/_utils/constants.py +0 -29
  16. polars/_utils/construction/__init__.py +0 -46
  17. polars/_utils/construction/dataframe.py +0 -1397
  18. polars/_utils/construction/other.py +0 -72
  19. polars/_utils/construction/series.py +0 -560
  20. polars/_utils/construction/utils.py +0 -118
  21. polars/_utils/convert.py +0 -224
  22. polars/_utils/deprecation.py +0 -406
  23. polars/_utils/getitem.py +0 -457
  24. polars/_utils/logging.py +0 -11
  25. polars/_utils/nest_asyncio.py +0 -264
  26. polars/_utils/parquet.py +0 -15
  27. polars/_utils/parse/__init__.py +0 -12
  28. polars/_utils/parse/expr.py +0 -242
  29. polars/_utils/polars_version.py +0 -19
  30. polars/_utils/pycapsule.py +0 -53
  31. polars/_utils/scan.py +0 -27
  32. polars/_utils/serde.py +0 -63
  33. polars/_utils/slice.py +0 -215
  34. polars/_utils/udfs.py +0 -1251
  35. polars/_utils/unstable.py +0 -63
  36. polars/_utils/various.py +0 -782
  37. polars/_utils/wrap.py +0 -25
  38. polars/api.py +0 -370
  39. polars/catalog/__init__.py +0 -0
  40. polars/catalog/unity/__init__.py +0 -19
  41. polars/catalog/unity/client.py +0 -733
  42. polars/catalog/unity/models.py +0 -152
  43. polars/config.py +0 -1571
  44. polars/convert/__init__.py +0 -25
  45. polars/convert/general.py +0 -1046
  46. polars/convert/normalize.py +0 -261
  47. polars/dataframe/__init__.py +0 -5
  48. polars/dataframe/_html.py +0 -186
  49. polars/dataframe/frame.py +0 -12582
  50. polars/dataframe/group_by.py +0 -1067
  51. polars/dataframe/plotting.py +0 -257
  52. polars/datatype_expr/__init__.py +0 -5
  53. polars/datatype_expr/array.py +0 -56
  54. polars/datatype_expr/datatype_expr.py +0 -304
  55. polars/datatype_expr/list.py +0 -18
  56. polars/datatype_expr/struct.py +0 -69
  57. polars/datatypes/__init__.py +0 -122
  58. polars/datatypes/_parse.py +0 -195
  59. polars/datatypes/_utils.py +0 -48
  60. polars/datatypes/classes.py +0 -1213
  61. polars/datatypes/constants.py +0 -11
  62. polars/datatypes/constructor.py +0 -172
  63. polars/datatypes/convert.py +0 -366
  64. polars/datatypes/group.py +0 -130
  65. polars/exceptions.py +0 -230
  66. polars/expr/__init__.py +0 -7
  67. polars/expr/array.py +0 -964
  68. polars/expr/binary.py +0 -346
  69. polars/expr/categorical.py +0 -306
  70. polars/expr/datetime.py +0 -2620
  71. polars/expr/expr.py +0 -11272
  72. polars/expr/list.py +0 -1408
  73. polars/expr/meta.py +0 -444
  74. polars/expr/name.py +0 -321
  75. polars/expr/string.py +0 -3045
  76. polars/expr/struct.py +0 -357
  77. polars/expr/whenthen.py +0 -185
  78. polars/functions/__init__.py +0 -193
  79. polars/functions/aggregation/__init__.py +0 -33
  80. polars/functions/aggregation/horizontal.py +0 -298
  81. polars/functions/aggregation/vertical.py +0 -341
  82. polars/functions/as_datatype.py +0 -848
  83. polars/functions/business.py +0 -138
  84. polars/functions/col.py +0 -384
  85. polars/functions/datatype.py +0 -121
  86. polars/functions/eager.py +0 -524
  87. polars/functions/escape_regex.py +0 -29
  88. polars/functions/lazy.py +0 -2751
  89. polars/functions/len.py +0 -68
  90. polars/functions/lit.py +0 -210
  91. polars/functions/random.py +0 -22
  92. polars/functions/range/__init__.py +0 -19
  93. polars/functions/range/_utils.py +0 -15
  94. polars/functions/range/date_range.py +0 -303
  95. polars/functions/range/datetime_range.py +0 -370
  96. polars/functions/range/int_range.py +0 -348
  97. polars/functions/range/linear_space.py +0 -311
  98. polars/functions/range/time_range.py +0 -287
  99. polars/functions/repeat.py +0 -301
  100. polars/functions/whenthen.py +0 -353
  101. polars/interchange/__init__.py +0 -10
  102. polars/interchange/buffer.py +0 -77
  103. polars/interchange/column.py +0 -190
  104. polars/interchange/dataframe.py +0 -230
  105. polars/interchange/from_dataframe.py +0 -328
  106. polars/interchange/protocol.py +0 -303
  107. polars/interchange/utils.py +0 -170
  108. polars/io/__init__.py +0 -64
  109. polars/io/_utils.py +0 -317
  110. polars/io/avro.py +0 -49
  111. polars/io/clipboard.py +0 -36
  112. polars/io/cloud/__init__.py +0 -17
  113. polars/io/cloud/_utils.py +0 -80
  114. polars/io/cloud/credential_provider/__init__.py +0 -17
  115. polars/io/cloud/credential_provider/_builder.py +0 -520
  116. polars/io/cloud/credential_provider/_providers.py +0 -618
  117. polars/io/csv/__init__.py +0 -9
  118. polars/io/csv/_utils.py +0 -38
  119. polars/io/csv/batched_reader.py +0 -142
  120. polars/io/csv/functions.py +0 -1495
  121. polars/io/database/__init__.py +0 -6
  122. polars/io/database/_arrow_registry.py +0 -70
  123. polars/io/database/_cursor_proxies.py +0 -147
  124. polars/io/database/_executor.py +0 -578
  125. polars/io/database/_inference.py +0 -314
  126. polars/io/database/_utils.py +0 -144
  127. polars/io/database/functions.py +0 -516
  128. polars/io/delta.py +0 -499
  129. polars/io/iceberg/__init__.py +0 -3
  130. polars/io/iceberg/_utils.py +0 -697
  131. polars/io/iceberg/dataset.py +0 -556
  132. polars/io/iceberg/functions.py +0 -151
  133. polars/io/ipc/__init__.py +0 -8
  134. polars/io/ipc/functions.py +0 -514
  135. polars/io/json/__init__.py +0 -3
  136. polars/io/json/read.py +0 -101
  137. polars/io/ndjson.py +0 -332
  138. polars/io/parquet/__init__.py +0 -17
  139. polars/io/parquet/field_overwrites.py +0 -140
  140. polars/io/parquet/functions.py +0 -722
  141. polars/io/partition.py +0 -491
  142. polars/io/plugins.py +0 -187
  143. polars/io/pyarrow_dataset/__init__.py +0 -5
  144. polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
  145. polars/io/pyarrow_dataset/functions.py +0 -79
  146. polars/io/scan_options/__init__.py +0 -5
  147. polars/io/scan_options/_options.py +0 -59
  148. polars/io/scan_options/cast_options.py +0 -126
  149. polars/io/spreadsheet/__init__.py +0 -6
  150. polars/io/spreadsheet/_utils.py +0 -52
  151. polars/io/spreadsheet/_write_utils.py +0 -647
  152. polars/io/spreadsheet/functions.py +0 -1323
  153. polars/lazyframe/__init__.py +0 -9
  154. polars/lazyframe/engine_config.py +0 -61
  155. polars/lazyframe/frame.py +0 -8564
  156. polars/lazyframe/group_by.py +0 -669
  157. polars/lazyframe/in_process.py +0 -42
  158. polars/lazyframe/opt_flags.py +0 -333
  159. polars/meta/__init__.py +0 -14
  160. polars/meta/build.py +0 -33
  161. polars/meta/index_type.py +0 -27
  162. polars/meta/thread_pool.py +0 -50
  163. polars/meta/versions.py +0 -120
  164. polars/ml/__init__.py +0 -0
  165. polars/ml/torch.py +0 -213
  166. polars/ml/utilities.py +0 -30
  167. polars/plugins.py +0 -155
  168. polars/py.typed +0 -0
  169. polars/pyproject.toml +0 -96
  170. polars/schema.py +0 -265
  171. polars/selectors.py +0 -3117
  172. polars/series/__init__.py +0 -5
  173. polars/series/array.py +0 -776
  174. polars/series/binary.py +0 -254
  175. polars/series/categorical.py +0 -246
  176. polars/series/datetime.py +0 -2275
  177. polars/series/list.py +0 -1087
  178. polars/series/plotting.py +0 -191
  179. polars/series/series.py +0 -9197
  180. polars/series/string.py +0 -2367
  181. polars/series/struct.py +0 -154
  182. polars/series/utils.py +0 -191
  183. polars/sql/__init__.py +0 -7
  184. polars/sql/context.py +0 -677
  185. polars/sql/functions.py +0 -139
  186. polars/string_cache.py +0 -185
  187. polars/testing/__init__.py +0 -13
  188. polars/testing/asserts/__init__.py +0 -9
  189. polars/testing/asserts/frame.py +0 -231
  190. polars/testing/asserts/series.py +0 -219
  191. polars/testing/asserts/utils.py +0 -12
  192. polars/testing/parametric/__init__.py +0 -33
  193. polars/testing/parametric/profiles.py +0 -107
  194. polars/testing/parametric/strategies/__init__.py +0 -22
  195. polars/testing/parametric/strategies/_utils.py +0 -14
  196. polars/testing/parametric/strategies/core.py +0 -615
  197. polars/testing/parametric/strategies/data.py +0 -452
  198. polars/testing/parametric/strategies/dtype.py +0 -436
  199. polars/testing/parametric/strategies/legacy.py +0 -169
  200. polars/type_aliases.py +0 -24
  201. polars_runtime_compat-1.34.0b2.dist-info/RECORD +0 -203
  202. {polars_runtime_compat-1.34.0b2.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
  203. {polars_runtime_compat-1.34.0b2.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0
@@ -1,1495 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import contextlib
4
- import os
5
- from collections.abc import Sequence
6
- from io import BytesIO, StringIO
7
- from pathlib import Path
8
- from typing import IO, TYPE_CHECKING, Any, Callable, Literal
9
-
10
- import polars._reexport as pl
11
- import polars.functions as F
12
- from polars._utils.deprecation import deprecate_renamed_parameter
13
- from polars._utils.various import (
14
- _process_null_values,
15
- is_path_or_str_sequence,
16
- is_str_sequence,
17
- normalize_filepath,
18
- qualified_type_name,
19
- )
20
- from polars._utils.wrap import wrap_df, wrap_ldf
21
- from polars.datatypes import N_INFER_DEFAULT, String, parse_into_dtype
22
- from polars.io._utils import (
23
- is_glob_pattern,
24
- parse_columns_arg,
25
- parse_row_index_args,
26
- prepare_file_arg,
27
- )
28
- from polars.io.cloud.credential_provider._builder import (
29
- _init_credential_provider_builder,
30
- )
31
- from polars.io.csv._utils import _check_arg_is_1byte, _update_columns
32
- from polars.io.csv.batched_reader import BatchedCsvReader
33
-
34
- with contextlib.suppress(ImportError): # Module not available when building docs
35
- from polars._plr import PyDataFrame, PyLazyFrame
36
-
37
- if TYPE_CHECKING:
38
- from collections.abc import Mapping
39
-
40
- from polars import DataFrame, LazyFrame
41
- from polars._typing import CsvEncoding, PolarsDataType, SchemaDict
42
- from polars.io.cloud import CredentialProviderFunction
43
- from polars.io.cloud.credential_provider._builder import CredentialProviderBuilder
44
-
45
-
46
- @deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31")
47
- @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
48
- @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
49
- def read_csv(
50
- source: str | Path | IO[str] | IO[bytes] | bytes,
51
- *,
52
- has_header: bool = True,
53
- columns: Sequence[int] | Sequence[str] | None = None,
54
- new_columns: Sequence[str] | None = None,
55
- separator: str = ",",
56
- comment_prefix: str | None = None,
57
- quote_char: str | None = '"',
58
- skip_rows: int = 0,
59
- skip_lines: int = 0,
60
- schema: SchemaDict | None = None,
61
- schema_overrides: (
62
- Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None
63
- ) = None,
64
- null_values: str | Sequence[str] | dict[str, str] | None = None,
65
- missing_utf8_is_empty_string: bool = False,
66
- ignore_errors: bool = False,
67
- try_parse_dates: bool = False,
68
- n_threads: int | None = None,
69
- infer_schema: bool = True,
70
- infer_schema_length: int | None = N_INFER_DEFAULT,
71
- batch_size: int = 8192,
72
- n_rows: int | None = None,
73
- encoding: CsvEncoding | str = "utf8",
74
- low_memory: bool = False,
75
- rechunk: bool = False,
76
- use_pyarrow: bool = False,
77
- storage_options: dict[str, Any] | None = None,
78
- skip_rows_after_header: int = 0,
79
- row_index_name: str | None = None,
80
- row_index_offset: int = 0,
81
- sample_size: int = 1024,
82
- eol_char: str = "\n",
83
- raise_if_empty: bool = True,
84
- truncate_ragged_lines: bool = False,
85
- decimal_comma: bool = False,
86
- glob: bool = True,
87
- ) -> DataFrame:
88
- r"""
89
- Read a CSV file into a DataFrame.
90
-
91
- Polars expects CSV data to strictly conform to RFC 4180, unless documented
92
- otherwise. Malformed data, though common, may lead to undefined behavior.
93
-
94
- .. versionchanged:: 0.20.31
95
- The `dtypes` parameter was renamed `schema_overrides`.
96
- .. versionchanged:: 0.20.4
97
- * The `row_count_name` parameter was renamed `row_index_name`.
98
- * The `row_count_offset` parameter was renamed `row_index_offset`.
99
-
100
- Parameters
101
- ----------
102
- source
103
- Path to a file or a file-like object (by "file-like object" we refer to objects
104
- that have a `read()` method, such as a file handler like the builtin `open`
105
- function, or a `BytesIO` instance). If `fsspec` is installed, it will be used
106
- to open remote files. For file-like objects, the stream position may not be
107
- updated accordingly after reading.
108
- has_header
109
- Indicate if the first row of the dataset is a header or not. If set to False,
110
- column names will be autogenerated in the following format: `column_x`, with
111
- `x` being an enumeration over every column in the dataset, starting at 1.
112
- columns
113
- Columns to select. Accepts a list of column indices (starting
114
- at zero) or a list of column names.
115
- new_columns
116
- Rename columns right after parsing the CSV file. If the given
117
- list is shorter than the width of the DataFrame the remaining
118
- columns will have their original name.
119
- separator
120
- Single byte character to use as separator in the file.
121
- comment_prefix
122
- A string used to indicate the start of a comment line. Comment lines are skipped
123
- during parsing. Common examples of comment prefixes are `#` and `//`.
124
- quote_char
125
- Single byte character used for csv quoting, default = `"`.
126
- Set to None to turn off special handling and escaping of quotes.
127
- skip_rows
128
- Start reading after ``skip_rows`` rows. The header will be parsed at this
129
- offset. Note that we respect CSV escaping/comments when skipping rows.
130
- If you want to skip by newline char only, use `skip_lines`.
131
- skip_lines
132
- Start reading after `skip_lines` lines. The header will be parsed at this
133
- offset. Note that CSV escaping will not be respected when skipping lines.
134
- If you want to skip valid CSV rows, use ``skip_rows``.
135
- schema
136
- Provide the schema. This means that polars doesn't do schema inference.
137
- This argument expects the complete schema, whereas `schema_overrides` can be
138
- used to partially overwrite a schema. Note that the order of the columns in
139
- the provided `schema` must match the order of the columns in the CSV being read.
140
- schema_overrides
141
- Overwrite dtypes for specific or all columns during schema inference.
142
- null_values
143
- Values to interpret as null values. You can provide a:
144
-
145
- - `str`: All values equal to this string will be null.
146
- - `List[str]`: All values equal to any string in this list will be null.
147
- - `Dict[str, str]`: A dictionary that maps column name to a
148
- null value string.
149
-
150
- missing_utf8_is_empty_string
151
- By default a missing value is considered to be null; if you would prefer missing
152
- utf8 values to be treated as the empty string you can set this param True.
153
- ignore_errors
154
- Try to keep reading lines if some lines yield errors.
155
- Before using this option, try to increase the number of lines used for schema
156
- inference with e.g `infer_schema_length=10000` or override automatic dtype
157
- inference for specific columns with the `schema_overrides` option or use
158
- `infer_schema=False` to read all columns as `pl.String` to check which
159
- values might cause an issue.
160
- try_parse_dates
161
- Try to automatically parse dates. Most ISO8601-like formats can
162
- be inferred, as well as a handful of others. If this does not succeed,
163
- the column remains of data type `pl.String`.
164
- If `use_pyarrow=True`, dates will always be parsed.
165
- n_threads
166
- Number of threads to use in csv parsing.
167
- Defaults to the number of physical cpu's of your system.
168
- infer_schema
169
- When `True`, the schema is inferred from the data using the first
170
- `infer_schema_length` rows.
171
- When `False`, the schema is not inferred and will be `pl.String` if not
172
- specified in `schema` or `schema_overrides`.
173
- infer_schema_length
174
- The maximum number of rows to scan for schema inference.
175
- If set to `None`, the full data may be scanned *(this is slow)*.
176
- Set `infer_schema=False` to read all columns as `pl.String`.
177
- batch_size
178
- Number of lines to read into the buffer at once.
179
- Modify this to change performance.
180
- n_rows
181
- Stop reading from CSV file after reading `n_rows`.
182
- During multi-threaded parsing, an upper bound of `n_rows`
183
- rows cannot be guaranteed.
184
- encoding : {'utf8', 'utf8-lossy', 'windows-1252', 'windows-1252-lossy', ...}
185
- Lossy means that invalid utf8 values are replaced with `�`
186
- characters. When using other encodings than `utf8` or
187
- `utf8-lossy`, the input is first decoded in memory with
188
- python. Defaults to `utf8`.
189
- low_memory
190
- Reduce memory pressure at the expense of performance.
191
- rechunk
192
- Make sure that all columns are contiguous in memory by
193
- aggregating the chunks into a single array.
194
- use_pyarrow
195
- Try to use pyarrow's native CSV parser. This will always
196
- parse dates, even if `try_parse_dates=False`.
197
- This is not always possible. The set of arguments given to
198
- this function determines if it is possible to use pyarrow's
199
- native parser. Note that pyarrow and polars may have a
200
- different strategy regarding type inference.
201
- storage_options
202
- Extra options that make sense for `fsspec.open()` or a
203
- particular storage connection.
204
- e.g. host, port, username, password, etc.
205
- skip_rows_after_header
206
- Skip this number of rows when the header is parsed.
207
- row_index_name
208
- Insert a row index column with the given name into the DataFrame as the first
209
- column. If set to `None` (default), no row index column is created.
210
- row_index_offset
211
- Start the row index at this offset. Cannot be negative.
212
- Only used if `row_index_name` is set.
213
- sample_size
214
- Set the sample size. This is used to sample statistics to estimate the
215
- allocation needed.
216
-
217
- .. deprecated:: 1.10.0
218
- This parameter is now a no-op.
219
- eol_char
220
- Single byte end of line character (default: `\n`). When encountering a file
221
- with windows line endings (`\r\n`), one can go with the default `\n`. The extra
222
- `\r` will be removed when processed.
223
- raise_if_empty
224
- When there is no data in the source, `NoDataError` is raised. If this parameter
225
- is set to False, an empty DataFrame (with no columns) is returned instead.
226
- truncate_ragged_lines
227
- Truncate lines that are longer than the schema.
228
- decimal_comma
229
- Parse floats using a comma as the decimal separator instead of a period.
230
- glob
231
- Expand path given via globbing rules.
232
-
233
- Returns
234
- -------
235
- DataFrame
236
-
237
- See Also
238
- --------
239
- scan_csv : Lazily read from a CSV file or multiple files via glob patterns.
240
-
241
- Warnings
242
- --------
243
- Calling `read_csv().lazy()` is an antipattern as this forces Polars to materialize
244
- a full csv file and therefore cannot push any optimizations into the reader.
245
- Therefore always prefer `scan_csv` if you want to work with `LazyFrame` s.
246
-
247
- Notes
248
- -----
249
- If the schema is inferred incorrectly (e.g. as `pl.Int64` instead of `pl.Float64`),
250
- try to increase the number of lines used to infer the schema with
251
- `infer_schema_length` or override the inferred dtype for those columns with
252
- `schema_overrides`.
253
-
254
- Examples
255
- --------
256
- >>> pl.read_csv("data.csv", separator="|") # doctest: +SKIP
257
-
258
- Demonstrate use against a BytesIO object, parsing string dates.
259
-
260
- >>> from io import BytesIO
261
- >>> data = BytesIO(
262
- ... b"ID,Name,Birthday\n"
263
- ... b"1,Alice,1995-07-12\n"
264
- ... b"2,Bob,1990-09-20\n"
265
- ... b"3,Charlie,2002-03-08\n"
266
- ... )
267
- >>> pl.read_csv(data, try_parse_dates=True)
268
- shape: (3, 3)
269
- ┌─────┬─────────┬────────────┐
270
- │ ID ┆ Name ┆ Birthday │
271
- │ --- ┆ --- ┆ --- │
272
- │ i64 ┆ str ┆ date │
273
- ╞═════╪═════════╪════════════╡
274
- │ 1 ┆ Alice ┆ 1995-07-12 │
275
- │ 2 ┆ Bob ┆ 1990-09-20 │
276
- │ 3 ┆ Charlie ┆ 2002-03-08 │
277
- └─────┴─────────┴────────────┘
278
- """
279
- _check_arg_is_1byte("separator", separator, can_be_empty=False)
280
- _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
281
- _check_arg_is_1byte("eol_char", eol_char, can_be_empty=False)
282
-
283
- projection, columns = parse_columns_arg(columns)
284
- storage_options = storage_options or {}
285
-
286
- if columns and not has_header:
287
- for column in columns:
288
- if not column.startswith("column_"):
289
- msg = (
290
- "specified column names do not start with 'column_',"
291
- " but autogenerated header names were requested"
292
- )
293
- raise ValueError(msg)
294
-
295
- if schema_overrides is not None and not isinstance(
296
- schema_overrides, (dict, Sequence)
297
- ):
298
- msg = "`schema_overrides` should be of type list or dict"
299
- raise TypeError(msg)
300
-
301
- if (
302
- use_pyarrow
303
- and schema_overrides is None
304
- and n_rows is None
305
- and n_threads is None
306
- and not low_memory
307
- and null_values is None
308
- ):
309
- include_columns: Sequence[str] | None = None
310
- if columns:
311
- if not has_header:
312
- # Convert 'column_1', 'column_2', ... column names to 'f0', 'f1', ...
313
- # column names for pyarrow, if CSV file does not contain a header.
314
- include_columns = [f"f{int(column[7:]) - 1}" for column in columns]
315
- else:
316
- include_columns = columns
317
-
318
- if not columns and projection:
319
- # Convert column indices from projection to 'f0', 'f1', ... column names
320
- # for pyarrow.
321
- include_columns = [f"f{column_idx}" for column_idx in projection]
322
-
323
- with prepare_file_arg(
324
- source,
325
- encoding=None,
326
- use_pyarrow=True,
327
- raise_if_empty=raise_if_empty,
328
- storage_options=storage_options,
329
- ) as data:
330
- import pyarrow as pa
331
- import pyarrow.csv
332
-
333
- try:
334
- tbl = pa.csv.read_csv(
335
- data,
336
- pa.csv.ReadOptions(
337
- skip_rows=skip_rows,
338
- skip_rows_after_names=skip_rows_after_header,
339
- autogenerate_column_names=not has_header,
340
- encoding=encoding,
341
- ),
342
- pa.csv.ParseOptions(
343
- delimiter=separator,
344
- quote_char=quote_char if quote_char else False,
345
- double_quote=quote_char is not None and quote_char == '"',
346
- ),
347
- pa.csv.ConvertOptions(
348
- column_types=None,
349
- include_columns=include_columns,
350
- include_missing_columns=ignore_errors,
351
- ),
352
- )
353
- except pa.ArrowInvalid as err:
354
- if raise_if_empty or "Empty CSV" not in str(err):
355
- raise
356
- return pl.DataFrame()
357
-
358
- if not has_header:
359
- # Rename 'f0', 'f1', ... columns names autogenerated by pyarrow
360
- # to 'column_1', 'column_2', ...
361
- tbl = tbl.rename_columns(
362
- [f"column_{int(column[1:]) + 1}" for column in tbl.column_names]
363
- )
364
-
365
- df = pl.DataFrame._from_arrow(tbl, rechunk=rechunk)
366
- if new_columns:
367
- return _update_columns(df, new_columns)
368
- return df
369
-
370
- if projection and schema_overrides and isinstance(schema_overrides, list):
371
- if len(projection) < len(schema_overrides):
372
- msg = "more schema overrides are specified than there are selected columns"
373
- raise ValueError(msg)
374
-
375
- # Fix list of dtypes when used together with projection as polars CSV reader
376
- # wants a list of dtypes for the x first columns before it does the projection.
377
- dtypes_list: list[PolarsDataType] = [String] * (max(projection) + 1)
378
-
379
- for idx, column_idx in enumerate(projection):
380
- if idx < len(schema_overrides):
381
- dtypes_list[column_idx] = schema_overrides[idx]
382
-
383
- schema_overrides = dtypes_list
384
-
385
- if columns and schema_overrides and isinstance(schema_overrides, list):
386
- if len(columns) < len(schema_overrides):
387
- msg = "more dtypes overrides are specified than there are selected columns"
388
- raise ValueError(msg)
389
-
390
- # Map list of dtypes when used together with selected columns as a dtypes dict
391
- # so the dtypes are applied to the correct column instead of the first x
392
- # columns.
393
- schema_overrides = dict(zip(columns, schema_overrides))
394
-
395
- if new_columns and schema_overrides and isinstance(schema_overrides, dict):
396
- current_columns = None
397
-
398
- # As new column names are not available yet while parsing the CSV file, rename
399
- # column names in dtypes to old names (if possible) so they can be used during
400
- # CSV parsing.
401
- if columns:
402
- if len(columns) < len(new_columns):
403
- msg = (
404
- "more new column names are specified than there are selected"
405
- " columns"
406
- )
407
- raise ValueError(msg)
408
-
409
- # Get column names of requested columns.
410
- current_columns = columns[0 : len(new_columns)]
411
- elif not has_header:
412
- # When there are no header, column names are autogenerated (and known).
413
-
414
- if projection:
415
- if columns and len(columns) < len(new_columns):
416
- msg = (
417
- "more new column names are specified than there are selected"
418
- " columns"
419
- )
420
- raise ValueError(msg)
421
- # Convert column indices from projection to 'column_1', 'column_2', ...
422
- # column names.
423
- current_columns = [
424
- f"column_{column_idx + 1}" for column_idx in projection
425
- ]
426
- else:
427
- # Generate autogenerated 'column_1', 'column_2', ... column names for
428
- # new column names.
429
- current_columns = [
430
- f"column_{column_idx}"
431
- for column_idx in range(1, len(new_columns) + 1)
432
- ]
433
- else:
434
- # When a header is present, column names are not known yet.
435
-
436
- if len(schema_overrides) <= len(new_columns):
437
- # If dtypes dictionary contains less or same amount of values than new
438
- # column names a list of dtypes can be created if all listed column
439
- # names in dtypes dictionary appear in the first consecutive new column
440
- # names.
441
- dtype_list = [
442
- schema_overrides[new_column_name]
443
- for new_column_name in new_columns[0 : len(schema_overrides)]
444
- if new_column_name in schema_overrides
445
- ]
446
-
447
- if len(dtype_list) == len(schema_overrides):
448
- schema_overrides = dtype_list
449
-
450
- if current_columns and isinstance(schema_overrides, dict):
451
- new_to_current = dict(zip(new_columns, current_columns))
452
- # Change new column names to current column names in dtype.
453
- schema_overrides = {
454
- new_to_current.get(column_name, column_name): column_dtype
455
- for column_name, column_dtype in schema_overrides.items()
456
- }
457
-
458
- if not infer_schema:
459
- infer_schema_length = 0
460
-
461
- # TODO: scan_csv doesn't support a "dtype slice" (i.e. list[DataType])
462
- schema_overrides_is_list = isinstance(schema_overrides, Sequence)
463
- encoding_supported_in_lazy = encoding in {"utf8", "utf8-lossy"}
464
-
465
- new_streaming = (
466
- os.getenv("POLARS_FORCE_NEW_STREAMING") == "1"
467
- or os.getenv("POLARS_AUTO_NEW_STREAMING") == "1"
468
- )
469
-
470
- if new_streaming or (
471
- # Check that it is not a BytesIO object
472
- isinstance(v := source, (str, Path))
473
- and (
474
- # HuggingFace only for now ⊂( ◜◒◝ )⊃
475
- str(v).startswith("hf://")
476
- # Also dispatch on FORCE_ASYNC, so that this codepath gets run
477
- # through by our test suite during CI.
478
- or (
479
- os.getenv("POLARS_FORCE_ASYNC") == "1"
480
- and not schema_overrides_is_list
481
- and encoding_supported_in_lazy
482
- )
483
- # TODO: We can't dispatch this for all paths due to a few reasons:
484
- # * `scan_csv` does not support compressed files
485
- # * The `storage_options` configuration keys are different between
486
- # fsspec and object_store (would require a breaking change)
487
- )
488
- ):
489
- if isinstance(source, (str, Path)):
490
- source = normalize_filepath(source, check_not_directory=False)
491
- elif is_path_or_str_sequence(source, allow_str=False):
492
- source = [ # type: ignore[assignment]
493
- normalize_filepath(source, check_not_directory=False)
494
- for source in source
495
- ]
496
-
497
- if not new_streaming:
498
- if schema_overrides_is_list:
499
- msg = "passing a list to `schema_overrides` is unsupported for hf:// paths"
500
- raise ValueError(msg)
501
- if not encoding_supported_in_lazy:
502
- msg = f"unsupported encoding {encoding} for hf:// paths"
503
- raise ValueError(msg)
504
-
505
- lf = _scan_csv_impl(
506
- source,
507
- has_header=has_header,
508
- separator=separator,
509
- comment_prefix=comment_prefix,
510
- quote_char=quote_char,
511
- skip_rows=skip_rows,
512
- skip_lines=skip_lines,
513
- schema_overrides=schema_overrides, # type: ignore[arg-type]
514
- schema=schema,
515
- null_values=null_values,
516
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
517
- ignore_errors=ignore_errors,
518
- try_parse_dates=try_parse_dates,
519
- infer_schema_length=infer_schema_length,
520
- n_rows=n_rows,
521
- encoding=encoding, # type: ignore[arg-type]
522
- low_memory=low_memory,
523
- rechunk=rechunk,
524
- skip_rows_after_header=skip_rows_after_header,
525
- row_index_name=row_index_name,
526
- row_index_offset=row_index_offset,
527
- eol_char=eol_char,
528
- raise_if_empty=raise_if_empty,
529
- truncate_ragged_lines=truncate_ragged_lines,
530
- decimal_comma=decimal_comma,
531
- glob=glob,
532
- )
533
-
534
- if columns:
535
- lf = lf.select(columns)
536
- elif projection:
537
- lf = lf.select(F.nth(projection))
538
-
539
- df = lf.collect()
540
-
541
- else:
542
- with prepare_file_arg(
543
- source,
544
- encoding=encoding,
545
- use_pyarrow=False,
546
- raise_if_empty=raise_if_empty,
547
- storage_options=storage_options,
548
- ) as data:
549
- df = _read_csv_impl(
550
- data,
551
- has_header=has_header,
552
- columns=columns if columns else projection,
553
- separator=separator,
554
- comment_prefix=comment_prefix,
555
- quote_char=quote_char,
556
- skip_rows=skip_rows,
557
- skip_lines=skip_lines,
558
- schema_overrides=schema_overrides,
559
- schema=schema,
560
- null_values=null_values,
561
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
562
- ignore_errors=ignore_errors,
563
- try_parse_dates=try_parse_dates,
564
- n_threads=n_threads,
565
- infer_schema_length=infer_schema_length,
566
- batch_size=batch_size,
567
- n_rows=n_rows,
568
- encoding=encoding if encoding == "utf8-lossy" else "utf8",
569
- low_memory=low_memory,
570
- rechunk=rechunk,
571
- skip_rows_after_header=skip_rows_after_header,
572
- row_index_name=row_index_name,
573
- row_index_offset=row_index_offset,
574
- eol_char=eol_char,
575
- raise_if_empty=raise_if_empty,
576
- truncate_ragged_lines=truncate_ragged_lines,
577
- decimal_comma=decimal_comma,
578
- glob=glob,
579
- )
580
-
581
- if new_columns:
582
- return _update_columns(df, new_columns)
583
- return df
584
-
585
-
586
- def _read_csv_impl(
587
- source: str | Path | IO[bytes] | bytes,
588
- *,
589
- has_header: bool = True,
590
- columns: Sequence[int] | Sequence[str] | None = None,
591
- separator: str = ",",
592
- comment_prefix: str | None = None,
593
- quote_char: str | None = '"',
594
- skip_rows: int = 0,
595
- skip_lines: int = 0,
596
- schema: None | SchemaDict = None,
597
- schema_overrides: None | (SchemaDict | Sequence[PolarsDataType]) = None,
598
- null_values: str | Sequence[str] | dict[str, str] | None = None,
599
- missing_utf8_is_empty_string: bool = False,
600
- ignore_errors: bool = False,
601
- try_parse_dates: bool = False,
602
- n_threads: int | None = None,
603
- infer_schema_length: int | None = N_INFER_DEFAULT,
604
- batch_size: int = 8192,
605
- n_rows: int | None = None,
606
- encoding: CsvEncoding = "utf8",
607
- low_memory: bool = False,
608
- rechunk: bool = False,
609
- skip_rows_after_header: int = 0,
610
- row_index_name: str | None = None,
611
- row_index_offset: int = 0,
612
- sample_size: int = 1024,
613
- eol_char: str = "\n",
614
- raise_if_empty: bool = True,
615
- truncate_ragged_lines: bool = False,
616
- decimal_comma: bool = False,
617
- glob: bool = True,
618
- ) -> DataFrame:
619
- path: str | None
620
- if isinstance(source, (str, Path)):
621
- path = normalize_filepath(source, check_not_directory=False)
622
- else:
623
- path = None
624
- if isinstance(source, BytesIO):
625
- source = source.getvalue()
626
- if isinstance(source, StringIO):
627
- source = source.getvalue().encode()
628
-
629
- dtype_list: Sequence[tuple[str, PolarsDataType]] | None = None
630
- dtype_slice: Sequence[PolarsDataType] | None = None
631
- if schema_overrides is not None:
632
- if isinstance(schema_overrides, dict):
633
- dtype_list = []
634
- for k, v in schema_overrides.items():
635
- dtype_list.append((k, parse_into_dtype(v)))
636
- elif isinstance(schema_overrides, Sequence):
637
- dtype_slice = schema_overrides
638
- else:
639
- msg = f"`schema_overrides` should be of type list or dict, got {qualified_type_name(schema_overrides)!r}"
640
- raise TypeError(msg)
641
-
642
- processed_null_values = _process_null_values(null_values)
643
-
644
- if isinstance(columns, str):
645
- columns = [columns]
646
- if isinstance(source, str) and is_glob_pattern(source):
647
- dtypes_dict = None
648
- if dtype_list is not None:
649
- dtypes_dict = dict(dtype_list)
650
- if dtype_slice is not None:
651
- msg = (
652
- "cannot use glob patterns and unnamed dtypes as `schema_overrides` argument"
653
- "\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]"
654
- )
655
- raise ValueError(msg)
656
- from polars import scan_csv
657
-
658
- scan = scan_csv(
659
- source,
660
- has_header=has_header,
661
- separator=separator,
662
- comment_prefix=comment_prefix,
663
- quote_char=quote_char,
664
- skip_rows=skip_rows,
665
- skip_lines=skip_lines,
666
- schema=schema,
667
- schema_overrides=dtypes_dict,
668
- null_values=null_values,
669
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
670
- ignore_errors=ignore_errors,
671
- infer_schema_length=infer_schema_length,
672
- n_rows=n_rows,
673
- low_memory=low_memory,
674
- rechunk=rechunk,
675
- skip_rows_after_header=skip_rows_after_header,
676
- row_index_name=row_index_name,
677
- row_index_offset=row_index_offset,
678
- eol_char=eol_char,
679
- raise_if_empty=raise_if_empty,
680
- truncate_ragged_lines=truncate_ragged_lines,
681
- decimal_comma=decimal_comma,
682
- glob=glob,
683
- )
684
- if columns is None:
685
- return scan.collect()
686
- elif is_str_sequence(columns, allow_str=False):
687
- return scan.select(columns).collect()
688
- else:
689
- msg = (
690
- "cannot use glob patterns and integer based projection as `columns` argument"
691
- "\n\nUse columns: List[str]"
692
- )
693
- raise ValueError(msg)
694
-
695
- projection, columns = parse_columns_arg(columns)
696
-
697
- pydf = PyDataFrame.read_csv(
698
- source,
699
- infer_schema_length,
700
- batch_size,
701
- has_header,
702
- ignore_errors,
703
- n_rows,
704
- skip_rows,
705
- skip_lines,
706
- projection,
707
- separator,
708
- rechunk,
709
- columns,
710
- encoding,
711
- n_threads,
712
- path,
713
- dtype_list,
714
- dtype_slice,
715
- low_memory,
716
- comment_prefix,
717
- quote_char,
718
- processed_null_values,
719
- missing_utf8_is_empty_string,
720
- try_parse_dates,
721
- skip_rows_after_header,
722
- parse_row_index_args(row_index_name, row_index_offset),
723
- eol_char=eol_char,
724
- raise_if_empty=raise_if_empty,
725
- truncate_ragged_lines=truncate_ragged_lines,
726
- decimal_comma=decimal_comma,
727
- schema=schema,
728
- )
729
- return wrap_df(pydf)
730
-
731
-
732
- @deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31")
733
- @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
734
- @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
735
- def read_csv_batched(
736
- source: str | Path,
737
- *,
738
- has_header: bool = True,
739
- columns: Sequence[int] | Sequence[str] | None = None,
740
- new_columns: Sequence[str] | None = None,
741
- separator: str = ",",
742
- comment_prefix: str | None = None,
743
- quote_char: str | None = '"',
744
- skip_rows: int = 0,
745
- skip_lines: int = 0,
746
- schema_overrides: (
747
- Mapping[str, PolarsDataType] | Sequence[PolarsDataType] | None
748
- ) = None,
749
- null_values: str | Sequence[str] | dict[str, str] | None = None,
750
- missing_utf8_is_empty_string: bool = False,
751
- ignore_errors: bool = False,
752
- try_parse_dates: bool = False,
753
- n_threads: int | None = None,
754
- infer_schema_length: int | None = N_INFER_DEFAULT,
755
- batch_size: int = 50_000,
756
- n_rows: int | None = None,
757
- encoding: CsvEncoding | str = "utf8",
758
- low_memory: bool = False,
759
- rechunk: bool = False,
760
- skip_rows_after_header: int = 0,
761
- row_index_name: str | None = None,
762
- row_index_offset: int = 0,
763
- sample_size: int = 1024,
764
- eol_char: str = "\n",
765
- raise_if_empty: bool = True,
766
- truncate_ragged_lines: bool = False,
767
- decimal_comma: bool = False,
768
- ) -> BatchedCsvReader:
769
- r"""
770
- Read a CSV file in batches.
771
-
772
- Upon creation of the `BatchedCsvReader`, Polars will gather statistics and
773
- determine the file chunks. After that, work will only be done if `next_batches`
774
- is called, which will return a list of `n` frames of the given batch size.
775
-
776
- .. versionchanged:: 0.20.31
777
- The `dtypes` parameter was renamed `schema_overrides`.
778
- .. versionchanged:: 0.20.4
779
- * The `row_count_name` parameter was renamed `row_index_name`.
780
- * The `row_count_offset` parameter was renamed `row_index_offset`.
781
-
782
- Parameters
783
- ----------
784
- source
785
- Path to a file or a file-like object (by "file-like object" we refer to objects
786
- that have a `read()` method, such as a file handler like the builtin `open`
787
- function, or a `BytesIO` instance). If `fsspec` is installed, it will be used
788
- to open remote files. For file-like objects, the stream position may not be
789
- updated accordingly after reading.
790
- has_header
791
- Indicate if the first row of the dataset is a header or not. If set to False,
792
- column names will be autogenerated in the following format: `column_x`, with
793
- `x` being an enumeration over every column in the dataset, starting at 1.
794
- columns
795
- Columns to select. Accepts a list of column indices (starting
796
- at zero) or a list of column names.
797
- new_columns
798
- Rename columns right after parsing the CSV file. If the given
799
- list is shorter than the width of the DataFrame the remaining
800
- columns will have their original name.
801
- separator
802
- Single byte character to use as separator in the file.
803
- comment_prefix
804
- A string used to indicate the start of a comment line. Comment lines are skipped
805
- during parsing. Common examples of comment prefixes are `#` and `//`.
806
- quote_char
807
- Single byte character used for csv quoting, default = `"`.
808
- Set to None to turn off special handling and escaping of quotes.
809
- skip_rows
810
- Start reading after ``skip_rows`` rows. The header will be parsed at this
811
- offset. Note that we respect CSV escaping/comments when skipping rows.
812
- If you want to skip by newline char only, use `skip_lines`.
813
- skip_lines
814
- Start reading after `skip_lines` lines. The header will be parsed at this
815
- offset. Note that CSV escaping will not be respected when skipping lines.
816
- If you want to skip valid CSV rows, use ``skip_rows``.
817
- schema_overrides
818
- Overwrite dtypes during inference.
819
- null_values
820
- Values to interpret as null values. You can provide a:
821
-
822
- - `str`: All values equal to this string will be null.
823
- - `List[str]`: All values equal to any string in this list will be null.
824
- - `Dict[str, str]`: A dictionary that maps column name to a
825
- null value string.
826
-
827
- missing_utf8_is_empty_string
828
- By default a missing value is considered to be null; if you would prefer missing
829
- utf8 values to be treated as the empty string you can set this param True.
830
- ignore_errors
831
- Try to keep reading lines if some lines yield errors.
832
- First try `infer_schema_length=0` to read all columns as
833
- `pl.String` to check which values might cause an issue.
834
- try_parse_dates
835
- Try to automatically parse dates. Most ISO8601-like formats can
836
- be inferred, as well as a handful of others. If this does not succeed,
837
- the column remains of data type `pl.String`.
838
- n_threads
839
- Number of threads to use in csv parsing.
840
- Defaults to the number of physical cpu's of your system.
841
- infer_schema_length
842
- The maximum number of rows to scan for schema inference.
843
- If set to `0`, all columns will be read as `pl.String`.
844
- If set to `None`, the full data may be scanned *(this is slow)*.
845
- batch_size
846
- Number of lines to read into the buffer at once.
847
-
848
- Modify this to change performance.
849
- n_rows
850
- Stop reading from CSV file after reading `n_rows`.
851
- During multi-threaded parsing, an upper bound of `n_rows`
852
- rows cannot be guaranteed.
853
- encoding : {'utf8', 'utf8-lossy', ...}
854
- Lossy means that invalid utf8 values are replaced with `�`
855
- characters. When using other encodings than `utf8` or
856
- `utf8-lossy`, the input is first decoded in memory with
857
- python. Defaults to `utf8`.
858
- low_memory
859
- Reduce memory pressure at the expense of performance.
860
- rechunk
861
- Make sure that all columns are contiguous in memory by
862
- aggregating the chunks into a single array.
863
- skip_rows_after_header
864
- Skip this number of rows when the header is parsed.
865
- row_index_name
866
- Insert a row index column with the given name into the DataFrame as the first
867
- column. If set to `None` (default), no row index column is created.
868
- row_index_offset
869
- Start the row index at this offset. Cannot be negative.
870
- Only used if `row_index_name` is set.
871
- sample_size
872
- Set the sample size. This is used to sample statistics to estimate the
873
- allocation needed.
874
-
875
- .. deprecated:: 1.10.0
876
- Is a no-op.
877
- eol_char
878
- Single byte end of line character (default: `\n`). When encountering a file
879
- with windows line endings (`\r\n`), one can go with the default `\n`. The extra
880
- `\r` will be removed when processed.
881
- raise_if_empty
882
- When there is no data in the source,`NoDataError` is raised. If this parameter
883
- is set to False, `None` will be returned from `next_batches(n)` instead.
884
- truncate_ragged_lines
885
- Truncate lines that are longer than the schema.
886
- decimal_comma
887
- Parse floats using a comma as the decimal separator instead of a period.
888
-
889
- Returns
890
- -------
891
- BatchedCsvReader
892
-
893
- See Also
894
- --------
895
- scan_csv : Lazily read from a CSV file or multiple files via glob patterns.
896
-
897
- Examples
898
- --------
899
- >>> reader = pl.read_csv_batched(
900
- ... "./pdsh/tables_scale_100/lineitem.tbl",
901
- ... separator="|",
902
- ... try_parse_dates=True,
903
- ... ) # doctest: +SKIP
904
- >>> batches = reader.next_batches(5) # doctest: +SKIP
905
- >>> for df in batches: # doctest: +SKIP
906
- ... print(df)
907
-
908
- Read big CSV file in batches and write a CSV file for each "group" of interest.
909
-
910
- >>> seen_groups = set()
911
- >>> reader = pl.read_csv_batched("big_file.csv") # doctest: +SKIP
912
- >>> batches = reader.next_batches(100) # doctest: +SKIP
913
-
914
- >>> while batches: # doctest: +SKIP
915
- ... df_current_batches = pl.concat(batches)
916
- ... partition_dfs = df_current_batches.partition_by("group", as_dict=True)
917
- ...
918
- ... for group, df in partition_dfs.items():
919
- ... if group in seen_groups:
920
- ... with open(f"./data/{group}.csv", "a") as fh:
921
- ... fh.write(df.write_csv(file=None, include_header=False))
922
- ... else:
923
- ... df.write_csv(file=f"./data/{group}.csv", include_header=True)
924
- ... seen_groups.add(group)
925
- ...
926
- ... batches = reader.next_batches(100)
927
- """
928
- projection, columns = parse_columns_arg(columns)
929
-
930
- if columns and not has_header:
931
- for column in columns:
932
- if not column.startswith("column_"):
933
- msg = (
934
- "specified column names do not start with 'column_',"
935
- " but autogenerated header names were requested"
936
- )
937
- raise ValueError(msg)
938
-
939
- if projection and schema_overrides and isinstance(schema_overrides, list):
940
- if len(projection) < len(schema_overrides):
941
- msg = "more schema overrides are specified than there are selected columns"
942
- raise ValueError(msg)
943
-
944
- # Fix list of dtypes when used together with projection as polars CSV reader
945
- # wants a list of dtypes for the x first columns before it does the projection.
946
- dtypes_list: list[PolarsDataType] = [String] * (max(projection) + 1)
947
-
948
- for idx, column_idx in enumerate(projection):
949
- if idx < len(schema_overrides):
950
- dtypes_list[column_idx] = schema_overrides[idx]
951
-
952
- schema_overrides = dtypes_list
953
-
954
- if columns and schema_overrides and isinstance(schema_overrides, list):
955
- if len(columns) < len(schema_overrides):
956
- msg = "more schema overrides are specified than there are selected columns"
957
- raise ValueError(msg)
958
-
959
- # Map list of dtypes when used together with selected columns as a dtypes dict
960
- # so the dtypes are applied to the correct column instead of the first x
961
- # columns.
962
- schema_overrides = dict(zip(columns, schema_overrides))
963
-
964
- if new_columns and schema_overrides and isinstance(schema_overrides, dict):
965
- current_columns = None
966
-
967
- # As new column names are not available yet while parsing the CSV file, rename
968
- # column names in dtypes to old names (if possible) so they can be used during
969
- # CSV parsing.
970
- if columns:
971
- if len(columns) < len(new_columns):
972
- msg = "more new column names are specified than there are selected columns"
973
- raise ValueError(msg)
974
-
975
- # Get column names of requested columns.
976
- current_columns = columns[0 : len(new_columns)]
977
- elif not has_header:
978
- # When there are no header, column names are autogenerated (and known).
979
-
980
- if projection:
981
- if columns and len(columns) < len(new_columns):
982
- msg = "more new column names are specified than there are selected columns"
983
- raise ValueError(msg)
984
- # Convert column indices from projection to 'column_1', 'column_2', ...
985
- # column names.
986
- current_columns = [
987
- f"column_{column_idx + 1}" for column_idx in projection
988
- ]
989
- else:
990
- # Generate autogenerated 'column_1', 'column_2', ... column names for
991
- # new column names.
992
- current_columns = [
993
- f"column_{column_idx}"
994
- for column_idx in range(1, len(new_columns) + 1)
995
- ]
996
- else:
997
- # When a header is present, column names are not known yet.
998
-
999
- if len(schema_overrides) <= len(new_columns):
1000
- # If dtypes dictionary contains less or same amount of values than new
1001
- # column names a list of dtypes can be created if all listed column
1002
- # names in dtypes dictionary appear in the first consecutive new column
1003
- # names.
1004
- dtype_list = [
1005
- schema_overrides[new_column_name]
1006
- for new_column_name in new_columns[0 : len(schema_overrides)]
1007
- if new_column_name in schema_overrides
1008
- ]
1009
-
1010
- if len(dtype_list) == len(schema_overrides):
1011
- schema_overrides = dtype_list
1012
-
1013
- if current_columns and isinstance(schema_overrides, dict):
1014
- new_to_current = dict(zip(new_columns, current_columns))
1015
- # Change new column names to current column names in dtype.
1016
- schema_overrides = {
1017
- new_to_current.get(column_name, column_name): column_dtype
1018
- for column_name, column_dtype in schema_overrides.items()
1019
- }
1020
-
1021
- return BatchedCsvReader(
1022
- source,
1023
- has_header=has_header,
1024
- columns=columns if columns else projection,
1025
- separator=separator,
1026
- comment_prefix=comment_prefix,
1027
- quote_char=quote_char,
1028
- skip_rows=skip_rows,
1029
- skip_lines=skip_lines,
1030
- schema_overrides=schema_overrides,
1031
- null_values=null_values,
1032
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
1033
- ignore_errors=ignore_errors,
1034
- try_parse_dates=try_parse_dates,
1035
- n_threads=n_threads,
1036
- infer_schema_length=infer_schema_length,
1037
- batch_size=batch_size,
1038
- n_rows=n_rows,
1039
- encoding=encoding if encoding == "utf8-lossy" else "utf8",
1040
- low_memory=low_memory,
1041
- rechunk=rechunk,
1042
- skip_rows_after_header=skip_rows_after_header,
1043
- row_index_name=row_index_name,
1044
- row_index_offset=row_index_offset,
1045
- eol_char=eol_char,
1046
- new_columns=new_columns,
1047
- raise_if_empty=raise_if_empty,
1048
- truncate_ragged_lines=truncate_ragged_lines,
1049
- decimal_comma=decimal_comma,
1050
- )
1051
-
1052
-
1053
- @deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31")
1054
- @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
1055
- @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
1056
- def scan_csv(
1057
- source: (
1058
- str
1059
- | Path
1060
- | IO[str]
1061
- | IO[bytes]
1062
- | bytes
1063
- | list[str]
1064
- | list[Path]
1065
- | list[IO[str]]
1066
- | list[IO[bytes]]
1067
- | list[bytes]
1068
- ),
1069
- *,
1070
- has_header: bool = True,
1071
- separator: str = ",",
1072
- comment_prefix: str | None = None,
1073
- quote_char: str | None = '"',
1074
- skip_rows: int = 0,
1075
- skip_lines: int = 0,
1076
- schema: SchemaDict | None = None,
1077
- schema_overrides: SchemaDict | Sequence[PolarsDataType] | None = None,
1078
- null_values: str | Sequence[str] | dict[str, str] | None = None,
1079
- missing_utf8_is_empty_string: bool = False,
1080
- ignore_errors: bool = False,
1081
- cache: bool = True,
1082
- with_column_names: Callable[[list[str]], list[str]] | None = None,
1083
- infer_schema: bool = True,
1084
- infer_schema_length: int | None = N_INFER_DEFAULT,
1085
- n_rows: int | None = None,
1086
- encoding: CsvEncoding = "utf8",
1087
- low_memory: bool = False,
1088
- rechunk: bool = False,
1089
- skip_rows_after_header: int = 0,
1090
- row_index_name: str | None = None,
1091
- row_index_offset: int = 0,
1092
- try_parse_dates: bool = False,
1093
- eol_char: str = "\n",
1094
- new_columns: Sequence[str] | None = None,
1095
- raise_if_empty: bool = True,
1096
- truncate_ragged_lines: bool = False,
1097
- decimal_comma: bool = False,
1098
- glob: bool = True,
1099
- storage_options: dict[str, Any] | None = None,
1100
- credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
1101
- retries: int = 2,
1102
- file_cache_ttl: int | None = None,
1103
- include_file_paths: str | None = None,
1104
- ) -> LazyFrame:
1105
- r"""
1106
- Lazily read from a CSV file or multiple files via glob patterns.
1107
-
1108
- This allows the query optimizer to push down predicates and
1109
- projections to the scan level, thereby potentially reducing
1110
- memory overhead.
1111
-
1112
- .. versionchanged:: 0.20.31
1113
- The `dtypes` parameter was renamed `schema_overrides`.
1114
- .. versionchanged:: 0.20.4
1115
- * The `row_count_name` parameter was renamed `row_index_name`.
1116
- * The `row_count_offset` parameter was renamed `row_index_offset`.
1117
-
1118
- Parameters
1119
- ----------
1120
- source
1121
- Path(s) to a file or directory
1122
- When needing to authenticate for scanning cloud locations, see the
1123
- `storage_options` parameter.
1124
- has_header
1125
- Indicate if the first row of the dataset is a header or not. If set to False,
1126
- column names will be autogenerated in the following format: `column_x`, with
1127
- `x` being an enumeration over every column in the dataset, starting at 1.
1128
- separator
1129
- Single byte character to use as separator in the file.
1130
- comment_prefix
1131
- A string used to indicate the start of a comment line. Comment lines are skipped
1132
- during parsing. Common examples of comment prefixes are `#` and `//`.
1133
- quote_char
1134
- Single byte character used for csv quoting, default = `"`.
1135
- Set to None to turn off special handling and escaping of quotes.
1136
- skip_rows
1137
- Start reading after ``skip_rows`` rows. The header will be parsed at this
1138
- offset. Note that we respect CSV escaping/comments when skipping rows.
1139
- If you want to skip by newline char only, use `skip_lines`.
1140
- skip_lines
1141
- Start reading after `skip_lines` lines. The header will be parsed at this
1142
- offset. Note that CSV escaping will not be respected when skipping lines.
1143
- If you want to skip valid CSV rows, use ``skip_rows``.
1144
- schema
1145
- Provide the schema. This means that polars doesn't do schema inference.
1146
- This argument expects the complete schema, whereas `schema_overrides` can be
1147
- used to partially overwrite a schema. Note that the order of the columns in
1148
- the provided `schema` must match the order of the columns in the CSV being read.
1149
- schema_overrides
1150
- Overwrite dtypes during inference; should be a {colname:dtype,} dict or,
1151
- if providing a list of strings to `new_columns`, a list of dtypes of
1152
- the same length.
1153
- null_values
1154
- Values to interpret as null values. You can provide a:
1155
-
1156
- - `str`: All values equal to this string will be null.
1157
- - `List[str]`: All values equal to any string in this list will be null.
1158
- - `Dict[str, str]`: A dictionary that maps column name to a
1159
- null value string.
1160
-
1161
- missing_utf8_is_empty_string
1162
- By default a missing value is considered to be null; if you would prefer missing
1163
- utf8 values to be treated as the empty string you can set this param True.
1164
- ignore_errors
1165
- Try to keep reading lines if some lines yield errors.
1166
- First try `infer_schema=False` to read all columns as
1167
- `pl.String` to check which values might cause an issue.
1168
- cache
1169
- Cache the result after reading.
1170
- with_column_names
1171
- Apply a function over the column names just in time (when they are determined);
1172
- this function will receive (and should return) a list of column names.
1173
- infer_schema
1174
- When `True`, the schema is inferred from the data using the first
1175
- `infer_schema_length` rows.
1176
- When `False`, the schema is not inferred and will be `pl.String` if not
1177
- specified in `schema` or `schema_overrides`.
1178
- infer_schema_length
1179
- The maximum number of rows to scan for schema inference.
1180
- If set to `None`, the full data may be scanned *(this is slow)*.
1181
- Set `infer_schema=False` to read all columns as `pl.String`.
1182
- n_rows
1183
- Stop reading from CSV file after reading `n_rows`.
1184
- encoding : {'utf8', 'utf8-lossy'}
1185
- Lossy means that invalid utf8 values are replaced with `�`
1186
- characters. Defaults to "utf8".
1187
- low_memory
1188
- Reduce memory pressure at the expense of performance.
1189
- rechunk
1190
- Reallocate to contiguous memory when all chunks/ files are parsed.
1191
- skip_rows_after_header
1192
- Skip this number of rows when the header is parsed.
1193
- row_index_name
1194
- If not None, this will insert a row index column with the given name into
1195
- the DataFrame.
1196
- row_index_offset
1197
- Offset to start the row index column (only used if the name is set).
1198
- try_parse_dates
1199
- Try to automatically parse dates. Most ISO8601-like formats
1200
- can be inferred, as well as a handful of others. If this does not succeed,
1201
- the column remains of data type `pl.String`.
1202
- eol_char
1203
- Single byte end of line character (default: `\n`). When encountering a file
1204
- with windows line endings (`\r\n`), one can go with the default `\n`. The extra
1205
- `\r` will be removed when processed.
1206
- new_columns
1207
- Provide an explicit list of string column names to use (for example, when
1208
- scanning a headerless CSV file). If the given list is shorter than the width of
1209
- the DataFrame the remaining columns will have their original name.
1210
- raise_if_empty
1211
- When there is no data in the source, `NoDataError` is raised. If this parameter
1212
- is set to False, an empty LazyFrame (with no columns) is returned instead.
1213
- truncate_ragged_lines
1214
- Truncate lines that are longer than the schema.
1215
- decimal_comma
1216
- Parse floats using a comma as the decimal separator instead of a period.
1217
- glob
1218
- Expand path given via globbing rules.
1219
- storage_options
1220
- Options that indicate how to connect to a cloud provider.
1221
-
1222
- The cloud providers currently supported are AWS, GCP, and Azure.
1223
- See supported keys here:
1224
-
1225
- * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
1226
- * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
1227
- * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
1228
- * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
1229
- `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
1230
-
1231
- If `storage_options` is not provided, Polars will try to infer the information
1232
- from environment variables.
1233
- credential_provider
1234
- Provide a function that can be called to provide cloud storage
1235
- credentials. The function is expected to return a dictionary of
1236
- credential keys along with an optional credential expiry time.
1237
-
1238
- .. warning::
1239
- This functionality is considered **unstable**. It may be changed
1240
- at any point without it being considered a breaking change.
1241
- retries
1242
- Number of retries if accessing a cloud instance fails.
1243
- file_cache_ttl
1244
- Amount of time to keep downloaded cloud files since their last access time,
1245
- in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
1246
- (which defaults to 1 hour) if not given.
1247
- include_file_paths
1248
- Include the path of the source file(s) as a column with this name.
1249
-
1250
- Returns
1251
- -------
1252
- LazyFrame
1253
-
1254
- See Also
1255
- --------
1256
- read_csv : Read a CSV file into a DataFrame.
1257
-
1258
- Examples
1259
- --------
1260
- >>> import pathlib
1261
- >>>
1262
- >>> (
1263
- ... pl.scan_csv("my_long_file.csv") # lazy, doesn't do a thing
1264
- ... .select(
1265
- ... ["a", "c"]
1266
- ... ) # select only 2 columns (other columns will not be read)
1267
- ... .filter(
1268
- ... pl.col("a") > 10
1269
- ... ) # the filter is pushed down the scan, so less data is read into memory
1270
- ... .head(100) # constrain number of returned results to 100
1271
- ... ) # doctest: +SKIP
1272
-
1273
- We can use `with_column_names` to modify the header before scanning:
1274
-
1275
- >>> df = pl.DataFrame(
1276
- ... {"BrEeZaH": [1, 2, 3, 4], "LaNgUaGe": ["is", "hard", "to", "read"]}
1277
- ... )
1278
- >>> path: pathlib.Path = dirpath / "mydf.csv"
1279
- >>> df.write_csv(path)
1280
- >>> pl.scan_csv(
1281
- ... path, with_column_names=lambda cols: [col.lower() for col in cols]
1282
- ... ).collect()
1283
- shape: (4, 2)
1284
- ┌─────────┬──────────┐
1285
- │ breezah ┆ language │
1286
- │ --- ┆ --- │
1287
- │ i64 ┆ str │
1288
- ╞═════════╪══════════╡
1289
- │ 1 ┆ is │
1290
- │ 2 ┆ hard │
1291
- │ 3 ┆ to │
1292
- │ 4 ┆ read │
1293
- └─────────┴──────────┘
1294
-
1295
- You can also simply replace column names (or provide them if the file has none)
1296
- by passing a list of new column names to the `new_columns` parameter:
1297
-
1298
- >>> df.write_csv(path)
1299
- >>> pl.scan_csv(
1300
- ... path,
1301
- ... new_columns=["idx", "txt"],
1302
- ... schema_overrides=[pl.UInt16, pl.String],
1303
- ... ).collect()
1304
- shape: (4, 2)
1305
- ┌─────┬──────┐
1306
- │ idx ┆ txt │
1307
- │ --- ┆ --- │
1308
- │ u16 ┆ str │
1309
- ╞═════╪══════╡
1310
- │ 1 ┆ is │
1311
- │ 2 ┆ hard │
1312
- │ 3 ┆ to │
1313
- │ 4 ┆ read │
1314
- └─────┴──────┘
1315
- """
1316
- if schema_overrides is not None and not isinstance(
1317
- schema_overrides, (dict, Sequence)
1318
- ):
1319
- msg = "`schema_overrides` should be of type list or dict"
1320
- raise TypeError(msg)
1321
-
1322
- if not new_columns and isinstance(schema_overrides, Sequence):
1323
- msg = f"expected 'schema_overrides' dict, found {qualified_type_name(schema_overrides)!r}"
1324
- raise TypeError(msg)
1325
- elif new_columns:
1326
- if with_column_names:
1327
- msg = "cannot set both `with_column_names` and `new_columns`; mutually exclusive"
1328
- raise ValueError(msg)
1329
- if schema_overrides and isinstance(schema_overrides, Sequence):
1330
- schema_overrides = dict(zip(new_columns, schema_overrides))
1331
-
1332
- # wrap new column names as a callable
1333
- def with_column_names(cols: list[str]) -> list[str]:
1334
- if len(cols) > len(new_columns):
1335
- return new_columns + cols[len(new_columns) :] # type: ignore[operator]
1336
- else:
1337
- return new_columns # type: ignore[return-value]
1338
-
1339
- _check_arg_is_1byte("separator", separator, can_be_empty=False)
1340
- _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
1341
-
1342
- if isinstance(source, (str, Path)):
1343
- source = normalize_filepath(source, check_not_directory=False)
1344
- elif is_path_or_str_sequence(source, allow_str=False):
1345
- source = [
1346
- normalize_filepath(source, check_not_directory=False) for source in source
1347
- ]
1348
-
1349
- if not infer_schema:
1350
- infer_schema_length = 0
1351
-
1352
- credential_provider_builder = _init_credential_provider_builder(
1353
- credential_provider, source, storage_options, "scan_csv"
1354
- )
1355
- del credential_provider
1356
-
1357
- return _scan_csv_impl(
1358
- source,
1359
- has_header=has_header,
1360
- separator=separator,
1361
- comment_prefix=comment_prefix,
1362
- quote_char=quote_char,
1363
- skip_rows=skip_rows,
1364
- skip_lines=skip_lines,
1365
- schema_overrides=schema_overrides, # type: ignore[arg-type]
1366
- schema=schema,
1367
- null_values=null_values,
1368
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
1369
- ignore_errors=ignore_errors,
1370
- cache=cache,
1371
- with_column_names=with_column_names,
1372
- infer_schema_length=infer_schema_length,
1373
- n_rows=n_rows,
1374
- low_memory=low_memory,
1375
- rechunk=rechunk,
1376
- skip_rows_after_header=skip_rows_after_header,
1377
- encoding=encoding,
1378
- row_index_name=row_index_name,
1379
- row_index_offset=row_index_offset,
1380
- try_parse_dates=try_parse_dates,
1381
- eol_char=eol_char,
1382
- raise_if_empty=raise_if_empty,
1383
- truncate_ragged_lines=truncate_ragged_lines,
1384
- decimal_comma=decimal_comma,
1385
- glob=glob,
1386
- retries=retries,
1387
- storage_options=storage_options,
1388
- credential_provider=credential_provider_builder,
1389
- file_cache_ttl=file_cache_ttl,
1390
- include_file_paths=include_file_paths,
1391
- )
1392
-
1393
-
1394
- def _scan_csv_impl(
1395
- source: str
1396
- | IO[str]
1397
- | IO[bytes]
1398
- | bytes
1399
- | list[str]
1400
- | list[Path]
1401
- | list[IO[str]]
1402
- | list[IO[bytes]]
1403
- | list[bytes],
1404
- *,
1405
- has_header: bool = True,
1406
- separator: str = ",",
1407
- comment_prefix: str | None = None,
1408
- quote_char: str | None = '"',
1409
- skip_rows: int = 0,
1410
- skip_lines: int = 0,
1411
- schema: SchemaDict | None = None,
1412
- schema_overrides: SchemaDict | None = None,
1413
- null_values: str | Sequence[str] | dict[str, str] | None = None,
1414
- missing_utf8_is_empty_string: bool = False,
1415
- ignore_errors: bool = False,
1416
- cache: bool = True,
1417
- with_column_names: Callable[[list[str]], list[str]] | None = None,
1418
- infer_schema_length: int | None = N_INFER_DEFAULT,
1419
- n_rows: int | None = None,
1420
- encoding: CsvEncoding = "utf8",
1421
- low_memory: bool = False,
1422
- rechunk: bool = False,
1423
- skip_rows_after_header: int = 0,
1424
- row_index_name: str | None = None,
1425
- row_index_offset: int = 0,
1426
- try_parse_dates: bool = False,
1427
- eol_char: str = "\n",
1428
- raise_if_empty: bool = True,
1429
- truncate_ragged_lines: bool = True,
1430
- decimal_comma: bool = False,
1431
- glob: bool = True,
1432
- storage_options: dict[str, Any] | None = None,
1433
- credential_provider: CredentialProviderBuilder | None = None,
1434
- retries: int = 2,
1435
- file_cache_ttl: int | None = None,
1436
- include_file_paths: str | None = None,
1437
- ) -> LazyFrame:
1438
- dtype_list: list[tuple[str, PolarsDataType]] | None = None
1439
- if schema_overrides is not None:
1440
- if not isinstance(schema_overrides, dict):
1441
- msg = "expected 'schema_overrides' dict, found 'list'"
1442
- raise TypeError(msg)
1443
- dtype_list = []
1444
- for k, v in schema_overrides.items():
1445
- dtype_list.append((k, parse_into_dtype(v)))
1446
- processed_null_values = _process_null_values(null_values)
1447
-
1448
- if isinstance(source, list):
1449
- sources = source
1450
- source = None # type: ignore[assignment]
1451
- else:
1452
- sources = []
1453
-
1454
- if storage_options:
1455
- storage_options = list(storage_options.items()) # type: ignore[assignment]
1456
- else:
1457
- # Handle empty dict input
1458
- storage_options = None
1459
-
1460
- pylf = PyLazyFrame.new_from_csv(
1461
- source,
1462
- sources,
1463
- separator=separator,
1464
- has_header=has_header,
1465
- ignore_errors=ignore_errors,
1466
- skip_rows=skip_rows,
1467
- skip_lines=skip_lines,
1468
- n_rows=n_rows,
1469
- cache=cache,
1470
- overwrite_dtype=dtype_list,
1471
- low_memory=low_memory,
1472
- comment_prefix=comment_prefix,
1473
- quote_char=quote_char,
1474
- null_values=processed_null_values,
1475
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
1476
- infer_schema_length=infer_schema_length,
1477
- with_schema_modify=with_column_names,
1478
- rechunk=rechunk,
1479
- skip_rows_after_header=skip_rows_after_header,
1480
- encoding=encoding,
1481
- row_index=parse_row_index_args(row_index_name, row_index_offset),
1482
- try_parse_dates=try_parse_dates,
1483
- eol_char=eol_char,
1484
- raise_if_empty=raise_if_empty,
1485
- truncate_ragged_lines=truncate_ragged_lines,
1486
- decimal_comma=decimal_comma,
1487
- glob=glob,
1488
- schema=schema,
1489
- cloud_options=storage_options,
1490
- credential_provider=credential_provider,
1491
- retries=retries,
1492
- file_cache_ttl=file_cache_ttl,
1493
- include_file_paths=include_file_paths,
1494
- )
1495
- return wrap_ldf(pylf)