polars-runtime-compat 1.34.0b3__cp39-abi3-manylinux_2_24_aarch64.whl → 1.34.0b5__cp39-abi3-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (204) hide show
  1. _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
  2. polars_runtime_compat-1.34.0b5.dist-info/METADATA +35 -0
  3. polars_runtime_compat-1.34.0b5.dist-info/RECORD +6 -0
  4. polars/__init__.py +0 -528
  5. polars/_cpu_check.py +0 -265
  6. polars/_dependencies.py +0 -355
  7. polars/_plr.py +0 -99
  8. polars/_plr.pyi +0 -2496
  9. polars/_reexport.py +0 -23
  10. polars/_typing.py +0 -478
  11. polars/_utils/__init__.py +0 -37
  12. polars/_utils/async_.py +0 -102
  13. polars/_utils/cache.py +0 -176
  14. polars/_utils/cloud.py +0 -40
  15. polars/_utils/constants.py +0 -29
  16. polars/_utils/construction/__init__.py +0 -46
  17. polars/_utils/construction/dataframe.py +0 -1397
  18. polars/_utils/construction/other.py +0 -72
  19. polars/_utils/construction/series.py +0 -560
  20. polars/_utils/construction/utils.py +0 -118
  21. polars/_utils/convert.py +0 -224
  22. polars/_utils/deprecation.py +0 -406
  23. polars/_utils/getitem.py +0 -457
  24. polars/_utils/logging.py +0 -11
  25. polars/_utils/nest_asyncio.py +0 -264
  26. polars/_utils/parquet.py +0 -15
  27. polars/_utils/parse/__init__.py +0 -12
  28. polars/_utils/parse/expr.py +0 -242
  29. polars/_utils/polars_version.py +0 -19
  30. polars/_utils/pycapsule.py +0 -53
  31. polars/_utils/scan.py +0 -27
  32. polars/_utils/serde.py +0 -63
  33. polars/_utils/slice.py +0 -215
  34. polars/_utils/udfs.py +0 -1251
  35. polars/_utils/unstable.py +0 -63
  36. polars/_utils/various.py +0 -782
  37. polars/_utils/wrap.py +0 -25
  38. polars/api.py +0 -370
  39. polars/catalog/__init__.py +0 -0
  40. polars/catalog/unity/__init__.py +0 -19
  41. polars/catalog/unity/client.py +0 -733
  42. polars/catalog/unity/models.py +0 -152
  43. polars/config.py +0 -1571
  44. polars/convert/__init__.py +0 -25
  45. polars/convert/general.py +0 -1046
  46. polars/convert/normalize.py +0 -261
  47. polars/dataframe/__init__.py +0 -5
  48. polars/dataframe/_html.py +0 -186
  49. polars/dataframe/frame.py +0 -12582
  50. polars/dataframe/group_by.py +0 -1067
  51. polars/dataframe/plotting.py +0 -257
  52. polars/datatype_expr/__init__.py +0 -5
  53. polars/datatype_expr/array.py +0 -56
  54. polars/datatype_expr/datatype_expr.py +0 -304
  55. polars/datatype_expr/list.py +0 -18
  56. polars/datatype_expr/struct.py +0 -69
  57. polars/datatypes/__init__.py +0 -122
  58. polars/datatypes/_parse.py +0 -195
  59. polars/datatypes/_utils.py +0 -48
  60. polars/datatypes/classes.py +0 -1213
  61. polars/datatypes/constants.py +0 -11
  62. polars/datatypes/constructor.py +0 -172
  63. polars/datatypes/convert.py +0 -366
  64. polars/datatypes/group.py +0 -130
  65. polars/exceptions.py +0 -230
  66. polars/expr/__init__.py +0 -7
  67. polars/expr/array.py +0 -964
  68. polars/expr/binary.py +0 -346
  69. polars/expr/categorical.py +0 -306
  70. polars/expr/datetime.py +0 -2620
  71. polars/expr/expr.py +0 -11272
  72. polars/expr/list.py +0 -1408
  73. polars/expr/meta.py +0 -444
  74. polars/expr/name.py +0 -321
  75. polars/expr/string.py +0 -3045
  76. polars/expr/struct.py +0 -357
  77. polars/expr/whenthen.py +0 -185
  78. polars/functions/__init__.py +0 -193
  79. polars/functions/aggregation/__init__.py +0 -33
  80. polars/functions/aggregation/horizontal.py +0 -298
  81. polars/functions/aggregation/vertical.py +0 -341
  82. polars/functions/as_datatype.py +0 -848
  83. polars/functions/business.py +0 -138
  84. polars/functions/col.py +0 -384
  85. polars/functions/datatype.py +0 -121
  86. polars/functions/eager.py +0 -524
  87. polars/functions/escape_regex.py +0 -29
  88. polars/functions/lazy.py +0 -2751
  89. polars/functions/len.py +0 -68
  90. polars/functions/lit.py +0 -210
  91. polars/functions/random.py +0 -22
  92. polars/functions/range/__init__.py +0 -19
  93. polars/functions/range/_utils.py +0 -15
  94. polars/functions/range/date_range.py +0 -303
  95. polars/functions/range/datetime_range.py +0 -370
  96. polars/functions/range/int_range.py +0 -348
  97. polars/functions/range/linear_space.py +0 -311
  98. polars/functions/range/time_range.py +0 -287
  99. polars/functions/repeat.py +0 -301
  100. polars/functions/whenthen.py +0 -353
  101. polars/interchange/__init__.py +0 -10
  102. polars/interchange/buffer.py +0 -77
  103. polars/interchange/column.py +0 -190
  104. polars/interchange/dataframe.py +0 -230
  105. polars/interchange/from_dataframe.py +0 -328
  106. polars/interchange/protocol.py +0 -303
  107. polars/interchange/utils.py +0 -170
  108. polars/io/__init__.py +0 -64
  109. polars/io/_utils.py +0 -317
  110. polars/io/avro.py +0 -49
  111. polars/io/clipboard.py +0 -36
  112. polars/io/cloud/__init__.py +0 -17
  113. polars/io/cloud/_utils.py +0 -80
  114. polars/io/cloud/credential_provider/__init__.py +0 -17
  115. polars/io/cloud/credential_provider/_builder.py +0 -520
  116. polars/io/cloud/credential_provider/_providers.py +0 -618
  117. polars/io/csv/__init__.py +0 -9
  118. polars/io/csv/_utils.py +0 -38
  119. polars/io/csv/batched_reader.py +0 -142
  120. polars/io/csv/functions.py +0 -1495
  121. polars/io/database/__init__.py +0 -6
  122. polars/io/database/_arrow_registry.py +0 -70
  123. polars/io/database/_cursor_proxies.py +0 -147
  124. polars/io/database/_executor.py +0 -578
  125. polars/io/database/_inference.py +0 -314
  126. polars/io/database/_utils.py +0 -144
  127. polars/io/database/functions.py +0 -516
  128. polars/io/delta.py +0 -499
  129. polars/io/iceberg/__init__.py +0 -3
  130. polars/io/iceberg/_utils.py +0 -697
  131. polars/io/iceberg/dataset.py +0 -556
  132. polars/io/iceberg/functions.py +0 -151
  133. polars/io/ipc/__init__.py +0 -8
  134. polars/io/ipc/functions.py +0 -514
  135. polars/io/json/__init__.py +0 -3
  136. polars/io/json/read.py +0 -101
  137. polars/io/ndjson.py +0 -332
  138. polars/io/parquet/__init__.py +0 -17
  139. polars/io/parquet/field_overwrites.py +0 -140
  140. polars/io/parquet/functions.py +0 -722
  141. polars/io/partition.py +0 -491
  142. polars/io/plugins.py +0 -187
  143. polars/io/pyarrow_dataset/__init__.py +0 -5
  144. polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
  145. polars/io/pyarrow_dataset/functions.py +0 -79
  146. polars/io/scan_options/__init__.py +0 -5
  147. polars/io/scan_options/_options.py +0 -59
  148. polars/io/scan_options/cast_options.py +0 -126
  149. polars/io/spreadsheet/__init__.py +0 -6
  150. polars/io/spreadsheet/_utils.py +0 -52
  151. polars/io/spreadsheet/_write_utils.py +0 -647
  152. polars/io/spreadsheet/functions.py +0 -1323
  153. polars/lazyframe/__init__.py +0 -9
  154. polars/lazyframe/engine_config.py +0 -61
  155. polars/lazyframe/frame.py +0 -8564
  156. polars/lazyframe/group_by.py +0 -669
  157. polars/lazyframe/in_process.py +0 -42
  158. polars/lazyframe/opt_flags.py +0 -333
  159. polars/meta/__init__.py +0 -14
  160. polars/meta/build.py +0 -33
  161. polars/meta/index_type.py +0 -27
  162. polars/meta/thread_pool.py +0 -50
  163. polars/meta/versions.py +0 -120
  164. polars/ml/__init__.py +0 -0
  165. polars/ml/torch.py +0 -213
  166. polars/ml/utilities.py +0 -30
  167. polars/plugins.py +0 -155
  168. polars/py.typed +0 -0
  169. polars/pyproject.toml +0 -103
  170. polars/schema.py +0 -265
  171. polars/selectors.py +0 -3117
  172. polars/series/__init__.py +0 -5
  173. polars/series/array.py +0 -776
  174. polars/series/binary.py +0 -254
  175. polars/series/categorical.py +0 -246
  176. polars/series/datetime.py +0 -2275
  177. polars/series/list.py +0 -1087
  178. polars/series/plotting.py +0 -191
  179. polars/series/series.py +0 -9197
  180. polars/series/string.py +0 -2367
  181. polars/series/struct.py +0 -154
  182. polars/series/utils.py +0 -191
  183. polars/sql/__init__.py +0 -7
  184. polars/sql/context.py +0 -677
  185. polars/sql/functions.py +0 -139
  186. polars/string_cache.py +0 -185
  187. polars/testing/__init__.py +0 -13
  188. polars/testing/asserts/__init__.py +0 -9
  189. polars/testing/asserts/frame.py +0 -231
  190. polars/testing/asserts/series.py +0 -219
  191. polars/testing/asserts/utils.py +0 -12
  192. polars/testing/parametric/__init__.py +0 -33
  193. polars/testing/parametric/profiles.py +0 -107
  194. polars/testing/parametric/strategies/__init__.py +0 -22
  195. polars/testing/parametric/strategies/_utils.py +0 -14
  196. polars/testing/parametric/strategies/core.py +0 -615
  197. polars/testing/parametric/strategies/data.py +0 -452
  198. polars/testing/parametric/strategies/dtype.py +0 -436
  199. polars/testing/parametric/strategies/legacy.py +0 -169
  200. polars/type_aliases.py +0 -24
  201. polars_runtime_compat-1.34.0b3.dist-info/METADATA +0 -190
  202. polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
  203. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/WHEEL +0 -0
  204. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/licenses/LICENSE +0 -0
@@ -1,1323 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import re
5
- import warnings
6
- from collections import defaultdict
7
- from collections.abc import Sequence
8
- from datetime import time
9
- from glob import glob
10
- from io import BufferedReader, BytesIO, StringIO, TextIOWrapper
11
- from pathlib import Path
12
- from typing import IO, TYPE_CHECKING, Any, Callable, NoReturn, overload
13
-
14
- import polars._reexport as pl
15
- from polars import from_arrow
16
- from polars import functions as F
17
- from polars._dependencies import import_optional
18
- from polars._utils.deprecation import (
19
- deprecate_renamed_parameter,
20
- issue_deprecation_warning,
21
- )
22
- from polars._utils.various import deduplicate_names, normalize_filepath, parse_version
23
- from polars.datatypes import (
24
- N_INFER_DEFAULT,
25
- Boolean,
26
- Date,
27
- Datetime,
28
- Duration,
29
- Int64,
30
- Null,
31
- String,
32
- Time,
33
- UInt8,
34
- )
35
- from polars.datatypes.group import FLOAT_DTYPES, INTEGER_DTYPES, NUMERIC_DTYPES
36
- from polars.exceptions import (
37
- ModuleUpgradeRequiredError,
38
- NoDataError,
39
- ParameterCollisionError,
40
- )
41
- from polars.functions import concat
42
- from polars.io._utils import looks_like_url, process_file_url
43
- from polars.io.csv.functions import read_csv
44
-
45
- if TYPE_CHECKING:
46
- from typing import Literal
47
-
48
- from polars._typing import ExcelSpreadsheetEngine, FileSource, SchemaDict
49
-
50
-
51
- def _sources(source: FileSource) -> tuple[Any, bool]:
52
- """Unpack any glob patterns, standardise file paths."""
53
- read_multiple_workbooks = True
54
- sources: list[Any] = []
55
-
56
- if isinstance(source, memoryview):
57
- source = source.tobytes()
58
- if not isinstance(source, Sequence) or isinstance(source, (bytes, str)):
59
- read_multiple_workbooks = False
60
- source = [source] # type: ignore[assignment]
61
-
62
- for src in source: # type: ignore[union-attr]
63
- if isinstance(src, (str, os.PathLike)) and not Path(src).exists():
64
- src = os.path.expanduser(str(src)) # noqa: PTH111
65
- if looks_like_url(src):
66
- sources.append(src)
67
- continue
68
- sources.extend(files := glob(src, recursive=True)) # noqa: PTH207
69
- if not files:
70
- msg = f"no workbook found at path {src!r}"
71
- raise FileNotFoundError(msg)
72
- read_multiple_workbooks = True
73
- else:
74
- if isinstance(src, os.PathLike):
75
- src = str(src)
76
- sources.append(src)
77
-
78
- return sources, read_multiple_workbooks
79
-
80
-
81
- def _standardize_duplicates(s: str) -> str:
82
- """Standardize columns with '_duplicated_n' names."""
83
- return re.sub(r"_duplicated_(\d+)", repl=r"\1", string=s)
84
-
85
-
86
- def _unpack_read_results(
87
- frames: list[pl.DataFrame] | list[dict[str, pl.DataFrame]],
88
- *,
89
- read_multiple_workbooks: bool,
90
- ) -> Any:
91
- if not frames:
92
- msg = "no data found in the given workbook(s) and sheet(s)"
93
- raise NoDataError(msg)
94
-
95
- if not read_multiple_workbooks:
96
- # one sheet from one workbook
97
- return frames[0]
98
-
99
- if isinstance(frames[0], pl.DataFrame):
100
- # one sheet from multiple workbooks
101
- return concat(frames, how="vertical_relaxed") # type: ignore[type-var]
102
- else:
103
- # multiple sheets from multiple workbooks
104
- sheet_frames = defaultdict(list)
105
- for res in frames:
106
- for sheet, df in res.items(): # type: ignore[union-attr]
107
- sheet_frames[sheet].append(df)
108
- return {k: concat(v, how="vertical_relaxed") for k, v in sheet_frames.items()}
109
-
110
-
111
- @overload
112
- def read_excel(
113
- source: FileSource,
114
- *,
115
- sheet_id: None = ...,
116
- sheet_name: str,
117
- table_name: str | None = ...,
118
- engine: ExcelSpreadsheetEngine = ...,
119
- engine_options: dict[str, Any] | None = ...,
120
- read_options: dict[str, Any] | None = ...,
121
- has_header: bool = ...,
122
- columns: Sequence[int] | Sequence[str] | str | None = ...,
123
- schema_overrides: SchemaDict | None = ...,
124
- infer_schema_length: int | None = ...,
125
- include_file_paths: str | None = ...,
126
- drop_empty_rows: bool = ...,
127
- drop_empty_cols: bool = ...,
128
- raise_if_empty: bool = ...,
129
- ) -> pl.DataFrame: ...
130
-
131
-
132
- @overload
133
- def read_excel(
134
- source: FileSource,
135
- *,
136
- sheet_id: None = ...,
137
- sheet_name: None = ...,
138
- table_name: str | None = ...,
139
- engine: ExcelSpreadsheetEngine = ...,
140
- engine_options: dict[str, Any] | None = ...,
141
- has_header: bool = ...,
142
- read_options: dict[str, Any] | None = ...,
143
- columns: Sequence[int] | Sequence[str] | str | None = ...,
144
- schema_overrides: SchemaDict | None = ...,
145
- infer_schema_length: int | None = ...,
146
- include_file_paths: str | None = ...,
147
- drop_empty_rows: bool = ...,
148
- drop_empty_cols: bool = ...,
149
- raise_if_empty: bool = ...,
150
- ) -> pl.DataFrame: ...
151
-
152
-
153
- @overload
154
- def read_excel(
155
- source: FileSource,
156
- *,
157
- sheet_id: int,
158
- sheet_name: str,
159
- table_name: str | None = ...,
160
- engine: ExcelSpreadsheetEngine = ...,
161
- engine_options: dict[str, Any] | None = ...,
162
- read_options: dict[str, Any] | None = ...,
163
- has_header: bool = ...,
164
- columns: Sequence[int] | Sequence[str] | str | None = ...,
165
- schema_overrides: SchemaDict | None = ...,
166
- infer_schema_length: int | None = ...,
167
- include_file_paths: str | None = ...,
168
- drop_empty_rows: bool = ...,
169
- drop_empty_cols: bool = ...,
170
- raise_if_empty: bool = ...,
171
- ) -> NoReturn: ...
172
-
173
-
174
- # note: 'ignore' required as mypy thinks that the return value for
175
- # Literal[0] overlaps with the return value for other integers
176
- @overload # type: ignore[overload-overlap]
177
- def read_excel(
178
- source: FileSource,
179
- *,
180
- sheet_id: Literal[0] | Sequence[int],
181
- sheet_name: None = ...,
182
- table_name: str | None = ...,
183
- engine: ExcelSpreadsheetEngine = ...,
184
- engine_options: dict[str, Any] | None = ...,
185
- read_options: dict[str, Any] | None = ...,
186
- has_header: bool = ...,
187
- columns: Sequence[int] | Sequence[str] | str | None = ...,
188
- schema_overrides: SchemaDict | None = ...,
189
- infer_schema_length: int | None = ...,
190
- include_file_paths: str | None = ...,
191
- drop_empty_rows: bool = ...,
192
- drop_empty_cols: bool = ...,
193
- raise_if_empty: bool = ...,
194
- ) -> dict[str, pl.DataFrame]: ...
195
-
196
-
197
- @overload
198
- def read_excel(
199
- source: FileSource,
200
- *,
201
- sheet_id: int,
202
- sheet_name: None = ...,
203
- table_name: str | None = ...,
204
- engine: ExcelSpreadsheetEngine = ...,
205
- engine_options: dict[str, Any] | None = ...,
206
- read_options: dict[str, Any] | None = ...,
207
- has_header: bool = ...,
208
- columns: Sequence[int] | Sequence[str] | str | None = ...,
209
- schema_overrides: SchemaDict | None = ...,
210
- infer_schema_length: int | None = ...,
211
- include_file_paths: str | None = ...,
212
- drop_empty_rows: bool = ...,
213
- drop_empty_cols: bool = ...,
214
- raise_if_empty: bool = ...,
215
- ) -> pl.DataFrame: ...
216
-
217
-
218
- @overload
219
- def read_excel(
220
- source: FileSource,
221
- *,
222
- sheet_id: None = ...,
223
- sheet_name: list[str] | tuple[str],
224
- table_name: str | None = ...,
225
- engine: ExcelSpreadsheetEngine = ...,
226
- engine_options: dict[str, Any] | None = ...,
227
- read_options: dict[str, Any] | None = ...,
228
- has_header: bool = ...,
229
- columns: Sequence[int] | Sequence[str] | str | None = ...,
230
- schema_overrides: SchemaDict | None = ...,
231
- infer_schema_length: int | None = ...,
232
- include_file_paths: str | None = ...,
233
- drop_empty_rows: bool = ...,
234
- drop_empty_cols: bool = ...,
235
- raise_if_empty: bool = ...,
236
- ) -> dict[str, pl.DataFrame]: ...
237
-
238
-
239
- @deprecate_renamed_parameter("xlsx2csv_options", "engine_options", version="0.20.6")
240
- @deprecate_renamed_parameter("read_csv_options", "read_options", version="0.20.7")
241
- def read_excel(
242
- source: FileSource,
243
- *,
244
- sheet_id: int | Sequence[int] | None = None,
245
- sheet_name: str | list[str] | tuple[str] | None = None,
246
- table_name: str | None = None,
247
- engine: ExcelSpreadsheetEngine = "calamine",
248
- engine_options: dict[str, Any] | None = None,
249
- read_options: dict[str, Any] | None = None,
250
- has_header: bool = True,
251
- columns: Sequence[int] | Sequence[str] | str | None = None,
252
- schema_overrides: SchemaDict | None = None,
253
- infer_schema_length: int | None = N_INFER_DEFAULT,
254
- include_file_paths: str | None = None,
255
- drop_empty_rows: bool = True,
256
- drop_empty_cols: bool = True,
257
- raise_if_empty: bool = True,
258
- ) -> pl.DataFrame | dict[str, pl.DataFrame]:
259
- """
260
- Read Excel spreadsheet data into a DataFrame.
261
-
262
- .. versionadded:: 1.20
263
- Support loading data from named table objects with `table_name` parameter.
264
- .. versionadded:: 1.18
265
- Support loading data from a list (or glob pattern) of multiple workbooks.
266
- .. versionchanged:: 1.0
267
- Default engine is now "calamine" (was "xlsx2csv").
268
- .. versionchanged:: 0.20.7
269
- The `read_csv_options` parameter was renamed `read_options`.
270
- .. versionchanged:: 0.20.6
271
- The `xlsx2csv_options` parameter was renamed `engine_options`.
272
-
273
- Parameters
274
- ----------
275
- source
276
- Path(s) to a file or a file-like object (by "file-like object" we refer to
277
- objects that have a `read()` method, such as a file handler like the builtin
278
- `open` function, or a `BytesIO` instance). For file-like objects, the stream
279
- position may not be updated after reading.
280
- sheet_id
281
- Sheet number(s) to convert (set `0` to load all sheets as DataFrames) and
282
- return a `{sheetname:frame,}` dict. (Defaults to `1` if neither this nor
283
- `sheet_name` are specified). Can also take a sequence of sheet numbers.
284
- sheet_name
285
- Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If
286
- more than one is given then a `{sheetname:frame,}` dict is returned.
287
- table_name
288
- Name of a specific table to read; note that table names are unique across
289
- the workbook, so additionally specifying a sheet id or name is optional;
290
- if one of those parameters *is* specified, an error will be raised if
291
- the named table is not found in that particular sheet.
292
- engine : {'calamine', 'openpyxl', 'xlsx2csv'}
293
- Library used to parse the spreadsheet file; defaults to "calamine".
294
-
295
- * "calamine": this engine can be used for reading all major types of Excel
296
- Workbook (`.xlsx`, `.xlsb`, `.xls`) and is dramatically faster than the
297
- other options, using the `fastexcel` module to bind the Rust-based Calamine
298
- parser.
299
- * "openpyxl": this engine is significantly slower than both `calamine` and
300
- `xlsx2csv`, but can provide a useful fallback if you are otherwise unable
301
- to read data from your workbook.
302
- * "xlsx2csv": converts the data to an in-memory CSV before using the native
303
- polars `read_csv` method to parse the result.
304
- engine_options
305
- Additional options passed to the underlying engine's primary parsing
306
- constructor (given below), if supported:
307
-
308
- * "calamine": n/a (can only provide `read_options`)
309
- * "openpyxl": `load_workbook <https://openpyxl.readthedocs.io/en/stable/api/openpyxl.reader.excel.html#openpyxl.reader.excel.load_workbook>`_
310
- * "xlsx2csv": `Xlsx2csv <https://github.com/dilshod/xlsx2csv/blob/f35734aa453d65102198a77e7b8cd04928e6b3a2/xlsx2csv.py#L157>`_
311
- read_options
312
- Options passed to the underlying engine method that reads the sheet data.
313
- Where supported, this allows for additional control over parsing. The
314
- specific read methods associated with each engine are:
315
-
316
- * "calamine": `load_sheet_by_name <https://fastexcel.toucantoco.dev/fastexcel.html#ExcelReader.load_sheet_by_name>`_
317
- (or `load_table <https://fastexcel.toucantoco.dev/fastexcel.html#ExcelReader.load_table>`_
318
- if using the `table_name` parameter).
319
- * "openpyxl": n/a (can only provide `engine_options`)
320
- * "xlsx2csv": see :meth:`read_csv`
321
- has_header
322
- Indicate if the first row of the table data is a header or not. If False,
323
- column names will be autogenerated in the following format: `column_x`, with
324
- `x` being an enumeration over every column in the dataset, starting at 1.
325
- columns
326
- Columns to read from the sheet; if not specified, all columns are read. Can
327
- be given as a sequence of column names or indices, or a single column name.
328
- schema_overrides
329
- Support type specification or override of one or more columns.
330
- infer_schema_length
331
- The maximum number of rows to scan for schema inference. If set to `None`, the
332
- entire dataset is scanned to determine the dtypes, which can slow parsing for
333
- large workbooks. Note that only the "calamine" and "xlsx2csv" engines support
334
- this parameter.
335
- include_file_paths
336
- Include the path of the source file(s) as a column with this name.
337
- drop_empty_rows
338
- Indicate whether to omit empty rows when reading data into the DataFrame.
339
- drop_empty_cols
340
- Indicate whether to omit empty columns (with no headers) when reading data into
341
- the DataFrame (note that empty column identification may vary depending on the
342
- underlying engine being used).
343
- raise_if_empty
344
- When there is no data in the sheet,`NoDataError` is raised. If this parameter
345
- is set to False, an empty DataFrame (with no columns) is returned instead.
346
-
347
- Returns
348
- -------
349
- DataFrame
350
- If reading a single sheet.
351
- dict
352
- If reading multiple sheets, a "{sheetname: DataFrame, ...}" dict is returned.
353
-
354
- See Also
355
- --------
356
- read_ods
357
-
358
- Notes
359
- -----
360
- * Where possible, prefer the default "calamine" engine for reading Excel Workbooks,
361
- as it is significantly faster than the other options.
362
- * When using the `xlsx2csv` engine the target Excel sheet is first converted
363
- to CSV using `xlsx2csv.Xlsx2csv(source).convert()` and then parsed with Polars'
364
- :func:`read_csv` function. You can pass additional options to `read_options`
365
- to influence this part of the parsing pipeline.
366
- * If you want to read multiple sheets and set *different* options (`read_options`,
367
- `schema_overrides`, etc), you should make separate calls as the options are set
368
- globally, not on a per-sheet basis.
369
-
370
- Examples
371
- --------
372
- Read the "data" worksheet from an Excel file into a DataFrame.
373
-
374
- >>> pl.read_excel(
375
- ... source="test.xlsx",
376
- ... sheet_name="data",
377
- ... ) # doctest: +SKIP
378
-
379
- If the correct dtypes can't be determined, use the `schema_overrides` parameter
380
- to specify them, or increase the inference length with `infer_schema_length`.
381
-
382
- >>> pl.read_excel(
383
- ... source="test.xlsx",
384
- ... schema_overrides={"dt": pl.Date},
385
- ... infer_schema_length=None,
386
- ... ) # doctest: +SKIP
387
-
388
- Using the `xlsx2csv` engine, read table data from sheet 3 in an Excel workbook as a
389
- DataFrame while skipping empty lines in the sheet. As sheet 3 does not have a header
390
- row, you can pass the necessary additional settings for this to the `read_options`
391
- parameter; these will be passed to :func:`read_csv`.
392
-
393
- >>> pl.read_excel(
394
- ... source="test.xlsx",
395
- ... sheet_id=3,
396
- ... engine="xlsx2csv",
397
- ... engine_options={"skip_empty_lines": True},
398
- ... read_options={"has_header": False, "new_columns": ["a", "b", "c"]},
399
- ... ) # doctest: +SKIP
400
- """
401
- sources, read_multiple_workbooks = _sources(source)
402
- frames: list[pl.DataFrame] | list[dict[str, pl.DataFrame]] = [ # type: ignore[assignment]
403
- _read_spreadsheet(
404
- src,
405
- sheet_id=sheet_id,
406
- sheet_name=sheet_name,
407
- table_name=table_name,
408
- engine=engine,
409
- engine_options=engine_options,
410
- read_options=read_options,
411
- schema_overrides=schema_overrides,
412
- infer_schema_length=infer_schema_length,
413
- include_file_paths=include_file_paths,
414
- raise_if_empty=raise_if_empty,
415
- has_header=has_header,
416
- columns=columns,
417
- drop_empty_rows=drop_empty_rows,
418
- drop_empty_cols=drop_empty_cols,
419
- )
420
- for src in sources
421
- ]
422
- return _unpack_read_results(
423
- frames=frames,
424
- read_multiple_workbooks=read_multiple_workbooks,
425
- )
426
-
427
-
428
- @overload
429
- def read_ods(
430
- source: FileSource,
431
- *,
432
- sheet_id: None = ...,
433
- sheet_name: str,
434
- has_header: bool = ...,
435
- columns: Sequence[int] | Sequence[str] | None = ...,
436
- schema_overrides: SchemaDict | None = ...,
437
- infer_schema_length: int | None = ...,
438
- include_file_paths: str | None = ...,
439
- drop_empty_rows: bool = ...,
440
- drop_empty_cols: bool = ...,
441
- raise_if_empty: bool = ...,
442
- ) -> pl.DataFrame: ...
443
-
444
-
445
- @overload
446
- def read_ods(
447
- source: FileSource,
448
- *,
449
- sheet_id: None = ...,
450
- sheet_name: None = ...,
451
- has_header: bool = ...,
452
- columns: Sequence[int] | Sequence[str] | None = ...,
453
- schema_overrides: SchemaDict | None = ...,
454
- infer_schema_length: int | None = ...,
455
- include_file_paths: str | None = ...,
456
- drop_empty_rows: bool = ...,
457
- drop_empty_cols: bool = ...,
458
- raise_if_empty: bool = ...,
459
- ) -> pl.DataFrame: ...
460
-
461
-
462
- @overload
463
- def read_ods(
464
- source: FileSource,
465
- *,
466
- sheet_id: int,
467
- sheet_name: str,
468
- has_header: bool = ...,
469
- columns: Sequence[int] | Sequence[str] | None = ...,
470
- schema_overrides: SchemaDict | None = ...,
471
- infer_schema_length: int | None = ...,
472
- include_file_paths: str | None = ...,
473
- drop_empty_rows: bool = ...,
474
- drop_empty_cols: bool = ...,
475
- raise_if_empty: bool = ...,
476
- ) -> NoReturn: ...
477
-
478
-
479
- @overload # type: ignore[overload-overlap]
480
- def read_ods(
481
- source: FileSource,
482
- *,
483
- sheet_id: Literal[0] | Sequence[int],
484
- sheet_name: None = ...,
485
- has_header: bool = ...,
486
- columns: Sequence[int] | Sequence[str] | None = ...,
487
- schema_overrides: SchemaDict | None = ...,
488
- infer_schema_length: int | None = ...,
489
- include_file_paths: str | None = ...,
490
- drop_empty_rows: bool = ...,
491
- drop_empty_cols: bool = ...,
492
- raise_if_empty: bool = ...,
493
- ) -> dict[str, pl.DataFrame]: ...
494
-
495
-
496
- @overload
497
- def read_ods(
498
- source: FileSource,
499
- *,
500
- sheet_id: int,
501
- sheet_name: None = ...,
502
- has_header: bool = ...,
503
- columns: Sequence[int] | Sequence[str] | None = ...,
504
- schema_overrides: SchemaDict | None = ...,
505
- infer_schema_length: int | None = ...,
506
- include_file_paths: str | None = ...,
507
- drop_empty_rows: bool = ...,
508
- drop_empty_cols: bool = ...,
509
- raise_if_empty: bool = ...,
510
- ) -> pl.DataFrame: ...
511
-
512
-
513
- @overload
514
- def read_ods(
515
- source: FileSource,
516
- *,
517
- sheet_id: None = ...,
518
- sheet_name: list[str] | tuple[str],
519
- has_header: bool = ...,
520
- columns: Sequence[int] | Sequence[str] | None = ...,
521
- schema_overrides: SchemaDict | None = ...,
522
- infer_schema_length: int | None = ...,
523
- include_file_paths: str | None = ...,
524
- drop_empty_rows: bool = ...,
525
- drop_empty_cols: bool = ...,
526
- raise_if_empty: bool = ...,
527
- ) -> dict[str, pl.DataFrame]: ...
528
-
529
-
530
- def read_ods(
531
- source: FileSource,
532
- *,
533
- sheet_id: int | Sequence[int] | None = None,
534
- sheet_name: str | list[str] | tuple[str] | None = None,
535
- has_header: bool = True,
536
- columns: Sequence[int] | Sequence[str] | None = None,
537
- schema_overrides: SchemaDict | None = None,
538
- infer_schema_length: int | None = N_INFER_DEFAULT,
539
- include_file_paths: str | None = None,
540
- drop_empty_rows: bool = True,
541
- drop_empty_cols: bool = True,
542
- raise_if_empty: bool = True,
543
- ) -> pl.DataFrame | dict[str, pl.DataFrame]:
544
- """
545
- Read OpenOffice (ODS) spreadsheet data into a DataFrame.
546
-
547
- Parameters
548
- ----------
549
- source
550
- Path to a file or a file-like object (by "file-like object" we refer to objects
551
- that have a `read()` method, such as a file handler like the builtin `open`
552
- function, or a `BytesIO` instance). For file-like objects, the stream position
553
- may not be updated accordingly after reading.
554
- sheet_id
555
- Sheet number(s) to convert, starting from 1 (set `0` to load *all* worksheets
556
- as DataFrames) and return a `{sheetname:frame,}` dict. (Defaults to `1` if
557
- neither this nor `sheet_name` are specified). Can also take a sequence of sheet
558
- numbers.
559
- sheet_name
560
- Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If
561
- more than one is given then a `{sheetname:frame,}` dict is returned.
562
- has_header
563
- Indicate if the first row of the table data is a header or not. If False,
564
- column names will be autogenerated in the following format: `column_x`, with
565
- `x` being an enumeration over every column in the dataset, starting at 1.
566
- columns
567
- Columns to read from the sheet; if not specified, all columns are read. Can
568
- be given as a sequence of column names or indices.
569
- schema_overrides
570
- Support type specification or override of one or more columns.
571
- infer_schema_length
572
- The maximum number of rows to scan for schema inference. If set to `None`, the
573
- entire dataset is scanned to determine the dtypes, which can slow parsing for
574
- large workbooks.
575
- include_file_paths
576
- Include the path of the source file(s) as a column with this name.
577
- drop_empty_rows
578
- Indicate whether to omit empty rows when reading data into the DataFrame.
579
- drop_empty_cols
580
- Indicate whether to omit empty columns (with no headers) when reading data into
581
- the DataFrame (note that empty column identification may vary depending on the
582
- underlying engine being used).
583
- raise_if_empty
584
- When there is no data in the sheet,`NoDataError` is raised. If this parameter
585
- is set to False, an empty DataFrame (with no columns) is returned instead.
586
-
587
- Returns
588
- -------
589
- DataFrame, or a `{sheetname: DataFrame, ...}` dict if reading multiple sheets.
590
-
591
- See Also
592
- --------
593
- read_excel
594
-
595
- Examples
596
- --------
597
- Read the "data" worksheet from an OpenOffice spreadsheet file into a DataFrame.
598
-
599
- >>> pl.read_ods(
600
- ... source="test.ods",
601
- ... sheet_name="data",
602
- ... ) # doctest: +SKIP
603
-
604
- If the correct dtypes can't be determined, use the `schema_overrides` parameter
605
- to specify them, or increase the inference length with `infer_schema_length`.
606
-
607
- >>> pl.read_ods(
608
- ... source="test.ods",
609
- ... sheet_id=3,
610
- ... schema_overrides={"dt": pl.Date},
611
- ... raise_if_empty=False,
612
- ... ) # doctest: +SKIP
613
- """
614
- sources, read_multiple_workbooks = _sources(source)
615
- frames: list[pl.DataFrame] | list[dict[str, pl.DataFrame]] = [ # type: ignore[assignment]
616
- _read_spreadsheet(
617
- src,
618
- sheet_id=sheet_id,
619
- sheet_name=sheet_name,
620
- table_name=None,
621
- engine="calamine",
622
- engine_options={},
623
- read_options=None,
624
- schema_overrides=schema_overrides,
625
- infer_schema_length=infer_schema_length,
626
- include_file_paths=include_file_paths,
627
- raise_if_empty=raise_if_empty,
628
- drop_empty_rows=drop_empty_rows,
629
- drop_empty_cols=drop_empty_cols,
630
- has_header=has_header,
631
- columns=columns,
632
- )
633
- for src in sources
634
- ]
635
- return _unpack_read_results(
636
- frames=frames,
637
- read_multiple_workbooks=read_multiple_workbooks,
638
- )
639
-
640
-
641
- def _read_spreadsheet(
642
- source: str | IO[bytes] | bytes,
643
- *,
644
- sheet_id: int | Sequence[int] | None,
645
- sheet_name: str | Sequence[str] | None,
646
- table_name: str | None,
647
- engine: ExcelSpreadsheetEngine,
648
- engine_options: dict[str, Any] | None = None,
649
- read_options: dict[str, Any] | None = None,
650
- schema_overrides: SchemaDict | None = None,
651
- infer_schema_length: int | None = N_INFER_DEFAULT,
652
- include_file_paths: str | None = None,
653
- columns: Sequence[int] | Sequence[str] | str | None = None,
654
- has_header: bool = True,
655
- raise_if_empty: bool = True,
656
- drop_empty_rows: bool = True,
657
- drop_empty_cols: bool = True,
658
- ) -> pl.DataFrame | dict[str, pl.DataFrame]:
659
- if isinstance(source, str):
660
- source = normalize_filepath(source)
661
- if looks_like_url(source):
662
- source = process_file_url(source)
663
-
664
- if isinstance(columns, str):
665
- columns = [columns]
666
-
667
- read_options = _get_read_options(
668
- read_options,
669
- engine=engine,
670
- columns=columns,
671
- has_header=has_header,
672
- infer_schema_length=infer_schema_length,
673
- )
674
- engine_options = (engine_options or {}).copy()
675
- schema_overrides = dict(schema_overrides or {})
676
-
677
- # establish the reading function, parser, and available worksheets
678
- reader_fn, parser, worksheets = _initialise_spreadsheet_parser(
679
- engine, source, engine_options
680
- )
681
- try:
682
- # parse data from the indicated sheet(s)
683
- sheet_names, return_multiple_sheets = _get_sheet_names(
684
- sheet_id, sheet_name, table_name, worksheets
685
- )
686
- parsed_sheets = {
687
- name: reader_fn(
688
- parser=parser,
689
- sheet_name=name,
690
- schema_overrides=schema_overrides,
691
- read_options=read_options,
692
- raise_if_empty=raise_if_empty,
693
- columns=columns,
694
- table_name=table_name,
695
- drop_empty_rows=drop_empty_rows,
696
- drop_empty_cols=drop_empty_cols,
697
- )
698
- for name in sheet_names
699
- }
700
- finally:
701
- if hasattr(parser, "close"):
702
- parser.close()
703
-
704
- if not parsed_sheets:
705
- param, value = ("id", sheet_id) if sheet_name is None else ("name", sheet_name)
706
- msg = f"no matching sheets found when `sheet_{param}` is {value!r}"
707
- raise ValueError(msg)
708
-
709
- if include_file_paths:
710
- workbook = source if isinstance(source, str) else "in-mem"
711
- parsed_sheets = {
712
- name: frame.with_columns(F.lit(workbook).alias(include_file_paths))
713
- for name, frame in parsed_sheets.items()
714
- }
715
- if return_multiple_sheets:
716
- return parsed_sheets
717
- return next(iter(parsed_sheets.values()))
718
-
719
-
720
- def _get_read_options(
721
- read_options: dict[str, Any] | None,
722
- *,
723
- engine: ExcelSpreadsheetEngine,
724
- columns: Sequence[int] | Sequence[str] | None,
725
- infer_schema_length: int | None,
726
- has_header: bool,
727
- ) -> dict[str, Any]:
728
- """Normalise top-level parameters to engine-specific 'read_options' dict."""
729
- read_options = (read_options or {}).copy()
730
-
731
- if engine == "calamine":
732
- if ("use_columns" in read_options) and columns:
733
- msg = 'cannot specify both `columns` and `read_options["use_columns"]`'
734
- raise ParameterCollisionError(msg)
735
- elif read_options.get("header_row") is not None and has_header is False:
736
- msg = 'the values of `has_header` and `read_options["header_row"]` are not compatible'
737
- raise ParameterCollisionError(msg)
738
- elif ("schema_sample_rows" in read_options) and (
739
- infer_schema_length != N_INFER_DEFAULT
740
- ):
741
- msg = 'cannot specify both `infer_schema_length` and `read_options["schema_sample_rows"]`'
742
- raise ParameterCollisionError(msg)
743
-
744
- read_options["schema_sample_rows"] = infer_schema_length
745
- if has_header is False and "header_row" not in read_options:
746
- read_options["header_row"] = None
747
-
748
- elif engine == "xlsx2csv":
749
- if ("columns" in read_options) and columns:
750
- msg = 'cannot specify both `columns` and `read_options["columns"]`'
751
- raise ParameterCollisionError(msg)
752
- elif (
753
- "has_header" in read_options
754
- and read_options["has_header"] is not has_header
755
- ):
756
- msg = 'the values of `has_header` and `read_options["has_header"]` are not compatible'
757
- raise ParameterCollisionError(msg)
758
- elif ("infer_schema_length" in read_options) and (
759
- infer_schema_length != N_INFER_DEFAULT
760
- ):
761
- msg = 'cannot specify both `infer_schema_length` and `read_options["infer_schema_length"]`'
762
- raise ParameterCollisionError(msg)
763
-
764
- read_options["infer_schema_length"] = infer_schema_length
765
- if "has_header" not in read_options:
766
- read_options["has_header"] = has_header
767
- else:
768
- read_options["infer_schema_length"] = infer_schema_length
769
- read_options["has_header"] = has_header
770
-
771
- return read_options
772
-
773
-
774
- def _get_sheet_names(
775
- sheet_id: int | Sequence[int] | None,
776
- sheet_name: str | Sequence[str] | None,
777
- table_name: str | None,
778
- worksheets: list[dict[str, Any]],
779
- ) -> tuple[list[str], bool]:
780
- """Establish sheets to read; indicate if we are returning a dict frames."""
781
- if sheet_id is not None and sheet_name is not None:
782
- msg = f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})"
783
- raise ValueError(msg)
784
-
785
- sheet_names = []
786
- if sheet_id is None and sheet_name is None:
787
- name = None if table_name else worksheets[0]["name"]
788
- sheet_names.append(name)
789
- return_multiple_sheets = False
790
- elif sheet_id == 0:
791
- sheet_names.extend(ws["name"] for ws in worksheets)
792
- return_multiple_sheets = True
793
- else:
794
- return_multiple_sheets = (
795
- (isinstance(sheet_name, Sequence) and not isinstance(sheet_name, str))
796
- or isinstance(sheet_id, Sequence)
797
- or sheet_id == 0
798
- )
799
- if names := (
800
- (sheet_name,) if isinstance(sheet_name, str) else sheet_name or ()
801
- ):
802
- known_sheet_names = {ws["name"] for ws in worksheets}
803
- for name in names:
804
- if name not in known_sheet_names:
805
- msg = f"no matching sheet found when `sheet_name` is {name!r}"
806
- raise ValueError(msg)
807
- sheet_names.append(name)
808
- else:
809
- ids = (sheet_id,) if isinstance(sheet_id, int) else sheet_id or ()
810
- sheet_names_by_idx = {
811
- idx: ws["name"]
812
- for idx, ws in enumerate(worksheets, start=1)
813
- if (sheet_id == 0 or ws["index"] in ids or ws["name"] in names)
814
- }
815
- for idx in ids:
816
- if (name := sheet_names_by_idx.get(idx)) is None:
817
- msg = f"no matching sheet found when `sheet_id` is {idx}"
818
- raise ValueError(msg)
819
- sheet_names.append(name)
820
-
821
- return sheet_names, return_multiple_sheets # type: ignore[return-value]
822
-
823
-
824
- def _initialise_spreadsheet_parser(
825
- engine: str | None,
826
- source: str | IO[bytes] | bytes,
827
- engine_options: dict[str, Any],
828
- ) -> tuple[Callable[..., pl.DataFrame], Any, list[dict[str, Any]]]:
829
- """Instantiate the indicated spreadsheet parser and establish related properties."""
830
- if isinstance(source, str) and not Path(source).exists():
831
- raise FileNotFoundError(source)
832
-
833
- if engine == "xlsx2csv": # default
834
- xlsx2csv = import_optional("xlsx2csv")
835
-
836
- # establish sensible defaults for unset options
837
- for option, value in {
838
- "exclude_hidden_sheets": False,
839
- "skip_empty_lines": False,
840
- "skip_hidden_rows": False,
841
- "floatformat": "%f",
842
- }.items():
843
- engine_options.setdefault(option, value)
844
-
845
- if isinstance(source, bytes):
846
- source = BytesIO(source)
847
-
848
- parser = xlsx2csv.Xlsx2csv(source, **engine_options)
849
- sheets = parser.workbook.sheets
850
- return _read_spreadsheet_xlsx2csv, parser, sheets
851
-
852
- elif engine == "openpyxl":
853
- openpyxl = import_optional("openpyxl")
854
- if isinstance(source, bytes):
855
- source = BytesIO(source)
856
-
857
- parser = openpyxl.load_workbook(source, data_only=True, **engine_options)
858
- sheets = [{"index": i + 1, "name": ws.title} for i, ws in enumerate(parser)]
859
- return _read_spreadsheet_openpyxl, parser, sheets
860
-
861
- elif engine == "calamine":
862
- fastexcel = import_optional("fastexcel", min_version="0.7.0")
863
- reading_bytesio, reading_bytes = (
864
- isinstance(source, BytesIO),
865
- isinstance(source, bytes),
866
- )
867
- if (reading_bytesio or reading_bytes) and parse_version(
868
- module_version := fastexcel.__version__
869
- ) < (0, 10):
870
- msg = f"`fastexcel` >= 0.10 is required to read bytes; found {module_version})"
871
- raise ModuleUpgradeRequiredError(msg)
872
-
873
- if reading_bytesio:
874
- source = source.getvalue() # type: ignore[union-attr]
875
- elif isinstance(source, (BufferedReader, TextIOWrapper)):
876
- if "b" not in source.mode:
877
- msg = f"file {source.name!r} must be opened in binary mode"
878
- raise OSError(msg)
879
- elif (filename := source.name) and Path(filename).exists():
880
- source = filename
881
- else:
882
- source = source.read()
883
-
884
- parser = fastexcel.read_excel(source, **engine_options)
885
- sheets = [
886
- {"index": i + 1, "name": nm} for i, nm in enumerate(parser.sheet_names)
887
- ]
888
- return _read_spreadsheet_calamine, parser, sheets
889
-
890
- msg = f"unrecognized engine: {engine!r}"
891
- raise NotImplementedError(msg)
892
-
893
-
894
- def _csv_buffer_to_frame(
895
- csv: StringIO,
896
- *,
897
- separator: str,
898
- read_options: dict[str, Any],
899
- schema_overrides: SchemaDict | None,
900
- drop_empty_rows: bool,
901
- drop_empty_cols: bool,
902
- raise_if_empty: bool,
903
- ) -> pl.DataFrame:
904
- """Translate StringIO buffer containing delimited data as a DataFrame."""
905
- # handle (completely) empty sheet data
906
- if csv.tell() == 0:
907
- return _empty_frame(raise_if_empty)
908
-
909
- # otherwise rewind the buffer and parse as csv
910
- csv.seek(0)
911
-
912
- if read_options is None:
913
- read_options = {}
914
-
915
- date_cols = []
916
- if schema_overrides:
917
- if csv_dtypes := read_options.get("dtypes", {}):
918
- issue_deprecation_warning(
919
- "the `dtypes` parameter for `read_csv` is deprecated. It has been renamed to `schema_overrides`.",
920
- version="0.20.31",
921
- )
922
-
923
- csv_schema_overrides = read_options.get("schema_overrides", csv_dtypes)
924
- if set(csv_schema_overrides).intersection(schema_overrides):
925
- msg = "cannot specify columns in both `schema_overrides` and `read_options['dtypes']`"
926
- raise ParameterCollisionError(msg)
927
-
928
- overrides, schema_overrides = {**csv_schema_overrides, **schema_overrides}, {}
929
- for nm, dtype in overrides.items():
930
- if dtype != Date:
931
- schema_overrides[nm] = dtype
932
- else:
933
- date_cols.append(nm)
934
-
935
- read_options = read_options.copy()
936
- read_options["schema_overrides"] = schema_overrides
937
-
938
- df = _drop_null_data(
939
- df=read_csv(
940
- csv,
941
- separator=separator,
942
- **read_options,
943
- ),
944
- raise_if_empty=raise_if_empty,
945
- drop_empty_rows=drop_empty_rows,
946
- drop_empty_cols=drop_empty_cols,
947
- )
948
- if date_cols:
949
- date_casts, schema = {}, df.schema
950
- for nm in date_cols:
951
- if schema[nm] == String:
952
- date_casts[nm] = (
953
- F.col(nm)
954
- .str.replace(r"(?:[ T]00:00:00(?:\.0+)?)$", "")
955
- .str.to_date()
956
- )
957
- if date_casts:
958
- df = df.with_columns(**date_casts)
959
- return df
960
-
961
-
962
- def _drop_null_data(
963
- df: pl.DataFrame,
964
- *,
965
- raise_if_empty: bool,
966
- drop_empty_rows: bool = True,
967
- drop_empty_cols: bool = True,
968
- ) -> pl.DataFrame:
969
- """If DataFrame contains columns/rows that contain only nulls, drop them."""
970
- null_cols: list[str] = []
971
- if drop_empty_cols:
972
- for col_name in df.columns:
973
- # note that if multiple unnamed columns are found then all but the first one
974
- # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine)
975
- if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name):
976
- col = df[col_name]
977
- if (
978
- col.dtype == Null
979
- or col.null_count() == df.height
980
- or (
981
- col.dtype in NUMERIC_DTYPES
982
- and col.replace(0, None).null_count() == df.height
983
- )
984
- ):
985
- null_cols.append(col_name)
986
- if null_cols:
987
- df = df.drop(*null_cols)
988
-
989
- if df.height == df.width == 0:
990
- return _empty_frame(raise_if_empty)
991
- if drop_empty_rows:
992
- return df.filter(~F.all_horizontal(F.all().is_null()))
993
- return df
994
-
995
-
996
- def _empty_frame(raise_if_empty: bool) -> pl.DataFrame: # noqa: FBT001
997
- if raise_if_empty:
998
- msg = (
999
- "empty Excel sheet"
1000
- "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
1001
- )
1002
- raise NoDataError(msg)
1003
- return pl.DataFrame()
1004
-
1005
-
1006
- def _reorder_columns(
1007
- df: pl.DataFrame, columns: Sequence[int] | Sequence[str] | None
1008
- ) -> pl.DataFrame:
1009
- if columns:
1010
- from polars.selectors import by_index, by_name
1011
-
1012
- cols = by_index(*columns) if isinstance(columns[0], int) else by_name(*columns)
1013
- df = df.select(cols)
1014
- return df
1015
-
1016
-
1017
- def _read_spreadsheet_calamine(
1018
- parser: Any,
1019
- *,
1020
- sheet_name: str | None,
1021
- read_options: dict[str, Any],
1022
- schema_overrides: SchemaDict | None,
1023
- columns: Sequence[int] | Sequence[str] | None,
1024
- table_name: str | None = None,
1025
- drop_empty_rows: bool,
1026
- drop_empty_cols: bool,
1027
- raise_if_empty: bool,
1028
- ) -> pl.DataFrame:
1029
- # if we have 'schema_overrides' and a more recent version of `fastexcel`
1030
- # we can pass translated dtypes to the engine to refine the initial parse
1031
- fastexcel = import_optional("fastexcel")
1032
- fastexcel_version = parse_version(original_version := fastexcel.__version__)
1033
-
1034
- if fastexcel_version < (0, 9) and "schema_sample_rows" in read_options:
1035
- msg = f"a more recent version of `fastexcel` is required for 'schema_sample_rows' (>= 0.9; found {original_version})"
1036
- raise ModuleUpgradeRequiredError(msg)
1037
- if fastexcel_version < (0, 10, 2) and "use_columns" in read_options:
1038
- msg = f"a more recent version of `fastexcel` is required for 'use_columns' (>= 0.10.2; found {original_version})"
1039
- raise ModuleUpgradeRequiredError(msg)
1040
- if table_name and fastexcel_version < (0, 12):
1041
- msg = f"a more recent version of `fastexcel` is required for 'table_name' (>= 0.12.0; found {original_version})"
1042
- raise ValueError(msg)
1043
-
1044
- if columns:
1045
- if not isinstance(columns, list):
1046
- columns = list(columns) # type: ignore[assignment]
1047
- read_options["use_columns"] = columns
1048
-
1049
- schema_overrides = schema_overrides or {}
1050
- if read_options.get("schema_sample_rows") == 0:
1051
- # ref: https://github.com/ToucanToco/fastexcel/issues/236
1052
- del read_options["schema_sample_rows"]
1053
- read_options["dtypes"] = (
1054
- "string"
1055
- if fastexcel_version >= (0, 12, 1)
1056
- else dict.fromkeys(range(16384), "string")
1057
- )
1058
- elif schema_overrides and fastexcel_version >= (0, 10):
1059
- parser_dtypes = read_options.get("dtypes", {})
1060
- for name, dtype in schema_overrides.items():
1061
- if name not in parser_dtypes:
1062
- if (base_dtype := dtype.base_type()) in INTEGER_DTYPES:
1063
- parser_dtypes[name] = "int"
1064
- elif base_dtype in FLOAT_DTYPES:
1065
- parser_dtypes[name] = "float"
1066
- elif base_dtype == String:
1067
- parser_dtypes[name] = "string"
1068
- elif base_dtype == Duration:
1069
- parser_dtypes[name] = "duration"
1070
- elif base_dtype == Boolean:
1071
- parser_dtypes[name] = "boolean"
1072
-
1073
- read_options["dtypes"] = parser_dtypes
1074
-
1075
- if fastexcel_version < (0, 11, 2):
1076
- ws = parser.load_sheet_by_name(name=sheet_name, **read_options)
1077
- df = ws.to_polars()
1078
- else:
1079
- if table_name:
1080
- xl_table = parser.load_table(table_name, **read_options)
1081
- if sheet_name and sheet_name != xl_table.sheet_name:
1082
- msg = f"table named {table_name!r} not found in sheet {sheet_name!r}"
1083
- raise RuntimeError(msg)
1084
- df = xl_table.to_polars()
1085
- else:
1086
- ws_arrow = parser.load_sheet_eager(sheet_name, **read_options)
1087
- df = from_arrow(ws_arrow)
1088
-
1089
- if read_options.get("header_row", False) is None and not read_options.get(
1090
- "column_names"
1091
- ):
1092
- df.columns = [f"column_{i}" for i in range(1, df.width + 1)]
1093
-
1094
- df = _drop_null_data(
1095
- df,
1096
- raise_if_empty=raise_if_empty,
1097
- drop_empty_rows=drop_empty_rows,
1098
- drop_empty_cols=drop_empty_cols,
1099
- )
1100
-
1101
- # note: even if we applied parser dtypes we still re-apply schema_overrides
1102
- # natively as we can refine integer/float types, temporal precision, etc.
1103
- if schema_overrides:
1104
- lf, schema = df.lazy(), df.schema
1105
- str_to_temporal, updated_overrides = [], {}
1106
- for nm, tp in schema_overrides.items():
1107
- if schema[nm] != String:
1108
- updated_overrides[nm] = tp
1109
- elif tp == Datetime:
1110
- str_to_temporal.append(
1111
- F.col(nm).str.to_datetime(
1112
- time_unit=getattr(tp, "time_unit", None),
1113
- time_zone=getattr(tp, "time_zone", None),
1114
- )
1115
- )
1116
- elif tp == Date:
1117
- dt_str = F.col(nm).str.replace(r"(?:[ T]00:00:00(?:\.0+)?)$", "")
1118
- str_to_temporal.append(dt_str.str.to_date())
1119
- elif tp == Time:
1120
- str_to_temporal.append(F.col(nm).str.to_time())
1121
- else:
1122
- updated_overrides[nm] = tp
1123
-
1124
- if str_to_temporal:
1125
- lf = lf.with_columns(*str_to_temporal)
1126
- if updated_overrides:
1127
- lf = lf.cast(dtypes=updated_overrides)
1128
- df = lf.collect()
1129
-
1130
- # standardise on string dtype for null columns in empty frame
1131
- if df.is_empty():
1132
- df = df.cast({Null: String})
1133
-
1134
- # further refine dtypes
1135
- type_checks = []
1136
- for c, dtype in df.schema.items():
1137
- if c not in schema_overrides:
1138
- # may read integer data as float; cast back to int where possible.
1139
- if dtype in FLOAT_DTYPES:
1140
- check_cast = [
1141
- F.col(c).floor().eq_missing(F.col(c)) & F.col(c).is_not_nan(),
1142
- F.col(c).cast(Int64),
1143
- ]
1144
- type_checks.append(check_cast)
1145
- # do a similar check for datetime columns that have only 00:00:00 times.
1146
- elif dtype == Datetime:
1147
- check_cast = [
1148
- F.col(c).dt.time().eq(time(0, 0, 0)),
1149
- F.col(c).cast(Date),
1150
- ]
1151
- type_checks.append(check_cast)
1152
-
1153
- if type_checks:
1154
- apply_cast = df.select(d[0].all(ignore_nulls=True) for d in type_checks).row(0)
1155
- if downcast := [
1156
- cast for apply, (_, cast) in zip(apply_cast, type_checks) if apply
1157
- ]:
1158
- df = df.with_columns(*downcast)
1159
-
1160
- return df
1161
-
1162
-
1163
- def _read_spreadsheet_openpyxl(
1164
- parser: Any,
1165
- *,
1166
- sheet_name: str | None,
1167
- read_options: dict[str, Any],
1168
- schema_overrides: SchemaDict | None,
1169
- columns: Sequence[int] | Sequence[str] | None,
1170
- table_name: str | None = None,
1171
- drop_empty_rows: bool,
1172
- drop_empty_cols: bool,
1173
- raise_if_empty: bool,
1174
- ) -> pl.DataFrame:
1175
- """Use the 'openpyxl' library to read data from the given worksheet."""
1176
- infer_schema_length = read_options.pop("infer_schema_length", None)
1177
- has_header = read_options.pop("has_header", True)
1178
- schema_overrides = schema_overrides or {}
1179
- no_inference = infer_schema_length == 0
1180
- header: list[str | None] = []
1181
-
1182
- if table_name and not sheet_name:
1183
- sheet_name, n_tables = None, 0
1184
- for sheet in parser.worksheets:
1185
- n_tables += 1
1186
- if table_name in sheet.tables:
1187
- ws, sheet_name = sheet, sheet.title
1188
- break
1189
- if sheet_name is None:
1190
- msg = (
1191
- f"table named {table_name!r} not found in sheet {sheet_name!r}"
1192
- if n_tables
1193
- else f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
1194
- )
1195
- raise RuntimeError(msg)
1196
- else:
1197
- ws = parser[sheet_name]
1198
-
1199
- # prefer detection of actual table objects; otherwise read
1200
- # data in the used worksheet range, dropping null columns
1201
- if tables := getattr(ws, "tables", None):
1202
- table = tables[table_name] if table_name else next(iter(tables.values()))
1203
- rows = list(ws[table.ref])
1204
- if not rows:
1205
- return _empty_frame(raise_if_empty)
1206
- if has_header:
1207
- header.extend(cell.value for cell in rows.pop(0))
1208
- else:
1209
- header.extend(f"column_{n}" for n in range(1, len(rows[0]) + 1))
1210
- if table.totalsRowCount:
1211
- rows = rows[: -table.totalsRowCount]
1212
- rows_iter = rows
1213
- elif table_name:
1214
- msg = f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
1215
- raise RuntimeError(msg)
1216
- else:
1217
- if not has_header:
1218
- if not (rows_iter := list(ws.iter_rows())):
1219
- return _empty_frame(raise_if_empty)
1220
- n_cols = len(rows_iter[0])
1221
- header = [f"column_{n}" for n in range(1, n_cols + 1)]
1222
- else:
1223
- rows_iter = ws.iter_rows()
1224
- for row in rows_iter:
1225
- row_values = [cell.value for cell in row]
1226
- if any(v is not None for v in row_values):
1227
- header.extend(row_values)
1228
- break
1229
-
1230
- dtype = String if no_inference else None
1231
- series_data = []
1232
- for name, column_data in zip(header, zip(*rows_iter)):
1233
- if name or not drop_empty_cols:
1234
- values = [cell.value for cell in column_data]
1235
- if no_inference or (dtype := schema_overrides.get(name)) == String: # type: ignore[assignment,arg-type]
1236
- # note: if we initialise the series with mixed-type data (eg: str/int)
1237
- # then the non-strings will become null, so we handle the cast here
1238
- values = [str(v) if (v is not None) else v for v in values]
1239
-
1240
- if (tp := schema_overrides.get(name)) in (Date, Datetime, Time): # type: ignore[operator,arg-type]
1241
- s = pl.Series(name, values, strict=False)
1242
- if s.dtype == String:
1243
- if tp == Datetime:
1244
- s = s.str.to_datetime(
1245
- time_unit=getattr(tp, "time_unit", None),
1246
- time_zone=getattr(tp, "time_zone", None),
1247
- )
1248
- elif tp == Date:
1249
- s = s.str.replace(
1250
- r"(?:[ T]00:00:00(?:\.0+)?)$", ""
1251
- ).str.to_date()
1252
- elif tp == Time:
1253
- s = s.str.to_time()
1254
- else:
1255
- s = pl.Series(name, values, dtype=dtype, strict=False)
1256
- series_data.append(s)
1257
-
1258
- names = deduplicate_names(s.name for s in series_data)
1259
- df = pl.DataFrame(
1260
- dict(zip(names, series_data)),
1261
- schema_overrides=schema_overrides,
1262
- infer_schema_length=infer_schema_length,
1263
- strict=False,
1264
- )
1265
- df = _drop_null_data(
1266
- df,
1267
- raise_if_empty=raise_if_empty,
1268
- drop_empty_rows=drop_empty_rows,
1269
- drop_empty_cols=drop_empty_cols,
1270
- )
1271
- df = _reorder_columns(df, columns)
1272
- return df
1273
-
1274
-
1275
- def _read_spreadsheet_xlsx2csv(
1276
- parser: Any,
1277
- *,
1278
- sheet_name: str | None,
1279
- read_options: dict[str, Any],
1280
- schema_overrides: SchemaDict | None,
1281
- columns: Sequence[int] | Sequence[str] | None,
1282
- table_name: str | None = None,
1283
- drop_empty_rows: bool,
1284
- drop_empty_cols: bool,
1285
- raise_if_empty: bool,
1286
- ) -> pl.DataFrame:
1287
- """Use the 'xlsx2csv' library to read data from the given worksheet."""
1288
- if table_name:
1289
- msg = "the `table_name` parameter is not supported by the 'xlsx2csv' engine"
1290
- raise ValueError(msg)
1291
-
1292
- csv_buffer = StringIO()
1293
- with warnings.catch_warnings():
1294
- # xlsx2csv version 0.8.4 throws a DeprecationWarning in Python 3.13
1295
- # https://github.com/dilshod/xlsx2csv/pull/287
1296
- warnings.filterwarnings("ignore", category=DeprecationWarning)
1297
- parser.convert(outfile=csv_buffer, sheetname=sheet_name)
1298
-
1299
- read_options.setdefault("truncate_ragged_lines", True)
1300
- if columns:
1301
- read_options["columns"] = columns
1302
-
1303
- cast_to_boolean = []
1304
- if schema_overrides:
1305
- for col, dtype in schema_overrides.items():
1306
- if dtype == Boolean:
1307
- schema_overrides[col] = UInt8 # type: ignore[index]
1308
- cast_to_boolean.append(F.col(col).cast(Boolean))
1309
-
1310
- df = _csv_buffer_to_frame(
1311
- csv_buffer,
1312
- separator=",",
1313
- read_options=read_options,
1314
- schema_overrides=schema_overrides,
1315
- raise_if_empty=raise_if_empty,
1316
- drop_empty_rows=drop_empty_rows,
1317
- drop_empty_cols=drop_empty_cols,
1318
- )
1319
- if cast_to_boolean:
1320
- df = df.with_columns(*cast_to_boolean)
1321
-
1322
- df = df.rename(_standardize_duplicates)
1323
- return _reorder_columns(df, columns)