polars-runtime-compat 1.34.0b3__cp39-abi3-macosx_11_0_arm64.whl → 1.34.0b5__cp39-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
- polars_runtime_compat-1.34.0b5.dist-info/METADATA +35 -0
- polars_runtime_compat-1.34.0b5.dist-info/RECORD +6 -0
- polars/__init__.py +0 -528
- polars/_cpu_check.py +0 -265
- polars/_dependencies.py +0 -355
- polars/_plr.py +0 -99
- polars/_plr.pyi +0 -2496
- polars/_reexport.py +0 -23
- polars/_typing.py +0 -478
- polars/_utils/__init__.py +0 -37
- polars/_utils/async_.py +0 -102
- polars/_utils/cache.py +0 -176
- polars/_utils/cloud.py +0 -40
- polars/_utils/constants.py +0 -29
- polars/_utils/construction/__init__.py +0 -46
- polars/_utils/construction/dataframe.py +0 -1397
- polars/_utils/construction/other.py +0 -72
- polars/_utils/construction/series.py +0 -560
- polars/_utils/construction/utils.py +0 -118
- polars/_utils/convert.py +0 -224
- polars/_utils/deprecation.py +0 -406
- polars/_utils/getitem.py +0 -457
- polars/_utils/logging.py +0 -11
- polars/_utils/nest_asyncio.py +0 -264
- polars/_utils/parquet.py +0 -15
- polars/_utils/parse/__init__.py +0 -12
- polars/_utils/parse/expr.py +0 -242
- polars/_utils/polars_version.py +0 -19
- polars/_utils/pycapsule.py +0 -53
- polars/_utils/scan.py +0 -27
- polars/_utils/serde.py +0 -63
- polars/_utils/slice.py +0 -215
- polars/_utils/udfs.py +0 -1251
- polars/_utils/unstable.py +0 -63
- polars/_utils/various.py +0 -782
- polars/_utils/wrap.py +0 -25
- polars/api.py +0 -370
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +0 -19
- polars/catalog/unity/client.py +0 -733
- polars/catalog/unity/models.py +0 -152
- polars/config.py +0 -1571
- polars/convert/__init__.py +0 -25
- polars/convert/general.py +0 -1046
- polars/convert/normalize.py +0 -261
- polars/dataframe/__init__.py +0 -5
- polars/dataframe/_html.py +0 -186
- polars/dataframe/frame.py +0 -12582
- polars/dataframe/group_by.py +0 -1067
- polars/dataframe/plotting.py +0 -257
- polars/datatype_expr/__init__.py +0 -5
- polars/datatype_expr/array.py +0 -56
- polars/datatype_expr/datatype_expr.py +0 -304
- polars/datatype_expr/list.py +0 -18
- polars/datatype_expr/struct.py +0 -69
- polars/datatypes/__init__.py +0 -122
- polars/datatypes/_parse.py +0 -195
- polars/datatypes/_utils.py +0 -48
- polars/datatypes/classes.py +0 -1213
- polars/datatypes/constants.py +0 -11
- polars/datatypes/constructor.py +0 -172
- polars/datatypes/convert.py +0 -366
- polars/datatypes/group.py +0 -130
- polars/exceptions.py +0 -230
- polars/expr/__init__.py +0 -7
- polars/expr/array.py +0 -964
- polars/expr/binary.py +0 -346
- polars/expr/categorical.py +0 -306
- polars/expr/datetime.py +0 -2620
- polars/expr/expr.py +0 -11272
- polars/expr/list.py +0 -1408
- polars/expr/meta.py +0 -444
- polars/expr/name.py +0 -321
- polars/expr/string.py +0 -3045
- polars/expr/struct.py +0 -357
- polars/expr/whenthen.py +0 -185
- polars/functions/__init__.py +0 -193
- polars/functions/aggregation/__init__.py +0 -33
- polars/functions/aggregation/horizontal.py +0 -298
- polars/functions/aggregation/vertical.py +0 -341
- polars/functions/as_datatype.py +0 -848
- polars/functions/business.py +0 -138
- polars/functions/col.py +0 -384
- polars/functions/datatype.py +0 -121
- polars/functions/eager.py +0 -524
- polars/functions/escape_regex.py +0 -29
- polars/functions/lazy.py +0 -2751
- polars/functions/len.py +0 -68
- polars/functions/lit.py +0 -210
- polars/functions/random.py +0 -22
- polars/functions/range/__init__.py +0 -19
- polars/functions/range/_utils.py +0 -15
- polars/functions/range/date_range.py +0 -303
- polars/functions/range/datetime_range.py +0 -370
- polars/functions/range/int_range.py +0 -348
- polars/functions/range/linear_space.py +0 -311
- polars/functions/range/time_range.py +0 -287
- polars/functions/repeat.py +0 -301
- polars/functions/whenthen.py +0 -353
- polars/interchange/__init__.py +0 -10
- polars/interchange/buffer.py +0 -77
- polars/interchange/column.py +0 -190
- polars/interchange/dataframe.py +0 -230
- polars/interchange/from_dataframe.py +0 -328
- polars/interchange/protocol.py +0 -303
- polars/interchange/utils.py +0 -170
- polars/io/__init__.py +0 -64
- polars/io/_utils.py +0 -317
- polars/io/avro.py +0 -49
- polars/io/clipboard.py +0 -36
- polars/io/cloud/__init__.py +0 -17
- polars/io/cloud/_utils.py +0 -80
- polars/io/cloud/credential_provider/__init__.py +0 -17
- polars/io/cloud/credential_provider/_builder.py +0 -520
- polars/io/cloud/credential_provider/_providers.py +0 -618
- polars/io/csv/__init__.py +0 -9
- polars/io/csv/_utils.py +0 -38
- polars/io/csv/batched_reader.py +0 -142
- polars/io/csv/functions.py +0 -1495
- polars/io/database/__init__.py +0 -6
- polars/io/database/_arrow_registry.py +0 -70
- polars/io/database/_cursor_proxies.py +0 -147
- polars/io/database/_executor.py +0 -578
- polars/io/database/_inference.py +0 -314
- polars/io/database/_utils.py +0 -144
- polars/io/database/functions.py +0 -516
- polars/io/delta.py +0 -499
- polars/io/iceberg/__init__.py +0 -3
- polars/io/iceberg/_utils.py +0 -697
- polars/io/iceberg/dataset.py +0 -556
- polars/io/iceberg/functions.py +0 -151
- polars/io/ipc/__init__.py +0 -8
- polars/io/ipc/functions.py +0 -514
- polars/io/json/__init__.py +0 -3
- polars/io/json/read.py +0 -101
- polars/io/ndjson.py +0 -332
- polars/io/parquet/__init__.py +0 -17
- polars/io/parquet/field_overwrites.py +0 -140
- polars/io/parquet/functions.py +0 -722
- polars/io/partition.py +0 -491
- polars/io/plugins.py +0 -187
- polars/io/pyarrow_dataset/__init__.py +0 -5
- polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
- polars/io/pyarrow_dataset/functions.py +0 -79
- polars/io/scan_options/__init__.py +0 -5
- polars/io/scan_options/_options.py +0 -59
- polars/io/scan_options/cast_options.py +0 -126
- polars/io/spreadsheet/__init__.py +0 -6
- polars/io/spreadsheet/_utils.py +0 -52
- polars/io/spreadsheet/_write_utils.py +0 -647
- polars/io/spreadsheet/functions.py +0 -1323
- polars/lazyframe/__init__.py +0 -9
- polars/lazyframe/engine_config.py +0 -61
- polars/lazyframe/frame.py +0 -8564
- polars/lazyframe/group_by.py +0 -669
- polars/lazyframe/in_process.py +0 -42
- polars/lazyframe/opt_flags.py +0 -333
- polars/meta/__init__.py +0 -14
- polars/meta/build.py +0 -33
- polars/meta/index_type.py +0 -27
- polars/meta/thread_pool.py +0 -50
- polars/meta/versions.py +0 -120
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +0 -213
- polars/ml/utilities.py +0 -30
- polars/plugins.py +0 -155
- polars/py.typed +0 -0
- polars/pyproject.toml +0 -103
- polars/schema.py +0 -265
- polars/selectors.py +0 -3117
- polars/series/__init__.py +0 -5
- polars/series/array.py +0 -776
- polars/series/binary.py +0 -254
- polars/series/categorical.py +0 -246
- polars/series/datetime.py +0 -2275
- polars/series/list.py +0 -1087
- polars/series/plotting.py +0 -191
- polars/series/series.py +0 -9197
- polars/series/string.py +0 -2367
- polars/series/struct.py +0 -154
- polars/series/utils.py +0 -191
- polars/sql/__init__.py +0 -7
- polars/sql/context.py +0 -677
- polars/sql/functions.py +0 -139
- polars/string_cache.py +0 -185
- polars/testing/__init__.py +0 -13
- polars/testing/asserts/__init__.py +0 -9
- polars/testing/asserts/frame.py +0 -231
- polars/testing/asserts/series.py +0 -219
- polars/testing/asserts/utils.py +0 -12
- polars/testing/parametric/__init__.py +0 -33
- polars/testing/parametric/profiles.py +0 -107
- polars/testing/parametric/strategies/__init__.py +0 -22
- polars/testing/parametric/strategies/_utils.py +0 -14
- polars/testing/parametric/strategies/core.py +0 -615
- polars/testing/parametric/strategies/data.py +0 -452
- polars/testing/parametric/strategies/dtype.py +0 -436
- polars/testing/parametric/strategies/legacy.py +0 -169
- polars/type_aliases.py +0 -24
- polars_runtime_compat-1.34.0b3.dist-info/METADATA +0 -190
- polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/WHEEL +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,1323 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import re
|
|
5
|
-
import warnings
|
|
6
|
-
from collections import defaultdict
|
|
7
|
-
from collections.abc import Sequence
|
|
8
|
-
from datetime import time
|
|
9
|
-
from glob import glob
|
|
10
|
-
from io import BufferedReader, BytesIO, StringIO, TextIOWrapper
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
from typing import IO, TYPE_CHECKING, Any, Callable, NoReturn, overload
|
|
13
|
-
|
|
14
|
-
import polars._reexport as pl
|
|
15
|
-
from polars import from_arrow
|
|
16
|
-
from polars import functions as F
|
|
17
|
-
from polars._dependencies import import_optional
|
|
18
|
-
from polars._utils.deprecation import (
|
|
19
|
-
deprecate_renamed_parameter,
|
|
20
|
-
issue_deprecation_warning,
|
|
21
|
-
)
|
|
22
|
-
from polars._utils.various import deduplicate_names, normalize_filepath, parse_version
|
|
23
|
-
from polars.datatypes import (
|
|
24
|
-
N_INFER_DEFAULT,
|
|
25
|
-
Boolean,
|
|
26
|
-
Date,
|
|
27
|
-
Datetime,
|
|
28
|
-
Duration,
|
|
29
|
-
Int64,
|
|
30
|
-
Null,
|
|
31
|
-
String,
|
|
32
|
-
Time,
|
|
33
|
-
UInt8,
|
|
34
|
-
)
|
|
35
|
-
from polars.datatypes.group import FLOAT_DTYPES, INTEGER_DTYPES, NUMERIC_DTYPES
|
|
36
|
-
from polars.exceptions import (
|
|
37
|
-
ModuleUpgradeRequiredError,
|
|
38
|
-
NoDataError,
|
|
39
|
-
ParameterCollisionError,
|
|
40
|
-
)
|
|
41
|
-
from polars.functions import concat
|
|
42
|
-
from polars.io._utils import looks_like_url, process_file_url
|
|
43
|
-
from polars.io.csv.functions import read_csv
|
|
44
|
-
|
|
45
|
-
if TYPE_CHECKING:
|
|
46
|
-
from typing import Literal
|
|
47
|
-
|
|
48
|
-
from polars._typing import ExcelSpreadsheetEngine, FileSource, SchemaDict
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def _sources(source: FileSource) -> tuple[Any, bool]:
|
|
52
|
-
"""Unpack any glob patterns, standardise file paths."""
|
|
53
|
-
read_multiple_workbooks = True
|
|
54
|
-
sources: list[Any] = []
|
|
55
|
-
|
|
56
|
-
if isinstance(source, memoryview):
|
|
57
|
-
source = source.tobytes()
|
|
58
|
-
if not isinstance(source, Sequence) or isinstance(source, (bytes, str)):
|
|
59
|
-
read_multiple_workbooks = False
|
|
60
|
-
source = [source] # type: ignore[assignment]
|
|
61
|
-
|
|
62
|
-
for src in source: # type: ignore[union-attr]
|
|
63
|
-
if isinstance(src, (str, os.PathLike)) and not Path(src).exists():
|
|
64
|
-
src = os.path.expanduser(str(src)) # noqa: PTH111
|
|
65
|
-
if looks_like_url(src):
|
|
66
|
-
sources.append(src)
|
|
67
|
-
continue
|
|
68
|
-
sources.extend(files := glob(src, recursive=True)) # noqa: PTH207
|
|
69
|
-
if not files:
|
|
70
|
-
msg = f"no workbook found at path {src!r}"
|
|
71
|
-
raise FileNotFoundError(msg)
|
|
72
|
-
read_multiple_workbooks = True
|
|
73
|
-
else:
|
|
74
|
-
if isinstance(src, os.PathLike):
|
|
75
|
-
src = str(src)
|
|
76
|
-
sources.append(src)
|
|
77
|
-
|
|
78
|
-
return sources, read_multiple_workbooks
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def _standardize_duplicates(s: str) -> str:
|
|
82
|
-
"""Standardize columns with '_duplicated_n' names."""
|
|
83
|
-
return re.sub(r"_duplicated_(\d+)", repl=r"\1", string=s)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def _unpack_read_results(
|
|
87
|
-
frames: list[pl.DataFrame] | list[dict[str, pl.DataFrame]],
|
|
88
|
-
*,
|
|
89
|
-
read_multiple_workbooks: bool,
|
|
90
|
-
) -> Any:
|
|
91
|
-
if not frames:
|
|
92
|
-
msg = "no data found in the given workbook(s) and sheet(s)"
|
|
93
|
-
raise NoDataError(msg)
|
|
94
|
-
|
|
95
|
-
if not read_multiple_workbooks:
|
|
96
|
-
# one sheet from one workbook
|
|
97
|
-
return frames[0]
|
|
98
|
-
|
|
99
|
-
if isinstance(frames[0], pl.DataFrame):
|
|
100
|
-
# one sheet from multiple workbooks
|
|
101
|
-
return concat(frames, how="vertical_relaxed") # type: ignore[type-var]
|
|
102
|
-
else:
|
|
103
|
-
# multiple sheets from multiple workbooks
|
|
104
|
-
sheet_frames = defaultdict(list)
|
|
105
|
-
for res in frames:
|
|
106
|
-
for sheet, df in res.items(): # type: ignore[union-attr]
|
|
107
|
-
sheet_frames[sheet].append(df)
|
|
108
|
-
return {k: concat(v, how="vertical_relaxed") for k, v in sheet_frames.items()}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
@overload
|
|
112
|
-
def read_excel(
|
|
113
|
-
source: FileSource,
|
|
114
|
-
*,
|
|
115
|
-
sheet_id: None = ...,
|
|
116
|
-
sheet_name: str,
|
|
117
|
-
table_name: str | None = ...,
|
|
118
|
-
engine: ExcelSpreadsheetEngine = ...,
|
|
119
|
-
engine_options: dict[str, Any] | None = ...,
|
|
120
|
-
read_options: dict[str, Any] | None = ...,
|
|
121
|
-
has_header: bool = ...,
|
|
122
|
-
columns: Sequence[int] | Sequence[str] | str | None = ...,
|
|
123
|
-
schema_overrides: SchemaDict | None = ...,
|
|
124
|
-
infer_schema_length: int | None = ...,
|
|
125
|
-
include_file_paths: str | None = ...,
|
|
126
|
-
drop_empty_rows: bool = ...,
|
|
127
|
-
drop_empty_cols: bool = ...,
|
|
128
|
-
raise_if_empty: bool = ...,
|
|
129
|
-
) -> pl.DataFrame: ...
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
@overload
|
|
133
|
-
def read_excel(
|
|
134
|
-
source: FileSource,
|
|
135
|
-
*,
|
|
136
|
-
sheet_id: None = ...,
|
|
137
|
-
sheet_name: None = ...,
|
|
138
|
-
table_name: str | None = ...,
|
|
139
|
-
engine: ExcelSpreadsheetEngine = ...,
|
|
140
|
-
engine_options: dict[str, Any] | None = ...,
|
|
141
|
-
has_header: bool = ...,
|
|
142
|
-
read_options: dict[str, Any] | None = ...,
|
|
143
|
-
columns: Sequence[int] | Sequence[str] | str | None = ...,
|
|
144
|
-
schema_overrides: SchemaDict | None = ...,
|
|
145
|
-
infer_schema_length: int | None = ...,
|
|
146
|
-
include_file_paths: str | None = ...,
|
|
147
|
-
drop_empty_rows: bool = ...,
|
|
148
|
-
drop_empty_cols: bool = ...,
|
|
149
|
-
raise_if_empty: bool = ...,
|
|
150
|
-
) -> pl.DataFrame: ...
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
@overload
|
|
154
|
-
def read_excel(
|
|
155
|
-
source: FileSource,
|
|
156
|
-
*,
|
|
157
|
-
sheet_id: int,
|
|
158
|
-
sheet_name: str,
|
|
159
|
-
table_name: str | None = ...,
|
|
160
|
-
engine: ExcelSpreadsheetEngine = ...,
|
|
161
|
-
engine_options: dict[str, Any] | None = ...,
|
|
162
|
-
read_options: dict[str, Any] | None = ...,
|
|
163
|
-
has_header: bool = ...,
|
|
164
|
-
columns: Sequence[int] | Sequence[str] | str | None = ...,
|
|
165
|
-
schema_overrides: SchemaDict | None = ...,
|
|
166
|
-
infer_schema_length: int | None = ...,
|
|
167
|
-
include_file_paths: str | None = ...,
|
|
168
|
-
drop_empty_rows: bool = ...,
|
|
169
|
-
drop_empty_cols: bool = ...,
|
|
170
|
-
raise_if_empty: bool = ...,
|
|
171
|
-
) -> NoReturn: ...
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
# note: 'ignore' required as mypy thinks that the return value for
|
|
175
|
-
# Literal[0] overlaps with the return value for other integers
|
|
176
|
-
@overload # type: ignore[overload-overlap]
|
|
177
|
-
def read_excel(
|
|
178
|
-
source: FileSource,
|
|
179
|
-
*,
|
|
180
|
-
sheet_id: Literal[0] | Sequence[int],
|
|
181
|
-
sheet_name: None = ...,
|
|
182
|
-
table_name: str | None = ...,
|
|
183
|
-
engine: ExcelSpreadsheetEngine = ...,
|
|
184
|
-
engine_options: dict[str, Any] | None = ...,
|
|
185
|
-
read_options: dict[str, Any] | None = ...,
|
|
186
|
-
has_header: bool = ...,
|
|
187
|
-
columns: Sequence[int] | Sequence[str] | str | None = ...,
|
|
188
|
-
schema_overrides: SchemaDict | None = ...,
|
|
189
|
-
infer_schema_length: int | None = ...,
|
|
190
|
-
include_file_paths: str | None = ...,
|
|
191
|
-
drop_empty_rows: bool = ...,
|
|
192
|
-
drop_empty_cols: bool = ...,
|
|
193
|
-
raise_if_empty: bool = ...,
|
|
194
|
-
) -> dict[str, pl.DataFrame]: ...
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
@overload
|
|
198
|
-
def read_excel(
|
|
199
|
-
source: FileSource,
|
|
200
|
-
*,
|
|
201
|
-
sheet_id: int,
|
|
202
|
-
sheet_name: None = ...,
|
|
203
|
-
table_name: str | None = ...,
|
|
204
|
-
engine: ExcelSpreadsheetEngine = ...,
|
|
205
|
-
engine_options: dict[str, Any] | None = ...,
|
|
206
|
-
read_options: dict[str, Any] | None = ...,
|
|
207
|
-
has_header: bool = ...,
|
|
208
|
-
columns: Sequence[int] | Sequence[str] | str | None = ...,
|
|
209
|
-
schema_overrides: SchemaDict | None = ...,
|
|
210
|
-
infer_schema_length: int | None = ...,
|
|
211
|
-
include_file_paths: str | None = ...,
|
|
212
|
-
drop_empty_rows: bool = ...,
|
|
213
|
-
drop_empty_cols: bool = ...,
|
|
214
|
-
raise_if_empty: bool = ...,
|
|
215
|
-
) -> pl.DataFrame: ...
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
@overload
|
|
219
|
-
def read_excel(
|
|
220
|
-
source: FileSource,
|
|
221
|
-
*,
|
|
222
|
-
sheet_id: None = ...,
|
|
223
|
-
sheet_name: list[str] | tuple[str],
|
|
224
|
-
table_name: str | None = ...,
|
|
225
|
-
engine: ExcelSpreadsheetEngine = ...,
|
|
226
|
-
engine_options: dict[str, Any] | None = ...,
|
|
227
|
-
read_options: dict[str, Any] | None = ...,
|
|
228
|
-
has_header: bool = ...,
|
|
229
|
-
columns: Sequence[int] | Sequence[str] | str | None = ...,
|
|
230
|
-
schema_overrides: SchemaDict | None = ...,
|
|
231
|
-
infer_schema_length: int | None = ...,
|
|
232
|
-
include_file_paths: str | None = ...,
|
|
233
|
-
drop_empty_rows: bool = ...,
|
|
234
|
-
drop_empty_cols: bool = ...,
|
|
235
|
-
raise_if_empty: bool = ...,
|
|
236
|
-
) -> dict[str, pl.DataFrame]: ...
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
@deprecate_renamed_parameter("xlsx2csv_options", "engine_options", version="0.20.6")
|
|
240
|
-
@deprecate_renamed_parameter("read_csv_options", "read_options", version="0.20.7")
|
|
241
|
-
def read_excel(
|
|
242
|
-
source: FileSource,
|
|
243
|
-
*,
|
|
244
|
-
sheet_id: int | Sequence[int] | None = None,
|
|
245
|
-
sheet_name: str | list[str] | tuple[str] | None = None,
|
|
246
|
-
table_name: str | None = None,
|
|
247
|
-
engine: ExcelSpreadsheetEngine = "calamine",
|
|
248
|
-
engine_options: dict[str, Any] | None = None,
|
|
249
|
-
read_options: dict[str, Any] | None = None,
|
|
250
|
-
has_header: bool = True,
|
|
251
|
-
columns: Sequence[int] | Sequence[str] | str | None = None,
|
|
252
|
-
schema_overrides: SchemaDict | None = None,
|
|
253
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
254
|
-
include_file_paths: str | None = None,
|
|
255
|
-
drop_empty_rows: bool = True,
|
|
256
|
-
drop_empty_cols: bool = True,
|
|
257
|
-
raise_if_empty: bool = True,
|
|
258
|
-
) -> pl.DataFrame | dict[str, pl.DataFrame]:
|
|
259
|
-
"""
|
|
260
|
-
Read Excel spreadsheet data into a DataFrame.
|
|
261
|
-
|
|
262
|
-
.. versionadded:: 1.20
|
|
263
|
-
Support loading data from named table objects with `table_name` parameter.
|
|
264
|
-
.. versionadded:: 1.18
|
|
265
|
-
Support loading data from a list (or glob pattern) of multiple workbooks.
|
|
266
|
-
.. versionchanged:: 1.0
|
|
267
|
-
Default engine is now "calamine" (was "xlsx2csv").
|
|
268
|
-
.. versionchanged:: 0.20.7
|
|
269
|
-
The `read_csv_options` parameter was renamed `read_options`.
|
|
270
|
-
.. versionchanged:: 0.20.6
|
|
271
|
-
The `xlsx2csv_options` parameter was renamed `engine_options`.
|
|
272
|
-
|
|
273
|
-
Parameters
|
|
274
|
-
----------
|
|
275
|
-
source
|
|
276
|
-
Path(s) to a file or a file-like object (by "file-like object" we refer to
|
|
277
|
-
objects that have a `read()` method, such as a file handler like the builtin
|
|
278
|
-
`open` function, or a `BytesIO` instance). For file-like objects, the stream
|
|
279
|
-
position may not be updated after reading.
|
|
280
|
-
sheet_id
|
|
281
|
-
Sheet number(s) to convert (set `0` to load all sheets as DataFrames) and
|
|
282
|
-
return a `{sheetname:frame,}` dict. (Defaults to `1` if neither this nor
|
|
283
|
-
`sheet_name` are specified). Can also take a sequence of sheet numbers.
|
|
284
|
-
sheet_name
|
|
285
|
-
Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If
|
|
286
|
-
more than one is given then a `{sheetname:frame,}` dict is returned.
|
|
287
|
-
table_name
|
|
288
|
-
Name of a specific table to read; note that table names are unique across
|
|
289
|
-
the workbook, so additionally specifying a sheet id or name is optional;
|
|
290
|
-
if one of those parameters *is* specified, an error will be raised if
|
|
291
|
-
the named table is not found in that particular sheet.
|
|
292
|
-
engine : {'calamine', 'openpyxl', 'xlsx2csv'}
|
|
293
|
-
Library used to parse the spreadsheet file; defaults to "calamine".
|
|
294
|
-
|
|
295
|
-
* "calamine": this engine can be used for reading all major types of Excel
|
|
296
|
-
Workbook (`.xlsx`, `.xlsb`, `.xls`) and is dramatically faster than the
|
|
297
|
-
other options, using the `fastexcel` module to bind the Rust-based Calamine
|
|
298
|
-
parser.
|
|
299
|
-
* "openpyxl": this engine is significantly slower than both `calamine` and
|
|
300
|
-
`xlsx2csv`, but can provide a useful fallback if you are otherwise unable
|
|
301
|
-
to read data from your workbook.
|
|
302
|
-
* "xlsx2csv": converts the data to an in-memory CSV before using the native
|
|
303
|
-
polars `read_csv` method to parse the result.
|
|
304
|
-
engine_options
|
|
305
|
-
Additional options passed to the underlying engine's primary parsing
|
|
306
|
-
constructor (given below), if supported:
|
|
307
|
-
|
|
308
|
-
* "calamine": n/a (can only provide `read_options`)
|
|
309
|
-
* "openpyxl": `load_workbook <https://openpyxl.readthedocs.io/en/stable/api/openpyxl.reader.excel.html#openpyxl.reader.excel.load_workbook>`_
|
|
310
|
-
* "xlsx2csv": `Xlsx2csv <https://github.com/dilshod/xlsx2csv/blob/f35734aa453d65102198a77e7b8cd04928e6b3a2/xlsx2csv.py#L157>`_
|
|
311
|
-
read_options
|
|
312
|
-
Options passed to the underlying engine method that reads the sheet data.
|
|
313
|
-
Where supported, this allows for additional control over parsing. The
|
|
314
|
-
specific read methods associated with each engine are:
|
|
315
|
-
|
|
316
|
-
* "calamine": `load_sheet_by_name <https://fastexcel.toucantoco.dev/fastexcel.html#ExcelReader.load_sheet_by_name>`_
|
|
317
|
-
(or `load_table <https://fastexcel.toucantoco.dev/fastexcel.html#ExcelReader.load_table>`_
|
|
318
|
-
if using the `table_name` parameter).
|
|
319
|
-
* "openpyxl": n/a (can only provide `engine_options`)
|
|
320
|
-
* "xlsx2csv": see :meth:`read_csv`
|
|
321
|
-
has_header
|
|
322
|
-
Indicate if the first row of the table data is a header or not. If False,
|
|
323
|
-
column names will be autogenerated in the following format: `column_x`, with
|
|
324
|
-
`x` being an enumeration over every column in the dataset, starting at 1.
|
|
325
|
-
columns
|
|
326
|
-
Columns to read from the sheet; if not specified, all columns are read. Can
|
|
327
|
-
be given as a sequence of column names or indices, or a single column name.
|
|
328
|
-
schema_overrides
|
|
329
|
-
Support type specification or override of one or more columns.
|
|
330
|
-
infer_schema_length
|
|
331
|
-
The maximum number of rows to scan for schema inference. If set to `None`, the
|
|
332
|
-
entire dataset is scanned to determine the dtypes, which can slow parsing for
|
|
333
|
-
large workbooks. Note that only the "calamine" and "xlsx2csv" engines support
|
|
334
|
-
this parameter.
|
|
335
|
-
include_file_paths
|
|
336
|
-
Include the path of the source file(s) as a column with this name.
|
|
337
|
-
drop_empty_rows
|
|
338
|
-
Indicate whether to omit empty rows when reading data into the DataFrame.
|
|
339
|
-
drop_empty_cols
|
|
340
|
-
Indicate whether to omit empty columns (with no headers) when reading data into
|
|
341
|
-
the DataFrame (note that empty column identification may vary depending on the
|
|
342
|
-
underlying engine being used).
|
|
343
|
-
raise_if_empty
|
|
344
|
-
When there is no data in the sheet,`NoDataError` is raised. If this parameter
|
|
345
|
-
is set to False, an empty DataFrame (with no columns) is returned instead.
|
|
346
|
-
|
|
347
|
-
Returns
|
|
348
|
-
-------
|
|
349
|
-
DataFrame
|
|
350
|
-
If reading a single sheet.
|
|
351
|
-
dict
|
|
352
|
-
If reading multiple sheets, a "{sheetname: DataFrame, ...}" dict is returned.
|
|
353
|
-
|
|
354
|
-
See Also
|
|
355
|
-
--------
|
|
356
|
-
read_ods
|
|
357
|
-
|
|
358
|
-
Notes
|
|
359
|
-
-----
|
|
360
|
-
* Where possible, prefer the default "calamine" engine for reading Excel Workbooks,
|
|
361
|
-
as it is significantly faster than the other options.
|
|
362
|
-
* When using the `xlsx2csv` engine the target Excel sheet is first converted
|
|
363
|
-
to CSV using `xlsx2csv.Xlsx2csv(source).convert()` and then parsed with Polars'
|
|
364
|
-
:func:`read_csv` function. You can pass additional options to `read_options`
|
|
365
|
-
to influence this part of the parsing pipeline.
|
|
366
|
-
* If you want to read multiple sheets and set *different* options (`read_options`,
|
|
367
|
-
`schema_overrides`, etc), you should make separate calls as the options are set
|
|
368
|
-
globally, not on a per-sheet basis.
|
|
369
|
-
|
|
370
|
-
Examples
|
|
371
|
-
--------
|
|
372
|
-
Read the "data" worksheet from an Excel file into a DataFrame.
|
|
373
|
-
|
|
374
|
-
>>> pl.read_excel(
|
|
375
|
-
... source="test.xlsx",
|
|
376
|
-
... sheet_name="data",
|
|
377
|
-
... ) # doctest: +SKIP
|
|
378
|
-
|
|
379
|
-
If the correct dtypes can't be determined, use the `schema_overrides` parameter
|
|
380
|
-
to specify them, or increase the inference length with `infer_schema_length`.
|
|
381
|
-
|
|
382
|
-
>>> pl.read_excel(
|
|
383
|
-
... source="test.xlsx",
|
|
384
|
-
... schema_overrides={"dt": pl.Date},
|
|
385
|
-
... infer_schema_length=None,
|
|
386
|
-
... ) # doctest: +SKIP
|
|
387
|
-
|
|
388
|
-
Using the `xlsx2csv` engine, read table data from sheet 3 in an Excel workbook as a
|
|
389
|
-
DataFrame while skipping empty lines in the sheet. As sheet 3 does not have a header
|
|
390
|
-
row, you can pass the necessary additional settings for this to the `read_options`
|
|
391
|
-
parameter; these will be passed to :func:`read_csv`.
|
|
392
|
-
|
|
393
|
-
>>> pl.read_excel(
|
|
394
|
-
... source="test.xlsx",
|
|
395
|
-
... sheet_id=3,
|
|
396
|
-
... engine="xlsx2csv",
|
|
397
|
-
... engine_options={"skip_empty_lines": True},
|
|
398
|
-
... read_options={"has_header": False, "new_columns": ["a", "b", "c"]},
|
|
399
|
-
... ) # doctest: +SKIP
|
|
400
|
-
"""
|
|
401
|
-
sources, read_multiple_workbooks = _sources(source)
|
|
402
|
-
frames: list[pl.DataFrame] | list[dict[str, pl.DataFrame]] = [ # type: ignore[assignment]
|
|
403
|
-
_read_spreadsheet(
|
|
404
|
-
src,
|
|
405
|
-
sheet_id=sheet_id,
|
|
406
|
-
sheet_name=sheet_name,
|
|
407
|
-
table_name=table_name,
|
|
408
|
-
engine=engine,
|
|
409
|
-
engine_options=engine_options,
|
|
410
|
-
read_options=read_options,
|
|
411
|
-
schema_overrides=schema_overrides,
|
|
412
|
-
infer_schema_length=infer_schema_length,
|
|
413
|
-
include_file_paths=include_file_paths,
|
|
414
|
-
raise_if_empty=raise_if_empty,
|
|
415
|
-
has_header=has_header,
|
|
416
|
-
columns=columns,
|
|
417
|
-
drop_empty_rows=drop_empty_rows,
|
|
418
|
-
drop_empty_cols=drop_empty_cols,
|
|
419
|
-
)
|
|
420
|
-
for src in sources
|
|
421
|
-
]
|
|
422
|
-
return _unpack_read_results(
|
|
423
|
-
frames=frames,
|
|
424
|
-
read_multiple_workbooks=read_multiple_workbooks,
|
|
425
|
-
)
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
@overload
|
|
429
|
-
def read_ods(
|
|
430
|
-
source: FileSource,
|
|
431
|
-
*,
|
|
432
|
-
sheet_id: None = ...,
|
|
433
|
-
sheet_name: str,
|
|
434
|
-
has_header: bool = ...,
|
|
435
|
-
columns: Sequence[int] | Sequence[str] | None = ...,
|
|
436
|
-
schema_overrides: SchemaDict | None = ...,
|
|
437
|
-
infer_schema_length: int | None = ...,
|
|
438
|
-
include_file_paths: str | None = ...,
|
|
439
|
-
drop_empty_rows: bool = ...,
|
|
440
|
-
drop_empty_cols: bool = ...,
|
|
441
|
-
raise_if_empty: bool = ...,
|
|
442
|
-
) -> pl.DataFrame: ...
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
@overload
|
|
446
|
-
def read_ods(
|
|
447
|
-
source: FileSource,
|
|
448
|
-
*,
|
|
449
|
-
sheet_id: None = ...,
|
|
450
|
-
sheet_name: None = ...,
|
|
451
|
-
has_header: bool = ...,
|
|
452
|
-
columns: Sequence[int] | Sequence[str] | None = ...,
|
|
453
|
-
schema_overrides: SchemaDict | None = ...,
|
|
454
|
-
infer_schema_length: int | None = ...,
|
|
455
|
-
include_file_paths: str | None = ...,
|
|
456
|
-
drop_empty_rows: bool = ...,
|
|
457
|
-
drop_empty_cols: bool = ...,
|
|
458
|
-
raise_if_empty: bool = ...,
|
|
459
|
-
) -> pl.DataFrame: ...
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
@overload
|
|
463
|
-
def read_ods(
|
|
464
|
-
source: FileSource,
|
|
465
|
-
*,
|
|
466
|
-
sheet_id: int,
|
|
467
|
-
sheet_name: str,
|
|
468
|
-
has_header: bool = ...,
|
|
469
|
-
columns: Sequence[int] | Sequence[str] | None = ...,
|
|
470
|
-
schema_overrides: SchemaDict | None = ...,
|
|
471
|
-
infer_schema_length: int | None = ...,
|
|
472
|
-
include_file_paths: str | None = ...,
|
|
473
|
-
drop_empty_rows: bool = ...,
|
|
474
|
-
drop_empty_cols: bool = ...,
|
|
475
|
-
raise_if_empty: bool = ...,
|
|
476
|
-
) -> NoReturn: ...
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
@overload # type: ignore[overload-overlap]
|
|
480
|
-
def read_ods(
|
|
481
|
-
source: FileSource,
|
|
482
|
-
*,
|
|
483
|
-
sheet_id: Literal[0] | Sequence[int],
|
|
484
|
-
sheet_name: None = ...,
|
|
485
|
-
has_header: bool = ...,
|
|
486
|
-
columns: Sequence[int] | Sequence[str] | None = ...,
|
|
487
|
-
schema_overrides: SchemaDict | None = ...,
|
|
488
|
-
infer_schema_length: int | None = ...,
|
|
489
|
-
include_file_paths: str | None = ...,
|
|
490
|
-
drop_empty_rows: bool = ...,
|
|
491
|
-
drop_empty_cols: bool = ...,
|
|
492
|
-
raise_if_empty: bool = ...,
|
|
493
|
-
) -> dict[str, pl.DataFrame]: ...
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
@overload
|
|
497
|
-
def read_ods(
|
|
498
|
-
source: FileSource,
|
|
499
|
-
*,
|
|
500
|
-
sheet_id: int,
|
|
501
|
-
sheet_name: None = ...,
|
|
502
|
-
has_header: bool = ...,
|
|
503
|
-
columns: Sequence[int] | Sequence[str] | None = ...,
|
|
504
|
-
schema_overrides: SchemaDict | None = ...,
|
|
505
|
-
infer_schema_length: int | None = ...,
|
|
506
|
-
include_file_paths: str | None = ...,
|
|
507
|
-
drop_empty_rows: bool = ...,
|
|
508
|
-
drop_empty_cols: bool = ...,
|
|
509
|
-
raise_if_empty: bool = ...,
|
|
510
|
-
) -> pl.DataFrame: ...
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
@overload
|
|
514
|
-
def read_ods(
|
|
515
|
-
source: FileSource,
|
|
516
|
-
*,
|
|
517
|
-
sheet_id: None = ...,
|
|
518
|
-
sheet_name: list[str] | tuple[str],
|
|
519
|
-
has_header: bool = ...,
|
|
520
|
-
columns: Sequence[int] | Sequence[str] | None = ...,
|
|
521
|
-
schema_overrides: SchemaDict | None = ...,
|
|
522
|
-
infer_schema_length: int | None = ...,
|
|
523
|
-
include_file_paths: str | None = ...,
|
|
524
|
-
drop_empty_rows: bool = ...,
|
|
525
|
-
drop_empty_cols: bool = ...,
|
|
526
|
-
raise_if_empty: bool = ...,
|
|
527
|
-
) -> dict[str, pl.DataFrame]: ...
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
def read_ods(
|
|
531
|
-
source: FileSource,
|
|
532
|
-
*,
|
|
533
|
-
sheet_id: int | Sequence[int] | None = None,
|
|
534
|
-
sheet_name: str | list[str] | tuple[str] | None = None,
|
|
535
|
-
has_header: bool = True,
|
|
536
|
-
columns: Sequence[int] | Sequence[str] | None = None,
|
|
537
|
-
schema_overrides: SchemaDict | None = None,
|
|
538
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
539
|
-
include_file_paths: str | None = None,
|
|
540
|
-
drop_empty_rows: bool = True,
|
|
541
|
-
drop_empty_cols: bool = True,
|
|
542
|
-
raise_if_empty: bool = True,
|
|
543
|
-
) -> pl.DataFrame | dict[str, pl.DataFrame]:
|
|
544
|
-
"""
|
|
545
|
-
Read OpenOffice (ODS) spreadsheet data into a DataFrame.
|
|
546
|
-
|
|
547
|
-
Parameters
|
|
548
|
-
----------
|
|
549
|
-
source
|
|
550
|
-
Path to a file or a file-like object (by "file-like object" we refer to objects
|
|
551
|
-
that have a `read()` method, such as a file handler like the builtin `open`
|
|
552
|
-
function, or a `BytesIO` instance). For file-like objects, the stream position
|
|
553
|
-
may not be updated accordingly after reading.
|
|
554
|
-
sheet_id
|
|
555
|
-
Sheet number(s) to convert, starting from 1 (set `0` to load *all* worksheets
|
|
556
|
-
as DataFrames) and return a `{sheetname:frame,}` dict. (Defaults to `1` if
|
|
557
|
-
neither this nor `sheet_name` are specified). Can also take a sequence of sheet
|
|
558
|
-
numbers.
|
|
559
|
-
sheet_name
|
|
560
|
-
Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If
|
|
561
|
-
more than one is given then a `{sheetname:frame,}` dict is returned.
|
|
562
|
-
has_header
|
|
563
|
-
Indicate if the first row of the table data is a header or not. If False,
|
|
564
|
-
column names will be autogenerated in the following format: `column_x`, with
|
|
565
|
-
`x` being an enumeration over every column in the dataset, starting at 1.
|
|
566
|
-
columns
|
|
567
|
-
Columns to read from the sheet; if not specified, all columns are read. Can
|
|
568
|
-
be given as a sequence of column names or indices.
|
|
569
|
-
schema_overrides
|
|
570
|
-
Support type specification or override of one or more columns.
|
|
571
|
-
infer_schema_length
|
|
572
|
-
The maximum number of rows to scan for schema inference. If set to `None`, the
|
|
573
|
-
entire dataset is scanned to determine the dtypes, which can slow parsing for
|
|
574
|
-
large workbooks.
|
|
575
|
-
include_file_paths
|
|
576
|
-
Include the path of the source file(s) as a column with this name.
|
|
577
|
-
drop_empty_rows
|
|
578
|
-
Indicate whether to omit empty rows when reading data into the DataFrame.
|
|
579
|
-
drop_empty_cols
|
|
580
|
-
Indicate whether to omit empty columns (with no headers) when reading data into
|
|
581
|
-
the DataFrame (note that empty column identification may vary depending on the
|
|
582
|
-
underlying engine being used).
|
|
583
|
-
raise_if_empty
|
|
584
|
-
When there is no data in the sheet,`NoDataError` is raised. If this parameter
|
|
585
|
-
is set to False, an empty DataFrame (with no columns) is returned instead.
|
|
586
|
-
|
|
587
|
-
Returns
|
|
588
|
-
-------
|
|
589
|
-
DataFrame, or a `{sheetname: DataFrame, ...}` dict if reading multiple sheets.
|
|
590
|
-
|
|
591
|
-
See Also
|
|
592
|
-
--------
|
|
593
|
-
read_excel
|
|
594
|
-
|
|
595
|
-
Examples
|
|
596
|
-
--------
|
|
597
|
-
Read the "data" worksheet from an OpenOffice spreadsheet file into a DataFrame.
|
|
598
|
-
|
|
599
|
-
>>> pl.read_ods(
|
|
600
|
-
... source="test.ods",
|
|
601
|
-
... sheet_name="data",
|
|
602
|
-
... ) # doctest: +SKIP
|
|
603
|
-
|
|
604
|
-
If the correct dtypes can't be determined, use the `schema_overrides` parameter
|
|
605
|
-
to specify them, or increase the inference length with `infer_schema_length`.
|
|
606
|
-
|
|
607
|
-
>>> pl.read_ods(
|
|
608
|
-
... source="test.ods",
|
|
609
|
-
... sheet_id=3,
|
|
610
|
-
... schema_overrides={"dt": pl.Date},
|
|
611
|
-
... raise_if_empty=False,
|
|
612
|
-
... ) # doctest: +SKIP
|
|
613
|
-
"""
|
|
614
|
-
sources, read_multiple_workbooks = _sources(source)
|
|
615
|
-
frames: list[pl.DataFrame] | list[dict[str, pl.DataFrame]] = [ # type: ignore[assignment]
|
|
616
|
-
_read_spreadsheet(
|
|
617
|
-
src,
|
|
618
|
-
sheet_id=sheet_id,
|
|
619
|
-
sheet_name=sheet_name,
|
|
620
|
-
table_name=None,
|
|
621
|
-
engine="calamine",
|
|
622
|
-
engine_options={},
|
|
623
|
-
read_options=None,
|
|
624
|
-
schema_overrides=schema_overrides,
|
|
625
|
-
infer_schema_length=infer_schema_length,
|
|
626
|
-
include_file_paths=include_file_paths,
|
|
627
|
-
raise_if_empty=raise_if_empty,
|
|
628
|
-
drop_empty_rows=drop_empty_rows,
|
|
629
|
-
drop_empty_cols=drop_empty_cols,
|
|
630
|
-
has_header=has_header,
|
|
631
|
-
columns=columns,
|
|
632
|
-
)
|
|
633
|
-
for src in sources
|
|
634
|
-
]
|
|
635
|
-
return _unpack_read_results(
|
|
636
|
-
frames=frames,
|
|
637
|
-
read_multiple_workbooks=read_multiple_workbooks,
|
|
638
|
-
)
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
def _read_spreadsheet(
|
|
642
|
-
source: str | IO[bytes] | bytes,
|
|
643
|
-
*,
|
|
644
|
-
sheet_id: int | Sequence[int] | None,
|
|
645
|
-
sheet_name: str | Sequence[str] | None,
|
|
646
|
-
table_name: str | None,
|
|
647
|
-
engine: ExcelSpreadsheetEngine,
|
|
648
|
-
engine_options: dict[str, Any] | None = None,
|
|
649
|
-
read_options: dict[str, Any] | None = None,
|
|
650
|
-
schema_overrides: SchemaDict | None = None,
|
|
651
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
652
|
-
include_file_paths: str | None = None,
|
|
653
|
-
columns: Sequence[int] | Sequence[str] | str | None = None,
|
|
654
|
-
has_header: bool = True,
|
|
655
|
-
raise_if_empty: bool = True,
|
|
656
|
-
drop_empty_rows: bool = True,
|
|
657
|
-
drop_empty_cols: bool = True,
|
|
658
|
-
) -> pl.DataFrame | dict[str, pl.DataFrame]:
|
|
659
|
-
if isinstance(source, str):
|
|
660
|
-
source = normalize_filepath(source)
|
|
661
|
-
if looks_like_url(source):
|
|
662
|
-
source = process_file_url(source)
|
|
663
|
-
|
|
664
|
-
if isinstance(columns, str):
|
|
665
|
-
columns = [columns]
|
|
666
|
-
|
|
667
|
-
read_options = _get_read_options(
|
|
668
|
-
read_options,
|
|
669
|
-
engine=engine,
|
|
670
|
-
columns=columns,
|
|
671
|
-
has_header=has_header,
|
|
672
|
-
infer_schema_length=infer_schema_length,
|
|
673
|
-
)
|
|
674
|
-
engine_options = (engine_options or {}).copy()
|
|
675
|
-
schema_overrides = dict(schema_overrides or {})
|
|
676
|
-
|
|
677
|
-
# establish the reading function, parser, and available worksheets
|
|
678
|
-
reader_fn, parser, worksheets = _initialise_spreadsheet_parser(
|
|
679
|
-
engine, source, engine_options
|
|
680
|
-
)
|
|
681
|
-
try:
|
|
682
|
-
# parse data from the indicated sheet(s)
|
|
683
|
-
sheet_names, return_multiple_sheets = _get_sheet_names(
|
|
684
|
-
sheet_id, sheet_name, table_name, worksheets
|
|
685
|
-
)
|
|
686
|
-
parsed_sheets = {
|
|
687
|
-
name: reader_fn(
|
|
688
|
-
parser=parser,
|
|
689
|
-
sheet_name=name,
|
|
690
|
-
schema_overrides=schema_overrides,
|
|
691
|
-
read_options=read_options,
|
|
692
|
-
raise_if_empty=raise_if_empty,
|
|
693
|
-
columns=columns,
|
|
694
|
-
table_name=table_name,
|
|
695
|
-
drop_empty_rows=drop_empty_rows,
|
|
696
|
-
drop_empty_cols=drop_empty_cols,
|
|
697
|
-
)
|
|
698
|
-
for name in sheet_names
|
|
699
|
-
}
|
|
700
|
-
finally:
|
|
701
|
-
if hasattr(parser, "close"):
|
|
702
|
-
parser.close()
|
|
703
|
-
|
|
704
|
-
if not parsed_sheets:
|
|
705
|
-
param, value = ("id", sheet_id) if sheet_name is None else ("name", sheet_name)
|
|
706
|
-
msg = f"no matching sheets found when `sheet_{param}` is {value!r}"
|
|
707
|
-
raise ValueError(msg)
|
|
708
|
-
|
|
709
|
-
if include_file_paths:
|
|
710
|
-
workbook = source if isinstance(source, str) else "in-mem"
|
|
711
|
-
parsed_sheets = {
|
|
712
|
-
name: frame.with_columns(F.lit(workbook).alias(include_file_paths))
|
|
713
|
-
for name, frame in parsed_sheets.items()
|
|
714
|
-
}
|
|
715
|
-
if return_multiple_sheets:
|
|
716
|
-
return parsed_sheets
|
|
717
|
-
return next(iter(parsed_sheets.values()))
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
def _get_read_options(
|
|
721
|
-
read_options: dict[str, Any] | None,
|
|
722
|
-
*,
|
|
723
|
-
engine: ExcelSpreadsheetEngine,
|
|
724
|
-
columns: Sequence[int] | Sequence[str] | None,
|
|
725
|
-
infer_schema_length: int | None,
|
|
726
|
-
has_header: bool,
|
|
727
|
-
) -> dict[str, Any]:
|
|
728
|
-
"""Normalise top-level parameters to engine-specific 'read_options' dict."""
|
|
729
|
-
read_options = (read_options or {}).copy()
|
|
730
|
-
|
|
731
|
-
if engine == "calamine":
|
|
732
|
-
if ("use_columns" in read_options) and columns:
|
|
733
|
-
msg = 'cannot specify both `columns` and `read_options["use_columns"]`'
|
|
734
|
-
raise ParameterCollisionError(msg)
|
|
735
|
-
elif read_options.get("header_row") is not None and has_header is False:
|
|
736
|
-
msg = 'the values of `has_header` and `read_options["header_row"]` are not compatible'
|
|
737
|
-
raise ParameterCollisionError(msg)
|
|
738
|
-
elif ("schema_sample_rows" in read_options) and (
|
|
739
|
-
infer_schema_length != N_INFER_DEFAULT
|
|
740
|
-
):
|
|
741
|
-
msg = 'cannot specify both `infer_schema_length` and `read_options["schema_sample_rows"]`'
|
|
742
|
-
raise ParameterCollisionError(msg)
|
|
743
|
-
|
|
744
|
-
read_options["schema_sample_rows"] = infer_schema_length
|
|
745
|
-
if has_header is False and "header_row" not in read_options:
|
|
746
|
-
read_options["header_row"] = None
|
|
747
|
-
|
|
748
|
-
elif engine == "xlsx2csv":
|
|
749
|
-
if ("columns" in read_options) and columns:
|
|
750
|
-
msg = 'cannot specify both `columns` and `read_options["columns"]`'
|
|
751
|
-
raise ParameterCollisionError(msg)
|
|
752
|
-
elif (
|
|
753
|
-
"has_header" in read_options
|
|
754
|
-
and read_options["has_header"] is not has_header
|
|
755
|
-
):
|
|
756
|
-
msg = 'the values of `has_header` and `read_options["has_header"]` are not compatible'
|
|
757
|
-
raise ParameterCollisionError(msg)
|
|
758
|
-
elif ("infer_schema_length" in read_options) and (
|
|
759
|
-
infer_schema_length != N_INFER_DEFAULT
|
|
760
|
-
):
|
|
761
|
-
msg = 'cannot specify both `infer_schema_length` and `read_options["infer_schema_length"]`'
|
|
762
|
-
raise ParameterCollisionError(msg)
|
|
763
|
-
|
|
764
|
-
read_options["infer_schema_length"] = infer_schema_length
|
|
765
|
-
if "has_header" not in read_options:
|
|
766
|
-
read_options["has_header"] = has_header
|
|
767
|
-
else:
|
|
768
|
-
read_options["infer_schema_length"] = infer_schema_length
|
|
769
|
-
read_options["has_header"] = has_header
|
|
770
|
-
|
|
771
|
-
return read_options
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
def _get_sheet_names(
|
|
775
|
-
sheet_id: int | Sequence[int] | None,
|
|
776
|
-
sheet_name: str | Sequence[str] | None,
|
|
777
|
-
table_name: str | None,
|
|
778
|
-
worksheets: list[dict[str, Any]],
|
|
779
|
-
) -> tuple[list[str], bool]:
|
|
780
|
-
"""Establish sheets to read; indicate if we are returning a dict frames."""
|
|
781
|
-
if sheet_id is not None and sheet_name is not None:
|
|
782
|
-
msg = f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})"
|
|
783
|
-
raise ValueError(msg)
|
|
784
|
-
|
|
785
|
-
sheet_names = []
|
|
786
|
-
if sheet_id is None and sheet_name is None:
|
|
787
|
-
name = None if table_name else worksheets[0]["name"]
|
|
788
|
-
sheet_names.append(name)
|
|
789
|
-
return_multiple_sheets = False
|
|
790
|
-
elif sheet_id == 0:
|
|
791
|
-
sheet_names.extend(ws["name"] for ws in worksheets)
|
|
792
|
-
return_multiple_sheets = True
|
|
793
|
-
else:
|
|
794
|
-
return_multiple_sheets = (
|
|
795
|
-
(isinstance(sheet_name, Sequence) and not isinstance(sheet_name, str))
|
|
796
|
-
or isinstance(sheet_id, Sequence)
|
|
797
|
-
or sheet_id == 0
|
|
798
|
-
)
|
|
799
|
-
if names := (
|
|
800
|
-
(sheet_name,) if isinstance(sheet_name, str) else sheet_name or ()
|
|
801
|
-
):
|
|
802
|
-
known_sheet_names = {ws["name"] for ws in worksheets}
|
|
803
|
-
for name in names:
|
|
804
|
-
if name not in known_sheet_names:
|
|
805
|
-
msg = f"no matching sheet found when `sheet_name` is {name!r}"
|
|
806
|
-
raise ValueError(msg)
|
|
807
|
-
sheet_names.append(name)
|
|
808
|
-
else:
|
|
809
|
-
ids = (sheet_id,) if isinstance(sheet_id, int) else sheet_id or ()
|
|
810
|
-
sheet_names_by_idx = {
|
|
811
|
-
idx: ws["name"]
|
|
812
|
-
for idx, ws in enumerate(worksheets, start=1)
|
|
813
|
-
if (sheet_id == 0 or ws["index"] in ids or ws["name"] in names)
|
|
814
|
-
}
|
|
815
|
-
for idx in ids:
|
|
816
|
-
if (name := sheet_names_by_idx.get(idx)) is None:
|
|
817
|
-
msg = f"no matching sheet found when `sheet_id` is {idx}"
|
|
818
|
-
raise ValueError(msg)
|
|
819
|
-
sheet_names.append(name)
|
|
820
|
-
|
|
821
|
-
return sheet_names, return_multiple_sheets # type: ignore[return-value]
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
def _initialise_spreadsheet_parser(
|
|
825
|
-
engine: str | None,
|
|
826
|
-
source: str | IO[bytes] | bytes,
|
|
827
|
-
engine_options: dict[str, Any],
|
|
828
|
-
) -> tuple[Callable[..., pl.DataFrame], Any, list[dict[str, Any]]]:
|
|
829
|
-
"""Instantiate the indicated spreadsheet parser and establish related properties."""
|
|
830
|
-
if isinstance(source, str) and not Path(source).exists():
|
|
831
|
-
raise FileNotFoundError(source)
|
|
832
|
-
|
|
833
|
-
if engine == "xlsx2csv": # default
|
|
834
|
-
xlsx2csv = import_optional("xlsx2csv")
|
|
835
|
-
|
|
836
|
-
# establish sensible defaults for unset options
|
|
837
|
-
for option, value in {
|
|
838
|
-
"exclude_hidden_sheets": False,
|
|
839
|
-
"skip_empty_lines": False,
|
|
840
|
-
"skip_hidden_rows": False,
|
|
841
|
-
"floatformat": "%f",
|
|
842
|
-
}.items():
|
|
843
|
-
engine_options.setdefault(option, value)
|
|
844
|
-
|
|
845
|
-
if isinstance(source, bytes):
|
|
846
|
-
source = BytesIO(source)
|
|
847
|
-
|
|
848
|
-
parser = xlsx2csv.Xlsx2csv(source, **engine_options)
|
|
849
|
-
sheets = parser.workbook.sheets
|
|
850
|
-
return _read_spreadsheet_xlsx2csv, parser, sheets
|
|
851
|
-
|
|
852
|
-
elif engine == "openpyxl":
|
|
853
|
-
openpyxl = import_optional("openpyxl")
|
|
854
|
-
if isinstance(source, bytes):
|
|
855
|
-
source = BytesIO(source)
|
|
856
|
-
|
|
857
|
-
parser = openpyxl.load_workbook(source, data_only=True, **engine_options)
|
|
858
|
-
sheets = [{"index": i + 1, "name": ws.title} for i, ws in enumerate(parser)]
|
|
859
|
-
return _read_spreadsheet_openpyxl, parser, sheets
|
|
860
|
-
|
|
861
|
-
elif engine == "calamine":
|
|
862
|
-
fastexcel = import_optional("fastexcel", min_version="0.7.0")
|
|
863
|
-
reading_bytesio, reading_bytes = (
|
|
864
|
-
isinstance(source, BytesIO),
|
|
865
|
-
isinstance(source, bytes),
|
|
866
|
-
)
|
|
867
|
-
if (reading_bytesio or reading_bytes) and parse_version(
|
|
868
|
-
module_version := fastexcel.__version__
|
|
869
|
-
) < (0, 10):
|
|
870
|
-
msg = f"`fastexcel` >= 0.10 is required to read bytes; found {module_version})"
|
|
871
|
-
raise ModuleUpgradeRequiredError(msg)
|
|
872
|
-
|
|
873
|
-
if reading_bytesio:
|
|
874
|
-
source = source.getvalue() # type: ignore[union-attr]
|
|
875
|
-
elif isinstance(source, (BufferedReader, TextIOWrapper)):
|
|
876
|
-
if "b" not in source.mode:
|
|
877
|
-
msg = f"file {source.name!r} must be opened in binary mode"
|
|
878
|
-
raise OSError(msg)
|
|
879
|
-
elif (filename := source.name) and Path(filename).exists():
|
|
880
|
-
source = filename
|
|
881
|
-
else:
|
|
882
|
-
source = source.read()
|
|
883
|
-
|
|
884
|
-
parser = fastexcel.read_excel(source, **engine_options)
|
|
885
|
-
sheets = [
|
|
886
|
-
{"index": i + 1, "name": nm} for i, nm in enumerate(parser.sheet_names)
|
|
887
|
-
]
|
|
888
|
-
return _read_spreadsheet_calamine, parser, sheets
|
|
889
|
-
|
|
890
|
-
msg = f"unrecognized engine: {engine!r}"
|
|
891
|
-
raise NotImplementedError(msg)
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
def _csv_buffer_to_frame(
|
|
895
|
-
csv: StringIO,
|
|
896
|
-
*,
|
|
897
|
-
separator: str,
|
|
898
|
-
read_options: dict[str, Any],
|
|
899
|
-
schema_overrides: SchemaDict | None,
|
|
900
|
-
drop_empty_rows: bool,
|
|
901
|
-
drop_empty_cols: bool,
|
|
902
|
-
raise_if_empty: bool,
|
|
903
|
-
) -> pl.DataFrame:
|
|
904
|
-
"""Translate StringIO buffer containing delimited data as a DataFrame."""
|
|
905
|
-
# handle (completely) empty sheet data
|
|
906
|
-
if csv.tell() == 0:
|
|
907
|
-
return _empty_frame(raise_if_empty)
|
|
908
|
-
|
|
909
|
-
# otherwise rewind the buffer and parse as csv
|
|
910
|
-
csv.seek(0)
|
|
911
|
-
|
|
912
|
-
if read_options is None:
|
|
913
|
-
read_options = {}
|
|
914
|
-
|
|
915
|
-
date_cols = []
|
|
916
|
-
if schema_overrides:
|
|
917
|
-
if csv_dtypes := read_options.get("dtypes", {}):
|
|
918
|
-
issue_deprecation_warning(
|
|
919
|
-
"the `dtypes` parameter for `read_csv` is deprecated. It has been renamed to `schema_overrides`.",
|
|
920
|
-
version="0.20.31",
|
|
921
|
-
)
|
|
922
|
-
|
|
923
|
-
csv_schema_overrides = read_options.get("schema_overrides", csv_dtypes)
|
|
924
|
-
if set(csv_schema_overrides).intersection(schema_overrides):
|
|
925
|
-
msg = "cannot specify columns in both `schema_overrides` and `read_options['dtypes']`"
|
|
926
|
-
raise ParameterCollisionError(msg)
|
|
927
|
-
|
|
928
|
-
overrides, schema_overrides = {**csv_schema_overrides, **schema_overrides}, {}
|
|
929
|
-
for nm, dtype in overrides.items():
|
|
930
|
-
if dtype != Date:
|
|
931
|
-
schema_overrides[nm] = dtype
|
|
932
|
-
else:
|
|
933
|
-
date_cols.append(nm)
|
|
934
|
-
|
|
935
|
-
read_options = read_options.copy()
|
|
936
|
-
read_options["schema_overrides"] = schema_overrides
|
|
937
|
-
|
|
938
|
-
df = _drop_null_data(
|
|
939
|
-
df=read_csv(
|
|
940
|
-
csv,
|
|
941
|
-
separator=separator,
|
|
942
|
-
**read_options,
|
|
943
|
-
),
|
|
944
|
-
raise_if_empty=raise_if_empty,
|
|
945
|
-
drop_empty_rows=drop_empty_rows,
|
|
946
|
-
drop_empty_cols=drop_empty_cols,
|
|
947
|
-
)
|
|
948
|
-
if date_cols:
|
|
949
|
-
date_casts, schema = {}, df.schema
|
|
950
|
-
for nm in date_cols:
|
|
951
|
-
if schema[nm] == String:
|
|
952
|
-
date_casts[nm] = (
|
|
953
|
-
F.col(nm)
|
|
954
|
-
.str.replace(r"(?:[ T]00:00:00(?:\.0+)?)$", "")
|
|
955
|
-
.str.to_date()
|
|
956
|
-
)
|
|
957
|
-
if date_casts:
|
|
958
|
-
df = df.with_columns(**date_casts)
|
|
959
|
-
return df
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
def _drop_null_data(
|
|
963
|
-
df: pl.DataFrame,
|
|
964
|
-
*,
|
|
965
|
-
raise_if_empty: bool,
|
|
966
|
-
drop_empty_rows: bool = True,
|
|
967
|
-
drop_empty_cols: bool = True,
|
|
968
|
-
) -> pl.DataFrame:
|
|
969
|
-
"""If DataFrame contains columns/rows that contain only nulls, drop them."""
|
|
970
|
-
null_cols: list[str] = []
|
|
971
|
-
if drop_empty_cols:
|
|
972
|
-
for col_name in df.columns:
|
|
973
|
-
# note that if multiple unnamed columns are found then all but the first one
|
|
974
|
-
# will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine)
|
|
975
|
-
if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name):
|
|
976
|
-
col = df[col_name]
|
|
977
|
-
if (
|
|
978
|
-
col.dtype == Null
|
|
979
|
-
or col.null_count() == df.height
|
|
980
|
-
or (
|
|
981
|
-
col.dtype in NUMERIC_DTYPES
|
|
982
|
-
and col.replace(0, None).null_count() == df.height
|
|
983
|
-
)
|
|
984
|
-
):
|
|
985
|
-
null_cols.append(col_name)
|
|
986
|
-
if null_cols:
|
|
987
|
-
df = df.drop(*null_cols)
|
|
988
|
-
|
|
989
|
-
if df.height == df.width == 0:
|
|
990
|
-
return _empty_frame(raise_if_empty)
|
|
991
|
-
if drop_empty_rows:
|
|
992
|
-
return df.filter(~F.all_horizontal(F.all().is_null()))
|
|
993
|
-
return df
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
def _empty_frame(raise_if_empty: bool) -> pl.DataFrame: # noqa: FBT001
|
|
997
|
-
if raise_if_empty:
|
|
998
|
-
msg = (
|
|
999
|
-
"empty Excel sheet"
|
|
1000
|
-
"\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
|
|
1001
|
-
)
|
|
1002
|
-
raise NoDataError(msg)
|
|
1003
|
-
return pl.DataFrame()
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
def _reorder_columns(
|
|
1007
|
-
df: pl.DataFrame, columns: Sequence[int] | Sequence[str] | None
|
|
1008
|
-
) -> pl.DataFrame:
|
|
1009
|
-
if columns:
|
|
1010
|
-
from polars.selectors import by_index, by_name
|
|
1011
|
-
|
|
1012
|
-
cols = by_index(*columns) if isinstance(columns[0], int) else by_name(*columns)
|
|
1013
|
-
df = df.select(cols)
|
|
1014
|
-
return df
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
def _read_spreadsheet_calamine(
|
|
1018
|
-
parser: Any,
|
|
1019
|
-
*,
|
|
1020
|
-
sheet_name: str | None,
|
|
1021
|
-
read_options: dict[str, Any],
|
|
1022
|
-
schema_overrides: SchemaDict | None,
|
|
1023
|
-
columns: Sequence[int] | Sequence[str] | None,
|
|
1024
|
-
table_name: str | None = None,
|
|
1025
|
-
drop_empty_rows: bool,
|
|
1026
|
-
drop_empty_cols: bool,
|
|
1027
|
-
raise_if_empty: bool,
|
|
1028
|
-
) -> pl.DataFrame:
|
|
1029
|
-
# if we have 'schema_overrides' and a more recent version of `fastexcel`
|
|
1030
|
-
# we can pass translated dtypes to the engine to refine the initial parse
|
|
1031
|
-
fastexcel = import_optional("fastexcel")
|
|
1032
|
-
fastexcel_version = parse_version(original_version := fastexcel.__version__)
|
|
1033
|
-
|
|
1034
|
-
if fastexcel_version < (0, 9) and "schema_sample_rows" in read_options:
|
|
1035
|
-
msg = f"a more recent version of `fastexcel` is required for 'schema_sample_rows' (>= 0.9; found {original_version})"
|
|
1036
|
-
raise ModuleUpgradeRequiredError(msg)
|
|
1037
|
-
if fastexcel_version < (0, 10, 2) and "use_columns" in read_options:
|
|
1038
|
-
msg = f"a more recent version of `fastexcel` is required for 'use_columns' (>= 0.10.2; found {original_version})"
|
|
1039
|
-
raise ModuleUpgradeRequiredError(msg)
|
|
1040
|
-
if table_name and fastexcel_version < (0, 12):
|
|
1041
|
-
msg = f"a more recent version of `fastexcel` is required for 'table_name' (>= 0.12.0; found {original_version})"
|
|
1042
|
-
raise ValueError(msg)
|
|
1043
|
-
|
|
1044
|
-
if columns:
|
|
1045
|
-
if not isinstance(columns, list):
|
|
1046
|
-
columns = list(columns) # type: ignore[assignment]
|
|
1047
|
-
read_options["use_columns"] = columns
|
|
1048
|
-
|
|
1049
|
-
schema_overrides = schema_overrides or {}
|
|
1050
|
-
if read_options.get("schema_sample_rows") == 0:
|
|
1051
|
-
# ref: https://github.com/ToucanToco/fastexcel/issues/236
|
|
1052
|
-
del read_options["schema_sample_rows"]
|
|
1053
|
-
read_options["dtypes"] = (
|
|
1054
|
-
"string"
|
|
1055
|
-
if fastexcel_version >= (0, 12, 1)
|
|
1056
|
-
else dict.fromkeys(range(16384), "string")
|
|
1057
|
-
)
|
|
1058
|
-
elif schema_overrides and fastexcel_version >= (0, 10):
|
|
1059
|
-
parser_dtypes = read_options.get("dtypes", {})
|
|
1060
|
-
for name, dtype in schema_overrides.items():
|
|
1061
|
-
if name not in parser_dtypes:
|
|
1062
|
-
if (base_dtype := dtype.base_type()) in INTEGER_DTYPES:
|
|
1063
|
-
parser_dtypes[name] = "int"
|
|
1064
|
-
elif base_dtype in FLOAT_DTYPES:
|
|
1065
|
-
parser_dtypes[name] = "float"
|
|
1066
|
-
elif base_dtype == String:
|
|
1067
|
-
parser_dtypes[name] = "string"
|
|
1068
|
-
elif base_dtype == Duration:
|
|
1069
|
-
parser_dtypes[name] = "duration"
|
|
1070
|
-
elif base_dtype == Boolean:
|
|
1071
|
-
parser_dtypes[name] = "boolean"
|
|
1072
|
-
|
|
1073
|
-
read_options["dtypes"] = parser_dtypes
|
|
1074
|
-
|
|
1075
|
-
if fastexcel_version < (0, 11, 2):
|
|
1076
|
-
ws = parser.load_sheet_by_name(name=sheet_name, **read_options)
|
|
1077
|
-
df = ws.to_polars()
|
|
1078
|
-
else:
|
|
1079
|
-
if table_name:
|
|
1080
|
-
xl_table = parser.load_table(table_name, **read_options)
|
|
1081
|
-
if sheet_name and sheet_name != xl_table.sheet_name:
|
|
1082
|
-
msg = f"table named {table_name!r} not found in sheet {sheet_name!r}"
|
|
1083
|
-
raise RuntimeError(msg)
|
|
1084
|
-
df = xl_table.to_polars()
|
|
1085
|
-
else:
|
|
1086
|
-
ws_arrow = parser.load_sheet_eager(sheet_name, **read_options)
|
|
1087
|
-
df = from_arrow(ws_arrow)
|
|
1088
|
-
|
|
1089
|
-
if read_options.get("header_row", False) is None and not read_options.get(
|
|
1090
|
-
"column_names"
|
|
1091
|
-
):
|
|
1092
|
-
df.columns = [f"column_{i}" for i in range(1, df.width + 1)]
|
|
1093
|
-
|
|
1094
|
-
df = _drop_null_data(
|
|
1095
|
-
df,
|
|
1096
|
-
raise_if_empty=raise_if_empty,
|
|
1097
|
-
drop_empty_rows=drop_empty_rows,
|
|
1098
|
-
drop_empty_cols=drop_empty_cols,
|
|
1099
|
-
)
|
|
1100
|
-
|
|
1101
|
-
# note: even if we applied parser dtypes we still re-apply schema_overrides
|
|
1102
|
-
# natively as we can refine integer/float types, temporal precision, etc.
|
|
1103
|
-
if schema_overrides:
|
|
1104
|
-
lf, schema = df.lazy(), df.schema
|
|
1105
|
-
str_to_temporal, updated_overrides = [], {}
|
|
1106
|
-
for nm, tp in schema_overrides.items():
|
|
1107
|
-
if schema[nm] != String:
|
|
1108
|
-
updated_overrides[nm] = tp
|
|
1109
|
-
elif tp == Datetime:
|
|
1110
|
-
str_to_temporal.append(
|
|
1111
|
-
F.col(nm).str.to_datetime(
|
|
1112
|
-
time_unit=getattr(tp, "time_unit", None),
|
|
1113
|
-
time_zone=getattr(tp, "time_zone", None),
|
|
1114
|
-
)
|
|
1115
|
-
)
|
|
1116
|
-
elif tp == Date:
|
|
1117
|
-
dt_str = F.col(nm).str.replace(r"(?:[ T]00:00:00(?:\.0+)?)$", "")
|
|
1118
|
-
str_to_temporal.append(dt_str.str.to_date())
|
|
1119
|
-
elif tp == Time:
|
|
1120
|
-
str_to_temporal.append(F.col(nm).str.to_time())
|
|
1121
|
-
else:
|
|
1122
|
-
updated_overrides[nm] = tp
|
|
1123
|
-
|
|
1124
|
-
if str_to_temporal:
|
|
1125
|
-
lf = lf.with_columns(*str_to_temporal)
|
|
1126
|
-
if updated_overrides:
|
|
1127
|
-
lf = lf.cast(dtypes=updated_overrides)
|
|
1128
|
-
df = lf.collect()
|
|
1129
|
-
|
|
1130
|
-
# standardise on string dtype for null columns in empty frame
|
|
1131
|
-
if df.is_empty():
|
|
1132
|
-
df = df.cast({Null: String})
|
|
1133
|
-
|
|
1134
|
-
# further refine dtypes
|
|
1135
|
-
type_checks = []
|
|
1136
|
-
for c, dtype in df.schema.items():
|
|
1137
|
-
if c not in schema_overrides:
|
|
1138
|
-
# may read integer data as float; cast back to int where possible.
|
|
1139
|
-
if dtype in FLOAT_DTYPES:
|
|
1140
|
-
check_cast = [
|
|
1141
|
-
F.col(c).floor().eq_missing(F.col(c)) & F.col(c).is_not_nan(),
|
|
1142
|
-
F.col(c).cast(Int64),
|
|
1143
|
-
]
|
|
1144
|
-
type_checks.append(check_cast)
|
|
1145
|
-
# do a similar check for datetime columns that have only 00:00:00 times.
|
|
1146
|
-
elif dtype == Datetime:
|
|
1147
|
-
check_cast = [
|
|
1148
|
-
F.col(c).dt.time().eq(time(0, 0, 0)),
|
|
1149
|
-
F.col(c).cast(Date),
|
|
1150
|
-
]
|
|
1151
|
-
type_checks.append(check_cast)
|
|
1152
|
-
|
|
1153
|
-
if type_checks:
|
|
1154
|
-
apply_cast = df.select(d[0].all(ignore_nulls=True) for d in type_checks).row(0)
|
|
1155
|
-
if downcast := [
|
|
1156
|
-
cast for apply, (_, cast) in zip(apply_cast, type_checks) if apply
|
|
1157
|
-
]:
|
|
1158
|
-
df = df.with_columns(*downcast)
|
|
1159
|
-
|
|
1160
|
-
return df
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
def _read_spreadsheet_openpyxl(
|
|
1164
|
-
parser: Any,
|
|
1165
|
-
*,
|
|
1166
|
-
sheet_name: str | None,
|
|
1167
|
-
read_options: dict[str, Any],
|
|
1168
|
-
schema_overrides: SchemaDict | None,
|
|
1169
|
-
columns: Sequence[int] | Sequence[str] | None,
|
|
1170
|
-
table_name: str | None = None,
|
|
1171
|
-
drop_empty_rows: bool,
|
|
1172
|
-
drop_empty_cols: bool,
|
|
1173
|
-
raise_if_empty: bool,
|
|
1174
|
-
) -> pl.DataFrame:
|
|
1175
|
-
"""Use the 'openpyxl' library to read data from the given worksheet."""
|
|
1176
|
-
infer_schema_length = read_options.pop("infer_schema_length", None)
|
|
1177
|
-
has_header = read_options.pop("has_header", True)
|
|
1178
|
-
schema_overrides = schema_overrides or {}
|
|
1179
|
-
no_inference = infer_schema_length == 0
|
|
1180
|
-
header: list[str | None] = []
|
|
1181
|
-
|
|
1182
|
-
if table_name and not sheet_name:
|
|
1183
|
-
sheet_name, n_tables = None, 0
|
|
1184
|
-
for sheet in parser.worksheets:
|
|
1185
|
-
n_tables += 1
|
|
1186
|
-
if table_name in sheet.tables:
|
|
1187
|
-
ws, sheet_name = sheet, sheet.title
|
|
1188
|
-
break
|
|
1189
|
-
if sheet_name is None:
|
|
1190
|
-
msg = (
|
|
1191
|
-
f"table named {table_name!r} not found in sheet {sheet_name!r}"
|
|
1192
|
-
if n_tables
|
|
1193
|
-
else f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
|
|
1194
|
-
)
|
|
1195
|
-
raise RuntimeError(msg)
|
|
1196
|
-
else:
|
|
1197
|
-
ws = parser[sheet_name]
|
|
1198
|
-
|
|
1199
|
-
# prefer detection of actual table objects; otherwise read
|
|
1200
|
-
# data in the used worksheet range, dropping null columns
|
|
1201
|
-
if tables := getattr(ws, "tables", None):
|
|
1202
|
-
table = tables[table_name] if table_name else next(iter(tables.values()))
|
|
1203
|
-
rows = list(ws[table.ref])
|
|
1204
|
-
if not rows:
|
|
1205
|
-
return _empty_frame(raise_if_empty)
|
|
1206
|
-
if has_header:
|
|
1207
|
-
header.extend(cell.value for cell in rows.pop(0))
|
|
1208
|
-
else:
|
|
1209
|
-
header.extend(f"column_{n}" for n in range(1, len(rows[0]) + 1))
|
|
1210
|
-
if table.totalsRowCount:
|
|
1211
|
-
rows = rows[: -table.totalsRowCount]
|
|
1212
|
-
rows_iter = rows
|
|
1213
|
-
elif table_name:
|
|
1214
|
-
msg = f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
|
|
1215
|
-
raise RuntimeError(msg)
|
|
1216
|
-
else:
|
|
1217
|
-
if not has_header:
|
|
1218
|
-
if not (rows_iter := list(ws.iter_rows())):
|
|
1219
|
-
return _empty_frame(raise_if_empty)
|
|
1220
|
-
n_cols = len(rows_iter[0])
|
|
1221
|
-
header = [f"column_{n}" for n in range(1, n_cols + 1)]
|
|
1222
|
-
else:
|
|
1223
|
-
rows_iter = ws.iter_rows()
|
|
1224
|
-
for row in rows_iter:
|
|
1225
|
-
row_values = [cell.value for cell in row]
|
|
1226
|
-
if any(v is not None for v in row_values):
|
|
1227
|
-
header.extend(row_values)
|
|
1228
|
-
break
|
|
1229
|
-
|
|
1230
|
-
dtype = String if no_inference else None
|
|
1231
|
-
series_data = []
|
|
1232
|
-
for name, column_data in zip(header, zip(*rows_iter)):
|
|
1233
|
-
if name or not drop_empty_cols:
|
|
1234
|
-
values = [cell.value for cell in column_data]
|
|
1235
|
-
if no_inference or (dtype := schema_overrides.get(name)) == String: # type: ignore[assignment,arg-type]
|
|
1236
|
-
# note: if we initialise the series with mixed-type data (eg: str/int)
|
|
1237
|
-
# then the non-strings will become null, so we handle the cast here
|
|
1238
|
-
values = [str(v) if (v is not None) else v for v in values]
|
|
1239
|
-
|
|
1240
|
-
if (tp := schema_overrides.get(name)) in (Date, Datetime, Time): # type: ignore[operator,arg-type]
|
|
1241
|
-
s = pl.Series(name, values, strict=False)
|
|
1242
|
-
if s.dtype == String:
|
|
1243
|
-
if tp == Datetime:
|
|
1244
|
-
s = s.str.to_datetime(
|
|
1245
|
-
time_unit=getattr(tp, "time_unit", None),
|
|
1246
|
-
time_zone=getattr(tp, "time_zone", None),
|
|
1247
|
-
)
|
|
1248
|
-
elif tp == Date:
|
|
1249
|
-
s = s.str.replace(
|
|
1250
|
-
r"(?:[ T]00:00:00(?:\.0+)?)$", ""
|
|
1251
|
-
).str.to_date()
|
|
1252
|
-
elif tp == Time:
|
|
1253
|
-
s = s.str.to_time()
|
|
1254
|
-
else:
|
|
1255
|
-
s = pl.Series(name, values, dtype=dtype, strict=False)
|
|
1256
|
-
series_data.append(s)
|
|
1257
|
-
|
|
1258
|
-
names = deduplicate_names(s.name for s in series_data)
|
|
1259
|
-
df = pl.DataFrame(
|
|
1260
|
-
dict(zip(names, series_data)),
|
|
1261
|
-
schema_overrides=schema_overrides,
|
|
1262
|
-
infer_schema_length=infer_schema_length,
|
|
1263
|
-
strict=False,
|
|
1264
|
-
)
|
|
1265
|
-
df = _drop_null_data(
|
|
1266
|
-
df,
|
|
1267
|
-
raise_if_empty=raise_if_empty,
|
|
1268
|
-
drop_empty_rows=drop_empty_rows,
|
|
1269
|
-
drop_empty_cols=drop_empty_cols,
|
|
1270
|
-
)
|
|
1271
|
-
df = _reorder_columns(df, columns)
|
|
1272
|
-
return df
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
def _read_spreadsheet_xlsx2csv(
|
|
1276
|
-
parser: Any,
|
|
1277
|
-
*,
|
|
1278
|
-
sheet_name: str | None,
|
|
1279
|
-
read_options: dict[str, Any],
|
|
1280
|
-
schema_overrides: SchemaDict | None,
|
|
1281
|
-
columns: Sequence[int] | Sequence[str] | None,
|
|
1282
|
-
table_name: str | None = None,
|
|
1283
|
-
drop_empty_rows: bool,
|
|
1284
|
-
drop_empty_cols: bool,
|
|
1285
|
-
raise_if_empty: bool,
|
|
1286
|
-
) -> pl.DataFrame:
|
|
1287
|
-
"""Use the 'xlsx2csv' library to read data from the given worksheet."""
|
|
1288
|
-
if table_name:
|
|
1289
|
-
msg = "the `table_name` parameter is not supported by the 'xlsx2csv' engine"
|
|
1290
|
-
raise ValueError(msg)
|
|
1291
|
-
|
|
1292
|
-
csv_buffer = StringIO()
|
|
1293
|
-
with warnings.catch_warnings():
|
|
1294
|
-
# xlsx2csv version 0.8.4 throws a DeprecationWarning in Python 3.13
|
|
1295
|
-
# https://github.com/dilshod/xlsx2csv/pull/287
|
|
1296
|
-
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
1297
|
-
parser.convert(outfile=csv_buffer, sheetname=sheet_name)
|
|
1298
|
-
|
|
1299
|
-
read_options.setdefault("truncate_ragged_lines", True)
|
|
1300
|
-
if columns:
|
|
1301
|
-
read_options["columns"] = columns
|
|
1302
|
-
|
|
1303
|
-
cast_to_boolean = []
|
|
1304
|
-
if schema_overrides:
|
|
1305
|
-
for col, dtype in schema_overrides.items():
|
|
1306
|
-
if dtype == Boolean:
|
|
1307
|
-
schema_overrides[col] = UInt8 # type: ignore[index]
|
|
1308
|
-
cast_to_boolean.append(F.col(col).cast(Boolean))
|
|
1309
|
-
|
|
1310
|
-
df = _csv_buffer_to_frame(
|
|
1311
|
-
csv_buffer,
|
|
1312
|
-
separator=",",
|
|
1313
|
-
read_options=read_options,
|
|
1314
|
-
schema_overrides=schema_overrides,
|
|
1315
|
-
raise_if_empty=raise_if_empty,
|
|
1316
|
-
drop_empty_rows=drop_empty_rows,
|
|
1317
|
-
drop_empty_cols=drop_empty_cols,
|
|
1318
|
-
)
|
|
1319
|
-
if cast_to_boolean:
|
|
1320
|
-
df = df.with_columns(*cast_to_boolean)
|
|
1321
|
-
|
|
1322
|
-
df = df.rename(_standardize_duplicates)
|
|
1323
|
-
return _reorder_columns(df, columns)
|