polars-runtime-compat 1.34.0b2__cp39-abi3-macosx_10_12_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/.gitkeep +0 -0
- _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
- polars/__init__.py +528 -0
- polars/_cpu_check.py +265 -0
- polars/_dependencies.py +355 -0
- polars/_plr.py +99 -0
- polars/_plr.pyi +2496 -0
- polars/_reexport.py +23 -0
- polars/_typing.py +478 -0
- polars/_utils/__init__.py +37 -0
- polars/_utils/async_.py +102 -0
- polars/_utils/cache.py +176 -0
- polars/_utils/cloud.py +40 -0
- polars/_utils/constants.py +29 -0
- polars/_utils/construction/__init__.py +46 -0
- polars/_utils/construction/dataframe.py +1397 -0
- polars/_utils/construction/other.py +72 -0
- polars/_utils/construction/series.py +560 -0
- polars/_utils/construction/utils.py +118 -0
- polars/_utils/convert.py +224 -0
- polars/_utils/deprecation.py +406 -0
- polars/_utils/getitem.py +457 -0
- polars/_utils/logging.py +11 -0
- polars/_utils/nest_asyncio.py +264 -0
- polars/_utils/parquet.py +15 -0
- polars/_utils/parse/__init__.py +12 -0
- polars/_utils/parse/expr.py +242 -0
- polars/_utils/polars_version.py +19 -0
- polars/_utils/pycapsule.py +53 -0
- polars/_utils/scan.py +27 -0
- polars/_utils/serde.py +63 -0
- polars/_utils/slice.py +215 -0
- polars/_utils/udfs.py +1251 -0
- polars/_utils/unstable.py +63 -0
- polars/_utils/various.py +782 -0
- polars/_utils/wrap.py +25 -0
- polars/api.py +370 -0
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +19 -0
- polars/catalog/unity/client.py +733 -0
- polars/catalog/unity/models.py +152 -0
- polars/config.py +1571 -0
- polars/convert/__init__.py +25 -0
- polars/convert/general.py +1046 -0
- polars/convert/normalize.py +261 -0
- polars/dataframe/__init__.py +5 -0
- polars/dataframe/_html.py +186 -0
- polars/dataframe/frame.py +12582 -0
- polars/dataframe/group_by.py +1067 -0
- polars/dataframe/plotting.py +257 -0
- polars/datatype_expr/__init__.py +5 -0
- polars/datatype_expr/array.py +56 -0
- polars/datatype_expr/datatype_expr.py +304 -0
- polars/datatype_expr/list.py +18 -0
- polars/datatype_expr/struct.py +69 -0
- polars/datatypes/__init__.py +122 -0
- polars/datatypes/_parse.py +195 -0
- polars/datatypes/_utils.py +48 -0
- polars/datatypes/classes.py +1213 -0
- polars/datatypes/constants.py +11 -0
- polars/datatypes/constructor.py +172 -0
- polars/datatypes/convert.py +366 -0
- polars/datatypes/group.py +130 -0
- polars/exceptions.py +230 -0
- polars/expr/__init__.py +7 -0
- polars/expr/array.py +964 -0
- polars/expr/binary.py +346 -0
- polars/expr/categorical.py +306 -0
- polars/expr/datetime.py +2620 -0
- polars/expr/expr.py +11272 -0
- polars/expr/list.py +1408 -0
- polars/expr/meta.py +444 -0
- polars/expr/name.py +321 -0
- polars/expr/string.py +3045 -0
- polars/expr/struct.py +357 -0
- polars/expr/whenthen.py +185 -0
- polars/functions/__init__.py +193 -0
- polars/functions/aggregation/__init__.py +33 -0
- polars/functions/aggregation/horizontal.py +298 -0
- polars/functions/aggregation/vertical.py +341 -0
- polars/functions/as_datatype.py +848 -0
- polars/functions/business.py +138 -0
- polars/functions/col.py +384 -0
- polars/functions/datatype.py +121 -0
- polars/functions/eager.py +524 -0
- polars/functions/escape_regex.py +29 -0
- polars/functions/lazy.py +2751 -0
- polars/functions/len.py +68 -0
- polars/functions/lit.py +210 -0
- polars/functions/random.py +22 -0
- polars/functions/range/__init__.py +19 -0
- polars/functions/range/_utils.py +15 -0
- polars/functions/range/date_range.py +303 -0
- polars/functions/range/datetime_range.py +370 -0
- polars/functions/range/int_range.py +348 -0
- polars/functions/range/linear_space.py +311 -0
- polars/functions/range/time_range.py +287 -0
- polars/functions/repeat.py +301 -0
- polars/functions/whenthen.py +353 -0
- polars/interchange/__init__.py +10 -0
- polars/interchange/buffer.py +77 -0
- polars/interchange/column.py +190 -0
- polars/interchange/dataframe.py +230 -0
- polars/interchange/from_dataframe.py +328 -0
- polars/interchange/protocol.py +303 -0
- polars/interchange/utils.py +170 -0
- polars/io/__init__.py +64 -0
- polars/io/_utils.py +317 -0
- polars/io/avro.py +49 -0
- polars/io/clipboard.py +36 -0
- polars/io/cloud/__init__.py +17 -0
- polars/io/cloud/_utils.py +80 -0
- polars/io/cloud/credential_provider/__init__.py +17 -0
- polars/io/cloud/credential_provider/_builder.py +520 -0
- polars/io/cloud/credential_provider/_providers.py +618 -0
- polars/io/csv/__init__.py +9 -0
- polars/io/csv/_utils.py +38 -0
- polars/io/csv/batched_reader.py +142 -0
- polars/io/csv/functions.py +1495 -0
- polars/io/database/__init__.py +6 -0
- polars/io/database/_arrow_registry.py +70 -0
- polars/io/database/_cursor_proxies.py +147 -0
- polars/io/database/_executor.py +578 -0
- polars/io/database/_inference.py +314 -0
- polars/io/database/_utils.py +144 -0
- polars/io/database/functions.py +516 -0
- polars/io/delta.py +499 -0
- polars/io/iceberg/__init__.py +3 -0
- polars/io/iceberg/_utils.py +697 -0
- polars/io/iceberg/dataset.py +556 -0
- polars/io/iceberg/functions.py +151 -0
- polars/io/ipc/__init__.py +8 -0
- polars/io/ipc/functions.py +514 -0
- polars/io/json/__init__.py +3 -0
- polars/io/json/read.py +101 -0
- polars/io/ndjson.py +332 -0
- polars/io/parquet/__init__.py +17 -0
- polars/io/parquet/field_overwrites.py +140 -0
- polars/io/parquet/functions.py +722 -0
- polars/io/partition.py +491 -0
- polars/io/plugins.py +187 -0
- polars/io/pyarrow_dataset/__init__.py +5 -0
- polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
- polars/io/pyarrow_dataset/functions.py +79 -0
- polars/io/scan_options/__init__.py +5 -0
- polars/io/scan_options/_options.py +59 -0
- polars/io/scan_options/cast_options.py +126 -0
- polars/io/spreadsheet/__init__.py +6 -0
- polars/io/spreadsheet/_utils.py +52 -0
- polars/io/spreadsheet/_write_utils.py +647 -0
- polars/io/spreadsheet/functions.py +1323 -0
- polars/lazyframe/__init__.py +9 -0
- polars/lazyframe/engine_config.py +61 -0
- polars/lazyframe/frame.py +8564 -0
- polars/lazyframe/group_by.py +669 -0
- polars/lazyframe/in_process.py +42 -0
- polars/lazyframe/opt_flags.py +333 -0
- polars/meta/__init__.py +14 -0
- polars/meta/build.py +33 -0
- polars/meta/index_type.py +27 -0
- polars/meta/thread_pool.py +50 -0
- polars/meta/versions.py +120 -0
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +213 -0
- polars/ml/utilities.py +30 -0
- polars/plugins.py +155 -0
- polars/py.typed +0 -0
- polars/pyproject.toml +96 -0
- polars/schema.py +265 -0
- polars/selectors.py +3117 -0
- polars/series/__init__.py +5 -0
- polars/series/array.py +776 -0
- polars/series/binary.py +254 -0
- polars/series/categorical.py +246 -0
- polars/series/datetime.py +2275 -0
- polars/series/list.py +1087 -0
- polars/series/plotting.py +191 -0
- polars/series/series.py +9197 -0
- polars/series/string.py +2367 -0
- polars/series/struct.py +154 -0
- polars/series/utils.py +191 -0
- polars/sql/__init__.py +7 -0
- polars/sql/context.py +677 -0
- polars/sql/functions.py +139 -0
- polars/string_cache.py +185 -0
- polars/testing/__init__.py +13 -0
- polars/testing/asserts/__init__.py +9 -0
- polars/testing/asserts/frame.py +231 -0
- polars/testing/asserts/series.py +219 -0
- polars/testing/asserts/utils.py +12 -0
- polars/testing/parametric/__init__.py +33 -0
- polars/testing/parametric/profiles.py +107 -0
- polars/testing/parametric/strategies/__init__.py +22 -0
- polars/testing/parametric/strategies/_utils.py +14 -0
- polars/testing/parametric/strategies/core.py +615 -0
- polars/testing/parametric/strategies/data.py +452 -0
- polars/testing/parametric/strategies/dtype.py +436 -0
- polars/testing/parametric/strategies/legacy.py +169 -0
- polars/type_aliases.py +24 -0
- polars_runtime_compat-1.34.0b2.dist-info/METADATA +190 -0
- polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
- polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
- polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +20 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from functools import partial
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
import polars._reexport as pl
|
|
7
|
+
from polars._dependencies import pyarrow as pa
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from polars import DataFrame, LazyFrame
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _scan_pyarrow_dataset(
|
|
14
|
+
ds: pa.dataset.Dataset,
|
|
15
|
+
*,
|
|
16
|
+
allow_pyarrow_filter: bool = True,
|
|
17
|
+
batch_size: int | None = None,
|
|
18
|
+
) -> LazyFrame:
|
|
19
|
+
"""
|
|
20
|
+
Pickle the partially applied function `_scan_pyarrow_dataset_impl`.
|
|
21
|
+
|
|
22
|
+
The bytes are then sent to the polars logical plan. It can be deserialized once
|
|
23
|
+
executed and ran.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
ds
|
|
28
|
+
pyarrow dataset
|
|
29
|
+
allow_pyarrow_filter
|
|
30
|
+
Allow predicates to be pushed down to pyarrow. This can lead to different
|
|
31
|
+
results if comparisons are done with null values as pyarrow handles this
|
|
32
|
+
different than polars does.
|
|
33
|
+
batch_size
|
|
34
|
+
The maximum row count for scanned pyarrow record batches.
|
|
35
|
+
"""
|
|
36
|
+
func = partial(_scan_pyarrow_dataset_impl, ds, batch_size=batch_size)
|
|
37
|
+
return pl.LazyFrame._scan_python_function(
|
|
38
|
+
ds.schema, func, pyarrow=allow_pyarrow_filter
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _scan_pyarrow_dataset_impl(
|
|
43
|
+
ds: pa.dataset.Dataset,
|
|
44
|
+
with_columns: list[str] | None,
|
|
45
|
+
predicate: str | None,
|
|
46
|
+
n_rows: int | None,
|
|
47
|
+
batch_size: int | None,
|
|
48
|
+
) -> DataFrame:
|
|
49
|
+
"""
|
|
50
|
+
Take the projected columns and materialize an arrow table.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
ds
|
|
55
|
+
pyarrow dataset
|
|
56
|
+
with_columns
|
|
57
|
+
Columns that are projected
|
|
58
|
+
predicate
|
|
59
|
+
pyarrow expression that can be evaluated with eval
|
|
60
|
+
n_rows:
|
|
61
|
+
Materialize only n rows from the arrow dataset
|
|
62
|
+
batch_size
|
|
63
|
+
The maximum row count for scanned pyarrow record batches.
|
|
64
|
+
|
|
65
|
+
Warnings
|
|
66
|
+
--------
|
|
67
|
+
Don't use this if you accept untrusted user inputs. Predicates will be evaluated
|
|
68
|
+
with python 'eval'. There is sanitation in place, but it is a possible attack
|
|
69
|
+
vector.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
DataFrame
|
|
74
|
+
"""
|
|
75
|
+
from polars import from_arrow
|
|
76
|
+
|
|
77
|
+
_filter = None
|
|
78
|
+
|
|
79
|
+
if predicate:
|
|
80
|
+
from polars._utils.convert import (
|
|
81
|
+
to_py_date,
|
|
82
|
+
to_py_datetime,
|
|
83
|
+
to_py_time,
|
|
84
|
+
to_py_timedelta,
|
|
85
|
+
)
|
|
86
|
+
from polars.datatypes import Date, Datetime, Duration
|
|
87
|
+
|
|
88
|
+
_filter = eval(
|
|
89
|
+
predicate,
|
|
90
|
+
{
|
|
91
|
+
"pa": pa,
|
|
92
|
+
"Date": Date,
|
|
93
|
+
"Datetime": Datetime,
|
|
94
|
+
"Duration": Duration,
|
|
95
|
+
"to_py_date": to_py_date,
|
|
96
|
+
"to_py_datetime": to_py_datetime,
|
|
97
|
+
"to_py_time": to_py_time,
|
|
98
|
+
"to_py_timedelta": to_py_timedelta,
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
common_params = {"columns": with_columns, "filter": _filter}
|
|
103
|
+
if batch_size is not None:
|
|
104
|
+
common_params["batch_size"] = batch_size
|
|
105
|
+
|
|
106
|
+
if n_rows:
|
|
107
|
+
return from_arrow(ds.head(n_rows, **common_params)) # type: ignore[return-value]
|
|
108
|
+
|
|
109
|
+
return from_arrow(ds.to_table(**common_params)) # type: ignore[return-value]
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from polars._utils.unstable import unstable
|
|
6
|
+
from polars.io.pyarrow_dataset.anonymous_scan import _scan_pyarrow_dataset
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from polars import LazyFrame
|
|
10
|
+
from polars._dependencies import pyarrow as pa
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@unstable()
|
|
14
|
+
def scan_pyarrow_dataset(
|
|
15
|
+
source: pa.dataset.Dataset,
|
|
16
|
+
*,
|
|
17
|
+
allow_pyarrow_filter: bool = True,
|
|
18
|
+
batch_size: int | None = None,
|
|
19
|
+
) -> LazyFrame:
|
|
20
|
+
"""
|
|
21
|
+
Scan a pyarrow dataset.
|
|
22
|
+
|
|
23
|
+
.. warning::
|
|
24
|
+
This functionality is considered **unstable**. It may be changed
|
|
25
|
+
at any point without it being considered a breaking change.
|
|
26
|
+
|
|
27
|
+
This can be useful to connect to cloud or partitioned datasets.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
source
|
|
32
|
+
Pyarrow dataset to scan.
|
|
33
|
+
allow_pyarrow_filter
|
|
34
|
+
Allow predicates to be pushed down to pyarrow. This can lead to different
|
|
35
|
+
results if comparisons are done with null values as pyarrow handles this
|
|
36
|
+
different than polars does.
|
|
37
|
+
batch_size
|
|
38
|
+
The maximum row count for scanned pyarrow record batches.
|
|
39
|
+
|
|
40
|
+
Warnings
|
|
41
|
+
--------
|
|
42
|
+
Don't use this if you accept untrusted user inputs. Predicates will be evaluated
|
|
43
|
+
with python 'eval'. There is sanitation in place, but it is a possible attack
|
|
44
|
+
vector.
|
|
45
|
+
This method can only can push down predicates that are allowed by PyArrow
|
|
46
|
+
(e.g. not the full Polars API).
|
|
47
|
+
|
|
48
|
+
If :func:`scan_parquet` works for your source, you should use that instead.
|
|
49
|
+
|
|
50
|
+
Notes
|
|
51
|
+
-----
|
|
52
|
+
When using partitioning, the appropriate `partitioning` option must be set on
|
|
53
|
+
`pyarrow.dataset.dataset` before passing to Polars or the partitioned-on column(s)
|
|
54
|
+
may not get passed to Polars.
|
|
55
|
+
|
|
56
|
+
Examples
|
|
57
|
+
--------
|
|
58
|
+
>>> import pyarrow.dataset as ds
|
|
59
|
+
>>> dset = ds.dataset("s3://my-partitioned-folder/", format="ipc") # doctest: +SKIP
|
|
60
|
+
>>> (
|
|
61
|
+
... pl.scan_pyarrow_dataset(dset)
|
|
62
|
+
... .filter("bools")
|
|
63
|
+
... .select("bools", "floats", "date")
|
|
64
|
+
... .collect()
|
|
65
|
+
... ) # doctest: +SKIP
|
|
66
|
+
shape: (1, 3)
|
|
67
|
+
┌───────┬────────┬────────────┐
|
|
68
|
+
│ bools ┆ floats ┆ date │
|
|
69
|
+
│ --- ┆ --- ┆ --- │
|
|
70
|
+
│ bool ┆ f64 ┆ date │
|
|
71
|
+
╞═══════╪════════╪════════════╡
|
|
72
|
+
│ true ┆ 2.0 ┆ 1970-05-04 │
|
|
73
|
+
└───────┴────────┴────────────┘
|
|
74
|
+
"""
|
|
75
|
+
return _scan_pyarrow_dataset(
|
|
76
|
+
source,
|
|
77
|
+
allow_pyarrow_filter=allow_pyarrow_filter,
|
|
78
|
+
batch_size=batch_size,
|
|
79
|
+
)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Literal
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from collections.abc import Sequence
|
|
7
|
+
|
|
8
|
+
from polars._typing import (
|
|
9
|
+
ColumnMapping,
|
|
10
|
+
DefaultFieldValues,
|
|
11
|
+
DeletionFiles,
|
|
12
|
+
SchemaDict,
|
|
13
|
+
)
|
|
14
|
+
from polars.dataframe.frame import DataFrame
|
|
15
|
+
from polars.io.cloud.credential_provider._builder import CredentialProviderBuilder
|
|
16
|
+
from polars.io.scan_options.cast_options import ScanCastOptions
|
|
17
|
+
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# TODO: Add `kw_only=True` after 3.9 support dropped
|
|
22
|
+
@dataclass
|
|
23
|
+
class ScanOptions:
|
|
24
|
+
"""
|
|
25
|
+
Holds scan options that are generic over scan type.
|
|
26
|
+
|
|
27
|
+
For internal use. Most of the options will parse into `UnifiedScanArgs`.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
row_index: tuple[str, int] | None = None
|
|
31
|
+
# (i64, usize)
|
|
32
|
+
pre_slice: tuple[int, int] | None = None
|
|
33
|
+
cast_options: ScanCastOptions | None = None
|
|
34
|
+
extra_columns: Literal["ignore", "raise"] = "raise"
|
|
35
|
+
missing_columns: Literal["insert", "raise"] = "raise"
|
|
36
|
+
include_file_paths: str | None = None
|
|
37
|
+
|
|
38
|
+
# For path expansion
|
|
39
|
+
glob: bool = True
|
|
40
|
+
hidden_file_prefix: Sequence[str] | None = None
|
|
41
|
+
|
|
42
|
+
# Hive
|
|
43
|
+
# Note: `None` means auto.
|
|
44
|
+
hive_partitioning: bool | None = None
|
|
45
|
+
hive_schema: SchemaDict | None = None
|
|
46
|
+
try_parse_hive_dates: bool = True
|
|
47
|
+
|
|
48
|
+
rechunk: bool = False
|
|
49
|
+
cache: bool = True
|
|
50
|
+
|
|
51
|
+
# Cloud
|
|
52
|
+
storage_options: list[tuple[str, str]] | None = None
|
|
53
|
+
credential_provider: CredentialProviderBuilder | None = None
|
|
54
|
+
retries: int = 2
|
|
55
|
+
|
|
56
|
+
column_mapping: ColumnMapping | None = None
|
|
57
|
+
default_values: DefaultFieldValues | None = None
|
|
58
|
+
deletion_files: DeletionFiles | None = None
|
|
59
|
+
table_statistics: DataFrame | None = None
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Literal
|
|
4
|
+
|
|
5
|
+
from polars._utils.unstable import issue_unstable_warning
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from collections.abc import Collection
|
|
9
|
+
|
|
10
|
+
from typing_extensions import TypeAlias
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
FloatCastOption: TypeAlias = Literal["upcast", "downcast"]
|
|
14
|
+
DatetimeCastOption: TypeAlias = Literal["nanosecond-downcast", "convert-timezone"]
|
|
15
|
+
|
|
16
|
+
_DEFAULT_CAST_OPTIONS_ICEBERG: ScanCastOptions | None = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ScanCastOptions:
|
|
20
|
+
"""Options for scanning files."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
*,
|
|
25
|
+
integer_cast: Literal["upcast", "forbid"] = "forbid",
|
|
26
|
+
float_cast: Literal["forbid"]
|
|
27
|
+
| FloatCastOption
|
|
28
|
+
| Collection[FloatCastOption] = "forbid",
|
|
29
|
+
datetime_cast: Literal["forbid"]
|
|
30
|
+
| DatetimeCastOption
|
|
31
|
+
| Collection[DatetimeCastOption] = "forbid",
|
|
32
|
+
missing_struct_fields: Literal["insert", "raise"] = "raise",
|
|
33
|
+
extra_struct_fields: Literal["ignore", "raise"] = "raise",
|
|
34
|
+
categorical_to_string: Literal["allow", "forbid"] = "forbid",
|
|
35
|
+
_internal_call: bool = False,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Common configuration for scanning files.
|
|
39
|
+
|
|
40
|
+
.. warning::
|
|
41
|
+
This functionality is considered **unstable**. It may be changed
|
|
42
|
+
at any point without it being considered a breaking change.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
integer_cast
|
|
47
|
+
Configuration for casting from integer types:
|
|
48
|
+
|
|
49
|
+
* `upcast`: Allow lossless casting to wider integer types.
|
|
50
|
+
* `forbid`: Raises an error if dtypes do not match.
|
|
51
|
+
|
|
52
|
+
float_cast
|
|
53
|
+
Configuration for casting from float types:
|
|
54
|
+
|
|
55
|
+
* `upcast`: Allow casting to higher precision float types.
|
|
56
|
+
* `downcast`: Allow casting to lower precision float types.
|
|
57
|
+
* `forbid`: Raises an error if dtypes do not match.
|
|
58
|
+
|
|
59
|
+
datetime_cast
|
|
60
|
+
Configuration for casting from datetime types:
|
|
61
|
+
|
|
62
|
+
* `nanosecond-downcast`: Allow nanosecond precision datetime to be \
|
|
63
|
+
downcasted to any lower precision. This has a similar effect to \
|
|
64
|
+
PyArrow's `coerce_int96_timestamp_unit`.
|
|
65
|
+
* `convert-timezone`: Allow casting to a different timezone.
|
|
66
|
+
* `forbid`: Raises an error if dtypes do not match.
|
|
67
|
+
|
|
68
|
+
missing_struct_fields
|
|
69
|
+
Configuration for behavior when struct fields defined in the schema
|
|
70
|
+
are missing from the data:
|
|
71
|
+
|
|
72
|
+
* `insert`: Inserts the missing fields.
|
|
73
|
+
* `raise`: Raises an error.
|
|
74
|
+
|
|
75
|
+
extra_struct_fields
|
|
76
|
+
Configuration for behavior when extra struct fields outside of the
|
|
77
|
+
defined schema are encountered in the data:
|
|
78
|
+
|
|
79
|
+
* `ignore`: Silently ignores.
|
|
80
|
+
* `raise`: Raises an error.
|
|
81
|
+
|
|
82
|
+
categorical_to_string
|
|
83
|
+
Configuration for behavior when reading in a column whose expected
|
|
84
|
+
type is string, but type in the file is categorical.
|
|
85
|
+
|
|
86
|
+
* `allow`: Categorical is casted to string.
|
|
87
|
+
* `forbid`: Raises an error.
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
if not _internal_call:
|
|
91
|
+
issue_unstable_warning("ScanCastOptions is considered unstable.")
|
|
92
|
+
|
|
93
|
+
self.integer_cast = integer_cast
|
|
94
|
+
self.float_cast = float_cast
|
|
95
|
+
self.datetime_cast = datetime_cast
|
|
96
|
+
self.missing_struct_fields = missing_struct_fields
|
|
97
|
+
self.extra_struct_fields = extra_struct_fields
|
|
98
|
+
self.categorical_to_string = categorical_to_string
|
|
99
|
+
|
|
100
|
+
# Note: We don't cache this here, it's cached on the Rust-side.
|
|
101
|
+
@staticmethod
|
|
102
|
+
def _default() -> ScanCastOptions:
|
|
103
|
+
return ScanCastOptions(_internal_call=True)
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def _default_iceberg(cls) -> ScanCastOptions:
|
|
107
|
+
"""
|
|
108
|
+
Default options suitable for Iceberg / Deltalake.
|
|
109
|
+
|
|
110
|
+
This in general has all casting options enabled. Note: do not modify the
|
|
111
|
+
returned config object, it is a cached global object.
|
|
112
|
+
"""
|
|
113
|
+
global _DEFAULT_CAST_OPTIONS_ICEBERG
|
|
114
|
+
|
|
115
|
+
if _DEFAULT_CAST_OPTIONS_ICEBERG is None:
|
|
116
|
+
_DEFAULT_CAST_OPTIONS_ICEBERG = ScanCastOptions(
|
|
117
|
+
integer_cast="upcast",
|
|
118
|
+
float_cast=["upcast", "downcast"],
|
|
119
|
+
datetime_cast=["nanosecond-downcast", "convert-timezone"],
|
|
120
|
+
missing_struct_fields="insert",
|
|
121
|
+
extra_struct_fields="ignore",
|
|
122
|
+
categorical_to_string="allow",
|
|
123
|
+
_internal_call=True,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return _DEFAULT_CAST_OPTIONS_ICEBERG
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from collections.abc import Iterator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@contextmanager
|
|
12
|
+
def PortableTemporaryFile(
|
|
13
|
+
mode: str = "w+b",
|
|
14
|
+
*,
|
|
15
|
+
buffering: int = -1,
|
|
16
|
+
encoding: str | None = None,
|
|
17
|
+
newline: str | None = None,
|
|
18
|
+
suffix: str | None = None,
|
|
19
|
+
prefix: str | None = None,
|
|
20
|
+
dir: str | Path | None = None,
|
|
21
|
+
delete: bool = True,
|
|
22
|
+
errors: str | None = None,
|
|
23
|
+
) -> Iterator[Any]:
|
|
24
|
+
"""
|
|
25
|
+
Slightly more resilient version of the standard `NamedTemporaryFile`.
|
|
26
|
+
|
|
27
|
+
Plays better with Windows when using the 'delete' option.
|
|
28
|
+
"""
|
|
29
|
+
from tempfile import NamedTemporaryFile
|
|
30
|
+
|
|
31
|
+
params = cast(
|
|
32
|
+
Any,
|
|
33
|
+
{
|
|
34
|
+
"mode": mode,
|
|
35
|
+
"buffering": buffering,
|
|
36
|
+
"encoding": encoding,
|
|
37
|
+
"newline": newline,
|
|
38
|
+
"suffix": suffix,
|
|
39
|
+
"prefix": prefix,
|
|
40
|
+
"dir": dir,
|
|
41
|
+
"delete": False,
|
|
42
|
+
"errors": errors,
|
|
43
|
+
},
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
with NamedTemporaryFile(**params) as tmp:
|
|
47
|
+
try:
|
|
48
|
+
yield tmp
|
|
49
|
+
finally:
|
|
50
|
+
tmp.close()
|
|
51
|
+
if delete:
|
|
52
|
+
Path(tmp.name).unlink(missing_ok=True)
|