polars-runtime-compat 1.34.0b3__cp39-abi3-manylinux_2_24_aarch64.whl → 1.34.0b4__cp39-abi3-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
- polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
- polars/__init__.py +0 -528
- polars/_cpu_check.py +0 -265
- polars/_dependencies.py +0 -355
- polars/_plr.py +0 -99
- polars/_plr.pyi +0 -2496
- polars/_reexport.py +0 -23
- polars/_typing.py +0 -478
- polars/_utils/__init__.py +0 -37
- polars/_utils/async_.py +0 -102
- polars/_utils/cache.py +0 -176
- polars/_utils/cloud.py +0 -40
- polars/_utils/constants.py +0 -29
- polars/_utils/construction/__init__.py +0 -46
- polars/_utils/construction/dataframe.py +0 -1397
- polars/_utils/construction/other.py +0 -72
- polars/_utils/construction/series.py +0 -560
- polars/_utils/construction/utils.py +0 -118
- polars/_utils/convert.py +0 -224
- polars/_utils/deprecation.py +0 -406
- polars/_utils/getitem.py +0 -457
- polars/_utils/logging.py +0 -11
- polars/_utils/nest_asyncio.py +0 -264
- polars/_utils/parquet.py +0 -15
- polars/_utils/parse/__init__.py +0 -12
- polars/_utils/parse/expr.py +0 -242
- polars/_utils/polars_version.py +0 -19
- polars/_utils/pycapsule.py +0 -53
- polars/_utils/scan.py +0 -27
- polars/_utils/serde.py +0 -63
- polars/_utils/slice.py +0 -215
- polars/_utils/udfs.py +0 -1251
- polars/_utils/unstable.py +0 -63
- polars/_utils/various.py +0 -782
- polars/_utils/wrap.py +0 -25
- polars/api.py +0 -370
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +0 -19
- polars/catalog/unity/client.py +0 -733
- polars/catalog/unity/models.py +0 -152
- polars/config.py +0 -1571
- polars/convert/__init__.py +0 -25
- polars/convert/general.py +0 -1046
- polars/convert/normalize.py +0 -261
- polars/dataframe/__init__.py +0 -5
- polars/dataframe/_html.py +0 -186
- polars/dataframe/frame.py +0 -12582
- polars/dataframe/group_by.py +0 -1067
- polars/dataframe/plotting.py +0 -257
- polars/datatype_expr/__init__.py +0 -5
- polars/datatype_expr/array.py +0 -56
- polars/datatype_expr/datatype_expr.py +0 -304
- polars/datatype_expr/list.py +0 -18
- polars/datatype_expr/struct.py +0 -69
- polars/datatypes/__init__.py +0 -122
- polars/datatypes/_parse.py +0 -195
- polars/datatypes/_utils.py +0 -48
- polars/datatypes/classes.py +0 -1213
- polars/datatypes/constants.py +0 -11
- polars/datatypes/constructor.py +0 -172
- polars/datatypes/convert.py +0 -366
- polars/datatypes/group.py +0 -130
- polars/exceptions.py +0 -230
- polars/expr/__init__.py +0 -7
- polars/expr/array.py +0 -964
- polars/expr/binary.py +0 -346
- polars/expr/categorical.py +0 -306
- polars/expr/datetime.py +0 -2620
- polars/expr/expr.py +0 -11272
- polars/expr/list.py +0 -1408
- polars/expr/meta.py +0 -444
- polars/expr/name.py +0 -321
- polars/expr/string.py +0 -3045
- polars/expr/struct.py +0 -357
- polars/expr/whenthen.py +0 -185
- polars/functions/__init__.py +0 -193
- polars/functions/aggregation/__init__.py +0 -33
- polars/functions/aggregation/horizontal.py +0 -298
- polars/functions/aggregation/vertical.py +0 -341
- polars/functions/as_datatype.py +0 -848
- polars/functions/business.py +0 -138
- polars/functions/col.py +0 -384
- polars/functions/datatype.py +0 -121
- polars/functions/eager.py +0 -524
- polars/functions/escape_regex.py +0 -29
- polars/functions/lazy.py +0 -2751
- polars/functions/len.py +0 -68
- polars/functions/lit.py +0 -210
- polars/functions/random.py +0 -22
- polars/functions/range/__init__.py +0 -19
- polars/functions/range/_utils.py +0 -15
- polars/functions/range/date_range.py +0 -303
- polars/functions/range/datetime_range.py +0 -370
- polars/functions/range/int_range.py +0 -348
- polars/functions/range/linear_space.py +0 -311
- polars/functions/range/time_range.py +0 -287
- polars/functions/repeat.py +0 -301
- polars/functions/whenthen.py +0 -353
- polars/interchange/__init__.py +0 -10
- polars/interchange/buffer.py +0 -77
- polars/interchange/column.py +0 -190
- polars/interchange/dataframe.py +0 -230
- polars/interchange/from_dataframe.py +0 -328
- polars/interchange/protocol.py +0 -303
- polars/interchange/utils.py +0 -170
- polars/io/__init__.py +0 -64
- polars/io/_utils.py +0 -317
- polars/io/avro.py +0 -49
- polars/io/clipboard.py +0 -36
- polars/io/cloud/__init__.py +0 -17
- polars/io/cloud/_utils.py +0 -80
- polars/io/cloud/credential_provider/__init__.py +0 -17
- polars/io/cloud/credential_provider/_builder.py +0 -520
- polars/io/cloud/credential_provider/_providers.py +0 -618
- polars/io/csv/__init__.py +0 -9
- polars/io/csv/_utils.py +0 -38
- polars/io/csv/batched_reader.py +0 -142
- polars/io/csv/functions.py +0 -1495
- polars/io/database/__init__.py +0 -6
- polars/io/database/_arrow_registry.py +0 -70
- polars/io/database/_cursor_proxies.py +0 -147
- polars/io/database/_executor.py +0 -578
- polars/io/database/_inference.py +0 -314
- polars/io/database/_utils.py +0 -144
- polars/io/database/functions.py +0 -516
- polars/io/delta.py +0 -499
- polars/io/iceberg/__init__.py +0 -3
- polars/io/iceberg/_utils.py +0 -697
- polars/io/iceberg/dataset.py +0 -556
- polars/io/iceberg/functions.py +0 -151
- polars/io/ipc/__init__.py +0 -8
- polars/io/ipc/functions.py +0 -514
- polars/io/json/__init__.py +0 -3
- polars/io/json/read.py +0 -101
- polars/io/ndjson.py +0 -332
- polars/io/parquet/__init__.py +0 -17
- polars/io/parquet/field_overwrites.py +0 -140
- polars/io/parquet/functions.py +0 -722
- polars/io/partition.py +0 -491
- polars/io/plugins.py +0 -187
- polars/io/pyarrow_dataset/__init__.py +0 -5
- polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
- polars/io/pyarrow_dataset/functions.py +0 -79
- polars/io/scan_options/__init__.py +0 -5
- polars/io/scan_options/_options.py +0 -59
- polars/io/scan_options/cast_options.py +0 -126
- polars/io/spreadsheet/__init__.py +0 -6
- polars/io/spreadsheet/_utils.py +0 -52
- polars/io/spreadsheet/_write_utils.py +0 -647
- polars/io/spreadsheet/functions.py +0 -1323
- polars/lazyframe/__init__.py +0 -9
- polars/lazyframe/engine_config.py +0 -61
- polars/lazyframe/frame.py +0 -8564
- polars/lazyframe/group_by.py +0 -669
- polars/lazyframe/in_process.py +0 -42
- polars/lazyframe/opt_flags.py +0 -333
- polars/meta/__init__.py +0 -14
- polars/meta/build.py +0 -33
- polars/meta/index_type.py +0 -27
- polars/meta/thread_pool.py +0 -50
- polars/meta/versions.py +0 -120
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +0 -213
- polars/ml/utilities.py +0 -30
- polars/plugins.py +0 -155
- polars/py.typed +0 -0
- polars/pyproject.toml +0 -103
- polars/schema.py +0 -265
- polars/selectors.py +0 -3117
- polars/series/__init__.py +0 -5
- polars/series/array.py +0 -776
- polars/series/binary.py +0 -254
- polars/series/categorical.py +0 -246
- polars/series/datetime.py +0 -2275
- polars/series/list.py +0 -1087
- polars/series/plotting.py +0 -191
- polars/series/series.py +0 -9197
- polars/series/string.py +0 -2367
- polars/series/struct.py +0 -154
- polars/series/utils.py +0 -191
- polars/sql/__init__.py +0 -7
- polars/sql/context.py +0 -677
- polars/sql/functions.py +0 -139
- polars/string_cache.py +0 -185
- polars/testing/__init__.py +0 -13
- polars/testing/asserts/__init__.py +0 -9
- polars/testing/asserts/frame.py +0 -231
- polars/testing/asserts/series.py +0 -219
- polars/testing/asserts/utils.py +0 -12
- polars/testing/parametric/__init__.py +0 -33
- polars/testing/parametric/profiles.py +0 -107
- polars/testing/parametric/strategies/__init__.py +0 -22
- polars/testing/parametric/strategies/_utils.py +0 -14
- polars/testing/parametric/strategies/core.py +0 -615
- polars/testing/parametric/strategies/data.py +0 -452
- polars/testing/parametric/strategies/dtype.py +0 -436
- polars/testing/parametric/strategies/legacy.py +0 -169
- polars/type_aliases.py +0 -24
- polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,1397 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import contextlib
|
|
4
|
-
from collections.abc import Generator, Mapping, Sequence
|
|
5
|
-
from datetime import date, datetime, time, timedelta
|
|
6
|
-
from functools import singledispatch
|
|
7
|
-
from itertools import islice, zip_longest
|
|
8
|
-
from operator import itemgetter
|
|
9
|
-
from typing import (
|
|
10
|
-
TYPE_CHECKING,
|
|
11
|
-
Any,
|
|
12
|
-
Callable,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
import polars._reexport as pl
|
|
16
|
-
import polars._utils.construction as plc
|
|
17
|
-
from polars import functions as F
|
|
18
|
-
from polars._dependencies import (
|
|
19
|
-
_NUMPY_AVAILABLE,
|
|
20
|
-
_PYARROW_AVAILABLE,
|
|
21
|
-
_check_for_numpy,
|
|
22
|
-
_check_for_pandas,
|
|
23
|
-
dataclasses,
|
|
24
|
-
)
|
|
25
|
-
from polars._dependencies import numpy as np
|
|
26
|
-
from polars._dependencies import pandas as pd
|
|
27
|
-
from polars._dependencies import pyarrow as pa
|
|
28
|
-
from polars._utils.construction.utils import (
|
|
29
|
-
contains_nested,
|
|
30
|
-
get_first_non_none,
|
|
31
|
-
is_namedtuple,
|
|
32
|
-
is_pydantic_model,
|
|
33
|
-
is_simple_numpy_backed_pandas_series,
|
|
34
|
-
is_sqlalchemy_row,
|
|
35
|
-
nt_unpack,
|
|
36
|
-
try_get_type_hints,
|
|
37
|
-
)
|
|
38
|
-
from polars._utils.various import (
|
|
39
|
-
_is_generator,
|
|
40
|
-
arrlen,
|
|
41
|
-
issue_warning,
|
|
42
|
-
parse_version,
|
|
43
|
-
)
|
|
44
|
-
from polars.datatypes import (
|
|
45
|
-
N_INFER_DEFAULT,
|
|
46
|
-
Categorical,
|
|
47
|
-
Enum,
|
|
48
|
-
String,
|
|
49
|
-
Struct,
|
|
50
|
-
Unknown,
|
|
51
|
-
is_polars_dtype,
|
|
52
|
-
parse_into_dtype,
|
|
53
|
-
try_parse_into_dtype,
|
|
54
|
-
)
|
|
55
|
-
from polars.exceptions import DataOrientationWarning, ShapeError
|
|
56
|
-
from polars.meta import thread_pool_size
|
|
57
|
-
|
|
58
|
-
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
59
|
-
from polars._plr import PyDataFrame
|
|
60
|
-
|
|
61
|
-
if TYPE_CHECKING:
|
|
62
|
-
from collections.abc import Iterable, MutableMapping
|
|
63
|
-
|
|
64
|
-
from polars import DataFrame, Series
|
|
65
|
-
from polars._plr import PySeries
|
|
66
|
-
from polars._typing import (
|
|
67
|
-
Orientation,
|
|
68
|
-
PolarsDataType,
|
|
69
|
-
SchemaDefinition,
|
|
70
|
-
SchemaDict,
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
_MIN_NUMPY_SIZE_FOR_MULTITHREADING = 1000
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def dict_to_pydf(
|
|
77
|
-
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
|
|
78
|
-
schema: SchemaDefinition | None = None,
|
|
79
|
-
*,
|
|
80
|
-
schema_overrides: SchemaDict | None = None,
|
|
81
|
-
strict: bool = True,
|
|
82
|
-
nan_to_null: bool = False,
|
|
83
|
-
allow_multithreaded: bool = True,
|
|
84
|
-
) -> PyDataFrame:
|
|
85
|
-
"""Construct a PyDataFrame from a dictionary of sequences."""
|
|
86
|
-
if isinstance(schema, Mapping) and data:
|
|
87
|
-
if not all((col in schema) for col in data):
|
|
88
|
-
msg = "the given column-schema names do not match the data dictionary"
|
|
89
|
-
raise ValueError(msg)
|
|
90
|
-
data = {col: data[col] for col in schema}
|
|
91
|
-
|
|
92
|
-
column_names, schema_overrides = _unpack_schema(
|
|
93
|
-
schema, lookup_names=data.keys(), schema_overrides=schema_overrides
|
|
94
|
-
)
|
|
95
|
-
if not column_names:
|
|
96
|
-
column_names = list(data)
|
|
97
|
-
|
|
98
|
-
if data and _NUMPY_AVAILABLE:
|
|
99
|
-
# if there are 3 or more numpy arrays of sufficient size, we multi-thread:
|
|
100
|
-
count_numpy = sum(
|
|
101
|
-
int(
|
|
102
|
-
allow_multithreaded
|
|
103
|
-
and _check_for_numpy(val)
|
|
104
|
-
and isinstance(val, np.ndarray)
|
|
105
|
-
and len(val) > _MIN_NUMPY_SIZE_FOR_MULTITHREADING
|
|
106
|
-
# integers and non-nan floats are zero-copy
|
|
107
|
-
and nan_to_null
|
|
108
|
-
and val.dtype in (np.float32, np.float64)
|
|
109
|
-
)
|
|
110
|
-
for val in data.values()
|
|
111
|
-
)
|
|
112
|
-
if count_numpy >= 3:
|
|
113
|
-
# yes, multi-threading was easier in python here; we cannot have multiple
|
|
114
|
-
# threads running python and release the gil in pyo3 (it will deadlock).
|
|
115
|
-
|
|
116
|
-
# (note: 'dummy' is threaded)
|
|
117
|
-
# We catch FileNotFoundError: see 16675
|
|
118
|
-
try:
|
|
119
|
-
import multiprocessing.dummy
|
|
120
|
-
|
|
121
|
-
pool_size = thread_pool_size()
|
|
122
|
-
with multiprocessing.dummy.Pool(pool_size) as pool:
|
|
123
|
-
data = dict(
|
|
124
|
-
zip(
|
|
125
|
-
column_names,
|
|
126
|
-
pool.map(
|
|
127
|
-
lambda t: (
|
|
128
|
-
pl.Series(t[0], t[1], nan_to_null=nan_to_null)
|
|
129
|
-
if isinstance(t[1], np.ndarray)
|
|
130
|
-
else t[1]
|
|
131
|
-
),
|
|
132
|
-
list(data.items()),
|
|
133
|
-
),
|
|
134
|
-
)
|
|
135
|
-
)
|
|
136
|
-
except FileNotFoundError:
|
|
137
|
-
return dict_to_pydf(
|
|
138
|
-
data=data,
|
|
139
|
-
schema=schema,
|
|
140
|
-
schema_overrides=schema_overrides,
|
|
141
|
-
strict=strict,
|
|
142
|
-
nan_to_null=nan_to_null,
|
|
143
|
-
allow_multithreaded=False,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
if not data and schema_overrides:
|
|
147
|
-
data_series = [
|
|
148
|
-
pl.Series(
|
|
149
|
-
name,
|
|
150
|
-
[],
|
|
151
|
-
dtype=schema_overrides.get(name),
|
|
152
|
-
strict=strict,
|
|
153
|
-
nan_to_null=nan_to_null,
|
|
154
|
-
)._s
|
|
155
|
-
for name in column_names
|
|
156
|
-
]
|
|
157
|
-
else:
|
|
158
|
-
data_series = [
|
|
159
|
-
s._s
|
|
160
|
-
for s in _expand_dict_values(
|
|
161
|
-
data,
|
|
162
|
-
schema_overrides=schema_overrides,
|
|
163
|
-
strict=strict,
|
|
164
|
-
nan_to_null=nan_to_null,
|
|
165
|
-
).values()
|
|
166
|
-
]
|
|
167
|
-
|
|
168
|
-
data_series = _handle_columns_arg(data_series, columns=column_names, from_dict=True)
|
|
169
|
-
pydf = PyDataFrame(data_series)
|
|
170
|
-
|
|
171
|
-
if schema_overrides and pydf.dtypes() != list(schema_overrides.values()):
|
|
172
|
-
pydf = _post_apply_columns(
|
|
173
|
-
pydf, column_names, schema_overrides=schema_overrides, strict=strict
|
|
174
|
-
)
|
|
175
|
-
return pydf
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def _unpack_schema(
|
|
179
|
-
schema: SchemaDefinition | None,
|
|
180
|
-
*,
|
|
181
|
-
schema_overrides: SchemaDict | None = None,
|
|
182
|
-
n_expected: int | None = None,
|
|
183
|
-
lookup_names: Iterable[str] | None = None,
|
|
184
|
-
) -> tuple[list[str], SchemaDict]:
|
|
185
|
-
"""
|
|
186
|
-
Unpack column names and create dtype lookup.
|
|
187
|
-
|
|
188
|
-
Works for any (name, dtype) pairs or schema dict input,
|
|
189
|
-
overriding any inferred dtypes with explicit dtypes if supplied.
|
|
190
|
-
"""
|
|
191
|
-
|
|
192
|
-
def _normalize_dtype(dtype: Any) -> PolarsDataType:
|
|
193
|
-
"""Parse non-Polars data types as Polars data types."""
|
|
194
|
-
if is_polars_dtype(dtype, include_unknown=True):
|
|
195
|
-
return dtype
|
|
196
|
-
else:
|
|
197
|
-
return parse_into_dtype(dtype)
|
|
198
|
-
|
|
199
|
-
def _parse_schema_overrides(
|
|
200
|
-
schema_overrides: SchemaDict | None = None,
|
|
201
|
-
) -> dict[str, PolarsDataType]:
|
|
202
|
-
"""Parse schema overrides as a dictionary of name to Polars data type."""
|
|
203
|
-
if schema_overrides is None:
|
|
204
|
-
return {}
|
|
205
|
-
|
|
206
|
-
return {
|
|
207
|
-
name: _normalize_dtype(dtype) for name, dtype in schema_overrides.items()
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
schema_overrides = _parse_schema_overrides(schema_overrides)
|
|
211
|
-
|
|
212
|
-
# fast path for empty schema
|
|
213
|
-
if not schema:
|
|
214
|
-
columns = (
|
|
215
|
-
[f"column_{i}" for i in range(n_expected)] if n_expected is not None else []
|
|
216
|
-
)
|
|
217
|
-
return columns, schema_overrides
|
|
218
|
-
|
|
219
|
-
# determine column names from schema
|
|
220
|
-
if isinstance(schema, Mapping):
|
|
221
|
-
column_names: list[str] = list(schema)
|
|
222
|
-
schema = list(schema.items())
|
|
223
|
-
else:
|
|
224
|
-
column_names = []
|
|
225
|
-
for i, col in enumerate(schema):
|
|
226
|
-
if isinstance(col, str):
|
|
227
|
-
unnamed = not col and col not in schema_overrides
|
|
228
|
-
col = f"column_{i}" if unnamed else col
|
|
229
|
-
else:
|
|
230
|
-
col = col[0]
|
|
231
|
-
column_names.append(col)
|
|
232
|
-
|
|
233
|
-
if n_expected is not None and len(column_names) != n_expected:
|
|
234
|
-
msg = "data does not match the number of columns"
|
|
235
|
-
raise ShapeError(msg)
|
|
236
|
-
|
|
237
|
-
# determine column dtypes from schema and lookup_names
|
|
238
|
-
lookup: dict[str, str] | None = (
|
|
239
|
-
{
|
|
240
|
-
col: name
|
|
241
|
-
for col, name in zip_longest(column_names, lookup_names)
|
|
242
|
-
if name is not None
|
|
243
|
-
}
|
|
244
|
-
if lookup_names
|
|
245
|
-
else None
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
column_dtypes: dict[str, PolarsDataType] = {}
|
|
249
|
-
for col in schema:
|
|
250
|
-
if isinstance(col, str):
|
|
251
|
-
continue
|
|
252
|
-
|
|
253
|
-
name, dtype = col
|
|
254
|
-
if dtype is None:
|
|
255
|
-
continue
|
|
256
|
-
else:
|
|
257
|
-
dtype = _normalize_dtype(dtype)
|
|
258
|
-
name = lookup.get(name, name) if lookup else name
|
|
259
|
-
column_dtypes[name] = dtype # type: ignore[assignment]
|
|
260
|
-
|
|
261
|
-
# apply schema overrides
|
|
262
|
-
if schema_overrides:
|
|
263
|
-
column_dtypes.update(schema_overrides)
|
|
264
|
-
|
|
265
|
-
return column_names, column_dtypes
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
def _handle_columns_arg(
|
|
269
|
-
data: list[PySeries],
|
|
270
|
-
columns: Sequence[str] | None = None,
|
|
271
|
-
*,
|
|
272
|
-
from_dict: bool = False,
|
|
273
|
-
) -> list[PySeries]:
|
|
274
|
-
"""Rename data according to columns argument."""
|
|
275
|
-
if columns is None:
|
|
276
|
-
return data
|
|
277
|
-
elif not data:
|
|
278
|
-
return [pl.Series(name=c)._s for c in columns]
|
|
279
|
-
elif len(data) != len(columns):
|
|
280
|
-
msg = f"dimensions of columns arg ({len(columns)}) must match data dimensions ({len(data)})"
|
|
281
|
-
raise ValueError(msg)
|
|
282
|
-
|
|
283
|
-
if from_dict:
|
|
284
|
-
series_map = {s.name(): s for s in data}
|
|
285
|
-
if all((col in series_map) for col in columns):
|
|
286
|
-
return [series_map[col] for col in columns]
|
|
287
|
-
|
|
288
|
-
for i, c in enumerate(columns):
|
|
289
|
-
if c != data[i].name():
|
|
290
|
-
data[i] = data[i].clone()
|
|
291
|
-
data[i].rename(c)
|
|
292
|
-
|
|
293
|
-
return data
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
def _post_apply_columns(
|
|
297
|
-
pydf: PyDataFrame,
|
|
298
|
-
columns: SchemaDefinition | None,
|
|
299
|
-
structs: dict[str, Struct] | None = None,
|
|
300
|
-
schema_overrides: SchemaDict | None = None,
|
|
301
|
-
*,
|
|
302
|
-
strict: bool = True,
|
|
303
|
-
) -> PyDataFrame:
|
|
304
|
-
"""Apply 'columns' param *after* PyDataFrame creation (if no alternative)."""
|
|
305
|
-
pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes()
|
|
306
|
-
columns, dtypes = _unpack_schema(
|
|
307
|
-
(columns or pydf_columns), schema_overrides=schema_overrides
|
|
308
|
-
)
|
|
309
|
-
column_subset: list[str] = []
|
|
310
|
-
if columns != pydf_columns:
|
|
311
|
-
if len(columns) < len(pydf_columns) and columns == pydf_columns[: len(columns)]:
|
|
312
|
-
column_subset = columns
|
|
313
|
-
else:
|
|
314
|
-
pydf.set_column_names(columns)
|
|
315
|
-
|
|
316
|
-
column_casts = []
|
|
317
|
-
for i, col in enumerate(columns):
|
|
318
|
-
dtype = dtypes.get(col)
|
|
319
|
-
pydf_dtype = pydf_dtypes[i]
|
|
320
|
-
if dtype == Categorical != pydf_dtype:
|
|
321
|
-
column_casts.append(F.col(col).cast(Categorical, strict=strict)._pyexpr)
|
|
322
|
-
elif dtype == Enum != pydf_dtype:
|
|
323
|
-
column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
|
|
324
|
-
elif structs and (struct := structs.get(col)) and struct != pydf_dtype:
|
|
325
|
-
column_casts.append(F.col(col).cast(struct, strict=strict)._pyexpr)
|
|
326
|
-
elif dtype is not None and dtype != Unknown and dtype != pydf_dtype:
|
|
327
|
-
column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
|
|
328
|
-
|
|
329
|
-
if column_casts or column_subset:
|
|
330
|
-
pyldf = pydf.lazy()
|
|
331
|
-
if column_casts:
|
|
332
|
-
pyldf = pyldf.with_columns(column_casts)
|
|
333
|
-
if column_subset:
|
|
334
|
-
pyldf = pyldf.select([F.col(col)._pyexpr for col in column_subset])
|
|
335
|
-
pydf = pyldf.collect(engine="in-memory", lambda_post_opt=None)
|
|
336
|
-
|
|
337
|
-
return pydf
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
def _expand_dict_values(
|
|
341
|
-
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
|
|
342
|
-
*,
|
|
343
|
-
schema_overrides: SchemaDict | None = None,
|
|
344
|
-
strict: bool = True,
|
|
345
|
-
order: Sequence[str] | None = None,
|
|
346
|
-
nan_to_null: bool = False,
|
|
347
|
-
) -> dict[str, Series]:
|
|
348
|
-
"""Expand any scalar values in dict data (propagate literal as array)."""
|
|
349
|
-
updated_data = {}
|
|
350
|
-
if data:
|
|
351
|
-
if any(isinstance(val, pl.Expr) for val in data.values()):
|
|
352
|
-
msg = (
|
|
353
|
-
"passing Expr objects to the DataFrame constructor is not supported"
|
|
354
|
-
"\n\nHint: Try evaluating the expression first using `select`,"
|
|
355
|
-
" or if you meant to create an Object column containing expressions,"
|
|
356
|
-
" pass a list of Expr objects instead."
|
|
357
|
-
)
|
|
358
|
-
raise TypeError(msg)
|
|
359
|
-
|
|
360
|
-
dtypes = schema_overrides or {}
|
|
361
|
-
data = _expand_dict_data(data, dtypes, strict=strict)
|
|
362
|
-
array_len = max((arrlen(val) or 0) for val in data.values())
|
|
363
|
-
if array_len > 0:
|
|
364
|
-
for name, val in data.items():
|
|
365
|
-
dtype = dtypes.get(name)
|
|
366
|
-
if isinstance(val, dict) and dtype != Struct:
|
|
367
|
-
vdf = pl.DataFrame(val, strict=strict)
|
|
368
|
-
if (
|
|
369
|
-
vdf.height == 1
|
|
370
|
-
and array_len > 1
|
|
371
|
-
and all(not d.is_nested() for d in vdf.schema.values())
|
|
372
|
-
):
|
|
373
|
-
s_vals = {
|
|
374
|
-
nm: vdf[nm].extend_constant(v, n=(array_len - 1))
|
|
375
|
-
for nm, v in val.items()
|
|
376
|
-
}
|
|
377
|
-
st = pl.DataFrame(s_vals).to_struct(name)
|
|
378
|
-
else:
|
|
379
|
-
st = vdf.to_struct(name)
|
|
380
|
-
updated_data[name] = st
|
|
381
|
-
|
|
382
|
-
elif isinstance(val, pl.Series):
|
|
383
|
-
s = val.rename(name) if name != val.name else val
|
|
384
|
-
if dtype and dtype != s.dtype:
|
|
385
|
-
s = s.cast(dtype, strict=strict)
|
|
386
|
-
updated_data[name] = s
|
|
387
|
-
|
|
388
|
-
elif arrlen(val) is not None or _is_generator(val):
|
|
389
|
-
updated_data[name] = pl.Series(
|
|
390
|
-
name=name,
|
|
391
|
-
values=val,
|
|
392
|
-
dtype=dtype,
|
|
393
|
-
strict=strict,
|
|
394
|
-
nan_to_null=nan_to_null,
|
|
395
|
-
)
|
|
396
|
-
elif val is None or isinstance( # type: ignore[redundant-expr]
|
|
397
|
-
val, (int, float, str, bool, date, datetime, time, timedelta)
|
|
398
|
-
):
|
|
399
|
-
updated_data[name] = F.repeat(
|
|
400
|
-
val, array_len, dtype=dtype, eager=True
|
|
401
|
-
).alias(name)
|
|
402
|
-
else:
|
|
403
|
-
updated_data[name] = pl.Series(
|
|
404
|
-
name=name, values=[val] * array_len, dtype=dtype, strict=strict
|
|
405
|
-
)
|
|
406
|
-
|
|
407
|
-
elif all((arrlen(val) == 0) for val in data.values()):
|
|
408
|
-
for name, val in data.items():
|
|
409
|
-
updated_data[name] = pl.Series(
|
|
410
|
-
name, values=val, dtype=dtypes.get(name), strict=strict
|
|
411
|
-
)
|
|
412
|
-
|
|
413
|
-
elif all((arrlen(val) is None) for val in data.values()):
|
|
414
|
-
for name, val in data.items():
|
|
415
|
-
updated_data[name] = pl.Series(
|
|
416
|
-
name,
|
|
417
|
-
values=(val if _is_generator(val) else [val]),
|
|
418
|
-
dtype=dtypes.get(name),
|
|
419
|
-
strict=strict,
|
|
420
|
-
)
|
|
421
|
-
if order and list(updated_data) != order:
|
|
422
|
-
return {col: updated_data.pop(col) for col in order}
|
|
423
|
-
return updated_data
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
def _expand_dict_data(
|
|
427
|
-
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
|
|
428
|
-
dtypes: SchemaDict,
|
|
429
|
-
*,
|
|
430
|
-
strict: bool = True,
|
|
431
|
-
) -> Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series]:
|
|
432
|
-
"""
|
|
433
|
-
Expand any unsized generators/iterators.
|
|
434
|
-
|
|
435
|
-
(Note that `range` is sized, and will take a fast-path on Series init).
|
|
436
|
-
"""
|
|
437
|
-
expanded_data = {}
|
|
438
|
-
for name, val in data.items():
|
|
439
|
-
expanded_data[name] = (
|
|
440
|
-
pl.Series(name, val, dtypes.get(name), strict=strict)
|
|
441
|
-
if _is_generator(val)
|
|
442
|
-
else val
|
|
443
|
-
)
|
|
444
|
-
return expanded_data
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
def sequence_to_pydf(
|
|
448
|
-
data: Sequence[Any],
|
|
449
|
-
schema: SchemaDefinition | None = None,
|
|
450
|
-
*,
|
|
451
|
-
schema_overrides: SchemaDict | None = None,
|
|
452
|
-
strict: bool = True,
|
|
453
|
-
orient: Orientation | None = None,
|
|
454
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
455
|
-
nan_to_null: bool = False,
|
|
456
|
-
) -> PyDataFrame:
|
|
457
|
-
"""Construct a PyDataFrame from a sequence."""
|
|
458
|
-
if not data:
|
|
459
|
-
return dict_to_pydf({}, schema=schema, schema_overrides=schema_overrides)
|
|
460
|
-
|
|
461
|
-
return _sequence_to_pydf_dispatcher(
|
|
462
|
-
get_first_non_none(data),
|
|
463
|
-
data=data,
|
|
464
|
-
schema=schema,
|
|
465
|
-
schema_overrides=schema_overrides,
|
|
466
|
-
strict=strict,
|
|
467
|
-
orient=orient,
|
|
468
|
-
infer_schema_length=infer_schema_length,
|
|
469
|
-
nan_to_null=nan_to_null,
|
|
470
|
-
)
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
@singledispatch
|
|
474
|
-
def _sequence_to_pydf_dispatcher(
|
|
475
|
-
first_element: Any,
|
|
476
|
-
data: Sequence[Any],
|
|
477
|
-
schema: SchemaDefinition | None,
|
|
478
|
-
*,
|
|
479
|
-
schema_overrides: SchemaDict | None,
|
|
480
|
-
strict: bool = True,
|
|
481
|
-
orient: Orientation | None,
|
|
482
|
-
infer_schema_length: int | None,
|
|
483
|
-
nan_to_null: bool = False,
|
|
484
|
-
) -> PyDataFrame:
|
|
485
|
-
# note: ONLY python-native data should participate in singledispatch registration
|
|
486
|
-
# via top-level decorators, otherwise we have to import the associated module.
|
|
487
|
-
# third-party libraries (such as numpy/pandas) should be identified inline (below)
|
|
488
|
-
# and THEN registered for dispatch (here) so as not to break lazy-loading behaviour.
|
|
489
|
-
|
|
490
|
-
common_params = {
|
|
491
|
-
"data": data,
|
|
492
|
-
"schema": schema,
|
|
493
|
-
"schema_overrides": schema_overrides,
|
|
494
|
-
"strict": strict,
|
|
495
|
-
"orient": orient,
|
|
496
|
-
"infer_schema_length": infer_schema_length,
|
|
497
|
-
"nan_to_null": nan_to_null,
|
|
498
|
-
}
|
|
499
|
-
to_pydf: Callable[..., PyDataFrame]
|
|
500
|
-
register_with_singledispatch = True
|
|
501
|
-
|
|
502
|
-
if isinstance(first_element, Generator):
|
|
503
|
-
to_pydf = _sequence_of_sequence_to_pydf
|
|
504
|
-
data = [list(row) for row in data]
|
|
505
|
-
first_element = data[0]
|
|
506
|
-
register_with_singledispatch = False
|
|
507
|
-
|
|
508
|
-
elif isinstance(first_element, pl.Series):
|
|
509
|
-
to_pydf = _sequence_of_series_to_pydf
|
|
510
|
-
|
|
511
|
-
elif _check_for_numpy(first_element) and isinstance(first_element, np.ndarray):
|
|
512
|
-
to_pydf = _sequence_of_numpy_to_pydf
|
|
513
|
-
|
|
514
|
-
elif _check_for_pandas(first_element) and isinstance(
|
|
515
|
-
first_element, (pd.Series, pd.Index, pd.DatetimeIndex)
|
|
516
|
-
):
|
|
517
|
-
to_pydf = _sequence_of_pandas_to_pydf
|
|
518
|
-
|
|
519
|
-
elif dataclasses.is_dataclass(first_element):
|
|
520
|
-
to_pydf = _sequence_of_dataclasses_to_pydf
|
|
521
|
-
|
|
522
|
-
elif is_pydantic_model(first_element):
|
|
523
|
-
to_pydf = _sequence_of_pydantic_models_to_pydf
|
|
524
|
-
|
|
525
|
-
elif is_sqlalchemy_row(first_element):
|
|
526
|
-
to_pydf = _sequence_of_tuple_to_pydf
|
|
527
|
-
|
|
528
|
-
elif isinstance(first_element, Sequence) and not isinstance(first_element, str):
|
|
529
|
-
to_pydf = _sequence_of_sequence_to_pydf
|
|
530
|
-
else:
|
|
531
|
-
to_pydf = _sequence_of_elements_to_pydf
|
|
532
|
-
|
|
533
|
-
if register_with_singledispatch:
|
|
534
|
-
_sequence_to_pydf_dispatcher.register(type(first_element), to_pydf)
|
|
535
|
-
|
|
536
|
-
common_params["first_element"] = first_element
|
|
537
|
-
return to_pydf(**common_params)
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
@_sequence_to_pydf_dispatcher.register(list)
|
|
541
|
-
def _sequence_of_sequence_to_pydf(
|
|
542
|
-
first_element: Sequence[Any] | np.ndarray[Any, Any],
|
|
543
|
-
data: Sequence[Any],
|
|
544
|
-
schema: SchemaDefinition | None,
|
|
545
|
-
*,
|
|
546
|
-
schema_overrides: SchemaDict | None,
|
|
547
|
-
strict: bool,
|
|
548
|
-
orient: Orientation | None,
|
|
549
|
-
infer_schema_length: int | None,
|
|
550
|
-
nan_to_null: bool = False,
|
|
551
|
-
) -> PyDataFrame:
|
|
552
|
-
if orient is None:
|
|
553
|
-
if schema is None:
|
|
554
|
-
orient = "col"
|
|
555
|
-
else:
|
|
556
|
-
# Try to infer orientation from schema length and data dimensions
|
|
557
|
-
is_row_oriented = (len(schema) == len(first_element)) and (
|
|
558
|
-
len(schema) != len(data)
|
|
559
|
-
)
|
|
560
|
-
orient = "row" if is_row_oriented else "col"
|
|
561
|
-
|
|
562
|
-
if is_row_oriented:
|
|
563
|
-
issue_warning(
|
|
564
|
-
"Row orientation inferred during DataFrame construction."
|
|
565
|
-
' Explicitly specify the orientation by passing `orient="row"` to silence this warning.',
|
|
566
|
-
DataOrientationWarning,
|
|
567
|
-
)
|
|
568
|
-
|
|
569
|
-
if orient == "row":
|
|
570
|
-
column_names, schema_overrides = _unpack_schema(
|
|
571
|
-
schema, schema_overrides=schema_overrides, n_expected=len(first_element)
|
|
572
|
-
)
|
|
573
|
-
local_schema_override = (
|
|
574
|
-
_include_unknowns(schema_overrides, column_names)
|
|
575
|
-
if schema_overrides
|
|
576
|
-
else {}
|
|
577
|
-
)
|
|
578
|
-
|
|
579
|
-
unpack_nested = False
|
|
580
|
-
for col, tp in local_schema_override.items():
|
|
581
|
-
if tp in (Categorical, Enum):
|
|
582
|
-
local_schema_override[col] = String
|
|
583
|
-
elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
|
|
584
|
-
unpack_nested = contains_nested(
|
|
585
|
-
getattr(first_element, col, None).__class__, is_namedtuple
|
|
586
|
-
)
|
|
587
|
-
|
|
588
|
-
if unpack_nested:
|
|
589
|
-
dicts = [nt_unpack(d) for d in data]
|
|
590
|
-
pydf = PyDataFrame.from_dicts(
|
|
591
|
-
dicts,
|
|
592
|
-
schema=None,
|
|
593
|
-
schema_overrides=None,
|
|
594
|
-
strict=strict,
|
|
595
|
-
infer_schema_length=infer_schema_length,
|
|
596
|
-
)
|
|
597
|
-
else:
|
|
598
|
-
pydf = PyDataFrame.from_rows(
|
|
599
|
-
data,
|
|
600
|
-
schema=local_schema_override or None,
|
|
601
|
-
infer_schema_length=infer_schema_length,
|
|
602
|
-
)
|
|
603
|
-
if column_names or schema_overrides:
|
|
604
|
-
pydf = _post_apply_columns(
|
|
605
|
-
pydf, column_names, schema_overrides=schema_overrides, strict=strict
|
|
606
|
-
)
|
|
607
|
-
return pydf
|
|
608
|
-
|
|
609
|
-
elif orient == "col":
|
|
610
|
-
column_names, schema_overrides = _unpack_schema(
|
|
611
|
-
schema, schema_overrides=schema_overrides, n_expected=len(data)
|
|
612
|
-
)
|
|
613
|
-
data_series: list[PySeries] = [
|
|
614
|
-
pl.Series(
|
|
615
|
-
column_names[i],
|
|
616
|
-
element,
|
|
617
|
-
dtype=schema_overrides.get(column_names[i]),
|
|
618
|
-
strict=strict,
|
|
619
|
-
nan_to_null=nan_to_null,
|
|
620
|
-
)._s
|
|
621
|
-
for i, element in enumerate(data)
|
|
622
|
-
]
|
|
623
|
-
return PyDataFrame(data_series)
|
|
624
|
-
|
|
625
|
-
else:
|
|
626
|
-
msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
|
|
627
|
-
raise ValueError(msg)
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
def _sequence_of_series_to_pydf(
|
|
631
|
-
first_element: Series,
|
|
632
|
-
data: Sequence[Any],
|
|
633
|
-
schema: SchemaDefinition | None,
|
|
634
|
-
*,
|
|
635
|
-
schema_overrides: SchemaDict | None,
|
|
636
|
-
strict: bool,
|
|
637
|
-
**kwargs: Any,
|
|
638
|
-
) -> PyDataFrame:
|
|
639
|
-
series_names = [s.name for s in data]
|
|
640
|
-
column_names, schema_overrides = _unpack_schema(
|
|
641
|
-
schema or series_names,
|
|
642
|
-
schema_overrides=schema_overrides,
|
|
643
|
-
n_expected=len(data),
|
|
644
|
-
)
|
|
645
|
-
data_series: list[PySeries] = []
|
|
646
|
-
for i, s in enumerate(data):
|
|
647
|
-
if not s.name:
|
|
648
|
-
s = s.alias(column_names[i])
|
|
649
|
-
new_dtype = schema_overrides.get(column_names[i])
|
|
650
|
-
if new_dtype and new_dtype != s.dtype:
|
|
651
|
-
s = s.cast(new_dtype, strict=strict, wrap_numerical=False)
|
|
652
|
-
data_series.append(s._s)
|
|
653
|
-
|
|
654
|
-
data_series = _handle_columns_arg(data_series, columns=column_names)
|
|
655
|
-
return PyDataFrame(data_series)
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
@_sequence_to_pydf_dispatcher.register(tuple)
|
|
659
|
-
def _sequence_of_tuple_to_pydf(
|
|
660
|
-
first_element: tuple[Any, ...],
|
|
661
|
-
data: Sequence[Any],
|
|
662
|
-
schema: SchemaDefinition | None,
|
|
663
|
-
*,
|
|
664
|
-
schema_overrides: SchemaDict | None,
|
|
665
|
-
strict: bool,
|
|
666
|
-
orient: Orientation | None,
|
|
667
|
-
infer_schema_length: int | None,
|
|
668
|
-
nan_to_null: bool = False,
|
|
669
|
-
) -> PyDataFrame:
|
|
670
|
-
# infer additional meta information if namedtuple
|
|
671
|
-
if is_namedtuple(first_element.__class__) or is_sqlalchemy_row(first_element):
|
|
672
|
-
if schema is None:
|
|
673
|
-
schema = first_element._fields # type: ignore[attr-defined]
|
|
674
|
-
annotations = getattr(first_element, "__annotations__", None)
|
|
675
|
-
if annotations and len(annotations) == len(schema):
|
|
676
|
-
schema = [
|
|
677
|
-
(name, try_parse_into_dtype(tp))
|
|
678
|
-
for name, tp in first_element.__annotations__.items()
|
|
679
|
-
]
|
|
680
|
-
if orient is None:
|
|
681
|
-
orient = "row"
|
|
682
|
-
|
|
683
|
-
# ...then defer to generic sequence processing
|
|
684
|
-
return _sequence_of_sequence_to_pydf(
|
|
685
|
-
first_element,
|
|
686
|
-
data=data,
|
|
687
|
-
schema=schema,
|
|
688
|
-
schema_overrides=schema_overrides,
|
|
689
|
-
strict=strict,
|
|
690
|
-
orient=orient,
|
|
691
|
-
infer_schema_length=infer_schema_length,
|
|
692
|
-
nan_to_null=nan_to_null,
|
|
693
|
-
)
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
@_sequence_to_pydf_dispatcher.register(Mapping)
|
|
697
|
-
@_sequence_to_pydf_dispatcher.register(dict)
|
|
698
|
-
def _sequence_of_dict_to_pydf(
|
|
699
|
-
first_element: dict[str, Any],
|
|
700
|
-
data: Sequence[Any],
|
|
701
|
-
schema: SchemaDefinition | None,
|
|
702
|
-
*,
|
|
703
|
-
schema_overrides: SchemaDict | None,
|
|
704
|
-
strict: bool,
|
|
705
|
-
infer_schema_length: int | None,
|
|
706
|
-
**kwargs: Any,
|
|
707
|
-
) -> PyDataFrame:
|
|
708
|
-
column_names, schema_overrides = _unpack_schema(
|
|
709
|
-
schema, schema_overrides=schema_overrides
|
|
710
|
-
)
|
|
711
|
-
dicts_schema = (
|
|
712
|
-
_include_unknowns(schema_overrides, column_names or list(schema_overrides))
|
|
713
|
-
if column_names
|
|
714
|
-
else None
|
|
715
|
-
)
|
|
716
|
-
|
|
717
|
-
pydf = PyDataFrame.from_dicts(
|
|
718
|
-
data,
|
|
719
|
-
dicts_schema,
|
|
720
|
-
schema_overrides,
|
|
721
|
-
strict=strict,
|
|
722
|
-
infer_schema_length=infer_schema_length,
|
|
723
|
-
)
|
|
724
|
-
|
|
725
|
-
# TODO: we can remove this `schema_overrides` block completely
|
|
726
|
-
# once https://github.com/pola-rs/polars/issues/11044 is fixed
|
|
727
|
-
if schema_overrides:
|
|
728
|
-
pydf = _post_apply_columns(
|
|
729
|
-
pydf, columns=column_names, schema_overrides=schema_overrides, strict=strict
|
|
730
|
-
)
|
|
731
|
-
return pydf
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
@_sequence_to_pydf_dispatcher.register(str)
|
|
735
|
-
def _sequence_of_elements_to_pydf(
|
|
736
|
-
first_element: Any,
|
|
737
|
-
data: Sequence[Any],
|
|
738
|
-
schema: SchemaDefinition | None,
|
|
739
|
-
schema_overrides: SchemaDict | None,
|
|
740
|
-
*,
|
|
741
|
-
strict: bool,
|
|
742
|
-
**kwargs: Any,
|
|
743
|
-
) -> PyDataFrame:
|
|
744
|
-
column_names, schema_overrides = _unpack_schema(
|
|
745
|
-
schema, schema_overrides=schema_overrides, n_expected=1
|
|
746
|
-
)
|
|
747
|
-
data_series: list[PySeries] = [
|
|
748
|
-
pl.Series(
|
|
749
|
-
column_names[0],
|
|
750
|
-
data,
|
|
751
|
-
schema_overrides.get(column_names[0]),
|
|
752
|
-
strict=strict,
|
|
753
|
-
)._s
|
|
754
|
-
]
|
|
755
|
-
data_series = _handle_columns_arg(data_series, columns=column_names)
|
|
756
|
-
return PyDataFrame(data_series)
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
def _sequence_of_numpy_to_pydf(
|
|
760
|
-
first_element: np.ndarray[Any, Any],
|
|
761
|
-
**kwargs: Any,
|
|
762
|
-
) -> PyDataFrame:
|
|
763
|
-
if first_element.ndim == 1:
|
|
764
|
-
return _sequence_of_sequence_to_pydf(first_element, **kwargs)
|
|
765
|
-
else:
|
|
766
|
-
return _sequence_of_elements_to_pydf(first_element, **kwargs)
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
def _sequence_of_pandas_to_pydf(
|
|
770
|
-
first_element: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex,
|
|
771
|
-
data: Sequence[Any],
|
|
772
|
-
schema: SchemaDefinition | None,
|
|
773
|
-
schema_overrides: SchemaDict | None,
|
|
774
|
-
*,
|
|
775
|
-
strict: bool,
|
|
776
|
-
**kwargs: Any,
|
|
777
|
-
) -> PyDataFrame:
|
|
778
|
-
if schema is None:
|
|
779
|
-
column_names: list[str] = []
|
|
780
|
-
else:
|
|
781
|
-
column_names, schema_overrides = _unpack_schema(
|
|
782
|
-
schema, schema_overrides=schema_overrides, n_expected=1
|
|
783
|
-
)
|
|
784
|
-
|
|
785
|
-
schema_overrides = schema_overrides or {}
|
|
786
|
-
data_series: list[PySeries] = []
|
|
787
|
-
for i, s in enumerate(data):
|
|
788
|
-
name = column_names[i] if column_names else s.name
|
|
789
|
-
pyseries = plc.pandas_to_pyseries(name=name, values=s)
|
|
790
|
-
dtype = schema_overrides.get(name)
|
|
791
|
-
if dtype is not None and dtype != pyseries.dtype():
|
|
792
|
-
pyseries = pyseries.cast(dtype, strict=strict, wrap_numerical=False)
|
|
793
|
-
data_series.append(pyseries)
|
|
794
|
-
|
|
795
|
-
return PyDataFrame(data_series)
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
def _sequence_of_dataclasses_to_pydf(
|
|
799
|
-
first_element: Any,
|
|
800
|
-
data: Sequence[Any],
|
|
801
|
-
schema: SchemaDefinition | None,
|
|
802
|
-
schema_overrides: SchemaDict | None,
|
|
803
|
-
infer_schema_length: int | None,
|
|
804
|
-
*,
|
|
805
|
-
strict: bool = True,
|
|
806
|
-
**kwargs: Any,
|
|
807
|
-
) -> PyDataFrame:
|
|
808
|
-
"""Initialize DataFrame from Python dataclasses."""
|
|
809
|
-
from dataclasses import asdict, astuple
|
|
810
|
-
|
|
811
|
-
(
|
|
812
|
-
unpack_nested,
|
|
813
|
-
column_names,
|
|
814
|
-
schema_overrides,
|
|
815
|
-
overrides,
|
|
816
|
-
) = _establish_dataclass_or_model_schema(
|
|
817
|
-
first_element, schema, schema_overrides, model_fields=None
|
|
818
|
-
)
|
|
819
|
-
if unpack_nested:
|
|
820
|
-
dicts = [asdict(md) for md in data]
|
|
821
|
-
pydf = PyDataFrame.from_dicts(
|
|
822
|
-
dicts,
|
|
823
|
-
schema=None,
|
|
824
|
-
schema_overrides=None,
|
|
825
|
-
strict=strict,
|
|
826
|
-
infer_schema_length=infer_schema_length,
|
|
827
|
-
)
|
|
828
|
-
else:
|
|
829
|
-
rows = [astuple(dc) for dc in data]
|
|
830
|
-
pydf = PyDataFrame.from_rows(
|
|
831
|
-
rows, # type: ignore[arg-type]
|
|
832
|
-
schema=overrides or None,
|
|
833
|
-
infer_schema_length=infer_schema_length,
|
|
834
|
-
)
|
|
835
|
-
|
|
836
|
-
if overrides:
|
|
837
|
-
structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
|
|
838
|
-
pydf = _post_apply_columns(
|
|
839
|
-
pydf, column_names, structs, schema_overrides, strict=strict
|
|
840
|
-
)
|
|
841
|
-
|
|
842
|
-
return pydf
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
def _sequence_of_pydantic_models_to_pydf(
|
|
846
|
-
first_element: Any,
|
|
847
|
-
data: Sequence[Any],
|
|
848
|
-
schema: SchemaDefinition | None,
|
|
849
|
-
schema_overrides: SchemaDict | None,
|
|
850
|
-
infer_schema_length: int | None,
|
|
851
|
-
*,
|
|
852
|
-
strict: bool,
|
|
853
|
-
**kwargs: Any,
|
|
854
|
-
) -> PyDataFrame:
|
|
855
|
-
"""Initialise DataFrame from pydantic model objects."""
|
|
856
|
-
import pydantic # note: must already be available in the env here
|
|
857
|
-
|
|
858
|
-
old_pydantic = parse_version(pydantic.__version__) < (2, 0)
|
|
859
|
-
model_fields = list(
|
|
860
|
-
first_element.__fields__
|
|
861
|
-
if old_pydantic
|
|
862
|
-
else first_element.__class__.model_fields
|
|
863
|
-
)
|
|
864
|
-
(
|
|
865
|
-
unpack_nested,
|
|
866
|
-
column_names,
|
|
867
|
-
schema_overrides,
|
|
868
|
-
overrides,
|
|
869
|
-
) = _establish_dataclass_or_model_schema(
|
|
870
|
-
first_element, schema, schema_overrides, model_fields
|
|
871
|
-
)
|
|
872
|
-
if unpack_nested:
|
|
873
|
-
# note: this is an *extremely* slow path, due to the requirement to
|
|
874
|
-
# use pydantic's 'dict()' method to properly unpack nested models
|
|
875
|
-
dicts = (
|
|
876
|
-
[md.dict() for md in data]
|
|
877
|
-
if old_pydantic
|
|
878
|
-
else [md.model_dump(mode="python") for md in data]
|
|
879
|
-
)
|
|
880
|
-
pydf = PyDataFrame.from_dicts(
|
|
881
|
-
dicts,
|
|
882
|
-
schema=None,
|
|
883
|
-
schema_overrides=None,
|
|
884
|
-
strict=strict,
|
|
885
|
-
infer_schema_length=infer_schema_length,
|
|
886
|
-
)
|
|
887
|
-
|
|
888
|
-
elif len(model_fields) > 50:
|
|
889
|
-
# 'from_rows' is the faster codepath for models with a lot of fields...
|
|
890
|
-
get_values = itemgetter(*model_fields)
|
|
891
|
-
rows = [get_values(md.__dict__) for md in data]
|
|
892
|
-
pydf = PyDataFrame.from_rows(
|
|
893
|
-
rows, schema=overrides, infer_schema_length=infer_schema_length
|
|
894
|
-
)
|
|
895
|
-
else:
|
|
896
|
-
# ...and 'from_dicts' is faster otherwise
|
|
897
|
-
dicts = [md.__dict__ for md in data]
|
|
898
|
-
pydf = PyDataFrame.from_dicts(
|
|
899
|
-
dicts,
|
|
900
|
-
schema=overrides,
|
|
901
|
-
schema_overrides=None,
|
|
902
|
-
strict=strict,
|
|
903
|
-
infer_schema_length=infer_schema_length,
|
|
904
|
-
)
|
|
905
|
-
|
|
906
|
-
if overrides:
|
|
907
|
-
structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
|
|
908
|
-
pydf = _post_apply_columns(
|
|
909
|
-
pydf, column_names, structs, schema_overrides, strict=strict
|
|
910
|
-
)
|
|
911
|
-
|
|
912
|
-
return pydf
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
def _establish_dataclass_or_model_schema(
|
|
916
|
-
first_element: Any,
|
|
917
|
-
schema: SchemaDefinition | None,
|
|
918
|
-
schema_overrides: SchemaDict | None,
|
|
919
|
-
model_fields: list[str] | None,
|
|
920
|
-
) -> tuple[bool, list[str], SchemaDict, SchemaDict]:
|
|
921
|
-
"""Shared utility code for establishing dataclasses/pydantic model cols/schema."""
|
|
922
|
-
from dataclasses import asdict
|
|
923
|
-
|
|
924
|
-
unpack_nested = False
|
|
925
|
-
if schema:
|
|
926
|
-
column_names, schema_overrides = _unpack_schema(
|
|
927
|
-
schema, schema_overrides=schema_overrides
|
|
928
|
-
)
|
|
929
|
-
overrides = {col: schema_overrides.get(col, Unknown) for col in column_names}
|
|
930
|
-
else:
|
|
931
|
-
column_names = []
|
|
932
|
-
overrides = {
|
|
933
|
-
col: (try_parse_into_dtype(tp) or Unknown)
|
|
934
|
-
for col, tp in try_get_type_hints(first_element.__class__).items()
|
|
935
|
-
if ((col in model_fields) if model_fields else (col != "__slots__"))
|
|
936
|
-
}
|
|
937
|
-
if schema_overrides:
|
|
938
|
-
overrides.update(schema_overrides)
|
|
939
|
-
elif not model_fields:
|
|
940
|
-
dc_fields = set(asdict(first_element))
|
|
941
|
-
schema_overrides = overrides = {
|
|
942
|
-
nm: tp for nm, tp in overrides.items() if nm in dc_fields
|
|
943
|
-
}
|
|
944
|
-
else:
|
|
945
|
-
schema_overrides = overrides
|
|
946
|
-
|
|
947
|
-
for col, tp in overrides.items():
|
|
948
|
-
if tp in (Categorical, Enum):
|
|
949
|
-
overrides[col] = String
|
|
950
|
-
elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
|
|
951
|
-
unpack_nested = contains_nested(
|
|
952
|
-
getattr(first_element, col, None),
|
|
953
|
-
is_pydantic_model if model_fields else dataclasses.is_dataclass, # type: ignore[arg-type]
|
|
954
|
-
)
|
|
955
|
-
|
|
956
|
-
if model_fields and len(model_fields) == len(overrides):
|
|
957
|
-
overrides = dict(zip(model_fields, overrides.values()))
|
|
958
|
-
|
|
959
|
-
return unpack_nested, column_names, schema_overrides, overrides
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
def _include_unknowns(
|
|
963
|
-
schema: SchemaDict, cols: Sequence[str]
|
|
964
|
-
) -> MutableMapping[str, PolarsDataType]:
|
|
965
|
-
"""Complete partial schema dict by including Unknown type."""
|
|
966
|
-
return {
|
|
967
|
-
col: (schema.get(col, Unknown) or Unknown) # type: ignore[truthy-bool]
|
|
968
|
-
for col in cols
|
|
969
|
-
}
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
def iterable_to_pydf(
|
|
973
|
-
data: Iterable[Any],
|
|
974
|
-
schema: SchemaDefinition | None = None,
|
|
975
|
-
*,
|
|
976
|
-
schema_overrides: SchemaDict | None = None,
|
|
977
|
-
strict: bool = True,
|
|
978
|
-
orient: Orientation | None = None,
|
|
979
|
-
chunk_size: int | None = None,
|
|
980
|
-
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
981
|
-
rechunk: bool = True,
|
|
982
|
-
) -> PyDataFrame:
|
|
983
|
-
"""Construct a PyDataFrame from an iterable/generator."""
|
|
984
|
-
original_schema = schema
|
|
985
|
-
column_names: list[str] = []
|
|
986
|
-
dtypes_by_idx: dict[int, PolarsDataType] = {}
|
|
987
|
-
if schema is not None:
|
|
988
|
-
column_names, schema_overrides = _unpack_schema(
|
|
989
|
-
schema, schema_overrides=schema_overrides
|
|
990
|
-
)
|
|
991
|
-
elif schema_overrides:
|
|
992
|
-
_, schema_overrides = _unpack_schema(schema, schema_overrides=schema_overrides)
|
|
993
|
-
|
|
994
|
-
if not isinstance(data, Generator):
|
|
995
|
-
data = iter(data)
|
|
996
|
-
|
|
997
|
-
if orient == "col":
|
|
998
|
-
if column_names and schema_overrides:
|
|
999
|
-
dtypes_by_idx = {
|
|
1000
|
-
idx: schema_overrides.get(col, Unknown)
|
|
1001
|
-
for idx, col in enumerate(column_names)
|
|
1002
|
-
}
|
|
1003
|
-
|
|
1004
|
-
return pl.DataFrame(
|
|
1005
|
-
{
|
|
1006
|
-
(column_names[idx] if column_names else f"column_{idx}"): pl.Series(
|
|
1007
|
-
coldata,
|
|
1008
|
-
dtype=dtypes_by_idx.get(idx),
|
|
1009
|
-
strict=strict,
|
|
1010
|
-
)
|
|
1011
|
-
for idx, coldata in enumerate(data)
|
|
1012
|
-
},
|
|
1013
|
-
)._df
|
|
1014
|
-
|
|
1015
|
-
def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFrame:
|
|
1016
|
-
return pl.DataFrame(
|
|
1017
|
-
data=values,
|
|
1018
|
-
schema=schema,
|
|
1019
|
-
strict=strict,
|
|
1020
|
-
orient="row",
|
|
1021
|
-
infer_schema_length=infer_schema_length,
|
|
1022
|
-
)
|
|
1023
|
-
|
|
1024
|
-
n_chunks = 0
|
|
1025
|
-
n_chunk_elems = 1_000_000
|
|
1026
|
-
|
|
1027
|
-
if chunk_size:
|
|
1028
|
-
adaptive_chunk_size = chunk_size
|
|
1029
|
-
elif column_names:
|
|
1030
|
-
adaptive_chunk_size = n_chunk_elems // len(column_names)
|
|
1031
|
-
else:
|
|
1032
|
-
adaptive_chunk_size = None
|
|
1033
|
-
|
|
1034
|
-
df: DataFrame = None # type: ignore[assignment]
|
|
1035
|
-
chunk_size = (
|
|
1036
|
-
None
|
|
1037
|
-
if infer_schema_length is None
|
|
1038
|
-
else max(infer_schema_length, adaptive_chunk_size or 1000)
|
|
1039
|
-
)
|
|
1040
|
-
while True:
|
|
1041
|
-
values = list(islice(data, chunk_size))
|
|
1042
|
-
if not values:
|
|
1043
|
-
break
|
|
1044
|
-
frame_chunk = to_frame_chunk(values, original_schema)
|
|
1045
|
-
if df is None:
|
|
1046
|
-
df = frame_chunk
|
|
1047
|
-
if not original_schema:
|
|
1048
|
-
original_schema = list(df.schema.items())
|
|
1049
|
-
if chunk_size != adaptive_chunk_size:
|
|
1050
|
-
if (n_columns := df.width) > 0:
|
|
1051
|
-
chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns
|
|
1052
|
-
else:
|
|
1053
|
-
df.vstack(frame_chunk, in_place=True)
|
|
1054
|
-
n_chunks += 1
|
|
1055
|
-
|
|
1056
|
-
if df is None:
|
|
1057
|
-
df = to_frame_chunk([], original_schema)
|
|
1058
|
-
|
|
1059
|
-
if n_chunks > 0 and rechunk:
|
|
1060
|
-
df = df.rechunk()
|
|
1061
|
-
|
|
1062
|
-
return df._df
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
def _check_pandas_columns(data: pd.DataFrame, *, include_index: bool) -> None:
|
|
1066
|
-
"""Check pandas dataframe columns can be converted to polars."""
|
|
1067
|
-
stringified_cols: set[str] = {str(col) for col in data.columns}
|
|
1068
|
-
stringified_index: set[str] = (
|
|
1069
|
-
{str(idx) for idx in data.index.names} if include_index else set()
|
|
1070
|
-
)
|
|
1071
|
-
|
|
1072
|
-
non_unique_cols: bool = len(stringified_cols) < len(data.columns)
|
|
1073
|
-
non_unique_indices: bool = (
|
|
1074
|
-
(len(stringified_index) < len(data.index.names)) if include_index else False
|
|
1075
|
-
)
|
|
1076
|
-
if non_unique_cols or non_unique_indices:
|
|
1077
|
-
msg = (
|
|
1078
|
-
"Pandas dataframe contains non-unique indices and/or column names. "
|
|
1079
|
-
"Polars dataframes require unique string names for columns."
|
|
1080
|
-
)
|
|
1081
|
-
raise ValueError(msg)
|
|
1082
|
-
|
|
1083
|
-
overlapping_cols_and_indices: set[str] = stringified_cols & stringified_index
|
|
1084
|
-
if len(overlapping_cols_and_indices) > 0:
|
|
1085
|
-
msg = "Pandas indices and column names must not overlap."
|
|
1086
|
-
raise ValueError(msg)
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
def pandas_to_pydf(
|
|
1090
|
-
data: pd.DataFrame,
|
|
1091
|
-
schema: SchemaDefinition | None = None,
|
|
1092
|
-
*,
|
|
1093
|
-
schema_overrides: SchemaDict | None = None,
|
|
1094
|
-
strict: bool = True,
|
|
1095
|
-
rechunk: bool = True,
|
|
1096
|
-
nan_to_null: bool = True,
|
|
1097
|
-
include_index: bool = False,
|
|
1098
|
-
) -> PyDataFrame:
|
|
1099
|
-
"""Construct a PyDataFrame from a pandas DataFrame."""
|
|
1100
|
-
_check_pandas_columns(data, include_index=include_index)
|
|
1101
|
-
|
|
1102
|
-
convert_index = include_index and not _pandas_has_default_index(data)
|
|
1103
|
-
if not convert_index and all(
|
|
1104
|
-
is_simple_numpy_backed_pandas_series(data[col]) for col in data.columns
|
|
1105
|
-
):
|
|
1106
|
-
# Convert via NumPy directly, no PyArrow needed.
|
|
1107
|
-
return pl.DataFrame(
|
|
1108
|
-
{str(col): data[col].to_numpy() for col in data.columns},
|
|
1109
|
-
schema=schema,
|
|
1110
|
-
strict=strict,
|
|
1111
|
-
schema_overrides=schema_overrides,
|
|
1112
|
-
nan_to_null=nan_to_null,
|
|
1113
|
-
)._df
|
|
1114
|
-
|
|
1115
|
-
if not _PYARROW_AVAILABLE:
|
|
1116
|
-
msg = (
|
|
1117
|
-
"pyarrow is required for converting a pandas dataframe to Polars, "
|
|
1118
|
-
"unless each of its columns is a simple numpy-backed one "
|
|
1119
|
-
"(e.g. 'int64', 'bool', 'float32' - not 'Int64')"
|
|
1120
|
-
)
|
|
1121
|
-
raise ImportError(msg)
|
|
1122
|
-
arrow_dict = {}
|
|
1123
|
-
length = data.shape[0]
|
|
1124
|
-
|
|
1125
|
-
if convert_index:
|
|
1126
|
-
for idxcol in data.index.names:
|
|
1127
|
-
arrow_dict[str(idxcol)] = plc.pandas_series_to_arrow(
|
|
1128
|
-
# get_level_values accepts `int | str`
|
|
1129
|
-
# but `index.names` returns `Hashable`
|
|
1130
|
-
data.index.get_level_values(idxcol), # type: ignore[arg-type, unused-ignore]
|
|
1131
|
-
nan_to_null=nan_to_null,
|
|
1132
|
-
length=length,
|
|
1133
|
-
)
|
|
1134
|
-
|
|
1135
|
-
for col_idx, col_data in data.items():
|
|
1136
|
-
arrow_dict[str(col_idx)] = plc.pandas_series_to_arrow(
|
|
1137
|
-
col_data, nan_to_null=nan_to_null, length=length
|
|
1138
|
-
)
|
|
1139
|
-
|
|
1140
|
-
arrow_table = pa.table(arrow_dict)
|
|
1141
|
-
return arrow_to_pydf(
|
|
1142
|
-
arrow_table,
|
|
1143
|
-
schema=schema,
|
|
1144
|
-
schema_overrides=schema_overrides,
|
|
1145
|
-
strict=strict,
|
|
1146
|
-
rechunk=rechunk,
|
|
1147
|
-
)
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
def _pandas_has_default_index(df: pd.DataFrame) -> bool:
|
|
1151
|
-
"""Identify if the pandas frame only has a default (or equivalent) index."""
|
|
1152
|
-
from pandas.core.indexes.range import RangeIndex
|
|
1153
|
-
|
|
1154
|
-
index_cols = df.index.names
|
|
1155
|
-
|
|
1156
|
-
if len(index_cols) > 1 or index_cols not in ([None], [""]):
|
|
1157
|
-
# not default: more than one index, or index is named
|
|
1158
|
-
return False
|
|
1159
|
-
elif df.index.equals(RangeIndex(start=0, stop=len(df), step=1)):
|
|
1160
|
-
# is default: simple range index
|
|
1161
|
-
return True
|
|
1162
|
-
else:
|
|
1163
|
-
# finally, is the index _equivalent_ to a default unnamed
|
|
1164
|
-
# integer index with frame data that was previously sorted
|
|
1165
|
-
return (
|
|
1166
|
-
str(df.index.dtype).startswith("int")
|
|
1167
|
-
and (df.index.sort_values() == np.arange(len(df))).all()
|
|
1168
|
-
)
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
def arrow_to_pydf(
|
|
1172
|
-
data: pa.Table | pa.RecordBatch,
|
|
1173
|
-
schema: SchemaDefinition | None = None,
|
|
1174
|
-
*,
|
|
1175
|
-
schema_overrides: SchemaDict | None = None,
|
|
1176
|
-
strict: bool = True,
|
|
1177
|
-
rechunk: bool = True,
|
|
1178
|
-
) -> PyDataFrame:
|
|
1179
|
-
"""Construct a PyDataFrame from an Arrow Table or RecordBatch."""
|
|
1180
|
-
column_names, schema_overrides = _unpack_schema(
|
|
1181
|
-
(schema or data.schema.names), schema_overrides=schema_overrides
|
|
1182
|
-
)
|
|
1183
|
-
try:
|
|
1184
|
-
if column_names != data.schema.names:
|
|
1185
|
-
data = data.rename_columns(column_names)
|
|
1186
|
-
except pa.ArrowInvalid as e:
|
|
1187
|
-
msg = "dimensions of columns arg must match data dimensions"
|
|
1188
|
-
raise ValueError(msg) from e
|
|
1189
|
-
|
|
1190
|
-
batches: list[pa.RecordBatch]
|
|
1191
|
-
if isinstance(data, pa.RecordBatch):
|
|
1192
|
-
batches = [data]
|
|
1193
|
-
else:
|
|
1194
|
-
batches = data.to_batches()
|
|
1195
|
-
|
|
1196
|
-
# supply the arrow schema so the metadata is intact
|
|
1197
|
-
pydf = PyDataFrame.from_arrow_record_batches(batches, data.schema)
|
|
1198
|
-
|
|
1199
|
-
if rechunk:
|
|
1200
|
-
pydf = pydf.rechunk()
|
|
1201
|
-
|
|
1202
|
-
if schema_overrides is not None:
|
|
1203
|
-
pydf = _post_apply_columns(
|
|
1204
|
-
pydf,
|
|
1205
|
-
column_names,
|
|
1206
|
-
schema_overrides=schema_overrides,
|
|
1207
|
-
strict=strict,
|
|
1208
|
-
)
|
|
1209
|
-
|
|
1210
|
-
return pydf
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
def numpy_to_pydf(
|
|
1214
|
-
data: np.ndarray[Any, Any],
|
|
1215
|
-
schema: SchemaDefinition | None = None,
|
|
1216
|
-
*,
|
|
1217
|
-
schema_overrides: SchemaDict | None = None,
|
|
1218
|
-
orient: Orientation | None = None,
|
|
1219
|
-
strict: bool = True,
|
|
1220
|
-
nan_to_null: bool = False,
|
|
1221
|
-
) -> PyDataFrame:
|
|
1222
|
-
"""Construct a PyDataFrame from a NumPy ndarray (including structured ndarrays)."""
|
|
1223
|
-
shape = data.shape
|
|
1224
|
-
two_d = len(shape) == 2
|
|
1225
|
-
|
|
1226
|
-
if data.dtype.names is not None:
|
|
1227
|
-
structured_array, orient = True, "col"
|
|
1228
|
-
record_names = list(data.dtype.names)
|
|
1229
|
-
n_columns = len(record_names)
|
|
1230
|
-
for nm in record_names:
|
|
1231
|
-
shape = data[nm].shape
|
|
1232
|
-
if not schema:
|
|
1233
|
-
schema = record_names
|
|
1234
|
-
else:
|
|
1235
|
-
# Unpack columns
|
|
1236
|
-
structured_array, record_names = False, []
|
|
1237
|
-
if shape == (0,):
|
|
1238
|
-
n_columns = 0
|
|
1239
|
-
|
|
1240
|
-
elif len(shape) == 1:
|
|
1241
|
-
n_columns = 1
|
|
1242
|
-
|
|
1243
|
-
elif len(shape) == 2:
|
|
1244
|
-
if orient is None and schema is None:
|
|
1245
|
-
# default convention; first axis is rows, second axis is columns
|
|
1246
|
-
n_columns = shape[1]
|
|
1247
|
-
orient = "row"
|
|
1248
|
-
|
|
1249
|
-
elif orient is None and schema is not None:
|
|
1250
|
-
# infer orientation from 'schema' param; if square array
|
|
1251
|
-
# we check the flags to establish row/column major order
|
|
1252
|
-
n_schema_cols = len(schema)
|
|
1253
|
-
if n_schema_cols == shape[0] and n_schema_cols != shape[1]:
|
|
1254
|
-
orient = "col"
|
|
1255
|
-
n_columns = shape[0]
|
|
1256
|
-
elif data.flags["F_CONTIGUOUS"] and shape[0] == shape[1]:
|
|
1257
|
-
orient = "col"
|
|
1258
|
-
n_columns = n_schema_cols
|
|
1259
|
-
else:
|
|
1260
|
-
orient = "row"
|
|
1261
|
-
n_columns = shape[1]
|
|
1262
|
-
|
|
1263
|
-
elif orient == "row":
|
|
1264
|
-
n_columns = shape[1]
|
|
1265
|
-
elif orient == "col":
|
|
1266
|
-
n_columns = shape[0]
|
|
1267
|
-
else:
|
|
1268
|
-
msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
|
|
1269
|
-
raise ValueError(msg)
|
|
1270
|
-
else:
|
|
1271
|
-
if shape == ():
|
|
1272
|
-
msg = "cannot create DataFrame from zero-dimensional array"
|
|
1273
|
-
else:
|
|
1274
|
-
msg = f"cannot create DataFrame from array with more than two dimensions; shape = {shape}"
|
|
1275
|
-
raise ValueError(msg)
|
|
1276
|
-
|
|
1277
|
-
if schema is not None and len(schema) != n_columns:
|
|
1278
|
-
if (n_schema_cols := len(schema)) != 1:
|
|
1279
|
-
msg = f"dimensions of `schema` ({n_schema_cols}) must match data dimensions ({n_columns})"
|
|
1280
|
-
raise ValueError(msg)
|
|
1281
|
-
n_columns = n_schema_cols
|
|
1282
|
-
|
|
1283
|
-
column_names, schema_overrides = _unpack_schema(
|
|
1284
|
-
schema, schema_overrides=schema_overrides, n_expected=n_columns
|
|
1285
|
-
)
|
|
1286
|
-
|
|
1287
|
-
# Convert data to series
|
|
1288
|
-
if structured_array:
|
|
1289
|
-
data_series = [
|
|
1290
|
-
pl.Series(
|
|
1291
|
-
name=series_name,
|
|
1292
|
-
values=data[record_name],
|
|
1293
|
-
dtype=schema_overrides.get(record_name),
|
|
1294
|
-
strict=strict,
|
|
1295
|
-
nan_to_null=nan_to_null,
|
|
1296
|
-
)._s
|
|
1297
|
-
for series_name, record_name in zip(column_names, record_names)
|
|
1298
|
-
]
|
|
1299
|
-
elif shape == (0,) and n_columns == 0:
|
|
1300
|
-
data_series = []
|
|
1301
|
-
|
|
1302
|
-
elif len(shape) == 1:
|
|
1303
|
-
data_series = [
|
|
1304
|
-
pl.Series(
|
|
1305
|
-
name=column_names[0],
|
|
1306
|
-
values=data,
|
|
1307
|
-
dtype=schema_overrides.get(column_names[0]),
|
|
1308
|
-
strict=strict,
|
|
1309
|
-
nan_to_null=nan_to_null,
|
|
1310
|
-
)._s
|
|
1311
|
-
]
|
|
1312
|
-
else:
|
|
1313
|
-
if orient == "row":
|
|
1314
|
-
data_series = [
|
|
1315
|
-
pl.Series(
|
|
1316
|
-
name=column_names[i],
|
|
1317
|
-
values=(
|
|
1318
|
-
data
|
|
1319
|
-
if two_d and n_columns == 1 and shape[1] > 1
|
|
1320
|
-
else data[:, i]
|
|
1321
|
-
),
|
|
1322
|
-
dtype=schema_overrides.get(column_names[i]),
|
|
1323
|
-
strict=strict,
|
|
1324
|
-
nan_to_null=nan_to_null,
|
|
1325
|
-
)._s
|
|
1326
|
-
for i in range(n_columns)
|
|
1327
|
-
]
|
|
1328
|
-
else:
|
|
1329
|
-
data_series = [
|
|
1330
|
-
pl.Series(
|
|
1331
|
-
name=column_names[i],
|
|
1332
|
-
values=(
|
|
1333
|
-
data if two_d and n_columns == 1 and shape[1] > 1 else data[i]
|
|
1334
|
-
),
|
|
1335
|
-
dtype=schema_overrides.get(column_names[i]),
|
|
1336
|
-
strict=strict,
|
|
1337
|
-
nan_to_null=nan_to_null,
|
|
1338
|
-
)._s
|
|
1339
|
-
for i in range(n_columns)
|
|
1340
|
-
]
|
|
1341
|
-
|
|
1342
|
-
data_series = _handle_columns_arg(data_series, columns=column_names)
|
|
1343
|
-
return PyDataFrame(data_series)
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
def series_to_pydf(
|
|
1347
|
-
data: Series,
|
|
1348
|
-
schema: SchemaDefinition | None = None,
|
|
1349
|
-
schema_overrides: SchemaDict | None = None,
|
|
1350
|
-
*,
|
|
1351
|
-
strict: bool = True,
|
|
1352
|
-
) -> PyDataFrame:
|
|
1353
|
-
"""Construct a PyDataFrame from a Polars Series."""
|
|
1354
|
-
if schema is None and schema_overrides is None:
|
|
1355
|
-
return PyDataFrame([data._s])
|
|
1356
|
-
|
|
1357
|
-
data_series = [data._s]
|
|
1358
|
-
series_name = [s.name() for s in data_series]
|
|
1359
|
-
column_names, schema_overrides = _unpack_schema(
|
|
1360
|
-
schema or series_name, schema_overrides=schema_overrides, n_expected=1
|
|
1361
|
-
)
|
|
1362
|
-
if schema_overrides:
|
|
1363
|
-
new_dtype = next(iter(schema_overrides.values()))
|
|
1364
|
-
if new_dtype != data.dtype:
|
|
1365
|
-
data_series[0] = data_series[0].cast(
|
|
1366
|
-
new_dtype, strict=strict, wrap_numerical=False
|
|
1367
|
-
)
|
|
1368
|
-
|
|
1369
|
-
data_series = _handle_columns_arg(data_series, columns=column_names)
|
|
1370
|
-
return PyDataFrame(data_series)
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
def dataframe_to_pydf(
|
|
1374
|
-
data: DataFrame,
|
|
1375
|
-
schema: SchemaDefinition | None = None,
|
|
1376
|
-
*,
|
|
1377
|
-
schema_overrides: SchemaDict | None = None,
|
|
1378
|
-
strict: bool = True,
|
|
1379
|
-
) -> PyDataFrame:
|
|
1380
|
-
"""Construct a PyDataFrame from an existing Polars DataFrame."""
|
|
1381
|
-
if schema is None and schema_overrides is None:
|
|
1382
|
-
return data._df.clone()
|
|
1383
|
-
|
|
1384
|
-
data_series = {c.name: c._s for c in data}
|
|
1385
|
-
column_names, schema_overrides = _unpack_schema(
|
|
1386
|
-
schema or data.columns, schema_overrides=schema_overrides
|
|
1387
|
-
)
|
|
1388
|
-
if schema_overrides:
|
|
1389
|
-
existing_schema = data.schema
|
|
1390
|
-
for name, new_dtype in schema_overrides.items():
|
|
1391
|
-
if new_dtype != existing_schema[name]:
|
|
1392
|
-
data_series[name] = data_series[name].cast(
|
|
1393
|
-
new_dtype, strict=strict, wrap_numerical=False
|
|
1394
|
-
)
|
|
1395
|
-
|
|
1396
|
-
series_cols = _handle_columns_arg(list(data_series.values()), columns=column_names)
|
|
1397
|
-
return PyDataFrame(series_cols)
|