polars-runtime-compat 1.34.0b3__cp39-abi3-manylinux_2_24_aarch64.whl → 1.34.0b5__cp39-abi3-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (204) hide show
  1. _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
  2. polars_runtime_compat-1.34.0b5.dist-info/METADATA +35 -0
  3. polars_runtime_compat-1.34.0b5.dist-info/RECORD +6 -0
  4. polars/__init__.py +0 -528
  5. polars/_cpu_check.py +0 -265
  6. polars/_dependencies.py +0 -355
  7. polars/_plr.py +0 -99
  8. polars/_plr.pyi +0 -2496
  9. polars/_reexport.py +0 -23
  10. polars/_typing.py +0 -478
  11. polars/_utils/__init__.py +0 -37
  12. polars/_utils/async_.py +0 -102
  13. polars/_utils/cache.py +0 -176
  14. polars/_utils/cloud.py +0 -40
  15. polars/_utils/constants.py +0 -29
  16. polars/_utils/construction/__init__.py +0 -46
  17. polars/_utils/construction/dataframe.py +0 -1397
  18. polars/_utils/construction/other.py +0 -72
  19. polars/_utils/construction/series.py +0 -560
  20. polars/_utils/construction/utils.py +0 -118
  21. polars/_utils/convert.py +0 -224
  22. polars/_utils/deprecation.py +0 -406
  23. polars/_utils/getitem.py +0 -457
  24. polars/_utils/logging.py +0 -11
  25. polars/_utils/nest_asyncio.py +0 -264
  26. polars/_utils/parquet.py +0 -15
  27. polars/_utils/parse/__init__.py +0 -12
  28. polars/_utils/parse/expr.py +0 -242
  29. polars/_utils/polars_version.py +0 -19
  30. polars/_utils/pycapsule.py +0 -53
  31. polars/_utils/scan.py +0 -27
  32. polars/_utils/serde.py +0 -63
  33. polars/_utils/slice.py +0 -215
  34. polars/_utils/udfs.py +0 -1251
  35. polars/_utils/unstable.py +0 -63
  36. polars/_utils/various.py +0 -782
  37. polars/_utils/wrap.py +0 -25
  38. polars/api.py +0 -370
  39. polars/catalog/__init__.py +0 -0
  40. polars/catalog/unity/__init__.py +0 -19
  41. polars/catalog/unity/client.py +0 -733
  42. polars/catalog/unity/models.py +0 -152
  43. polars/config.py +0 -1571
  44. polars/convert/__init__.py +0 -25
  45. polars/convert/general.py +0 -1046
  46. polars/convert/normalize.py +0 -261
  47. polars/dataframe/__init__.py +0 -5
  48. polars/dataframe/_html.py +0 -186
  49. polars/dataframe/frame.py +0 -12582
  50. polars/dataframe/group_by.py +0 -1067
  51. polars/dataframe/plotting.py +0 -257
  52. polars/datatype_expr/__init__.py +0 -5
  53. polars/datatype_expr/array.py +0 -56
  54. polars/datatype_expr/datatype_expr.py +0 -304
  55. polars/datatype_expr/list.py +0 -18
  56. polars/datatype_expr/struct.py +0 -69
  57. polars/datatypes/__init__.py +0 -122
  58. polars/datatypes/_parse.py +0 -195
  59. polars/datatypes/_utils.py +0 -48
  60. polars/datatypes/classes.py +0 -1213
  61. polars/datatypes/constants.py +0 -11
  62. polars/datatypes/constructor.py +0 -172
  63. polars/datatypes/convert.py +0 -366
  64. polars/datatypes/group.py +0 -130
  65. polars/exceptions.py +0 -230
  66. polars/expr/__init__.py +0 -7
  67. polars/expr/array.py +0 -964
  68. polars/expr/binary.py +0 -346
  69. polars/expr/categorical.py +0 -306
  70. polars/expr/datetime.py +0 -2620
  71. polars/expr/expr.py +0 -11272
  72. polars/expr/list.py +0 -1408
  73. polars/expr/meta.py +0 -444
  74. polars/expr/name.py +0 -321
  75. polars/expr/string.py +0 -3045
  76. polars/expr/struct.py +0 -357
  77. polars/expr/whenthen.py +0 -185
  78. polars/functions/__init__.py +0 -193
  79. polars/functions/aggregation/__init__.py +0 -33
  80. polars/functions/aggregation/horizontal.py +0 -298
  81. polars/functions/aggregation/vertical.py +0 -341
  82. polars/functions/as_datatype.py +0 -848
  83. polars/functions/business.py +0 -138
  84. polars/functions/col.py +0 -384
  85. polars/functions/datatype.py +0 -121
  86. polars/functions/eager.py +0 -524
  87. polars/functions/escape_regex.py +0 -29
  88. polars/functions/lazy.py +0 -2751
  89. polars/functions/len.py +0 -68
  90. polars/functions/lit.py +0 -210
  91. polars/functions/random.py +0 -22
  92. polars/functions/range/__init__.py +0 -19
  93. polars/functions/range/_utils.py +0 -15
  94. polars/functions/range/date_range.py +0 -303
  95. polars/functions/range/datetime_range.py +0 -370
  96. polars/functions/range/int_range.py +0 -348
  97. polars/functions/range/linear_space.py +0 -311
  98. polars/functions/range/time_range.py +0 -287
  99. polars/functions/repeat.py +0 -301
  100. polars/functions/whenthen.py +0 -353
  101. polars/interchange/__init__.py +0 -10
  102. polars/interchange/buffer.py +0 -77
  103. polars/interchange/column.py +0 -190
  104. polars/interchange/dataframe.py +0 -230
  105. polars/interchange/from_dataframe.py +0 -328
  106. polars/interchange/protocol.py +0 -303
  107. polars/interchange/utils.py +0 -170
  108. polars/io/__init__.py +0 -64
  109. polars/io/_utils.py +0 -317
  110. polars/io/avro.py +0 -49
  111. polars/io/clipboard.py +0 -36
  112. polars/io/cloud/__init__.py +0 -17
  113. polars/io/cloud/_utils.py +0 -80
  114. polars/io/cloud/credential_provider/__init__.py +0 -17
  115. polars/io/cloud/credential_provider/_builder.py +0 -520
  116. polars/io/cloud/credential_provider/_providers.py +0 -618
  117. polars/io/csv/__init__.py +0 -9
  118. polars/io/csv/_utils.py +0 -38
  119. polars/io/csv/batched_reader.py +0 -142
  120. polars/io/csv/functions.py +0 -1495
  121. polars/io/database/__init__.py +0 -6
  122. polars/io/database/_arrow_registry.py +0 -70
  123. polars/io/database/_cursor_proxies.py +0 -147
  124. polars/io/database/_executor.py +0 -578
  125. polars/io/database/_inference.py +0 -314
  126. polars/io/database/_utils.py +0 -144
  127. polars/io/database/functions.py +0 -516
  128. polars/io/delta.py +0 -499
  129. polars/io/iceberg/__init__.py +0 -3
  130. polars/io/iceberg/_utils.py +0 -697
  131. polars/io/iceberg/dataset.py +0 -556
  132. polars/io/iceberg/functions.py +0 -151
  133. polars/io/ipc/__init__.py +0 -8
  134. polars/io/ipc/functions.py +0 -514
  135. polars/io/json/__init__.py +0 -3
  136. polars/io/json/read.py +0 -101
  137. polars/io/ndjson.py +0 -332
  138. polars/io/parquet/__init__.py +0 -17
  139. polars/io/parquet/field_overwrites.py +0 -140
  140. polars/io/parquet/functions.py +0 -722
  141. polars/io/partition.py +0 -491
  142. polars/io/plugins.py +0 -187
  143. polars/io/pyarrow_dataset/__init__.py +0 -5
  144. polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
  145. polars/io/pyarrow_dataset/functions.py +0 -79
  146. polars/io/scan_options/__init__.py +0 -5
  147. polars/io/scan_options/_options.py +0 -59
  148. polars/io/scan_options/cast_options.py +0 -126
  149. polars/io/spreadsheet/__init__.py +0 -6
  150. polars/io/spreadsheet/_utils.py +0 -52
  151. polars/io/spreadsheet/_write_utils.py +0 -647
  152. polars/io/spreadsheet/functions.py +0 -1323
  153. polars/lazyframe/__init__.py +0 -9
  154. polars/lazyframe/engine_config.py +0 -61
  155. polars/lazyframe/frame.py +0 -8564
  156. polars/lazyframe/group_by.py +0 -669
  157. polars/lazyframe/in_process.py +0 -42
  158. polars/lazyframe/opt_flags.py +0 -333
  159. polars/meta/__init__.py +0 -14
  160. polars/meta/build.py +0 -33
  161. polars/meta/index_type.py +0 -27
  162. polars/meta/thread_pool.py +0 -50
  163. polars/meta/versions.py +0 -120
  164. polars/ml/__init__.py +0 -0
  165. polars/ml/torch.py +0 -213
  166. polars/ml/utilities.py +0 -30
  167. polars/plugins.py +0 -155
  168. polars/py.typed +0 -0
  169. polars/pyproject.toml +0 -103
  170. polars/schema.py +0 -265
  171. polars/selectors.py +0 -3117
  172. polars/series/__init__.py +0 -5
  173. polars/series/array.py +0 -776
  174. polars/series/binary.py +0 -254
  175. polars/series/categorical.py +0 -246
  176. polars/series/datetime.py +0 -2275
  177. polars/series/list.py +0 -1087
  178. polars/series/plotting.py +0 -191
  179. polars/series/series.py +0 -9197
  180. polars/series/string.py +0 -2367
  181. polars/series/struct.py +0 -154
  182. polars/series/utils.py +0 -191
  183. polars/sql/__init__.py +0 -7
  184. polars/sql/context.py +0 -677
  185. polars/sql/functions.py +0 -139
  186. polars/string_cache.py +0 -185
  187. polars/testing/__init__.py +0 -13
  188. polars/testing/asserts/__init__.py +0 -9
  189. polars/testing/asserts/frame.py +0 -231
  190. polars/testing/asserts/series.py +0 -219
  191. polars/testing/asserts/utils.py +0 -12
  192. polars/testing/parametric/__init__.py +0 -33
  193. polars/testing/parametric/profiles.py +0 -107
  194. polars/testing/parametric/strategies/__init__.py +0 -22
  195. polars/testing/parametric/strategies/_utils.py +0 -14
  196. polars/testing/parametric/strategies/core.py +0 -615
  197. polars/testing/parametric/strategies/data.py +0 -452
  198. polars/testing/parametric/strategies/dtype.py +0 -436
  199. polars/testing/parametric/strategies/legacy.py +0 -169
  200. polars/type_aliases.py +0 -24
  201. polars_runtime_compat-1.34.0b3.dist-info/METADATA +0 -190
  202. polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
  203. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/WHEEL +0 -0
  204. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/licenses/LICENSE +0 -0
@@ -1,1397 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import contextlib
4
- from collections.abc import Generator, Mapping, Sequence
5
- from datetime import date, datetime, time, timedelta
6
- from functools import singledispatch
7
- from itertools import islice, zip_longest
8
- from operator import itemgetter
9
- from typing import (
10
- TYPE_CHECKING,
11
- Any,
12
- Callable,
13
- )
14
-
15
- import polars._reexport as pl
16
- import polars._utils.construction as plc
17
- from polars import functions as F
18
- from polars._dependencies import (
19
- _NUMPY_AVAILABLE,
20
- _PYARROW_AVAILABLE,
21
- _check_for_numpy,
22
- _check_for_pandas,
23
- dataclasses,
24
- )
25
- from polars._dependencies import numpy as np
26
- from polars._dependencies import pandas as pd
27
- from polars._dependencies import pyarrow as pa
28
- from polars._utils.construction.utils import (
29
- contains_nested,
30
- get_first_non_none,
31
- is_namedtuple,
32
- is_pydantic_model,
33
- is_simple_numpy_backed_pandas_series,
34
- is_sqlalchemy_row,
35
- nt_unpack,
36
- try_get_type_hints,
37
- )
38
- from polars._utils.various import (
39
- _is_generator,
40
- arrlen,
41
- issue_warning,
42
- parse_version,
43
- )
44
- from polars.datatypes import (
45
- N_INFER_DEFAULT,
46
- Categorical,
47
- Enum,
48
- String,
49
- Struct,
50
- Unknown,
51
- is_polars_dtype,
52
- parse_into_dtype,
53
- try_parse_into_dtype,
54
- )
55
- from polars.exceptions import DataOrientationWarning, ShapeError
56
- from polars.meta import thread_pool_size
57
-
58
- with contextlib.suppress(ImportError): # Module not available when building docs
59
- from polars._plr import PyDataFrame
60
-
61
- if TYPE_CHECKING:
62
- from collections.abc import Iterable, MutableMapping
63
-
64
- from polars import DataFrame, Series
65
- from polars._plr import PySeries
66
- from polars._typing import (
67
- Orientation,
68
- PolarsDataType,
69
- SchemaDefinition,
70
- SchemaDict,
71
- )
72
-
73
- _MIN_NUMPY_SIZE_FOR_MULTITHREADING = 1000
74
-
75
-
76
- def dict_to_pydf(
77
- data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
78
- schema: SchemaDefinition | None = None,
79
- *,
80
- schema_overrides: SchemaDict | None = None,
81
- strict: bool = True,
82
- nan_to_null: bool = False,
83
- allow_multithreaded: bool = True,
84
- ) -> PyDataFrame:
85
- """Construct a PyDataFrame from a dictionary of sequences."""
86
- if isinstance(schema, Mapping) and data:
87
- if not all((col in schema) for col in data):
88
- msg = "the given column-schema names do not match the data dictionary"
89
- raise ValueError(msg)
90
- data = {col: data[col] for col in schema}
91
-
92
- column_names, schema_overrides = _unpack_schema(
93
- schema, lookup_names=data.keys(), schema_overrides=schema_overrides
94
- )
95
- if not column_names:
96
- column_names = list(data)
97
-
98
- if data and _NUMPY_AVAILABLE:
99
- # if there are 3 or more numpy arrays of sufficient size, we multi-thread:
100
- count_numpy = sum(
101
- int(
102
- allow_multithreaded
103
- and _check_for_numpy(val)
104
- and isinstance(val, np.ndarray)
105
- and len(val) > _MIN_NUMPY_SIZE_FOR_MULTITHREADING
106
- # integers and non-nan floats are zero-copy
107
- and nan_to_null
108
- and val.dtype in (np.float32, np.float64)
109
- )
110
- for val in data.values()
111
- )
112
- if count_numpy >= 3:
113
- # yes, multi-threading was easier in python here; we cannot have multiple
114
- # threads running python and release the gil in pyo3 (it will deadlock).
115
-
116
- # (note: 'dummy' is threaded)
117
- # We catch FileNotFoundError: see 16675
118
- try:
119
- import multiprocessing.dummy
120
-
121
- pool_size = thread_pool_size()
122
- with multiprocessing.dummy.Pool(pool_size) as pool:
123
- data = dict(
124
- zip(
125
- column_names,
126
- pool.map(
127
- lambda t: (
128
- pl.Series(t[0], t[1], nan_to_null=nan_to_null)
129
- if isinstance(t[1], np.ndarray)
130
- else t[1]
131
- ),
132
- list(data.items()),
133
- ),
134
- )
135
- )
136
- except FileNotFoundError:
137
- return dict_to_pydf(
138
- data=data,
139
- schema=schema,
140
- schema_overrides=schema_overrides,
141
- strict=strict,
142
- nan_to_null=nan_to_null,
143
- allow_multithreaded=False,
144
- )
145
-
146
- if not data and schema_overrides:
147
- data_series = [
148
- pl.Series(
149
- name,
150
- [],
151
- dtype=schema_overrides.get(name),
152
- strict=strict,
153
- nan_to_null=nan_to_null,
154
- )._s
155
- for name in column_names
156
- ]
157
- else:
158
- data_series = [
159
- s._s
160
- for s in _expand_dict_values(
161
- data,
162
- schema_overrides=schema_overrides,
163
- strict=strict,
164
- nan_to_null=nan_to_null,
165
- ).values()
166
- ]
167
-
168
- data_series = _handle_columns_arg(data_series, columns=column_names, from_dict=True)
169
- pydf = PyDataFrame(data_series)
170
-
171
- if schema_overrides and pydf.dtypes() != list(schema_overrides.values()):
172
- pydf = _post_apply_columns(
173
- pydf, column_names, schema_overrides=schema_overrides, strict=strict
174
- )
175
- return pydf
176
-
177
-
178
- def _unpack_schema(
179
- schema: SchemaDefinition | None,
180
- *,
181
- schema_overrides: SchemaDict | None = None,
182
- n_expected: int | None = None,
183
- lookup_names: Iterable[str] | None = None,
184
- ) -> tuple[list[str], SchemaDict]:
185
- """
186
- Unpack column names and create dtype lookup.
187
-
188
- Works for any (name, dtype) pairs or schema dict input,
189
- overriding any inferred dtypes with explicit dtypes if supplied.
190
- """
191
-
192
- def _normalize_dtype(dtype: Any) -> PolarsDataType:
193
- """Parse non-Polars data types as Polars data types."""
194
- if is_polars_dtype(dtype, include_unknown=True):
195
- return dtype
196
- else:
197
- return parse_into_dtype(dtype)
198
-
199
- def _parse_schema_overrides(
200
- schema_overrides: SchemaDict | None = None,
201
- ) -> dict[str, PolarsDataType]:
202
- """Parse schema overrides as a dictionary of name to Polars data type."""
203
- if schema_overrides is None:
204
- return {}
205
-
206
- return {
207
- name: _normalize_dtype(dtype) for name, dtype in schema_overrides.items()
208
- }
209
-
210
- schema_overrides = _parse_schema_overrides(schema_overrides)
211
-
212
- # fast path for empty schema
213
- if not schema:
214
- columns = (
215
- [f"column_{i}" for i in range(n_expected)] if n_expected is not None else []
216
- )
217
- return columns, schema_overrides
218
-
219
- # determine column names from schema
220
- if isinstance(schema, Mapping):
221
- column_names: list[str] = list(schema)
222
- schema = list(schema.items())
223
- else:
224
- column_names = []
225
- for i, col in enumerate(schema):
226
- if isinstance(col, str):
227
- unnamed = not col and col not in schema_overrides
228
- col = f"column_{i}" if unnamed else col
229
- else:
230
- col = col[0]
231
- column_names.append(col)
232
-
233
- if n_expected is not None and len(column_names) != n_expected:
234
- msg = "data does not match the number of columns"
235
- raise ShapeError(msg)
236
-
237
- # determine column dtypes from schema and lookup_names
238
- lookup: dict[str, str] | None = (
239
- {
240
- col: name
241
- for col, name in zip_longest(column_names, lookup_names)
242
- if name is not None
243
- }
244
- if lookup_names
245
- else None
246
- )
247
-
248
- column_dtypes: dict[str, PolarsDataType] = {}
249
- for col in schema:
250
- if isinstance(col, str):
251
- continue
252
-
253
- name, dtype = col
254
- if dtype is None:
255
- continue
256
- else:
257
- dtype = _normalize_dtype(dtype)
258
- name = lookup.get(name, name) if lookup else name
259
- column_dtypes[name] = dtype # type: ignore[assignment]
260
-
261
- # apply schema overrides
262
- if schema_overrides:
263
- column_dtypes.update(schema_overrides)
264
-
265
- return column_names, column_dtypes
266
-
267
-
268
- def _handle_columns_arg(
269
- data: list[PySeries],
270
- columns: Sequence[str] | None = None,
271
- *,
272
- from_dict: bool = False,
273
- ) -> list[PySeries]:
274
- """Rename data according to columns argument."""
275
- if columns is None:
276
- return data
277
- elif not data:
278
- return [pl.Series(name=c)._s for c in columns]
279
- elif len(data) != len(columns):
280
- msg = f"dimensions of columns arg ({len(columns)}) must match data dimensions ({len(data)})"
281
- raise ValueError(msg)
282
-
283
- if from_dict:
284
- series_map = {s.name(): s for s in data}
285
- if all((col in series_map) for col in columns):
286
- return [series_map[col] for col in columns]
287
-
288
- for i, c in enumerate(columns):
289
- if c != data[i].name():
290
- data[i] = data[i].clone()
291
- data[i].rename(c)
292
-
293
- return data
294
-
295
-
296
- def _post_apply_columns(
297
- pydf: PyDataFrame,
298
- columns: SchemaDefinition | None,
299
- structs: dict[str, Struct] | None = None,
300
- schema_overrides: SchemaDict | None = None,
301
- *,
302
- strict: bool = True,
303
- ) -> PyDataFrame:
304
- """Apply 'columns' param *after* PyDataFrame creation (if no alternative)."""
305
- pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes()
306
- columns, dtypes = _unpack_schema(
307
- (columns or pydf_columns), schema_overrides=schema_overrides
308
- )
309
- column_subset: list[str] = []
310
- if columns != pydf_columns:
311
- if len(columns) < len(pydf_columns) and columns == pydf_columns[: len(columns)]:
312
- column_subset = columns
313
- else:
314
- pydf.set_column_names(columns)
315
-
316
- column_casts = []
317
- for i, col in enumerate(columns):
318
- dtype = dtypes.get(col)
319
- pydf_dtype = pydf_dtypes[i]
320
- if dtype == Categorical != pydf_dtype:
321
- column_casts.append(F.col(col).cast(Categorical, strict=strict)._pyexpr)
322
- elif dtype == Enum != pydf_dtype:
323
- column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
324
- elif structs and (struct := structs.get(col)) and struct != pydf_dtype:
325
- column_casts.append(F.col(col).cast(struct, strict=strict)._pyexpr)
326
- elif dtype is not None and dtype != Unknown and dtype != pydf_dtype:
327
- column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
328
-
329
- if column_casts or column_subset:
330
- pyldf = pydf.lazy()
331
- if column_casts:
332
- pyldf = pyldf.with_columns(column_casts)
333
- if column_subset:
334
- pyldf = pyldf.select([F.col(col)._pyexpr for col in column_subset])
335
- pydf = pyldf.collect(engine="in-memory", lambda_post_opt=None)
336
-
337
- return pydf
338
-
339
-
340
- def _expand_dict_values(
341
- data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
342
- *,
343
- schema_overrides: SchemaDict | None = None,
344
- strict: bool = True,
345
- order: Sequence[str] | None = None,
346
- nan_to_null: bool = False,
347
- ) -> dict[str, Series]:
348
- """Expand any scalar values in dict data (propagate literal as array)."""
349
- updated_data = {}
350
- if data:
351
- if any(isinstance(val, pl.Expr) for val in data.values()):
352
- msg = (
353
- "passing Expr objects to the DataFrame constructor is not supported"
354
- "\n\nHint: Try evaluating the expression first using `select`,"
355
- " or if you meant to create an Object column containing expressions,"
356
- " pass a list of Expr objects instead."
357
- )
358
- raise TypeError(msg)
359
-
360
- dtypes = schema_overrides or {}
361
- data = _expand_dict_data(data, dtypes, strict=strict)
362
- array_len = max((arrlen(val) or 0) for val in data.values())
363
- if array_len > 0:
364
- for name, val in data.items():
365
- dtype = dtypes.get(name)
366
- if isinstance(val, dict) and dtype != Struct:
367
- vdf = pl.DataFrame(val, strict=strict)
368
- if (
369
- vdf.height == 1
370
- and array_len > 1
371
- and all(not d.is_nested() for d in vdf.schema.values())
372
- ):
373
- s_vals = {
374
- nm: vdf[nm].extend_constant(v, n=(array_len - 1))
375
- for nm, v in val.items()
376
- }
377
- st = pl.DataFrame(s_vals).to_struct(name)
378
- else:
379
- st = vdf.to_struct(name)
380
- updated_data[name] = st
381
-
382
- elif isinstance(val, pl.Series):
383
- s = val.rename(name) if name != val.name else val
384
- if dtype and dtype != s.dtype:
385
- s = s.cast(dtype, strict=strict)
386
- updated_data[name] = s
387
-
388
- elif arrlen(val) is not None or _is_generator(val):
389
- updated_data[name] = pl.Series(
390
- name=name,
391
- values=val,
392
- dtype=dtype,
393
- strict=strict,
394
- nan_to_null=nan_to_null,
395
- )
396
- elif val is None or isinstance( # type: ignore[redundant-expr]
397
- val, (int, float, str, bool, date, datetime, time, timedelta)
398
- ):
399
- updated_data[name] = F.repeat(
400
- val, array_len, dtype=dtype, eager=True
401
- ).alias(name)
402
- else:
403
- updated_data[name] = pl.Series(
404
- name=name, values=[val] * array_len, dtype=dtype, strict=strict
405
- )
406
-
407
- elif all((arrlen(val) == 0) for val in data.values()):
408
- for name, val in data.items():
409
- updated_data[name] = pl.Series(
410
- name, values=val, dtype=dtypes.get(name), strict=strict
411
- )
412
-
413
- elif all((arrlen(val) is None) for val in data.values()):
414
- for name, val in data.items():
415
- updated_data[name] = pl.Series(
416
- name,
417
- values=(val if _is_generator(val) else [val]),
418
- dtype=dtypes.get(name),
419
- strict=strict,
420
- )
421
- if order and list(updated_data) != order:
422
- return {col: updated_data.pop(col) for col in order}
423
- return updated_data
424
-
425
-
426
- def _expand_dict_data(
427
- data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
428
- dtypes: SchemaDict,
429
- *,
430
- strict: bool = True,
431
- ) -> Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series]:
432
- """
433
- Expand any unsized generators/iterators.
434
-
435
- (Note that `range` is sized, and will take a fast-path on Series init).
436
- """
437
- expanded_data = {}
438
- for name, val in data.items():
439
- expanded_data[name] = (
440
- pl.Series(name, val, dtypes.get(name), strict=strict)
441
- if _is_generator(val)
442
- else val
443
- )
444
- return expanded_data
445
-
446
-
447
- def sequence_to_pydf(
448
- data: Sequence[Any],
449
- schema: SchemaDefinition | None = None,
450
- *,
451
- schema_overrides: SchemaDict | None = None,
452
- strict: bool = True,
453
- orient: Orientation | None = None,
454
- infer_schema_length: int | None = N_INFER_DEFAULT,
455
- nan_to_null: bool = False,
456
- ) -> PyDataFrame:
457
- """Construct a PyDataFrame from a sequence."""
458
- if not data:
459
- return dict_to_pydf({}, schema=schema, schema_overrides=schema_overrides)
460
-
461
- return _sequence_to_pydf_dispatcher(
462
- get_first_non_none(data),
463
- data=data,
464
- schema=schema,
465
- schema_overrides=schema_overrides,
466
- strict=strict,
467
- orient=orient,
468
- infer_schema_length=infer_schema_length,
469
- nan_to_null=nan_to_null,
470
- )
471
-
472
-
473
- @singledispatch
474
- def _sequence_to_pydf_dispatcher(
475
- first_element: Any,
476
- data: Sequence[Any],
477
- schema: SchemaDefinition | None,
478
- *,
479
- schema_overrides: SchemaDict | None,
480
- strict: bool = True,
481
- orient: Orientation | None,
482
- infer_schema_length: int | None,
483
- nan_to_null: bool = False,
484
- ) -> PyDataFrame:
485
- # note: ONLY python-native data should participate in singledispatch registration
486
- # via top-level decorators, otherwise we have to import the associated module.
487
- # third-party libraries (such as numpy/pandas) should be identified inline (below)
488
- # and THEN registered for dispatch (here) so as not to break lazy-loading behaviour.
489
-
490
- common_params = {
491
- "data": data,
492
- "schema": schema,
493
- "schema_overrides": schema_overrides,
494
- "strict": strict,
495
- "orient": orient,
496
- "infer_schema_length": infer_schema_length,
497
- "nan_to_null": nan_to_null,
498
- }
499
- to_pydf: Callable[..., PyDataFrame]
500
- register_with_singledispatch = True
501
-
502
- if isinstance(first_element, Generator):
503
- to_pydf = _sequence_of_sequence_to_pydf
504
- data = [list(row) for row in data]
505
- first_element = data[0]
506
- register_with_singledispatch = False
507
-
508
- elif isinstance(first_element, pl.Series):
509
- to_pydf = _sequence_of_series_to_pydf
510
-
511
- elif _check_for_numpy(first_element) and isinstance(first_element, np.ndarray):
512
- to_pydf = _sequence_of_numpy_to_pydf
513
-
514
- elif _check_for_pandas(first_element) and isinstance(
515
- first_element, (pd.Series, pd.Index, pd.DatetimeIndex)
516
- ):
517
- to_pydf = _sequence_of_pandas_to_pydf
518
-
519
- elif dataclasses.is_dataclass(first_element):
520
- to_pydf = _sequence_of_dataclasses_to_pydf
521
-
522
- elif is_pydantic_model(first_element):
523
- to_pydf = _sequence_of_pydantic_models_to_pydf
524
-
525
- elif is_sqlalchemy_row(first_element):
526
- to_pydf = _sequence_of_tuple_to_pydf
527
-
528
- elif isinstance(first_element, Sequence) and not isinstance(first_element, str):
529
- to_pydf = _sequence_of_sequence_to_pydf
530
- else:
531
- to_pydf = _sequence_of_elements_to_pydf
532
-
533
- if register_with_singledispatch:
534
- _sequence_to_pydf_dispatcher.register(type(first_element), to_pydf)
535
-
536
- common_params["first_element"] = first_element
537
- return to_pydf(**common_params)
538
-
539
-
540
- @_sequence_to_pydf_dispatcher.register(list)
541
- def _sequence_of_sequence_to_pydf(
542
- first_element: Sequence[Any] | np.ndarray[Any, Any],
543
- data: Sequence[Any],
544
- schema: SchemaDefinition | None,
545
- *,
546
- schema_overrides: SchemaDict | None,
547
- strict: bool,
548
- orient: Orientation | None,
549
- infer_schema_length: int | None,
550
- nan_to_null: bool = False,
551
- ) -> PyDataFrame:
552
- if orient is None:
553
- if schema is None:
554
- orient = "col"
555
- else:
556
- # Try to infer orientation from schema length and data dimensions
557
- is_row_oriented = (len(schema) == len(first_element)) and (
558
- len(schema) != len(data)
559
- )
560
- orient = "row" if is_row_oriented else "col"
561
-
562
- if is_row_oriented:
563
- issue_warning(
564
- "Row orientation inferred during DataFrame construction."
565
- ' Explicitly specify the orientation by passing `orient="row"` to silence this warning.',
566
- DataOrientationWarning,
567
- )
568
-
569
- if orient == "row":
570
- column_names, schema_overrides = _unpack_schema(
571
- schema, schema_overrides=schema_overrides, n_expected=len(first_element)
572
- )
573
- local_schema_override = (
574
- _include_unknowns(schema_overrides, column_names)
575
- if schema_overrides
576
- else {}
577
- )
578
-
579
- unpack_nested = False
580
- for col, tp in local_schema_override.items():
581
- if tp in (Categorical, Enum):
582
- local_schema_override[col] = String
583
- elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
584
- unpack_nested = contains_nested(
585
- getattr(first_element, col, None).__class__, is_namedtuple
586
- )
587
-
588
- if unpack_nested:
589
- dicts = [nt_unpack(d) for d in data]
590
- pydf = PyDataFrame.from_dicts(
591
- dicts,
592
- schema=None,
593
- schema_overrides=None,
594
- strict=strict,
595
- infer_schema_length=infer_schema_length,
596
- )
597
- else:
598
- pydf = PyDataFrame.from_rows(
599
- data,
600
- schema=local_schema_override or None,
601
- infer_schema_length=infer_schema_length,
602
- )
603
- if column_names or schema_overrides:
604
- pydf = _post_apply_columns(
605
- pydf, column_names, schema_overrides=schema_overrides, strict=strict
606
- )
607
- return pydf
608
-
609
- elif orient == "col":
610
- column_names, schema_overrides = _unpack_schema(
611
- schema, schema_overrides=schema_overrides, n_expected=len(data)
612
- )
613
- data_series: list[PySeries] = [
614
- pl.Series(
615
- column_names[i],
616
- element,
617
- dtype=schema_overrides.get(column_names[i]),
618
- strict=strict,
619
- nan_to_null=nan_to_null,
620
- )._s
621
- for i, element in enumerate(data)
622
- ]
623
- return PyDataFrame(data_series)
624
-
625
- else:
626
- msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
627
- raise ValueError(msg)
628
-
629
-
630
- def _sequence_of_series_to_pydf(
631
- first_element: Series,
632
- data: Sequence[Any],
633
- schema: SchemaDefinition | None,
634
- *,
635
- schema_overrides: SchemaDict | None,
636
- strict: bool,
637
- **kwargs: Any,
638
- ) -> PyDataFrame:
639
- series_names = [s.name for s in data]
640
- column_names, schema_overrides = _unpack_schema(
641
- schema or series_names,
642
- schema_overrides=schema_overrides,
643
- n_expected=len(data),
644
- )
645
- data_series: list[PySeries] = []
646
- for i, s in enumerate(data):
647
- if not s.name:
648
- s = s.alias(column_names[i])
649
- new_dtype = schema_overrides.get(column_names[i])
650
- if new_dtype and new_dtype != s.dtype:
651
- s = s.cast(new_dtype, strict=strict, wrap_numerical=False)
652
- data_series.append(s._s)
653
-
654
- data_series = _handle_columns_arg(data_series, columns=column_names)
655
- return PyDataFrame(data_series)
656
-
657
-
658
- @_sequence_to_pydf_dispatcher.register(tuple)
659
- def _sequence_of_tuple_to_pydf(
660
- first_element: tuple[Any, ...],
661
- data: Sequence[Any],
662
- schema: SchemaDefinition | None,
663
- *,
664
- schema_overrides: SchemaDict | None,
665
- strict: bool,
666
- orient: Orientation | None,
667
- infer_schema_length: int | None,
668
- nan_to_null: bool = False,
669
- ) -> PyDataFrame:
670
- # infer additional meta information if namedtuple
671
- if is_namedtuple(first_element.__class__) or is_sqlalchemy_row(first_element):
672
- if schema is None:
673
- schema = first_element._fields # type: ignore[attr-defined]
674
- annotations = getattr(first_element, "__annotations__", None)
675
- if annotations and len(annotations) == len(schema):
676
- schema = [
677
- (name, try_parse_into_dtype(tp))
678
- for name, tp in first_element.__annotations__.items()
679
- ]
680
- if orient is None:
681
- orient = "row"
682
-
683
- # ...then defer to generic sequence processing
684
- return _sequence_of_sequence_to_pydf(
685
- first_element,
686
- data=data,
687
- schema=schema,
688
- schema_overrides=schema_overrides,
689
- strict=strict,
690
- orient=orient,
691
- infer_schema_length=infer_schema_length,
692
- nan_to_null=nan_to_null,
693
- )
694
-
695
-
696
- @_sequence_to_pydf_dispatcher.register(Mapping)
697
- @_sequence_to_pydf_dispatcher.register(dict)
698
- def _sequence_of_dict_to_pydf(
699
- first_element: dict[str, Any],
700
- data: Sequence[Any],
701
- schema: SchemaDefinition | None,
702
- *,
703
- schema_overrides: SchemaDict | None,
704
- strict: bool,
705
- infer_schema_length: int | None,
706
- **kwargs: Any,
707
- ) -> PyDataFrame:
708
- column_names, schema_overrides = _unpack_schema(
709
- schema, schema_overrides=schema_overrides
710
- )
711
- dicts_schema = (
712
- _include_unknowns(schema_overrides, column_names or list(schema_overrides))
713
- if column_names
714
- else None
715
- )
716
-
717
- pydf = PyDataFrame.from_dicts(
718
- data,
719
- dicts_schema,
720
- schema_overrides,
721
- strict=strict,
722
- infer_schema_length=infer_schema_length,
723
- )
724
-
725
- # TODO: we can remove this `schema_overrides` block completely
726
- # once https://github.com/pola-rs/polars/issues/11044 is fixed
727
- if schema_overrides:
728
- pydf = _post_apply_columns(
729
- pydf, columns=column_names, schema_overrides=schema_overrides, strict=strict
730
- )
731
- return pydf
732
-
733
-
734
- @_sequence_to_pydf_dispatcher.register(str)
735
- def _sequence_of_elements_to_pydf(
736
- first_element: Any,
737
- data: Sequence[Any],
738
- schema: SchemaDefinition | None,
739
- schema_overrides: SchemaDict | None,
740
- *,
741
- strict: bool,
742
- **kwargs: Any,
743
- ) -> PyDataFrame:
744
- column_names, schema_overrides = _unpack_schema(
745
- schema, schema_overrides=schema_overrides, n_expected=1
746
- )
747
- data_series: list[PySeries] = [
748
- pl.Series(
749
- column_names[0],
750
- data,
751
- schema_overrides.get(column_names[0]),
752
- strict=strict,
753
- )._s
754
- ]
755
- data_series = _handle_columns_arg(data_series, columns=column_names)
756
- return PyDataFrame(data_series)
757
-
758
-
759
- def _sequence_of_numpy_to_pydf(
760
- first_element: np.ndarray[Any, Any],
761
- **kwargs: Any,
762
- ) -> PyDataFrame:
763
- if first_element.ndim == 1:
764
- return _sequence_of_sequence_to_pydf(first_element, **kwargs)
765
- else:
766
- return _sequence_of_elements_to_pydf(first_element, **kwargs)
767
-
768
-
769
- def _sequence_of_pandas_to_pydf(
770
- first_element: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex,
771
- data: Sequence[Any],
772
- schema: SchemaDefinition | None,
773
- schema_overrides: SchemaDict | None,
774
- *,
775
- strict: bool,
776
- **kwargs: Any,
777
- ) -> PyDataFrame:
778
- if schema is None:
779
- column_names: list[str] = []
780
- else:
781
- column_names, schema_overrides = _unpack_schema(
782
- schema, schema_overrides=schema_overrides, n_expected=1
783
- )
784
-
785
- schema_overrides = schema_overrides or {}
786
- data_series: list[PySeries] = []
787
- for i, s in enumerate(data):
788
- name = column_names[i] if column_names else s.name
789
- pyseries = plc.pandas_to_pyseries(name=name, values=s)
790
- dtype = schema_overrides.get(name)
791
- if dtype is not None and dtype != pyseries.dtype():
792
- pyseries = pyseries.cast(dtype, strict=strict, wrap_numerical=False)
793
- data_series.append(pyseries)
794
-
795
- return PyDataFrame(data_series)
796
-
797
-
798
- def _sequence_of_dataclasses_to_pydf(
799
- first_element: Any,
800
- data: Sequence[Any],
801
- schema: SchemaDefinition | None,
802
- schema_overrides: SchemaDict | None,
803
- infer_schema_length: int | None,
804
- *,
805
- strict: bool = True,
806
- **kwargs: Any,
807
- ) -> PyDataFrame:
808
- """Initialize DataFrame from Python dataclasses."""
809
- from dataclasses import asdict, astuple
810
-
811
- (
812
- unpack_nested,
813
- column_names,
814
- schema_overrides,
815
- overrides,
816
- ) = _establish_dataclass_or_model_schema(
817
- first_element, schema, schema_overrides, model_fields=None
818
- )
819
- if unpack_nested:
820
- dicts = [asdict(md) for md in data]
821
- pydf = PyDataFrame.from_dicts(
822
- dicts,
823
- schema=None,
824
- schema_overrides=None,
825
- strict=strict,
826
- infer_schema_length=infer_schema_length,
827
- )
828
- else:
829
- rows = [astuple(dc) for dc in data]
830
- pydf = PyDataFrame.from_rows(
831
- rows, # type: ignore[arg-type]
832
- schema=overrides or None,
833
- infer_schema_length=infer_schema_length,
834
- )
835
-
836
- if overrides:
837
- structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
838
- pydf = _post_apply_columns(
839
- pydf, column_names, structs, schema_overrides, strict=strict
840
- )
841
-
842
- return pydf
843
-
844
-
845
- def _sequence_of_pydantic_models_to_pydf(
846
- first_element: Any,
847
- data: Sequence[Any],
848
- schema: SchemaDefinition | None,
849
- schema_overrides: SchemaDict | None,
850
- infer_schema_length: int | None,
851
- *,
852
- strict: bool,
853
- **kwargs: Any,
854
- ) -> PyDataFrame:
855
- """Initialise DataFrame from pydantic model objects."""
856
- import pydantic # note: must already be available in the env here
857
-
858
- old_pydantic = parse_version(pydantic.__version__) < (2, 0)
859
- model_fields = list(
860
- first_element.__fields__
861
- if old_pydantic
862
- else first_element.__class__.model_fields
863
- )
864
- (
865
- unpack_nested,
866
- column_names,
867
- schema_overrides,
868
- overrides,
869
- ) = _establish_dataclass_or_model_schema(
870
- first_element, schema, schema_overrides, model_fields
871
- )
872
- if unpack_nested:
873
- # note: this is an *extremely* slow path, due to the requirement to
874
- # use pydantic's 'dict()' method to properly unpack nested models
875
- dicts = (
876
- [md.dict() for md in data]
877
- if old_pydantic
878
- else [md.model_dump(mode="python") for md in data]
879
- )
880
- pydf = PyDataFrame.from_dicts(
881
- dicts,
882
- schema=None,
883
- schema_overrides=None,
884
- strict=strict,
885
- infer_schema_length=infer_schema_length,
886
- )
887
-
888
- elif len(model_fields) > 50:
889
- # 'from_rows' is the faster codepath for models with a lot of fields...
890
- get_values = itemgetter(*model_fields)
891
- rows = [get_values(md.__dict__) for md in data]
892
- pydf = PyDataFrame.from_rows(
893
- rows, schema=overrides, infer_schema_length=infer_schema_length
894
- )
895
- else:
896
- # ...and 'from_dicts' is faster otherwise
897
- dicts = [md.__dict__ for md in data]
898
- pydf = PyDataFrame.from_dicts(
899
- dicts,
900
- schema=overrides,
901
- schema_overrides=None,
902
- strict=strict,
903
- infer_schema_length=infer_schema_length,
904
- )
905
-
906
- if overrides:
907
- structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
908
- pydf = _post_apply_columns(
909
- pydf, column_names, structs, schema_overrides, strict=strict
910
- )
911
-
912
- return pydf
913
-
914
-
915
- def _establish_dataclass_or_model_schema(
916
- first_element: Any,
917
- schema: SchemaDefinition | None,
918
- schema_overrides: SchemaDict | None,
919
- model_fields: list[str] | None,
920
- ) -> tuple[bool, list[str], SchemaDict, SchemaDict]:
921
- """Shared utility code for establishing dataclasses/pydantic model cols/schema."""
922
- from dataclasses import asdict
923
-
924
- unpack_nested = False
925
- if schema:
926
- column_names, schema_overrides = _unpack_schema(
927
- schema, schema_overrides=schema_overrides
928
- )
929
- overrides = {col: schema_overrides.get(col, Unknown) for col in column_names}
930
- else:
931
- column_names = []
932
- overrides = {
933
- col: (try_parse_into_dtype(tp) or Unknown)
934
- for col, tp in try_get_type_hints(first_element.__class__).items()
935
- if ((col in model_fields) if model_fields else (col != "__slots__"))
936
- }
937
- if schema_overrides:
938
- overrides.update(schema_overrides)
939
- elif not model_fields:
940
- dc_fields = set(asdict(first_element))
941
- schema_overrides = overrides = {
942
- nm: tp for nm, tp in overrides.items() if nm in dc_fields
943
- }
944
- else:
945
- schema_overrides = overrides
946
-
947
- for col, tp in overrides.items():
948
- if tp in (Categorical, Enum):
949
- overrides[col] = String
950
- elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
951
- unpack_nested = contains_nested(
952
- getattr(first_element, col, None),
953
- is_pydantic_model if model_fields else dataclasses.is_dataclass, # type: ignore[arg-type]
954
- )
955
-
956
- if model_fields and len(model_fields) == len(overrides):
957
- overrides = dict(zip(model_fields, overrides.values()))
958
-
959
- return unpack_nested, column_names, schema_overrides, overrides
960
-
961
-
962
- def _include_unknowns(
963
- schema: SchemaDict, cols: Sequence[str]
964
- ) -> MutableMapping[str, PolarsDataType]:
965
- """Complete partial schema dict by including Unknown type."""
966
- return {
967
- col: (schema.get(col, Unknown) or Unknown) # type: ignore[truthy-bool]
968
- for col in cols
969
- }
970
-
971
-
972
- def iterable_to_pydf(
973
- data: Iterable[Any],
974
- schema: SchemaDefinition | None = None,
975
- *,
976
- schema_overrides: SchemaDict | None = None,
977
- strict: bool = True,
978
- orient: Orientation | None = None,
979
- chunk_size: int | None = None,
980
- infer_schema_length: int | None = N_INFER_DEFAULT,
981
- rechunk: bool = True,
982
- ) -> PyDataFrame:
983
- """Construct a PyDataFrame from an iterable/generator."""
984
- original_schema = schema
985
- column_names: list[str] = []
986
- dtypes_by_idx: dict[int, PolarsDataType] = {}
987
- if schema is not None:
988
- column_names, schema_overrides = _unpack_schema(
989
- schema, schema_overrides=schema_overrides
990
- )
991
- elif schema_overrides:
992
- _, schema_overrides = _unpack_schema(schema, schema_overrides=schema_overrides)
993
-
994
- if not isinstance(data, Generator):
995
- data = iter(data)
996
-
997
- if orient == "col":
998
- if column_names and schema_overrides:
999
- dtypes_by_idx = {
1000
- idx: schema_overrides.get(col, Unknown)
1001
- for idx, col in enumerate(column_names)
1002
- }
1003
-
1004
- return pl.DataFrame(
1005
- {
1006
- (column_names[idx] if column_names else f"column_{idx}"): pl.Series(
1007
- coldata,
1008
- dtype=dtypes_by_idx.get(idx),
1009
- strict=strict,
1010
- )
1011
- for idx, coldata in enumerate(data)
1012
- },
1013
- )._df
1014
-
1015
- def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFrame:
1016
- return pl.DataFrame(
1017
- data=values,
1018
- schema=schema,
1019
- strict=strict,
1020
- orient="row",
1021
- infer_schema_length=infer_schema_length,
1022
- )
1023
-
1024
- n_chunks = 0
1025
- n_chunk_elems = 1_000_000
1026
-
1027
- if chunk_size:
1028
- adaptive_chunk_size = chunk_size
1029
- elif column_names:
1030
- adaptive_chunk_size = n_chunk_elems // len(column_names)
1031
- else:
1032
- adaptive_chunk_size = None
1033
-
1034
- df: DataFrame = None # type: ignore[assignment]
1035
- chunk_size = (
1036
- None
1037
- if infer_schema_length is None
1038
- else max(infer_schema_length, adaptive_chunk_size or 1000)
1039
- )
1040
- while True:
1041
- values = list(islice(data, chunk_size))
1042
- if not values:
1043
- break
1044
- frame_chunk = to_frame_chunk(values, original_schema)
1045
- if df is None:
1046
- df = frame_chunk
1047
- if not original_schema:
1048
- original_schema = list(df.schema.items())
1049
- if chunk_size != adaptive_chunk_size:
1050
- if (n_columns := df.width) > 0:
1051
- chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns
1052
- else:
1053
- df.vstack(frame_chunk, in_place=True)
1054
- n_chunks += 1
1055
-
1056
- if df is None:
1057
- df = to_frame_chunk([], original_schema)
1058
-
1059
- if n_chunks > 0 and rechunk:
1060
- df = df.rechunk()
1061
-
1062
- return df._df
1063
-
1064
-
1065
- def _check_pandas_columns(data: pd.DataFrame, *, include_index: bool) -> None:
1066
- """Check pandas dataframe columns can be converted to polars."""
1067
- stringified_cols: set[str] = {str(col) for col in data.columns}
1068
- stringified_index: set[str] = (
1069
- {str(idx) for idx in data.index.names} if include_index else set()
1070
- )
1071
-
1072
- non_unique_cols: bool = len(stringified_cols) < len(data.columns)
1073
- non_unique_indices: bool = (
1074
- (len(stringified_index) < len(data.index.names)) if include_index else False
1075
- )
1076
- if non_unique_cols or non_unique_indices:
1077
- msg = (
1078
- "Pandas dataframe contains non-unique indices and/or column names. "
1079
- "Polars dataframes require unique string names for columns."
1080
- )
1081
- raise ValueError(msg)
1082
-
1083
- overlapping_cols_and_indices: set[str] = stringified_cols & stringified_index
1084
- if len(overlapping_cols_and_indices) > 0:
1085
- msg = "Pandas indices and column names must not overlap."
1086
- raise ValueError(msg)
1087
-
1088
-
1089
- def pandas_to_pydf(
1090
- data: pd.DataFrame,
1091
- schema: SchemaDefinition | None = None,
1092
- *,
1093
- schema_overrides: SchemaDict | None = None,
1094
- strict: bool = True,
1095
- rechunk: bool = True,
1096
- nan_to_null: bool = True,
1097
- include_index: bool = False,
1098
- ) -> PyDataFrame:
1099
- """Construct a PyDataFrame from a pandas DataFrame."""
1100
- _check_pandas_columns(data, include_index=include_index)
1101
-
1102
- convert_index = include_index and not _pandas_has_default_index(data)
1103
- if not convert_index and all(
1104
- is_simple_numpy_backed_pandas_series(data[col]) for col in data.columns
1105
- ):
1106
- # Convert via NumPy directly, no PyArrow needed.
1107
- return pl.DataFrame(
1108
- {str(col): data[col].to_numpy() for col in data.columns},
1109
- schema=schema,
1110
- strict=strict,
1111
- schema_overrides=schema_overrides,
1112
- nan_to_null=nan_to_null,
1113
- )._df
1114
-
1115
- if not _PYARROW_AVAILABLE:
1116
- msg = (
1117
- "pyarrow is required for converting a pandas dataframe to Polars, "
1118
- "unless each of its columns is a simple numpy-backed one "
1119
- "(e.g. 'int64', 'bool', 'float32' - not 'Int64')"
1120
- )
1121
- raise ImportError(msg)
1122
- arrow_dict = {}
1123
- length = data.shape[0]
1124
-
1125
- if convert_index:
1126
- for idxcol in data.index.names:
1127
- arrow_dict[str(idxcol)] = plc.pandas_series_to_arrow(
1128
- # get_level_values accepts `int | str`
1129
- # but `index.names` returns `Hashable`
1130
- data.index.get_level_values(idxcol), # type: ignore[arg-type, unused-ignore]
1131
- nan_to_null=nan_to_null,
1132
- length=length,
1133
- )
1134
-
1135
- for col_idx, col_data in data.items():
1136
- arrow_dict[str(col_idx)] = plc.pandas_series_to_arrow(
1137
- col_data, nan_to_null=nan_to_null, length=length
1138
- )
1139
-
1140
- arrow_table = pa.table(arrow_dict)
1141
- return arrow_to_pydf(
1142
- arrow_table,
1143
- schema=schema,
1144
- schema_overrides=schema_overrides,
1145
- strict=strict,
1146
- rechunk=rechunk,
1147
- )
1148
-
1149
-
1150
- def _pandas_has_default_index(df: pd.DataFrame) -> bool:
1151
- """Identify if the pandas frame only has a default (or equivalent) index."""
1152
- from pandas.core.indexes.range import RangeIndex
1153
-
1154
- index_cols = df.index.names
1155
-
1156
- if len(index_cols) > 1 or index_cols not in ([None], [""]):
1157
- # not default: more than one index, or index is named
1158
- return False
1159
- elif df.index.equals(RangeIndex(start=0, stop=len(df), step=1)):
1160
- # is default: simple range index
1161
- return True
1162
- else:
1163
- # finally, is the index _equivalent_ to a default unnamed
1164
- # integer index with frame data that was previously sorted
1165
- return (
1166
- str(df.index.dtype).startswith("int")
1167
- and (df.index.sort_values() == np.arange(len(df))).all()
1168
- )
1169
-
1170
-
1171
- def arrow_to_pydf(
1172
- data: pa.Table | pa.RecordBatch,
1173
- schema: SchemaDefinition | None = None,
1174
- *,
1175
- schema_overrides: SchemaDict | None = None,
1176
- strict: bool = True,
1177
- rechunk: bool = True,
1178
- ) -> PyDataFrame:
1179
- """Construct a PyDataFrame from an Arrow Table or RecordBatch."""
1180
- column_names, schema_overrides = _unpack_schema(
1181
- (schema or data.schema.names), schema_overrides=schema_overrides
1182
- )
1183
- try:
1184
- if column_names != data.schema.names:
1185
- data = data.rename_columns(column_names)
1186
- except pa.ArrowInvalid as e:
1187
- msg = "dimensions of columns arg must match data dimensions"
1188
- raise ValueError(msg) from e
1189
-
1190
- batches: list[pa.RecordBatch]
1191
- if isinstance(data, pa.RecordBatch):
1192
- batches = [data]
1193
- else:
1194
- batches = data.to_batches()
1195
-
1196
- # supply the arrow schema so the metadata is intact
1197
- pydf = PyDataFrame.from_arrow_record_batches(batches, data.schema)
1198
-
1199
- if rechunk:
1200
- pydf = pydf.rechunk()
1201
-
1202
- if schema_overrides is not None:
1203
- pydf = _post_apply_columns(
1204
- pydf,
1205
- column_names,
1206
- schema_overrides=schema_overrides,
1207
- strict=strict,
1208
- )
1209
-
1210
- return pydf
1211
-
1212
-
1213
- def numpy_to_pydf(
1214
- data: np.ndarray[Any, Any],
1215
- schema: SchemaDefinition | None = None,
1216
- *,
1217
- schema_overrides: SchemaDict | None = None,
1218
- orient: Orientation | None = None,
1219
- strict: bool = True,
1220
- nan_to_null: bool = False,
1221
- ) -> PyDataFrame:
1222
- """Construct a PyDataFrame from a NumPy ndarray (including structured ndarrays)."""
1223
- shape = data.shape
1224
- two_d = len(shape) == 2
1225
-
1226
- if data.dtype.names is not None:
1227
- structured_array, orient = True, "col"
1228
- record_names = list(data.dtype.names)
1229
- n_columns = len(record_names)
1230
- for nm in record_names:
1231
- shape = data[nm].shape
1232
- if not schema:
1233
- schema = record_names
1234
- else:
1235
- # Unpack columns
1236
- structured_array, record_names = False, []
1237
- if shape == (0,):
1238
- n_columns = 0
1239
-
1240
- elif len(shape) == 1:
1241
- n_columns = 1
1242
-
1243
- elif len(shape) == 2:
1244
- if orient is None and schema is None:
1245
- # default convention; first axis is rows, second axis is columns
1246
- n_columns = shape[1]
1247
- orient = "row"
1248
-
1249
- elif orient is None and schema is not None:
1250
- # infer orientation from 'schema' param; if square array
1251
- # we check the flags to establish row/column major order
1252
- n_schema_cols = len(schema)
1253
- if n_schema_cols == shape[0] and n_schema_cols != shape[1]:
1254
- orient = "col"
1255
- n_columns = shape[0]
1256
- elif data.flags["F_CONTIGUOUS"] and shape[0] == shape[1]:
1257
- orient = "col"
1258
- n_columns = n_schema_cols
1259
- else:
1260
- orient = "row"
1261
- n_columns = shape[1]
1262
-
1263
- elif orient == "row":
1264
- n_columns = shape[1]
1265
- elif orient == "col":
1266
- n_columns = shape[0]
1267
- else:
1268
- msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
1269
- raise ValueError(msg)
1270
- else:
1271
- if shape == ():
1272
- msg = "cannot create DataFrame from zero-dimensional array"
1273
- else:
1274
- msg = f"cannot create DataFrame from array with more than two dimensions; shape = {shape}"
1275
- raise ValueError(msg)
1276
-
1277
- if schema is not None and len(schema) != n_columns:
1278
- if (n_schema_cols := len(schema)) != 1:
1279
- msg = f"dimensions of `schema` ({n_schema_cols}) must match data dimensions ({n_columns})"
1280
- raise ValueError(msg)
1281
- n_columns = n_schema_cols
1282
-
1283
- column_names, schema_overrides = _unpack_schema(
1284
- schema, schema_overrides=schema_overrides, n_expected=n_columns
1285
- )
1286
-
1287
- # Convert data to series
1288
- if structured_array:
1289
- data_series = [
1290
- pl.Series(
1291
- name=series_name,
1292
- values=data[record_name],
1293
- dtype=schema_overrides.get(record_name),
1294
- strict=strict,
1295
- nan_to_null=nan_to_null,
1296
- )._s
1297
- for series_name, record_name in zip(column_names, record_names)
1298
- ]
1299
- elif shape == (0,) and n_columns == 0:
1300
- data_series = []
1301
-
1302
- elif len(shape) == 1:
1303
- data_series = [
1304
- pl.Series(
1305
- name=column_names[0],
1306
- values=data,
1307
- dtype=schema_overrides.get(column_names[0]),
1308
- strict=strict,
1309
- nan_to_null=nan_to_null,
1310
- )._s
1311
- ]
1312
- else:
1313
- if orient == "row":
1314
- data_series = [
1315
- pl.Series(
1316
- name=column_names[i],
1317
- values=(
1318
- data
1319
- if two_d and n_columns == 1 and shape[1] > 1
1320
- else data[:, i]
1321
- ),
1322
- dtype=schema_overrides.get(column_names[i]),
1323
- strict=strict,
1324
- nan_to_null=nan_to_null,
1325
- )._s
1326
- for i in range(n_columns)
1327
- ]
1328
- else:
1329
- data_series = [
1330
- pl.Series(
1331
- name=column_names[i],
1332
- values=(
1333
- data if two_d and n_columns == 1 and shape[1] > 1 else data[i]
1334
- ),
1335
- dtype=schema_overrides.get(column_names[i]),
1336
- strict=strict,
1337
- nan_to_null=nan_to_null,
1338
- )._s
1339
- for i in range(n_columns)
1340
- ]
1341
-
1342
- data_series = _handle_columns_arg(data_series, columns=column_names)
1343
- return PyDataFrame(data_series)
1344
-
1345
-
1346
- def series_to_pydf(
1347
- data: Series,
1348
- schema: SchemaDefinition | None = None,
1349
- schema_overrides: SchemaDict | None = None,
1350
- *,
1351
- strict: bool = True,
1352
- ) -> PyDataFrame:
1353
- """Construct a PyDataFrame from a Polars Series."""
1354
- if schema is None and schema_overrides is None:
1355
- return PyDataFrame([data._s])
1356
-
1357
- data_series = [data._s]
1358
- series_name = [s.name() for s in data_series]
1359
- column_names, schema_overrides = _unpack_schema(
1360
- schema or series_name, schema_overrides=schema_overrides, n_expected=1
1361
- )
1362
- if schema_overrides:
1363
- new_dtype = next(iter(schema_overrides.values()))
1364
- if new_dtype != data.dtype:
1365
- data_series[0] = data_series[0].cast(
1366
- new_dtype, strict=strict, wrap_numerical=False
1367
- )
1368
-
1369
- data_series = _handle_columns_arg(data_series, columns=column_names)
1370
- return PyDataFrame(data_series)
1371
-
1372
-
1373
- def dataframe_to_pydf(
1374
- data: DataFrame,
1375
- schema: SchemaDefinition | None = None,
1376
- *,
1377
- schema_overrides: SchemaDict | None = None,
1378
- strict: bool = True,
1379
- ) -> PyDataFrame:
1380
- """Construct a PyDataFrame from an existing Polars DataFrame."""
1381
- if schema is None and schema_overrides is None:
1382
- return data._df.clone()
1383
-
1384
- data_series = {c.name: c._s for c in data}
1385
- column_names, schema_overrides = _unpack_schema(
1386
- schema or data.columns, schema_overrides=schema_overrides
1387
- )
1388
- if schema_overrides:
1389
- existing_schema = data.schema
1390
- for name, new_dtype in schema_overrides.items():
1391
- if new_dtype != existing_schema[name]:
1392
- data_series[name] = data_series[name].cast(
1393
- new_dtype, strict=strict, wrap_numerical=False
1394
- )
1395
-
1396
- series_cols = _handle_columns_arg(list(data_series.values()), columns=column_names)
1397
- return PyDataFrame(series_cols)