polars-runtime-compat 1.34.0b2__cp39-abi3-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/.gitkeep +0 -0
  2. _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
  3. polars/__init__.py +528 -0
  4. polars/_cpu_check.py +265 -0
  5. polars/_dependencies.py +355 -0
  6. polars/_plr.py +99 -0
  7. polars/_plr.pyi +2496 -0
  8. polars/_reexport.py +23 -0
  9. polars/_typing.py +478 -0
  10. polars/_utils/__init__.py +37 -0
  11. polars/_utils/async_.py +102 -0
  12. polars/_utils/cache.py +176 -0
  13. polars/_utils/cloud.py +40 -0
  14. polars/_utils/constants.py +29 -0
  15. polars/_utils/construction/__init__.py +46 -0
  16. polars/_utils/construction/dataframe.py +1397 -0
  17. polars/_utils/construction/other.py +72 -0
  18. polars/_utils/construction/series.py +560 -0
  19. polars/_utils/construction/utils.py +118 -0
  20. polars/_utils/convert.py +224 -0
  21. polars/_utils/deprecation.py +406 -0
  22. polars/_utils/getitem.py +457 -0
  23. polars/_utils/logging.py +11 -0
  24. polars/_utils/nest_asyncio.py +264 -0
  25. polars/_utils/parquet.py +15 -0
  26. polars/_utils/parse/__init__.py +12 -0
  27. polars/_utils/parse/expr.py +242 -0
  28. polars/_utils/polars_version.py +19 -0
  29. polars/_utils/pycapsule.py +53 -0
  30. polars/_utils/scan.py +27 -0
  31. polars/_utils/serde.py +63 -0
  32. polars/_utils/slice.py +215 -0
  33. polars/_utils/udfs.py +1251 -0
  34. polars/_utils/unstable.py +63 -0
  35. polars/_utils/various.py +782 -0
  36. polars/_utils/wrap.py +25 -0
  37. polars/api.py +370 -0
  38. polars/catalog/__init__.py +0 -0
  39. polars/catalog/unity/__init__.py +19 -0
  40. polars/catalog/unity/client.py +733 -0
  41. polars/catalog/unity/models.py +152 -0
  42. polars/config.py +1571 -0
  43. polars/convert/__init__.py +25 -0
  44. polars/convert/general.py +1046 -0
  45. polars/convert/normalize.py +261 -0
  46. polars/dataframe/__init__.py +5 -0
  47. polars/dataframe/_html.py +186 -0
  48. polars/dataframe/frame.py +12582 -0
  49. polars/dataframe/group_by.py +1067 -0
  50. polars/dataframe/plotting.py +257 -0
  51. polars/datatype_expr/__init__.py +5 -0
  52. polars/datatype_expr/array.py +56 -0
  53. polars/datatype_expr/datatype_expr.py +304 -0
  54. polars/datatype_expr/list.py +18 -0
  55. polars/datatype_expr/struct.py +69 -0
  56. polars/datatypes/__init__.py +122 -0
  57. polars/datatypes/_parse.py +195 -0
  58. polars/datatypes/_utils.py +48 -0
  59. polars/datatypes/classes.py +1213 -0
  60. polars/datatypes/constants.py +11 -0
  61. polars/datatypes/constructor.py +172 -0
  62. polars/datatypes/convert.py +366 -0
  63. polars/datatypes/group.py +130 -0
  64. polars/exceptions.py +230 -0
  65. polars/expr/__init__.py +7 -0
  66. polars/expr/array.py +964 -0
  67. polars/expr/binary.py +346 -0
  68. polars/expr/categorical.py +306 -0
  69. polars/expr/datetime.py +2620 -0
  70. polars/expr/expr.py +11272 -0
  71. polars/expr/list.py +1408 -0
  72. polars/expr/meta.py +444 -0
  73. polars/expr/name.py +321 -0
  74. polars/expr/string.py +3045 -0
  75. polars/expr/struct.py +357 -0
  76. polars/expr/whenthen.py +185 -0
  77. polars/functions/__init__.py +193 -0
  78. polars/functions/aggregation/__init__.py +33 -0
  79. polars/functions/aggregation/horizontal.py +298 -0
  80. polars/functions/aggregation/vertical.py +341 -0
  81. polars/functions/as_datatype.py +848 -0
  82. polars/functions/business.py +138 -0
  83. polars/functions/col.py +384 -0
  84. polars/functions/datatype.py +121 -0
  85. polars/functions/eager.py +524 -0
  86. polars/functions/escape_regex.py +29 -0
  87. polars/functions/lazy.py +2751 -0
  88. polars/functions/len.py +68 -0
  89. polars/functions/lit.py +210 -0
  90. polars/functions/random.py +22 -0
  91. polars/functions/range/__init__.py +19 -0
  92. polars/functions/range/_utils.py +15 -0
  93. polars/functions/range/date_range.py +303 -0
  94. polars/functions/range/datetime_range.py +370 -0
  95. polars/functions/range/int_range.py +348 -0
  96. polars/functions/range/linear_space.py +311 -0
  97. polars/functions/range/time_range.py +287 -0
  98. polars/functions/repeat.py +301 -0
  99. polars/functions/whenthen.py +353 -0
  100. polars/interchange/__init__.py +10 -0
  101. polars/interchange/buffer.py +77 -0
  102. polars/interchange/column.py +190 -0
  103. polars/interchange/dataframe.py +230 -0
  104. polars/interchange/from_dataframe.py +328 -0
  105. polars/interchange/protocol.py +303 -0
  106. polars/interchange/utils.py +170 -0
  107. polars/io/__init__.py +64 -0
  108. polars/io/_utils.py +317 -0
  109. polars/io/avro.py +49 -0
  110. polars/io/clipboard.py +36 -0
  111. polars/io/cloud/__init__.py +17 -0
  112. polars/io/cloud/_utils.py +80 -0
  113. polars/io/cloud/credential_provider/__init__.py +17 -0
  114. polars/io/cloud/credential_provider/_builder.py +520 -0
  115. polars/io/cloud/credential_provider/_providers.py +618 -0
  116. polars/io/csv/__init__.py +9 -0
  117. polars/io/csv/_utils.py +38 -0
  118. polars/io/csv/batched_reader.py +142 -0
  119. polars/io/csv/functions.py +1495 -0
  120. polars/io/database/__init__.py +6 -0
  121. polars/io/database/_arrow_registry.py +70 -0
  122. polars/io/database/_cursor_proxies.py +147 -0
  123. polars/io/database/_executor.py +578 -0
  124. polars/io/database/_inference.py +314 -0
  125. polars/io/database/_utils.py +144 -0
  126. polars/io/database/functions.py +516 -0
  127. polars/io/delta.py +499 -0
  128. polars/io/iceberg/__init__.py +3 -0
  129. polars/io/iceberg/_utils.py +697 -0
  130. polars/io/iceberg/dataset.py +556 -0
  131. polars/io/iceberg/functions.py +151 -0
  132. polars/io/ipc/__init__.py +8 -0
  133. polars/io/ipc/functions.py +514 -0
  134. polars/io/json/__init__.py +3 -0
  135. polars/io/json/read.py +101 -0
  136. polars/io/ndjson.py +332 -0
  137. polars/io/parquet/__init__.py +17 -0
  138. polars/io/parquet/field_overwrites.py +140 -0
  139. polars/io/parquet/functions.py +722 -0
  140. polars/io/partition.py +491 -0
  141. polars/io/plugins.py +187 -0
  142. polars/io/pyarrow_dataset/__init__.py +5 -0
  143. polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
  144. polars/io/pyarrow_dataset/functions.py +79 -0
  145. polars/io/scan_options/__init__.py +5 -0
  146. polars/io/scan_options/_options.py +59 -0
  147. polars/io/scan_options/cast_options.py +126 -0
  148. polars/io/spreadsheet/__init__.py +6 -0
  149. polars/io/spreadsheet/_utils.py +52 -0
  150. polars/io/spreadsheet/_write_utils.py +647 -0
  151. polars/io/spreadsheet/functions.py +1323 -0
  152. polars/lazyframe/__init__.py +9 -0
  153. polars/lazyframe/engine_config.py +61 -0
  154. polars/lazyframe/frame.py +8564 -0
  155. polars/lazyframe/group_by.py +669 -0
  156. polars/lazyframe/in_process.py +42 -0
  157. polars/lazyframe/opt_flags.py +333 -0
  158. polars/meta/__init__.py +14 -0
  159. polars/meta/build.py +33 -0
  160. polars/meta/index_type.py +27 -0
  161. polars/meta/thread_pool.py +50 -0
  162. polars/meta/versions.py +120 -0
  163. polars/ml/__init__.py +0 -0
  164. polars/ml/torch.py +213 -0
  165. polars/ml/utilities.py +30 -0
  166. polars/plugins.py +155 -0
  167. polars/py.typed +0 -0
  168. polars/pyproject.toml +96 -0
  169. polars/schema.py +265 -0
  170. polars/selectors.py +3117 -0
  171. polars/series/__init__.py +5 -0
  172. polars/series/array.py +776 -0
  173. polars/series/binary.py +254 -0
  174. polars/series/categorical.py +246 -0
  175. polars/series/datetime.py +2275 -0
  176. polars/series/list.py +1087 -0
  177. polars/series/plotting.py +191 -0
  178. polars/series/series.py +9197 -0
  179. polars/series/string.py +2367 -0
  180. polars/series/struct.py +154 -0
  181. polars/series/utils.py +191 -0
  182. polars/sql/__init__.py +7 -0
  183. polars/sql/context.py +677 -0
  184. polars/sql/functions.py +139 -0
  185. polars/string_cache.py +185 -0
  186. polars/testing/__init__.py +13 -0
  187. polars/testing/asserts/__init__.py +9 -0
  188. polars/testing/asserts/frame.py +231 -0
  189. polars/testing/asserts/series.py +219 -0
  190. polars/testing/asserts/utils.py +12 -0
  191. polars/testing/parametric/__init__.py +33 -0
  192. polars/testing/parametric/profiles.py +107 -0
  193. polars/testing/parametric/strategies/__init__.py +22 -0
  194. polars/testing/parametric/strategies/_utils.py +14 -0
  195. polars/testing/parametric/strategies/core.py +615 -0
  196. polars/testing/parametric/strategies/data.py +452 -0
  197. polars/testing/parametric/strategies/dtype.py +436 -0
  198. polars/testing/parametric/strategies/legacy.py +169 -0
  199. polars/type_aliases.py +24 -0
  200. polars_runtime_compat-1.34.0b2.dist-info/METADATA +31 -0
  201. polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
  202. polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
  203. polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,1397 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ from collections.abc import Generator, Mapping, Sequence
5
+ from datetime import date, datetime, time, timedelta
6
+ from functools import singledispatch
7
+ from itertools import islice, zip_longest
8
+ from operator import itemgetter
9
+ from typing import (
10
+ TYPE_CHECKING,
11
+ Any,
12
+ Callable,
13
+ )
14
+
15
+ import polars._reexport as pl
16
+ import polars._utils.construction as plc
17
+ from polars import functions as F
18
+ from polars._dependencies import (
19
+ _NUMPY_AVAILABLE,
20
+ _PYARROW_AVAILABLE,
21
+ _check_for_numpy,
22
+ _check_for_pandas,
23
+ dataclasses,
24
+ )
25
+ from polars._dependencies import numpy as np
26
+ from polars._dependencies import pandas as pd
27
+ from polars._dependencies import pyarrow as pa
28
+ from polars._utils.construction.utils import (
29
+ contains_nested,
30
+ get_first_non_none,
31
+ is_namedtuple,
32
+ is_pydantic_model,
33
+ is_simple_numpy_backed_pandas_series,
34
+ is_sqlalchemy_row,
35
+ nt_unpack,
36
+ try_get_type_hints,
37
+ )
38
+ from polars._utils.various import (
39
+ _is_generator,
40
+ arrlen,
41
+ issue_warning,
42
+ parse_version,
43
+ )
44
+ from polars.datatypes import (
45
+ N_INFER_DEFAULT,
46
+ Categorical,
47
+ Enum,
48
+ String,
49
+ Struct,
50
+ Unknown,
51
+ is_polars_dtype,
52
+ parse_into_dtype,
53
+ try_parse_into_dtype,
54
+ )
55
+ from polars.exceptions import DataOrientationWarning, ShapeError
56
+ from polars.meta import thread_pool_size
57
+
58
+ with contextlib.suppress(ImportError): # Module not available when building docs
59
+ from polars._plr import PyDataFrame
60
+
61
+ if TYPE_CHECKING:
62
+ from collections.abc import Iterable, MutableMapping
63
+
64
+ from polars import DataFrame, Series
65
+ from polars._plr import PySeries
66
+ from polars._typing import (
67
+ Orientation,
68
+ PolarsDataType,
69
+ SchemaDefinition,
70
+ SchemaDict,
71
+ )
72
+
73
+ _MIN_NUMPY_SIZE_FOR_MULTITHREADING = 1000
74
+
75
+
76
+ def dict_to_pydf(
77
+ data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
78
+ schema: SchemaDefinition | None = None,
79
+ *,
80
+ schema_overrides: SchemaDict | None = None,
81
+ strict: bool = True,
82
+ nan_to_null: bool = False,
83
+ allow_multithreaded: bool = True,
84
+ ) -> PyDataFrame:
85
+ """Construct a PyDataFrame from a dictionary of sequences."""
86
+ if isinstance(schema, Mapping) and data:
87
+ if not all((col in schema) for col in data):
88
+ msg = "the given column-schema names do not match the data dictionary"
89
+ raise ValueError(msg)
90
+ data = {col: data[col] for col in schema}
91
+
92
+ column_names, schema_overrides = _unpack_schema(
93
+ schema, lookup_names=data.keys(), schema_overrides=schema_overrides
94
+ )
95
+ if not column_names:
96
+ column_names = list(data)
97
+
98
+ if data and _NUMPY_AVAILABLE:
99
+ # if there are 3 or more numpy arrays of sufficient size, we multi-thread:
100
+ count_numpy = sum(
101
+ int(
102
+ allow_multithreaded
103
+ and _check_for_numpy(val)
104
+ and isinstance(val, np.ndarray)
105
+ and len(val) > _MIN_NUMPY_SIZE_FOR_MULTITHREADING
106
+ # integers and non-nan floats are zero-copy
107
+ and nan_to_null
108
+ and val.dtype in (np.float32, np.float64)
109
+ )
110
+ for val in data.values()
111
+ )
112
+ if count_numpy >= 3:
113
+ # yes, multi-threading was easier in python here; we cannot have multiple
114
+ # threads running python and release the gil in pyo3 (it will deadlock).
115
+
116
+ # (note: 'dummy' is threaded)
117
+ # We catch FileNotFoundError: see 16675
118
+ try:
119
+ import multiprocessing.dummy
120
+
121
+ pool_size = thread_pool_size()
122
+ with multiprocessing.dummy.Pool(pool_size) as pool:
123
+ data = dict(
124
+ zip(
125
+ column_names,
126
+ pool.map(
127
+ lambda t: (
128
+ pl.Series(t[0], t[1], nan_to_null=nan_to_null)
129
+ if isinstance(t[1], np.ndarray)
130
+ else t[1]
131
+ ),
132
+ list(data.items()),
133
+ ),
134
+ )
135
+ )
136
+ except FileNotFoundError:
137
+ return dict_to_pydf(
138
+ data=data,
139
+ schema=schema,
140
+ schema_overrides=schema_overrides,
141
+ strict=strict,
142
+ nan_to_null=nan_to_null,
143
+ allow_multithreaded=False,
144
+ )
145
+
146
+ if not data and schema_overrides:
147
+ data_series = [
148
+ pl.Series(
149
+ name,
150
+ [],
151
+ dtype=schema_overrides.get(name),
152
+ strict=strict,
153
+ nan_to_null=nan_to_null,
154
+ )._s
155
+ for name in column_names
156
+ ]
157
+ else:
158
+ data_series = [
159
+ s._s
160
+ for s in _expand_dict_values(
161
+ data,
162
+ schema_overrides=schema_overrides,
163
+ strict=strict,
164
+ nan_to_null=nan_to_null,
165
+ ).values()
166
+ ]
167
+
168
+ data_series = _handle_columns_arg(data_series, columns=column_names, from_dict=True)
169
+ pydf = PyDataFrame(data_series)
170
+
171
+ if schema_overrides and pydf.dtypes() != list(schema_overrides.values()):
172
+ pydf = _post_apply_columns(
173
+ pydf, column_names, schema_overrides=schema_overrides, strict=strict
174
+ )
175
+ return pydf
176
+
177
+
178
+ def _unpack_schema(
179
+ schema: SchemaDefinition | None,
180
+ *,
181
+ schema_overrides: SchemaDict | None = None,
182
+ n_expected: int | None = None,
183
+ lookup_names: Iterable[str] | None = None,
184
+ ) -> tuple[list[str], SchemaDict]:
185
+ """
186
+ Unpack column names and create dtype lookup.
187
+
188
+ Works for any (name, dtype) pairs or schema dict input,
189
+ overriding any inferred dtypes with explicit dtypes if supplied.
190
+ """
191
+
192
+ def _normalize_dtype(dtype: Any) -> PolarsDataType:
193
+ """Parse non-Polars data types as Polars data types."""
194
+ if is_polars_dtype(dtype, include_unknown=True):
195
+ return dtype
196
+ else:
197
+ return parse_into_dtype(dtype)
198
+
199
+ def _parse_schema_overrides(
200
+ schema_overrides: SchemaDict | None = None,
201
+ ) -> dict[str, PolarsDataType]:
202
+ """Parse schema overrides as a dictionary of name to Polars data type."""
203
+ if schema_overrides is None:
204
+ return {}
205
+
206
+ return {
207
+ name: _normalize_dtype(dtype) for name, dtype in schema_overrides.items()
208
+ }
209
+
210
+ schema_overrides = _parse_schema_overrides(schema_overrides)
211
+
212
+ # fast path for empty schema
213
+ if not schema:
214
+ columns = (
215
+ [f"column_{i}" for i in range(n_expected)] if n_expected is not None else []
216
+ )
217
+ return columns, schema_overrides
218
+
219
+ # determine column names from schema
220
+ if isinstance(schema, Mapping):
221
+ column_names: list[str] = list(schema)
222
+ schema = list(schema.items())
223
+ else:
224
+ column_names = []
225
+ for i, col in enumerate(schema):
226
+ if isinstance(col, str):
227
+ unnamed = not col and col not in schema_overrides
228
+ col = f"column_{i}" if unnamed else col
229
+ else:
230
+ col = col[0]
231
+ column_names.append(col)
232
+
233
+ if n_expected is not None and len(column_names) != n_expected:
234
+ msg = "data does not match the number of columns"
235
+ raise ShapeError(msg)
236
+
237
+ # determine column dtypes from schema and lookup_names
238
+ lookup: dict[str, str] | None = (
239
+ {
240
+ col: name
241
+ for col, name in zip_longest(column_names, lookup_names)
242
+ if name is not None
243
+ }
244
+ if lookup_names
245
+ else None
246
+ )
247
+
248
+ column_dtypes: dict[str, PolarsDataType] = {}
249
+ for col in schema:
250
+ if isinstance(col, str):
251
+ continue
252
+
253
+ name, dtype = col
254
+ if dtype is None:
255
+ continue
256
+ else:
257
+ dtype = _normalize_dtype(dtype)
258
+ name = lookup.get(name, name) if lookup else name
259
+ column_dtypes[name] = dtype # type: ignore[assignment]
260
+
261
+ # apply schema overrides
262
+ if schema_overrides:
263
+ column_dtypes.update(schema_overrides)
264
+
265
+ return column_names, column_dtypes
266
+
267
+
268
+ def _handle_columns_arg(
269
+ data: list[PySeries],
270
+ columns: Sequence[str] | None = None,
271
+ *,
272
+ from_dict: bool = False,
273
+ ) -> list[PySeries]:
274
+ """Rename data according to columns argument."""
275
+ if columns is None:
276
+ return data
277
+ elif not data:
278
+ return [pl.Series(name=c)._s for c in columns]
279
+ elif len(data) != len(columns):
280
+ msg = f"dimensions of columns arg ({len(columns)}) must match data dimensions ({len(data)})"
281
+ raise ValueError(msg)
282
+
283
+ if from_dict:
284
+ series_map = {s.name(): s for s in data}
285
+ if all((col in series_map) for col in columns):
286
+ return [series_map[col] for col in columns]
287
+
288
+ for i, c in enumerate(columns):
289
+ if c != data[i].name():
290
+ data[i] = data[i].clone()
291
+ data[i].rename(c)
292
+
293
+ return data
294
+
295
+
296
+ def _post_apply_columns(
297
+ pydf: PyDataFrame,
298
+ columns: SchemaDefinition | None,
299
+ structs: dict[str, Struct] | None = None,
300
+ schema_overrides: SchemaDict | None = None,
301
+ *,
302
+ strict: bool = True,
303
+ ) -> PyDataFrame:
304
+ """Apply 'columns' param *after* PyDataFrame creation (if no alternative)."""
305
+ pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes()
306
+ columns, dtypes = _unpack_schema(
307
+ (columns or pydf_columns), schema_overrides=schema_overrides
308
+ )
309
+ column_subset: list[str] = []
310
+ if columns != pydf_columns:
311
+ if len(columns) < len(pydf_columns) and columns == pydf_columns[: len(columns)]:
312
+ column_subset = columns
313
+ else:
314
+ pydf.set_column_names(columns)
315
+
316
+ column_casts = []
317
+ for i, col in enumerate(columns):
318
+ dtype = dtypes.get(col)
319
+ pydf_dtype = pydf_dtypes[i]
320
+ if dtype == Categorical != pydf_dtype:
321
+ column_casts.append(F.col(col).cast(Categorical, strict=strict)._pyexpr)
322
+ elif dtype == Enum != pydf_dtype:
323
+ column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
324
+ elif structs and (struct := structs.get(col)) and struct != pydf_dtype:
325
+ column_casts.append(F.col(col).cast(struct, strict=strict)._pyexpr)
326
+ elif dtype is not None and dtype != Unknown and dtype != pydf_dtype:
327
+ column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
328
+
329
+ if column_casts or column_subset:
330
+ pyldf = pydf.lazy()
331
+ if column_casts:
332
+ pyldf = pyldf.with_columns(column_casts)
333
+ if column_subset:
334
+ pyldf = pyldf.select([F.col(col)._pyexpr for col in column_subset])
335
+ pydf = pyldf.collect(engine="in-memory", lambda_post_opt=None)
336
+
337
+ return pydf
338
+
339
+
340
+ def _expand_dict_values(
341
+ data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
342
+ *,
343
+ schema_overrides: SchemaDict | None = None,
344
+ strict: bool = True,
345
+ order: Sequence[str] | None = None,
346
+ nan_to_null: bool = False,
347
+ ) -> dict[str, Series]:
348
+ """Expand any scalar values in dict data (propagate literal as array)."""
349
+ updated_data = {}
350
+ if data:
351
+ if any(isinstance(val, pl.Expr) for val in data.values()):
352
+ msg = (
353
+ "passing Expr objects to the DataFrame constructor is not supported"
354
+ "\n\nHint: Try evaluating the expression first using `select`,"
355
+ " or if you meant to create an Object column containing expressions,"
356
+ " pass a list of Expr objects instead."
357
+ )
358
+ raise TypeError(msg)
359
+
360
+ dtypes = schema_overrides or {}
361
+ data = _expand_dict_data(data, dtypes, strict=strict)
362
+ array_len = max((arrlen(val) or 0) for val in data.values())
363
+ if array_len > 0:
364
+ for name, val in data.items():
365
+ dtype = dtypes.get(name)
366
+ if isinstance(val, dict) and dtype != Struct:
367
+ vdf = pl.DataFrame(val, strict=strict)
368
+ if (
369
+ vdf.height == 1
370
+ and array_len > 1
371
+ and all(not d.is_nested() for d in vdf.schema.values())
372
+ ):
373
+ s_vals = {
374
+ nm: vdf[nm].extend_constant(v, n=(array_len - 1))
375
+ for nm, v in val.items()
376
+ }
377
+ st = pl.DataFrame(s_vals).to_struct(name)
378
+ else:
379
+ st = vdf.to_struct(name)
380
+ updated_data[name] = st
381
+
382
+ elif isinstance(val, pl.Series):
383
+ s = val.rename(name) if name != val.name else val
384
+ if dtype and dtype != s.dtype:
385
+ s = s.cast(dtype, strict=strict)
386
+ updated_data[name] = s
387
+
388
+ elif arrlen(val) is not None or _is_generator(val):
389
+ updated_data[name] = pl.Series(
390
+ name=name,
391
+ values=val,
392
+ dtype=dtype,
393
+ strict=strict,
394
+ nan_to_null=nan_to_null,
395
+ )
396
+ elif val is None or isinstance( # type: ignore[redundant-expr]
397
+ val, (int, float, str, bool, date, datetime, time, timedelta)
398
+ ):
399
+ updated_data[name] = F.repeat(
400
+ val, array_len, dtype=dtype, eager=True
401
+ ).alias(name)
402
+ else:
403
+ updated_data[name] = pl.Series(
404
+ name=name, values=[val] * array_len, dtype=dtype, strict=strict
405
+ )
406
+
407
+ elif all((arrlen(val) == 0) for val in data.values()):
408
+ for name, val in data.items():
409
+ updated_data[name] = pl.Series(
410
+ name, values=val, dtype=dtypes.get(name), strict=strict
411
+ )
412
+
413
+ elif all((arrlen(val) is None) for val in data.values()):
414
+ for name, val in data.items():
415
+ updated_data[name] = pl.Series(
416
+ name,
417
+ values=(val if _is_generator(val) else [val]),
418
+ dtype=dtypes.get(name),
419
+ strict=strict,
420
+ )
421
+ if order and list(updated_data) != order:
422
+ return {col: updated_data.pop(col) for col in order}
423
+ return updated_data
424
+
425
+
426
+ def _expand_dict_data(
427
+ data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
428
+ dtypes: SchemaDict,
429
+ *,
430
+ strict: bool = True,
431
+ ) -> Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series]:
432
+ """
433
+ Expand any unsized generators/iterators.
434
+
435
+ (Note that `range` is sized, and will take a fast-path on Series init).
436
+ """
437
+ expanded_data = {}
438
+ for name, val in data.items():
439
+ expanded_data[name] = (
440
+ pl.Series(name, val, dtypes.get(name), strict=strict)
441
+ if _is_generator(val)
442
+ else val
443
+ )
444
+ return expanded_data
445
+
446
+
447
+ def sequence_to_pydf(
448
+ data: Sequence[Any],
449
+ schema: SchemaDefinition | None = None,
450
+ *,
451
+ schema_overrides: SchemaDict | None = None,
452
+ strict: bool = True,
453
+ orient: Orientation | None = None,
454
+ infer_schema_length: int | None = N_INFER_DEFAULT,
455
+ nan_to_null: bool = False,
456
+ ) -> PyDataFrame:
457
+ """Construct a PyDataFrame from a sequence."""
458
+ if not data:
459
+ return dict_to_pydf({}, schema=schema, schema_overrides=schema_overrides)
460
+
461
+ return _sequence_to_pydf_dispatcher(
462
+ get_first_non_none(data),
463
+ data=data,
464
+ schema=schema,
465
+ schema_overrides=schema_overrides,
466
+ strict=strict,
467
+ orient=orient,
468
+ infer_schema_length=infer_schema_length,
469
+ nan_to_null=nan_to_null,
470
+ )
471
+
472
+
473
+ @singledispatch
474
+ def _sequence_to_pydf_dispatcher(
475
+ first_element: Any,
476
+ data: Sequence[Any],
477
+ schema: SchemaDefinition | None,
478
+ *,
479
+ schema_overrides: SchemaDict | None,
480
+ strict: bool = True,
481
+ orient: Orientation | None,
482
+ infer_schema_length: int | None,
483
+ nan_to_null: bool = False,
484
+ ) -> PyDataFrame:
485
+ # note: ONLY python-native data should participate in singledispatch registration
486
+ # via top-level decorators, otherwise we have to import the associated module.
487
+ # third-party libraries (such as numpy/pandas) should be identified inline (below)
488
+ # and THEN registered for dispatch (here) so as not to break lazy-loading behaviour.
489
+
490
+ common_params = {
491
+ "data": data,
492
+ "schema": schema,
493
+ "schema_overrides": schema_overrides,
494
+ "strict": strict,
495
+ "orient": orient,
496
+ "infer_schema_length": infer_schema_length,
497
+ "nan_to_null": nan_to_null,
498
+ }
499
+ to_pydf: Callable[..., PyDataFrame]
500
+ register_with_singledispatch = True
501
+
502
+ if isinstance(first_element, Generator):
503
+ to_pydf = _sequence_of_sequence_to_pydf
504
+ data = [list(row) for row in data]
505
+ first_element = data[0]
506
+ register_with_singledispatch = False
507
+
508
+ elif isinstance(first_element, pl.Series):
509
+ to_pydf = _sequence_of_series_to_pydf
510
+
511
+ elif _check_for_numpy(first_element) and isinstance(first_element, np.ndarray):
512
+ to_pydf = _sequence_of_numpy_to_pydf
513
+
514
+ elif _check_for_pandas(first_element) and isinstance(
515
+ first_element, (pd.Series, pd.Index, pd.DatetimeIndex)
516
+ ):
517
+ to_pydf = _sequence_of_pandas_to_pydf
518
+
519
+ elif dataclasses.is_dataclass(first_element):
520
+ to_pydf = _sequence_of_dataclasses_to_pydf
521
+
522
+ elif is_pydantic_model(first_element):
523
+ to_pydf = _sequence_of_pydantic_models_to_pydf
524
+
525
+ elif is_sqlalchemy_row(first_element):
526
+ to_pydf = _sequence_of_tuple_to_pydf
527
+
528
+ elif isinstance(first_element, Sequence) and not isinstance(first_element, str):
529
+ to_pydf = _sequence_of_sequence_to_pydf
530
+ else:
531
+ to_pydf = _sequence_of_elements_to_pydf
532
+
533
+ if register_with_singledispatch:
534
+ _sequence_to_pydf_dispatcher.register(type(first_element), to_pydf)
535
+
536
+ common_params["first_element"] = first_element
537
+ return to_pydf(**common_params)
538
+
539
+
540
+ @_sequence_to_pydf_dispatcher.register(list)
541
+ def _sequence_of_sequence_to_pydf(
542
+ first_element: Sequence[Any] | np.ndarray[Any, Any],
543
+ data: Sequence[Any],
544
+ schema: SchemaDefinition | None,
545
+ *,
546
+ schema_overrides: SchemaDict | None,
547
+ strict: bool,
548
+ orient: Orientation | None,
549
+ infer_schema_length: int | None,
550
+ nan_to_null: bool = False,
551
+ ) -> PyDataFrame:
552
+ if orient is None:
553
+ if schema is None:
554
+ orient = "col"
555
+ else:
556
+ # Try to infer orientation from schema length and data dimensions
557
+ is_row_oriented = (len(schema) == len(first_element)) and (
558
+ len(schema) != len(data)
559
+ )
560
+ orient = "row" if is_row_oriented else "col"
561
+
562
+ if is_row_oriented:
563
+ issue_warning(
564
+ "Row orientation inferred during DataFrame construction."
565
+ ' Explicitly specify the orientation by passing `orient="row"` to silence this warning.',
566
+ DataOrientationWarning,
567
+ )
568
+
569
+ if orient == "row":
570
+ column_names, schema_overrides = _unpack_schema(
571
+ schema, schema_overrides=schema_overrides, n_expected=len(first_element)
572
+ )
573
+ local_schema_override = (
574
+ _include_unknowns(schema_overrides, column_names)
575
+ if schema_overrides
576
+ else {}
577
+ )
578
+
579
+ unpack_nested = False
580
+ for col, tp in local_schema_override.items():
581
+ if tp in (Categorical, Enum):
582
+ local_schema_override[col] = String
583
+ elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
584
+ unpack_nested = contains_nested(
585
+ getattr(first_element, col, None).__class__, is_namedtuple
586
+ )
587
+
588
+ if unpack_nested:
589
+ dicts = [nt_unpack(d) for d in data]
590
+ pydf = PyDataFrame.from_dicts(
591
+ dicts,
592
+ schema=None,
593
+ schema_overrides=None,
594
+ strict=strict,
595
+ infer_schema_length=infer_schema_length,
596
+ )
597
+ else:
598
+ pydf = PyDataFrame.from_rows(
599
+ data,
600
+ schema=local_schema_override or None,
601
+ infer_schema_length=infer_schema_length,
602
+ )
603
+ if column_names or schema_overrides:
604
+ pydf = _post_apply_columns(
605
+ pydf, column_names, schema_overrides=schema_overrides, strict=strict
606
+ )
607
+ return pydf
608
+
609
+ elif orient == "col":
610
+ column_names, schema_overrides = _unpack_schema(
611
+ schema, schema_overrides=schema_overrides, n_expected=len(data)
612
+ )
613
+ data_series: list[PySeries] = [
614
+ pl.Series(
615
+ column_names[i],
616
+ element,
617
+ dtype=schema_overrides.get(column_names[i]),
618
+ strict=strict,
619
+ nan_to_null=nan_to_null,
620
+ )._s
621
+ for i, element in enumerate(data)
622
+ ]
623
+ return PyDataFrame(data_series)
624
+
625
+ else:
626
+ msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
627
+ raise ValueError(msg)
628
+
629
+
630
+ def _sequence_of_series_to_pydf(
631
+ first_element: Series,
632
+ data: Sequence[Any],
633
+ schema: SchemaDefinition | None,
634
+ *,
635
+ schema_overrides: SchemaDict | None,
636
+ strict: bool,
637
+ **kwargs: Any,
638
+ ) -> PyDataFrame:
639
+ series_names = [s.name for s in data]
640
+ column_names, schema_overrides = _unpack_schema(
641
+ schema or series_names,
642
+ schema_overrides=schema_overrides,
643
+ n_expected=len(data),
644
+ )
645
+ data_series: list[PySeries] = []
646
+ for i, s in enumerate(data):
647
+ if not s.name:
648
+ s = s.alias(column_names[i])
649
+ new_dtype = schema_overrides.get(column_names[i])
650
+ if new_dtype and new_dtype != s.dtype:
651
+ s = s.cast(new_dtype, strict=strict, wrap_numerical=False)
652
+ data_series.append(s._s)
653
+
654
+ data_series = _handle_columns_arg(data_series, columns=column_names)
655
+ return PyDataFrame(data_series)
656
+
657
+
658
+ @_sequence_to_pydf_dispatcher.register(tuple)
659
+ def _sequence_of_tuple_to_pydf(
660
+ first_element: tuple[Any, ...],
661
+ data: Sequence[Any],
662
+ schema: SchemaDefinition | None,
663
+ *,
664
+ schema_overrides: SchemaDict | None,
665
+ strict: bool,
666
+ orient: Orientation | None,
667
+ infer_schema_length: int | None,
668
+ nan_to_null: bool = False,
669
+ ) -> PyDataFrame:
670
+ # infer additional meta information if namedtuple
671
+ if is_namedtuple(first_element.__class__) or is_sqlalchemy_row(first_element):
672
+ if schema is None:
673
+ schema = first_element._fields # type: ignore[attr-defined]
674
+ annotations = getattr(first_element, "__annotations__", None)
675
+ if annotations and len(annotations) == len(schema):
676
+ schema = [
677
+ (name, try_parse_into_dtype(tp))
678
+ for name, tp in first_element.__annotations__.items()
679
+ ]
680
+ if orient is None:
681
+ orient = "row"
682
+
683
+ # ...then defer to generic sequence processing
684
+ return _sequence_of_sequence_to_pydf(
685
+ first_element,
686
+ data=data,
687
+ schema=schema,
688
+ schema_overrides=schema_overrides,
689
+ strict=strict,
690
+ orient=orient,
691
+ infer_schema_length=infer_schema_length,
692
+ nan_to_null=nan_to_null,
693
+ )
694
+
695
+
696
+ @_sequence_to_pydf_dispatcher.register(Mapping)
697
+ @_sequence_to_pydf_dispatcher.register(dict)
698
+ def _sequence_of_dict_to_pydf(
699
+ first_element: dict[str, Any],
700
+ data: Sequence[Any],
701
+ schema: SchemaDefinition | None,
702
+ *,
703
+ schema_overrides: SchemaDict | None,
704
+ strict: bool,
705
+ infer_schema_length: int | None,
706
+ **kwargs: Any,
707
+ ) -> PyDataFrame:
708
+ column_names, schema_overrides = _unpack_schema(
709
+ schema, schema_overrides=schema_overrides
710
+ )
711
+ dicts_schema = (
712
+ _include_unknowns(schema_overrides, column_names or list(schema_overrides))
713
+ if column_names
714
+ else None
715
+ )
716
+
717
+ pydf = PyDataFrame.from_dicts(
718
+ data,
719
+ dicts_schema,
720
+ schema_overrides,
721
+ strict=strict,
722
+ infer_schema_length=infer_schema_length,
723
+ )
724
+
725
+ # TODO: we can remove this `schema_overrides` block completely
726
+ # once https://github.com/pola-rs/polars/issues/11044 is fixed
727
+ if schema_overrides:
728
+ pydf = _post_apply_columns(
729
+ pydf, columns=column_names, schema_overrides=schema_overrides, strict=strict
730
+ )
731
+ return pydf
732
+
733
+
734
+ @_sequence_to_pydf_dispatcher.register(str)
735
+ def _sequence_of_elements_to_pydf(
736
+ first_element: Any,
737
+ data: Sequence[Any],
738
+ schema: SchemaDefinition | None,
739
+ schema_overrides: SchemaDict | None,
740
+ *,
741
+ strict: bool,
742
+ **kwargs: Any,
743
+ ) -> PyDataFrame:
744
+ column_names, schema_overrides = _unpack_schema(
745
+ schema, schema_overrides=schema_overrides, n_expected=1
746
+ )
747
+ data_series: list[PySeries] = [
748
+ pl.Series(
749
+ column_names[0],
750
+ data,
751
+ schema_overrides.get(column_names[0]),
752
+ strict=strict,
753
+ )._s
754
+ ]
755
+ data_series = _handle_columns_arg(data_series, columns=column_names)
756
+ return PyDataFrame(data_series)
757
+
758
+
759
+ def _sequence_of_numpy_to_pydf(
760
+ first_element: np.ndarray[Any, Any],
761
+ **kwargs: Any,
762
+ ) -> PyDataFrame:
763
+ if first_element.ndim == 1:
764
+ return _sequence_of_sequence_to_pydf(first_element, **kwargs)
765
+ else:
766
+ return _sequence_of_elements_to_pydf(first_element, **kwargs)
767
+
768
+
769
+ def _sequence_of_pandas_to_pydf(
770
+ first_element: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex,
771
+ data: Sequence[Any],
772
+ schema: SchemaDefinition | None,
773
+ schema_overrides: SchemaDict | None,
774
+ *,
775
+ strict: bool,
776
+ **kwargs: Any,
777
+ ) -> PyDataFrame:
778
+ if schema is None:
779
+ column_names: list[str] = []
780
+ else:
781
+ column_names, schema_overrides = _unpack_schema(
782
+ schema, schema_overrides=schema_overrides, n_expected=1
783
+ )
784
+
785
+ schema_overrides = schema_overrides or {}
786
+ data_series: list[PySeries] = []
787
+ for i, s in enumerate(data):
788
+ name = column_names[i] if column_names else s.name
789
+ pyseries = plc.pandas_to_pyseries(name=name, values=s)
790
+ dtype = schema_overrides.get(name)
791
+ if dtype is not None and dtype != pyseries.dtype():
792
+ pyseries = pyseries.cast(dtype, strict=strict, wrap_numerical=False)
793
+ data_series.append(pyseries)
794
+
795
+ return PyDataFrame(data_series)
796
+
797
+
798
+ def _sequence_of_dataclasses_to_pydf(
799
+ first_element: Any,
800
+ data: Sequence[Any],
801
+ schema: SchemaDefinition | None,
802
+ schema_overrides: SchemaDict | None,
803
+ infer_schema_length: int | None,
804
+ *,
805
+ strict: bool = True,
806
+ **kwargs: Any,
807
+ ) -> PyDataFrame:
808
+ """Initialize DataFrame from Python dataclasses."""
809
+ from dataclasses import asdict, astuple
810
+
811
+ (
812
+ unpack_nested,
813
+ column_names,
814
+ schema_overrides,
815
+ overrides,
816
+ ) = _establish_dataclass_or_model_schema(
817
+ first_element, schema, schema_overrides, model_fields=None
818
+ )
819
+ if unpack_nested:
820
+ dicts = [asdict(md) for md in data]
821
+ pydf = PyDataFrame.from_dicts(
822
+ dicts,
823
+ schema=None,
824
+ schema_overrides=None,
825
+ strict=strict,
826
+ infer_schema_length=infer_schema_length,
827
+ )
828
+ else:
829
+ rows = [astuple(dc) for dc in data]
830
+ pydf = PyDataFrame.from_rows(
831
+ rows, # type: ignore[arg-type]
832
+ schema=overrides or None,
833
+ infer_schema_length=infer_schema_length,
834
+ )
835
+
836
+ if overrides:
837
+ structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
838
+ pydf = _post_apply_columns(
839
+ pydf, column_names, structs, schema_overrides, strict=strict
840
+ )
841
+
842
+ return pydf
843
+
844
+
845
+ def _sequence_of_pydantic_models_to_pydf(
846
+ first_element: Any,
847
+ data: Sequence[Any],
848
+ schema: SchemaDefinition | None,
849
+ schema_overrides: SchemaDict | None,
850
+ infer_schema_length: int | None,
851
+ *,
852
+ strict: bool,
853
+ **kwargs: Any,
854
+ ) -> PyDataFrame:
855
+ """Initialise DataFrame from pydantic model objects."""
856
+ import pydantic # note: must already be available in the env here
857
+
858
+ old_pydantic = parse_version(pydantic.__version__) < (2, 0)
859
+ model_fields = list(
860
+ first_element.__fields__
861
+ if old_pydantic
862
+ else first_element.__class__.model_fields
863
+ )
864
+ (
865
+ unpack_nested,
866
+ column_names,
867
+ schema_overrides,
868
+ overrides,
869
+ ) = _establish_dataclass_or_model_schema(
870
+ first_element, schema, schema_overrides, model_fields
871
+ )
872
+ if unpack_nested:
873
+ # note: this is an *extremely* slow path, due to the requirement to
874
+ # use pydantic's 'dict()' method to properly unpack nested models
875
+ dicts = (
876
+ [md.dict() for md in data]
877
+ if old_pydantic
878
+ else [md.model_dump(mode="python") for md in data]
879
+ )
880
+ pydf = PyDataFrame.from_dicts(
881
+ dicts,
882
+ schema=None,
883
+ schema_overrides=None,
884
+ strict=strict,
885
+ infer_schema_length=infer_schema_length,
886
+ )
887
+
888
+ elif len(model_fields) > 50:
889
+ # 'from_rows' is the faster codepath for models with a lot of fields...
890
+ get_values = itemgetter(*model_fields)
891
+ rows = [get_values(md.__dict__) for md in data]
892
+ pydf = PyDataFrame.from_rows(
893
+ rows, schema=overrides, infer_schema_length=infer_schema_length
894
+ )
895
+ else:
896
+ # ...and 'from_dicts' is faster otherwise
897
+ dicts = [md.__dict__ for md in data]
898
+ pydf = PyDataFrame.from_dicts(
899
+ dicts,
900
+ schema=overrides,
901
+ schema_overrides=None,
902
+ strict=strict,
903
+ infer_schema_length=infer_schema_length,
904
+ )
905
+
906
+ if overrides:
907
+ structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
908
+ pydf = _post_apply_columns(
909
+ pydf, column_names, structs, schema_overrides, strict=strict
910
+ )
911
+
912
+ return pydf
913
+
914
+
915
+ def _establish_dataclass_or_model_schema(
916
+ first_element: Any,
917
+ schema: SchemaDefinition | None,
918
+ schema_overrides: SchemaDict | None,
919
+ model_fields: list[str] | None,
920
+ ) -> tuple[bool, list[str], SchemaDict, SchemaDict]:
921
+ """Shared utility code for establishing dataclasses/pydantic model cols/schema."""
922
+ from dataclasses import asdict
923
+
924
+ unpack_nested = False
925
+ if schema:
926
+ column_names, schema_overrides = _unpack_schema(
927
+ schema, schema_overrides=schema_overrides
928
+ )
929
+ overrides = {col: schema_overrides.get(col, Unknown) for col in column_names}
930
+ else:
931
+ column_names = []
932
+ overrides = {
933
+ col: (try_parse_into_dtype(tp) or Unknown)
934
+ for col, tp in try_get_type_hints(first_element.__class__).items()
935
+ if ((col in model_fields) if model_fields else (col != "__slots__"))
936
+ }
937
+ if schema_overrides:
938
+ overrides.update(schema_overrides)
939
+ elif not model_fields:
940
+ dc_fields = set(asdict(first_element))
941
+ schema_overrides = overrides = {
942
+ nm: tp for nm, tp in overrides.items() if nm in dc_fields
943
+ }
944
+ else:
945
+ schema_overrides = overrides
946
+
947
+ for col, tp in overrides.items():
948
+ if tp in (Categorical, Enum):
949
+ overrides[col] = String
950
+ elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
951
+ unpack_nested = contains_nested(
952
+ getattr(first_element, col, None),
953
+ is_pydantic_model if model_fields else dataclasses.is_dataclass, # type: ignore[arg-type]
954
+ )
955
+
956
+ if model_fields and len(model_fields) == len(overrides):
957
+ overrides = dict(zip(model_fields, overrides.values()))
958
+
959
+ return unpack_nested, column_names, schema_overrides, overrides
960
+
961
+
962
+ def _include_unknowns(
963
+ schema: SchemaDict, cols: Sequence[str]
964
+ ) -> MutableMapping[str, PolarsDataType]:
965
+ """Complete partial schema dict by including Unknown type."""
966
+ return {
967
+ col: (schema.get(col, Unknown) or Unknown) # type: ignore[truthy-bool]
968
+ for col in cols
969
+ }
970
+
971
+
972
+ def iterable_to_pydf(
973
+ data: Iterable[Any],
974
+ schema: SchemaDefinition | None = None,
975
+ *,
976
+ schema_overrides: SchemaDict | None = None,
977
+ strict: bool = True,
978
+ orient: Orientation | None = None,
979
+ chunk_size: int | None = None,
980
+ infer_schema_length: int | None = N_INFER_DEFAULT,
981
+ rechunk: bool = True,
982
+ ) -> PyDataFrame:
983
+ """Construct a PyDataFrame from an iterable/generator."""
984
+ original_schema = schema
985
+ column_names: list[str] = []
986
+ dtypes_by_idx: dict[int, PolarsDataType] = {}
987
+ if schema is not None:
988
+ column_names, schema_overrides = _unpack_schema(
989
+ schema, schema_overrides=schema_overrides
990
+ )
991
+ elif schema_overrides:
992
+ _, schema_overrides = _unpack_schema(schema, schema_overrides=schema_overrides)
993
+
994
+ if not isinstance(data, Generator):
995
+ data = iter(data)
996
+
997
+ if orient == "col":
998
+ if column_names and schema_overrides:
999
+ dtypes_by_idx = {
1000
+ idx: schema_overrides.get(col, Unknown)
1001
+ for idx, col in enumerate(column_names)
1002
+ }
1003
+
1004
+ return pl.DataFrame(
1005
+ {
1006
+ (column_names[idx] if column_names else f"column_{idx}"): pl.Series(
1007
+ coldata,
1008
+ dtype=dtypes_by_idx.get(idx),
1009
+ strict=strict,
1010
+ )
1011
+ for idx, coldata in enumerate(data)
1012
+ },
1013
+ )._df
1014
+
1015
+ def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFrame:
1016
+ return pl.DataFrame(
1017
+ data=values,
1018
+ schema=schema,
1019
+ strict=strict,
1020
+ orient="row",
1021
+ infer_schema_length=infer_schema_length,
1022
+ )
1023
+
1024
+ n_chunks = 0
1025
+ n_chunk_elems = 1_000_000
1026
+
1027
+ if chunk_size:
1028
+ adaptive_chunk_size = chunk_size
1029
+ elif column_names:
1030
+ adaptive_chunk_size = n_chunk_elems // len(column_names)
1031
+ else:
1032
+ adaptive_chunk_size = None
1033
+
1034
+ df: DataFrame = None # type: ignore[assignment]
1035
+ chunk_size = (
1036
+ None
1037
+ if infer_schema_length is None
1038
+ else max(infer_schema_length, adaptive_chunk_size or 1000)
1039
+ )
1040
+ while True:
1041
+ values = list(islice(data, chunk_size))
1042
+ if not values:
1043
+ break
1044
+ frame_chunk = to_frame_chunk(values, original_schema)
1045
+ if df is None:
1046
+ df = frame_chunk
1047
+ if not original_schema:
1048
+ original_schema = list(df.schema.items())
1049
+ if chunk_size != adaptive_chunk_size:
1050
+ if (n_columns := df.width) > 0:
1051
+ chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns
1052
+ else:
1053
+ df.vstack(frame_chunk, in_place=True)
1054
+ n_chunks += 1
1055
+
1056
+ if df is None:
1057
+ df = to_frame_chunk([], original_schema)
1058
+
1059
+ if n_chunks > 0 and rechunk:
1060
+ df = df.rechunk()
1061
+
1062
+ return df._df
1063
+
1064
+
1065
+ def _check_pandas_columns(data: pd.DataFrame, *, include_index: bool) -> None:
1066
+ """Check pandas dataframe columns can be converted to polars."""
1067
+ stringified_cols: set[str] = {str(col) for col in data.columns}
1068
+ stringified_index: set[str] = (
1069
+ {str(idx) for idx in data.index.names} if include_index else set()
1070
+ )
1071
+
1072
+ non_unique_cols: bool = len(stringified_cols) < len(data.columns)
1073
+ non_unique_indices: bool = (
1074
+ (len(stringified_index) < len(data.index.names)) if include_index else False
1075
+ )
1076
+ if non_unique_cols or non_unique_indices:
1077
+ msg = (
1078
+ "Pandas dataframe contains non-unique indices and/or column names. "
1079
+ "Polars dataframes require unique string names for columns."
1080
+ )
1081
+ raise ValueError(msg)
1082
+
1083
+ overlapping_cols_and_indices: set[str] = stringified_cols & stringified_index
1084
+ if len(overlapping_cols_and_indices) > 0:
1085
+ msg = "Pandas indices and column names must not overlap."
1086
+ raise ValueError(msg)
1087
+
1088
+
1089
+ def pandas_to_pydf(
1090
+ data: pd.DataFrame,
1091
+ schema: SchemaDefinition | None = None,
1092
+ *,
1093
+ schema_overrides: SchemaDict | None = None,
1094
+ strict: bool = True,
1095
+ rechunk: bool = True,
1096
+ nan_to_null: bool = True,
1097
+ include_index: bool = False,
1098
+ ) -> PyDataFrame:
1099
+ """Construct a PyDataFrame from a pandas DataFrame."""
1100
+ _check_pandas_columns(data, include_index=include_index)
1101
+
1102
+ convert_index = include_index and not _pandas_has_default_index(data)
1103
+ if not convert_index and all(
1104
+ is_simple_numpy_backed_pandas_series(data[col]) for col in data.columns
1105
+ ):
1106
+ # Convert via NumPy directly, no PyArrow needed.
1107
+ return pl.DataFrame(
1108
+ {str(col): data[col].to_numpy() for col in data.columns},
1109
+ schema=schema,
1110
+ strict=strict,
1111
+ schema_overrides=schema_overrides,
1112
+ nan_to_null=nan_to_null,
1113
+ )._df
1114
+
1115
+ if not _PYARROW_AVAILABLE:
1116
+ msg = (
1117
+ "pyarrow is required for converting a pandas dataframe to Polars, "
1118
+ "unless each of its columns is a simple numpy-backed one "
1119
+ "(e.g. 'int64', 'bool', 'float32' - not 'Int64')"
1120
+ )
1121
+ raise ImportError(msg)
1122
+ arrow_dict = {}
1123
+ length = data.shape[0]
1124
+
1125
+ if convert_index:
1126
+ for idxcol in data.index.names:
1127
+ arrow_dict[str(idxcol)] = plc.pandas_series_to_arrow(
1128
+ # get_level_values accepts `int | str`
1129
+ # but `index.names` returns `Hashable`
1130
+ data.index.get_level_values(idxcol), # type: ignore[arg-type, unused-ignore]
1131
+ nan_to_null=nan_to_null,
1132
+ length=length,
1133
+ )
1134
+
1135
+ for col_idx, col_data in data.items():
1136
+ arrow_dict[str(col_idx)] = plc.pandas_series_to_arrow(
1137
+ col_data, nan_to_null=nan_to_null, length=length
1138
+ )
1139
+
1140
+ arrow_table = pa.table(arrow_dict)
1141
+ return arrow_to_pydf(
1142
+ arrow_table,
1143
+ schema=schema,
1144
+ schema_overrides=schema_overrides,
1145
+ strict=strict,
1146
+ rechunk=rechunk,
1147
+ )
1148
+
1149
+
1150
+ def _pandas_has_default_index(df: pd.DataFrame) -> bool:
1151
+ """Identify if the pandas frame only has a default (or equivalent) index."""
1152
+ from pandas.core.indexes.range import RangeIndex
1153
+
1154
+ index_cols = df.index.names
1155
+
1156
+ if len(index_cols) > 1 or index_cols not in ([None], [""]):
1157
+ # not default: more than one index, or index is named
1158
+ return False
1159
+ elif df.index.equals(RangeIndex(start=0, stop=len(df), step=1)):
1160
+ # is default: simple range index
1161
+ return True
1162
+ else:
1163
+ # finally, is the index _equivalent_ to a default unnamed
1164
+ # integer index with frame data that was previously sorted
1165
+ return (
1166
+ str(df.index.dtype).startswith("int")
1167
+ and (df.index.sort_values() == np.arange(len(df))).all()
1168
+ )
1169
+
1170
+
1171
+ def arrow_to_pydf(
1172
+ data: pa.Table | pa.RecordBatch,
1173
+ schema: SchemaDefinition | None = None,
1174
+ *,
1175
+ schema_overrides: SchemaDict | None = None,
1176
+ strict: bool = True,
1177
+ rechunk: bool = True,
1178
+ ) -> PyDataFrame:
1179
+ """Construct a PyDataFrame from an Arrow Table or RecordBatch."""
1180
+ column_names, schema_overrides = _unpack_schema(
1181
+ (schema or data.schema.names), schema_overrides=schema_overrides
1182
+ )
1183
+ try:
1184
+ if column_names != data.schema.names:
1185
+ data = data.rename_columns(column_names)
1186
+ except pa.ArrowInvalid as e:
1187
+ msg = "dimensions of columns arg must match data dimensions"
1188
+ raise ValueError(msg) from e
1189
+
1190
+ batches: list[pa.RecordBatch]
1191
+ if isinstance(data, pa.RecordBatch):
1192
+ batches = [data]
1193
+ else:
1194
+ batches = data.to_batches()
1195
+
1196
+ # supply the arrow schema so the metadata is intact
1197
+ pydf = PyDataFrame.from_arrow_record_batches(batches, data.schema)
1198
+
1199
+ if rechunk:
1200
+ pydf = pydf.rechunk()
1201
+
1202
+ if schema_overrides is not None:
1203
+ pydf = _post_apply_columns(
1204
+ pydf,
1205
+ column_names,
1206
+ schema_overrides=schema_overrides,
1207
+ strict=strict,
1208
+ )
1209
+
1210
+ return pydf
1211
+
1212
+
1213
+ def numpy_to_pydf(
1214
+ data: np.ndarray[Any, Any],
1215
+ schema: SchemaDefinition | None = None,
1216
+ *,
1217
+ schema_overrides: SchemaDict | None = None,
1218
+ orient: Orientation | None = None,
1219
+ strict: bool = True,
1220
+ nan_to_null: bool = False,
1221
+ ) -> PyDataFrame:
1222
+ """Construct a PyDataFrame from a NumPy ndarray (including structured ndarrays)."""
1223
+ shape = data.shape
1224
+ two_d = len(shape) == 2
1225
+
1226
+ if data.dtype.names is not None:
1227
+ structured_array, orient = True, "col"
1228
+ record_names = list(data.dtype.names)
1229
+ n_columns = len(record_names)
1230
+ for nm in record_names:
1231
+ shape = data[nm].shape
1232
+ if not schema:
1233
+ schema = record_names
1234
+ else:
1235
+ # Unpack columns
1236
+ structured_array, record_names = False, []
1237
+ if shape == (0,):
1238
+ n_columns = 0
1239
+
1240
+ elif len(shape) == 1:
1241
+ n_columns = 1
1242
+
1243
+ elif len(shape) == 2:
1244
+ if orient is None and schema is None:
1245
+ # default convention; first axis is rows, second axis is columns
1246
+ n_columns = shape[1]
1247
+ orient = "row"
1248
+
1249
+ elif orient is None and schema is not None:
1250
+ # infer orientation from 'schema' param; if square array
1251
+ # we check the flags to establish row/column major order
1252
+ n_schema_cols = len(schema)
1253
+ if n_schema_cols == shape[0] and n_schema_cols != shape[1]:
1254
+ orient = "col"
1255
+ n_columns = shape[0]
1256
+ elif data.flags["F_CONTIGUOUS"] and shape[0] == shape[1]:
1257
+ orient = "col"
1258
+ n_columns = n_schema_cols
1259
+ else:
1260
+ orient = "row"
1261
+ n_columns = shape[1]
1262
+
1263
+ elif orient == "row":
1264
+ n_columns = shape[1]
1265
+ elif orient == "col":
1266
+ n_columns = shape[0]
1267
+ else:
1268
+ msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
1269
+ raise ValueError(msg)
1270
+ else:
1271
+ if shape == ():
1272
+ msg = "cannot create DataFrame from zero-dimensional array"
1273
+ else:
1274
+ msg = f"cannot create DataFrame from array with more than two dimensions; shape = {shape}"
1275
+ raise ValueError(msg)
1276
+
1277
+ if schema is not None and len(schema) != n_columns:
1278
+ if (n_schema_cols := len(schema)) != 1:
1279
+ msg = f"dimensions of `schema` ({n_schema_cols}) must match data dimensions ({n_columns})"
1280
+ raise ValueError(msg)
1281
+ n_columns = n_schema_cols
1282
+
1283
+ column_names, schema_overrides = _unpack_schema(
1284
+ schema, schema_overrides=schema_overrides, n_expected=n_columns
1285
+ )
1286
+
1287
+ # Convert data to series
1288
+ if structured_array:
1289
+ data_series = [
1290
+ pl.Series(
1291
+ name=series_name,
1292
+ values=data[record_name],
1293
+ dtype=schema_overrides.get(record_name),
1294
+ strict=strict,
1295
+ nan_to_null=nan_to_null,
1296
+ )._s
1297
+ for series_name, record_name in zip(column_names, record_names)
1298
+ ]
1299
+ elif shape == (0,) and n_columns == 0:
1300
+ data_series = []
1301
+
1302
+ elif len(shape) == 1:
1303
+ data_series = [
1304
+ pl.Series(
1305
+ name=column_names[0],
1306
+ values=data,
1307
+ dtype=schema_overrides.get(column_names[0]),
1308
+ strict=strict,
1309
+ nan_to_null=nan_to_null,
1310
+ )._s
1311
+ ]
1312
+ else:
1313
+ if orient == "row":
1314
+ data_series = [
1315
+ pl.Series(
1316
+ name=column_names[i],
1317
+ values=(
1318
+ data
1319
+ if two_d and n_columns == 1 and shape[1] > 1
1320
+ else data[:, i]
1321
+ ),
1322
+ dtype=schema_overrides.get(column_names[i]),
1323
+ strict=strict,
1324
+ nan_to_null=nan_to_null,
1325
+ )._s
1326
+ for i in range(n_columns)
1327
+ ]
1328
+ else:
1329
+ data_series = [
1330
+ pl.Series(
1331
+ name=column_names[i],
1332
+ values=(
1333
+ data if two_d and n_columns == 1 and shape[1] > 1 else data[i]
1334
+ ),
1335
+ dtype=schema_overrides.get(column_names[i]),
1336
+ strict=strict,
1337
+ nan_to_null=nan_to_null,
1338
+ )._s
1339
+ for i in range(n_columns)
1340
+ ]
1341
+
1342
+ data_series = _handle_columns_arg(data_series, columns=column_names)
1343
+ return PyDataFrame(data_series)
1344
+
1345
+
1346
+ def series_to_pydf(
1347
+ data: Series,
1348
+ schema: SchemaDefinition | None = None,
1349
+ schema_overrides: SchemaDict | None = None,
1350
+ *,
1351
+ strict: bool = True,
1352
+ ) -> PyDataFrame:
1353
+ """Construct a PyDataFrame from a Polars Series."""
1354
+ if schema is None and schema_overrides is None:
1355
+ return PyDataFrame([data._s])
1356
+
1357
+ data_series = [data._s]
1358
+ series_name = [s.name() for s in data_series]
1359
+ column_names, schema_overrides = _unpack_schema(
1360
+ schema or series_name, schema_overrides=schema_overrides, n_expected=1
1361
+ )
1362
+ if schema_overrides:
1363
+ new_dtype = next(iter(schema_overrides.values()))
1364
+ if new_dtype != data.dtype:
1365
+ data_series[0] = data_series[0].cast(
1366
+ new_dtype, strict=strict, wrap_numerical=False
1367
+ )
1368
+
1369
+ data_series = _handle_columns_arg(data_series, columns=column_names)
1370
+ return PyDataFrame(data_series)
1371
+
1372
+
1373
+ def dataframe_to_pydf(
1374
+ data: DataFrame,
1375
+ schema: SchemaDefinition | None = None,
1376
+ *,
1377
+ schema_overrides: SchemaDict | None = None,
1378
+ strict: bool = True,
1379
+ ) -> PyDataFrame:
1380
+ """Construct a PyDataFrame from an existing Polars DataFrame."""
1381
+ if schema is None and schema_overrides is None:
1382
+ return data._df.clone()
1383
+
1384
+ data_series = {c.name: c._s for c in data}
1385
+ column_names, schema_overrides = _unpack_schema(
1386
+ schema or data.columns, schema_overrides=schema_overrides
1387
+ )
1388
+ if schema_overrides:
1389
+ existing_schema = data.schema
1390
+ for name, new_dtype in schema_overrides.items():
1391
+ if new_dtype != existing_schema[name]:
1392
+ data_series[name] = data_series[name].cast(
1393
+ new_dtype, strict=strict, wrap_numerical=False
1394
+ )
1395
+
1396
+ series_cols = _handle_columns_arg(list(data_series.values()), columns=column_names)
1397
+ return PyDataFrame(series_cols)