polars-runtime-compat 1.34.0b2__cp39-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/.gitkeep +0 -0
- _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
- polars/__init__.py +528 -0
- polars/_cpu_check.py +265 -0
- polars/_dependencies.py +355 -0
- polars/_plr.py +99 -0
- polars/_plr.pyi +2496 -0
- polars/_reexport.py +23 -0
- polars/_typing.py +478 -0
- polars/_utils/__init__.py +37 -0
- polars/_utils/async_.py +102 -0
- polars/_utils/cache.py +176 -0
- polars/_utils/cloud.py +40 -0
- polars/_utils/constants.py +29 -0
- polars/_utils/construction/__init__.py +46 -0
- polars/_utils/construction/dataframe.py +1397 -0
- polars/_utils/construction/other.py +72 -0
- polars/_utils/construction/series.py +560 -0
- polars/_utils/construction/utils.py +118 -0
- polars/_utils/convert.py +224 -0
- polars/_utils/deprecation.py +406 -0
- polars/_utils/getitem.py +457 -0
- polars/_utils/logging.py +11 -0
- polars/_utils/nest_asyncio.py +264 -0
- polars/_utils/parquet.py +15 -0
- polars/_utils/parse/__init__.py +12 -0
- polars/_utils/parse/expr.py +242 -0
- polars/_utils/polars_version.py +19 -0
- polars/_utils/pycapsule.py +53 -0
- polars/_utils/scan.py +27 -0
- polars/_utils/serde.py +63 -0
- polars/_utils/slice.py +215 -0
- polars/_utils/udfs.py +1251 -0
- polars/_utils/unstable.py +63 -0
- polars/_utils/various.py +782 -0
- polars/_utils/wrap.py +25 -0
- polars/api.py +370 -0
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +19 -0
- polars/catalog/unity/client.py +733 -0
- polars/catalog/unity/models.py +152 -0
- polars/config.py +1571 -0
- polars/convert/__init__.py +25 -0
- polars/convert/general.py +1046 -0
- polars/convert/normalize.py +261 -0
- polars/dataframe/__init__.py +5 -0
- polars/dataframe/_html.py +186 -0
- polars/dataframe/frame.py +12582 -0
- polars/dataframe/group_by.py +1067 -0
- polars/dataframe/plotting.py +257 -0
- polars/datatype_expr/__init__.py +5 -0
- polars/datatype_expr/array.py +56 -0
- polars/datatype_expr/datatype_expr.py +304 -0
- polars/datatype_expr/list.py +18 -0
- polars/datatype_expr/struct.py +69 -0
- polars/datatypes/__init__.py +122 -0
- polars/datatypes/_parse.py +195 -0
- polars/datatypes/_utils.py +48 -0
- polars/datatypes/classes.py +1213 -0
- polars/datatypes/constants.py +11 -0
- polars/datatypes/constructor.py +172 -0
- polars/datatypes/convert.py +366 -0
- polars/datatypes/group.py +130 -0
- polars/exceptions.py +230 -0
- polars/expr/__init__.py +7 -0
- polars/expr/array.py +964 -0
- polars/expr/binary.py +346 -0
- polars/expr/categorical.py +306 -0
- polars/expr/datetime.py +2620 -0
- polars/expr/expr.py +11272 -0
- polars/expr/list.py +1408 -0
- polars/expr/meta.py +444 -0
- polars/expr/name.py +321 -0
- polars/expr/string.py +3045 -0
- polars/expr/struct.py +357 -0
- polars/expr/whenthen.py +185 -0
- polars/functions/__init__.py +193 -0
- polars/functions/aggregation/__init__.py +33 -0
- polars/functions/aggregation/horizontal.py +298 -0
- polars/functions/aggregation/vertical.py +341 -0
- polars/functions/as_datatype.py +848 -0
- polars/functions/business.py +138 -0
- polars/functions/col.py +384 -0
- polars/functions/datatype.py +121 -0
- polars/functions/eager.py +524 -0
- polars/functions/escape_regex.py +29 -0
- polars/functions/lazy.py +2751 -0
- polars/functions/len.py +68 -0
- polars/functions/lit.py +210 -0
- polars/functions/random.py +22 -0
- polars/functions/range/__init__.py +19 -0
- polars/functions/range/_utils.py +15 -0
- polars/functions/range/date_range.py +303 -0
- polars/functions/range/datetime_range.py +370 -0
- polars/functions/range/int_range.py +348 -0
- polars/functions/range/linear_space.py +311 -0
- polars/functions/range/time_range.py +287 -0
- polars/functions/repeat.py +301 -0
- polars/functions/whenthen.py +353 -0
- polars/interchange/__init__.py +10 -0
- polars/interchange/buffer.py +77 -0
- polars/interchange/column.py +190 -0
- polars/interchange/dataframe.py +230 -0
- polars/interchange/from_dataframe.py +328 -0
- polars/interchange/protocol.py +303 -0
- polars/interchange/utils.py +170 -0
- polars/io/__init__.py +64 -0
- polars/io/_utils.py +317 -0
- polars/io/avro.py +49 -0
- polars/io/clipboard.py +36 -0
- polars/io/cloud/__init__.py +17 -0
- polars/io/cloud/_utils.py +80 -0
- polars/io/cloud/credential_provider/__init__.py +17 -0
- polars/io/cloud/credential_provider/_builder.py +520 -0
- polars/io/cloud/credential_provider/_providers.py +618 -0
- polars/io/csv/__init__.py +9 -0
- polars/io/csv/_utils.py +38 -0
- polars/io/csv/batched_reader.py +142 -0
- polars/io/csv/functions.py +1495 -0
- polars/io/database/__init__.py +6 -0
- polars/io/database/_arrow_registry.py +70 -0
- polars/io/database/_cursor_proxies.py +147 -0
- polars/io/database/_executor.py +578 -0
- polars/io/database/_inference.py +314 -0
- polars/io/database/_utils.py +144 -0
- polars/io/database/functions.py +516 -0
- polars/io/delta.py +499 -0
- polars/io/iceberg/__init__.py +3 -0
- polars/io/iceberg/_utils.py +697 -0
- polars/io/iceberg/dataset.py +556 -0
- polars/io/iceberg/functions.py +151 -0
- polars/io/ipc/__init__.py +8 -0
- polars/io/ipc/functions.py +514 -0
- polars/io/json/__init__.py +3 -0
- polars/io/json/read.py +101 -0
- polars/io/ndjson.py +332 -0
- polars/io/parquet/__init__.py +17 -0
- polars/io/parquet/field_overwrites.py +140 -0
- polars/io/parquet/functions.py +722 -0
- polars/io/partition.py +491 -0
- polars/io/plugins.py +187 -0
- polars/io/pyarrow_dataset/__init__.py +5 -0
- polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
- polars/io/pyarrow_dataset/functions.py +79 -0
- polars/io/scan_options/__init__.py +5 -0
- polars/io/scan_options/_options.py +59 -0
- polars/io/scan_options/cast_options.py +126 -0
- polars/io/spreadsheet/__init__.py +6 -0
- polars/io/spreadsheet/_utils.py +52 -0
- polars/io/spreadsheet/_write_utils.py +647 -0
- polars/io/spreadsheet/functions.py +1323 -0
- polars/lazyframe/__init__.py +9 -0
- polars/lazyframe/engine_config.py +61 -0
- polars/lazyframe/frame.py +8564 -0
- polars/lazyframe/group_by.py +669 -0
- polars/lazyframe/in_process.py +42 -0
- polars/lazyframe/opt_flags.py +333 -0
- polars/meta/__init__.py +14 -0
- polars/meta/build.py +33 -0
- polars/meta/index_type.py +27 -0
- polars/meta/thread_pool.py +50 -0
- polars/meta/versions.py +120 -0
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +213 -0
- polars/ml/utilities.py +30 -0
- polars/plugins.py +155 -0
- polars/py.typed +0 -0
- polars/pyproject.toml +96 -0
- polars/schema.py +265 -0
- polars/selectors.py +3117 -0
- polars/series/__init__.py +5 -0
- polars/series/array.py +776 -0
- polars/series/binary.py +254 -0
- polars/series/categorical.py +246 -0
- polars/series/datetime.py +2275 -0
- polars/series/list.py +1087 -0
- polars/series/plotting.py +191 -0
- polars/series/series.py +9197 -0
- polars/series/string.py +2367 -0
- polars/series/struct.py +154 -0
- polars/series/utils.py +191 -0
- polars/sql/__init__.py +7 -0
- polars/sql/context.py +677 -0
- polars/sql/functions.py +139 -0
- polars/string_cache.py +185 -0
- polars/testing/__init__.py +13 -0
- polars/testing/asserts/__init__.py +9 -0
- polars/testing/asserts/frame.py +231 -0
- polars/testing/asserts/series.py +219 -0
- polars/testing/asserts/utils.py +12 -0
- polars/testing/parametric/__init__.py +33 -0
- polars/testing/parametric/profiles.py +107 -0
- polars/testing/parametric/strategies/__init__.py +22 -0
- polars/testing/parametric/strategies/_utils.py +14 -0
- polars/testing/parametric/strategies/core.py +615 -0
- polars/testing/parametric/strategies/data.py +452 -0
- polars/testing/parametric/strategies/dtype.py +436 -0
- polars/testing/parametric/strategies/legacy.py +169 -0
- polars/type_aliases.py +24 -0
- polars_runtime_compat-1.34.0b2.dist-info/METADATA +190 -0
- polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
- polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
- polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +20 -0
|
@@ -0,0 +1,524 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
from collections.abc import Generator, Iterator, Sequence
|
|
5
|
+
from functools import reduce
|
|
6
|
+
from itertools import chain
|
|
7
|
+
from typing import TYPE_CHECKING, get_args
|
|
8
|
+
|
|
9
|
+
import polars._reexport as pl
|
|
10
|
+
from polars import functions as F
|
|
11
|
+
from polars._typing import ConcatMethod
|
|
12
|
+
from polars._utils.various import ordered_unique, qualified_type_name
|
|
13
|
+
from polars._utils.wrap import wrap_df, wrap_expr, wrap_ldf, wrap_s
|
|
14
|
+
from polars.exceptions import InvalidOperationError
|
|
15
|
+
|
|
16
|
+
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
17
|
+
import polars._plr as plr
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from collections.abc import Iterable
|
|
21
|
+
|
|
22
|
+
from polars import DataFrame, Expr, LazyFrame, Series
|
|
23
|
+
from polars._typing import FrameType, JoinStrategy, PolarsType
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def concat(
|
|
27
|
+
items: Iterable[PolarsType],
|
|
28
|
+
*,
|
|
29
|
+
how: ConcatMethod = "vertical",
|
|
30
|
+
rechunk: bool = False,
|
|
31
|
+
parallel: bool = True,
|
|
32
|
+
) -> PolarsType:
|
|
33
|
+
"""
|
|
34
|
+
Combine multiple DataFrames, LazyFrames, or Series into a single object.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
items
|
|
39
|
+
DataFrames, LazyFrames, or Series to concatenate.
|
|
40
|
+
how : {'vertical', 'vertical_relaxed', 'diagonal', 'diagonal_relaxed', 'horizontal', 'align', 'align_full', 'align_inner', 'align_left', 'align_right'}
|
|
41
|
+
Note that `Series` only support the `vertical` strategy.
|
|
42
|
+
|
|
43
|
+
* vertical: Applies multiple `vstack` operations.
|
|
44
|
+
* vertical_relaxed: Same as `vertical`, but additionally coerces columns to
|
|
45
|
+
their common supertype *if* they are mismatched (eg: Int32 → Int64).
|
|
46
|
+
* diagonal: Finds a union between the column schemas and fills missing column
|
|
47
|
+
values with `null`.
|
|
48
|
+
* diagonal_relaxed: Same as `diagonal`, but additionally coerces columns to
|
|
49
|
+
their common supertype *if* they are mismatched (eg: Int32 → Int64).
|
|
50
|
+
* horizontal: Stacks Series from DataFrames horizontally and fills with `null`
|
|
51
|
+
if the lengths don't match.
|
|
52
|
+
* align, align_full, align_left, align_right: Combines frames horizontally,
|
|
53
|
+
auto-determining the common key columns and aligning rows using the same
|
|
54
|
+
logic as `align_frames` (note that "align" is an alias for "align_full").
|
|
55
|
+
The "align" strategy determines the type of join used to align the frames,
|
|
56
|
+
equivalent to the "how" parameter on `align_frames`. Note that the common
|
|
57
|
+
join columns are automatically coalesced, but other column collisions
|
|
58
|
+
will raise an error (if you need more control over this you should use
|
|
59
|
+
a suitable `join` method directly).
|
|
60
|
+
rechunk
|
|
61
|
+
Make sure that the result data is in contiguous memory.
|
|
62
|
+
parallel
|
|
63
|
+
Only relevant for LazyFrames. This determines if the concatenated
|
|
64
|
+
lazy computations may be executed in parallel.
|
|
65
|
+
|
|
66
|
+
Examples
|
|
67
|
+
--------
|
|
68
|
+
>>> df1 = pl.DataFrame({"a": [1], "b": [3]})
|
|
69
|
+
>>> df2 = pl.DataFrame({"a": [2], "b": [4]})
|
|
70
|
+
>>> pl.concat([df1, df2]) # default is 'vertical' strategy
|
|
71
|
+
shape: (2, 2)
|
|
72
|
+
┌─────┬─────┐
|
|
73
|
+
│ a ┆ b │
|
|
74
|
+
│ --- ┆ --- │
|
|
75
|
+
│ i64 ┆ i64 │
|
|
76
|
+
╞═════╪═════╡
|
|
77
|
+
│ 1 ┆ 3 │
|
|
78
|
+
│ 2 ┆ 4 │
|
|
79
|
+
└─────┴─────┘
|
|
80
|
+
|
|
81
|
+
>>> df1 = pl.DataFrame({"a": [1], "b": [3]})
|
|
82
|
+
>>> df2 = pl.DataFrame({"a": [2.5], "b": [4]})
|
|
83
|
+
>>> pl.concat([df1, df2], how="vertical_relaxed") # 'a' coerced into f64
|
|
84
|
+
shape: (2, 2)
|
|
85
|
+
┌─────┬─────┐
|
|
86
|
+
│ a ┆ b │
|
|
87
|
+
│ --- ┆ --- │
|
|
88
|
+
│ f64 ┆ i64 │
|
|
89
|
+
╞═════╪═════╡
|
|
90
|
+
│ 1.0 ┆ 3 │
|
|
91
|
+
│ 2.5 ┆ 4 │
|
|
92
|
+
└─────┴─────┘
|
|
93
|
+
|
|
94
|
+
>>> df_h1 = pl.DataFrame({"l1": [1, 2], "l2": [3, 4]})
|
|
95
|
+
>>> df_h2 = pl.DataFrame({"r1": [5, 6], "r2": [7, 8], "r3": [9, 10]})
|
|
96
|
+
>>> pl.concat([df_h1, df_h2], how="horizontal")
|
|
97
|
+
shape: (2, 5)
|
|
98
|
+
┌─────┬─────┬─────┬─────┬─────┐
|
|
99
|
+
│ l1 ┆ l2 ┆ r1 ┆ r2 ┆ r3 │
|
|
100
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
101
|
+
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
102
|
+
╞═════╪═════╪═════╪═════╪═════╡
|
|
103
|
+
│ 1 ┆ 3 ┆ 5 ┆ 7 ┆ 9 │
|
|
104
|
+
│ 2 ┆ 4 ┆ 6 ┆ 8 ┆ 10 │
|
|
105
|
+
└─────┴─────┴─────┴─────┴─────┘
|
|
106
|
+
|
|
107
|
+
The "diagonal" strategy allows for some frames to have missing columns,
|
|
108
|
+
the values for which are filled with `null`:
|
|
109
|
+
|
|
110
|
+
>>> df_d1 = pl.DataFrame({"a": [1], "b": [3]})
|
|
111
|
+
>>> df_d2 = pl.DataFrame({"a": [2], "c": [4]})
|
|
112
|
+
>>> pl.concat([df_d1, df_d2], how="diagonal")
|
|
113
|
+
shape: (2, 3)
|
|
114
|
+
┌─────┬──────┬──────┐
|
|
115
|
+
│ a ┆ b ┆ c │
|
|
116
|
+
│ --- ┆ --- ┆ --- │
|
|
117
|
+
│ i64 ┆ i64 ┆ i64 │
|
|
118
|
+
╞═════╪══════╪══════╡
|
|
119
|
+
│ 1 ┆ 3 ┆ null │
|
|
120
|
+
│ 2 ┆ null ┆ 4 │
|
|
121
|
+
└─────┴──────┴──────┘
|
|
122
|
+
|
|
123
|
+
The "align" strategies require at least one common column to align on:
|
|
124
|
+
|
|
125
|
+
>>> df_a1 = pl.DataFrame({"id": [1, 2], "x": [3, 4]})
|
|
126
|
+
>>> df_a2 = pl.DataFrame({"id": [2, 3], "y": [5, 6]})
|
|
127
|
+
>>> df_a3 = pl.DataFrame({"id": [1, 3], "z": [7, 8]})
|
|
128
|
+
>>> pl.concat([df_a1, df_a2, df_a3], how="align") # equivalent to "align_full"
|
|
129
|
+
shape: (3, 4)
|
|
130
|
+
┌─────┬──────┬──────┬──────┐
|
|
131
|
+
│ id ┆ x ┆ y ┆ z │
|
|
132
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
133
|
+
│ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
134
|
+
╞═════╪══════╪══════╪══════╡
|
|
135
|
+
│ 1 ┆ 3 ┆ null ┆ 7 │
|
|
136
|
+
│ 2 ┆ 4 ┆ 5 ┆ null │
|
|
137
|
+
│ 3 ┆ null ┆ 6 ┆ 8 │
|
|
138
|
+
└─────┴──────┴──────┴──────┘
|
|
139
|
+
>>> pl.concat([df_a1, df_a2, df_a3], how="align_left")
|
|
140
|
+
shape: (2, 4)
|
|
141
|
+
┌─────┬─────┬──────┬──────┐
|
|
142
|
+
│ id ┆ x ┆ y ┆ z │
|
|
143
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
144
|
+
│ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
145
|
+
╞═════╪═════╪══════╪══════╡
|
|
146
|
+
│ 1 ┆ 3 ┆ null ┆ 7 │
|
|
147
|
+
│ 2 ┆ 4 ┆ 5 ┆ null │
|
|
148
|
+
└─────┴─────┴──────┴──────┘
|
|
149
|
+
>>> pl.concat([df_a1, df_a2, df_a3], how="align_right")
|
|
150
|
+
shape: (2, 4)
|
|
151
|
+
┌─────┬──────┬──────┬─────┐
|
|
152
|
+
│ id ┆ x ┆ y ┆ z │
|
|
153
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
154
|
+
│ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
155
|
+
╞═════╪══════╪══════╪═════╡
|
|
156
|
+
│ 1 ┆ null ┆ null ┆ 7 │
|
|
157
|
+
│ 3 ┆ null ┆ 6 ┆ 8 │
|
|
158
|
+
└─────┴──────┴──────┴─────┘
|
|
159
|
+
>>> pl.concat([df_a1, df_a2, df_a3], how="align_inner")
|
|
160
|
+
shape: (0, 4)
|
|
161
|
+
┌─────┬─────┬─────┬─────┐
|
|
162
|
+
│ id ┆ x ┆ y ┆ z │
|
|
163
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
164
|
+
│ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
165
|
+
╞═════╪═════╪═════╪═════╡
|
|
166
|
+
└─────┴─────┴─────┴─────┘
|
|
167
|
+
""" # noqa: W505
|
|
168
|
+
# unpack/standardise (handles generator input)
|
|
169
|
+
elems = list(items)
|
|
170
|
+
|
|
171
|
+
if not elems:
|
|
172
|
+
msg = "cannot concat empty list"
|
|
173
|
+
raise ValueError(msg)
|
|
174
|
+
elif len(elems) == 1 and isinstance(
|
|
175
|
+
elems[0], (pl.DataFrame, pl.Series, pl.LazyFrame)
|
|
176
|
+
):
|
|
177
|
+
return elems[0]
|
|
178
|
+
|
|
179
|
+
if how.startswith("align"):
|
|
180
|
+
if not isinstance(elems[0], (pl.DataFrame, pl.LazyFrame)):
|
|
181
|
+
msg = f"{how!r} strategy is not supported for {qualified_type_name(elems[0])!r}"
|
|
182
|
+
raise TypeError(msg)
|
|
183
|
+
|
|
184
|
+
# establish common columns, maintaining the order in which they appear
|
|
185
|
+
all_columns = list(chain.from_iterable(e.collect_schema() for e in elems))
|
|
186
|
+
key = {v: k for k, v in enumerate(ordered_unique(all_columns))}
|
|
187
|
+
output_column_order = list(key)
|
|
188
|
+
common_cols = sorted(
|
|
189
|
+
reduce(
|
|
190
|
+
lambda x, y: set(x) & set(y), # type: ignore[arg-type, return-value]
|
|
191
|
+
chain(e.collect_schema() for e in elems),
|
|
192
|
+
),
|
|
193
|
+
key=lambda k: key.get(k, 0),
|
|
194
|
+
)
|
|
195
|
+
# we require at least one key column for 'align' strategies
|
|
196
|
+
if not common_cols:
|
|
197
|
+
msg = f"{how!r} strategy requires at least one common column"
|
|
198
|
+
raise InvalidOperationError(msg)
|
|
199
|
+
|
|
200
|
+
# align frame data using a join, with no suffix-resolution (will raise
|
|
201
|
+
# a DuplicateError in case of column collision, same as "horizontal")
|
|
202
|
+
join_method: JoinStrategy = (
|
|
203
|
+
"full" if how == "align" else how.removeprefix("align_") # type: ignore[assignment]
|
|
204
|
+
)
|
|
205
|
+
lf: LazyFrame = (
|
|
206
|
+
reduce(
|
|
207
|
+
lambda x, y: (
|
|
208
|
+
x.join(
|
|
209
|
+
y,
|
|
210
|
+
on=common_cols,
|
|
211
|
+
how=join_method,
|
|
212
|
+
maintain_order="right_left",
|
|
213
|
+
coalesce=True,
|
|
214
|
+
)
|
|
215
|
+
),
|
|
216
|
+
[df.lazy() for df in elems],
|
|
217
|
+
)
|
|
218
|
+
.sort(by=common_cols, maintain_order=True)
|
|
219
|
+
.select(*output_column_order)
|
|
220
|
+
)
|
|
221
|
+
eager = isinstance(elems[0], pl.DataFrame)
|
|
222
|
+
return lf.collect() if eager else lf # type: ignore[return-value]
|
|
223
|
+
|
|
224
|
+
out: Series | DataFrame | LazyFrame | Expr
|
|
225
|
+
first = elems[0]
|
|
226
|
+
|
|
227
|
+
from polars.lazyframe.opt_flags import QueryOptFlags
|
|
228
|
+
|
|
229
|
+
if isinstance(first, pl.DataFrame):
|
|
230
|
+
if how == "vertical":
|
|
231
|
+
out = wrap_df(plr.concat_df(elems))
|
|
232
|
+
elif how == "vertical_relaxed":
|
|
233
|
+
out = wrap_ldf(
|
|
234
|
+
plr.concat_lf(
|
|
235
|
+
[df.lazy() for df in elems],
|
|
236
|
+
rechunk=rechunk,
|
|
237
|
+
parallel=parallel,
|
|
238
|
+
to_supertypes=True,
|
|
239
|
+
)
|
|
240
|
+
).collect(optimizations=QueryOptFlags._eager())
|
|
241
|
+
|
|
242
|
+
elif how == "diagonal":
|
|
243
|
+
out = wrap_df(plr.concat_df_diagonal(elems))
|
|
244
|
+
elif how == "diagonal_relaxed":
|
|
245
|
+
out = wrap_ldf(
|
|
246
|
+
plr.concat_lf_diagonal(
|
|
247
|
+
[df.lazy() for df in elems],
|
|
248
|
+
rechunk=rechunk,
|
|
249
|
+
parallel=parallel,
|
|
250
|
+
to_supertypes=True,
|
|
251
|
+
)
|
|
252
|
+
).collect(optimizations=QueryOptFlags._eager())
|
|
253
|
+
elif how == "horizontal":
|
|
254
|
+
out = wrap_df(plr.concat_df_horizontal(elems))
|
|
255
|
+
else:
|
|
256
|
+
allowed = ", ".join(repr(m) for m in get_args(ConcatMethod))
|
|
257
|
+
msg = f"DataFrame `how` must be one of {{{allowed}}}, got {how!r}"
|
|
258
|
+
raise ValueError(msg)
|
|
259
|
+
|
|
260
|
+
elif isinstance(first, pl.LazyFrame):
|
|
261
|
+
if how in ("vertical", "vertical_relaxed"):
|
|
262
|
+
return wrap_ldf(
|
|
263
|
+
plr.concat_lf(
|
|
264
|
+
elems,
|
|
265
|
+
rechunk=rechunk,
|
|
266
|
+
parallel=parallel,
|
|
267
|
+
to_supertypes=how.endswith("relaxed"),
|
|
268
|
+
)
|
|
269
|
+
)
|
|
270
|
+
elif how in ("diagonal", "diagonal_relaxed"):
|
|
271
|
+
return wrap_ldf(
|
|
272
|
+
plr.concat_lf_diagonal(
|
|
273
|
+
elems,
|
|
274
|
+
rechunk=rechunk,
|
|
275
|
+
parallel=parallel,
|
|
276
|
+
to_supertypes=how.endswith("relaxed"),
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
elif how == "horizontal":
|
|
280
|
+
return wrap_ldf(
|
|
281
|
+
plr.concat_lf_horizontal(
|
|
282
|
+
elems,
|
|
283
|
+
parallel=parallel,
|
|
284
|
+
)
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
allowed = ", ".join(repr(m) for m in get_args(ConcatMethod))
|
|
288
|
+
msg = f"LazyFrame `how` must be one of {{{allowed}}}, got {how!r}"
|
|
289
|
+
raise ValueError(msg)
|
|
290
|
+
|
|
291
|
+
elif isinstance(first, pl.Series):
|
|
292
|
+
if how == "vertical":
|
|
293
|
+
out = wrap_s(plr.concat_series(elems))
|
|
294
|
+
else:
|
|
295
|
+
msg = "Series only supports 'vertical' concat strategy"
|
|
296
|
+
raise ValueError(msg)
|
|
297
|
+
|
|
298
|
+
elif isinstance(first, pl.Expr):
|
|
299
|
+
return wrap_expr(plr.concat_expr([e._pyexpr for e in elems], rechunk))
|
|
300
|
+
else:
|
|
301
|
+
msg = f"did not expect type: {qualified_type_name(first)!r} in `concat`"
|
|
302
|
+
raise TypeError(msg)
|
|
303
|
+
|
|
304
|
+
if rechunk:
|
|
305
|
+
return out.rechunk()
|
|
306
|
+
return out
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _alignment_join(
|
|
310
|
+
*idx_frames: tuple[int, LazyFrame],
|
|
311
|
+
align_on: list[str],
|
|
312
|
+
how: JoinStrategy = "full",
|
|
313
|
+
descending: bool | Sequence[bool] = False,
|
|
314
|
+
) -> LazyFrame:
|
|
315
|
+
"""Create a single master frame with all rows aligned on the common key values."""
|
|
316
|
+
# note: can stack overflow if the join becomes too large, so we
|
|
317
|
+
# collect eagerly when hitting a large enough number of frames
|
|
318
|
+
post_align_collect = len(idx_frames) >= 250
|
|
319
|
+
|
|
320
|
+
def join_func(
|
|
321
|
+
idx_x: tuple[int, LazyFrame],
|
|
322
|
+
idx_y: tuple[int, LazyFrame],
|
|
323
|
+
) -> tuple[int, LazyFrame]:
|
|
324
|
+
(_, x), (y_idx, y) = idx_x, idx_y
|
|
325
|
+
return y_idx, x.join(
|
|
326
|
+
y,
|
|
327
|
+
how=how,
|
|
328
|
+
on=align_on,
|
|
329
|
+
suffix=f":{y_idx}",
|
|
330
|
+
nulls_equal=True,
|
|
331
|
+
coalesce=True,
|
|
332
|
+
maintain_order="right_left",
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
from polars.lazyframe import QueryOptFlags
|
|
336
|
+
|
|
337
|
+
joined = reduce(join_func, idx_frames)[1].sort(
|
|
338
|
+
by=align_on, descending=descending, maintain_order=True
|
|
339
|
+
)
|
|
340
|
+
if post_align_collect:
|
|
341
|
+
joined = joined.collect(optimizations=QueryOptFlags.none()).lazy()
|
|
342
|
+
return joined
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def align_frames(
|
|
346
|
+
*frames: FrameType | Iterable[FrameType],
|
|
347
|
+
on: str | Expr | Sequence[str] | Sequence[Expr] | Sequence[str | Expr],
|
|
348
|
+
how: JoinStrategy = "full",
|
|
349
|
+
select: str | Expr | Sequence[str | Expr] | None = None,
|
|
350
|
+
descending: bool | Sequence[bool] = False,
|
|
351
|
+
) -> list[FrameType]:
|
|
352
|
+
r"""
|
|
353
|
+
Align a sequence of frames using common values from one or more columns as a key.
|
|
354
|
+
|
|
355
|
+
Frames that do not contain the given key values have rows injected (with nulls
|
|
356
|
+
filling the non-key columns), and each resulting frame is sorted by the key.
|
|
357
|
+
|
|
358
|
+
The original column order of input frames is not changed unless `select` is
|
|
359
|
+
specified (in which case the final column order is determined from that). In the
|
|
360
|
+
case where duplicate key values exist, the alignment behaviour is determined by
|
|
361
|
+
the given alignment strategy specified in the `how` parameter (by default this
|
|
362
|
+
is a full outer join, but if your data is suitable you can get a large speedup
|
|
363
|
+
by setting `how="left"` instead).
|
|
364
|
+
|
|
365
|
+
Note that this function does not result in a joined frame - you receive the same
|
|
366
|
+
number of frames back that you passed in, but each is now aligned by key and has
|
|
367
|
+
the same number of rows.
|
|
368
|
+
|
|
369
|
+
Parameters
|
|
370
|
+
----------
|
|
371
|
+
frames
|
|
372
|
+
Sequence of DataFrames or LazyFrames.
|
|
373
|
+
on
|
|
374
|
+
One or more columns whose unique values will be used to align the frames.
|
|
375
|
+
select
|
|
376
|
+
Optional post-alignment column select to constrain and/or order
|
|
377
|
+
the columns returned from the newly aligned frames.
|
|
378
|
+
descending
|
|
379
|
+
Sort the alignment column values in descending order; can be a single
|
|
380
|
+
boolean or a list of booleans associated with each column in `on`.
|
|
381
|
+
how
|
|
382
|
+
By default the row alignment values are determined using a full outer join
|
|
383
|
+
strategy across all frames; if you know that the first frame contains all
|
|
384
|
+
required keys, you can set `how="left"` for a large performance increase.
|
|
385
|
+
|
|
386
|
+
Examples
|
|
387
|
+
--------
|
|
388
|
+
>>> from datetime import date
|
|
389
|
+
>>> df1 = pl.DataFrame(
|
|
390
|
+
... {
|
|
391
|
+
... "dt": [date(2022, 9, 1), date(2022, 9, 2), date(2022, 9, 3)],
|
|
392
|
+
... "x": [3.5, 4.0, 1.0],
|
|
393
|
+
... "y": [10.0, 2.5, 1.5],
|
|
394
|
+
... }
|
|
395
|
+
... )
|
|
396
|
+
>>> df2 = pl.DataFrame(
|
|
397
|
+
... {
|
|
398
|
+
... "dt": [date(2022, 9, 2), date(2022, 9, 3), date(2022, 9, 1)],
|
|
399
|
+
... "x": [8.0, 1.0, 3.5],
|
|
400
|
+
... "y": [1.5, 12.0, 5.0],
|
|
401
|
+
... }
|
|
402
|
+
... )
|
|
403
|
+
>>> df3 = pl.DataFrame(
|
|
404
|
+
... {
|
|
405
|
+
... "dt": [date(2022, 9, 3), date(2022, 9, 2)],
|
|
406
|
+
... "x": [2.0, 5.0],
|
|
407
|
+
... "y": [2.5, 2.0],
|
|
408
|
+
... }
|
|
409
|
+
... ) # doctest: +IGNORE_RESULT
|
|
410
|
+
>>> pl.Config.set_tbl_formatting("UTF8_FULL") # doctest: +IGNORE_RESULT
|
|
411
|
+
#
|
|
412
|
+
# df1 df2 df3
|
|
413
|
+
# shape: (3, 3) shape: (3, 3) shape: (2, 3)
|
|
414
|
+
# ┌────────────┬─────┬──────┐ ┌────────────┬─────┬──────┐ ┌────────────┬─────┬─────┐
|
|
415
|
+
# │ dt ┆ x ┆ y │ │ dt ┆ x ┆ y │ │ dt ┆ x ┆ y │
|
|
416
|
+
# │ --- ┆ --- ┆ --- │ │ --- ┆ --- ┆ --- │ │ --- ┆ --- ┆ --- │
|
|
417
|
+
# │ date ┆ f64 ┆ f64 │ │ date ┆ f64 ┆ f64 │ │ date ┆ f64 ┆ f64 │
|
|
418
|
+
# ╞════════════╪═════╪══════╡ ╞════════════╪═════╪══════╡ ╞════════════╪═════╪═════╡
|
|
419
|
+
# │ 2022-09-01 ┆ 3.5 ┆ 10.0 │\ ,->│ 2022-09-02 ┆ 8.0 ┆ 1.5 │\ ,->│ 2022-09-03 ┆ 2.0 ┆ 2.5 │
|
|
420
|
+
# ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤ \/ ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤ \/ ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
|
|
421
|
+
# │ 2022-09-02 ┆ 4.0 ┆ 2.5 │_/\,->│ 2022-09-03 ┆ 1.0 ┆ 12.0 │_/`-->│ 2022-09-02 ┆ 5.0 ┆ 2.0 │
|
|
422
|
+
# ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤ /\ ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤ └────────────┴─────┴─────┘
|
|
423
|
+
# │ 2022-09-03 ┆ 1.0 ┆ 1.5 │_/ `>│ 2022-09-01 ┆ 3.5 ┆ 5.0 │-//-
|
|
424
|
+
# └────────────┴─────┴──────┘ └────────────┴─────┴──────┘
|
|
425
|
+
...
|
|
426
|
+
|
|
427
|
+
Align frames by the "dt" column:
|
|
428
|
+
|
|
429
|
+
>>> af1, af2, af3 = pl.align_frames(
|
|
430
|
+
... df1, df2, df3, on="dt"
|
|
431
|
+
... ) # doctest: +IGNORE_RESULT
|
|
432
|
+
#
|
|
433
|
+
# df1 df2 df3
|
|
434
|
+
# shape: (3, 3) shape: (3, 3) shape: (3, 3)
|
|
435
|
+
# ┌────────────┬─────┬──────┐ ┌────────────┬─────┬──────┐ ┌────────────┬──────┬──────┐
|
|
436
|
+
# │ dt ┆ x ┆ y │ │ dt ┆ x ┆ y │ │ dt ┆ x ┆ y │
|
|
437
|
+
# │ --- ┆ --- ┆ --- │ │ --- ┆ --- ┆ --- │ │ --- ┆ --- ┆ --- │
|
|
438
|
+
# │ date ┆ f64 ┆ f64 │ │ date ┆ f64 ┆ f64 │ │ date ┆ f64 ┆ f64 │
|
|
439
|
+
# ╞════════════╪═════╪══════╡ ╞════════════╪═════╪══════╡ ╞════════════╪══════╪══════╡
|
|
440
|
+
# │ 2022-09-01 ┆ 3.5 ┆ 10.0 │----->│ 2022-09-01 ┆ 3.5 ┆ 5.0 │----->│ 2022-09-01 ┆ null ┆ null │
|
|
441
|
+
# ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤ ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤ ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
|
442
|
+
# │ 2022-09-02 ┆ 4.0 ┆ 2.5 │----->│ 2022-09-02 ┆ 8.0 ┆ 1.5 │----->│ 2022-09-02 ┆ 5.0 ┆ 2.0 │
|
|
443
|
+
# ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤ ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤ ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
|
444
|
+
# │ 2022-09-03 ┆ 1.0 ┆ 1.5 │----->│ 2022-09-03 ┆ 1.0 ┆ 12.0 │----->│ 2022-09-03 ┆ 2.0 ┆ 2.5 │
|
|
445
|
+
# └────────────┴─────┴──────┘ └────────────┴─────┴──────┘ └────────────┴──────┴──────┘
|
|
446
|
+
...
|
|
447
|
+
|
|
448
|
+
Align frames by "dt" using "left" alignment, but keep only cols "x" and "y":
|
|
449
|
+
|
|
450
|
+
>>> af1, af2, af3 = pl.align_frames(
|
|
451
|
+
... df1, df2, df3, on="dt", select=["x", "y"], how="left"
|
|
452
|
+
... ) # doctest: +IGNORE_RESULT
|
|
453
|
+
#
|
|
454
|
+
# af1 af2 af3
|
|
455
|
+
# shape: (3, 3) shape: (3, 3) shape: (3, 3)
|
|
456
|
+
# ┌─────┬──────┐ ┌─────┬──────┐ ┌──────┬──────┐
|
|
457
|
+
# │ x ┆ y │ │ x ┆ y │ │ x ┆ y │
|
|
458
|
+
# │ --- ┆ --- │ │ --- ┆ --- │ │ --- ┆ --- │
|
|
459
|
+
# │ f64 ┆ f64 │ │ f64 ┆ f64 │ │ f64 ┆ f64 │
|
|
460
|
+
# ╞═════╪══════╡ ╞═════╪══════╡ ╞══════╪══════╡
|
|
461
|
+
# │ 3.5 ┆ 10.0 │ │ 3.5 ┆ 5.0 │ │ null ┆ null │
|
|
462
|
+
# ├╌╌╌╌╌┼╌╌╌╌╌╌┤ ├╌╌╌╌╌┼╌╌╌╌╌╌┤ ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
|
463
|
+
# │ 4.0 ┆ 2.5 │ │ 8.0 ┆ 1.5 │ │ 5.0 ┆ 2.0 │
|
|
464
|
+
# ├╌╌╌╌╌┼╌╌╌╌╌╌┤ ├╌╌╌╌╌┼╌╌╌╌╌╌┤ ├╌╌╌╌╌╌┼╌╌╌╌╌╌┤
|
|
465
|
+
# │ 1.0 ┆ 1.5 │ │ 1.0 ┆ 12.0 │ │ 2.0 ┆ 2.5 │
|
|
466
|
+
# └─────┴──────┘ └─────┴──────┘ └──────┴──────┘
|
|
467
|
+
...
|
|
468
|
+
|
|
469
|
+
Now data is aligned, and you can easily calculate the row-wise dot product:
|
|
470
|
+
|
|
471
|
+
>>> (af1 * af2 * af3).fill_null(0).select(pl.sum_horizontal("*").alias("dot"))
|
|
472
|
+
shape: (3, 1)
|
|
473
|
+
┌───────┐
|
|
474
|
+
│ dot │
|
|
475
|
+
│ --- │
|
|
476
|
+
│ f64 │
|
|
477
|
+
╞═══════╡
|
|
478
|
+
│ 0.0 │
|
|
479
|
+
├╌╌╌╌╌╌╌┤
|
|
480
|
+
│ 167.5 │
|
|
481
|
+
├╌╌╌╌╌╌╌┤
|
|
482
|
+
│ 47.0 │
|
|
483
|
+
└───────┘
|
|
484
|
+
""" # noqa: W505
|
|
485
|
+
if not frames:
|
|
486
|
+
return []
|
|
487
|
+
|
|
488
|
+
if len(frames) == 1 and not isinstance(frames[0], (pl.DataFrame, pl.LazyFrame)):
|
|
489
|
+
frames = frames[0] # type: ignore[assignment]
|
|
490
|
+
if isinstance(frames, (Generator, Iterator)):
|
|
491
|
+
frames = tuple(frames)
|
|
492
|
+
|
|
493
|
+
if len({type(f) for f in frames}) != 1:
|
|
494
|
+
msg = (
|
|
495
|
+
"input frames must be of a consistent type (all LazyFrame or all DataFrame)"
|
|
496
|
+
)
|
|
497
|
+
raise TypeError(msg)
|
|
498
|
+
|
|
499
|
+
eager = isinstance(frames[0], pl.DataFrame)
|
|
500
|
+
on = [on] if (isinstance(on, str) or not isinstance(on, Sequence)) else on
|
|
501
|
+
align_on = [(c.meta.output_name() if isinstance(c, pl.Expr) else c) for c in on]
|
|
502
|
+
|
|
503
|
+
# create aligned master frame (this is the most expensive part; after
|
|
504
|
+
# we just select out the columns representing the component frames)
|
|
505
|
+
idx_frames = [(idx, frame.lazy()) for idx, frame in enumerate(frames)] # type: ignore[union-attr]
|
|
506
|
+
alignment_frame = _alignment_join(
|
|
507
|
+
*idx_frames, align_on=align_on, how=how, descending=descending
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# select-out aligned components from the master frame
|
|
511
|
+
aligned_cols = set(alignment_frame.collect_schema())
|
|
512
|
+
aligned_frames = []
|
|
513
|
+
for idx, lf in idx_frames:
|
|
514
|
+
sfx = f":{idx}"
|
|
515
|
+
df_cols = [
|
|
516
|
+
F.col(f"{c}{sfx}").alias(c) if f"{c}{sfx}" in aligned_cols else F.col(c)
|
|
517
|
+
for c in lf.collect_schema()
|
|
518
|
+
]
|
|
519
|
+
f = alignment_frame.select(*df_cols)
|
|
520
|
+
if select is not None:
|
|
521
|
+
f = f.select(select)
|
|
522
|
+
aligned_frames.append(f)
|
|
523
|
+
|
|
524
|
+
return F.collect_all(aligned_frames) if eager else aligned_frames # type: ignore[return-value]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
|
|
5
|
+
from polars._utils.various import qualified_type_name
|
|
6
|
+
|
|
7
|
+
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
8
|
+
import polars._plr as plr
|
|
9
|
+
import polars._reexport as pl
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def escape_regex(s: str) -> str:
|
|
13
|
+
r"""
|
|
14
|
+
Escapes string regex meta characters.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
s
|
|
19
|
+
The string whose meta characters will be escaped.
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
if isinstance(s, pl.Expr):
|
|
23
|
+
msg = "escape_regex function is unsupported for `Expr`, you may want use `Expr.str.escape_regex` instead"
|
|
24
|
+
raise TypeError(msg)
|
|
25
|
+
elif not isinstance(s, str):
|
|
26
|
+
msg = f"escape_regex function supports only `str` type, got `{qualified_type_name(s)}`"
|
|
27
|
+
raise TypeError(msg)
|
|
28
|
+
|
|
29
|
+
return plr.escape_regex(s)
|