polars-runtime-compat 1.34.0b2__cp39-abi3-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/.gitkeep +0 -0
- _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
- polars/__init__.py +528 -0
- polars/_cpu_check.py +265 -0
- polars/_dependencies.py +355 -0
- polars/_plr.py +99 -0
- polars/_plr.pyi +2496 -0
- polars/_reexport.py +23 -0
- polars/_typing.py +478 -0
- polars/_utils/__init__.py +37 -0
- polars/_utils/async_.py +102 -0
- polars/_utils/cache.py +176 -0
- polars/_utils/cloud.py +40 -0
- polars/_utils/constants.py +29 -0
- polars/_utils/construction/__init__.py +46 -0
- polars/_utils/construction/dataframe.py +1397 -0
- polars/_utils/construction/other.py +72 -0
- polars/_utils/construction/series.py +560 -0
- polars/_utils/construction/utils.py +118 -0
- polars/_utils/convert.py +224 -0
- polars/_utils/deprecation.py +406 -0
- polars/_utils/getitem.py +457 -0
- polars/_utils/logging.py +11 -0
- polars/_utils/nest_asyncio.py +264 -0
- polars/_utils/parquet.py +15 -0
- polars/_utils/parse/__init__.py +12 -0
- polars/_utils/parse/expr.py +242 -0
- polars/_utils/polars_version.py +19 -0
- polars/_utils/pycapsule.py +53 -0
- polars/_utils/scan.py +27 -0
- polars/_utils/serde.py +63 -0
- polars/_utils/slice.py +215 -0
- polars/_utils/udfs.py +1251 -0
- polars/_utils/unstable.py +63 -0
- polars/_utils/various.py +782 -0
- polars/_utils/wrap.py +25 -0
- polars/api.py +370 -0
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +19 -0
- polars/catalog/unity/client.py +733 -0
- polars/catalog/unity/models.py +152 -0
- polars/config.py +1571 -0
- polars/convert/__init__.py +25 -0
- polars/convert/general.py +1046 -0
- polars/convert/normalize.py +261 -0
- polars/dataframe/__init__.py +5 -0
- polars/dataframe/_html.py +186 -0
- polars/dataframe/frame.py +12582 -0
- polars/dataframe/group_by.py +1067 -0
- polars/dataframe/plotting.py +257 -0
- polars/datatype_expr/__init__.py +5 -0
- polars/datatype_expr/array.py +56 -0
- polars/datatype_expr/datatype_expr.py +304 -0
- polars/datatype_expr/list.py +18 -0
- polars/datatype_expr/struct.py +69 -0
- polars/datatypes/__init__.py +122 -0
- polars/datatypes/_parse.py +195 -0
- polars/datatypes/_utils.py +48 -0
- polars/datatypes/classes.py +1213 -0
- polars/datatypes/constants.py +11 -0
- polars/datatypes/constructor.py +172 -0
- polars/datatypes/convert.py +366 -0
- polars/datatypes/group.py +130 -0
- polars/exceptions.py +230 -0
- polars/expr/__init__.py +7 -0
- polars/expr/array.py +964 -0
- polars/expr/binary.py +346 -0
- polars/expr/categorical.py +306 -0
- polars/expr/datetime.py +2620 -0
- polars/expr/expr.py +11272 -0
- polars/expr/list.py +1408 -0
- polars/expr/meta.py +444 -0
- polars/expr/name.py +321 -0
- polars/expr/string.py +3045 -0
- polars/expr/struct.py +357 -0
- polars/expr/whenthen.py +185 -0
- polars/functions/__init__.py +193 -0
- polars/functions/aggregation/__init__.py +33 -0
- polars/functions/aggregation/horizontal.py +298 -0
- polars/functions/aggregation/vertical.py +341 -0
- polars/functions/as_datatype.py +848 -0
- polars/functions/business.py +138 -0
- polars/functions/col.py +384 -0
- polars/functions/datatype.py +121 -0
- polars/functions/eager.py +524 -0
- polars/functions/escape_regex.py +29 -0
- polars/functions/lazy.py +2751 -0
- polars/functions/len.py +68 -0
- polars/functions/lit.py +210 -0
- polars/functions/random.py +22 -0
- polars/functions/range/__init__.py +19 -0
- polars/functions/range/_utils.py +15 -0
- polars/functions/range/date_range.py +303 -0
- polars/functions/range/datetime_range.py +370 -0
- polars/functions/range/int_range.py +348 -0
- polars/functions/range/linear_space.py +311 -0
- polars/functions/range/time_range.py +287 -0
- polars/functions/repeat.py +301 -0
- polars/functions/whenthen.py +353 -0
- polars/interchange/__init__.py +10 -0
- polars/interchange/buffer.py +77 -0
- polars/interchange/column.py +190 -0
- polars/interchange/dataframe.py +230 -0
- polars/interchange/from_dataframe.py +328 -0
- polars/interchange/protocol.py +303 -0
- polars/interchange/utils.py +170 -0
- polars/io/__init__.py +64 -0
- polars/io/_utils.py +317 -0
- polars/io/avro.py +49 -0
- polars/io/clipboard.py +36 -0
- polars/io/cloud/__init__.py +17 -0
- polars/io/cloud/_utils.py +80 -0
- polars/io/cloud/credential_provider/__init__.py +17 -0
- polars/io/cloud/credential_provider/_builder.py +520 -0
- polars/io/cloud/credential_provider/_providers.py +618 -0
- polars/io/csv/__init__.py +9 -0
- polars/io/csv/_utils.py +38 -0
- polars/io/csv/batched_reader.py +142 -0
- polars/io/csv/functions.py +1495 -0
- polars/io/database/__init__.py +6 -0
- polars/io/database/_arrow_registry.py +70 -0
- polars/io/database/_cursor_proxies.py +147 -0
- polars/io/database/_executor.py +578 -0
- polars/io/database/_inference.py +314 -0
- polars/io/database/_utils.py +144 -0
- polars/io/database/functions.py +516 -0
- polars/io/delta.py +499 -0
- polars/io/iceberg/__init__.py +3 -0
- polars/io/iceberg/_utils.py +697 -0
- polars/io/iceberg/dataset.py +556 -0
- polars/io/iceberg/functions.py +151 -0
- polars/io/ipc/__init__.py +8 -0
- polars/io/ipc/functions.py +514 -0
- polars/io/json/__init__.py +3 -0
- polars/io/json/read.py +101 -0
- polars/io/ndjson.py +332 -0
- polars/io/parquet/__init__.py +17 -0
- polars/io/parquet/field_overwrites.py +140 -0
- polars/io/parquet/functions.py +722 -0
- polars/io/partition.py +491 -0
- polars/io/plugins.py +187 -0
- polars/io/pyarrow_dataset/__init__.py +5 -0
- polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
- polars/io/pyarrow_dataset/functions.py +79 -0
- polars/io/scan_options/__init__.py +5 -0
- polars/io/scan_options/_options.py +59 -0
- polars/io/scan_options/cast_options.py +126 -0
- polars/io/spreadsheet/__init__.py +6 -0
- polars/io/spreadsheet/_utils.py +52 -0
- polars/io/spreadsheet/_write_utils.py +647 -0
- polars/io/spreadsheet/functions.py +1323 -0
- polars/lazyframe/__init__.py +9 -0
- polars/lazyframe/engine_config.py +61 -0
- polars/lazyframe/frame.py +8564 -0
- polars/lazyframe/group_by.py +669 -0
- polars/lazyframe/in_process.py +42 -0
- polars/lazyframe/opt_flags.py +333 -0
- polars/meta/__init__.py +14 -0
- polars/meta/build.py +33 -0
- polars/meta/index_type.py +27 -0
- polars/meta/thread_pool.py +50 -0
- polars/meta/versions.py +120 -0
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +213 -0
- polars/ml/utilities.py +30 -0
- polars/plugins.py +155 -0
- polars/py.typed +0 -0
- polars/pyproject.toml +96 -0
- polars/schema.py +265 -0
- polars/selectors.py +3117 -0
- polars/series/__init__.py +5 -0
- polars/series/array.py +776 -0
- polars/series/binary.py +254 -0
- polars/series/categorical.py +246 -0
- polars/series/datetime.py +2275 -0
- polars/series/list.py +1087 -0
- polars/series/plotting.py +191 -0
- polars/series/series.py +9197 -0
- polars/series/string.py +2367 -0
- polars/series/struct.py +154 -0
- polars/series/utils.py +191 -0
- polars/sql/__init__.py +7 -0
- polars/sql/context.py +677 -0
- polars/sql/functions.py +139 -0
- polars/string_cache.py +185 -0
- polars/testing/__init__.py +13 -0
- polars/testing/asserts/__init__.py +9 -0
- polars/testing/asserts/frame.py +231 -0
- polars/testing/asserts/series.py +219 -0
- polars/testing/asserts/utils.py +12 -0
- polars/testing/parametric/__init__.py +33 -0
- polars/testing/parametric/profiles.py +107 -0
- polars/testing/parametric/strategies/__init__.py +22 -0
- polars/testing/parametric/strategies/_utils.py +14 -0
- polars/testing/parametric/strategies/core.py +615 -0
- polars/testing/parametric/strategies/data.py +452 -0
- polars/testing/parametric/strategies/dtype.py +436 -0
- polars/testing/parametric/strategies/legacy.py +169 -0
- polars/type_aliases.py +24 -0
- polars_runtime_compat-1.34.0b2.dist-info/METADATA +190 -0
- polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
- polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
- polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +20 -0
polars/_utils/udfs.py
ADDED
|
@@ -0,0 +1,1251 @@
|
|
|
1
|
+
"""Utilities related to user defined functions (such as those passed to `apply`)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import datetime
|
|
6
|
+
import dis
|
|
7
|
+
import inspect
|
|
8
|
+
import re
|
|
9
|
+
import sys
|
|
10
|
+
import warnings
|
|
11
|
+
from bisect import bisect_left
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from dis import get_instructions
|
|
14
|
+
from inspect import signature
|
|
15
|
+
from itertools import count, zip_longest
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import (
|
|
18
|
+
TYPE_CHECKING,
|
|
19
|
+
Any,
|
|
20
|
+
Callable,
|
|
21
|
+
ClassVar,
|
|
22
|
+
Literal,
|
|
23
|
+
NamedTuple,
|
|
24
|
+
Union,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
from polars._utils.cache import LRUCache
|
|
28
|
+
from polars._utils.various import no_default, re_escape
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from collections.abc import Iterator, MutableMapping
|
|
32
|
+
from collections.abc import Set as AbstractSet
|
|
33
|
+
from dis import Instruction
|
|
34
|
+
|
|
35
|
+
from polars._utils.various import NoDefault
|
|
36
|
+
|
|
37
|
+
if sys.version_info >= (3, 10):
|
|
38
|
+
from typing import TypeAlias
|
|
39
|
+
else:
|
|
40
|
+
from typing_extensions import TypeAlias
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class StackValue(NamedTuple):
|
|
44
|
+
operator: str
|
|
45
|
+
operator_arity: int
|
|
46
|
+
left_operand: str
|
|
47
|
+
right_operand: str
|
|
48
|
+
from_module: str | None = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
MapTarget: TypeAlias = Literal["expr", "frame", "series"]
|
|
52
|
+
StackEntry: TypeAlias = Union[str, StackValue]
|
|
53
|
+
|
|
54
|
+
_MIN_PY311 = sys.version_info >= (3, 11)
|
|
55
|
+
_MIN_PY312 = _MIN_PY311 and sys.version_info >= (3, 12)
|
|
56
|
+
_MIN_PY314 = _MIN_PY312 and sys.version_info >= (3, 14)
|
|
57
|
+
|
|
58
|
+
_BYTECODE_PARSER_CACHE_: MutableMapping[
|
|
59
|
+
tuple[Callable[[Any], Any], str], BytecodeParser
|
|
60
|
+
] = LRUCache(32)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class OpNames:
|
|
64
|
+
BINARY: ClassVar[dict[str, str]] = {
|
|
65
|
+
"BINARY_ADD": "+",
|
|
66
|
+
"BINARY_AND": "&",
|
|
67
|
+
"BINARY_FLOOR_DIVIDE": "//",
|
|
68
|
+
"BINARY_LSHIFT": "<<",
|
|
69
|
+
"BINARY_RSHIFT": ">>",
|
|
70
|
+
"BINARY_MODULO": "%",
|
|
71
|
+
"BINARY_MULTIPLY": "*",
|
|
72
|
+
"BINARY_OR": "|",
|
|
73
|
+
"BINARY_POWER": "**",
|
|
74
|
+
"BINARY_SUBTRACT": "-",
|
|
75
|
+
"BINARY_TRUE_DIVIDE": "/",
|
|
76
|
+
"BINARY_XOR": "^",
|
|
77
|
+
}
|
|
78
|
+
CALL = frozenset({"CALL"} if _MIN_PY311 else {"CALL_FUNCTION", "CALL_METHOD"})
|
|
79
|
+
CONTROL_FLOW: ClassVar[dict[str, str]] = (
|
|
80
|
+
{
|
|
81
|
+
"POP_JUMP_FORWARD_IF_FALSE": "&",
|
|
82
|
+
"POP_JUMP_FORWARD_IF_TRUE": "|",
|
|
83
|
+
"JUMP_IF_FALSE_OR_POP": "&",
|
|
84
|
+
"JUMP_IF_TRUE_OR_POP": "|",
|
|
85
|
+
}
|
|
86
|
+
# note: 3.12 dropped POP_JUMP_FORWARD_IF_* opcodes
|
|
87
|
+
if _MIN_PY311 and not _MIN_PY312
|
|
88
|
+
else {
|
|
89
|
+
"POP_JUMP_IF_FALSE": "&",
|
|
90
|
+
"POP_JUMP_IF_TRUE": "|",
|
|
91
|
+
"JUMP_IF_FALSE_OR_POP": "&",
|
|
92
|
+
"JUMP_IF_TRUE_OR_POP": "|",
|
|
93
|
+
}
|
|
94
|
+
)
|
|
95
|
+
LOAD_VALUES = frozenset(("LOAD_CONST", "LOAD_DEREF", "LOAD_FAST", "LOAD_GLOBAL"))
|
|
96
|
+
LOAD_ATTR = frozenset({"LOAD_METHOD", "LOAD_ATTR"})
|
|
97
|
+
LOAD = LOAD_VALUES | LOAD_ATTR
|
|
98
|
+
SIMPLIFY_SPECIALIZED: ClassVar[dict[str, str]] = {
|
|
99
|
+
"LOAD_FAST_BORROW": "LOAD_FAST",
|
|
100
|
+
"LOAD_SMALL_INT": "LOAD_CONST",
|
|
101
|
+
}
|
|
102
|
+
SYNTHETIC: ClassVar[dict[str, int]] = {
|
|
103
|
+
"POLARS_EXPRESSION": 1,
|
|
104
|
+
}
|
|
105
|
+
UNARY: ClassVar[dict[str, str]] = {
|
|
106
|
+
"UNARY_NEGATIVE": "-",
|
|
107
|
+
"UNARY_POSITIVE": "+",
|
|
108
|
+
"UNARY_NOT": "~",
|
|
109
|
+
}
|
|
110
|
+
PARSEABLE_OPS = frozenset(
|
|
111
|
+
{"BINARY_OP", "BINARY_SUBSCR", "COMPARE_OP", "CONTAINS_OP", "IS_OP"}
|
|
112
|
+
| set(UNARY)
|
|
113
|
+
| set(CONTROL_FLOW)
|
|
114
|
+
| set(SYNTHETIC)
|
|
115
|
+
| LOAD_VALUES
|
|
116
|
+
)
|
|
117
|
+
MATCHABLE_OPS = (
|
|
118
|
+
set(SIMPLIFY_SPECIALIZED) | PARSEABLE_OPS | set(BINARY) | LOAD_ATTR | CALL
|
|
119
|
+
)
|
|
120
|
+
UNARY_VALUES = frozenset(UNARY.values())
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# math module funcs that we can map to native expressions
|
|
124
|
+
_MATH_FUNCTIONS = frozenset(
|
|
125
|
+
(
|
|
126
|
+
"acos",
|
|
127
|
+
"acosh",
|
|
128
|
+
"asin",
|
|
129
|
+
"asinh",
|
|
130
|
+
"atan",
|
|
131
|
+
"atanh",
|
|
132
|
+
"cbrt",
|
|
133
|
+
"ceil",
|
|
134
|
+
"cos",
|
|
135
|
+
"cosh",
|
|
136
|
+
"degrees",
|
|
137
|
+
"exp",
|
|
138
|
+
"floor",
|
|
139
|
+
"log",
|
|
140
|
+
"log10",
|
|
141
|
+
"log1p",
|
|
142
|
+
"pow",
|
|
143
|
+
"radians",
|
|
144
|
+
"sin",
|
|
145
|
+
"sinh",
|
|
146
|
+
"sqrt",
|
|
147
|
+
"tan",
|
|
148
|
+
"tanh",
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# numpy functions that we can map to native expressions
|
|
153
|
+
_NUMPY_MODULE_ALIASES = frozenset(("np", "numpy"))
|
|
154
|
+
_NUMPY_FUNCTIONS = frozenset(
|
|
155
|
+
(
|
|
156
|
+
# "abs", # TODO: this one clashes with Python builtin abs
|
|
157
|
+
"arccos",
|
|
158
|
+
"arccosh",
|
|
159
|
+
"arcsin",
|
|
160
|
+
"arcsinh",
|
|
161
|
+
"arctan",
|
|
162
|
+
"arctanh",
|
|
163
|
+
"cbrt",
|
|
164
|
+
"ceil",
|
|
165
|
+
"cos",
|
|
166
|
+
"cosh",
|
|
167
|
+
"degrees",
|
|
168
|
+
"exp",
|
|
169
|
+
"floor",
|
|
170
|
+
"log",
|
|
171
|
+
"log10",
|
|
172
|
+
"log1p",
|
|
173
|
+
"radians",
|
|
174
|
+
"sign",
|
|
175
|
+
"sin",
|
|
176
|
+
"sinh",
|
|
177
|
+
"sqrt",
|
|
178
|
+
"tan",
|
|
179
|
+
"tanh",
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# python attrs/funcs that map to native expressions
|
|
184
|
+
_PYTHON_ATTRS_MAP = {
|
|
185
|
+
"date": "dt.date()",
|
|
186
|
+
"day": "dt.day()",
|
|
187
|
+
"hour": "dt.hour()",
|
|
188
|
+
"microsecond": "dt.microsecond()",
|
|
189
|
+
"minute": "dt.minute()",
|
|
190
|
+
"month": "dt.month()",
|
|
191
|
+
"second": "dt.second()",
|
|
192
|
+
"year": "dt.year()",
|
|
193
|
+
}
|
|
194
|
+
_PYTHON_CASTS_MAP = {"float": "Float64", "int": "Int64", "str": "String"}
|
|
195
|
+
_PYTHON_BUILTINS = frozenset(_PYTHON_CASTS_MAP) | {"abs"}
|
|
196
|
+
_PYTHON_METHODS_MAP = {
|
|
197
|
+
# string
|
|
198
|
+
"endswith": "str.ends_with",
|
|
199
|
+
"lower": "str.to_lowercase",
|
|
200
|
+
"lstrip": "str.strip_chars_start",
|
|
201
|
+
"removeprefix": "str.strip_prefix",
|
|
202
|
+
"removesuffix": "str.strip_suffix",
|
|
203
|
+
"replace": "str.replace",
|
|
204
|
+
"rstrip": "str.strip_chars_end",
|
|
205
|
+
"startswith": "str.starts_with",
|
|
206
|
+
"strip": "str.strip_chars",
|
|
207
|
+
"title": "str.to_titlecase",
|
|
208
|
+
"upper": "str.to_uppercase",
|
|
209
|
+
"zfill": "str.zfill",
|
|
210
|
+
# temporal
|
|
211
|
+
"date": "dt.date",
|
|
212
|
+
"day": "dt.day",
|
|
213
|
+
"hour": "dt.hour",
|
|
214
|
+
"isoweekday": "dt.weekday",
|
|
215
|
+
"microsecond": "dt.microsecond",
|
|
216
|
+
"month": "dt.month",
|
|
217
|
+
"second": "dt.second",
|
|
218
|
+
"strftime": "dt.strftime",
|
|
219
|
+
"time": "dt.time",
|
|
220
|
+
"year": "dt.year",
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
_MODULE_FUNCTIONS: list[dict[str, list[AbstractSet[str]]]] = [
|
|
224
|
+
# lambda x: numpy.func(x)
|
|
225
|
+
# lambda x: numpy.func(CONSTANT)
|
|
226
|
+
{
|
|
227
|
+
"argument_1_opname": [{"LOAD_FAST", "LOAD_CONST"}],
|
|
228
|
+
"argument_2_opname": [],
|
|
229
|
+
"module_opname": [OpNames.LOAD_ATTR],
|
|
230
|
+
"attribute_opname": [],
|
|
231
|
+
"module_name": [_NUMPY_MODULE_ALIASES],
|
|
232
|
+
"attribute_name": [],
|
|
233
|
+
"function_name": [_NUMPY_FUNCTIONS],
|
|
234
|
+
},
|
|
235
|
+
# lambda x: math.func(x)
|
|
236
|
+
# lambda x: math.func(CONSTANT)
|
|
237
|
+
{
|
|
238
|
+
"argument_1_opname": [{"LOAD_FAST", "LOAD_CONST"}],
|
|
239
|
+
"argument_2_opname": [],
|
|
240
|
+
"module_opname": [OpNames.LOAD_ATTR],
|
|
241
|
+
"attribute_opname": [],
|
|
242
|
+
"module_name": [{"math"}],
|
|
243
|
+
"attribute_name": [],
|
|
244
|
+
"function_name": [_MATH_FUNCTIONS],
|
|
245
|
+
},
|
|
246
|
+
# lambda x: json.loads(x)
|
|
247
|
+
{
|
|
248
|
+
"argument_1_opname": [{"LOAD_FAST"}],
|
|
249
|
+
"argument_2_opname": [],
|
|
250
|
+
"module_opname": [OpNames.LOAD_ATTR],
|
|
251
|
+
"attribute_opname": [],
|
|
252
|
+
"module_name": [{"json"}],
|
|
253
|
+
"attribute_name": [],
|
|
254
|
+
"function_name": [{"loads"}],
|
|
255
|
+
},
|
|
256
|
+
# lambda x: datetime.strptime(x, CONSTANT)
|
|
257
|
+
{
|
|
258
|
+
"argument_1_opname": [{"LOAD_FAST"}],
|
|
259
|
+
"argument_2_opname": [{"LOAD_CONST"}],
|
|
260
|
+
"module_opname": [OpNames.LOAD_ATTR],
|
|
261
|
+
"attribute_opname": [],
|
|
262
|
+
"module_name": [{"datetime"}],
|
|
263
|
+
"attribute_name": [],
|
|
264
|
+
"function_name": [{"strptime"}],
|
|
265
|
+
"check_load_global": False, # type: ignore[dict-item]
|
|
266
|
+
},
|
|
267
|
+
# lambda x: module.attribute.func(x, CONSTANT)
|
|
268
|
+
{
|
|
269
|
+
"argument_1_opname": [{"LOAD_FAST"}],
|
|
270
|
+
"argument_2_opname": [{"LOAD_CONST"}],
|
|
271
|
+
"module_opname": [{"LOAD_ATTR"}],
|
|
272
|
+
"attribute_opname": [OpNames.LOAD_ATTR],
|
|
273
|
+
"module_name": [{"datetime", "dt"}],
|
|
274
|
+
"attribute_name": [{"datetime"}],
|
|
275
|
+
"function_name": [{"strptime"}],
|
|
276
|
+
"check_load_global": False, # type: ignore[dict-item]
|
|
277
|
+
},
|
|
278
|
+
]
|
|
279
|
+
# In addition to `lambda x: func(x)`, also support cases when a unary operation
|
|
280
|
+
# has been applied to `x`, like `lambda x: func(-x)` or `lambda x: func(~x)`.
|
|
281
|
+
_MODULE_FUNCTIONS = [
|
|
282
|
+
{**kind, "argument_1_unary_opname": unary} # type: ignore[dict-item]
|
|
283
|
+
for kind in _MODULE_FUNCTIONS
|
|
284
|
+
for unary in [[set(OpNames.UNARY)], []]
|
|
285
|
+
]
|
|
286
|
+
# Lookup for module functions that have different names as polars expressions
|
|
287
|
+
_MODULE_FUNC_TO_EXPR_NAME = {
|
|
288
|
+
"math.acos": "arccos",
|
|
289
|
+
"math.acosh": "arccosh",
|
|
290
|
+
"math.asin": "arcsin",
|
|
291
|
+
"math.asinh": "arcsinh",
|
|
292
|
+
"math.atan": "arctan",
|
|
293
|
+
"math.atanh": "arctanh",
|
|
294
|
+
"json.loads": "str.json_decode",
|
|
295
|
+
}
|
|
296
|
+
_RE_IMPLICIT_BOOL = re.compile(r'pl\.col\("([^"]*)"\) & pl\.col\("\1"\)\.(.+)')
|
|
297
|
+
_RE_SERIES_NAMES = re.compile(r"^(s|srs\d?|series)\.")
|
|
298
|
+
_RE_STRIP_BOOL = re.compile(r"^bool\((.+)\)$")
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _get_all_caller_variables() -> dict[str, Any]:
|
|
302
|
+
"""Get all local and global variables from caller's frame."""
|
|
303
|
+
pkg_dir = Path(__file__).parent.parent
|
|
304
|
+
|
|
305
|
+
# https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow
|
|
306
|
+
frame = inspect.currentframe()
|
|
307
|
+
n = 0
|
|
308
|
+
try:
|
|
309
|
+
while frame:
|
|
310
|
+
fname = inspect.getfile(frame)
|
|
311
|
+
if fname.startswith(str(pkg_dir)):
|
|
312
|
+
frame = frame.f_back
|
|
313
|
+
n += 1
|
|
314
|
+
else:
|
|
315
|
+
break
|
|
316
|
+
variables: dict[str, Any]
|
|
317
|
+
if frame is None:
|
|
318
|
+
variables = {}
|
|
319
|
+
else:
|
|
320
|
+
variables = {**frame.f_locals, **frame.f_globals}
|
|
321
|
+
finally:
|
|
322
|
+
# https://docs.python.org/3/library/inspect.html
|
|
323
|
+
# > Though the cycle detector will catch these, destruction of the frames
|
|
324
|
+
# > (and local variables) can be made deterministic by removing the cycle
|
|
325
|
+
# > in a finally clause.
|
|
326
|
+
del frame
|
|
327
|
+
return variables
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _get_target_name(col: str, expression: str, map_target: str) -> str:
|
|
331
|
+
"""The name of the object against which the 'map' is being invoked."""
|
|
332
|
+
col_expr = f'pl.col("{col}")'
|
|
333
|
+
if map_target == "expr":
|
|
334
|
+
return col_expr
|
|
335
|
+
elif map_target == "series":
|
|
336
|
+
if _RE_SERIES_NAMES.match(expression):
|
|
337
|
+
return expression.split(".", 1)[0]
|
|
338
|
+
|
|
339
|
+
# note: handle overlapping name from global variables; fallback
|
|
340
|
+
# through "s", "srs", "series" and (finally) srs0 -> srsN...
|
|
341
|
+
search_expr = expression.replace(col_expr, "")
|
|
342
|
+
for name in ("s", "srs", "series"):
|
|
343
|
+
if not re.search(rf"\b{name}\b", search_expr):
|
|
344
|
+
return name
|
|
345
|
+
n = count()
|
|
346
|
+
while True:
|
|
347
|
+
name = f"srs{next(n)}"
|
|
348
|
+
if not re.search(rf"\b{name}\b", search_expr):
|
|
349
|
+
return name
|
|
350
|
+
|
|
351
|
+
msg = f"TODO: map_target = {map_target!r}"
|
|
352
|
+
raise NotImplementedError(msg)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
class BytecodeParser:
|
|
356
|
+
"""Introspect UDF bytecode and determine if we can rewrite as native expression."""
|
|
357
|
+
|
|
358
|
+
_map_target_name: str | None = None
|
|
359
|
+
_can_attempt_rewrite: bool | None = None
|
|
360
|
+
_caller_variables: dict[str, Any] | None = None
|
|
361
|
+
_col_expression: tuple[str, str] | NoDefault | None = no_default
|
|
362
|
+
|
|
363
|
+
def __init__(self, function: Callable[[Any], Any], map_target: MapTarget) -> None:
|
|
364
|
+
"""
|
|
365
|
+
Initialize BytecodeParser instance and prepare to introspect UDFs.
|
|
366
|
+
|
|
367
|
+
Parameters
|
|
368
|
+
----------
|
|
369
|
+
function : callable
|
|
370
|
+
The function/lambda to disassemble and introspect.
|
|
371
|
+
map_target : {'expr','series','frame'}
|
|
372
|
+
The underlying target object type of the map operation.
|
|
373
|
+
"""
|
|
374
|
+
try:
|
|
375
|
+
original_instructions = get_instructions(function)
|
|
376
|
+
except TypeError:
|
|
377
|
+
# in case we hit something that can't be disassembled (eg: code object
|
|
378
|
+
# unavailable, like a bare numpy ufunc that isn't in a lambda/function)
|
|
379
|
+
original_instructions = iter([])
|
|
380
|
+
|
|
381
|
+
self._function = function
|
|
382
|
+
self._map_target = map_target
|
|
383
|
+
self._param_name = self._get_param_name(function)
|
|
384
|
+
self._rewritten_instructions = RewrittenInstructions(
|
|
385
|
+
instructions=original_instructions,
|
|
386
|
+
caller_variables=self._caller_variables,
|
|
387
|
+
function=function,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
def _omit_implicit_bool(self, expr: str) -> str:
|
|
391
|
+
"""Drop extraneous/implied bool (eg: `pl.col("d") & pl.col("d").dt.date()`)."""
|
|
392
|
+
while _RE_IMPLICIT_BOOL.search(expr):
|
|
393
|
+
expr = _RE_IMPLICIT_BOOL.sub(repl=r'pl.col("\1").\2', string=expr)
|
|
394
|
+
return expr
|
|
395
|
+
|
|
396
|
+
@staticmethod
|
|
397
|
+
def _get_param_name(function: Callable[[Any], Any]) -> str | None:
|
|
398
|
+
"""Return single function parameter name."""
|
|
399
|
+
try:
|
|
400
|
+
# note: we do not parse/handle functions with > 1 params
|
|
401
|
+
sig = signature(function)
|
|
402
|
+
except ValueError:
|
|
403
|
+
return None
|
|
404
|
+
return (
|
|
405
|
+
next(iter(parameters.keys()))
|
|
406
|
+
if len(parameters := sig.parameters) == 1
|
|
407
|
+
else None
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
def _inject_nesting(
|
|
411
|
+
self,
|
|
412
|
+
expression_blocks: dict[int, str],
|
|
413
|
+
logical_instructions: list[Instruction],
|
|
414
|
+
) -> list[tuple[int, str]]:
|
|
415
|
+
"""Inject nesting boundaries into expression blocks (as parentheses)."""
|
|
416
|
+
if logical_instructions:
|
|
417
|
+
# reconstruct nesting for mixed 'and'/'or' ops by associating control flow
|
|
418
|
+
# jump offsets with their target expression blocks and applying parens
|
|
419
|
+
if len({inst.opname for inst in logical_instructions}) > 1:
|
|
420
|
+
block_offsets: list[int] = list(expression_blocks.keys())
|
|
421
|
+
prev_end = -1
|
|
422
|
+
for inst in logical_instructions:
|
|
423
|
+
start = block_offsets[bisect_left(block_offsets, inst.offset) - 1]
|
|
424
|
+
end = block_offsets[bisect_left(block_offsets, inst.argval) - 1]
|
|
425
|
+
if not (start == 0 and end == block_offsets[-1]):
|
|
426
|
+
if prev_end not in (start, end):
|
|
427
|
+
expression_blocks[start] = "(" + expression_blocks[start]
|
|
428
|
+
expression_blocks[end] += ")"
|
|
429
|
+
prev_end = end
|
|
430
|
+
|
|
431
|
+
for inst in logical_instructions: # inject connecting "&" and "|" ops
|
|
432
|
+
expression_blocks[inst.offset] = OpNames.CONTROL_FLOW[inst.opname]
|
|
433
|
+
|
|
434
|
+
return sorted(expression_blocks.items())
|
|
435
|
+
|
|
436
|
+
@property
|
|
437
|
+
def map_target(self) -> MapTarget:
|
|
438
|
+
"""The map target, eg: one of 'expr', 'frame', or 'series'."""
|
|
439
|
+
return self._map_target
|
|
440
|
+
|
|
441
|
+
def can_attempt_rewrite(self) -> bool:
|
|
442
|
+
"""
|
|
443
|
+
Determine if we may be able to offer a native polars expression instead.
|
|
444
|
+
|
|
445
|
+
Note that `lambda x: x` is inefficient, but we ignore it because it is not
|
|
446
|
+
guaranteed that using the equivalent bare constant value will return the
|
|
447
|
+
same output. (Hopefully nobody is writing lambdas like that anyway...)
|
|
448
|
+
"""
|
|
449
|
+
if self._can_attempt_rewrite is None:
|
|
450
|
+
self._can_attempt_rewrite = (
|
|
451
|
+
self._param_name is not None
|
|
452
|
+
# check minimum number of ops, ensuring all are parseable
|
|
453
|
+
and len(self._rewritten_instructions) >= 2
|
|
454
|
+
and all(
|
|
455
|
+
inst.opname in OpNames.PARSEABLE_OPS
|
|
456
|
+
for inst in self._rewritten_instructions
|
|
457
|
+
)
|
|
458
|
+
# exclude constructs/functions with multiple RETURN_VALUE ops
|
|
459
|
+
and sum(
|
|
460
|
+
1
|
|
461
|
+
for inst in self.original_instructions
|
|
462
|
+
if inst.opname == "RETURN_VALUE"
|
|
463
|
+
)
|
|
464
|
+
== 1
|
|
465
|
+
)
|
|
466
|
+
return self._can_attempt_rewrite
|
|
467
|
+
|
|
468
|
+
def dis(self) -> None:
|
|
469
|
+
"""Print disassembled function bytecode."""
|
|
470
|
+
dis.dis(self._function)
|
|
471
|
+
|
|
472
|
+
@property
|
|
473
|
+
def function(self) -> Callable[[Any], Any]:
|
|
474
|
+
"""The function being parsed."""
|
|
475
|
+
return self._function
|
|
476
|
+
|
|
477
|
+
@property
|
|
478
|
+
def original_instructions(self) -> list[Instruction]:
|
|
479
|
+
"""The original bytecode instructions from the function we are parsing."""
|
|
480
|
+
return list(self._rewritten_instructions._original_instructions)
|
|
481
|
+
|
|
482
|
+
@property
|
|
483
|
+
def param_name(self) -> str | None:
|
|
484
|
+
"""The parameter name of the function being parsed."""
|
|
485
|
+
return self._param_name
|
|
486
|
+
|
|
487
|
+
@property
|
|
488
|
+
def rewritten_instructions(self) -> list[Instruction]:
|
|
489
|
+
"""The rewritten bytecode instructions from the function we are parsing."""
|
|
490
|
+
return list(self._rewritten_instructions)
|
|
491
|
+
|
|
492
|
+
def to_expression(self, col: str) -> str | None:
|
|
493
|
+
"""Translate postfix bytecode instructions to polars expression/string."""
|
|
494
|
+
if self._col_expression is not no_default and self._col_expression is not None:
|
|
495
|
+
col_name, expr = self._col_expression
|
|
496
|
+
if col != col_name:
|
|
497
|
+
expr = re.sub(
|
|
498
|
+
rf'pl\.col\("{re_escape(col_name)}"\)',
|
|
499
|
+
f'pl.col("{re_escape(col)}")',
|
|
500
|
+
expr,
|
|
501
|
+
)
|
|
502
|
+
self._col_expression = (col, expr)
|
|
503
|
+
return expr
|
|
504
|
+
|
|
505
|
+
self._map_target_name = None
|
|
506
|
+
if self._param_name is None:
|
|
507
|
+
self._col_expression = None
|
|
508
|
+
return None
|
|
509
|
+
|
|
510
|
+
# decompose bytecode into logical 'and'/'or' expression blocks (if present)
|
|
511
|
+
control_flow_blocks = defaultdict(list)
|
|
512
|
+
logical_instructions = []
|
|
513
|
+
jump_offset = 0
|
|
514
|
+
for idx, inst in enumerate(self._rewritten_instructions):
|
|
515
|
+
if inst.opname in OpNames.CONTROL_FLOW:
|
|
516
|
+
jump_offset = self._rewritten_instructions[idx + 1].offset
|
|
517
|
+
logical_instructions.append(inst)
|
|
518
|
+
else:
|
|
519
|
+
control_flow_blocks[jump_offset].append(inst)
|
|
520
|
+
|
|
521
|
+
# convert each block to a polars expression string
|
|
522
|
+
try:
|
|
523
|
+
expression_strings = self._inject_nesting(
|
|
524
|
+
{
|
|
525
|
+
offset: InstructionTranslator(
|
|
526
|
+
instructions=ops,
|
|
527
|
+
caller_variables=self._caller_variables,
|
|
528
|
+
map_target=self._map_target,
|
|
529
|
+
function=self._function,
|
|
530
|
+
).to_expression(
|
|
531
|
+
col=col,
|
|
532
|
+
param_name=self._param_name,
|
|
533
|
+
depth=int(bool(logical_instructions)),
|
|
534
|
+
)
|
|
535
|
+
for offset, ops in control_flow_blocks.items()
|
|
536
|
+
},
|
|
537
|
+
logical_instructions,
|
|
538
|
+
)
|
|
539
|
+
except NotImplementedError:
|
|
540
|
+
self._col_expression = None
|
|
541
|
+
return None
|
|
542
|
+
|
|
543
|
+
polars_expr = " ".join(expr for _offset, expr in expression_strings)
|
|
544
|
+
|
|
545
|
+
# note: if no 'pl.col' in the expression, it likely represents a compound
|
|
546
|
+
# constant value (e.g. `lambda x: CONST + 123`), so we don't want to warn
|
|
547
|
+
if "pl.col(" not in polars_expr:
|
|
548
|
+
self._col_expression = None
|
|
549
|
+
return None
|
|
550
|
+
else:
|
|
551
|
+
polars_expr = self._omit_implicit_bool(polars_expr)
|
|
552
|
+
if self._map_target == "series":
|
|
553
|
+
if (target_name := self._map_target_name) is None:
|
|
554
|
+
target_name = _get_target_name(col, polars_expr, self._map_target)
|
|
555
|
+
polars_expr = polars_expr.replace(f'pl.col("{col}")', target_name)
|
|
556
|
+
|
|
557
|
+
self._col_expression = (col, polars_expr)
|
|
558
|
+
return polars_expr
|
|
559
|
+
|
|
560
|
+
def warn(
|
|
561
|
+
self,
|
|
562
|
+
col: str,
|
|
563
|
+
*,
|
|
564
|
+
suggestion_override: str | None = None,
|
|
565
|
+
udf_override: str | None = None,
|
|
566
|
+
) -> None:
|
|
567
|
+
"""Generate warning that suggests an equivalent native polars expression."""
|
|
568
|
+
# Import these here so that udfs can be imported without polars installed.
|
|
569
|
+
|
|
570
|
+
from polars._utils.various import (
|
|
571
|
+
find_stacklevel,
|
|
572
|
+
in_terminal_that_supports_colour,
|
|
573
|
+
)
|
|
574
|
+
from polars.exceptions import PolarsInefficientMapWarning
|
|
575
|
+
|
|
576
|
+
suggested_expression = suggestion_override or self.to_expression(col)
|
|
577
|
+
|
|
578
|
+
if suggested_expression is not None:
|
|
579
|
+
if (target_name := self._map_target_name) is None:
|
|
580
|
+
target_name = _get_target_name(
|
|
581
|
+
col, suggested_expression, self._map_target
|
|
582
|
+
)
|
|
583
|
+
func_name = udf_override or self._function.__name__ or "..."
|
|
584
|
+
if func_name == "<lambda>":
|
|
585
|
+
func_name = f"lambda {self._param_name}: ..."
|
|
586
|
+
|
|
587
|
+
addendum = (
|
|
588
|
+
'Note: in list.eval context, pl.col("") should be written as pl.element()'
|
|
589
|
+
if 'pl.col("")' in suggested_expression
|
|
590
|
+
else ""
|
|
591
|
+
)
|
|
592
|
+
apitype, clsname = (
|
|
593
|
+
("expressions", "Expr")
|
|
594
|
+
if self._map_target == "expr"
|
|
595
|
+
else ("series", "Series")
|
|
596
|
+
)
|
|
597
|
+
before, after = (
|
|
598
|
+
(
|
|
599
|
+
f" \033[31m- {target_name}.map_elements({func_name})\033[0m\n",
|
|
600
|
+
f" \033[32m+ {suggested_expression}\033[0m\n{addendum}",
|
|
601
|
+
)
|
|
602
|
+
if in_terminal_that_supports_colour()
|
|
603
|
+
else (
|
|
604
|
+
f" - {target_name}.map_elements({func_name})\n",
|
|
605
|
+
f" + {suggested_expression}\n{addendum}",
|
|
606
|
+
)
|
|
607
|
+
)
|
|
608
|
+
warnings.warn(
|
|
609
|
+
f"\n{clsname}.map_elements is significantly slower than the native {apitype} API.\n"
|
|
610
|
+
"Only use if you absolutely CANNOT implement your logic otherwise.\n"
|
|
611
|
+
"Replace this expression...\n"
|
|
612
|
+
f"{before}"
|
|
613
|
+
"with this one instead:\n"
|
|
614
|
+
f"{after}",
|
|
615
|
+
PolarsInefficientMapWarning,
|
|
616
|
+
stacklevel=find_stacklevel(),
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
class InstructionTranslator:
|
|
621
|
+
"""Translates Instruction bytecode to a polars expression string."""
|
|
622
|
+
|
|
623
|
+
def __init__(
|
|
624
|
+
self,
|
|
625
|
+
instructions: list[Instruction],
|
|
626
|
+
caller_variables: dict[str, Any] | None,
|
|
627
|
+
function: Callable[[Any], Any],
|
|
628
|
+
map_target: MapTarget,
|
|
629
|
+
) -> None:
|
|
630
|
+
self._stack = self._to_intermediate_stack(instructions, map_target)
|
|
631
|
+
self._caller_variables = caller_variables
|
|
632
|
+
self._function = function
|
|
633
|
+
|
|
634
|
+
def to_expression(self, col: str, param_name: str, depth: int) -> str:
|
|
635
|
+
"""Convert intermediate stack to polars expression string."""
|
|
636
|
+
return self._expr(self._stack, col, param_name, depth)
|
|
637
|
+
|
|
638
|
+
@staticmethod
|
|
639
|
+
def op(inst: Instruction) -> str:
|
|
640
|
+
"""Convert bytecode instruction to suitable intermediate op string."""
|
|
641
|
+
if (opname := inst.opname) in OpNames.CONTROL_FLOW:
|
|
642
|
+
return OpNames.CONTROL_FLOW[opname]
|
|
643
|
+
elif inst.argrepr:
|
|
644
|
+
return inst.argrepr
|
|
645
|
+
elif opname == "IS_OP":
|
|
646
|
+
return "is not" if inst.argval else "is"
|
|
647
|
+
elif opname == "CONTAINS_OP":
|
|
648
|
+
return "not in" if inst.argval else "in"
|
|
649
|
+
elif opname in OpNames.UNARY:
|
|
650
|
+
return OpNames.UNARY[opname]
|
|
651
|
+
elif opname == "BINARY_SUBSCR":
|
|
652
|
+
return "replace_strict"
|
|
653
|
+
else:
|
|
654
|
+
msg = (
|
|
655
|
+
f"unexpected or unrecognised op name ({opname})\n\n"
|
|
656
|
+
"Please report a bug to https://github.com/pola-rs/polars/issues "
|
|
657
|
+
"with the content of function you were passing to the `map` "
|
|
658
|
+
f"expression and the following instruction object:\n{inst!r}"
|
|
659
|
+
)
|
|
660
|
+
raise AssertionError(msg)
|
|
661
|
+
|
|
662
|
+
def _expr(self, value: StackEntry, col: str, param_name: str, depth: int) -> str:
|
|
663
|
+
"""Take stack entry value and convert to polars expression string."""
|
|
664
|
+
if isinstance(value, StackValue):
|
|
665
|
+
op = _RE_STRIP_BOOL.sub(r"\1", value.operator)
|
|
666
|
+
e1 = self._expr(value.left_operand, col, param_name, depth + 1)
|
|
667
|
+
if value.operator_arity == 1:
|
|
668
|
+
if op not in OpNames.UNARY_VALUES:
|
|
669
|
+
if e1.startswith("pl.col("):
|
|
670
|
+
call = "" if op.endswith(")") else "()"
|
|
671
|
+
return f"{e1}.{op}{call}"
|
|
672
|
+
if e1[0] in OpNames.UNARY_VALUES and e1[1:].startswith("pl.col("):
|
|
673
|
+
call = "" if op.endswith(")") else "()"
|
|
674
|
+
return f"({e1}).{op}{call}"
|
|
675
|
+
|
|
676
|
+
# support use of consts as numpy/builtin params, eg:
|
|
677
|
+
# "np.sin(3) + np.cos(x)", or "len('const_string') + len(x)"
|
|
678
|
+
if (
|
|
679
|
+
value.from_module in _NUMPY_MODULE_ALIASES
|
|
680
|
+
and op in _NUMPY_FUNCTIONS
|
|
681
|
+
):
|
|
682
|
+
pfx = "np."
|
|
683
|
+
elif (
|
|
684
|
+
value.from_module == "math"
|
|
685
|
+
and _MODULE_FUNC_TO_EXPR_NAME.get(f"math.{op}", op)
|
|
686
|
+
in _MATH_FUNCTIONS
|
|
687
|
+
):
|
|
688
|
+
pfx = "math."
|
|
689
|
+
else:
|
|
690
|
+
pfx = ""
|
|
691
|
+
return f"{pfx}{op}({e1})"
|
|
692
|
+
return f"{op}{e1}"
|
|
693
|
+
else:
|
|
694
|
+
e2 = self._expr(value.right_operand, col, param_name, depth + 1)
|
|
695
|
+
if op in ("is", "is not") and value.left_operand == "None":
|
|
696
|
+
not_ = "" if op == "is" else "not_"
|
|
697
|
+
return f"{e1}.is_{not_}null()"
|
|
698
|
+
elif op in ("in", "not in"):
|
|
699
|
+
not_ = "" if op == "in" else "~"
|
|
700
|
+
return (
|
|
701
|
+
f"{not_}({e1}.is_in({e2}))"
|
|
702
|
+
if " " in e1
|
|
703
|
+
else f"{not_}{e1}.is_in({e2})"
|
|
704
|
+
)
|
|
705
|
+
elif op == "replace_strict":
|
|
706
|
+
if not self._caller_variables:
|
|
707
|
+
self._caller_variables = _get_all_caller_variables()
|
|
708
|
+
if not isinstance(self._caller_variables.get(e1, None), dict):
|
|
709
|
+
msg = "require dict mapping"
|
|
710
|
+
raise NotImplementedError(msg)
|
|
711
|
+
return f"{e2}.{op}({e1})"
|
|
712
|
+
elif op == "<<":
|
|
713
|
+
# 2**e2 may be float if e2 was -ve, but if e1 << e2 was valid then
|
|
714
|
+
# e2 must have been +ve. therefore 2**e2 can be safely cast to
|
|
715
|
+
# i64, which may be necessary if chaining ops that assume i64.
|
|
716
|
+
return f"({e1} * 2**{e2}).cast(pl.Int64)"
|
|
717
|
+
elif op == ">>":
|
|
718
|
+
# (motivation for the cast is same as the '<<' case above)
|
|
719
|
+
return f"({e1} / 2**{e2}).cast(pl.Int64)"
|
|
720
|
+
else:
|
|
721
|
+
expr = f"{e1} {op} {e2}"
|
|
722
|
+
return f"({expr})" if depth else expr
|
|
723
|
+
|
|
724
|
+
elif value == param_name:
|
|
725
|
+
return f'pl.col("{col}")'
|
|
726
|
+
|
|
727
|
+
return value
|
|
728
|
+
|
|
729
|
+
def _to_intermediate_stack(
|
|
730
|
+
self, instructions: list[Instruction], map_target: MapTarget
|
|
731
|
+
) -> StackEntry:
|
|
732
|
+
"""Take postfix bytecode and convert to an intermediate natural-order stack."""
|
|
733
|
+
if map_target in ("expr", "series"):
|
|
734
|
+
stack: list[StackEntry] = []
|
|
735
|
+
for inst in instructions:
|
|
736
|
+
stack.append(
|
|
737
|
+
inst.argrepr
|
|
738
|
+
if inst.opname in OpNames.LOAD
|
|
739
|
+
else (
|
|
740
|
+
StackValue(
|
|
741
|
+
operator=self.op(inst),
|
|
742
|
+
operator_arity=1,
|
|
743
|
+
left_operand=stack.pop(), # type: ignore[arg-type]
|
|
744
|
+
right_operand=None, # type: ignore[arg-type]
|
|
745
|
+
from_module=getattr(inst, "_from_module", None),
|
|
746
|
+
)
|
|
747
|
+
if (
|
|
748
|
+
inst.opname in OpNames.UNARY
|
|
749
|
+
or OpNames.SYNTHETIC.get(inst.opname) == 1
|
|
750
|
+
)
|
|
751
|
+
else StackValue(
|
|
752
|
+
operator=self.op(inst),
|
|
753
|
+
operator_arity=2,
|
|
754
|
+
left_operand=stack.pop(-2), # type: ignore[arg-type]
|
|
755
|
+
right_operand=stack.pop(-1), # type: ignore[arg-type]
|
|
756
|
+
from_module=getattr(inst, "_from_module", None),
|
|
757
|
+
)
|
|
758
|
+
)
|
|
759
|
+
)
|
|
760
|
+
return stack[0]
|
|
761
|
+
|
|
762
|
+
# TODO: dataframe.map... ?
|
|
763
|
+
msg = f"TODO: {map_target!r} map target not yet supported."
|
|
764
|
+
raise NotImplementedError(msg)
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
class RewrittenInstructions:
|
|
768
|
+
"""
|
|
769
|
+
Standalone class that applies Instruction rewrite/filtering rules.
|
|
770
|
+
|
|
771
|
+
This significantly simplifies subsequent parsing by injecting
|
|
772
|
+
synthetic POLARS_EXPRESSION ops into the Instruction stream for
|
|
773
|
+
easy identification/translation, and separates the parsing logic
|
|
774
|
+
from the identification of expression translation opportunities.
|
|
775
|
+
"""
|
|
776
|
+
|
|
777
|
+
_ignored_ops = frozenset(
|
|
778
|
+
[
|
|
779
|
+
"COPY",
|
|
780
|
+
"COPY_FREE_VARS",
|
|
781
|
+
"NOT_TAKEN",
|
|
782
|
+
"POP_TOP",
|
|
783
|
+
"PRECALL",
|
|
784
|
+
"PUSH_NULL",
|
|
785
|
+
"RESUME",
|
|
786
|
+
"RETURN_VALUE",
|
|
787
|
+
"TO_BOOL",
|
|
788
|
+
]
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
def __init__(
|
|
792
|
+
self,
|
|
793
|
+
instructions: Iterator[Instruction],
|
|
794
|
+
function: Callable[[Any], Any],
|
|
795
|
+
caller_variables: dict[str, Any] | None,
|
|
796
|
+
) -> None:
|
|
797
|
+
self._function = function
|
|
798
|
+
self._caller_variables = caller_variables
|
|
799
|
+
self._original_instructions = list(instructions)
|
|
800
|
+
|
|
801
|
+
normalised_instructions = []
|
|
802
|
+
|
|
803
|
+
for inst in self._unpack_superinstructions(self._original_instructions):
|
|
804
|
+
if inst.opname not in self._ignored_ops:
|
|
805
|
+
if inst.opname not in OpNames.MATCHABLE_OPS:
|
|
806
|
+
self._rewritten_instructions = []
|
|
807
|
+
return
|
|
808
|
+
upgraded_inst = self._update_instruction(inst)
|
|
809
|
+
normalised_instructions.append(upgraded_inst)
|
|
810
|
+
|
|
811
|
+
self._rewritten_instructions = self._rewrite(normalised_instructions)
|
|
812
|
+
|
|
813
|
+
def __len__(self) -> int:
|
|
814
|
+
return len(self._rewritten_instructions)
|
|
815
|
+
|
|
816
|
+
def __iter__(self) -> Iterator[Instruction]:
|
|
817
|
+
return iter(self._rewritten_instructions)
|
|
818
|
+
|
|
819
|
+
def __getitem__(self, item: Any) -> Instruction:
|
|
820
|
+
return self._rewritten_instructions[item]
|
|
821
|
+
|
|
822
|
+
def _matches(
|
|
823
|
+
self,
|
|
824
|
+
idx: int,
|
|
825
|
+
*,
|
|
826
|
+
opnames: list[AbstractSet[str]],
|
|
827
|
+
argvals: list[AbstractSet[Any] | dict[Any, Any] | None] | None,
|
|
828
|
+
is_attr: bool = False,
|
|
829
|
+
) -> list[Instruction]:
|
|
830
|
+
"""
|
|
831
|
+
Check if a sequence of Instructions matches the specified ops/argvals.
|
|
832
|
+
|
|
833
|
+
Parameters
|
|
834
|
+
----------
|
|
835
|
+
idx
|
|
836
|
+
The index of the first instruction to check.
|
|
837
|
+
opnames
|
|
838
|
+
The full opname sequence that defines a match.
|
|
839
|
+
argvals
|
|
840
|
+
Associated argvals that must also match (in same position as opnames).
|
|
841
|
+
is_attr
|
|
842
|
+
Indicate if the match represents pure attribute access (cannot be called).
|
|
843
|
+
"""
|
|
844
|
+
n_required_ops, argvals = len(opnames), argvals or []
|
|
845
|
+
idx_offset = idx + n_required_ops
|
|
846
|
+
if (
|
|
847
|
+
is_attr
|
|
848
|
+
and (trailing_inst := self._instructions[idx_offset : idx_offset + 1])
|
|
849
|
+
and trailing_inst[0].opname in OpNames.CALL # not pure attr if called
|
|
850
|
+
):
|
|
851
|
+
return []
|
|
852
|
+
|
|
853
|
+
instructions = self._instructions[idx:idx_offset]
|
|
854
|
+
if len(instructions) == n_required_ops and all(
|
|
855
|
+
inst.opname in match_opnames
|
|
856
|
+
and (match_argval is None or inst.argval in match_argval)
|
|
857
|
+
for inst, match_opnames, match_argval in zip_longest(
|
|
858
|
+
instructions, opnames, argvals
|
|
859
|
+
)
|
|
860
|
+
):
|
|
861
|
+
return instructions
|
|
862
|
+
return []
|
|
863
|
+
|
|
864
|
+
def _rewrite(self, instructions: list[Instruction]) -> list[Instruction]:
|
|
865
|
+
"""
|
|
866
|
+
Apply rewrite rules, potentially injecting synthetic operations.
|
|
867
|
+
|
|
868
|
+
Rules operate on the instruction stream and can examine/modify
|
|
869
|
+
it as needed, pushing updates into "updated_instructions" and
|
|
870
|
+
returning True/False to indicate if any changes were made.
|
|
871
|
+
"""
|
|
872
|
+
self._instructions = instructions
|
|
873
|
+
updated_instructions: list[Instruction] = []
|
|
874
|
+
idx = 0
|
|
875
|
+
while idx < len(self._instructions):
|
|
876
|
+
inst, increment = self._instructions[idx], 1
|
|
877
|
+
if inst.opname not in OpNames.LOAD or not any(
|
|
878
|
+
(increment := map_rewrite(idx, updated_instructions))
|
|
879
|
+
for map_rewrite in (
|
|
880
|
+
# add any other rewrite methods here
|
|
881
|
+
self._rewrite_functions,
|
|
882
|
+
self._rewrite_methods,
|
|
883
|
+
self._rewrite_builtins,
|
|
884
|
+
self._rewrite_attrs,
|
|
885
|
+
)
|
|
886
|
+
):
|
|
887
|
+
updated_instructions.append(inst)
|
|
888
|
+
idx += increment or 1
|
|
889
|
+
return updated_instructions
|
|
890
|
+
|
|
891
|
+
def _rewrite_attrs(self, idx: int, updated_instructions: list[Instruction]) -> int:
|
|
892
|
+
"""Replace python attribute lookup with synthetic POLARS_EXPRESSION op."""
|
|
893
|
+
if matching_instructions := self._matches(
|
|
894
|
+
idx,
|
|
895
|
+
opnames=[{"LOAD_FAST"}, {"LOAD_ATTR"}],
|
|
896
|
+
argvals=[None, _PYTHON_ATTRS_MAP],
|
|
897
|
+
is_attr=True,
|
|
898
|
+
):
|
|
899
|
+
inst = matching_instructions[1]
|
|
900
|
+
expr_name = _PYTHON_ATTRS_MAP[inst.argval]
|
|
901
|
+
px = inst._replace(
|
|
902
|
+
opname="POLARS_EXPRESSION", argval=expr_name, argrepr=expr_name
|
|
903
|
+
)
|
|
904
|
+
updated_instructions.extend([matching_instructions[0], px])
|
|
905
|
+
|
|
906
|
+
return len(matching_instructions)
|
|
907
|
+
|
|
908
|
+
def _rewrite_builtins(
|
|
909
|
+
self, idx: int, updated_instructions: list[Instruction]
|
|
910
|
+
) -> int:
|
|
911
|
+
"""Replace builtin function calls with a synthetic POLARS_EXPRESSION op."""
|
|
912
|
+
if matching_instructions := self._matches(
|
|
913
|
+
idx,
|
|
914
|
+
opnames=[{"LOAD_GLOBAL"}, {"LOAD_FAST", "LOAD_CONST"}, OpNames.CALL],
|
|
915
|
+
argvals=[_PYTHON_BUILTINS],
|
|
916
|
+
):
|
|
917
|
+
inst1, inst2 = matching_instructions[:2]
|
|
918
|
+
if (argval := inst1.argval) in _PYTHON_CASTS_MAP:
|
|
919
|
+
dtype = _PYTHON_CASTS_MAP[argval]
|
|
920
|
+
argval = f"cast(pl.{dtype})"
|
|
921
|
+
|
|
922
|
+
px = inst1._replace(
|
|
923
|
+
opname="POLARS_EXPRESSION",
|
|
924
|
+
argval=argval,
|
|
925
|
+
argrepr=argval,
|
|
926
|
+
offset=inst2.offset,
|
|
927
|
+
)
|
|
928
|
+
# POLARS_EXPRESSION is mapped as a unary op, so switch instruction order
|
|
929
|
+
operand = inst2._replace(offset=inst1.offset)
|
|
930
|
+
updated_instructions.extend((operand, px))
|
|
931
|
+
|
|
932
|
+
return len(matching_instructions)
|
|
933
|
+
|
|
934
|
+
def _rewrite_functions(
|
|
935
|
+
self, idx: int, updated_instructions: list[Instruction]
|
|
936
|
+
) -> int:
|
|
937
|
+
"""Replace function calls with a synthetic POLARS_EXPRESSION op."""
|
|
938
|
+
for check_globals in (False, True):
|
|
939
|
+
for function_kind in _MODULE_FUNCTIONS:
|
|
940
|
+
if check_globals and not function_kind.get("check_load_global", True):
|
|
941
|
+
return 0
|
|
942
|
+
|
|
943
|
+
opnames: list[AbstractSet[str]] = (
|
|
944
|
+
[
|
|
945
|
+
{"LOAD_GLOBAL", "LOAD_DEREF"},
|
|
946
|
+
*function_kind["argument_1_opname"],
|
|
947
|
+
*function_kind["argument_1_unary_opname"],
|
|
948
|
+
*function_kind["argument_2_opname"],
|
|
949
|
+
OpNames.CALL,
|
|
950
|
+
]
|
|
951
|
+
if check_globals
|
|
952
|
+
else [
|
|
953
|
+
{"LOAD_GLOBAL", "LOAD_DEREF"},
|
|
954
|
+
*function_kind["module_opname"],
|
|
955
|
+
*function_kind["attribute_opname"],
|
|
956
|
+
*function_kind["argument_1_opname"],
|
|
957
|
+
*function_kind["argument_1_unary_opname"],
|
|
958
|
+
*function_kind["argument_2_opname"],
|
|
959
|
+
OpNames.CALL,
|
|
960
|
+
]
|
|
961
|
+
)
|
|
962
|
+
module_aliases = function_kind["module_name"]
|
|
963
|
+
if matching_instructions := self._matches(
|
|
964
|
+
idx,
|
|
965
|
+
opnames=opnames,
|
|
966
|
+
argvals=[
|
|
967
|
+
*function_kind["function_name"],
|
|
968
|
+
]
|
|
969
|
+
if check_globals
|
|
970
|
+
else [
|
|
971
|
+
*function_kind["module_name"],
|
|
972
|
+
*function_kind["attribute_name"],
|
|
973
|
+
*function_kind["function_name"],
|
|
974
|
+
],
|
|
975
|
+
):
|
|
976
|
+
attribute_count = len(function_kind["attribute_name"])
|
|
977
|
+
inst1, inst2, inst3 = matching_instructions[
|
|
978
|
+
attribute_count : 3 + attribute_count
|
|
979
|
+
]
|
|
980
|
+
if check_globals:
|
|
981
|
+
if not self._caller_variables:
|
|
982
|
+
self._caller_variables = _get_all_caller_variables()
|
|
983
|
+
if (expr_name := inst1.argval) not in self._caller_variables:
|
|
984
|
+
continue
|
|
985
|
+
else:
|
|
986
|
+
module_name = self._caller_variables[expr_name].__module__
|
|
987
|
+
if not any((module_name in m) for m in module_aliases):
|
|
988
|
+
continue
|
|
989
|
+
expr_name = _MODULE_FUNC_TO_EXPR_NAME.get(
|
|
990
|
+
f"{module_name}.{expr_name}", expr_name
|
|
991
|
+
)
|
|
992
|
+
elif inst1.argval == "json":
|
|
993
|
+
expr_name = "str.json_decode"
|
|
994
|
+
elif inst1.argval == "datetime":
|
|
995
|
+
fmt = matching_instructions[attribute_count + 3].argval
|
|
996
|
+
expr_name = f'str.to_datetime(format="{fmt}")'
|
|
997
|
+
if not self._is_stdlib_datetime(
|
|
998
|
+
inst1.argval,
|
|
999
|
+
matching_instructions[0].argval,
|
|
1000
|
+
attribute_count,
|
|
1001
|
+
):
|
|
1002
|
+
# skip these instructions if not stdlib datetime function
|
|
1003
|
+
return len(matching_instructions)
|
|
1004
|
+
elif inst1.argval == "math":
|
|
1005
|
+
expr_name = _MODULE_FUNC_TO_EXPR_NAME.get(
|
|
1006
|
+
f"math.{inst2.argval}", inst2.argval
|
|
1007
|
+
)
|
|
1008
|
+
else:
|
|
1009
|
+
expr_name = inst2.argval
|
|
1010
|
+
|
|
1011
|
+
# note: POLARS_EXPRESSION is mapped as unary op, so switch
|
|
1012
|
+
# instruction order/offsets (for later RPE-type stack walk)
|
|
1013
|
+
swap_inst = inst2 if check_globals else inst3
|
|
1014
|
+
px = inst1._replace(
|
|
1015
|
+
opname="POLARS_EXPRESSION",
|
|
1016
|
+
argval=expr_name,
|
|
1017
|
+
argrepr=expr_name,
|
|
1018
|
+
offset=swap_inst.offset,
|
|
1019
|
+
)
|
|
1020
|
+
px._from_module = None if check_globals else (inst1.argval or None) # type: ignore[attr-defined]
|
|
1021
|
+
operand = swap_inst._replace(offset=inst1.offset)
|
|
1022
|
+
updated_instructions.extend(
|
|
1023
|
+
(
|
|
1024
|
+
operand,
|
|
1025
|
+
matching_instructions[3 + attribute_count],
|
|
1026
|
+
px,
|
|
1027
|
+
)
|
|
1028
|
+
if function_kind["argument_1_unary_opname"]
|
|
1029
|
+
else (operand, px)
|
|
1030
|
+
)
|
|
1031
|
+
return len(matching_instructions)
|
|
1032
|
+
|
|
1033
|
+
return 0
|
|
1034
|
+
|
|
1035
|
+
def _rewrite_methods(
|
|
1036
|
+
self, idx: int, updated_instructions: list[Instruction]
|
|
1037
|
+
) -> int:
|
|
1038
|
+
"""Replace python method calls with synthetic POLARS_EXPRESSION op."""
|
|
1039
|
+
LOAD_METHOD = OpNames.LOAD_ATTR if _MIN_PY312 else {"LOAD_METHOD"}
|
|
1040
|
+
if matching_instructions := (
|
|
1041
|
+
# method call with one arg, eg: "s.endswith('!')"
|
|
1042
|
+
self._matches(
|
|
1043
|
+
idx,
|
|
1044
|
+
opnames=[LOAD_METHOD, {"LOAD_CONST"}, OpNames.CALL],
|
|
1045
|
+
argvals=[_PYTHON_METHODS_MAP],
|
|
1046
|
+
)
|
|
1047
|
+
or
|
|
1048
|
+
# method call with no arg, eg: "s.lower()"
|
|
1049
|
+
self._matches(
|
|
1050
|
+
idx,
|
|
1051
|
+
opnames=[LOAD_METHOD, OpNames.CALL],
|
|
1052
|
+
argvals=[_PYTHON_METHODS_MAP],
|
|
1053
|
+
)
|
|
1054
|
+
):
|
|
1055
|
+
inst = matching_instructions[0]
|
|
1056
|
+
expr = _PYTHON_METHODS_MAP[inst.argval]
|
|
1057
|
+
|
|
1058
|
+
if matching_instructions[1].opname == "LOAD_CONST":
|
|
1059
|
+
param_value = matching_instructions[1].argval
|
|
1060
|
+
if isinstance(param_value, tuple) and expr in (
|
|
1061
|
+
"str.starts_with",
|
|
1062
|
+
"str.ends_with",
|
|
1063
|
+
):
|
|
1064
|
+
starts, ends = ("^", "") if "starts" in expr else ("", "$")
|
|
1065
|
+
rx = "|".join(re_escape(v) for v in param_value)
|
|
1066
|
+
q = '"' if "'" in param_value else "'"
|
|
1067
|
+
expr = f"str.contains(r{q}{starts}({rx}){ends}{q})"
|
|
1068
|
+
else:
|
|
1069
|
+
expr += f"({param_value!r})"
|
|
1070
|
+
|
|
1071
|
+
px = inst._replace(opname="POLARS_EXPRESSION", argval=expr, argrepr=expr)
|
|
1072
|
+
updated_instructions.append(px)
|
|
1073
|
+
|
|
1074
|
+
elif matching_instructions := (
|
|
1075
|
+
# method call with three args, eg: "s.replace('!','?',count=2)"
|
|
1076
|
+
self._matches(
|
|
1077
|
+
idx,
|
|
1078
|
+
opnames=[
|
|
1079
|
+
LOAD_METHOD,
|
|
1080
|
+
{"LOAD_CONST"},
|
|
1081
|
+
{"LOAD_CONST"},
|
|
1082
|
+
{"LOAD_CONST"},
|
|
1083
|
+
OpNames.CALL,
|
|
1084
|
+
],
|
|
1085
|
+
argvals=[_PYTHON_METHODS_MAP],
|
|
1086
|
+
)
|
|
1087
|
+
or
|
|
1088
|
+
# method call with two args, eg: "s.replace('!','?')"
|
|
1089
|
+
self._matches(
|
|
1090
|
+
idx,
|
|
1091
|
+
opnames=[LOAD_METHOD, {"LOAD_CONST"}, {"LOAD_CONST"}, OpNames.CALL],
|
|
1092
|
+
argvals=[_PYTHON_METHODS_MAP],
|
|
1093
|
+
)
|
|
1094
|
+
):
|
|
1095
|
+
inst = matching_instructions[0]
|
|
1096
|
+
expr = _PYTHON_METHODS_MAP[inst.argval]
|
|
1097
|
+
|
|
1098
|
+
param_values = [
|
|
1099
|
+
i.argval
|
|
1100
|
+
for i in matching_instructions[1 : len(matching_instructions) - 1]
|
|
1101
|
+
]
|
|
1102
|
+
if expr == "str.replace":
|
|
1103
|
+
if len(param_values) == 3:
|
|
1104
|
+
old, new, count = param_values
|
|
1105
|
+
expr += f"({old!r},{new!r},n={count},literal=True)"
|
|
1106
|
+
else:
|
|
1107
|
+
old, new = param_values
|
|
1108
|
+
expr = f"str.replace_all({old!r},{new!r},literal=True)"
|
|
1109
|
+
else:
|
|
1110
|
+
expr += f"({','.join(repr(v) for v in param_values)})"
|
|
1111
|
+
|
|
1112
|
+
px = inst._replace(opname="POLARS_EXPRESSION", argval=expr, argrepr=expr)
|
|
1113
|
+
updated_instructions.append(px)
|
|
1114
|
+
|
|
1115
|
+
return len(matching_instructions)
|
|
1116
|
+
|
|
1117
|
+
@staticmethod
|
|
1118
|
+
def _unpack_superinstructions(
|
|
1119
|
+
instructions: list[Instruction],
|
|
1120
|
+
) -> Iterator[Instruction]:
|
|
1121
|
+
"""Expand known 'superinstructions' into their component parts."""
|
|
1122
|
+
for inst in instructions:
|
|
1123
|
+
if inst.opname in (
|
|
1124
|
+
"LOAD_FAST_LOAD_FAST",
|
|
1125
|
+
"LOAD_FAST_BORROW_LOAD_FAST_BORROW",
|
|
1126
|
+
):
|
|
1127
|
+
for idx in (0, 1):
|
|
1128
|
+
yield inst._replace(
|
|
1129
|
+
opname="LOAD_FAST",
|
|
1130
|
+
argval=inst.argval[idx],
|
|
1131
|
+
argrepr=inst.argval[idx],
|
|
1132
|
+
)
|
|
1133
|
+
else:
|
|
1134
|
+
yield inst
|
|
1135
|
+
|
|
1136
|
+
@staticmethod
|
|
1137
|
+
def _update_instruction(inst: Instruction) -> Instruction:
|
|
1138
|
+
"""Update/modify specific instructions to simplify multi-version parsing."""
|
|
1139
|
+
if not _MIN_PY311 and inst.opname in OpNames.BINARY:
|
|
1140
|
+
# update older binary opcodes using py >= 3.11 'BINARY_OP' instead
|
|
1141
|
+
inst = inst._replace(
|
|
1142
|
+
argrepr=OpNames.BINARY[inst.opname],
|
|
1143
|
+
opname="BINARY_OP",
|
|
1144
|
+
)
|
|
1145
|
+
elif _MIN_PY314:
|
|
1146
|
+
if (opname := inst.opname) in OpNames.SIMPLIFY_SPECIALIZED:
|
|
1147
|
+
# simplify specialised opcode variants to their more generic form
|
|
1148
|
+
# (eg: 'LOAD_FAST_BORROW' -> 'LOAD_FAST', etc)
|
|
1149
|
+
updated_params = {"opname": OpNames.SIMPLIFY_SPECIALIZED[inst.opname]}
|
|
1150
|
+
if opname == "LOAD_SMALL_INT":
|
|
1151
|
+
updated_params["argrepr"] = str(inst.argval)
|
|
1152
|
+
inst = inst._replace(**updated_params) # type: ignore[arg-type]
|
|
1153
|
+
|
|
1154
|
+
elif opname == "BINARY_OP" and inst.argrepr == "[]":
|
|
1155
|
+
# special case for new 'BINARY_OP ([])'; revert to 'BINARY_SUBSCR'
|
|
1156
|
+
inst = inst._replace(opname="BINARY_SUBSCR", argrepr="")
|
|
1157
|
+
|
|
1158
|
+
return inst
|
|
1159
|
+
|
|
1160
|
+
def _is_stdlib_datetime(
|
|
1161
|
+
self, function_name: str, module_name: str, attribute_count: int
|
|
1162
|
+
) -> bool:
|
|
1163
|
+
if not self._caller_variables:
|
|
1164
|
+
self._caller_variables = _get_all_caller_variables()
|
|
1165
|
+
vars = self._caller_variables
|
|
1166
|
+
return (
|
|
1167
|
+
attribute_count == 0 and vars.get(function_name) is datetime.datetime
|
|
1168
|
+
) or (attribute_count == 1 and vars.get(module_name) is datetime)
|
|
1169
|
+
|
|
1170
|
+
|
|
1171
|
+
def _raw_function_meta(function: Callable[[Any], Any]) -> tuple[str, str]:
|
|
1172
|
+
"""Identify translatable calls that aren't wrapped inside a lambda/function."""
|
|
1173
|
+
try:
|
|
1174
|
+
func_module = function.__class__.__module__
|
|
1175
|
+
func_name = function.__name__
|
|
1176
|
+
except AttributeError:
|
|
1177
|
+
return "", ""
|
|
1178
|
+
|
|
1179
|
+
# numpy function calls
|
|
1180
|
+
if func_module == "numpy" and func_name in _NUMPY_FUNCTIONS:
|
|
1181
|
+
return "np", f"{func_name}()"
|
|
1182
|
+
|
|
1183
|
+
# python function calls
|
|
1184
|
+
elif func_module == "builtins":
|
|
1185
|
+
if func_name in _PYTHON_CASTS_MAP:
|
|
1186
|
+
return "builtins", f"cast(pl.{_PYTHON_CASTS_MAP[func_name]})"
|
|
1187
|
+
elif func_name in _MATH_FUNCTIONS:
|
|
1188
|
+
import math
|
|
1189
|
+
|
|
1190
|
+
if function is getattr(math, func_name):
|
|
1191
|
+
expr_name = _MODULE_FUNC_TO_EXPR_NAME.get(
|
|
1192
|
+
f"math.{func_name}", func_name
|
|
1193
|
+
)
|
|
1194
|
+
return "math", f"{expr_name}()"
|
|
1195
|
+
elif func_name == "loads":
|
|
1196
|
+
import json # double-check since it is referenced via 'builtins'
|
|
1197
|
+
|
|
1198
|
+
if function is json.loads:
|
|
1199
|
+
return "json", "str.json_decode()"
|
|
1200
|
+
|
|
1201
|
+
return "", ""
|
|
1202
|
+
|
|
1203
|
+
|
|
1204
|
+
def warn_on_inefficient_map(
|
|
1205
|
+
function: Callable[[Any], Any], columns: list[str], map_target: MapTarget
|
|
1206
|
+
) -> None:
|
|
1207
|
+
"""
|
|
1208
|
+
Generate `PolarsInefficientMapWarning` on poor usage of a `map` function.
|
|
1209
|
+
|
|
1210
|
+
Parameters
|
|
1211
|
+
----------
|
|
1212
|
+
function
|
|
1213
|
+
The function passed to `map`.
|
|
1214
|
+
columns
|
|
1215
|
+
The column name(s) of the original object; in the case of an `Expr` this
|
|
1216
|
+
will be a list of length 1, containing the expression's root name.
|
|
1217
|
+
map_target
|
|
1218
|
+
The target of the `map` call. One of `"expr"`, `"frame"`, or `"series"`.
|
|
1219
|
+
"""
|
|
1220
|
+
if map_target == "frame":
|
|
1221
|
+
msg = "TODO: 'frame' map-function parsing"
|
|
1222
|
+
raise NotImplementedError(msg)
|
|
1223
|
+
|
|
1224
|
+
# note: we only consider simple functions with a single col/param
|
|
1225
|
+
col: str = columns and columns[0] # type: ignore[assignment]
|
|
1226
|
+
if not col and col != "":
|
|
1227
|
+
return None
|
|
1228
|
+
|
|
1229
|
+
# the parser introspects function bytecode to determine if we can
|
|
1230
|
+
# rewrite as a (much) more optimal native polars expression instead
|
|
1231
|
+
if (parser := _BYTECODE_PARSER_CACHE_.get(key := (function, map_target))) is None:
|
|
1232
|
+
parser = BytecodeParser(function, map_target)
|
|
1233
|
+
_BYTECODE_PARSER_CACHE_[key] = parser
|
|
1234
|
+
|
|
1235
|
+
if parser.can_attempt_rewrite():
|
|
1236
|
+
parser.warn(col)
|
|
1237
|
+
else:
|
|
1238
|
+
# handle bare numpy/json functions
|
|
1239
|
+
module, suggestion = _raw_function_meta(function)
|
|
1240
|
+
if module and suggestion:
|
|
1241
|
+
target_name = _get_target_name(col, suggestion, map_target)
|
|
1242
|
+
parser._map_target_name = target_name
|
|
1243
|
+
fn = function.__name__
|
|
1244
|
+
parser.warn(
|
|
1245
|
+
col,
|
|
1246
|
+
suggestion_override=f"{target_name}.{suggestion}",
|
|
1247
|
+
udf_override=fn if module == "builtins" else f"{module}.{fn}",
|
|
1248
|
+
)
|
|
1249
|
+
|
|
1250
|
+
|
|
1251
|
+
__all__ = ["BytecodeParser", "warn_on_inefficient_map"]
|