Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
flowfile_frame/expr.py
ADDED
|
@@ -0,0 +1,1163 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Union, TYPE_CHECKING, List, Literal, TypeVar
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
from polars.expr.string import ExprStringNameSpace
|
|
7
|
+
|
|
8
|
+
from flowfile_core.schemas import transform_schema
|
|
9
|
+
|
|
10
|
+
from builtins import len as built_in_len
|
|
11
|
+
|
|
12
|
+
# --- TYPE CHECKING IMPORTS ---
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from flowfile_frame.selectors import Selector
|
|
15
|
+
ExprType = TypeVar('ExprType', bound='Expr')
|
|
16
|
+
ColumnType = "Column" # Use string literal instead of direct class reference
|
|
17
|
+
|
|
18
|
+
ExprOrStr = Union['Expr', str]
|
|
19
|
+
ExprOrStrList = List[ExprOrStr]
|
|
20
|
+
ExprStrOrList = Union[ExprOrStr, ExprOrStrList]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _repr_args(*args, **kwargs):
|
|
24
|
+
"""Helper to represent arguments for __repr__."""
|
|
25
|
+
arg_reprs = [repr(a) for a in args]
|
|
26
|
+
kwarg_reprs = []
|
|
27
|
+
for k, v in kwargs.items():
|
|
28
|
+
if isinstance(v, pl.DataType):
|
|
29
|
+
kwarg_reprs.append(f"{k}={v!s}")
|
|
30
|
+
elif isinstance(v, type) and issubclass(v, pl.DataType):
|
|
31
|
+
kwarg_reprs.append(f"{k}=pl.{v.__name__}")
|
|
32
|
+
else:
|
|
33
|
+
kwarg_reprs.append(f"{k}={repr(v)}")
|
|
34
|
+
return ", ".join(arg_reprs + kwarg_reprs)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_expr_and_repr(value: Any) -> tuple[Optional[pl.Expr], str]:
|
|
38
|
+
"""Helper to get polars expr and repr string for operands."""
|
|
39
|
+
if isinstance(value, Expr):
|
|
40
|
+
# Ensure we return None if the inner expression is None
|
|
41
|
+
inner_expr = value.expr if value.expr is not None else None
|
|
42
|
+
return inner_expr, value._repr_str
|
|
43
|
+
elif isinstance(value, pl.Expr):
|
|
44
|
+
base_str = str(value)
|
|
45
|
+
if base_str.startswith("col("):
|
|
46
|
+
return value, f"pl.{base_str}"
|
|
47
|
+
if base_str.startswith("lit("):
|
|
48
|
+
return value, f"pl.{base_str}"
|
|
49
|
+
return value, f"pl.Expr({base_str})"
|
|
50
|
+
else:
|
|
51
|
+
# Assume literal
|
|
52
|
+
return pl.lit(value), repr(value)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# --- Namespaces ---
|
|
56
|
+
|
|
57
|
+
class StringMethods:
|
|
58
|
+
expr: Optional[ExprStringNameSpace]
|
|
59
|
+
|
|
60
|
+
def __init__(self, parent_expr: 'Expr', parent_repr_str: str):
|
|
61
|
+
self.parent = parent_expr
|
|
62
|
+
self.expr = parent_expr.expr.str if parent_expr.expr is not None else None
|
|
63
|
+
self.parent_repr_str = parent_repr_str
|
|
64
|
+
|
|
65
|
+
def _create_next_expr(self, *args, method_name: str, result_expr: Optional[pl.Expr], is_complex: bool, **kwargs) -> 'Expr':
|
|
66
|
+
args_repr = _repr_args(*args, **kwargs)
|
|
67
|
+
new_repr = f"{self.parent_repr_str}.str.{method_name}({args_repr})"
|
|
68
|
+
new_expr = Expr(result_expr, self.parent.name, repr_str=new_repr,
|
|
69
|
+
initial_column_name=self.parent._initial_column_name,
|
|
70
|
+
selector=None,
|
|
71
|
+
agg_func=self.parent.agg_func,
|
|
72
|
+
is_complex=is_complex)
|
|
73
|
+
return new_expr
|
|
74
|
+
|
|
75
|
+
# ... (String methods remain unchanged from your provided code) ...
|
|
76
|
+
def contains(self, pattern, *, literal=False):
|
|
77
|
+
res_expr = self.expr.contains(pattern, literal=literal) if self.expr is not None else None
|
|
78
|
+
return self._create_next_expr(pattern, literal=literal, method_name="contains", result_expr=res_expr, is_complex=True)
|
|
79
|
+
|
|
80
|
+
def starts_with(self, prefix):
|
|
81
|
+
res_expr = self.expr.starts_with(prefix) if self.expr is not None else None
|
|
82
|
+
return self._create_next_expr(prefix, is_complex=True, method_name="starts_with", result_expr=res_expr)
|
|
83
|
+
|
|
84
|
+
def ends_with(self, suffix):
|
|
85
|
+
res_expr = self.expr.ends_with(suffix) if self.expr is not None else None
|
|
86
|
+
return self._create_next_expr(suffix, result_expr=res_expr, method_name="ends_with", is_complex=True)
|
|
87
|
+
|
|
88
|
+
def replace(self, pattern, replacement, *, literal=False):
|
|
89
|
+
res_expr = self.expr.replace(pattern, replacement, literal=literal) if self.expr is not None else None
|
|
90
|
+
return self._create_next_expr(pattern, replacement, method_name="replace",
|
|
91
|
+
result_expr=res_expr, literal=literal, is_complex=True)
|
|
92
|
+
|
|
93
|
+
def to_uppercase(self):
|
|
94
|
+
res_expr = self.expr.to_uppercase() if self.expr is not None else None
|
|
95
|
+
return self._create_next_expr(method_name="to_uppercase", result_expr=res_expr, is_complex=True)
|
|
96
|
+
|
|
97
|
+
def to_lowercase(self):
|
|
98
|
+
res_expr = self.expr.to_lowercase() if self.expr is not None else None
|
|
99
|
+
return self._create_next_expr(method_name="to_lowercase", result_expr=res_expr, is_complex=True)
|
|
100
|
+
|
|
101
|
+
def len_chars(self):
|
|
102
|
+
res_expr = self.expr.len_chars() if self.expr is not None else None
|
|
103
|
+
return self._create_next_expr(method_name="len_chars", result_expr=res_expr, is_complex=True)
|
|
104
|
+
|
|
105
|
+
def len_bytes(self):
|
|
106
|
+
res_expr = self.expr.len_bytes() if self.expr is not None else None
|
|
107
|
+
return self._create_next_expr(method_name="len_bytes", result_expr=res_expr, is_complex=True)
|
|
108
|
+
|
|
109
|
+
def to_titlecase(self):
|
|
110
|
+
res_expr = self.expr.to_titlecase() if self.expr is not None else None
|
|
111
|
+
return self._create_next_expr(method_name="to_titlecase", result_expr=res_expr, is_complex=True)
|
|
112
|
+
|
|
113
|
+
def __getattr__(self, name):
|
|
114
|
+
if self.expr is None or not hasattr(self.expr, name):
|
|
115
|
+
if self.expr is None:
|
|
116
|
+
raise AttributeError(
|
|
117
|
+
f"'StringMethods' cannot call '{name}' because underlying expression is not set "
|
|
118
|
+
f"(e.g., created from selector). Apply aggregation first."
|
|
119
|
+
)
|
|
120
|
+
raise AttributeError(f"'StringMethods' underlying expression has no attribute '{name}'")
|
|
121
|
+
pl_attr = getattr(self.expr, name)
|
|
122
|
+
if callable(pl_attr):
|
|
123
|
+
def wrapper(*args, **kwargs):
|
|
124
|
+
result = pl_attr(*args, **kwargs)
|
|
125
|
+
# Assume generic getattr methods don't change aggregation status
|
|
126
|
+
return self._create_next_expr(name, result, *args, **kwargs)
|
|
127
|
+
return wrapper
|
|
128
|
+
else:
|
|
129
|
+
return pl_attr
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class DateTimeMethods:
|
|
133
|
+
expr: Optional[Any]
|
|
134
|
+
|
|
135
|
+
def __init__(self, parent_expr: 'Expr', parent_repr_str: str):
|
|
136
|
+
self.parent = parent_expr
|
|
137
|
+
self.expr = parent_expr.expr.dt if parent_expr.expr is not None else None
|
|
138
|
+
self.parent_repr_str = parent_repr_str
|
|
139
|
+
|
|
140
|
+
def _create_next_expr(self, method_name: str, result_expr: Optional[pl.Expr], *args, **kwargs) -> 'Expr':
|
|
141
|
+
args_repr = _repr_args(*args, **kwargs)
|
|
142
|
+
new_repr = f"{self.parent_repr_str}.dt.{method_name}({args_repr})"
|
|
143
|
+
|
|
144
|
+
new_expr = Expr(result_expr, self.parent.name, repr_str=new_repr,
|
|
145
|
+
initial_column_name=self.parent._initial_column_name,
|
|
146
|
+
selector=None,
|
|
147
|
+
agg_func=self.parent.agg_func,
|
|
148
|
+
is_complex=True)
|
|
149
|
+
return new_expr
|
|
150
|
+
|
|
151
|
+
# ... (DateTime methods remain unchanged from your provided code) ...
|
|
152
|
+
def year(self):
|
|
153
|
+
res_expr = self.expr.year() if self.expr is not None else None
|
|
154
|
+
return self._create_next_expr("year", res_expr)
|
|
155
|
+
|
|
156
|
+
def month(self):
|
|
157
|
+
res_expr = self.expr.month() if self.expr is not None else None
|
|
158
|
+
return self._create_next_expr("month", res_expr)
|
|
159
|
+
|
|
160
|
+
def day(self):
|
|
161
|
+
res_expr = self.expr.day() if self.expr is not None else None
|
|
162
|
+
return self._create_next_expr("day", res_expr)
|
|
163
|
+
|
|
164
|
+
def hour(self):
|
|
165
|
+
res_expr = self.expr.hour() if self.expr is not None else None
|
|
166
|
+
return self._create_next_expr("hour", res_expr)
|
|
167
|
+
|
|
168
|
+
def minute(self):
|
|
169
|
+
res_expr = self.expr.minute() if self.expr is not None else None
|
|
170
|
+
return self._create_next_expr("minute", res_expr)
|
|
171
|
+
|
|
172
|
+
def second(self):
|
|
173
|
+
res_expr = self.expr.second() if self.expr is not None else None
|
|
174
|
+
return self._create_next_expr("second", res_expr)
|
|
175
|
+
|
|
176
|
+
def __getattr__(self, name):
|
|
177
|
+
if self.expr is None or not hasattr(self.expr, name):
|
|
178
|
+
if self.expr is None:
|
|
179
|
+
raise AttributeError(
|
|
180
|
+
f"'DateTimeMethods' cannot call '{name}' because underlying expression is not set "
|
|
181
|
+
f"(e.g., created from selector). Apply aggregation first."
|
|
182
|
+
)
|
|
183
|
+
raise AttributeError(f"'DateTimeMethods' underlying expression has no attribute '{name}'")
|
|
184
|
+
pl_attr = getattr(self.expr, name)
|
|
185
|
+
if callable(pl_attr):
|
|
186
|
+
def wrapper(*args, **kwargs):
|
|
187
|
+
result = pl_attr(*args, **kwargs)
|
|
188
|
+
# Assume generic getattr methods don't change aggregation status
|
|
189
|
+
return self._create_next_expr(name, result, *args, **kwargs)
|
|
190
|
+
return wrapper
|
|
191
|
+
else:
|
|
192
|
+
return pl_attr
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class Expr:
|
|
196
|
+
_initial_column_name: Optional[str]
|
|
197
|
+
selector: Optional['Selector']
|
|
198
|
+
expr: Optional[pl.Expr]
|
|
199
|
+
agg_func: Optional[str]
|
|
200
|
+
_repr_str: str
|
|
201
|
+
name: Optional[str]
|
|
202
|
+
is_complex: bool = False
|
|
203
|
+
|
|
204
|
+
def __init__(self,
|
|
205
|
+
expr: Optional[pl.Expr],
|
|
206
|
+
column_name: Optional[str] = None,
|
|
207
|
+
repr_str: Optional[str] = None,
|
|
208
|
+
initial_column_name: Optional[str] = None,
|
|
209
|
+
selector: Optional['Selector'] = None,
|
|
210
|
+
agg_func: Optional[str] = None,
|
|
211
|
+
ddof: Optional[int] = None,
|
|
212
|
+
is_complex: bool = False):
|
|
213
|
+
|
|
214
|
+
self.expr = expr
|
|
215
|
+
self.name = column_name
|
|
216
|
+
self.agg_func = agg_func
|
|
217
|
+
self.selector = selector
|
|
218
|
+
self._initial_column_name = initial_column_name or column_name
|
|
219
|
+
self.is_complex = is_complex
|
|
220
|
+
# --- Determine Representation String ---
|
|
221
|
+
if repr_str is not None:
|
|
222
|
+
self._repr_str = repr_str
|
|
223
|
+
elif self.selector is not None and self.agg_func is not None:
|
|
224
|
+
selector_repr = self.selector.repr_str
|
|
225
|
+
func_name = self.agg_func
|
|
226
|
+
kwargs_dict = {}
|
|
227
|
+
if func_name in ("std", "var") and ddof is not None:
|
|
228
|
+
kwargs_dict['ddof'] = ddof
|
|
229
|
+
kwargs_repr = _repr_args(**kwargs_dict)
|
|
230
|
+
self._repr_str = f"{selector_repr}.{func_name}({kwargs_repr})"
|
|
231
|
+
self.expr = None
|
|
232
|
+
elif self.selector is not None:
|
|
233
|
+
self._repr_str = f"{self.selector.repr_str}"
|
|
234
|
+
self.expr = None
|
|
235
|
+
elif self.expr is not None:
|
|
236
|
+
_, default_repr = _get_expr_and_repr(self.expr)
|
|
237
|
+
self._repr_str = default_repr
|
|
238
|
+
else:
|
|
239
|
+
raise ValueError("Cannot initialize Expr without expr, repr_str, or selector+agg_func")
|
|
240
|
+
|
|
241
|
+
if self.name is None and self.selector is None and self.expr is not None:
|
|
242
|
+
try:
|
|
243
|
+
self.name = self.expr._output_name
|
|
244
|
+
except AttributeError:
|
|
245
|
+
try:
|
|
246
|
+
self.name = self.expr._name
|
|
247
|
+
except AttributeError:
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
self._str_namespace: Optional['StringMethods'] = None
|
|
251
|
+
self._dt_namespace: Optional['DateTimeMethods'] = None
|
|
252
|
+
|
|
253
|
+
def __repr__(self) -> str:
|
|
254
|
+
return self._repr_str
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
def is_simple(self) -> bool:
|
|
258
|
+
"""
|
|
259
|
+
Determines if this expression is a "simple" expression that can be directly
|
|
260
|
+
converted to a GroupBy's AggColl structure.
|
|
261
|
+
|
|
262
|
+
A simple expression is one that:
|
|
263
|
+
1. References a single column directly (not through arithmetic/logical operations)
|
|
264
|
+
2. May have an aggregation function applied (sum, mean, etc.)
|
|
265
|
+
3. May have been aliased with a new name
|
|
266
|
+
|
|
267
|
+
Returns
|
|
268
|
+
-------
|
|
269
|
+
bool
|
|
270
|
+
True if this is a simple expression, False otherwise
|
|
271
|
+
"""
|
|
272
|
+
# Check for selector expressions
|
|
273
|
+
if self.selector is not None:
|
|
274
|
+
# Selector expressions are complex - they select multiple columns
|
|
275
|
+
return False
|
|
276
|
+
|
|
277
|
+
# Check if this expression has any arithmetic/logical operators
|
|
278
|
+
if hasattr(self, "_repr_str"):
|
|
279
|
+
# Check for when/then/otherwise expressions
|
|
280
|
+
if any(
|
|
281
|
+
marker in self._repr_str
|
|
282
|
+
for marker in ["when(", ".then(", ".otherwise("]
|
|
283
|
+
):
|
|
284
|
+
return False
|
|
285
|
+
|
|
286
|
+
# Look for arithmetic operators in the expression string
|
|
287
|
+
for op in ["+", "-", "*", "/", "//", "%", "**", "&", "|", "==", "!=", "<", ">", "<=", ">=",]:
|
|
288
|
+
if op in self._repr_str:
|
|
289
|
+
# If the operator is in a .alias() part, it's still simple
|
|
290
|
+
if f".alias('{op}" in self._repr_str:
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
# Otherwise, we have a complex expression
|
|
294
|
+
return False
|
|
295
|
+
|
|
296
|
+
# Check for other functions that might create complex expressions
|
|
297
|
+
for func in [
|
|
298
|
+
"filter(",
|
|
299
|
+
"where(",
|
|
300
|
+
"if_else(",
|
|
301
|
+
"case_when(",
|
|
302
|
+
"apply(",
|
|
303
|
+
"map(",
|
|
304
|
+
]:
|
|
305
|
+
if func in self._repr_str:
|
|
306
|
+
return False
|
|
307
|
+
|
|
308
|
+
# If we reach here, it's a simple expression (just column reference and maybe aggregation)
|
|
309
|
+
return True
|
|
310
|
+
|
|
311
|
+
def _create_next_expr(self, *args, method_name: str, result_expr: Optional[pl.Expr], is_complex: bool, **kwargs) -> 'Expr':
|
|
312
|
+
"""Creates a new Expr instance, appending method call to repr string."""
|
|
313
|
+
args_repr = _repr_args(*args, **kwargs)
|
|
314
|
+
new_repr = f"{self._repr_str}.{method_name}({args_repr})"
|
|
315
|
+
|
|
316
|
+
# Create new instance, inheriting current agg_func status by default
|
|
317
|
+
new_expr_instance = Expr(result_expr, self.name, repr_str=new_repr,
|
|
318
|
+
initial_column_name=self._initial_column_name,
|
|
319
|
+
selector=None,
|
|
320
|
+
agg_func=self.agg_func,
|
|
321
|
+
is_complex=is_complex)
|
|
322
|
+
return new_expr_instance
|
|
323
|
+
|
|
324
|
+
def _create_binary_op_expr(
|
|
325
|
+
self, op_symbol: str, other: Any, result_expr: Optional[pl.Expr]
|
|
326
|
+
) -> "Expr":
|
|
327
|
+
"""Creates a new Expr for binary operations."""
|
|
328
|
+
if self.expr is None:
|
|
329
|
+
raise ValueError(
|
|
330
|
+
f"Cannot perform binary operation '{op_symbol}' on Expr without underlying polars expression."
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
other_expr, other_repr = _get_expr_and_repr(other)
|
|
334
|
+
|
|
335
|
+
if other_expr is None and not isinstance(
|
|
336
|
+
other, (int, float, str, bool, type(None))
|
|
337
|
+
):
|
|
338
|
+
raise ValueError(
|
|
339
|
+
f"Cannot perform binary operation '{op_symbol}' with operand without underlying polars expression or literal value: {other_repr}"
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# For binary operations, just construct the expression without extra parentheses
|
|
343
|
+
new_repr = f"{self._repr_str} {op_symbol} {other_repr}"
|
|
344
|
+
|
|
345
|
+
# Binary ops clear the aggregation state and selector link
|
|
346
|
+
return Expr(
|
|
347
|
+
result_expr,
|
|
348
|
+
None,
|
|
349
|
+
repr_str=f"({new_repr})", # Add parentheses around the ENTIRE expression
|
|
350
|
+
initial_column_name=self._initial_column_name,
|
|
351
|
+
selector=None,
|
|
352
|
+
agg_func=None,
|
|
353
|
+
is_complex=True
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
@property
|
|
357
|
+
def str(self) -> StringMethods:
|
|
358
|
+
if self._str_namespace is None:
|
|
359
|
+
self._str_namespace = StringMethods(self, self._repr_str)
|
|
360
|
+
return self._str_namespace
|
|
361
|
+
|
|
362
|
+
@property
|
|
363
|
+
def dt(self) -> DateTimeMethods:
|
|
364
|
+
if self._dt_namespace is None:
|
|
365
|
+
self._dt_namespace = DateTimeMethods(self, self._repr_str)
|
|
366
|
+
return self._dt_namespace
|
|
367
|
+
|
|
368
|
+
def sum(self):
|
|
369
|
+
result_expr = self.expr.sum() if self.expr is not None else None
|
|
370
|
+
result = self._create_next_expr(method_name="sum", result_expr=result_expr, is_complex=self.is_complex)
|
|
371
|
+
result.agg_func = "sum"
|
|
372
|
+
return result
|
|
373
|
+
|
|
374
|
+
def mean(self):
|
|
375
|
+
result_expr = self.expr.mean() if self.expr is not None else None
|
|
376
|
+
result = self._create_next_expr(method_name="mean", result_expr=result_expr, is_complex=self.is_complex)
|
|
377
|
+
result.agg_func = "mean"
|
|
378
|
+
return result
|
|
379
|
+
|
|
380
|
+
def min(self):
|
|
381
|
+
result_expr = self.expr.min() if self.expr is not None else None
|
|
382
|
+
result = self._create_next_expr(method_name="min", result_expr=result_expr, is_complex=self.is_complex)
|
|
383
|
+
result.agg_func = "min"
|
|
384
|
+
return result
|
|
385
|
+
|
|
386
|
+
def max(self):
|
|
387
|
+
result_expr = self.expr.max() if self.expr is not None else None
|
|
388
|
+
result = self._create_next_expr(method_name="max", result_expr=result_expr, is_complex=self.is_complex)
|
|
389
|
+
result.agg_func = "max"
|
|
390
|
+
return result
|
|
391
|
+
|
|
392
|
+
def median(self):
|
|
393
|
+
result_expr = self.expr.median() if self.expr is not None else None
|
|
394
|
+
result = self._create_next_expr(method_name="median", result_expr=result_expr, is_complex=self.is_complex)
|
|
395
|
+
result.agg_func = "median"
|
|
396
|
+
return result
|
|
397
|
+
|
|
398
|
+
def count(self):
|
|
399
|
+
result_expr = self.expr.count() if self.expr is not None else None
|
|
400
|
+
result = self._create_next_expr(method_name="count", result_expr=result_expr, is_complex=self.is_complex)
|
|
401
|
+
result.agg_func = "count"
|
|
402
|
+
return result
|
|
403
|
+
|
|
404
|
+
def first(self):
|
|
405
|
+
result_expr = self.expr.first() if self.expr is not None else None
|
|
406
|
+
result = self._create_next_expr(method_name="first", result_expr=result_expr, is_complex=self.is_complex)
|
|
407
|
+
result.agg_func = "first"
|
|
408
|
+
return result
|
|
409
|
+
|
|
410
|
+
def last(self):
|
|
411
|
+
result_expr = self.expr.last() if self.expr is not None else None
|
|
412
|
+
result = self._create_next_expr(method_name="last", result_expr=result_expr, is_complex=self.is_complex)
|
|
413
|
+
result.agg_func = "last"
|
|
414
|
+
return result
|
|
415
|
+
|
|
416
|
+
def n_unique(self):
|
|
417
|
+
result_expr = self.expr.n_unique() if self.expr is not None else None
|
|
418
|
+
result = self._create_next_expr(method_name="n_unique", result_expr=result_expr, is_complex=self.is_complex)
|
|
419
|
+
result.agg_func = "n_unique"
|
|
420
|
+
return result
|
|
421
|
+
|
|
422
|
+
def std(self, ddof=1):
|
|
423
|
+
result_expr = self.expr.std(ddof=ddof) if self.expr is not None else None
|
|
424
|
+
result = self._create_next_expr(method_name="std", result_expr=result_expr, ddof=ddof, is_complex=True)
|
|
425
|
+
result.agg_func = "std"
|
|
426
|
+
return result
|
|
427
|
+
|
|
428
|
+
def cum_count(self, reverse: bool = False) -> "Expr":
|
|
429
|
+
"""
|
|
430
|
+
Return the cumulative count of the non-null values in the column.
|
|
431
|
+
|
|
432
|
+
Parameters
|
|
433
|
+
----------
|
|
434
|
+
reverse : bool, default False
|
|
435
|
+
Reverse the operation
|
|
436
|
+
|
|
437
|
+
Returns
|
|
438
|
+
-------
|
|
439
|
+
Expr
|
|
440
|
+
A new expression with the cumulative count
|
|
441
|
+
"""
|
|
442
|
+
result_expr = (
|
|
443
|
+
self.expr.cum_count(reverse=reverse) if self.expr is not None else None
|
|
444
|
+
)
|
|
445
|
+
result = self._create_next_expr(method_name="cum_count", result_expr=result_expr, reverse=reverse, is_complex=True)
|
|
446
|
+
result.agg_func = None
|
|
447
|
+
return result
|
|
448
|
+
|
|
449
|
+
def var(self, ddof=1):
|
|
450
|
+
result_expr = self.expr.var(ddof=ddof) if self.expr is not None else None
|
|
451
|
+
result = self._create_next_expr(method_name="var", result_expr=result_expr, ddof=ddof, is_complex=True)
|
|
452
|
+
result.agg_func = "var"
|
|
453
|
+
return result
|
|
454
|
+
|
|
455
|
+
def __add__(self, other):
|
|
456
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
457
|
+
res_expr = self.expr + other_expr if self.expr is not None and other_expr is not None else None
|
|
458
|
+
return self._create_binary_op_expr("+", other, res_expr)
|
|
459
|
+
|
|
460
|
+
def __sub__(self, other):
|
|
461
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
462
|
+
res_expr = self.expr - other_expr if self.expr is not None and other_expr is not None else None
|
|
463
|
+
return self._create_binary_op_expr("-", other, res_expr)
|
|
464
|
+
|
|
465
|
+
def __mul__(self, other):
|
|
466
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
467
|
+
res_expr = self.expr * other_expr if self.expr is not None and other_expr is not None else None
|
|
468
|
+
return self._create_binary_op_expr("*", other, res_expr)
|
|
469
|
+
|
|
470
|
+
def __truediv__(self, other):
|
|
471
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
472
|
+
res_expr = self.expr / other_expr if self.expr is not None and other_expr is not None else None
|
|
473
|
+
return self._create_binary_op_expr("/", other, res_expr)
|
|
474
|
+
|
|
475
|
+
def __floordiv__(self, other):
|
|
476
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
477
|
+
res_expr = self.expr // other_expr if self.expr is not None and other_expr is not None else None
|
|
478
|
+
return self._create_binary_op_expr("//", other, res_expr)
|
|
479
|
+
|
|
480
|
+
def __pow__(self, exponent):
|
|
481
|
+
exp_expr, _ = _get_expr_and_repr(exponent)
|
|
482
|
+
res_expr = self.expr.pow(exp_expr) if self.expr is not None and exp_expr is not None else None
|
|
483
|
+
return self._create_binary_op_expr("**", exponent, res_expr)
|
|
484
|
+
|
|
485
|
+
def __mod__(self, other):
|
|
486
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
487
|
+
res_expr = self.expr % other_expr if self.expr is not None and other_expr is not None else None
|
|
488
|
+
return self._create_binary_op_expr("%", other, res_expr)
|
|
489
|
+
|
|
490
|
+
# --- Right-side Arithmetic ---
|
|
491
|
+
def __radd__(self, other):
|
|
492
|
+
other_expr, other_repr = _get_expr_and_repr(other)
|
|
493
|
+
new_repr = f"{other_repr} + {self._repr_str}"
|
|
494
|
+
res_expr = other_expr + self.expr if other_expr is not None and self.expr is not None else None
|
|
495
|
+
# Right-side ops also clear agg_func
|
|
496
|
+
return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
|
|
497
|
+
|
|
498
|
+
def __rsub__(self, other):
|
|
499
|
+
other_expr, other_repr = _get_expr_and_repr(other)
|
|
500
|
+
new_repr = f"{other_repr} - {self._repr_str}"
|
|
501
|
+
res_expr = other_expr - self.expr if other_expr is not None and self.expr is not None else None
|
|
502
|
+
return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
|
|
503
|
+
|
|
504
|
+
def __rmul__(self, other):
|
|
505
|
+
other_expr, other_repr = _get_expr_and_repr(other)
|
|
506
|
+
new_repr = f"{other_repr} * {self._repr_str}"
|
|
507
|
+
res_expr = other_expr * self.expr if other_expr is not None and self.expr is not None else None
|
|
508
|
+
return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
|
|
509
|
+
|
|
510
|
+
def __rtruediv__(self, other):
|
|
511
|
+
other_expr, other_repr = _get_expr_and_repr(other)
|
|
512
|
+
new_repr = f"{other_repr} / {self._repr_str}"
|
|
513
|
+
res_expr = other_expr / self.expr if other_expr is not None and self.expr is not None else None
|
|
514
|
+
return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
|
|
515
|
+
|
|
516
|
+
def __rfloordiv__(self, other):
|
|
517
|
+
other_expr, other_repr = _get_expr_and_repr(other)
|
|
518
|
+
new_repr = f"{other_repr} // {self._repr_str}"
|
|
519
|
+
res_expr = other_expr // self.expr if other_expr is not None and self.expr is not None else None
|
|
520
|
+
return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
|
|
521
|
+
|
|
522
|
+
def __rmod__(self, other):
|
|
523
|
+
other_expr, other_repr = _get_expr_and_repr(other)
|
|
524
|
+
new_repr = f"{other_repr} % {self._repr_str}"
|
|
525
|
+
res_expr = other_expr % self.expr if other_expr is not None and self.expr is not None else None
|
|
526
|
+
return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
|
|
527
|
+
|
|
528
|
+
def __rpow__(self, other):
|
|
529
|
+
other_expr, other_repr = _get_expr_and_repr(other)
|
|
530
|
+
new_repr = f"{other_repr} ** {self._repr_str}"
|
|
531
|
+
base_expr = pl.lit(other) if not isinstance(other, (Expr, pl.Expr)) else other_expr
|
|
532
|
+
res_expr = base_expr.pow(self.expr) if self.expr is not None and base_expr is not None else None
|
|
533
|
+
return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
|
|
534
|
+
|
|
535
|
+
# --- Comparison operations ---
|
|
536
|
+
def __eq__(self, other):
|
|
537
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
538
|
+
res_expr = self.expr == other_expr if self.expr is not None and other_expr is not None else None
|
|
539
|
+
return self._create_binary_op_expr("==", other, res_expr)
|
|
540
|
+
|
|
541
|
+
def __ne__(self, other):
|
|
542
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
543
|
+
res_expr = self.expr != other_expr if self.expr is not None and other_expr is not None else None
|
|
544
|
+
return self._create_binary_op_expr("!=", other, res_expr)
|
|
545
|
+
|
|
546
|
+
def __gt__(self, other):
|
|
547
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
548
|
+
res_expr = self.expr > other_expr if self.expr is not None and other_expr is not None else None
|
|
549
|
+
return self._create_binary_op_expr(">", other, res_expr)
|
|
550
|
+
|
|
551
|
+
def __lt__(self, other):
|
|
552
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
553
|
+
res_expr = self.expr < other_expr if self.expr is not None and other_expr is not None else None
|
|
554
|
+
return self._create_binary_op_expr("<", other, res_expr)
|
|
555
|
+
|
|
556
|
+
def __ge__(self, other):
|
|
557
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
558
|
+
res_expr = self.expr >= other_expr if self.expr is not None and other_expr is not None else None
|
|
559
|
+
return self._create_binary_op_expr(">=", other, res_expr)
|
|
560
|
+
|
|
561
|
+
def __le__(self, other):
|
|
562
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
563
|
+
res_expr = self.expr <= other_expr if self.expr is not None and other_expr is not None else None
|
|
564
|
+
return self._create_binary_op_expr("<=", other, res_expr)
|
|
565
|
+
|
|
566
|
+
# --- Logical operations ---
|
|
567
|
+
def __and__(self, other):
|
|
568
|
+
from flowfile_frame.selectors import Selector
|
|
569
|
+
if isinstance(other, Selector):
|
|
570
|
+
raise TypeError("Unsupported operation: Expr & Selector")
|
|
571
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
572
|
+
res_expr = self.expr & other_expr if self.expr is not None and other_expr is not None else None
|
|
573
|
+
return self._create_binary_op_expr("&", other, res_expr)
|
|
574
|
+
|
|
575
|
+
def __or__(self, other):
|
|
576
|
+
from flowfile_frame.selectors import Selector
|
|
577
|
+
if isinstance(other, Selector):
|
|
578
|
+
raise TypeError("Unsupported operation: Expr | Selector")
|
|
579
|
+
other_expr, _ = _get_expr_and_repr(other)
|
|
580
|
+
res_expr = self.expr | other_expr if self.expr is not None and other_expr is not None else None
|
|
581
|
+
return self._create_binary_op_expr("|", other, res_expr)
|
|
582
|
+
|
|
583
|
+
def __invert__(self):
|
|
584
|
+
new_repr = f"~({self._repr_str})"
|
|
585
|
+
res_expr = ~self.expr if self.expr is not None else None
|
|
586
|
+
# Invert clears agg_func
|
|
587
|
+
return Expr(res_expr, None, repr_str=new_repr,
|
|
588
|
+
initial_column_name=self._initial_column_name, agg_func=None)
|
|
589
|
+
|
|
590
|
+
# --- Other useful methods ---
|
|
591
|
+
def is_null(self):
|
|
592
|
+
result_expr = self.expr.is_null() if self.expr is not None else None
|
|
593
|
+
# is_null is not an aggregation, resets agg_func
|
|
594
|
+
result = self._create_next_expr(method_name="is_null", result_expr=result_expr, is_complex=True)
|
|
595
|
+
result.agg_func = None
|
|
596
|
+
return result
|
|
597
|
+
|
|
598
|
+
def filter(self, *predicates, **constraints) -> "Expr":
|
|
599
|
+
"""
|
|
600
|
+
Filter expression
|
|
601
|
+
"""
|
|
602
|
+
# Build arguments for the filter representation
|
|
603
|
+
args_strs = []
|
|
604
|
+
for pred in predicates:
|
|
605
|
+
if isinstance(pred, Expr):
|
|
606
|
+
args_strs.append(str(pred))
|
|
607
|
+
elif isinstance(pred, pl.Expr):
|
|
608
|
+
_, pred_repr = _get_expr_and_repr(pred)
|
|
609
|
+
args_strs.append(pred_repr)
|
|
610
|
+
else:
|
|
611
|
+
args_strs.append(repr(pred))
|
|
612
|
+
|
|
613
|
+
# Add constraints as keyword arguments
|
|
614
|
+
constraints_strs = [f"{k}={repr(v)}" for k, v in constraints.items()]
|
|
615
|
+
all_args_str = ", ".join(args_strs + constraints_strs)
|
|
616
|
+
|
|
617
|
+
# Process the predicates for the polars expression
|
|
618
|
+
processed_predicates = []
|
|
619
|
+
for pred in predicates:
|
|
620
|
+
if isinstance(pred, Expr):
|
|
621
|
+
if pred.expr is not None:
|
|
622
|
+
processed_predicates.append(pred.expr)
|
|
623
|
+
else:
|
|
624
|
+
# Handle non-Expr predicates (convert to polars Expr if possible)
|
|
625
|
+
processed_predicates.append(pred)
|
|
626
|
+
|
|
627
|
+
# Process constraints for the polars expression
|
|
628
|
+
for col_name, value in constraints.items():
|
|
629
|
+
# Create equivalent of pl.col(col_name).eq(value)
|
|
630
|
+
constraint_expr = pl.col(col_name).eq(value)
|
|
631
|
+
processed_predicates.append(constraint_expr)
|
|
632
|
+
|
|
633
|
+
# Create the actual polars expression if possible
|
|
634
|
+
res_expr = None
|
|
635
|
+
if self.expr is not None:
|
|
636
|
+
try:
|
|
637
|
+
res_expr = self.expr.filter(*processed_predicates)
|
|
638
|
+
except Exception as e:
|
|
639
|
+
print(f"Warning: Could not create polars expression for filter(): {e}")
|
|
640
|
+
pass # res_expr will remain None
|
|
641
|
+
|
|
642
|
+
return Expr(
|
|
643
|
+
res_expr,
|
|
644
|
+
self.name,
|
|
645
|
+
repr_str=f"{self._repr_str}.filter({all_args_str})",
|
|
646
|
+
initial_column_name=self._initial_column_name,
|
|
647
|
+
selector=None, # Filter typically removes selector link
|
|
648
|
+
agg_func=self.agg_func, # Preserve aggregation status
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
def is_not_null(self):
|
|
652
|
+
result_expr = self.expr.is_not_null() if self.expr is not None else None
|
|
653
|
+
result = self._create_next_expr(method_name="is_not_null", result_expr=result_expr, is_complex=True)
|
|
654
|
+
result.agg_func = None
|
|
655
|
+
return result
|
|
656
|
+
|
|
657
|
+
def is_in(self, values):
|
|
658
|
+
res_expr = self.expr.is_in(values) if self.expr is not None else None
|
|
659
|
+
# is_in is not an aggregation, resets agg_func
|
|
660
|
+
result = self._create_next_expr(values, method_name="is_in", result_expr=res_expr, is_complex=True)
|
|
661
|
+
result.agg_func = None
|
|
662
|
+
return result
|
|
663
|
+
|
|
664
|
+
def alias(self, name):
|
|
665
|
+
"""Rename the expression result."""
|
|
666
|
+
new_pl_expr = self.expr.alias(name) if self.expr is not None else None
|
|
667
|
+
new_repr = f"{self._repr_str}.alias({repr(name)})"
|
|
668
|
+
# Alias preserves aggregation status
|
|
669
|
+
new_instance = Expr(new_pl_expr, name, repr_str=new_repr,
|
|
670
|
+
initial_column_name=self._initial_column_name,
|
|
671
|
+
selector=None,
|
|
672
|
+
agg_func=self.agg_func,
|
|
673
|
+
is_complex=self.is_complex)
|
|
674
|
+
return new_instance
|
|
675
|
+
|
|
676
|
+
def fill_null(self, value):
|
|
677
|
+
res_expr = self.expr.fill_null(value) if self.expr is not None else None
|
|
678
|
+
# fill_null is not an aggregation, resets agg_func
|
|
679
|
+
result = self._create_next_expr(value, method_name="fill_null", result_expr=res_expr, is_complex=True)
|
|
680
|
+
result.agg_func = None
|
|
681
|
+
return result
|
|
682
|
+
|
|
683
|
+
def fill_nan(self, value):
|
|
684
|
+
res_expr = None
|
|
685
|
+
if self.expr is not None and hasattr(self.expr, 'fill_nan'):
|
|
686
|
+
res_expr = self.expr.fill_nan(value)
|
|
687
|
+
result = self._create_next_expr(value, method_name="fill_nan", result_expr=res_expr, is_complex=True)
|
|
688
|
+
result.agg_func = None
|
|
689
|
+
return result
|
|
690
|
+
|
|
691
|
+
@staticmethod
|
|
692
|
+
def _get_expr_repr(expr):
|
|
693
|
+
"""Helper to get appropriate string representation for an expression"""
|
|
694
|
+
if isinstance(expr, (Expr, Column)):
|
|
695
|
+
return expr._repr_str
|
|
696
|
+
elif isinstance(expr, str):
|
|
697
|
+
return f"pl.col('{expr}')"
|
|
698
|
+
elif isinstance(expr, pl.Expr):
|
|
699
|
+
base_str = str(expr)
|
|
700
|
+
if base_str.startswith("col("):
|
|
701
|
+
return f"pl.{base_str}"
|
|
702
|
+
if base_str.startswith("lit("):
|
|
703
|
+
return f"pl.{base_str}"
|
|
704
|
+
return f"pl.Expr({base_str})"
|
|
705
|
+
else:
|
|
706
|
+
return repr(expr)
|
|
707
|
+
|
|
708
|
+
def over(self,
|
|
709
|
+
partition_by: ExprStrOrList, # Use the type alias defined earlier
|
|
710
|
+
*more_exprs: ExprOrStr,
|
|
711
|
+
order_by: Optional[ExprStrOrList] = None,
|
|
712
|
+
descending: bool = False,
|
|
713
|
+
nulls_last: bool = False,
|
|
714
|
+
mapping_strategy: Literal["group_to_rows", "join", "explode"] = "group_to_rows",
|
|
715
|
+
) -> "Expr":
|
|
716
|
+
"""
|
|
717
|
+
Compute expressions over the given groups.
|
|
718
|
+
String representation will show 'descending' and 'nulls_last' if they are True,
|
|
719
|
+
regardless of 'order_by' presence.
|
|
720
|
+
"""
|
|
721
|
+
# Process all partition columns (partition_by + more_exprs)
|
|
722
|
+
all_partition_cols = [partition_by]
|
|
723
|
+
if more_exprs:
|
|
724
|
+
all_partition_cols.extend(more_exprs)
|
|
725
|
+
|
|
726
|
+
processed_partition_cols = []
|
|
727
|
+
for col_expr in all_partition_cols:
|
|
728
|
+
if isinstance(col_expr, str):
|
|
729
|
+
processed_partition_cols.append(col(col_expr))
|
|
730
|
+
elif isinstance(col_expr, list):
|
|
731
|
+
processed_list = []
|
|
732
|
+
for item in col_expr:
|
|
733
|
+
if isinstance(item, str):
|
|
734
|
+
processed_list.append(col(item))
|
|
735
|
+
else:
|
|
736
|
+
processed_list.append(item)
|
|
737
|
+
processed_partition_cols.extend(processed_list)
|
|
738
|
+
else:
|
|
739
|
+
processed_partition_cols.append(col_expr)
|
|
740
|
+
|
|
741
|
+
processed_order_by = None
|
|
742
|
+
if order_by is not None:
|
|
743
|
+
if isinstance(order_by, str):
|
|
744
|
+
processed_order_by = col(order_by)
|
|
745
|
+
elif isinstance(order_by, list):
|
|
746
|
+
processed_order_by = [
|
|
747
|
+
col(o) if isinstance(o, str) else o for o in order_by
|
|
748
|
+
]
|
|
749
|
+
else:
|
|
750
|
+
processed_order_by = order_by
|
|
751
|
+
|
|
752
|
+
over_arg_strings_for_repr = []
|
|
753
|
+
|
|
754
|
+
if built_in_len(processed_partition_cols) == 1:
|
|
755
|
+
over_arg_strings_for_repr.append(self._get_expr_repr(processed_partition_cols[0]))
|
|
756
|
+
else:
|
|
757
|
+
col_reprs = [self._get_expr_repr(p) for p in processed_partition_cols]
|
|
758
|
+
over_arg_strings_for_repr.append(f"[{', '.join(col_reprs)}]")
|
|
759
|
+
|
|
760
|
+
# Handle keyword-like arguments for string representation
|
|
761
|
+
# order_by
|
|
762
|
+
if processed_order_by is not None:
|
|
763
|
+
if isinstance(processed_order_by, list):
|
|
764
|
+
order_by_repr_val = f"[{', '.join([self._get_expr_repr(o) for o in processed_order_by])}]"
|
|
765
|
+
else:
|
|
766
|
+
order_by_repr_val = self._get_expr_repr(processed_order_by)
|
|
767
|
+
over_arg_strings_for_repr.append(f"order_by={order_by_repr_val}")
|
|
768
|
+
|
|
769
|
+
if descending:
|
|
770
|
+
over_arg_strings_for_repr.append(f"descending={repr(descending)}")
|
|
771
|
+
|
|
772
|
+
if nulls_last:
|
|
773
|
+
over_arg_strings_for_repr.append(f"nulls_last={repr(nulls_last)}")
|
|
774
|
+
|
|
775
|
+
if mapping_strategy != "group_to_rows":
|
|
776
|
+
over_arg_strings_for_repr.append(f"mapping_strategy='{mapping_strategy}'")
|
|
777
|
+
|
|
778
|
+
args_str_for_repr = ", ".join(over_arg_strings_for_repr)
|
|
779
|
+
|
|
780
|
+
res_expr = None
|
|
781
|
+
if self.expr is not None:
|
|
782
|
+
try:
|
|
783
|
+
if len(processed_partition_cols) == 1:
|
|
784
|
+
partition_arg = (
|
|
785
|
+
processed_partition_cols[0].expr
|
|
786
|
+
if hasattr(processed_partition_cols[0], "expr")
|
|
787
|
+
else processed_partition_cols[0]
|
|
788
|
+
)
|
|
789
|
+
else:
|
|
790
|
+
partition_arg = [
|
|
791
|
+
p.expr if hasattr(p, "expr") else p
|
|
792
|
+
for p in processed_partition_cols
|
|
793
|
+
]
|
|
794
|
+
|
|
795
|
+
# Build kwargs for the actual polars over() call
|
|
796
|
+
polars_call_kwargs = {"mapping_strategy": mapping_strategy}
|
|
797
|
+
|
|
798
|
+
if processed_order_by is not None:
|
|
799
|
+
# Convert order_by to Polars expressions
|
|
800
|
+
if isinstance(processed_order_by, list):
|
|
801
|
+
polars_order_by_arg = [
|
|
802
|
+
o.expr if hasattr(o, "expr") else o
|
|
803
|
+
for o in processed_order_by
|
|
804
|
+
]
|
|
805
|
+
else:
|
|
806
|
+
polars_order_by_arg = (
|
|
807
|
+
processed_order_by.expr
|
|
808
|
+
if hasattr(processed_order_by, "expr")
|
|
809
|
+
else processed_order_by
|
|
810
|
+
)
|
|
811
|
+
polars_call_kwargs["order_by"] = polars_order_by_arg
|
|
812
|
+
# These are tied to order_by for the actual Polars call
|
|
813
|
+
polars_call_kwargs["descending"] = descending
|
|
814
|
+
polars_call_kwargs["nulls_last"] = nulls_last
|
|
815
|
+
|
|
816
|
+
res_expr = self.expr.over(partition_by=partition_arg, **polars_call_kwargs)
|
|
817
|
+
|
|
818
|
+
except Exception as e:
|
|
819
|
+
|
|
820
|
+
print(f"Warning: Could not create polars expression for over(): {e}")
|
|
821
|
+
pass
|
|
822
|
+
|
|
823
|
+
return Expr(
|
|
824
|
+
res_expr,
|
|
825
|
+
self.name,
|
|
826
|
+
repr_str=f"{self._repr_str}.over({args_str_for_repr})",
|
|
827
|
+
initial_column_name=self._initial_column_name,
|
|
828
|
+
selector=None,
|
|
829
|
+
agg_func=None,
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
def sort(self, *, descending=False, nulls_last=False):
|
|
833
|
+
res_expr = self.expr.sort(descending=descending, nulls_last=nulls_last) if self.expr is not None else None
|
|
834
|
+
return Expr(res_expr, self.name,
|
|
835
|
+
repr_str=f"{self._repr_str}.sort(descending={descending}, nulls_last={nulls_last})",
|
|
836
|
+
initial_column_name=self._initial_column_name, agg_func=None)
|
|
837
|
+
|
|
838
|
+
def cast(self, dtype: Union[pl.DataType, str, pl.datatypes.classes.DataTypeClass], *, strict=True):
|
|
839
|
+
""" Casts the Expr to a specified data type. """
|
|
840
|
+
pl_dtype = dtype
|
|
841
|
+
dtype_repr = repr(dtype)
|
|
842
|
+
|
|
843
|
+
if isinstance(dtype, str):
|
|
844
|
+
try:
|
|
845
|
+
pl_dtype = getattr(pl, dtype)
|
|
846
|
+
dtype_repr = f"pl.{dtype}"
|
|
847
|
+
except AttributeError:
|
|
848
|
+
pass
|
|
849
|
+
elif hasattr(dtype, '__name__'):
|
|
850
|
+
dtype_repr = f"pl.{dtype.__name__}"
|
|
851
|
+
elif isinstance(dtype, pl.DataType):
|
|
852
|
+
dtype_repr = f"pl.{dtype!s}"
|
|
853
|
+
|
|
854
|
+
res_expr = self.expr.cast(pl_dtype, strict=strict) if self.expr is not None else None
|
|
855
|
+
# Cast preserves aggregation status (e.g., cast(col('a').sum()))
|
|
856
|
+
new_expr = Expr(res_expr, self.name,
|
|
857
|
+
repr_str=f"{self._repr_str}.cast({dtype_repr}, strict={strict})",
|
|
858
|
+
initial_column_name=self._initial_column_name,
|
|
859
|
+
selector=None,
|
|
860
|
+
agg_func=self.agg_func,
|
|
861
|
+
is_complex=True)
|
|
862
|
+
return new_expr
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
class Column(Expr):
|
|
866
|
+
"""Special Expr representing a single column, preserving column identity through alias/cast."""
|
|
867
|
+
_select_input: transform_schema.SelectInput
|
|
868
|
+
|
|
869
|
+
def __init__(self, name: str, select_input: Optional[transform_schema.SelectInput] = None):
|
|
870
|
+
super().__init__(expr=pl.col(name),
|
|
871
|
+
column_name=name,
|
|
872
|
+
repr_str=f"pl.col('{name}')",
|
|
873
|
+
initial_column_name=select_input.old_name if select_input else name,
|
|
874
|
+
selector=None,
|
|
875
|
+
agg_func=None)
|
|
876
|
+
self._select_input = select_input or transform_schema.SelectInput(old_name=name)
|
|
877
|
+
|
|
878
|
+
def alias(self, new_name: str) -> "Column":
|
|
879
|
+
"""Rename a column, returning a new Column instance."""
|
|
880
|
+
new_select = transform_schema.SelectInput(
|
|
881
|
+
old_name=self._select_input.old_name,
|
|
882
|
+
new_name=new_name,
|
|
883
|
+
data_type=self._select_input.data_type,
|
|
884
|
+
data_type_change=self._select_input.data_type_change,
|
|
885
|
+
is_altered=True
|
|
886
|
+
)
|
|
887
|
+
if self.expr is None:
|
|
888
|
+
raise ValueError("Cannot alias Column without underlying polars expression.")
|
|
889
|
+
|
|
890
|
+
new_pl_expr = self.expr.alias(new_name)
|
|
891
|
+
new_repr = f"{self._repr_str}.alias({repr(new_name)})"
|
|
892
|
+
|
|
893
|
+
new_column = Column(new_name, new_select)
|
|
894
|
+
new_column.expr = new_pl_expr
|
|
895
|
+
new_column._repr_str = new_repr
|
|
896
|
+
|
|
897
|
+
new_column.agg_func = self.agg_func
|
|
898
|
+
new_column.is_complex = self.is_complex
|
|
899
|
+
return new_column
|
|
900
|
+
|
|
901
|
+
def cast(self, dtype: Union[pl.DataType, str, pl.datatypes.classes.DataTypeClass], *, strict=True) -> "Column":
|
|
902
|
+
"""Change the data type of a column, returning a new Column instance."""
|
|
903
|
+
pl_dtype = dtype
|
|
904
|
+
dtype_repr = repr(dtype)
|
|
905
|
+
|
|
906
|
+
if isinstance(dtype, str):
|
|
907
|
+
try:
|
|
908
|
+
pl_dtype = getattr(pl, dtype)
|
|
909
|
+
dtype_repr = f"pl.{dtype}"
|
|
910
|
+
except AttributeError:
|
|
911
|
+
pass
|
|
912
|
+
elif hasattr(dtype, '__name__'):
|
|
913
|
+
dtype_repr = f"pl.{dtype.__name__}"
|
|
914
|
+
elif isinstance(dtype, pl.DataType):
|
|
915
|
+
dtype_repr = f"pl.{dtype!s}"
|
|
916
|
+
|
|
917
|
+
if not isinstance(pl_dtype, pl.DataType):
|
|
918
|
+
try:
|
|
919
|
+
pl_dtype_instance = pl_dtype()
|
|
920
|
+
if isinstance(pl_dtype_instance, pl.DataType):
|
|
921
|
+
pl_dtype = pl_dtype_instance
|
|
922
|
+
except TypeError:
|
|
923
|
+
raise TypeError(f"Invalid Polars data type specified for cast: {dtype}")
|
|
924
|
+
|
|
925
|
+
new_select = transform_schema.SelectInput(
|
|
926
|
+
old_name=self._select_input.old_name,
|
|
927
|
+
new_name=self._select_input.new_name,
|
|
928
|
+
data_type=str(pl_dtype),
|
|
929
|
+
data_type_change=True,
|
|
930
|
+
is_altered=True
|
|
931
|
+
)
|
|
932
|
+
if self.expr is None:
|
|
933
|
+
raise ValueError("Cannot cast Column without underlying polars expression.")
|
|
934
|
+
|
|
935
|
+
new_pl_expr = self.expr.cast(pl_dtype, strict=strict)
|
|
936
|
+
new_repr = f"{self._repr_str}.cast({dtype_repr}, strict={strict})"
|
|
937
|
+
display_name = self._select_input.new_name or self._select_input.old_name
|
|
938
|
+
|
|
939
|
+
new_column = Column(display_name, new_select)
|
|
940
|
+
new_column.expr = new_pl_expr
|
|
941
|
+
new_column._repr_str = new_repr
|
|
942
|
+
new_column.agg_func = self.agg_func
|
|
943
|
+
new_column.is_complex = True
|
|
944
|
+
return new_column
|
|
945
|
+
|
|
946
|
+
def to_select_input(self) -> transform_schema.SelectInput:
|
|
947
|
+
"""Convert Column state back to a SelectInput schema object."""
|
|
948
|
+
# This logic seems correct based on your previous version
|
|
949
|
+
current_name = self.name
|
|
950
|
+
original_name = self._select_input.old_name
|
|
951
|
+
new_name_attr = self._select_input.new_name
|
|
952
|
+
|
|
953
|
+
final_new_name = current_name if current_name != original_name else new_name_attr
|
|
954
|
+
final_data_type = self._select_input.data_type if self._select_input.data_type_change else None
|
|
955
|
+
final_data_type_change = bool(final_data_type)
|
|
956
|
+
final_is_altered = bool(final_new_name or final_data_type_change)
|
|
957
|
+
|
|
958
|
+
return transform_schema.SelectInput(
|
|
959
|
+
old_name=original_name,
|
|
960
|
+
new_name=final_new_name,
|
|
961
|
+
data_type=final_data_type,
|
|
962
|
+
data_type_change=final_data_type_change,
|
|
963
|
+
is_altered=final_is_altered
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
@property
|
|
967
|
+
def str(self) -> StringMethods:
|
|
968
|
+
return super().str
|
|
969
|
+
|
|
970
|
+
@property
|
|
971
|
+
def dt(self) -> DateTimeMethods:
|
|
972
|
+
return super().dt
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
class When(Expr):
|
|
976
|
+
"""Class that represents a when-then-otherwise expression chain."""
|
|
977
|
+
|
|
978
|
+
def __init__(self, condition):
|
|
979
|
+
"""Initialize a When expression with a condition."""
|
|
980
|
+
# Get the condition's expression and representation
|
|
981
|
+
condition_expr, condition_repr = self._get_expr_and_repr(condition)
|
|
982
|
+
self.condition = condition_expr
|
|
983
|
+
|
|
984
|
+
# Build the initial representation string
|
|
985
|
+
repr_str = f"pl.when({condition_repr})"
|
|
986
|
+
# Initialize the base class
|
|
987
|
+
super().__init__(expr=None, repr_str=repr_str, is_complex=True)
|
|
988
|
+
self._branch_expr = None
|
|
989
|
+
|
|
990
|
+
@staticmethod
|
|
991
|
+
def _get_expr_and_repr(value):
|
|
992
|
+
"""Extract expression and representation from a value."""
|
|
993
|
+
if hasattr(value, 'expr') and hasattr(value, '_repr_str'):
|
|
994
|
+
return value.expr, value._repr_str
|
|
995
|
+
elif isinstance(value, str) and not value.startswith("pl."):
|
|
996
|
+
col_obj = col(value)
|
|
997
|
+
return col_obj.expr, f"'{value}'"
|
|
998
|
+
else:
|
|
999
|
+
return value, repr(value)
|
|
1000
|
+
|
|
1001
|
+
def then(self, value):
|
|
1002
|
+
"""Set the value to use when the condition is True."""
|
|
1003
|
+
value_expr, value_repr = self._get_expr_and_repr(value)
|
|
1004
|
+
|
|
1005
|
+
self._repr_str = f"{self._repr_str}.then({value_repr})"
|
|
1006
|
+
try:
|
|
1007
|
+
self._branch_expr = pl.when(self.condition).then(value_expr)
|
|
1008
|
+
except Exception as e:
|
|
1009
|
+
print(f"Warning: Error in then() creation: {e}")
|
|
1010
|
+
|
|
1011
|
+
return self
|
|
1012
|
+
|
|
1013
|
+
def otherwise(self, value):
|
|
1014
|
+
"""Set the value to use when no condition is True."""
|
|
1015
|
+
# Get the value's expression and representation
|
|
1016
|
+
value_expr, value_repr = self._get_expr_and_repr(value)
|
|
1017
|
+
final_repr = f"{self._repr_str}.otherwise({value_repr})"
|
|
1018
|
+
|
|
1019
|
+
pl_expr = None
|
|
1020
|
+
try:
|
|
1021
|
+
if self._branch_expr is not None:
|
|
1022
|
+
pl_expr = self._branch_expr.otherwise(value_expr)
|
|
1023
|
+
except Exception as e:
|
|
1024
|
+
print(f"Warning: Could not create when-then-otherwise expression: {e}")
|
|
1025
|
+
|
|
1026
|
+
return Expr(pl_expr, repr_str=final_repr)
|
|
1027
|
+
|
|
1028
|
+
def when(self, condition):
|
|
1029
|
+
"""Create a new branch in the chain."""
|
|
1030
|
+
if self._branch_expr is None:
|
|
1031
|
+
print("Warning: Cannot add new branch without a then() first")
|
|
1032
|
+
return self
|
|
1033
|
+
|
|
1034
|
+
condition_expr, condition_repr = self._get_expr_and_repr(condition)
|
|
1035
|
+
|
|
1036
|
+
self._repr_str = f"{self._repr_str}.when({condition_repr})"
|
|
1037
|
+
|
|
1038
|
+
try:
|
|
1039
|
+
self._branch_expr = self._branch_expr.when(condition_expr)
|
|
1040
|
+
except Exception as e:
|
|
1041
|
+
print(f"Warning: Error adding new when() branch: {e}")
|
|
1042
|
+
|
|
1043
|
+
# Return self for chaining
|
|
1044
|
+
return self
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
# --- Top-Level Functions ---
|
|
1048
|
+
def col(name: str) -> Column:
|
|
1049
|
+
"""Creates a Column expression."""
|
|
1050
|
+
return Column(name)
|
|
1051
|
+
|
|
1052
|
+
|
|
1053
|
+
def column(name: str) -> Column:
|
|
1054
|
+
"""Alias for col(). Creates a Column expression."""
|
|
1055
|
+
return Column(name)
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
def lit(value: Any) -> Expr:
|
|
1059
|
+
"""Creates a Literal expression."""
|
|
1060
|
+
# Literals don't have an agg_func
|
|
1061
|
+
return Expr(pl.lit(value), repr_str=f"pl.lit({repr(value)})", agg_func=None)
|
|
1062
|
+
|
|
1063
|
+
|
|
1064
|
+
def len() -> Expr:
|
|
1065
|
+
return Expr(pl.len()).alias('number_of_records')
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
def agg_function(func):
|
|
1069
|
+
"""
|
|
1070
|
+
Decorator for aggregation functions that sets appropriate properties based on number of arguments.
|
|
1071
|
+
Uses the function name as the aggregation function name.
|
|
1072
|
+
|
|
1073
|
+
Parameters:
|
|
1074
|
+
-----------
|
|
1075
|
+
func : function
|
|
1076
|
+
The aggregation function to decorate
|
|
1077
|
+
|
|
1078
|
+
Returns:
|
|
1079
|
+
--------
|
|
1080
|
+
wrapper
|
|
1081
|
+
A wrapped function that returns the properly configured Expr
|
|
1082
|
+
"""
|
|
1083
|
+
agg_func_name = func.__name__ # Use the function name as the agg_func
|
|
1084
|
+
|
|
1085
|
+
def wrapper(*names):
|
|
1086
|
+
# Get the Polars expression from the original function
|
|
1087
|
+
pl_expr = func(*names)
|
|
1088
|
+
if built_in_len(names) == 1 and isinstance(names[0], str):
|
|
1089
|
+
return Expr(pl_expr, agg_func=agg_func_name, initial_column_name=names[0], is_complex=False)
|
|
1090
|
+
elif built_in_len(names) == 1 and isinstance(names[0], Expr):
|
|
1091
|
+
return Expr(pl_expr, agg_func=agg_func_name, initial_column_name=names[0].name, is_complex=names[0].is_complex)
|
|
1092
|
+
else:
|
|
1093
|
+
return Expr(pl_expr, agg_func=agg_func_name, is_complex=True)
|
|
1094
|
+
return wrapper
|
|
1095
|
+
|
|
1096
|
+
|
|
1097
|
+
@agg_function
|
|
1098
|
+
def max(*names) -> Expr:
|
|
1099
|
+
return pl.max(*names)
|
|
1100
|
+
|
|
1101
|
+
|
|
1102
|
+
@agg_function
|
|
1103
|
+
def min(*names) -> Expr:
|
|
1104
|
+
return pl.min(*names)
|
|
1105
|
+
|
|
1106
|
+
|
|
1107
|
+
@agg_function
|
|
1108
|
+
def first(*names) -> Expr:
|
|
1109
|
+
return pl.first(*names)
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
@agg_function
|
|
1113
|
+
def last(*names) -> Expr:
|
|
1114
|
+
return pl.last(*names)
|
|
1115
|
+
|
|
1116
|
+
|
|
1117
|
+
@agg_function
|
|
1118
|
+
def mean(*names) -> Expr:
|
|
1119
|
+
return pl.mean(*names)
|
|
1120
|
+
|
|
1121
|
+
|
|
1122
|
+
@agg_function
|
|
1123
|
+
def count(*names) -> Expr:
|
|
1124
|
+
return pl.count(*names)
|
|
1125
|
+
|
|
1126
|
+
|
|
1127
|
+
@agg_function
|
|
1128
|
+
def sum(*names) -> Expr:
|
|
1129
|
+
return pl.sum(*names)
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
def std(column, ddof) -> Expr:
|
|
1133
|
+
return Expr(column, ddof=ddof, agg_func='std')
|
|
1134
|
+
|
|
1135
|
+
|
|
1136
|
+
def var(column, ddof) -> Expr:
|
|
1137
|
+
return Expr(column, ddof=ddof, agg_func="var")
|
|
1138
|
+
|
|
1139
|
+
|
|
1140
|
+
def cum_count(expr, reverse: bool = False) -> Expr:
|
|
1141
|
+
"""
|
|
1142
|
+
Return the cumulative count of the non-null values in the column.
|
|
1143
|
+
|
|
1144
|
+
Parameters
|
|
1145
|
+
----------
|
|
1146
|
+
expr : str or Expr
|
|
1147
|
+
Expression to compute cumulative count on
|
|
1148
|
+
reverse : bool, default False
|
|
1149
|
+
Reverse the operation
|
|
1150
|
+
|
|
1151
|
+
Returns
|
|
1152
|
+
-------
|
|
1153
|
+
Expr
|
|
1154
|
+
A new expression with the cumulative count
|
|
1155
|
+
"""
|
|
1156
|
+
if isinstance(expr, str):
|
|
1157
|
+
expr = col(expr)
|
|
1158
|
+
return expr.cum_count(reverse=reverse)
|
|
1159
|
+
|
|
1160
|
+
|
|
1161
|
+
def when(condition):
|
|
1162
|
+
"""Start a when-then-otherwise expression."""
|
|
1163
|
+
return When(condition)
|