Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
flowfile_frame/expr.py ADDED
@@ -0,0 +1,1163 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Optional, Union, TYPE_CHECKING, List, Literal, TypeVar
4
+
5
+ import polars as pl
6
+ from polars.expr.string import ExprStringNameSpace
7
+
8
+ from flowfile_core.schemas import transform_schema
9
+
10
+ from builtins import len as built_in_len
11
+
12
+ # --- TYPE CHECKING IMPORTS ---
13
+ if TYPE_CHECKING:
14
+ from flowfile_frame.selectors import Selector
15
+ ExprType = TypeVar('ExprType', bound='Expr')
16
+ ColumnType = "Column" # Use string literal instead of direct class reference
17
+
18
+ ExprOrStr = Union['Expr', str]
19
+ ExprOrStrList = List[ExprOrStr]
20
+ ExprStrOrList = Union[ExprOrStr, ExprOrStrList]
21
+
22
+
23
+ def _repr_args(*args, **kwargs):
24
+ """Helper to represent arguments for __repr__."""
25
+ arg_reprs = [repr(a) for a in args]
26
+ kwarg_reprs = []
27
+ for k, v in kwargs.items():
28
+ if isinstance(v, pl.DataType):
29
+ kwarg_reprs.append(f"{k}={v!s}")
30
+ elif isinstance(v, type) and issubclass(v, pl.DataType):
31
+ kwarg_reprs.append(f"{k}=pl.{v.__name__}")
32
+ else:
33
+ kwarg_reprs.append(f"{k}={repr(v)}")
34
+ return ", ".join(arg_reprs + kwarg_reprs)
35
+
36
+
37
+ def _get_expr_and_repr(value: Any) -> tuple[Optional[pl.Expr], str]:
38
+ """Helper to get polars expr and repr string for operands."""
39
+ if isinstance(value, Expr):
40
+ # Ensure we return None if the inner expression is None
41
+ inner_expr = value.expr if value.expr is not None else None
42
+ return inner_expr, value._repr_str
43
+ elif isinstance(value, pl.Expr):
44
+ base_str = str(value)
45
+ if base_str.startswith("col("):
46
+ return value, f"pl.{base_str}"
47
+ if base_str.startswith("lit("):
48
+ return value, f"pl.{base_str}"
49
+ return value, f"pl.Expr({base_str})"
50
+ else:
51
+ # Assume literal
52
+ return pl.lit(value), repr(value)
53
+
54
+
55
+ # --- Namespaces ---
56
+
57
+ class StringMethods:
58
+ expr: Optional[ExprStringNameSpace]
59
+
60
+ def __init__(self, parent_expr: 'Expr', parent_repr_str: str):
61
+ self.parent = parent_expr
62
+ self.expr = parent_expr.expr.str if parent_expr.expr is not None else None
63
+ self.parent_repr_str = parent_repr_str
64
+
65
+ def _create_next_expr(self, *args, method_name: str, result_expr: Optional[pl.Expr], is_complex: bool, **kwargs) -> 'Expr':
66
+ args_repr = _repr_args(*args, **kwargs)
67
+ new_repr = f"{self.parent_repr_str}.str.{method_name}({args_repr})"
68
+ new_expr = Expr(result_expr, self.parent.name, repr_str=new_repr,
69
+ initial_column_name=self.parent._initial_column_name,
70
+ selector=None,
71
+ agg_func=self.parent.agg_func,
72
+ is_complex=is_complex)
73
+ return new_expr
74
+
75
+ # ... (String methods remain unchanged from your provided code) ...
76
+ def contains(self, pattern, *, literal=False):
77
+ res_expr = self.expr.contains(pattern, literal=literal) if self.expr is not None else None
78
+ return self._create_next_expr(pattern, literal=literal, method_name="contains", result_expr=res_expr, is_complex=True)
79
+
80
+ def starts_with(self, prefix):
81
+ res_expr = self.expr.starts_with(prefix) if self.expr is not None else None
82
+ return self._create_next_expr(prefix, is_complex=True, method_name="starts_with", result_expr=res_expr)
83
+
84
+ def ends_with(self, suffix):
85
+ res_expr = self.expr.ends_with(suffix) if self.expr is not None else None
86
+ return self._create_next_expr(suffix, result_expr=res_expr, method_name="ends_with", is_complex=True)
87
+
88
+ def replace(self, pattern, replacement, *, literal=False):
89
+ res_expr = self.expr.replace(pattern, replacement, literal=literal) if self.expr is not None else None
90
+ return self._create_next_expr(pattern, replacement, method_name="replace",
91
+ result_expr=res_expr, literal=literal, is_complex=True)
92
+
93
+ def to_uppercase(self):
94
+ res_expr = self.expr.to_uppercase() if self.expr is not None else None
95
+ return self._create_next_expr(method_name="to_uppercase", result_expr=res_expr, is_complex=True)
96
+
97
+ def to_lowercase(self):
98
+ res_expr = self.expr.to_lowercase() if self.expr is not None else None
99
+ return self._create_next_expr(method_name="to_lowercase", result_expr=res_expr, is_complex=True)
100
+
101
+ def len_chars(self):
102
+ res_expr = self.expr.len_chars() if self.expr is not None else None
103
+ return self._create_next_expr(method_name="len_chars", result_expr=res_expr, is_complex=True)
104
+
105
+ def len_bytes(self):
106
+ res_expr = self.expr.len_bytes() if self.expr is not None else None
107
+ return self._create_next_expr(method_name="len_bytes", result_expr=res_expr, is_complex=True)
108
+
109
+ def to_titlecase(self):
110
+ res_expr = self.expr.to_titlecase() if self.expr is not None else None
111
+ return self._create_next_expr(method_name="to_titlecase", result_expr=res_expr, is_complex=True)
112
+
113
+ def __getattr__(self, name):
114
+ if self.expr is None or not hasattr(self.expr, name):
115
+ if self.expr is None:
116
+ raise AttributeError(
117
+ f"'StringMethods' cannot call '{name}' because underlying expression is not set "
118
+ f"(e.g., created from selector). Apply aggregation first."
119
+ )
120
+ raise AttributeError(f"'StringMethods' underlying expression has no attribute '{name}'")
121
+ pl_attr = getattr(self.expr, name)
122
+ if callable(pl_attr):
123
+ def wrapper(*args, **kwargs):
124
+ result = pl_attr(*args, **kwargs)
125
+ # Assume generic getattr methods don't change aggregation status
126
+ return self._create_next_expr(name, result, *args, **kwargs)
127
+ return wrapper
128
+ else:
129
+ return pl_attr
130
+
131
+
132
+ class DateTimeMethods:
133
+ expr: Optional[Any]
134
+
135
+ def __init__(self, parent_expr: 'Expr', parent_repr_str: str):
136
+ self.parent = parent_expr
137
+ self.expr = parent_expr.expr.dt if parent_expr.expr is not None else None
138
+ self.parent_repr_str = parent_repr_str
139
+
140
+ def _create_next_expr(self, method_name: str, result_expr: Optional[pl.Expr], *args, **kwargs) -> 'Expr':
141
+ args_repr = _repr_args(*args, **kwargs)
142
+ new_repr = f"{self.parent_repr_str}.dt.{method_name}({args_repr})"
143
+
144
+ new_expr = Expr(result_expr, self.parent.name, repr_str=new_repr,
145
+ initial_column_name=self.parent._initial_column_name,
146
+ selector=None,
147
+ agg_func=self.parent.agg_func,
148
+ is_complex=True)
149
+ return new_expr
150
+
151
+ # ... (DateTime methods remain unchanged from your provided code) ...
152
+ def year(self):
153
+ res_expr = self.expr.year() if self.expr is not None else None
154
+ return self._create_next_expr("year", res_expr)
155
+
156
+ def month(self):
157
+ res_expr = self.expr.month() if self.expr is not None else None
158
+ return self._create_next_expr("month", res_expr)
159
+
160
+ def day(self):
161
+ res_expr = self.expr.day() if self.expr is not None else None
162
+ return self._create_next_expr("day", res_expr)
163
+
164
+ def hour(self):
165
+ res_expr = self.expr.hour() if self.expr is not None else None
166
+ return self._create_next_expr("hour", res_expr)
167
+
168
+ def minute(self):
169
+ res_expr = self.expr.minute() if self.expr is not None else None
170
+ return self._create_next_expr("minute", res_expr)
171
+
172
+ def second(self):
173
+ res_expr = self.expr.second() if self.expr is not None else None
174
+ return self._create_next_expr("second", res_expr)
175
+
176
+ def __getattr__(self, name):
177
+ if self.expr is None or not hasattr(self.expr, name):
178
+ if self.expr is None:
179
+ raise AttributeError(
180
+ f"'DateTimeMethods' cannot call '{name}' because underlying expression is not set "
181
+ f"(e.g., created from selector). Apply aggregation first."
182
+ )
183
+ raise AttributeError(f"'DateTimeMethods' underlying expression has no attribute '{name}'")
184
+ pl_attr = getattr(self.expr, name)
185
+ if callable(pl_attr):
186
+ def wrapper(*args, **kwargs):
187
+ result = pl_attr(*args, **kwargs)
188
+ # Assume generic getattr methods don't change aggregation status
189
+ return self._create_next_expr(name, result, *args, **kwargs)
190
+ return wrapper
191
+ else:
192
+ return pl_attr
193
+
194
+
195
+ class Expr:
196
+ _initial_column_name: Optional[str]
197
+ selector: Optional['Selector']
198
+ expr: Optional[pl.Expr]
199
+ agg_func: Optional[str]
200
+ _repr_str: str
201
+ name: Optional[str]
202
+ is_complex: bool = False
203
+
204
+ def __init__(self,
205
+ expr: Optional[pl.Expr],
206
+ column_name: Optional[str] = None,
207
+ repr_str: Optional[str] = None,
208
+ initial_column_name: Optional[str] = None,
209
+ selector: Optional['Selector'] = None,
210
+ agg_func: Optional[str] = None,
211
+ ddof: Optional[int] = None,
212
+ is_complex: bool = False):
213
+
214
+ self.expr = expr
215
+ self.name = column_name
216
+ self.agg_func = agg_func
217
+ self.selector = selector
218
+ self._initial_column_name = initial_column_name or column_name
219
+ self.is_complex = is_complex
220
+ # --- Determine Representation String ---
221
+ if repr_str is not None:
222
+ self._repr_str = repr_str
223
+ elif self.selector is not None and self.agg_func is not None:
224
+ selector_repr = self.selector.repr_str
225
+ func_name = self.agg_func
226
+ kwargs_dict = {}
227
+ if func_name in ("std", "var") and ddof is not None:
228
+ kwargs_dict['ddof'] = ddof
229
+ kwargs_repr = _repr_args(**kwargs_dict)
230
+ self._repr_str = f"{selector_repr}.{func_name}({kwargs_repr})"
231
+ self.expr = None
232
+ elif self.selector is not None:
233
+ self._repr_str = f"{self.selector.repr_str}"
234
+ self.expr = None
235
+ elif self.expr is not None:
236
+ _, default_repr = _get_expr_and_repr(self.expr)
237
+ self._repr_str = default_repr
238
+ else:
239
+ raise ValueError("Cannot initialize Expr without expr, repr_str, or selector+agg_func")
240
+
241
+ if self.name is None and self.selector is None and self.expr is not None:
242
+ try:
243
+ self.name = self.expr._output_name
244
+ except AttributeError:
245
+ try:
246
+ self.name = self.expr._name
247
+ except AttributeError:
248
+ pass
249
+
250
+ self._str_namespace: Optional['StringMethods'] = None
251
+ self._dt_namespace: Optional['DateTimeMethods'] = None
252
+
253
+ def __repr__(self) -> str:
254
+ return self._repr_str
255
+
256
+ @property
257
+ def is_simple(self) -> bool:
258
+ """
259
+ Determines if this expression is a "simple" expression that can be directly
260
+ converted to a GroupBy's AggColl structure.
261
+
262
+ A simple expression is one that:
263
+ 1. References a single column directly (not through arithmetic/logical operations)
264
+ 2. May have an aggregation function applied (sum, mean, etc.)
265
+ 3. May have been aliased with a new name
266
+
267
+ Returns
268
+ -------
269
+ bool
270
+ True if this is a simple expression, False otherwise
271
+ """
272
+ # Check for selector expressions
273
+ if self.selector is not None:
274
+ # Selector expressions are complex - they select multiple columns
275
+ return False
276
+
277
+ # Check if this expression has any arithmetic/logical operators
278
+ if hasattr(self, "_repr_str"):
279
+ # Check for when/then/otherwise expressions
280
+ if any(
281
+ marker in self._repr_str
282
+ for marker in ["when(", ".then(", ".otherwise("]
283
+ ):
284
+ return False
285
+
286
+ # Look for arithmetic operators in the expression string
287
+ for op in ["+", "-", "*", "/", "//", "%", "**", "&", "|", "==", "!=", "<", ">", "<=", ">=",]:
288
+ if op in self._repr_str:
289
+ # If the operator is in a .alias() part, it's still simple
290
+ if f".alias('{op}" in self._repr_str:
291
+ continue
292
+
293
+ # Otherwise, we have a complex expression
294
+ return False
295
+
296
+ # Check for other functions that might create complex expressions
297
+ for func in [
298
+ "filter(",
299
+ "where(",
300
+ "if_else(",
301
+ "case_when(",
302
+ "apply(",
303
+ "map(",
304
+ ]:
305
+ if func in self._repr_str:
306
+ return False
307
+
308
+ # If we reach here, it's a simple expression (just column reference and maybe aggregation)
309
+ return True
310
+
311
+ def _create_next_expr(self, *args, method_name: str, result_expr: Optional[pl.Expr], is_complex: bool, **kwargs) -> 'Expr':
312
+ """Creates a new Expr instance, appending method call to repr string."""
313
+ args_repr = _repr_args(*args, **kwargs)
314
+ new_repr = f"{self._repr_str}.{method_name}({args_repr})"
315
+
316
+ # Create new instance, inheriting current agg_func status by default
317
+ new_expr_instance = Expr(result_expr, self.name, repr_str=new_repr,
318
+ initial_column_name=self._initial_column_name,
319
+ selector=None,
320
+ agg_func=self.agg_func,
321
+ is_complex=is_complex)
322
+ return new_expr_instance
323
+
324
+ def _create_binary_op_expr(
325
+ self, op_symbol: str, other: Any, result_expr: Optional[pl.Expr]
326
+ ) -> "Expr":
327
+ """Creates a new Expr for binary operations."""
328
+ if self.expr is None:
329
+ raise ValueError(
330
+ f"Cannot perform binary operation '{op_symbol}' on Expr without underlying polars expression."
331
+ )
332
+
333
+ other_expr, other_repr = _get_expr_and_repr(other)
334
+
335
+ if other_expr is None and not isinstance(
336
+ other, (int, float, str, bool, type(None))
337
+ ):
338
+ raise ValueError(
339
+ f"Cannot perform binary operation '{op_symbol}' with operand without underlying polars expression or literal value: {other_repr}"
340
+ )
341
+
342
+ # For binary operations, just construct the expression without extra parentheses
343
+ new_repr = f"{self._repr_str} {op_symbol} {other_repr}"
344
+
345
+ # Binary ops clear the aggregation state and selector link
346
+ return Expr(
347
+ result_expr,
348
+ None,
349
+ repr_str=f"({new_repr})", # Add parentheses around the ENTIRE expression
350
+ initial_column_name=self._initial_column_name,
351
+ selector=None,
352
+ agg_func=None,
353
+ is_complex=True
354
+ )
355
+
356
+ @property
357
+ def str(self) -> StringMethods:
358
+ if self._str_namespace is None:
359
+ self._str_namespace = StringMethods(self, self._repr_str)
360
+ return self._str_namespace
361
+
362
+ @property
363
+ def dt(self) -> DateTimeMethods:
364
+ if self._dt_namespace is None:
365
+ self._dt_namespace = DateTimeMethods(self, self._repr_str)
366
+ return self._dt_namespace
367
+
368
+ def sum(self):
369
+ result_expr = self.expr.sum() if self.expr is not None else None
370
+ result = self._create_next_expr(method_name="sum", result_expr=result_expr, is_complex=self.is_complex)
371
+ result.agg_func = "sum"
372
+ return result
373
+
374
+ def mean(self):
375
+ result_expr = self.expr.mean() if self.expr is not None else None
376
+ result = self._create_next_expr(method_name="mean", result_expr=result_expr, is_complex=self.is_complex)
377
+ result.agg_func = "mean"
378
+ return result
379
+
380
+ def min(self):
381
+ result_expr = self.expr.min() if self.expr is not None else None
382
+ result = self._create_next_expr(method_name="min", result_expr=result_expr, is_complex=self.is_complex)
383
+ result.agg_func = "min"
384
+ return result
385
+
386
+ def max(self):
387
+ result_expr = self.expr.max() if self.expr is not None else None
388
+ result = self._create_next_expr(method_name="max", result_expr=result_expr, is_complex=self.is_complex)
389
+ result.agg_func = "max"
390
+ return result
391
+
392
+ def median(self):
393
+ result_expr = self.expr.median() if self.expr is not None else None
394
+ result = self._create_next_expr(method_name="median", result_expr=result_expr, is_complex=self.is_complex)
395
+ result.agg_func = "median"
396
+ return result
397
+
398
+ def count(self):
399
+ result_expr = self.expr.count() if self.expr is not None else None
400
+ result = self._create_next_expr(method_name="count", result_expr=result_expr, is_complex=self.is_complex)
401
+ result.agg_func = "count"
402
+ return result
403
+
404
+ def first(self):
405
+ result_expr = self.expr.first() if self.expr is not None else None
406
+ result = self._create_next_expr(method_name="first", result_expr=result_expr, is_complex=self.is_complex)
407
+ result.agg_func = "first"
408
+ return result
409
+
410
+ def last(self):
411
+ result_expr = self.expr.last() if self.expr is not None else None
412
+ result = self._create_next_expr(method_name="last", result_expr=result_expr, is_complex=self.is_complex)
413
+ result.agg_func = "last"
414
+ return result
415
+
416
+ def n_unique(self):
417
+ result_expr = self.expr.n_unique() if self.expr is not None else None
418
+ result = self._create_next_expr(method_name="n_unique", result_expr=result_expr, is_complex=self.is_complex)
419
+ result.agg_func = "n_unique"
420
+ return result
421
+
422
+ def std(self, ddof=1):
423
+ result_expr = self.expr.std(ddof=ddof) if self.expr is not None else None
424
+ result = self._create_next_expr(method_name="std", result_expr=result_expr, ddof=ddof, is_complex=True)
425
+ result.agg_func = "std"
426
+ return result
427
+
428
+ def cum_count(self, reverse: bool = False) -> "Expr":
429
+ """
430
+ Return the cumulative count of the non-null values in the column.
431
+
432
+ Parameters
433
+ ----------
434
+ reverse : bool, default False
435
+ Reverse the operation
436
+
437
+ Returns
438
+ -------
439
+ Expr
440
+ A new expression with the cumulative count
441
+ """
442
+ result_expr = (
443
+ self.expr.cum_count(reverse=reverse) if self.expr is not None else None
444
+ )
445
+ result = self._create_next_expr(method_name="cum_count", result_expr=result_expr, reverse=reverse, is_complex=True)
446
+ result.agg_func = None
447
+ return result
448
+
449
+ def var(self, ddof=1):
450
+ result_expr = self.expr.var(ddof=ddof) if self.expr is not None else None
451
+ result = self._create_next_expr(method_name="var", result_expr=result_expr, ddof=ddof, is_complex=True)
452
+ result.agg_func = "var"
453
+ return result
454
+
455
+ def __add__(self, other):
456
+ other_expr, _ = _get_expr_and_repr(other)
457
+ res_expr = self.expr + other_expr if self.expr is not None and other_expr is not None else None
458
+ return self._create_binary_op_expr("+", other, res_expr)
459
+
460
+ def __sub__(self, other):
461
+ other_expr, _ = _get_expr_and_repr(other)
462
+ res_expr = self.expr - other_expr if self.expr is not None and other_expr is not None else None
463
+ return self._create_binary_op_expr("-", other, res_expr)
464
+
465
+ def __mul__(self, other):
466
+ other_expr, _ = _get_expr_and_repr(other)
467
+ res_expr = self.expr * other_expr if self.expr is not None and other_expr is not None else None
468
+ return self._create_binary_op_expr("*", other, res_expr)
469
+
470
+ def __truediv__(self, other):
471
+ other_expr, _ = _get_expr_and_repr(other)
472
+ res_expr = self.expr / other_expr if self.expr is not None and other_expr is not None else None
473
+ return self._create_binary_op_expr("/", other, res_expr)
474
+
475
+ def __floordiv__(self, other):
476
+ other_expr, _ = _get_expr_and_repr(other)
477
+ res_expr = self.expr // other_expr if self.expr is not None and other_expr is not None else None
478
+ return self._create_binary_op_expr("//", other, res_expr)
479
+
480
+ def __pow__(self, exponent):
481
+ exp_expr, _ = _get_expr_and_repr(exponent)
482
+ res_expr = self.expr.pow(exp_expr) if self.expr is not None and exp_expr is not None else None
483
+ return self._create_binary_op_expr("**", exponent, res_expr)
484
+
485
+ def __mod__(self, other):
486
+ other_expr, _ = _get_expr_and_repr(other)
487
+ res_expr = self.expr % other_expr if self.expr is not None and other_expr is not None else None
488
+ return self._create_binary_op_expr("%", other, res_expr)
489
+
490
+ # --- Right-side Arithmetic ---
491
+ def __radd__(self, other):
492
+ other_expr, other_repr = _get_expr_and_repr(other)
493
+ new_repr = f"{other_repr} + {self._repr_str}"
494
+ res_expr = other_expr + self.expr if other_expr is not None and self.expr is not None else None
495
+ # Right-side ops also clear agg_func
496
+ return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
497
+
498
+ def __rsub__(self, other):
499
+ other_expr, other_repr = _get_expr_and_repr(other)
500
+ new_repr = f"{other_repr} - {self._repr_str}"
501
+ res_expr = other_expr - self.expr if other_expr is not None and self.expr is not None else None
502
+ return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
503
+
504
+ def __rmul__(self, other):
505
+ other_expr, other_repr = _get_expr_and_repr(other)
506
+ new_repr = f"{other_repr} * {self._repr_str}"
507
+ res_expr = other_expr * self.expr if other_expr is not None and self.expr is not None else None
508
+ return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
509
+
510
+ def __rtruediv__(self, other):
511
+ other_expr, other_repr = _get_expr_and_repr(other)
512
+ new_repr = f"{other_repr} / {self._repr_str}"
513
+ res_expr = other_expr / self.expr if other_expr is not None and self.expr is not None else None
514
+ return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
515
+
516
+ def __rfloordiv__(self, other):
517
+ other_expr, other_repr = _get_expr_and_repr(other)
518
+ new_repr = f"{other_repr} // {self._repr_str}"
519
+ res_expr = other_expr // self.expr if other_expr is not None and self.expr is not None else None
520
+ return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
521
+
522
+ def __rmod__(self, other):
523
+ other_expr, other_repr = _get_expr_and_repr(other)
524
+ new_repr = f"{other_repr} % {self._repr_str}"
525
+ res_expr = other_expr % self.expr if other_expr is not None and self.expr is not None else None
526
+ return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
527
+
528
+ def __rpow__(self, other):
529
+ other_expr, other_repr = _get_expr_and_repr(other)
530
+ new_repr = f"{other_repr} ** {self._repr_str}"
531
+ base_expr = pl.lit(other) if not isinstance(other, (Expr, pl.Expr)) else other_expr
532
+ res_expr = base_expr.pow(self.expr) if self.expr is not None and base_expr is not None else None
533
+ return Expr(res_expr, None, repr_str=new_repr, agg_func=None, is_complex=True)
534
+
535
+ # --- Comparison operations ---
536
+ def __eq__(self, other):
537
+ other_expr, _ = _get_expr_and_repr(other)
538
+ res_expr = self.expr == other_expr if self.expr is not None and other_expr is not None else None
539
+ return self._create_binary_op_expr("==", other, res_expr)
540
+
541
+ def __ne__(self, other):
542
+ other_expr, _ = _get_expr_and_repr(other)
543
+ res_expr = self.expr != other_expr if self.expr is not None and other_expr is not None else None
544
+ return self._create_binary_op_expr("!=", other, res_expr)
545
+
546
+ def __gt__(self, other):
547
+ other_expr, _ = _get_expr_and_repr(other)
548
+ res_expr = self.expr > other_expr if self.expr is not None and other_expr is not None else None
549
+ return self._create_binary_op_expr(">", other, res_expr)
550
+
551
+ def __lt__(self, other):
552
+ other_expr, _ = _get_expr_and_repr(other)
553
+ res_expr = self.expr < other_expr if self.expr is not None and other_expr is not None else None
554
+ return self._create_binary_op_expr("<", other, res_expr)
555
+
556
+ def __ge__(self, other):
557
+ other_expr, _ = _get_expr_and_repr(other)
558
+ res_expr = self.expr >= other_expr if self.expr is not None and other_expr is not None else None
559
+ return self._create_binary_op_expr(">=", other, res_expr)
560
+
561
+ def __le__(self, other):
562
+ other_expr, _ = _get_expr_and_repr(other)
563
+ res_expr = self.expr <= other_expr if self.expr is not None and other_expr is not None else None
564
+ return self._create_binary_op_expr("<=", other, res_expr)
565
+
566
+ # --- Logical operations ---
567
+ def __and__(self, other):
568
+ from flowfile_frame.selectors import Selector
569
+ if isinstance(other, Selector):
570
+ raise TypeError("Unsupported operation: Expr & Selector")
571
+ other_expr, _ = _get_expr_and_repr(other)
572
+ res_expr = self.expr & other_expr if self.expr is not None and other_expr is not None else None
573
+ return self._create_binary_op_expr("&", other, res_expr)
574
+
575
+ def __or__(self, other):
576
+ from flowfile_frame.selectors import Selector
577
+ if isinstance(other, Selector):
578
+ raise TypeError("Unsupported operation: Expr | Selector")
579
+ other_expr, _ = _get_expr_and_repr(other)
580
+ res_expr = self.expr | other_expr if self.expr is not None and other_expr is not None else None
581
+ return self._create_binary_op_expr("|", other, res_expr)
582
+
583
+ def __invert__(self):
584
+ new_repr = f"~({self._repr_str})"
585
+ res_expr = ~self.expr if self.expr is not None else None
586
+ # Invert clears agg_func
587
+ return Expr(res_expr, None, repr_str=new_repr,
588
+ initial_column_name=self._initial_column_name, agg_func=None)
589
+
590
+ # --- Other useful methods ---
591
+ def is_null(self):
592
+ result_expr = self.expr.is_null() if self.expr is not None else None
593
+ # is_null is not an aggregation, resets agg_func
594
+ result = self._create_next_expr(method_name="is_null", result_expr=result_expr, is_complex=True)
595
+ result.agg_func = None
596
+ return result
597
+
598
+ def filter(self, *predicates, **constraints) -> "Expr":
599
+ """
600
+ Filter expression
601
+ """
602
+ # Build arguments for the filter representation
603
+ args_strs = []
604
+ for pred in predicates:
605
+ if isinstance(pred, Expr):
606
+ args_strs.append(str(pred))
607
+ elif isinstance(pred, pl.Expr):
608
+ _, pred_repr = _get_expr_and_repr(pred)
609
+ args_strs.append(pred_repr)
610
+ else:
611
+ args_strs.append(repr(pred))
612
+
613
+ # Add constraints as keyword arguments
614
+ constraints_strs = [f"{k}={repr(v)}" for k, v in constraints.items()]
615
+ all_args_str = ", ".join(args_strs + constraints_strs)
616
+
617
+ # Process the predicates for the polars expression
618
+ processed_predicates = []
619
+ for pred in predicates:
620
+ if isinstance(pred, Expr):
621
+ if pred.expr is not None:
622
+ processed_predicates.append(pred.expr)
623
+ else:
624
+ # Handle non-Expr predicates (convert to polars Expr if possible)
625
+ processed_predicates.append(pred)
626
+
627
+ # Process constraints for the polars expression
628
+ for col_name, value in constraints.items():
629
+ # Create equivalent of pl.col(col_name).eq(value)
630
+ constraint_expr = pl.col(col_name).eq(value)
631
+ processed_predicates.append(constraint_expr)
632
+
633
+ # Create the actual polars expression if possible
634
+ res_expr = None
635
+ if self.expr is not None:
636
+ try:
637
+ res_expr = self.expr.filter(*processed_predicates)
638
+ except Exception as e:
639
+ print(f"Warning: Could not create polars expression for filter(): {e}")
640
+ pass # res_expr will remain None
641
+
642
+ return Expr(
643
+ res_expr,
644
+ self.name,
645
+ repr_str=f"{self._repr_str}.filter({all_args_str})",
646
+ initial_column_name=self._initial_column_name,
647
+ selector=None, # Filter typically removes selector link
648
+ agg_func=self.agg_func, # Preserve aggregation status
649
+ )
650
+
651
+ def is_not_null(self):
652
+ result_expr = self.expr.is_not_null() if self.expr is not None else None
653
+ result = self._create_next_expr(method_name="is_not_null", result_expr=result_expr, is_complex=True)
654
+ result.agg_func = None
655
+ return result
656
+
657
+ def is_in(self, values):
658
+ res_expr = self.expr.is_in(values) if self.expr is not None else None
659
+ # is_in is not an aggregation, resets agg_func
660
+ result = self._create_next_expr(values, method_name="is_in", result_expr=res_expr, is_complex=True)
661
+ result.agg_func = None
662
+ return result
663
+
664
+ def alias(self, name):
665
+ """Rename the expression result."""
666
+ new_pl_expr = self.expr.alias(name) if self.expr is not None else None
667
+ new_repr = f"{self._repr_str}.alias({repr(name)})"
668
+ # Alias preserves aggregation status
669
+ new_instance = Expr(new_pl_expr, name, repr_str=new_repr,
670
+ initial_column_name=self._initial_column_name,
671
+ selector=None,
672
+ agg_func=self.agg_func,
673
+ is_complex=self.is_complex)
674
+ return new_instance
675
+
676
+ def fill_null(self, value):
677
+ res_expr = self.expr.fill_null(value) if self.expr is not None else None
678
+ # fill_null is not an aggregation, resets agg_func
679
+ result = self._create_next_expr(value, method_name="fill_null", result_expr=res_expr, is_complex=True)
680
+ result.agg_func = None
681
+ return result
682
+
683
+ def fill_nan(self, value):
684
+ res_expr = None
685
+ if self.expr is not None and hasattr(self.expr, 'fill_nan'):
686
+ res_expr = self.expr.fill_nan(value)
687
+ result = self._create_next_expr(value, method_name="fill_nan", result_expr=res_expr, is_complex=True)
688
+ result.agg_func = None
689
+ return result
690
+
691
+ @staticmethod
692
+ def _get_expr_repr(expr):
693
+ """Helper to get appropriate string representation for an expression"""
694
+ if isinstance(expr, (Expr, Column)):
695
+ return expr._repr_str
696
+ elif isinstance(expr, str):
697
+ return f"pl.col('{expr}')"
698
+ elif isinstance(expr, pl.Expr):
699
+ base_str = str(expr)
700
+ if base_str.startswith("col("):
701
+ return f"pl.{base_str}"
702
+ if base_str.startswith("lit("):
703
+ return f"pl.{base_str}"
704
+ return f"pl.Expr({base_str})"
705
+ else:
706
+ return repr(expr)
707
+
708
+ def over(self,
709
+ partition_by: ExprStrOrList, # Use the type alias defined earlier
710
+ *more_exprs: ExprOrStr,
711
+ order_by: Optional[ExprStrOrList] = None,
712
+ descending: bool = False,
713
+ nulls_last: bool = False,
714
+ mapping_strategy: Literal["group_to_rows", "join", "explode"] = "group_to_rows",
715
+ ) -> "Expr":
716
+ """
717
+ Compute expressions over the given groups.
718
+ String representation will show 'descending' and 'nulls_last' if they are True,
719
+ regardless of 'order_by' presence.
720
+ """
721
+ # Process all partition columns (partition_by + more_exprs)
722
+ all_partition_cols = [partition_by]
723
+ if more_exprs:
724
+ all_partition_cols.extend(more_exprs)
725
+
726
+ processed_partition_cols = []
727
+ for col_expr in all_partition_cols:
728
+ if isinstance(col_expr, str):
729
+ processed_partition_cols.append(col(col_expr))
730
+ elif isinstance(col_expr, list):
731
+ processed_list = []
732
+ for item in col_expr:
733
+ if isinstance(item, str):
734
+ processed_list.append(col(item))
735
+ else:
736
+ processed_list.append(item)
737
+ processed_partition_cols.extend(processed_list)
738
+ else:
739
+ processed_partition_cols.append(col_expr)
740
+
741
+ processed_order_by = None
742
+ if order_by is not None:
743
+ if isinstance(order_by, str):
744
+ processed_order_by = col(order_by)
745
+ elif isinstance(order_by, list):
746
+ processed_order_by = [
747
+ col(o) if isinstance(o, str) else o for o in order_by
748
+ ]
749
+ else:
750
+ processed_order_by = order_by
751
+
752
+ over_arg_strings_for_repr = []
753
+
754
+ if built_in_len(processed_partition_cols) == 1:
755
+ over_arg_strings_for_repr.append(self._get_expr_repr(processed_partition_cols[0]))
756
+ else:
757
+ col_reprs = [self._get_expr_repr(p) for p in processed_partition_cols]
758
+ over_arg_strings_for_repr.append(f"[{', '.join(col_reprs)}]")
759
+
760
+ # Handle keyword-like arguments for string representation
761
+ # order_by
762
+ if processed_order_by is not None:
763
+ if isinstance(processed_order_by, list):
764
+ order_by_repr_val = f"[{', '.join([self._get_expr_repr(o) for o in processed_order_by])}]"
765
+ else:
766
+ order_by_repr_val = self._get_expr_repr(processed_order_by)
767
+ over_arg_strings_for_repr.append(f"order_by={order_by_repr_val}")
768
+
769
+ if descending:
770
+ over_arg_strings_for_repr.append(f"descending={repr(descending)}")
771
+
772
+ if nulls_last:
773
+ over_arg_strings_for_repr.append(f"nulls_last={repr(nulls_last)}")
774
+
775
+ if mapping_strategy != "group_to_rows":
776
+ over_arg_strings_for_repr.append(f"mapping_strategy='{mapping_strategy}'")
777
+
778
+ args_str_for_repr = ", ".join(over_arg_strings_for_repr)
779
+
780
+ res_expr = None
781
+ if self.expr is not None:
782
+ try:
783
+ if len(processed_partition_cols) == 1:
784
+ partition_arg = (
785
+ processed_partition_cols[0].expr
786
+ if hasattr(processed_partition_cols[0], "expr")
787
+ else processed_partition_cols[0]
788
+ )
789
+ else:
790
+ partition_arg = [
791
+ p.expr if hasattr(p, "expr") else p
792
+ for p in processed_partition_cols
793
+ ]
794
+
795
+ # Build kwargs for the actual polars over() call
796
+ polars_call_kwargs = {"mapping_strategy": mapping_strategy}
797
+
798
+ if processed_order_by is not None:
799
+ # Convert order_by to Polars expressions
800
+ if isinstance(processed_order_by, list):
801
+ polars_order_by_arg = [
802
+ o.expr if hasattr(o, "expr") else o
803
+ for o in processed_order_by
804
+ ]
805
+ else:
806
+ polars_order_by_arg = (
807
+ processed_order_by.expr
808
+ if hasattr(processed_order_by, "expr")
809
+ else processed_order_by
810
+ )
811
+ polars_call_kwargs["order_by"] = polars_order_by_arg
812
+ # These are tied to order_by for the actual Polars call
813
+ polars_call_kwargs["descending"] = descending
814
+ polars_call_kwargs["nulls_last"] = nulls_last
815
+
816
+ res_expr = self.expr.over(partition_by=partition_arg, **polars_call_kwargs)
817
+
818
+ except Exception as e:
819
+
820
+ print(f"Warning: Could not create polars expression for over(): {e}")
821
+ pass
822
+
823
+ return Expr(
824
+ res_expr,
825
+ self.name,
826
+ repr_str=f"{self._repr_str}.over({args_str_for_repr})",
827
+ initial_column_name=self._initial_column_name,
828
+ selector=None,
829
+ agg_func=None,
830
+ )
831
+
832
+ def sort(self, *, descending=False, nulls_last=False):
833
+ res_expr = self.expr.sort(descending=descending, nulls_last=nulls_last) if self.expr is not None else None
834
+ return Expr(res_expr, self.name,
835
+ repr_str=f"{self._repr_str}.sort(descending={descending}, nulls_last={nulls_last})",
836
+ initial_column_name=self._initial_column_name, agg_func=None)
837
+
838
+ def cast(self, dtype: Union[pl.DataType, str, pl.datatypes.classes.DataTypeClass], *, strict=True):
839
+ """ Casts the Expr to a specified data type. """
840
+ pl_dtype = dtype
841
+ dtype_repr = repr(dtype)
842
+
843
+ if isinstance(dtype, str):
844
+ try:
845
+ pl_dtype = getattr(pl, dtype)
846
+ dtype_repr = f"pl.{dtype}"
847
+ except AttributeError:
848
+ pass
849
+ elif hasattr(dtype, '__name__'):
850
+ dtype_repr = f"pl.{dtype.__name__}"
851
+ elif isinstance(dtype, pl.DataType):
852
+ dtype_repr = f"pl.{dtype!s}"
853
+
854
+ res_expr = self.expr.cast(pl_dtype, strict=strict) if self.expr is not None else None
855
+ # Cast preserves aggregation status (e.g., cast(col('a').sum()))
856
+ new_expr = Expr(res_expr, self.name,
857
+ repr_str=f"{self._repr_str}.cast({dtype_repr}, strict={strict})",
858
+ initial_column_name=self._initial_column_name,
859
+ selector=None,
860
+ agg_func=self.agg_func,
861
+ is_complex=True)
862
+ return new_expr
863
+
864
+
865
+ class Column(Expr):
866
+ """Special Expr representing a single column, preserving column identity through alias/cast."""
867
+ _select_input: transform_schema.SelectInput
868
+
869
+ def __init__(self, name: str, select_input: Optional[transform_schema.SelectInput] = None):
870
+ super().__init__(expr=pl.col(name),
871
+ column_name=name,
872
+ repr_str=f"pl.col('{name}')",
873
+ initial_column_name=select_input.old_name if select_input else name,
874
+ selector=None,
875
+ agg_func=None)
876
+ self._select_input = select_input or transform_schema.SelectInput(old_name=name)
877
+
878
+ def alias(self, new_name: str) -> "Column":
879
+ """Rename a column, returning a new Column instance."""
880
+ new_select = transform_schema.SelectInput(
881
+ old_name=self._select_input.old_name,
882
+ new_name=new_name,
883
+ data_type=self._select_input.data_type,
884
+ data_type_change=self._select_input.data_type_change,
885
+ is_altered=True
886
+ )
887
+ if self.expr is None:
888
+ raise ValueError("Cannot alias Column without underlying polars expression.")
889
+
890
+ new_pl_expr = self.expr.alias(new_name)
891
+ new_repr = f"{self._repr_str}.alias({repr(new_name)})"
892
+
893
+ new_column = Column(new_name, new_select)
894
+ new_column.expr = new_pl_expr
895
+ new_column._repr_str = new_repr
896
+
897
+ new_column.agg_func = self.agg_func
898
+ new_column.is_complex = self.is_complex
899
+ return new_column
900
+
901
+ def cast(self, dtype: Union[pl.DataType, str, pl.datatypes.classes.DataTypeClass], *, strict=True) -> "Column":
902
+ """Change the data type of a column, returning a new Column instance."""
903
+ pl_dtype = dtype
904
+ dtype_repr = repr(dtype)
905
+
906
+ if isinstance(dtype, str):
907
+ try:
908
+ pl_dtype = getattr(pl, dtype)
909
+ dtype_repr = f"pl.{dtype}"
910
+ except AttributeError:
911
+ pass
912
+ elif hasattr(dtype, '__name__'):
913
+ dtype_repr = f"pl.{dtype.__name__}"
914
+ elif isinstance(dtype, pl.DataType):
915
+ dtype_repr = f"pl.{dtype!s}"
916
+
917
+ if not isinstance(pl_dtype, pl.DataType):
918
+ try:
919
+ pl_dtype_instance = pl_dtype()
920
+ if isinstance(pl_dtype_instance, pl.DataType):
921
+ pl_dtype = pl_dtype_instance
922
+ except TypeError:
923
+ raise TypeError(f"Invalid Polars data type specified for cast: {dtype}")
924
+
925
+ new_select = transform_schema.SelectInput(
926
+ old_name=self._select_input.old_name,
927
+ new_name=self._select_input.new_name,
928
+ data_type=str(pl_dtype),
929
+ data_type_change=True,
930
+ is_altered=True
931
+ )
932
+ if self.expr is None:
933
+ raise ValueError("Cannot cast Column without underlying polars expression.")
934
+
935
+ new_pl_expr = self.expr.cast(pl_dtype, strict=strict)
936
+ new_repr = f"{self._repr_str}.cast({dtype_repr}, strict={strict})"
937
+ display_name = self._select_input.new_name or self._select_input.old_name
938
+
939
+ new_column = Column(display_name, new_select)
940
+ new_column.expr = new_pl_expr
941
+ new_column._repr_str = new_repr
942
+ new_column.agg_func = self.agg_func
943
+ new_column.is_complex = True
944
+ return new_column
945
+
946
+ def to_select_input(self) -> transform_schema.SelectInput:
947
+ """Convert Column state back to a SelectInput schema object."""
948
+ # This logic seems correct based on your previous version
949
+ current_name = self.name
950
+ original_name = self._select_input.old_name
951
+ new_name_attr = self._select_input.new_name
952
+
953
+ final_new_name = current_name if current_name != original_name else new_name_attr
954
+ final_data_type = self._select_input.data_type if self._select_input.data_type_change else None
955
+ final_data_type_change = bool(final_data_type)
956
+ final_is_altered = bool(final_new_name or final_data_type_change)
957
+
958
+ return transform_schema.SelectInput(
959
+ old_name=original_name,
960
+ new_name=final_new_name,
961
+ data_type=final_data_type,
962
+ data_type_change=final_data_type_change,
963
+ is_altered=final_is_altered
964
+ )
965
+
966
+ @property
967
+ def str(self) -> StringMethods:
968
+ return super().str
969
+
970
+ @property
971
+ def dt(self) -> DateTimeMethods:
972
+ return super().dt
973
+
974
+
975
+ class When(Expr):
976
+ """Class that represents a when-then-otherwise expression chain."""
977
+
978
+ def __init__(self, condition):
979
+ """Initialize a When expression with a condition."""
980
+ # Get the condition's expression and representation
981
+ condition_expr, condition_repr = self._get_expr_and_repr(condition)
982
+ self.condition = condition_expr
983
+
984
+ # Build the initial representation string
985
+ repr_str = f"pl.when({condition_repr})"
986
+ # Initialize the base class
987
+ super().__init__(expr=None, repr_str=repr_str, is_complex=True)
988
+ self._branch_expr = None
989
+
990
+ @staticmethod
991
+ def _get_expr_and_repr(value):
992
+ """Extract expression and representation from a value."""
993
+ if hasattr(value, 'expr') and hasattr(value, '_repr_str'):
994
+ return value.expr, value._repr_str
995
+ elif isinstance(value, str) and not value.startswith("pl."):
996
+ col_obj = col(value)
997
+ return col_obj.expr, f"'{value}'"
998
+ else:
999
+ return value, repr(value)
1000
+
1001
+ def then(self, value):
1002
+ """Set the value to use when the condition is True."""
1003
+ value_expr, value_repr = self._get_expr_and_repr(value)
1004
+
1005
+ self._repr_str = f"{self._repr_str}.then({value_repr})"
1006
+ try:
1007
+ self._branch_expr = pl.when(self.condition).then(value_expr)
1008
+ except Exception as e:
1009
+ print(f"Warning: Error in then() creation: {e}")
1010
+
1011
+ return self
1012
+
1013
+ def otherwise(self, value):
1014
+ """Set the value to use when no condition is True."""
1015
+ # Get the value's expression and representation
1016
+ value_expr, value_repr = self._get_expr_and_repr(value)
1017
+ final_repr = f"{self._repr_str}.otherwise({value_repr})"
1018
+
1019
+ pl_expr = None
1020
+ try:
1021
+ if self._branch_expr is not None:
1022
+ pl_expr = self._branch_expr.otherwise(value_expr)
1023
+ except Exception as e:
1024
+ print(f"Warning: Could not create when-then-otherwise expression: {e}")
1025
+
1026
+ return Expr(pl_expr, repr_str=final_repr)
1027
+
1028
+ def when(self, condition):
1029
+ """Create a new branch in the chain."""
1030
+ if self._branch_expr is None:
1031
+ print("Warning: Cannot add new branch without a then() first")
1032
+ return self
1033
+
1034
+ condition_expr, condition_repr = self._get_expr_and_repr(condition)
1035
+
1036
+ self._repr_str = f"{self._repr_str}.when({condition_repr})"
1037
+
1038
+ try:
1039
+ self._branch_expr = self._branch_expr.when(condition_expr)
1040
+ except Exception as e:
1041
+ print(f"Warning: Error adding new when() branch: {e}")
1042
+
1043
+ # Return self for chaining
1044
+ return self
1045
+
1046
+
1047
+ # --- Top-Level Functions ---
1048
+ def col(name: str) -> Column:
1049
+ """Creates a Column expression."""
1050
+ return Column(name)
1051
+
1052
+
1053
+ def column(name: str) -> Column:
1054
+ """Alias for col(). Creates a Column expression."""
1055
+ return Column(name)
1056
+
1057
+
1058
+ def lit(value: Any) -> Expr:
1059
+ """Creates a Literal expression."""
1060
+ # Literals don't have an agg_func
1061
+ return Expr(pl.lit(value), repr_str=f"pl.lit({repr(value)})", agg_func=None)
1062
+
1063
+
1064
+ def len() -> Expr:
1065
+ return Expr(pl.len()).alias('number_of_records')
1066
+
1067
+
1068
+ def agg_function(func):
1069
+ """
1070
+ Decorator for aggregation functions that sets appropriate properties based on number of arguments.
1071
+ Uses the function name as the aggregation function name.
1072
+
1073
+ Parameters:
1074
+ -----------
1075
+ func : function
1076
+ The aggregation function to decorate
1077
+
1078
+ Returns:
1079
+ --------
1080
+ wrapper
1081
+ A wrapped function that returns the properly configured Expr
1082
+ """
1083
+ agg_func_name = func.__name__ # Use the function name as the agg_func
1084
+
1085
+ def wrapper(*names):
1086
+ # Get the Polars expression from the original function
1087
+ pl_expr = func(*names)
1088
+ if built_in_len(names) == 1 and isinstance(names[0], str):
1089
+ return Expr(pl_expr, agg_func=agg_func_name, initial_column_name=names[0], is_complex=False)
1090
+ elif built_in_len(names) == 1 and isinstance(names[0], Expr):
1091
+ return Expr(pl_expr, agg_func=agg_func_name, initial_column_name=names[0].name, is_complex=names[0].is_complex)
1092
+ else:
1093
+ return Expr(pl_expr, agg_func=agg_func_name, is_complex=True)
1094
+ return wrapper
1095
+
1096
+
1097
+ @agg_function
1098
+ def max(*names) -> Expr:
1099
+ return pl.max(*names)
1100
+
1101
+
1102
+ @agg_function
1103
+ def min(*names) -> Expr:
1104
+ return pl.min(*names)
1105
+
1106
+
1107
+ @agg_function
1108
+ def first(*names) -> Expr:
1109
+ return pl.first(*names)
1110
+
1111
+
1112
+ @agg_function
1113
+ def last(*names) -> Expr:
1114
+ return pl.last(*names)
1115
+
1116
+
1117
+ @agg_function
1118
+ def mean(*names) -> Expr:
1119
+ return pl.mean(*names)
1120
+
1121
+
1122
+ @agg_function
1123
+ def count(*names) -> Expr:
1124
+ return pl.count(*names)
1125
+
1126
+
1127
+ @agg_function
1128
+ def sum(*names) -> Expr:
1129
+ return pl.sum(*names)
1130
+
1131
+
1132
+ def std(column, ddof) -> Expr:
1133
+ return Expr(column, ddof=ddof, agg_func='std')
1134
+
1135
+
1136
+ def var(column, ddof) -> Expr:
1137
+ return Expr(column, ddof=ddof, agg_func="var")
1138
+
1139
+
1140
+ def cum_count(expr, reverse: bool = False) -> Expr:
1141
+ """
1142
+ Return the cumulative count of the non-null values in the column.
1143
+
1144
+ Parameters
1145
+ ----------
1146
+ expr : str or Expr
1147
+ Expression to compute cumulative count on
1148
+ reverse : bool, default False
1149
+ Reverse the operation
1150
+
1151
+ Returns
1152
+ -------
1153
+ Expr
1154
+ A new expression with the cumulative count
1155
+ """
1156
+ if isinstance(expr, str):
1157
+ expr = col(expr)
1158
+ return expr.cum_count(reverse=reverse)
1159
+
1160
+
1161
+ def when(condition):
1162
+ """Start a when-then-otherwise expression."""
1163
+ return When(condition)