Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,199 @@
1
+
2
+ from flowfile_frame.expr import col, Expr
3
+ from flowfile_frame.selectors import Selector
4
+ from flowfile_frame.utils import _parse_inputs_as_iterable
5
+ from flowfile_core.schemas import transform_schema, input_schema
6
+ from typing import TYPE_CHECKING
7
+
8
+ # Corrected TYPE_CHECKING block as provided by user
9
+ if TYPE_CHECKING:
10
+ from flowfile_frame.flow_frame import FlowFrame
11
+ else:
12
+ FlowFrame = None
13
+
14
+
15
+ class GroupByFrame:
16
+ """Represents a grouped DataFrame for aggregation operations."""
17
+
18
+ def __init__(self, node_id: int, parent_frame, by_cols, maintain_order=False, description: str = None):
19
+ self.parent = parent_frame
20
+ self.by_cols = _parse_inputs_as_iterable(by_cols)
21
+ self.maintain_order = maintain_order
22
+ self.description = description
23
+ self.node_id = node_id
24
+
25
+ def readable_group(self):
26
+ parts = []
27
+ for c in self.by_cols:
28
+ if isinstance(c, Expr):
29
+ parts.append(str(c))
30
+ elif isinstance(c, str):
31
+ parts.append(f'''"{c}"''')
32
+ else:
33
+ parts.append(f'''"{str(c)}"''')
34
+ return ", ".join(parts)
35
+
36
+ def len(self) -> 'FlowFrame':
37
+ """
38
+ Count number of rows per group. Output column is named 'len'.
39
+ """
40
+ # Uses direct code generation as per user's example
41
+ return self._generate_direct_polars_code("len")
42
+
43
+ def count(self) -> 'FlowFrame':
44
+ """
45
+ Count number of rows per group. Output column is named 'count'.
46
+ """
47
+ # Uses direct code generation as per user's example
48
+ return self._generate_direct_polars_code("count")
49
+
50
+ def agg(self, *agg_exprs, **named_agg_exprs) -> FlowFrame:
51
+ """
52
+ Apply EXPLICIT aggregations to grouped data using expressions.
53
+ """
54
+ agg_expressions = _parse_inputs_as_iterable(agg_exprs)
55
+ can_be_converted: bool = not self.maintain_order
56
+ agg_cols: list[transform_schema.AggColl] = []
57
+ if can_be_converted:
58
+ can_be_converted = self._process_group_columns(agg_cols)
59
+ if can_be_converted:
60
+ can_be_converted = self._process_agg_expressions(agg_cols, agg_expressions)
61
+ if can_be_converted:
62
+ can_be_converted = self._process_named_agg_expressions(agg_cols, named_agg_exprs)
63
+ node_desc = self.description or f"Aggregate after grouping by {self.readable_group()}"
64
+ return self._create_agg_node(self.node_id, can_be_converted, agg_cols, agg_expressions, named_agg_exprs, node_desc)
65
+
66
+ def _process_group_columns(self, agg_cols: list[transform_schema.AggColl]) -> bool:
67
+ # (Implementation unchanged from user input)
68
+ for col_expr in self.by_cols:
69
+ if isinstance(col_expr, str):
70
+ agg_cols.append(transform_schema.AggColl(old_name=col_expr, agg="groupby"))
71
+ elif isinstance(col_expr, Expr):
72
+ agg_cols.append(transform_schema.AggColl(old_name=col_expr.name, agg="groupby"))
73
+ elif isinstance(col_expr, Selector):
74
+ return False
75
+ else:
76
+ return False
77
+ return True
78
+
79
+ @staticmethod
80
+ def _process_agg_expressions(agg_cols: list[transform_schema.AggColl], agg_expressions) -> bool:
81
+ # (Implementation unchanged from user input)
82
+ for expr in agg_expressions:
83
+ if isinstance(expr, Expr):
84
+ agg_func = getattr(expr, "agg_func", None)
85
+ old_name = getattr(expr, "_initial_column_name", expr.name) or expr.name
86
+ if agg_func:
87
+ agg_cols.append(
88
+ transform_schema.AggColl(old_name=old_name, agg=agg_func, new_name=expr.name)
89
+ )
90
+ else:
91
+ agg_cols.append(transform_schema.AggColl(old_name=expr.name, agg="first"))
92
+ elif isinstance(expr, str):
93
+ agg_cols.append(transform_schema.AggColl(old_name=expr, agg="first"))
94
+ elif isinstance(expr, Selector):
95
+ return False
96
+ else:
97
+ return False
98
+ return True
99
+
100
+ @staticmethod
101
+ def _process_named_agg_expressions(agg_cols: list[transform_schema.AggColl], named_agg_exprs: dict) -> bool:
102
+ for name, expr in named_agg_exprs.items():
103
+ if expr.is_complex:
104
+ return False
105
+ if isinstance(expr, Expr):
106
+ agg_func = getattr(expr, "agg_func", "first")
107
+ old_name = getattr(expr, "_initial_column_name", expr.name) or expr.name
108
+ agg_cols.append(transform_schema.AggColl(old_name=old_name, agg=agg_func, new_name=name))
109
+ elif isinstance(expr, str):
110
+ agg_cols.append(transform_schema.AggColl(old_name=expr, agg="first", new_name=name))
111
+ elif isinstance(expr, tuple) and len(expr) == 2:
112
+ col_spec, agg_func_str = expr
113
+ if isinstance(col_spec, Expr):
114
+ old_name = getattr(col_spec, "_initial_column_name", col_spec.name) or col_spec.name
115
+ elif isinstance(col_spec, str):
116
+ old_name = col_spec
117
+ else:
118
+ return False
119
+ if not isinstance(agg_func_str, str):
120
+ return False
121
+ agg_cols.append(transform_schema.AggColl(old_name=old_name, agg=agg_func_str, new_name=name))
122
+ else:
123
+ return False
124
+ return True
125
+
126
+ def _create_agg_node(self, node_id_to_use: int, can_be_converted: bool, agg_cols: list, agg_expressions, named_agg_exprs, description: str):
127
+ """Creates node for explicit aggregations via self.agg()"""
128
+ # (Implementation unchanged from user input, passes description)
129
+ if can_be_converted:
130
+ group_by_settings = input_schema.NodeGroupBy(
131
+ flow_id=self.parent.flow_graph.flow_id,
132
+ node_id=node_id_to_use,
133
+ groupby_input=transform_schema.GroupByInput(agg_cols=agg_cols),
134
+ pos_x=200, pos_y=200, is_setup=True,
135
+ depending_on_id=self.parent.node_id,
136
+ description=description
137
+ )
138
+ self.parent.flow_graph.add_group_by(group_by_settings)
139
+ else:
140
+ code = self._generate_polars_agg_code(agg_expressions, named_agg_exprs)
141
+ self.parent._add_polars_code(new_node_id=node_id_to_use, code=code, description=description)
142
+ return self.parent._create_child_frame(node_id_to_use)
143
+
144
+ def _generate_direct_polars_code(self, method_name: str) -> "FlowFrame":
145
+ """
146
+ Generates Polars code for simple GroupBy methods like sum(), mean(), len(), count()
147
+ which operate implicitly or have a standard Polars counterpart.
148
+ Always uses the Polars code path.
149
+ """
150
+ readable_group_str = self.readable_group()
151
+ code = f"input_df.group_by([{readable_group_str}], maintain_order={self.maintain_order}).{method_name}()"
152
+ node_description = self.description or f"{method_name.capitalize()} after grouping by {readable_group_str}"
153
+ self.parent._add_polars_code(new_node_id=self.node_id, code=code, description=node_description)
154
+ return self.parent._create_child_frame(self.node_id)
155
+
156
+ def _generate_polars_agg_code(self, agg_expressions, named_agg_exprs) -> str:
157
+ """Generate Polars code specifically for explicit .agg() calls."""
158
+ # (Implementation unchanged from user input)
159
+ readable_group_str = self.readable_group()
160
+ agg_strs = [str(expr) for expr in agg_expressions]
161
+ named_agg_strs = [f"{name}={str(expr)}" for name, expr in named_agg_exprs.items()]
162
+ all_agg_strs = agg_strs + named_agg_strs
163
+ agg_combined = ", ".join(all_agg_strs)
164
+ # Assuming input dataframe is 'input_df' in execution context
165
+ return f"input_df.group_by([{readable_group_str}], maintain_order={self.maintain_order}).agg({agg_combined})"
166
+
167
+ # --- Convenience Methods (No Column Args - Use Direct Code Gen) ---
168
+
169
+ def sum(self):
170
+ """Calculate sum for all non-grouping columns."""
171
+ return self._generate_direct_polars_code("sum")
172
+
173
+ def mean(self):
174
+ """Calculate mean for all non-grouping columns."""
175
+ return self._generate_direct_polars_code("mean")
176
+
177
+ def median(self):
178
+ """Calculate median for all non-grouping columns."""
179
+ return self._generate_direct_polars_code("median")
180
+
181
+ def min(self):
182
+ """Calculate minimum for all non-grouping columns."""
183
+ # Remove *columns argument
184
+ return self._generate_direct_polars_code("min")
185
+
186
+ def max(self):
187
+ """Calculate maximum for all non-grouping columns."""
188
+ # Remove *columns argument
189
+ return self._generate_direct_polars_code("max")
190
+
191
+ def first(self):
192
+ """Get first value for all non-grouping columns."""
193
+ # Remove *columns argument
194
+ return self._generate_direct_polars_code("first")
195
+
196
+ def last(self):
197
+ """Get last value for all non-grouping columns."""
198
+ # Remove *columns argument
199
+ return self._generate_direct_polars_code("last")
flowfile_frame/join.py ADDED
@@ -0,0 +1,75 @@
1
+ # Assume these imports are correct from your original context
2
+ from flowfile_frame.expr import Column
3
+ from flowfile_core.schemas import transform_schema
4
+
5
+
6
+ def _normalize_columns_to_list(columns):
7
+ """Convert a column specification to a list format.
8
+
9
+ Args:
10
+ columns: Column name or list of column names
11
+
12
+ Returns:
13
+ List of column names/expressions
14
+ """
15
+ if columns is None:
16
+ return []
17
+ elif isinstance(columns, str):
18
+ return [columns]
19
+ elif isinstance(columns, (list, tuple)):
20
+ return list(columns)
21
+ else:
22
+ return [columns] # Single non-string item
23
+
24
+
25
+ def _extract_column_name(col_expr):
26
+ """Extract a simple column name from various column representations.
27
+
28
+ Args:
29
+ col_expr: Column expression (string, Column object, etc.)
30
+
31
+ Returns:
32
+ tuple: (column_name, needs_polars_code)
33
+ - column_name is the string name if possible
34
+ - needs_polars_code is True if this expression requires polars code generation
35
+ """
36
+ if isinstance(col_expr, str):
37
+ return col_expr, False
38
+
39
+ if isinstance(col_expr, Column):
40
+ # If it's a simple unaltered column, use its name
41
+ if not col_expr._select_input.is_altered:
42
+ return col_expr.name, False
43
+ # Otherwise, this requires polars code
44
+ return col_expr, True
45
+
46
+ # Any other expression type needs polars code
47
+ return col_expr, True
48
+
49
+
50
+ def _create_join_mappings(left_columns, right_columns):
51
+ """Create join mappings between left and right columns.
52
+
53
+ Args:
54
+ left_columns: List of left join columns
55
+ right_columns: List of right join columns
56
+
57
+ Returns:
58
+ tuple: (join_mappings, needs_polars_code)
59
+ - join_mappings is a list of JoinMap objects
60
+ - needs_polars_code is True if any column requires polars code generation
61
+ """
62
+ join_mappings = []
63
+ needs_polars_code = False
64
+
65
+ for left_col, right_col in zip(left_columns, right_columns):
66
+ left_name, left_needs_code = _extract_column_name(left_col)
67
+ right_name, right_needs_code = _extract_column_name(right_col)
68
+
69
+ needs_polars_code = needs_polars_code or left_needs_code or right_needs_code
70
+
71
+ # Only create standard join mappings if both columns are simple strings
72
+ if not left_needs_code and not right_needs_code:
73
+ join_mappings.append(transform_schema.JoinMap(left_col=left_name, right_col=right_name))
74
+
75
+ return join_mappings, needs_polars_code
@@ -0,0 +1,242 @@
1
+ from typing import List, Union, TYPE_CHECKING # Added TYPE_CHECKING
2
+
3
+ import polars as pl
4
+
5
+ # --- TYPE CHECKING IMPORTS ---
6
+ # if TYPE_CHECKING:
7
+ # Import Expr only for type hints
8
+ from flowfile_frame.expr import Expr
9
+
10
+
11
+ # --- Selector Base Classes (Compound, Complement) ---
12
+
13
+ class Selector:
14
+ """Base class for column selectors, inspired by polars.selectors"""
15
+ def __init__(self):
16
+ self._repr_str = self._get_repr_str() # Use base repr calculation method
17
+ # No agg_func state stored here anymore
18
+
19
+ @property
20
+ def repr_str(self):
21
+ return self._repr_str
22
+
23
+ def _get_repr_str(self) -> str:
24
+ """Get representation string for the selector itself."""
25
+ # Default implementation, specific selectors override this
26
+ return f"pl.selectors.{self.__class__.__name__}()"
27
+
28
+ def __repr__(self) -> str:
29
+ return self._repr_str
30
+
31
+ def __or__(self, other: 'Selector') -> 'CompoundSelector':
32
+ return CompoundSelector(self, other, 'union')
33
+
34
+ def __and__(self, other: 'Selector') -> 'CompoundSelector':
35
+ return CompoundSelector(self, other, 'intersection')
36
+
37
+ def __sub__(self, other: 'Selector') -> 'CompoundSelector':
38
+ return CompoundSelector(self, other, 'difference')
39
+
40
+ def __xor__(self, other: 'Selector') -> 'CompoundSelector':
41
+ return CompoundSelector(self, other, 'symmetric_difference')
42
+
43
+ def __invert__(self) -> 'ComplementSelector':
44
+ return ComplementSelector(self)
45
+
46
+ # --- Aggregation Methods ---
47
+ # These methods now return Expr objects, importing Expr locally
48
+
49
+ def sum(self) -> 'Expr':
50
+ """Create an expression to sum columns selected by this selector."""
51
+ # Expr init will handle creating the 'pl.sum(selector)' repr
52
+ return Expr(expr=None, selector=self, agg_func="sum")
53
+
54
+ def mean(self) -> 'Expr':
55
+ """Create an expression to average columns selected by this selector."""
56
+ return Expr(expr=None, selector=self, agg_func="mean")
57
+
58
+ def median(self) -> 'Expr':
59
+ """Create an expression to find the median of columns selected by this selector."""
60
+ return Expr(expr=None, selector=self, agg_func="median")
61
+
62
+ def min(self) -> 'Expr':
63
+ """Create an expression to find the minimum of columns selected by this selector."""
64
+ return Expr(expr=None, selector=self, agg_func="min")
65
+
66
+ def max(self) -> 'Expr':
67
+ """Create an expression to find the maximum of columns selected by this selector."""
68
+ return Expr(expr=None, selector=self, agg_func="max")
69
+
70
+ def std(self, ddof: int = 1) -> 'Expr':
71
+ """Create an expression to find the standard deviation of columns selected by this selector."""
72
+ return Expr(expr=None, selector=self, agg_func="std", ddof=ddof)
73
+
74
+ def var(self, ddof: int = 1) -> 'Expr':
75
+ """Create an expression to find the variance of columns selected by this selector."""
76
+ return Expr(expr=None, selector=self, agg_func="var", ddof=ddof)
77
+
78
+ def first(self) -> 'Expr':
79
+ """Create an expression to get the first element of columns selected by this selector."""
80
+ return Expr(expr=None, selector=self, agg_func="first")
81
+
82
+ def last(self) -> 'Expr':
83
+ """Create an expression to get the last element of columns selected by this selector."""
84
+ return Expr(expr=None, selector=self, agg_func="last")
85
+
86
+ def count(self) -> 'Expr':
87
+ """Create an expression to count elements in columns selected by this selector."""
88
+ return Expr(expr=None, selector=self, agg_func="count")
89
+
90
+ def n_unique(self) -> 'Expr':
91
+ """Create an expression to count unique elements in columns selected by this selector."""
92
+ return Expr(expr=None, selector=self, agg_func="n_unique")
93
+
94
+ # Removed alias method - belongs on Expr
95
+
96
+
97
+ class CompoundSelector(Selector):
98
+ """Selector representing a compound operation between two selectors"""
99
+ def __init__(self, left: Selector, right: Selector, operation: str):
100
+ self.left = left
101
+ self.right = right
102
+ self.operation = operation
103
+ super().__init__()
104
+
105
+ def _get_repr_str(self) -> str:
106
+ op_map = {'union': '|', 'intersection': '&', 'difference': '-', 'symmetric_difference': '^'}
107
+ op_symbol = op_map.get(self.operation, '|')
108
+ # Use base repr (_repr_str) of operands
109
+ left_repr = f"({self.left._repr_str})" if isinstance(self.left, CompoundSelector) else self.left._repr_str
110
+ right_repr = f"({self.right._repr_str})" if isinstance(self.right, CompoundSelector) else self.right._repr_str
111
+ return f"{left_repr} {op_symbol} {right_repr}"
112
+
113
+
114
+ class ComplementSelector(Selector):
115
+ """Selector representing the complement (NOT) of another selector"""
116
+ def __init__(self, selector: Selector):
117
+ self.selector = selector
118
+ super().__init__()
119
+
120
+ def _get_repr_str(self) -> str:
121
+ selector_repr = f"({self.selector._repr_str})" if isinstance(self.selector, CompoundSelector) else self.selector._repr_str
122
+ return f"~{selector_repr}"
123
+
124
+
125
+ class NumericSelector(Selector):
126
+ def _get_repr_str(self) -> str: return "pl.selectors.numeric()"
127
+
128
+
129
+ class FloatSelector(Selector):
130
+ def _get_repr_str(self) -> str: return "pl.selectors.float()"
131
+
132
+
133
+ class IntegerSelector(Selector):
134
+ def _get_repr_str(self) -> str: return "pl.selectors.integer()"
135
+
136
+
137
+ class StringSelector(Selector):
138
+ def _get_repr_str(self) -> str: return "pl.selectors.string()"
139
+
140
+
141
+ class TemporalSelector(Selector):
142
+ def _get_repr_str(self) -> str: return "pl.selectors.temporal()"
143
+
144
+
145
+ class DatetimeSelector(Selector):
146
+ def _get_repr_str(self) -> str: return "pl.selectors.datetime()"
147
+
148
+
149
+ class DateSelector(Selector):
150
+ def _get_repr_str(self) -> str: return "pl.selectors.date()"
151
+
152
+
153
+ class TimeSelector(Selector):
154
+ def _get_repr_str(self) -> str: return "pl.selectors.time()"
155
+
156
+
157
+ class DurationSelector(Selector):
158
+ def _get_repr_str(self) -> str: return "pl.selectors.duration()"
159
+
160
+
161
+ class BooleanSelector(Selector):
162
+ def _get_repr_str(self) -> str: return "pl.selectors.boolean()"
163
+
164
+
165
+ class CategoricalSelector(Selector):
166
+ def _get_repr_str(self) -> str: return "pl.selectors.categorical()"
167
+
168
+
169
+ class ObjectSelector(Selector):
170
+ def _get_repr_str(self) -> str: return "pl.selectors.object()"
171
+
172
+
173
+ class ListSelector(Selector):
174
+ def _get_repr_str(self) -> str: return "pl.selectors.list()"
175
+
176
+
177
+ class StructSelector(Selector):
178
+ def _get_repr_str(self) -> str: return "pl.selectors.struct()"
179
+
180
+
181
+ class AllSelector(Selector):
182
+ def _get_repr_str(self) -> str: return "pl.selectors.all()"
183
+
184
+
185
+ class DtypeSelector(Selector):
186
+ def __init__(self, dtypes: Union[pl.DataType, List[pl.DataType]]):
187
+ self.dtypes = dtypes if isinstance(dtypes, list) else [dtypes]
188
+ super().__init__()
189
+
190
+ def _get_repr_str(self) -> str:
191
+ dtype_strs = []
192
+ for dt in self.dtypes:
193
+ dt_repr = repr(dt)
194
+ if dt_repr.startswith("DataType"):
195
+ dt_repr = str(dt).capitalize()
196
+ dtype_strs.append(f"pl.{dt_repr}")
197
+ dtype_repr_arg = dtype_strs[0] if len(dtype_strs) == 1 else f"[{', '.join(dtype_strs)}]"
198
+ return f"pl.selectors.by_dtype({dtype_repr_arg})"
199
+
200
+
201
+ class PatternSelector(Selector):
202
+ def __init__(self, pattern: str):
203
+ self.pattern = pattern
204
+ super().__init__()
205
+
206
+
207
+ class ContainsSelector(PatternSelector):
208
+ def _get_repr_str(self) -> str: return f"pl.selectors.contains({self.pattern!r})"
209
+
210
+
211
+ class StartsWithSelector(PatternSelector):
212
+ def _get_repr_str(self) -> str: return f"pl.selectors.starts_with({self.pattern!r})"
213
+
214
+
215
+ class EndsWithSelector(PatternSelector):
216
+ def _get_repr_str(self) -> str: return f"pl.selectors.ends_with({self.pattern!r})"
217
+
218
+
219
+ class MatchesSelector(PatternSelector):
220
+ def _get_repr_str(self) -> str: return f"pl.selectors.matches({self.pattern!r})"
221
+
222
+
223
+ def numeric() -> NumericSelector: return NumericSelector()
224
+ def float_() -> FloatSelector: return FloatSelector()
225
+ def integer() -> IntegerSelector: return IntegerSelector()
226
+ def string() -> StringSelector: return StringSelector()
227
+ def temporal() -> TemporalSelector: return TemporalSelector()
228
+ def datetime() -> DatetimeSelector: return DatetimeSelector()
229
+ def date() -> DateSelector: return DateSelector()
230
+ def time() -> TimeSelector: return TimeSelector()
231
+ def duration() -> DurationSelector: return DurationSelector()
232
+ def boolean() -> BooleanSelector: return BooleanSelector()
233
+ def categorical() -> CategoricalSelector: return CategoricalSelector()
234
+ def object_() -> ObjectSelector: return ObjectSelector()
235
+ def list_() -> ListSelector: return ListSelector()
236
+ def struct() -> StructSelector: return StructSelector()
237
+ def all_() -> AllSelector: return AllSelector()
238
+ def by_dtype(dtypes: Union[pl.DataType, List[pl.DataType]]) -> DtypeSelector: return DtypeSelector(dtypes)
239
+ def contains(pattern: str) -> ContainsSelector: return ContainsSelector(pattern)
240
+ def starts_with(pattern: str) -> StartsWithSelector: return StartsWithSelector(pattern)
241
+ def ends_with(pattern: str) -> EndsWithSelector: return EndsWithSelector(pattern)
242
+ def matches(pattern: str) -> MatchesSelector: return MatchesSelector(pattern)