Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,1521 @@
1
+ # Standard library imports
2
+ import logging
3
+ import os
4
+ from copy import deepcopy
5
+ from dataclasses import dataclass
6
+ from math import ceil
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
8
+
9
+ # Third-party imports
10
+ from loky import Future
11
+ import polars as pl
12
+ from polars.exceptions import PanicException
13
+ from polars_grouper import graph_solver
14
+ from polars_expr_transformer import simple_function_to_expr as to_expr
15
+ from pyarrow.parquet import ParquetFile
16
+
17
+ # Local imports - Core
18
+ from flowfile_core.configs import logger
19
+ from flowfile_core.configs.flow_logger import NodeLogger
20
+ from flowfile_core.schemas import (
21
+ input_schema,
22
+ transform_schema as transform_schemas
23
+ )
24
+
25
+ # Local imports - Flow File Components
26
+ from flowfile_core.flowfile.flow_data_engine import utils
27
+ from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
28
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
29
+ FlowfileColumn,
30
+ convert_stats_to_column_info
31
+ )
32
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars
33
+ from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
34
+ from flowfile_core.flowfile.flow_data_engine.join import (
35
+ verify_join_select_integrity,
36
+ verify_join_map_integrity
37
+ )
38
+ from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
39
+ from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
40
+ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (
41
+ ExternalCreateFetcher,
42
+ ExternalDfFetcher,
43
+ ExternalExecutorTracker,
44
+ ExternalFuzzyMatchFetcher,
45
+ fetch_unique_values
46
+ )
47
+ from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
48
+ get_join_count,
49
+ write_threaded
50
+ )
51
+
52
+ from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
53
+
54
+
55
+ @dataclass
56
+ class FlowDataEngine:
57
+ """
58
+ A class that provides a unified interface for working with tabular data, supporting both eager and lazy evaluation.
59
+
60
+ The class is organized into several logical sections:
61
+ 1. Core properties and initialization
62
+ 2. Data access and manipulation
63
+ 3. Schema and metadata operations
64
+ 4. Transformations and operations
65
+ 5. I/O operations
66
+ """
67
+
68
+ # Core attributes
69
+ _data_frame: Union[pl.DataFrame, pl.LazyFrame]
70
+ columns: List[Any]
71
+
72
+ # Metadata attributes
73
+ name: str = None
74
+ number_of_records: int = None
75
+ errors: List = None
76
+ _schema: Optional[List['FlowfileColumn']] = None
77
+
78
+ # Configuration attributes
79
+ _optimize_memory: bool = False
80
+ _lazy: bool = None
81
+ _streamable: bool = True
82
+ _calculate_schema_stats: bool = False
83
+
84
+ # Cache and optimization attributes
85
+ __col_name_idx_map: Dict = None
86
+ __data_map: Dict = None
87
+ __optimized_columns: List = None
88
+ __sample__: str = None
89
+ __number_of_fields: int = None
90
+ _col_idx: Dict[str, int] = None
91
+
92
+ # Source tracking
93
+ _org_path: Optional[str] = None
94
+ _external_source: Optional[ExternalDataSource] = None
95
+
96
+ # State tracking
97
+ sorted_by: int = None
98
+ is_future: bool = False
99
+ is_collected: bool = True
100
+ ind_schema_calculated: bool = False
101
+
102
+ # Callbacks
103
+ _future: Future = None
104
+ _number_of_records_callback: Callable = None
105
+ _data_callback: Callable = None
106
+
107
+ # Tracking info
108
+ # node_id: int = None # TODO: Implement node_id
109
+ # flow_id: int = None # TODO: Implement flow_id
110
+
111
+ def __init__(self,
112
+ raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame] = None,
113
+ path_ref: str = None,
114
+ name: str = None,
115
+ optimize_memory: bool = True,
116
+ schema: List['FlowfileColumn'] | List[str] | pl.Schema = None,
117
+ number_of_records: int = None,
118
+ calculate_schema_stats: bool = False,
119
+ streamable: bool = True,
120
+ number_of_records_callback: Callable = None,
121
+ data_callback: Callable = None):
122
+ """Initialize FlowDataEngine with various data sources and configuration options."""
123
+ self._initialize_attributes(number_of_records_callback, data_callback, streamable)
124
+
125
+ if raw_data is not None:
126
+ self._handle_raw_data(raw_data, number_of_records, optimize_memory)
127
+ elif path_ref:
128
+ self._handle_path_ref(path_ref, optimize_memory)
129
+ else:
130
+ self.initialize_empty_fl()
131
+
132
+ self._finalize_initialization(name, optimize_memory, schema, calculate_schema_stats)
133
+
134
+ def _initialize_attributes(self, number_of_records_callback, data_callback, streamable):
135
+ """Initialize basic attributes with default values."""
136
+ self._external_source = None
137
+ self._number_of_records_callback = number_of_records_callback
138
+ self._data_callback = data_callback
139
+ self.ind_schema_calculated = False
140
+ self._streamable = streamable
141
+ self._org_path = None
142
+ self._lazy = False
143
+ self.errors = []
144
+ self._calculate_schema_stats = False
145
+ self.is_collected = True
146
+ self.is_future = False
147
+
148
+ def _handle_raw_data(self, raw_data, number_of_records, optimize_memory):
149
+ """Process different types of input data."""
150
+ if isinstance(raw_data, pl.DataFrame):
151
+ self._handle_polars_dataframe(raw_data, number_of_records)
152
+ elif isinstance(raw_data, pl.LazyFrame):
153
+ self._handle_polars_lazy_frame(raw_data, number_of_records, optimize_memory)
154
+ elif isinstance(raw_data, (list, dict)):
155
+ self._handle_python_data(raw_data)
156
+
157
+ def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: Optional[int]):
158
+ """Handle Polars DataFrame input."""
159
+ self.data_frame = df
160
+ self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
161
+
162
+ def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: Optional[int], optimize_memory: bool):
163
+ """Handle Polars LazyFrame input."""
164
+ self.data_frame = lf
165
+ self._lazy = True
166
+ if number_of_records is not None:
167
+ self.number_of_records = number_of_records
168
+ elif optimize_memory:
169
+ self.number_of_records = -1
170
+ else:
171
+ self.number_of_records = lf.select(pl.len()).collect()[0, 0]
172
+
173
+ def _handle_python_data(self, data: Union[List, Dict]):
174
+ """Handle Python list or dict input."""
175
+ if isinstance(data, dict):
176
+ self._handle_dict_input(data)
177
+ else:
178
+ self._handle_list_input(data)
179
+
180
+ def _handle_dict_input(self, data: Dict):
181
+ """Handle dictionary input."""
182
+ if len(data) == 0:
183
+ self.initialize_empty_fl()
184
+ lengths = [len(v) if isinstance(v, (list, tuple)) else 1 for v in data.values()]
185
+
186
+ if len(set(lengths)) == 1 and lengths[0]>1:
187
+ self.number_of_records = lengths[0]
188
+ self.data_frame = pl.DataFrame(data)
189
+ else:
190
+ self.number_of_records = 1
191
+ self.data_frame = pl.DataFrame([data])
192
+
193
+ def _handle_list_input(self, data: List):
194
+ """Handle list input."""
195
+ number_of_records = len(data)
196
+ if number_of_records > 0:
197
+ processed_data = self._process_list_data(data)
198
+ self.number_of_records = number_of_records
199
+ self.data_frame = pl.DataFrame(processed_data)
200
+ self.lazy = True
201
+ else:
202
+ self.initialize_empty_fl()
203
+ self.number_of_records = 0
204
+
205
+ @staticmethod
206
+ def _process_list_data(data: List) -> List[Dict]:
207
+ """Process list data into a format suitable for DataFrame creation."""
208
+ if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
209
+ try:
210
+ return pl.DataFrame(data).to_dicts()
211
+ except:
212
+ raise Exception('Value must be able to be converted to dictionary')
213
+
214
+ if not isinstance(data[0], dict):
215
+ data = [row.__dict__ for row in data]
216
+
217
+ return utils.ensure_similarity_dicts(data)
218
+
219
+ def _handle_path_ref(self, path_ref: str, optimize_memory: bool):
220
+ """Handle file path reference input."""
221
+ try:
222
+ pf = ParquetFile(path_ref)
223
+ except Exception as e:
224
+ logger.error(e)
225
+ raise Exception("Provided ref is not a parquet file")
226
+
227
+ self.number_of_records = pf.metadata.num_rows
228
+ if optimize_memory:
229
+ self._lazy = True
230
+ self.data_frame = pl.scan_parquet(path_ref)
231
+ else:
232
+ self.data_frame = pl.read_parquet(path_ref)
233
+
234
+ def _finalize_initialization(self, name: str, optimize_memory: bool, schema: Optional[Any],
235
+ calculate_schema_stats: bool):
236
+ """Finalize initialization by setting remaining attributes."""
237
+ _ = calculate_schema_stats
238
+ self.name = name
239
+ self._optimize_memory = optimize_memory
240
+ pl_schema = self.data_frame.collect_schema()
241
+ self._schema = self._handle_schema(schema, pl_schema)
242
+ self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
243
+
244
+ def __getitem__(self, item):
245
+ """Access a specific column or item from the DataFrame."""
246
+ return self.data_frame.select([item])
247
+
248
+ @property
249
+ def data_frame(self) -> pl.LazyFrame | pl.DataFrame:
250
+ """Get the underlying DataFrame with appropriate handling of different states."""
251
+ if self._data_frame is not None and not self.is_future:
252
+ return self._data_frame
253
+ elif self.is_future:
254
+ return self._data_frame
255
+ elif self._external_source is not None and self.lazy:
256
+ return self._data_frame
257
+ elif self._external_source is not None and not self.lazy:
258
+ if self._external_source.get_pl_df() is None:
259
+ data_frame = list(self._external_source.get_iter())
260
+ if len(data_frame) > 0:
261
+ self.data_frame = pl.DataFrame(data_frame)
262
+ else:
263
+ self.data_frame = self._external_source.get_pl_df()
264
+ self.calculate_schema()
265
+ return self._data_frame
266
+
267
+ @data_frame.setter
268
+ def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
269
+ """Set the underlying DataFrame with validation."""
270
+ if self.lazy and isinstance(df, pl.DataFrame):
271
+ raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
272
+ self._data_frame = df
273
+
274
+ @property
275
+ def schema(self) -> List[FlowfileColumn]:
276
+ """Get the schema of the DataFrame, calculating if necessary."""
277
+ if self.number_of_fields == 0:
278
+ return []
279
+ if self._schema is None or (self._calculate_schema_stats and not self.ind_schema_calculated):
280
+ if self._calculate_schema_stats and not self.ind_schema_calculated:
281
+ schema_stats = self._calculate_schema()
282
+ self.ind_schema_calculated = True
283
+ else:
284
+ schema_stats = [
285
+ dict(column_name=k, pl_datatype=v, col_index=i)
286
+ for i, (k, v) in enumerate(self.data_frame.collect_schema().items())
287
+ ]
288
+ self._schema = convert_stats_to_column_info(schema_stats)
289
+ return self._schema
290
+
291
+ @property
292
+ def number_of_fields(self) -> int:
293
+ """Get the number of fields in the DataFrame."""
294
+ if self.__number_of_fields is None:
295
+ self.__number_of_fields = len(self.columns)
296
+ return self.__number_of_fields
297
+
298
+ # Data Collection and Sampling Methods
299
+
300
+ def collect(self, n_records: int = None) -> pl.DataFrame:
301
+ """
302
+ Collect data from the DataFrame, optionally limiting the number of records.
303
+ Handles streaming and error cases appropriately.
304
+ """
305
+ if n_records is None:
306
+ logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
307
+ else:
308
+ logger.info(f'Fetching {n_records} record(s) for Table object "{id(self)}". '
309
+ f'Settings: streaming={self._streamable}')
310
+
311
+ if not self.lazy:
312
+ return self.data_frame
313
+
314
+ try:
315
+ return self._collect_data(n_records)
316
+ except Exception as e:
317
+ self.errors = [e]
318
+ return self._handle_collection_error(n_records)
319
+
320
+ def _collect_data(self, n_records: int = None) -> pl.DataFrame:
321
+ """Internal method to handle data collection."""
322
+ if n_records is None:
323
+ self.collect_external()
324
+ if self._streamable:
325
+ try:
326
+ logger.info('Collecting data in streaming mode')
327
+ return self.data_frame.collect(engine="streaming")
328
+ except PanicException:
329
+ self._streamable = False
330
+
331
+ logger.info('Collecting data in non-streaming mode')
332
+ return self.data_frame.collect()
333
+
334
+ if self.external_source is not None:
335
+ return self._collect_from_external_source(n_records)
336
+
337
+ if self._streamable:
338
+ return self.data_frame.head(n_records).collect(engine="streaming", comm_subplan_elim=False)
339
+ return self.data_frame.head(n_records).collect()
340
+
341
+ def _collect_from_external_source(self, n_records: int) -> pl.DataFrame:
342
+ """Handle collection from external source."""
343
+ if self.external_source.get_pl_df() is not None:
344
+ all_data = self.external_source.get_pl_df().head(n_records)
345
+ self.data_frame = all_data
346
+ else:
347
+ all_data = self.external_source.get_sample(n_records)
348
+ self.data_frame = pl.LazyFrame(all_data)
349
+ return self.data_frame
350
+
351
+ def _handle_collection_error(self, n_records: int) -> pl.DataFrame:
352
+ """Handle errors during collection by attempting partial collection."""
353
+ n_records = 100000000 if n_records is None else n_records
354
+ ok_cols, error_cols = self._identify_valid_columns(n_records)
355
+
356
+ if len(ok_cols) > 0:
357
+ return self._create_partial_dataframe(ok_cols, error_cols, n_records)
358
+ return self._create_empty_dataframe(n_records)
359
+
360
+ def _identify_valid_columns(self, n_records: int) -> Tuple[List[str], List[Tuple[str, Any]]]:
361
+ """Identify which columns can be collected successfully."""
362
+ ok_cols = []
363
+ error_cols = []
364
+ for c in self.columns:
365
+ try:
366
+ _ = self.data_frame.select(c).head(n_records).collect()
367
+ ok_cols.append(c)
368
+ except:
369
+ error_cols.append((c, self.data_frame.schema[c]))
370
+ return ok_cols, error_cols
371
+
372
+ def _create_partial_dataframe(self, ok_cols: List[str], error_cols: List[Tuple[str, Any]],
373
+ n_records: int) -> pl.DataFrame:
374
+ """Create a DataFrame with partial data for columns that could be collected."""
375
+ df = self.data_frame.select(ok_cols)
376
+ df = df.with_columns([
377
+ pl.lit(None).alias(column_name).cast(data_type)
378
+ for column_name, data_type in error_cols
379
+ ])
380
+ return df.select(self.columns).head(n_records).collect()
381
+
382
+ def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
383
+ """Create an empty DataFrame with the correct schema."""
384
+ if self.number_of_records > 0:
385
+ return pl.DataFrame({
386
+ column_name: pl.Series(
387
+ name=column_name,
388
+ values=[None] * min(self.number_of_records, n_records)
389
+ ).cast(data_type)
390
+ for column_name, data_type in self.data_frame.schema.items()
391
+ })
392
+ return pl.DataFrame(schema=self.data_frame.schema)
393
+
394
+ # Data Transformation Methods
395
+
396
+ def do_group_by(self, group_by_input: transform_schemas.GroupByInput,
397
+ calculate_schema_stats: bool = True) -> "FlowDataEngine":
398
+ """Perform group by operations on the DataFrame."""
399
+ aggregations = [c for c in group_by_input.agg_cols if c.agg != 'groupby']
400
+ group_columns = [c for c in group_by_input.agg_cols if c.agg == 'groupby']
401
+
402
+ if len(group_columns) == 0:
403
+ return FlowDataEngine(
404
+ self.data_frame.select(
405
+ ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
406
+ ),
407
+ calculate_schema_stats=calculate_schema_stats
408
+ )
409
+
410
+ df = self.data_frame.rename({c.old_name: c.new_name for c in group_columns})
411
+ group_by_columns = [n_c.new_name for n_c in group_columns]
412
+ return FlowDataEngine(
413
+ df.group_by(*group_by_columns).agg(
414
+ ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
415
+ ),
416
+ calculate_schema_stats=calculate_schema_stats
417
+ )
418
+
419
+ def do_sort(self, sorts: List[transform_schemas.SortByInput]) -> "FlowDataEngine":
420
+ """Sort the DataFrame based on specified columns and directions."""
421
+ if not sorts:
422
+ return self
423
+
424
+ descending = [s.how == 'desc' or s.how.lower() == 'descending' for s in sorts]
425
+ df = self.data_frame.sort([sort_by.column for sort_by in sorts], descending=descending)
426
+ return FlowDataEngine(df, number_of_records=self.number_of_records, schema=self.schema)
427
+
428
+ def change_column_types(self, transforms: List[transform_schemas.SelectInput],
429
+ calculate_schema: bool = False) -> "FlowDataEngine":
430
+ """Change the data types of specified columns."""
431
+ dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
432
+ idx_mapping = list(
433
+ (transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
434
+ for transform in transforms if transform.data_type is not None
435
+ )
436
+
437
+ actual_transforms = [c for c in idx_mapping if c[2] != dtypes[c[1]]]
438
+ transformations = [
439
+ utils.define_pl_col_transformation(col_name=transform[0], col_type=transform[2])
440
+ for transform in actual_transforms
441
+ ]
442
+
443
+ df = self.data_frame.with_columns(transformations)
444
+ return FlowDataEngine(
445
+ df,
446
+ number_of_records=self.number_of_records,
447
+ calculate_schema_stats=calculate_schema,
448
+ streamable=self._streamable
449
+ )
450
+
451
+ # Data Export and Conversion Methods
452
+
453
+ def save(self, path: str, data_type: str = 'parquet') -> Future:
454
+ """Save the DataFrame to a file."""
455
+ estimated_size = deepcopy(self.get_estimated_file_size() * 4)
456
+ df = deepcopy(self.data_frame)
457
+ return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
458
+
459
+ def to_pylist(self) -> List[Dict]:
460
+ """Convert the DataFrame to a list of dictionaries."""
461
+ if self.lazy:
462
+ return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
463
+ return self.data_frame.to_dicts()
464
+
465
+ @classmethod
466
+ def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
467
+ """Create a FlowDataEngine from an external data source."""
468
+ if external_source.schema is not None:
469
+ ff = cls.create_from_schema(external_source.schema)
470
+ elif external_source.initial_data_getter is not None:
471
+ ff = cls(raw_data=external_source.initial_data_getter())
472
+ else:
473
+ ff = cls()
474
+ ff._external_source = external_source
475
+ return ff
476
+
477
+ @classmethod
478
+ def create_from_sql(cls, sql: str, conn: Any) -> "FlowDataEngine":
479
+ """Create a FlowDataEngine from a SQL query."""
480
+ return cls(pl.read_sql(sql, conn))
481
+
482
+ @classmethod
483
+ def create_from_schema(cls, schema: List[FlowfileColumn]) -> "FlowDataEngine":
484
+ """Create a FlowDataEngine from a schema definition."""
485
+ pl_schema = []
486
+ for i, flow_file_column in enumerate(schema):
487
+ pl_schema.append((flow_file_column.name, type_to_polars(flow_file_column.data_type)))
488
+ schema[i].col_index = i
489
+ df = pl.LazyFrame(schema=pl_schema)
490
+ return cls(df, schema=schema, calculate_schema_stats=False, number_of_records=0)
491
+
492
+ @classmethod
493
+ def create_from_path(cls, received_table: input_schema.ReceivedTableBase) -> "FlowDataEngine":
494
+ """Create a FlowDataEngine from a file path."""
495
+ received_table.set_absolute_filepath()
496
+
497
+ file_type_handlers = {
498
+ 'csv': create_funcs.create_from_path_csv,
499
+ 'parquet': create_funcs.create_from_path_parquet,
500
+ 'excel': create_funcs.create_from_path_excel
501
+ }
502
+
503
+ handler = file_type_handlers.get(received_table.file_type)
504
+ if not handler:
505
+ raise Exception(f'Cannot create from {received_table.file_type}')
506
+
507
+ flow_file = cls(handler(received_table))
508
+ flow_file._org_path = received_table.abs_file_path
509
+ return flow_file
510
+
511
+ @classmethod
512
+ def create_random(cls, number_of_records: int = 1000) -> "FlowDataEngine":
513
+ """Create a FlowDataEngine with random data."""
514
+ return cls(create_fake_data(number_of_records))
515
+
516
+ @classmethod
517
+ def generate_enumerator(cls, length: int = 1000, output_name: str = 'output_column') -> "FlowDataEngine":
518
+ """Generate a sequence of numbers as a FlowDataEngine."""
519
+ if length > 10_000_000:
520
+ length = 10_000_000
521
+ return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
522
+
523
+ # Schema Handling Methods
524
+
525
+ def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema,
526
+ pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
527
+ """Handle schema processing and validation."""
528
+ if schema is None:
529
+ return None
530
+
531
+ if schema.__len__() != pl_schema.__len__():
532
+ raise Exception(
533
+ f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
534
+
535
+ if isinstance(schema, pl.Schema):
536
+ return self._handle_polars_schema(schema, pl_schema)
537
+ elif isinstance(schema, list) and len(schema) == 0:
538
+ return []
539
+ elif isinstance(schema[0], str):
540
+ return self._handle_string_schema(schema, pl_schema)
541
+ return schema
542
+
543
+ def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
544
+ """Handle Polars schema conversion."""
545
+ flow_file_columns = [
546
+ FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
547
+ for col_name, dtype in zip(schema.names(), schema.dtypes())
548
+ ]
549
+
550
+ select_arg = [
551
+ pl.col(o).alias(n).cast(schema_dtype)
552
+ for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes())
553
+ ]
554
+
555
+ self.data_frame = self.data_frame.select(select_arg)
556
+ return flow_file_columns
557
+
558
+ def _handle_string_schema(self, schema: List[str], pl_schema: pl.Schema) -> List[FlowfileColumn]:
559
+ """Handle string-based schema conversion."""
560
+ flow_file_columns = [
561
+ FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
562
+ for col_name, dtype in zip(schema, pl_schema.dtypes())
563
+ ]
564
+
565
+ self.data_frame = self.data_frame.rename({
566
+ o: n for o, n in zip(pl_schema.names(), schema)
567
+ })
568
+
569
+ return flow_file_columns
570
+
571
+ # Data Manipulation Methods
572
+
573
+ def split(self, split_input: transform_schemas.TextToRowsInput) -> "FlowDataEngine":
574
+ """Split a column into multiple rows based on a delimiter."""
575
+ output_column_name = (
576
+ split_input.output_column_name
577
+ if split_input.output_column_name
578
+ else split_input.column_to_split
579
+ )
580
+
581
+ split_value = (
582
+ split_input.split_fixed_value
583
+ if split_input.split_by_fixed_value
584
+ else pl.col(split_input.split_by_column)
585
+ )
586
+
587
+ df = (
588
+ self.data_frame.with_columns(
589
+ pl.col(split_input.column_to_split)
590
+ .str.split(by=split_value)
591
+ .alias(output_column_name)
592
+ )
593
+ .explode(output_column_name)
594
+ )
595
+
596
+ return FlowDataEngine(df)
597
+
598
+ def unpivot(self, unpivot_input: transform_schemas.UnpivotInput) -> "FlowDataEngine":
599
+ """Convert data from wide to long format."""
600
+ lf = self.data_frame
601
+
602
+ if unpivot_input.data_type_selector_expr is not None:
603
+ result = lf.unpivot(
604
+ on=unpivot_input.data_type_selector_expr(),
605
+ index=unpivot_input.index_columns
606
+ )
607
+ elif unpivot_input.value_columns is not None:
608
+ result = lf.unpivot(
609
+ on=unpivot_input.value_columns,
610
+ index=unpivot_input.index_columns
611
+ )
612
+ else:
613
+ result = lf.unpivot()
614
+
615
+ return FlowDataEngine(result)
616
+
617
+ def do_pivot(self, pivot_input: transform_schemas.PivotInput, node_logger: NodeLogger = None) -> "FlowDataEngine":
618
+ """Convert data from long to wide format with aggregations."""
619
+ # Get unique values for pivot columns
620
+ max_unique_vals = 200
621
+ new_cols_unique = fetch_unique_values(self.data_frame.select(pivot_input.pivot_column)
622
+ .unique()
623
+ .sort(pivot_input.pivot_column)
624
+ .limit(max_unique_vals).cast(pl.String))
625
+ if len(new_cols_unique) >= max_unique_vals:
626
+ if node_logger:
627
+ node_logger.warning('Pivot column has too many unique values. Please consider using a different column.'
628
+ f' Max unique values: {max_unique_vals}')
629
+
630
+ if len(pivot_input.index_columns) == 0:
631
+ no_index_cols = True
632
+ pivot_input.index_columns = ['__temp__']
633
+ ff = self.apply_flowfile_formula('1', col_name='__temp__')
634
+ else:
635
+ no_index_cols = False
636
+ ff = self
637
+
638
+ # Perform pivot operations
639
+ index_columns = pivot_input.get_index_columns()
640
+ grouped_ff = ff.do_group_by(pivot_input.get_group_by_input(), False)
641
+ pivot_column = pivot_input.get_pivot_column()
642
+
643
+ input_df = grouped_ff.data_frame.with_columns(
644
+ pivot_column.cast(pl.String).alias(pivot_input.pivot_column)
645
+ )
646
+ number_of_aggregations = len(pivot_input.aggregations)
647
+ df = (
648
+ input_df.select(
649
+ *index_columns,
650
+ pivot_column,
651
+ pivot_input.get_values_expr()
652
+ )
653
+ .group_by(*index_columns)
654
+ .agg([
655
+ (pl.col('vals').filter(pivot_column == new_col_value))
656
+ .first()
657
+ .alias(new_col_value)
658
+ for new_col_value in new_cols_unique
659
+ ])
660
+ .select(
661
+ *index_columns,
662
+ *[
663
+ pl.col(new_col).struct.field(agg).alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
664
+ for new_col in new_cols_unique
665
+ for agg in pivot_input.aggregations
666
+ ]
667
+ )
668
+ )
669
+
670
+ # Clean up temporary columns if needed
671
+ if no_index_cols:
672
+ df = df.drop('__temp__')
673
+ pivot_input.index_columns = []
674
+
675
+ return FlowDataEngine(df, calculate_schema_stats=False)
676
+
677
+ def do_filter(self, predicate: str) -> "FlowDataEngine":
678
+ """Filter the DataFrame based on a predicate expression."""
679
+ try:
680
+ f = to_expr(predicate)
681
+ except Exception as e:
682
+ logger.warning(f'Error in filter expression: {e}')
683
+ f = to_expr("False")
684
+ df = self.data_frame.filter(f)
685
+ return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
686
+
687
+ def add_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
688
+ """Add a record ID column with optional grouping."""
689
+ if record_id_settings.group_by and len(record_id_settings.group_by_columns) > 0:
690
+ return self._add_grouped_record_id(record_id_settings)
691
+ return self._add_simple_record_id(record_id_settings)
692
+
693
+ def _add_grouped_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
694
+ """Add a record ID column with grouping."""
695
+ select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
696
+
697
+ df = (
698
+ self.data_frame
699
+ .with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
700
+ .with_columns(
701
+ (pl.cum_count(record_id_settings.output_column_name)
702
+ .over(record_id_settings.group_by_columns) + record_id_settings.offset - 1)
703
+ .alias(record_id_settings.output_column_name)
704
+ )
705
+ .select(select_cols)
706
+ )
707
+
708
+ output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
709
+ output_schema.extend(self.schema)
710
+
711
+ return FlowDataEngine(df, schema=output_schema)
712
+
713
+ def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
714
+ """Add a simple sequential record ID column."""
715
+ df = self.data_frame.with_row_index(
716
+ record_id_settings.output_column_name,
717
+ record_id_settings.offset
718
+ )
719
+
720
+ output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
721
+ output_schema.extend(self.schema)
722
+
723
+ return FlowDataEngine(df, schema=output_schema)
724
+
725
+ # Utility Methods
726
+
727
+ def get_schema_column(self, col_name: str) -> FlowfileColumn:
728
+ """Get schema information for a specific column."""
729
+ for s in self.schema:
730
+ if s.name == col_name:
731
+ return s
732
+
733
+ def get_estimated_file_size(self) -> int:
734
+ """Get the estimated size of the file in bytes."""
735
+ if self._org_path is not None:
736
+ return os.path.getsize(self._org_path)
737
+ return 0
738
+
739
+ def __repr__(self) -> str:
740
+ """Return string representation of the FlowDataEngine."""
741
+ return f'flowfile table\n{self.data_frame.__repr__()}'
742
+
743
+ def __call__(self) -> "FlowDataEngine":
744
+ """Make the class callable, returning self."""
745
+ return self
746
+
747
+ def __len__(self) -> int:
748
+ """Get the number of records in the table."""
749
+ return self.number_of_records if self.number_of_records >= 0 else self.get_number_of_records()
750
+
751
+ def cache(self) -> "FlowDataEngine":
752
+ """
753
+ Cache the data in background and update the DataFrame reference.
754
+
755
+ Returns:
756
+ FlowDataEngine: Self with cached data
757
+ """
758
+ edf = ExternalDfFetcher(lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False,
759
+ flow_id=-1,
760
+ node_id=-1)
761
+ logger.info('Caching data in background')
762
+ result = edf.get_result()
763
+ if isinstance(result, pl.LazyFrame):
764
+ logger.info('Data cached')
765
+ del self._data_frame
766
+ self.data_frame = result
767
+ logger.info('Data loaded from cache')
768
+ return self
769
+
770
+ def collect_external(self):
771
+ """Collect data from external source if present."""
772
+ if self._external_source is not None:
773
+ logger.info('Collecting external source')
774
+ if self.external_source.get_pl_df() is not None:
775
+ self.data_frame = self.external_source.get_pl_df().lazy()
776
+ else:
777
+ self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
778
+ self._schema = None # enforce reset schema
779
+
780
+ # Data Access Methods
781
+ def get_output_sample(self, n_rows: int = 10) -> List[Dict]:
782
+ """
783
+ Get a sample of the data as a list of dictionaries.
784
+
785
+ Args:
786
+ n_rows: Number of rows to sample
787
+
788
+ Returns:
789
+ List[Dict]: Sample data as dictionaries
790
+ """
791
+ if self.number_of_records > n_rows or self.number_of_records < 0:
792
+ df = self.collect(n_rows)
793
+ else:
794
+ df = self.collect()
795
+ return df.to_dicts()
796
+
797
+ def __get_sample__(self, n_rows: int = 100, streamable: bool = True) -> "FlowDataEngine":
798
+ if not self.lazy:
799
+ df = self.data_frame.lazy()
800
+ else:
801
+ df = self.data_frame
802
+
803
+ if streamable:
804
+ try:
805
+ df = df.head(n_rows).collect()
806
+ except Exception as e:
807
+ logger.warning(f'Error in getting sample: {e}')
808
+ df = df.head(n_rows).collect(engine="auto")
809
+ else:
810
+ df = self.collect()
811
+ return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
812
+
813
+ def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
814
+ seed: int = None) -> "FlowDataEngine":
815
+ """
816
+ Get a sample of rows from the DataFrame.
817
+
818
+ Args:
819
+ n_rows: Number of rows to sample
820
+ random: Whether to randomly sample
821
+ shuffle: Whether to shuffle the sample
822
+ seed: Random seed for reproducibility
823
+
824
+ Returns:
825
+ FlowDataEngine: New instance with sampled data
826
+ """
827
+ n_records = min(n_rows, self.number_of_records)
828
+ logging.info(f'Getting sample of {n_rows} rows')
829
+
830
+ if random:
831
+ if self.lazy and self.external_source is not None:
832
+ self.collect_external()
833
+
834
+ if self.lazy and shuffle:
835
+ sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(n_rows,
836
+ seed=seed,
837
+ shuffle=shuffle)
838
+ elif shuffle:
839
+ sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
840
+ else:
841
+ every_n_records = ceil(self.number_of_records / n_rows)
842
+ sample_df = self.data_frame.gather_every(every_n_records)
843
+ else:
844
+ if self.external_source:
845
+ self.collect(n_rows)
846
+ sample_df = self.data_frame.head(n_rows)
847
+
848
+ return FlowDataEngine(sample_df, schema=self.schema, number_of_records=n_records)
849
+
850
+ def get_subset(self, n_rows: int = 100) -> "FlowDataEngine":
851
+ """
852
+ Get a subset of rows from the DataFrame.
853
+
854
+ Args:
855
+ n_rows: Number of rows to include
856
+
857
+ Returns:
858
+ FlowDataEngine: New instance with subset of data
859
+ """
860
+ if not self.lazy:
861
+ return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
862
+ else:
863
+ return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
864
+
865
+ # Iterator Methods
866
+ def iter_batches(self, batch_size: int = 1000, columns: Union[List, Tuple, str] = None):
867
+ """
868
+ Iterate over the DataFrame in batches.
869
+
870
+ Args:
871
+ batch_size: Size of each batch
872
+ columns: Columns to include
873
+
874
+ Yields:
875
+ FlowDataEngine: New instance for each batch
876
+ """
877
+ if columns:
878
+ self.data_frame = self.data_frame.select(columns)
879
+ self.lazy = False
880
+ batches = self.data_frame.iter_slices(batch_size)
881
+ for batch in batches:
882
+ yield FlowDataEngine(batch)
883
+
884
+ def start_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
885
+ other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
886
+ node_id: int | str = -1) -> ExternalFuzzyMatchFetcher:
887
+ """
888
+ Starts a fuzzy join with another DataFrame and returns the object to track.
889
+
890
+ Args:
891
+ fuzzy_match_input: Fuzzy matching parameters
892
+ other: Right DataFrame for join
893
+ file_ref: Reference for temporary files
894
+ flow_id: Flow ID for tracking
895
+ node_id: Node ID for tracking
896
+ Returns:
897
+ FlowDataEngine: New instance with joined data
898
+ """
899
+ left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
900
+ fuzzy_match_input=fuzzy_match_input)
901
+ return ExternalFuzzyMatchFetcher(left_df, right_df,
902
+ fuzzy_maps=fuzzy_match_input.fuzzy_maps,
903
+ file_ref=file_ref + '_fm',
904
+ wait_on_completion=False,
905
+ flow_id=flow_id,
906
+ node_id=node_id)
907
+
908
+ def do_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
909
+ other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
910
+ node_id: int | str = -1) -> "FlowDataEngine":
911
+ """
912
+ Perform a fuzzy join with another DataFrame.
913
+
914
+ Args:
915
+ fuzzy_match_input: Fuzzy matching parameters
916
+ other: Right DataFrame for join
917
+ file_ref: Reference for temporary files
918
+ flow_id: Flow ID for tracking
919
+ node_id: Node ID for tracking
920
+ Returns:
921
+ FlowDataEngine: New instance with joined data
922
+ """
923
+ left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
924
+ fuzzy_match_input=fuzzy_match_input)
925
+ f = ExternalFuzzyMatchFetcher(left_df, right_df,
926
+ fuzzy_maps=fuzzy_match_input.fuzzy_maps,
927
+ file_ref=file_ref + '_fm',
928
+ wait_on_completion=True,
929
+ flow_id=flow_id,
930
+ node_id=node_id)
931
+ return FlowDataEngine(f.get_result())
932
+
933
+ def fuzzy_match(self, right: "FlowDataEngine", left_on: str, right_on: str,
934
+ fuzzy_method: str = 'levenshtein', threshold: float = 0.75) -> "FlowDataEngine":
935
+ """
936
+ Perform fuzzy matching between two DataFrames.
937
+
938
+ Args:
939
+ right: Right DataFrame for matching
940
+ left_on: Column from left DataFrame
941
+ right_on: Column from right DataFrame
942
+ fuzzy_method: Method for fuzzy matching
943
+ threshold: Matching threshold
944
+
945
+ Returns:
946
+ FlowDataEngine: New instance with matched data
947
+ """
948
+ fuzzy_match_input = transform_schemas.FuzzyMatchInput(
949
+ [transform_schemas.FuzzyMap(
950
+ left_on, right_on,
951
+ fuzzy_type=fuzzy_method,
952
+ threshold_score=threshold
953
+ )],
954
+ left_select=self.columns,
955
+ right_select=right.columns
956
+ )
957
+ return self.do_fuzzy_join(fuzzy_match_input, right, str(id(self)))
958
+
959
+ def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
960
+ auto_generate_selection: bool, verify_integrity: bool,
961
+ other: "FlowDataEngine") -> "FlowDataEngine":
962
+ """
963
+ Perform a cross join with another DataFrame.
964
+
965
+ Args:
966
+ cross_join_input: Cross join parameters
967
+ auto_generate_selection: Whether to auto-generate column selection
968
+ verify_integrity: Whether to verify join integrity
969
+ other: Right DataFrame for join
970
+
971
+ Returns:
972
+ FlowDataEngine: New instance with joined data
973
+
974
+ Raises:
975
+ Exception: If join would result in too many records
976
+ """
977
+ self.lazy = True
978
+ other.lazy = True
979
+
980
+ verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
981
+
982
+ # if auto_generate_selection:
983
+ # cross_join_input.auto_rename()
984
+
985
+ right_select = [v.old_name for v in cross_join_input.right_select.renames
986
+ if (v.keep or v.join_key) and v.is_available]
987
+ left_select = [v.old_name for v in cross_join_input.left_select.renames
988
+ if (v.keep or v.join_key) and v.is_available]
989
+
990
+ left = self.data_frame.select(left_select).rename(cross_join_input.left_select.rename_table)
991
+ right = other.data_frame.select(right_select).rename(cross_join_input.right_select.rename_table)
992
+
993
+ if verify_integrity:
994
+ n_records = self.get_number_of_records() * other.get_number_of_records()
995
+ if n_records > 1_000_000_000:
996
+ raise Exception("Join will result in too many records, ending process")
997
+ else:
998
+ n_records = -1
999
+
1000
+ joined_df = left.join(right, how='cross')
1001
+
1002
+ cols_to_delete_after = [col.new_name for col in
1003
+ cross_join_input.left_select.renames + cross_join_input.left_select.renames
1004
+ if col.join_key and not col.keep and col.is_available]
1005
+
1006
+ if verify_integrity:
1007
+ return FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False,
1008
+ number_of_records=n_records, streamable=False)
1009
+ else:
1010
+ fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False,
1011
+ number_of_records=0, streamable=False)
1012
+ return fl
1013
+
1014
+ def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
1015
+ verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
1016
+ """
1017
+ Perform a join operation with another DataFrame.
1018
+
1019
+ Args:
1020
+ join_input: Join parameters
1021
+ auto_generate_selection: Whether to auto-generate column selection
1022
+ verify_integrity: Whether to verify join integrity
1023
+ other: Right DataFrame for join
1024
+
1025
+ Returns:
1026
+ FlowDataEngine: New instance with joined data
1027
+
1028
+ Raises:
1029
+ Exception: If join would result in too many records or is invalid
1030
+ """
1031
+ # self.lazy = False if join_input.how == 'right' else True
1032
+ # other.lazy = False if join_input.how == 'right' else True
1033
+
1034
+ verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
1035
+ if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
1036
+ raise Exception('Join is not valid by the data fields')
1037
+ if auto_generate_selection:
1038
+ join_input.auto_rename()
1039
+
1040
+ right_select = [v.old_name for v in join_input.right_select.renames
1041
+ if (v.keep or v.join_key) and v.is_available]
1042
+ left_select = [v.old_name for v in join_input.left_select.renames
1043
+ if (v.keep or v.join_key) and v.is_available]
1044
+ left = self.data_frame.select(left_select).rename(join_input.left_select.rename_table)
1045
+ right = other.data_frame.select(right_select).rename(join_input.right_select.rename_table)
1046
+
1047
+ if verify_integrity and join_input.how != 'right':
1048
+ n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
1049
+ right_on_keys=join_input.right_join_keys, how=join_input.how)
1050
+ if n_records > 1_000_000_000:
1051
+ raise Exception("Join will result in too many records, ending process")
1052
+ else:
1053
+ n_records = -1
1054
+ if join_input.how == 'right':
1055
+ # Default to left join since right join can give panic issues in execution plan downstream
1056
+ joined_df = right.join(left, left_on=join_input.right_join_keys,
1057
+ right_on=join_input.left_join_keys, how="left", suffix="")
1058
+ else:
1059
+ joined_df = left.join(right, left_on=join_input.left_join_keys,
1060
+ right_on=join_input.right_join_keys,
1061
+ how=join_input.how, suffix="")
1062
+ cols_to_delete_after = [col.new_name for col in
1063
+ join_input.left_select.renames + join_input.left_select.renames
1064
+ if col.join_key and not col.keep and col.is_available]
1065
+ if len(cols_to_delete_after) > 0:
1066
+ joined_df = joined_df.drop(cols_to_delete_after)
1067
+ if verify_integrity:
1068
+ return FlowDataEngine(joined_df, calculate_schema_stats=True,
1069
+ number_of_records=n_records, streamable=False)
1070
+ else:
1071
+ fl = FlowDataEngine(joined_df, calculate_schema_stats=False,
1072
+ number_of_records=0, streamable=False)
1073
+ return fl
1074
+
1075
+ # Graph Operations
1076
+ def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
1077
+ """
1078
+ Solve a graph problem using the specified columns.
1079
+
1080
+ Args:
1081
+ graph_solver_input: Graph solving parameters
1082
+
1083
+ Returns:
1084
+ FlowDataEngine: New instance with solved graph data
1085
+ """
1086
+ lf = self.data_frame.with_columns(
1087
+ graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
1088
+ .alias(graph_solver_input.output_column_name)
1089
+ )
1090
+ return FlowDataEngine(lf)
1091
+
1092
+ # Data Modification Methods
1093
+ def add_new_values(self, values: Iterable, col_name: str = None) -> "FlowDataEngine":
1094
+ """
1095
+ Add a new column with specified values.
1096
+
1097
+ Args:
1098
+ values: Values to add
1099
+ col_name: Name for new column
1100
+
1101
+ Returns:
1102
+ FlowDataEngine: New instance with added column
1103
+ """
1104
+ if col_name is None:
1105
+ col_name = 'new_values'
1106
+ return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
1107
+
1108
+ def get_record_count(self) -> "FlowDataEngine":
1109
+ """
1110
+ Get the total number of records.
1111
+
1112
+ Returns:
1113
+ FlowDataEngine: New instance with record count
1114
+ """
1115
+ return FlowDataEngine(self.data_frame.select(pl.len().alias('number_of_records')))
1116
+
1117
+ def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
1118
+ """
1119
+ Assert that this DataFrame is equal to another.
1120
+
1121
+ Args:
1122
+ other: DataFrame to compare with
1123
+ ordered: Whether to consider row order
1124
+ strict_schema: Whether to strictly compare schemas
1125
+
1126
+ Raises:
1127
+ Exception: If DataFrames are not equal
1128
+ """
1129
+ org_laziness = self.lazy, other.lazy
1130
+ self.lazy = False
1131
+ other.lazy = False
1132
+ self.number_of_records = -1
1133
+ other.number_of_records = -1
1134
+
1135
+ if self.get_number_of_records() != other.get_number_of_records():
1136
+ raise Exception('Number of records is not equal')
1137
+
1138
+ if self.columns != other.columns:
1139
+ raise Exception('Schema is not equal')
1140
+
1141
+ if strict_schema:
1142
+ assert self.data_frame.schema == other.data_frame.schema, 'Data types do not match'
1143
+
1144
+ if ordered:
1145
+ self_lf = self.data_frame.sort(by=self.columns)
1146
+ other_lf = other.data_frame.sort(by=other.columns)
1147
+ else:
1148
+ self_lf = self.data_frame
1149
+ other_lf = other.data_frame
1150
+
1151
+ self.lazy, other.lazy = org_laziness
1152
+ assert self_lf.equals(other_lf), 'Data is not equal'
1153
+
1154
+ # Initialization Methods
1155
+ def initialize_empty_fl(self):
1156
+ """Initialize an empty LazyFrame."""
1157
+ self.data_frame = pl.LazyFrame()
1158
+ self.number_of_records = 0
1159
+ self._lazy = True
1160
+
1161
+ def get_number_of_records(self, warn: bool = False, force_calculate: bool = False) -> int:
1162
+ """
1163
+ Get the total number of records in the DataFrame.
1164
+
1165
+ Args:
1166
+ warn: Whether to warn about expensive operations
1167
+ force_calculate: Whether to force recalculation
1168
+
1169
+ Returns:
1170
+ int: Number of records
1171
+
1172
+ Raises:
1173
+ Exception: If unable to get number of records
1174
+ """
1175
+ if self.is_future and not self.is_collected:
1176
+ return -1
1177
+
1178
+ if self.number_of_records is None or self.number_of_records < 0 or force_calculate:
1179
+ if self._number_of_records_callback is not None:
1180
+ self._number_of_records_callback(self)
1181
+
1182
+ if self.lazy:
1183
+ if warn:
1184
+ logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1185
+ try:
1186
+ self.number_of_records = self.data_frame.select(pl.len()).collect(
1187
+ engine="streaming" if self._streamable else "auto")[0, 0]
1188
+ except Exception:
1189
+ raise Exception('Could not get number of records')
1190
+ else:
1191
+ self.number_of_records = self.data_frame.__len__()
1192
+
1193
+ return self.number_of_records
1194
+
1195
+ # Properties
1196
+ @property
1197
+ def has_errors(self) -> bool:
1198
+ """Check if there are any errors."""
1199
+ return len(self.errors) > 0
1200
+
1201
+ @property
1202
+ def lazy(self) -> bool:
1203
+ """Check if DataFrame is lazy."""
1204
+ return self._lazy
1205
+
1206
+ @lazy.setter
1207
+ def lazy(self, exec_lazy: bool = False):
1208
+ """
1209
+ Set the laziness of the DataFrame.
1210
+
1211
+ Args:
1212
+ exec_lazy: Whether to make DataFrame lazy
1213
+ """
1214
+ if exec_lazy != self._lazy:
1215
+ if exec_lazy:
1216
+ self.data_frame = self.data_frame.lazy()
1217
+ else:
1218
+ self._lazy = exec_lazy
1219
+ if self.external_source is not None:
1220
+ df = self.collect()
1221
+ self.data_frame = df
1222
+ else:
1223
+ self.data_frame = self.data_frame.collect(engine="streaming" if self._streamable else "auto")
1224
+ self._lazy = exec_lazy
1225
+
1226
+ @property
1227
+ def external_source(self) -> ExternalDataSource:
1228
+ """Get the external data source."""
1229
+ return self._external_source
1230
+
1231
+ @property
1232
+ def cols_idx(self) -> Dict[str, int]:
1233
+ """Get column index mapping."""
1234
+ if self._col_idx is None:
1235
+ self._col_idx = {c: i for i, c in enumerate(self.columns)}
1236
+ return self._col_idx
1237
+
1238
+ @property
1239
+ def __name__(self) -> str:
1240
+ """Get table name."""
1241
+ return self.name
1242
+
1243
+ # Schema and Column Operations
1244
+ def get_select_inputs(self) -> transform_schemas.SelectInputs:
1245
+ """
1246
+ Get select inputs for all columns.
1247
+
1248
+ Returns:
1249
+ SelectInputs: Input specifications for all columns
1250
+ """
1251
+ return transform_schemas.SelectInputs(
1252
+ [transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
1253
+ )
1254
+
1255
+ def select_columns(self, list_select: Union[List[str], Tuple[str], str]) -> "FlowDataEngine":
1256
+ """
1257
+ Select specific columns from the DataFrame.
1258
+
1259
+ Args:
1260
+ list_select: Columns to select
1261
+
1262
+ Returns:
1263
+ FlowDataEngine: New instance with selected columns
1264
+ """
1265
+ if isinstance(list_select, str):
1266
+ list_select = [list_select]
1267
+
1268
+ idx_to_keep = [self.cols_idx.get(c) for c in list_select]
1269
+ selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep) if id_to_keep is not None]
1270
+ new_schema = [self.schema[i] for i in idx_to_keep if i is not None]
1271
+
1272
+ return FlowDataEngine(
1273
+ self.data_frame.select(selects),
1274
+ number_of_records=self.number_of_records,
1275
+ schema=new_schema,
1276
+ streamable=self._streamable
1277
+ )
1278
+
1279
+ def drop_columns(self, columns: List[str]) -> "FlowDataEngine":
1280
+ """
1281
+ Drop specified columns from the DataFrame.
1282
+
1283
+ Args:
1284
+ columns: Columns to drop
1285
+
1286
+ Returns:
1287
+ FlowDataEngine: New instance without dropped columns
1288
+ """
1289
+ cols_for_select = tuple(set(self.columns) - set(columns))
1290
+ idx_to_keep = [self.cols_idx.get(c) for c in cols_for_select]
1291
+ new_schema = [self.schema[i] for i in idx_to_keep]
1292
+
1293
+ return FlowDataEngine(
1294
+ self.data_frame.select(cols_for_select),
1295
+ number_of_records=self.number_of_records,
1296
+ schema=new_schema
1297
+ )
1298
+
1299
+ def reorganize_order(self, column_order: List[str]) -> "FlowDataEngine":
1300
+ """
1301
+ Reorganize columns in specified order.
1302
+
1303
+ Args:
1304
+ column_order: Desired column order
1305
+
1306
+ Returns:
1307
+ FlowDataEngine: New instance with reordered columns
1308
+ """
1309
+ df = self.data_frame.select(column_order)
1310
+ schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
1311
+ return FlowDataEngine(df, schema=schema, number_of_records=self.number_of_records)
1312
+
1313
+ def apply_flowfile_formula(self, func: str, col_name: str,
1314
+ output_data_type: pl.DataType = None) -> "FlowDataEngine":
1315
+ """
1316
+ Apply a formula to create a new column.
1317
+
1318
+ Args:
1319
+ func: Formula to apply
1320
+ col_name: Name for new column
1321
+ output_data_type: Data type for output
1322
+
1323
+ Returns:
1324
+ FlowDataEngine: New instance with added column
1325
+ """
1326
+ parsed_func = to_expr(func)
1327
+ if output_data_type is not None:
1328
+ df2 = self.data_frame.with_columns(parsed_func.cast(output_data_type).alias(col_name))
1329
+ else:
1330
+ df2 = self.data_frame.with_columns(parsed_func.alias(col_name))
1331
+
1332
+ return FlowDataEngine(df2, number_of_records=self.number_of_records)
1333
+
1334
+ def apply_sql_formula(self, func: str, col_name: str,
1335
+ output_data_type: pl.DataType = None) -> "FlowDataEngine":
1336
+ """
1337
+ Apply an SQL-style formula to create a new column.
1338
+
1339
+ Args:
1340
+ func: SQL formula to apply
1341
+ col_name: Name for new column
1342
+ output_data_type: Data type for output
1343
+
1344
+ Returns:
1345
+ FlowDataEngine: New instance with added column
1346
+ """
1347
+ expr = to_expr(func)
1348
+ if output_data_type is not None:
1349
+ df = self.data_frame.with_columns(expr.cast(output_data_type).alias(col_name))
1350
+ else:
1351
+ df = self.data_frame.with_columns(expr.alias(col_name))
1352
+
1353
+ return FlowDataEngine(df, number_of_records=self.number_of_records)
1354
+
1355
+ def output(self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str,
1356
+ execute_remote: bool = True) -> "FlowDataEngine":
1357
+ """
1358
+ Write DataFrame to output file.
1359
+
1360
+ Args:
1361
+ output_fs: Output settings.
1362
+ flow_id: Flow ID for tracking.
1363
+ node_id: Node ID for tracking.
1364
+ execute_remote: If the output should be executed at the flowfile worker process.
1365
+ Returns:
1366
+ FlowDataEngine: Self for chaining
1367
+ """
1368
+ logger.info('Starting to write output')
1369
+ if execute_remote:
1370
+ status = utils.write_output(
1371
+ self.data_frame,
1372
+ data_type=output_fs.file_type,
1373
+ path=output_fs.abs_file_path,
1374
+ write_mode=output_fs.write_mode,
1375
+ sheet_name=output_fs.output_excel_table.sheet_name,
1376
+ delimiter=output_fs.output_csv_table.delimiter,
1377
+ flow_id=flow_id,
1378
+ node_id=node_id
1379
+ )
1380
+ tracker = ExternalExecutorTracker(status)
1381
+ tracker.get_result()
1382
+ logger.info('Finished writing output')
1383
+ else:
1384
+ logger.info("Starting to write results locally")
1385
+ utils.local_write_output(
1386
+ self.data_frame,
1387
+ data_type=output_fs.file_type,
1388
+ path=output_fs.abs_file_path,
1389
+ write_mode=output_fs.write_mode,
1390
+ sheet_name=output_fs.output_excel_table.sheet_name,
1391
+ delimiter=output_fs.output_csv_table.delimiter,
1392
+ flow_id=flow_id,
1393
+ node_id=node_id,
1394
+ )
1395
+ logger.info("Finished writing output")
1396
+ return self
1397
+
1398
+ # Data Operations
1399
+ def make_unique(self, unique_input: transform_schemas.UniqueInput = None) -> "FlowDataEngine":
1400
+ """
1401
+ Get unique rows based on specified columns.
1402
+
1403
+ Args:
1404
+ unique_input: Unique operation parameters
1405
+
1406
+ Returns:
1407
+ FlowDataEngine: New instance with unique rows
1408
+ """
1409
+ if unique_input is None or unique_input.columns is None:
1410
+ return FlowDataEngine(self.data_frame.unique())
1411
+ return FlowDataEngine(self.data_frame.unique(unique_input.columns, keep=unique_input.strategy))
1412
+
1413
+ def concat(self, other: Iterable["FlowDataEngine"] | "FlowDataEngine") -> "FlowDataEngine":
1414
+ """
1415
+ Concatenate with other DataFrames.
1416
+
1417
+ Args:
1418
+ other: DataFrames to concatenate
1419
+
1420
+ Returns:
1421
+ FlowDataEngine: Concatenated DataFrame
1422
+ """
1423
+ if isinstance(other, FlowDataEngine):
1424
+ other = [other]
1425
+
1426
+ dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
1427
+ return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
1428
+
1429
+ def do_select(self, select_inputs: transform_schemas.SelectInputs,
1430
+ keep_missing: bool = True) -> "FlowDataEngine":
1431
+ """
1432
+ Perform complex column selection and transformation.
1433
+
1434
+ Args:
1435
+ select_inputs: Selection specifications
1436
+ keep_missing: Whether to keep columns not specified
1437
+
1438
+ Returns:
1439
+ FlowDataEngine: New instance with selected/transformed columns
1440
+ """
1441
+ new_schema = deepcopy(self.schema)
1442
+ renames = [r for r in select_inputs.renames if r.is_available]
1443
+
1444
+ if not keep_missing:
1445
+ drop_cols = set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames).union(
1446
+ set(r.old_name for r in renames if not r.keep))
1447
+ keep_cols = []
1448
+ else:
1449
+ keep_cols = list(set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames))
1450
+ drop_cols = set(r.old_name for r in renames if not r.keep)
1451
+
1452
+ if len(drop_cols) > 0:
1453
+ new_schema = [s for s in new_schema if s.name not in drop_cols]
1454
+ new_schema_mapping = {v.name: v for v in new_schema}
1455
+
1456
+ available_renames = []
1457
+ for rename in renames:
1458
+ if (rename.new_name != rename.old_name or rename.new_name not in new_schema_mapping) and rename.keep:
1459
+ schema_entry = new_schema_mapping.get(rename.old_name)
1460
+ if schema_entry is not None:
1461
+ available_renames.append(rename)
1462
+ schema_entry.column_name = rename.new_name
1463
+
1464
+ rename_dict = {r.old_name: r.new_name for r in available_renames}
1465
+ fl = self.select_columns(
1466
+ list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols)
1467
+ fl = fl.change_column_types(transforms=[r for r in renames if r.keep])
1468
+ ndf = fl.data_frame.rename(rename_dict)
1469
+ renames.sort(key=lambda r: 0 if r.position is None else r.position)
1470
+ sorted_cols = utils.match_order(ndf.collect_schema().names(),
1471
+ [r.new_name for r in renames] + self.data_frame.collect_schema().names())
1472
+ output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
1473
+ return output_file.reorganize_order(sorted_cols)
1474
+
1475
+ # Utility Methods
1476
+ def set_streamable(self, streamable: bool = False):
1477
+ """Set whether DataFrame operations should be streamable."""
1478
+ self._streamable = streamable
1479
+
1480
+ def _calculate_schema(self) -> List[Dict]:
1481
+ """Calculate schema statistics."""
1482
+ if self.external_source is not None:
1483
+ self.collect_external()
1484
+ v = utils.calculate_schema(self.data_frame)
1485
+ return v
1486
+
1487
+ def calculate_schema(self):
1488
+ """Calculate and return schema."""
1489
+ self._calculate_schema_stats = True
1490
+ return self.schema
1491
+
1492
+ def count(self) -> int:
1493
+ """Get total number of records."""
1494
+ return self.get_number_of_records()
1495
+
1496
+ @classmethod
1497
+ def create_from_path_worker(cls, received_table: input_schema.ReceivedTable, flow_id: int, node_id: int | str):
1498
+ received_table.set_absolute_filepath()
1499
+ external_fetcher = ExternalCreateFetcher(received_table=received_table,
1500
+ file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
1501
+ return cls(external_fetcher.get_result())
1502
+
1503
+
1504
+ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowDataEngine":
1505
+ """
1506
+ Execute arbitrary Polars code.
1507
+
1508
+ Args:
1509
+ code: Polars code to execute
1510
+
1511
+ Returns:
1512
+ FlowDataEngine: Result of code execution
1513
+ """
1514
+ polars_executable = polars_code_parser.get_executable(code, num_inputs=len(flowfile_tables))
1515
+ if len(flowfile_tables) == 0:
1516
+ kwargs = {}
1517
+ elif len(flowfile_tables) == 1:
1518
+ kwargs = {'input_df': flowfile_tables[0].data_frame}
1519
+ else:
1520
+ kwargs = {f'input_df_{i+1}': flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
1521
+ return FlowDataEngine(polars_executable(**kwargs))