Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (145) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +1 -0
  3. flowfile/web/__init__.py +2 -2
  4. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  5. flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
  6. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  7. flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
  8. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  9. flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
  10. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
  11. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  12. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
  13. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
  14. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
  15. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
  16. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
  17. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
  18. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  19. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
  20. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
  21. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
  22. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
  23. flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
  24. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  25. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
  26. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  27. flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
  28. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
  29. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
  30. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
  31. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
  32. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
  33. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
  34. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
  35. flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
  36. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
  37. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
  38. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
  39. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
  40. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
  41. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
  42. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
  43. flowfile/web/static/assets/api-6ef0dcef.js +80 -0
  44. flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
  45. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  46. flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
  47. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
  48. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
  49. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
  50. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
  51. flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
  52. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
  53. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
  54. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
  55. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
  56. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
  57. flowfile/web/static/index.html +1 -1
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
  59. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
  60. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
  61. flowfile_core/__init__.py +3 -0
  62. flowfile_core/auth/jwt.py +39 -0
  63. flowfile_core/configs/node_store/nodes.py +9 -6
  64. flowfile_core/configs/settings.py +6 -5
  65. flowfile_core/database/connection.py +63 -15
  66. flowfile_core/database/init_db.py +0 -1
  67. flowfile_core/database/models.py +49 -2
  68. flowfile_core/flowfile/code_generator/code_generator.py +472 -17
  69. flowfile_core/flowfile/connection_manager/models.py +1 -1
  70. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  71. flowfile_core/flowfile/extensions.py +1 -1
  72. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  73. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  74. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
  75. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  76. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  77. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  78. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  79. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  80. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  81. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  82. flowfile_core/flowfile/flow_graph.py +718 -253
  83. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  84. flowfile_core/flowfile/flow_node/flow_node.py +563 -117
  85. flowfile_core/flowfile/flow_node/models.py +154 -20
  86. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  87. flowfile_core/flowfile/handler.py +2 -33
  88. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  89. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  90. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  91. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  92. flowfile_core/flowfile/utils.py +35 -26
  93. flowfile_core/main.py +35 -15
  94. flowfile_core/routes/cloud_connections.py +77 -0
  95. flowfile_core/routes/logs.py +2 -7
  96. flowfile_core/routes/public.py +1 -0
  97. flowfile_core/routes/routes.py +130 -90
  98. flowfile_core/routes/secrets.py +72 -14
  99. flowfile_core/schemas/__init__.py +8 -0
  100. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  101. flowfile_core/schemas/input_schema.py +121 -71
  102. flowfile_core/schemas/output_model.py +19 -3
  103. flowfile_core/schemas/schemas.py +150 -12
  104. flowfile_core/schemas/transform_schema.py +175 -35
  105. flowfile_core/utils/utils.py +40 -1
  106. flowfile_core/utils/validate_setup.py +41 -0
  107. flowfile_frame/__init__.py +9 -1
  108. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  109. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  110. flowfile_frame/expr.py +28 -1
  111. flowfile_frame/expr.pyi +76 -61
  112. flowfile_frame/flow_frame.py +481 -208
  113. flowfile_frame/flow_frame.pyi +140 -91
  114. flowfile_frame/flow_frame_methods.py +160 -22
  115. flowfile_frame/group_frame.py +3 -0
  116. flowfile_frame/utils.py +25 -3
  117. flowfile_worker/external_sources/s3_source/main.py +216 -0
  118. flowfile_worker/external_sources/s3_source/models.py +142 -0
  119. flowfile_worker/funcs.py +51 -6
  120. flowfile_worker/models.py +22 -2
  121. flowfile_worker/routes.py +40 -38
  122. flowfile_worker/utils.py +1 -1
  123. test_utils/s3/commands.py +46 -0
  124. test_utils/s3/data_generator.py +292 -0
  125. test_utils/s3/demo_data_generator.py +186 -0
  126. test_utils/s3/fixtures.py +214 -0
  127. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  128. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  129. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  130. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  131. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  132. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  133. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  134. flowfile_core/schemas/defaults.py +0 -9
  135. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  136. flowfile_core/schemas/models.py +0 -193
  137. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  138. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  139. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  140. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  141. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
  142. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
  143. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
  144. {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  145. {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
@@ -4,7 +4,7 @@ import os
4
4
  from copy import deepcopy
5
5
  from dataclasses import dataclass
6
6
  from math import ceil
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
8
8
 
9
9
  # Third-party imports
10
10
  from loky import Future
@@ -12,29 +12,39 @@ import polars as pl
12
12
  from polars.exceptions import PanicException
13
13
  from polars_grouper import graph_solver
14
14
  from polars_expr_transformer import simple_function_to_expr as to_expr
15
+ from pyarrow import Table as PaTable
15
16
  from pyarrow.parquet import ParquetFile
16
17
 
17
18
  # Local imports - Core
18
19
  from flowfile_core.configs import logger
20
+ from flowfile_core.utils.utils import ensure_similarity_dicts
19
21
  from flowfile_core.configs.flow_logger import NodeLogger
20
22
  from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
21
23
  from flowfile_core.schemas import (
24
+ cloud_storage_schemas,
22
25
  input_schema,
23
26
  transform_schema as transform_schemas
24
27
  )
25
28
 
26
29
  # Local imports - Flow File Components
27
30
  from flowfile_core.flowfile.flow_data_engine import utils
31
+ from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (CloudStorageReader,
32
+ ensure_path_has_wildcard_pattern,
33
+ get_first_file_from_s3_dir)
28
34
  from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
29
35
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
30
36
  FlowfileColumn,
37
+ assert_if_flowfile_schema,
31
38
  convert_stats_to_column_info
32
39
  )
33
40
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
34
41
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
35
42
  from flowfile_core.flowfile.flow_data_engine.join import (
36
43
  verify_join_select_integrity,
37
- verify_join_map_integrity
44
+ verify_join_map_integrity,
45
+ rename_df_table_for_join,
46
+ get_undo_rename_mapping_join,
47
+ get_col_name_to_delete
38
48
  )
39
49
  from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
40
50
  from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
@@ -52,20 +62,95 @@ from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
52
62
 
53
63
  from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
54
64
 
65
+ T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
55
66
 
56
- @dataclass
57
- class FlowDataEngine:
67
+ def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
68
+ """Temporarily renames join keys to avoid conflicts during a join.
69
+
70
+ This helper function checks the join type and renames the join key columns
71
+ in either the left or right DataFrame to a temporary name (`__FL_TEMP__...`).
72
+ This prevents Polars from automatically suffixing columns with `_right` when
73
+ join keys have the same name.
74
+
75
+ Args:
76
+ left_df: The left Polars DataFrame or LazyFrame.
77
+ right_df: The right Polars DataFrame or LazyFrame.
78
+ join_input: The JoinInput settings object defining the join.
79
+
80
+ Returns:
81
+ A tuple containing:
82
+ - The (potentially modified) left DataFrame.
83
+ - The (potentially modified) right DataFrame.
84
+ - A dictionary mapping the temporary names back to their desired final names.
58
85
  """
59
- A class that provides a unified interface for working with tabular data, supporting both eager and lazy evaluation.
60
-
61
- The class is organized into several logical sections:
62
- 1. Core properties and initialization
63
- 2. Data access and manipulation
64
- 3. Schema and metadata operations
65
- 4. Transformations and operations
66
- 5. I/O operations
86
+ def _construct_temp_name(column_name: str) -> str:
87
+ return "__FL_TEMP__"+column_name
88
+ if join_input.how == 'right':
89
+ left_df = left_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
90
+ for jk in join_input.left_select.join_key_selects)
91
+ reverse_actions = {
92
+ _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
93
+ for jk in join_input.left_select.join_key_selects}
94
+ elif join_input.how in ('left', 'inner'):
95
+ right_df = right_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
96
+ for jk in join_input.right_select.join_key_selects)
97
+ reverse_actions = {
98
+ _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
99
+ for jk in join_input.right_select.join_key_selects}
100
+ else:
101
+ reverse_actions = {}
102
+ return left_df, right_df, reverse_actions
103
+
104
+
105
+ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.JoinInput) -> None:
106
+ """Modifies JoinInput for semi/anti joins to not keep right-side columns.
107
+
108
+ For 'semi' and 'anti' joins, Polars only returns columns from the left
109
+ DataFrame. This function enforces that behavior by modifying the `join_input`
110
+ in-place, setting the `keep` flag to `False` for all columns in the
111
+ right-side selection.
112
+
113
+ Args:
114
+ join_input: The JoinInput settings object to modify.
115
+ """
116
+ if join_input.how in ('semi', 'anti'):
117
+ for jk in join_input.right_select.renames:
118
+ jk.keep = False
119
+
120
+
121
+ def get_select_columns(full_select_input: List[transform_schemas.SelectInput]) -> List[str]:
122
+ """Extracts a list of column names to be selected from a SelectInput list.
123
+
124
+ This function filters a list of `SelectInput` objects to return the names
125
+ of columns that are marked as available and are either a join key or
126
+ explicitly marked to be kept.
127
+
128
+ Args:
129
+ full_select_input: A list of SelectInput objects.
130
+
131
+ Returns:
132
+ A list of column names to be selected.
67
133
  """
134
+ return [v.old_name for v in full_select_input if (v.keep or v.join_key) and v.is_available]
68
135
 
136
+
137
+ @dataclass
138
+ class FlowDataEngine:
139
+ """The core data handling engine for Flowfile.
140
+
141
+ This class acts as a high-level wrapper around a Polars DataFrame or
142
+ LazyFrame, providing a unified API for data ingestion, transformation,
143
+ and output. It manages data state (lazy vs. eager), schema information,
144
+ and execution logic.
145
+
146
+ Attributes:
147
+ _data_frame: The underlying Polars DataFrame or LazyFrame.
148
+ columns: A list of column names in the current data frame.
149
+ name: An optional name for the data engine instance.
150
+ number_of_records: The number of records. Can be -1 for lazy frames.
151
+ errors: A list of errors encountered during operations.
152
+ _schema: A cached list of `FlowfileColumn` objects representing the schema.
153
+ """
69
154
  # Core attributes
70
155
  _data_frame: Union[pl.DataFrame, pl.LazyFrame]
71
156
  columns: List[Any]
@@ -105,12 +190,9 @@ class FlowDataEngine:
105
190
  _number_of_records_callback: Callable = None
106
191
  _data_callback: Callable = None
107
192
 
108
- # Tracking info
109
- # node_id: int = None # TODO: Implement node_id
110
- # flow_id: int = None # TODO: Implement flow_id
111
193
 
112
194
  def __init__(self,
113
- raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
195
+ raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
114
196
  path_ref: str = None,
115
197
  name: str = None,
116
198
  optimize_memory: bool = True,
@@ -120,7 +202,22 @@ class FlowDataEngine:
120
202
  streamable: bool = True,
121
203
  number_of_records_callback: Callable = None,
122
204
  data_callback: Callable = None):
123
- """Initialize FlowDataEngine with various data sources and configuration options."""
205
+ """Initializes the FlowDataEngine from various data sources.
206
+
207
+ Args:
208
+ raw_data: The input data. Can be a list of dicts, a Polars DataFrame/LazyFrame,
209
+ or a `RawData` schema object.
210
+ path_ref: A string path to a Parquet file.
211
+ name: An optional name for the data engine instance.
212
+ optimize_memory: If True, prefers lazy operations to conserve memory.
213
+ schema: An optional schema definition. Can be a list of `FlowfileColumn` objects,
214
+ a list of column names, or a Polars `Schema`.
215
+ number_of_records: The number of records, if known.
216
+ calculate_schema_stats: If True, computes detailed statistics for each column.
217
+ streamable: If True, allows for streaming operations when possible.
218
+ number_of_records_callback: A callback function to retrieve the number of records.
219
+ data_callback: A callback function to retrieve the data.
220
+ """
124
221
  self._initialize_attributes(number_of_records_callback, data_callback, streamable)
125
222
 
126
223
  if raw_data is not None:
@@ -129,11 +226,14 @@ class FlowDataEngine:
129
226
  self._handle_path_ref(path_ref, optimize_memory)
130
227
  else:
131
228
  self.initialize_empty_fl()
132
-
133
229
  self._finalize_initialization(name, optimize_memory, schema, calculate_schema_stats)
134
230
 
135
231
  def _initialize_attributes(self, number_of_records_callback, data_callback, streamable):
136
- """Initialize basic attributes with default values."""
232
+ """(Internal) Sets the initial default attributes for a new instance.
233
+
234
+ This helper is called first during initialization to ensure all state-tracking
235
+ and configuration attributes have a clean default value before data is processed.
236
+ """
137
237
  self._external_source = None
138
238
  self._number_of_records_callback = number_of_records_callback
139
239
  self._data_callback = data_callback
@@ -147,8 +247,11 @@ class FlowDataEngine:
147
247
  self.is_future = False
148
248
 
149
249
  def _handle_raw_data(self, raw_data, number_of_records, optimize_memory):
150
- """Process different types of input data."""
250
+ """(Internal) Dispatches raw data to the appropriate handler based on its type.
151
251
 
252
+ This acts as a router during initialization, inspecting the type of `raw_data`
253
+ and calling the corresponding specialized `_handle_*` method to process it.
254
+ """
152
255
  if isinstance(raw_data, input_schema.RawData):
153
256
  self._handle_raw_data_format(raw_data)
154
257
  elif isinstance(raw_data, pl.DataFrame):
@@ -159,12 +262,12 @@ class FlowDataEngine:
159
262
  self._handle_python_data(raw_data)
160
263
 
161
264
  def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: Optional[int]):
162
- """Handle Polars DataFrame input."""
265
+ """(Internal) Initializes the engine from an eager Polars DataFrame."""
163
266
  self.data_frame = df
164
267
  self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
165
268
 
166
269
  def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: Optional[int], optimize_memory: bool):
167
- """Handle Polars LazyFrame input."""
270
+ """(Internal) Initializes the engine from a Polars LazyFrame."""
168
271
  self.data_frame = lf
169
272
  self._lazy = True
170
273
  if number_of_records is not None:
@@ -175,27 +278,35 @@ class FlowDataEngine:
175
278
  self.number_of_records = lf.select(pl.len()).collect()[0, 0]
176
279
 
177
280
  def _handle_python_data(self, data: Union[List, Dict]):
178
- """Handle Python list or dict input."""
281
+ """(Internal) Dispatches Python collections to the correct handler."""
179
282
  if isinstance(data, dict):
180
283
  self._handle_dict_input(data)
181
284
  else:
182
285
  self._handle_list_input(data)
183
286
 
184
287
  def _handle_dict_input(self, data: Dict):
185
- """Handle dictionary input."""
288
+ """(Internal) Initializes the engine from a Python dictionary."""
186
289
  if len(data) == 0:
187
290
  self.initialize_empty_fl()
188
291
  lengths = [len(v) if isinstance(v, (list, tuple)) else 1 for v in data.values()]
189
292
 
190
- if len(set(lengths)) == 1 and lengths[0]>1:
293
+ if len(set(lengths)) == 1 and lengths[0] > 1:
191
294
  self.number_of_records = lengths[0]
192
295
  self.data_frame = pl.DataFrame(data)
193
296
  else:
194
297
  self.number_of_records = 1
195
298
  self.data_frame = pl.DataFrame([data])
299
+ self.lazy = True
196
300
 
197
301
  def _handle_raw_data_format(self, raw_data: input_schema.RawData):
198
- """Create a FlowDataEngine from a RawData object."""
302
+ """(Internal) Initializes the engine from a `RawData` schema object.
303
+
304
+ This method uses the schema provided in the `RawData` object to correctly
305
+ infer data types when creating the Polars DataFrame.
306
+
307
+ Args:
308
+ raw_data: An instance of `RawData` containing the data and schema.
309
+ """
199
310
  flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
200
311
  polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
201
312
  for flowfile_column in flowfile_schema])
@@ -209,7 +320,7 @@ class FlowDataEngine:
209
320
  self.lazy = True
210
321
 
211
322
  def _handle_list_input(self, data: List):
212
- """Handle list input."""
323
+ """(Internal) Initializes the engine from a list of records."""
213
324
  number_of_records = len(data)
214
325
  if number_of_records > 0:
215
326
  processed_data = self._process_list_data(data)
@@ -222,20 +333,411 @@ class FlowDataEngine:
222
333
 
223
334
  @staticmethod
224
335
  def _process_list_data(data: List) -> List[Dict]:
225
- """Process list data into a format suitable for DataFrame creation."""
336
+ """(Internal) Normalizes list data into a list of dictionaries.
337
+
338
+ Ensures that a list of objects or non-dict items is converted into a
339
+ uniform list of dictionaries suitable for Polars DataFrame creation.
340
+ """
226
341
  if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
227
342
  try:
228
343
  return pl.DataFrame(data).to_dicts()
229
- except:
344
+ except TypeError:
230
345
  raise Exception('Value must be able to be converted to dictionary')
346
+ except Exception as e:
347
+ raise Exception(f'Value must be able to be converted to dictionary: {e}')
231
348
 
232
349
  if not isinstance(data[0], dict):
233
350
  data = [row.__dict__ for row in data]
234
351
 
235
- return utils.ensure_similarity_dicts(data)
352
+ return ensure_similarity_dicts(data)
353
+
354
+ def to_cloud_storage_obj(self, settings: cloud_storage_schemas.CloudStorageWriteSettingsInternal):
355
+ """Writes the DataFrame to an object in cloud storage.
356
+
357
+ This method supports writing to various cloud storage providers like AWS S3,
358
+ Azure Data Lake Storage, and Google Cloud Storage.
359
+
360
+ Args:
361
+ settings: A `CloudStorageWriteSettingsInternal` object containing connection
362
+ details, file format, and write options.
363
+
364
+ Raises:
365
+ ValueError: If the specified file format is not supported for writing.
366
+ NotImplementedError: If the 'append' write mode is used with an unsupported format.
367
+ Exception: If the write operation to cloud storage fails for any reason.
368
+ """
369
+ connection = settings.connection
370
+ write_settings = settings.write_settings
371
+
372
+ logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
373
+
374
+ if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
375
+ raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
376
+ storage_options = CloudStorageReader.get_storage_options(connection)
377
+ credential_provider = CloudStorageReader.get_credential_provider(connection)
378
+ # Dispatch to the correct writer based on file format
379
+ if write_settings.file_format == "parquet":
380
+ self._write_parquet_to_cloud(
381
+ write_settings.resource_path,
382
+ storage_options,
383
+ credential_provider,
384
+ write_settings
385
+ )
386
+ elif write_settings.file_format == "delta":
387
+ self._write_delta_to_cloud(
388
+ write_settings.resource_path,
389
+ storage_options,
390
+ credential_provider,
391
+ write_settings
392
+ )
393
+ elif write_settings.file_format == "csv":
394
+ self._write_csv_to_cloud(
395
+ write_settings.resource_path,
396
+ storage_options,
397
+ credential_provider,
398
+ write_settings
399
+ )
400
+ elif write_settings.file_format == "json":
401
+ self._write_json_to_cloud(
402
+ write_settings.resource_path,
403
+ storage_options,
404
+ credential_provider,
405
+ write_settings
406
+ )
407
+ else:
408
+ raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
409
+
410
+ logger.info(f"Successfully wrote data to {write_settings.resource_path}")
411
+
412
+ def _write_parquet_to_cloud(self,
413
+ resource_path: str,
414
+ storage_options: Dict[str, Any],
415
+ credential_provider: Optional[Callable],
416
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
417
+ """(Internal) Writes the DataFrame to a Parquet file in cloud storage.
418
+
419
+ Uses `sink_parquet` for efficient streaming writes. Falls back to a
420
+ collect-then-write pattern if sinking fails.
421
+ """
422
+ try:
423
+ sink_kwargs = {
424
+ "path": resource_path,
425
+ "compression": write_settings.parquet_compression,
426
+ }
427
+ if storage_options:
428
+ sink_kwargs["storage_options"] = storage_options
429
+ if credential_provider:
430
+ sink_kwargs["credential_provider"] = credential_provider
431
+ try:
432
+ self.data_frame.sink_parquet(**sink_kwargs)
433
+ except Exception as e:
434
+ logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
435
+ pl_df = self.collect()
436
+ sink_kwargs['file'] = sink_kwargs.pop("path")
437
+ pl_df.write_parquet(**sink_kwargs)
438
+
439
+ except Exception as e:
440
+ logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
441
+ raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
442
+
443
+ def _write_delta_to_cloud(self,
444
+ resource_path: str,
445
+ storage_options: Dict[str, Any],
446
+ credential_provider: Optional[Callable],
447
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
448
+ """(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
449
+
450
+ This operation requires collecting the data first, as `write_delta` operates
451
+ on an eager DataFrame.
452
+ """
453
+ sink_kwargs = {
454
+ "target": resource_path,
455
+ "mode": write_settings.write_mode,
456
+ }
457
+ if storage_options:
458
+ sink_kwargs["storage_options"] = storage_options
459
+ if credential_provider:
460
+ sink_kwargs["credential_provider"] = credential_provider
461
+ self.collect().write_delta(**sink_kwargs)
462
+
463
+ def _write_csv_to_cloud(self,
464
+ resource_path: str,
465
+ storage_options: Dict[str, Any],
466
+ credential_provider: Optional[Callable],
467
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
468
+ """(Internal) Writes the DataFrame to a CSV file in cloud storage.
469
+
470
+ Uses `sink_csv` for efficient, streaming writes of the data.
471
+ """
472
+ try:
473
+ sink_kwargs = {
474
+ "path": resource_path,
475
+ "separator": write_settings.csv_delimiter,
476
+ }
477
+ if storage_options:
478
+ sink_kwargs["storage_options"] = storage_options
479
+ if credential_provider:
480
+ sink_kwargs["credential_provider"] = credential_provider
481
+
482
+ # sink_csv executes the lazy query and writes the result
483
+ self.data_frame.sink_csv(**sink_kwargs)
484
+
485
+ except Exception as e:
486
+ logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
487
+ raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
488
+
489
+ def _write_json_to_cloud(self,
490
+ resource_path: str,
491
+ storage_options: Dict[str, Any],
492
+ credential_provider: Optional[Callable],
493
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
494
+ """(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
495
+
496
+ Uses `sink_ndjson` for efficient, streaming writes.
497
+ """
498
+ try:
499
+ sink_kwargs = {"path": resource_path}
500
+ if storage_options:
501
+ sink_kwargs["storage_options"] = storage_options
502
+ if credential_provider:
503
+ sink_kwargs["credential_provider"] = credential_provider
504
+ self.data_frame.sink_ndjson(**sink_kwargs)
505
+
506
+ except Exception as e:
507
+ logger.error(f"Failed to write JSON to {resource_path}: {str(e)}")
508
+ raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
509
+
510
+ @classmethod
511
+ def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal) -> "FlowDataEngine":
512
+ """Creates a FlowDataEngine from an object in cloud storage.
513
+
514
+ This method supports reading from various cloud storage providers like AWS S3,
515
+ Azure Data Lake Storage, and Google Cloud Storage, with support for
516
+ various authentication methods.
517
+
518
+ Args:
519
+ settings: A `CloudStorageReadSettingsInternal` object containing connection
520
+ details, file format, and read options.
521
+
522
+ Returns:
523
+ A new `FlowDataEngine` instance containing the data from cloud storage.
524
+
525
+ Raises:
526
+ ValueError: If the storage type or file format is not supported.
527
+ NotImplementedError: If a requested file format like "delta" or "iceberg"
528
+ is not yet implemented.
529
+ Exception: If reading from cloud storage fails.
530
+ """
531
+ connection = settings.connection
532
+ read_settings = settings.read_settings
533
+
534
+ logger.info(f"Reading from {connection.storage_type} storage: {read_settings.resource_path}")
535
+ # Get storage options based on connection type
536
+ storage_options = CloudStorageReader.get_storage_options(connection)
537
+ # Get credential provider if needed
538
+ credential_provider = CloudStorageReader.get_credential_provider(connection)
539
+ if read_settings.file_format == "parquet":
540
+ return cls._read_parquet_from_cloud(
541
+ read_settings.resource_path,
542
+ storage_options,
543
+ credential_provider,
544
+ read_settings.scan_mode == "directory",
545
+ )
546
+ elif read_settings.file_format == "delta":
547
+ return cls._read_delta_from_cloud(
548
+ read_settings.resource_path,
549
+ storage_options,
550
+ credential_provider,
551
+ read_settings
552
+ )
553
+ elif read_settings.file_format == "csv":
554
+ return cls._read_csv_from_cloud(
555
+ read_settings.resource_path,
556
+ storage_options,
557
+ credential_provider,
558
+ read_settings
559
+ )
560
+ elif read_settings.file_format == "json":
561
+ return cls._read_json_from_cloud(
562
+ read_settings.resource_path,
563
+ storage_options,
564
+ credential_provider,
565
+ read_settings.scan_mode == "directory"
566
+ )
567
+ elif read_settings.file_format == "iceberg":
568
+ return cls._read_iceberg_from_cloud(
569
+ read_settings.resource_path,
570
+ storage_options,
571
+ credential_provider,
572
+ read_settings
573
+ )
574
+
575
+ elif read_settings.file_format in ["delta", "iceberg"]:
576
+ # These would require additional libraries
577
+ raise NotImplementedError(f"File format {read_settings.file_format} not yet implemented")
578
+ else:
579
+ raise ValueError(f"Unsupported file format: {read_settings.file_format}")
580
+
581
+ @staticmethod
582
+ def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any],
583
+ file_format: Literal["csv", "parquet", "json", "delta"]) -> List[FlowfileColumn] | None:
584
+ """Infers the schema by scanning the first file in a cloud directory."""
585
+ try:
586
+ scan_func = getattr(pl, "scan_" + file_format)
587
+ first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
588
+ return convert_stats_to_column_info(FlowDataEngine._create_schema_stats_from_pl_schema(
589
+ scan_func(first_file_ref, storage_options=storage_options).collect_schema()))
590
+ except Exception as e:
591
+ logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
592
+
593
+
594
+ @classmethod
595
+ def _read_iceberg_from_cloud(cls,
596
+ resource_path: str,
597
+ storage_options: Dict[str, Any],
598
+ credential_provider: Optional[Callable],
599
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
600
+ """Reads Iceberg table(s) from cloud storage."""
601
+ raise NotImplementedError(f"Failed to read Iceberg table from cloud storage: Not yet implemented")
602
+
603
+ @classmethod
604
+ def _read_parquet_from_cloud(cls,
605
+ resource_path: str,
606
+ storage_options: Dict[str, Any],
607
+ credential_provider: Optional[Callable],
608
+ is_directory: bool) -> "FlowDataEngine":
609
+ """Reads Parquet file(s) from cloud storage."""
610
+ try:
611
+ # Use scan_parquet for lazy evaluation
612
+ if is_directory:
613
+ resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="parquet")
614
+ scan_kwargs = {"source": resource_path}
615
+
616
+ if storage_options:
617
+ scan_kwargs["storage_options"] = storage_options
618
+
619
+ if credential_provider:
620
+ scan_kwargs["credential_provider"] = credential_provider
621
+ if storage_options and is_directory:
622
+ schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options, "parquet")
623
+ else:
624
+ schema = None
625
+ lf = pl.scan_parquet(**scan_kwargs)
626
+
627
+ return cls(
628
+ lf,
629
+ number_of_records=6_666_666, # Set so the provider is not accessed for this stat
630
+ optimize_memory=True,
631
+ streamable=True,
632
+ schema=schema
633
+ )
634
+
635
+ except Exception as e:
636
+ logger.error(f"Failed to read Parquet from {resource_path}: {str(e)}")
637
+ raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
638
+
639
+ @classmethod
640
+ def _read_delta_from_cloud(cls,
641
+ resource_path: str,
642
+ storage_options: Dict[str, Any],
643
+ credential_provider: Optional[Callable],
644
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
645
+ """Reads a Delta Lake table from cloud storage."""
646
+ try:
647
+ logger.info("Reading Delta file from cloud storage...")
648
+ logger.info(f"read_settings: {read_settings}")
649
+ scan_kwargs = {"source": resource_path}
650
+ if read_settings.delta_version:
651
+ scan_kwargs['version'] = read_settings.delta_version
652
+ if storage_options:
653
+ scan_kwargs["storage_options"] = storage_options
654
+ if credential_provider:
655
+ scan_kwargs["credential_provider"] = credential_provider
656
+ lf = pl.scan_delta(**scan_kwargs)
657
+
658
+ return cls(
659
+ lf,
660
+ number_of_records=6_666_666, # Set so the provider is not accessed for this stat
661
+ optimize_memory=True,
662
+ streamable=True
663
+ )
664
+ except Exception as e:
665
+ logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
666
+ raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
667
+
668
+ @classmethod
669
+ def _read_csv_from_cloud(cls,
670
+ resource_path: str,
671
+ storage_options: Dict[str, Any],
672
+ credential_provider: Optional[Callable],
673
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
674
+ """Reads CSV file(s) from cloud storage."""
675
+ try:
676
+ scan_kwargs = {
677
+ "source": resource_path,
678
+ "has_header": read_settings.csv_has_header,
679
+ "separator": read_settings.csv_delimiter,
680
+ "encoding": read_settings.csv_encoding,
681
+ }
682
+ if storage_options:
683
+ scan_kwargs["storage_options"] = storage_options
684
+ if credential_provider:
685
+ scan_kwargs["credential_provider"] = credential_provider
686
+
687
+ if read_settings.scan_mode == "directory":
688
+ resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="csv")
689
+ scan_kwargs["source"] = resource_path
690
+ if storage_options and read_settings.scan_mode == "directory":
691
+ schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options, "csv")
692
+ else:
693
+ schema = None
694
+
695
+ lf = pl.scan_csv(**scan_kwargs)
696
+
697
+ return cls(
698
+ lf,
699
+ number_of_records=6_666_666, # Will be calculated lazily
700
+ optimize_memory=True,
701
+ streamable=True,
702
+ schema=schema
703
+ )
704
+
705
+ except Exception as e:
706
+ logger.error(f"Failed to read CSV from {resource_path}: {str(e)}")
707
+ raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
708
+
709
+ @classmethod
710
+ def _read_json_from_cloud(cls,
711
+ resource_path: str,
712
+ storage_options: Dict[str, Any],
713
+ credential_provider: Optional[Callable],
714
+ is_directory: bool) -> "FlowDataEngine":
715
+ """Reads JSON file(s) from cloud storage."""
716
+ try:
717
+ if is_directory:
718
+ resource_path = ensure_path_has_wildcard_pattern(resource_path, "json")
719
+ scan_kwargs = {"source": resource_path}
720
+
721
+ if storage_options:
722
+ scan_kwargs["storage_options"] = storage_options
723
+ if credential_provider:
724
+ scan_kwargs["credential_provider"] = credential_provider
725
+
726
+ lf = pl.scan_ndjson(**scan_kwargs) # Using NDJSON for line-delimited JSON
727
+
728
+ return cls(
729
+ lf,
730
+ number_of_records=-1,
731
+ optimize_memory=True,
732
+ streamable=True,
733
+ )
734
+
735
+ except Exception as e:
736
+ logger.error(f"Failed to read JSON from {resource_path}: {str(e)}")
737
+ raise Exception(f"Failed to read JSON from cloud storage: {str(e)}")
236
738
 
237
739
  def _handle_path_ref(self, path_ref: str, optimize_memory: bool):
238
- """Handle file path reference input."""
740
+ """Handles file path reference input."""
239
741
  try:
240
742
  pf = ParquetFile(path_ref)
241
743
  except Exception as e:
@@ -251,21 +753,32 @@ class FlowDataEngine:
251
753
 
252
754
  def _finalize_initialization(self, name: str, optimize_memory: bool, schema: Optional[Any],
253
755
  calculate_schema_stats: bool):
254
- """Finalize initialization by setting remaining attributes."""
756
+ """Finalizes initialization by setting remaining attributes."""
255
757
  _ = calculate_schema_stats
256
758
  self.name = name
257
759
  self._optimize_memory = optimize_memory
258
- pl_schema = self.data_frame.collect_schema()
259
- self._schema = self._handle_schema(schema, pl_schema)
260
- self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
760
+ if assert_if_flowfile_schema(schema):
761
+ self._schema = schema
762
+ self.columns = [c.column_name for c in self._schema]
763
+ else:
764
+ pl_schema = self.data_frame.collect_schema()
765
+ self._schema = self._handle_schema(schema, pl_schema)
766
+ self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
261
767
 
262
768
  def __getitem__(self, item):
263
- """Access a specific column or item from the DataFrame."""
769
+ """Accesses a specific column or item from the DataFrame."""
264
770
  return self.data_frame.select([item])
265
771
 
266
772
  @property
267
- def data_frame(self) -> pl.LazyFrame | pl.DataFrame:
268
- """Get the underlying DataFrame with appropriate handling of different states."""
773
+ def data_frame(self) -> pl.LazyFrame | pl.DataFrame | None:
774
+ """The underlying Polars DataFrame or LazyFrame.
775
+
776
+ This property provides access to the Polars object that backs the
777
+ FlowDataEngine. It handles lazy-loading from external sources if necessary.
778
+
779
+ Returns:
780
+ The active Polars `DataFrame` or `LazyFrame`.
781
+ """
269
782
  if self._data_frame is not None and not self.is_future:
270
783
  return self._data_frame
271
784
  elif self.is_future:
@@ -284,14 +797,32 @@ class FlowDataEngine:
284
797
 
285
798
  @data_frame.setter
286
799
  def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
287
- """Set the underlying DataFrame with validation."""
800
+ """Sets the underlying Polars DataFrame or LazyFrame."""
288
801
  if self.lazy and isinstance(df, pl.DataFrame):
289
802
  raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
290
803
  self._data_frame = df
291
804
 
805
+ @staticmethod
806
+ def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> List[Dict]:
807
+ """Converts a Polars Schema into a list of schema statistics dictionaries."""
808
+ return [
809
+ dict(column_name=k, pl_datatype=v, col_index=i)
810
+ for i, (k, v) in enumerate(pl_schema.items())
811
+ ]
812
+
813
+ def _add_schema_from_schema_stats(self, schema_stats: List[Dict]):
814
+ """Populates the schema from a list of schema statistics dictionaries."""
815
+ self._schema = convert_stats_to_column_info(schema_stats)
816
+
292
817
  @property
293
818
  def schema(self) -> List[FlowfileColumn]:
294
- """Get the schema of the DataFrame, calculating if necessary."""
819
+ """The schema of the DataFrame as a list of `FlowfileColumn` objects.
820
+
821
+ This property lazily calculates the schema if it hasn't been determined yet.
822
+
823
+ Returns:
824
+ A list of `FlowfileColumn` objects describing the schema.
825
+ """
295
826
  if self.number_of_fields == 0:
296
827
  return []
297
828
  if self._schema is None or (self._calculate_schema_stats and not self.ind_schema_calculated):
@@ -299,26 +830,34 @@ class FlowDataEngine:
299
830
  schema_stats = self._calculate_schema()
300
831
  self.ind_schema_calculated = True
301
832
  else:
302
- schema_stats = [
303
- dict(column_name=k, pl_datatype=v, col_index=i)
304
- for i, (k, v) in enumerate(self.data_frame.collect_schema().items())
305
- ]
306
- self._schema = convert_stats_to_column_info(schema_stats)
833
+ schema_stats = self._create_schema_stats_from_pl_schema(self.data_frame.collect_schema())
834
+ self._add_schema_from_schema_stats(schema_stats)
307
835
  return self._schema
308
836
 
309
837
  @property
310
838
  def number_of_fields(self) -> int:
311
- """Get the number of fields in the DataFrame."""
839
+ """The number of columns (fields) in the DataFrame.
840
+
841
+ Returns:
842
+ The integer count of columns.
843
+ """
312
844
  if self.__number_of_fields is None:
313
845
  self.__number_of_fields = len(self.columns)
314
846
  return self.__number_of_fields
315
847
 
316
- # Data Collection and Sampling Methods
317
-
318
848
  def collect(self, n_records: int = None) -> pl.DataFrame:
319
- """
320
- Collect data from the DataFrame, optionally limiting the number of records.
321
- Handles streaming and error cases appropriately.
849
+ """Collects the data and returns it as a Polars DataFrame.
850
+
851
+ This method triggers the execution of the lazy query plan (if applicable)
852
+ and returns the result. It supports streaming to optimize memory usage
853
+ for large datasets.
854
+
855
+ Args:
856
+ n_records: The maximum number of records to collect. If None, all
857
+ records are collected.
858
+
859
+ Returns:
860
+ A Polars `DataFrame` containing the collected data.
322
861
  """
323
862
  if n_records is None:
324
863
  logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
@@ -336,8 +875,9 @@ class FlowDataEngine:
336
875
  return self._handle_collection_error(n_records)
337
876
 
338
877
  def _collect_data(self, n_records: int = None) -> pl.DataFrame:
339
- """Internal method to handle data collection."""
878
+ """Internal method to handle data collection logic."""
340
879
  if n_records is None:
880
+
341
881
  self.collect_external()
342
882
  if self._streamable:
343
883
  try:
@@ -353,11 +893,11 @@ class FlowDataEngine:
353
893
  return self._collect_from_external_source(n_records)
354
894
 
355
895
  if self._streamable:
356
- return self.data_frame.head(n_records).collect(engine="streaming", comm_subplan_elim=False)
896
+ return self.data_frame.head(n_records).collect(engine="streaming")
357
897
  return self.data_frame.head(n_records).collect()
358
898
 
359
899
  def _collect_from_external_source(self, n_records: int) -> pl.DataFrame:
360
- """Handle collection from external source."""
900
+ """Handles collection from an external source."""
361
901
  if self.external_source.get_pl_df() is not None:
362
902
  all_data = self.external_source.get_pl_df().head(n_records)
363
903
  self.data_frame = all_data
@@ -367,7 +907,7 @@ class FlowDataEngine:
367
907
  return self.data_frame
368
908
 
369
909
  def _handle_collection_error(self, n_records: int) -> pl.DataFrame:
370
- """Handle errors during collection by attempting partial collection."""
910
+ """Handles errors during collection by attempting partial collection."""
371
911
  n_records = 100000000 if n_records is None else n_records
372
912
  ok_cols, error_cols = self._identify_valid_columns(n_records)
373
913
 
@@ -376,7 +916,7 @@ class FlowDataEngine:
376
916
  return self._create_empty_dataframe(n_records)
377
917
 
378
918
  def _identify_valid_columns(self, n_records: int) -> Tuple[List[str], List[Tuple[str, Any]]]:
379
- """Identify which columns can be collected successfully."""
919
+ """Identifies which columns can be collected successfully."""
380
920
  ok_cols = []
381
921
  error_cols = []
382
922
  for c in self.columns:
@@ -389,7 +929,7 @@ class FlowDataEngine:
389
929
 
390
930
  def _create_partial_dataframe(self, ok_cols: List[str], error_cols: List[Tuple[str, Any]],
391
931
  n_records: int) -> pl.DataFrame:
392
- """Create a DataFrame with partial data for columns that could be collected."""
932
+ """Creates a DataFrame with partial data for columns that could be collected."""
393
933
  df = self.data_frame.select(ok_cols)
394
934
  df = df.with_columns([
395
935
  pl.lit(None).alias(column_name).cast(data_type)
@@ -398,7 +938,7 @@ class FlowDataEngine:
398
938
  return df.select(self.columns).head(n_records).collect()
399
939
 
400
940
  def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
401
- """Create an empty DataFrame with the correct schema."""
941
+ """Creates an empty DataFrame with the correct schema."""
402
942
  if self.number_of_records > 0:
403
943
  return pl.DataFrame({
404
944
  column_name: pl.Series(
@@ -409,11 +949,19 @@ class FlowDataEngine:
409
949
  })
410
950
  return pl.DataFrame(schema=self.data_frame.schema)
411
951
 
412
- # Data Transformation Methods
413
-
414
952
  def do_group_by(self, group_by_input: transform_schemas.GroupByInput,
415
953
  calculate_schema_stats: bool = True) -> "FlowDataEngine":
416
- """Perform group by operations on the DataFrame."""
954
+ """Performs a group-by operation on the DataFrame.
955
+
956
+ Args:
957
+ group_by_input: A `GroupByInput` object defining the grouping columns
958
+ and aggregations.
959
+ calculate_schema_stats: If True, calculates schema statistics for the
960
+ resulting DataFrame.
961
+
962
+ Returns:
963
+ A new `FlowDataEngine` instance with the grouped and aggregated data.
964
+ """
417
965
  aggregations = [c for c in group_by_input.agg_cols if c.agg != 'groupby']
418
966
  group_columns = [c for c in group_by_input.agg_cols if c.agg == 'groupby']
419
967
 
@@ -435,7 +983,15 @@ class FlowDataEngine:
435
983
  )
436
984
 
437
985
  def do_sort(self, sorts: List[transform_schemas.SortByInput]) -> "FlowDataEngine":
438
- """Sort the DataFrame based on specified columns and directions."""
986
+ """Sorts the DataFrame by one or more columns.
987
+
988
+ Args:
989
+ sorts: A list of `SortByInput` objects, each specifying a column
990
+ and sort direction ('asc' or 'desc').
991
+
992
+ Returns:
993
+ A new `FlowDataEngine` instance with the sorted data.
994
+ """
439
995
  if not sorts:
440
996
  return self
441
997
 
@@ -445,7 +1001,16 @@ class FlowDataEngine:
445
1001
 
446
1002
  def change_column_types(self, transforms: List[transform_schemas.SelectInput],
447
1003
  calculate_schema: bool = False) -> "FlowDataEngine":
448
- """Change the data types of specified columns."""
1004
+ """Changes the data type of one or more columns.
1005
+
1006
+ Args:
1007
+ transforms: A list of `SelectInput` objects, where each object specifies
1008
+ the column and its new `polars_type`.
1009
+ calculate_schema: If True, recalculates the schema after the type change.
1010
+
1011
+ Returns:
1012
+ A new `FlowDataEngine` instance with the updated column types.
1013
+ """
449
1014
  dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
450
1015
  idx_mapping = list(
451
1016
  (transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
@@ -466,26 +1031,79 @@ class FlowDataEngine:
466
1031
  streamable=self._streamable
467
1032
  )
468
1033
 
469
- # Data Export and Conversion Methods
470
-
471
1034
  def save(self, path: str, data_type: str = 'parquet') -> Future:
472
- """Save the DataFrame to a file."""
1035
+ """Saves the DataFrame to a file in a separate thread.
1036
+
1037
+ Args:
1038
+ path: The file path to save to.
1039
+ data_type: The format to save in (e.g., 'parquet', 'csv').
1040
+
1041
+ Returns:
1042
+ A `loky.Future` object representing the asynchronous save operation.
1043
+ """
473
1044
  estimated_size = deepcopy(self.get_estimated_file_size() * 4)
474
1045
  df = deepcopy(self.data_frame)
475
1046
  return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
476
1047
 
477
1048
  def to_pylist(self) -> List[Dict]:
478
- """Convert the DataFrame to a list of dictionaries."""
1049
+ """Converts the DataFrame to a list of Python dictionaries.
1050
+
1051
+ Returns:
1052
+ A list where each item is a dictionary representing a row.
1053
+ """
479
1054
  if self.lazy:
480
1055
  return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
481
1056
  return self.data_frame.to_dicts()
482
1057
 
1058
+ def to_arrow(self) -> PaTable:
1059
+ """Converts the DataFrame to a PyArrow Table.
1060
+
1061
+ This method triggers a `.collect()` call if the data is lazy,
1062
+ then converts the resulting eager DataFrame into a `pyarrow.Table`.
1063
+
1064
+ Returns:
1065
+ A `pyarrow.Table` instance representing the data.
1066
+ """
1067
+ if self.lazy:
1068
+ return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_arrow()
1069
+ else:
1070
+ return self.data_frame.to_arrow()
1071
+
1072
+ def to_raw_data(self) -> input_schema.RawData:
1073
+ """Converts the DataFrame to a `RawData` schema object.
1074
+
1075
+ Returns:
1076
+ An `input_schema.RawData` object containing the schema and data.
1077
+ """
1078
+ columns = [c.get_minimal_field_info() for c in self.schema]
1079
+ data = list(self.to_dict().values())
1080
+ return input_schema.RawData(columns=columns, data=data)
1081
+
483
1082
  def to_dict(self) -> Dict[str, List]:
484
- return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
1083
+ """Converts the DataFrame to a Python dictionary of columns.
1084
+
1085
+ Each key in the dictionary is a column name, and the corresponding value
1086
+ is a list of the data in that column.
1087
+
1088
+ Returns:
1089
+ A dictionary mapping column names to lists of their values.
1090
+ """
1091
+ if self.lazy:
1092
+ return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
1093
+ else:
1094
+ return self.data_frame.to_dict(as_series=False)
485
1095
 
486
1096
  @classmethod
487
1097
  def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
488
- """Create a FlowDataEngine from an external data source."""
1098
+ """Creates a FlowDataEngine from an external data source.
1099
+
1100
+ Args:
1101
+ external_source: An object that conforms to the `ExternalDataSource`
1102
+ interface.
1103
+
1104
+ Returns:
1105
+ A new `FlowDataEngine` instance.
1106
+ """
489
1107
  if external_source.schema is not None:
490
1108
  ff = cls.create_from_schema(external_source.schema)
491
1109
  elif external_source.initial_data_getter is not None:
@@ -497,12 +1115,27 @@ class FlowDataEngine:
497
1115
 
498
1116
  @classmethod
499
1117
  def create_from_sql(cls, sql: str, conn: Any) -> "FlowDataEngine":
500
- """Create a FlowDataEngine from a SQL query."""
1118
+ """Creates a FlowDataEngine by executing a SQL query.
1119
+
1120
+ Args:
1121
+ sql: The SQL query string to execute.
1122
+ conn: A database connection object or connection URI string.
1123
+
1124
+ Returns:
1125
+ A new `FlowDataEngine` instance with the query result.
1126
+ """
501
1127
  return cls(pl.read_sql(sql, conn))
502
1128
 
503
1129
  @classmethod
504
1130
  def create_from_schema(cls, schema: List[FlowfileColumn]) -> "FlowDataEngine":
505
- """Create a FlowDataEngine from a schema definition."""
1131
+ """Creates an empty FlowDataEngine from a schema definition.
1132
+
1133
+ Args:
1134
+ schema: A list of `FlowfileColumn` objects defining the schema.
1135
+
1136
+ Returns:
1137
+ A new, empty `FlowDataEngine` instance with the specified schema.
1138
+ """
506
1139
  pl_schema = []
507
1140
  for i, flow_file_column in enumerate(schema):
508
1141
  pl_schema.append((flow_file_column.name, cast_str_to_polars_type(flow_file_column.data_type)))
@@ -512,9 +1145,18 @@ class FlowDataEngine:
512
1145
 
513
1146
  @classmethod
514
1147
  def create_from_path(cls, received_table: input_schema.ReceivedTableBase) -> "FlowDataEngine":
515
- """Create a FlowDataEngine from a file path."""
516
- received_table.set_absolute_filepath()
1148
+ """Creates a FlowDataEngine from a local file path.
1149
+
1150
+ Supports various file types like CSV, Parquet, and Excel.
1151
+
1152
+ Args:
1153
+ received_table: A `ReceivedTableBase` object containing the file path
1154
+ and format details.
517
1155
 
1156
+ Returns:
1157
+ A new `FlowDataEngine` instance with data from the file.
1158
+ """
1159
+ received_table.set_absolute_filepath()
518
1160
  file_type_handlers = {
519
1161
  'csv': create_funcs.create_from_path_csv,
520
1162
  'parquet': create_funcs.create_from_path_parquet,
@@ -531,38 +1173,56 @@ class FlowDataEngine:
531
1173
 
532
1174
  @classmethod
533
1175
  def create_random(cls, number_of_records: int = 1000) -> "FlowDataEngine":
534
- """Create a FlowDataEngine with random data."""
1176
+ """Creates a FlowDataEngine with randomly generated data.
1177
+
1178
+ Useful for testing and examples.
1179
+
1180
+ Args:
1181
+ number_of_records: The number of random records to generate.
1182
+
1183
+ Returns:
1184
+ A new `FlowDataEngine` instance with fake data.
1185
+ """
535
1186
  return cls(create_fake_data(number_of_records))
536
1187
 
537
1188
  @classmethod
538
1189
  def generate_enumerator(cls, length: int = 1000, output_name: str = 'output_column') -> "FlowDataEngine":
539
- """Generate a sequence of numbers as a FlowDataEngine."""
1190
+ """Generates a FlowDataEngine with a single column containing a sequence of integers.
1191
+
1192
+ Args:
1193
+ length: The number of integers to generate in the sequence.
1194
+ output_name: The name of the output column.
1195
+
1196
+ Returns:
1197
+ A new `FlowDataEngine` instance.
1198
+ """
540
1199
  if length > 10_000_000:
541
1200
  length = 10_000_000
542
1201
  return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
543
1202
 
544
- # Schema Handling Methods
545
-
546
- def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema,
1203
+ def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema | None,
547
1204
  pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
548
- """Handle schema processing and validation."""
549
- if schema is None:
1205
+ """Handles schema processing and validation during initialization."""
1206
+ if schema is None and pl_schema is not None:
1207
+ return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
1208
+ elif schema is None and pl_schema is None:
550
1209
  return None
551
-
552
- if schema.__len__() != pl_schema.__len__():
553
- raise Exception(
554
- f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
555
-
556
- if isinstance(schema, pl.Schema):
557
- return self._handle_polars_schema(schema, pl_schema)
558
- elif isinstance(schema, list) and len(schema) == 0:
559
- return []
560
- elif isinstance(schema[0], str):
561
- return self._handle_string_schema(schema, pl_schema)
562
- return schema
1210
+ elif assert_if_flowfile_schema(schema) and pl_schema is None:
1211
+ return schema
1212
+ elif pl_schema is not None and schema is not None:
1213
+ if schema.__len__() != pl_schema.__len__():
1214
+ raise Exception(
1215
+ f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
1216
+ if isinstance(schema, pl.Schema):
1217
+ return self._handle_polars_schema(schema, pl_schema)
1218
+ elif isinstance(schema, list) and len(schema) == 0:
1219
+ return []
1220
+ elif isinstance(schema[0], str):
1221
+ return self._handle_string_schema(schema, pl_schema)
1222
+ return schema
563
1223
 
564
1224
  def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
565
- """Handle Polars schema conversion."""
1225
+ """Handles Polars schema conversion."""
566
1226
  flow_file_columns = [
567
1227
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
568
1228
  for col_name, dtype in zip(schema.names(), schema.dtypes())
@@ -577,7 +1237,7 @@ class FlowDataEngine:
577
1237
  return flow_file_columns
578
1238
 
579
1239
  def _handle_string_schema(self, schema: List[str], pl_schema: pl.Schema) -> List[FlowfileColumn]:
580
- """Handle string-based schema conversion."""
1240
+ """Handles string-based schema conversion."""
581
1241
  flow_file_columns = [
582
1242
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
583
1243
  for col_name, dtype in zip(schema, pl_schema.dtypes())
@@ -589,10 +1249,19 @@ class FlowDataEngine:
589
1249
 
590
1250
  return flow_file_columns
591
1251
 
592
- # Data Manipulation Methods
593
-
594
1252
  def split(self, split_input: transform_schemas.TextToRowsInput) -> "FlowDataEngine":
595
- """Split a column into multiple rows based on a delimiter."""
1253
+ """Splits a column's text values into multiple rows based on a delimiter.
1254
+
1255
+ This operation is often referred to as "exploding" the DataFrame, as it
1256
+ increases the number of rows.
1257
+
1258
+ Args:
1259
+ split_input: A `TextToRowsInput` object specifying the column to split,
1260
+ the delimiter, and the output column name.
1261
+
1262
+ Returns:
1263
+ A new `FlowDataEngine` instance with the exploded rows.
1264
+ """
596
1265
  output_column_name = (
597
1266
  split_input.output_column_name
598
1267
  if split_input.output_column_name
@@ -617,7 +1286,18 @@ class FlowDataEngine:
617
1286
  return FlowDataEngine(df)
618
1287
 
619
1288
  def unpivot(self, unpivot_input: transform_schemas.UnpivotInput) -> "FlowDataEngine":
620
- """Convert data from wide to long format."""
1289
+ """Converts the DataFrame from a wide to a long format.
1290
+
1291
+ This is the inverse of a pivot operation, taking columns and transforming
1292
+ them into `variable` and `value` rows.
1293
+
1294
+ Args:
1295
+ unpivot_input: An `UnpivotInput` object specifying which columns to
1296
+ unpivot and which to keep as index columns.
1297
+
1298
+ Returns:
1299
+ A new, unpivoted `FlowDataEngine` instance.
1300
+ """
621
1301
  lf = self.data_frame
622
1302
 
623
1303
  if unpivot_input.data_type_selector_expr is not None:
@@ -636,7 +1316,17 @@ class FlowDataEngine:
636
1316
  return FlowDataEngine(result)
637
1317
 
638
1318
  def do_pivot(self, pivot_input: transform_schemas.PivotInput, node_logger: NodeLogger = None) -> "FlowDataEngine":
639
- """Convert data from long to wide format with aggregations."""
1319
+ """Converts the DataFrame from a long to a wide format, aggregating values.
1320
+
1321
+ Args:
1322
+ pivot_input: A `PivotInput` object defining the index, pivot, and value
1323
+ columns, along with the aggregation logic.
1324
+ node_logger: An optional logger for reporting warnings, e.g., if the
1325
+ pivot column has too many unique values.
1326
+
1327
+ Returns:
1328
+ A new, pivoted `FlowDataEngine` instance.
1329
+ """
640
1330
  # Get unique values for pivot columns
641
1331
  max_unique_vals = 200
642
1332
  new_cols_unique = fetch_unique_values(self.data_frame.select(pivot_input.pivot_column)
@@ -696,7 +1386,16 @@ class FlowDataEngine:
696
1386
  return FlowDataEngine(df, calculate_schema_stats=False)
697
1387
 
698
1388
  def do_filter(self, predicate: str) -> "FlowDataEngine":
699
- """Filter the DataFrame based on a predicate expression."""
1389
+ """Filters rows based on a predicate expression.
1390
+
1391
+ Args:
1392
+ predicate: A string containing a Polars expression that evaluates to
1393
+ a boolean value.
1394
+
1395
+ Returns:
1396
+ A new `FlowDataEngine` instance containing only the rows that match
1397
+ the predicate.
1398
+ """
700
1399
  try:
701
1400
  f = to_expr(predicate)
702
1401
  except Exception as e:
@@ -706,13 +1405,24 @@ class FlowDataEngine:
706
1405
  return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
707
1406
 
708
1407
  def add_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
709
- """Add a record ID column with optional grouping."""
1408
+ """Adds a record ID (row number) column to the DataFrame.
1409
+
1410
+ Can generate a simple sequential ID or a grouped ID that resets for
1411
+ each group.
1412
+
1413
+ Args:
1414
+ record_id_settings: A `RecordIdInput` object specifying the output
1415
+ column name, offset, and optional grouping columns.
1416
+
1417
+ Returns:
1418
+ A new `FlowDataEngine` instance with the added record ID column.
1419
+ """
710
1420
  if record_id_settings.group_by and len(record_id_settings.group_by_columns) > 0:
711
1421
  return self._add_grouped_record_id(record_id_settings)
712
1422
  return self._add_simple_record_id(record_id_settings)
713
1423
 
714
1424
  def _add_grouped_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
715
- """Add a record ID column with grouping."""
1425
+ """Adds a record ID column with grouping."""
716
1426
  select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
717
1427
 
718
1428
  df = (
@@ -732,7 +1442,7 @@ class FlowDataEngine:
732
1442
  return FlowDataEngine(df, schema=output_schema)
733
1443
 
734
1444
  def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
735
- """Add a simple sequential record ID column."""
1445
+ """Adds a simple sequential record ID column."""
736
1446
  df = self.data_frame.with_row_index(
737
1447
  record_id_settings.output_column_name,
738
1448
  record_id_settings.offset
@@ -743,38 +1453,52 @@ class FlowDataEngine:
743
1453
 
744
1454
  return FlowDataEngine(df, schema=output_schema)
745
1455
 
746
- # Utility Methods
747
-
748
1456
  def get_schema_column(self, col_name: str) -> FlowfileColumn:
749
- """Get schema information for a specific column."""
1457
+ """Retrieves the schema information for a single column by its name.
1458
+
1459
+ Args:
1460
+ col_name: The name of the column to retrieve.
1461
+
1462
+ Returns:
1463
+ A `FlowfileColumn` object for the specified column, or `None` if not found.
1464
+ """
750
1465
  for s in self.schema:
751
1466
  if s.name == col_name:
752
1467
  return s
753
1468
 
754
1469
  def get_estimated_file_size(self) -> int:
755
- """Get the estimated size of the file in bytes."""
1470
+ """Estimates the file size in bytes if the data originated from a local file.
1471
+
1472
+ This relies on the original path being tracked during file ingestion.
1473
+
1474
+ Returns:
1475
+ The file size in bytes, or 0 if the original path is unknown.
1476
+ """
756
1477
  if self._org_path is not None:
757
1478
  return os.path.getsize(self._org_path)
758
1479
  return 0
759
1480
 
760
1481
  def __repr__(self) -> str:
761
- """Return string representation of the FlowDataEngine."""
762
- return f'flowfile table\n{self.data_frame.__repr__()}'
1482
+ """Returns a string representation of the FlowDataEngine."""
1483
+ return f'flow data engine\n{self.data_frame.__repr__()}'
763
1484
 
764
1485
  def __call__(self) -> "FlowDataEngine":
765
- """Make the class callable, returning self."""
1486
+ """Makes the class instance callable, returning itself."""
766
1487
  return self
767
1488
 
768
1489
  def __len__(self) -> int:
769
- """Get the number of records in the table."""
1490
+ """Returns the number of records in the table."""
770
1491
  return self.number_of_records if self.number_of_records >= 0 else self.get_number_of_records()
771
1492
 
772
1493
  def cache(self) -> "FlowDataEngine":
773
- """
774
- Cache the data in background and update the DataFrame reference.
1494
+ """Caches the current DataFrame to disk and updates the internal reference.
1495
+
1496
+ This triggers a background process to write the current LazyFrame's result
1497
+ to a temporary file. Subsequent operations on this `FlowDataEngine` instance
1498
+ will read from the cached file, which can speed up downstream computations.
775
1499
 
776
1500
  Returns:
777
- FlowDataEngine: Self with cached data
1501
+ The same `FlowDataEngine` instance, now backed by the cached data.
778
1502
  """
779
1503
  edf = ExternalDfFetcher(lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False,
780
1504
  flow_id=-1,
@@ -789,7 +1513,13 @@ class FlowDataEngine:
789
1513
  return self
790
1514
 
791
1515
  def collect_external(self):
792
- """Collect data from external source if present."""
1516
+ """Materializes data from a tracked external source.
1517
+
1518
+ If the `FlowDataEngine` was created from an `ExternalDataSource`, this
1519
+ method will trigger the data retrieval, update the internal `_data_frame`
1520
+ to a `LazyFrame` of the collected data, and reset the schema to be
1521
+ re-evaluated.
1522
+ """
793
1523
  if self._external_source is not None:
794
1524
  logger.info('Collecting external source')
795
1525
  if self.external_source.get_pl_df() is not None:
@@ -798,16 +1528,16 @@ class FlowDataEngine:
798
1528
  self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
799
1529
  self._schema = None # enforce reset schema
800
1530
 
801
- # Data Access Methods
802
1531
  def get_output_sample(self, n_rows: int = 10) -> List[Dict]:
803
- """
804
- Get a sample of the data as a list of dictionaries.
1532
+ """Gets a sample of the data as a list of dictionaries.
1533
+
1534
+ This is typically used to display a preview of the data in a UI.
805
1535
 
806
1536
  Args:
807
- n_rows: Number of rows to sample
1537
+ n_rows: The number of rows to sample.
808
1538
 
809
1539
  Returns:
810
- List[Dict]: Sample data as dictionaries
1540
+ A list of dictionaries, where each dictionary represents a row.
811
1541
  """
812
1542
  if self.number_of_records > n_rows or self.number_of_records < 0:
813
1543
  df = self.collect(n_rows)
@@ -816,6 +1546,7 @@ class FlowDataEngine:
816
1546
  return df.to_dicts()
817
1547
 
818
1548
  def __get_sample__(self, n_rows: int = 100, streamable: bool = True) -> "FlowDataEngine":
1549
+ """Internal method to get a sample of the data."""
819
1550
  if not self.lazy:
820
1551
  df = self.data_frame.lazy()
821
1552
  else:
@@ -833,19 +1564,18 @@ class FlowDataEngine:
833
1564
 
834
1565
  def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
835
1566
  seed: int = None) -> "FlowDataEngine":
836
- """
837
- Get a sample of rows from the DataFrame.
1567
+ """Gets a sample of rows from the DataFrame.
838
1568
 
839
1569
  Args:
840
- n_rows: Number of rows to sample
841
- random: Whether to randomly sample
842
- shuffle: Whether to shuffle the sample
843
- seed: Random seed for reproducibility
1570
+ n_rows: The number of rows to sample.
1571
+ random: If True, performs random sampling. If False, takes the first n_rows.
1572
+ shuffle: If True (and `random` is True), shuffles the data before sampling.
1573
+ seed: A random seed for reproducibility.
844
1574
 
845
1575
  Returns:
846
- FlowDataEngine: New instance with sampled data
1576
+ A new `FlowDataEngine` instance containing the sampled data.
847
1577
  """
848
- n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=True))
1578
+ n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=OFFLOAD_TO_WORKER))
849
1579
  logging.info(f'Getting sample of {n_rows} rows')
850
1580
 
851
1581
  if random:
@@ -869,31 +1599,30 @@ class FlowDataEngine:
869
1599
  return FlowDataEngine(sample_df, schema=self.schema, number_of_records=n_records)
870
1600
 
871
1601
  def get_subset(self, n_rows: int = 100) -> "FlowDataEngine":
872
- """
873
- Get a subset of rows from the DataFrame.
1602
+ """Gets the first `n_rows` from the DataFrame.
874
1603
 
875
1604
  Args:
876
- n_rows: Number of rows to include
1605
+ n_rows: The number of rows to include in the subset.
877
1606
 
878
1607
  Returns:
879
- FlowDataEngine: New instance with subset of data
1608
+ A new `FlowDataEngine` instance containing the subset of data.
880
1609
  """
881
1610
  if not self.lazy:
882
1611
  return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
883
1612
  else:
884
1613
  return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
885
1614
 
886
- # Iterator Methods
887
- def iter_batches(self, batch_size: int = 1000, columns: Union[List, Tuple, str] = None):
888
- """
889
- Iterate over the DataFrame in batches.
1615
+ def iter_batches(self, batch_size: int = 1000,
1616
+ columns: Union[List, Tuple, str] = None) -> Generator["FlowDataEngine", None, None]:
1617
+ """Iterates over the DataFrame in batches.
890
1618
 
891
1619
  Args:
892
- batch_size: Size of each batch
893
- columns: Columns to include
1620
+ batch_size: The size of each batch.
1621
+ columns: A list of column names to include in the batches. If None,
1622
+ all columns are included.
894
1623
 
895
1624
  Yields:
896
- FlowDataEngine: New instance for each batch
1625
+ A `FlowDataEngine` instance for each batch.
897
1626
  """
898
1627
  if columns:
899
1628
  self.data_frame = self.data_frame.select(columns)
@@ -905,17 +1634,21 @@ class FlowDataEngine:
905
1634
  def start_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
906
1635
  other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
907
1636
  node_id: int | str = -1) -> ExternalFuzzyMatchFetcher:
908
- """
909
- Starts a fuzzy join with another DataFrame and returns the object to track.
1637
+ """Starts a fuzzy join operation in a background process.
1638
+
1639
+ This method prepares the data and initiates the fuzzy matching in a
1640
+ separate process, returning a tracker object immediately.
910
1641
 
911
1642
  Args:
912
- fuzzy_match_input: Fuzzy matching parameters
913
- other: Right DataFrame for join
914
- file_ref: Reference for temporary files
915
- flow_id: Flow ID for tracking
916
- node_id: Node ID for tracking
1643
+ fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
1644
+ other: The right `FlowDataEngine` to join with.
1645
+ file_ref: A reference string for temporary files.
1646
+ flow_id: The flow ID for tracking.
1647
+ node_id: The node ID for tracking.
1648
+
917
1649
  Returns:
918
- FlowDataEngine: New instance with joined data
1650
+ An `ExternalFuzzyMatchFetcher` object that can be used to track the
1651
+ progress and retrieve the result of the fuzzy join.
919
1652
  """
920
1653
  left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
921
1654
  fuzzy_match_input=fuzzy_match_input)
@@ -929,17 +1662,19 @@ class FlowDataEngine:
929
1662
  def do_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
930
1663
  other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
931
1664
  node_id: int | str = -1) -> "FlowDataEngine":
932
- """
933
- Perform a fuzzy join with another DataFrame.
1665
+ """Performs a fuzzy join with another DataFrame.
1666
+
1667
+ This method blocks until the fuzzy join operation is complete.
934
1668
 
935
1669
  Args:
936
- fuzzy_match_input: Fuzzy matching parameters
937
- other: Right DataFrame for join
938
- file_ref: Reference for temporary files
939
- flow_id: Flow ID for tracking
940
- node_id: Node ID for tracking
1670
+ fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
1671
+ other: The right `FlowDataEngine` to join with.
1672
+ file_ref: A reference string for temporary files.
1673
+ flow_id: The flow ID for tracking.
1674
+ node_id: The node ID for tracking.
1675
+
941
1676
  Returns:
942
- FlowDataEngine: New instance with joined data
1677
+ A new `FlowDataEngine` instance with the result of the fuzzy join.
943
1678
  """
944
1679
  left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
945
1680
  fuzzy_match_input=fuzzy_match_input)
@@ -953,18 +1688,19 @@ class FlowDataEngine:
953
1688
 
954
1689
  def fuzzy_match(self, right: "FlowDataEngine", left_on: str, right_on: str,
955
1690
  fuzzy_method: str = 'levenshtein', threshold: float = 0.75) -> "FlowDataEngine":
956
- """
957
- Perform fuzzy matching between two DataFrames.
1691
+ """Performs a simple fuzzy match between two DataFrames on a single column pair.
1692
+
1693
+ This is a convenience method for a common fuzzy join scenario.
958
1694
 
959
1695
  Args:
960
- right: Right DataFrame for matching
961
- left_on: Column from left DataFrame
962
- right_on: Column from right DataFrame
963
- fuzzy_method: Method for fuzzy matching
964
- threshold: Matching threshold
1696
+ right: The right `FlowDataEngine` to match against.
1697
+ left_on: The column name from the left DataFrame to match on.
1698
+ right_on: The column name from the right DataFrame to match on.
1699
+ fuzzy_method: The fuzzy matching algorithm to use (e.g., 'levenshtein').
1700
+ threshold: The similarity score threshold (0.0 to 1.0) for a match.
965
1701
 
966
1702
  Returns:
967
- FlowDataEngine: New instance with matched data
1703
+ A new `FlowDataEngine` with the matched data.
968
1704
  """
969
1705
  fuzzy_match_input = transform_schemas.FuzzyMatchInput(
970
1706
  [transform_schemas.FuzzyMap(
@@ -980,29 +1716,28 @@ class FlowDataEngine:
980
1716
  def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
981
1717
  auto_generate_selection: bool, verify_integrity: bool,
982
1718
  other: "FlowDataEngine") -> "FlowDataEngine":
983
- """
984
- Perform a cross join with another DataFrame.
1719
+ """Performs a cross join with another DataFrame.
1720
+
1721
+ A cross join produces the Cartesian product of the two DataFrames.
985
1722
 
986
1723
  Args:
987
- cross_join_input: Cross join parameters
988
- auto_generate_selection: Whether to auto-generate column selection
989
- verify_integrity: Whether to verify join integrity
990
- other: Right DataFrame for join
1724
+ cross_join_input: A `CrossJoinInput` object specifying column selections.
1725
+ auto_generate_selection: If True, automatically renames columns to avoid conflicts.
1726
+ verify_integrity: If True, checks if the resulting join would be too large.
1727
+ other: The right `FlowDataEngine` to join with.
991
1728
 
992
1729
  Returns:
993
- FlowDataEngine: New instance with joined data
1730
+ A new `FlowDataEngine` with the result of the cross join.
994
1731
 
995
1732
  Raises:
996
- Exception: If join would result in too many records
1733
+ Exception: If `verify_integrity` is True and the join would result in
1734
+ an excessively large number of records.
997
1735
  """
998
1736
  self.lazy = True
999
1737
  other.lazy = True
1000
1738
 
1001
1739
  verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
1002
1740
 
1003
- # if auto_generate_selection:
1004
- # cross_join_input.auto_rename()
1005
-
1006
1741
  right_select = [v.old_name for v in cross_join_input.right_select.renames
1007
1742
  if (v.keep or v.join_key) and v.is_available]
1008
1743
  left_select = [v.old_name for v in cross_join_input.left_select.renames
@@ -1034,37 +1769,32 @@ class FlowDataEngine:
1034
1769
 
1035
1770
  def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
1036
1771
  verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
1037
- """
1038
- Perform a join operation with another DataFrame.
1772
+ """Performs a standard SQL-style join with another DataFrame.
1773
+
1774
+ Supports various join types like 'inner', 'left', 'right', 'outer', 'semi', and 'anti'.
1039
1775
 
1040
1776
  Args:
1041
- join_input: Join parameters
1042
- auto_generate_selection: Whether to auto-generate column selection
1043
- verify_integrity: Whether to verify join integrity
1044
- other: Right DataFrame for join
1777
+ join_input: A `JoinInput` object defining the join keys, join type,
1778
+ and column selections.
1779
+ auto_generate_selection: If True, automatically handles column renaming.
1780
+ verify_integrity: If True, performs checks to prevent excessively large joins.
1781
+ other: The right `FlowDataEngine` to join with.
1045
1782
 
1046
1783
  Returns:
1047
- FlowDataEngine: New instance with joined data
1784
+ A new `FlowDataEngine` with the joined data.
1048
1785
 
1049
1786
  Raises:
1050
- Exception: If join would result in too many records or is invalid
1787
+ Exception: If the join configuration is invalid or if `verify_integrity`
1788
+ is True and the join is predicted to be too large.
1051
1789
  """
1052
- # self.lazy = False if join_input.how == 'right' else True
1053
- # other.lazy = False if join_input.how == 'right' else True
1054
-
1790
+ ensure_right_unselect_for_semi_and_anti_joins(join_input)
1055
1791
  verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
1056
1792
  if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
1057
1793
  raise Exception('Join is not valid by the data fields')
1058
1794
  if auto_generate_selection:
1059
1795
  join_input.auto_rename()
1060
-
1061
- right_select = [v.old_name for v in join_input.right_select.renames
1062
- if (v.keep or v.join_key) and v.is_available]
1063
- left_select = [v.old_name for v in join_input.left_select.renames
1064
- if (v.keep or v.join_key) and v.is_available]
1065
- left = self.data_frame.select(left_select).rename(join_input.left_select.rename_table)
1066
- right = other.data_frame.select(right_select).rename(join_input.right_select.rename_table)
1067
-
1796
+ left = self.data_frame.select(get_select_columns(join_input.left_select.renames)).rename(join_input.left_select.rename_table)
1797
+ right = other.data_frame.select(get_select_columns(join_input.right_select.renames)).rename(join_input.right_select.rename_table)
1068
1798
  if verify_integrity and join_input.how != 'right':
1069
1799
  n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
1070
1800
  right_on_keys=join_input.right_join_keys, how=join_input.how)
@@ -1072,37 +1802,55 @@ class FlowDataEngine:
1072
1802
  raise Exception("Join will result in too many records, ending process")
1073
1803
  else:
1074
1804
  n_records = -1
1805
+ left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_input)
1806
+ left, right = rename_df_table_for_join(left, right, join_input.get_join_key_renames())
1075
1807
  if join_input.how == 'right':
1076
- # Default to left join since right join can give panic issues in execution plan downstream
1077
- joined_df = right.join(left, left_on=join_input.right_join_keys,
1078
- right_on=join_input.left_join_keys, how="left", suffix="")
1808
+ joined_df = right.join(
1809
+ other=left,
1810
+ left_on=join_input.right_join_keys,
1811
+ right_on=join_input.left_join_keys,
1812
+ how="left",
1813
+ suffix="").rename(reverse_join_key_mapping)
1079
1814
  else:
1080
- joined_df = left.join(right, left_on=join_input.left_join_keys,
1081
- right_on=join_input.right_join_keys,
1082
- how=join_input.how, suffix="")
1083
- cols_to_delete_after = [col.new_name for col in
1084
- join_input.left_select.renames + join_input.left_select.renames
1085
- if col.join_key and not col.keep and col.is_available]
1086
- if len(cols_to_delete_after) > 0:
1087
- joined_df = joined_df.drop(cols_to_delete_after)
1815
+ joined_df = left.join(
1816
+ other=right,
1817
+ left_on=join_input.left_join_keys,
1818
+ right_on=join_input.right_join_keys,
1819
+ how=join_input.how,
1820
+ suffix="").rename(reverse_join_key_mapping)
1821
+ left_cols_to_delete_after = [get_col_name_to_delete(col, 'left') for col in join_input.left_select.renames
1822
+ if not col.keep
1823
+ and col.is_available and col.join_key
1824
+ ]
1825
+ right_cols_to_delete_after = [get_col_name_to_delete(col, 'right') for col in join_input.right_select.renames
1826
+ if not col.keep
1827
+ and col.is_available and col.join_key
1828
+ and join_input.how in ("left", "right", "inner", "cross", "outer")
1829
+ ]
1830
+ if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
1831
+ joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
1832
+ undo_join_key_remapping = get_undo_rename_mapping_join(join_input)
1833
+ joined_df = joined_df.rename(undo_join_key_remapping)
1834
+
1088
1835
  if verify_integrity:
1089
1836
  return FlowDataEngine(joined_df, calculate_schema_stats=True,
1090
- number_of_records=n_records, streamable=False)
1837
+ number_of_records=n_records, streamable=False)
1091
1838
  else:
1092
1839
  fl = FlowDataEngine(joined_df, calculate_schema_stats=False,
1093
- number_of_records=0, streamable=False)
1840
+ number_of_records=0, streamable=False)
1094
1841
  return fl
1095
1842
 
1096
- # Graph Operations
1097
1843
  def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
1098
- """
1099
- Solve a graph problem using the specified columns.
1844
+ """Solves a graph problem represented by 'from' and 'to' columns.
1845
+
1846
+ This is used for operations like finding connected components in a graph.
1100
1847
 
1101
1848
  Args:
1102
- graph_solver_input: Graph solving parameters
1849
+ graph_solver_input: A `GraphSolverInput` object defining the source,
1850
+ destination, and output column names.
1103
1851
 
1104
1852
  Returns:
1105
- FlowDataEngine: New instance with solved graph data
1853
+ A new `FlowDataEngine` instance with the solved graph data.
1106
1854
  """
1107
1855
  lf = self.data_frame.with_columns(
1108
1856
  graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
@@ -1110,48 +1858,48 @@ class FlowDataEngine:
1110
1858
  )
1111
1859
  return FlowDataEngine(lf)
1112
1860
 
1113
- # Data Modification Methods
1114
1861
  def add_new_values(self, values: Iterable, col_name: str = None) -> "FlowDataEngine":
1115
- """
1116
- Add a new column with specified values.
1862
+ """Adds a new column with the provided values.
1117
1863
 
1118
1864
  Args:
1119
- values: Values to add
1120
- col_name: Name for new column
1865
+ values: An iterable (e.g., list, tuple) of values to add as a new column.
1866
+ col_name: The name for the new column. Defaults to 'new_values'.
1121
1867
 
1122
1868
  Returns:
1123
- FlowDataEngine: New instance with added column
1869
+ A new `FlowDataEngine` instance with the added column.
1124
1870
  """
1125
1871
  if col_name is None:
1126
1872
  col_name = 'new_values'
1127
1873
  return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
1128
1874
 
1129
1875
  def get_record_count(self) -> "FlowDataEngine":
1130
- """
1131
- Get the total number of records.
1876
+ """Returns a new FlowDataEngine with a single column 'number_of_records'
1877
+ containing the total number of records.
1132
1878
 
1133
1879
  Returns:
1134
- FlowDataEngine: New instance with record count
1880
+ A new `FlowDataEngine` instance.
1135
1881
  """
1136
1882
  return FlowDataEngine(self.data_frame.select(pl.len().alias('number_of_records')))
1137
1883
 
1138
1884
  def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
1139
- """
1140
- Assert that this DataFrame is equal to another.
1885
+ """Asserts that this DataFrame is equal to another.
1886
+
1887
+ Useful for testing.
1141
1888
 
1142
1889
  Args:
1143
- other: DataFrame to compare with
1144
- ordered: Whether to consider row order
1145
- strict_schema: Whether to strictly compare schemas
1890
+ other: The other `FlowDataEngine` to compare with.
1891
+ ordered: If True, the row order must be identical.
1892
+ strict_schema: If True, the data types of the schemas must be identical.
1146
1893
 
1147
1894
  Raises:
1148
- Exception: If DataFrames are not equal
1895
+ Exception: If the DataFrames are not equal based on the specified criteria.
1149
1896
  """
1150
1897
  org_laziness = self.lazy, other.lazy
1151
1898
  self.lazy = False
1152
1899
  other.lazy = False
1153
1900
  self.number_of_records = -1
1154
1901
  other.number_of_records = -1
1902
+ other = other.select_columns(self.columns)
1155
1903
 
1156
1904
  if self.get_number_of_records() != other.get_number_of_records():
1157
1905
  raise Exception('Number of records is not equal')
@@ -1172,14 +1920,14 @@ class FlowDataEngine:
1172
1920
  self.lazy, other.lazy = org_laziness
1173
1921
  assert self_lf.equals(other_lf), 'Data is not equal'
1174
1922
 
1175
- # Initialization Methods
1176
1923
  def initialize_empty_fl(self):
1177
- """Initialize an empty LazyFrame."""
1924
+ """Initializes an empty LazyFrame."""
1178
1925
  self.data_frame = pl.LazyFrame()
1179
1926
  self.number_of_records = 0
1180
1927
  self._lazy = True
1181
1928
 
1182
1929
  def _calculate_number_of_records_in_worker(self) -> int:
1930
+ """Calculates the number of records in a worker process."""
1183
1931
  number_of_records = ExternalDfFetcher(
1184
1932
  lf=self.data_frame,
1185
1933
  operation_type="calculate_number_of_records",
@@ -1191,18 +1939,20 @@ class FlowDataEngine:
1191
1939
 
1192
1940
  def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
1193
1941
  calculate_in_worker_process: bool = False) -> int:
1194
- """
1195
- Get the total number of records in the DataFrame.
1942
+ """Gets the total number of records in the DataFrame.
1943
+
1944
+ For lazy frames, this may trigger a full data scan, which can be expensive.
1196
1945
 
1197
1946
  Args:
1198
- warn: Whether to warn about expensive operations
1199
- force_calculate: Whether to force recalculation
1200
- calculate_in_worker_process: Whether to offload compute to the worker process
1947
+ warn: If True, logs a warning if a potentially expensive calculation is triggered.
1948
+ force_calculate: If True, forces recalculation even if a value is cached.
1949
+ calculate_in_worker_process: If True, offloads the calculation to a worker process.
1950
+
1201
1951
  Returns:
1202
- int: Number of records
1952
+ The total number of records.
1203
1953
 
1204
1954
  Raises:
1205
- Exception: If unable to get number of records
1955
+ ValueError: If the number of records could not be determined.
1206
1956
  """
1207
1957
  if self.is_future and not self.is_collected:
1208
1958
  return -1
@@ -1213,37 +1963,39 @@ class FlowDataEngine:
1213
1963
 
1214
1964
  if self.lazy:
1215
1965
  if calculate_in_worker_process:
1216
- self.number_of_records = self._calculate_number_of_records_in_worker()
1217
- else:
1218
- if warn:
1219
- logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1220
1966
  try:
1221
- self.number_of_records = self.data_frame.select(pl.len()).collect(
1222
- engine="streaming" if self._streamable else "auto")[0, 0]
1223
- except Exception:
1224
- raise ValueError('Could not get number of records')
1967
+ self.number_of_records = self._calculate_number_of_records_in_worker()
1968
+ return self.number_of_records
1969
+ except Exception as e:
1970
+ logger.error(f"Error: {e}")
1971
+ if warn:
1972
+ logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1973
+ try:
1974
+ self.number_of_records = self.data_frame.select(pl.len()).collect(
1975
+ engine="streaming" if self._streamable else "auto")[0, 0]
1976
+ except Exception:
1977
+ raise ValueError('Could not get number of records')
1225
1978
  else:
1226
1979
  self.number_of_records = self.data_frame.__len__()
1227
1980
  return self.number_of_records
1228
1981
 
1229
- # Properties
1230
1982
  @property
1231
1983
  def has_errors(self) -> bool:
1232
- """Check if there are any errors."""
1984
+ """Checks if there are any errors."""
1233
1985
  return len(self.errors) > 0
1234
1986
 
1235
1987
  @property
1236
1988
  def lazy(self) -> bool:
1237
- """Check if DataFrame is lazy."""
1989
+ """Indicates if the DataFrame is in lazy mode."""
1238
1990
  return self._lazy
1239
1991
 
1240
1992
  @lazy.setter
1241
1993
  def lazy(self, exec_lazy: bool = False):
1242
- """
1243
- Set the laziness of the DataFrame.
1994
+ """Sets the laziness of the DataFrame.
1244
1995
 
1245
1996
  Args:
1246
- exec_lazy: Whether to make DataFrame lazy
1997
+ exec_lazy: If True, converts the DataFrame to a LazyFrame. If False,
1998
+ collects the data and converts it to an eager DataFrame.
1247
1999
  """
1248
2000
  if exec_lazy != self._lazy:
1249
2001
  if exec_lazy:
@@ -1259,42 +2011,40 @@ class FlowDataEngine:
1259
2011
 
1260
2012
  @property
1261
2013
  def external_source(self) -> ExternalDataSource:
1262
- """Get the external data source."""
2014
+ """The external data source, if any."""
1263
2015
  return self._external_source
1264
2016
 
1265
2017
  @property
1266
2018
  def cols_idx(self) -> Dict[str, int]:
1267
- """Get column index mapping."""
2019
+ """A dictionary mapping column names to their integer index."""
1268
2020
  if self._col_idx is None:
1269
2021
  self._col_idx = {c: i for i, c in enumerate(self.columns)}
1270
2022
  return self._col_idx
1271
2023
 
1272
2024
  @property
1273
2025
  def __name__(self) -> str:
1274
- """Get table name."""
2026
+ """The name of the table."""
1275
2027
  return self.name
1276
2028
 
1277
- # Schema and Column Operations
1278
2029
  def get_select_inputs(self) -> transform_schemas.SelectInputs:
1279
- """
1280
- Get select inputs for all columns.
2030
+ """Gets `SelectInput` specifications for all columns in the current schema.
1281
2031
 
1282
2032
  Returns:
1283
- SelectInputs: Input specifications for all columns
2033
+ A `SelectInputs` object that can be used to configure selection or
2034
+ transformation operations.
1284
2035
  """
1285
2036
  return transform_schemas.SelectInputs(
1286
2037
  [transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
1287
2038
  )
1288
2039
 
1289
2040
  def select_columns(self, list_select: Union[List[str], Tuple[str], str]) -> "FlowDataEngine":
1290
- """
1291
- Select specific columns from the DataFrame.
2041
+ """Selects a subset of columns from the DataFrame.
1292
2042
 
1293
2043
  Args:
1294
- list_select: Columns to select
2044
+ list_select: A list, tuple, or single string of column names to select.
1295
2045
 
1296
2046
  Returns:
1297
- FlowDataEngine: New instance with selected columns
2047
+ A new `FlowDataEngine` instance containing only the selected columns.
1298
2048
  """
1299
2049
  if isinstance(list_select, str):
1300
2050
  list_select = [list_select]
@@ -1311,14 +2061,13 @@ class FlowDataEngine:
1311
2061
  )
1312
2062
 
1313
2063
  def drop_columns(self, columns: List[str]) -> "FlowDataEngine":
1314
- """
1315
- Drop specified columns from the DataFrame.
2064
+ """Drops specified columns from the DataFrame.
1316
2065
 
1317
2066
  Args:
1318
- columns: Columns to drop
2067
+ columns: A list of column names to drop.
1319
2068
 
1320
2069
  Returns:
1321
- FlowDataEngine: New instance without dropped columns
2070
+ A new `FlowDataEngine` instance without the dropped columns.
1322
2071
  """
1323
2072
  cols_for_select = tuple(set(self.columns) - set(columns))
1324
2073
  idx_to_keep = [self.cols_idx.get(c) for c in cols_for_select]
@@ -1331,14 +2080,13 @@ class FlowDataEngine:
1331
2080
  )
1332
2081
 
1333
2082
  def reorganize_order(self, column_order: List[str]) -> "FlowDataEngine":
1334
- """
1335
- Reorganize columns in specified order.
2083
+ """Reorganizes columns into a specified order.
1336
2084
 
1337
2085
  Args:
1338
- column_order: Desired column order
2086
+ column_order: A list of column names in the desired order.
1339
2087
 
1340
2088
  Returns:
1341
- FlowDataEngine: New instance with reordered columns
2089
+ A new `FlowDataEngine` instance with the columns reordered.
1342
2090
  """
1343
2091
  df = self.data_frame.select(column_order)
1344
2092
  schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
@@ -1346,16 +2094,15 @@ class FlowDataEngine:
1346
2094
 
1347
2095
  def apply_flowfile_formula(self, func: str, col_name: str,
1348
2096
  output_data_type: pl.DataType = None) -> "FlowDataEngine":
1349
- """
1350
- Apply a formula to create a new column.
2097
+ """Applies a formula to create a new column or transform an existing one.
1351
2098
 
1352
2099
  Args:
1353
- func: Formula to apply
1354
- col_name: Name for new column
1355
- output_data_type: Data type for output
2100
+ func: A string containing a Polars expression formula.
2101
+ col_name: The name of the new or transformed column.
2102
+ output_data_type: The desired Polars data type for the output column.
1356
2103
 
1357
2104
  Returns:
1358
- FlowDataEngine: New instance with added column
2105
+ A new `FlowDataEngine` instance with the applied formula.
1359
2106
  """
1360
2107
  parsed_func = to_expr(func)
1361
2108
  if output_data_type is not None:
@@ -1367,16 +2114,15 @@ class FlowDataEngine:
1367
2114
 
1368
2115
  def apply_sql_formula(self, func: str, col_name: str,
1369
2116
  output_data_type: pl.DataType = None) -> "FlowDataEngine":
1370
- """
1371
- Apply an SQL-style formula to create a new column.
2117
+ """Applies an SQL-style formula using `pl.sql_expr`.
1372
2118
 
1373
2119
  Args:
1374
- func: SQL formula to apply
1375
- col_name: Name for new column
1376
- output_data_type: Data type for output
2120
+ func: A string containing an SQL expression.
2121
+ col_name: The name of the new or transformed column.
2122
+ output_data_type: The desired Polars data type for the output column.
1377
2123
 
1378
2124
  Returns:
1379
- FlowDataEngine: New instance with added column
2125
+ A new `FlowDataEngine` instance with the applied formula.
1380
2126
  """
1381
2127
  expr = to_expr(func)
1382
2128
  if output_data_type not in (None, "Auto"):
@@ -1388,16 +2134,18 @@ class FlowDataEngine:
1388
2134
 
1389
2135
  def output(self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str,
1390
2136
  execute_remote: bool = True) -> "FlowDataEngine":
1391
- """
1392
- Write DataFrame to output file.
2137
+ """Writes the DataFrame to an output file.
2138
+
2139
+ Can execute the write operation locally or in a remote worker process.
1393
2140
 
1394
2141
  Args:
1395
- output_fs: Output settings.
1396
- flow_id: Flow ID for tracking.
1397
- node_id: Node ID for tracking.
1398
- execute_remote: If the output should be executed at the flowfile worker process.
2142
+ output_fs: An `OutputSettings` object with details about the output file.
2143
+ flow_id: The flow ID for tracking.
2144
+ node_id: The node ID for tracking.
2145
+ execute_remote: If True, executes the write in a worker process.
2146
+
1399
2147
  Returns:
1400
- FlowDataEngine: Self for chaining
2148
+ The same `FlowDataEngine` instance for chaining.
1401
2149
  """
1402
2150
  logger.info('Starting to write output')
1403
2151
  if execute_remote:
@@ -1429,30 +2177,28 @@ class FlowDataEngine:
1429
2177
  logger.info("Finished writing output")
1430
2178
  return self
1431
2179
 
1432
- # Data Operations
1433
2180
  def make_unique(self, unique_input: transform_schemas.UniqueInput = None) -> "FlowDataEngine":
1434
- """
1435
- Get unique rows based on specified columns.
2181
+ """Gets the unique rows from the DataFrame.
1436
2182
 
1437
2183
  Args:
1438
- unique_input: Unique operation parameters
2184
+ unique_input: A `UniqueInput` object specifying a subset of columns
2185
+ to consider for uniqueness and a strategy for keeping rows.
1439
2186
 
1440
2187
  Returns:
1441
- FlowDataEngine: New instance with unique rows
2188
+ A new `FlowDataEngine` instance with unique rows.
1442
2189
  """
1443
2190
  if unique_input is None or unique_input.columns is None:
1444
2191
  return FlowDataEngine(self.data_frame.unique())
1445
2192
  return FlowDataEngine(self.data_frame.unique(unique_input.columns, keep=unique_input.strategy))
1446
2193
 
1447
2194
  def concat(self, other: Iterable["FlowDataEngine"] | "FlowDataEngine") -> "FlowDataEngine":
1448
- """
1449
- Concatenate with other DataFrames.
2195
+ """Concatenates this DataFrame with one or more other DataFrames.
1450
2196
 
1451
2197
  Args:
1452
- other: DataFrames to concatenate
2198
+ other: A single `FlowDataEngine` or an iterable of them.
1453
2199
 
1454
2200
  Returns:
1455
- FlowDataEngine: Concatenated DataFrame
2201
+ A new `FlowDataEngine` containing the concatenated data.
1456
2202
  """
1457
2203
  if isinstance(other, FlowDataEngine):
1458
2204
  other = [other]
@@ -1462,15 +2208,15 @@ class FlowDataEngine:
1462
2208
 
1463
2209
  def do_select(self, select_inputs: transform_schemas.SelectInputs,
1464
2210
  keep_missing: bool = True) -> "FlowDataEngine":
1465
- """
1466
- Perform complex column selection and transformation.
2211
+ """Performs a complex column selection, renaming, and reordering operation.
1467
2212
 
1468
2213
  Args:
1469
- select_inputs: Selection specifications
1470
- keep_missing: Whether to keep columns not specified
2214
+ select_inputs: A `SelectInputs` object defining the desired transformations.
2215
+ keep_missing: If True, columns not specified in `select_inputs` are kept.
2216
+ If False, they are dropped.
1471
2217
 
1472
2218
  Returns:
1473
- FlowDataEngine: New instance with selected/transformed columns
2219
+ A new `FlowDataEngine` with the transformed selection.
1474
2220
  """
1475
2221
  new_schema = deepcopy(self.schema)
1476
2222
  renames = [r for r in select_inputs.renames if r.is_available]
@@ -1506,29 +2252,29 @@ class FlowDataEngine:
1506
2252
  output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
1507
2253
  return output_file.reorganize_order(sorted_cols)
1508
2254
 
1509
- # Utility Methods
1510
2255
  def set_streamable(self, streamable: bool = False):
1511
- """Set whether DataFrame operations should be streamable."""
2256
+ """Sets whether DataFrame operations should be streamable."""
1512
2257
  self._streamable = streamable
1513
2258
 
1514
2259
  def _calculate_schema(self) -> List[Dict]:
1515
- """Calculate schema statistics."""
2260
+ """Calculates schema statistics."""
1516
2261
  if self.external_source is not None:
1517
2262
  self.collect_external()
1518
2263
  v = utils.calculate_schema(self.data_frame)
1519
2264
  return v
1520
2265
 
1521
2266
  def calculate_schema(self):
1522
- """Calculate and return schema."""
2267
+ """Calculates and returns the schema."""
1523
2268
  self._calculate_schema_stats = True
1524
2269
  return self.schema
1525
2270
 
1526
2271
  def count(self) -> int:
1527
- """Get total number of records."""
2272
+ """Gets the total number of records."""
1528
2273
  return self.get_number_of_records()
1529
2274
 
1530
2275
  @classmethod
1531
2276
  def create_from_path_worker(cls, received_table: input_schema.ReceivedTable, flow_id: int, node_id: int | str):
2277
+ """Creates a FlowDataEngine from a path in a worker process."""
1532
2278
  received_table.set_absolute_filepath()
1533
2279
  external_fetcher = ExternalCreateFetcher(received_table=received_table,
1534
2280
  file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
@@ -1536,14 +2282,19 @@ class FlowDataEngine:
1536
2282
 
1537
2283
 
1538
2284
  def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowDataEngine":
1539
- """
1540
- Execute arbitrary Polars code.
2285
+ """Executes arbitrary Polars code on one or more FlowDataEngine objects.
2286
+
2287
+ This function takes a string of Python code that uses Polars and executes it.
2288
+ Input `FlowDataEngine` objects are made available in the code's scope as
2289
+ `input_df` (for a single input) or `input_df_1`, `input_df_2`, etc.
1541
2290
 
1542
2291
  Args:
1543
- code: Polars code to execute
2292
+ *flowfile_tables: A variable number of `FlowDataEngine` objects to be
2293
+ used as input to the code.
2294
+ code: A string containing the Polars code to execute.
1544
2295
 
1545
2296
  Returns:
1546
- FlowDataEngine: Result of code execution
2297
+ A new `FlowDataEngine` instance containing the result of the executed code.
1547
2298
  """
1548
2299
  polars_executable = polars_code_parser.get_executable(code, num_inputs=len(flowfile_tables))
1549
2300
  if len(flowfile_tables) == 0:
@@ -1555,4 +2306,4 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
1555
2306
  df = polars_executable(**kwargs)
1556
2307
  if isinstance(df, pl.DataFrame):
1557
2308
  logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
1558
- return FlowDataEngine(df)
2309
+ return FlowDataEngine(df)