Flowfile 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (98) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +1 -0
  3. flowfile/web/__init__.py +2 -2
  4. flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
  5. flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
  6. flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
  7. flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
  8. flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
  11. flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
  12. flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
  13. flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
  14. flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
  15. flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
  16. flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
  17. flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
  18. flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
  19. flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
  20. flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
  21. flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
  22. flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
  23. flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
  24. flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
  26. flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
  27. flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
  28. flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
  29. flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
  31. flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
  33. flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
  34. flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
  35. flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
  37. flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
  38. flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
  39. flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
  40. flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
  41. flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
  42. flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
  43. flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
  44. flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
  45. flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
  49. flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/METADATA +2 -2
  52. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/RECORD +96 -94
  53. flowfile_core/__init__.py +1 -0
  54. flowfile_core/auth/jwt.py +39 -0
  55. flowfile_core/configs/node_store/nodes.py +1 -0
  56. flowfile_core/configs/settings.py +6 -5
  57. flowfile_core/flowfile/code_generator/code_generator.py +71 -0
  58. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -1
  59. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +597 -309
  60. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  61. flowfile_core/flowfile/flow_graph.py +619 -191
  62. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  63. flowfile_core/flowfile/flow_node/flow_node.py +500 -89
  64. flowfile_core/flowfile/flow_node/models.py +125 -20
  65. flowfile_core/flowfile/handler.py +2 -33
  66. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  67. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  68. flowfile_core/flowfile/utils.py +36 -5
  69. flowfile_core/main.py +32 -13
  70. flowfile_core/routes/cloud_connections.py +7 -11
  71. flowfile_core/routes/logs.py +2 -6
  72. flowfile_core/routes/public.py +1 -0
  73. flowfile_core/routes/routes.py +127 -51
  74. flowfile_core/routes/secrets.py +72 -14
  75. flowfile_core/schemas/__init__.py +8 -0
  76. flowfile_core/schemas/input_schema.py +92 -64
  77. flowfile_core/schemas/output_model.py +19 -3
  78. flowfile_core/schemas/schemas.py +144 -11
  79. flowfile_core/schemas/transform_schema.py +82 -17
  80. flowfile_frame/__init__.py +9 -1
  81. flowfile_frame/cloud_storage/__init__.py +0 -0
  82. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  83. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  84. flowfile_frame/expr.py +28 -1
  85. flowfile_frame/expr.pyi +76 -61
  86. flowfile_frame/flow_frame.py +232 -110
  87. flowfile_frame/flow_frame.pyi +140 -91
  88. flowfile_frame/flow_frame_methods.py +150 -12
  89. flowfile_frame/group_frame.py +3 -0
  90. flowfile_frame/utils.py +25 -3
  91. test_utils/s3/data_generator.py +1 -0
  92. test_utils/s3/demo_data_generator.py +186 -0
  93. test_utils/s3/fixtures.py +6 -1
  94. flowfile_core/schemas/defaults.py +0 -9
  95. flowfile_core/schemas/models.py +0 -193
  96. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
  97. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
  98. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +0 -0
@@ -4,7 +4,7 @@ import os
4
4
  from copy import deepcopy
5
5
  from dataclasses import dataclass
6
6
  from math import ceil
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
8
8
 
9
9
  # Third-party imports
10
10
  from loky import Future
@@ -12,6 +12,7 @@ import polars as pl
12
12
  from polars.exceptions import PanicException
13
13
  from polars_grouper import graph_solver
14
14
  from polars_expr_transformer import simple_function_to_expr as to_expr
15
+ from pyarrow import Table as PaTable
15
16
  from pyarrow.parquet import ParquetFile
16
17
 
17
18
  # Local imports - Core
@@ -64,7 +65,24 @@ from flowfile_core.flowfile.sources.external_sources.base_class import ExternalD
64
65
  T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
65
66
 
66
67
  def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
68
+ """Temporarily renames join keys to avoid conflicts during a join.
67
69
 
70
+ This helper function checks the join type and renames the join key columns
71
+ in either the left or right DataFrame to a temporary name (`__FL_TEMP__...`).
72
+ This prevents Polars from automatically suffixing columns with `_right` when
73
+ join keys have the same name.
74
+
75
+ Args:
76
+ left_df: The left Polars DataFrame or LazyFrame.
77
+ right_df: The right Polars DataFrame or LazyFrame.
78
+ join_input: The JoinInput settings object defining the join.
79
+
80
+ Returns:
81
+ A tuple containing:
82
+ - The (potentially modified) left DataFrame.
83
+ - The (potentially modified) right DataFrame.
84
+ - A dictionary mapping the temporary names back to their desired final names.
85
+ """
68
86
  def _construct_temp_name(column_name: str) -> str:
69
87
  return "__FL_TEMP__"+column_name
70
88
  if join_input.how == 'right':
@@ -85,13 +103,15 @@ def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform
85
103
 
86
104
 
87
105
  def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.JoinInput) -> None:
88
- """
89
- Updates the right columns of the join input by deselecting them.
90
- Args:
91
- join_input ():
106
+ """Modifies JoinInput for semi/anti joins to not keep right-side columns.
92
107
 
93
- Returns:
94
- None
108
+ For 'semi' and 'anti' joins, Polars only returns columns from the left
109
+ DataFrame. This function enforces that behavior by modifying the `join_input`
110
+ in-place, setting the `keep` flag to `False` for all columns in the
111
+ right-side selection.
112
+
113
+ Args:
114
+ join_input: The JoinInput settings object to modify.
95
115
  """
96
116
  if join_input.how in ('semi', 'anti'):
97
117
  for jk in join_input.right_select.renames:
@@ -99,31 +119,38 @@ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.
99
119
 
100
120
 
101
121
  def get_select_columns(full_select_input: List[transform_schemas.SelectInput]) -> List[str]:
102
- """
103
- Gets the list of column names to select from the full select input.
104
- It filters out columns that are not marked to keep or join keys, and only includes those that are available.
122
+ """Extracts a list of column names to be selected from a SelectInput list.
123
+
124
+ This function filters a list of `SelectInput` objects to return the names
125
+ of columns that are marked as available and are either a join key or
126
+ explicitly marked to be kept.
127
+
105
128
  Args:
106
- full_select_input (): List of SelectInput objects containing column information.
129
+ full_select_input: A list of SelectInput objects.
107
130
 
108
131
  Returns:
109
- List of column names to select.
132
+ A list of column names to be selected.
110
133
  """
111
134
  return [v.old_name for v in full_select_input if (v.keep or v.join_key) and v.is_available]
112
135
 
113
136
 
114
137
  @dataclass
115
138
  class FlowDataEngine:
139
+ """The core data handling engine for Flowfile.
140
+
141
+ This class acts as a high-level wrapper around a Polars DataFrame or
142
+ LazyFrame, providing a unified API for data ingestion, transformation,
143
+ and output. It manages data state (lazy vs. eager), schema information,
144
+ and execution logic.
145
+
146
+ Attributes:
147
+ _data_frame: The underlying Polars DataFrame or LazyFrame.
148
+ columns: A list of column names in the current data frame.
149
+ name: An optional name for the data engine instance.
150
+ number_of_records: The number of records. Can be -1 for lazy frames.
151
+ errors: A list of errors encountered during operations.
152
+ _schema: A cached list of `FlowfileColumn` objects representing the schema.
116
153
  """
117
- A class that provides a unified interface for working with tabular data, supporting both eager and lazy evaluation.
118
-
119
- The class is organized into several logical sections:
120
- 1. Core properties and initialization
121
- 2. Data access and manipulation
122
- 3. Schema and metadata operations
123
- 4. Transformations and operations
124
- 5. I/O operations
125
- """
126
-
127
154
  # Core attributes
128
155
  _data_frame: Union[pl.DataFrame, pl.LazyFrame]
129
156
  columns: List[Any]
@@ -163,9 +190,6 @@ class FlowDataEngine:
163
190
  _number_of_records_callback: Callable = None
164
191
  _data_callback: Callable = None
165
192
 
166
- # Tracking info
167
- # node_id: int = None # TODO: Implement node_id
168
- # flow_id: int = None # TODO: Implement flow_id
169
193
 
170
194
  def __init__(self,
171
195
  raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
@@ -178,7 +202,22 @@ class FlowDataEngine:
178
202
  streamable: bool = True,
179
203
  number_of_records_callback: Callable = None,
180
204
  data_callback: Callable = None):
181
- """Initialize FlowDataEngine with various data sources and configuration options."""
205
+ """Initializes the FlowDataEngine from various data sources.
206
+
207
+ Args:
208
+ raw_data: The input data. Can be a list of dicts, a Polars DataFrame/LazyFrame,
209
+ or a `RawData` schema object.
210
+ path_ref: A string path to a Parquet file.
211
+ name: An optional name for the data engine instance.
212
+ optimize_memory: If True, prefers lazy operations to conserve memory.
213
+ schema: An optional schema definition. Can be a list of `FlowfileColumn` objects,
214
+ a list of column names, or a Polars `Schema`.
215
+ number_of_records: The number of records, if known.
216
+ calculate_schema_stats: If True, computes detailed statistics for each column.
217
+ streamable: If True, allows for streaming operations when possible.
218
+ number_of_records_callback: A callback function to retrieve the number of records.
219
+ data_callback: A callback function to retrieve the data.
220
+ """
182
221
  self._initialize_attributes(number_of_records_callback, data_callback, streamable)
183
222
 
184
223
  if raw_data is not None:
@@ -190,7 +229,11 @@ class FlowDataEngine:
190
229
  self._finalize_initialization(name, optimize_memory, schema, calculate_schema_stats)
191
230
 
192
231
  def _initialize_attributes(self, number_of_records_callback, data_callback, streamable):
193
- """Initialize basic attributes with default values."""
232
+ """(Internal) Sets the initial default attributes for a new instance.
233
+
234
+ This helper is called first during initialization to ensure all state-tracking
235
+ and configuration attributes have a clean default value before data is processed.
236
+ """
194
237
  self._external_source = None
195
238
  self._number_of_records_callback = number_of_records_callback
196
239
  self._data_callback = data_callback
@@ -204,8 +247,11 @@ class FlowDataEngine:
204
247
  self.is_future = False
205
248
 
206
249
  def _handle_raw_data(self, raw_data, number_of_records, optimize_memory):
207
- """Process different types of input data."""
250
+ """(Internal) Dispatches raw data to the appropriate handler based on its type.
208
251
 
252
+ This acts as a router during initialization, inspecting the type of `raw_data`
253
+ and calling the corresponding specialized `_handle_*` method to process it.
254
+ """
209
255
  if isinstance(raw_data, input_schema.RawData):
210
256
  self._handle_raw_data_format(raw_data)
211
257
  elif isinstance(raw_data, pl.DataFrame):
@@ -216,12 +262,12 @@ class FlowDataEngine:
216
262
  self._handle_python_data(raw_data)
217
263
 
218
264
  def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: Optional[int]):
219
- """Handle Polars DataFrame input."""
265
+ """(Internal) Initializes the engine from an eager Polars DataFrame."""
220
266
  self.data_frame = df
221
267
  self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
222
268
 
223
269
  def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: Optional[int], optimize_memory: bool):
224
- """Handle Polars LazyFrame input."""
270
+ """(Internal) Initializes the engine from a Polars LazyFrame."""
225
271
  self.data_frame = lf
226
272
  self._lazy = True
227
273
  if number_of_records is not None:
@@ -229,18 +275,17 @@ class FlowDataEngine:
229
275
  elif optimize_memory:
230
276
  self.number_of_records = -1
231
277
  else:
232
- # TODO: assess whether this leads to slow downs with multi remote files
233
278
  self.number_of_records = lf.select(pl.len()).collect()[0, 0]
234
279
 
235
280
  def _handle_python_data(self, data: Union[List, Dict]):
236
- """Handle Python list or dict input."""
281
+ """(Internal) Dispatches Python collections to the correct handler."""
237
282
  if isinstance(data, dict):
238
283
  self._handle_dict_input(data)
239
284
  else:
240
285
  self._handle_list_input(data)
241
286
 
242
287
  def _handle_dict_input(self, data: Dict):
243
- """Handle dictionary input."""
288
+ """(Internal) Initializes the engine from a Python dictionary."""
244
289
  if len(data) == 0:
245
290
  self.initialize_empty_fl()
246
291
  lengths = [len(v) if isinstance(v, (list, tuple)) else 1 for v in data.values()]
@@ -254,7 +299,14 @@ class FlowDataEngine:
254
299
  self.lazy = True
255
300
 
256
301
  def _handle_raw_data_format(self, raw_data: input_schema.RawData):
257
- """Create a FlowDataEngine from a RawData object."""
302
+ """(Internal) Initializes the engine from a `RawData` schema object.
303
+
304
+ This method uses the schema provided in the `RawData` object to correctly
305
+ infer data types when creating the Polars DataFrame.
306
+
307
+ Args:
308
+ raw_data: An instance of `RawData` containing the data and schema.
309
+ """
258
310
  flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
259
311
  polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
260
312
  for flowfile_column in flowfile_schema])
@@ -268,7 +320,7 @@ class FlowDataEngine:
268
320
  self.lazy = True
269
321
 
270
322
  def _handle_list_input(self, data: List):
271
- """Handle list input."""
323
+ """(Internal) Initializes the engine from a list of records."""
272
324
  number_of_records = len(data)
273
325
  if number_of_records > 0:
274
326
  processed_data = self._process_list_data(data)
@@ -281,7 +333,11 @@ class FlowDataEngine:
281
333
 
282
334
  @staticmethod
283
335
  def _process_list_data(data: List) -> List[Dict]:
284
- """Process list data into a format suitable for DataFrame creation."""
336
+ """(Internal) Normalizes list data into a list of dictionaries.
337
+
338
+ Ensures that a list of objects or non-dict items is converted into a
339
+ uniform list of dictionaries suitable for Polars DataFrame creation.
340
+ """
285
341
  if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
286
342
  try:
287
343
  return pl.DataFrame(data).to_dicts()
@@ -296,19 +352,19 @@ class FlowDataEngine:
296
352
  return ensure_similarity_dicts(data)
297
353
 
298
354
  def to_cloud_storage_obj(self, settings: cloud_storage_schemas.CloudStorageWriteSettingsInternal):
299
- """
300
- Write the FlowDataEngine's data to an object in cloud storage.
355
+ """Writes the DataFrame to an object in cloud storage.
301
356
 
302
- Supports writing to S3, Azure ADLS, and Google Cloud Storage. The 'overwrite'
303
- write mode is supported. The 'append' mode is not yet implemented.
357
+ This method supports writing to various cloud storage providers like AWS S3,
358
+ Azure Data Lake Storage, and Google Cloud Storage.
304
359
 
305
360
  Args:
306
- settings: Cloud storage write settings with connection details and write options.
361
+ settings: A `CloudStorageWriteSettingsInternal` object containing connection
362
+ details, file format, and write options.
307
363
 
308
364
  Raises:
309
- ValueError: If file format is not supported.
310
- NotImplementedError: If the 'append' write mode is used.
311
- Exception: If writing to cloud storage fails.
365
+ ValueError: If the specified file format is not supported for writing.
366
+ NotImplementedError: If the 'append' write mode is used with an unsupported format.
367
+ Exception: If the write operation to cloud storage fails for any reason.
312
368
  """
313
369
  connection = settings.connection
314
370
  write_settings = settings.write_settings
@@ -317,7 +373,6 @@ class FlowDataEngine:
317
373
 
318
374
  if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
319
375
  raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
320
-
321
376
  storage_options = CloudStorageReader.get_storage_options(connection)
322
377
  credential_provider = CloudStorageReader.get_credential_provider(connection)
323
378
  # Dispatch to the correct writer based on file format
@@ -359,7 +414,11 @@ class FlowDataEngine:
359
414
  storage_options: Dict[str, Any],
360
415
  credential_provider: Optional[Callable],
361
416
  write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
362
- """Write LazyFrame to a Parquet file in cloud storage."""
417
+ """(Internal) Writes the DataFrame to a Parquet file in cloud storage.
418
+
419
+ Uses `sink_parquet` for efficient streaming writes. Falls back to a
420
+ collect-then-write pattern if sinking fails.
421
+ """
363
422
  try:
364
423
  sink_kwargs = {
365
424
  "path": resource_path,
@@ -371,7 +430,8 @@ class FlowDataEngine:
371
430
  sink_kwargs["credential_provider"] = credential_provider
372
431
  try:
373
432
  self.data_frame.sink_parquet(**sink_kwargs)
374
- except:
433
+ except Exception as e:
434
+ logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
375
435
  pl_df = self.collect()
376
436
  sink_kwargs['file'] = sink_kwargs.pop("path")
377
437
  pl_df.write_parquet(**sink_kwargs)
@@ -385,6 +445,11 @@ class FlowDataEngine:
385
445
  storage_options: Dict[str, Any],
386
446
  credential_provider: Optional[Callable],
387
447
  write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
448
+ """(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
449
+
450
+ This operation requires collecting the data first, as `write_delta` operates
451
+ on an eager DataFrame.
452
+ """
388
453
  sink_kwargs = {
389
454
  "target": resource_path,
390
455
  "mode": write_settings.write_mode,
@@ -400,7 +465,10 @@ class FlowDataEngine:
400
465
  storage_options: Dict[str, Any],
401
466
  credential_provider: Optional[Callable],
402
467
  write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
403
- """Write LazyFrame to a CSV file in cloud storage."""
468
+ """(Internal) Writes the DataFrame to a CSV file in cloud storage.
469
+
470
+ Uses `sink_csv` for efficient, streaming writes of the data.
471
+ """
404
472
  try:
405
473
  sink_kwargs = {
406
474
  "path": resource_path,
@@ -423,7 +491,10 @@ class FlowDataEngine:
423
491
  storage_options: Dict[str, Any],
424
492
  credential_provider: Optional[Callable],
425
493
  write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
426
- """Write LazyFrame to a line-delimited JSON (NDJSON) file in cloud storage."""
494
+ """(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
495
+
496
+ Uses `sink_ndjson` for efficient, streaming writes.
497
+ """
427
498
  try:
428
499
  sink_kwargs = {"path": resource_path}
429
500
  if storage_options:
@@ -437,22 +508,25 @@ class FlowDataEngine:
437
508
  raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
438
509
 
439
510
  @classmethod
440
- def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal):
441
- """
442
- Create a FlowDataEngine from an object in cloud storage.
511
+ def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal) -> "FlowDataEngine":
512
+ """Creates a FlowDataEngine from an object in cloud storage.
443
513
 
444
- Supports reading from S3, Azure ADLS, and Google Cloud Storage with various
445
- authentication methods including access keys, IAM roles, and CLI credentials.
514
+ This method supports reading from various cloud storage providers like AWS S3,
515
+ Azure Data Lake Storage, and Google Cloud Storage, with support for
516
+ various authentication methods.
446
517
 
447
518
  Args:
448
- settings: Cloud storage read settings with connection details and read options
519
+ settings: A `CloudStorageReadSettingsInternal` object containing connection
520
+ details, file format, and read options.
449
521
 
450
522
  Returns:
451
- FlowDataEngine: New instance with data from cloud storage
523
+ A new `FlowDataEngine` instance containing the data from cloud storage.
452
524
 
453
525
  Raises:
454
- ValueError: If storage type or file format is not supported
455
- Exception: If reading from cloud storage fails
526
+ ValueError: If the storage type or file format is not supported.
527
+ NotImplementedError: If a requested file format like "delta" or "iceberg"
528
+ is not yet implemented.
529
+ Exception: If reading from cloud storage fails.
456
530
  """
457
531
  connection = settings.connection
458
532
  read_settings = settings.read_settings
@@ -505,11 +579,14 @@ class FlowDataEngine:
505
579
  raise ValueError(f"Unsupported file format: {read_settings.file_format}")
506
580
 
507
581
  @staticmethod
508
- def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any]) -> List[FlowfileColumn] | None:
582
+ def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any],
583
+ file_format: Literal["csv", "parquet", "json", "delta"]) -> List[FlowfileColumn] | None:
584
+ """Infers the schema by scanning the first file in a cloud directory."""
509
585
  try:
586
+ scan_func = getattr(pl, "scan_" + file_format)
510
587
  first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
511
588
  return convert_stats_to_column_info(FlowDataEngine._create_schema_stats_from_pl_schema(
512
- pl.scan_parquet(first_file_ref, storage_options=storage_options).collect_schema()))
589
+ scan_func(first_file_ref, storage_options=storage_options).collect_schema()))
513
590
  except Exception as e:
514
591
  logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
515
592
 
@@ -520,7 +597,7 @@ class FlowDataEngine:
520
597
  storage_options: Dict[str, Any],
521
598
  credential_provider: Optional[Callable],
522
599
  read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
523
- """Read Iceberg table(s) from cloud storage."""
600
+ """Reads Iceberg table(s) from cloud storage."""
524
601
  raise NotImplementedError(f"Failed to read Iceberg table from cloud storage: Not yet implemented")
525
602
 
526
603
  @classmethod
@@ -529,7 +606,7 @@ class FlowDataEngine:
529
606
  storage_options: Dict[str, Any],
530
607
  credential_provider: Optional[Callable],
531
608
  is_directory: bool) -> "FlowDataEngine":
532
- """Read Parquet file(s) from cloud storage."""
609
+ """Reads Parquet file(s) from cloud storage."""
533
610
  try:
534
611
  # Use scan_parquet for lazy evaluation
535
612
  if is_directory:
@@ -542,14 +619,14 @@ class FlowDataEngine:
542
619
  if credential_provider:
543
620
  scan_kwargs["credential_provider"] = credential_provider
544
621
  if storage_options and is_directory:
545
- schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
622
+ schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options, "parquet")
546
623
  else:
547
624
  schema = None
548
625
  lf = pl.scan_parquet(**scan_kwargs)
549
626
 
550
627
  return cls(
551
628
  lf,
552
- number_of_records=6_666_666, # Set to 6666666 so that the provider is not accessed for this stat
629
+ number_of_records=6_666_666, # Set so the provider is not accessed for this stat
553
630
  optimize_memory=True,
554
631
  streamable=True,
555
632
  schema=schema
@@ -565,6 +642,7 @@ class FlowDataEngine:
565
642
  storage_options: Dict[str, Any],
566
643
  credential_provider: Optional[Callable],
567
644
  read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
645
+ """Reads a Delta Lake table from cloud storage."""
568
646
  try:
569
647
  logger.info("Reading Delta file from cloud storage...")
570
648
  logger.info(f"read_settings: {read_settings}")
@@ -579,7 +657,7 @@ class FlowDataEngine:
579
657
 
580
658
  return cls(
581
659
  lf,
582
- number_of_records=6_666_666, # Set to 6666666 so that the provider is not accessed for this stat
660
+ number_of_records=6_666_666, # Set so the provider is not accessed for this stat
583
661
  optimize_memory=True,
584
662
  streamable=True
585
663
  )
@@ -593,7 +671,7 @@ class FlowDataEngine:
593
671
  storage_options: Dict[str, Any],
594
672
  credential_provider: Optional[Callable],
595
673
  read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
596
- """Read CSV file(s) from cloud storage."""
674
+ """Reads CSV file(s) from cloud storage."""
597
675
  try:
598
676
  scan_kwargs = {
599
677
  "source": resource_path,
@@ -610,7 +688,7 @@ class FlowDataEngine:
610
688
  resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="csv")
611
689
  scan_kwargs["source"] = resource_path
612
690
  if storage_options and read_settings.scan_mode == "directory":
613
- schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
691
+ schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options, "csv")
614
692
  else:
615
693
  schema = None
616
694
 
@@ -634,8 +712,10 @@ class FlowDataEngine:
634
712
  storage_options: Dict[str, Any],
635
713
  credential_provider: Optional[Callable],
636
714
  is_directory: bool) -> "FlowDataEngine":
637
- """Read JSON file(s) from cloud storage."""
715
+ """Reads JSON file(s) from cloud storage."""
638
716
  try:
717
+ if is_directory:
718
+ resource_path = ensure_path_has_wildcard_pattern(resource_path, "json")
639
719
  scan_kwargs = {"source": resource_path}
640
720
 
641
721
  if storage_options:
@@ -643,13 +723,6 @@ class FlowDataEngine:
643
723
  if credential_provider:
644
724
  scan_kwargs["credential_provider"] = credential_provider
645
725
 
646
- if is_directory:
647
- resource_path = ensure_path_has_wildcard_pattern(resource_path, "json")
648
- if storage_options and is_directory:
649
- schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
650
- else:
651
- schema = None
652
-
653
726
  lf = pl.scan_ndjson(**scan_kwargs) # Using NDJSON for line-delimited JSON
654
727
 
655
728
  return cls(
@@ -657,7 +730,6 @@ class FlowDataEngine:
657
730
  number_of_records=-1,
658
731
  optimize_memory=True,
659
732
  streamable=True,
660
- schema=schema
661
733
  )
662
734
 
663
735
  except Exception as e:
@@ -665,7 +737,7 @@ class FlowDataEngine:
665
737
  raise Exception(f"Failed to read JSON from cloud storage: {str(e)}")
666
738
 
667
739
  def _handle_path_ref(self, path_ref: str, optimize_memory: bool):
668
- """Handle file path reference input."""
740
+ """Handles file path reference input."""
669
741
  try:
670
742
  pf = ParquetFile(path_ref)
671
743
  except Exception as e:
@@ -681,7 +753,7 @@ class FlowDataEngine:
681
753
 
682
754
  def _finalize_initialization(self, name: str, optimize_memory: bool, schema: Optional[Any],
683
755
  calculate_schema_stats: bool):
684
- """Finalize initialization by setting remaining attributes."""
756
+ """Finalizes initialization by setting remaining attributes."""
685
757
  _ = calculate_schema_stats
686
758
  self.name = name
687
759
  self._optimize_memory = optimize_memory
@@ -694,12 +766,19 @@ class FlowDataEngine:
694
766
  self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
695
767
 
696
768
  def __getitem__(self, item):
697
- """Access a specific column or item from the DataFrame."""
769
+ """Accesses a specific column or item from the DataFrame."""
698
770
  return self.data_frame.select([item])
699
771
 
700
772
  @property
701
773
  def data_frame(self) -> pl.LazyFrame | pl.DataFrame | None:
702
- """Get the underlying DataFrame with appropriate handling of different states."""
774
+ """The underlying Polars DataFrame or LazyFrame.
775
+
776
+ This property provides access to the Polars object that backs the
777
+ FlowDataEngine. It handles lazy-loading from external sources if necessary.
778
+
779
+ Returns:
780
+ The active Polars `DataFrame` or `LazyFrame`.
781
+ """
703
782
  if self._data_frame is not None and not self.is_future:
704
783
  return self._data_frame
705
784
  elif self.is_future:
@@ -718,24 +797,32 @@ class FlowDataEngine:
718
797
 
719
798
  @data_frame.setter
720
799
  def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
721
- """Set the underlying DataFrame with validation."""
800
+ """Sets the underlying Polars DataFrame or LazyFrame."""
722
801
  if self.lazy and isinstance(df, pl.DataFrame):
723
802
  raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
724
803
  self._data_frame = df
725
804
 
726
805
  @staticmethod
727
806
  def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> List[Dict]:
807
+ """Converts a Polars Schema into a list of schema statistics dictionaries."""
728
808
  return [
729
809
  dict(column_name=k, pl_datatype=v, col_index=i)
730
810
  for i, (k, v) in enumerate(pl_schema.items())
731
811
  ]
732
812
 
733
813
  def _add_schema_from_schema_stats(self, schema_stats: List[Dict]):
814
+ """Populates the schema from a list of schema statistics dictionaries."""
734
815
  self._schema = convert_stats_to_column_info(schema_stats)
735
816
 
736
817
  @property
737
818
  def schema(self) -> List[FlowfileColumn]:
738
- """Get the schema of the DataFrame, calculating if necessary."""
819
+ """The schema of the DataFrame as a list of `FlowfileColumn` objects.
820
+
821
+ This property lazily calculates the schema if it hasn't been determined yet.
822
+
823
+ Returns:
824
+ A list of `FlowfileColumn` objects describing the schema.
825
+ """
739
826
  if self.number_of_fields == 0:
740
827
  return []
741
828
  if self._schema is None or (self._calculate_schema_stats and not self.ind_schema_calculated):
@@ -749,17 +836,28 @@ class FlowDataEngine:
749
836
 
750
837
  @property
751
838
  def number_of_fields(self) -> int:
752
- """Get the number of fields in the DataFrame."""
839
+ """The number of columns (fields) in the DataFrame.
840
+
841
+ Returns:
842
+ The integer count of columns.
843
+ """
753
844
  if self.__number_of_fields is None:
754
845
  self.__number_of_fields = len(self.columns)
755
846
  return self.__number_of_fields
756
847
 
757
- # Data Collection and Sampling Methods
758
-
759
848
  def collect(self, n_records: int = None) -> pl.DataFrame:
760
- """
761
- Collect data from the DataFrame, optionally limiting the number of records.
762
- Handles streaming and error cases appropriately.
849
+ """Collects the data and returns it as a Polars DataFrame.
850
+
851
+ This method triggers the execution of the lazy query plan (if applicable)
852
+ and returns the result. It supports streaming to optimize memory usage
853
+ for large datasets.
854
+
855
+ Args:
856
+ n_records: The maximum number of records to collect. If None, all
857
+ records are collected.
858
+
859
+ Returns:
860
+ A Polars `DataFrame` containing the collected data.
763
861
  """
764
862
  if n_records is None:
765
863
  logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
@@ -777,7 +875,7 @@ class FlowDataEngine:
777
875
  return self._handle_collection_error(n_records)
778
876
 
779
877
  def _collect_data(self, n_records: int = None) -> pl.DataFrame:
780
- """Internal method to handle data collection."""
878
+ """Internal method to handle data collection logic."""
781
879
  if n_records is None:
782
880
 
783
881
  self.collect_external()
@@ -799,7 +897,7 @@ class FlowDataEngine:
799
897
  return self.data_frame.head(n_records).collect()
800
898
 
801
899
  def _collect_from_external_source(self, n_records: int) -> pl.DataFrame:
802
- """Handle collection from external source."""
900
+ """Handles collection from an external source."""
803
901
  if self.external_source.get_pl_df() is not None:
804
902
  all_data = self.external_source.get_pl_df().head(n_records)
805
903
  self.data_frame = all_data
@@ -809,7 +907,7 @@ class FlowDataEngine:
809
907
  return self.data_frame
810
908
 
811
909
  def _handle_collection_error(self, n_records: int) -> pl.DataFrame:
812
- """Handle errors during collection by attempting partial collection."""
910
+ """Handles errors during collection by attempting partial collection."""
813
911
  n_records = 100000000 if n_records is None else n_records
814
912
  ok_cols, error_cols = self._identify_valid_columns(n_records)
815
913
 
@@ -818,7 +916,7 @@ class FlowDataEngine:
818
916
  return self._create_empty_dataframe(n_records)
819
917
 
820
918
  def _identify_valid_columns(self, n_records: int) -> Tuple[List[str], List[Tuple[str, Any]]]:
821
- """Identify which columns can be collected successfully."""
919
+ """Identifies which columns can be collected successfully."""
822
920
  ok_cols = []
823
921
  error_cols = []
824
922
  for c in self.columns:
@@ -831,7 +929,7 @@ class FlowDataEngine:
831
929
 
832
930
  def _create_partial_dataframe(self, ok_cols: List[str], error_cols: List[Tuple[str, Any]],
833
931
  n_records: int) -> pl.DataFrame:
834
- """Create a DataFrame with partial data for columns that could be collected."""
932
+ """Creates a DataFrame with partial data for columns that could be collected."""
835
933
  df = self.data_frame.select(ok_cols)
836
934
  df = df.with_columns([
837
935
  pl.lit(None).alias(column_name).cast(data_type)
@@ -840,7 +938,7 @@ class FlowDataEngine:
840
938
  return df.select(self.columns).head(n_records).collect()
841
939
 
842
940
  def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
843
- """Create an empty DataFrame with the correct schema."""
941
+ """Creates an empty DataFrame with the correct schema."""
844
942
  if self.number_of_records > 0:
845
943
  return pl.DataFrame({
846
944
  column_name: pl.Series(
@@ -851,11 +949,19 @@ class FlowDataEngine:
851
949
  })
852
950
  return pl.DataFrame(schema=self.data_frame.schema)
853
951
 
854
- # Data Transformation Methods
855
-
856
952
  def do_group_by(self, group_by_input: transform_schemas.GroupByInput,
857
953
  calculate_schema_stats: bool = True) -> "FlowDataEngine":
858
- """Perform group by operations on the DataFrame."""
954
+ """Performs a group-by operation on the DataFrame.
955
+
956
+ Args:
957
+ group_by_input: A `GroupByInput` object defining the grouping columns
958
+ and aggregations.
959
+ calculate_schema_stats: If True, calculates schema statistics for the
960
+ resulting DataFrame.
961
+
962
+ Returns:
963
+ A new `FlowDataEngine` instance with the grouped and aggregated data.
964
+ """
859
965
  aggregations = [c for c in group_by_input.agg_cols if c.agg != 'groupby']
860
966
  group_columns = [c for c in group_by_input.agg_cols if c.agg == 'groupby']
861
967
 
@@ -877,7 +983,15 @@ class FlowDataEngine:
877
983
  )
878
984
 
879
985
  def do_sort(self, sorts: List[transform_schemas.SortByInput]) -> "FlowDataEngine":
880
- """Sort the DataFrame based on specified columns and directions."""
986
+ """Sorts the DataFrame by one or more columns.
987
+
988
+ Args:
989
+ sorts: A list of `SortByInput` objects, each specifying a column
990
+ and sort direction ('asc' or 'desc').
991
+
992
+ Returns:
993
+ A new `FlowDataEngine` instance with the sorted data.
994
+ """
881
995
  if not sorts:
882
996
  return self
883
997
 
@@ -887,7 +1001,16 @@ class FlowDataEngine:
887
1001
 
888
1002
  def change_column_types(self, transforms: List[transform_schemas.SelectInput],
889
1003
  calculate_schema: bool = False) -> "FlowDataEngine":
890
- """Change the data types of specified columns."""
1004
+ """Changes the data type of one or more columns.
1005
+
1006
+ Args:
1007
+ transforms: A list of `SelectInput` objects, where each object specifies
1008
+ the column and its new `polars_type`.
1009
+ calculate_schema: If True, recalculates the schema after the type change.
1010
+
1011
+ Returns:
1012
+ A new `FlowDataEngine` instance with the updated column types.
1013
+ """
891
1014
  dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
892
1015
  idx_mapping = list(
893
1016
  (transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
@@ -908,27 +1031,63 @@ class FlowDataEngine:
908
1031
  streamable=self._streamable
909
1032
  )
910
1033
 
911
- # Data Export and Conversion Methods
912
-
913
1034
  def save(self, path: str, data_type: str = 'parquet') -> Future:
914
- """Save the DataFrame to a file."""
1035
+ """Saves the DataFrame to a file in a separate thread.
1036
+
1037
+ Args:
1038
+ path: The file path to save to.
1039
+ data_type: The format to save in (e.g., 'parquet', 'csv').
1040
+
1041
+ Returns:
1042
+ A `loky.Future` object representing the asynchronous save operation.
1043
+ """
915
1044
  estimated_size = deepcopy(self.get_estimated_file_size() * 4)
916
1045
  df = deepcopy(self.data_frame)
917
1046
  return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
918
1047
 
919
1048
  def to_pylist(self) -> List[Dict]:
920
- """Convert the DataFrame to a list of dictionaries."""
1049
+ """Converts the DataFrame to a list of Python dictionaries.
1050
+
1051
+ Returns:
1052
+ A list where each item is a dictionary representing a row.
1053
+ """
921
1054
  if self.lazy:
922
1055
  return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
923
1056
  return self.data_frame.to_dicts()
924
1057
 
1058
+ def to_arrow(self) -> PaTable:
1059
+ """Converts the DataFrame to a PyArrow Table.
1060
+
1061
+ This method triggers a `.collect()` call if the data is lazy,
1062
+ then converts the resulting eager DataFrame into a `pyarrow.Table`.
1063
+
1064
+ Returns:
1065
+ A `pyarrow.Table` instance representing the data.
1066
+ """
1067
+ if self.lazy:
1068
+ return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_arrow()
1069
+ else:
1070
+ return self.data_frame.to_arrow()
1071
+
925
1072
  def to_raw_data(self) -> input_schema.RawData:
926
- """Convert the DataFrame to a list of values."""
1073
+ """Converts the DataFrame to a `RawData` schema object.
1074
+
1075
+ Returns:
1076
+ An `input_schema.RawData` object containing the schema and data.
1077
+ """
927
1078
  columns = [c.get_minimal_field_info() for c in self.schema]
928
1079
  data = list(self.to_dict().values())
929
1080
  return input_schema.RawData(columns=columns, data=data)
930
1081
 
931
1082
  def to_dict(self) -> Dict[str, List]:
1083
+ """Converts the DataFrame to a Python dictionary of columns.
1084
+
1085
+ Each key in the dictionary is a column name, and the corresponding value
1086
+ is a list of the data in that column.
1087
+
1088
+ Returns:
1089
+ A dictionary mapping column names to lists of their values.
1090
+ """
932
1091
  if self.lazy:
933
1092
  return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
934
1093
  else:
@@ -936,7 +1095,15 @@ class FlowDataEngine:
936
1095
 
937
1096
  @classmethod
938
1097
  def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
939
- """Create a FlowDataEngine from an external data source."""
1098
+ """Creates a FlowDataEngine from an external data source.
1099
+
1100
+ Args:
1101
+ external_source: An object that conforms to the `ExternalDataSource`
1102
+ interface.
1103
+
1104
+ Returns:
1105
+ A new `FlowDataEngine` instance.
1106
+ """
940
1107
  if external_source.schema is not None:
941
1108
  ff = cls.create_from_schema(external_source.schema)
942
1109
  elif external_source.initial_data_getter is not None:
@@ -948,12 +1115,27 @@ class FlowDataEngine:
948
1115
 
949
1116
  @classmethod
950
1117
  def create_from_sql(cls, sql: str, conn: Any) -> "FlowDataEngine":
951
- """Create a FlowDataEngine from a SQL query."""
1118
+ """Creates a FlowDataEngine by executing a SQL query.
1119
+
1120
+ Args:
1121
+ sql: The SQL query string to execute.
1122
+ conn: A database connection object or connection URI string.
1123
+
1124
+ Returns:
1125
+ A new `FlowDataEngine` instance with the query result.
1126
+ """
952
1127
  return cls(pl.read_sql(sql, conn))
953
1128
 
954
1129
  @classmethod
955
1130
  def create_from_schema(cls, schema: List[FlowfileColumn]) -> "FlowDataEngine":
956
- """Create a FlowDataEngine from a schema definition."""
1131
+ """Creates an empty FlowDataEngine from a schema definition.
1132
+
1133
+ Args:
1134
+ schema: A list of `FlowfileColumn` objects defining the schema.
1135
+
1136
+ Returns:
1137
+ A new, empty `FlowDataEngine` instance with the specified schema.
1138
+ """
957
1139
  pl_schema = []
958
1140
  for i, flow_file_column in enumerate(schema):
959
1141
  pl_schema.append((flow_file_column.name, cast_str_to_polars_type(flow_file_column.data_type)))
@@ -963,7 +1145,17 @@ class FlowDataEngine:
963
1145
 
964
1146
  @classmethod
965
1147
  def create_from_path(cls, received_table: input_schema.ReceivedTableBase) -> "FlowDataEngine":
966
- """Create a FlowDataEngine from a file path."""
1148
+ """Creates a FlowDataEngine from a local file path.
1149
+
1150
+ Supports various file types like CSV, Parquet, and Excel.
1151
+
1152
+ Args:
1153
+ received_table: A `ReceivedTableBase` object containing the file path
1154
+ and format details.
1155
+
1156
+ Returns:
1157
+ A new `FlowDataEngine` instance with data from the file.
1158
+ """
967
1159
  received_table.set_absolute_filepath()
968
1160
  file_type_handlers = {
969
1161
  'csv': create_funcs.create_from_path_csv,
@@ -981,19 +1173,36 @@ class FlowDataEngine:
981
1173
 
982
1174
  @classmethod
983
1175
  def create_random(cls, number_of_records: int = 1000) -> "FlowDataEngine":
984
- """Create a FlowDataEngine with random data."""
1176
+ """Creates a FlowDataEngine with randomly generated data.
1177
+
1178
+ Useful for testing and examples.
1179
+
1180
+ Args:
1181
+ number_of_records: The number of random records to generate.
1182
+
1183
+ Returns:
1184
+ A new `FlowDataEngine` instance with fake data.
1185
+ """
985
1186
  return cls(create_fake_data(number_of_records))
986
1187
 
987
1188
  @classmethod
988
1189
  def generate_enumerator(cls, length: int = 1000, output_name: str = 'output_column') -> "FlowDataEngine":
989
- """Generate a sequence of numbers as a FlowDataEngine."""
1190
+ """Generates a FlowDataEngine with a single column containing a sequence of integers.
1191
+
1192
+ Args:
1193
+ length: The number of integers to generate in the sequence.
1194
+ output_name: The name of the output column.
1195
+
1196
+ Returns:
1197
+ A new `FlowDataEngine` instance.
1198
+ """
990
1199
  if length > 10_000_000:
991
1200
  length = 10_000_000
992
1201
  return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
993
1202
 
994
1203
  def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema | None,
995
1204
  pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
996
- """Handle schema processing and validation."""
1205
+ """Handles schema processing and validation during initialization."""
997
1206
  if schema is None and pl_schema is not None:
998
1207
  return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
999
1208
  elif schema is None and pl_schema is None:
@@ -1013,7 +1222,7 @@ class FlowDataEngine:
1013
1222
  return schema
1014
1223
 
1015
1224
  def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
1016
- """Handle Polars schema conversion."""
1225
+ """Handles Polars schema conversion."""
1017
1226
  flow_file_columns = [
1018
1227
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
1019
1228
  for col_name, dtype in zip(schema.names(), schema.dtypes())
@@ -1028,7 +1237,7 @@ class FlowDataEngine:
1028
1237
  return flow_file_columns
1029
1238
 
1030
1239
  def _handle_string_schema(self, schema: List[str], pl_schema: pl.Schema) -> List[FlowfileColumn]:
1031
- """Handle string-based schema conversion."""
1240
+ """Handles string-based schema conversion."""
1032
1241
  flow_file_columns = [
1033
1242
  FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
1034
1243
  for col_name, dtype in zip(schema, pl_schema.dtypes())
@@ -1040,10 +1249,19 @@ class FlowDataEngine:
1040
1249
 
1041
1250
  return flow_file_columns
1042
1251
 
1043
- # Data Manipulation Methods
1044
-
1045
1252
  def split(self, split_input: transform_schemas.TextToRowsInput) -> "FlowDataEngine":
1046
- """Split a column into multiple rows based on a delimiter."""
1253
+ """Splits a column's text values into multiple rows based on a delimiter.
1254
+
1255
+ This operation is often referred to as "exploding" the DataFrame, as it
1256
+ increases the number of rows.
1257
+
1258
+ Args:
1259
+ split_input: A `TextToRowsInput` object specifying the column to split,
1260
+ the delimiter, and the output column name.
1261
+
1262
+ Returns:
1263
+ A new `FlowDataEngine` instance with the exploded rows.
1264
+ """
1047
1265
  output_column_name = (
1048
1266
  split_input.output_column_name
1049
1267
  if split_input.output_column_name
@@ -1068,7 +1286,18 @@ class FlowDataEngine:
1068
1286
  return FlowDataEngine(df)
1069
1287
 
1070
1288
  def unpivot(self, unpivot_input: transform_schemas.UnpivotInput) -> "FlowDataEngine":
1071
- """Convert data from wide to long format."""
1289
+ """Converts the DataFrame from a wide to a long format.
1290
+
1291
+ This is the inverse of a pivot operation, taking columns and transforming
1292
+ them into `variable` and `value` rows.
1293
+
1294
+ Args:
1295
+ unpivot_input: An `UnpivotInput` object specifying which columns to
1296
+ unpivot and which to keep as index columns.
1297
+
1298
+ Returns:
1299
+ A new, unpivoted `FlowDataEngine` instance.
1300
+ """
1072
1301
  lf = self.data_frame
1073
1302
 
1074
1303
  if unpivot_input.data_type_selector_expr is not None:
@@ -1087,7 +1316,17 @@ class FlowDataEngine:
1087
1316
  return FlowDataEngine(result)
1088
1317
 
1089
1318
  def do_pivot(self, pivot_input: transform_schemas.PivotInput, node_logger: NodeLogger = None) -> "FlowDataEngine":
1090
- """Convert data from long to wide format with aggregations."""
1319
+ """Converts the DataFrame from a long to a wide format, aggregating values.
1320
+
1321
+ Args:
1322
+ pivot_input: A `PivotInput` object defining the index, pivot, and value
1323
+ columns, along with the aggregation logic.
1324
+ node_logger: An optional logger for reporting warnings, e.g., if the
1325
+ pivot column has too many unique values.
1326
+
1327
+ Returns:
1328
+ A new, pivoted `FlowDataEngine` instance.
1329
+ """
1091
1330
  # Get unique values for pivot columns
1092
1331
  max_unique_vals = 200
1093
1332
  new_cols_unique = fetch_unique_values(self.data_frame.select(pivot_input.pivot_column)
@@ -1147,7 +1386,16 @@ class FlowDataEngine:
1147
1386
  return FlowDataEngine(df, calculate_schema_stats=False)
1148
1387
 
1149
1388
  def do_filter(self, predicate: str) -> "FlowDataEngine":
1150
- """Filter the DataFrame based on a predicate expression."""
1389
+ """Filters rows based on a predicate expression.
1390
+
1391
+ Args:
1392
+ predicate: A string containing a Polars expression that evaluates to
1393
+ a boolean value.
1394
+
1395
+ Returns:
1396
+ A new `FlowDataEngine` instance containing only the rows that match
1397
+ the predicate.
1398
+ """
1151
1399
  try:
1152
1400
  f = to_expr(predicate)
1153
1401
  except Exception as e:
@@ -1157,13 +1405,24 @@ class FlowDataEngine:
1157
1405
  return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
1158
1406
 
1159
1407
  def add_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
1160
- """Add a record ID column with optional grouping."""
1408
+ """Adds a record ID (row number) column to the DataFrame.
1409
+
1410
+ Can generate a simple sequential ID or a grouped ID that resets for
1411
+ each group.
1412
+
1413
+ Args:
1414
+ record_id_settings: A `RecordIdInput` object specifying the output
1415
+ column name, offset, and optional grouping columns.
1416
+
1417
+ Returns:
1418
+ A new `FlowDataEngine` instance with the added record ID column.
1419
+ """
1161
1420
  if record_id_settings.group_by and len(record_id_settings.group_by_columns) > 0:
1162
1421
  return self._add_grouped_record_id(record_id_settings)
1163
1422
  return self._add_simple_record_id(record_id_settings)
1164
1423
 
1165
1424
  def _add_grouped_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
1166
- """Add a record ID column with grouping."""
1425
+ """Adds a record ID column with grouping."""
1167
1426
  select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
1168
1427
 
1169
1428
  df = (
@@ -1183,7 +1442,7 @@ class FlowDataEngine:
1183
1442
  return FlowDataEngine(df, schema=output_schema)
1184
1443
 
1185
1444
  def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
1186
- """Add a simple sequential record ID column."""
1445
+ """Adds a simple sequential record ID column."""
1187
1446
  df = self.data_frame.with_row_index(
1188
1447
  record_id_settings.output_column_name,
1189
1448
  record_id_settings.offset
@@ -1194,38 +1453,52 @@ class FlowDataEngine:
1194
1453
 
1195
1454
  return FlowDataEngine(df, schema=output_schema)
1196
1455
 
1197
- # Utility Methods
1198
-
1199
1456
  def get_schema_column(self, col_name: str) -> FlowfileColumn:
1200
- """Get schema information for a specific column."""
1457
+ """Retrieves the schema information for a single column by its name.
1458
+
1459
+ Args:
1460
+ col_name: The name of the column to retrieve.
1461
+
1462
+ Returns:
1463
+ A `FlowfileColumn` object for the specified column, or `None` if not found.
1464
+ """
1201
1465
  for s in self.schema:
1202
1466
  if s.name == col_name:
1203
1467
  return s
1204
1468
 
1205
1469
  def get_estimated_file_size(self) -> int:
1206
- """Get the estimated size of the file in bytes."""
1470
+ """Estimates the file size in bytes if the data originated from a local file.
1471
+
1472
+ This relies on the original path being tracked during file ingestion.
1473
+
1474
+ Returns:
1475
+ The file size in bytes, or 0 if the original path is unknown.
1476
+ """
1207
1477
  if self._org_path is not None:
1208
1478
  return os.path.getsize(self._org_path)
1209
1479
  return 0
1210
1480
 
1211
1481
  def __repr__(self) -> str:
1212
- """Return string representation of the FlowDataEngine."""
1213
- return f'flowfile table\n{self.data_frame.__repr__()}'
1482
+ """Returns a string representation of the FlowDataEngine."""
1483
+ return f'flow data engine\n{self.data_frame.__repr__()}'
1214
1484
 
1215
1485
  def __call__(self) -> "FlowDataEngine":
1216
- """Make the class callable, returning self."""
1486
+ """Makes the class instance callable, returning itself."""
1217
1487
  return self
1218
1488
 
1219
1489
  def __len__(self) -> int:
1220
- """Get the number of records in the table."""
1490
+ """Returns the number of records in the table."""
1221
1491
  return self.number_of_records if self.number_of_records >= 0 else self.get_number_of_records()
1222
1492
 
1223
1493
  def cache(self) -> "FlowDataEngine":
1224
- """
1225
- Cache the data in background and update the DataFrame reference.
1494
+ """Caches the current DataFrame to disk and updates the internal reference.
1495
+
1496
+ This triggers a background process to write the current LazyFrame's result
1497
+ to a temporary file. Subsequent operations on this `FlowDataEngine` instance
1498
+ will read from the cached file, which can speed up downstream computations.
1226
1499
 
1227
1500
  Returns:
1228
- FlowDataEngine: Self with cached data
1501
+ The same `FlowDataEngine` instance, now backed by the cached data.
1229
1502
  """
1230
1503
  edf = ExternalDfFetcher(lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False,
1231
1504
  flow_id=-1,
@@ -1240,7 +1513,13 @@ class FlowDataEngine:
1240
1513
  return self
1241
1514
 
1242
1515
  def collect_external(self):
1243
- """Collect data from external source if present."""
1516
+ """Materializes data from a tracked external source.
1517
+
1518
+ If the `FlowDataEngine` was created from an `ExternalDataSource`, this
1519
+ method will trigger the data retrieval, update the internal `_data_frame`
1520
+ to a `LazyFrame` of the collected data, and reset the schema to be
1521
+ re-evaluated.
1522
+ """
1244
1523
  if self._external_source is not None:
1245
1524
  logger.info('Collecting external source')
1246
1525
  if self.external_source.get_pl_df() is not None:
@@ -1249,16 +1528,16 @@ class FlowDataEngine:
1249
1528
  self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
1250
1529
  self._schema = None # enforce reset schema
1251
1530
 
1252
- # Data Access Methods
1253
1531
  def get_output_sample(self, n_rows: int = 10) -> List[Dict]:
1254
- """
1255
- Get a sample of the data as a list of dictionaries.
1532
+ """Gets a sample of the data as a list of dictionaries.
1533
+
1534
+ This is typically used to display a preview of the data in a UI.
1256
1535
 
1257
1536
  Args:
1258
- n_rows: Number of rows to sample
1537
+ n_rows: The number of rows to sample.
1259
1538
 
1260
1539
  Returns:
1261
- List[Dict]: Sample data as dictionaries
1540
+ A list of dictionaries, where each dictionary represents a row.
1262
1541
  """
1263
1542
  if self.number_of_records > n_rows or self.number_of_records < 0:
1264
1543
  df = self.collect(n_rows)
@@ -1267,6 +1546,7 @@ class FlowDataEngine:
1267
1546
  return df.to_dicts()
1268
1547
 
1269
1548
  def __get_sample__(self, n_rows: int = 100, streamable: bool = True) -> "FlowDataEngine":
1549
+ """Internal method to get a sample of the data."""
1270
1550
  if not self.lazy:
1271
1551
  df = self.data_frame.lazy()
1272
1552
  else:
@@ -1284,20 +1564,20 @@ class FlowDataEngine:
1284
1564
 
1285
1565
  def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
1286
1566
  seed: int = None) -> "FlowDataEngine":
1287
- """
1288
- Get a sample of rows from the DataFrame.
1567
+ """Gets a sample of rows from the DataFrame.
1289
1568
 
1290
1569
  Args:
1291
- n_rows: Number of rows to sample
1292
- random: Whether to randomly sample
1293
- shuffle: Whether to shuffle the sample
1294
- seed: Random seed for reproducibility
1570
+ n_rows: The number of rows to sample.
1571
+ random: If True, performs random sampling. If False, takes the first n_rows.
1572
+ shuffle: If True (and `random` is True), shuffles the data before sampling.
1573
+ seed: A random seed for reproducibility.
1295
1574
 
1296
1575
  Returns:
1297
- FlowDataEngine: New instance with sampled data
1576
+ A new `FlowDataEngine` instance containing the sampled data.
1298
1577
  """
1299
- n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=True))
1578
+ n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=OFFLOAD_TO_WORKER))
1300
1579
  logging.info(f'Getting sample of {n_rows} rows')
1580
+
1301
1581
  if random:
1302
1582
  if self.lazy and self.external_source is not None:
1303
1583
  self.collect_external()
@@ -1319,31 +1599,30 @@ class FlowDataEngine:
1319
1599
  return FlowDataEngine(sample_df, schema=self.schema, number_of_records=n_records)
1320
1600
 
1321
1601
  def get_subset(self, n_rows: int = 100) -> "FlowDataEngine":
1322
- """
1323
- Get a subset of rows from the DataFrame.
1602
+ """Gets the first `n_rows` from the DataFrame.
1324
1603
 
1325
1604
  Args:
1326
- n_rows: Number of rows to include
1605
+ n_rows: The number of rows to include in the subset.
1327
1606
 
1328
1607
  Returns:
1329
- FlowDataEngine: New instance with subset of data
1608
+ A new `FlowDataEngine` instance containing the subset of data.
1330
1609
  """
1331
1610
  if not self.lazy:
1332
1611
  return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
1333
1612
  else:
1334
1613
  return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
1335
1614
 
1336
- # Iterator Methods
1337
- def iter_batches(self, batch_size: int = 1000, columns: Union[List, Tuple, str] = None):
1338
- """
1339
- Iterate over the DataFrame in batches.
1615
+ def iter_batches(self, batch_size: int = 1000,
1616
+ columns: Union[List, Tuple, str] = None) -> Generator["FlowDataEngine", None, None]:
1617
+ """Iterates over the DataFrame in batches.
1340
1618
 
1341
1619
  Args:
1342
- batch_size: Size of each batch
1343
- columns: Columns to include
1620
+ batch_size: The size of each batch.
1621
+ columns: A list of column names to include in the batches. If None,
1622
+ all columns are included.
1344
1623
 
1345
1624
  Yields:
1346
- FlowDataEngine: New instance for each batch
1625
+ A `FlowDataEngine` instance for each batch.
1347
1626
  """
1348
1627
  if columns:
1349
1628
  self.data_frame = self.data_frame.select(columns)
@@ -1355,17 +1634,21 @@ class FlowDataEngine:
1355
1634
  def start_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1356
1635
  other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
1357
1636
  node_id: int | str = -1) -> ExternalFuzzyMatchFetcher:
1358
- """
1359
- Starts a fuzzy join with another DataFrame and returns the object to track.
1637
+ """Starts a fuzzy join operation in a background process.
1638
+
1639
+ This method prepares the data and initiates the fuzzy matching in a
1640
+ separate process, returning a tracker object immediately.
1360
1641
 
1361
1642
  Args:
1362
- fuzzy_match_input: Fuzzy matching parameters
1363
- other: Right DataFrame for join
1364
- file_ref: Reference for temporary files
1365
- flow_id: Flow ID for tracking
1366
- node_id: Node ID for tracking
1643
+ fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
1644
+ other: The right `FlowDataEngine` to join with.
1645
+ file_ref: A reference string for temporary files.
1646
+ flow_id: The flow ID for tracking.
1647
+ node_id: The node ID for tracking.
1648
+
1367
1649
  Returns:
1368
- FlowDataEngine: New instance with joined data
1650
+ An `ExternalFuzzyMatchFetcher` object that can be used to track the
1651
+ progress and retrieve the result of the fuzzy join.
1369
1652
  """
1370
1653
  left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1371
1654
  fuzzy_match_input=fuzzy_match_input)
@@ -1379,17 +1662,19 @@ class FlowDataEngine:
1379
1662
  def do_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1380
1663
  other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
1381
1664
  node_id: int | str = -1) -> "FlowDataEngine":
1382
- """
1383
- Perform a fuzzy join with another DataFrame.
1665
+ """Performs a fuzzy join with another DataFrame.
1666
+
1667
+ This method blocks until the fuzzy join operation is complete.
1384
1668
 
1385
1669
  Args:
1386
- fuzzy_match_input: Fuzzy matching parameters
1387
- other: Right DataFrame for join
1388
- file_ref: Reference for temporary files
1389
- flow_id: Flow ID for tracking
1390
- node_id: Node ID for tracking
1670
+ fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
1671
+ other: The right `FlowDataEngine` to join with.
1672
+ file_ref: A reference string for temporary files.
1673
+ flow_id: The flow ID for tracking.
1674
+ node_id: The node ID for tracking.
1675
+
1391
1676
  Returns:
1392
- FlowDataEngine: New instance with joined data
1677
+ A new `FlowDataEngine` instance with the result of the fuzzy join.
1393
1678
  """
1394
1679
  left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1395
1680
  fuzzy_match_input=fuzzy_match_input)
@@ -1403,18 +1688,19 @@ class FlowDataEngine:
1403
1688
 
1404
1689
  def fuzzy_match(self, right: "FlowDataEngine", left_on: str, right_on: str,
1405
1690
  fuzzy_method: str = 'levenshtein', threshold: float = 0.75) -> "FlowDataEngine":
1406
- """
1407
- Perform fuzzy matching between two DataFrames.
1691
+ """Performs a simple fuzzy match between two DataFrames on a single column pair.
1692
+
1693
+ This is a convenience method for a common fuzzy join scenario.
1408
1694
 
1409
1695
  Args:
1410
- right: Right DataFrame for matching
1411
- left_on: Column from left DataFrame
1412
- right_on: Column from right DataFrame
1413
- fuzzy_method: Method for fuzzy matching
1414
- threshold: Matching threshold
1696
+ right: The right `FlowDataEngine` to match against.
1697
+ left_on: The column name from the left DataFrame to match on.
1698
+ right_on: The column name from the right DataFrame to match on.
1699
+ fuzzy_method: The fuzzy matching algorithm to use (e.g., 'levenshtein').
1700
+ threshold: The similarity score threshold (0.0 to 1.0) for a match.
1415
1701
 
1416
1702
  Returns:
1417
- FlowDataEngine: New instance with matched data
1703
+ A new `FlowDataEngine` with the matched data.
1418
1704
  """
1419
1705
  fuzzy_match_input = transform_schemas.FuzzyMatchInput(
1420
1706
  [transform_schemas.FuzzyMap(
@@ -1430,29 +1716,28 @@ class FlowDataEngine:
1430
1716
  def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
1431
1717
  auto_generate_selection: bool, verify_integrity: bool,
1432
1718
  other: "FlowDataEngine") -> "FlowDataEngine":
1433
- """
1434
- Perform a cross join with another DataFrame.
1719
+ """Performs a cross join with another DataFrame.
1720
+
1721
+ A cross join produces the Cartesian product of the two DataFrames.
1435
1722
 
1436
1723
  Args:
1437
- cross_join_input: Cross join parameters
1438
- auto_generate_selection: Whether to auto-generate column selection
1439
- verify_integrity: Whether to verify join integrity
1440
- other: Right DataFrame for join
1724
+ cross_join_input: A `CrossJoinInput` object specifying column selections.
1725
+ auto_generate_selection: If True, automatically renames columns to avoid conflicts.
1726
+ verify_integrity: If True, checks if the resulting join would be too large.
1727
+ other: The right `FlowDataEngine` to join with.
1441
1728
 
1442
1729
  Returns:
1443
- FlowDataEngine: New instance with joined data
1730
+ A new `FlowDataEngine` with the result of the cross join.
1444
1731
 
1445
1732
  Raises:
1446
- Exception: If join would result in too many records
1733
+ Exception: If `verify_integrity` is True and the join would result in
1734
+ an excessively large number of records.
1447
1735
  """
1448
1736
  self.lazy = True
1449
1737
  other.lazy = True
1450
1738
 
1451
1739
  verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
1452
1740
 
1453
- # if auto_generate_selection:
1454
- # cross_join_input.auto_rename()
1455
-
1456
1741
  right_select = [v.old_name for v in cross_join_input.right_select.renames
1457
1742
  if (v.keep or v.join_key) and v.is_available]
1458
1743
  left_select = [v.old_name for v in cross_join_input.left_select.renames
@@ -1484,31 +1769,32 @@ class FlowDataEngine:
1484
1769
 
1485
1770
  def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
1486
1771
  verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
1487
- """
1488
- Perform a join operation with another DataFrame.
1772
+ """Performs a standard SQL-style join with another DataFrame.
1773
+
1774
+ Supports various join types like 'inner', 'left', 'right', 'outer', 'semi', and 'anti'.
1489
1775
 
1490
1776
  Args:
1491
- join_input: Join parameters
1492
- auto_generate_selection: Whether to auto-generate column selection
1493
- verify_integrity: Whether to verify join integrity
1494
- other: Right DataFrame for join
1777
+ join_input: A `JoinInput` object defining the join keys, join type,
1778
+ and column selections.
1779
+ auto_generate_selection: If True, automatically handles column renaming.
1780
+ verify_integrity: If True, performs checks to prevent excessively large joins.
1781
+ other: The right `FlowDataEngine` to join with.
1495
1782
 
1496
1783
  Returns:
1497
- FlowDataEngine: New instance with joined data
1784
+ A new `FlowDataEngine` with the joined data.
1498
1785
 
1499
1786
  Raises:
1500
- Exception: If join would result in too many records or is invalid
1787
+ Exception: If the join configuration is invalid or if `verify_integrity`
1788
+ is True and the join is predicted to be too large.
1501
1789
  """
1502
1790
  ensure_right_unselect_for_semi_and_anti_joins(join_input)
1503
1791
  verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
1504
1792
  if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
1505
1793
  raise Exception('Join is not valid by the data fields')
1506
-
1507
1794
  if auto_generate_selection:
1508
1795
  join_input.auto_rename()
1509
1796
  left = self.data_frame.select(get_select_columns(join_input.left_select.renames)).rename(join_input.left_select.rename_table)
1510
1797
  right = other.data_frame.select(get_select_columns(join_input.right_select.renames)).rename(join_input.right_select.rename_table)
1511
-
1512
1798
  if verify_integrity and join_input.how != 'right':
1513
1799
  n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
1514
1800
  right_on_keys=join_input.right_join_keys, how=join_input.how)
@@ -1554,16 +1840,17 @@ class FlowDataEngine:
1554
1840
  number_of_records=0, streamable=False)
1555
1841
  return fl
1556
1842
 
1557
- # Graph Operations
1558
1843
  def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
1559
- """
1560
- Solve a graph problem using the specified columns.
1844
+ """Solves a graph problem represented by 'from' and 'to' columns.
1845
+
1846
+ This is used for operations like finding connected components in a graph.
1561
1847
 
1562
1848
  Args:
1563
- graph_solver_input: Graph solving parameters
1849
+ graph_solver_input: A `GraphSolverInput` object defining the source,
1850
+ destination, and output column names.
1564
1851
 
1565
1852
  Returns:
1566
- FlowDataEngine: New instance with solved graph data
1853
+ A new `FlowDataEngine` instance with the solved graph data.
1567
1854
  """
1568
1855
  lf = self.data_frame.with_columns(
1569
1856
  graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
@@ -1571,42 +1858,41 @@ class FlowDataEngine:
1571
1858
  )
1572
1859
  return FlowDataEngine(lf)
1573
1860
 
1574
- # Data Modification Methods
1575
1861
  def add_new_values(self, values: Iterable, col_name: str = None) -> "FlowDataEngine":
1576
- """
1577
- Add a new column with specified values.
1862
+ """Adds a new column with the provided values.
1578
1863
 
1579
1864
  Args:
1580
- values: Values to add
1581
- col_name: Name for new column
1865
+ values: An iterable (e.g., list, tuple) of values to add as a new column.
1866
+ col_name: The name for the new column. Defaults to 'new_values'.
1582
1867
 
1583
1868
  Returns:
1584
- FlowDataEngine: New instance with added column
1869
+ A new `FlowDataEngine` instance with the added column.
1585
1870
  """
1586
1871
  if col_name is None:
1587
1872
  col_name = 'new_values'
1588
1873
  return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
1589
1874
 
1590
1875
  def get_record_count(self) -> "FlowDataEngine":
1591
- """
1592
- Get the total number of records.
1876
+ """Returns a new FlowDataEngine with a single column 'number_of_records'
1877
+ containing the total number of records.
1593
1878
 
1594
1879
  Returns:
1595
- FlowDataEngine: New instance with record count
1880
+ A new `FlowDataEngine` instance.
1596
1881
  """
1597
1882
  return FlowDataEngine(self.data_frame.select(pl.len().alias('number_of_records')))
1598
1883
 
1599
1884
  def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
1600
- """
1601
- Assert that this DataFrame is equal to another.
1885
+ """Asserts that this DataFrame is equal to another.
1886
+
1887
+ Useful for testing.
1602
1888
 
1603
1889
  Args:
1604
- other: DataFrame to compare with
1605
- ordered: Whether to consider row order
1606
- strict_schema: Whether to strictly compare schemas
1890
+ other: The other `FlowDataEngine` to compare with.
1891
+ ordered: If True, the row order must be identical.
1892
+ strict_schema: If True, the data types of the schemas must be identical.
1607
1893
 
1608
1894
  Raises:
1609
- Exception: If DataFrames are not equal
1895
+ Exception: If the DataFrames are not equal based on the specified criteria.
1610
1896
  """
1611
1897
  org_laziness = self.lazy, other.lazy
1612
1898
  self.lazy = False
@@ -1634,14 +1920,14 @@ class FlowDataEngine:
1634
1920
  self.lazy, other.lazy = org_laziness
1635
1921
  assert self_lf.equals(other_lf), 'Data is not equal'
1636
1922
 
1637
- # Initialization Methods
1638
1923
  def initialize_empty_fl(self):
1639
- """Initialize an empty LazyFrame."""
1924
+ """Initializes an empty LazyFrame."""
1640
1925
  self.data_frame = pl.LazyFrame()
1641
1926
  self.number_of_records = 0
1642
1927
  self._lazy = True
1643
1928
 
1644
1929
  def _calculate_number_of_records_in_worker(self) -> int:
1930
+ """Calculates the number of records in a worker process."""
1645
1931
  number_of_records = ExternalDfFetcher(
1646
1932
  lf=self.data_frame,
1647
1933
  operation_type="calculate_number_of_records",
@@ -1653,18 +1939,20 @@ class FlowDataEngine:
1653
1939
 
1654
1940
  def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
1655
1941
  calculate_in_worker_process: bool = False) -> int:
1656
- """
1657
- Get the total number of records in the DataFrame.
1942
+ """Gets the total number of records in the DataFrame.
1943
+
1944
+ For lazy frames, this may trigger a full data scan, which can be expensive.
1658
1945
 
1659
1946
  Args:
1660
- warn: Whether to warn about expensive operations
1661
- force_calculate: Whether to force recalculation
1662
- calculate_in_worker_process: Whether to offload compute to the worker process
1947
+ warn: If True, logs a warning if a potentially expensive calculation is triggered.
1948
+ force_calculate: If True, forces recalculation even if a value is cached.
1949
+ calculate_in_worker_process: If True, offloads the calculation to a worker process.
1950
+
1663
1951
  Returns:
1664
- int: Number of records
1952
+ The total number of records.
1665
1953
 
1666
1954
  Raises:
1667
- Exception: If unable to get number of records
1955
+ ValueError: If the number of records could not be determined.
1668
1956
  """
1669
1957
  if self.is_future and not self.is_collected:
1670
1958
  return -1
@@ -1675,37 +1963,39 @@ class FlowDataEngine:
1675
1963
 
1676
1964
  if self.lazy:
1677
1965
  if calculate_in_worker_process:
1678
- self.number_of_records = self._calculate_number_of_records_in_worker()
1679
- else:
1680
- if warn:
1681
- logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1682
1966
  try:
1683
- self.number_of_records = self.data_frame.select(pl.len()).collect(
1684
- engine="streaming" if self._streamable else "auto")[0, 0]
1685
- except Exception:
1686
- raise ValueError('Could not get number of records')
1967
+ self.number_of_records = self._calculate_number_of_records_in_worker()
1968
+ return self.number_of_records
1969
+ except Exception as e:
1970
+ logger.error(f"Error: {e}")
1971
+ if warn:
1972
+ logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1973
+ try:
1974
+ self.number_of_records = self.data_frame.select(pl.len()).collect(
1975
+ engine="streaming" if self._streamable else "auto")[0, 0]
1976
+ except Exception:
1977
+ raise ValueError('Could not get number of records')
1687
1978
  else:
1688
1979
  self.number_of_records = self.data_frame.__len__()
1689
1980
  return self.number_of_records
1690
1981
 
1691
- # Properties
1692
1982
  @property
1693
1983
  def has_errors(self) -> bool:
1694
- """Check if there are any errors."""
1984
+ """Checks if there are any errors."""
1695
1985
  return len(self.errors) > 0
1696
1986
 
1697
1987
  @property
1698
1988
  def lazy(self) -> bool:
1699
- """Check if DataFrame is lazy."""
1989
+ """Indicates if the DataFrame is in lazy mode."""
1700
1990
  return self._lazy
1701
1991
 
1702
1992
  @lazy.setter
1703
1993
  def lazy(self, exec_lazy: bool = False):
1704
- """
1705
- Set the laziness of the DataFrame.
1994
+ """Sets the laziness of the DataFrame.
1706
1995
 
1707
1996
  Args:
1708
- exec_lazy: Whether to make DataFrame lazy
1997
+ exec_lazy: If True, converts the DataFrame to a LazyFrame. If False,
1998
+ collects the data and converts it to an eager DataFrame.
1709
1999
  """
1710
2000
  if exec_lazy != self._lazy:
1711
2001
  if exec_lazy:
@@ -1721,42 +2011,40 @@ class FlowDataEngine:
1721
2011
 
1722
2012
  @property
1723
2013
  def external_source(self) -> ExternalDataSource:
1724
- """Get the external data source."""
2014
+ """The external data source, if any."""
1725
2015
  return self._external_source
1726
2016
 
1727
2017
  @property
1728
2018
  def cols_idx(self) -> Dict[str, int]:
1729
- """Get column index mapping."""
2019
+ """A dictionary mapping column names to their integer index."""
1730
2020
  if self._col_idx is None:
1731
2021
  self._col_idx = {c: i for i, c in enumerate(self.columns)}
1732
2022
  return self._col_idx
1733
2023
 
1734
2024
  @property
1735
2025
  def __name__(self) -> str:
1736
- """Get table name."""
2026
+ """The name of the table."""
1737
2027
  return self.name
1738
2028
 
1739
- # Schema and Column Operations
1740
2029
  def get_select_inputs(self) -> transform_schemas.SelectInputs:
1741
- """
1742
- Get select inputs for all columns.
2030
+ """Gets `SelectInput` specifications for all columns in the current schema.
1743
2031
 
1744
2032
  Returns:
1745
- SelectInputs: Input specifications for all columns
2033
+ A `SelectInputs` object that can be used to configure selection or
2034
+ transformation operations.
1746
2035
  """
1747
2036
  return transform_schemas.SelectInputs(
1748
2037
  [transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
1749
2038
  )
1750
2039
 
1751
2040
  def select_columns(self, list_select: Union[List[str], Tuple[str], str]) -> "FlowDataEngine":
1752
- """
1753
- Select specific columns from the DataFrame.
2041
+ """Selects a subset of columns from the DataFrame.
1754
2042
 
1755
2043
  Args:
1756
- list_select: Columns to select
2044
+ list_select: A list, tuple, or single string of column names to select.
1757
2045
 
1758
2046
  Returns:
1759
- FlowDataEngine: New instance with selected columns
2047
+ A new `FlowDataEngine` instance containing only the selected columns.
1760
2048
  """
1761
2049
  if isinstance(list_select, str):
1762
2050
  list_select = [list_select]
@@ -1773,14 +2061,13 @@ class FlowDataEngine:
1773
2061
  )
1774
2062
 
1775
2063
  def drop_columns(self, columns: List[str]) -> "FlowDataEngine":
1776
- """
1777
- Drop specified columns from the DataFrame.
2064
+ """Drops specified columns from the DataFrame.
1778
2065
 
1779
2066
  Args:
1780
- columns: Columns to drop
2067
+ columns: A list of column names to drop.
1781
2068
 
1782
2069
  Returns:
1783
- FlowDataEngine: New instance without dropped columns
2070
+ A new `FlowDataEngine` instance without the dropped columns.
1784
2071
  """
1785
2072
  cols_for_select = tuple(set(self.columns) - set(columns))
1786
2073
  idx_to_keep = [self.cols_idx.get(c) for c in cols_for_select]
@@ -1793,14 +2080,13 @@ class FlowDataEngine:
1793
2080
  )
1794
2081
 
1795
2082
  def reorganize_order(self, column_order: List[str]) -> "FlowDataEngine":
1796
- """
1797
- Reorganize columns in specified order.
2083
+ """Reorganizes columns into a specified order.
1798
2084
 
1799
2085
  Args:
1800
- column_order: Desired column order
2086
+ column_order: A list of column names in the desired order.
1801
2087
 
1802
2088
  Returns:
1803
- FlowDataEngine: New instance with reordered columns
2089
+ A new `FlowDataEngine` instance with the columns reordered.
1804
2090
  """
1805
2091
  df = self.data_frame.select(column_order)
1806
2092
  schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
@@ -1808,16 +2094,15 @@ class FlowDataEngine:
1808
2094
 
1809
2095
  def apply_flowfile_formula(self, func: str, col_name: str,
1810
2096
  output_data_type: pl.DataType = None) -> "FlowDataEngine":
1811
- """
1812
- Apply a formula to create a new column.
2097
+ """Applies a formula to create a new column or transform an existing one.
1813
2098
 
1814
2099
  Args:
1815
- func: Formula to apply
1816
- col_name: Name for new column
1817
- output_data_type: Data type for output
2100
+ func: A string containing a Polars expression formula.
2101
+ col_name: The name of the new or transformed column.
2102
+ output_data_type: The desired Polars data type for the output column.
1818
2103
 
1819
2104
  Returns:
1820
- FlowDataEngine: New instance with added column
2105
+ A new `FlowDataEngine` instance with the applied formula.
1821
2106
  """
1822
2107
  parsed_func = to_expr(func)
1823
2108
  if output_data_type is not None:
@@ -1829,16 +2114,15 @@ class FlowDataEngine:
1829
2114
 
1830
2115
  def apply_sql_formula(self, func: str, col_name: str,
1831
2116
  output_data_type: pl.DataType = None) -> "FlowDataEngine":
1832
- """
1833
- Apply an SQL-style formula to create a new column.
2117
+ """Applies an SQL-style formula using `pl.sql_expr`.
1834
2118
 
1835
2119
  Args:
1836
- func: SQL formula to apply
1837
- col_name: Name for new column
1838
- output_data_type: Data type for output
2120
+ func: A string containing an SQL expression.
2121
+ col_name: The name of the new or transformed column.
2122
+ output_data_type: The desired Polars data type for the output column.
1839
2123
 
1840
2124
  Returns:
1841
- FlowDataEngine: New instance with added column
2125
+ A new `FlowDataEngine` instance with the applied formula.
1842
2126
  """
1843
2127
  expr = to_expr(func)
1844
2128
  if output_data_type not in (None, "Auto"):
@@ -1850,16 +2134,18 @@ class FlowDataEngine:
1850
2134
 
1851
2135
  def output(self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str,
1852
2136
  execute_remote: bool = True) -> "FlowDataEngine":
1853
- """
1854
- Write DataFrame to output file.
2137
+ """Writes the DataFrame to an output file.
2138
+
2139
+ Can execute the write operation locally or in a remote worker process.
1855
2140
 
1856
2141
  Args:
1857
- output_fs: Output settings.
1858
- flow_id: Flow ID for tracking.
1859
- node_id: Node ID for tracking.
1860
- execute_remote: If the output should be executed at the flowfile worker process.
2142
+ output_fs: An `OutputSettings` object with details about the output file.
2143
+ flow_id: The flow ID for tracking.
2144
+ node_id: The node ID for tracking.
2145
+ execute_remote: If True, executes the write in a worker process.
2146
+
1861
2147
  Returns:
1862
- FlowDataEngine: Self for chaining
2148
+ The same `FlowDataEngine` instance for chaining.
1863
2149
  """
1864
2150
  logger.info('Starting to write output')
1865
2151
  if execute_remote:
@@ -1891,30 +2177,28 @@ class FlowDataEngine:
1891
2177
  logger.info("Finished writing output")
1892
2178
  return self
1893
2179
 
1894
- # Data Operations
1895
2180
  def make_unique(self, unique_input: transform_schemas.UniqueInput = None) -> "FlowDataEngine":
1896
- """
1897
- Get unique rows based on specified columns.
2181
+ """Gets the unique rows from the DataFrame.
1898
2182
 
1899
2183
  Args:
1900
- unique_input: Unique operation parameters
2184
+ unique_input: A `UniqueInput` object specifying a subset of columns
2185
+ to consider for uniqueness and a strategy for keeping rows.
1901
2186
 
1902
2187
  Returns:
1903
- FlowDataEngine: New instance with unique rows
2188
+ A new `FlowDataEngine` instance with unique rows.
1904
2189
  """
1905
2190
  if unique_input is None or unique_input.columns is None:
1906
2191
  return FlowDataEngine(self.data_frame.unique())
1907
2192
  return FlowDataEngine(self.data_frame.unique(unique_input.columns, keep=unique_input.strategy))
1908
2193
 
1909
2194
  def concat(self, other: Iterable["FlowDataEngine"] | "FlowDataEngine") -> "FlowDataEngine":
1910
- """
1911
- Concatenate with other DataFrames.
2195
+ """Concatenates this DataFrame with one or more other DataFrames.
1912
2196
 
1913
2197
  Args:
1914
- other: DataFrames to concatenate
2198
+ other: A single `FlowDataEngine` or an iterable of them.
1915
2199
 
1916
2200
  Returns:
1917
- FlowDataEngine: Concatenated DataFrame
2201
+ A new `FlowDataEngine` containing the concatenated data.
1918
2202
  """
1919
2203
  if isinstance(other, FlowDataEngine):
1920
2204
  other = [other]
@@ -1924,15 +2208,15 @@ class FlowDataEngine:
1924
2208
 
1925
2209
  def do_select(self, select_inputs: transform_schemas.SelectInputs,
1926
2210
  keep_missing: bool = True) -> "FlowDataEngine":
1927
- """
1928
- Perform complex column selection and transformation.
2211
+ """Performs a complex column selection, renaming, and reordering operation.
1929
2212
 
1930
2213
  Args:
1931
- select_inputs: Selection specifications
1932
- keep_missing: Whether to keep columns not specified
2214
+ select_inputs: A `SelectInputs` object defining the desired transformations.
2215
+ keep_missing: If True, columns not specified in `select_inputs` are kept.
2216
+ If False, they are dropped.
1933
2217
 
1934
2218
  Returns:
1935
- FlowDataEngine: New instance with selected/transformed columns
2219
+ A new `FlowDataEngine` with the transformed selection.
1936
2220
  """
1937
2221
  new_schema = deepcopy(self.schema)
1938
2222
  renames = [r for r in select_inputs.renames if r.is_available]
@@ -1968,29 +2252,29 @@ class FlowDataEngine:
1968
2252
  output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
1969
2253
  return output_file.reorganize_order(sorted_cols)
1970
2254
 
1971
- # Utility Methods
1972
2255
  def set_streamable(self, streamable: bool = False):
1973
- """Set whether DataFrame operations should be streamable."""
2256
+ """Sets whether DataFrame operations should be streamable."""
1974
2257
  self._streamable = streamable
1975
2258
 
1976
2259
  def _calculate_schema(self) -> List[Dict]:
1977
- """Calculate schema statistics."""
2260
+ """Calculates schema statistics."""
1978
2261
  if self.external_source is not None:
1979
2262
  self.collect_external()
1980
2263
  v = utils.calculate_schema(self.data_frame)
1981
2264
  return v
1982
2265
 
1983
2266
  def calculate_schema(self):
1984
- """Calculate and return schema."""
2267
+ """Calculates and returns the schema."""
1985
2268
  self._calculate_schema_stats = True
1986
2269
  return self.schema
1987
2270
 
1988
2271
  def count(self) -> int:
1989
- """Get total number of records."""
2272
+ """Gets the total number of records."""
1990
2273
  return self.get_number_of_records()
1991
2274
 
1992
2275
  @classmethod
1993
2276
  def create_from_path_worker(cls, received_table: input_schema.ReceivedTable, flow_id: int, node_id: int | str):
2277
+ """Creates a FlowDataEngine from a path in a worker process."""
1994
2278
  received_table.set_absolute_filepath()
1995
2279
  external_fetcher = ExternalCreateFetcher(received_table=received_table,
1996
2280
  file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
@@ -1998,14 +2282,19 @@ class FlowDataEngine:
1998
2282
 
1999
2283
 
2000
2284
  def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowDataEngine":
2001
- """
2002
- Execute arbitrary Polars code.
2285
+ """Executes arbitrary Polars code on one or more FlowDataEngine objects.
2286
+
2287
+ This function takes a string of Python code that uses Polars and executes it.
2288
+ Input `FlowDataEngine` objects are made available in the code's scope as
2289
+ `input_df` (for a single input) or `input_df_1`, `input_df_2`, etc.
2003
2290
 
2004
2291
  Args:
2005
- code: Polars code to execute
2292
+ *flowfile_tables: A variable number of `FlowDataEngine` objects to be
2293
+ used as input to the code.
2294
+ code: A string containing the Polars code to execute.
2006
2295
 
2007
2296
  Returns:
2008
- FlowDataEngine: Result of code execution
2297
+ A new `FlowDataEngine` instance containing the result of the executed code.
2009
2298
  """
2010
2299
  polars_executable = polars_code_parser.get_executable(code, num_inputs=len(flowfile_tables))
2011
2300
  if len(flowfile_tables) == 0:
@@ -2017,5 +2306,4 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
2017
2306
  df = polars_executable(**kwargs)
2018
2307
  if isinstance(df, pl.DataFrame):
2019
2308
  logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
2020
- return FlowDataEngine(df)
2021
-
2309
+ return FlowDataEngine(df)