Flowfile 0.3.4.1__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (122) hide show
  1. flowfile/__init__.py +3 -3
  2. flowfile/api.py +36 -15
  3. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  4. flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
  5. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  6. flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
  7. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  8. flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
  9. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
  10. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
  11. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
  12. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
  13. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
  14. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
  15. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
  16. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
  17. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
  18. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  19. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
  20. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
  21. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
  22. flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
  23. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  24. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
  25. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  26. flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
  27. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
  28. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
  29. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
  30. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
  31. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
  32. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
  33. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
  34. flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
  35. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
  36. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
  37. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
  38. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
  39. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
  40. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
  41. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
  42. flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
  43. flowfile/web/static/assets/api-fb67319c.js +80 -0
  44. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  45. flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
  46. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
  47. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
  48. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
  49. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
  50. flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
  51. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
  52. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
  53. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
  54. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
  55. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
  56. flowfile/web/static/index.html +1 -1
  57. {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
  58. {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/RECORD +109 -104
  59. {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
  60. flowfile_core/__init__.py +2 -0
  61. flowfile_core/configs/node_store/nodes.py +8 -6
  62. flowfile_core/database/connection.py +63 -15
  63. flowfile_core/database/init_db.py +0 -1
  64. flowfile_core/database/models.py +49 -2
  65. flowfile_core/flowfile/code_generator/code_generator.py +402 -18
  66. flowfile_core/flowfile/connection_manager/models.py +1 -1
  67. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  68. flowfile_core/flowfile/extensions.py +1 -1
  69. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  70. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  71. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
  72. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  73. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  74. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  75. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  76. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  77. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  78. flowfile_core/flowfile/flow_graph.py +119 -82
  79. flowfile_core/flowfile/flow_node/flow_node.py +68 -33
  80. flowfile_core/flowfile/flow_node/models.py +32 -3
  81. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  82. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  83. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  84. flowfile_core/flowfile/utils.py +1 -23
  85. flowfile_core/main.py +3 -2
  86. flowfile_core/routes/cloud_connections.py +81 -0
  87. flowfile_core/routes/logs.py +0 -1
  88. flowfile_core/routes/routes.py +3 -39
  89. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  90. flowfile_core/schemas/input_schema.py +37 -15
  91. flowfile_core/schemas/schemas.py +7 -2
  92. flowfile_core/schemas/transform_schema.py +97 -22
  93. flowfile_core/utils/utils.py +40 -1
  94. flowfile_core/utils/validate_setup.py +41 -0
  95. flowfile_frame/flow_frame.py +253 -102
  96. flowfile_frame/flow_frame_methods.py +13 -13
  97. flowfile_worker/external_sources/s3_source/main.py +216 -0
  98. flowfile_worker/external_sources/s3_source/models.py +142 -0
  99. flowfile_worker/funcs.py +51 -6
  100. flowfile_worker/models.py +22 -2
  101. flowfile_worker/routes.py +40 -38
  102. flowfile_worker/utils.py +1 -1
  103. test_utils/s3/commands.py +46 -0
  104. test_utils/s3/data_generator.py +291 -0
  105. test_utils/s3/fixtures.py +209 -0
  106. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  107. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  108. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  109. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  110. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  111. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  112. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  114. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  115. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  116. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  117. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  118. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  119. {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
  120. {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
  121. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  122. {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
@@ -4,7 +4,7 @@ import os
4
4
  from copy import deepcopy
5
5
  from dataclasses import dataclass
6
6
  from math import ceil
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar
8
8
 
9
9
  # Third-party imports
10
10
  from loky import Future
@@ -16,25 +16,34 @@ from pyarrow.parquet import ParquetFile
16
16
 
17
17
  # Local imports - Core
18
18
  from flowfile_core.configs import logger
19
+ from flowfile_core.utils.utils import ensure_similarity_dicts
19
20
  from flowfile_core.configs.flow_logger import NodeLogger
20
21
  from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
21
22
  from flowfile_core.schemas import (
23
+ cloud_storage_schemas,
22
24
  input_schema,
23
25
  transform_schema as transform_schemas
24
26
  )
25
27
 
26
28
  # Local imports - Flow File Components
27
29
  from flowfile_core.flowfile.flow_data_engine import utils
30
+ from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (CloudStorageReader,
31
+ ensure_path_has_wildcard_pattern,
32
+ get_first_file_from_s3_dir)
28
33
  from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
29
34
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
30
35
  FlowfileColumn,
36
+ assert_if_flowfile_schema,
31
37
  convert_stats_to_column_info
32
38
  )
33
39
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
34
40
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
35
41
  from flowfile_core.flowfile.flow_data_engine.join import (
36
42
  verify_join_select_integrity,
37
- verify_join_map_integrity
43
+ verify_join_map_integrity,
44
+ rename_df_table_for_join,
45
+ get_undo_rename_mapping_join,
46
+ get_col_name_to_delete
38
47
  )
39
48
  from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
40
49
  from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
@@ -52,6 +61,55 @@ from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
52
61
 
53
62
  from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
54
63
 
64
+ T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
65
+
66
+ def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
67
+
68
+ def _construct_temp_name(column_name: str) -> str:
69
+ return "__FL_TEMP__"+column_name
70
+ if join_input.how == 'right':
71
+ left_df = left_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
72
+ for jk in join_input.left_select.join_key_selects)
73
+ reverse_actions = {
74
+ _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
75
+ for jk in join_input.left_select.join_key_selects}
76
+ elif join_input.how in ('left', 'inner'):
77
+ right_df = right_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
78
+ for jk in join_input.right_select.join_key_selects)
79
+ reverse_actions = {
80
+ _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
81
+ for jk in join_input.right_select.join_key_selects}
82
+ else:
83
+ reverse_actions = {}
84
+ return left_df, right_df, reverse_actions
85
+
86
+
87
+ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.JoinInput) -> None:
88
+ """
89
+ Updates the right columns of the join input by deselecting them.
90
+ Args:
91
+ join_input ():
92
+
93
+ Returns:
94
+ None
95
+ """
96
+ if join_input.how in ('semi', 'anti'):
97
+ for jk in join_input.right_select.renames:
98
+ jk.keep = False
99
+
100
+
101
+ def get_select_columns(full_select_input: List[transform_schemas.SelectInput]) -> List[str]:
102
+ """
103
+ Gets the list of column names to select from the full select input.
104
+ It filters out columns that are not marked to keep or join keys, and only includes those that are available.
105
+ Args:
106
+ full_select_input (): List of SelectInput objects containing column information.
107
+
108
+ Returns:
109
+ List of column names to select.
110
+ """
111
+ return [v.old_name for v in full_select_input if (v.keep or v.join_key) and v.is_available]
112
+
55
113
 
56
114
  @dataclass
57
115
  class FlowDataEngine:
@@ -110,7 +168,7 @@ class FlowDataEngine:
110
168
  # flow_id: int = None # TODO: Implement flow_id
111
169
 
112
170
  def __init__(self,
113
- raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
171
+ raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
114
172
  path_ref: str = None,
115
173
  name: str = None,
116
174
  optimize_memory: bool = True,
@@ -129,7 +187,6 @@ class FlowDataEngine:
129
187
  self._handle_path_ref(path_ref, optimize_memory)
130
188
  else:
131
189
  self.initialize_empty_fl()
132
-
133
190
  self._finalize_initialization(name, optimize_memory, schema, calculate_schema_stats)
134
191
 
135
192
  def _initialize_attributes(self, number_of_records_callback, data_callback, streamable):
@@ -172,6 +229,7 @@ class FlowDataEngine:
172
229
  elif optimize_memory:
173
230
  self.number_of_records = -1
174
231
  else:
232
+ # TODO: assess whether this leads to slow downs with multi remote files
175
233
  self.number_of_records = lf.select(pl.len()).collect()[0, 0]
176
234
 
177
235
  def _handle_python_data(self, data: Union[List, Dict]):
@@ -187,12 +245,13 @@ class FlowDataEngine:
187
245
  self.initialize_empty_fl()
188
246
  lengths = [len(v) if isinstance(v, (list, tuple)) else 1 for v in data.values()]
189
247
 
190
- if len(set(lengths)) == 1 and lengths[0]>1:
248
+ if len(set(lengths)) == 1 and lengths[0] > 1:
191
249
  self.number_of_records = lengths[0]
192
250
  self.data_frame = pl.DataFrame(data)
193
251
  else:
194
252
  self.number_of_records = 1
195
253
  self.data_frame = pl.DataFrame([data])
254
+ self.lazy = True
196
255
 
197
256
  def _handle_raw_data_format(self, raw_data: input_schema.RawData):
198
257
  """Create a FlowDataEngine from a RawData object."""
@@ -226,13 +285,384 @@ class FlowDataEngine:
226
285
  if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
227
286
  try:
228
287
  return pl.DataFrame(data).to_dicts()
229
- except:
288
+ except TypeError:
230
289
  raise Exception('Value must be able to be converted to dictionary')
290
+ except Exception as e:
291
+ raise Exception(f'Value must be able to be converted to dictionary: {e}')
231
292
 
232
293
  if not isinstance(data[0], dict):
233
294
  data = [row.__dict__ for row in data]
234
295
 
235
- return utils.ensure_similarity_dicts(data)
296
+ return ensure_similarity_dicts(data)
297
+
298
+ def to_cloud_storage_obj(self, settings: cloud_storage_schemas.CloudStorageWriteSettingsInternal):
299
+ """
300
+ Write the FlowDataEngine's data to an object in cloud storage.
301
+
302
+ Supports writing to S3, Azure ADLS, and Google Cloud Storage. The 'overwrite'
303
+ write mode is supported. The 'append' mode is not yet implemented.
304
+
305
+ Args:
306
+ settings: Cloud storage write settings with connection details and write options.
307
+
308
+ Raises:
309
+ ValueError: If file format is not supported.
310
+ NotImplementedError: If the 'append' write mode is used.
311
+ Exception: If writing to cloud storage fails.
312
+ """
313
+ connection = settings.connection
314
+ write_settings = settings.write_settings
315
+
316
+ logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
317
+
318
+ if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
319
+ raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
320
+
321
+ storage_options = CloudStorageReader.get_storage_options(connection)
322
+ credential_provider = CloudStorageReader.get_credential_provider(connection)
323
+ # Dispatch to the correct writer based on file format
324
+ if write_settings.file_format == "parquet":
325
+ self._write_parquet_to_cloud(
326
+ write_settings.resource_path,
327
+ storage_options,
328
+ credential_provider,
329
+ write_settings
330
+ )
331
+ elif write_settings.file_format == "delta":
332
+ self._write_delta_to_cloud(
333
+ write_settings.resource_path,
334
+ storage_options,
335
+ credential_provider,
336
+ write_settings
337
+ )
338
+ elif write_settings.file_format == "csv":
339
+ self._write_csv_to_cloud(
340
+ write_settings.resource_path,
341
+ storage_options,
342
+ credential_provider,
343
+ write_settings
344
+ )
345
+ elif write_settings.file_format == "json":
346
+ self._write_json_to_cloud(
347
+ write_settings.resource_path,
348
+ storage_options,
349
+ credential_provider,
350
+ write_settings
351
+ )
352
+ else:
353
+ raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
354
+
355
+ logger.info(f"Successfully wrote data to {write_settings.resource_path}")
356
+
357
+ def _write_parquet_to_cloud(self,
358
+ resource_path: str,
359
+ storage_options: Dict[str, Any],
360
+ credential_provider: Optional[Callable],
361
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
362
+ """Write LazyFrame to a Parquet file in cloud storage."""
363
+ try:
364
+ sink_kwargs = {
365
+ "path": resource_path,
366
+ "compression": write_settings.parquet_compression,
367
+ }
368
+ if storage_options:
369
+ sink_kwargs["storage_options"] = storage_options
370
+ if credential_provider:
371
+ sink_kwargs["credential_provider"] = credential_provider
372
+ try:
373
+ self.data_frame.sink_parquet(**sink_kwargs)
374
+ except:
375
+ pl_df = self.collect()
376
+ sink_kwargs['file'] = sink_kwargs.pop("path")
377
+ pl_df.write_parquet(**sink_kwargs)
378
+
379
+ except Exception as e:
380
+ logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
381
+ raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
382
+
383
+ def _write_delta_to_cloud(self,
384
+ resource_path: str,
385
+ storage_options: Dict[str, Any],
386
+ credential_provider: Optional[Callable],
387
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
388
+ sink_kwargs = {
389
+ "target": resource_path,
390
+ "mode": write_settings.write_mode,
391
+ }
392
+ if storage_options:
393
+ sink_kwargs["storage_options"] = storage_options
394
+ if credential_provider:
395
+ sink_kwargs["credential_provider"] = credential_provider
396
+ self.collect().write_delta(**sink_kwargs)
397
+
398
+ def _write_csv_to_cloud(self,
399
+ resource_path: str,
400
+ storage_options: Dict[str, Any],
401
+ credential_provider: Optional[Callable],
402
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
403
+ """Write LazyFrame to a CSV file in cloud storage."""
404
+ try:
405
+ sink_kwargs = {
406
+ "path": resource_path,
407
+ "separator": write_settings.csv_delimiter,
408
+ }
409
+ if storage_options:
410
+ sink_kwargs["storage_options"] = storage_options
411
+ if credential_provider:
412
+ sink_kwargs["credential_provider"] = credential_provider
413
+
414
+ # sink_csv executes the lazy query and writes the result
415
+ self.data_frame.sink_csv(**sink_kwargs)
416
+
417
+ except Exception as e:
418
+ logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
419
+ raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
420
+
421
+ def _write_json_to_cloud(self,
422
+ resource_path: str,
423
+ storage_options: Dict[str, Any],
424
+ credential_provider: Optional[Callable],
425
+ write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
426
+ """Write LazyFrame to a line-delimited JSON (NDJSON) file in cloud storage."""
427
+ try:
428
+ sink_kwargs = {"path": resource_path}
429
+ if storage_options:
430
+ sink_kwargs["storage_options"] = storage_options
431
+ if credential_provider:
432
+ sink_kwargs["credential_provider"] = credential_provider
433
+ self.data_frame.sink_ndjson(**sink_kwargs)
434
+
435
+ except Exception as e:
436
+ logger.error(f"Failed to write JSON to {resource_path}: {str(e)}")
437
+ raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
438
+
439
+ @classmethod
440
+ def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal):
441
+ """
442
+ Create a FlowDataEngine from an object in cloud storage.
443
+
444
+ Supports reading from S3, Azure ADLS, and Google Cloud Storage with various
445
+ authentication methods including access keys, IAM roles, and CLI credentials.
446
+
447
+ Args:
448
+ settings: Cloud storage read settings with connection details and read options
449
+
450
+ Returns:
451
+ FlowDataEngine: New instance with data from cloud storage
452
+
453
+ Raises:
454
+ ValueError: If storage type or file format is not supported
455
+ Exception: If reading from cloud storage fails
456
+ """
457
+ connection = settings.connection
458
+ read_settings = settings.read_settings
459
+
460
+ logger.info(f"Reading from {connection.storage_type} storage: {read_settings.resource_path}")
461
+ # Get storage options based on connection type
462
+ storage_options = CloudStorageReader.get_storage_options(connection)
463
+ # Get credential provider if needed
464
+ credential_provider = CloudStorageReader.get_credential_provider(connection)
465
+ if read_settings.file_format == "parquet":
466
+ return cls._read_parquet_from_cloud(
467
+ read_settings.resource_path,
468
+ storage_options,
469
+ credential_provider,
470
+ read_settings.scan_mode == "directory",
471
+ )
472
+ elif read_settings.file_format == "delta":
473
+ return cls._read_delta_from_cloud(
474
+ read_settings.resource_path,
475
+ storage_options,
476
+ credential_provider,
477
+ read_settings
478
+ )
479
+ elif read_settings.file_format == "csv":
480
+ return cls._read_csv_from_cloud(
481
+ read_settings.resource_path,
482
+ storage_options,
483
+ credential_provider,
484
+ read_settings
485
+ )
486
+ elif read_settings.file_format == "json":
487
+ return cls._read_json_from_cloud(
488
+ read_settings.resource_path,
489
+ storage_options,
490
+ credential_provider,
491
+ read_settings.scan_mode == "directory"
492
+ )
493
+ elif read_settings.file_format == "iceberg":
494
+ return cls._read_iceberg_from_cloud(
495
+ read_settings.resource_path,
496
+ storage_options,
497
+ credential_provider,
498
+ read_settings
499
+ )
500
+
501
+ elif read_settings.file_format in ["delta", "iceberg"]:
502
+ # These would require additional libraries
503
+ raise NotImplementedError(f"File format {read_settings.file_format} not yet implemented")
504
+ else:
505
+ raise ValueError(f"Unsupported file format: {read_settings.file_format}")
506
+
507
+ @staticmethod
508
+ def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any]) -> List[FlowfileColumn] | None:
509
+ try:
510
+ first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
511
+ return convert_stats_to_column_info(FlowDataEngine._create_schema_stats_from_pl_schema(
512
+ pl.scan_parquet(first_file_ref, storage_options=storage_options).collect_schema()))
513
+ except Exception as e:
514
+ logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
515
+
516
+
517
+ @classmethod
518
+ def _read_iceberg_from_cloud(cls,
519
+ resource_path: str,
520
+ storage_options: Dict[str, Any],
521
+ credential_provider: Optional[Callable],
522
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
523
+ """Read Iceberg table(s) from cloud storage."""
524
+ raise NotImplementedError(f"Failed to read Iceberg table from cloud storage: Not yet implemented")
525
+
526
+ @classmethod
527
+ def _read_parquet_from_cloud(cls,
528
+ resource_path: str,
529
+ storage_options: Dict[str, Any],
530
+ credential_provider: Optional[Callable],
531
+ is_directory: bool) -> "FlowDataEngine":
532
+ """Read Parquet file(s) from cloud storage."""
533
+ try:
534
+ # Use scan_parquet for lazy evaluation
535
+ if is_directory:
536
+ resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="parquet")
537
+ scan_kwargs = {"source": resource_path}
538
+
539
+ if storage_options:
540
+ scan_kwargs["storage_options"] = storage_options
541
+
542
+ if credential_provider:
543
+ scan_kwargs["credential_provider"] = credential_provider
544
+ if storage_options and is_directory:
545
+ schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
546
+ else:
547
+ schema = None
548
+ lf = pl.scan_parquet(**scan_kwargs)
549
+
550
+ return cls(
551
+ lf,
552
+ number_of_records=6_666_666, # Set to 6666666 so that the provider is not accessed for this stat
553
+ optimize_memory=True,
554
+ streamable=True,
555
+ schema=schema
556
+ )
557
+
558
+ except Exception as e:
559
+ logger.error(f"Failed to read Parquet from {resource_path}: {str(e)}")
560
+ raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
561
+
562
+ @classmethod
563
+ def _read_delta_from_cloud(cls,
564
+ resource_path: str,
565
+ storage_options: Dict[str, Any],
566
+ credential_provider: Optional[Callable],
567
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
568
+ try:
569
+ logger.info("Reading Delta file from cloud storage...")
570
+ logger.info(f"read_settings: {read_settings}")
571
+ scan_kwargs = {"source": resource_path}
572
+ if read_settings.delta_version:
573
+ scan_kwargs['version'] = read_settings.delta_version
574
+ if storage_options:
575
+ scan_kwargs["storage_options"] = storage_options
576
+ if credential_provider:
577
+ scan_kwargs["credential_provider"] = credential_provider
578
+ lf = pl.scan_delta(**scan_kwargs)
579
+
580
+ return cls(
581
+ lf,
582
+ number_of_records=6_666_666, # Set to 6666666 so that the provider is not accessed for this stat
583
+ optimize_memory=True,
584
+ streamable=True
585
+ )
586
+ except Exception as e:
587
+ logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
588
+ raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
589
+
590
+ @classmethod
591
+ def _read_csv_from_cloud(cls,
592
+ resource_path: str,
593
+ storage_options: Dict[str, Any],
594
+ credential_provider: Optional[Callable],
595
+ read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
596
+ """Read CSV file(s) from cloud storage."""
597
+ try:
598
+ scan_kwargs = {
599
+ "source": resource_path,
600
+ "has_header": read_settings.csv_has_header,
601
+ "separator": read_settings.csv_delimiter,
602
+ "encoding": read_settings.csv_encoding,
603
+ }
604
+ if storage_options:
605
+ scan_kwargs["storage_options"] = storage_options
606
+ if credential_provider:
607
+ scan_kwargs["credential_provider"] = credential_provider
608
+
609
+ if read_settings.scan_mode == "directory":
610
+ resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="csv")
611
+ scan_kwargs["source"] = resource_path
612
+ if storage_options and read_settings.scan_mode == "directory":
613
+ schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
614
+ else:
615
+ schema = None
616
+
617
+ lf = pl.scan_csv(**scan_kwargs)
618
+
619
+ return cls(
620
+ lf,
621
+ number_of_records=6_666_666, # Will be calculated lazily
622
+ optimize_memory=True,
623
+ streamable=True,
624
+ schema=schema
625
+ )
626
+
627
+ except Exception as e:
628
+ logger.error(f"Failed to read CSV from {resource_path}: {str(e)}")
629
+ raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
630
+
631
+ @classmethod
632
+ def _read_json_from_cloud(cls,
633
+ resource_path: str,
634
+ storage_options: Dict[str, Any],
635
+ credential_provider: Optional[Callable],
636
+ is_directory: bool) -> "FlowDataEngine":
637
+ """Read JSON file(s) from cloud storage."""
638
+ try:
639
+ scan_kwargs = {"source": resource_path}
640
+
641
+ if storage_options:
642
+ scan_kwargs["storage_options"] = storage_options
643
+ if credential_provider:
644
+ scan_kwargs["credential_provider"] = credential_provider
645
+
646
+ if is_directory:
647
+ resource_path = ensure_path_has_wildcard_pattern(resource_path, "json")
648
+ if storage_options and is_directory:
649
+ schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
650
+ else:
651
+ schema = None
652
+
653
+ lf = pl.scan_ndjson(**scan_kwargs) # Using NDJSON for line-delimited JSON
654
+
655
+ return cls(
656
+ lf,
657
+ number_of_records=-1,
658
+ optimize_memory=True,
659
+ streamable=True,
660
+ schema=schema
661
+ )
662
+
663
+ except Exception as e:
664
+ logger.error(f"Failed to read JSON from {resource_path}: {str(e)}")
665
+ raise Exception(f"Failed to read JSON from cloud storage: {str(e)}")
236
666
 
237
667
  def _handle_path_ref(self, path_ref: str, optimize_memory: bool):
238
668
  """Handle file path reference input."""
@@ -255,16 +685,20 @@ class FlowDataEngine:
255
685
  _ = calculate_schema_stats
256
686
  self.name = name
257
687
  self._optimize_memory = optimize_memory
258
- pl_schema = self.data_frame.collect_schema()
259
- self._schema = self._handle_schema(schema, pl_schema)
260
- self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
688
+ if assert_if_flowfile_schema(schema):
689
+ self._schema = schema
690
+ self.columns = [c.column_name for c in self._schema]
691
+ else:
692
+ pl_schema = self.data_frame.collect_schema()
693
+ self._schema = self._handle_schema(schema, pl_schema)
694
+ self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
261
695
 
262
696
  def __getitem__(self, item):
263
697
  """Access a specific column or item from the DataFrame."""
264
698
  return self.data_frame.select([item])
265
699
 
266
700
  @property
267
- def data_frame(self) -> pl.LazyFrame | pl.DataFrame:
701
+ def data_frame(self) -> pl.LazyFrame | pl.DataFrame | None:
268
702
  """Get the underlying DataFrame with appropriate handling of different states."""
269
703
  if self._data_frame is not None and not self.is_future:
270
704
  return self._data_frame
@@ -289,6 +723,16 @@ class FlowDataEngine:
289
723
  raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
290
724
  self._data_frame = df
291
725
 
726
+ @staticmethod
727
+ def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> List[Dict]:
728
+ return [
729
+ dict(column_name=k, pl_datatype=v, col_index=i)
730
+ for i, (k, v) in enumerate(pl_schema.items())
731
+ ]
732
+
733
+ def _add_schema_from_schema_stats(self, schema_stats: List[Dict]):
734
+ self._schema = convert_stats_to_column_info(schema_stats)
735
+
292
736
  @property
293
737
  def schema(self) -> List[FlowfileColumn]:
294
738
  """Get the schema of the DataFrame, calculating if necessary."""
@@ -299,11 +743,8 @@ class FlowDataEngine:
299
743
  schema_stats = self._calculate_schema()
300
744
  self.ind_schema_calculated = True
301
745
  else:
302
- schema_stats = [
303
- dict(column_name=k, pl_datatype=v, col_index=i)
304
- for i, (k, v) in enumerate(self.data_frame.collect_schema().items())
305
- ]
306
- self._schema = convert_stats_to_column_info(schema_stats)
746
+ schema_stats = self._create_schema_stats_from_pl_schema(self.data_frame.collect_schema())
747
+ self._add_schema_from_schema_stats(schema_stats)
307
748
  return self._schema
308
749
 
309
750
  @property
@@ -338,6 +779,7 @@ class FlowDataEngine:
338
779
  def _collect_data(self, n_records: int = None) -> pl.DataFrame:
339
780
  """Internal method to handle data collection."""
340
781
  if n_records is None:
782
+
341
783
  self.collect_external()
342
784
  if self._streamable:
343
785
  try:
@@ -353,7 +795,7 @@ class FlowDataEngine:
353
795
  return self._collect_from_external_source(n_records)
354
796
 
355
797
  if self._streamable:
356
- return self.data_frame.head(n_records).collect(engine="streaming", comm_subplan_elim=False)
798
+ return self.data_frame.head(n_records).collect(engine="streaming")
357
799
  return self.data_frame.head(n_records).collect()
358
800
 
359
801
  def _collect_from_external_source(self, n_records: int) -> pl.DataFrame:
@@ -480,8 +922,17 @@ class FlowDataEngine:
480
922
  return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
481
923
  return self.data_frame.to_dicts()
482
924
 
925
+ def to_raw_data(self) -> input_schema.RawData:
926
+ """Convert the DataFrame to a list of values."""
927
+ columns = [c.get_minimal_field_info() for c in self.schema]
928
+ data = list(self.to_dict().values())
929
+ return input_schema.RawData(columns=columns, data=data)
930
+
483
931
  def to_dict(self) -> Dict[str, List]:
484
- return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
932
+ if self.lazy:
933
+ return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
934
+ else:
935
+ return self.data_frame.to_dict(as_series=False)
485
936
 
486
937
  @classmethod
487
938
  def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
@@ -514,7 +965,6 @@ class FlowDataEngine:
514
965
  def create_from_path(cls, received_table: input_schema.ReceivedTableBase) -> "FlowDataEngine":
515
966
  """Create a FlowDataEngine from a file path."""
516
967
  received_table.set_absolute_filepath()
517
-
518
968
  file_type_handlers = {
519
969
  'csv': create_funcs.create_from_path_csv,
520
970
  'parquet': create_funcs.create_from_path_parquet,
@@ -541,25 +991,26 @@ class FlowDataEngine:
541
991
  length = 10_000_000
542
992
  return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
543
993
 
544
- # Schema Handling Methods
545
-
546
- def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema,
994
+ def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema | None,
547
995
  pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
548
996
  """Handle schema processing and validation."""
549
- if schema is None:
997
+ if schema is None and pl_schema is not None:
998
+ return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
999
+ elif schema is None and pl_schema is None:
550
1000
  return None
551
-
552
- if schema.__len__() != pl_schema.__len__():
553
- raise Exception(
554
- f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
555
-
556
- if isinstance(schema, pl.Schema):
557
- return self._handle_polars_schema(schema, pl_schema)
558
- elif isinstance(schema, list) and len(schema) == 0:
559
- return []
560
- elif isinstance(schema[0], str):
561
- return self._handle_string_schema(schema, pl_schema)
562
- return schema
1001
+ elif assert_if_flowfile_schema(schema) and pl_schema is None:
1002
+ return schema
1003
+ elif pl_schema is not None and schema is not None:
1004
+ if schema.__len__() != pl_schema.__len__():
1005
+ raise Exception(
1006
+ f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
1007
+ if isinstance(schema, pl.Schema):
1008
+ return self._handle_polars_schema(schema, pl_schema)
1009
+ elif isinstance(schema, list) and len(schema) == 0:
1010
+ return []
1011
+ elif isinstance(schema[0], str):
1012
+ return self._handle_string_schema(schema, pl_schema)
1013
+ return schema
563
1014
 
564
1015
  def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
565
1016
  """Handle Polars schema conversion."""
@@ -847,7 +1298,6 @@ class FlowDataEngine:
847
1298
  """
848
1299
  n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=True))
849
1300
  logging.info(f'Getting sample of {n_rows} rows')
850
-
851
1301
  if random:
852
1302
  if self.lazy and self.external_source is not None:
853
1303
  self.collect_external()
@@ -1049,21 +1499,15 @@ class FlowDataEngine:
1049
1499
  Raises:
1050
1500
  Exception: If join would result in too many records or is invalid
1051
1501
  """
1052
- # self.lazy = False if join_input.how == 'right' else True
1053
- # other.lazy = False if join_input.how == 'right' else True
1054
-
1502
+ ensure_right_unselect_for_semi_and_anti_joins(join_input)
1055
1503
  verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
1056
1504
  if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
1057
1505
  raise Exception('Join is not valid by the data fields')
1506
+
1058
1507
  if auto_generate_selection:
1059
1508
  join_input.auto_rename()
1060
-
1061
- right_select = [v.old_name for v in join_input.right_select.renames
1062
- if (v.keep or v.join_key) and v.is_available]
1063
- left_select = [v.old_name for v in join_input.left_select.renames
1064
- if (v.keep or v.join_key) and v.is_available]
1065
- left = self.data_frame.select(left_select).rename(join_input.left_select.rename_table)
1066
- right = other.data_frame.select(right_select).rename(join_input.right_select.rename_table)
1509
+ left = self.data_frame.select(get_select_columns(join_input.left_select.renames)).rename(join_input.left_select.rename_table)
1510
+ right = other.data_frame.select(get_select_columns(join_input.right_select.renames)).rename(join_input.right_select.rename_table)
1067
1511
 
1068
1512
  if verify_integrity and join_input.how != 'right':
1069
1513
  n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
@@ -1072,25 +1516,42 @@ class FlowDataEngine:
1072
1516
  raise Exception("Join will result in too many records, ending process")
1073
1517
  else:
1074
1518
  n_records = -1
1519
+ left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_input)
1520
+ left, right = rename_df_table_for_join(left, right, join_input.get_join_key_renames())
1075
1521
  if join_input.how == 'right':
1076
- # Default to left join since right join can give panic issues in execution plan downstream
1077
- joined_df = right.join(left, left_on=join_input.right_join_keys,
1078
- right_on=join_input.left_join_keys, how="left", suffix="")
1522
+ joined_df = right.join(
1523
+ other=left,
1524
+ left_on=join_input.right_join_keys,
1525
+ right_on=join_input.left_join_keys,
1526
+ how="left",
1527
+ suffix="").rename(reverse_join_key_mapping)
1079
1528
  else:
1080
- joined_df = left.join(right, left_on=join_input.left_join_keys,
1081
- right_on=join_input.right_join_keys,
1082
- how=join_input.how, suffix="")
1083
- cols_to_delete_after = [col.new_name for col in
1084
- join_input.left_select.renames + join_input.left_select.renames
1085
- if col.join_key and not col.keep and col.is_available]
1086
- if len(cols_to_delete_after) > 0:
1087
- joined_df = joined_df.drop(cols_to_delete_after)
1529
+ joined_df = left.join(
1530
+ other=right,
1531
+ left_on=join_input.left_join_keys,
1532
+ right_on=join_input.right_join_keys,
1533
+ how=join_input.how,
1534
+ suffix="").rename(reverse_join_key_mapping)
1535
+ left_cols_to_delete_after = [get_col_name_to_delete(col, 'left') for col in join_input.left_select.renames
1536
+ if not col.keep
1537
+ and col.is_available and col.join_key
1538
+ ]
1539
+ right_cols_to_delete_after = [get_col_name_to_delete(col, 'right') for col in join_input.right_select.renames
1540
+ if not col.keep
1541
+ and col.is_available and col.join_key
1542
+ and join_input.how in ("left", "right", "inner", "cross", "outer")
1543
+ ]
1544
+ if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
1545
+ joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
1546
+ undo_join_key_remapping = get_undo_rename_mapping_join(join_input)
1547
+ joined_df = joined_df.rename(undo_join_key_remapping)
1548
+
1088
1549
  if verify_integrity:
1089
1550
  return FlowDataEngine(joined_df, calculate_schema_stats=True,
1090
- number_of_records=n_records, streamable=False)
1551
+ number_of_records=n_records, streamable=False)
1091
1552
  else:
1092
1553
  fl = FlowDataEngine(joined_df, calculate_schema_stats=False,
1093
- number_of_records=0, streamable=False)
1554
+ number_of_records=0, streamable=False)
1094
1555
  return fl
1095
1556
 
1096
1557
  # Graph Operations
@@ -1152,6 +1613,7 @@ class FlowDataEngine:
1152
1613
  other.lazy = False
1153
1614
  self.number_of_records = -1
1154
1615
  other.number_of_records = -1
1616
+ other = other.select_columns(self.columns)
1155
1617
 
1156
1618
  if self.get_number_of_records() != other.get_number_of_records():
1157
1619
  raise Exception('Number of records is not equal')
@@ -1556,3 +2018,4 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
1556
2018
  if isinstance(df, pl.DataFrame):
1557
2019
  logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
1558
2020
  return FlowDataEngine(df)
2021
+