Flowfile 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (121) hide show
  1. flowfile/__init__.py +3 -3
  2. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  3. flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
  4. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  5. flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
  6. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  7. flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
  8. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
  9. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
  11. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
  12. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
  13. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
  14. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
  15. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
  16. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
  17. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  18. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
  19. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
  20. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
  21. flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
  22. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  23. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
  24. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  25. flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
  26. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
  27. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
  28. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
  29. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
  30. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
  31. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
  32. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
  33. flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
  34. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
  35. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
  36. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
  37. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
  38. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
  39. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
  40. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
  41. flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
  42. flowfile/web/static/assets/api-fb67319c.js +80 -0
  43. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  44. flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
  45. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
  46. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
  47. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
  48. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
  49. flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
  50. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
  51. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
  52. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
  53. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
  54. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
  55. flowfile/web/static/index.html +1 -1
  56. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
  57. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/RECORD +108 -103
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
  59. flowfile_core/__init__.py +2 -0
  60. flowfile_core/configs/node_store/nodes.py +8 -6
  61. flowfile_core/database/connection.py +63 -15
  62. flowfile_core/database/init_db.py +0 -1
  63. flowfile_core/database/models.py +49 -2
  64. flowfile_core/flowfile/code_generator/code_generator.py +401 -17
  65. flowfile_core/flowfile/connection_manager/models.py +1 -1
  66. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  67. flowfile_core/flowfile/extensions.py +1 -1
  68. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  69. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  70. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
  71. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  72. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  73. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  74. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  75. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  76. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  77. flowfile_core/flowfile/flow_graph.py +119 -82
  78. flowfile_core/flowfile/flow_node/flow_node.py +68 -33
  79. flowfile_core/flowfile/flow_node/models.py +32 -3
  80. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  81. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  82. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  83. flowfile_core/flowfile/utils.py +1 -23
  84. flowfile_core/main.py +3 -2
  85. flowfile_core/routes/cloud_connections.py +81 -0
  86. flowfile_core/routes/logs.py +0 -1
  87. flowfile_core/routes/routes.py +3 -39
  88. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  89. flowfile_core/schemas/input_schema.py +37 -15
  90. flowfile_core/schemas/schemas.py +7 -2
  91. flowfile_core/schemas/transform_schema.py +97 -22
  92. flowfile_core/utils/utils.py +40 -1
  93. flowfile_core/utils/validate_setup.py +41 -0
  94. flowfile_frame/flow_frame.py +253 -102
  95. flowfile_frame/flow_frame_methods.py +13 -13
  96. flowfile_worker/external_sources/s3_source/main.py +216 -0
  97. flowfile_worker/external_sources/s3_source/models.py +142 -0
  98. flowfile_worker/funcs.py +51 -6
  99. flowfile_worker/models.py +22 -2
  100. flowfile_worker/routes.py +40 -38
  101. flowfile_worker/utils.py +1 -1
  102. test_utils/s3/commands.py +46 -0
  103. test_utils/s3/data_generator.py +291 -0
  104. test_utils/s3/fixtures.py +209 -0
  105. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  106. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  107. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  108. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  109. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  110. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  111. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  112. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  113. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  114. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  115. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  116. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  117. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  118. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
  119. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
  120. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  121. {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
@@ -1,9 +1,10 @@
1
+
1
2
  from dataclasses import dataclass
2
- from typing import Optional, Any, List, Dict, Literal
3
+ from typing import Optional, Any, List, Dict, Literal, Iterable
4
+
3
5
  from flowfile_core.schemas import input_schema
4
6
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
5
7
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
6
- from polars import datatypes
7
8
  import polars as pl
8
9
  # TODO: rename flow_file_column to flowfile_column
9
10
  DataTypeGroup = Literal['numeric', 'str', 'date']
@@ -175,3 +176,12 @@ def convert_stats_to_column_info(stats: List[Dict]) -> List[FlowfileColumn]:
175
176
  def convert_pl_schema_to_raw_data_format(pl_schema: pl.Schema) -> List[input_schema.MinimalFieldInfo]:
176
177
  return [FlowfileColumn.create_from_polars_type(PlType(column_name=k, pl_datatype=v)).get_minimal_field_info()
177
178
  for k, v in pl_schema.items()]
179
+
180
+
181
+ def assert_if_flowfile_schema(obj: Iterable) -> bool:
182
+ """
183
+ Assert that the object is a valid iterable of FlowfileColumn objects.
184
+ """
185
+ if isinstance(obj, (list, set, tuple)):
186
+ return all(isinstance(item, FlowfileColumn) for item in obj)
187
+ return False
@@ -32,7 +32,7 @@ def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
32
32
  output_schema.append(FlowfileColumn.from_input(column.new_name, column_schema.data_type,
33
33
  example_values=column_schema.example_values))
34
34
 
35
- for i, fm in enumerate(fm_input.join_mappings):
35
+ for i, fm in enumerate(fm_input.join_mapping):
36
36
  output_schema.append(FlowfileColumn.from_input(f'fuzzy_score_{i}', 'Float64'))
37
37
  return output_schema
38
38
 
@@ -1 +1,2 @@
1
- from flowfile_core.flowfile.flow_data_engine.join.verify_integrity import *
1
+ from flowfile_core.flowfile.flow_data_engine.join.verify_integrity import *
2
+ from flowfile_core.flowfile.flow_data_engine.join.utils import *
@@ -0,0 +1,25 @@
1
+ # Standard library imports
2
+ from typing import Dict, Tuple, TypeVar
3
+
4
+ # Third-party imports
5
+ import polars as pl
6
+
7
+ from flowfile_core.schemas import (
8
+ transform_schema as transform_schemas
9
+ )
10
+
11
+ T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
12
+
13
+
14
+ def rename_df_table_for_join(left_df: T, right_df: T, join_key_rename: transform_schemas.FullJoinKeyResponse) -> Tuple[T, T]:
15
+ return (left_df.rename({r[0]: r[1] for r in join_key_rename.left.join_key_renames}),
16
+ right_df.rename({r[0]: r[1] for r in join_key_rename.right.join_key_renames}))
17
+
18
+
19
+ def get_undo_rename_mapping_join(join_input: transform_schemas.JoinInput) -> Dict[str, str]:
20
+ join_key_rename = join_input.get_join_key_renames(True)
21
+ return {r[1]: r[0] for r in join_key_rename.right.join_key_renames + join_key_rename.left.join_key_renames}
22
+
23
+
24
+ def get_col_name_to_delete(col: transform_schemas.SelectInput, side: transform_schemas.SideLit):
25
+ return col.new_name if not col.join_key else transform_schemas.construct_join_key_name(side, col.new_name)
@@ -18,9 +18,9 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.models import
18
18
  PolarsOperation,
19
19
  Status
20
20
  )
21
- from flowfile_core.flowfile.sources.external_sources.airbyte_sources.models import AirbyteSettings
22
21
  from flowfile_core.flowfile.sources.external_sources.sql_source.models import (DatabaseExternalReadSettings,
23
22
  DatabaseExternalWriteSettings)
23
+ from flowfile_core.schemas.cloud_storage_schemas import CloudStorageWriteSettingsWorkerInterface
24
24
  from flowfile_core.schemas.input_schema import (
25
25
  ReceivedCsvTable,
26
26
  ReceivedExcelTable,
@@ -81,13 +81,6 @@ def trigger_create_operation(flow_id: int, node_id: int | str, received_table: R
81
81
  return Status(**f.json())
82
82
 
83
83
 
84
- def trigger_airbyte_collector(airbyte_settings: AirbyteSettings):
85
- f = requests.post(url=f'{WORKER_URL}/store_airbyte_result', data=airbyte_settings.model_dump_json())
86
- if not f.ok:
87
- raise Exception(f'Could not cache the data, {f.text}')
88
- return Status(**f.json())
89
-
90
-
91
84
  def trigger_database_read_collector(database_external_read_settings: DatabaseExternalReadSettings):
92
85
  f = requests.post(url=f'{WORKER_URL}/store_database_read_result',
93
86
  data=database_external_read_settings.model_dump_json())
@@ -104,6 +97,14 @@ def trigger_database_write(database_external_write_settings: DatabaseExternalWri
104
97
  return Status(**f.json())
105
98
 
106
99
 
100
+ def trigger_cloud_storage_write(database_external_write_settings: CloudStorageWriteSettingsWorkerInterface):
101
+ f = requests.post(url=f'{WORKER_URL}/write_data_to_cloud',
102
+ data=database_external_write_settings.model_dump_json())
103
+ if not f.ok:
104
+ raise Exception(f'Could not cache the data, {f.text}')
105
+ return Status(**f.json())
106
+
107
+
107
108
  def get_results(file_ref: str) -> Status | None:
108
109
  f = requests.get(f'{WORKER_URL}/status/{file_ref}')
109
110
  if f.status_code == 200:
@@ -113,11 +114,15 @@ def get_results(file_ref: str) -> Status | None:
113
114
 
114
115
 
115
116
  def results_exists(file_ref: str):
116
- f = requests.get(f'{WORKER_URL}/status/{file_ref}')
117
- if f.status_code == 200:
118
- if f.json()['status'] == 'Completed':
119
- return True
120
- return False
117
+ try:
118
+ f = requests.get(f'{WORKER_URL}/status/{file_ref}')
119
+ if f.status_code == 200:
120
+ if f.json()['status'] == 'Completed':
121
+ return True
122
+ return False
123
+ except requests.RequestException as e:
124
+ logger.error(f"Failed to check results existence: {str(e)}")
125
+ return False
121
126
 
122
127
 
123
128
  def get_df_result(encoded_df: str) -> pl.LazyFrame:
@@ -336,15 +341,6 @@ class ExternalCreateFetcher(BaseFetcher):
336
341
  _ = self.get_result()
337
342
 
338
343
 
339
- class ExternalAirbyteFetcher(BaseFetcher):
340
- def __init__(self, airbyte_settings: AirbyteSettings, wait_on_completion: bool = True):
341
- r = trigger_airbyte_collector(airbyte_settings)
342
- super().__init__(file_ref=r.background_task_id)
343
- self.running = r.status == 'Processing'
344
- if wait_on_completion:
345
- _ = self.get_result()
346
-
347
-
348
344
  class ExternalDatabaseFetcher(BaseFetcher):
349
345
  def __init__(self, database_external_read_settings: DatabaseExternalReadSettings,
350
346
  wait_on_completion: bool = True):
@@ -365,6 +361,17 @@ class ExternalDatabaseWriter(BaseFetcher):
365
361
  _ = self.get_result()
366
362
 
367
363
 
364
+ class ExternalCloudWriter(BaseFetcher):
365
+
366
+ def __init__(self, cloud_storage_write_settings: CloudStorageWriteSettingsWorkerInterface,
367
+ wait_on_completion: bool = True):
368
+ r = trigger_cloud_storage_write(database_external_write_settings=cloud_storage_write_settings)
369
+ super().__init__(file_ref=r.background_task_id)
370
+ self.running = r.status == 'Processing'
371
+ if wait_on_completion:
372
+ _ = self.get_result()
373
+
374
+
368
375
  class ExternalExecutorTracker:
369
376
  result: Optional[pl.LazyFrame]
370
377
  started: bool = False
@@ -3,30 +3,13 @@ from flowfile_core.configs.settings import AVAILABLE_RAM, WORKER_URL
3
3
  from flowfile_core.configs import logger
4
4
  from flowfile_core.flowfile.flow_data_engine.subprocess_operations import ExternalDfFetcher
5
5
  from flowfile_core.flowfile.flow_data_engine.subprocess_operations import Status
6
+ from flowfile_core.utils.utils import standardize_col_dtype
6
7
  import os
7
8
  from typing import List, Dict, Iterable, Callable, Any
8
- from itertools import chain
9
9
  import requests
10
10
  from base64 import encodebytes
11
11
 
12
12
 
13
- def convert_to_string(v):
14
- try:
15
- return str(v)
16
- except:
17
- return None
18
-
19
-
20
- def standardize_col_dtype(vals):
21
- types = set(type(val) for val in vals)
22
- if len(types) == 1:
23
- return vals
24
- elif int in types and float in types:
25
- return vals
26
- else:
27
- return [convert_to_string(v) for v in vals]
28
-
29
-
30
13
  def get_data_type(vals: Iterable[Any]):
31
14
  types = set(type(val) for val in vals)
32
15
  if len(types) == 1:
@@ -37,28 +20,6 @@ def get_data_type(vals: Iterable[Any]):
37
20
  return 'str'
38
21
 
39
22
 
40
- def ensure_similarity_dicts(datas: List[Dict], respect_order: bool = True):
41
- all_cols = (data.keys() for data in datas)
42
- if not respect_order:
43
- unique_cols = set(chain(*all_cols))
44
- else:
45
- col_store = set()
46
- unique_cols = list()
47
- for row in all_cols:
48
- for col in row:
49
- if col not in col_store:
50
- unique_cols.append(col)
51
- col_store.update((col,))
52
- output = []
53
- for data in datas:
54
- new_record = dict()
55
- for col in unique_cols:
56
- val = data.get(col)
57
- new_record[col] = val
58
- output.append(new_record)
59
- return output
60
-
61
-
62
23
  def calculate_schema(lf: pl.LazyFrame) -> List[Dict]:
63
24
  r = ExternalDfFetcher(lf=lf, operation_type='calculate_schema', wait_on_completion=False, flow_id=-1, node_id=-1)
64
25
  schema_stats: List[Dict] = r.get_result()
@@ -13,10 +13,10 @@ from pyarrow.parquet import ParquetFile
13
13
  from flowfile_core.configs import logger
14
14
  from flowfile_core.configs.flow_logger import FlowLogger
15
15
  from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
16
- from flowfile_core.flowfile.sources.external_sources.airbyte_sources.settings import airbyte_settings_from_config
17
16
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
18
17
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.settings_validator import (calculate_fuzzy_match_schema,
19
18
  pre_calculate_pivot_schema)
19
+ from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import CloudStorageReader
20
20
  from flowfile_core.utils.arrow_reader import get_read_top_n
21
21
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine, execute_polars_code
22
22
  from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_xlsx_datatypes, \
@@ -24,19 +24,22 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_x
24
24
  from flowfile_core.flowfile.sources import external_sources
25
25
  from flowfile_core.schemas import input_schema, schemas, transform_schema
26
26
  from flowfile_core.schemas.output_model import TableExample, NodeData, NodeResult, RunInformation
27
- from flowfile_core.flowfile.utils import snake_case_to_camel_case, _handle_raw_data
27
+ from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal, FullCloudStorageConnection,
28
+ get_cloud_storage_write_settings_worker_interface, AuthMethod)
29
+ from flowfile_core.flowfile.utils import snake_case_to_camel_case
28
30
  from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
29
31
  from flowfile_core.flowfile.flow_node.flow_node import FlowNode
30
32
  from flowfile_core.flowfile.util.execution_orderer import determine_execution_order
31
33
  from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
32
- from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalAirbyteFetcher,
33
- ExternalDatabaseFetcher,
34
+ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalDatabaseFetcher,
34
35
  ExternalDatabaseWriter,
35
- ExternalDfFetcher)
36
+ ExternalDfFetcher,
37
+ ExternalCloudWriter)
36
38
  from flowfile_core.secret_manager.secret_manager import get_encrypted_secret, decrypt_secret
37
39
  from flowfile_core.flowfile.sources.external_sources.sql_source import utils as sql_utils, models as sql_models
38
40
  from flowfile_core.flowfile.sources.external_sources.sql_source.sql_source import SqlSource, BaseSqlSource
39
- from flowfile_core.flowfile.database_connection_manager.db_connections import get_local_database_connection
41
+ from flowfile_core.flowfile.database_connection_manager.db_connections import (get_local_database_connection,
42
+ get_local_cloud_connection)
40
43
  from flowfile_core.flowfile.util.calculate_layout import calculate_layered_layout
41
44
 
42
45
 
@@ -80,6 +83,16 @@ def get_xlsx_schema_callback(engine: str, file_path: str, sheet_name: str, start
80
83
  start_column=start_column, end_row=end_row, end_column=end_column, has_headers=has_headers)
81
84
 
82
85
 
86
+ def get_cloud_connection_settings(connection_name: str, user_id: int, auth_mode: AuthMethod) -> FullCloudStorageConnection:
87
+ cloud_connection_settings = get_local_cloud_connection(connection_name, user_id)
88
+ if cloud_connection_settings is None and auth_mode == "aws-cli":
89
+ # If the auth mode is aws-cli, we do not need connection settings
90
+ cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="aws-cli")
91
+ if cloud_connection_settings is None:
92
+ raise HTTPException(status_code=400, detail="Cloud connection settings not found")
93
+ return cloud_connection_settings
94
+
95
+
83
96
  class FlowGraph:
84
97
  """
85
98
  FlowGraph is a class that enables Extract, Transform and Load (ETL) operations
@@ -656,7 +669,7 @@ class FlowGraph:
656
669
  setting_input: Any = None,
657
670
  cache_results: bool = None,
658
671
  schema_callback: Callable = None,
659
- input_node_ids: List[int] = None):
672
+ input_node_ids: List[int] = None) -> FlowNode:
660
673
  existing_node = self.get_node(node_id)
661
674
  if existing_node is not None:
662
675
  if existing_node.node_type != node_type:
@@ -668,14 +681,13 @@ class FlowGraph:
668
681
  input_nodes = [self.get_node(node_id) for node_id in input_node_ids]
669
682
  else:
670
683
  input_nodes = None
671
- if cache_results is None:
672
- if hasattr(setting_input, 'cache_results'):
673
- cache_results = getattr(setting_input, 'cache_results')
674
- cache_results = False if cache_results is None else cache_results
675
684
  if isinstance(input_columns, str):
676
685
  input_columns = [input_columns]
677
-
678
- if input_nodes is not None or function.__name__ in ('placeholder', 'analysis_preparation'):
686
+ if (
687
+ input_nodes is not None or
688
+ function.__name__ in ('placeholder', 'analysis_preparation') or
689
+ node_type == "cloud_storage_reader"
690
+ ):
679
691
 
680
692
  if not existing_node:
681
693
  node = FlowNode(node_id=node_id,
@@ -703,6 +715,7 @@ class FlowGraph:
703
715
  raise Exception("No data initialized")
704
716
  self._node_db[node_id] = node
705
717
  self._node_ids.append(node_id)
718
+ return node
706
719
 
707
720
  def add_include_cols(self, include_columns: List[str]):
708
721
  for column in include_columns:
@@ -854,80 +867,107 @@ class FlowGraph:
854
867
  self._flow_starts.append(node)
855
868
  self._node_ids.append(node_database_reader.node_id)
856
869
 
857
- def add_airbyte_reader(self, external_source_input: input_schema.NodeAirbyteReader):
858
- logger.info('Adding airbyte reader')
859
- node_type = 'airbyte_reader'
860
- source_settings: input_schema.AirbyteReader = external_source_input.source_settings
861
- airbyte_settings = airbyte_settings_from_config(source_settings, flow_id=self.flow_id,
862
- node_id=external_source_input.node_id)
870
+ def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
871
+ logger.info('Adding sql source')
872
+ self.add_external_source(external_source_input)
863
873
 
864
- logger.info("Airbyte settings created")
865
- airbyte_settings.fields = source_settings.fields
866
- external_source = data_source_factory(source_type='airbyte', airbyte_settings=airbyte_settings)
874
+ def add_cloud_storage_writer(self, node_cloud_storage_writer: input_schema.NodeCloudStorageWriter) -> None:
867
875
 
868
- def _func():
869
- logger.info('Calling external source')
870
- external_fetcher = ExternalAirbyteFetcher(airbyte_settings, wait_on_completion=False)
871
- node._fetch_cached_df = external_fetcher
872
- fl = FlowDataEngine(external_fetcher.get_result())
873
- external_source_input.source_settings.fields = [c.get_minimal_field_info() for c in fl.schema]
874
- return fl
876
+ node_type = "cloud_storage_writer"
877
+
878
+ def _func(df: FlowDataEngine):
879
+ df.lazy = True
880
+ cloud_connection_settings = get_cloud_connection_settings(
881
+ connection_name=node_cloud_storage_writer.cloud_storage_settings.connection_name,
882
+ user_id=node_cloud_storage_writer.user_id,
883
+ auth_mode=node_cloud_storage_writer.cloud_storage_settings.auth_mode
884
+ )
885
+ full_cloud_storage_connection = FullCloudStorageConnection(
886
+ storage_type=cloud_connection_settings.storage_type,
887
+ auth_method=cloud_connection_settings.auth_method,
888
+ aws_allow_unsafe_html=cloud_connection_settings.aws_allow_unsafe_html,
889
+ **CloudStorageReader.get_storage_options(cloud_connection_settings)
890
+ )
891
+ settings = get_cloud_storage_write_settings_worker_interface(
892
+ write_settings=node_cloud_storage_writer.cloud_storage_settings,
893
+ connection=full_cloud_storage_connection,
894
+ lf=df.data_frame,
895
+ flowfile_node_id=node_cloud_storage_writer.node_id,
896
+ flowfile_flow_id=self.flow_id)
897
+ external_database_writer = ExternalCloudWriter(settings, wait_on_completion=False)
898
+ node._fetch_cached_df = external_database_writer
899
+ external_database_writer.get_result()
900
+ return df
875
901
 
876
902
  def schema_callback():
877
- return [FlowfileColumn.from_input(f.name, f.data_type) for f in external_source.schema]
903
+ logger.info("Starting to run the schema callback for cloud storage writer")
904
+ if self.get_node(node_cloud_storage_writer.node_id).is_correct:
905
+ return self.get_node(node_cloud_storage_writer.node_id).node_inputs.main_inputs[0].schema
906
+ else:
907
+ return [FlowfileColumn.from_input(column_name="__error__", data_type="String")]
878
908
 
879
- node = self.get_node(external_source_input.node_id)
880
- if node:
881
- node.node_type = node_type
882
- node.name = node_type
883
- node.function = _func
884
- node.setting_input = external_source_input
885
- node.node_settings.cache_results = external_source_input.cache_results
886
- if external_source_input.node_id not in set(start_node.node_id for start_node in self._flow_starts):
887
- self._flow_starts.append(node)
888
- node.schema_callback = schema_callback
889
- else:
890
- node = FlowNode(external_source_input.node_id, function=_func,
891
- setting_input=external_source_input,
892
- name=node_type, node_type=node_type, parent_uuid=self.uuid,
893
- schema_callback=schema_callback)
894
- self._node_db[external_source_input.node_id] = node
895
- self._flow_starts.append(node)
896
- self._node_ids.append(external_source_input.node_id)
897
- if external_source_input.source_settings.fields and len(external_source_input.source_settings.fields) > 0:
898
- logger.info('Using provided schema in the node')
909
+ self.add_node_step(
910
+ node_id=node_cloud_storage_writer.node_id,
911
+ function=_func,
912
+ input_columns=[],
913
+ node_type=node_type,
914
+ setting_input=node_cloud_storage_writer,
915
+ schema_callback=schema_callback,
916
+ input_node_ids=[node_cloud_storage_writer.depending_on_id]
917
+ )
899
918
 
919
+ node = self.get_node(node_cloud_storage_writer.node_id)
900
920
 
901
- def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
902
- logger.info('Adding sql source')
903
- self.add_external_source(external_source_input)
921
+ def add_cloud_storage_reader(self, node_cloud_storage_reader: input_schema.NodeCloudStorageReader) -> None:
922
+ """
923
+ Adds a cloud storage read node to the flow graph.
924
+ Args:
925
+ node_cloud_storage_reader (input_schema.NodeCloudStorageReader):
926
+ The settings for the cloud storage read node.
927
+ Returns:
928
+ """
929
+ node_type = "cloud_storage_reader"
930
+ logger.info("Adding cloud storage reader")
931
+ cloud_storage_read_settings = node_cloud_storage_reader.cloud_storage_settings
932
+
933
+ def _func():
934
+ logger.info("Starting to run the schema callback for cloud storage reader")
935
+ self.flow_logger.info("Starting to run the schema callback for cloud storage reader")
936
+ settings = CloudStorageReadSettingsInternal(read_settings=cloud_storage_read_settings,
937
+ connection=get_cloud_connection_settings(
938
+ connection_name=cloud_storage_read_settings.connection_name,
939
+ user_id=node_cloud_storage_reader.user_id,
940
+ auth_mode=cloud_storage_read_settings.auth_mode
941
+ ))
942
+ fl = FlowDataEngine.from_cloud_storage_obj(settings)
943
+ return fl
944
+
945
+ node = self.add_node_step(node_id=node_cloud_storage_reader.node_id,
946
+ function=_func,
947
+ cache_results=node_cloud_storage_reader.cache_results,
948
+ setting_input=node_cloud_storage_reader,
949
+ node_type=node_type,
950
+ )
951
+ if node_cloud_storage_reader.node_id not in set(start_node.node_id for start_node in self._flow_starts):
952
+ self._flow_starts.append(node)
904
953
 
905
954
  def add_external_source(self,
906
- external_source_input: input_schema.NodeExternalSource | input_schema.NodeAirbyteReader):
907
-
908
- custom_source_type = external_source_input.identifier != 'airbyte'
909
- if custom_source_type:
910
- node_type = 'external_source'
911
- external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
912
- source_settings = (getattr(input_schema, snake_case_to_camel_case(external_source_input.identifier)).
913
- model_validate(external_source_input.source_settings))
914
- if hasattr(external_source_script, 'initial_getter'):
915
- initial_getter = getattr(external_source_script, 'initial_getter')(source_settings)
916
- else:
917
- initial_getter = None
918
- data_getter = external_source_script.getter(source_settings)
919
- external_source = data_source_factory(source_type='custom',
920
- data_getter=data_getter,
921
- initial_data_getter=initial_getter,
922
- orientation=external_source_input.source_settings.orientation,
923
- schema=None)
955
+ external_source_input: input_schema.NodeExternalSource):
956
+
957
+ node_type = 'external_source'
958
+ external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
959
+ source_settings = (getattr(input_schema, snake_case_to_camel_case(external_source_input.identifier)).
960
+ model_validate(external_source_input.source_settings))
961
+ if hasattr(external_source_script, 'initial_getter'):
962
+ initial_getter = getattr(external_source_script, 'initial_getter')(source_settings)
924
963
  else:
925
- node_type = 'airbyte_reader'
926
- source_settings: input_schema.AirbyteReader = external_source_input.source_settings
927
- airbyte_settings = airbyte_settings_from_config(source_settings, flow_id=self.flow_id,
928
- node_id=external_source_input.node_id)
929
- airbyte_settings.fields = source_settings.fields
930
- external_source = data_source_factory(source_type='airbyte', airbyte_settings=airbyte_settings)
964
+ initial_getter = None
965
+ data_getter = external_source_script.getter(source_settings)
966
+ external_source = data_source_factory(source_type='custom',
967
+ data_getter=data_getter,
968
+ initial_data_getter=initial_getter,
969
+ orientation=external_source_input.source_settings.orientation,
970
+ schema=None)
931
971
 
932
972
  def _func():
933
973
  logger.info('Calling external source')
@@ -984,8 +1024,8 @@ class FlowGraph:
984
1024
  input_data = FlowDataEngine.create_from_path(input_file.received_file)
985
1025
  else:
986
1026
  input_data = FlowDataEngine.create_from_path_worker(input_file.received_file,
987
- node_id=input_file.node_id,
988
- flow_id=self.flow_id)
1027
+ node_id=input_file.node_id,
1028
+ flow_id=self.flow_id)
989
1029
  input_data.name = input_file.received_file.name
990
1030
  return input_data
991
1031
 
@@ -1039,7 +1079,6 @@ class FlowGraph:
1039
1079
 
1040
1080
  def add_datasource(self, input_file: input_schema.NodeDatasource | input_schema.NodeManualInput):
1041
1081
  if isinstance(input_file, input_schema.NodeManualInput):
1042
- _handle_raw_data(input_file)
1043
1082
  input_data = FlowDataEngine(input_file.raw_data_format)
1044
1083
  ref = 'manual_input'
1045
1084
  else:
@@ -1051,10 +1090,8 @@ class FlowGraph:
1051
1090
  node.name = ref
1052
1091
  node.function = input_data
1053
1092
  node.setting_input = input_file
1054
-
1055
1093
  if not input_file.node_id in set(start_node.node_id for start_node in self._flow_starts):
1056
1094
  self._flow_starts.append(node)
1057
-
1058
1095
  else:
1059
1096
  input_data.collect()
1060
1097
  node = FlowNode(input_file.node_id, function=input_data,