Flowfile 0.3.4.1__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +3 -3
- flowfile/api.py +36 -15
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
- flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
- flowfile/web/static/assets/api-fb67319c.js +80 -0
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
- {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/RECORD +109 -104
- {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +2 -0
- flowfile_core/configs/node_store/nodes.py +8 -6
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +402 -18
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +119 -82
- flowfile_core/flowfile/flow_node/flow_node.py +68 -33
- flowfile_core/flowfile/flow_node/models.py +32 -3
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/utils.py +1 -23
- flowfile_core/main.py +3 -2
- flowfile_core/routes/cloud_connections.py +81 -0
- flowfile_core/routes/logs.py +0 -1
- flowfile_core/routes/routes.py +3 -39
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +37 -15
- flowfile_core/schemas/schemas.py +7 -2
- flowfile_core/schemas/transform_schema.py +97 -22
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/flow_frame.py +253 -102
- flowfile_frame/flow_frame_methods.py +13 -13
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +291 -0
- test_utils/s3/fixtures.py +209 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
- {flowfile-0.3.4.1.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
|
|
1
2
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Optional, Any, List, Dict, Literal
|
|
3
|
+
from typing import Optional, Any, List, Dict, Literal, Iterable
|
|
4
|
+
|
|
3
5
|
from flowfile_core.schemas import input_schema
|
|
4
6
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
|
|
5
7
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
|
|
6
|
-
from polars import datatypes
|
|
7
8
|
import polars as pl
|
|
8
9
|
# TODO: rename flow_file_column to flowfile_column
|
|
9
10
|
DataTypeGroup = Literal['numeric', 'str', 'date']
|
|
@@ -175,3 +176,12 @@ def convert_stats_to_column_info(stats: List[Dict]) -> List[FlowfileColumn]:
|
|
|
175
176
|
def convert_pl_schema_to_raw_data_format(pl_schema: pl.Schema) -> List[input_schema.MinimalFieldInfo]:
|
|
176
177
|
return [FlowfileColumn.create_from_polars_type(PlType(column_name=k, pl_datatype=v)).get_minimal_field_info()
|
|
177
178
|
for k, v in pl_schema.items()]
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def assert_if_flowfile_schema(obj: Iterable) -> bool:
|
|
182
|
+
"""
|
|
183
|
+
Assert that the object is a valid iterable of FlowfileColumn objects.
|
|
184
|
+
"""
|
|
185
|
+
if isinstance(obj, (list, set, tuple)):
|
|
186
|
+
return all(isinstance(item, FlowfileColumn) for item in obj)
|
|
187
|
+
return False
|
|
@@ -32,7 +32,7 @@ def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
|
|
|
32
32
|
output_schema.append(FlowfileColumn.from_input(column.new_name, column_schema.data_type,
|
|
33
33
|
example_values=column_schema.example_values))
|
|
34
34
|
|
|
35
|
-
for i, fm in enumerate(fm_input.
|
|
35
|
+
for i, fm in enumerate(fm_input.join_mapping):
|
|
36
36
|
output_schema.append(FlowfileColumn.from_input(f'fuzzy_score_{i}', 'Float64'))
|
|
37
37
|
return output_schema
|
|
38
38
|
|
|
@@ -1 +1,2 @@
|
|
|
1
|
-
from flowfile_core.flowfile.flow_data_engine.join.verify_integrity import *
|
|
1
|
+
from flowfile_core.flowfile.flow_data_engine.join.verify_integrity import *
|
|
2
|
+
from flowfile_core.flowfile.flow_data_engine.join.utils import *
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Standard library imports
|
|
2
|
+
from typing import Dict, Tuple, TypeVar
|
|
3
|
+
|
|
4
|
+
# Third-party imports
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
from flowfile_core.schemas import (
|
|
8
|
+
transform_schema as transform_schemas
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def rename_df_table_for_join(left_df: T, right_df: T, join_key_rename: transform_schemas.FullJoinKeyResponse) -> Tuple[T, T]:
|
|
15
|
+
return (left_df.rename({r[0]: r[1] for r in join_key_rename.left.join_key_renames}),
|
|
16
|
+
right_df.rename({r[0]: r[1] for r in join_key_rename.right.join_key_renames}))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_undo_rename_mapping_join(join_input: transform_schemas.JoinInput) -> Dict[str, str]:
|
|
20
|
+
join_key_rename = join_input.get_join_key_renames(True)
|
|
21
|
+
return {r[1]: r[0] for r in join_key_rename.right.join_key_renames + join_key_rename.left.join_key_renames}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_col_name_to_delete(col: transform_schemas.SelectInput, side: transform_schemas.SideLit):
|
|
25
|
+
return col.new_name if not col.join_key else transform_schemas.construct_join_key_name(side, col.new_name)
|
|
@@ -18,9 +18,9 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.models import
|
|
|
18
18
|
PolarsOperation,
|
|
19
19
|
Status
|
|
20
20
|
)
|
|
21
|
-
from flowfile_core.flowfile.sources.external_sources.airbyte_sources.models import AirbyteSettings
|
|
22
21
|
from flowfile_core.flowfile.sources.external_sources.sql_source.models import (DatabaseExternalReadSettings,
|
|
23
22
|
DatabaseExternalWriteSettings)
|
|
23
|
+
from flowfile_core.schemas.cloud_storage_schemas import CloudStorageWriteSettingsWorkerInterface
|
|
24
24
|
from flowfile_core.schemas.input_schema import (
|
|
25
25
|
ReceivedCsvTable,
|
|
26
26
|
ReceivedExcelTable,
|
|
@@ -81,13 +81,6 @@ def trigger_create_operation(flow_id: int, node_id: int | str, received_table: R
|
|
|
81
81
|
return Status(**f.json())
|
|
82
82
|
|
|
83
83
|
|
|
84
|
-
def trigger_airbyte_collector(airbyte_settings: AirbyteSettings):
|
|
85
|
-
f = requests.post(url=f'{WORKER_URL}/store_airbyte_result', data=airbyte_settings.model_dump_json())
|
|
86
|
-
if not f.ok:
|
|
87
|
-
raise Exception(f'Could not cache the data, {f.text}')
|
|
88
|
-
return Status(**f.json())
|
|
89
|
-
|
|
90
|
-
|
|
91
84
|
def trigger_database_read_collector(database_external_read_settings: DatabaseExternalReadSettings):
|
|
92
85
|
f = requests.post(url=f'{WORKER_URL}/store_database_read_result',
|
|
93
86
|
data=database_external_read_settings.model_dump_json())
|
|
@@ -104,6 +97,14 @@ def trigger_database_write(database_external_write_settings: DatabaseExternalWri
|
|
|
104
97
|
return Status(**f.json())
|
|
105
98
|
|
|
106
99
|
|
|
100
|
+
def trigger_cloud_storage_write(database_external_write_settings: CloudStorageWriteSettingsWorkerInterface):
|
|
101
|
+
f = requests.post(url=f'{WORKER_URL}/write_data_to_cloud',
|
|
102
|
+
data=database_external_write_settings.model_dump_json())
|
|
103
|
+
if not f.ok:
|
|
104
|
+
raise Exception(f'Could not cache the data, {f.text}')
|
|
105
|
+
return Status(**f.json())
|
|
106
|
+
|
|
107
|
+
|
|
107
108
|
def get_results(file_ref: str) -> Status | None:
|
|
108
109
|
f = requests.get(f'{WORKER_URL}/status/{file_ref}')
|
|
109
110
|
if f.status_code == 200:
|
|
@@ -113,11 +114,15 @@ def get_results(file_ref: str) -> Status | None:
|
|
|
113
114
|
|
|
114
115
|
|
|
115
116
|
def results_exists(file_ref: str):
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
if f.
|
|
119
|
-
|
|
120
|
-
|
|
117
|
+
try:
|
|
118
|
+
f = requests.get(f'{WORKER_URL}/status/{file_ref}')
|
|
119
|
+
if f.status_code == 200:
|
|
120
|
+
if f.json()['status'] == 'Completed':
|
|
121
|
+
return True
|
|
122
|
+
return False
|
|
123
|
+
except requests.RequestException as e:
|
|
124
|
+
logger.error(f"Failed to check results existence: {str(e)}")
|
|
125
|
+
return False
|
|
121
126
|
|
|
122
127
|
|
|
123
128
|
def get_df_result(encoded_df: str) -> pl.LazyFrame:
|
|
@@ -336,15 +341,6 @@ class ExternalCreateFetcher(BaseFetcher):
|
|
|
336
341
|
_ = self.get_result()
|
|
337
342
|
|
|
338
343
|
|
|
339
|
-
class ExternalAirbyteFetcher(BaseFetcher):
|
|
340
|
-
def __init__(self, airbyte_settings: AirbyteSettings, wait_on_completion: bool = True):
|
|
341
|
-
r = trigger_airbyte_collector(airbyte_settings)
|
|
342
|
-
super().__init__(file_ref=r.background_task_id)
|
|
343
|
-
self.running = r.status == 'Processing'
|
|
344
|
-
if wait_on_completion:
|
|
345
|
-
_ = self.get_result()
|
|
346
|
-
|
|
347
|
-
|
|
348
344
|
class ExternalDatabaseFetcher(BaseFetcher):
|
|
349
345
|
def __init__(self, database_external_read_settings: DatabaseExternalReadSettings,
|
|
350
346
|
wait_on_completion: bool = True):
|
|
@@ -365,6 +361,17 @@ class ExternalDatabaseWriter(BaseFetcher):
|
|
|
365
361
|
_ = self.get_result()
|
|
366
362
|
|
|
367
363
|
|
|
364
|
+
class ExternalCloudWriter(BaseFetcher):
|
|
365
|
+
|
|
366
|
+
def __init__(self, cloud_storage_write_settings: CloudStorageWriteSettingsWorkerInterface,
|
|
367
|
+
wait_on_completion: bool = True):
|
|
368
|
+
r = trigger_cloud_storage_write(database_external_write_settings=cloud_storage_write_settings)
|
|
369
|
+
super().__init__(file_ref=r.background_task_id)
|
|
370
|
+
self.running = r.status == 'Processing'
|
|
371
|
+
if wait_on_completion:
|
|
372
|
+
_ = self.get_result()
|
|
373
|
+
|
|
374
|
+
|
|
368
375
|
class ExternalExecutorTracker:
|
|
369
376
|
result: Optional[pl.LazyFrame]
|
|
370
377
|
started: bool = False
|
|
@@ -3,30 +3,13 @@ from flowfile_core.configs.settings import AVAILABLE_RAM, WORKER_URL
|
|
|
3
3
|
from flowfile_core.configs import logger
|
|
4
4
|
from flowfile_core.flowfile.flow_data_engine.subprocess_operations import ExternalDfFetcher
|
|
5
5
|
from flowfile_core.flowfile.flow_data_engine.subprocess_operations import Status
|
|
6
|
+
from flowfile_core.utils.utils import standardize_col_dtype
|
|
6
7
|
import os
|
|
7
8
|
from typing import List, Dict, Iterable, Callable, Any
|
|
8
|
-
from itertools import chain
|
|
9
9
|
import requests
|
|
10
10
|
from base64 import encodebytes
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
def convert_to_string(v):
|
|
14
|
-
try:
|
|
15
|
-
return str(v)
|
|
16
|
-
except:
|
|
17
|
-
return None
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def standardize_col_dtype(vals):
|
|
21
|
-
types = set(type(val) for val in vals)
|
|
22
|
-
if len(types) == 1:
|
|
23
|
-
return vals
|
|
24
|
-
elif int in types and float in types:
|
|
25
|
-
return vals
|
|
26
|
-
else:
|
|
27
|
-
return [convert_to_string(v) for v in vals]
|
|
28
|
-
|
|
29
|
-
|
|
30
13
|
def get_data_type(vals: Iterable[Any]):
|
|
31
14
|
types = set(type(val) for val in vals)
|
|
32
15
|
if len(types) == 1:
|
|
@@ -37,28 +20,6 @@ def get_data_type(vals: Iterable[Any]):
|
|
|
37
20
|
return 'str'
|
|
38
21
|
|
|
39
22
|
|
|
40
|
-
def ensure_similarity_dicts(datas: List[Dict], respect_order: bool = True):
|
|
41
|
-
all_cols = (data.keys() for data in datas)
|
|
42
|
-
if not respect_order:
|
|
43
|
-
unique_cols = set(chain(*all_cols))
|
|
44
|
-
else:
|
|
45
|
-
col_store = set()
|
|
46
|
-
unique_cols = list()
|
|
47
|
-
for row in all_cols:
|
|
48
|
-
for col in row:
|
|
49
|
-
if col not in col_store:
|
|
50
|
-
unique_cols.append(col)
|
|
51
|
-
col_store.update((col,))
|
|
52
|
-
output = []
|
|
53
|
-
for data in datas:
|
|
54
|
-
new_record = dict()
|
|
55
|
-
for col in unique_cols:
|
|
56
|
-
val = data.get(col)
|
|
57
|
-
new_record[col] = val
|
|
58
|
-
output.append(new_record)
|
|
59
|
-
return output
|
|
60
|
-
|
|
61
|
-
|
|
62
23
|
def calculate_schema(lf: pl.LazyFrame) -> List[Dict]:
|
|
63
24
|
r = ExternalDfFetcher(lf=lf, operation_type='calculate_schema', wait_on_completion=False, flow_id=-1, node_id=-1)
|
|
64
25
|
schema_stats: List[Dict] = r.get_result()
|
|
@@ -13,10 +13,10 @@ from pyarrow.parquet import ParquetFile
|
|
|
13
13
|
from flowfile_core.configs import logger
|
|
14
14
|
from flowfile_core.configs.flow_logger import FlowLogger
|
|
15
15
|
from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
|
|
16
|
-
from flowfile_core.flowfile.sources.external_sources.airbyte_sources.settings import airbyte_settings_from_config
|
|
17
16
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
|
|
18
17
|
from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.settings_validator import (calculate_fuzzy_match_schema,
|
|
19
18
|
pre_calculate_pivot_schema)
|
|
19
|
+
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import CloudStorageReader
|
|
20
20
|
from flowfile_core.utils.arrow_reader import get_read_top_n
|
|
21
21
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine, execute_polars_code
|
|
22
22
|
from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_xlsx_datatypes, \
|
|
@@ -24,19 +24,22 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_x
|
|
|
24
24
|
from flowfile_core.flowfile.sources import external_sources
|
|
25
25
|
from flowfile_core.schemas import input_schema, schemas, transform_schema
|
|
26
26
|
from flowfile_core.schemas.output_model import TableExample, NodeData, NodeResult, RunInformation
|
|
27
|
-
from flowfile_core.
|
|
27
|
+
from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal, FullCloudStorageConnection,
|
|
28
|
+
get_cloud_storage_write_settings_worker_interface, AuthMethod)
|
|
29
|
+
from flowfile_core.flowfile.utils import snake_case_to_camel_case
|
|
28
30
|
from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
|
|
29
31
|
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
30
32
|
from flowfile_core.flowfile.util.execution_orderer import determine_execution_order
|
|
31
33
|
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
32
|
-
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (
|
|
33
|
-
ExternalDatabaseFetcher,
|
|
34
|
+
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalDatabaseFetcher,
|
|
34
35
|
ExternalDatabaseWriter,
|
|
35
|
-
ExternalDfFetcher
|
|
36
|
+
ExternalDfFetcher,
|
|
37
|
+
ExternalCloudWriter)
|
|
36
38
|
from flowfile_core.secret_manager.secret_manager import get_encrypted_secret, decrypt_secret
|
|
37
39
|
from flowfile_core.flowfile.sources.external_sources.sql_source import utils as sql_utils, models as sql_models
|
|
38
40
|
from flowfile_core.flowfile.sources.external_sources.sql_source.sql_source import SqlSource, BaseSqlSource
|
|
39
|
-
from flowfile_core.flowfile.database_connection_manager.db_connections import get_local_database_connection
|
|
41
|
+
from flowfile_core.flowfile.database_connection_manager.db_connections import (get_local_database_connection,
|
|
42
|
+
get_local_cloud_connection)
|
|
40
43
|
from flowfile_core.flowfile.util.calculate_layout import calculate_layered_layout
|
|
41
44
|
|
|
42
45
|
|
|
@@ -80,6 +83,16 @@ def get_xlsx_schema_callback(engine: str, file_path: str, sheet_name: str, start
|
|
|
80
83
|
start_column=start_column, end_row=end_row, end_column=end_column, has_headers=has_headers)
|
|
81
84
|
|
|
82
85
|
|
|
86
|
+
def get_cloud_connection_settings(connection_name: str, user_id: int, auth_mode: AuthMethod) -> FullCloudStorageConnection:
|
|
87
|
+
cloud_connection_settings = get_local_cloud_connection(connection_name, user_id)
|
|
88
|
+
if cloud_connection_settings is None and auth_mode == "aws-cli":
|
|
89
|
+
# If the auth mode is aws-cli, we do not need connection settings
|
|
90
|
+
cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="aws-cli")
|
|
91
|
+
if cloud_connection_settings is None:
|
|
92
|
+
raise HTTPException(status_code=400, detail="Cloud connection settings not found")
|
|
93
|
+
return cloud_connection_settings
|
|
94
|
+
|
|
95
|
+
|
|
83
96
|
class FlowGraph:
|
|
84
97
|
"""
|
|
85
98
|
FlowGraph is a class that enables Extract, Transform and Load (ETL) operations
|
|
@@ -656,7 +669,7 @@ class FlowGraph:
|
|
|
656
669
|
setting_input: Any = None,
|
|
657
670
|
cache_results: bool = None,
|
|
658
671
|
schema_callback: Callable = None,
|
|
659
|
-
input_node_ids: List[int] = None):
|
|
672
|
+
input_node_ids: List[int] = None) -> FlowNode:
|
|
660
673
|
existing_node = self.get_node(node_id)
|
|
661
674
|
if existing_node is not None:
|
|
662
675
|
if existing_node.node_type != node_type:
|
|
@@ -668,14 +681,13 @@ class FlowGraph:
|
|
|
668
681
|
input_nodes = [self.get_node(node_id) for node_id in input_node_ids]
|
|
669
682
|
else:
|
|
670
683
|
input_nodes = None
|
|
671
|
-
if cache_results is None:
|
|
672
|
-
if hasattr(setting_input, 'cache_results'):
|
|
673
|
-
cache_results = getattr(setting_input, 'cache_results')
|
|
674
|
-
cache_results = False if cache_results is None else cache_results
|
|
675
684
|
if isinstance(input_columns, str):
|
|
676
685
|
input_columns = [input_columns]
|
|
677
|
-
|
|
678
|
-
|
|
686
|
+
if (
|
|
687
|
+
input_nodes is not None or
|
|
688
|
+
function.__name__ in ('placeholder', 'analysis_preparation') or
|
|
689
|
+
node_type == "cloud_storage_reader"
|
|
690
|
+
):
|
|
679
691
|
|
|
680
692
|
if not existing_node:
|
|
681
693
|
node = FlowNode(node_id=node_id,
|
|
@@ -703,6 +715,7 @@ class FlowGraph:
|
|
|
703
715
|
raise Exception("No data initialized")
|
|
704
716
|
self._node_db[node_id] = node
|
|
705
717
|
self._node_ids.append(node_id)
|
|
718
|
+
return node
|
|
706
719
|
|
|
707
720
|
def add_include_cols(self, include_columns: List[str]):
|
|
708
721
|
for column in include_columns:
|
|
@@ -854,80 +867,107 @@ class FlowGraph:
|
|
|
854
867
|
self._flow_starts.append(node)
|
|
855
868
|
self._node_ids.append(node_database_reader.node_id)
|
|
856
869
|
|
|
857
|
-
def
|
|
858
|
-
logger.info('Adding
|
|
859
|
-
|
|
860
|
-
source_settings: input_schema.AirbyteReader = external_source_input.source_settings
|
|
861
|
-
airbyte_settings = airbyte_settings_from_config(source_settings, flow_id=self.flow_id,
|
|
862
|
-
node_id=external_source_input.node_id)
|
|
870
|
+
def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
|
|
871
|
+
logger.info('Adding sql source')
|
|
872
|
+
self.add_external_source(external_source_input)
|
|
863
873
|
|
|
864
|
-
|
|
865
|
-
airbyte_settings.fields = source_settings.fields
|
|
866
|
-
external_source = data_source_factory(source_type='airbyte', airbyte_settings=airbyte_settings)
|
|
874
|
+
def add_cloud_storage_writer(self, node_cloud_storage_writer: input_schema.NodeCloudStorageWriter) -> None:
|
|
867
875
|
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
876
|
+
node_type = "cloud_storage_writer"
|
|
877
|
+
|
|
878
|
+
def _func(df: FlowDataEngine):
|
|
879
|
+
df.lazy = True
|
|
880
|
+
cloud_connection_settings = get_cloud_connection_settings(
|
|
881
|
+
connection_name=node_cloud_storage_writer.cloud_storage_settings.connection_name,
|
|
882
|
+
user_id=node_cloud_storage_writer.user_id,
|
|
883
|
+
auth_mode=node_cloud_storage_writer.cloud_storage_settings.auth_mode
|
|
884
|
+
)
|
|
885
|
+
full_cloud_storage_connection = FullCloudStorageConnection(
|
|
886
|
+
storage_type=cloud_connection_settings.storage_type,
|
|
887
|
+
auth_method=cloud_connection_settings.auth_method,
|
|
888
|
+
aws_allow_unsafe_html=cloud_connection_settings.aws_allow_unsafe_html,
|
|
889
|
+
**CloudStorageReader.get_storage_options(cloud_connection_settings)
|
|
890
|
+
)
|
|
891
|
+
settings = get_cloud_storage_write_settings_worker_interface(
|
|
892
|
+
write_settings=node_cloud_storage_writer.cloud_storage_settings,
|
|
893
|
+
connection=full_cloud_storage_connection,
|
|
894
|
+
lf=df.data_frame,
|
|
895
|
+
flowfile_node_id=node_cloud_storage_writer.node_id,
|
|
896
|
+
flowfile_flow_id=self.flow_id)
|
|
897
|
+
external_database_writer = ExternalCloudWriter(settings, wait_on_completion=False)
|
|
898
|
+
node._fetch_cached_df = external_database_writer
|
|
899
|
+
external_database_writer.get_result()
|
|
900
|
+
return df
|
|
875
901
|
|
|
876
902
|
def schema_callback():
|
|
877
|
-
|
|
903
|
+
logger.info("Starting to run the schema callback for cloud storage writer")
|
|
904
|
+
if self.get_node(node_cloud_storage_writer.node_id).is_correct:
|
|
905
|
+
return self.get_node(node_cloud_storage_writer.node_id).node_inputs.main_inputs[0].schema
|
|
906
|
+
else:
|
|
907
|
+
return [FlowfileColumn.from_input(column_name="__error__", data_type="String")]
|
|
878
908
|
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
node.schema_callback = schema_callback
|
|
889
|
-
else:
|
|
890
|
-
node = FlowNode(external_source_input.node_id, function=_func,
|
|
891
|
-
setting_input=external_source_input,
|
|
892
|
-
name=node_type, node_type=node_type, parent_uuid=self.uuid,
|
|
893
|
-
schema_callback=schema_callback)
|
|
894
|
-
self._node_db[external_source_input.node_id] = node
|
|
895
|
-
self._flow_starts.append(node)
|
|
896
|
-
self._node_ids.append(external_source_input.node_id)
|
|
897
|
-
if external_source_input.source_settings.fields and len(external_source_input.source_settings.fields) > 0:
|
|
898
|
-
logger.info('Using provided schema in the node')
|
|
909
|
+
self.add_node_step(
|
|
910
|
+
node_id=node_cloud_storage_writer.node_id,
|
|
911
|
+
function=_func,
|
|
912
|
+
input_columns=[],
|
|
913
|
+
node_type=node_type,
|
|
914
|
+
setting_input=node_cloud_storage_writer,
|
|
915
|
+
schema_callback=schema_callback,
|
|
916
|
+
input_node_ids=[node_cloud_storage_writer.depending_on_id]
|
|
917
|
+
)
|
|
899
918
|
|
|
919
|
+
node = self.get_node(node_cloud_storage_writer.node_id)
|
|
900
920
|
|
|
901
|
-
def
|
|
902
|
-
|
|
903
|
-
|
|
921
|
+
def add_cloud_storage_reader(self, node_cloud_storage_reader: input_schema.NodeCloudStorageReader) -> None:
|
|
922
|
+
"""
|
|
923
|
+
Adds a cloud storage read node to the flow graph.
|
|
924
|
+
Args:
|
|
925
|
+
node_cloud_storage_reader (input_schema.NodeCloudStorageReader):
|
|
926
|
+
The settings for the cloud storage read node.
|
|
927
|
+
Returns:
|
|
928
|
+
"""
|
|
929
|
+
node_type = "cloud_storage_reader"
|
|
930
|
+
logger.info("Adding cloud storage reader")
|
|
931
|
+
cloud_storage_read_settings = node_cloud_storage_reader.cloud_storage_settings
|
|
932
|
+
|
|
933
|
+
def _func():
|
|
934
|
+
logger.info("Starting to run the schema callback for cloud storage reader")
|
|
935
|
+
self.flow_logger.info("Starting to run the schema callback for cloud storage reader")
|
|
936
|
+
settings = CloudStorageReadSettingsInternal(read_settings=cloud_storage_read_settings,
|
|
937
|
+
connection=get_cloud_connection_settings(
|
|
938
|
+
connection_name=cloud_storage_read_settings.connection_name,
|
|
939
|
+
user_id=node_cloud_storage_reader.user_id,
|
|
940
|
+
auth_mode=cloud_storage_read_settings.auth_mode
|
|
941
|
+
))
|
|
942
|
+
fl = FlowDataEngine.from_cloud_storage_obj(settings)
|
|
943
|
+
return fl
|
|
944
|
+
|
|
945
|
+
node = self.add_node_step(node_id=node_cloud_storage_reader.node_id,
|
|
946
|
+
function=_func,
|
|
947
|
+
cache_results=node_cloud_storage_reader.cache_results,
|
|
948
|
+
setting_input=node_cloud_storage_reader,
|
|
949
|
+
node_type=node_type,
|
|
950
|
+
)
|
|
951
|
+
if node_cloud_storage_reader.node_id not in set(start_node.node_id for start_node in self._flow_starts):
|
|
952
|
+
self._flow_starts.append(node)
|
|
904
953
|
|
|
905
954
|
def add_external_source(self,
|
|
906
|
-
external_source_input: input_schema.NodeExternalSource
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
if hasattr(external_source_script, 'initial_getter'):
|
|
915
|
-
initial_getter = getattr(external_source_script, 'initial_getter')(source_settings)
|
|
916
|
-
else:
|
|
917
|
-
initial_getter = None
|
|
918
|
-
data_getter = external_source_script.getter(source_settings)
|
|
919
|
-
external_source = data_source_factory(source_type='custom',
|
|
920
|
-
data_getter=data_getter,
|
|
921
|
-
initial_data_getter=initial_getter,
|
|
922
|
-
orientation=external_source_input.source_settings.orientation,
|
|
923
|
-
schema=None)
|
|
955
|
+
external_source_input: input_schema.NodeExternalSource):
|
|
956
|
+
|
|
957
|
+
node_type = 'external_source'
|
|
958
|
+
external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
|
|
959
|
+
source_settings = (getattr(input_schema, snake_case_to_camel_case(external_source_input.identifier)).
|
|
960
|
+
model_validate(external_source_input.source_settings))
|
|
961
|
+
if hasattr(external_source_script, 'initial_getter'):
|
|
962
|
+
initial_getter = getattr(external_source_script, 'initial_getter')(source_settings)
|
|
924
963
|
else:
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
964
|
+
initial_getter = None
|
|
965
|
+
data_getter = external_source_script.getter(source_settings)
|
|
966
|
+
external_source = data_source_factory(source_type='custom',
|
|
967
|
+
data_getter=data_getter,
|
|
968
|
+
initial_data_getter=initial_getter,
|
|
969
|
+
orientation=external_source_input.source_settings.orientation,
|
|
970
|
+
schema=None)
|
|
931
971
|
|
|
932
972
|
def _func():
|
|
933
973
|
logger.info('Calling external source')
|
|
@@ -984,8 +1024,8 @@ class FlowGraph:
|
|
|
984
1024
|
input_data = FlowDataEngine.create_from_path(input_file.received_file)
|
|
985
1025
|
else:
|
|
986
1026
|
input_data = FlowDataEngine.create_from_path_worker(input_file.received_file,
|
|
987
|
-
|
|
988
|
-
|
|
1027
|
+
node_id=input_file.node_id,
|
|
1028
|
+
flow_id=self.flow_id)
|
|
989
1029
|
input_data.name = input_file.received_file.name
|
|
990
1030
|
return input_data
|
|
991
1031
|
|
|
@@ -1039,7 +1079,6 @@ class FlowGraph:
|
|
|
1039
1079
|
|
|
1040
1080
|
def add_datasource(self, input_file: input_schema.NodeDatasource | input_schema.NodeManualInput):
|
|
1041
1081
|
if isinstance(input_file, input_schema.NodeManualInput):
|
|
1042
|
-
_handle_raw_data(input_file)
|
|
1043
1082
|
input_data = FlowDataEngine(input_file.raw_data_format)
|
|
1044
1083
|
ref = 'manual_input'
|
|
1045
1084
|
else:
|
|
@@ -1051,10 +1090,8 @@ class FlowGraph:
|
|
|
1051
1090
|
node.name = ref
|
|
1052
1091
|
node.function = input_data
|
|
1053
1092
|
node.setting_input = input_file
|
|
1054
|
-
|
|
1055
1093
|
if not input_file.node_id in set(start_node.node_id for start_node in self._flow_starts):
|
|
1056
1094
|
self._flow_starts.append(node)
|
|
1057
|
-
|
|
1058
1095
|
else:
|
|
1059
1096
|
input_data.collect()
|
|
1060
1097
|
node = FlowNode(input_file.node_id, function=input_data,
|