Flowfile 0.3.9__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +8 -1
- flowfile/api.py +1 -3
- flowfile/web/static/assets/{CloudConnectionManager-c97c25f8.js → CloudConnectionManager-0dfba9f2.js} +2 -2
- flowfile/web/static/assets/{CloudStorageReader-f1ff509e.js → CloudStorageReader-d5b1b6c9.js} +11 -78
- flowfile/web/static/assets/{CloudStorageWriter-034f8b78.js → CloudStorageWriter-00d87aad.js} +12 -79
- flowfile/web/static/assets/{CloudStorageWriter-49c9a4b2.css → CloudStorageWriter-b0ee067f.css} +24 -24
- flowfile/web/static/assets/ColumnSelector-4685e75d.js +83 -0
- flowfile/web/static/assets/ColumnSelector-47996a16.css +10 -0
- flowfile/web/static/assets/ContextMenu-23e909da.js +41 -0
- flowfile/web/static/assets/{SettingsSection-9c836ecc.css → ContextMenu-4c74eef1.css} +0 -21
- flowfile/web/static/assets/ContextMenu-63cfa99b.css +26 -0
- flowfile/web/static/assets/ContextMenu-70ae0c79.js +41 -0
- flowfile/web/static/assets/ContextMenu-c13f91d0.css +26 -0
- flowfile/web/static/assets/ContextMenu-f149cf7c.js +41 -0
- flowfile/web/static/assets/{CrossJoin-41efa4cb.css → CrossJoin-1119d18e.css} +18 -18
- flowfile/web/static/assets/{CrossJoin-9e156ebe.js → CrossJoin-702a3edd.js} +14 -84
- flowfile/web/static/assets/CustomNode-74a37f74.css +32 -0
- flowfile/web/static/assets/CustomNode-b1519993.js +211 -0
- flowfile/web/static/assets/{DatabaseConnectionSettings-d5c625b3.js → DatabaseConnectionSettings-6f3e4ea5.js} +3 -3
- flowfile/web/static/assets/{DatabaseManager-265adc5e.js → DatabaseManager-cf5ef661.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-f50c6558.css → DatabaseReader-ae61773c.css} +0 -27
- flowfile/web/static/assets/{DatabaseReader-0b10551e.js → DatabaseReader-d38c7295.js} +14 -114
- flowfile/web/static/assets/{DatabaseWriter-c17c6916.js → DatabaseWriter-b04ef46a.js} +13 -74
- flowfile/web/static/assets/{ExploreData-5bdae813.css → ExploreData-2d0cf4db.css} +8 -14
- flowfile/web/static/assets/ExploreData-5fa10ed8.js +192 -0
- flowfile/web/static/assets/{ExternalSource-3a66556c.js → ExternalSource-d39af878.js} +8 -79
- flowfile/web/static/assets/{Filter-91ad87e7.js → Filter-9b6d08db.js} +12 -85
- flowfile/web/static/assets/{Filter-a9d08ba1.css → Filter-f62091b3.css} +3 -3
- flowfile/web/static/assets/{Formula-3c395ab1.js → Formula-6b04fb1d.js} +20 -87
- flowfile/web/static/assets/{Formula-29f19d21.css → Formula-bb96803d.css} +4 -4
- flowfile/web/static/assets/{FuzzyMatch-6857de82.css → FuzzyMatch-1010f966.css} +42 -42
- flowfile/web/static/assets/{FuzzyMatch-2df0d230.js → FuzzyMatch-999521f4.js} +16 -87
- flowfile/web/static/assets/{GraphSolver-d285877f.js → GraphSolver-17dd2198.js} +13 -159
- flowfile/web/static/assets/GraphSolver-f0cb7bfb.css +22 -0
- flowfile/web/static/assets/{GroupBy-0bd1cc6b.js → GroupBy-6b039e18.js} +12 -75
- flowfile/web/static/assets/{Unique-b5615727.css → GroupBy-b9505323.css} +8 -8
- flowfile/web/static/assets/{Join-5a78a203.js → Join-24d0f113.js} +15 -85
- flowfile/web/static/assets/{Join-f45eff22.css → Join-fd79b451.css} +20 -20
- flowfile/web/static/assets/{ManualInput-a71b52c6.css → ManualInput-3246a08d.css} +20 -20
- flowfile/web/static/assets/{ManualInput-93aef9d6.js → ManualInput-34639209.js} +11 -82
- flowfile/web/static/assets/MultiSelect-0e8724a3.js +5 -0
- flowfile/web/static/assets/MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js +63 -0
- flowfile/web/static/assets/NumericInput-3d63a470.js +5 -0
- flowfile/web/static/assets/NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js +35 -0
- flowfile/web/static/assets/Output-283fe388.css +37 -0
- flowfile/web/static/assets/{Output-411ecaee.js → Output-edea9802.js} +62 -273
- flowfile/web/static/assets/{Pivot-89db4b04.js → Pivot-61d19301.js} +14 -138
- flowfile/web/static/assets/Pivot-cf333e3d.css +22 -0
- flowfile/web/static/assets/PivotValidation-891ddfb0.css +13 -0
- flowfile/web/static/assets/PivotValidation-c46cd420.css +13 -0
- flowfile/web/static/assets/PivotValidation-de9f43fe.js +61 -0
- flowfile/web/static/assets/PivotValidation-f97fec5b.js +61 -0
- flowfile/web/static/assets/{PolarsCode-a9f974f8.js → PolarsCode-bc3c9984.js} +13 -80
- flowfile/web/static/assets/Read-64a3f259.js +218 -0
- flowfile/web/static/assets/Read-e808b239.css +62 -0
- flowfile/web/static/assets/RecordCount-3d5039be.js +53 -0
- flowfile/web/static/assets/{RecordId-55ae7d36.js → RecordId-597510e0.js} +8 -80
- flowfile/web/static/assets/SQLQueryComponent-36cef432.css +27 -0
- flowfile/web/static/assets/SQLQueryComponent-df51adbe.js +38 -0
- flowfile/web/static/assets/{Sample-b4a18476.js → Sample-4be0a507.js} +8 -77
- flowfile/web/static/assets/{SecretManager-b066d13a.js → SecretManager-4839be57.js} +2 -2
- flowfile/web/static/assets/{Select-727688dc.js → Select-9b72f201.js} +11 -85
- flowfile/web/static/assets/SettingsSection-2e4d03c4.css +21 -0
- flowfile/web/static/assets/SettingsSection-5c696bee.css +20 -0
- flowfile/web/static/assets/SettingsSection-71e6b7e3.css +21 -0
- flowfile/web/static/assets/SettingsSection-7ded385d.js +45 -0
- flowfile/web/static/assets/{SettingsSection-695ac487.js → SettingsSection-e1e9c953.js} +2 -40
- flowfile/web/static/assets/SettingsSection-f0f75a42.js +53 -0
- flowfile/web/static/assets/SingleSelect-6c777aac.js +5 -0
- flowfile/web/static/assets/SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js +62 -0
- flowfile/web/static/assets/SliderInput-7cb93e62.js +40 -0
- flowfile/web/static/assets/SliderInput-b8fb6a8c.css +4 -0
- flowfile/web/static/assets/{GroupBy-ab1ea74b.css → Sort-3643d625.css} +8 -8
- flowfile/web/static/assets/{Sort-be3339a8.js → Sort-6cbde21a.js} +12 -97
- flowfile/web/static/assets/TextInput-d9a40c11.js +5 -0
- flowfile/web/static/assets/TextInput.vue_vue_type_script_setup_true_lang-5896c375.js +32 -0
- flowfile/web/static/assets/{TextToRows-c92d1ec2.css → TextToRows-5d2c1190.css} +9 -9
- flowfile/web/static/assets/{TextToRows-7b8998da.js → TextToRows-c4fcbf4d.js} +14 -83
- flowfile/web/static/assets/ToggleSwitch-4ef91d19.js +5 -0
- flowfile/web/static/assets/ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js +31 -0
- flowfile/web/static/assets/{UnavailableFields-8b0cb48e.js → UnavailableFields-a03f512c.js} +2 -2
- flowfile/web/static/assets/{Union-8d9ac7f9.css → Union-af6c3d9b.css} +6 -6
- flowfile/web/static/assets/Union-bfe9b996.js +77 -0
- flowfile/web/static/assets/{Unique-af5a80b4.js → Unique-5d023a27.js} +23 -104
- flowfile/web/static/assets/{Sort-7ccfa0fe.css → Unique-f9fb0809.css} +8 -8
- flowfile/web/static/assets/Unpivot-1e422df3.css +30 -0
- flowfile/web/static/assets/{Unpivot-5195d411.js → Unpivot-91cc5354.js} +12 -166
- flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +13 -0
- flowfile/web/static/assets/UnpivotValidation-7ee2de44.js +51 -0
- flowfile/web/static/assets/{ExploreData-18a4fe52.js → VueGraphicWalker-e51b9924.js} +4 -264
- flowfile/web/static/assets/VueGraphicWalker-ed5ab88b.css +6 -0
- flowfile/web/static/assets/{api-cb00cce6.js → api-c1bad5ca.js} +1 -1
- flowfile/web/static/assets/{api-023d1733.js → api-cf1221f0.js} +1 -1
- flowfile/web/static/assets/{designer-2197d782.css → designer-8da3ba3a.css} +859 -201
- flowfile/web/static/assets/{designer-6c322d8e.js → designer-9633482a.js} +2297 -733
- flowfile/web/static/assets/{documentation-4d1fafe1.js → documentation-ca400224.js} +1 -1
- flowfile/web/static/assets/{dropDown-0b46dd77.js → dropDown-614b998d.js} +1 -1
- flowfile/web/static/assets/{fullEditor-ec4e4f95.js → fullEditor-f7971590.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-def5879b.js → genericNodeSettings-4fe5f36b.js} +3 -3
- flowfile/web/static/assets/{index-681a3ed0.css → index-50508d4d.css} +8 -0
- flowfile/web/static/assets/{index-683fc198.js → index-5429bbf8.js} +208 -31
- flowfile/web/static/assets/nodeInput-5d0d6b79.js +41 -0
- flowfile/web/static/assets/outputCsv-076b85ab.js +86 -0
- flowfile/web/static/assets/{Output-48f81019.css → outputCsv-9cc59e0b.css} +0 -143
- flowfile/web/static/assets/outputExcel-0fd17dbe.js +56 -0
- flowfile/web/static/assets/outputExcel-b41305c0.css +102 -0
- flowfile/web/static/assets/outputParquet-b61e0847.js +31 -0
- flowfile/web/static/assets/outputParquet-cf8cf3f2.css +4 -0
- flowfile/web/static/assets/readCsv-a8bb8b61.js +179 -0
- flowfile/web/static/assets/readCsv-c767cb37.css +52 -0
- flowfile/web/static/assets/readExcel-67b4aee0.js +201 -0
- flowfile/web/static/assets/readExcel-806d2826.css +64 -0
- flowfile/web/static/assets/readParquet-48c81530.css +19 -0
- flowfile/web/static/assets/readParquet-92ce1dbc.js +23 -0
- flowfile/web/static/assets/{secretApi-baceb6f9.js → secretApi-68435402.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-de91449a.js → selectDynamic-92e25ee3.js} +7 -7
- flowfile/web/static/assets/{selectDynamic-b062bc9b.css → selectDynamic-aa913ff4.css} +16 -16
- flowfile/web/static/assets/user-defined-icon-0ae16c90.png +0 -0
- flowfile/web/static/assets/{vue-codemirror.esm-dc5e3348.js → vue-codemirror.esm-41b0e0d7.js} +65 -36
- flowfile/web/static/assets/{vue-content-loader.es-ba94b82f.js → vue-content-loader.es-2c8e608f.js} +1 -1
- flowfile/web/static/index.html +2 -2
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/METADATA +5 -3
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/RECORD +191 -121
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/WHEEL +1 -1
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/entry_points.txt +1 -0
- flowfile_core/__init__.py +3 -0
- flowfile_core/configs/flow_logger.py +5 -13
- flowfile_core/configs/node_store/__init__.py +30 -0
- flowfile_core/configs/node_store/nodes.py +383 -99
- flowfile_core/configs/node_store/user_defined_node_registry.py +193 -0
- flowfile_core/configs/settings.py +2 -1
- flowfile_core/database/connection.py +5 -21
- flowfile_core/fileExplorer/funcs.py +239 -121
- flowfile_core/flowfile/analytics/analytics_processor.py +1 -0
- flowfile_core/flowfile/code_generator/code_generator.py +62 -64
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +73 -56
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +77 -86
- flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +4 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +19 -34
- flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +23 -23
- flowfile_core/flowfile/flow_data_engine/join/utils.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +9 -4
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +212 -86
- flowfile_core/flowfile/flow_data_engine/utils.py +2 -0
- flowfile_core/flowfile/flow_graph.py +240 -54
- flowfile_core/flowfile/flow_node/flow_node.py +48 -13
- flowfile_core/flowfile/flow_node/models.py +2 -1
- flowfile_core/flowfile/handler.py +24 -5
- flowfile_core/flowfile/manage/compatibility_enhancements.py +404 -41
- flowfile_core/flowfile/manage/io_flowfile.py +394 -0
- flowfile_core/flowfile/node_designer/__init__.py +47 -0
- flowfile_core/flowfile/node_designer/_type_registry.py +197 -0
- flowfile_core/flowfile/node_designer/custom_node.py +371 -0
- flowfile_core/flowfile/node_designer/ui_components.py +277 -0
- flowfile_core/flowfile/schema_callbacks.py +17 -10
- flowfile_core/flowfile/setting_generator/settings.py +15 -10
- flowfile_core/main.py +5 -1
- flowfile_core/routes/routes.py +73 -30
- flowfile_core/routes/user_defined_components.py +55 -0
- flowfile_core/schemas/cloud_storage_schemas.py +0 -2
- flowfile_core/schemas/input_schema.py +228 -65
- flowfile_core/schemas/output_model.py +5 -2
- flowfile_core/schemas/schemas.py +153 -35
- flowfile_core/schemas/transform_schema.py +1083 -412
- flowfile_core/schemas/yaml_types.py +103 -0
- flowfile_core/types.py +156 -0
- flowfile_core/utils/validate_setup.py +3 -1
- flowfile_frame/__init__.py +3 -1
- flowfile_frame/flow_frame.py +31 -24
- flowfile_frame/flow_frame_methods.py +12 -9
- flowfile_worker/__init__.py +9 -35
- flowfile_worker/create/__init__.py +3 -21
- flowfile_worker/create/funcs.py +68 -56
- flowfile_worker/create/models.py +130 -62
- flowfile_worker/main.py +5 -2
- flowfile_worker/routes.py +52 -13
- shared/__init__.py +15 -0
- shared/storage_config.py +258 -0
- tools/migrate/README.md +56 -0
- tools/migrate/__init__.py +12 -0
- tools/migrate/__main__.py +131 -0
- tools/migrate/legacy_schemas.py +621 -0
- tools/migrate/migrate.py +598 -0
- tools/migrate/tests/__init__.py +0 -0
- tools/migrate/tests/conftest.py +23 -0
- tools/migrate/tests/test_migrate.py +627 -0
- tools/migrate/tests/test_migration_e2e.py +1010 -0
- tools/migrate/tests/test_node_migrations.py +813 -0
- flowfile/web/static/assets/GraphSolver-17fd26db.css +0 -68
- flowfile/web/static/assets/Pivot-f415e85f.css +0 -35
- flowfile/web/static/assets/Read-80dc1675.css +0 -197
- flowfile/web/static/assets/Read-c3b1929c.js +0 -701
- flowfile/web/static/assets/RecordCount-4e95f98e.js +0 -122
- flowfile/web/static/assets/Union-89fd73dc.js +0 -146
- flowfile/web/static/assets/Unpivot-246e9bbd.css +0 -77
- flowfile/web/static/assets/nodeTitle-a16db7c3.js +0 -227
- flowfile/web/static/assets/nodeTitle-f4b12bcb.css +0 -134
- flowfile_core/flowfile/manage/open_flowfile.py +0 -135
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info/licenses}/LICENSE +0 -0
- /flowfile_core/flowfile/manage/manage_flowfile.py → /tools/__init__.py +0 -0
|
@@ -67,7 +67,7 @@ from flowfile_core.flowfile.sources.external_sources.base_class import ExternalD
|
|
|
67
67
|
T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
|
|
68
68
|
|
|
69
69
|
|
|
70
|
-
def _handle_duplication_join_keys(left_df: T, right_df: T,
|
|
70
|
+
def _handle_duplication_join_keys(left_df: T, right_df: T, join_manager: transform_schemas.JoinInputManager) -> Tuple[T, T, Dict[str, str]]:
|
|
71
71
|
"""Temporarily renames join keys to avoid conflicts during a join.
|
|
72
72
|
|
|
73
73
|
This helper function checks the join type and renames the join key columns
|
|
@@ -86,20 +86,22 @@ def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform
|
|
|
86
86
|
- The (potentially modified) right DataFrame.
|
|
87
87
|
- A dictionary mapping the temporary names back to their desired final names.
|
|
88
88
|
"""
|
|
89
|
+
|
|
89
90
|
def _construct_temp_name(column_name: str) -> str:
|
|
90
91
|
return "__FL_TEMP__"+column_name
|
|
91
|
-
|
|
92
|
+
|
|
93
|
+
if join_manager.how == 'right':
|
|
92
94
|
left_df = left_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
93
|
-
for jk in
|
|
95
|
+
for jk in join_manager.left_manager.get_join_key_selects())
|
|
94
96
|
reverse_actions = {
|
|
95
97
|
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
|
|
96
|
-
for jk in
|
|
97
|
-
elif
|
|
98
|
+
for jk in join_manager.left_manager.get_join_key_selects()}
|
|
99
|
+
elif join_manager.how in ('left', 'inner'):
|
|
98
100
|
right_df = right_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
|
|
99
|
-
|
|
101
|
+
for jk in join_manager.right_manager.get_join_key_selects())
|
|
100
102
|
reverse_actions = {
|
|
101
103
|
_construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
|
|
102
|
-
for jk in
|
|
104
|
+
for jk in join_manager.right_manager.get_join_key_selects()}
|
|
103
105
|
else:
|
|
104
106
|
reverse_actions = {}
|
|
105
107
|
return left_df, right_df, reverse_actions
|
|
@@ -193,7 +195,6 @@ class FlowDataEngine:
|
|
|
193
195
|
_number_of_records_callback: Callable = None
|
|
194
196
|
_data_callback: Callable = None
|
|
195
197
|
|
|
196
|
-
|
|
197
198
|
def __init__(self,
|
|
198
199
|
raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
|
|
199
200
|
path_ref: str = None,
|
|
@@ -1147,7 +1148,7 @@ class FlowDataEngine:
|
|
|
1147
1148
|
return cls(df, schema=schema, calculate_schema_stats=False, number_of_records=0)
|
|
1148
1149
|
|
|
1149
1150
|
@classmethod
|
|
1150
|
-
def create_from_path(cls, received_table: input_schema.
|
|
1151
|
+
def create_from_path(cls, received_table: input_schema.ReceivedTable) -> "FlowDataEngine":
|
|
1151
1152
|
"""Creates a FlowDataEngine from a local file path.
|
|
1152
1153
|
|
|
1153
1154
|
Supports various file types like CSV, Parquet, and Excel.
|
|
@@ -1579,7 +1580,6 @@ class FlowDataEngine:
|
|
|
1579
1580
|
A new `FlowDataEngine` instance containing the sampled data.
|
|
1580
1581
|
"""
|
|
1581
1582
|
logging.info(f'Getting sample of {n_rows} rows')
|
|
1582
|
-
|
|
1583
1583
|
if random:
|
|
1584
1584
|
if self.lazy and self.external_source is not None:
|
|
1585
1585
|
self.collect_external()
|
|
@@ -1657,9 +1657,12 @@ class FlowDataEngine:
|
|
|
1657
1657
|
An `ExternalFuzzyMatchFetcher` object that can be used to track the
|
|
1658
1658
|
progress and retrieve the result of the fuzzy join.
|
|
1659
1659
|
"""
|
|
1660
|
-
|
|
1660
|
+
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1661
|
+
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
1662
|
+
fuzzy_match_input_manager=fuzzy_match_input_manager)
|
|
1663
|
+
|
|
1661
1664
|
return ExternalFuzzyMatchFetcher(left_df, right_df,
|
|
1662
|
-
fuzzy_maps=
|
|
1665
|
+
fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
|
|
1663
1666
|
file_ref=file_ref + '_fm',
|
|
1664
1667
|
wait_on_completion=False,
|
|
1665
1668
|
flow_id=flow_id,
|
|
@@ -1674,10 +1677,12 @@ class FlowDataEngine:
|
|
|
1674
1677
|
):
|
|
1675
1678
|
if file_ref is None:
|
|
1676
1679
|
file_ref = str(id(self)) + '_' + str(id(other))
|
|
1680
|
+
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1677
1681
|
|
|
1678
|
-
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
1682
|
+
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
1683
|
+
fuzzy_match_input_manager=fuzzy_match_input_manager)
|
|
1679
1684
|
external_tracker = ExternalFuzzyMatchFetcher(left_df, right_df,
|
|
1680
|
-
fuzzy_maps=
|
|
1685
|
+
fuzzy_maps=fuzzy_match_input_manager.fuzzy_maps,
|
|
1681
1686
|
file_ref=file_ref + '_fm',
|
|
1682
1687
|
wait_on_completion=False,
|
|
1683
1688
|
flow_id=flow_id,
|
|
@@ -1687,8 +1692,10 @@ class FlowDataEngine:
|
|
|
1687
1692
|
def fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1688
1693
|
other: "FlowDataEngine",
|
|
1689
1694
|
node_logger: NodeLogger = None) -> "FlowDataEngine":
|
|
1690
|
-
|
|
1691
|
-
|
|
1695
|
+
fuzzy_match_input_manager = transform_schemas.FuzzyMatchInputManager(fuzzy_match_input)
|
|
1696
|
+
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
1697
|
+
fuzzy_match_input_manager=fuzzy_match_input_manager)
|
|
1698
|
+
fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input_manager.fuzzy_maps]
|
|
1692
1699
|
return FlowDataEngine(fuzzy_match_dfs(left_df, right_df, fuzzy_maps=fuzzy_mappings,
|
|
1693
1700
|
logger=node_logger.logger if node_logger else logger)
|
|
1694
1701
|
.lazy())
|
|
@@ -1713,24 +1720,22 @@ class FlowDataEngine:
|
|
|
1713
1720
|
Exception: If `verify_integrity` is True and the join would result in
|
|
1714
1721
|
an excessively large number of records.
|
|
1715
1722
|
"""
|
|
1716
|
-
|
|
1717
1723
|
self.lazy = True
|
|
1718
|
-
|
|
1719
1724
|
other.lazy = True
|
|
1720
|
-
|
|
1721
|
-
verify_join_select_integrity(
|
|
1722
|
-
right_select = [v.old_name for v in
|
|
1725
|
+
cross_join_input_manager = transform_schemas.CrossJoinInputManager(cross_join_input)
|
|
1726
|
+
verify_join_select_integrity(cross_join_input_manager.input, left_columns=self.columns, right_columns=other.columns)
|
|
1727
|
+
right_select = [v.old_name for v in cross_join_input_manager.right_select.renames
|
|
1723
1728
|
if (v.keep or v.join_key) and v.is_available]
|
|
1724
|
-
left_select = [v.old_name for v in
|
|
1729
|
+
left_select = [v.old_name for v in cross_join_input_manager.left_select.renames
|
|
1725
1730
|
if (v.keep or v.join_key) and v.is_available]
|
|
1726
|
-
|
|
1727
|
-
left = self.data_frame.select(left_select).rename(
|
|
1728
|
-
right = other.data_frame.select(right_select).rename(
|
|
1731
|
+
cross_join_input_manager.auto_rename(rename_mode="suffix")
|
|
1732
|
+
left = self.data_frame.select(left_select).rename(cross_join_input_manager.left_select.rename_table)
|
|
1733
|
+
right = other.data_frame.select(right_select).rename(cross_join_input_manager.right_select.rename_table)
|
|
1729
1734
|
|
|
1730
1735
|
joined_df = left.join(right, how='cross')
|
|
1731
1736
|
|
|
1732
1737
|
cols_to_delete_after = [col.new_name for col in
|
|
1733
|
-
|
|
1738
|
+
cross_join_input_manager.left_select.renames + cross_join_input_manager.left_select.renames
|
|
1734
1739
|
if col.join_key and not col.keep and col.is_available]
|
|
1735
1740
|
|
|
1736
1741
|
fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
|
|
@@ -1738,76 +1743,60 @@ class FlowDataEngine:
|
|
|
1738
1743
|
|
|
1739
1744
|
def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
|
|
1740
1745
|
verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
|
|
1741
|
-
"""Performs a standard SQL-style join with another DataFrame.
|
|
1746
|
+
"""Performs a standard SQL-style join with another DataFrame."""
|
|
1747
|
+
# Create manager from input
|
|
1748
|
+
join_manager = transform_schemas.JoinInputManager(join_input)
|
|
1749
|
+
ensure_right_unselect_for_semi_and_anti_joins(join_manager.input)
|
|
1750
|
+
for jk in join_manager.join_mapping:
|
|
1751
|
+
if jk.left_col not in {c.old_name for c in join_manager.left_select.renames}:
|
|
1752
|
+
join_manager.left_select.append(transform_schemas.SelectInput(jk.left_col, keep=False))
|
|
1753
|
+
if jk.right_col not in {c.old_name for c in join_manager.right_select.renames}:
|
|
1754
|
+
join_manager.right_select.append(transform_schemas.SelectInput(jk.right_col, keep=False))
|
|
1755
|
+
verify_join_select_integrity(join_manager.input, left_columns=self.columns, right_columns=other.columns)
|
|
1756
|
+
if not verify_join_map_integrity(join_manager.input, left_columns=self.schema, right_columns=other.schema):
|
|
1757
|
+
raise Exception('Join is not valid by the data fields')
|
|
1742
1758
|
|
|
1743
|
-
|
|
1759
|
+
if auto_generate_selection:
|
|
1760
|
+
join_manager.auto_rename()
|
|
1744
1761
|
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
auto_generate_selection: If True, automatically handles column renaming.
|
|
1749
|
-
verify_integrity: If True, performs checks to prevent excessively large joins.
|
|
1750
|
-
other: The right `FlowDataEngine` to join with.
|
|
1762
|
+
# Use manager properties throughout
|
|
1763
|
+
left = self.data_frame.select(join_manager.left_manager.get_select_cols()).rename(join_manager.left_manager.get_rename_table())
|
|
1764
|
+
right = other.data_frame.select(join_manager.right_manager.get_select_cols()).rename(join_manager.right_manager.get_rename_table())
|
|
1751
1765
|
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
Raises:
|
|
1756
|
-
Exception: If the join configuration is invalid or if `verify_integrity`
|
|
1757
|
-
is True and the join is predicted to be too large.
|
|
1758
|
-
"""
|
|
1759
|
-
ensure_right_unselect_for_semi_and_anti_joins(join_input)
|
|
1760
|
-
verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
|
|
1761
|
-
if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
|
|
1762
|
-
raise Exception('Join is not valid by the data fields')
|
|
1763
|
-
if auto_generate_selection:
|
|
1764
|
-
join_input.auto_rename()
|
|
1765
|
-
left = self.data_frame.select(get_select_columns(join_input.left_select.renames)).rename(join_input.left_select.rename_table)
|
|
1766
|
-
right = other.data_frame.select(get_select_columns(join_input.right_select.renames)).rename(join_input.right_select.rename_table)
|
|
1767
|
-
if verify_integrity and join_input.how != 'right':
|
|
1768
|
-
n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
|
|
1769
|
-
right_on_keys=join_input.right_join_keys, how=join_input.how)
|
|
1770
|
-
if n_records > 1_000_000_000:
|
|
1771
|
-
raise Exception("Join will result in too many records, ending process")
|
|
1772
|
-
else:
|
|
1773
|
-
n_records = -1
|
|
1774
|
-
left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_input)
|
|
1775
|
-
left, right = rename_df_table_for_join(left, right, join_input.get_join_key_renames())
|
|
1776
|
-
if join_input.how == 'right':
|
|
1766
|
+
left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_manager)
|
|
1767
|
+
left, right = rename_df_table_for_join(left, right, join_manager.get_join_key_renames())
|
|
1768
|
+
if join_manager.how == 'right':
|
|
1777
1769
|
joined_df = right.join(
|
|
1778
1770
|
other=left,
|
|
1779
|
-
left_on=
|
|
1780
|
-
right_on=
|
|
1771
|
+
left_on=join_manager.right_join_keys,
|
|
1772
|
+
right_on=join_manager.left_join_keys,
|
|
1781
1773
|
how="left",
|
|
1782
1774
|
suffix="").rename(reverse_join_key_mapping)
|
|
1783
1775
|
else:
|
|
1784
1776
|
joined_df = left.join(
|
|
1785
1777
|
other=right,
|
|
1786
|
-
left_on=
|
|
1787
|
-
right_on=
|
|
1788
|
-
how=
|
|
1778
|
+
left_on=join_manager.left_join_keys,
|
|
1779
|
+
right_on=join_manager.right_join_keys,
|
|
1780
|
+
how=join_manager.how,
|
|
1789
1781
|
suffix="").rename(reverse_join_key_mapping)
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
]
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
]
|
|
1782
|
+
|
|
1783
|
+
left_cols_to_delete_after = [get_col_name_to_delete(col, 'left')
|
|
1784
|
+
for col in join_manager.input.left_select.renames
|
|
1785
|
+
if not col.keep and col.is_available and col.join_key]
|
|
1786
|
+
|
|
1787
|
+
right_cols_to_delete_after = [get_col_name_to_delete(col, 'right')
|
|
1788
|
+
for col in join_manager.input.right_select.renames
|
|
1789
|
+
if not col.keep and col.is_available and col.join_key
|
|
1790
|
+
and join_manager.how in ("left", "right", "inner", "cross", "outer")]
|
|
1791
|
+
|
|
1799
1792
|
if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
|
|
1800
1793
|
joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
|
|
1801
|
-
|
|
1794
|
+
|
|
1795
|
+
undo_join_key_remapping = get_undo_rename_mapping_join(join_manager)
|
|
1802
1796
|
joined_df = joined_df.rename(undo_join_key_remapping)
|
|
1803
1797
|
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
number_of_records=n_records, streamable=False)
|
|
1807
|
-
else:
|
|
1808
|
-
fl = FlowDataEngine(joined_df, calculate_schema_stats=False,
|
|
1809
|
-
number_of_records=0, streamable=False)
|
|
1810
|
-
return fl
|
|
1798
|
+
return FlowDataEngine(joined_df, calculate_schema_stats=False,
|
|
1799
|
+
number_of_records=0, streamable=False)
|
|
1811
1800
|
|
|
1812
1801
|
def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
|
|
1813
1802
|
"""Solves a graph problem represented by 'from' and 'to' columns.
|
|
@@ -2105,7 +2094,7 @@ class FlowDataEngine:
|
|
|
2105
2094
|
A new `FlowDataEngine` instance with the applied formula.
|
|
2106
2095
|
"""
|
|
2107
2096
|
expr = to_expr(func)
|
|
2108
|
-
if output_data_type not in (None,
|
|
2097
|
+
if output_data_type not in (None, transform_schemas.AUTO_DATA_TYPE):
|
|
2109
2098
|
df = self.data_frame.with_columns(expr.cast(output_data_type).alias(col_name))
|
|
2110
2099
|
else:
|
|
2111
2100
|
df = self.data_frame.with_columns(expr.alias(col_name))
|
|
@@ -2134,8 +2123,8 @@ class FlowDataEngine:
|
|
|
2134
2123
|
data_type=output_fs.file_type,
|
|
2135
2124
|
path=output_fs.abs_file_path,
|
|
2136
2125
|
write_mode=output_fs.write_mode,
|
|
2137
|
-
sheet_name=output_fs.
|
|
2138
|
-
delimiter=output_fs.
|
|
2126
|
+
sheet_name=output_fs.sheet_name,
|
|
2127
|
+
delimiter=output_fs.delimiter,
|
|
2139
2128
|
flow_id=flow_id,
|
|
2140
2129
|
node_id=node_id
|
|
2141
2130
|
)
|
|
@@ -2149,8 +2138,8 @@ class FlowDataEngine:
|
|
|
2149
2138
|
data_type=output_fs.file_type,
|
|
2150
2139
|
path=output_fs.abs_file_path,
|
|
2151
2140
|
write_mode=output_fs.write_mode,
|
|
2152
|
-
sheet_name=output_fs.
|
|
2153
|
-
delimiter=output_fs.
|
|
2141
|
+
sheet_name=output_fs.sheet_name,
|
|
2142
|
+
delimiter=output_fs.delimiter,
|
|
2154
2143
|
flow_id=flow_id,
|
|
2155
2144
|
node_id=node_id,
|
|
2156
2145
|
)
|
|
@@ -2239,6 +2228,7 @@ class FlowDataEngine:
|
|
|
2239
2228
|
def _calculate_schema(self) -> List[Dict]:
|
|
2240
2229
|
"""Calculates schema statistics."""
|
|
2241
2230
|
if self.external_source is not None:
|
|
2231
|
+
|
|
2242
2232
|
self.collect_external()
|
|
2243
2233
|
v = utils.calculate_schema(self.data_frame)
|
|
2244
2234
|
return v
|
|
@@ -2256,6 +2246,7 @@ class FlowDataEngine:
|
|
|
2256
2246
|
def create_from_path_worker(cls, received_table: input_schema.ReceivedTable, flow_id: int, node_id: int | str):
|
|
2257
2247
|
"""Creates a FlowDataEngine from a path in a worker process."""
|
|
2258
2248
|
received_table.set_absolute_filepath()
|
|
2249
|
+
|
|
2259
2250
|
external_fetcher = ExternalCreateFetcher(received_table=received_table,
|
|
2260
2251
|
file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
|
|
2261
2252
|
return cls(external_fetcher.get_result())
|
|
@@ -1,44 +1,13 @@
|
|
|
1
1
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Optional, Any, List, Dict,
|
|
3
|
+
from typing import Optional, Any, List, Dict, Iterable
|
|
4
4
|
|
|
5
5
|
from flowfile_core.schemas import input_schema
|
|
6
6
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
|
|
7
7
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
|
|
8
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.interface import ReadableDataTypeGroup, DataTypeGroup
|
|
9
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.type_registry import convert_pl_type_to_string
|
|
8
10
|
import polars as pl
|
|
9
|
-
# TODO: rename flow_file_column to flowfile_column
|
|
10
|
-
DataTypeGroup = Literal['numeric', 'str', 'date']
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def convert_pl_type_to_string(pl_type: pl.DataType, inner: bool = False) -> str:
|
|
14
|
-
if isinstance(pl_type, pl.List):
|
|
15
|
-
inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
|
|
16
|
-
return f"pl.List({inner_str})"
|
|
17
|
-
elif isinstance(pl_type, pl.Array):
|
|
18
|
-
inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
|
|
19
|
-
return f"pl.Array({inner_str})"
|
|
20
|
-
elif isinstance(pl_type, pl.Decimal):
|
|
21
|
-
precision = pl_type.precision if hasattr(pl_type, 'precision') else None
|
|
22
|
-
scale = pl_type.scale if hasattr(pl_type, 'scale') else None
|
|
23
|
-
if precision is not None and scale is not None:
|
|
24
|
-
return f"pl.Decimal({precision}, {scale})"
|
|
25
|
-
elif precision is not None:
|
|
26
|
-
return f"pl.Decimal({precision})"
|
|
27
|
-
else:
|
|
28
|
-
return "pl.Decimal()"
|
|
29
|
-
elif isinstance(pl_type, pl.Struct):
|
|
30
|
-
# Handle Struct with field definitions
|
|
31
|
-
fields = []
|
|
32
|
-
if hasattr(pl_type, 'fields'):
|
|
33
|
-
for field in pl_type.fields:
|
|
34
|
-
field_name = field.name
|
|
35
|
-
field_type = convert_pl_type_to_string(field.dtype, inner=True)
|
|
36
|
-
fields.append(f'pl.Field("{field_name}", {field_type})')
|
|
37
|
-
field_str = ", ".join(fields)
|
|
38
|
-
return f"pl.Struct([{field_str}])"
|
|
39
|
-
else:
|
|
40
|
-
# For base types, we want the full pl.TypeName format
|
|
41
|
-
return str(pl_type.base_type()) if not inner else f"pl.{pl_type}"
|
|
42
11
|
|
|
43
12
|
|
|
44
13
|
@dataclass
|
|
@@ -52,6 +21,7 @@ class FlowfileColumn:
|
|
|
52
21
|
number_of_empty_values: int
|
|
53
22
|
number_of_unique_values: int
|
|
54
23
|
example_values: str
|
|
24
|
+
data_type_group: ReadableDataTypeGroup
|
|
55
25
|
__sql_type: Optional[Any]
|
|
56
26
|
__is_unique: Optional[bool]
|
|
57
27
|
__nullable: Optional[bool]
|
|
@@ -75,6 +45,7 @@ class FlowfileColumn:
|
|
|
75
45
|
self.__is_unique = None
|
|
76
46
|
self.__sql_type = None
|
|
77
47
|
self.__perc_unique = None
|
|
48
|
+
self.data_type_group = self.get_readable_datatype_group()
|
|
78
49
|
|
|
79
50
|
def __repr__(self):
|
|
80
51
|
"""
|
|
@@ -220,6 +191,20 @@ class FlowfileColumn:
|
|
|
220
191
|
return 'numeric'
|
|
221
192
|
elif self.data_type in ('datetime', 'date', 'Date', 'Datetime', 'Time'):
|
|
222
193
|
return 'date'
|
|
194
|
+
else:
|
|
195
|
+
return 'str'
|
|
196
|
+
|
|
197
|
+
def get_readable_datatype_group(self) -> ReadableDataTypeGroup:
|
|
198
|
+
if self.data_type in ('Utf8', 'VARCHAR', 'CHAR', 'NVARCHAR', 'String'):
|
|
199
|
+
return 'String'
|
|
200
|
+
elif self.data_type in ('fixed_decimal', 'decimal', 'float', 'integer', 'boolean', 'double', 'Int16', 'Int32',
|
|
201
|
+
'Int64', 'Float32', 'Float64', 'Decimal', 'Binary', 'Boolean', 'Uint8', 'Uint16',
|
|
202
|
+
'Uint32', 'Uint64'):
|
|
203
|
+
return 'Numeric'
|
|
204
|
+
elif self.data_type in ('datetime', 'date', 'Date', 'Datetime', 'Time'):
|
|
205
|
+
return 'Date'
|
|
206
|
+
else:
|
|
207
|
+
return 'Other'
|
|
223
208
|
|
|
224
209
|
def get_polars_type(self) -> PlType:
|
|
225
210
|
pl_datatype = cast_str_to_polars_type(self.data_type)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
|
|
2
|
+
from typing import Type, Literal, List, Dict, Union, Tuple
|
|
3
|
+
import polars as pl
|
|
4
|
+
DataTypeGroup = Literal['numeric', 'string', 'datetime', 'boolean', 'binary', 'complex', 'unknown']
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_pl_type_to_string(pl_type: pl.DataType, inner: bool = False) -> str:
|
|
8
|
+
if isinstance(pl_type, pl.List):
|
|
9
|
+
inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
|
|
10
|
+
return f"pl.List({inner_str})"
|
|
11
|
+
elif isinstance(pl_type, pl.Array):
|
|
12
|
+
inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
|
|
13
|
+
return f"pl.Array({inner_str})"
|
|
14
|
+
elif isinstance(pl_type, pl.Decimal):
|
|
15
|
+
precision = pl_type.precision if hasattr(pl_type, 'precision') else None
|
|
16
|
+
scale = pl_type.scale if hasattr(pl_type, 'scale') else None
|
|
17
|
+
if precision is not None and scale is not None:
|
|
18
|
+
return f"pl.Decimal({precision}, {scale})"
|
|
19
|
+
elif precision is not None:
|
|
20
|
+
return f"pl.Decimal({precision})"
|
|
21
|
+
else:
|
|
22
|
+
return "pl.Decimal()"
|
|
23
|
+
elif isinstance(pl_type, pl.Struct):
|
|
24
|
+
# Handle Struct with field definitions
|
|
25
|
+
fields = []
|
|
26
|
+
if hasattr(pl_type, 'fields'):
|
|
27
|
+
for field in pl_type.fields:
|
|
28
|
+
field_name = field.name
|
|
29
|
+
field_type = convert_pl_type_to_string(field.dtype, inner=True)
|
|
30
|
+
fields.append(f'pl.Field("{field_name}", {field_type})')
|
|
31
|
+
field_str = ", ".join(fields)
|
|
32
|
+
return f"pl.Struct([{field_str}])"
|
|
33
|
+
else:
|
|
34
|
+
# For base types, we want the full pl.TypeName format
|
|
35
|
+
return str(pl_type.base_type()) if not inner else f"pl.{pl_type}"
|
|
36
|
+
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from flowfile_core.schemas.transform_schema import FuzzyMatchInput, SelectInput, JoinInputs
|
|
1
|
+
from flowfile_core.schemas.transform_schema import FuzzyMatchInput, SelectInput, JoinInputs, FuzzyMatchInputManager
|
|
2
2
|
from flowfile_core.flowfile.flow_data_engine.join import verify_join_select_integrity, verify_join_map_integrity
|
|
3
3
|
import polars as pl
|
|
4
4
|
from typing import TYPE_CHECKING, Tuple, List
|
|
@@ -15,37 +15,37 @@ def _order_join_inputs_based_on_col_order(col_order: List[str], join_inputs: Joi
|
|
|
15
15
|
Returns:
|
|
16
16
|
None
|
|
17
17
|
"""
|
|
18
|
-
select_map = {select.
|
|
18
|
+
select_map = {select.old_name: select for select in join_inputs.renames}
|
|
19
19
|
ordered_renames = [select_map[col] for col in col_order if col in select_map]
|
|
20
20
|
join_inputs.renames = ordered_renames
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def _ensure_all_columns_have_select(left: "FlowDataEngine",
|
|
24
24
|
right: "FlowDataEngine",
|
|
25
|
-
|
|
25
|
+
fuzzy_match_input_manager: FuzzyMatchInputManager):
|
|
26
26
|
"""
|
|
27
27
|
Ensure that all columns in the left and right FlowDataEngines are included in the fuzzy match input's select
|
|
28
28
|
statements.
|
|
29
29
|
Args:
|
|
30
|
-
left (FlowDataEngine):
|
|
31
|
-
right (FlowDataEngine):
|
|
32
|
-
|
|
30
|
+
left (FlowDataEngine): Left FlowDataEngine
|
|
31
|
+
right (FlowDataEngine): Right FlowDataEngine
|
|
32
|
+
fuzzy_match_input_manager (FuzzyMatchInputManager): Fuzzy match input manager
|
|
33
33
|
|
|
34
34
|
Returns:
|
|
35
35
|
None
|
|
36
36
|
"""
|
|
37
|
-
right_cols_in_select = {c.old_name for c in
|
|
38
|
-
left_cols_in_select = {c.old_name for c in
|
|
37
|
+
right_cols_in_select = {c.old_name for c in fuzzy_match_input_manager.right_select.renames}
|
|
38
|
+
left_cols_in_select = {c.old_name for c in fuzzy_match_input_manager.left_select.renames}
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
fuzzy_match_input_manager.left_select.renames.extend(
|
|
41
41
|
[SelectInput(col) for col in left.columns if col not in left_cols_in_select])
|
|
42
|
-
|
|
42
|
+
fuzzy_match_input_manager.right_select.renames.extend(
|
|
43
43
|
[SelectInput(col) for col in right.columns if col not in right_cols_in_select]
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
|
|
48
|
-
|
|
48
|
+
fuzzy_match_input_manager: FuzzyMatchInputManager) -> Tuple[pl.LazyFrame, pl.LazyFrame]:
|
|
49
49
|
"""
|
|
50
50
|
Prepare two FlowDataEngines for fuzzy matching.
|
|
51
51
|
|
|
@@ -58,22 +58,22 @@ def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
|
|
|
58
58
|
"""
|
|
59
59
|
left.lazy = True
|
|
60
60
|
right.lazy = True
|
|
61
|
-
_ensure_all_columns_have_select(left, right,
|
|
62
|
-
_order_join_inputs_based_on_col_order(left.columns,
|
|
63
|
-
_order_join_inputs_based_on_col_order(right.columns,
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
61
|
+
_ensure_all_columns_have_select(left, right, fuzzy_match_input_manager)
|
|
62
|
+
_order_join_inputs_based_on_col_order(left.columns, fuzzy_match_input_manager.left_select.join_inputs)
|
|
63
|
+
_order_join_inputs_based_on_col_order(right.columns, fuzzy_match_input_manager.right_select.join_inputs)
|
|
64
|
+
verify_join_select_integrity(fuzzy_match_input_manager.fuzzy_input, left_columns=left.columns, right_columns=right.columns)
|
|
65
|
+
if not verify_join_map_integrity(fuzzy_match_input_manager.fuzzy_input, left_columns=left.schema,
|
|
66
|
+
right_columns=right.schema):
|
|
67
67
|
raise Exception('Join is not valid by the data fields')
|
|
68
|
-
fuzzy_match_input = fuzzy_match_input
|
|
69
|
-
fuzzy_match_input.auto_rename()
|
|
70
68
|
|
|
71
|
-
|
|
69
|
+
fuzzy_match_input_manager.auto_rename()
|
|
70
|
+
|
|
71
|
+
right_select = [v.old_name for v in fuzzy_match_input_manager.right_select.renames if
|
|
72
72
|
(v.keep or v.join_key) and v.is_available]
|
|
73
|
-
left_select = [v.old_name for v in
|
|
73
|
+
left_select = [v.old_name for v in fuzzy_match_input_manager.left_select.renames if
|
|
74
74
|
(v.keep or v.join_key) and v.is_available]
|
|
75
75
|
left_df: pl.LazyFrame | pl.DataFrame = left.data_frame.select(left_select).rename(
|
|
76
|
-
|
|
76
|
+
fuzzy_match_input_manager.left_select.rename_table)
|
|
77
77
|
right_df: pl.LazyFrame | pl.DataFrame = right.data_frame.select(right_select).rename(
|
|
78
|
-
|
|
78
|
+
fuzzy_match_input_manager.right_select.rename_table)
|
|
79
79
|
return left_df, right_df
|
|
@@ -16,7 +16,7 @@ def rename_df_table_for_join(left_df: T, right_df: T, join_key_rename: transfor
|
|
|
16
16
|
right_df.rename({r[0]: r[1] for r in join_key_rename.right.join_key_renames}))
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def get_undo_rename_mapping_join(join_input: transform_schemas.
|
|
19
|
+
def get_undo_rename_mapping_join(join_input: transform_schemas.JoinInputManager) -> Dict[str, str]:
|
|
20
20
|
join_key_rename = join_input.get_join_key_renames(True)
|
|
21
21
|
return {r[1]: r[0] for r in join_key_rename.right.join_key_renames + join_key_rename.left.join_key_renames}
|
|
22
22
|
|
|
@@ -4,9 +4,14 @@ from flowfile_core.schemas import transform_schema
|
|
|
4
4
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def verify_join_select_integrity(
|
|
8
|
-
|
|
9
|
-
|
|
7
|
+
def verify_join_select_integrity(
|
|
8
|
+
join_input:
|
|
9
|
+
transform_schema.JoinInput |
|
|
10
|
+
transform_schema.CrossJoinInput |
|
|
11
|
+
transform_schema.FuzzyMatchInput |
|
|
12
|
+
transform_schema.JoinInputsManager,
|
|
13
|
+
left_columns: List[str],
|
|
14
|
+
right_columns: List[str]):
|
|
10
15
|
"""
|
|
11
16
|
Verify column availability for join selection and update availability flags.
|
|
12
17
|
|
|
@@ -27,7 +32,7 @@ def verify_join_select_integrity(join_input: transform_schema.JoinInput | transf
|
|
|
27
32
|
c.is_available = True
|
|
28
33
|
|
|
29
34
|
|
|
30
|
-
def verify_join_map_integrity(join_input: transform_schema.JoinInput,
|
|
35
|
+
def verify_join_map_integrity(join_input: transform_schema.JoinInput | transform_schema.FuzzyMatchInput | transform_schema.JoinInputManager,
|
|
31
36
|
left_columns: List[FlowfileColumn],
|
|
32
37
|
right_columns: List[FlowfileColumn]
|
|
33
38
|
):
|