Flowfile 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +4 -3
- flowfile/api.py +5 -2
- flowfile/web/__init__.py +2 -0
- flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
- flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
- flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
- flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
- flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
- flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
- flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
- flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
- flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
- flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
- flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
- flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
- flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
- flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
- flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
- flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
- flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
- flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
- flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
- flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
- flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
- flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
- flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
- flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
- flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
- flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
- flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
- flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
- flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
- flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
- flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/RECORD +88 -90
- flowfile_core/configs/settings.py +4 -2
- flowfile_core/configs/utils.py +5 -0
- flowfile_core/database/connection.py +1 -3
- flowfile_core/flowfile/code_generator/code_generator.py +36 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +0 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
- flowfile_core/flowfile/flow_graph.py +129 -88
- flowfile_core/flowfile/flow_node/flow_node.py +30 -15
- flowfile_core/flowfile/flow_node/models.py +0 -2
- flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
- flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
- flowfile_core/flowfile/graph_tree/models.py +15 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
- flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
- flowfile_core/flowfile/setting_generator/settings.py +2 -1
- flowfile_core/flowfile/util/execution_orderer.py +9 -0
- flowfile_core/flowfile/util/node_skipper.py +8 -0
- flowfile_core/schemas/schemas.py +46 -3
- flowfile_core/schemas/transform_schema.py +27 -38
- flowfile_core/utils/arrow_reader.py +8 -3
- flowfile_core/utils/validate_setup.py +0 -2
- flowfile_frame/__init__.py +1 -4
- flowfile_frame/expr.py +14 -0
- flowfile_frame/flow_frame.py +34 -5
- flowfile_frame/flow_frame.pyi +5 -6
- flowfile_worker/funcs.py +7 -3
- flowfile_worker/models.py +3 -1
- flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
- flowfile_worker/polars_fuzzy_match/models.py +0 -36
- flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
- flowfile_worker/polars_fuzzy_match/process.py +0 -86
- flowfile_worker/polars_fuzzy_match/utils.py +0 -50
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
- {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from typing import List, Dict, Optional, Set, Tuple
|
|
2
2
|
import polars as pl
|
|
3
3
|
|
|
4
|
+
from pl_fuzzy_frame_match.models import FuzzyMapping
|
|
5
|
+
|
|
4
6
|
from flowfile_core.flowfile.flow_graph import FlowGraph
|
|
5
7
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn, convert_pl_type_to_string
|
|
6
8
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
|
|
@@ -825,6 +827,40 @@ class FlowGraphToPolarsConverter:
|
|
|
825
827
|
self._add_code(f"{var_name} = {input_df}.head(n={settings.sample_size})")
|
|
826
828
|
self._add_code("")
|
|
827
829
|
|
|
830
|
+
@staticmethod
|
|
831
|
+
def _transform_fuzzy_mappings_to_string(fuzzy_mappings: List[FuzzyMapping]) -> str:
|
|
832
|
+
output_str = "["
|
|
833
|
+
for i, fuzzy_mapping in enumerate(fuzzy_mappings):
|
|
834
|
+
|
|
835
|
+
output_str += (f"FuzzyMapping(left_col='{fuzzy_mapping.left_col}',"
|
|
836
|
+
f" right_col='{fuzzy_mapping.right_col}', "
|
|
837
|
+
f"threshold_score={fuzzy_mapping.threshold_score}, "
|
|
838
|
+
f"fuzzy_type='{fuzzy_mapping.fuzzy_type}')")
|
|
839
|
+
if i < len(fuzzy_mappings) - 1:
|
|
840
|
+
output_str += ",\n"
|
|
841
|
+
output_str += "]"
|
|
842
|
+
return output_str
|
|
843
|
+
|
|
844
|
+
def _handle_fuzzy_match(self, settings: input_schema.NodeFuzzyMatch, var_name: str, input_vars: Dict[str, str]) -> None:
|
|
845
|
+
"""Handle fuzzy match nodes."""
|
|
846
|
+
self.imports.add("from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs")
|
|
847
|
+
left_df = input_vars.get('main', input_vars.get('main_0', 'df_left'))
|
|
848
|
+
right_df = input_vars.get('right', input_vars.get('main_1', 'df_right'))
|
|
849
|
+
if left_df == right_df:
|
|
850
|
+
right_df = "df_right"
|
|
851
|
+
self._add_code(f"{right_df} = {left_df}")
|
|
852
|
+
|
|
853
|
+
if settings.join_input.left_select.has_drop_cols():
|
|
854
|
+
self._add_code(f"{left_df} = {left_df}.drop({[c.old_name for c in settings.join_input.left_select.non_jk_drop_columns]})")
|
|
855
|
+
if settings.join_input.right_select.has_drop_cols():
|
|
856
|
+
self._add_code(f"{right_df} = {right_df}.drop({[c.old_name for c in settings.join_input.right_select.non_jk_drop_columns]})")
|
|
857
|
+
|
|
858
|
+
fuzzy_join_mapping_settings = self._transform_fuzzy_mappings_to_string(settings.join_input.join_mapping)
|
|
859
|
+
self._add_code(f"{var_name} = fuzzy_match_dfs(\n"
|
|
860
|
+
f" left_df={left_df}, right_df={right_df},\n"
|
|
861
|
+
f" fuzzy_maps={fuzzy_join_mapping_settings}\n"
|
|
862
|
+
f" ).lazy()")
|
|
863
|
+
|
|
828
864
|
def _handle_unique(self, settings: input_schema.NodeUnique, var_name: str, input_vars: Dict[str, str]) -> None:
|
|
829
865
|
"""Handle unique/distinct nodes."""
|
|
830
866
|
input_df = input_vars.get('main', 'df')
|
|
@@ -68,7 +68,6 @@ class CloudStorageReader:
|
|
|
68
68
|
def _get_s3_storage_options(connection: 'FullCloudStorageConnection') -> Dict[str, Any]:
|
|
69
69
|
"""Build S3-specific storage options."""
|
|
70
70
|
auth_method = connection.auth_method
|
|
71
|
-
print(f"Building S3 storage options for auth_method: '{auth_method}'")
|
|
72
71
|
if auth_method == "aws-cli":
|
|
73
72
|
return create_storage_options_from_boto_credentials(
|
|
74
73
|
profile_name=connection.connection_name,
|
|
@@ -6,6 +6,8 @@ from dataclasses import dataclass
|
|
|
6
6
|
from math import ceil
|
|
7
7
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
|
|
8
8
|
|
|
9
|
+
from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
|
|
10
|
+
|
|
9
11
|
# Third-party imports
|
|
10
12
|
from loky import Future
|
|
11
13
|
import polars as pl
|
|
@@ -19,12 +21,12 @@ from pyarrow.parquet import ParquetFile
|
|
|
19
21
|
from flowfile_core.configs import logger
|
|
20
22
|
from flowfile_core.utils.utils import ensure_similarity_dicts
|
|
21
23
|
from flowfile_core.configs.flow_logger import NodeLogger
|
|
22
|
-
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
23
24
|
from flowfile_core.schemas import (
|
|
24
25
|
cloud_storage_schemas,
|
|
25
26
|
input_schema,
|
|
26
27
|
transform_schema as transform_schemas
|
|
27
28
|
)
|
|
29
|
+
from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
|
|
28
30
|
|
|
29
31
|
# Local imports - Flow File Components
|
|
30
32
|
from flowfile_core.flowfile.flow_data_engine import utils
|
|
@@ -64,6 +66,7 @@ from flowfile_core.flowfile.sources.external_sources.base_class import ExternalD
|
|
|
64
66
|
|
|
65
67
|
T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
|
|
66
68
|
|
|
69
|
+
|
|
67
70
|
def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
|
|
68
71
|
"""Temporarily renames join keys to avoid conflicts during a join.
|
|
69
72
|
|
|
@@ -1563,7 +1566,7 @@ class FlowDataEngine:
|
|
|
1563
1566
|
return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
|
|
1564
1567
|
|
|
1565
1568
|
def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
|
|
1566
|
-
seed: int = None) -> "FlowDataEngine":
|
|
1569
|
+
seed: int = None, execution_location: Optional[ExecutionLocationsLiteral] = None) -> "FlowDataEngine":
|
|
1567
1570
|
"""Gets a sample of rows from the DataFrame.
|
|
1568
1571
|
|
|
1569
1572
|
Args:
|
|
@@ -1571,11 +1574,10 @@ class FlowDataEngine:
|
|
|
1571
1574
|
random: If True, performs random sampling. If False, takes the first n_rows.
|
|
1572
1575
|
shuffle: If True (and `random` is True), shuffles the data before sampling.
|
|
1573
1576
|
seed: A random seed for reproducibility.
|
|
1574
|
-
|
|
1577
|
+
execution_location: Location which is used to calculate the size of the dataframe
|
|
1575
1578
|
Returns:
|
|
1576
1579
|
A new `FlowDataEngine` instance containing the sampled data.
|
|
1577
1580
|
"""
|
|
1578
|
-
n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=OFFLOAD_TO_WORKER))
|
|
1579
1581
|
logging.info(f'Getting sample of {n_rows} rows')
|
|
1580
1582
|
|
|
1581
1583
|
if random:
|
|
@@ -1583,12 +1585,17 @@ class FlowDataEngine:
|
|
|
1583
1585
|
self.collect_external()
|
|
1584
1586
|
|
|
1585
1587
|
if self.lazy and shuffle:
|
|
1586
|
-
sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto")
|
|
1587
|
-
|
|
1588
|
-
shuffle=shuffle)
|
|
1588
|
+
sample_df = (self.data_frame.collect(engine="streaming" if self._streamable else "auto")
|
|
1589
|
+
.sample(n_rows, seed=seed, shuffle=shuffle))
|
|
1589
1590
|
elif shuffle:
|
|
1590
1591
|
sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
|
|
1591
1592
|
else:
|
|
1593
|
+
if execution_location is None:
|
|
1594
|
+
execution_location = get_global_execution_location()
|
|
1595
|
+
n_rows = min(n_rows, self.get_number_of_records(
|
|
1596
|
+
calculate_in_worker_process=execution_location == "remote")
|
|
1597
|
+
)
|
|
1598
|
+
|
|
1592
1599
|
every_n_records = ceil(self.number_of_records / n_rows)
|
|
1593
1600
|
sample_df = self.data_frame.gather_every(every_n_records)
|
|
1594
1601
|
else:
|
|
@@ -1596,7 +1603,7 @@ class FlowDataEngine:
|
|
|
1596
1603
|
self.collect(n_rows)
|
|
1597
1604
|
sample_df = self.data_frame.head(n_rows)
|
|
1598
1605
|
|
|
1599
|
-
return FlowDataEngine(sample_df, schema=self.schema
|
|
1606
|
+
return FlowDataEngine(sample_df, schema=self.schema)
|
|
1600
1607
|
|
|
1601
1608
|
def get_subset(self, n_rows: int = 100) -> "FlowDataEngine":
|
|
1602
1609
|
"""Gets the first `n_rows` from the DataFrame.
|
|
@@ -1650,8 +1657,7 @@ class FlowDataEngine:
|
|
|
1650
1657
|
An `ExternalFuzzyMatchFetcher` object that can be used to track the
|
|
1651
1658
|
progress and retrieve the result of the fuzzy join.
|
|
1652
1659
|
"""
|
|
1653
|
-
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
1654
|
-
fuzzy_match_input=fuzzy_match_input)
|
|
1660
|
+
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
|
|
1655
1661
|
return ExternalFuzzyMatchFetcher(left_df, right_df,
|
|
1656
1662
|
fuzzy_maps=fuzzy_match_input.fuzzy_maps,
|
|
1657
1663
|
file_ref=file_ref + '_fm',
|
|
@@ -1659,59 +1665,33 @@ class FlowDataEngine:
|
|
|
1659
1665
|
flow_id=flow_id,
|
|
1660
1666
|
node_id=node_id)
|
|
1661
1667
|
|
|
1662
|
-
def
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
def fuzzy_match(self, right: "FlowDataEngine", left_on: str, right_on: str,
|
|
1690
|
-
fuzzy_method: str = 'levenshtein', threshold: float = 0.75) -> "FlowDataEngine":
|
|
1691
|
-
"""Performs a simple fuzzy match between two DataFrames on a single column pair.
|
|
1692
|
-
|
|
1693
|
-
This is a convenience method for a common fuzzy join scenario.
|
|
1694
|
-
|
|
1695
|
-
Args:
|
|
1696
|
-
right: The right `FlowDataEngine` to match against.
|
|
1697
|
-
left_on: The column name from the left DataFrame to match on.
|
|
1698
|
-
right_on: The column name from the right DataFrame to match on.
|
|
1699
|
-
fuzzy_method: The fuzzy matching algorithm to use (e.g., 'levenshtein').
|
|
1700
|
-
threshold: The similarity score threshold (0.0 to 1.0) for a match.
|
|
1701
|
-
|
|
1702
|
-
Returns:
|
|
1703
|
-
A new `FlowDataEngine` with the matched data.
|
|
1704
|
-
"""
|
|
1705
|
-
fuzzy_match_input = transform_schemas.FuzzyMatchInput(
|
|
1706
|
-
[transform_schemas.FuzzyMap(
|
|
1707
|
-
left_on, right_on,
|
|
1708
|
-
fuzzy_type=fuzzy_method,
|
|
1709
|
-
threshold_score=threshold
|
|
1710
|
-
)],
|
|
1711
|
-
left_select=self.columns,
|
|
1712
|
-
right_select=right.columns
|
|
1713
|
-
)
|
|
1714
|
-
return self.do_fuzzy_join(fuzzy_match_input, right, str(id(self)))
|
|
1668
|
+
def fuzzy_join_external(self,
|
|
1669
|
+
fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1670
|
+
other: "FlowDataEngine",
|
|
1671
|
+
file_ref: str = None,
|
|
1672
|
+
flow_id: int = -1,
|
|
1673
|
+
node_id: int = -1
|
|
1674
|
+
):
|
|
1675
|
+
if file_ref is None:
|
|
1676
|
+
file_ref = str(id(self)) + '_' + str(id(other))
|
|
1677
|
+
|
|
1678
|
+
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
|
|
1679
|
+
external_tracker = ExternalFuzzyMatchFetcher(left_df, right_df,
|
|
1680
|
+
fuzzy_maps=fuzzy_match_input.fuzzy_maps,
|
|
1681
|
+
file_ref=file_ref + '_fm',
|
|
1682
|
+
wait_on_completion=False,
|
|
1683
|
+
flow_id=flow_id,
|
|
1684
|
+
node_id=node_id)
|
|
1685
|
+
return FlowDataEngine(external_tracker.get_result())
|
|
1686
|
+
|
|
1687
|
+
def fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1688
|
+
other: "FlowDataEngine",
|
|
1689
|
+
node_logger: NodeLogger = None) -> "FlowDataEngine":
|
|
1690
|
+
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
|
|
1691
|
+
fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input.fuzzy_maps]
|
|
1692
|
+
return FlowDataEngine(fuzzy_match_dfs(left_df, right_df, fuzzy_maps=fuzzy_mappings,
|
|
1693
|
+
logger=node_logger.logger if node_logger else logger)
|
|
1694
|
+
.lazy())
|
|
1715
1695
|
|
|
1716
1696
|
def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
|
|
1717
1697
|
auto_generate_selection: bool, verify_integrity: bool,
|
|
@@ -1733,11 +1713,12 @@ class FlowDataEngine:
|
|
|
1733
1713
|
Exception: If `verify_integrity` is True and the join would result in
|
|
1734
1714
|
an excessively large number of records.
|
|
1735
1715
|
"""
|
|
1716
|
+
|
|
1736
1717
|
self.lazy = True
|
|
1718
|
+
|
|
1737
1719
|
other.lazy = True
|
|
1738
1720
|
|
|
1739
1721
|
verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
|
|
1740
|
-
|
|
1741
1722
|
right_select = [v.old_name for v in cross_join_input.right_select.renames
|
|
1742
1723
|
if (v.keep or v.join_key) and v.is_available]
|
|
1743
1724
|
left_select = [v.old_name for v in cross_join_input.left_select.renames
|
|
@@ -1746,26 +1727,14 @@ class FlowDataEngine:
|
|
|
1746
1727
|
left = self.data_frame.select(left_select).rename(cross_join_input.left_select.rename_table)
|
|
1747
1728
|
right = other.data_frame.select(right_select).rename(cross_join_input.right_select.rename_table)
|
|
1748
1729
|
|
|
1749
|
-
if verify_integrity:
|
|
1750
|
-
n_records = self.get_number_of_records() * other.get_number_of_records()
|
|
1751
|
-
if n_records > 1_000_000_000:
|
|
1752
|
-
raise Exception("Join will result in too many records, ending process")
|
|
1753
|
-
else:
|
|
1754
|
-
n_records = -1
|
|
1755
|
-
|
|
1756
1730
|
joined_df = left.join(right, how='cross')
|
|
1757
1731
|
|
|
1758
1732
|
cols_to_delete_after = [col.new_name for col in
|
|
1759
1733
|
cross_join_input.left_select.renames + cross_join_input.left_select.renames
|
|
1760
1734
|
if col.join_key and not col.keep and col.is_available]
|
|
1761
1735
|
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
number_of_records=n_records, streamable=False)
|
|
1765
|
-
else:
|
|
1766
|
-
fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False,
|
|
1767
|
-
number_of_records=0, streamable=False)
|
|
1768
|
-
return fl
|
|
1736
|
+
fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
|
|
1737
|
+
return fl
|
|
1769
1738
|
|
|
1770
1739
|
def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
|
|
1771
1740
|
verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
|
|
@@ -1901,7 +1870,7 @@ class FlowDataEngine:
|
|
|
1901
1870
|
other.number_of_records = -1
|
|
1902
1871
|
other = other.select_columns(self.columns)
|
|
1903
1872
|
|
|
1904
|
-
if self.
|
|
1873
|
+
if self.get_number_of_records_in_process() != other.get_number_of_records_in_process():
|
|
1905
1874
|
raise Exception('Number of records is not equal')
|
|
1906
1875
|
|
|
1907
1876
|
if self.columns != other.columns:
|
|
@@ -1937,6 +1906,18 @@ class FlowDataEngine:
|
|
|
1937
1906
|
).result
|
|
1938
1907
|
return number_of_records
|
|
1939
1908
|
|
|
1909
|
+
def get_number_of_records_in_process(self, force_calculate: bool = False):
|
|
1910
|
+
"""
|
|
1911
|
+
Get the number of records in the DataFrame in the local process.
|
|
1912
|
+
|
|
1913
|
+
args:
|
|
1914
|
+
force_calculate: If True, forces recalculation even if a value is cached.
|
|
1915
|
+
|
|
1916
|
+
Returns:
|
|
1917
|
+
The total number of records.
|
|
1918
|
+
"""
|
|
1919
|
+
return self.get_number_of_records(force_calculate=force_calculate)
|
|
1920
|
+
|
|
1940
1921
|
def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
|
|
1941
1922
|
calculate_in_worker_process: bool = False) -> int:
|
|
1942
1923
|
"""Gets the total number of records in the DataFrame.
|
|
@@ -1956,7 +1937,6 @@ class FlowDataEngine:
|
|
|
1956
1937
|
"""
|
|
1957
1938
|
if self.is_future and not self.is_collected:
|
|
1958
1939
|
return -1
|
|
1959
|
-
calculate_in_worker_process = False if not OFFLOAD_TO_WORKER.value else calculate_in_worker_process
|
|
1960
1940
|
if self.number_of_records is None or self.number_of_records < 0 or force_calculate:
|
|
1961
1941
|
if self._number_of_records_callback is not None:
|
|
1962
1942
|
self._number_of_records_callback(self)
|
|
@@ -76,6 +76,67 @@ class FlowfileColumn:
|
|
|
76
76
|
self.__sql_type = None
|
|
77
77
|
self.__perc_unique = None
|
|
78
78
|
|
|
79
|
+
def __repr__(self):
|
|
80
|
+
"""
|
|
81
|
+
Provides a concise, developer-friendly representation of the object.
|
|
82
|
+
Ideal for debugging and console inspection.
|
|
83
|
+
"""
|
|
84
|
+
return (f"FlowfileColumn(name='{self.column_name}', "
|
|
85
|
+
f"type={self.data_type}, "
|
|
86
|
+
f"size={self.size}, "
|
|
87
|
+
f"nulls={self.number_of_empty_values})")
|
|
88
|
+
|
|
89
|
+
def __str__(self):
|
|
90
|
+
"""
|
|
91
|
+
Provides a detailed, readable summary of the column's metadata.
|
|
92
|
+
It conditionally omits any attribute that is None, ensuring a clean output.
|
|
93
|
+
"""
|
|
94
|
+
# --- Header (Always Shown) ---
|
|
95
|
+
header = f"<FlowfileColumn: '{self.column_name}'>"
|
|
96
|
+
lines = []
|
|
97
|
+
|
|
98
|
+
# --- Core Attributes (Conditionally Shown) ---
|
|
99
|
+
if self.data_type is not None:
|
|
100
|
+
lines.append(f" Type: {self.data_type}")
|
|
101
|
+
if self.size is not None:
|
|
102
|
+
lines.append(f" Non-Nulls: {self.size}")
|
|
103
|
+
|
|
104
|
+
# Calculate and display nulls if possible
|
|
105
|
+
if self.size is not None and self.number_of_empty_values is not None:
|
|
106
|
+
total_entries = self.size + self.number_of_empty_values
|
|
107
|
+
if total_entries > 0:
|
|
108
|
+
null_perc = (self.number_of_empty_values / total_entries) * 100
|
|
109
|
+
null_info = f"{self.number_of_empty_values} ({null_perc:.1f}%)"
|
|
110
|
+
else:
|
|
111
|
+
null_info = "0 (0.0%)"
|
|
112
|
+
lines.append(f" Nulls: {null_info}")
|
|
113
|
+
|
|
114
|
+
if self.number_of_unique_values is not None:
|
|
115
|
+
lines.append(f" Unique: {self.number_of_unique_values}")
|
|
116
|
+
|
|
117
|
+
# --- Conditional Stats Section ---
|
|
118
|
+
stats = []
|
|
119
|
+
if self.min_value is not None:
|
|
120
|
+
stats.append(f" Min: {self.min_value}")
|
|
121
|
+
if self.max_value is not None:
|
|
122
|
+
stats.append(f" Max: {self.max_value}")
|
|
123
|
+
if self.average_value is not None:
|
|
124
|
+
stats.append(f" Mean: {self.average_value}")
|
|
125
|
+
|
|
126
|
+
if stats:
|
|
127
|
+
lines.append(" Stats:")
|
|
128
|
+
lines.extend(stats)
|
|
129
|
+
|
|
130
|
+
# --- Conditional Examples Section ---
|
|
131
|
+
if self.example_values:
|
|
132
|
+
example_str = str(self.example_values)
|
|
133
|
+
# Truncate long example strings for cleaner display
|
|
134
|
+
if len(example_str) > 70:
|
|
135
|
+
example_str = example_str[:67] + '...'
|
|
136
|
+
lines.append(f" Examples: {example_str}")
|
|
137
|
+
|
|
138
|
+
return f"{header}\n" + "\n".join(lines)
|
|
139
|
+
|
|
79
140
|
@classmethod
|
|
80
141
|
def create_from_polars_type(cls, polars_type: PlType, **kwargs) -> "FlowfileColumn":
|
|
81
142
|
for k, v in kwargs.items():
|
|
@@ -1,12 +1,49 @@
|
|
|
1
|
-
from flowfile_core.schemas.transform_schema import FuzzyMatchInput
|
|
1
|
+
from flowfile_core.schemas.transform_schema import FuzzyMatchInput, SelectInput, JoinInputs
|
|
2
2
|
from flowfile_core.flowfile.flow_data_engine.join import verify_join_select_integrity, verify_join_map_integrity
|
|
3
3
|
import polars as pl
|
|
4
|
-
from typing import TYPE_CHECKING, Tuple
|
|
4
|
+
from typing import TYPE_CHECKING, Tuple, List
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
def _order_join_inputs_based_on_col_order(col_order: List[str], join_inputs: JoinInputs) -> None:
|
|
11
|
+
"""
|
|
12
|
+
Ensure that the select columns in the fuzzy match input match the order of the incoming columns.
|
|
13
|
+
This function modifies the join_inputs object in-place.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
None
|
|
17
|
+
"""
|
|
18
|
+
select_map = {select.new_name: select for select in join_inputs.renames}
|
|
19
|
+
ordered_renames = [select_map[col] for col in col_order if col in select_map]
|
|
20
|
+
join_inputs.renames = ordered_renames
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _ensure_all_columns_have_select(left: "FlowDataEngine",
|
|
24
|
+
right: "FlowDataEngine",
|
|
25
|
+
fuzzy_match_input: FuzzyMatchInput):
|
|
26
|
+
"""
|
|
27
|
+
Ensure that all columns in the left and right FlowDataEngines are included in the fuzzy match input's select
|
|
28
|
+
statements.
|
|
29
|
+
Args:
|
|
30
|
+
left (FlowDataEngine):
|
|
31
|
+
right (FlowDataEngine):
|
|
32
|
+
fuzzy_match_input ():
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
None
|
|
36
|
+
"""
|
|
37
|
+
right_cols_in_select = {c.old_name for c in fuzzy_match_input.right_select.renames}
|
|
38
|
+
left_cols_in_select = {c.old_name for c in fuzzy_match_input.left_select.renames}
|
|
39
|
+
|
|
40
|
+
fuzzy_match_input.left_select.renames.extend(
|
|
41
|
+
[SelectInput(col) for col in left.columns if col not in left_cols_in_select])
|
|
42
|
+
fuzzy_match_input.right_select.renames.extend(
|
|
43
|
+
[SelectInput(col) for col in right.columns if col not in right_cols_in_select]
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
10
47
|
def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
|
|
11
48
|
fuzzy_match_input: FuzzyMatchInput) -> Tuple[pl.LazyFrame, pl.LazyFrame]:
|
|
12
49
|
"""
|
|
@@ -19,14 +56,18 @@ def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
|
|
|
19
56
|
Returns:
|
|
20
57
|
Tuple[pl.LazyFrame, pl.LazyFrame]: Prepared left and right lazy frames
|
|
21
58
|
"""
|
|
22
|
-
|
|
23
59
|
left.lazy = True
|
|
24
60
|
right.lazy = True
|
|
61
|
+
_ensure_all_columns_have_select(left, right, fuzzy_match_input)
|
|
62
|
+
_order_join_inputs_based_on_col_order(left.columns, fuzzy_match_input.left_select)
|
|
63
|
+
_order_join_inputs_based_on_col_order(right.columns, fuzzy_match_input.right_select)
|
|
64
|
+
|
|
25
65
|
verify_join_select_integrity(fuzzy_match_input, left_columns=left.columns, right_columns=right.columns)
|
|
26
66
|
if not verify_join_map_integrity(fuzzy_match_input, left_columns=left.schema, right_columns=right.schema):
|
|
27
67
|
raise Exception('Join is not valid by the data fields')
|
|
28
68
|
fuzzy_match_input = fuzzy_match_input
|
|
29
69
|
fuzzy_match_input.auto_rename()
|
|
70
|
+
|
|
30
71
|
right_select = [v.old_name for v in fuzzy_match_input.right_select.renames if
|
|
31
72
|
(v.keep or v.join_key) and v.is_available]
|
|
32
73
|
left_select = [v.old_name for v in fuzzy_match_input.left_select.renames if
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import Any, Optional, Literal
|
|
2
2
|
from pydantic import BaseModel
|
|
3
|
-
from
|
|
3
|
+
from pl_fuzzy_frame_match.models import FuzzyMapping
|
|
4
4
|
|
|
5
5
|
OperationType = Literal['store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'store_sample']
|
|
6
6
|
|
|
@@ -20,8 +20,8 @@ class FuzzyJoinInput(BaseModel):
|
|
|
20
20
|
cache_dir: Optional[str] = None
|
|
21
21
|
left_df_operation: PolarsOperation
|
|
22
22
|
right_df_operation: PolarsOperation
|
|
23
|
-
fuzzy_maps: list[
|
|
24
|
-
flowfile_node_id: int|str
|
|
23
|
+
fuzzy_maps: list[FuzzyMapping]
|
|
24
|
+
flowfile_node_id: int | str
|
|
25
25
|
flowfile_flow_id: int
|
|
26
26
|
|
|
27
27
|
|
|
@@ -9,11 +9,12 @@ from uuid import uuid4
|
|
|
9
9
|
import polars as pl
|
|
10
10
|
import requests
|
|
11
11
|
|
|
12
|
+
from pl_fuzzy_frame_match.models import FuzzyMapping
|
|
13
|
+
|
|
12
14
|
from flowfile_core.configs import logger
|
|
13
15
|
from flowfile_core.configs.settings import WORKER_URL
|
|
14
16
|
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.models import (
|
|
15
17
|
FuzzyJoinInput,
|
|
16
|
-
FuzzyMap,
|
|
17
18
|
OperationType,
|
|
18
19
|
PolarsOperation,
|
|
19
20
|
Status
|
|
@@ -53,7 +54,7 @@ def trigger_sample_operation(lf: pl.LazyFrame, file_ref: str, flow_id: int, node
|
|
|
53
54
|
|
|
54
55
|
|
|
55
56
|
def trigger_fuzzy_match_operation(left_df: pl.LazyFrame, right_df: pl.LazyFrame,
|
|
56
|
-
fuzzy_maps: List[
|
|
57
|
+
fuzzy_maps: List[FuzzyMapping],
|
|
57
58
|
file_ref: str,
|
|
58
59
|
flow_id: int,
|
|
59
60
|
node_id: int | str) -> Status:
|
|
@@ -122,6 +123,8 @@ def results_exists(file_ref: str):
|
|
|
122
123
|
return False
|
|
123
124
|
except requests.RequestException as e:
|
|
124
125
|
logger.error(f"Failed to check results existence: {str(e)}")
|
|
126
|
+
if "Connection refused" in str(e):
|
|
127
|
+
logger.info("")
|
|
125
128
|
return False
|
|
126
129
|
|
|
127
130
|
|