Flowfile 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +3 -3
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
- flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
- flowfile/web/static/assets/api-fb67319c.js +80 -0
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/RECORD +108 -103
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +2 -0
- flowfile_core/configs/node_store/nodes.py +8 -6
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +401 -17
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +119 -82
- flowfile_core/flowfile/flow_node/flow_node.py +68 -33
- flowfile_core/flowfile/flow_node/models.py +32 -3
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/utils.py +1 -23
- flowfile_core/main.py +3 -2
- flowfile_core/routes/cloud_connections.py +81 -0
- flowfile_core/routes/logs.py +0 -1
- flowfile_core/routes/routes.py +3 -39
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +37 -15
- flowfile_core/schemas/schemas.py +7 -2
- flowfile_core/schemas/transform_schema.py +97 -22
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/flow_frame.py +253 -102
- flowfile_frame/flow_frame_methods.py +13 -13
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +291 -0
- test_utils/s3/fixtures.py +209 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
flowfile_frame/flow_frame.py
CHANGED
|
@@ -102,17 +102,17 @@ def _extract_expr_parts(expr_obj) -> tuple[str, str]:
|
|
|
102
102
|
def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr | None = None,
|
|
103
103
|
group_expr: pl.Expr | None = None) -> None:
|
|
104
104
|
if method_name is None:
|
|
105
|
-
raise
|
|
105
|
+
raise NotImplementedError("Cannot create a polars lambda expression without the method")
|
|
106
106
|
if polars_expr is None:
|
|
107
|
-
raise
|
|
107
|
+
raise NotImplementedError("Cannot create polars expressions with lambda function")
|
|
108
108
|
method_ref = getattr(pl.LazyFrame, method_name)
|
|
109
109
|
if method_ref is None:
|
|
110
110
|
raise ModuleNotFoundError(f"Could not find the method {method_name} in polars lazyframe")
|
|
111
111
|
if method_name == 'group_by':
|
|
112
112
|
if group_expr is None:
|
|
113
|
-
raise
|
|
113
|
+
raise NotImplementedError("Cannot create a polars lambda expression without the groupby expression")
|
|
114
114
|
if not all(isinstance(ge, pl.Expr) for ge in group_expr):
|
|
115
|
-
raise
|
|
115
|
+
raise NotImplementedError("Cannot create a polars lambda expression without the groupby expression")
|
|
116
116
|
|
|
117
117
|
|
|
118
118
|
def generate_node_id() -> int:
|
|
@@ -272,7 +272,6 @@ class FlowFrame:
|
|
|
272
272
|
data = pl.LazyFrame()
|
|
273
273
|
if not isinstance(data, pl.LazyFrame):
|
|
274
274
|
return
|
|
275
|
-
|
|
276
275
|
self.node_id = node_id or generate_node_id()
|
|
277
276
|
self.parent_node_id = parent_node_id
|
|
278
277
|
|
|
@@ -535,18 +534,18 @@ class FlowFrame:
|
|
|
535
534
|
self.flow_graph.add_polars_code(polars_code_settings)
|
|
536
535
|
|
|
537
536
|
def join(
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
537
|
+
self,
|
|
538
|
+
other,
|
|
539
|
+
on: List[str | Column] | str | Column = None,
|
|
540
|
+
how: str = "inner",
|
|
541
|
+
left_on: List[str | Column] | str | Column = None,
|
|
542
|
+
right_on: List[str | Column] | str | Column = None,
|
|
543
|
+
suffix: str = "_right",
|
|
544
|
+
validate: str = None,
|
|
545
|
+
nulls_equal: bool = False,
|
|
546
|
+
coalesce: bool = None,
|
|
547
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"] = None,
|
|
548
|
+
description: str = None,
|
|
550
549
|
):
|
|
551
550
|
"""
|
|
552
551
|
Add a join operation to the Logical Plan.
|
|
@@ -591,27 +590,90 @@ class FlowFrame:
|
|
|
591
590
|
FlowFrame
|
|
592
591
|
New FlowFrame with join operation applied.
|
|
593
592
|
"""
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
593
|
+
# Step 1: Determine if we need to use Polars code
|
|
594
|
+
use_polars_code = self._should_use_polars_code_for_join(
|
|
595
|
+
maintain_order, coalesce, nulls_equal, validate, suffix
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
# Step 2: Ensure both FlowFrames are in the same graph
|
|
599
|
+
self._ensure_same_graph(other)
|
|
600
|
+
|
|
601
|
+
# Step 3: Generate new node ID
|
|
602
|
+
new_node_id = generate_node_id()
|
|
603
|
+
|
|
604
|
+
# Step 4: Parse and validate join columns
|
|
605
|
+
left_columns, right_columns = self._parse_join_columns(
|
|
606
|
+
on, left_on, right_on, how
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Step 5: Validate column lists have same length (except for cross join)
|
|
610
|
+
if how != 'cross' and left_columns is not None and right_columns is not None:
|
|
611
|
+
if len(left_columns) != len(right_columns):
|
|
612
|
+
raise ValueError(
|
|
613
|
+
f"Length mismatch: left columns ({len(left_columns)}) != right columns ({len(right_columns)})"
|
|
614
|
+
)
|
|
599
615
|
|
|
616
|
+
# Step 6: Create join mappings if not using Polars code
|
|
600
617
|
join_mappings = None
|
|
618
|
+
if not use_polars_code and how != 'cross':
|
|
619
|
+
join_mappings, use_polars_code = _create_join_mappings(
|
|
620
|
+
left_columns or [], right_columns or []
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
# Step 7: Execute join based on approach
|
|
624
|
+
if use_polars_code or suffix != '_right':
|
|
625
|
+
return self._execute_polars_code_join(
|
|
626
|
+
other, new_node_id, on, left_on, right_on, left_columns, right_columns,
|
|
627
|
+
how, suffix, validate, nulls_equal, coalesce, maintain_order, description
|
|
628
|
+
)
|
|
629
|
+
elif join_mappings or how == 'cross':
|
|
630
|
+
return self._execute_native_join(
|
|
631
|
+
other, new_node_id, join_mappings, how, description
|
|
632
|
+
)
|
|
633
|
+
else:
|
|
634
|
+
raise ValueError("Could not execute join")
|
|
635
|
+
|
|
636
|
+
def _should_use_polars_code_for_join(
|
|
637
|
+
self, maintain_order, coalesce, nulls_equal, validate, suffix
|
|
638
|
+
) -> bool:
|
|
639
|
+
"""Determine if we should use Polars code instead of native join."""
|
|
640
|
+
return not (
|
|
641
|
+
maintain_order is None and
|
|
642
|
+
coalesce is None and
|
|
643
|
+
nulls_equal is False and
|
|
644
|
+
validate is None and
|
|
645
|
+
suffix == '_right'
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
def _ensure_same_graph(self, other: "FlowFrame") -> None:
|
|
649
|
+
"""Ensure both FlowFrames are in the same graph, combining if necessary."""
|
|
601
650
|
if self.flow_graph.flow_id != other.flow_graph.flow_id:
|
|
602
|
-
combined_graph, node_mappings = combine_flow_graphs_with_mapping(
|
|
651
|
+
combined_graph, node_mappings = combine_flow_graphs_with_mapping(
|
|
652
|
+
self.flow_graph, other.flow_graph
|
|
653
|
+
)
|
|
654
|
+
|
|
603
655
|
new_self_node_id = node_mappings.get((self.flow_graph.flow_id, self.node_id), None)
|
|
604
656
|
new_other_node_id = node_mappings.get((other.flow_graph.flow_id, other.node_id), None)
|
|
657
|
+
|
|
605
658
|
if new_other_node_id is None or new_self_node_id is None:
|
|
606
659
|
raise ValueError("Cannot remap the nodes")
|
|
660
|
+
|
|
607
661
|
self.node_id = new_self_node_id
|
|
608
662
|
other.node_id = new_other_node_id
|
|
609
663
|
self.flow_graph = combined_graph
|
|
610
664
|
other.flow_graph = combined_graph
|
|
665
|
+
|
|
611
666
|
global node_id_counter
|
|
612
667
|
node_id_counter += len(combined_graph.nodes)
|
|
613
|
-
new_node_id = generate_node_id()
|
|
614
668
|
|
|
669
|
+
def _parse_join_columns(
|
|
670
|
+
self,
|
|
671
|
+
on: List[str | Column] | str | Column,
|
|
672
|
+
left_on: List[str | Column] | str | Column,
|
|
673
|
+
right_on: List[str | Column] | str | Column,
|
|
674
|
+
how: str
|
|
675
|
+
) -> tuple[List[str] | None, List[str] | None]:
|
|
676
|
+
"""Parse and validate join column specifications."""
|
|
615
677
|
if on is not None:
|
|
616
678
|
left_columns = right_columns = _normalize_columns_to_list(on)
|
|
617
679
|
elif left_on is not None and right_on is not None:
|
|
@@ -623,93 +685,182 @@ class FlowFrame:
|
|
|
623
685
|
else:
|
|
624
686
|
raise ValueError("Must specify either 'on' or both 'left_on' and 'right_on'")
|
|
625
687
|
|
|
626
|
-
|
|
627
|
-
if how != 'cross' and len(left_columns) != len(right_columns):
|
|
628
|
-
raise ValueError(
|
|
629
|
-
f"Length mismatch: left columns ({len(left_columns)}) != right columns ({len(right_columns)})"
|
|
630
|
-
)
|
|
631
|
-
if not use_polars_code:
|
|
632
|
-
join_mappings, use_polars_code = _create_join_mappings(
|
|
633
|
-
left_columns or [], right_columns or []
|
|
634
|
-
)
|
|
688
|
+
return left_columns, right_columns
|
|
635
689
|
|
|
636
|
-
|
|
690
|
+
def _execute_polars_code_join(
|
|
691
|
+
self,
|
|
692
|
+
other: "FlowFrame",
|
|
693
|
+
new_node_id: int,
|
|
694
|
+
on: List[str | Column] | str | Column,
|
|
695
|
+
left_on: List[str | Column] | str | Column,
|
|
696
|
+
right_on: List[str | Column] | str | Column,
|
|
697
|
+
left_columns: List[str] | None,
|
|
698
|
+
right_columns: List[str] | None,
|
|
699
|
+
how: str,
|
|
700
|
+
suffix: str,
|
|
701
|
+
validate: str,
|
|
702
|
+
nulls_equal: bool,
|
|
703
|
+
coalesce: bool,
|
|
704
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"],
|
|
705
|
+
description: str,
|
|
706
|
+
) -> "FlowFrame":
|
|
707
|
+
"""Execute join using Polars code approach."""
|
|
708
|
+
# Build the code arguments
|
|
709
|
+
code_kwargs = self._build_polars_join_kwargs(
|
|
710
|
+
on, left_on, right_on, left_columns, right_columns,
|
|
711
|
+
how, suffix, validate, nulls_equal, coalesce, maintain_order
|
|
712
|
+
)
|
|
637
713
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
_right = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in right_columns) + "]" if right_on else None
|
|
641
|
-
code_kwargs = {"other": "input_df_2", "how": _to_string_val(how), "on": _on, "left_on": _left,
|
|
642
|
-
"right_on": _right, "suffix": _to_string_val(suffix), "validate": _to_string_val(validate),
|
|
643
|
-
"nulls_equal": nulls_equal, "coalesce": coalesce,
|
|
644
|
-
"maintain_order": _to_string_val(maintain_order)}
|
|
645
|
-
kwargs_str = ", ".join(f"{k}={v}" for k, v in code_kwargs.items() if v is not None)
|
|
646
|
-
code = f"input_df_1.join({kwargs_str})"
|
|
647
|
-
self._add_polars_code(new_node_id, code, description, depending_on_ids=[self.node_id, other.node_id])
|
|
648
|
-
self._add_connection(self.node_id, new_node_id, "main")
|
|
649
|
-
other._add_connection(other.node_id, new_node_id, "main")
|
|
650
|
-
result_frame = FlowFrame(
|
|
651
|
-
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
652
|
-
flow_graph=self.flow_graph,
|
|
653
|
-
node_id=new_node_id,
|
|
654
|
-
parent_node_id=self.node_id,
|
|
655
|
-
)
|
|
714
|
+
kwargs_str = ", ".join(f"{k}={v}" for k, v in code_kwargs.items() if v is not None)
|
|
715
|
+
code = f"input_df_1.join({kwargs_str})"
|
|
656
716
|
|
|
657
|
-
|
|
717
|
+
# Add the Polars code node
|
|
718
|
+
self._add_polars_code(
|
|
719
|
+
new_node_id, code, description,
|
|
720
|
+
depending_on_ids=[self.node_id, other.node_id]
|
|
721
|
+
)
|
|
658
722
|
|
|
659
|
-
|
|
660
|
-
|
|
723
|
+
# Add connections
|
|
724
|
+
self._add_connection(self.node_id, new_node_id, "main")
|
|
725
|
+
other._add_connection(other.node_id, new_node_id, "main")
|
|
661
726
|
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
right_select=right_select.renames,
|
|
670
|
-
how=how,
|
|
671
|
-
)
|
|
672
|
-
|
|
673
|
-
join_input.auto_rename()
|
|
674
|
-
if how == 'cross':
|
|
675
|
-
cross_join_settings = input_schema.NodeCrossJoin(
|
|
676
|
-
flow_id=self.flow_graph.flow_id,
|
|
677
|
-
node_id=new_node_id,
|
|
678
|
-
cross_join_input=join_input,
|
|
679
|
-
is_setup=True,
|
|
680
|
-
depending_on_ids=[self.node_id, other.node_id],
|
|
681
|
-
description=description or f"Join with {how} strategy",
|
|
682
|
-
auto_generate_selection=True,
|
|
683
|
-
verify_integrity=True,
|
|
684
|
-
)
|
|
727
|
+
# Create and return result frame
|
|
728
|
+
return FlowFrame(
|
|
729
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
730
|
+
flow_graph=self.flow_graph,
|
|
731
|
+
node_id=new_node_id,
|
|
732
|
+
parent_node_id=self.node_id,
|
|
733
|
+
)
|
|
685
734
|
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
735
|
+
def _build_polars_join_kwargs(
|
|
736
|
+
self,
|
|
737
|
+
on: List[str | Column] | str | Column,
|
|
738
|
+
left_on: List[str | Column] | str | Column,
|
|
739
|
+
right_on: List[str | Column] | str | Column,
|
|
740
|
+
left_columns: List[str] | None,
|
|
741
|
+
right_columns: List[str] | None,
|
|
742
|
+
how: str,
|
|
743
|
+
suffix: str,
|
|
744
|
+
validate: str,
|
|
745
|
+
nulls_equal: bool,
|
|
746
|
+
coalesce: bool,
|
|
747
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"],
|
|
748
|
+
) -> dict:
|
|
749
|
+
"""Build kwargs dictionary for Polars join code."""
|
|
750
|
+
|
|
751
|
+
def format_column_list(cols):
|
|
752
|
+
if cols is None:
|
|
753
|
+
return None
|
|
754
|
+
return "[" + ', '.join(
|
|
755
|
+
f"'{v}'" if isinstance(v, str) else str(v)
|
|
756
|
+
for v in _normalize_columns_to_list(cols)
|
|
757
|
+
) + "]"
|
|
758
|
+
|
|
759
|
+
return {
|
|
760
|
+
"other": "input_df_2",
|
|
761
|
+
"how": _to_string_val(how),
|
|
762
|
+
"on": format_column_list(on) if on else None,
|
|
763
|
+
"left_on": format_column_list(left_columns) if left_on else None,
|
|
764
|
+
"right_on": format_column_list(right_columns) if right_on else None,
|
|
765
|
+
"suffix": _to_string_val(suffix),
|
|
766
|
+
"validate": _to_string_val(validate),
|
|
767
|
+
"nulls_equal": nulls_equal,
|
|
768
|
+
"coalesce": coalesce,
|
|
769
|
+
"maintain_order": _to_string_val(maintain_order)
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
def _execute_native_join(
|
|
773
|
+
self,
|
|
774
|
+
other: "FlowFrame",
|
|
775
|
+
new_node_id: int,
|
|
776
|
+
join_mappings: List | None,
|
|
777
|
+
how: str,
|
|
778
|
+
description: str,
|
|
779
|
+
) -> "FlowFrame":
|
|
780
|
+
"""Execute join using native FlowFile join nodes."""
|
|
781
|
+
# Create select inputs for both frames
|
|
782
|
+
left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
|
|
783
|
+
right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
|
|
784
|
+
|
|
785
|
+
# Create appropriate join input based on join type
|
|
786
|
+
if how == 'cross':
|
|
787
|
+
join_input = transform_schema.CrossJoinInput(
|
|
788
|
+
left_select=left_select.renames,
|
|
789
|
+
right_select=right_select.renames,
|
|
708
790
|
)
|
|
709
791
|
else:
|
|
710
|
-
|
|
792
|
+
join_input = transform_schema.JoinInput(
|
|
793
|
+
join_mapping=join_mappings,
|
|
794
|
+
left_select=left_select.renames,
|
|
795
|
+
right_select=right_select.renames,
|
|
796
|
+
how=how,
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
# Configure join input
|
|
800
|
+
join_input.auto_rename()
|
|
801
|
+
for right_column in right_select.renames:
|
|
802
|
+
if right_column.join_key:
|
|
803
|
+
right_column.keep = False
|
|
804
|
+
|
|
805
|
+
# Create and add appropriate node
|
|
806
|
+
if how == 'cross':
|
|
807
|
+
self._add_cross_join_node(new_node_id, join_input, description, other)
|
|
808
|
+
else:
|
|
809
|
+
self._add_regular_join_node(new_node_id, join_input, description, other)
|
|
810
|
+
|
|
811
|
+
# Add connections
|
|
812
|
+
self._add_connection(self.node_id, new_node_id, "main")
|
|
813
|
+
other._add_connection(other.node_id, new_node_id, "right")
|
|
814
|
+
|
|
815
|
+
# Create and return result frame
|
|
816
|
+
return FlowFrame(
|
|
817
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
818
|
+
flow_graph=self.flow_graph,
|
|
819
|
+
node_id=new_node_id,
|
|
820
|
+
parent_node_id=self.node_id,
|
|
821
|
+
)
|
|
711
822
|
|
|
712
|
-
|
|
823
|
+
def _add_cross_join_node(
|
|
824
|
+
self,
|
|
825
|
+
new_node_id: int,
|
|
826
|
+
join_input: "transform_schema.CrossJoinInput",
|
|
827
|
+
description: str,
|
|
828
|
+
other: "FlowFrame",
|
|
829
|
+
) -> None:
|
|
830
|
+
"""Add a cross join node to the graph."""
|
|
831
|
+
cross_join_settings = input_schema.NodeCrossJoin(
|
|
832
|
+
flow_id=self.flow_graph.flow_id,
|
|
833
|
+
node_id=new_node_id,
|
|
834
|
+
cross_join_input=join_input,
|
|
835
|
+
is_setup=True,
|
|
836
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
837
|
+
description=description or f"Join with cross strategy",
|
|
838
|
+
auto_generate_selection=True,
|
|
839
|
+
verify_integrity=True,
|
|
840
|
+
)
|
|
841
|
+
self.flow_graph.add_cross_join(cross_join_settings)
|
|
842
|
+
|
|
843
|
+
def _add_regular_join_node(
|
|
844
|
+
self,
|
|
845
|
+
new_node_id: int,
|
|
846
|
+
join_input: "transform_schema.JoinInput",
|
|
847
|
+
description: str,
|
|
848
|
+
other: "FlowFrame",
|
|
849
|
+
) -> None:
|
|
850
|
+
"""Add a regular join node to the graph."""
|
|
851
|
+
join_settings = input_schema.NodeJoin(
|
|
852
|
+
flow_id=self.flow_graph.flow_id,
|
|
853
|
+
node_id=new_node_id,
|
|
854
|
+
join_input=join_input,
|
|
855
|
+
auto_generate_selection=True,
|
|
856
|
+
verify_integrity=True,
|
|
857
|
+
pos_x=200,
|
|
858
|
+
pos_y=150,
|
|
859
|
+
is_setup=True,
|
|
860
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
861
|
+
description=description or f"Join with {join_input.how} strategy",
|
|
862
|
+
)
|
|
863
|
+
self.flow_graph.add_join(join_settings)
|
|
713
864
|
|
|
714
865
|
def _add_number_of_records(self, new_node_id: int, description: str = None) -> "FlowFrame":
|
|
715
866
|
node_number_of_records = input_schema.NodeRecordCount(
|
|
@@ -1,22 +1,20 @@
|
|
|
1
|
-
import
|
|
1
|
+
import io
|
|
2
2
|
import os
|
|
3
|
-
from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable
|
|
4
3
|
from pathlib import Path
|
|
4
|
+
from typing import Any, List, Optional, Union, Dict, Callable
|
|
5
5
|
|
|
6
|
-
import io
|
|
7
6
|
import polars as pl
|
|
8
|
-
from polars._typing import (SchemaDict, IO,PolarsDataType,
|
|
7
|
+
from polars._typing import (SchemaDict, IO, PolarsDataType,
|
|
9
8
|
Sequence, CsvEncoding)
|
|
10
9
|
|
|
11
|
-
from flowfile_core.flowfile.flow_graph import FlowGraph
|
|
12
10
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
11
|
+
from flowfile_core.flowfile.flow_graph import FlowGraph
|
|
13
12
|
from flowfile_core.schemas import input_schema, transform_schema
|
|
14
|
-
|
|
13
|
+
from flowfile_frame.config import logger
|
|
15
14
|
from flowfile_frame.expr import col
|
|
16
|
-
|
|
17
|
-
from flowfile_frame.utils import create_flow_graph
|
|
18
15
|
from flowfile_frame.flow_frame import generate_node_id, FlowFrame
|
|
19
|
-
from flowfile_frame.
|
|
16
|
+
from flowfile_frame.utils import create_flow_graph
|
|
17
|
+
|
|
20
18
|
|
|
21
19
|
def sum(expr):
|
|
22
20
|
"""Sum aggregation function."""
|
|
@@ -140,11 +138,10 @@ def read_csv(
|
|
|
140
138
|
Returns:
|
|
141
139
|
A FlowFrame with the CSV data.
|
|
142
140
|
"""
|
|
143
|
-
node_id = generate_node_id()
|
|
141
|
+
node_id = generate_node_id()
|
|
144
142
|
if flow_graph is None:
|
|
145
|
-
flow_graph = create_flow_graph()
|
|
143
|
+
flow_graph = create_flow_graph()
|
|
146
144
|
flow_id = flow_graph.flow_id
|
|
147
|
-
|
|
148
145
|
current_source_path_for_native = None
|
|
149
146
|
if isinstance(source, (str, os.PathLike)):
|
|
150
147
|
current_source_path_for_native = str(source)
|
|
@@ -216,11 +213,14 @@ def read_csv(
|
|
|
216
213
|
description=read_node_description
|
|
217
214
|
)
|
|
218
215
|
flow_graph.add_read(read_node)
|
|
216
|
+
flow_graph.get_node(1)
|
|
217
|
+
|
|
219
218
|
result_frame = FlowFrame(
|
|
220
219
|
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
221
220
|
flow_graph=flow_graph,
|
|
222
221
|
node_id=node_id
|
|
223
222
|
)
|
|
223
|
+
flow_graph.get_node(1)
|
|
224
224
|
return result_frame
|
|
225
225
|
else:
|
|
226
226
|
polars_source_arg = source
|
|
@@ -449,7 +449,7 @@ def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) ->
|
|
|
449
449
|
input_node = input_schema.NodeManualInput(
|
|
450
450
|
flow_id=flow_id,
|
|
451
451
|
node_id=node_id,
|
|
452
|
-
|
|
452
|
+
raw_data_format=FlowDataEngine(data).to_raw_data(),
|
|
453
453
|
pos_x=100,
|
|
454
454
|
pos_y=100,
|
|
455
455
|
is_setup=True,
|