Flowfile 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
- flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
- flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
- flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
- flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
- flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
- flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/METADATA +2 -2
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/RECORD +96 -94
- flowfile_core/__init__.py +1 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +1 -0
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/flowfile/code_generator/code_generator.py +71 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +597 -309
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_graph.py +619 -191
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +500 -89
- flowfile_core/flowfile/flow_node/models.py +125 -20
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +36 -5
- flowfile_core/main.py +32 -13
- flowfile_core/routes/cloud_connections.py +7 -11
- flowfile_core/routes/logs.py +2 -6
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +127 -51
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/input_schema.py +92 -64
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +144 -11
- flowfile_core/schemas/transform_schema.py +82 -17
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/__init__.py +0 -0
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +232 -110
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +150 -12
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- test_utils/s3/data_generator.py +1 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +6 -1
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/models.py +0 -193
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, List, Optional, Union, Dict, Callable
|
|
4
|
+
from typing import Any, List, Optional, Union, Dict, Callable, Literal
|
|
5
5
|
|
|
6
6
|
import polars as pl
|
|
7
7
|
from polars._typing import (SchemaDict, IO, PolarsDataType,
|
|
@@ -9,12 +9,13 @@ from polars._typing import (SchemaDict, IO, PolarsDataType,
|
|
|
9
9
|
|
|
10
10
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
11
11
|
from flowfile_core.flowfile.flow_graph import FlowGraph
|
|
12
|
-
from flowfile_core.schemas import input_schema, transform_schema
|
|
12
|
+
from flowfile_core.schemas import input_schema, transform_schema, cloud_storage_schemas
|
|
13
13
|
from flowfile_frame.config import logger
|
|
14
14
|
from flowfile_frame.expr import col
|
|
15
|
-
from flowfile_frame.flow_frame import
|
|
15
|
+
from flowfile_frame.flow_frame import FlowFrame
|
|
16
16
|
from flowfile_frame.utils import create_flow_graph
|
|
17
|
-
|
|
17
|
+
from flowfile_frame.cloud_storage.secret_manager import get_current_user_id
|
|
18
|
+
from flowfile_frame.utils import generate_node_id
|
|
18
19
|
|
|
19
20
|
def sum(expr):
|
|
20
21
|
"""Sum aggregation function."""
|
|
@@ -278,6 +279,7 @@ def read_csv(
|
|
|
278
279
|
node_id=node_id,
|
|
279
280
|
)
|
|
280
281
|
|
|
282
|
+
|
|
281
283
|
def _build_polars_code_args(
|
|
282
284
|
source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
|
|
283
285
|
separator: str,
|
|
@@ -377,13 +379,13 @@ def _build_polars_code_args(
|
|
|
377
379
|
return polars_code
|
|
378
380
|
|
|
379
381
|
|
|
380
|
-
def read_parquet(
|
|
382
|
+
def read_parquet(source, *, flow_graph: FlowGraph = None, description: str = None,
|
|
381
383
|
convert_to_absolute_path: bool = True, **options) -> FlowFrame:
|
|
382
384
|
"""
|
|
383
385
|
Read a Parquet file into a FlowFrame.
|
|
384
386
|
|
|
385
387
|
Args:
|
|
386
|
-
|
|
388
|
+
source: Path to Parquet file
|
|
387
389
|
flow_graph: if you want to add it to an existing graph
|
|
388
390
|
description: if you want to add a readable name in the frontend (advised)
|
|
389
391
|
convert_to_absolute_path: If the path needs to be set to a fixed location
|
|
@@ -392,8 +394,8 @@ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str =
|
|
|
392
394
|
Returns:
|
|
393
395
|
A FlowFrame with the Parquet data
|
|
394
396
|
"""
|
|
395
|
-
if '~' in
|
|
396
|
-
file_path = os.path.expanduser(
|
|
397
|
+
if '~' in source:
|
|
398
|
+
file_path = os.path.expanduser(source)
|
|
397
399
|
node_id = generate_node_id()
|
|
398
400
|
|
|
399
401
|
if flow_graph is None:
|
|
@@ -403,8 +405,8 @@ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str =
|
|
|
403
405
|
|
|
404
406
|
received_table = input_schema.ReceivedTable(
|
|
405
407
|
file_type='parquet',
|
|
406
|
-
path=
|
|
407
|
-
name=Path(
|
|
408
|
+
path=source,
|
|
409
|
+
name=Path(source).name,
|
|
408
410
|
)
|
|
409
411
|
if convert_to_absolute_path:
|
|
410
412
|
received_table.path = received_table.abs_file_path
|
|
@@ -592,7 +594,7 @@ def scan_csv(
|
|
|
592
594
|
|
|
593
595
|
|
|
594
596
|
def scan_parquet(
|
|
595
|
-
|
|
597
|
+
source,
|
|
596
598
|
*,
|
|
597
599
|
flow_graph: FlowGraph = None,
|
|
598
600
|
description: str = None,
|
|
@@ -608,10 +610,146 @@ def scan_parquet(
|
|
|
608
610
|
See read_parquet for full documentation.
|
|
609
611
|
"""
|
|
610
612
|
return read_parquet(
|
|
611
|
-
|
|
613
|
+
source=source,
|
|
612
614
|
flow_graph=flow_graph,
|
|
613
615
|
description=description,
|
|
614
616
|
convert_to_absolute_path=convert_to_absolute_path,
|
|
615
617
|
**options
|
|
616
618
|
)
|
|
617
619
|
|
|
620
|
+
|
|
621
|
+
def scan_parquet_from_cloud_storage(
|
|
622
|
+
source: str,
|
|
623
|
+
*,
|
|
624
|
+
flow_graph: Optional[FlowGraph] = None,
|
|
625
|
+
connection_name: Optional[str] = None,
|
|
626
|
+
scan_mode: Literal["single_file", "directory", None] = None,
|
|
627
|
+
description: Optional[str] = None
|
|
628
|
+
) -> FlowFrame:
|
|
629
|
+
node_id = generate_node_id()
|
|
630
|
+
|
|
631
|
+
if scan_mode is None:
|
|
632
|
+
if source[-1] in ("*", "/"):
|
|
633
|
+
scan_mode: Literal["single_file", "directory"] = "directory"
|
|
634
|
+
else:
|
|
635
|
+
scan_mode: Literal["single_file", "directory"] = "single_file"
|
|
636
|
+
|
|
637
|
+
if flow_graph is None:
|
|
638
|
+
flow_graph = create_flow_graph()
|
|
639
|
+
|
|
640
|
+
flow_id = flow_graph.flow_id
|
|
641
|
+
settings = input_schema.NodeCloudStorageReader(
|
|
642
|
+
flow_id=flow_id,
|
|
643
|
+
node_id=node_id,
|
|
644
|
+
cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
|
|
645
|
+
scan_mode=scan_mode,
|
|
646
|
+
connection_name=connection_name,
|
|
647
|
+
file_format="parquet"),
|
|
648
|
+
user_id=get_current_user_id(),
|
|
649
|
+
description=description)
|
|
650
|
+
flow_graph.add_cloud_storage_reader(settings)
|
|
651
|
+
return FlowFrame(
|
|
652
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
653
|
+
flow_graph=flow_graph,
|
|
654
|
+
node_id=node_id
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def scan_csv_from_cloud_storage(
|
|
659
|
+
source: str,
|
|
660
|
+
*,
|
|
661
|
+
flow_graph: Optional[FlowGraph] = None,
|
|
662
|
+
connection_name: Optional[str] = None,
|
|
663
|
+
scan_mode: Literal["single_file", "directory", None] = None,
|
|
664
|
+
delimiter: str = ";",
|
|
665
|
+
has_header: Optional[bool] = True,
|
|
666
|
+
encoding: Optional[CsvEncoding] = "utf8") -> FlowFrame:
|
|
667
|
+
node_id = generate_node_id()
|
|
668
|
+
|
|
669
|
+
if scan_mode is None:
|
|
670
|
+
if source[-1] in ("*", "/"):
|
|
671
|
+
scan_mode: Literal["single_file", "directory"] = "directory"
|
|
672
|
+
else:
|
|
673
|
+
scan_mode: Literal["single_file", "directory"] = "single_file"
|
|
674
|
+
|
|
675
|
+
if flow_graph is None:
|
|
676
|
+
flow_graph = create_flow_graph()
|
|
677
|
+
flow_id = flow_graph.flow_id
|
|
678
|
+
settings = input_schema.NodeCloudStorageReader(
|
|
679
|
+
flow_id=flow_id,
|
|
680
|
+
node_id=node_id,
|
|
681
|
+
cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
|
|
682
|
+
scan_mode=scan_mode,
|
|
683
|
+
connection_name=connection_name,
|
|
684
|
+
csv_delimiter=delimiter,
|
|
685
|
+
csv_encoding=encoding,
|
|
686
|
+
csv_has_header=has_header,
|
|
687
|
+
file_format="csv"),
|
|
688
|
+
user_id=get_current_user_id())
|
|
689
|
+
flow_graph.add_cloud_storage_reader(settings)
|
|
690
|
+
return FlowFrame(
|
|
691
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
692
|
+
flow_graph=flow_graph,
|
|
693
|
+
node_id=node_id
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def scan_delta(
|
|
698
|
+
source: str,
|
|
699
|
+
*,
|
|
700
|
+
flow_graph: Optional[FlowGraph] = None,
|
|
701
|
+
connection_name: Optional[str] = None,
|
|
702
|
+
version: int = None) -> FlowFrame:
|
|
703
|
+
node_id = generate_node_id()
|
|
704
|
+
if flow_graph is None:
|
|
705
|
+
flow_graph = create_flow_graph()
|
|
706
|
+
flow_id = flow_graph.flow_id
|
|
707
|
+
settings = input_schema.NodeCloudStorageReader(
|
|
708
|
+
flow_id=flow_id,
|
|
709
|
+
node_id=node_id,
|
|
710
|
+
cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
|
|
711
|
+
connection_name=connection_name,
|
|
712
|
+
file_format="delta",
|
|
713
|
+
delta_version=version),
|
|
714
|
+
user_id=get_current_user_id())
|
|
715
|
+
flow_graph.add_cloud_storage_reader(settings)
|
|
716
|
+
return FlowFrame(
|
|
717
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
718
|
+
flow_graph=flow_graph,
|
|
719
|
+
node_id=node_id
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
def scan_json_from_cloud_storage(
|
|
724
|
+
source: str,
|
|
725
|
+
*,
|
|
726
|
+
flow_graph: Optional[FlowGraph] = None,
|
|
727
|
+
connection_name: Optional[str] = None,
|
|
728
|
+
scan_mode: Literal["single_file", "directory", None] = None,
|
|
729
|
+
) -> FlowFrame:
|
|
730
|
+
node_id = generate_node_id()
|
|
731
|
+
|
|
732
|
+
if scan_mode is None:
|
|
733
|
+
if source[-1] in ("*", "/"):
|
|
734
|
+
scan_mode: Literal["single_file", "directory"] = "directory"
|
|
735
|
+
else:
|
|
736
|
+
scan_mode: Literal["single_file", "directory"] = "single_file"
|
|
737
|
+
|
|
738
|
+
if flow_graph is None:
|
|
739
|
+
flow_graph = create_flow_graph()
|
|
740
|
+
flow_id = flow_graph.flow_id
|
|
741
|
+
settings = input_schema.NodeCloudStorageReader(
|
|
742
|
+
flow_id=flow_id,
|
|
743
|
+
node_id=node_id,
|
|
744
|
+
cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
|
|
745
|
+
scan_mode=scan_mode,
|
|
746
|
+
connection_name=connection_name,
|
|
747
|
+
file_format="json"),
|
|
748
|
+
user_id=get_current_user_id())
|
|
749
|
+
flow_graph.add_cloud_storage_reader(settings)
|
|
750
|
+
return FlowFrame(
|
|
751
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
752
|
+
flow_graph=flow_graph,
|
|
753
|
+
node_id=node_id
|
|
754
|
+
)
|
|
755
|
+
|
flowfile_frame/group_frame.py
CHANGED
|
@@ -91,6 +91,8 @@ class GroupByFrame:
|
|
|
91
91
|
if isinstance(col_expr, str):
|
|
92
92
|
agg_cols.append(transform_schema.AggColl(old_name=col_expr, agg="groupby"))
|
|
93
93
|
elif isinstance(col_expr, Expr):
|
|
94
|
+
if col_expr.is_complex:
|
|
95
|
+
return False
|
|
94
96
|
agg_cols.append(transform_schema.AggColl(old_name=col_expr.column_name, agg="groupby"))
|
|
95
97
|
elif isinstance(col_expr, Selector):
|
|
96
98
|
return False
|
|
@@ -151,6 +153,7 @@ class GroupByFrame:
|
|
|
151
153
|
def _create_agg_node(self, node_id_to_use: int, can_be_converted: bool, agg_cols: list, agg_expressions,
|
|
152
154
|
named_agg_exprs, convertable_to_code: bool, description: str):
|
|
153
155
|
"""Create node for explicit aggregations via self.agg()."""
|
|
156
|
+
|
|
154
157
|
if can_be_converted:
|
|
155
158
|
group_by_settings = input_schema.NodeGroupBy(
|
|
156
159
|
flow_id=self.parent.flow_graph.flow_id,
|
flowfile_frame/utils.py
CHANGED
|
@@ -88,14 +88,23 @@ def _generate_id() -> int:
|
|
|
88
88
|
return int(uuid.uuid4().int % 100000)
|
|
89
89
|
|
|
90
90
|
|
|
91
|
-
def create_flow_graph() -> FlowGraph:
|
|
92
|
-
|
|
91
|
+
def create_flow_graph(flow_id: int = None) -> FlowGraph:
|
|
92
|
+
"""
|
|
93
|
+
Create a new FlowGraph instance with a unique flow ID.
|
|
94
|
+
Parameters
|
|
95
|
+
- flow_id (int): Optional flow ID. If not provided, a new unique ID will be generated.
|
|
96
|
+
Returns
|
|
97
|
+
- FlowGraph: A new instance of FlowGraph with the specified or generated flow ID.
|
|
98
|
+
|
|
99
|
+
"""
|
|
100
|
+
if flow_id is None:
|
|
101
|
+
flow_id = _generate_id()
|
|
93
102
|
flow_settings = schemas.FlowSettings(
|
|
94
103
|
flow_id=flow_id,
|
|
95
104
|
name=f"Flow_{flow_id}",
|
|
96
105
|
path=f"flow_{flow_id}"
|
|
97
106
|
)
|
|
98
|
-
flow_graph = FlowGraph(
|
|
107
|
+
flow_graph = FlowGraph(flow_settings=flow_settings)
|
|
99
108
|
flow_graph.flow_settings.execution_location = 'local' # always create a local frame so that the run time does not attempt to use the flowfile_worker process
|
|
100
109
|
return flow_graph
|
|
101
110
|
|
|
@@ -119,3 +128,16 @@ def stringify_values(v: Any) -> str:
|
|
|
119
128
|
else:
|
|
120
129
|
# Handle any other types
|
|
121
130
|
return str(v)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
data = {"c": 0}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def generate_node_id() -> int:
|
|
137
|
+
data["c"] += 1
|
|
138
|
+
return data["c"]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def set_node_id(node_id):
|
|
142
|
+
"""Set the node ID to a specific value."""
|
|
143
|
+
data["c"] = node_id
|
test_utils/s3/data_generator.py
CHANGED
|
@@ -24,6 +24,7 @@ MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
|
|
|
24
24
|
MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
|
|
25
25
|
MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
|
|
26
26
|
|
|
27
|
+
|
|
27
28
|
def _create_single_csv_file(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
28
29
|
"""Creates a single CSV file from a DataFrame and uploads it to S3."""
|
|
29
30
|
logger.info("Writing single-file CSV...")
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import io
|
|
3
|
+
import os
|
|
4
|
+
import tempfile
|
|
5
|
+
import shutil
|
|
6
|
+
import random
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
|
|
9
|
+
# Third-party libraries
|
|
10
|
+
import boto3
|
|
11
|
+
from botocore.client import Config
|
|
12
|
+
import polars as pl
|
|
13
|
+
import pyarrow as pa
|
|
14
|
+
from pyarrow import parquet as pq
|
|
15
|
+
|
|
16
|
+
# Configure logging
|
|
17
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# --- MinIO/S3 Configuration ---
|
|
21
|
+
MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
|
|
22
|
+
MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
|
|
23
|
+
MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
|
|
24
|
+
MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
|
|
25
|
+
MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
|
|
26
|
+
|
|
27
|
+
# --- Data Generation Functions ---
|
|
28
|
+
|
|
29
|
+
def _create_sales_data(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
30
|
+
"""
|
|
31
|
+
Creates partitioned Parquet files for the sales data based on year and month.
|
|
32
|
+
s3://data-lake/sales/year=YYYY/month=MM/
|
|
33
|
+
"""
|
|
34
|
+
logger.info("Writing partitioned sales data...")
|
|
35
|
+
# Use Polars' built-in partitioning
|
|
36
|
+
# A temporary local directory is needed to stage the partitioned files before uploading
|
|
37
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
38
|
+
df.write_parquet(
|
|
39
|
+
temp_dir,
|
|
40
|
+
use_pyarrow=True,
|
|
41
|
+
pyarrow_options={"partition_cols": ["year", "month"]}
|
|
42
|
+
)
|
|
43
|
+
# Walk through the local directory and upload files to S3
|
|
44
|
+
for root, _, files in os.walk(temp_dir):
|
|
45
|
+
for file in files:
|
|
46
|
+
if file.endswith(".parquet"):
|
|
47
|
+
local_path = os.path.join(root, file)
|
|
48
|
+
# Construct the S3 key to match the desired structure
|
|
49
|
+
relative_path = os.path.relpath(local_path, temp_dir)
|
|
50
|
+
s3_key = f"data-lake/sales/{relative_path.replace(os.path.sep, '/')}"
|
|
51
|
+
s3_client.upload_file(local_path, bucket_name, s3_key)
|
|
52
|
+
logger.info(f"Finished writing sales data to s3://{bucket_name}/data-lake/sales/")
|
|
53
|
+
|
|
54
|
+
def _create_customers_data(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
55
|
+
"""
|
|
56
|
+
Creates a Parquet file for the customers data.
|
|
57
|
+
s3://data-lake/customers/
|
|
58
|
+
"""
|
|
59
|
+
logger.info("Writing customers Parquet data...")
|
|
60
|
+
parquet_buffer = io.BytesIO()
|
|
61
|
+
df.write_parquet(parquet_buffer)
|
|
62
|
+
parquet_buffer.seek(0)
|
|
63
|
+
s3_client.put_object(
|
|
64
|
+
Bucket=bucket_name,
|
|
65
|
+
Key='data-lake/customers/customers.parquet',
|
|
66
|
+
Body=parquet_buffer.getvalue()
|
|
67
|
+
)
|
|
68
|
+
logger.info(f"Finished writing customers data to s3://{bucket_name}/data-lake/customers/")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _create_orders_data(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
72
|
+
"""
|
|
73
|
+
Creates a pipe-delimited CSV file for the orders data.
|
|
74
|
+
s3://raw-data/orders/
|
|
75
|
+
"""
|
|
76
|
+
logger.info("Writing orders CSV data...")
|
|
77
|
+
csv_buffer = io.BytesIO()
|
|
78
|
+
# Write with pipe delimiter and header
|
|
79
|
+
df.write_csv(csv_buffer, separator="|")
|
|
80
|
+
csv_buffer.seek(0)
|
|
81
|
+
s3_client.put_object(
|
|
82
|
+
Bucket=bucket_name,
|
|
83
|
+
Key='raw-data/orders/orders.csv',
|
|
84
|
+
Body=csv_buffer.getvalue()
|
|
85
|
+
)
|
|
86
|
+
logger.info(f"Finished writing orders data to s3://{bucket_name}/raw-data/orders/")
|
|
87
|
+
|
|
88
|
+
def _create_products_data(df: pl.DataFrame):
|
|
89
|
+
"""
|
|
90
|
+
Creates a local Parquet file for the products data.
|
|
91
|
+
"""
|
|
92
|
+
logger.info("Writing local products Parquet data...")
|
|
93
|
+
# Create a directory for local data if it doesn't exist
|
|
94
|
+
local_data_dir = "local_data"
|
|
95
|
+
os.makedirs(local_data_dir, exist_ok=True)
|
|
96
|
+
file_path = os.path.join(local_data_dir, "local_products.parquet")
|
|
97
|
+
df.write_parquet(file_path)
|
|
98
|
+
logger.info(f"Finished writing products data to {file_path}")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def create_demo_data(endpoint_url: str, access_key: str, secret_key: str, bucket_name: str):
|
|
102
|
+
"""
|
|
103
|
+
Populates a MinIO bucket with test data matching the schemas from the examples.
|
|
104
|
+
"""
|
|
105
|
+
logger.info("🚀 Starting data population for flowfile examples...")
|
|
106
|
+
s3_client = boto3.client(
|
|
107
|
+
's3',
|
|
108
|
+
endpoint_url=endpoint_url,
|
|
109
|
+
aws_access_key_id=access_key,
|
|
110
|
+
aws_secret_access_key=secret_key,
|
|
111
|
+
config=Config(signature_version='s3v4'),
|
|
112
|
+
region_name='us-east-1'
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# --- Generate Core DataFrames ---
|
|
116
|
+
DATA_SIZE = 15_000 # Increased data size for more variety
|
|
117
|
+
START_DATE = datetime(2022, 1, 1)
|
|
118
|
+
END_DATE = datetime(2024, 12, 31)
|
|
119
|
+
TOTAL_DAYS = (END_DATE - START_DATE).days
|
|
120
|
+
|
|
121
|
+
# States for region mapping
|
|
122
|
+
states = ["CA", "OR", "WA", "NY", "NJ", "PA", "TX", "FL", "GA", "IL", "OH", "MI"]
|
|
123
|
+
|
|
124
|
+
# Generate base sales data across multiple years
|
|
125
|
+
sales_data = {
|
|
126
|
+
"order_id": range(1, DATA_SIZE + 1),
|
|
127
|
+
"customer_id": [random.randint(100, 299) for _ in range(DATA_SIZE)],
|
|
128
|
+
"product_id": [random.randint(1, 100) for _ in range(DATA_SIZE)],
|
|
129
|
+
"order_date": [START_DATE + timedelta(days=random.randint(0, TOTAL_DAYS)) for _ in range(DATA_SIZE)],
|
|
130
|
+
"quantity": [random.randint(1, 5) for _ in range(DATA_SIZE)],
|
|
131
|
+
"unit_price": [round(random.uniform(10.0, 500.0), 2) for _ in range(DATA_SIZE)],
|
|
132
|
+
"discount_rate": [random.choice([0.0, 0.1, 0.15, 0.2, None]) for _ in range(DATA_SIZE)],
|
|
133
|
+
"status": [random.choice(["completed", "pending", "cancelled"]) for _ in range(DATA_SIZE)],
|
|
134
|
+
"customer_lifetime_value": [random.uniform(500, 20000) for _ in range(DATA_SIZE)],
|
|
135
|
+
"state": [random.choice(states) for _ in range(DATA_SIZE)],
|
|
136
|
+
}
|
|
137
|
+
sales_df = pl.from_dict(sales_data).with_columns([
|
|
138
|
+
pl.col("order_date").dt.year().alias("year"),
|
|
139
|
+
pl.col("order_date").dt.month().alias("month"),
|
|
140
|
+
# The 'amount' column in the example seems to be the price before discount
|
|
141
|
+
pl.col("unit_price").alias("amount")
|
|
142
|
+
])
|
|
143
|
+
|
|
144
|
+
# Generate customers DataFrame
|
|
145
|
+
unique_customer_ids = sales_df["customer_id"].unique().to_list()
|
|
146
|
+
customers_df = pl.DataFrame({
|
|
147
|
+
"customer_id": unique_customer_ids,
|
|
148
|
+
"customer_segment": [random.choice(["VIP", "Regular", "New"]) for _ in unique_customer_ids]
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
# Generate products DataFrame
|
|
152
|
+
unique_product_ids = sales_df["product_id"].unique().to_list()
|
|
153
|
+
# Create a map of product_id to unit_price from the first occurrence in sales_df
|
|
154
|
+
product_price_map = sales_df.group_by("product_id").agg(pl.first("unit_price")).to_dict(as_series=False)
|
|
155
|
+
price_dict = dict(zip(product_price_map['product_id'], product_price_map['unit_price']))
|
|
156
|
+
|
|
157
|
+
products_df = pl.DataFrame({
|
|
158
|
+
"product_id": unique_product_ids,
|
|
159
|
+
"product_category": [random.choice(["Electronics", "Books", "Clothing", "Home Goods"]) for _ in unique_product_ids],
|
|
160
|
+
"unit_price": [price_dict.get(pid) for pid in unique_product_ids]
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
# Generate orders DataFrame for the CSV file (subset of sales)
|
|
164
|
+
orders_df = sales_df.select(["customer_id", "product_id", "quantity", "discount_rate"])
|
|
165
|
+
|
|
166
|
+
logger.info(f"Generated {len(sales_df)} sales records across {sales_df['year'].n_unique()} years, for {len(customers_df)} customers, and {len(products_df)} products.")
|
|
167
|
+
|
|
168
|
+
# --- Write Data to S3 and Local Filesystem ---
|
|
169
|
+
_create_sales_data(s3_client, sales_df, bucket_name)
|
|
170
|
+
_create_customers_data(s3_client, customers_df, bucket_name)
|
|
171
|
+
_create_orders_data(s3_client, orders_df, bucket_name)
|
|
172
|
+
_create_products_data(products_df)
|
|
173
|
+
|
|
174
|
+
logger.info("✅ All test data populated successfully.")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
if __name__ == '__main__':
|
|
178
|
+
# The bucket that will be created and populated
|
|
179
|
+
BUCKET = "flowfile-demo-data"
|
|
180
|
+
|
|
181
|
+
create_demo_data(
|
|
182
|
+
endpoint_url=MINIO_ENDPOINT_URL,
|
|
183
|
+
access_key=MINIO_ACCESS_KEY,
|
|
184
|
+
secret_key=MINIO_SECRET_KEY,
|
|
185
|
+
bucket_name=BUCKET
|
|
186
|
+
)
|
test_utils/s3/fixtures.py
CHANGED
|
@@ -8,6 +8,7 @@ import shutil
|
|
|
8
8
|
import boto3
|
|
9
9
|
from botocore.client import Config
|
|
10
10
|
from test_utils.s3.data_generator import populate_test_data
|
|
11
|
+
from test_utils.s3.demo_data_generator import create_demo_data
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger("s3_fixture")
|
|
13
14
|
|
|
@@ -102,7 +103,7 @@ def create_test_buckets():
|
|
|
102
103
|
client = get_minio_client()
|
|
103
104
|
|
|
104
105
|
# Create test buckets
|
|
105
|
-
buckets = ['test-bucket', 'flowfile-test', 'sample-data', 'worker-test-bucket']
|
|
106
|
+
buckets = ['test-bucket', 'flowfile-test', 'sample-data', 'worker-test-bucket', 'demo-bucket']
|
|
106
107
|
for bucket in buckets:
|
|
107
108
|
try:
|
|
108
109
|
client.create_bucket(Bucket=bucket)
|
|
@@ -176,6 +177,10 @@ def start_minio_container() -> bool:
|
|
|
176
177
|
access_key=MINIO_ACCESS_KEY,
|
|
177
178
|
secret_key=MINIO_SECRET_KEY,
|
|
178
179
|
bucket_name="test-bucket")
|
|
180
|
+
create_demo_data(endpoint_url=MINIO_ENDPOINT_URL,
|
|
181
|
+
access_key=MINIO_ACCESS_KEY,
|
|
182
|
+
secret_key=MINIO_SECRET_KEY,
|
|
183
|
+
bucket_name="demo-bucket")
|
|
179
184
|
return True
|
|
180
185
|
return False
|
|
181
186
|
|