Flowfile 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (98) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +1 -0
  3. flowfile/web/__init__.py +2 -2
  4. flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
  5. flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
  6. flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
  7. flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
  8. flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
  11. flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
  12. flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
  13. flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
  14. flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
  15. flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
  16. flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
  17. flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
  18. flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
  19. flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
  20. flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
  21. flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
  22. flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
  23. flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
  24. flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
  26. flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
  27. flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
  28. flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
  29. flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
  31. flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
  33. flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
  34. flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
  35. flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
  37. flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
  38. flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
  39. flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
  40. flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
  41. flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
  42. flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
  43. flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
  44. flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
  45. flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
  49. flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/METADATA +2 -2
  52. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/RECORD +96 -94
  53. flowfile_core/__init__.py +1 -0
  54. flowfile_core/auth/jwt.py +39 -0
  55. flowfile_core/configs/node_store/nodes.py +1 -0
  56. flowfile_core/configs/settings.py +6 -5
  57. flowfile_core/flowfile/code_generator/code_generator.py +71 -0
  58. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -1
  59. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +597 -309
  60. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  61. flowfile_core/flowfile/flow_graph.py +619 -191
  62. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  63. flowfile_core/flowfile/flow_node/flow_node.py +500 -89
  64. flowfile_core/flowfile/flow_node/models.py +125 -20
  65. flowfile_core/flowfile/handler.py +2 -33
  66. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  67. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  68. flowfile_core/flowfile/utils.py +36 -5
  69. flowfile_core/main.py +32 -13
  70. flowfile_core/routes/cloud_connections.py +7 -11
  71. flowfile_core/routes/logs.py +2 -6
  72. flowfile_core/routes/public.py +1 -0
  73. flowfile_core/routes/routes.py +127 -51
  74. flowfile_core/routes/secrets.py +72 -14
  75. flowfile_core/schemas/__init__.py +8 -0
  76. flowfile_core/schemas/input_schema.py +92 -64
  77. flowfile_core/schemas/output_model.py +19 -3
  78. flowfile_core/schemas/schemas.py +144 -11
  79. flowfile_core/schemas/transform_schema.py +82 -17
  80. flowfile_frame/__init__.py +9 -1
  81. flowfile_frame/cloud_storage/__init__.py +0 -0
  82. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  83. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  84. flowfile_frame/expr.py +28 -1
  85. flowfile_frame/expr.pyi +76 -61
  86. flowfile_frame/flow_frame.py +232 -110
  87. flowfile_frame/flow_frame.pyi +140 -91
  88. flowfile_frame/flow_frame_methods.py +150 -12
  89. flowfile_frame/group_frame.py +3 -0
  90. flowfile_frame/utils.py +25 -3
  91. test_utils/s3/data_generator.py +1 -0
  92. test_utils/s3/demo_data_generator.py +186 -0
  93. test_utils/s3/fixtures.py +6 -1
  94. flowfile_core/schemas/defaults.py +0 -9
  95. flowfile_core/schemas/models.py +0 -193
  96. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
  97. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
  98. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import io
2
2
  import os
3
3
  from pathlib import Path
4
- from typing import Any, List, Optional, Union, Dict, Callable
4
+ from typing import Any, List, Optional, Union, Dict, Callable, Literal
5
5
 
6
6
  import polars as pl
7
7
  from polars._typing import (SchemaDict, IO, PolarsDataType,
@@ -9,12 +9,13 @@ from polars._typing import (SchemaDict, IO, PolarsDataType,
9
9
 
10
10
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
11
11
  from flowfile_core.flowfile.flow_graph import FlowGraph
12
- from flowfile_core.schemas import input_schema, transform_schema
12
+ from flowfile_core.schemas import input_schema, transform_schema, cloud_storage_schemas
13
13
  from flowfile_frame.config import logger
14
14
  from flowfile_frame.expr import col
15
- from flowfile_frame.flow_frame import generate_node_id, FlowFrame
15
+ from flowfile_frame.flow_frame import FlowFrame
16
16
  from flowfile_frame.utils import create_flow_graph
17
-
17
+ from flowfile_frame.cloud_storage.secret_manager import get_current_user_id
18
+ from flowfile_frame.utils import generate_node_id
18
19
 
19
20
  def sum(expr):
20
21
  """Sum aggregation function."""
@@ -278,6 +279,7 @@ def read_csv(
278
279
  node_id=node_id,
279
280
  )
280
281
 
282
+
281
283
  def _build_polars_code_args(
282
284
  source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
283
285
  separator: str,
@@ -377,13 +379,13 @@ def _build_polars_code_args(
377
379
  return polars_code
378
380
 
379
381
 
380
- def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
382
+ def read_parquet(source, *, flow_graph: FlowGraph = None, description: str = None,
381
383
  convert_to_absolute_path: bool = True, **options) -> FlowFrame:
382
384
  """
383
385
  Read a Parquet file into a FlowFrame.
384
386
 
385
387
  Args:
386
- file_path: Path to Parquet file
388
+ source: Path to Parquet file
387
389
  flow_graph: if you want to add it to an existing graph
388
390
  description: if you want to add a readable name in the frontend (advised)
389
391
  convert_to_absolute_path: If the path needs to be set to a fixed location
@@ -392,8 +394,8 @@ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str =
392
394
  Returns:
393
395
  A FlowFrame with the Parquet data
394
396
  """
395
- if '~' in file_path:
396
- file_path = os.path.expanduser(file_path)
397
+ if '~' in source:
398
+ file_path = os.path.expanduser(source)
397
399
  node_id = generate_node_id()
398
400
 
399
401
  if flow_graph is None:
@@ -403,8 +405,8 @@ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str =
403
405
 
404
406
  received_table = input_schema.ReceivedTable(
405
407
  file_type='parquet',
406
- path=file_path,
407
- name=Path(file_path).name,
408
+ path=source,
409
+ name=Path(source).name,
408
410
  )
409
411
  if convert_to_absolute_path:
410
412
  received_table.path = received_table.abs_file_path
@@ -592,7 +594,7 @@ def scan_csv(
592
594
 
593
595
 
594
596
  def scan_parquet(
595
- file_path,
597
+ source,
596
598
  *,
597
599
  flow_graph: FlowGraph = None,
598
600
  description: str = None,
@@ -608,10 +610,146 @@ def scan_parquet(
608
610
  See read_parquet for full documentation.
609
611
  """
610
612
  return read_parquet(
611
- file_path=file_path,
613
+ source=source,
612
614
  flow_graph=flow_graph,
613
615
  description=description,
614
616
  convert_to_absolute_path=convert_to_absolute_path,
615
617
  **options
616
618
  )
617
619
 
620
+
621
+ def scan_parquet_from_cloud_storage(
622
+ source: str,
623
+ *,
624
+ flow_graph: Optional[FlowGraph] = None,
625
+ connection_name: Optional[str] = None,
626
+ scan_mode: Literal["single_file", "directory", None] = None,
627
+ description: Optional[str] = None
628
+ ) -> FlowFrame:
629
+ node_id = generate_node_id()
630
+
631
+ if scan_mode is None:
632
+ if source[-1] in ("*", "/"):
633
+ scan_mode: Literal["single_file", "directory"] = "directory"
634
+ else:
635
+ scan_mode: Literal["single_file", "directory"] = "single_file"
636
+
637
+ if flow_graph is None:
638
+ flow_graph = create_flow_graph()
639
+
640
+ flow_id = flow_graph.flow_id
641
+ settings = input_schema.NodeCloudStorageReader(
642
+ flow_id=flow_id,
643
+ node_id=node_id,
644
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
645
+ scan_mode=scan_mode,
646
+ connection_name=connection_name,
647
+ file_format="parquet"),
648
+ user_id=get_current_user_id(),
649
+ description=description)
650
+ flow_graph.add_cloud_storage_reader(settings)
651
+ return FlowFrame(
652
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
653
+ flow_graph=flow_graph,
654
+ node_id=node_id
655
+ )
656
+
657
+
658
+ def scan_csv_from_cloud_storage(
659
+ source: str,
660
+ *,
661
+ flow_graph: Optional[FlowGraph] = None,
662
+ connection_name: Optional[str] = None,
663
+ scan_mode: Literal["single_file", "directory", None] = None,
664
+ delimiter: str = ";",
665
+ has_header: Optional[bool] = True,
666
+ encoding: Optional[CsvEncoding] = "utf8") -> FlowFrame:
667
+ node_id = generate_node_id()
668
+
669
+ if scan_mode is None:
670
+ if source[-1] in ("*", "/"):
671
+ scan_mode: Literal["single_file", "directory"] = "directory"
672
+ else:
673
+ scan_mode: Literal["single_file", "directory"] = "single_file"
674
+
675
+ if flow_graph is None:
676
+ flow_graph = create_flow_graph()
677
+ flow_id = flow_graph.flow_id
678
+ settings = input_schema.NodeCloudStorageReader(
679
+ flow_id=flow_id,
680
+ node_id=node_id,
681
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
682
+ scan_mode=scan_mode,
683
+ connection_name=connection_name,
684
+ csv_delimiter=delimiter,
685
+ csv_encoding=encoding,
686
+ csv_has_header=has_header,
687
+ file_format="csv"),
688
+ user_id=get_current_user_id())
689
+ flow_graph.add_cloud_storage_reader(settings)
690
+ return FlowFrame(
691
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
692
+ flow_graph=flow_graph,
693
+ node_id=node_id
694
+ )
695
+
696
+
697
+ def scan_delta(
698
+ source: str,
699
+ *,
700
+ flow_graph: Optional[FlowGraph] = None,
701
+ connection_name: Optional[str] = None,
702
+ version: int = None) -> FlowFrame:
703
+ node_id = generate_node_id()
704
+ if flow_graph is None:
705
+ flow_graph = create_flow_graph()
706
+ flow_id = flow_graph.flow_id
707
+ settings = input_schema.NodeCloudStorageReader(
708
+ flow_id=flow_id,
709
+ node_id=node_id,
710
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
711
+ connection_name=connection_name,
712
+ file_format="delta",
713
+ delta_version=version),
714
+ user_id=get_current_user_id())
715
+ flow_graph.add_cloud_storage_reader(settings)
716
+ return FlowFrame(
717
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
718
+ flow_graph=flow_graph,
719
+ node_id=node_id
720
+ )
721
+
722
+
723
+ def scan_json_from_cloud_storage(
724
+ source: str,
725
+ *,
726
+ flow_graph: Optional[FlowGraph] = None,
727
+ connection_name: Optional[str] = None,
728
+ scan_mode: Literal["single_file", "directory", None] = None,
729
+ ) -> FlowFrame:
730
+ node_id = generate_node_id()
731
+
732
+ if scan_mode is None:
733
+ if source[-1] in ("*", "/"):
734
+ scan_mode: Literal["single_file", "directory"] = "directory"
735
+ else:
736
+ scan_mode: Literal["single_file", "directory"] = "single_file"
737
+
738
+ if flow_graph is None:
739
+ flow_graph = create_flow_graph()
740
+ flow_id = flow_graph.flow_id
741
+ settings = input_schema.NodeCloudStorageReader(
742
+ flow_id=flow_id,
743
+ node_id=node_id,
744
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
745
+ scan_mode=scan_mode,
746
+ connection_name=connection_name,
747
+ file_format="json"),
748
+ user_id=get_current_user_id())
749
+ flow_graph.add_cloud_storage_reader(settings)
750
+ return FlowFrame(
751
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
752
+ flow_graph=flow_graph,
753
+ node_id=node_id
754
+ )
755
+
@@ -91,6 +91,8 @@ class GroupByFrame:
91
91
  if isinstance(col_expr, str):
92
92
  agg_cols.append(transform_schema.AggColl(old_name=col_expr, agg="groupby"))
93
93
  elif isinstance(col_expr, Expr):
94
+ if col_expr.is_complex:
95
+ return False
94
96
  agg_cols.append(transform_schema.AggColl(old_name=col_expr.column_name, agg="groupby"))
95
97
  elif isinstance(col_expr, Selector):
96
98
  return False
@@ -151,6 +153,7 @@ class GroupByFrame:
151
153
  def _create_agg_node(self, node_id_to_use: int, can_be_converted: bool, agg_cols: list, agg_expressions,
152
154
  named_agg_exprs, convertable_to_code: bool, description: str):
153
155
  """Create node for explicit aggregations via self.agg()."""
156
+
154
157
  if can_be_converted:
155
158
  group_by_settings = input_schema.NodeGroupBy(
156
159
  flow_id=self.parent.flow_graph.flow_id,
flowfile_frame/utils.py CHANGED
@@ -88,14 +88,23 @@ def _generate_id() -> int:
88
88
  return int(uuid.uuid4().int % 100000)
89
89
 
90
90
 
91
- def create_flow_graph() -> FlowGraph:
92
- flow_id = _generate_id()
91
+ def create_flow_graph(flow_id: int = None) -> FlowGraph:
92
+ """
93
+ Create a new FlowGraph instance with a unique flow ID.
94
+ Parameters
95
+ - flow_id (int): Optional flow ID. If not provided, a new unique ID will be generated.
96
+ Returns
97
+ - FlowGraph: A new instance of FlowGraph with the specified or generated flow ID.
98
+
99
+ """
100
+ if flow_id is None:
101
+ flow_id = _generate_id()
93
102
  flow_settings = schemas.FlowSettings(
94
103
  flow_id=flow_id,
95
104
  name=f"Flow_{flow_id}",
96
105
  path=f"flow_{flow_id}"
97
106
  )
98
- flow_graph = FlowGraph(flow_id=flow_id, flow_settings=flow_settings)
107
+ flow_graph = FlowGraph(flow_settings=flow_settings)
99
108
  flow_graph.flow_settings.execution_location = 'local' # always create a local frame so that the run time does not attempt to use the flowfile_worker process
100
109
  return flow_graph
101
110
 
@@ -119,3 +128,16 @@ def stringify_values(v: Any) -> str:
119
128
  else:
120
129
  # Handle any other types
121
130
  return str(v)
131
+
132
+
133
+ data = {"c": 0}
134
+
135
+
136
+ def generate_node_id() -> int:
137
+ data["c"] += 1
138
+ return data["c"]
139
+
140
+
141
+ def set_node_id(node_id):
142
+ """Set the node ID to a specific value."""
143
+ data["c"] = node_id
@@ -24,6 +24,7 @@ MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
24
24
  MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
25
25
  MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
26
26
 
27
+
27
28
  def _create_single_csv_file(s3_client, df: pl.DataFrame, bucket_name: str):
28
29
  """Creates a single CSV file from a DataFrame and uploads it to S3."""
29
30
  logger.info("Writing single-file CSV...")
@@ -0,0 +1,186 @@
1
+ import logging
2
+ import io
3
+ import os
4
+ import tempfile
5
+ import shutil
6
+ import random
7
+ from datetime import datetime, timedelta
8
+
9
+ # Third-party libraries
10
+ import boto3
11
+ from botocore.client import Config
12
+ import polars as pl
13
+ import pyarrow as pa
14
+ from pyarrow import parquet as pq
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # --- MinIO/S3 Configuration ---
21
+ MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
22
+ MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
23
+ MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
24
+ MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
25
+ MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
26
+
27
+ # --- Data Generation Functions ---
28
+
29
+ def _create_sales_data(s3_client, df: pl.DataFrame, bucket_name: str):
30
+ """
31
+ Creates partitioned Parquet files for the sales data based on year and month.
32
+ s3://data-lake/sales/year=YYYY/month=MM/
33
+ """
34
+ logger.info("Writing partitioned sales data...")
35
+ # Use Polars' built-in partitioning
36
+ # A temporary local directory is needed to stage the partitioned files before uploading
37
+ with tempfile.TemporaryDirectory() as temp_dir:
38
+ df.write_parquet(
39
+ temp_dir,
40
+ use_pyarrow=True,
41
+ pyarrow_options={"partition_cols": ["year", "month"]}
42
+ )
43
+ # Walk through the local directory and upload files to S3
44
+ for root, _, files in os.walk(temp_dir):
45
+ for file in files:
46
+ if file.endswith(".parquet"):
47
+ local_path = os.path.join(root, file)
48
+ # Construct the S3 key to match the desired structure
49
+ relative_path = os.path.relpath(local_path, temp_dir)
50
+ s3_key = f"data-lake/sales/{relative_path.replace(os.path.sep, '/')}"
51
+ s3_client.upload_file(local_path, bucket_name, s3_key)
52
+ logger.info(f"Finished writing sales data to s3://{bucket_name}/data-lake/sales/")
53
+
54
+ def _create_customers_data(s3_client, df: pl.DataFrame, bucket_name: str):
55
+ """
56
+ Creates a Parquet file for the customers data.
57
+ s3://data-lake/customers/
58
+ """
59
+ logger.info("Writing customers Parquet data...")
60
+ parquet_buffer = io.BytesIO()
61
+ df.write_parquet(parquet_buffer)
62
+ parquet_buffer.seek(0)
63
+ s3_client.put_object(
64
+ Bucket=bucket_name,
65
+ Key='data-lake/customers/customers.parquet',
66
+ Body=parquet_buffer.getvalue()
67
+ )
68
+ logger.info(f"Finished writing customers data to s3://{bucket_name}/data-lake/customers/")
69
+
70
+
71
+ def _create_orders_data(s3_client, df: pl.DataFrame, bucket_name: str):
72
+ """
73
+ Creates a pipe-delimited CSV file for the orders data.
74
+ s3://raw-data/orders/
75
+ """
76
+ logger.info("Writing orders CSV data...")
77
+ csv_buffer = io.BytesIO()
78
+ # Write with pipe delimiter and header
79
+ df.write_csv(csv_buffer, separator="|")
80
+ csv_buffer.seek(0)
81
+ s3_client.put_object(
82
+ Bucket=bucket_name,
83
+ Key='raw-data/orders/orders.csv',
84
+ Body=csv_buffer.getvalue()
85
+ )
86
+ logger.info(f"Finished writing orders data to s3://{bucket_name}/raw-data/orders/")
87
+
88
+ def _create_products_data(df: pl.DataFrame):
89
+ """
90
+ Creates a local Parquet file for the products data.
91
+ """
92
+ logger.info("Writing local products Parquet data...")
93
+ # Create a directory for local data if it doesn't exist
94
+ local_data_dir = "local_data"
95
+ os.makedirs(local_data_dir, exist_ok=True)
96
+ file_path = os.path.join(local_data_dir, "local_products.parquet")
97
+ df.write_parquet(file_path)
98
+ logger.info(f"Finished writing products data to {file_path}")
99
+
100
+
101
+ def create_demo_data(endpoint_url: str, access_key: str, secret_key: str, bucket_name: str):
102
+ """
103
+ Populates a MinIO bucket with test data matching the schemas from the examples.
104
+ """
105
+ logger.info("🚀 Starting data population for flowfile examples...")
106
+ s3_client = boto3.client(
107
+ 's3',
108
+ endpoint_url=endpoint_url,
109
+ aws_access_key_id=access_key,
110
+ aws_secret_access_key=secret_key,
111
+ config=Config(signature_version='s3v4'),
112
+ region_name='us-east-1'
113
+ )
114
+
115
+ # --- Generate Core DataFrames ---
116
+ DATA_SIZE = 15_000 # Increased data size for more variety
117
+ START_DATE = datetime(2022, 1, 1)
118
+ END_DATE = datetime(2024, 12, 31)
119
+ TOTAL_DAYS = (END_DATE - START_DATE).days
120
+
121
+ # States for region mapping
122
+ states = ["CA", "OR", "WA", "NY", "NJ", "PA", "TX", "FL", "GA", "IL", "OH", "MI"]
123
+
124
+ # Generate base sales data across multiple years
125
+ sales_data = {
126
+ "order_id": range(1, DATA_SIZE + 1),
127
+ "customer_id": [random.randint(100, 299) for _ in range(DATA_SIZE)],
128
+ "product_id": [random.randint(1, 100) for _ in range(DATA_SIZE)],
129
+ "order_date": [START_DATE + timedelta(days=random.randint(0, TOTAL_DAYS)) for _ in range(DATA_SIZE)],
130
+ "quantity": [random.randint(1, 5) for _ in range(DATA_SIZE)],
131
+ "unit_price": [round(random.uniform(10.0, 500.0), 2) for _ in range(DATA_SIZE)],
132
+ "discount_rate": [random.choice([0.0, 0.1, 0.15, 0.2, None]) for _ in range(DATA_SIZE)],
133
+ "status": [random.choice(["completed", "pending", "cancelled"]) for _ in range(DATA_SIZE)],
134
+ "customer_lifetime_value": [random.uniform(500, 20000) for _ in range(DATA_SIZE)],
135
+ "state": [random.choice(states) for _ in range(DATA_SIZE)],
136
+ }
137
+ sales_df = pl.from_dict(sales_data).with_columns([
138
+ pl.col("order_date").dt.year().alias("year"),
139
+ pl.col("order_date").dt.month().alias("month"),
140
+ # The 'amount' column in the example seems to be the price before discount
141
+ pl.col("unit_price").alias("amount")
142
+ ])
143
+
144
+ # Generate customers DataFrame
145
+ unique_customer_ids = sales_df["customer_id"].unique().to_list()
146
+ customers_df = pl.DataFrame({
147
+ "customer_id": unique_customer_ids,
148
+ "customer_segment": [random.choice(["VIP", "Regular", "New"]) for _ in unique_customer_ids]
149
+ })
150
+
151
+ # Generate products DataFrame
152
+ unique_product_ids = sales_df["product_id"].unique().to_list()
153
+ # Create a map of product_id to unit_price from the first occurrence in sales_df
154
+ product_price_map = sales_df.group_by("product_id").agg(pl.first("unit_price")).to_dict(as_series=False)
155
+ price_dict = dict(zip(product_price_map['product_id'], product_price_map['unit_price']))
156
+
157
+ products_df = pl.DataFrame({
158
+ "product_id": unique_product_ids,
159
+ "product_category": [random.choice(["Electronics", "Books", "Clothing", "Home Goods"]) for _ in unique_product_ids],
160
+ "unit_price": [price_dict.get(pid) for pid in unique_product_ids]
161
+ })
162
+
163
+ # Generate orders DataFrame for the CSV file (subset of sales)
164
+ orders_df = sales_df.select(["customer_id", "product_id", "quantity", "discount_rate"])
165
+
166
+ logger.info(f"Generated {len(sales_df)} sales records across {sales_df['year'].n_unique()} years, for {len(customers_df)} customers, and {len(products_df)} products.")
167
+
168
+ # --- Write Data to S3 and Local Filesystem ---
169
+ _create_sales_data(s3_client, sales_df, bucket_name)
170
+ _create_customers_data(s3_client, customers_df, bucket_name)
171
+ _create_orders_data(s3_client, orders_df, bucket_name)
172
+ _create_products_data(products_df)
173
+
174
+ logger.info("✅ All test data populated successfully.")
175
+
176
+
177
+ if __name__ == '__main__':
178
+ # The bucket that will be created and populated
179
+ BUCKET = "flowfile-demo-data"
180
+
181
+ create_demo_data(
182
+ endpoint_url=MINIO_ENDPOINT_URL,
183
+ access_key=MINIO_ACCESS_KEY,
184
+ secret_key=MINIO_SECRET_KEY,
185
+ bucket_name=BUCKET
186
+ )
test_utils/s3/fixtures.py CHANGED
@@ -8,6 +8,7 @@ import shutil
8
8
  import boto3
9
9
  from botocore.client import Config
10
10
  from test_utils.s3.data_generator import populate_test_data
11
+ from test_utils.s3.demo_data_generator import create_demo_data
11
12
 
12
13
  logger = logging.getLogger("s3_fixture")
13
14
 
@@ -102,7 +103,7 @@ def create_test_buckets():
102
103
  client = get_minio_client()
103
104
 
104
105
  # Create test buckets
105
- buckets = ['test-bucket', 'flowfile-test', 'sample-data', 'worker-test-bucket']
106
+ buckets = ['test-bucket', 'flowfile-test', 'sample-data', 'worker-test-bucket', 'demo-bucket']
106
107
  for bucket in buckets:
107
108
  try:
108
109
  client.create_bucket(Bucket=bucket)
@@ -176,6 +177,10 @@ def start_minio_container() -> bool:
176
177
  access_key=MINIO_ACCESS_KEY,
177
178
  secret_key=MINIO_SECRET_KEY,
178
179
  bucket_name="test-bucket")
180
+ create_demo_data(endpoint_url=MINIO_ENDPOINT_URL,
181
+ access_key=MINIO_ACCESS_KEY,
182
+ secret_key=MINIO_SECRET_KEY,
183
+ bucket_name="demo-bucket")
179
184
  return True
180
185
  return False
181
186
 
@@ -1,9 +0,0 @@
1
- from schemas import transform_schema
2
- from pydantic import Field, BaseModel
3
-
4
-
5
- default_union_input = transform_schema.UnionInput
6
-
7
-
8
- class F(BaseModel):
9
- f: transform_schema.UnionInput = Field(default_factory=default_union_input)