Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (145) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +1 -0
  3. flowfile/web/__init__.py +2 -2
  4. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  5. flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
  6. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  7. flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
  8. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  9. flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
  10. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
  11. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  12. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
  13. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
  14. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
  15. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
  16. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
  17. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
  18. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  19. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
  20. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
  21. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
  22. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
  23. flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
  24. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  25. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
  26. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  27. flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
  28. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
  29. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
  30. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
  31. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
  32. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
  33. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
  34. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
  35. flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
  36. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
  37. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
  38. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
  39. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
  40. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
  41. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
  42. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
  43. flowfile/web/static/assets/api-6ef0dcef.js +80 -0
  44. flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
  45. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  46. flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
  47. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
  48. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
  49. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
  50. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
  51. flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
  52. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
  53. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
  54. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
  55. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
  56. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
  57. flowfile/web/static/index.html +1 -1
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
  59. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
  60. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
  61. flowfile_core/__init__.py +3 -0
  62. flowfile_core/auth/jwt.py +39 -0
  63. flowfile_core/configs/node_store/nodes.py +9 -6
  64. flowfile_core/configs/settings.py +6 -5
  65. flowfile_core/database/connection.py +63 -15
  66. flowfile_core/database/init_db.py +0 -1
  67. flowfile_core/database/models.py +49 -2
  68. flowfile_core/flowfile/code_generator/code_generator.py +472 -17
  69. flowfile_core/flowfile/connection_manager/models.py +1 -1
  70. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  71. flowfile_core/flowfile/extensions.py +1 -1
  72. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  73. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  74. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
  75. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  76. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  77. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  78. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  79. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  80. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  81. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  82. flowfile_core/flowfile/flow_graph.py +718 -253
  83. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  84. flowfile_core/flowfile/flow_node/flow_node.py +563 -117
  85. flowfile_core/flowfile/flow_node/models.py +154 -20
  86. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  87. flowfile_core/flowfile/handler.py +2 -33
  88. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  89. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  90. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  91. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  92. flowfile_core/flowfile/utils.py +35 -26
  93. flowfile_core/main.py +35 -15
  94. flowfile_core/routes/cloud_connections.py +77 -0
  95. flowfile_core/routes/logs.py +2 -7
  96. flowfile_core/routes/public.py +1 -0
  97. flowfile_core/routes/routes.py +130 -90
  98. flowfile_core/routes/secrets.py +72 -14
  99. flowfile_core/schemas/__init__.py +8 -0
  100. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  101. flowfile_core/schemas/input_schema.py +121 -71
  102. flowfile_core/schemas/output_model.py +19 -3
  103. flowfile_core/schemas/schemas.py +150 -12
  104. flowfile_core/schemas/transform_schema.py +175 -35
  105. flowfile_core/utils/utils.py +40 -1
  106. flowfile_core/utils/validate_setup.py +41 -0
  107. flowfile_frame/__init__.py +9 -1
  108. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  109. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  110. flowfile_frame/expr.py +28 -1
  111. flowfile_frame/expr.pyi +76 -61
  112. flowfile_frame/flow_frame.py +481 -208
  113. flowfile_frame/flow_frame.pyi +140 -91
  114. flowfile_frame/flow_frame_methods.py +160 -22
  115. flowfile_frame/group_frame.py +3 -0
  116. flowfile_frame/utils.py +25 -3
  117. flowfile_worker/external_sources/s3_source/main.py +216 -0
  118. flowfile_worker/external_sources/s3_source/models.py +142 -0
  119. flowfile_worker/funcs.py +51 -6
  120. flowfile_worker/models.py +22 -2
  121. flowfile_worker/routes.py +40 -38
  122. flowfile_worker/utils.py +1 -1
  123. test_utils/s3/commands.py +46 -0
  124. test_utils/s3/data_generator.py +292 -0
  125. test_utils/s3/demo_data_generator.py +186 -0
  126. test_utils/s3/fixtures.py +214 -0
  127. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  128. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  129. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  130. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  131. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  132. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  133. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  134. flowfile_core/schemas/defaults.py +0 -9
  135. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  136. flowfile_core/schemas/models.py +0 -193
  137. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  138. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  139. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  140. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  141. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
  142. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
  143. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
  144. {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  145. {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
@@ -1,22 +1,21 @@
1
- import logging
1
+ import io
2
2
  import os
3
- from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable
4
3
  from pathlib import Path
4
+ from typing import Any, List, Optional, Union, Dict, Callable, Literal
5
5
 
6
- import io
7
6
  import polars as pl
8
- from polars._typing import (SchemaDict, IO,PolarsDataType,
7
+ from polars._typing import (SchemaDict, IO, PolarsDataType,
9
8
  Sequence, CsvEncoding)
10
9
 
11
- from flowfile_core.flowfile.flow_graph import FlowGraph
12
10
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
13
- from flowfile_core.schemas import input_schema, transform_schema
14
-
11
+ from flowfile_core.flowfile.flow_graph import FlowGraph
12
+ from flowfile_core.schemas import input_schema, transform_schema, cloud_storage_schemas
13
+ from flowfile_frame.config import logger
15
14
  from flowfile_frame.expr import col
16
-
15
+ from flowfile_frame.flow_frame import FlowFrame
17
16
  from flowfile_frame.utils import create_flow_graph
18
- from flowfile_frame.flow_frame import generate_node_id, FlowFrame
19
- from flowfile_frame.config import logger
17
+ from flowfile_frame.cloud_storage.secret_manager import get_current_user_id
18
+ from flowfile_frame.utils import generate_node_id
20
19
 
21
20
  def sum(expr):
22
21
  """Sum aggregation function."""
@@ -140,11 +139,10 @@ def read_csv(
140
139
  Returns:
141
140
  A FlowFrame with the CSV data.
142
141
  """
143
- node_id = generate_node_id() # Assuming generate_node_id is defined
142
+ node_id = generate_node_id()
144
143
  if flow_graph is None:
145
- flow_graph = create_flow_graph() # Assuming create_flow_graph is defined
144
+ flow_graph = create_flow_graph()
146
145
  flow_id = flow_graph.flow_id
147
-
148
146
  current_source_path_for_native = None
149
147
  if isinstance(source, (str, os.PathLike)):
150
148
  current_source_path_for_native = str(source)
@@ -216,11 +214,14 @@ def read_csv(
216
214
  description=read_node_description
217
215
  )
218
216
  flow_graph.add_read(read_node)
217
+ flow_graph.get_node(1)
218
+
219
219
  result_frame = FlowFrame(
220
220
  data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
221
221
  flow_graph=flow_graph,
222
222
  node_id=node_id
223
223
  )
224
+ flow_graph.get_node(1)
224
225
  return result_frame
225
226
  else:
226
227
  polars_source_arg = source
@@ -278,6 +279,7 @@ def read_csv(
278
279
  node_id=node_id,
279
280
  )
280
281
 
282
+
281
283
  def _build_polars_code_args(
282
284
  source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
283
285
  separator: str,
@@ -377,13 +379,13 @@ def _build_polars_code_args(
377
379
  return polars_code
378
380
 
379
381
 
380
- def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
382
+ def read_parquet(source, *, flow_graph: FlowGraph = None, description: str = None,
381
383
  convert_to_absolute_path: bool = True, **options) -> FlowFrame:
382
384
  """
383
385
  Read a Parquet file into a FlowFrame.
384
386
 
385
387
  Args:
386
- file_path: Path to Parquet file
388
+ source: Path to Parquet file
387
389
  flow_graph: if you want to add it to an existing graph
388
390
  description: if you want to add a readable name in the frontend (advised)
389
391
  convert_to_absolute_path: If the path needs to be set to a fixed location
@@ -392,8 +394,8 @@ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str =
392
394
  Returns:
393
395
  A FlowFrame with the Parquet data
394
396
  """
395
- if '~' in file_path:
396
- file_path = os.path.expanduser(file_path)
397
+ if '~' in source:
398
+ file_path = os.path.expanduser(source)
397
399
  node_id = generate_node_id()
398
400
 
399
401
  if flow_graph is None:
@@ -403,8 +405,8 @@ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str =
403
405
 
404
406
  received_table = input_schema.ReceivedTable(
405
407
  file_type='parquet',
406
- path=file_path,
407
- name=Path(file_path).name,
408
+ path=source,
409
+ name=Path(source).name,
408
410
  )
409
411
  if convert_to_absolute_path:
410
412
  received_table.path = received_table.abs_file_path
@@ -449,7 +451,7 @@ def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) ->
449
451
  input_node = input_schema.NodeManualInput(
450
452
  flow_id=flow_id,
451
453
  node_id=node_id,
452
- raw_data=FlowDataEngine(data).to_pylist(),
454
+ raw_data_format=FlowDataEngine(data).to_raw_data(),
453
455
  pos_x=100,
454
456
  pos_y=100,
455
457
  is_setup=True,
@@ -592,7 +594,7 @@ def scan_csv(
592
594
 
593
595
 
594
596
  def scan_parquet(
595
- file_path,
597
+ source,
596
598
  *,
597
599
  flow_graph: FlowGraph = None,
598
600
  description: str = None,
@@ -608,10 +610,146 @@ def scan_parquet(
608
610
  See read_parquet for full documentation.
609
611
  """
610
612
  return read_parquet(
611
- file_path=file_path,
613
+ source=source,
612
614
  flow_graph=flow_graph,
613
615
  description=description,
614
616
  convert_to_absolute_path=convert_to_absolute_path,
615
617
  **options
616
618
  )
617
619
 
620
+
621
+ def scan_parquet_from_cloud_storage(
622
+ source: str,
623
+ *,
624
+ flow_graph: Optional[FlowGraph] = None,
625
+ connection_name: Optional[str] = None,
626
+ scan_mode: Literal["single_file", "directory", None] = None,
627
+ description: Optional[str] = None
628
+ ) -> FlowFrame:
629
+ node_id = generate_node_id()
630
+
631
+ if scan_mode is None:
632
+ if source[-1] in ("*", "/"):
633
+ scan_mode: Literal["single_file", "directory"] = "directory"
634
+ else:
635
+ scan_mode: Literal["single_file", "directory"] = "single_file"
636
+
637
+ if flow_graph is None:
638
+ flow_graph = create_flow_graph()
639
+
640
+ flow_id = flow_graph.flow_id
641
+ settings = input_schema.NodeCloudStorageReader(
642
+ flow_id=flow_id,
643
+ node_id=node_id,
644
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
645
+ scan_mode=scan_mode,
646
+ connection_name=connection_name,
647
+ file_format="parquet"),
648
+ user_id=get_current_user_id(),
649
+ description=description)
650
+ flow_graph.add_cloud_storage_reader(settings)
651
+ return FlowFrame(
652
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
653
+ flow_graph=flow_graph,
654
+ node_id=node_id
655
+ )
656
+
657
+
658
+ def scan_csv_from_cloud_storage(
659
+ source: str,
660
+ *,
661
+ flow_graph: Optional[FlowGraph] = None,
662
+ connection_name: Optional[str] = None,
663
+ scan_mode: Literal["single_file", "directory", None] = None,
664
+ delimiter: str = ";",
665
+ has_header: Optional[bool] = True,
666
+ encoding: Optional[CsvEncoding] = "utf8") -> FlowFrame:
667
+ node_id = generate_node_id()
668
+
669
+ if scan_mode is None:
670
+ if source[-1] in ("*", "/"):
671
+ scan_mode: Literal["single_file", "directory"] = "directory"
672
+ else:
673
+ scan_mode: Literal["single_file", "directory"] = "single_file"
674
+
675
+ if flow_graph is None:
676
+ flow_graph = create_flow_graph()
677
+ flow_id = flow_graph.flow_id
678
+ settings = input_schema.NodeCloudStorageReader(
679
+ flow_id=flow_id,
680
+ node_id=node_id,
681
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
682
+ scan_mode=scan_mode,
683
+ connection_name=connection_name,
684
+ csv_delimiter=delimiter,
685
+ csv_encoding=encoding,
686
+ csv_has_header=has_header,
687
+ file_format="csv"),
688
+ user_id=get_current_user_id())
689
+ flow_graph.add_cloud_storage_reader(settings)
690
+ return FlowFrame(
691
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
692
+ flow_graph=flow_graph,
693
+ node_id=node_id
694
+ )
695
+
696
+
697
+ def scan_delta(
698
+ source: str,
699
+ *,
700
+ flow_graph: Optional[FlowGraph] = None,
701
+ connection_name: Optional[str] = None,
702
+ version: int = None) -> FlowFrame:
703
+ node_id = generate_node_id()
704
+ if flow_graph is None:
705
+ flow_graph = create_flow_graph()
706
+ flow_id = flow_graph.flow_id
707
+ settings = input_schema.NodeCloudStorageReader(
708
+ flow_id=flow_id,
709
+ node_id=node_id,
710
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
711
+ connection_name=connection_name,
712
+ file_format="delta",
713
+ delta_version=version),
714
+ user_id=get_current_user_id())
715
+ flow_graph.add_cloud_storage_reader(settings)
716
+ return FlowFrame(
717
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
718
+ flow_graph=flow_graph,
719
+ node_id=node_id
720
+ )
721
+
722
+
723
+ def scan_json_from_cloud_storage(
724
+ source: str,
725
+ *,
726
+ flow_graph: Optional[FlowGraph] = None,
727
+ connection_name: Optional[str] = None,
728
+ scan_mode: Literal["single_file", "directory", None] = None,
729
+ ) -> FlowFrame:
730
+ node_id = generate_node_id()
731
+
732
+ if scan_mode is None:
733
+ if source[-1] in ("*", "/"):
734
+ scan_mode: Literal["single_file", "directory"] = "directory"
735
+ else:
736
+ scan_mode: Literal["single_file", "directory"] = "single_file"
737
+
738
+ if flow_graph is None:
739
+ flow_graph = create_flow_graph()
740
+ flow_id = flow_graph.flow_id
741
+ settings = input_schema.NodeCloudStorageReader(
742
+ flow_id=flow_id,
743
+ node_id=node_id,
744
+ cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
745
+ scan_mode=scan_mode,
746
+ connection_name=connection_name,
747
+ file_format="json"),
748
+ user_id=get_current_user_id())
749
+ flow_graph.add_cloud_storage_reader(settings)
750
+ return FlowFrame(
751
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
752
+ flow_graph=flow_graph,
753
+ node_id=node_id
754
+ )
755
+
@@ -91,6 +91,8 @@ class GroupByFrame:
91
91
  if isinstance(col_expr, str):
92
92
  agg_cols.append(transform_schema.AggColl(old_name=col_expr, agg="groupby"))
93
93
  elif isinstance(col_expr, Expr):
94
+ if col_expr.is_complex:
95
+ return False
94
96
  agg_cols.append(transform_schema.AggColl(old_name=col_expr.column_name, agg="groupby"))
95
97
  elif isinstance(col_expr, Selector):
96
98
  return False
@@ -151,6 +153,7 @@ class GroupByFrame:
151
153
  def _create_agg_node(self, node_id_to_use: int, can_be_converted: bool, agg_cols: list, agg_expressions,
152
154
  named_agg_exprs, convertable_to_code: bool, description: str):
153
155
  """Create node for explicit aggregations via self.agg()."""
156
+
154
157
  if can_be_converted:
155
158
  group_by_settings = input_schema.NodeGroupBy(
156
159
  flow_id=self.parent.flow_graph.flow_id,
flowfile_frame/utils.py CHANGED
@@ -88,14 +88,23 @@ def _generate_id() -> int:
88
88
  return int(uuid.uuid4().int % 100000)
89
89
 
90
90
 
91
- def create_flow_graph() -> FlowGraph:
92
- flow_id = _generate_id()
91
+ def create_flow_graph(flow_id: int = None) -> FlowGraph:
92
+ """
93
+ Create a new FlowGraph instance with a unique flow ID.
94
+ Parameters
95
+ - flow_id (int): Optional flow ID. If not provided, a new unique ID will be generated.
96
+ Returns
97
+ - FlowGraph: A new instance of FlowGraph with the specified or generated flow ID.
98
+
99
+ """
100
+ if flow_id is None:
101
+ flow_id = _generate_id()
93
102
  flow_settings = schemas.FlowSettings(
94
103
  flow_id=flow_id,
95
104
  name=f"Flow_{flow_id}",
96
105
  path=f"flow_{flow_id}"
97
106
  )
98
- flow_graph = FlowGraph(flow_id=flow_id, flow_settings=flow_settings)
107
+ flow_graph = FlowGraph(flow_settings=flow_settings)
99
108
  flow_graph.flow_settings.execution_location = 'local' # always create a local frame so that the run time does not attempt to use the flowfile_worker process
100
109
  return flow_graph
101
110
 
@@ -119,3 +128,16 @@ def stringify_values(v: Any) -> str:
119
128
  else:
120
129
  # Handle any other types
121
130
  return str(v)
131
+
132
+
133
+ data = {"c": 0}
134
+
135
+
136
+ def generate_node_id() -> int:
137
+ data["c"] += 1
138
+ return data["c"]
139
+
140
+
141
+ def set_node_id(node_id):
142
+ """Set the node ID to a specific value."""
143
+ data["c"] = node_id
@@ -0,0 +1,216 @@
1
+ """Cloud storage writer module for FlowFile Worker.
2
+
3
+ This module provides functionality to write Polars LazyFrames to various cloud storage
4
+ services (S3, Azure ADLS, Google Cloud Storage) in different file formats.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any
9
+ from logging import Logger
10
+
11
+ from flowfile_worker.external_sources.s3_source.models import (
12
+ CloudStorageWriteSettings,
13
+ WriteSettings
14
+ )
15
+ from flowfile_worker.utils import collect_lazy_frame
16
+
17
+
18
+ def _write_parquet_to_cloud(
19
+ df: pl.LazyFrame,
20
+ resource_path: str,
21
+ storage_options: Dict[str, Any],
22
+ write_settings: WriteSettings,
23
+ logger: Logger
24
+ ) -> None:
25
+ """Write LazyFrame to a Parquet file in cloud storage.
26
+
27
+ Args:
28
+ df: Polars LazyFrame to write.
29
+ resource_path: Cloud storage path where the file will be written.
30
+ storage_options: Storage-specific options for authentication and configuration.
31
+ write_settings: Write configuration including compression settings.
32
+ logger: Logger instance for logging operations.
33
+
34
+ Raises:
35
+ Exception: If writing fails, wrapped with a descriptive error message.
36
+ """
37
+ try:
38
+ sink_kwargs = {
39
+ "path": resource_path,
40
+ "compression": write_settings.parquet_compression,
41
+ }
42
+ if storage_options:
43
+ sink_kwargs["storage_options"] = storage_options
44
+
45
+ try:
46
+ # Try to use sink_parquet for lazy execution
47
+ df.sink_parquet(**sink_kwargs)
48
+ except Exception as e:
49
+ # Fall back to collecting and writing if sink fails
50
+ logger.warning(f"Failed to use sink_parquet, falling back to collect and write: {str(e)}")
51
+ pl_df = collect_lazy_frame(df)
52
+ sink_kwargs['file'] = sink_kwargs.pop("path")
53
+ pl_df.write_parquet(**sink_kwargs)
54
+
55
+ except Exception as e:
56
+ logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
57
+ raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
58
+
59
+
60
+ def _write_delta_to_cloud(
61
+ df: pl.LazyFrame,
62
+ resource_path: str,
63
+ storage_options: Dict[str, Any],
64
+ write_settings: WriteSettings,
65
+ logger: Logger
66
+ ) -> None:
67
+ """Write LazyFrame to Delta Lake format in cloud storage.
68
+
69
+ Args:
70
+ df: Polars LazyFrame to write.
71
+ resource_path: Cloud storage path where the Delta table will be written.
72
+ storage_options: Storage-specific options for authentication and configuration.
73
+ write_settings: Write configuration including write mode.
74
+ logger: Logger instance for logging operations.
75
+ """
76
+ sink_kwargs = {
77
+ "target": resource_path,
78
+ "mode": write_settings.write_mode,
79
+ }
80
+ if storage_options:
81
+ sink_kwargs["storage_options"] = storage_options
82
+
83
+ # Delta format requires collecting the LazyFrame first
84
+ collect_lazy_frame(df).write_delta(**sink_kwargs)
85
+
86
+
87
+ def _write_csv_to_cloud(
88
+ df: pl.LazyFrame,
89
+ resource_path: str,
90
+ storage_options: Dict[str, Any],
91
+ write_settings: WriteSettings,
92
+ logger: Logger
93
+ ) -> None:
94
+ """Write LazyFrame to a CSV file in cloud storage.
95
+
96
+ Args:
97
+ df: Polars LazyFrame to write.
98
+ resource_path: Cloud storage path where the CSV file will be written.
99
+ storage_options: Storage-specific options for authentication and configuration.
100
+ write_settings: Write configuration including delimiter settings.
101
+ logger: Logger instance for logging operations.
102
+
103
+ Raises:
104
+ Exception: If writing fails, wrapped with a descriptive error message.
105
+ """
106
+ try:
107
+ sink_kwargs = {
108
+ "path": resource_path,
109
+ "separator": write_settings.csv_delimiter,
110
+ }
111
+ if storage_options:
112
+ sink_kwargs["storage_options"] = storage_options
113
+
114
+ # sink_csv executes the lazy query and writes the result
115
+ df.sink_csv(**sink_kwargs)
116
+
117
+ except Exception as e:
118
+ logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
119
+ raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
120
+
121
+
122
+ def _write_json_to_cloud(
123
+ df: pl.LazyFrame,
124
+ resource_path: str,
125
+ storage_options: Dict[str, Any],
126
+ write_settings: WriteSettings,
127
+ logger: Logger
128
+ ) -> None:
129
+ """Write LazyFrame to a line-delimited JSON (NDJSON) file in cloud storage.
130
+
131
+ Args:
132
+ df: Polars LazyFrame to write.
133
+ resource_path: Cloud storage path where the NDJSON file will be written.
134
+ storage_options: Storage-specific options for authentication and configuration.
135
+ write_settings: Write configuration settings.
136
+ logger: Logger instance for logging operations.
137
+
138
+ Raises:
139
+ Exception: If writing fails, wrapped with a descriptive error message.
140
+ """
141
+ try:
142
+ sink_kwargs = {"path": resource_path}
143
+ if storage_options:
144
+ sink_kwargs["storage_options"] = storage_options
145
+
146
+ try:
147
+ # Try to use sink_ndjson for lazy execution
148
+ df.sink_ndjson(**sink_kwargs)
149
+ except Exception as e:
150
+ # Fall back to collecting and writing if sink fails
151
+ pl_df = collect_lazy_frame(df)
152
+ sink_kwargs['file'] = sink_kwargs.pop("path")
153
+ pl_df.write_ndjson(**sink_kwargs)
154
+ logger.error(f"Failed to use sink_ndjson, falling back to collect and write: {str(e)}")
155
+
156
+ except Exception as e:
157
+ logger.error(f"Failed to write JSON to {resource_path}: {str(e)}")
158
+ raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
159
+
160
+ writers = {
161
+ "parquet": _write_parquet_to_cloud,
162
+ "delta": _write_delta_to_cloud,
163
+ "csv": _write_csv_to_cloud,
164
+ "json": _write_json_to_cloud,
165
+ }
166
+
167
+
168
+ def write_df_to_cloud(
169
+ df: pl.LazyFrame,
170
+ settings: CloudStorageWriteSettings,
171
+ logger: Logger
172
+ ) -> None:
173
+ """Write a Polars LazyFrame to an object in cloud storage.
174
+
175
+ Supports writing to S3, Azure ADLS, and Google Cloud Storage. Currently supports
176
+ 'overwrite' write mode. The 'append' mode is not yet implemented for most formats.
177
+
178
+ Args:
179
+ df: Polars LazyFrame to write to cloud storage.
180
+ settings: Cloud storage write settings containing connection details and write options.
181
+ logger: Logger instance for logging operations.
182
+
183
+ Raises:
184
+ ValueError: If the specified file format is not supported.
185
+ NotImplementedError: If 'append' write mode is used for non-delta formats.
186
+ Exception: If writing to cloud storage fails.
187
+ """
188
+ connection = settings.connection
189
+ write_settings = settings.write_settings
190
+ logger.info(
191
+ f"Writing to {connection.storage_type} storage: {write_settings.resource_path}"
192
+ )
193
+ # Validate write mode
194
+ if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
195
+ raise NotImplementedError(
196
+ "The 'append' write mode is not yet supported for this destination."
197
+ )
198
+
199
+ storage_options = connection.get_storage_options()
200
+
201
+ # Dispatch to the appropriate writer
202
+ writer_func = writers.get(write_settings.file_format)
203
+ if not writer_func:
204
+ raise ValueError(
205
+ f"Unsupported file format for writing: {write_settings.file_format}"
206
+ )
207
+
208
+ writer_func(
209
+ df,
210
+ write_settings.resource_path,
211
+ storage_options,
212
+ write_settings,
213
+ logger
214
+ )
215
+
216
+ logger.info(f"Successfully wrote data to {write_settings.resource_path}")
@@ -0,0 +1,142 @@
1
+ """Cloud storage connection schemas for S3, ADLS, and other cloud providers."""
2
+
3
+ from typing import Optional, Literal, Dict, Any
4
+ import boto3
5
+ from pydantic import BaseModel, SecretStr
6
+ from flowfile_worker.secrets import decrypt_secret
7
+
8
+ CloudStorageType = Literal["s3", "adls", "gcs"]
9
+ AuthMethod = Literal["access_key", "iam_role", "service_principal", "managed_identity", "sas_token", "aws-cli", "env_vars"]
10
+
11
+
12
+ def create_storage_options_from_boto_credentials(profile_name: Optional[str],
13
+ region_name: Optional[str] = None) -> Dict[str, Any]:
14
+ """
15
+ Create a storage options dictionary from AWS credentials using a boto3 profile.
16
+ This is the most robust way to handle profile-based authentication as it
17
+ bypasses Polars' internal credential provider chain, avoiding conflicts.
18
+
19
+ Parameters
20
+ ----------
21
+ profile_name
22
+ The name of the AWS profile in ~/.aws/credentials.
23
+ region_name
24
+ The AWS region to use.
25
+
26
+ Returns
27
+ -------
28
+ Dict[str, Any]
29
+ A storage options dictionary for Polars with explicit credentials.
30
+ """
31
+ session = boto3.Session(profile_name=profile_name, region_name=region_name)
32
+ credentials = session.get_credentials()
33
+ frozen_creds = credentials.get_frozen_credentials()
34
+
35
+ storage_options = {
36
+ "aws_access_key_id": frozen_creds.access_key,
37
+ "aws_secret_access_key": frozen_creds.secret_key,
38
+ "aws_session_token": frozen_creds.token,
39
+ }
40
+ # Use the session's region if one was resolved, otherwise use the provided one
41
+ if session.region_name:
42
+ storage_options["aws_region"] = session.region_name
43
+
44
+ print("Boto3: Successfully created storage options with explicit credentials.")
45
+ return storage_options
46
+
47
+
48
+ class FullCloudStorageConnection(BaseModel):
49
+ """Internal model with decrypted secrets"""
50
+ storage_type: CloudStorageType
51
+ auth_method: AuthMethod
52
+ connection_name: Optional[str] = "None" # This is the reference to the item we will fetch that contains the data
53
+
54
+ # AWS S3
55
+ aws_region: Optional[str] = None
56
+ aws_access_key_id: Optional[str] = None
57
+ aws_secret_access_key: Optional[SecretStr] = None
58
+ aws_role_arn: Optional[str] = None
59
+ aws_allow_unsafe_html: Optional[bool] = None
60
+
61
+ # Azure ADLS
62
+ azure_account_name: Optional[str] = None
63
+ azure_account_key: Optional[SecretStr] = None
64
+ azure_tenant_id: Optional[str] = None
65
+ azure_client_id: Optional[str] = None
66
+ azure_client_secret: Optional[SecretStr] = None
67
+
68
+ # Common
69
+ endpoint_url: Optional[str] = None
70
+ verify_ssl: bool = True
71
+
72
+ def get_storage_options(self) -> Dict[str, Any]:
73
+ """
74
+ Build storage options dict based on the connection type and auth method.
75
+
76
+ Returns:
77
+ Dict containing appropriate storage options for the provider
78
+ """
79
+ if self.storage_type == "s3":
80
+ return self._get_s3_storage_options()
81
+
82
+ def _get_s3_storage_options(self) -> Dict[str, Any]:
83
+ """Build S3-specific storage options."""
84
+ auth_method = self.auth_method
85
+ print(f"Building S3 storage options for auth_method: '{auth_method}'")
86
+
87
+ if auth_method == "aws-cli":
88
+ return create_storage_options_from_boto_credentials(
89
+ profile_name=self.connection_name,
90
+ region_name=self.aws_region
91
+ )
92
+
93
+ storage_options = {}
94
+ if self.aws_region:
95
+ storage_options["aws_region"] = self.aws_region
96
+ if self.endpoint_url:
97
+ storage_options["endpoint_url"] = self.endpoint_url
98
+ if not self.verify_ssl:
99
+ storage_options["verify"] = "False"
100
+ if self.aws_allow_unsafe_html: # Note: Polars uses aws_allow_http
101
+ storage_options["aws_allow_http"] = "true"
102
+
103
+ if auth_method == "access_key":
104
+ storage_options["aws_access_key_id"] = self.aws_access_key_id
105
+ storage_options["aws_secret_access_key"] = decrypt_secret(
106
+ self.aws_secret_access_key.get_secret_value()).get_secret_value()
107
+ # Explicitly clear any session token from the environment
108
+ storage_options["aws_session_token"] = ""
109
+
110
+ elif auth_method == "iam_role":
111
+ # Correctly implement IAM role assumption using boto3 STS client.
112
+ sts_client = boto3.client('sts', region_name=self.aws_region)
113
+ assumed_role_object = sts_client.assume_role(
114
+ RoleArn=self.aws_role_arn,
115
+ RoleSessionName="PolarsCloudStorageReaderSession" # A descriptive session name
116
+ )
117
+ credentials = assumed_role_object['Credentials']
118
+ storage_options["aws_access_key_id"] = credentials['AccessKeyId']
119
+ storage_options["aws_secret_access_key"] = decrypt_secret(credentials['SecretAccessKey']).get_secret_value()
120
+ storage_options["aws_session_token"] = decrypt_secret(credentials['SessionToken']).get_secret_value()
121
+
122
+ return storage_options
123
+
124
+
125
+ class WriteSettings(BaseModel):
126
+ """Settings for writing to cloud storage"""
127
+ resource_path: str # s3://bucket/path/to/file.csv
128
+
129
+ write_mode: Literal["overwrite", "append"] = "overwrite"
130
+ file_format: Literal["csv", "parquet", "json", "delta"] = "parquet"
131
+
132
+ parquet_compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy"
133
+
134
+ csv_delimiter: str = ","
135
+ csv_encoding: str = "utf8"
136
+
137
+
138
+ class CloudStorageWriteSettings(BaseModel):
139
+ write_settings: WriteSettings
140
+ connection: FullCloudStorageConnection
141
+ flowfile_flow_id: int = 1
142
+ flowfile_node_id: int | str = -1