Flowfile 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (121) hide show
  1. flowfile/__init__.py +3 -3
  2. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  3. flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
  4. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  5. flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
  6. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  7. flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
  8. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
  9. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
  11. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
  12. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
  13. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
  14. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
  15. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
  16. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
  17. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  18. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
  19. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
  20. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
  21. flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
  22. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  23. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
  24. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  25. flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
  26. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
  27. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
  28. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
  29. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
  30. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
  31. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
  32. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
  33. flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
  34. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
  35. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
  36. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
  37. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
  38. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
  39. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
  40. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
  41. flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
  42. flowfile/web/static/assets/api-fb67319c.js +80 -0
  43. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  44. flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
  45. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
  46. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
  47. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
  48. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
  49. flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
  50. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
  51. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
  52. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
  53. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
  54. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
  55. flowfile/web/static/index.html +1 -1
  56. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
  57. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/RECORD +108 -103
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
  59. flowfile_core/__init__.py +2 -0
  60. flowfile_core/configs/node_store/nodes.py +8 -6
  61. flowfile_core/database/connection.py +63 -15
  62. flowfile_core/database/init_db.py +0 -1
  63. flowfile_core/database/models.py +49 -2
  64. flowfile_core/flowfile/code_generator/code_generator.py +401 -17
  65. flowfile_core/flowfile/connection_manager/models.py +1 -1
  66. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  67. flowfile_core/flowfile/extensions.py +1 -1
  68. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  69. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  70. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
  71. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  72. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  73. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  74. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  75. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  76. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  77. flowfile_core/flowfile/flow_graph.py +119 -82
  78. flowfile_core/flowfile/flow_node/flow_node.py +68 -33
  79. flowfile_core/flowfile/flow_node/models.py +32 -3
  80. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  81. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  82. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  83. flowfile_core/flowfile/utils.py +1 -23
  84. flowfile_core/main.py +3 -2
  85. flowfile_core/routes/cloud_connections.py +81 -0
  86. flowfile_core/routes/logs.py +0 -1
  87. flowfile_core/routes/routes.py +3 -39
  88. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  89. flowfile_core/schemas/input_schema.py +37 -15
  90. flowfile_core/schemas/schemas.py +7 -2
  91. flowfile_core/schemas/transform_schema.py +97 -22
  92. flowfile_core/utils/utils.py +40 -1
  93. flowfile_core/utils/validate_setup.py +41 -0
  94. flowfile_frame/flow_frame.py +253 -102
  95. flowfile_frame/flow_frame_methods.py +13 -13
  96. flowfile_worker/external_sources/s3_source/main.py +216 -0
  97. flowfile_worker/external_sources/s3_source/models.py +142 -0
  98. flowfile_worker/funcs.py +51 -6
  99. flowfile_worker/models.py +22 -2
  100. flowfile_worker/routes.py +40 -38
  101. flowfile_worker/utils.py +1 -1
  102. test_utils/s3/commands.py +46 -0
  103. test_utils/s3/data_generator.py +291 -0
  104. test_utils/s3/fixtures.py +209 -0
  105. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  106. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  107. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  108. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  109. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  110. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  111. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  112. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  113. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  114. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  115. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  116. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  117. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  118. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
  119. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
  120. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  121. {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
@@ -0,0 +1,216 @@
1
+ """Cloud storage writer module for FlowFile Worker.
2
+
3
+ This module provides functionality to write Polars LazyFrames to various cloud storage
4
+ services (S3, Azure ADLS, Google Cloud Storage) in different file formats.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any
9
+ from logging import Logger
10
+
11
+ from flowfile_worker.external_sources.s3_source.models import (
12
+ CloudStorageWriteSettings,
13
+ WriteSettings
14
+ )
15
+ from flowfile_worker.utils import collect_lazy_frame
16
+
17
+
18
+ def _write_parquet_to_cloud(
19
+ df: pl.LazyFrame,
20
+ resource_path: str,
21
+ storage_options: Dict[str, Any],
22
+ write_settings: WriteSettings,
23
+ logger: Logger
24
+ ) -> None:
25
+ """Write LazyFrame to a Parquet file in cloud storage.
26
+
27
+ Args:
28
+ df: Polars LazyFrame to write.
29
+ resource_path: Cloud storage path where the file will be written.
30
+ storage_options: Storage-specific options for authentication and configuration.
31
+ write_settings: Write configuration including compression settings.
32
+ logger: Logger instance for logging operations.
33
+
34
+ Raises:
35
+ Exception: If writing fails, wrapped with a descriptive error message.
36
+ """
37
+ try:
38
+ sink_kwargs = {
39
+ "path": resource_path,
40
+ "compression": write_settings.parquet_compression,
41
+ }
42
+ if storage_options:
43
+ sink_kwargs["storage_options"] = storage_options
44
+
45
+ try:
46
+ # Try to use sink_parquet for lazy execution
47
+ df.sink_parquet(**sink_kwargs)
48
+ except Exception as e:
49
+ # Fall back to collecting and writing if sink fails
50
+ logger.warning(f"Failed to use sink_parquet, falling back to collect and write: {str(e)}")
51
+ pl_df = collect_lazy_frame(df)
52
+ sink_kwargs['file'] = sink_kwargs.pop("path")
53
+ pl_df.write_parquet(**sink_kwargs)
54
+
55
+ except Exception as e:
56
+ logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
57
+ raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
58
+
59
+
60
+ def _write_delta_to_cloud(
61
+ df: pl.LazyFrame,
62
+ resource_path: str,
63
+ storage_options: Dict[str, Any],
64
+ write_settings: WriteSettings,
65
+ logger: Logger
66
+ ) -> None:
67
+ """Write LazyFrame to Delta Lake format in cloud storage.
68
+
69
+ Args:
70
+ df: Polars LazyFrame to write.
71
+ resource_path: Cloud storage path where the Delta table will be written.
72
+ storage_options: Storage-specific options for authentication and configuration.
73
+ write_settings: Write configuration including write mode.
74
+ logger: Logger instance for logging operations.
75
+ """
76
+ sink_kwargs = {
77
+ "target": resource_path,
78
+ "mode": write_settings.write_mode,
79
+ }
80
+ if storage_options:
81
+ sink_kwargs["storage_options"] = storage_options
82
+
83
+ # Delta format requires collecting the LazyFrame first
84
+ collect_lazy_frame(df).write_delta(**sink_kwargs)
85
+
86
+
87
+ def _write_csv_to_cloud(
88
+ df: pl.LazyFrame,
89
+ resource_path: str,
90
+ storage_options: Dict[str, Any],
91
+ write_settings: WriteSettings,
92
+ logger: Logger
93
+ ) -> None:
94
+ """Write LazyFrame to a CSV file in cloud storage.
95
+
96
+ Args:
97
+ df: Polars LazyFrame to write.
98
+ resource_path: Cloud storage path where the CSV file will be written.
99
+ storage_options: Storage-specific options for authentication and configuration.
100
+ write_settings: Write configuration including delimiter settings.
101
+ logger: Logger instance for logging operations.
102
+
103
+ Raises:
104
+ Exception: If writing fails, wrapped with a descriptive error message.
105
+ """
106
+ try:
107
+ sink_kwargs = {
108
+ "path": resource_path,
109
+ "separator": write_settings.csv_delimiter,
110
+ }
111
+ if storage_options:
112
+ sink_kwargs["storage_options"] = storage_options
113
+
114
+ # sink_csv executes the lazy query and writes the result
115
+ df.sink_csv(**sink_kwargs)
116
+
117
+ except Exception as e:
118
+ logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
119
+ raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
120
+
121
+
122
+ def _write_json_to_cloud(
123
+ df: pl.LazyFrame,
124
+ resource_path: str,
125
+ storage_options: Dict[str, Any],
126
+ write_settings: WriteSettings,
127
+ logger: Logger
128
+ ) -> None:
129
+ """Write LazyFrame to a line-delimited JSON (NDJSON) file in cloud storage.
130
+
131
+ Args:
132
+ df: Polars LazyFrame to write.
133
+ resource_path: Cloud storage path where the NDJSON file will be written.
134
+ storage_options: Storage-specific options for authentication and configuration.
135
+ write_settings: Write configuration settings.
136
+ logger: Logger instance for logging operations.
137
+
138
+ Raises:
139
+ Exception: If writing fails, wrapped with a descriptive error message.
140
+ """
141
+ try:
142
+ sink_kwargs = {"path": resource_path}
143
+ if storage_options:
144
+ sink_kwargs["storage_options"] = storage_options
145
+
146
+ try:
147
+ # Try to use sink_ndjson for lazy execution
148
+ df.sink_ndjson(**sink_kwargs)
149
+ except Exception as e:
150
+ # Fall back to collecting and writing if sink fails
151
+ pl_df = collect_lazy_frame(df)
152
+ sink_kwargs['file'] = sink_kwargs.pop("path")
153
+ pl_df.write_ndjson(**sink_kwargs)
154
+ logger.error(f"Failed to use sink_ndjson, falling back to collect and write: {str(e)}")
155
+
156
+ except Exception as e:
157
+ logger.error(f"Failed to write JSON to {resource_path}: {str(e)}")
158
+ raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
159
+
160
+ writers = {
161
+ "parquet": _write_parquet_to_cloud,
162
+ "delta": _write_delta_to_cloud,
163
+ "csv": _write_csv_to_cloud,
164
+ "json": _write_json_to_cloud,
165
+ }
166
+
167
+
168
+ def write_df_to_cloud(
169
+ df: pl.LazyFrame,
170
+ settings: CloudStorageWriteSettings,
171
+ logger: Logger
172
+ ) -> None:
173
+ """Write a Polars LazyFrame to an object in cloud storage.
174
+
175
+ Supports writing to S3, Azure ADLS, and Google Cloud Storage. Currently supports
176
+ 'overwrite' write mode. The 'append' mode is not yet implemented for most formats.
177
+
178
+ Args:
179
+ df: Polars LazyFrame to write to cloud storage.
180
+ settings: Cloud storage write settings containing connection details and write options.
181
+ logger: Logger instance for logging operations.
182
+
183
+ Raises:
184
+ ValueError: If the specified file format is not supported.
185
+ NotImplementedError: If 'append' write mode is used for non-delta formats.
186
+ Exception: If writing to cloud storage fails.
187
+ """
188
+ connection = settings.connection
189
+ write_settings = settings.write_settings
190
+ logger.info(
191
+ f"Writing to {connection.storage_type} storage: {write_settings.resource_path}"
192
+ )
193
+ # Validate write mode
194
+ if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
195
+ raise NotImplementedError(
196
+ "The 'append' write mode is not yet supported for this destination."
197
+ )
198
+
199
+ storage_options = connection.get_storage_options()
200
+
201
+ # Dispatch to the appropriate writer
202
+ writer_func = writers.get(write_settings.file_format)
203
+ if not writer_func:
204
+ raise ValueError(
205
+ f"Unsupported file format for writing: {write_settings.file_format}"
206
+ )
207
+
208
+ writer_func(
209
+ df,
210
+ write_settings.resource_path,
211
+ storage_options,
212
+ write_settings,
213
+ logger
214
+ )
215
+
216
+ logger.info(f"Successfully wrote data to {write_settings.resource_path}")
@@ -0,0 +1,142 @@
1
+ """Cloud storage connection schemas for S3, ADLS, and other cloud providers."""
2
+
3
+ from typing import Optional, Literal, Dict, Any
4
+ import boto3
5
+ from pydantic import BaseModel, SecretStr
6
+ from flowfile_worker.secrets import decrypt_secret
7
+
8
+ CloudStorageType = Literal["s3", "adls", "gcs"]
9
+ AuthMethod = Literal["access_key", "iam_role", "service_principal", "managed_identity", "sas_token", "aws-cli", "env_vars"]
10
+
11
+
12
+ def create_storage_options_from_boto_credentials(profile_name: Optional[str],
13
+ region_name: Optional[str] = None) -> Dict[str, Any]:
14
+ """
15
+ Create a storage options dictionary from AWS credentials using a boto3 profile.
16
+ This is the most robust way to handle profile-based authentication as it
17
+ bypasses Polars' internal credential provider chain, avoiding conflicts.
18
+
19
+ Parameters
20
+ ----------
21
+ profile_name
22
+ The name of the AWS profile in ~/.aws/credentials.
23
+ region_name
24
+ The AWS region to use.
25
+
26
+ Returns
27
+ -------
28
+ Dict[str, Any]
29
+ A storage options dictionary for Polars with explicit credentials.
30
+ """
31
+ session = boto3.Session(profile_name=profile_name, region_name=region_name)
32
+ credentials = session.get_credentials()
33
+ frozen_creds = credentials.get_frozen_credentials()
34
+
35
+ storage_options = {
36
+ "aws_access_key_id": frozen_creds.access_key,
37
+ "aws_secret_access_key": frozen_creds.secret_key,
38
+ "aws_session_token": frozen_creds.token,
39
+ }
40
+ # Use the session's region if one was resolved, otherwise use the provided one
41
+ if session.region_name:
42
+ storage_options["aws_region"] = session.region_name
43
+
44
+ print("Boto3: Successfully created storage options with explicit credentials.")
45
+ return storage_options
46
+
47
+
48
+ class FullCloudStorageConnection(BaseModel):
49
+ """Internal model with decrypted secrets"""
50
+ storage_type: CloudStorageType
51
+ auth_method: AuthMethod
52
+ connection_name: Optional[str] = "None" # This is the reference to the item we will fetch that contains the data
53
+
54
+ # AWS S3
55
+ aws_region: Optional[str] = None
56
+ aws_access_key_id: Optional[str] = None
57
+ aws_secret_access_key: Optional[SecretStr] = None
58
+ aws_role_arn: Optional[str] = None
59
+ aws_allow_unsafe_html: Optional[bool] = None
60
+
61
+ # Azure ADLS
62
+ azure_account_name: Optional[str] = None
63
+ azure_account_key: Optional[SecretStr] = None
64
+ azure_tenant_id: Optional[str] = None
65
+ azure_client_id: Optional[str] = None
66
+ azure_client_secret: Optional[SecretStr] = None
67
+
68
+ # Common
69
+ endpoint_url: Optional[str] = None
70
+ verify_ssl: bool = True
71
+
72
+ def get_storage_options(self) -> Dict[str, Any]:
73
+ """
74
+ Build storage options dict based on the connection type and auth method.
75
+
76
+ Returns:
77
+ Dict containing appropriate storage options for the provider
78
+ """
79
+ if self.storage_type == "s3":
80
+ return self._get_s3_storage_options()
81
+
82
+ def _get_s3_storage_options(self) -> Dict[str, Any]:
83
+ """Build S3-specific storage options."""
84
+ auth_method = self.auth_method
85
+ print(f"Building S3 storage options for auth_method: '{auth_method}'")
86
+
87
+ if auth_method == "aws-cli":
88
+ return create_storage_options_from_boto_credentials(
89
+ profile_name=self.connection_name,
90
+ region_name=self.aws_region
91
+ )
92
+
93
+ storage_options = {}
94
+ if self.aws_region:
95
+ storage_options["aws_region"] = self.aws_region
96
+ if self.endpoint_url:
97
+ storage_options["endpoint_url"] = self.endpoint_url
98
+ if not self.verify_ssl:
99
+ storage_options["verify"] = "False"
100
+ if self.aws_allow_unsafe_html: # Note: Polars uses aws_allow_http
101
+ storage_options["aws_allow_http"] = "true"
102
+
103
+ if auth_method == "access_key":
104
+ storage_options["aws_access_key_id"] = self.aws_access_key_id
105
+ storage_options["aws_secret_access_key"] = decrypt_secret(
106
+ self.aws_secret_access_key.get_secret_value()).get_secret_value()
107
+ # Explicitly clear any session token from the environment
108
+ storage_options["aws_session_token"] = ""
109
+
110
+ elif auth_method == "iam_role":
111
+ # Correctly implement IAM role assumption using boto3 STS client.
112
+ sts_client = boto3.client('sts', region_name=self.aws_region)
113
+ assumed_role_object = sts_client.assume_role(
114
+ RoleArn=self.aws_role_arn,
115
+ RoleSessionName="PolarsCloudStorageReaderSession" # A descriptive session name
116
+ )
117
+ credentials = assumed_role_object['Credentials']
118
+ storage_options["aws_access_key_id"] = credentials['AccessKeyId']
119
+ storage_options["aws_secret_access_key"] = decrypt_secret(credentials['SecretAccessKey']).get_secret_value()
120
+ storage_options["aws_session_token"] = decrypt_secret(credentials['SessionToken']).get_secret_value()
121
+
122
+ return storage_options
123
+
124
+
125
+ class WriteSettings(BaseModel):
126
+ """Settings for writing to cloud storage"""
127
+ resource_path: str # s3://bucket/path/to/file.csv
128
+
129
+ write_mode: Literal["overwrite", "append"] = "overwrite"
130
+ file_format: Literal["csv", "parquet", "json", "delta"] = "parquet"
131
+
132
+ parquet_compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy"
133
+
134
+ csv_delimiter: str = ","
135
+ csv_encoding: str = "utf8"
136
+
137
+
138
+ class CloudStorageWriteSettings(BaseModel):
139
+ write_settings: WriteSettings
140
+ connection: FullCloudStorageConnection
141
+ flowfile_flow_id: int = 1
142
+ flowfile_node_id: int | str = -1
flowfile_worker/funcs.py CHANGED
@@ -6,7 +6,9 @@ from flowfile_worker.polars_fuzzy_match.matcher import fuzzy_match_dfs
6
6
  from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
7
7
  from flowfile_worker.flow_logger import get_worker_logger
8
8
  from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
9
- from flowfile_worker.external_sources.sql_source.main import write_serialized_df_to_database, write_df_to_database
9
+ from flowfile_worker.external_sources.sql_source.main import write_df_to_database
10
+ from flowfile_worker.external_sources.s3_source.main import write_df_to_cloud
11
+ from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
10
12
  from base64 import encodebytes
11
13
  from logging import Logger
12
14
  import logging
@@ -205,9 +207,9 @@ def execute_write_method(write_method: Callable, path: str, data_type: str = Non
205
207
  logger.info('Writing as csv file')
206
208
  if write_mode == 'append':
207
209
  with open(path, 'ab') as f:
208
- write_method(file=f, separator=delimiter, quote_style='always')
210
+ write_method(f, separator=delimiter, quote_style='always')
209
211
  else:
210
- write_method(file=path, separator=delimiter, quote_style='always')
212
+ write_method(path, separator=delimiter, quote_style='always')
211
213
  elif data_type == 'parquet':
212
214
  logger.info('Writing as parquet file')
213
215
  write_method(path)
@@ -243,6 +245,49 @@ def write_to_database(polars_serializable_object: bytes,
243
245
  progress.value = -1
244
246
 
245
247
 
248
+ def write_to_cloud_storage(polars_serializable_object: bytes,
249
+ progress: Value,
250
+ error_message: Array,
251
+ queue: Queue,
252
+ file_path: str,
253
+ cloud_write_settings: CloudStorageWriteSettings,
254
+ flowfile_flow_id: int = -1,
255
+ flowfile_node_id: int | str = -1
256
+ ) -> None:
257
+ """
258
+ Writes a Polars DataFrame to cloud storage using the provided settings.
259
+ Args:
260
+ polars_serializable_object (): # Serialized Polars DataFrame object
261
+ progress (): Multiprocessing Value to track progress
262
+ error_message (): Array to store error messages
263
+ queue (): Queue to send results back
264
+ file_path (): Path to the file where the DataFrame will be written
265
+ cloud_write_settings (): CloudStorageWriteSettings object containing write settings and connection details
266
+ flowfile_flow_id (): Flowfile flow ID for logging
267
+ flowfile_node_id (): Flowfile node ID for logging
268
+
269
+ Returns:
270
+ None
271
+ """
272
+ flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
273
+ flowfile_logger.info(f"Starting write operation to: {cloud_write_settings.write_settings.resource_path}")
274
+ df = pl.LazyFrame.deserialize(io.BytesIO(polars_serializable_object))
275
+ flowfile_logger.info(f"Starting to sync the data to cloud, execution plan: \n"
276
+ f"{df.explain(format='plain')}")
277
+ try:
278
+ write_df_to_cloud(df, cloud_write_settings, flowfile_logger)
279
+ flowfile_logger.info("Write operation completed successfully")
280
+ with progress.get_lock():
281
+ progress.value = 100
282
+ except Exception as e:
283
+ error_msg = str(e).encode()[:1024]
284
+ flowfile_logger.error(f'Error during write operation: {str(e)}')
285
+ with error_message.get_lock():
286
+ error_message[:len(error_msg)] = error_msg
287
+ with progress.get_lock():
288
+ progress.value = -1
289
+
290
+
246
291
  def write_output(polars_serializable_object: bytes,
247
292
  progress: Value,
248
293
  error_message: Array,
@@ -263,16 +308,16 @@ def write_output(polars_serializable_object: bytes,
263
308
  if isinstance(df, pl.LazyFrame):
264
309
  flowfile_logger.info(f'Execution plan explanation:\n{df.explain(format="plain")}')
265
310
  flowfile_logger.info("Successfully deserialized dataframe")
266
- is_lazy = False
267
311
  sink_method_str = 'sink_'+data_type
268
312
  write_method_str = 'write_'+data_type
269
313
  has_sink_method = hasattr(df, sink_method_str)
270
314
  write_method = None
271
315
  if os.path.exists(path) and write_mode == 'create':
272
316
  raise Exception('File already exists')
273
- if has_sink_method and is_lazy:
317
+ if has_sink_method and write_method != 'append':
318
+ flowfile_logger.info(f'Using sink method: {sink_method_str}')
274
319
  write_method = getattr(df, 'sink_' + data_type)
275
- elif not is_lazy or not has_sink_method:
320
+ elif not has_sink_method:
276
321
  if isinstance(df, pl.LazyFrame):
277
322
  df = collect_lazy_frame(df)
278
323
  write_method = getattr(df, write_method_str)
flowfile_worker/models.py CHANGED
@@ -3,11 +3,12 @@ from typing import Optional, Literal, Any
3
3
  from base64 import decodebytes
4
4
  from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
5
5
  from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
6
+ from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
6
7
 
7
8
 
8
9
  OperationType = Literal[
9
10
  'store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'fuzzy', 'store_sample',
10
- 'write_to_database']
11
+ 'write_to_database', "write_to_cloud_storage",]
11
12
  ResultType = Literal['polars', 'other']
12
13
 
13
14
 
@@ -55,7 +56,6 @@ class DatabaseScriptWrite(DatabaseWriteSettings):
55
56
  Returns:
56
57
  DatabaseWriteSettings: The corresponding DatabaseWriteSettings object.
57
58
  """
58
-
59
59
  return DatabaseWriteSettings(
60
60
  connection=self.connection,
61
61
  table_name=self.table_name,
@@ -65,6 +65,26 @@ class DatabaseScriptWrite(DatabaseWriteSettings):
65
65
  )
66
66
 
67
67
 
68
+ class CloudStorageScriptWrite(CloudStorageWriteSettings):
69
+ operation: bytes
70
+
71
+ def polars_serializable_object(self):
72
+ return decodebytes(self.operation)
73
+
74
+ def get_cloud_storage_write_settings(self) -> CloudStorageWriteSettings:
75
+ """
76
+ Converts the current instance to a DatabaseWriteSettings object.
77
+ Returns:
78
+ DatabaseWriteSettings: The corresponding DatabaseWriteSettings object.
79
+ """
80
+ return CloudStorageWriteSettings(
81
+ write_settings=self.write_settings,
82
+ connection=self.connection,
83
+ flowfile_flow_id=self.flowfile_flow_id,
84
+ flowfile_node_id=self.flowfile_node_id
85
+ )
86
+
87
+
68
88
  class FuzzyJoinInput(BaseModel):
69
89
  task_id: Optional[str] = None
70
90
  cache_dir: Optional[str] = None
flowfile_worker/routes.py CHANGED
@@ -10,10 +10,8 @@ from flowfile_worker import models
10
10
  from flowfile_worker.spawner import start_process, start_fuzzy_process, start_generic_process, process_manager
11
11
  from flowfile_worker.create import table_creator_factory_method, received_table_parser, FileType
12
12
  from flowfile_worker.configs import logger
13
- from flowfile_worker.external_sources.airbyte_sources.models import AirbyteSettings
14
13
  from flowfile_worker.external_sources.sql_source.models import DatabaseReadSettings
15
14
  from flowfile_worker.external_sources.sql_source.main import read_sql_source, write_serialized_df_to_database
16
- from flowfile_worker.external_sources.airbyte_sources.main import read_airbyte_source
17
15
 
18
16
 
19
17
  router = APIRouter()
@@ -74,6 +72,44 @@ def store_sample(polars_script: models.PolarsScriptSample, background_tasks: Bac
74
72
  raise HTTPException(status_code=500, detail=str(e))
75
73
 
76
74
 
75
+ @router.post("/write_data_to_cloud/")
76
+ def write_data_to_cloud(cloud_storage_script_write: models.CloudStorageScriptWrite,
77
+ background_tasks: BackgroundTasks) -> models.Status:
78
+ """
79
+ Write polars dataframe to a file in cloud storage.
80
+ Args:
81
+ cloud_storage_script_write (): Contains dataframe and write options for cloud storage
82
+ background_tasks (): FastAPI background tasks handler
83
+
84
+ Returns:
85
+ models.Status: Status object tracking the write operation
86
+ """
87
+ try:
88
+ logger.info("Starting write operation to: cloud storage")
89
+ task_id = str(uuid.uuid4())
90
+ polars_serializable_object = cloud_storage_script_write.polars_serializable_object()
91
+ status = models.Status(background_task_id=task_id, status="Starting", file_ref='',
92
+ result_type="other")
93
+ status_dict[task_id] = status
94
+ background_tasks.add_task(
95
+ start_process,
96
+ polars_serializable_object=polars_serializable_object,
97
+ task_id=task_id,
98
+ operation="write_to_cloud_storage",
99
+ file_ref='',
100
+ flowfile_flow_id=cloud_storage_script_write.flowfile_flow_id,
101
+ flowfile_node_id=cloud_storage_script_write.flowfile_node_id,
102
+ kwargs=dict(cloud_write_settings=cloud_storage_script_write.get_cloud_storage_write_settings()),
103
+ )
104
+ logger.info(
105
+ f"Started write task: {task_id} to database"
106
+ )
107
+ return status
108
+ except Exception as e:
109
+ logger.error(f"Error in write operation: {str(e)}", exc_info=True)
110
+ raise HTTPException(status_code=500, detail=str(e))
111
+
112
+
77
113
  @router.post('/store_database_write_result/')
78
114
  def store_in_database(database_script_write: models.DatabaseScriptWrite, background_tasks: BackgroundTasks) -> models.Status:
79
115
  """
@@ -158,44 +194,10 @@ def write_results(polars_script_write: models.PolarsScriptWrite, background_task
158
194
  raise HTTPException(status_code=500, detail=str(e))
159
195
 
160
196
 
161
- @router.post('/store_airbyte_result')
162
- def store_airbyte_result(airbyte_settings: AirbyteSettings, background_tasks: BackgroundTasks) -> models.Status:
163
- """
164
- Store the result of an Airbyte source operation.
165
-
166
- Args:
167
- airbyte_settings (AirbyteSettings): Settings for the Airbyte source operation
168
- background_tasks (BackgroundTasks): FastAPI background tasks handler
169
-
170
- Returns:
171
- models.Status: Status object tracking the Airbyte source operation
172
- """
173
- logger.info("Processing Airbyte source operation")
174
-
175
- try:
176
- task_id = str(uuid.uuid4())
177
- file_path = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
178
- status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
179
- result_type="polars")
180
- status_dict[task_id] = status
181
- logger.info(f"Starting Airbyte source task: {task_id}")
182
- background_tasks.add_task(start_generic_process, func_ref=read_airbyte_source, file_ref=file_path,
183
- flowfile_flow_id=airbyte_settings.flowfile_flow_id,
184
- flowfile_node_id=airbyte_settings.flowfile_node_id,
185
- task_id=task_id, kwargs=dict(airbyte_settings=airbyte_settings))
186
- logger.info(f"Started Airbyte source task: {task_id}")
187
-
188
- return status
189
-
190
- except Exception as e:
191
- logger.error(f"Error processing Airbyte source: {str(e)}", exc_info=True)
192
- raise HTTPException(status_code=500, detail=str(e))
193
-
194
-
195
197
  @router.post('/store_database_read_result')
196
198
  def store_sql_db_result(database_read_settings: DatabaseReadSettings, background_tasks: BackgroundTasks) -> models.Status:
197
199
  """
198
- Store the result of an Airbyte source operation.
200
+ Store the result of an sql source operation.
199
201
 
200
202
  Args:
201
203
  database_read_settings (SQLSourceSettings): Settings for the SQL source operation
@@ -204,7 +206,7 @@ def store_sql_db_result(database_read_settings: DatabaseReadSettings, background
204
206
  Returns:
205
207
  models.Status: Status object tracking the Sql operation
206
208
  """
207
- logger.info("Processing Airbyte source operation")
209
+ logger.info("Processing Sql source operation")
208
210
 
209
211
  try:
210
212
  task_id = str(uuid.uuid4())
flowfile_worker/utils.py CHANGED
@@ -7,7 +7,7 @@ def collect_lazy_frame(lf: pl.LazyFrame) -> pl.DataFrame:
7
7
  try:
8
8
  return lf.collect(engine="streaming")
9
9
  except PanicException:
10
- return lf.collect(engine="auto")
10
+ return lf.collect(engine="in-memory")
11
11
 
12
12
 
13
13
  @dataclass
@@ -0,0 +1,46 @@
1
+ import logging
2
+
3
+ # Set up logging
4
+ logging.basicConfig(
5
+ level=logging.INFO,
6
+ format='%(asctime)s - %(levelname)s - %(message)s',
7
+ datefmt='%Y-%m-%d %H:%M:%S'
8
+ )
9
+ logger = logging.getLogger("postgres_commands")
10
+
11
+
12
+ def start_minio():
13
+ """Start MinIO container for S3 testing"""
14
+ from . import fixtures
15
+ if not fixtures.is_docker_available():
16
+ logger.warning("Docker is not available. Cannot start PostgreSQL container.")
17
+ print("\n" + "=" * 50)
18
+ print("SKIPPING: Docker is not available on this system")
19
+ print("Tests requiring Docker will need to be skipped")
20
+ print("=" * 50 + "\n")
21
+ return 0 # Return success to allow pipeline to continue
22
+
23
+
24
+ if fixtures.start_minio_container():
25
+ print(f"MinIO started at http://localhost:{fixtures.MINIO_PORT}")
26
+ print(f"Access Key: {fixtures.MINIO_ACCESS_KEY}")
27
+ return 0
28
+ return 1
29
+
30
+
31
+ def stop_minio():
32
+ """Stop MinIO container"""
33
+ from . import fixtures
34
+
35
+ if not fixtures.is_docker_available():
36
+ logger.warning("Docker is not available. Cannot stop MinIO container.")
37
+ print("\n" + "=" * 50)
38
+ print("SKIPPING: Docker is not available on this system")
39
+ print("Tests requiring Docker will need to be skipped")
40
+ print("=" * 50 + "\n")
41
+ return 0
42
+
43
+ if fixtures.stop_minio_container():
44
+ print("MinIO stopped successfully")
45
+ return 0
46
+ return 1