Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/api-6ef0dcef.js +80 -0
- flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +3 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +9 -6
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +472 -17
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +718 -253
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +563 -117
- flowfile_core/flowfile/flow_node/models.py +154 -20
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +35 -26
- flowfile_core/main.py +35 -15
- flowfile_core/routes/cloud_connections.py +77 -0
- flowfile_core/routes/logs.py +2 -7
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +130 -90
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +121 -71
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +150 -12
- flowfile_core/schemas/transform_schema.py +175 -35
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +481 -208
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +160 -22
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +292 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +214 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_core/schemas/models.py +0 -193
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
flowfile_worker/funcs.py
CHANGED
|
@@ -6,7 +6,9 @@ from flowfile_worker.polars_fuzzy_match.matcher import fuzzy_match_dfs
|
|
|
6
6
|
from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
|
|
7
7
|
from flowfile_worker.flow_logger import get_worker_logger
|
|
8
8
|
from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
|
|
9
|
-
from flowfile_worker.external_sources.sql_source.main import
|
|
9
|
+
from flowfile_worker.external_sources.sql_source.main import write_df_to_database
|
|
10
|
+
from flowfile_worker.external_sources.s3_source.main import write_df_to_cloud
|
|
11
|
+
from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
|
|
10
12
|
from base64 import encodebytes
|
|
11
13
|
from logging import Logger
|
|
12
14
|
import logging
|
|
@@ -205,9 +207,9 @@ def execute_write_method(write_method: Callable, path: str, data_type: str = Non
|
|
|
205
207
|
logger.info('Writing as csv file')
|
|
206
208
|
if write_mode == 'append':
|
|
207
209
|
with open(path, 'ab') as f:
|
|
208
|
-
write_method(
|
|
210
|
+
write_method(f, separator=delimiter, quote_style='always')
|
|
209
211
|
else:
|
|
210
|
-
write_method(
|
|
212
|
+
write_method(path, separator=delimiter, quote_style='always')
|
|
211
213
|
elif data_type == 'parquet':
|
|
212
214
|
logger.info('Writing as parquet file')
|
|
213
215
|
write_method(path)
|
|
@@ -243,6 +245,49 @@ def write_to_database(polars_serializable_object: bytes,
|
|
|
243
245
|
progress.value = -1
|
|
244
246
|
|
|
245
247
|
|
|
248
|
+
def write_to_cloud_storage(polars_serializable_object: bytes,
|
|
249
|
+
progress: Value,
|
|
250
|
+
error_message: Array,
|
|
251
|
+
queue: Queue,
|
|
252
|
+
file_path: str,
|
|
253
|
+
cloud_write_settings: CloudStorageWriteSettings,
|
|
254
|
+
flowfile_flow_id: int = -1,
|
|
255
|
+
flowfile_node_id: int | str = -1
|
|
256
|
+
) -> None:
|
|
257
|
+
"""
|
|
258
|
+
Writes a Polars DataFrame to cloud storage using the provided settings.
|
|
259
|
+
Args:
|
|
260
|
+
polars_serializable_object (): # Serialized Polars DataFrame object
|
|
261
|
+
progress (): Multiprocessing Value to track progress
|
|
262
|
+
error_message (): Array to store error messages
|
|
263
|
+
queue (): Queue to send results back
|
|
264
|
+
file_path (): Path to the file where the DataFrame will be written
|
|
265
|
+
cloud_write_settings (): CloudStorageWriteSettings object containing write settings and connection details
|
|
266
|
+
flowfile_flow_id (): Flowfile flow ID for logging
|
|
267
|
+
flowfile_node_id (): Flowfile node ID for logging
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
None
|
|
271
|
+
"""
|
|
272
|
+
flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
|
|
273
|
+
flowfile_logger.info(f"Starting write operation to: {cloud_write_settings.write_settings.resource_path}")
|
|
274
|
+
df = pl.LazyFrame.deserialize(io.BytesIO(polars_serializable_object))
|
|
275
|
+
flowfile_logger.info(f"Starting to sync the data to cloud, execution plan: \n"
|
|
276
|
+
f"{df.explain(format='plain')}")
|
|
277
|
+
try:
|
|
278
|
+
write_df_to_cloud(df, cloud_write_settings, flowfile_logger)
|
|
279
|
+
flowfile_logger.info("Write operation completed successfully")
|
|
280
|
+
with progress.get_lock():
|
|
281
|
+
progress.value = 100
|
|
282
|
+
except Exception as e:
|
|
283
|
+
error_msg = str(e).encode()[:1024]
|
|
284
|
+
flowfile_logger.error(f'Error during write operation: {str(e)}')
|
|
285
|
+
with error_message.get_lock():
|
|
286
|
+
error_message[:len(error_msg)] = error_msg
|
|
287
|
+
with progress.get_lock():
|
|
288
|
+
progress.value = -1
|
|
289
|
+
|
|
290
|
+
|
|
246
291
|
def write_output(polars_serializable_object: bytes,
|
|
247
292
|
progress: Value,
|
|
248
293
|
error_message: Array,
|
|
@@ -263,16 +308,16 @@ def write_output(polars_serializable_object: bytes,
|
|
|
263
308
|
if isinstance(df, pl.LazyFrame):
|
|
264
309
|
flowfile_logger.info(f'Execution plan explanation:\n{df.explain(format="plain")}')
|
|
265
310
|
flowfile_logger.info("Successfully deserialized dataframe")
|
|
266
|
-
is_lazy = False
|
|
267
311
|
sink_method_str = 'sink_'+data_type
|
|
268
312
|
write_method_str = 'write_'+data_type
|
|
269
313
|
has_sink_method = hasattr(df, sink_method_str)
|
|
270
314
|
write_method = None
|
|
271
315
|
if os.path.exists(path) and write_mode == 'create':
|
|
272
316
|
raise Exception('File already exists')
|
|
273
|
-
if has_sink_method and
|
|
317
|
+
if has_sink_method and write_method != 'append':
|
|
318
|
+
flowfile_logger.info(f'Using sink method: {sink_method_str}')
|
|
274
319
|
write_method = getattr(df, 'sink_' + data_type)
|
|
275
|
-
elif not
|
|
320
|
+
elif not has_sink_method:
|
|
276
321
|
if isinstance(df, pl.LazyFrame):
|
|
277
322
|
df = collect_lazy_frame(df)
|
|
278
323
|
write_method = getattr(df, write_method_str)
|
flowfile_worker/models.py
CHANGED
|
@@ -3,11 +3,12 @@ from typing import Optional, Literal, Any
|
|
|
3
3
|
from base64 import decodebytes
|
|
4
4
|
from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
|
|
5
5
|
from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
|
|
6
|
+
from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
OperationType = Literal[
|
|
9
10
|
'store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'fuzzy', 'store_sample',
|
|
10
|
-
'write_to_database']
|
|
11
|
+
'write_to_database', "write_to_cloud_storage",]
|
|
11
12
|
ResultType = Literal['polars', 'other']
|
|
12
13
|
|
|
13
14
|
|
|
@@ -55,7 +56,6 @@ class DatabaseScriptWrite(DatabaseWriteSettings):
|
|
|
55
56
|
Returns:
|
|
56
57
|
DatabaseWriteSettings: The corresponding DatabaseWriteSettings object.
|
|
57
58
|
"""
|
|
58
|
-
|
|
59
59
|
return DatabaseWriteSettings(
|
|
60
60
|
connection=self.connection,
|
|
61
61
|
table_name=self.table_name,
|
|
@@ -65,6 +65,26 @@ class DatabaseScriptWrite(DatabaseWriteSettings):
|
|
|
65
65
|
)
|
|
66
66
|
|
|
67
67
|
|
|
68
|
+
class CloudStorageScriptWrite(CloudStorageWriteSettings):
|
|
69
|
+
operation: bytes
|
|
70
|
+
|
|
71
|
+
def polars_serializable_object(self):
|
|
72
|
+
return decodebytes(self.operation)
|
|
73
|
+
|
|
74
|
+
def get_cloud_storage_write_settings(self) -> CloudStorageWriteSettings:
|
|
75
|
+
"""
|
|
76
|
+
Converts the current instance to a DatabaseWriteSettings object.
|
|
77
|
+
Returns:
|
|
78
|
+
DatabaseWriteSettings: The corresponding DatabaseWriteSettings object.
|
|
79
|
+
"""
|
|
80
|
+
return CloudStorageWriteSettings(
|
|
81
|
+
write_settings=self.write_settings,
|
|
82
|
+
connection=self.connection,
|
|
83
|
+
flowfile_flow_id=self.flowfile_flow_id,
|
|
84
|
+
flowfile_node_id=self.flowfile_node_id
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
68
88
|
class FuzzyJoinInput(BaseModel):
|
|
69
89
|
task_id: Optional[str] = None
|
|
70
90
|
cache_dir: Optional[str] = None
|
flowfile_worker/routes.py
CHANGED
|
@@ -10,10 +10,8 @@ from flowfile_worker import models
|
|
|
10
10
|
from flowfile_worker.spawner import start_process, start_fuzzy_process, start_generic_process, process_manager
|
|
11
11
|
from flowfile_worker.create import table_creator_factory_method, received_table_parser, FileType
|
|
12
12
|
from flowfile_worker.configs import logger
|
|
13
|
-
from flowfile_worker.external_sources.airbyte_sources.models import AirbyteSettings
|
|
14
13
|
from flowfile_worker.external_sources.sql_source.models import DatabaseReadSettings
|
|
15
14
|
from flowfile_worker.external_sources.sql_source.main import read_sql_source, write_serialized_df_to_database
|
|
16
|
-
from flowfile_worker.external_sources.airbyte_sources.main import read_airbyte_source
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
router = APIRouter()
|
|
@@ -74,6 +72,44 @@ def store_sample(polars_script: models.PolarsScriptSample, background_tasks: Bac
|
|
|
74
72
|
raise HTTPException(status_code=500, detail=str(e))
|
|
75
73
|
|
|
76
74
|
|
|
75
|
+
@router.post("/write_data_to_cloud/")
|
|
76
|
+
def write_data_to_cloud(cloud_storage_script_write: models.CloudStorageScriptWrite,
|
|
77
|
+
background_tasks: BackgroundTasks) -> models.Status:
|
|
78
|
+
"""
|
|
79
|
+
Write polars dataframe to a file in cloud storage.
|
|
80
|
+
Args:
|
|
81
|
+
cloud_storage_script_write (): Contains dataframe and write options for cloud storage
|
|
82
|
+
background_tasks (): FastAPI background tasks handler
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
models.Status: Status object tracking the write operation
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
logger.info("Starting write operation to: cloud storage")
|
|
89
|
+
task_id = str(uuid.uuid4())
|
|
90
|
+
polars_serializable_object = cloud_storage_script_write.polars_serializable_object()
|
|
91
|
+
status = models.Status(background_task_id=task_id, status="Starting", file_ref='',
|
|
92
|
+
result_type="other")
|
|
93
|
+
status_dict[task_id] = status
|
|
94
|
+
background_tasks.add_task(
|
|
95
|
+
start_process,
|
|
96
|
+
polars_serializable_object=polars_serializable_object,
|
|
97
|
+
task_id=task_id,
|
|
98
|
+
operation="write_to_cloud_storage",
|
|
99
|
+
file_ref='',
|
|
100
|
+
flowfile_flow_id=cloud_storage_script_write.flowfile_flow_id,
|
|
101
|
+
flowfile_node_id=cloud_storage_script_write.flowfile_node_id,
|
|
102
|
+
kwargs=dict(cloud_write_settings=cloud_storage_script_write.get_cloud_storage_write_settings()),
|
|
103
|
+
)
|
|
104
|
+
logger.info(
|
|
105
|
+
f"Started write task: {task_id} to database"
|
|
106
|
+
)
|
|
107
|
+
return status
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"Error in write operation: {str(e)}", exc_info=True)
|
|
110
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
111
|
+
|
|
112
|
+
|
|
77
113
|
@router.post('/store_database_write_result/')
|
|
78
114
|
def store_in_database(database_script_write: models.DatabaseScriptWrite, background_tasks: BackgroundTasks) -> models.Status:
|
|
79
115
|
"""
|
|
@@ -158,44 +194,10 @@ def write_results(polars_script_write: models.PolarsScriptWrite, background_task
|
|
|
158
194
|
raise HTTPException(status_code=500, detail=str(e))
|
|
159
195
|
|
|
160
196
|
|
|
161
|
-
@router.post('/store_airbyte_result')
|
|
162
|
-
def store_airbyte_result(airbyte_settings: AirbyteSettings, background_tasks: BackgroundTasks) -> models.Status:
|
|
163
|
-
"""
|
|
164
|
-
Store the result of an Airbyte source operation.
|
|
165
|
-
|
|
166
|
-
Args:
|
|
167
|
-
airbyte_settings (AirbyteSettings): Settings for the Airbyte source operation
|
|
168
|
-
background_tasks (BackgroundTasks): FastAPI background tasks handler
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
models.Status: Status object tracking the Airbyte source operation
|
|
172
|
-
"""
|
|
173
|
-
logger.info("Processing Airbyte source operation")
|
|
174
|
-
|
|
175
|
-
try:
|
|
176
|
-
task_id = str(uuid.uuid4())
|
|
177
|
-
file_path = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
|
|
178
|
-
status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
|
|
179
|
-
result_type="polars")
|
|
180
|
-
status_dict[task_id] = status
|
|
181
|
-
logger.info(f"Starting Airbyte source task: {task_id}")
|
|
182
|
-
background_tasks.add_task(start_generic_process, func_ref=read_airbyte_source, file_ref=file_path,
|
|
183
|
-
flowfile_flow_id=airbyte_settings.flowfile_flow_id,
|
|
184
|
-
flowfile_node_id=airbyte_settings.flowfile_node_id,
|
|
185
|
-
task_id=task_id, kwargs=dict(airbyte_settings=airbyte_settings))
|
|
186
|
-
logger.info(f"Started Airbyte source task: {task_id}")
|
|
187
|
-
|
|
188
|
-
return status
|
|
189
|
-
|
|
190
|
-
except Exception as e:
|
|
191
|
-
logger.error(f"Error processing Airbyte source: {str(e)}", exc_info=True)
|
|
192
|
-
raise HTTPException(status_code=500, detail=str(e))
|
|
193
|
-
|
|
194
|
-
|
|
195
197
|
@router.post('/store_database_read_result')
|
|
196
198
|
def store_sql_db_result(database_read_settings: DatabaseReadSettings, background_tasks: BackgroundTasks) -> models.Status:
|
|
197
199
|
"""
|
|
198
|
-
Store the result of an
|
|
200
|
+
Store the result of an sql source operation.
|
|
199
201
|
|
|
200
202
|
Args:
|
|
201
203
|
database_read_settings (SQLSourceSettings): Settings for the SQL source operation
|
|
@@ -204,7 +206,7 @@ def store_sql_db_result(database_read_settings: DatabaseReadSettings, background
|
|
|
204
206
|
Returns:
|
|
205
207
|
models.Status: Status object tracking the Sql operation
|
|
206
208
|
"""
|
|
207
|
-
logger.info("Processing
|
|
209
|
+
logger.info("Processing Sql source operation")
|
|
208
210
|
|
|
209
211
|
try:
|
|
210
212
|
task_id = str(uuid.uuid4())
|
flowfile_worker/utils.py
CHANGED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
# Set up logging
|
|
4
|
+
logging.basicConfig(
|
|
5
|
+
level=logging.INFO,
|
|
6
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
7
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
|
8
|
+
)
|
|
9
|
+
logger = logging.getLogger("postgres_commands")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def start_minio():
|
|
13
|
+
"""Start MinIO container for S3 testing"""
|
|
14
|
+
from . import fixtures
|
|
15
|
+
if not fixtures.is_docker_available():
|
|
16
|
+
logger.warning("Docker is not available. Cannot start PostgreSQL container.")
|
|
17
|
+
print("\n" + "=" * 50)
|
|
18
|
+
print("SKIPPING: Docker is not available on this system")
|
|
19
|
+
print("Tests requiring Docker will need to be skipped")
|
|
20
|
+
print("=" * 50 + "\n")
|
|
21
|
+
return 0 # Return success to allow pipeline to continue
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
if fixtures.start_minio_container():
|
|
25
|
+
print(f"MinIO started at http://localhost:{fixtures.MINIO_PORT}")
|
|
26
|
+
print(f"Access Key: {fixtures.MINIO_ACCESS_KEY}")
|
|
27
|
+
return 0
|
|
28
|
+
return 1
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def stop_minio():
|
|
32
|
+
"""Stop MinIO container"""
|
|
33
|
+
from . import fixtures
|
|
34
|
+
|
|
35
|
+
if not fixtures.is_docker_available():
|
|
36
|
+
logger.warning("Docker is not available. Cannot stop MinIO container.")
|
|
37
|
+
print("\n" + "=" * 50)
|
|
38
|
+
print("SKIPPING: Docker is not available on this system")
|
|
39
|
+
print("Tests requiring Docker will need to be skipped")
|
|
40
|
+
print("=" * 50 + "\n")
|
|
41
|
+
return 0
|
|
42
|
+
|
|
43
|
+
if fixtures.stop_minio_container():
|
|
44
|
+
print("MinIO stopped successfully")
|
|
45
|
+
return 0
|
|
46
|
+
return 1
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
|
|
2
|
+
import logging
|
|
3
|
+
import io
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
# Third-party libraries
|
|
7
|
+
import boto3
|
|
8
|
+
from botocore.client import Config
|
|
9
|
+
import polars as pl
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
from deltalake import write_deltalake
|
|
12
|
+
from pyiceberg.catalog import load_catalog
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Configure logging
|
|
16
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
|
|
20
|
+
MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
|
|
21
|
+
MINIO_CONSOLE_PORT = int(os.environ.get("TEST_MINIO_CONSOLE_PORT", 9001))
|
|
22
|
+
MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
|
|
23
|
+
MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
|
|
24
|
+
MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
|
|
25
|
+
MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _create_single_csv_file(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
29
|
+
"""Creates a single CSV file from a DataFrame and uploads it to S3."""
|
|
30
|
+
logger.info("Writing single-file CSV...")
|
|
31
|
+
csv_buffer = io.BytesIO()
|
|
32
|
+
df.write_csv(csv_buffer)
|
|
33
|
+
csv_buffer.seek(0)
|
|
34
|
+
s3_client.put_object(
|
|
35
|
+
Bucket=bucket_name,
|
|
36
|
+
Key='single-file-csv/data.csv',
|
|
37
|
+
Body=csv_buffer.getvalue()
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _create_multi_file_csv(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
|
|
42
|
+
"""Creates multiple CSV files from a DataFrame and uploads them to S3."""
|
|
43
|
+
logger.info(f"Writing {num_files} CSV files...")
|
|
44
|
+
data_size = len(df)
|
|
45
|
+
rows_per_file = data_size // num_files
|
|
46
|
+
for i in range(num_files):
|
|
47
|
+
sub_df = df.slice(i * rows_per_file, rows_per_file)
|
|
48
|
+
csv_buffer = io.BytesIO()
|
|
49
|
+
sub_df.write_csv(csv_buffer)
|
|
50
|
+
csv_buffer.seek(0)
|
|
51
|
+
s3_client.put_object(
|
|
52
|
+
Bucket=bucket_name,
|
|
53
|
+
Key=f'multi-file-csv/part_{i:02d}.csv',
|
|
54
|
+
Body=csv_buffer.getvalue()
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _create_single_file_json(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
59
|
+
"""Creates a single JSON file from a DataFrame and uploads it to S3."""
|
|
60
|
+
logger.info("Writing single-file JSON...")
|
|
61
|
+
json_buffer = io.BytesIO()
|
|
62
|
+
df.write_ndjson(json_buffer)
|
|
63
|
+
json_buffer.seek(0)
|
|
64
|
+
s3_client.put_object(
|
|
65
|
+
Bucket=bucket_name,
|
|
66
|
+
Key='single-file-json/data.json',
|
|
67
|
+
Body=json_buffer.getvalue()
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _create_multi_file_json(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
|
|
72
|
+
"""Creates multiple JSON files from a DataFrame and uploads them to S3."""
|
|
73
|
+
logger.info(f"Writing {num_files} JSON files...")
|
|
74
|
+
data_size = len(df)
|
|
75
|
+
rows_per_file = data_size // num_files
|
|
76
|
+
for i in range(num_files):
|
|
77
|
+
sub_df = df.slice(i * rows_per_file, rows_per_file)
|
|
78
|
+
json_buffer = io.BytesIO()
|
|
79
|
+
sub_df.write_ndjson(json_buffer)
|
|
80
|
+
json_buffer.seek(0)
|
|
81
|
+
s3_client.put_object(
|
|
82
|
+
Bucket=bucket_name,
|
|
83
|
+
Key=f'multi-file-json/part_{i:02d}.json',
|
|
84
|
+
Body=json_buffer.getvalue()
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _create_single_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str):
|
|
89
|
+
"""Creates a single Parquet file from a DataFrame and uploads it to S3."""
|
|
90
|
+
logger.info("Writing single-file Parquet...")
|
|
91
|
+
parquet_buffer = io.BytesIO()
|
|
92
|
+
df.write_parquet(parquet_buffer)
|
|
93
|
+
parquet_buffer.seek(0)
|
|
94
|
+
s3_client.put_object(
|
|
95
|
+
Bucket=bucket_name,
|
|
96
|
+
Key='single-file-parquet/data.parquet',
|
|
97
|
+
Body=parquet_buffer.getvalue()
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _create_multi_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
|
|
102
|
+
"""Creates multiple Parquet files from a DataFrame and uploads them to S3."""
|
|
103
|
+
logger.info(f"Writing {num_files} Parquet files...")
|
|
104
|
+
data_size = len(df)
|
|
105
|
+
rows_per_file = data_size // num_files
|
|
106
|
+
for i in range(num_files):
|
|
107
|
+
sub_df = df.slice(i * rows_per_file, rows_per_file)
|
|
108
|
+
parquet_buffer = io.BytesIO()
|
|
109
|
+
sub_df.write_parquet(parquet_buffer)
|
|
110
|
+
parquet_buffer.seek(0)
|
|
111
|
+
s3_client.put_object(
|
|
112
|
+
Bucket=bucket_name,
|
|
113
|
+
Key=f'multi-file-parquet/part_{i:02d}.parquet',
|
|
114
|
+
Body=parquet_buffer.getvalue()
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _create_delta_lake_table(arrow_table: pa.Table, bucket_name: str, storage_options: dict):
|
|
119
|
+
"""Creates a Delta Lake table from a PyArrow table in S3."""
|
|
120
|
+
logger.info("Writing Delta Lake table...")
|
|
121
|
+
delta_table_path = f"s3://{bucket_name}/delta-lake-table"
|
|
122
|
+
write_deltalake(
|
|
123
|
+
delta_table_path,
|
|
124
|
+
arrow_table,
|
|
125
|
+
mode='overwrite',
|
|
126
|
+
storage_options=storage_options
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _create_iceberg_table(df: pl.DataFrame, bucket_name: str, endpoint_url: str, access_key: str, secret_key: str,
|
|
131
|
+
s3_client):
|
|
132
|
+
"""Creates an Apache Iceberg table and FORCES sane metadata pointers."""
|
|
133
|
+
logger.info("Writing Apache Iceberg table with SANE metadata access...")
|
|
134
|
+
# Configure the catalog properties for S3 access
|
|
135
|
+
catalog_props = {
|
|
136
|
+
"py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
|
|
137
|
+
"s3.endpoint": endpoint_url,
|
|
138
|
+
"s3.access-key-id": access_key,
|
|
139
|
+
"s3.secret-access-key": secret_key,
|
|
140
|
+
}
|
|
141
|
+
# Use the SQL catalog with an in-memory SQLite database for storing metadata pointers
|
|
142
|
+
catalog = load_catalog(
|
|
143
|
+
"default",
|
|
144
|
+
**{
|
|
145
|
+
"type": "sql",
|
|
146
|
+
"uri": "sqlite:///:memory:", # Use an in-memory SQL DB for the catalog
|
|
147
|
+
"warehouse": f"s3a://{bucket_name}/iceberg_warehouse",
|
|
148
|
+
**catalog_props,
|
|
149
|
+
}
|
|
150
|
+
)
|
|
151
|
+
table_identifier = ("default_db", "iceberg_table")
|
|
152
|
+
# Create a namespace (like a schema or database) for the table
|
|
153
|
+
try:
|
|
154
|
+
catalog.drop_namespace("default_db")
|
|
155
|
+
except Exception:
|
|
156
|
+
pass # Ignore if namespace doesn't exist
|
|
157
|
+
catalog.create_namespace("default_db")
|
|
158
|
+
try:
|
|
159
|
+
catalog.load_table(table_identifier)
|
|
160
|
+
catalog.drop_table(table_identifier)
|
|
161
|
+
except:
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
# Create the table schema and object first
|
|
165
|
+
schema = df.to_arrow().schema
|
|
166
|
+
table = catalog.create_table(identifier=table_identifier, schema=schema)
|
|
167
|
+
|
|
168
|
+
# Use the simplified write_iceberg method from Polars
|
|
169
|
+
df.write_iceberg(table, mode='overwrite')
|
|
170
|
+
|
|
171
|
+
# NOW CREATE WHAT SHOULD EXIST BY DEFAULT - SANE METADATA POINTERS
|
|
172
|
+
# Get the current metadata location from the table
|
|
173
|
+
current_metadata = table.metadata_location
|
|
174
|
+
logger.info(f"Original metadata location: {current_metadata}")
|
|
175
|
+
|
|
176
|
+
# Extract just the path part
|
|
177
|
+
if current_metadata.startswith("s3a://"):
|
|
178
|
+
current_metadata_key = current_metadata.replace(f"s3a://{bucket_name}/", "")
|
|
179
|
+
else:
|
|
180
|
+
current_metadata_key = current_metadata.replace(f"s3://{bucket_name}/", "")
|
|
181
|
+
|
|
182
|
+
# Read the current metadata
|
|
183
|
+
response = s3_client.get_object(Bucket=bucket_name, Key=current_metadata_key)
|
|
184
|
+
metadata_content = response['Body'].read()
|
|
185
|
+
|
|
186
|
+
# Get the metadata directory
|
|
187
|
+
metadata_dir = "/".join(current_metadata_key.split("/")[:-1])
|
|
188
|
+
|
|
189
|
+
# Write it to standardized locations
|
|
190
|
+
# 1. metadata.json in the metadata folder (this is what pl.scan_iceberg expects)
|
|
191
|
+
s3_client.put_object(
|
|
192
|
+
Bucket=bucket_name,
|
|
193
|
+
Key=f"{metadata_dir}/metadata.json",
|
|
194
|
+
Body=metadata_content
|
|
195
|
+
)
|
|
196
|
+
logger.info(f"Created stable metadata.json at: s3://{bucket_name}/{metadata_dir}/metadata.json")
|
|
197
|
+
|
|
198
|
+
# 2. current.json as an additional pointer
|
|
199
|
+
s3_client.put_object(
|
|
200
|
+
Bucket=bucket_name,
|
|
201
|
+
Key=f"{metadata_dir}/current.json",
|
|
202
|
+
Body=metadata_content
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# 3. VERSION file that contains the current metadata filename
|
|
206
|
+
current_metadata_filename = current_metadata_key.split("/")[-1]
|
|
207
|
+
s3_client.put_object(
|
|
208
|
+
Bucket=bucket_name,
|
|
209
|
+
Key=f"{metadata_dir}/VERSION",
|
|
210
|
+
Body=current_metadata_filename.encode()
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# 4. version-hint.text (some Iceberg readers look for this)
|
|
214
|
+
s3_client.put_object(
|
|
215
|
+
Bucket=bucket_name,
|
|
216
|
+
Key=f"{metadata_dir}/version-hint.text",
|
|
217
|
+
Body=current_metadata_filename.encode()
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
table_base = "iceberg_warehouse/default_db.db/my_iceberg_table"
|
|
221
|
+
logger.info(f"""
|
|
222
|
+
✅ Iceberg table created with SANE access patterns:
|
|
223
|
+
- Versioned metadata: s3://{bucket_name}/{current_metadata_key}
|
|
224
|
+
- Latest metadata: s3://{bucket_name}/{table_base}/metadata/metadata.json
|
|
225
|
+
- Current pointer: s3://{bucket_name}/{table_base}/metadata/current.json
|
|
226
|
+
- Version hint: s3://{bucket_name}/{table_base}/metadata/version-hint.text
|
|
227
|
+
|
|
228
|
+
Read with: pl.scan_iceberg('s3://{bucket_name}/{table_base}/metadata/metadata.json').collect()
|
|
229
|
+
""")
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def populate_test_data(endpoint_url: str, access_key: str, secret_key: str, bucket_name: str):
|
|
233
|
+
"""
|
|
234
|
+
Populates a MinIO bucket with a variety of large-scale test data formats.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
endpoint_url (str): The S3 endpoint URL for the MinIO instance.
|
|
238
|
+
access_key (str): The access key for MinIO.
|
|
239
|
+
secret_key (str): The secret key for MinIO.
|
|
240
|
+
bucket_name (str): The name of the bucket to populate.
|
|
241
|
+
"""
|
|
242
|
+
logger.info("🚀 Starting data population...")
|
|
243
|
+
# --- S3 Client and Storage Options ---
|
|
244
|
+
s3_client = boto3.client(
|
|
245
|
+
's3',
|
|
246
|
+
endpoint_url=endpoint_url,
|
|
247
|
+
aws_access_key_id=access_key,
|
|
248
|
+
aws_secret_access_key=secret_key,
|
|
249
|
+
config=Config(signature_version='s3v4'),
|
|
250
|
+
region_name='us-east-1'
|
|
251
|
+
)
|
|
252
|
+
storage_options = {
|
|
253
|
+
"AWS_ENDPOINT_URL": endpoint_url,
|
|
254
|
+
"AWS_ACCESS_KEY_ID": access_key,
|
|
255
|
+
"AWS_SECRET_ACCESS_KEY": secret_key,
|
|
256
|
+
"AWS_REGION": "us-east-1",
|
|
257
|
+
"AWS_ALLOW_HTTP": "true",
|
|
258
|
+
"AWS_S3_ALLOW_UNSAFE_RENAME": "true"
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
# --- Data Generation ---
|
|
262
|
+
data_size = 100_000
|
|
263
|
+
df = pl.DataFrame({
|
|
264
|
+
"id": range(1, data_size + 1),
|
|
265
|
+
"name": [f"user_{i}" for i in range(1, data_size + 1)],
|
|
266
|
+
"value": [i * 10.5 for i in range(1, data_size + 1)],
|
|
267
|
+
"category": ["A", "B", "C", "D", "E"] * (data_size // 5)
|
|
268
|
+
})
|
|
269
|
+
logger.info(f"Generated a Polars DataFrame with {data_size} rows.")
|
|
270
|
+
#
|
|
271
|
+
# # --- Execute Data Population Scenarios ---
|
|
272
|
+
_create_single_csv_file(s3_client, df, bucket_name)
|
|
273
|
+
_create_multi_file_csv(s3_client, df, bucket_name)
|
|
274
|
+
_create_single_file_json(s3_client, df, bucket_name)
|
|
275
|
+
_create_multi_file_json(s3_client, df, bucket_name)
|
|
276
|
+
_create_single_parquet_file(s3_client, df, bucket_name)
|
|
277
|
+
_create_multi_parquet_file(s3_client, df, bucket_name)
|
|
278
|
+
|
|
279
|
+
# Convert to PyArrow table once for Delta and Iceberg
|
|
280
|
+
arrow_table = df.to_arrow()
|
|
281
|
+
|
|
282
|
+
_create_delta_lake_table(arrow_table, bucket_name, storage_options)
|
|
283
|
+
_create_iceberg_table(df, bucket_name, endpoint_url, access_key, secret_key, s3_client)
|
|
284
|
+
|
|
285
|
+
logger.info("✅ All test data populated successfully.")
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
if __name__ == '__main__':
|
|
289
|
+
populate_test_data(endpoint_url=MINIO_ENDPOINT_URL,
|
|
290
|
+
access_key=MINIO_ACCESS_KEY,
|
|
291
|
+
secret_key=MINIO_SECRET_KEY,
|
|
292
|
+
bucket_name="test-bucket")
|