Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import polars_distance as pld
|
|
3
|
+
from flowfile_worker.polars_fuzzy_match.utils import cache_polars_frame_to_temp
|
|
4
|
+
from flowfile_worker.utils import collect_lazy_frame
|
|
5
|
+
from flowfile_worker.polars_fuzzy_match.models import FuzzyTypeLiteral
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def calculate_fuzzy_score(mapping_table: pl.LazyFrame, left_col_name: str, right_col_name: str,
|
|
9
|
+
fuzzy_method: FuzzyTypeLiteral, th_score: float) -> pl.LazyFrame:
|
|
10
|
+
"""
|
|
11
|
+
Calculate fuzzy matching scores between columns in a LazyFrame.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
mapping_table: The DataFrame containing columns to compare
|
|
15
|
+
left_col_name: Name of the left column for comparison
|
|
16
|
+
right_col_name: Name of the right column for comparison
|
|
17
|
+
fuzzy_method: Type of fuzzy matching algorithm to use
|
|
18
|
+
th_score: The threshold score for fuzzy matching
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
A LazyFrame with fuzzy matching scores
|
|
22
|
+
"""
|
|
23
|
+
mapping_table = mapping_table.with_columns(pl.col(left_col_name).str.to_lowercase().alias('left'),
|
|
24
|
+
pl.col(right_col_name).str.to_lowercase().alias('right'))
|
|
25
|
+
dist_col = pld.DistancePairWiseString(pl.col('left'))
|
|
26
|
+
if fuzzy_method in ("jaro_winkler"):
|
|
27
|
+
fm_method = getattr(dist_col, fuzzy_method)(pl.col('right')).alias('s')
|
|
28
|
+
else:
|
|
29
|
+
fm_method = getattr(dist_col, fuzzy_method)(pl.col('right'), normalized=True).alias('s')
|
|
30
|
+
return (mapping_table.with_columns(fm_method).drop(['left', 'right']).filter(pl.col('s') <= th_score).
|
|
31
|
+
with_columns((1-pl.col('s')).alias('s')))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def process_fuzzy_frames(left_df: pl.LazyFrame, right_df: pl.LazyFrame, left_col_name: str, right_col_name: str,
|
|
35
|
+
temp_dir_ref: str):
|
|
36
|
+
"""
|
|
37
|
+
Process left and right data frames to create fuzzy frames,
|
|
38
|
+
cache them temporarily, and adjust based on their lengths.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
- left_df (pl.DataFrame): The left data frame.
|
|
42
|
+
- right_df (pl.DataFrame): The right data frame.
|
|
43
|
+
- fm (object): An object containing configuration such as the left column name.
|
|
44
|
+
- temp_dir_ref (str): A reference to the temporary directory for caching frames.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
- Tuple[pl.DataFrame, pl.DataFrame, str, str]: Processed left and right fuzzy frames and their respective column names.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# Process left and right data frames
|
|
51
|
+
left_fuzzy_frame = cache_polars_frame_to_temp(left_df.group_by(left_col_name).agg('__left_index').
|
|
52
|
+
filter(pl.col(left_col_name).is_not_null()), temp_dir_ref)
|
|
53
|
+
right_fuzzy_frame = cache_polars_frame_to_temp(right_df.group_by(right_col_name).agg('__right_index').
|
|
54
|
+
filter(pl.col(right_col_name).is_not_null()), temp_dir_ref)
|
|
55
|
+
# Calculate lengths of fuzzy frames
|
|
56
|
+
len_left_df = collect_lazy_frame(left_fuzzy_frame.select(pl.len()))[0, 0]
|
|
57
|
+
len_right_df = collect_lazy_frame(right_fuzzy_frame.select(pl.len()))[0, 0]
|
|
58
|
+
|
|
59
|
+
# Decide which frame to use as left or right based on their lengths
|
|
60
|
+
if len_left_df < len_right_df:
|
|
61
|
+
# Swap the frames and column names if right frame is larger
|
|
62
|
+
left_fuzzy_frame, right_fuzzy_frame = right_fuzzy_frame, left_fuzzy_frame
|
|
63
|
+
left_col_name, right_col_name = right_col_name, left_col_name
|
|
64
|
+
|
|
65
|
+
# Return the processed frames and column names
|
|
66
|
+
return left_fuzzy_frame, right_fuzzy_frame, left_col_name, right_col_name, len_left_df, len_right_df
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def calculate_and_parse_fuzzy(mapping_table: pl.LazyFrame, left_col_name: str, right_col_name: str,
|
|
70
|
+
fuzzy_method: FuzzyTypeLiteral, th_score: float) -> pl.LazyFrame:
|
|
71
|
+
"""
|
|
72
|
+
Calculate fuzzy scores and parse/explode the results for further processing.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
mapping_table: The DataFrame containing columns to compare
|
|
76
|
+
left_col_name: Name of the left column for comparison
|
|
77
|
+
right_col_name: Name of the right column for comparison
|
|
78
|
+
fuzzy_method: Type of fuzzy matching algorithm to use
|
|
79
|
+
th_score: Minimum similarity score threshold (0-1)
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
A LazyFrame with exploded indices and fuzzy scores
|
|
83
|
+
"""
|
|
84
|
+
return calculate_fuzzy_score(mapping_table, left_col_name, right_col_name, fuzzy_method, th_score).select(
|
|
85
|
+
pl.col('s'), pl.col('__left_index'), pl.col('__right_index')).explode(pl.col('__left_index')).explode(
|
|
86
|
+
pl.col('__right_index'))
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
from flowfile_worker.configs import logger
|
|
3
|
+
from flowfile_worker.utils import collect_lazy_frame
|
|
4
|
+
import os
|
|
5
|
+
import uuid
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def write_polars_frame(_df: pl.LazyFrame | pl.DataFrame, path: str,
|
|
9
|
+
estimated_size: int = 0):
|
|
10
|
+
is_lazy = isinstance(_df, pl.LazyFrame)
|
|
11
|
+
logger.info('Caching data frame')
|
|
12
|
+
if is_lazy:
|
|
13
|
+
if estimated_size > 0:
|
|
14
|
+
fit_memory = estimated_size / 1024 / 1000 / 1000 < 8
|
|
15
|
+
if fit_memory:
|
|
16
|
+
_df = _df.collect()
|
|
17
|
+
is_lazy = False
|
|
18
|
+
|
|
19
|
+
if is_lazy:
|
|
20
|
+
logger.info("Writing in memory efficient mode")
|
|
21
|
+
write_method = getattr(_df, 'sink_ipc')
|
|
22
|
+
try:
|
|
23
|
+
write_method(path)
|
|
24
|
+
return True
|
|
25
|
+
except Exception as e:
|
|
26
|
+
pass
|
|
27
|
+
try:
|
|
28
|
+
write_method(path)
|
|
29
|
+
return True
|
|
30
|
+
except Exception as e:
|
|
31
|
+
pass
|
|
32
|
+
if is_lazy:
|
|
33
|
+
_df = collect_lazy_frame(_df)
|
|
34
|
+
try:
|
|
35
|
+
write_method = getattr(_df, 'write_ipc')
|
|
36
|
+
write_method(path)
|
|
37
|
+
return True
|
|
38
|
+
except Exception as e:
|
|
39
|
+
print('error', e)
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def cache_polars_frame_to_temp(_df: pl.LazyFrame | pl.DataFrame, tempdir: str = None) -> pl.LazyFrame:
|
|
44
|
+
path = f'{tempdir}{os.sep}{uuid.uuid4()}'
|
|
45
|
+
result = write_polars_frame(_df, path)
|
|
46
|
+
if result:
|
|
47
|
+
df = pl.read_ipc(path)
|
|
48
|
+
return df.lazy()
|
|
49
|
+
else:
|
|
50
|
+
raise Exception('Could not cache the data')
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from threading import Lock
|
|
2
|
+
from multiprocessing import Process
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ProcessManager:
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self.process_dict: Dict[str, Process] = {}
|
|
9
|
+
self.lock = Lock()
|
|
10
|
+
|
|
11
|
+
def add_process(self, task_id: str, process: Process):
|
|
12
|
+
"""Add a process to the manager."""
|
|
13
|
+
with self.lock:
|
|
14
|
+
self.process_dict[task_id] = process
|
|
15
|
+
|
|
16
|
+
def get_process(self, task_id: str) -> Process:
|
|
17
|
+
"""Retrieve a process by its task ID."""
|
|
18
|
+
with self.lock:
|
|
19
|
+
return self.process_dict.get(task_id)
|
|
20
|
+
|
|
21
|
+
def remove_process(self, task_id: str):
|
|
22
|
+
"""Remove a process from the manager by its task ID."""
|
|
23
|
+
with self.lock:
|
|
24
|
+
self.process_dict.pop(task_id, None)
|
|
25
|
+
|
|
26
|
+
def cancel_process(self, task_id: str):
|
|
27
|
+
"""Cancel a running process by its task ID."""
|
|
28
|
+
with self.lock:
|
|
29
|
+
process = self.process_dict.get(task_id)
|
|
30
|
+
if process:
|
|
31
|
+
# Terminate and remove the process
|
|
32
|
+
process.terminate()
|
|
33
|
+
process.join()
|
|
34
|
+
self.process_dict.pop(task_id, None)
|
|
35
|
+
return True
|
|
36
|
+
return False
|
|
@@ -0,0 +1,440 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import uuid
|
|
3
|
+
import os
|
|
4
|
+
from fastapi import APIRouter, HTTPException, Response, BackgroundTasks
|
|
5
|
+
from typing import Dict
|
|
6
|
+
from base64 import encodebytes
|
|
7
|
+
|
|
8
|
+
from flowfile_worker import status_dict, CACHE_DIR, PROCESS_MEMORY_USAGE, status_dict_lock
|
|
9
|
+
from flowfile_worker import models
|
|
10
|
+
from flowfile_worker.spawner import start_process, start_fuzzy_process, start_generic_process, process_manager
|
|
11
|
+
from flowfile_worker.create import table_creator_factory_method, received_table_parser, FileType
|
|
12
|
+
from flowfile_worker.configs import logger
|
|
13
|
+
from flowfile_worker.external_sources.airbyte_sources.models import AirbyteSettings
|
|
14
|
+
from flowfile_worker.external_sources.sql_source.models import DatabaseReadSettings
|
|
15
|
+
from flowfile_worker.external_sources.sql_source.main import read_sql_source, write_serialized_df_to_database
|
|
16
|
+
from flowfile_worker.external_sources.airbyte_sources.main import read_airbyte_source
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
router = APIRouter()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@router.post("/submit_query/")
|
|
23
|
+
def submit_query(polars_script: models.PolarsScript, background_tasks: BackgroundTasks) -> models.Status:
|
|
24
|
+
logger.info(f"Processing query with operation: {polars_script.operation_type}")
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
polars_script.task_id = str(uuid.uuid4()) if polars_script.task_id is None else polars_script.task_id
|
|
28
|
+
polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else CACHE_DIR.name
|
|
29
|
+
polars_serializable_object = polars_script.polars_serializable_object()
|
|
30
|
+
file_path = os.path.join(polars_script.cache_dir, f"{polars_script.task_id}.arrow")
|
|
31
|
+
result_type = "polars" if polars_script.operation_type == "store" else "other"
|
|
32
|
+
status = models.Status(background_task_id=polars_script.task_id, status="Starting", file_ref=file_path,
|
|
33
|
+
result_type=result_type)
|
|
34
|
+
status_dict[polars_script.task_id] = status
|
|
35
|
+
background_tasks.add_task(start_process, polars_serializable_object=polars_serializable_object,
|
|
36
|
+
task_id=polars_script.task_id, operation=polars_script.operation_type,
|
|
37
|
+
file_ref=file_path, flowfile_flow_id=polars_script.flowfile_flow_id,
|
|
38
|
+
flowfile_node_id=polars_script.flowfile_node_id,
|
|
39
|
+
kwargs={}
|
|
40
|
+
)
|
|
41
|
+
logger.info(f"Started background task: {polars_script.task_id}")
|
|
42
|
+
return status
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
logger.error(f"Error processing query: {str(e)}", exc_info=True)
|
|
46
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@router.post('/store_sample/')
|
|
50
|
+
def store_sample(polars_script: models.PolarsScriptSample, background_tasks: BackgroundTasks) -> models.Status:
|
|
51
|
+
logger.info(f"Processing sample storage with size: {polars_script.sample_size}")
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
polars_script.task_id = str(uuid.uuid4()) if polars_script.task_id is None else polars_script.task_id
|
|
55
|
+
polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else CACHE_DIR.name
|
|
56
|
+
polars_serializable_object = polars_script.polars_serializable_object()
|
|
57
|
+
|
|
58
|
+
file_path = os.path.join(polars_script.cache_dir, f"{polars_script.task_id}.arrow")
|
|
59
|
+
status = models.Status(background_task_id=polars_script.task_id, status="Starting", file_ref=file_path,
|
|
60
|
+
result_type="other")
|
|
61
|
+
status_dict[polars_script.task_id] = status
|
|
62
|
+
|
|
63
|
+
background_tasks.add_task(start_process, polars_serializable_object=polars_serializable_object,
|
|
64
|
+
task_id=polars_script.task_id, operation=polars_script.operation_type,
|
|
65
|
+
file_ref=file_path, flowfile_flow_id=polars_script.flowfile_flow_id,
|
|
66
|
+
flowfile_node_id=polars_script.flowfile_node_id,
|
|
67
|
+
kwargs={'sample_size': polars_script.sample_size})
|
|
68
|
+
logger.info(f"Started sample storage task: {polars_script.task_id}")
|
|
69
|
+
|
|
70
|
+
return status
|
|
71
|
+
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.error(f"Error storing sample: {str(e)}", exc_info=True)
|
|
74
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@router.post('/store_database_write_result/')
|
|
78
|
+
def store_in_database(database_script_write: models.DatabaseScriptWrite, background_tasks: BackgroundTasks) -> models.Status:
|
|
79
|
+
"""
|
|
80
|
+
Write polars dataframe to a file in specified format.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
database_script_write (models.DatabaseScriptWrite): Contains dataframe and write options for database
|
|
84
|
+
background_tasks (BackgroundTasks): FastAPI background tasks handler
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
models.Status: Status object tracking the write operation
|
|
88
|
+
"""
|
|
89
|
+
logger.info("Starting write operation to: database")
|
|
90
|
+
try:
|
|
91
|
+
task_id = str(uuid.uuid4())
|
|
92
|
+
polars_serializable_object = database_script_write.polars_serializable_object()
|
|
93
|
+
status = models.Status(background_task_id=task_id, status="Starting", file_ref='',
|
|
94
|
+
result_type="other")
|
|
95
|
+
status_dict[task_id] = status
|
|
96
|
+
background_tasks.add_task(
|
|
97
|
+
start_process,
|
|
98
|
+
polars_serializable_object=polars_serializable_object,
|
|
99
|
+
task_id=task_id,
|
|
100
|
+
operation="write_to_database",
|
|
101
|
+
file_ref='',
|
|
102
|
+
flowfile_flow_id=database_script_write.flowfile_flow_id,
|
|
103
|
+
flowfile_node_id=database_script_write.flowfile_node_id,
|
|
104
|
+
kwargs=dict(database_write_settings=database_script_write.get_database_write_settings()),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
logger.info(
|
|
108
|
+
f"Started write task: {task_id} to database"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return status
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"Error in write operation: {str(e)}", exc_info=True)
|
|
115
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@router.post('/write_results/')
|
|
119
|
+
def write_results(polars_script_write: models.PolarsScriptWrite, background_tasks: BackgroundTasks) -> models.Status:
|
|
120
|
+
"""
|
|
121
|
+
Write polars dataframe to a file in specified format.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
polars_script_write (models.PolarsScriptWrite): Contains dataframe and write options
|
|
125
|
+
background_tasks (BackgroundTasks): FastAPI background tasks handler
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
models.Status: Status object tracking the write operation
|
|
129
|
+
"""
|
|
130
|
+
logger.info(f"Starting write operation to: {polars_script_write.path}")
|
|
131
|
+
try:
|
|
132
|
+
task_id = str(uuid.uuid4())
|
|
133
|
+
file_path = polars_script_write.path
|
|
134
|
+
polars_serializable_object = polars_script_write.polars_serializable_object()
|
|
135
|
+
result_type = "other"
|
|
136
|
+
status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
|
|
137
|
+
result_type=result_type)
|
|
138
|
+
status_dict[task_id] = status
|
|
139
|
+
background_tasks.add_task(start_process,
|
|
140
|
+
polars_serializable_object=polars_serializable_object, task_id=task_id,
|
|
141
|
+
operation="write_output",
|
|
142
|
+
file_ref=file_path,
|
|
143
|
+
flowfile_flow_id=polars_script_write.flowfile_flow_id,
|
|
144
|
+
flowfile_node_id=polars_script_write.flowfile_node_id,
|
|
145
|
+
kwargs=dict(
|
|
146
|
+
data_type=polars_script_write.data_type,
|
|
147
|
+
path=polars_script_write.path,
|
|
148
|
+
write_mode=polars_script_write.write_mode,
|
|
149
|
+
sheet_name=polars_script_write.sheet_name,
|
|
150
|
+
delimiter=polars_script_write.delimiter)
|
|
151
|
+
)
|
|
152
|
+
logger.info(f"Started write task: {task_id} with type: {polars_script_write.data_type}")
|
|
153
|
+
|
|
154
|
+
return status
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Error in write operation: {str(e)}", exc_info=True)
|
|
158
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@router.post('/store_airbyte_result')
|
|
162
|
+
def store_airbyte_result(airbyte_settings: AirbyteSettings, background_tasks: BackgroundTasks) -> models.Status:
|
|
163
|
+
"""
|
|
164
|
+
Store the result of an Airbyte source operation.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
airbyte_settings (AirbyteSettings): Settings for the Airbyte source operation
|
|
168
|
+
background_tasks (BackgroundTasks): FastAPI background tasks handler
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
models.Status: Status object tracking the Airbyte source operation
|
|
172
|
+
"""
|
|
173
|
+
logger.info("Processing Airbyte source operation")
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
task_id = str(uuid.uuid4())
|
|
177
|
+
file_path = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
|
|
178
|
+
status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
|
|
179
|
+
result_type="polars")
|
|
180
|
+
status_dict[task_id] = status
|
|
181
|
+
logger.info(f"Starting Airbyte source task: {task_id}")
|
|
182
|
+
background_tasks.add_task(start_generic_process, func_ref=read_airbyte_source, file_ref=file_path,
|
|
183
|
+
flowfile_flow_id=airbyte_settings.flowfile_flow_id,
|
|
184
|
+
flowfile_node_id=airbyte_settings.flowfile_node_id,
|
|
185
|
+
task_id=task_id, kwargs=dict(airbyte_settings=airbyte_settings))
|
|
186
|
+
logger.info(f"Started Airbyte source task: {task_id}")
|
|
187
|
+
|
|
188
|
+
return status
|
|
189
|
+
|
|
190
|
+
except Exception as e:
|
|
191
|
+
logger.error(f"Error processing Airbyte source: {str(e)}", exc_info=True)
|
|
192
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@router.post('/store_database_read_result')
|
|
196
|
+
def store_sql_db_result(database_read_settings: DatabaseReadSettings, background_tasks: BackgroundTasks) -> models.Status:
|
|
197
|
+
"""
|
|
198
|
+
Store the result of an Airbyte source operation.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
database_read_settings (SQLSourceSettings): Settings for the SQL source operation
|
|
202
|
+
background_tasks (BackgroundTasks): FastAPI background tasks handler
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
models.Status: Status object tracking the Sql operation
|
|
206
|
+
"""
|
|
207
|
+
logger.info("Processing Airbyte source operation")
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
task_id = str(uuid.uuid4())
|
|
211
|
+
file_path = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
|
|
212
|
+
status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
|
|
213
|
+
result_type="polars")
|
|
214
|
+
status_dict[task_id] = status
|
|
215
|
+
logger.info(f"Starting reading from database source task: {task_id}")
|
|
216
|
+
background_tasks.add_task(start_generic_process, func_ref=read_sql_source, file_ref=file_path,
|
|
217
|
+
flowfile_flow_id=database_read_settings.flowfile_flow_id,
|
|
218
|
+
flowfile_node_id=database_read_settings.flowfile_node_id,
|
|
219
|
+
task_id=task_id, kwargs=dict(database_read_settings=database_read_settings))
|
|
220
|
+
return status
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(f"Error processing sql source: {str(e)}", exc_info=True)
|
|
224
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
@router.post('/create_table/{file_type}')
|
|
228
|
+
def create_table(file_type: FileType, received_table: Dict, background_tasks: BackgroundTasks,
|
|
229
|
+
flowfile_flow_id: int = 1, flowfile_node_id: int | str = -1) -> models.Status:
|
|
230
|
+
"""
|
|
231
|
+
Create a Polars table from received dictionary data based on specified file type.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
file_type (FileType): Type of file/format for table creation
|
|
235
|
+
received_table (Dict): Raw table data as dictionary
|
|
236
|
+
background_tasks (BackgroundTasks): FastAPI background tasks handler
|
|
237
|
+
flowfile_flow_id: Flowfile ID
|
|
238
|
+
flowfile_node_id: Node ID
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
models.Status: Status object tracking the table creation
|
|
242
|
+
"""
|
|
243
|
+
logger.info(f"Creating table of type: {file_type}")
|
|
244
|
+
|
|
245
|
+
try:
|
|
246
|
+
task_id = str(uuid.uuid4())
|
|
247
|
+
file_ref = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
|
|
248
|
+
|
|
249
|
+
status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_ref,
|
|
250
|
+
result_type="polars")
|
|
251
|
+
status_dict[task_id] = status
|
|
252
|
+
func_ref = table_creator_factory_method(file_type)
|
|
253
|
+
received_table_parsed = received_table_parser(received_table, file_type)
|
|
254
|
+
background_tasks.add_task(start_generic_process, func_ref=func_ref, file_ref=file_ref,
|
|
255
|
+
task_id=task_id, kwargs={'received_table': received_table_parsed},
|
|
256
|
+
flowfile_flow_id=flowfile_flow_id,
|
|
257
|
+
flowfile_node_id=flowfile_node_id)
|
|
258
|
+
logger.info(f"Started table creation task: {task_id}")
|
|
259
|
+
|
|
260
|
+
return status
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.error(f"Error creating table: {str(e)}", exc_info=True)
|
|
264
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def validate_result(task_id: str) -> bool | None:
|
|
268
|
+
"""
|
|
269
|
+
Validate the result of a completed task by checking the IPC file.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
task_id (str): ID of the task to validate
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
bool | None: True if valid, False if error, None if not applicable
|
|
276
|
+
"""
|
|
277
|
+
logger.debug(f"Validating result for task: {task_id}")
|
|
278
|
+
status = status_dict.get(task_id)
|
|
279
|
+
if status.status == 'Completed' and status.result_type == 'polars':
|
|
280
|
+
try:
|
|
281
|
+
pl.scan_ipc(status.file_ref)
|
|
282
|
+
logger.debug(f"Validation successful for task: {task_id}")
|
|
283
|
+
return True
|
|
284
|
+
except Exception as e:
|
|
285
|
+
logger.error(f"Validation failed for task {task_id}: {str(e)}")
|
|
286
|
+
return False
|
|
287
|
+
return True
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@router.get('/status/{task_id}', response_model=models.Status)
|
|
291
|
+
def get_status(task_id: str) -> models.Status:
|
|
292
|
+
"""Get status of a task by ID and validate its result if completed.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
task_id: Unique identifier of the task
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
models.Status: Current status of the task
|
|
299
|
+
|
|
300
|
+
Raises:
|
|
301
|
+
HTTPException: If task not found or invalid result
|
|
302
|
+
"""
|
|
303
|
+
logger.debug(f"Getting status for task: {task_id}")
|
|
304
|
+
status = status_dict.get(task_id)
|
|
305
|
+
if status is None:
|
|
306
|
+
logger.warning(f"Task not found: {task_id}")
|
|
307
|
+
raise HTTPException(status_code=404, detail="Task not found")
|
|
308
|
+
result_valid = validate_result(task_id)
|
|
309
|
+
if not result_valid:
|
|
310
|
+
logger.error(f"Invalid result for task: {task_id}")
|
|
311
|
+
raise HTTPException(status_code=404, detail="Task not found")
|
|
312
|
+
return status
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@router.get("/fetch_results/{task_id}")
|
|
316
|
+
async def fetch_results(task_id: str):
|
|
317
|
+
"""Fetch results for a completed task.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
task_id: Unique identifier of the task
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
dict: Task ID and serialized result data
|
|
324
|
+
|
|
325
|
+
Raises:
|
|
326
|
+
HTTPException: If result not found or error occurred
|
|
327
|
+
"""
|
|
328
|
+
logger.debug(f"Fetching results for task: {task_id}")
|
|
329
|
+
status = status_dict.get(task_id)
|
|
330
|
+
if not status:
|
|
331
|
+
logger.warning(f"Result not found: {task_id}")
|
|
332
|
+
raise HTTPException(status_code=404, detail="Result not found")
|
|
333
|
+
if status.status == "Processing":
|
|
334
|
+
return Response(status_code=202, content="Result not ready yet")
|
|
335
|
+
if status.status == "Error":
|
|
336
|
+
logger.error(f"Task error: {status.error_message}")
|
|
337
|
+
raise HTTPException(status_code=404, detail=f"An error occurred during processing: {status.error_message}")
|
|
338
|
+
try:
|
|
339
|
+
lf = pl.scan_parquet(status.file_ref)
|
|
340
|
+
return {"task_id": task_id, "result": encodebytes(lf.serialize()).decode()}
|
|
341
|
+
except Exception as e:
|
|
342
|
+
logger.error(f"Error reading results: {str(e)}")
|
|
343
|
+
raise HTTPException(status_code=500, detail="Error reading results")
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
@router.get("/memory_usage/{task_id}")
|
|
347
|
+
async def memory_usage(task_id: str):
|
|
348
|
+
"""Get memory usage for a specific task.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
task_id: Unique identifier of the task
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
dict: Task ID and memory usage data
|
|
355
|
+
|
|
356
|
+
Raises:
|
|
357
|
+
HTTPException: If memory usage data not found
|
|
358
|
+
"""
|
|
359
|
+
logger.debug(f"Getting memory usage for task: {task_id}")
|
|
360
|
+
memory_usage = PROCESS_MEMORY_USAGE.get(task_id)
|
|
361
|
+
if memory_usage is None:
|
|
362
|
+
logger.warning(f"Memory usage not found: {task_id}")
|
|
363
|
+
raise HTTPException(status_code=404, detail="Memory usage data not found for this task ID")
|
|
364
|
+
return {"task_id": task_id, "memory_usage": memory_usage}
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
@router.post("/add_fuzzy_join")
|
|
368
|
+
async def add_fuzzy_join(polars_script: models.FuzzyJoinInput, background_tasks: BackgroundTasks) -> models.Status:
|
|
369
|
+
"""Start a fuzzy join operation between two dataframes.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
polars_script: Input containing left and right dataframes and fuzzy mapping config
|
|
373
|
+
background_tasks: FastAPI background tasks handler
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
models.Status: Status object for the fuzzy join task
|
|
377
|
+
|
|
378
|
+
Raises:
|
|
379
|
+
HTTPException: If error occurs during setup
|
|
380
|
+
"""
|
|
381
|
+
logger.info("Starting fuzzy join operation")
|
|
382
|
+
try:
|
|
383
|
+
polars_script.task_id = str(uuid.uuid4()) if polars_script.task_id is None else polars_script.task_id
|
|
384
|
+
polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else CACHE_DIR.name
|
|
385
|
+
left_serializable_object = polars_script.left_df_operation.polars_serializable_object()
|
|
386
|
+
right_serializable_object = polars_script.right_df_operation.polars_serializable_object()
|
|
387
|
+
|
|
388
|
+
file_path = os.path.join(polars_script.cache_dir, f"{polars_script.task_id}.arrow")
|
|
389
|
+
status = models.Status(background_task_id=polars_script.task_id, status="Starting", file_ref=file_path,
|
|
390
|
+
result_type="polars")
|
|
391
|
+
status_dict[polars_script.task_id] = status
|
|
392
|
+
background_tasks.add_task(start_fuzzy_process, left_serializable_object=left_serializable_object,
|
|
393
|
+
right_serializable_object=right_serializable_object,
|
|
394
|
+
file_ref=file_path,
|
|
395
|
+
fuzzy_maps=polars_script.fuzzy_maps,
|
|
396
|
+
task_id=polars_script.task_id,
|
|
397
|
+
flowfile_flow_id=polars_script.flowfile_flow_id,
|
|
398
|
+
flowfile_node_id=polars_script.flowfile_node_id)
|
|
399
|
+
logger.info(f"Started fuzzy join task: {polars_script.task_id}")
|
|
400
|
+
return status
|
|
401
|
+
except Exception as e:
|
|
402
|
+
logger.error(f"Error in fuzzy join: {str(e)}")
|
|
403
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
@router.post("/cancel_task/{task_id}")
|
|
407
|
+
def cancel_task(task_id: str):
|
|
408
|
+
"""Cancel a running task by ID.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
task_id: Unique identifier of the task to cancel
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
dict: Success message
|
|
415
|
+
|
|
416
|
+
Raises:
|
|
417
|
+
HTTPException: If task cannot be cancelled
|
|
418
|
+
"""
|
|
419
|
+
logger.info(f"Attempting to cancel task: {task_id}")
|
|
420
|
+
if not process_manager.cancel_process(task_id):
|
|
421
|
+
logger.warning(f"Cannot cancel task: {task_id}")
|
|
422
|
+
raise HTTPException(status_code=404, detail="Task not found or already completed")
|
|
423
|
+
with status_dict_lock:
|
|
424
|
+
if task_id in status_dict:
|
|
425
|
+
status_dict[task_id].status = "Cancelled"
|
|
426
|
+
logger.info(f"Successfully cancelled task: {task_id}")
|
|
427
|
+
return {"message": f"Task {task_id} has been cancelled."}
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
@router.get('/ids')
|
|
431
|
+
async def get_all_ids():
|
|
432
|
+
"""Get list of all task IDs in the system.
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
list: List of all task IDs currently tracked
|
|
436
|
+
"""
|
|
437
|
+
logger.debug("Fetching all task IDs")
|
|
438
|
+
ids = [k for k in status_dict.keys()]
|
|
439
|
+
logger.debug(f"Found {len(ids)} tasks")
|
|
440
|
+
return ids
|