Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,86 @@
1
+ import polars as pl
2
+ import polars_distance as pld
3
+ from flowfile_worker.polars_fuzzy_match.utils import cache_polars_frame_to_temp
4
+ from flowfile_worker.utils import collect_lazy_frame
5
+ from flowfile_worker.polars_fuzzy_match.models import FuzzyTypeLiteral
6
+
7
+
8
+ def calculate_fuzzy_score(mapping_table: pl.LazyFrame, left_col_name: str, right_col_name: str,
9
+ fuzzy_method: FuzzyTypeLiteral, th_score: float) -> pl.LazyFrame:
10
+ """
11
+ Calculate fuzzy matching scores between columns in a LazyFrame.
12
+
13
+ Args:
14
+ mapping_table: The DataFrame containing columns to compare
15
+ left_col_name: Name of the left column for comparison
16
+ right_col_name: Name of the right column for comparison
17
+ fuzzy_method: Type of fuzzy matching algorithm to use
18
+ th_score: The threshold score for fuzzy matching
19
+
20
+ Returns:
21
+ A LazyFrame with fuzzy matching scores
22
+ """
23
+ mapping_table = mapping_table.with_columns(pl.col(left_col_name).str.to_lowercase().alias('left'),
24
+ pl.col(right_col_name).str.to_lowercase().alias('right'))
25
+ dist_col = pld.DistancePairWiseString(pl.col('left'))
26
+ if fuzzy_method in ("jaro_winkler"):
27
+ fm_method = getattr(dist_col, fuzzy_method)(pl.col('right')).alias('s')
28
+ else:
29
+ fm_method = getattr(dist_col, fuzzy_method)(pl.col('right'), normalized=True).alias('s')
30
+ return (mapping_table.with_columns(fm_method).drop(['left', 'right']).filter(pl.col('s') <= th_score).
31
+ with_columns((1-pl.col('s')).alias('s')))
32
+
33
+
34
+ def process_fuzzy_frames(left_df: pl.LazyFrame, right_df: pl.LazyFrame, left_col_name: str, right_col_name: str,
35
+ temp_dir_ref: str):
36
+ """
37
+ Process left and right data frames to create fuzzy frames,
38
+ cache them temporarily, and adjust based on their lengths.
39
+
40
+ Args:
41
+ - left_df (pl.DataFrame): The left data frame.
42
+ - right_df (pl.DataFrame): The right data frame.
43
+ - fm (object): An object containing configuration such as the left column name.
44
+ - temp_dir_ref (str): A reference to the temporary directory for caching frames.
45
+
46
+ Returns:
47
+ - Tuple[pl.DataFrame, pl.DataFrame, str, str]: Processed left and right fuzzy frames and their respective column names.
48
+ """
49
+
50
+ # Process left and right data frames
51
+ left_fuzzy_frame = cache_polars_frame_to_temp(left_df.group_by(left_col_name).agg('__left_index').
52
+ filter(pl.col(left_col_name).is_not_null()), temp_dir_ref)
53
+ right_fuzzy_frame = cache_polars_frame_to_temp(right_df.group_by(right_col_name).agg('__right_index').
54
+ filter(pl.col(right_col_name).is_not_null()), temp_dir_ref)
55
+ # Calculate lengths of fuzzy frames
56
+ len_left_df = collect_lazy_frame(left_fuzzy_frame.select(pl.len()))[0, 0]
57
+ len_right_df = collect_lazy_frame(right_fuzzy_frame.select(pl.len()))[0, 0]
58
+
59
+ # Decide which frame to use as left or right based on their lengths
60
+ if len_left_df < len_right_df:
61
+ # Swap the frames and column names if right frame is larger
62
+ left_fuzzy_frame, right_fuzzy_frame = right_fuzzy_frame, left_fuzzy_frame
63
+ left_col_name, right_col_name = right_col_name, left_col_name
64
+
65
+ # Return the processed frames and column names
66
+ return left_fuzzy_frame, right_fuzzy_frame, left_col_name, right_col_name, len_left_df, len_right_df
67
+
68
+
69
+ def calculate_and_parse_fuzzy(mapping_table: pl.LazyFrame, left_col_name: str, right_col_name: str,
70
+ fuzzy_method: FuzzyTypeLiteral, th_score: float) -> pl.LazyFrame:
71
+ """
72
+ Calculate fuzzy scores and parse/explode the results for further processing.
73
+
74
+ Args:
75
+ mapping_table: The DataFrame containing columns to compare
76
+ left_col_name: Name of the left column for comparison
77
+ right_col_name: Name of the right column for comparison
78
+ fuzzy_method: Type of fuzzy matching algorithm to use
79
+ th_score: Minimum similarity score threshold (0-1)
80
+
81
+ Returns:
82
+ A LazyFrame with exploded indices and fuzzy scores
83
+ """
84
+ return calculate_fuzzy_score(mapping_table, left_col_name, right_col_name, fuzzy_method, th_score).select(
85
+ pl.col('s'), pl.col('__left_index'), pl.col('__right_index')).explode(pl.col('__left_index')).explode(
86
+ pl.col('__right_index'))
@@ -0,0 +1,50 @@
1
+ import polars as pl
2
+ from flowfile_worker.configs import logger
3
+ from flowfile_worker.utils import collect_lazy_frame
4
+ import os
5
+ import uuid
6
+
7
+
8
+ def write_polars_frame(_df: pl.LazyFrame | pl.DataFrame, path: str,
9
+ estimated_size: int = 0):
10
+ is_lazy = isinstance(_df, pl.LazyFrame)
11
+ logger.info('Caching data frame')
12
+ if is_lazy:
13
+ if estimated_size > 0:
14
+ fit_memory = estimated_size / 1024 / 1000 / 1000 < 8
15
+ if fit_memory:
16
+ _df = _df.collect()
17
+ is_lazy = False
18
+
19
+ if is_lazy:
20
+ logger.info("Writing in memory efficient mode")
21
+ write_method = getattr(_df, 'sink_ipc')
22
+ try:
23
+ write_method(path)
24
+ return True
25
+ except Exception as e:
26
+ pass
27
+ try:
28
+ write_method(path)
29
+ return True
30
+ except Exception as e:
31
+ pass
32
+ if is_lazy:
33
+ _df = collect_lazy_frame(_df)
34
+ try:
35
+ write_method = getattr(_df, 'write_ipc')
36
+ write_method(path)
37
+ return True
38
+ except Exception as e:
39
+ print('error', e)
40
+ return False
41
+
42
+
43
+ def cache_polars_frame_to_temp(_df: pl.LazyFrame | pl.DataFrame, tempdir: str = None) -> pl.LazyFrame:
44
+ path = f'{tempdir}{os.sep}{uuid.uuid4()}'
45
+ result = write_polars_frame(_df, path)
46
+ if result:
47
+ df = pl.read_ipc(path)
48
+ return df.lazy()
49
+ else:
50
+ raise Exception('Could not cache the data')
@@ -0,0 +1,36 @@
1
+ from threading import Lock
2
+ from multiprocessing import Process
3
+ from typing import Dict
4
+
5
+
6
+ class ProcessManager:
7
+ def __init__(self):
8
+ self.process_dict: Dict[str, Process] = {}
9
+ self.lock = Lock()
10
+
11
+ def add_process(self, task_id: str, process: Process):
12
+ """Add a process to the manager."""
13
+ with self.lock:
14
+ self.process_dict[task_id] = process
15
+
16
+ def get_process(self, task_id: str) -> Process:
17
+ """Retrieve a process by its task ID."""
18
+ with self.lock:
19
+ return self.process_dict.get(task_id)
20
+
21
+ def remove_process(self, task_id: str):
22
+ """Remove a process from the manager by its task ID."""
23
+ with self.lock:
24
+ self.process_dict.pop(task_id, None)
25
+
26
+ def cancel_process(self, task_id: str):
27
+ """Cancel a running process by its task ID."""
28
+ with self.lock:
29
+ process = self.process_dict.get(task_id)
30
+ if process:
31
+ # Terminate and remove the process
32
+ process.terminate()
33
+ process.join()
34
+ self.process_dict.pop(task_id, None)
35
+ return True
36
+ return False
@@ -0,0 +1,440 @@
1
+ import polars as pl
2
+ import uuid
3
+ import os
4
+ from fastapi import APIRouter, HTTPException, Response, BackgroundTasks
5
+ from typing import Dict
6
+ from base64 import encodebytes
7
+
8
+ from flowfile_worker import status_dict, CACHE_DIR, PROCESS_MEMORY_USAGE, status_dict_lock
9
+ from flowfile_worker import models
10
+ from flowfile_worker.spawner import start_process, start_fuzzy_process, start_generic_process, process_manager
11
+ from flowfile_worker.create import table_creator_factory_method, received_table_parser, FileType
12
+ from flowfile_worker.configs import logger
13
+ from flowfile_worker.external_sources.airbyte_sources.models import AirbyteSettings
14
+ from flowfile_worker.external_sources.sql_source.models import DatabaseReadSettings
15
+ from flowfile_worker.external_sources.sql_source.main import read_sql_source, write_serialized_df_to_database
16
+ from flowfile_worker.external_sources.airbyte_sources.main import read_airbyte_source
17
+
18
+
19
+ router = APIRouter()
20
+
21
+
22
+ @router.post("/submit_query/")
23
+ def submit_query(polars_script: models.PolarsScript, background_tasks: BackgroundTasks) -> models.Status:
24
+ logger.info(f"Processing query with operation: {polars_script.operation_type}")
25
+
26
+ try:
27
+ polars_script.task_id = str(uuid.uuid4()) if polars_script.task_id is None else polars_script.task_id
28
+ polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else CACHE_DIR.name
29
+ polars_serializable_object = polars_script.polars_serializable_object()
30
+ file_path = os.path.join(polars_script.cache_dir, f"{polars_script.task_id}.arrow")
31
+ result_type = "polars" if polars_script.operation_type == "store" else "other"
32
+ status = models.Status(background_task_id=polars_script.task_id, status="Starting", file_ref=file_path,
33
+ result_type=result_type)
34
+ status_dict[polars_script.task_id] = status
35
+ background_tasks.add_task(start_process, polars_serializable_object=polars_serializable_object,
36
+ task_id=polars_script.task_id, operation=polars_script.operation_type,
37
+ file_ref=file_path, flowfile_flow_id=polars_script.flowfile_flow_id,
38
+ flowfile_node_id=polars_script.flowfile_node_id,
39
+ kwargs={}
40
+ )
41
+ logger.info(f"Started background task: {polars_script.task_id}")
42
+ return status
43
+
44
+ except Exception as e:
45
+ logger.error(f"Error processing query: {str(e)}", exc_info=True)
46
+ raise HTTPException(status_code=500, detail=str(e))
47
+
48
+
49
+ @router.post('/store_sample/')
50
+ def store_sample(polars_script: models.PolarsScriptSample, background_tasks: BackgroundTasks) -> models.Status:
51
+ logger.info(f"Processing sample storage with size: {polars_script.sample_size}")
52
+
53
+ try:
54
+ polars_script.task_id = str(uuid.uuid4()) if polars_script.task_id is None else polars_script.task_id
55
+ polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else CACHE_DIR.name
56
+ polars_serializable_object = polars_script.polars_serializable_object()
57
+
58
+ file_path = os.path.join(polars_script.cache_dir, f"{polars_script.task_id}.arrow")
59
+ status = models.Status(background_task_id=polars_script.task_id, status="Starting", file_ref=file_path,
60
+ result_type="other")
61
+ status_dict[polars_script.task_id] = status
62
+
63
+ background_tasks.add_task(start_process, polars_serializable_object=polars_serializable_object,
64
+ task_id=polars_script.task_id, operation=polars_script.operation_type,
65
+ file_ref=file_path, flowfile_flow_id=polars_script.flowfile_flow_id,
66
+ flowfile_node_id=polars_script.flowfile_node_id,
67
+ kwargs={'sample_size': polars_script.sample_size})
68
+ logger.info(f"Started sample storage task: {polars_script.task_id}")
69
+
70
+ return status
71
+
72
+ except Exception as e:
73
+ logger.error(f"Error storing sample: {str(e)}", exc_info=True)
74
+ raise HTTPException(status_code=500, detail=str(e))
75
+
76
+
77
+ @router.post('/store_database_write_result/')
78
+ def store_in_database(database_script_write: models.DatabaseScriptWrite, background_tasks: BackgroundTasks) -> models.Status:
79
+ """
80
+ Write polars dataframe to a file in specified format.
81
+
82
+ Args:
83
+ database_script_write (models.DatabaseScriptWrite): Contains dataframe and write options for database
84
+ background_tasks (BackgroundTasks): FastAPI background tasks handler
85
+
86
+ Returns:
87
+ models.Status: Status object tracking the write operation
88
+ """
89
+ logger.info("Starting write operation to: database")
90
+ try:
91
+ task_id = str(uuid.uuid4())
92
+ polars_serializable_object = database_script_write.polars_serializable_object()
93
+ status = models.Status(background_task_id=task_id, status="Starting", file_ref='',
94
+ result_type="other")
95
+ status_dict[task_id] = status
96
+ background_tasks.add_task(
97
+ start_process,
98
+ polars_serializable_object=polars_serializable_object,
99
+ task_id=task_id,
100
+ operation="write_to_database",
101
+ file_ref='',
102
+ flowfile_flow_id=database_script_write.flowfile_flow_id,
103
+ flowfile_node_id=database_script_write.flowfile_node_id,
104
+ kwargs=dict(database_write_settings=database_script_write.get_database_write_settings()),
105
+ )
106
+
107
+ logger.info(
108
+ f"Started write task: {task_id} to database"
109
+ )
110
+
111
+ return status
112
+
113
+ except Exception as e:
114
+ logger.error(f"Error in write operation: {str(e)}", exc_info=True)
115
+ raise HTTPException(status_code=500, detail=str(e))
116
+
117
+
118
+ @router.post('/write_results/')
119
+ def write_results(polars_script_write: models.PolarsScriptWrite, background_tasks: BackgroundTasks) -> models.Status:
120
+ """
121
+ Write polars dataframe to a file in specified format.
122
+
123
+ Args:
124
+ polars_script_write (models.PolarsScriptWrite): Contains dataframe and write options
125
+ background_tasks (BackgroundTasks): FastAPI background tasks handler
126
+
127
+ Returns:
128
+ models.Status: Status object tracking the write operation
129
+ """
130
+ logger.info(f"Starting write operation to: {polars_script_write.path}")
131
+ try:
132
+ task_id = str(uuid.uuid4())
133
+ file_path = polars_script_write.path
134
+ polars_serializable_object = polars_script_write.polars_serializable_object()
135
+ result_type = "other"
136
+ status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
137
+ result_type=result_type)
138
+ status_dict[task_id] = status
139
+ background_tasks.add_task(start_process,
140
+ polars_serializable_object=polars_serializable_object, task_id=task_id,
141
+ operation="write_output",
142
+ file_ref=file_path,
143
+ flowfile_flow_id=polars_script_write.flowfile_flow_id,
144
+ flowfile_node_id=polars_script_write.flowfile_node_id,
145
+ kwargs=dict(
146
+ data_type=polars_script_write.data_type,
147
+ path=polars_script_write.path,
148
+ write_mode=polars_script_write.write_mode,
149
+ sheet_name=polars_script_write.sheet_name,
150
+ delimiter=polars_script_write.delimiter)
151
+ )
152
+ logger.info(f"Started write task: {task_id} with type: {polars_script_write.data_type}")
153
+
154
+ return status
155
+
156
+ except Exception as e:
157
+ logger.error(f"Error in write operation: {str(e)}", exc_info=True)
158
+ raise HTTPException(status_code=500, detail=str(e))
159
+
160
+
161
+ @router.post('/store_airbyte_result')
162
+ def store_airbyte_result(airbyte_settings: AirbyteSettings, background_tasks: BackgroundTasks) -> models.Status:
163
+ """
164
+ Store the result of an Airbyte source operation.
165
+
166
+ Args:
167
+ airbyte_settings (AirbyteSettings): Settings for the Airbyte source operation
168
+ background_tasks (BackgroundTasks): FastAPI background tasks handler
169
+
170
+ Returns:
171
+ models.Status: Status object tracking the Airbyte source operation
172
+ """
173
+ logger.info("Processing Airbyte source operation")
174
+
175
+ try:
176
+ task_id = str(uuid.uuid4())
177
+ file_path = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
178
+ status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
179
+ result_type="polars")
180
+ status_dict[task_id] = status
181
+ logger.info(f"Starting Airbyte source task: {task_id}")
182
+ background_tasks.add_task(start_generic_process, func_ref=read_airbyte_source, file_ref=file_path,
183
+ flowfile_flow_id=airbyte_settings.flowfile_flow_id,
184
+ flowfile_node_id=airbyte_settings.flowfile_node_id,
185
+ task_id=task_id, kwargs=dict(airbyte_settings=airbyte_settings))
186
+ logger.info(f"Started Airbyte source task: {task_id}")
187
+
188
+ return status
189
+
190
+ except Exception as e:
191
+ logger.error(f"Error processing Airbyte source: {str(e)}", exc_info=True)
192
+ raise HTTPException(status_code=500, detail=str(e))
193
+
194
+
195
+ @router.post('/store_database_read_result')
196
+ def store_sql_db_result(database_read_settings: DatabaseReadSettings, background_tasks: BackgroundTasks) -> models.Status:
197
+ """
198
+ Store the result of an Airbyte source operation.
199
+
200
+ Args:
201
+ database_read_settings (SQLSourceSettings): Settings for the SQL source operation
202
+ background_tasks (BackgroundTasks): FastAPI background tasks handler
203
+
204
+ Returns:
205
+ models.Status: Status object tracking the Sql operation
206
+ """
207
+ logger.info("Processing Airbyte source operation")
208
+
209
+ try:
210
+ task_id = str(uuid.uuid4())
211
+ file_path = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
212
+ status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
213
+ result_type="polars")
214
+ status_dict[task_id] = status
215
+ logger.info(f"Starting reading from database source task: {task_id}")
216
+ background_tasks.add_task(start_generic_process, func_ref=read_sql_source, file_ref=file_path,
217
+ flowfile_flow_id=database_read_settings.flowfile_flow_id,
218
+ flowfile_node_id=database_read_settings.flowfile_node_id,
219
+ task_id=task_id, kwargs=dict(database_read_settings=database_read_settings))
220
+ return status
221
+
222
+ except Exception as e:
223
+ logger.error(f"Error processing sql source: {str(e)}", exc_info=True)
224
+ raise HTTPException(status_code=500, detail=str(e))
225
+
226
+
227
+ @router.post('/create_table/{file_type}')
228
+ def create_table(file_type: FileType, received_table: Dict, background_tasks: BackgroundTasks,
229
+ flowfile_flow_id: int = 1, flowfile_node_id: int | str = -1) -> models.Status:
230
+ """
231
+ Create a Polars table from received dictionary data based on specified file type.
232
+
233
+ Args:
234
+ file_type (FileType): Type of file/format for table creation
235
+ received_table (Dict): Raw table data as dictionary
236
+ background_tasks (BackgroundTasks): FastAPI background tasks handler
237
+ flowfile_flow_id: Flowfile ID
238
+ flowfile_node_id: Node ID
239
+
240
+ Returns:
241
+ models.Status: Status object tracking the table creation
242
+ """
243
+ logger.info(f"Creating table of type: {file_type}")
244
+
245
+ try:
246
+ task_id = str(uuid.uuid4())
247
+ file_ref = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
248
+
249
+ status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_ref,
250
+ result_type="polars")
251
+ status_dict[task_id] = status
252
+ func_ref = table_creator_factory_method(file_type)
253
+ received_table_parsed = received_table_parser(received_table, file_type)
254
+ background_tasks.add_task(start_generic_process, func_ref=func_ref, file_ref=file_ref,
255
+ task_id=task_id, kwargs={'received_table': received_table_parsed},
256
+ flowfile_flow_id=flowfile_flow_id,
257
+ flowfile_node_id=flowfile_node_id)
258
+ logger.info(f"Started table creation task: {task_id}")
259
+
260
+ return status
261
+
262
+ except Exception as e:
263
+ logger.error(f"Error creating table: {str(e)}", exc_info=True)
264
+ raise HTTPException(status_code=500, detail=str(e))
265
+
266
+
267
+ def validate_result(task_id: str) -> bool | None:
268
+ """
269
+ Validate the result of a completed task by checking the IPC file.
270
+
271
+ Args:
272
+ task_id (str): ID of the task to validate
273
+
274
+ Returns:
275
+ bool | None: True if valid, False if error, None if not applicable
276
+ """
277
+ logger.debug(f"Validating result for task: {task_id}")
278
+ status = status_dict.get(task_id)
279
+ if status.status == 'Completed' and status.result_type == 'polars':
280
+ try:
281
+ pl.scan_ipc(status.file_ref)
282
+ logger.debug(f"Validation successful for task: {task_id}")
283
+ return True
284
+ except Exception as e:
285
+ logger.error(f"Validation failed for task {task_id}: {str(e)}")
286
+ return False
287
+ return True
288
+
289
+
290
+ @router.get('/status/{task_id}', response_model=models.Status)
291
+ def get_status(task_id: str) -> models.Status:
292
+ """Get status of a task by ID and validate its result if completed.
293
+
294
+ Args:
295
+ task_id: Unique identifier of the task
296
+
297
+ Returns:
298
+ models.Status: Current status of the task
299
+
300
+ Raises:
301
+ HTTPException: If task not found or invalid result
302
+ """
303
+ logger.debug(f"Getting status for task: {task_id}")
304
+ status = status_dict.get(task_id)
305
+ if status is None:
306
+ logger.warning(f"Task not found: {task_id}")
307
+ raise HTTPException(status_code=404, detail="Task not found")
308
+ result_valid = validate_result(task_id)
309
+ if not result_valid:
310
+ logger.error(f"Invalid result for task: {task_id}")
311
+ raise HTTPException(status_code=404, detail="Task not found")
312
+ return status
313
+
314
+
315
+ @router.get("/fetch_results/{task_id}")
316
+ async def fetch_results(task_id: str):
317
+ """Fetch results for a completed task.
318
+
319
+ Args:
320
+ task_id: Unique identifier of the task
321
+
322
+ Returns:
323
+ dict: Task ID and serialized result data
324
+
325
+ Raises:
326
+ HTTPException: If result not found or error occurred
327
+ """
328
+ logger.debug(f"Fetching results for task: {task_id}")
329
+ status = status_dict.get(task_id)
330
+ if not status:
331
+ logger.warning(f"Result not found: {task_id}")
332
+ raise HTTPException(status_code=404, detail="Result not found")
333
+ if status.status == "Processing":
334
+ return Response(status_code=202, content="Result not ready yet")
335
+ if status.status == "Error":
336
+ logger.error(f"Task error: {status.error_message}")
337
+ raise HTTPException(status_code=404, detail=f"An error occurred during processing: {status.error_message}")
338
+ try:
339
+ lf = pl.scan_parquet(status.file_ref)
340
+ return {"task_id": task_id, "result": encodebytes(lf.serialize()).decode()}
341
+ except Exception as e:
342
+ logger.error(f"Error reading results: {str(e)}")
343
+ raise HTTPException(status_code=500, detail="Error reading results")
344
+
345
+
346
+ @router.get("/memory_usage/{task_id}")
347
+ async def memory_usage(task_id: str):
348
+ """Get memory usage for a specific task.
349
+
350
+ Args:
351
+ task_id: Unique identifier of the task
352
+
353
+ Returns:
354
+ dict: Task ID and memory usage data
355
+
356
+ Raises:
357
+ HTTPException: If memory usage data not found
358
+ """
359
+ logger.debug(f"Getting memory usage for task: {task_id}")
360
+ memory_usage = PROCESS_MEMORY_USAGE.get(task_id)
361
+ if memory_usage is None:
362
+ logger.warning(f"Memory usage not found: {task_id}")
363
+ raise HTTPException(status_code=404, detail="Memory usage data not found for this task ID")
364
+ return {"task_id": task_id, "memory_usage": memory_usage}
365
+
366
+
367
+ @router.post("/add_fuzzy_join")
368
+ async def add_fuzzy_join(polars_script: models.FuzzyJoinInput, background_tasks: BackgroundTasks) -> models.Status:
369
+ """Start a fuzzy join operation between two dataframes.
370
+
371
+ Args:
372
+ polars_script: Input containing left and right dataframes and fuzzy mapping config
373
+ background_tasks: FastAPI background tasks handler
374
+
375
+ Returns:
376
+ models.Status: Status object for the fuzzy join task
377
+
378
+ Raises:
379
+ HTTPException: If error occurs during setup
380
+ """
381
+ logger.info("Starting fuzzy join operation")
382
+ try:
383
+ polars_script.task_id = str(uuid.uuid4()) if polars_script.task_id is None else polars_script.task_id
384
+ polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else CACHE_DIR.name
385
+ left_serializable_object = polars_script.left_df_operation.polars_serializable_object()
386
+ right_serializable_object = polars_script.right_df_operation.polars_serializable_object()
387
+
388
+ file_path = os.path.join(polars_script.cache_dir, f"{polars_script.task_id}.arrow")
389
+ status = models.Status(background_task_id=polars_script.task_id, status="Starting", file_ref=file_path,
390
+ result_type="polars")
391
+ status_dict[polars_script.task_id] = status
392
+ background_tasks.add_task(start_fuzzy_process, left_serializable_object=left_serializable_object,
393
+ right_serializable_object=right_serializable_object,
394
+ file_ref=file_path,
395
+ fuzzy_maps=polars_script.fuzzy_maps,
396
+ task_id=polars_script.task_id,
397
+ flowfile_flow_id=polars_script.flowfile_flow_id,
398
+ flowfile_node_id=polars_script.flowfile_node_id)
399
+ logger.info(f"Started fuzzy join task: {polars_script.task_id}")
400
+ return status
401
+ except Exception as e:
402
+ logger.error(f"Error in fuzzy join: {str(e)}")
403
+ raise HTTPException(status_code=500, detail=str(e))
404
+
405
+
406
+ @router.post("/cancel_task/{task_id}")
407
+ def cancel_task(task_id: str):
408
+ """Cancel a running task by ID.
409
+
410
+ Args:
411
+ task_id: Unique identifier of the task to cancel
412
+
413
+ Returns:
414
+ dict: Success message
415
+
416
+ Raises:
417
+ HTTPException: If task cannot be cancelled
418
+ """
419
+ logger.info(f"Attempting to cancel task: {task_id}")
420
+ if not process_manager.cancel_process(task_id):
421
+ logger.warning(f"Cannot cancel task: {task_id}")
422
+ raise HTTPException(status_code=404, detail="Task not found or already completed")
423
+ with status_dict_lock:
424
+ if task_id in status_dict:
425
+ status_dict[task_id].status = "Cancelled"
426
+ logger.info(f"Successfully cancelled task: {task_id}")
427
+ return {"message": f"Task {task_id} has been cancelled."}
428
+
429
+
430
+ @router.get('/ids')
431
+ async def get_all_ids():
432
+ """Get list of all task IDs in the system.
433
+
434
+ Returns:
435
+ list: List of all task IDs currently tracked
436
+ """
437
+ logger.debug("Fetching all task IDs")
438
+ ids = [k for k in status_dict.keys()]
439
+ logger.debug(f"Found {len(ids)} tasks")
440
+ return ids