Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,72 @@
1
+ from typing import Optional, Literal
2
+ from pydantic import BaseModel, SecretStr
3
+ from flowfile_worker.secrets import decrypt_secret
4
+
5
+
6
+ class DataBaseConnection(BaseModel):
7
+ """Database connection configuration with secure password handling."""
8
+ username: Optional[str] = None
9
+ password: Optional[SecretStr] = None # Encrypted password
10
+ host: Optional[str] = None
11
+ port: Optional[int] = None
12
+ database: Optional[str] = None # The database name
13
+ database_type: str = "postgresql" # Database type (postgresql, mysql, etc.)
14
+ url: Optional[str] = None
15
+
16
+ def get_decrypted_secret(self) -> SecretStr:
17
+ return decrypt_secret(self.password.get_secret_value())
18
+
19
+ def create_uri(self) -> str:
20
+ """
21
+ Creates a database URI based on the connection details.
22
+ If url is provided, it returns that directly.
23
+ Otherwise, it constructs a URI from the individual components.
24
+
25
+ Returns:
26
+ str: The database URI
27
+ """
28
+ # If URL is already provided, use it
29
+ if self.url:
30
+ return self.url
31
+
32
+ # Validate that required fields are present
33
+ if not all([self.host, self.database_type]):
34
+ raise ValueError("Host and database type are required to create a URI")
35
+
36
+ # Create credential part if username is provided
37
+ credentials = ""
38
+ if self.username:
39
+ credentials = self.username
40
+ if self.password:
41
+ # Get the raw password string from SecretStr
42
+ password_value = decrypt_secret(self.password.get_secret_value()).get_secret_value()
43
+ credentials += f":{password_value}"
44
+ credentials += "@"
45
+
46
+ # Create port part if port is provided
47
+ port_section = ""
48
+ if self.port:
49
+ port_section = f":{self.port}"
50
+ if self.database:
51
+
52
+ base_uri = f"{self.database_type}://{credentials}{self.host}{port_section}/{self.database}"
53
+ else:
54
+ base_uri = f"{self.database_type}://{credentials}{self.host}{port_section}"
55
+ return base_uri
56
+
57
+
58
+ class DatabaseReadSettings(BaseModel):
59
+ """Settings for SQL source."""
60
+ connection: DataBaseConnection
61
+ query: str
62
+ flowfile_flow_id: int = 1
63
+ flowfile_node_id: int | str = -1
64
+
65
+
66
+ class DatabaseWriteSettings(BaseModel):
67
+ """Settings for SQL sink."""
68
+ connection: DataBaseConnection
69
+ table_name: str
70
+ if_exists: Literal['append', 'replace', 'fail'] = 'append'
71
+ flowfile_flow_id: int = 1
72
+ flowfile_node_id: int | str = -1
@@ -0,0 +1,58 @@
1
+ import logging
2
+ import requests
3
+ from flowfile_worker.models import RawLogInput
4
+ from flowfile_worker.configs import FLOWFILE_CORE_URI
5
+
6
+ LOGGING_URL = FLOWFILE_CORE_URI + "/raw_logs"
7
+
8
+
9
+ class FlowfileLogHandler(logging.Handler):
10
+ def __init__(self, flowfile_flow_id: int = 1, flowfile_node_id: int | str = -1):
11
+ super().__init__()
12
+ self.flowfile_flow_id = flowfile_flow_id
13
+ self.flowfile_node_id = flowfile_node_id
14
+
15
+ def emit(self, record):
16
+ try:
17
+ log_message = self.format(record)
18
+
19
+ extra = {"Node Id": self.flowfile_node_id}
20
+ for k, v in extra.items():
21
+ log_message = f"{k}: {v} - {log_message}"
22
+ raw_log_input = RawLogInput(
23
+ flowfile_flow_id=self.flowfile_flow_id,
24
+ log_message=log_message,
25
+ log_type=record.levelname.upper(),
26
+ extra={
27
+ }
28
+ )
29
+ if self.flowfile_flow_id != -1 and self.flowfile_node_id != -1:
30
+ response = requests.post(LOGGING_URL, json=raw_log_input.__dict__,
31
+ headers={"Content-Type": "application/json"})
32
+ if response.status_code != 200:
33
+ raise Exception(f"Failed to send log: {response.text}")
34
+ except Exception as e:
35
+ print(f"Error sending log to {LOGGING_URL}: {e}")
36
+
37
+
38
+ def get_worker_logger(flowfile_flow_id: int, flowfile_node_id: int | str) -> logging.Logger:
39
+ logger_name = f"NodeLog: {flowfile_node_id}"
40
+ logger = logging.getLogger(logger_name)
41
+ logger.propagate = False # Prevent propagation to parent loggers
42
+ logger.setLevel(logging.DEBUG)
43
+
44
+ # Only add handlers if they don't already exist to avoid duplicates
45
+ if not logger.handlers:
46
+ stream_handler = logging.StreamHandler()
47
+ stream_handler.setLevel(logging.DEBUG)
48
+ stream_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
49
+ stream_handler.setFormatter(stream_formatter)
50
+ logger.addHandler(stream_handler)
51
+
52
+ http_handler = FlowfileLogHandler(flowfile_flow_id=flowfile_flow_id, flowfile_node_id = flowfile_node_id)
53
+ http_handler.setLevel(logging.INFO)
54
+ http_formatter = logging.Formatter('%(message)s')
55
+ http_handler.setFormatter(http_formatter)
56
+ logger.addHandler(http_handler)
57
+
58
+ return logger
@@ -0,0 +1,327 @@
1
+ import polars as pl
2
+ import io
3
+ from typing import List, Dict, Callable
4
+ from multiprocessing import Array, Value, Queue
5
+ from flowfile_worker.polars_fuzzy_match.matcher import fuzzy_match_dfs
6
+ from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
7
+ from flowfile_worker.flow_logger import get_worker_logger
8
+ from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
9
+ from flowfile_worker.external_sources.sql_source.main import write_serialized_df_to_database, write_df_to_database
10
+ from base64 import encodebytes
11
+ from logging import Logger
12
+ import logging
13
+ import os
14
+ from flowfile_worker.utils import collect_lazy_frame, collect_lazy_frame_and_get_streaming_info
15
+
16
+
17
+ # 'store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'fuzzy', 'store_sample']
18
+
19
+ logging.basicConfig(format='%(asctime)s: %(message)s')
20
+ logger = logging.getLogger('Spawner')
21
+ logger.setLevel(logging.INFO)
22
+
23
+
24
+ def fuzzy_join_task(left_serializable_object: bytes, right_serializable_object: bytes,
25
+ fuzzy_maps: List[FuzzyMapping], error_message: Array, file_path: str,
26
+ progress: Value,
27
+ queue: Queue, flowfile_flow_id: int, flowfile_node_id: int | str,
28
+ ):
29
+ flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
30
+ try:
31
+ flowfile_logger.info("Starting fuzzy join operation")
32
+ left_df = pl.LazyFrame.deserialize(io.BytesIO(left_serializable_object))
33
+ right_df = pl.LazyFrame.deserialize(io.BytesIO(right_serializable_object))
34
+ fuzzy_match_result = fuzzy_match_dfs(left_df, right_df, fuzzy_maps, flowfile_logger)
35
+ flowfile_logger.info("Fuzzy join operation completed successfully")
36
+ fuzzy_match_result.write_ipc(file_path)
37
+ with progress.get_lock():
38
+ progress.value = 100
39
+ except Exception as e:
40
+ error_msg = str(e).encode()[:256]
41
+ with error_message.get_lock():
42
+ error_message[:len(error_msg)] = error_msg
43
+ with progress.get_lock():
44
+ progress.value = -1
45
+ flowfile_logger.error(f'Error during fuzzy join operation: {str(e)}')
46
+ lf = pl.scan_ipc(file_path)
47
+ number_of_records = collect_lazy_frame(lf.select(pl.len()))[0, 0]
48
+ flowfile_logger.info(f'Number of records after fuzzy match: {number_of_records}')
49
+ queue.put(encodebytes(lf.serialize()))
50
+
51
+
52
+ def process_and_cache(polars_serializable_object: io.BytesIO, progress: Value, error_message: Array,
53
+ file_path: str, flowfile_logger: Logger) -> bytes:
54
+ try:
55
+ lf = pl.LazyFrame.deserialize(polars_serializable_object)
56
+ collect_lazy_frame(lf).write_ipc(file_path)
57
+ flowfile_logger.info("Process operation completed successfully")
58
+ with progress.get_lock():
59
+ progress.value = 100
60
+ except Exception as e:
61
+ error_msg = str(e).encode()[:1024] # Limit error message length
62
+ flowfile_logger.error(f'Error during process and cache operation: {str(e)}')
63
+ with error_message.get_lock():
64
+ error_message[:len(error_msg)] = error_msg
65
+ with progress.get_lock():
66
+ progress.value = -1 # Indicate error
67
+ return error_msg
68
+
69
+
70
+ def store_sample(polars_serializable_object: bytes,
71
+ progress: Value,
72
+ error_message: Array,
73
+ queue: Queue,
74
+ file_path: str,
75
+ sample_size: int,
76
+ flowfile_flow_id: int,
77
+ flowfile_node_id: int | str
78
+ ):
79
+ flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
80
+ flowfile_logger.info("Starting store sample operation")
81
+ try:
82
+ lf = pl.LazyFrame.deserialize(io.BytesIO(polars_serializable_object))
83
+ collect_lazy_frame(lf.limit(sample_size)).write_ipc(file_path)
84
+ flowfile_logger.info("Store sample operation completed successfully")
85
+ with progress.get_lock():
86
+ progress.value = 100
87
+ except Exception as e:
88
+ flowfile_logger.error(f'Error during store sample operation: {str(e)}')
89
+ error_msg = str(e).encode()[:1024] # Limit error message length
90
+ with error_message.get_lock():
91
+ error_message[:len(error_msg)] = error_msg
92
+ with progress.get_lock():
93
+ progress.value = -1 # Indicate error
94
+ return error_msg
95
+
96
+
97
+ def store(polars_serializable_object: bytes, progress: Value, error_message: Array, queue: Queue, file_path: str,
98
+ flowfile_flow_id: int, flowfile_node_id: int | str):
99
+ flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
100
+ flowfile_logger.info("Starting store operation")
101
+ polars_serializable_object_io = io.BytesIO(polars_serializable_object)
102
+ process_and_cache(polars_serializable_object_io, progress, error_message, file_path, flowfile_logger)
103
+ lf = pl.scan_ipc(file_path)
104
+ number_of_records = collect_lazy_frame(lf.select(pl.len()))[0, 0]
105
+ flowfile_logger.info(f'Number of records processed: {number_of_records}')
106
+ queue.put(encodebytes(lf.serialize()))
107
+
108
+
109
+ def calculate_schema_logic(df: pl.LazyFrame, optimize_memory: bool = True, flowfile_logger: Logger = None) -> List[Dict]:
110
+ if flowfile_logger is None:
111
+ raise ValueError('flowfile_logger is required')
112
+ schema = df.collect_schema()
113
+ schema_stats = [dict(column_name=k, pl_datatype=str(v), col_index=i) for i, (k, v) in
114
+ enumerate(schema.items())]
115
+ flowfile_logger.info('Starting to calculate the number of records')
116
+ collected_streaming_info = collect_lazy_frame_and_get_streaming_info(df.select(pl.len()))
117
+ n_records = collected_streaming_info.df[0, 0]
118
+ if n_records < 10_000:
119
+ flowfile_logger.info('Collecting the whole dataset')
120
+ df = collect_lazy_frame(df).lazy()
121
+ if optimize_memory and n_records > 1_000_000:
122
+ df = df.head(1_000_000)
123
+ null_cols = [col for col, data_type in schema.items() if data_type is pl.Null]
124
+ if not (n_records == 0 and df.width == 0):
125
+ if len(null_cols) == 0:
126
+ pl_stats = df.describe()
127
+ else:
128
+ df = df.drop(null_cols)
129
+ pl_stats = df.describe()
130
+ n_unique_per_cols = list(df.select(pl.all().approx_n_unique()).collect(
131
+ engine="streaming" if collected_streaming_info.streaming_collect_available else "auto").to_dicts()[0].values()
132
+ )
133
+ stats_headers = pl_stats.drop_in_place('statistic').to_list()
134
+ stats = {v['column_name']: v for v in pl_stats.transpose(include_header=True, header_name='column_name',
135
+ column_names=stats_headers).to_dicts()}
136
+ for i, (col_stat, n_unique_values) in enumerate(zip(stats.values(), n_unique_per_cols)):
137
+ col_stat['n_unique'] = n_unique_values
138
+ col_stat['examples'] = ', '.join({str(col_stat['min']), str(col_stat['max'])})
139
+ col_stat['null_count'] = int(float(col_stat['null_count']))
140
+ col_stat['count'] = int(float(col_stat['count']))
141
+
142
+ for schema_stat in schema_stats:
143
+ deep_stat = stats.get(schema_stat['column_name'])
144
+ if deep_stat:
145
+ schema_stat.update(deep_stat)
146
+ del df
147
+ else:
148
+ schema_stats = []
149
+ return schema_stats
150
+
151
+
152
+ def calculate_schema(polars_serializable_object: bytes, progress: Value, error_message: Array, queue: Queue,
153
+ flowfile_flow_id: int, flowfile_node_id: int | str, *args, **kwargs):
154
+ polars_serializable_object_io = io.BytesIO(polars_serializable_object)
155
+ flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
156
+ flowfile_logger.info("Starting schema calculation")
157
+ try:
158
+ lf = pl.LazyFrame.deserialize(polars_serializable_object_io)
159
+ schema_stats = calculate_schema_logic(lf, flowfile_logger=flowfile_logger)
160
+ flowfile_logger.info('schema_stats', schema_stats)
161
+ queue.put(schema_stats)
162
+ flowfile_logger.info("Schema calculation completed successfully")
163
+ with progress.get_lock():
164
+ progress.value = 100
165
+ except Exception as e:
166
+ error_msg = str(e).encode()[:256] # Limit error message length
167
+ flowfile_logger.error('error', e)
168
+ with error_message.get_lock():
169
+ error_message[:len(error_msg)] = error_msg
170
+ with progress.get_lock():
171
+ progress.value = -1 # Indicate error
172
+
173
+
174
+ def calculate_number_of_records(polars_serializable_object: bytes, progress: Value, error_message: Array,
175
+ queue: Queue, flowfile_flow_id: int, *args, **kwargs):
176
+ flowfile_logger = get_worker_logger(flowfile_flow_id, -1)
177
+ flowfile_logger.info("Starting number of records calculation")
178
+ polars_serializable_object_io = io.BytesIO(polars_serializable_object)
179
+ try:
180
+ lf = pl.LazyFrame.deserialize(polars_serializable_object_io)
181
+ n_records = collect_lazy_frame(lf.select(pl.len()))[0, 0]
182
+ queue.put(n_records)
183
+ flowfile_logger.debug("Number of records calculation completed successfully")
184
+ flowfile_logger.debug(f'n_records {n_records}')
185
+ with progress.get_lock():
186
+ progress.value = 100
187
+ except Exception as e:
188
+ flowfile_logger.error('error', e)
189
+ error_msg = str(e).encode()[:256] # Limit error message length
190
+ with error_message.get_lock():
191
+ error_message[:len(error_msg)] = error_msg
192
+ with progress.get_lock():
193
+ progress.value = -1 # Indicate error
194
+ return b'error'
195
+
196
+
197
+ def execute_write_method(write_method: Callable, path: str, data_type: str = None, sheet_name: str = None,
198
+ delimiter: str = None,
199
+ write_mode: str = 'create', flowfile_logger: Logger = None):
200
+ flowfile_logger.info('executing write method')
201
+ if data_type == 'excel':
202
+ logger.info('Writing as excel file')
203
+ write_method(path, worksheet=sheet_name)
204
+ elif data_type == 'csv':
205
+ logger.info('Writing as csv file')
206
+ if write_mode == 'append':
207
+ with open(path, 'ab') as f:
208
+ write_method(file=f, separator=delimiter, quote_style='always')
209
+ else:
210
+ write_method(file=path, separator=delimiter, quote_style='always')
211
+ elif data_type == 'parquet':
212
+ logger.info('Writing as parquet file')
213
+ write_method(path)
214
+
215
+
216
+ def write_to_database(polars_serializable_object: bytes,
217
+ progress: Value,
218
+ error_message: Array,
219
+ queue: Queue,
220
+ file_path: str,
221
+ database_write_settings: DatabaseWriteSettings,
222
+ flowfile_flow_id: int = -1,
223
+ flowfile_node_id: int | str = -1
224
+ ):
225
+ """
226
+ Writes a Polars DataFrame to a SQL database.
227
+ """
228
+ flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
229
+ flowfile_logger.info(f"Starting write operation to: {database_write_settings.table_name}")
230
+ df = collect_lazy_frame(pl.LazyFrame.deserialize(io.BytesIO(polars_serializable_object)))
231
+ flowfile_logger.info(f"Starting to write {len(df)} records")
232
+ try:
233
+ write_df_to_database(df, database_write_settings)
234
+ flowfile_logger.info("Write operation completed successfully")
235
+ with progress.get_lock():
236
+ progress.value = 100
237
+ except Exception as e:
238
+ error_msg = str(e).encode()[:1024]
239
+ flowfile_logger.error(f'Error during write operation: {str(e)}')
240
+ with error_message.get_lock():
241
+ error_message[:len(error_msg)] = error_msg
242
+ with progress.get_lock():
243
+ progress.value = -1
244
+
245
+
246
+ def write_output(polars_serializable_object: bytes,
247
+ progress: Value,
248
+ error_message: Array,
249
+ queue: Queue,
250
+ file_path: str,
251
+ data_type: str,
252
+ path: str,
253
+ write_mode: str,
254
+ sheet_name: str = None,
255
+ delimiter: str = None,
256
+ flowfile_flow_id: int = -1,
257
+ flowfile_node_id: int | str = -1
258
+ ):
259
+ flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
260
+ flowfile_logger.info(f"Starting write operation to: {path}")
261
+ try:
262
+ df = pl.LazyFrame.deserialize(io.BytesIO(polars_serializable_object))
263
+ if isinstance(df, pl.LazyFrame):
264
+ flowfile_logger.info(f'Execution plan explanation:\n{df.explain(format="plain")}')
265
+ flowfile_logger.info("Successfully deserialized dataframe")
266
+ is_lazy = False
267
+ sink_method_str = 'sink_'+data_type
268
+ write_method_str = 'write_'+data_type
269
+ has_sink_method = hasattr(df, sink_method_str)
270
+ write_method = None
271
+ if os.path.exists(path) and write_mode == 'create':
272
+ raise Exception('File already exists')
273
+ if has_sink_method and is_lazy:
274
+ write_method = getattr(df, 'sink_' + data_type)
275
+ elif not is_lazy or not has_sink_method:
276
+ if isinstance(df, pl.LazyFrame):
277
+ df = collect_lazy_frame(df)
278
+ write_method = getattr(df, write_method_str)
279
+ if write_method is not None:
280
+ execute_write_method(write_method, path=path, data_type=data_type, sheet_name=sheet_name,
281
+ delimiter=delimiter, write_mode=write_mode, flowfile_logger=flowfile_logger)
282
+ number_of_records_written = (collect_lazy_frame(df.select(pl.len()))[0, 0]
283
+ if isinstance(df, pl.LazyFrame) else df.height)
284
+ flowfile_logger.info(f'Number of records written: {number_of_records_written}')
285
+ else:
286
+ raise Exception('Write method not found')
287
+ with progress.get_lock():
288
+ progress.value = 100
289
+ except Exception as e:
290
+ logger.info(f'Error during write operation: {str(e)}')
291
+ error_message[:len(str(e))] = str(e).encode()
292
+
293
+
294
+ def generic_task(func: Callable,
295
+ progress: Value,
296
+ error_message: Array,
297
+ queue: Queue,
298
+ file_path: str,
299
+ flowfile_flow_id: int,
300
+ flowfile_node_id: int | str,
301
+ *args, **kwargs):
302
+ print(kwargs)
303
+ flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
304
+ flowfile_logger.info("Starting generic task")
305
+ try:
306
+ df = func(*args, **kwargs)
307
+ if isinstance(df, pl.LazyFrame):
308
+ collect_lazy_frame(df).write_ipc(file_path)
309
+ elif isinstance(df, pl.DataFrame):
310
+ df.write_ipc(file_path)
311
+ else:
312
+ raise Exception('Returned object is not a DataFrame or LazyFrame')
313
+ with progress.get_lock():
314
+ progress.value = 100
315
+ flowfile_logger.info("Task completed successfully")
316
+ except Exception as e:
317
+ flowfile_logger.error(f'Error during task execution: {str(e)}')
318
+ error_msg = str(e).encode()[:1024]
319
+ with error_message.get_lock():
320
+ error_message[:len(error_msg)] = error_msg
321
+ with progress.get_lock():
322
+ progress.value = -1
323
+
324
+ lf = pl.scan_ipc(file_path)
325
+ number_of_records = collect_lazy_frame(lf.select(pl.len()))[0, 0]
326
+ flowfile_logger.info(f'Number of records processed: {number_of_records}')
327
+ queue.put(encodebytes(lf.serialize()))
@@ -0,0 +1,108 @@
1
+ import asyncio
2
+ import uvicorn
3
+ import signal
4
+
5
+ from contextlib import asynccontextmanager
6
+ from fastapi import FastAPI
7
+ from flowfile_worker.routes import router
8
+ from flowfile_worker import mp_context, CACHE_DIR
9
+ from flowfile_worker.configs import logger, FLOWFILE_CORE_URI, SERVICE_HOST, SERVICE_PORT
10
+
11
+
12
+ should_exit = False
13
+ server_instance = None
14
+
15
+
16
+ @asynccontextmanager
17
+ async def shutdown_handler(app: FastAPI):
18
+ """Handle application startup and shutdown"""
19
+ logger.info('Starting application...')
20
+ try:
21
+ yield
22
+ finally:
23
+ logger.info('Shutting down application...')
24
+ logger.info("Cleaning up worker resources...")
25
+ for p in mp_context.active_children():
26
+ try:
27
+ p.terminate()
28
+ p.join()
29
+ except Exception as e:
30
+ logger.error(f"Error cleaning up process: {e}")
31
+
32
+ try:
33
+ CACHE_DIR.cleanup()
34
+ except Exception as e:
35
+ print(f"Error cleaning up cache directory: {e}")
36
+
37
+ await asyncio.sleep(0.1)
38
+
39
+
40
+ app = FastAPI(lifespan=shutdown_handler)
41
+ app.include_router(router)
42
+
43
+
44
+ @app.post("/shutdown")
45
+ async def shutdown():
46
+ """Endpoint to handle graceful shutdown"""
47
+ if server_instance:
48
+ # Schedule the shutdown
49
+ await asyncio.create_task(trigger_shutdown())
50
+ return {"message": "Shutting down"}
51
+
52
+
53
+ async def trigger_shutdown():
54
+ """Trigger the actual shutdown after responding to the client"""
55
+ await asyncio.sleep(1) # Give time for the response to be sent
56
+ if server_instance:
57
+ server_instance.should_exit = True
58
+
59
+
60
+ def signal_handler(signum, frame):
61
+ """Handle shutdown signals"""
62
+ logger.info(f"Received signal {signum}")
63
+ if server_instance:
64
+ server_instance.should_exit = True
65
+
66
+
67
+ def run(host: str = None, port: int = None):
68
+ """Run the FastAPI app with graceful shutdown"""
69
+ global server_instance
70
+
71
+ # Use values from settings if not explicitly provided
72
+ if host is None:
73
+ host = SERVICE_HOST
74
+ if port is None:
75
+ port = SERVICE_PORT
76
+
77
+ # Log service configuration
78
+ logger.info(f"Starting worker service on {host}:{port}")
79
+ logger.info(f"Core service configured at {FLOWFILE_CORE_URI}")
80
+
81
+ signal.signal(signal.SIGTERM, signal_handler)
82
+ signal.signal(signal.SIGINT, signal_handler)
83
+
84
+ config = uvicorn.Config(
85
+ app,
86
+ host=host,
87
+ port=port,
88
+ loop="asyncio"
89
+ )
90
+ server = uvicorn.Server(config)
91
+ server_instance = server # Store server instance globally
92
+
93
+ logger.info('Starting server...')
94
+ logger.info('Server started')
95
+
96
+ try:
97
+ server.run()
98
+ except KeyboardInterrupt:
99
+ logger.info("Received interrupt signal, shutting down...")
100
+ finally:
101
+ server_instance = None
102
+ logger.info("Server shutdown complete")
103
+
104
+
105
+ if __name__ == "__main__":
106
+ import multiprocessing
107
+ multiprocessing.freeze_support()
108
+ run()
@@ -0,0 +1,95 @@
1
+ from pydantic import BaseModel
2
+ from typing import Optional, Literal, Any
3
+ from base64 import decodebytes
4
+ from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
5
+ from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
6
+
7
+
8
+ OperationType = Literal[
9
+ 'store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'fuzzy', 'store_sample',
10
+ 'write_to_database']
11
+ ResultType = Literal['polars', 'other']
12
+
13
+
14
+ class PolarsOperation(BaseModel):
15
+ operation: bytes
16
+ flowfile_flow_id: Optional[int] = 1
17
+ flowfile_node_id: Optional[int | str] = -1
18
+ def polars_serializable_object(self):
19
+ return decodebytes(self.operation)
20
+
21
+
22
+ class PolarsScript(PolarsOperation):
23
+ task_id: Optional[str] = None
24
+ cache_dir: Optional[str] = None
25
+ operation_type: OperationType
26
+
27
+
28
+ class PolarsScriptSample(PolarsScript):
29
+ sample_size: Optional[int] = 100
30
+
31
+
32
+ class PolarsScriptWrite(BaseModel):
33
+ operation: bytes
34
+ data_type: str
35
+ path: str
36
+ write_mode: str
37
+ sheet_name: Optional[str] = None
38
+ delimiter: Optional[str] = None
39
+ flowfile_flow_id: Optional[int] = -1
40
+ flowfile_node_id: Optional[int | str] = -1
41
+
42
+ def polars_serializable_object(self):
43
+ return decodebytes(self.operation)
44
+
45
+
46
+ class DatabaseScriptWrite(DatabaseWriteSettings):
47
+ operation: bytes
48
+
49
+ def polars_serializable_object(self):
50
+ return decodebytes(self.operation)
51
+
52
+ def get_database_write_settings(self) -> DatabaseWriteSettings:
53
+ """
54
+ Converts the current instance to a DatabaseWriteSettings object.
55
+ Returns:
56
+ DatabaseWriteSettings: The corresponding DatabaseWriteSettings object.
57
+ """
58
+
59
+ return DatabaseWriteSettings(
60
+ connection=self.connection,
61
+ table_name=self.table_name,
62
+ if_exists=self.if_exists,
63
+ flowfile_flow_id=self.flowfile_flow_id,
64
+ flowfile_node_id=self.flowfile_node_id
65
+ )
66
+
67
+
68
+ class FuzzyJoinInput(BaseModel):
69
+ task_id: Optional[str] = None
70
+ cache_dir: Optional[str] = None
71
+ left_df_operation: PolarsOperation
72
+ right_df_operation: PolarsOperation
73
+ fuzzy_maps: list[FuzzyMapping]
74
+ flowfile_flow_id: Optional[int] = 1
75
+ flowfile_node_id: Optional[int | str] = -1
76
+
77
+
78
+ class Status(BaseModel):
79
+ background_task_id: str
80
+ status: Literal['Processing', 'Completed', 'Error', 'Unknown Error', 'Starting'] # Type alias for status
81
+ file_ref: str
82
+ progress: Optional[int] = 0
83
+ error_message: Optional[str] = None # Add error_message field
84
+ results: Optional[Any] = None
85
+ result_type: Optional[ResultType] = 'polars'
86
+
87
+ def __hash__(self):
88
+ return hash(self.file_ref)
89
+
90
+
91
+ class RawLogInput(BaseModel):
92
+ flowfile_flow_id: int
93
+ log_message: str
94
+ log_type: Literal["INFO", "ERROR"]
95
+ extra: Optional[dict] = None
File without changes