Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,184 @@
1
+ import uuid
2
+ import time
3
+ import os
4
+ import requests
5
+ import subprocess
6
+ from pathlib import Path
7
+ from typing import Iterable, Any, List, Optional
8
+ from flowfile_core.flowfile.FlowfileFlow import FlowGraph
9
+ from flowfile_core.schemas import schemas
10
+ from tempfile import TemporaryDirectory
11
+
12
+
13
+ def _is_iterable(obj: Any) -> bool:
14
+ # Avoid treating strings as iterables in this context
15
+ return isinstance(obj, Iterable) and not isinstance(obj, (str, bytes))
16
+
17
+
18
+ def _parse_inputs_as_iterable(
19
+ inputs: tuple[Any, ...] | tuple[Iterable[Any]],
20
+ ) -> List[Any]:
21
+ if not inputs:
22
+ return []
23
+
24
+ # Treat elements of a single iterable as separate inputs
25
+ if len(inputs) == 1 and _is_iterable(inputs[0]):
26
+ return list(inputs[0])
27
+
28
+ return list(inputs)
29
+
30
+
31
+ def _generate_id() -> int:
32
+ """Generate a simple unique ID for nodes."""
33
+ return int(uuid.uuid4().int % 100000)
34
+
35
+
36
+ def create_etl_graph() -> FlowGraph:
37
+ flow_id = _generate_id()
38
+ flow_settings = schemas.FlowSettings(
39
+ flow_id=flow_id,
40
+ name=f"Flow_{flow_id}",
41
+ path=f"flow_{flow_id}"
42
+ )
43
+ flow_graph = FlowGraph(flow_id=flow_id, flow_settings=flow_settings)
44
+ flow_graph.flow_settings.execution_location = 'local' # always create a local frame so that the run time does not attempt to use the flowfile_worker process
45
+ return flow_graph
46
+
47
+
48
+ def is_flowfile_running() -> bool:
49
+ """Check if the Flowfile application is running by testing its API endpoint."""
50
+ try:
51
+ response = requests.get("http://0.0.0.0:63578/docs", timeout=2)
52
+ return response.status_code == 200
53
+ except (requests.ConnectionError, requests.Timeout):
54
+ return False
55
+
56
+
57
+ def start_flowfile_application() -> bool:
58
+ """Start the Flowfile application on macOS."""
59
+ try:
60
+ # Attempt to start the Flowfile application
61
+ subprocess.Popen(['open', '-a', 'Flowfile'],
62
+ stdout=subprocess.PIPE,
63
+ stderr=subprocess.PIPE)
64
+
65
+ # Wait for the application to start up (max 10 seconds)
66
+ start_time = time.time()
67
+ while time.time() - start_time < 10:
68
+ if is_flowfile_running():
69
+ return True
70
+ time.sleep(0.5) # Check every half second
71
+
72
+ # If we get here, the app didn't start in time
73
+ return False
74
+ except Exception as e:
75
+ print(f"Error starting Flowfile application: {e}")
76
+ return False
77
+
78
+
79
+ def get_auth_token() -> Optional[str]:
80
+ """Get an authentication token from the Flowfile API."""
81
+ try:
82
+ response = requests.post(
83
+ "http://0.0.0.0:63578/auth/token",
84
+ json={}, # Empty body as specified
85
+ timeout=5
86
+ )
87
+
88
+ if response.status_code == 200:
89
+ token_data = response.json()
90
+ return token_data.get("access_token")
91
+ else:
92
+ print(f"Failed to get auth token: {response.status_code} - {response.text}")
93
+ return None
94
+ except Exception as e:
95
+ print(f"Error getting auth token: {e}")
96
+ return None
97
+
98
+
99
+ def import_flow_to_editor(flow_path: str, auth_token: str) -> Optional[int]:
100
+ """Import the flow into the Flowfile editor using the API endpoint."""
101
+ try:
102
+ flow_path = Path(flow_path).resolve() # Get absolute path
103
+ if not flow_path.exists():
104
+ print(f"Flow file not found: {flow_path}")
105
+ return None
106
+
107
+ # Set authorization header with the token
108
+ headers = {"Authorization": f"Bearer {auth_token}"}
109
+
110
+ # Make a GET request to the import endpoint
111
+ response = requests.get(
112
+ "http://0.0.0.0:63578/import_flow/",
113
+ params={"flow_path": str(flow_path)},
114
+ headers=headers,
115
+ timeout=10
116
+ )
117
+
118
+ if response.status_code == 200:
119
+ flow_id = response.json()
120
+ print(f"Flow imported successfully with ID: {flow_id}")
121
+ return flow_id
122
+ else:
123
+ print(f"Failed to import flow: {response.status_code} - {response.text}")
124
+ return None
125
+ except Exception as e:
126
+ print(f"Error importing flow: {e}")
127
+ return None
128
+
129
+
130
+ def open_graph_in_editor(etl_graph: FlowGraph, storage_location: str = None) -> bool:
131
+ """
132
+ Save the ETL graph and open it in the Flowfile editor.
133
+
134
+ Parameters:
135
+ -----------
136
+ etl_graph : FlowGraph
137
+ The graph to save and open
138
+ storage_location : str, optional
139
+ Where to save the flowfile. If None, a default name is used.
140
+
141
+ Returns:
142
+ --------
143
+ bool
144
+ True if the graph was successfully opened in the editor, False otherwise
145
+ """
146
+ # Create a temporary directory if needed
147
+ temp_dir = None
148
+ if storage_location is None:
149
+ temp_dir = TemporaryDirectory()
150
+ storage_location = os.path.join(temp_dir.name, 'temp_flow.flowfile')
151
+ else:
152
+ # Ensure path is absolute
153
+ storage_location = os.path.abspath(storage_location)
154
+
155
+ etl_graph.apply_layout()
156
+ etl_graph.save_flow(storage_location)
157
+ print(f"Flow saved to: {storage_location}")
158
+
159
+ # Check if Flowfile is running, and start it if not
160
+ if not is_flowfile_running():
161
+ print("Flowfile application is not running. Starting it...")
162
+ if not start_flowfile_application():
163
+ print("Failed to start Flowfile application")
164
+ if temp_dir:
165
+ temp_dir.cleanup()
166
+ return False
167
+ print("Flowfile application started successfully")
168
+
169
+ # Get authentication token
170
+ auth_token = get_auth_token()
171
+ if not auth_token:
172
+ print("Failed to authenticate with Flowfile API")
173
+ if temp_dir:
174
+ temp_dir.cleanup()
175
+ return False
176
+
177
+ # Import the flow into the editor
178
+ flow_id = import_flow_to_editor(storage_location, auth_token)
179
+
180
+ # Clean up temporary directory if we created one
181
+ if temp_dir:
182
+ temp_dir.cleanup()
183
+
184
+ return flow_id is not None
@@ -0,0 +1,55 @@
1
+ from typing import Dict
2
+ import tempfile
3
+ import threading
4
+ import multiprocessing
5
+ import os
6
+ import shutil
7
+ multiprocessing.set_start_method('spawn', force=True)
8
+
9
+
10
+ from multiprocessing import get_context
11
+ from flowfile_worker.models import Status
12
+ mp_context = get_context("spawn")
13
+ status_dict: Dict[str, Status] = dict()
14
+ process_dict = dict()
15
+
16
+ status_dict_lock = threading.Lock()
17
+ process_dict_lock = threading.Lock()
18
+
19
+
20
+ class SharedTempDirectory:
21
+ """A class that mimics tempfile.TemporaryDirectory but uses a fixed directory"""
22
+ def __init__(self, dir_path):
23
+ self._path = dir_path
24
+ os.makedirs(self._path, exist_ok=True)
25
+
26
+ @property
27
+ def name(self):
28
+ return self._path
29
+
30
+ def cleanup(self):
31
+ """Remove all contents of the temp directory"""
32
+ try:
33
+ shutil.rmtree(self._path)
34
+ os.makedirs(self._path, exist_ok=True)
35
+ print(f"Cleaned up temporary directory: {self._path}")
36
+ except Exception as e:
37
+ print(f"Error during cleanup: {e}")
38
+
39
+ def __enter__(self):
40
+ return self.name
41
+
42
+ def __exit__(self, exc, value, tb):
43
+ self.cleanup()
44
+
45
+
46
+ CACHE_EXPIRATION_TIME = 24 * 60 * 60
47
+
48
+
49
+ TEMP_DIR = os.getenv('TEMP_DIR')
50
+ if TEMP_DIR:
51
+ CACHE_DIR = SharedTempDirectory(TEMP_DIR)
52
+ else:
53
+ CACHE_DIR = tempfile.TemporaryDirectory()
54
+
55
+ PROCESS_MEMORY_USAGE: Dict[str, float] = dict()
@@ -0,0 +1,95 @@
1
+ # flowfile_worker.configs
2
+
3
+ import logging
4
+ import platform
5
+ import argparse
6
+ import os
7
+ from connectorx import __version__
8
+
9
+
10
+ # Configure logging
11
+ logging.basicConfig(format='%(asctime)s: %(message)s')
12
+ logger = logging.getLogger('FlowfileWorker')
13
+ logger.setLevel(logging.INFO)
14
+
15
+ # Constants for worker and core configuration
16
+ DEFAULT_SERVICE_HOST = "0.0.0.0" if platform.system() != "Windows" else "127.0.0.1"
17
+ DEFAULT_SERVICE_PORT = 63579
18
+ DEFAULT_CORE_HOST = "0.0.0.0" if platform.system() != "Windows" else "127.0.0.1"
19
+ DEFAULT_CORE_PORT = 63578
20
+ TEST_MODE = True if 'TEST_MODE' in os.environ else False
21
+
22
+
23
+ def parse_args():
24
+ """Parse command line arguments"""
25
+ parser = argparse.ArgumentParser(description="Flowfile Worker Server")
26
+ parser.add_argument(
27
+ "--host", type=str, default=DEFAULT_SERVICE_HOST, help="Host to bind worker to"
28
+ )
29
+ parser.add_argument(
30
+ "--port", type=int, default=DEFAULT_SERVICE_PORT, help="Port to bind worker to"
31
+ )
32
+ parser.add_argument(
33
+ "--core-host",
34
+ type=str,
35
+ default=DEFAULT_CORE_HOST,
36
+ help="Host of the core service",
37
+ )
38
+ parser.add_argument(
39
+ "--core-port",
40
+ type=int,
41
+ default=DEFAULT_CORE_PORT,
42
+ help="Port of the core service",
43
+ )
44
+
45
+ # Use known_args to handle PyInstaller's extra args
46
+ args = parser.parse_known_args()[0]
47
+
48
+ # Validate arguments
49
+ if args.port < 1 or args.port > 65535:
50
+ raise ValueError(
51
+ f"Invalid port number: {args.port}. Port must be between 1 and 65535."
52
+ )
53
+
54
+ if args.core_port < 1 or args.core_port > 65535:
55
+ raise ValueError(
56
+ f"Invalid core port number: {args.core_port}. Port must be between 1 and 65535."
57
+ )
58
+
59
+ # Check if hosts are valid (basic check)
60
+ if not args.host:
61
+ raise ValueError("Worker host cannot be empty")
62
+
63
+ if not args.core_host:
64
+ raise ValueError("Core host cannot be empty")
65
+
66
+ return args
67
+
68
+
69
+ def get_core_url(host, port):
70
+ """
71
+ Get the core URL based on provided host and port
72
+
73
+ Args:
74
+ host: Core service host
75
+ port: Core service port
76
+ """
77
+ return f"http://{host}:{port}"
78
+
79
+
80
+ # Parse arguments - defaults are already set in the argument parser
81
+ args = parse_args()
82
+
83
+ # These variables will already use defaults from argparse if not provided
84
+ SERVICE_HOST = args.host
85
+ SERVICE_PORT = args.port
86
+ CORE_HOST = args.core_host
87
+ CORE_PORT = args.core_port
88
+
89
+ # Generate the core URI
90
+ FLOWFILE_CORE_URI = get_core_url(CORE_HOST, CORE_PORT)
91
+
92
+ logger.info(f"ConnectorX version: {__version__}")
93
+ # Log configuration
94
+ logger.info(f"Worker configured at {SERVICE_HOST}:{SERVICE_PORT}")
95
+ logger.info(f"Core service configured at {FLOWFILE_CORE_URI}")
@@ -0,0 +1,37 @@
1
+ from flowfile_worker.create.models import (ReceivedCsvTable, ReceivedParquetTable, ReceivedExcelTable,
2
+ ReceivedJsonTable)
3
+ from flowfile_worker.create.funcs import (create_from_path_csv, create_from_path_parquet, create_from_path_excel,
4
+ create_from_path_json)
5
+ from typing import Dict, Literal
6
+
7
+ ReceivedTableCollection = ReceivedCsvTable | ReceivedParquetTable | ReceivedJsonTable | ReceivedExcelTable
8
+ FileType = Literal['csv', 'parquet', 'json', 'excel']
9
+
10
+
11
+ def received_table_parser(received_table_raw: Dict, file_type: FileType) -> ReceivedTableCollection:
12
+ match file_type:
13
+ case 'csv':
14
+ received_table = ReceivedCsvTable.model_validate(received_table_raw)
15
+ case 'parquet':
16
+ received_table = ReceivedParquetTable.model_validate(received_table_raw)
17
+ case 'excel':
18
+ received_table = ReceivedExcelTable.model_validate(received_table_raw)
19
+ case 'json':
20
+ return ReceivedJsonTable.model_validate(received_table_raw)
21
+ case _:
22
+ raise ValueError(f'Unsupported file type: {file_type}')
23
+ return received_table
24
+
25
+
26
+ def table_creator_factory_method(file_type: Literal['csv', 'parquet', 'json', 'excel']) -> callable:
27
+ match file_type:
28
+ case 'csv':
29
+ return create_from_path_csv
30
+ case 'parquet':
31
+ return create_from_path_parquet
32
+ case 'excel':
33
+ return create_from_path_excel
34
+ case 'json':
35
+ return create_from_path_json
36
+ case _:
37
+ raise ValueError(f'Unsupported file type: {file_type}')
@@ -0,0 +1,146 @@
1
+ import polars as pl
2
+ import os
3
+
4
+ from flowfile_worker.create.models import ReceivedCsvTable, ReceivedParquetTable, ReceivedExcelTable
5
+ from flowfile_worker.create.utils import create_fake_data
6
+ from flowfile_worker.create.read_excel_tables import df_from_openpyxl, df_from_calamine_xlsx
7
+
8
+
9
+ def create_from_path_json(received_table: ReceivedCsvTable):
10
+ f = received_table.abs_file_path
11
+ gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
12
+ low_mem = gbs_to_load > 10
13
+ if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
14
+ try:
15
+ df = pl.scan_csv(f,
16
+ low_memory=low_mem,
17
+ try_parse_dates=True,
18
+ separator=received_table.delimiter,
19
+ has_header=received_table.has_headers,
20
+ skip_rows=received_table.starting_from_line,
21
+ encoding='utf8',
22
+ infer_schema_length=received_table.infer_schema_length)
23
+ df.head(1).collect()
24
+ return df
25
+ except:
26
+ try:
27
+ df = pl.scan_csv(f, low_memory=low_mem,
28
+ separator=received_table.delimiter,
29
+ has_header=received_table.has_headers,
30
+ skip_rows=received_table.starting_from_line,
31
+ encoding='utf8-lossy',
32
+ ignore_errors=True)
33
+ return df
34
+ except:
35
+ df = pl.scan_csv(f, low_memory=low_mem,
36
+ separator=received_table.delimiter,
37
+ has_header=received_table.has_headers,
38
+ skip_rows=received_table.starting_from_line,
39
+ encoding='utf8',
40
+ ignore_errors=True)
41
+ return df
42
+ else:
43
+ df = pl.read_csv(f, low_memory=low_mem,
44
+ separator=received_table.delimiter,
45
+ has_header=received_table.has_headers,
46
+ skip_rows=received_table.starting_from_line,
47
+ encoding=received_table.encoding,
48
+ ignore_errors=True)
49
+ return df
50
+
51
+
52
+ def create_from_path_csv(received_table: ReceivedCsvTable) -> pl.DataFrame:
53
+ f = received_table.abs_file_path
54
+ gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
55
+ low_mem = gbs_to_load > 10
56
+ if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
57
+ try:
58
+ df = pl.scan_csv(f,
59
+ low_memory=low_mem,
60
+ try_parse_dates=True,
61
+ separator=received_table.delimiter,
62
+ has_header=received_table.has_headers,
63
+ skip_rows=received_table.starting_from_line,
64
+ encoding='utf8',
65
+ infer_schema_length=received_table.infer_schema_length)
66
+ df.head(1).collect()
67
+ return df
68
+ except:
69
+ try:
70
+ df = pl.scan_csv(f, low_memory=low_mem,
71
+ separator=received_table.delimiter,
72
+ has_header=received_table.has_headers,
73
+ skip_rows=received_table.starting_from_line,
74
+ encoding='utf8-lossy',
75
+ ignore_errors=True)
76
+ return df
77
+ except:
78
+ df = pl.scan_csv(f, low_memory=low_mem,
79
+ separator=received_table.delimiter,
80
+ has_header=received_table.has_headers,
81
+ skip_rows=received_table.starting_from_line,
82
+ encoding='utf8',
83
+ ignore_errors=True)
84
+ return df
85
+ else:
86
+ df = pl.read_csv(f,
87
+ low_memory=low_mem,
88
+ separator=received_table.delimiter,
89
+ has_header=received_table.has_headers,
90
+ skip_rows=received_table.starting_from_line,
91
+ encoding=received_table.encoding,
92
+ ignore_errors=True)
93
+ return df
94
+
95
+
96
+ def create_random(number_of_records: int = 1000) -> pl.LazyFrame:
97
+ return create_fake_data(number_of_records).lazy()
98
+
99
+
100
+ def create_from_path_parquet(received_table: ReceivedParquetTable):
101
+ low_mem = (os.path.getsize(received_table.abs_file_path) / 1024 / 1000 / 1000) > 2
102
+ return pl.scan_parquet(source=received_table.abs_file_path, low_memory=low_mem)
103
+
104
+
105
+ def create_from_path_excel(received_table: ReceivedExcelTable):
106
+ if received_table.type_inference:
107
+ engine = 'openpyxl'
108
+ elif received_table.start_row > 0 and received_table.start_column == 0:
109
+ engine = 'calamine' if received_table.has_headers else 'xlsx2csv'
110
+ elif received_table.start_column > 0 or received_table.start_row > 0:
111
+ engine = 'openpyxl'
112
+ else:
113
+ engine = 'calamine'
114
+
115
+ sheet_name = received_table.sheet_name
116
+
117
+ if engine == 'calamine':
118
+ df = df_from_calamine_xlsx(file_path=received_table.abs_file_path, sheet_name=sheet_name,
119
+ start_row=received_table.start_row, end_row=received_table.end_row)
120
+ if received_table.end_column > 0:
121
+ end_col_index = received_table.end_column
122
+ cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
123
+ df = df.select(cols_to_select)
124
+
125
+ elif engine == 'xlsx2csv':
126
+ csv_options = {'has_header': received_table.has_headers, 'skip_rows': received_table.start_row}
127
+ df = pl.read_excel(source=received_table.abs_file_path,
128
+ read_options=csv_options,
129
+ engine='xlsx2csv',
130
+ sheet_name=received_table.sheet_name)
131
+ end_col_index = received_table.end_column if received_table.end_column > 0 else len(df.columns)
132
+ cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
133
+ df = df.select(cols_to_select)
134
+ if 0 < received_table.end_row < len(df):
135
+ df = df.head(received_table.end_row)
136
+
137
+ else:
138
+ max_col = received_table.end_column if received_table.end_column > 0 else None
139
+ max_row = received_table.end_row + 1 if received_table.end_row > 0 else None
140
+ df = df_from_openpyxl(file_path=received_table.abs_file_path,
141
+ sheet_name=received_table.sheet_name,
142
+ min_row=received_table.start_row + 1,
143
+ min_col=received_table.start_column + 1,
144
+ max_row=max_row,
145
+ max_col=max_col, has_headers=received_table.has_headers)
146
+ return df
@@ -0,0 +1,86 @@
1
+ from pydantic import BaseModel, Field, model_validator
2
+ from typing import List, Optional
3
+ import os
4
+ from pathlib import Path
5
+
6
+
7
+ class MinimalFieldInfo(BaseModel):
8
+ name: str
9
+ data_type: str
10
+
11
+
12
+ class ReceivedTableBase(BaseModel):
13
+ id: Optional[int] = None
14
+ name: str
15
+ path: str
16
+ directory: Optional[str] = None
17
+ analysis_file_available: Optional[bool] = False
18
+ status: Optional[str] = None
19
+ file_type: Optional[str] = None
20
+ fields: List[MinimalFieldInfo] = Field(default_factory=list)
21
+ abs_file_path: Optional[str] = None
22
+
23
+ @classmethod
24
+ def create_from_path(cls, path: str):
25
+ filename = os.path.basename(path)
26
+ return cls(name=filename, path=path)
27
+
28
+ @property
29
+ def file_path(self) -> str:
30
+ if self.name not in self.path:
31
+ return os.path.join(self.path, self.name)
32
+ return self.path
33
+
34
+ @model_validator(mode="after")
35
+ def set_abs_file_path(cls, values):
36
+ abs_file_path = getattr(values, "abs_file_path", None)
37
+ if abs_file_path is None:
38
+ path = getattr(values, "path", None)
39
+ if not path:
40
+ raise ValueError("Field 'path' is required to compute abs_file_path")
41
+ setattr(values, "abs_file_path", str(Path(path).absolute()))
42
+ return values
43
+
44
+
45
+ class ReceivedCsvTable(ReceivedTableBase):
46
+ file_type: Optional[str] = 'csv'
47
+ reference: Optional[str] = ''
48
+ starting_from_line: Optional[int] = 0
49
+ delimiter: Optional[str] = ','
50
+ has_headers: Optional[bool] = True
51
+ encoding: Optional[str] = 'utf-8'
52
+ parquet_ref: Optional[str] = None
53
+ row_delimiter: Optional[str] = '\n'
54
+ quote_char: Optional[str] = '"'
55
+ infer_schema_length: Optional[int] = 10_000
56
+ truncate_ragged_lines: Optional[bool] = False
57
+ ignore_errors: Optional[bool] = False
58
+
59
+
60
+ class ReceivedJsonTable(ReceivedCsvTable):
61
+ pass
62
+
63
+
64
+ class ReceivedParquetTable(ReceivedTableBase):
65
+ file_type: Optional[str] = 'parquet'
66
+
67
+
68
+ class ReceivedExcelTable(ReceivedTableBase):
69
+ sheet_name: Optional[str] = None
70
+ start_row: Optional[int] = 0 # optional
71
+ start_column: Optional[int] = 0 # optional
72
+ end_row: Optional[int] = 0 # optional
73
+ end_column: Optional[int] = 0 # optional
74
+ has_headers: Optional[bool] = True # optional
75
+ type_inference: Optional[bool] = False # optional
76
+
77
+ def validate_range_values(self):
78
+ # Validate that start and end rows/columns are non-negative integers
79
+ for attribute in [self.start_row, self.start_column, self.end_row, self.end_column]:
80
+ if not isinstance(attribute, int) or attribute < 0:
81
+ raise ValueError("Row and column indices must be non-negative integers")
82
+
83
+ # Validate that start is before end if end is specified (non-zero)
84
+ if (0 < self.end_row < self.start_row) or \
85
+ (0 < self.end_column < self.start_column):
86
+ raise ValueError("Start row/column must not be greater than end row/column if specified")
@@ -0,0 +1,35 @@
1
+ import polars as pl
2
+
3
+
4
+ dtype_to_pl = {
5
+ 'int': pl.Int64,
6
+ 'integer': pl.Int64,
7
+ 'char': pl.String,
8
+ 'fixed decimal': pl.Float32,
9
+ 'double': pl.Float64,
10
+ 'float': pl.Float64,
11
+ 'bool': pl.Boolean,
12
+ 'byte': pl.UInt8,
13
+ 'bit': pl.Binary,
14
+ 'date': pl.Date,
15
+ 'datetime': pl.Datetime,
16
+ 'string': pl.String,
17
+ 'str': pl.String,
18
+ 'time': pl.Time,
19
+ }
20
+
21
+ dtype_to_pl_str = {k: v.__name__ for k, v in dtype_to_pl.items()}
22
+
23
+
24
+ def type_to_polars(dtype: str):
25
+ pl_datetype = dtype_to_pl.get(dtype.lower())
26
+ if pl_datetype is not None:
27
+ return pl_datetype
28
+ elif hasattr(pl, dtype):
29
+ return getattr(pl, dtype)
30
+ else:
31
+ return pl.String
32
+
33
+
34
+ def type_to_polars_str(dtype: str) -> pl.DataType:
35
+ return type_to_polars(dtype)()