satif-ai 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- satif_ai/__init__.py +19 -0
- satif_ai/adapters/tidy.py +21 -40
- satif_ai/standardize.py +112 -0
- satif_ai/standardizers/ai.py +481 -0
- satif_ai/standardizers/ai_csv.py +5 -2
- satif_ai/standardizers/ai_xlsx.py +372 -0
- satif_ai/transform.py +155 -0
- satif_ai/{code_builders/transformation.py → transformation_builders/syncpulse.py} +22 -29
- satif_ai/utils/__init__.py +5 -0
- satif_ai/utils/merge_sdif.py +535 -0
- satif_ai/utils/openai_mcp.py +97 -0
- satif_ai/utils/zip.py +120 -0
- {satif_ai-0.2.8.dist-info → satif_ai-0.2.10.dist-info}/METADATA +6 -4
- satif_ai-0.2.10.dist-info/RECORD +20 -0
- satif_ai/code_builders/adaptation.py +0 -9
- satif_ai-0.2.8.dist-info/RECORD +0 -13
- /satif_ai/{code_builders → transformation_builders}/__init__.py +0 -0
- {satif_ai-0.2.8.dist-info → satif_ai-0.2.10.dist-info}/LICENSE +0 -0
- {satif_ai-0.2.8.dist-info → satif_ai-0.2.10.dist-info}/WHEEL +0 -0
- {satif_ai-0.2.8.dist-info → satif_ai-0.2.10.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,372 @@
|
|
1
|
+
import logging
|
2
|
+
import shutil
|
3
|
+
import tempfile
|
4
|
+
import uuid
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple
|
7
|
+
|
8
|
+
try:
|
9
|
+
from xlsx_to_sdif.graph import graph as xlsx_graph
|
10
|
+
from xlsx_to_sdif.state import State as XLSXState
|
11
|
+
except ImportError:
|
12
|
+
xlsx_graph = None # type: ignore
|
13
|
+
XLSXState = None # type: ignore
|
14
|
+
logging.getLogger(__name__).warning(
|
15
|
+
"Failed to import xlsx_to_sdif. AIXLSXStandardizer will not be functional."
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
from satif_core.standardizers.base import AsyncStandardizer
|
20
|
+
from satif_core.types import Datasource, SDIFPath, StandardizationResult
|
21
|
+
|
22
|
+
from satif_ai.utils.merge_sdif import merge_sdif_files
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
class AIXLSXStandardizer(AsyncStandardizer):
|
28
|
+
"""
|
29
|
+
An asynchronous standardizer for XLSX files that leverages the `xlsx-to-sdif` library.
|
30
|
+
|
31
|
+
This standardizer processes one or more XLSX files, converts each to an
|
32
|
+
intermediate SDIF (Standardized Data Interchange Format) file using the
|
33
|
+
`xlsx-to-sdif` processing graph, and then consolidates these intermediate
|
34
|
+
files into a single final SDIF file.
|
35
|
+
"""
|
36
|
+
|
37
|
+
def __init__(self, *args: Any, **kwargs: Any):
|
38
|
+
"""
|
39
|
+
Initializes the AIXLSXStandardizer.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
...
|
43
|
+
"""
|
44
|
+
|
45
|
+
async def _invoke_xlsx_graph(
|
46
|
+
self, input_file_path: Path, graph_config: Dict[str, Any]
|
47
|
+
) -> Path:
|
48
|
+
"""
|
49
|
+
Invokes the `xlsx-to-sdif` graph for a single XLSX file.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
input_file_path: Path to the input XLSX file.
|
53
|
+
graph_config: Configuration for the `xlsx-to-sdif` graph invocation,
|
54
|
+
including a unique `thread_id`.
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
Path to the SDIF file produced by the graph.
|
58
|
+
|
59
|
+
Raises:
|
60
|
+
RuntimeError: If the `xlsx-to-sdif` graph is not available, fails to
|
61
|
+
return a final state, or does not produce an output path.
|
62
|
+
FileNotFoundError: If the graph reports an output file that doesn't exist.
|
63
|
+
"""
|
64
|
+
if not xlsx_graph or not XLSXState:
|
65
|
+
raise RuntimeError(
|
66
|
+
"xlsx_to_sdif is not available. "
|
67
|
+
"Please ensure 'xlsx-to-sdif' library is installed correctly."
|
68
|
+
)
|
69
|
+
|
70
|
+
initial_state: XLSXState = {"spreadsheet_path": str(input_file_path)} # type: ignore
|
71
|
+
|
72
|
+
thread_id = graph_config.get("configurable", {}).get(
|
73
|
+
"thread_id", "unknown_thread"
|
74
|
+
)
|
75
|
+
logger.info(
|
76
|
+
f"Invoking xlsx_to_sdif graph for: {input_file_path.name} with thread_id: {thread_id}"
|
77
|
+
)
|
78
|
+
|
79
|
+
# Stream events for logging or potential progress updates
|
80
|
+
async for event in xlsx_graph.astream_events(
|
81
|
+
initial_state, graph_config, version="v1"
|
82
|
+
):
|
83
|
+
event_type = event["event"]
|
84
|
+
event_name = event.get("name", "")
|
85
|
+
if event_type in ["on_tool_start", "on_chain_start"]:
|
86
|
+
logger.debug(
|
87
|
+
f"Graph event for {input_file_path.name} (Thread: {thread_id}): {event_type} - {event_name}"
|
88
|
+
)
|
89
|
+
elif event_type in ["on_tool_error", "on_chain_error", "on_llm_error"]:
|
90
|
+
logger.warning(
|
91
|
+
f"Graph error event for {input_file_path.name} (Thread: {thread_id}): {event_type} - {event_name}. Data: {event.get('data')}"
|
92
|
+
)
|
93
|
+
|
94
|
+
final_snapshot = await xlsx_graph.aget_state(graph_config)
|
95
|
+
if not final_snapshot or not final_snapshot.values:
|
96
|
+
raise RuntimeError(
|
97
|
+
f"xlsx_to_sdif graph did not return a final state for {input_file_path.name} (Thread: {thread_id})."
|
98
|
+
)
|
99
|
+
|
100
|
+
output_sdif_path_str = final_snapshot.values.get("output_sdif_path")
|
101
|
+
if not output_sdif_path_str:
|
102
|
+
raise RuntimeError(
|
103
|
+
f"xlsx_to_sdif graph for {input_file_path.name} (Thread: {thread_id}) "
|
104
|
+
f"did not produce an 'output_sdif_path' in its final state. State: {final_snapshot.values}"
|
105
|
+
)
|
106
|
+
|
107
|
+
output_sdif_path = Path(output_sdif_path_str)
|
108
|
+
if not output_sdif_path.is_file():
|
109
|
+
raise FileNotFoundError(
|
110
|
+
f"xlsx_to_sdif graph for {input_file_path.name} (Thread: {thread_id}) "
|
111
|
+
f"reported output file '{output_sdif_path}', but it does not exist or is not a file."
|
112
|
+
)
|
113
|
+
|
114
|
+
logger.info(
|
115
|
+
f"xlsx_to_sdif graph successfully processed {input_file_path.name} (Thread: {thread_id}). Output at {output_sdif_path}"
|
116
|
+
)
|
117
|
+
return output_sdif_path
|
118
|
+
|
119
|
+
async def _resolve_and_filter_input_files(
|
120
|
+
self, datasource: Datasource
|
121
|
+
) -> List[Path]:
|
122
|
+
"""Resolves and validates datasource, returning a list of XLSX file paths."""
|
123
|
+
input_files: List[Path]
|
124
|
+
if isinstance(datasource, (str, Path)):
|
125
|
+
input_files = [Path(datasource)]
|
126
|
+
elif isinstance(datasource, list) and all(
|
127
|
+
isinstance(p, (str, Path)) for p in datasource
|
128
|
+
):
|
129
|
+
input_files = [Path(p) for p in datasource]
|
130
|
+
else:
|
131
|
+
raise ValueError(
|
132
|
+
"Datasource must be a file path (str or Path) or a list of such paths."
|
133
|
+
)
|
134
|
+
|
135
|
+
if not input_files:
|
136
|
+
raise ValueError("No input XLSX files provided in the datasource.")
|
137
|
+
|
138
|
+
xlsx_input_files = []
|
139
|
+
for f_path in input_files:
|
140
|
+
if not f_path.is_file():
|
141
|
+
raise FileNotFoundError(f"Input file not found: {f_path}")
|
142
|
+
if f_path.suffix.lower() not in (
|
143
|
+
".xlsx",
|
144
|
+
".xlsm",
|
145
|
+
".xlsb",
|
146
|
+
".xls",
|
147
|
+
): # Common Excel extensions
|
148
|
+
logger.warning(
|
149
|
+
f"File {f_path.name} is not a typical XLSX file extension, but will be attempted."
|
150
|
+
)
|
151
|
+
xlsx_input_files.append(f_path)
|
152
|
+
|
153
|
+
if not xlsx_input_files:
|
154
|
+
raise ValueError(
|
155
|
+
"No processable XLSX files found in the datasource after filtering."
|
156
|
+
)
|
157
|
+
return xlsx_input_files
|
158
|
+
|
159
|
+
def _prepare_final_output_path(
|
160
|
+
self, output_path: SDIFPath, overwrite: bool
|
161
|
+
) -> Path:
|
162
|
+
"""Prepares the final output path, handling overwrites and directory creation."""
|
163
|
+
final_output_path = Path(output_path)
|
164
|
+
if final_output_path.exists() and not overwrite:
|
165
|
+
raise FileExistsError(
|
166
|
+
f"Output file {final_output_path} already exists and overwrite is False."
|
167
|
+
)
|
168
|
+
elif final_output_path.exists() and overwrite:
|
169
|
+
logger.info(
|
170
|
+
f"Overwrite active: Deleting existing output file {final_output_path}"
|
171
|
+
)
|
172
|
+
try:
|
173
|
+
if (
|
174
|
+
final_output_path.is_dir()
|
175
|
+
): # Should not happen if SDIFPath is file path
|
176
|
+
raise IsADirectoryError(
|
177
|
+
f"Output path {final_output_path} is a directory."
|
178
|
+
)
|
179
|
+
final_output_path.unlink()
|
180
|
+
except OSError as e:
|
181
|
+
raise RuntimeError(
|
182
|
+
f"Failed to delete existing output file {final_output_path}: {e}"
|
183
|
+
) from e
|
184
|
+
|
185
|
+
final_output_path.parent.mkdir(parents=True, exist_ok=True)
|
186
|
+
return final_output_path
|
187
|
+
|
188
|
+
def _setup_temp_directories(self) -> Tuple[Path, Path, Path]:
|
189
|
+
"""Creates and returns paths for temporary working directories."""
|
190
|
+
run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aixlsx_run_"))
|
191
|
+
intermediate_sdif_dir = run_temp_dir / "intermediate_sdifs"
|
192
|
+
intermediate_sdif_dir.mkdir()
|
193
|
+
temp_input_copies_dir = (
|
194
|
+
run_temp_dir / "temp_input_copies"
|
195
|
+
) # Directory for temporary input copies
|
196
|
+
temp_input_copies_dir.mkdir()
|
197
|
+
return run_temp_dir, intermediate_sdif_dir, temp_input_copies_dir
|
198
|
+
|
199
|
+
async def _process_single_file_to_intermediate_sdif(
|
200
|
+
self,
|
201
|
+
input_xlsx_file: Path,
|
202
|
+
final_output_path_stem: str,
|
203
|
+
temp_input_copies_dir: Path,
|
204
|
+
intermediate_sdif_dir: Path,
|
205
|
+
) -> Path:
|
206
|
+
"""Processes a single XLSX file to an intermediate SDIF in a controlled location."""
|
207
|
+
logger.info(f"Processing file: {input_xlsx_file.name}")
|
208
|
+
graph_thread_id = f"satif_aixlsx_{final_output_path_stem}_{input_xlsx_file.stem}_{uuid.uuid4().hex[:8]}"
|
209
|
+
|
210
|
+
temp_input_file_for_graph = (
|
211
|
+
temp_input_copies_dir
|
212
|
+
/ f"{input_xlsx_file.stem}_{graph_thread_id}{input_xlsx_file.suffix}"
|
213
|
+
)
|
214
|
+
shutil.copy2(input_xlsx_file, temp_input_file_for_graph)
|
215
|
+
logger.debug(
|
216
|
+
f"Created temporary copy of {input_xlsx_file.name} at {temp_input_file_for_graph}"
|
217
|
+
)
|
218
|
+
|
219
|
+
graph_config_for_file = {
|
220
|
+
"configurable": {"thread_id": graph_thread_id},
|
221
|
+
"recursion_limit": 50, # Default, make configurable if needed
|
222
|
+
}
|
223
|
+
|
224
|
+
try:
|
225
|
+
graph_produced_sdif_path = await self._invoke_xlsx_graph(
|
226
|
+
temp_input_file_for_graph, graph_config_for_file
|
227
|
+
)
|
228
|
+
|
229
|
+
target_intermediate_sdif_path = (
|
230
|
+
intermediate_sdif_dir
|
231
|
+
/ f"intermediate_{input_xlsx_file.stem}_{graph_thread_id}.sdif"
|
232
|
+
)
|
233
|
+
shutil.move(
|
234
|
+
str(graph_produced_sdif_path),
|
235
|
+
str(target_intermediate_sdif_path),
|
236
|
+
)
|
237
|
+
logger.info(
|
238
|
+
f"Moved graph output for {input_xlsx_file.name} to {target_intermediate_sdif_path}"
|
239
|
+
)
|
240
|
+
return target_intermediate_sdif_path
|
241
|
+
except Exception as e:
|
242
|
+
error_msg = f"Failed to process file {input_xlsx_file.name} (using copy {temp_input_file_for_graph.name}) with xlsx-to-sdif graph: {e}"
|
243
|
+
logger.error(error_msg, exc_info=True)
|
244
|
+
# Re-raise to be caught by the main standardize method's loop or error handling
|
245
|
+
raise RuntimeError(
|
246
|
+
f"Error processing {input_xlsx_file.name}. Halting batch."
|
247
|
+
) from e
|
248
|
+
|
249
|
+
async def _consolidate_intermediate_sdifs(
|
250
|
+
self, intermediate_sdif_paths: List[Path], final_output_path: Path
|
251
|
+
) -> None:
|
252
|
+
"""Consolidates intermediate SDIF files into the final output path."""
|
253
|
+
if not intermediate_sdif_paths:
|
254
|
+
# This case should ideally be handled before calling, but as a safeguard:
|
255
|
+
raise RuntimeError(
|
256
|
+
"No intermediate SDIF files were provided for consolidation."
|
257
|
+
)
|
258
|
+
|
259
|
+
if len(intermediate_sdif_paths) == 1:
|
260
|
+
logger.info(
|
261
|
+
f"Only one intermediate SDIF generated. Moving {intermediate_sdif_paths[0]} to {final_output_path}"
|
262
|
+
)
|
263
|
+
shutil.move(str(intermediate_sdif_paths[0]), str(final_output_path))
|
264
|
+
else:
|
265
|
+
logger.info(
|
266
|
+
f"Merging {len(intermediate_sdif_paths)} intermediate SDIF files into {final_output_path}"
|
267
|
+
)
|
268
|
+
merge_sdif_files(
|
269
|
+
source_db_paths=intermediate_sdif_paths,
|
270
|
+
target_db_path=final_output_path,
|
271
|
+
)
|
272
|
+
|
273
|
+
async def standardize(
|
274
|
+
self,
|
275
|
+
datasource: Datasource,
|
276
|
+
output_path: SDIFPath,
|
277
|
+
*,
|
278
|
+
overwrite: bool = False,
|
279
|
+
config: Optional[Dict[str, Any]] = None,
|
280
|
+
**kwargs: Any,
|
281
|
+
) -> StandardizationResult:
|
282
|
+
"""
|
283
|
+
Standardizes one or more XLSX files into a single SDIF file.
|
284
|
+
|
285
|
+
Args:
|
286
|
+
datasource: A single file path (str or Path) or a list of file paths
|
287
|
+
to XLSX files.
|
288
|
+
output_path: The path where the final consolidated SDIF file will be saved.
|
289
|
+
overwrite: If True, overwrite the output_path if it already exists.
|
290
|
+
Defaults to False.
|
291
|
+
config: General configuration options (currently not used by this standardizer
|
292
|
+
for graph interaction but preserved for API consistency).
|
293
|
+
**kwargs: Additional keyword arguments (currently ignored).
|
294
|
+
|
295
|
+
Returns:
|
296
|
+
A StandardizationResult object containing the path to the final SDIF file.
|
297
|
+
|
298
|
+
Raises:
|
299
|
+
ValueError: If the datasource is invalid or no XLSX files are found.
|
300
|
+
RuntimeError: If critical errors occur during processing, such as the
|
301
|
+
`xlsx-to-sdif` graph not being available or failing.
|
302
|
+
FileNotFoundError: If input files are not found or graph outputs are invalid.
|
303
|
+
FileExistsError: If output_path exists and overwrite is False.
|
304
|
+
"""
|
305
|
+
if not xlsx_graph or not XLSXState:
|
306
|
+
raise RuntimeError(
|
307
|
+
"AIXLSXStandardizer cannot operate because `xlsx_to_sdif.graph` or `xlsx_to_sdif.state` is not available. "
|
308
|
+
"Please ensure the 'xlsx-to-sdif' library is installed and accessible."
|
309
|
+
)
|
310
|
+
|
311
|
+
xlsx_input_files = await self._resolve_and_filter_input_files(datasource)
|
312
|
+
final_output_path = self._prepare_final_output_path(output_path, overwrite)
|
313
|
+
run_temp_dir, intermediate_sdif_dir, temp_input_copies_dir = (
|
314
|
+
self._setup_temp_directories()
|
315
|
+
)
|
316
|
+
|
317
|
+
intermediate_sdif_paths: List[Path] = []
|
318
|
+
processing_errors: List[str] = []
|
319
|
+
|
320
|
+
try:
|
321
|
+
# Process each file sequentially. Consider asyncio.gather for parallel if graph supports it well for many files.
|
322
|
+
for i, input_xlsx_file in enumerate(xlsx_input_files):
|
323
|
+
try:
|
324
|
+
logger.info(
|
325
|
+
f"Starting processing for file {i + 1}/{len(xlsx_input_files)}: {input_xlsx_file.name}"
|
326
|
+
)
|
327
|
+
intermediate_sdif_path = (
|
328
|
+
await self._process_single_file_to_intermediate_sdif(
|
329
|
+
input_xlsx_file,
|
330
|
+
final_output_path.stem, # Pass stem for unique naming
|
331
|
+
temp_input_copies_dir,
|
332
|
+
intermediate_sdif_dir,
|
333
|
+
)
|
334
|
+
)
|
335
|
+
intermediate_sdif_paths.append(intermediate_sdif_path)
|
336
|
+
except Exception:
|
337
|
+
logger.error(
|
338
|
+
f"Halting standardization due to error processing {input_xlsx_file.name}."
|
339
|
+
)
|
340
|
+
raise # Re-raise the exception to be caught by the outer try/finally
|
341
|
+
|
342
|
+
if not intermediate_sdif_paths:
|
343
|
+
# This condition might be redundant if _process_single_file_to_intermediate_sdif always raises on failure
|
344
|
+
# and we re-raise immediately.
|
345
|
+
if processing_errors: # This list would be empty if we fail fast
|
346
|
+
raise RuntimeError(
|
347
|
+
f"No XLSX files were successfully processed. Errors: {'; '.join(processing_errors)}"
|
348
|
+
)
|
349
|
+
else:
|
350
|
+
raise RuntimeError(
|
351
|
+
"No intermediate SDIF files were generated, though no specific errors were caught."
|
352
|
+
)
|
353
|
+
|
354
|
+
await self._consolidate_intermediate_sdifs(
|
355
|
+
intermediate_sdif_paths, final_output_path
|
356
|
+
)
|
357
|
+
|
358
|
+
logger.info(f"Successfully created final SDIF: {final_output_path}")
|
359
|
+
return StandardizationResult(
|
360
|
+
output_path=final_output_path, file_configs=None
|
361
|
+
) # file_configs not available from this process
|
362
|
+
|
363
|
+
finally:
|
364
|
+
if run_temp_dir.exists():
|
365
|
+
try:
|
366
|
+
shutil.rmtree(run_temp_dir)
|
367
|
+
logger.debug(f"Cleaned up temporary directory: {run_temp_dir}")
|
368
|
+
except Exception as e_clean:
|
369
|
+
logger.error(
|
370
|
+
f"Error cleaning up temporary directory {run_temp_dir}: {e_clean}",
|
371
|
+
exc_info=True,
|
372
|
+
)
|
satif_ai/transform.py
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Any, Dict, List, Optional
|
3
|
+
|
4
|
+
from fastmcp import Client, FastMCP
|
5
|
+
from fastmcp.client.transports import FastMCPTransport
|
6
|
+
from satif_core.code_executors.base import CodeExecutor
|
7
|
+
from satif_core.transformation_builders.base import AsyncTransformationBuilder
|
8
|
+
from satif_core.types import (
|
9
|
+
FilePath,
|
10
|
+
SDIFPath,
|
11
|
+
TransformationResult,
|
12
|
+
)
|
13
|
+
from satif_sdk.code_executors.local_executor import LocalCodeExecutor
|
14
|
+
from satif_sdk.transformers.code import CodeTransformer
|
15
|
+
from sdif_mcp.server import mcp
|
16
|
+
|
17
|
+
from satif_ai.transformation_builders.syncpulse import SyncpulseTransformationBuilder
|
18
|
+
from satif_ai.utils.openai_mcp import OpenAICompatibleMCP
|
19
|
+
|
20
|
+
|
21
|
+
async def atransform(
|
22
|
+
sdif: SDIFPath,
|
23
|
+
output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
|
24
|
+
instructions: Optional[str] = None,
|
25
|
+
output_path: FilePath = Path("."),
|
26
|
+
*,
|
27
|
+
transformation_code: Optional[str] = None,
|
28
|
+
transformation_builder: Optional[AsyncTransformationBuilder] = None,
|
29
|
+
code_executor: Optional[CodeExecutor] = None,
|
30
|
+
mcp_server: Optional[FastMCP] = None,
|
31
|
+
mcp_client: Optional[Client] = None,
|
32
|
+
llm_model: str = "o4-mini",
|
33
|
+
schema_only: bool = False,
|
34
|
+
representer_kwargs: Optional[Dict[str, Any]] = None,
|
35
|
+
) -> TransformationResult:
|
36
|
+
"""
|
37
|
+
Asynchronously transforms an SDIF (Standard Data Interchange Format) input using
|
38
|
+
an AI-generated or provided transformation code.
|
39
|
+
|
40
|
+
This function orchestrates the process of:
|
41
|
+
1. Optionally generating transformation code using an AI model via a `TransformationBuilder`
|
42
|
+
if `transformation_code` is not provided.
|
43
|
+
explicitly passed.
|
44
|
+
2. Executing the transformation code using a `CodeTransformer` and a `CodeExecutor`.
|
45
|
+
3. Exporting the results to the specified output.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
sdif: Path to the input SDIF file or an `SDIFDatabase` object.
|
49
|
+
output_target_files: A dictionary mapping original example file paths (or string identifiers)
|
50
|
+
to their desired agent-facing filenames, or a list of output example
|
51
|
+
file paths, or a single output file path. These are used by the AI to understand the target
|
52
|
+
format and structure, and also by the `CodeTransformer` to determine
|
53
|
+
output filenames if the transformation result keys match.
|
54
|
+
instructions: Optional. Natural language instructions for the AI to generate
|
55
|
+
the transformation code. Used if `transformation_code` is None.
|
56
|
+
output_path: Path to the directory where transformation outputs will be saved.
|
57
|
+
transformation_code: Optional. Pre-existing Python code for the transformation.
|
58
|
+
If None, code will be generated by the `transformation_builder`.
|
59
|
+
transformation_builder: Optional. An `AsyncTransformationBuilder` instance responsible for generating
|
60
|
+
the transformation code if `transformation_code` is not provided.
|
61
|
+
If None, a `SyncpulseTransformationBuilder` is instantiated.
|
62
|
+
code_executor: Optional. A `CodeExecutor` instance for running the transformation
|
63
|
+
code. If None, a `LocalCodeExecutor` is used.
|
64
|
+
mcp_server: Optional. A `FastMCP` server instance for the AI code builder.
|
65
|
+
Defaults to the global `mcp` instance if `transformation_builder` is None and
|
66
|
+
a new `SyncpulseTransformationBuilder` is being created.
|
67
|
+
mcp_client: Optional. A user-provided `Client` instance. If provided when
|
68
|
+
`transformation_builder` is None, it will be used by the internally
|
69
|
+
created `SyncpulseTransformationBuilder`. The caller is responsible for
|
70
|
+
managing the lifecycle of a provided client.
|
71
|
+
llm_model: The language model to use for code generation (e.g., "o4-mini").
|
72
|
+
Used if `transformation_builder` is None.
|
73
|
+
schema_only: If True, the transformation aims to match only the schema (headers)
|
74
|
+
of the `output_target_files`, and input samples may be omitted or marked
|
75
|
+
as empty for the AI. This is useful for structural transformations
|
76
|
+
without processing actual data rows.
|
77
|
+
representer_kwargs: Optional dictionary of keyword arguments to pass to the
|
78
|
+
representer when analyzing `output_target_files`.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
A `TransformationResult` object containing the path to the output
|
82
|
+
and the transformation code used.
|
83
|
+
"""
|
84
|
+
current_transformation_code: Optional[str] = transformation_code
|
85
|
+
active_builder: Optional[AsyncTransformationBuilder] = transformation_builder
|
86
|
+
|
87
|
+
_openai_mcp_instance: Optional[OpenAICompatibleMCP] = None
|
88
|
+
openai_mcp_managed_locally = False
|
89
|
+
|
90
|
+
# If code isn't provided, we need a builder. If a builder isn't provided, we create one.
|
91
|
+
if current_transformation_code is None:
|
92
|
+
if active_builder is None:
|
93
|
+
# Create SyncpulseTransformationBuilder
|
94
|
+
_effective_mcp_server = mcp_server if mcp_server is not None else mcp
|
95
|
+
|
96
|
+
_openai_mcp_instance = OpenAICompatibleMCP(mcp=_effective_mcp_server)
|
97
|
+
await _openai_mcp_instance.connect()
|
98
|
+
openai_mcp_managed_locally = True
|
99
|
+
|
100
|
+
if mcp_client is None: # No user-provided client, create and manage one
|
101
|
+
mcp_transport = FastMCPTransport(mcp=_effective_mcp_server)
|
102
|
+
async with Client(mcp_transport) as new_client:
|
103
|
+
active_builder = SyncpulseTransformationBuilder(
|
104
|
+
mcp_server=_openai_mcp_instance,
|
105
|
+
mcp_session=new_client.session,
|
106
|
+
llm_model=llm_model,
|
107
|
+
)
|
108
|
+
current_transformation_code = await active_builder.build(
|
109
|
+
sdif=sdif,
|
110
|
+
output_target_files=output_target_files,
|
111
|
+
instructions=instructions,
|
112
|
+
schema_only=schema_only,
|
113
|
+
representer_kwargs=representer_kwargs,
|
114
|
+
)
|
115
|
+
else:
|
116
|
+
active_builder = SyncpulseTransformationBuilder(
|
117
|
+
mcp_server=_openai_mcp_instance,
|
118
|
+
mcp_session=mcp_client, # Use the provided client
|
119
|
+
llm_model=llm_model,
|
120
|
+
)
|
121
|
+
current_transformation_code = await active_builder.build(
|
122
|
+
sdif=sdif,
|
123
|
+
output_target_files=output_target_files,
|
124
|
+
instructions=instructions,
|
125
|
+
schema_only=schema_only,
|
126
|
+
representer_kwargs=representer_kwargs,
|
127
|
+
)
|
128
|
+
|
129
|
+
# Disconnect OpenAICompatibleMCP if it was created and connected locally
|
130
|
+
if (
|
131
|
+
openai_mcp_managed_locally
|
132
|
+
and _openai_mcp_instance
|
133
|
+
and _openai_mcp_instance._is_connected
|
134
|
+
):
|
135
|
+
await _openai_mcp_instance.cleanup()
|
136
|
+
|
137
|
+
if current_transformation_code is None:
|
138
|
+
raise ValueError("Transformation code could not be obtained or generated.")
|
139
|
+
|
140
|
+
# Code Executor and Transformation
|
141
|
+
_code_executor = code_executor if code_executor is not None else LocalCodeExecutor()
|
142
|
+
|
143
|
+
transformer = CodeTransformer(
|
144
|
+
function=current_transformation_code,
|
145
|
+
code_executor=_code_executor,
|
146
|
+
)
|
147
|
+
|
148
|
+
exported_artifact_path = transformer.export(
|
149
|
+
sdif=sdif,
|
150
|
+
output_path=output_path,
|
151
|
+
)
|
152
|
+
|
153
|
+
return TransformationResult(
|
154
|
+
output_path=exported_artifact_path, function_code=current_transformation_code
|
155
|
+
)
|
@@ -8,7 +8,9 @@ from typing import Any, Dict, List, Optional, Union
|
|
8
8
|
from agents import Agent, Runner, function_tool
|
9
9
|
from agents.mcp.server import MCPServer
|
10
10
|
from mcp import ClientSession
|
11
|
-
from satif_core import
|
11
|
+
from satif_core import AsyncTransformationBuilder
|
12
|
+
from satif_core.types import FilePath
|
13
|
+
from satif_sdk.code_executors.local_executor import LocalCodeExecutor
|
12
14
|
from satif_sdk.comparators import get_comparator
|
13
15
|
from satif_sdk.representers import get_representer
|
14
16
|
from satif_sdk.transformers import CodeTransformer
|
@@ -61,7 +63,10 @@ async def execute_transformation(code: str) -> str:
|
|
61
63
|
if INPUT_SDIF_PATH is None or OUTPUT_TARGET_FILES is None:
|
62
64
|
return "Error: Transformation context not initialized"
|
63
65
|
|
64
|
-
code_transformer = CodeTransformer(
|
66
|
+
code_transformer = CodeTransformer(
|
67
|
+
function=code,
|
68
|
+
code_executor=LocalCodeExecutor(disable_security_warning=True),
|
69
|
+
)
|
65
70
|
generated_output_path = code_transformer.export(INPUT_SDIF_PATH)
|
66
71
|
|
67
72
|
comparisons = []
|
@@ -120,19 +125,7 @@ async def execute_transformation(code: str) -> str:
|
|
120
125
|
return "\n".join(comparisons)
|
121
126
|
|
122
127
|
|
123
|
-
class
|
124
|
-
def __init__(self, output_example: Path | List[Path] | Dict[str, Path]):
|
125
|
-
self.output_example = output_example
|
126
|
-
|
127
|
-
def build(
|
128
|
-
self,
|
129
|
-
sdif: Path | SDIFDatabase,
|
130
|
-
instructions: Optional[str] = None,
|
131
|
-
) -> str:
|
132
|
-
pass
|
133
|
-
|
134
|
-
|
135
|
-
class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
|
128
|
+
class SyncpulseTransformationBuilder(AsyncTransformationBuilder):
|
136
129
|
"""This class is used to build a transformation code that will be used to transform a SDIF database into a set of files following the format of the given output files."""
|
137
130
|
|
138
131
|
def __init__(
|
@@ -147,23 +140,18 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
|
|
147
140
|
|
148
141
|
async def build(
|
149
142
|
self,
|
150
|
-
sdif: Path,
|
151
|
-
output_target_files: Dict[
|
152
|
-
output_sdif: Optional[Path] = None,
|
143
|
+
sdif: Path,
|
144
|
+
output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
|
145
|
+
output_sdif: Optional[Path] = None,
|
153
146
|
instructions: str = "",
|
154
147
|
schema_only: bool = False,
|
155
|
-
|
148
|
+
representer_kwargs: Optional[Dict[str, Any]] = None,
|
156
149
|
) -> str:
|
157
150
|
global INPUT_SDIF_PATH, OUTPUT_TARGET_FILES, SCHEMA_ONLY
|
158
|
-
|
159
|
-
# If execute_transformation runs in the same process as the builder, absolute path is fine.
|
160
|
-
# If it were a separate context, this might need adjustment.
|
161
|
-
# For now, assume execute_transformation can access absolute paths if needed for its *input SDIF*.
|
162
|
-
# However, the sdif for MCP URIs must be relative.
|
151
|
+
|
163
152
|
INPUT_SDIF_PATH = Path(sdif).resolve()
|
164
153
|
SCHEMA_ONLY = schema_only
|
165
|
-
#
|
166
|
-
# So, use them directly as strings.
|
154
|
+
# We must encode the path because special characters are not allowed in mcp read_resource()
|
167
155
|
input_sdif_mcp_uri_path = base64.b64encode(str(sdif).encode()).decode()
|
168
156
|
output_sdif_mcp_uri_path = (
|
169
157
|
base64.b64encode(str(output_sdif).encode()).decode()
|
@@ -205,9 +193,14 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
|
|
205
193
|
|
206
194
|
# OUTPUT_TARGET_FILES keys are absolute paths to original example files for local reading by representers/comparators.
|
207
195
|
# Values are agent-facing filenames.
|
208
|
-
if isinstance(output_target_files,
|
196
|
+
if isinstance(output_target_files, FilePath):
|
197
|
+
OUTPUT_TARGET_FILES = {
|
198
|
+
Path(output_target_files).resolve(): Path(output_target_files).name
|
199
|
+
}
|
200
|
+
elif isinstance(output_target_files, list):
|
209
201
|
OUTPUT_TARGET_FILES = {
|
210
|
-
file_path.resolve(): file_path.name
|
202
|
+
Path(file_path).resolve(): Path(file_path).name
|
203
|
+
for file_path in output_target_files
|
211
204
|
}
|
212
205
|
elif isinstance(output_target_files, dict):
|
213
206
|
temp_map = {}
|
@@ -229,7 +222,7 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
|
|
229
222
|
# Representer uses the absolute path (file_key_abs_path) to read the example file.
|
230
223
|
representer = get_representer(file_key_abs_path)
|
231
224
|
representation, used_params = representer.represent(
|
232
|
-
file_key_abs_path, **(
|
225
|
+
file_key_abs_path, **(representer_kwargs or {})
|
233
226
|
)
|
234
227
|
output_representation[agent_facing_name] = {
|
235
228
|
"representation": representation,
|