satif-ai 0.2.8__tar.gz → 0.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {satif_ai-0.2.8 → satif_ai-0.2.9}/PKG-INFO +4 -3
- {satif_ai-0.2.8 → satif_ai-0.2.9}/pyproject.toml +4 -1
- satif_ai-0.2.9/satif_ai/__init__.py +19 -0
- {satif_ai-0.2.8 → satif_ai-0.2.9}/satif_ai/adapters/tidy.py +19 -38
- satif_ai-0.2.9/satif_ai/standardize.py +112 -0
- satif_ai-0.2.9/satif_ai/standardizers/ai.py +485 -0
- {satif_ai-0.2.8 → satif_ai-0.2.9}/satif_ai/standardizers/ai_csv.py +1 -1
- satif_ai-0.2.9/satif_ai/transform.py +121 -0
- satif_ai-0.2.8/satif_ai/code_builders/transformation.py → satif_ai-0.2.9/satif_ai/transformation_builders/syncpulse.py +22 -29
- satif_ai-0.2.9/satif_ai/utils/__init__.py +5 -0
- satif_ai-0.2.9/satif_ai/utils/merge_sdif.py +22 -0
- satif_ai-0.2.9/satif_ai/utils/openai_mcp.py +97 -0
- satif_ai-0.2.9/satif_ai/utils/zip.py +120 -0
- satif_ai-0.2.8/satif_ai/code_builders/adaptation.py +0 -9
- satif_ai-0.2.8/satif_ai/standardizers/__init__.py +0 -0
- {satif_ai-0.2.8 → satif_ai-0.2.9}/LICENSE +0 -0
- {satif_ai-0.2.8 → satif_ai-0.2.9}/README.md +0 -0
- {satif_ai-0.2.8/satif_ai → satif_ai-0.2.9/satif_ai/adapters}/__init__.py +0 -0
- {satif_ai-0.2.8/satif_ai/adapters → satif_ai-0.2.9/satif_ai/standardizers}/__init__.py +0 -0
- {satif_ai-0.2.8/satif_ai/code_builders → satif_ai-0.2.9/satif_ai/transformation_builders}/__init__.py +0 -0
@@ -1,10 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: satif-ai
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.9
|
4
4
|
Summary: AI Agents for Satif
|
5
5
|
License: MIT
|
6
|
-
Author:
|
7
|
-
|
6
|
+
Author: Syncpulse
|
7
|
+
Maintainer: Bryan Djafer
|
8
|
+
Maintainer-email: bryan.djafer@syncpulse.fr
|
8
9
|
Requires-Python: >=3.10,<4.0
|
9
10
|
Classifier: License :: OSI Approved :: MIT License
|
10
11
|
Classifier: Programming Language :: Python :: 3
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from .adapters.tidy import TidyAdapter
|
2
|
+
from .standardize import astandardize
|
3
|
+
from .standardizers.ai import AIStandardizer
|
4
|
+
from .standardizers.ai_csv import AICSVStandardizer
|
5
|
+
from .transform import atransform
|
6
|
+
from .transformation_builders.syncpulse import SyncpulseTransformationBuilder
|
7
|
+
from .utils import OpenAICompatibleMCP, extract_zip_archive_async, merge_sdif_files
|
8
|
+
|
9
|
+
__all__ = [
|
10
|
+
"astandardize",
|
11
|
+
"atransform",
|
12
|
+
"TidyAdapter",
|
13
|
+
"AICSVStandardizer",
|
14
|
+
"AIStandardizer",
|
15
|
+
"SyncpulseTransformationBuilder",
|
16
|
+
"OpenAICompatibleMCP",
|
17
|
+
"extract_zip_archive_async",
|
18
|
+
"merge_sdif_files",
|
19
|
+
]
|
@@ -6,23 +6,19 @@ import shutil
|
|
6
6
|
import sqlite3
|
7
7
|
import tempfile
|
8
8
|
from pathlib import Path
|
9
|
-
from typing import Optional
|
9
|
+
from typing import Optional, Union
|
10
10
|
|
11
|
-
# MCP and Agent imports
|
12
11
|
from agents import Agent, Runner, function_tool
|
13
12
|
from agents.mcp.server import MCPServerStdio
|
14
13
|
from mcp import ClientSession
|
15
|
-
|
16
|
-
# SATIF imports
|
17
14
|
from satif_core.adapters.base import Adapter
|
18
|
-
from satif_core.types import Datasource
|
15
|
+
from satif_core.types import Datasource, SDIFPath
|
19
16
|
from satif_sdk import SDIFDatabase
|
20
17
|
from satif_sdk.adapters.code import AdapterError, CodeAdapter
|
21
18
|
|
22
19
|
logger = logging.getLogger(__name__)
|
23
20
|
|
24
21
|
|
25
|
-
# --- Tidy Transformation Prompt ---
|
26
22
|
TIDY_TRANSFORMATION_PROMPT = """
|
27
23
|
You are an expert Data Tidying Agent for SDIF databases.
|
28
24
|
Your task is to write Python code to transform tables within a given SDIF database into a 'tidy' format, modifying the database *in place*.
|
@@ -130,12 +126,11 @@ def adapt_sdif(db: SDIFDatabase) -> None:
|
|
130
126
|
- Ensure pandas and other necessary libraries (like `typing`, `AdapterError`) are imported within the code string if you use them.
|
131
127
|
"""
|
132
128
|
|
133
|
-
|
134
|
-
# These will be set within the TidyAdapter instance when adapt is called
|
129
|
+
|
135
130
|
TOOL_CONTEXT = {
|
136
131
|
"copied_input_sdif_path": None,
|
137
132
|
"temp_dir": None,
|
138
|
-
"current_output_sdif_path": None,
|
133
|
+
"current_output_sdif_path": None,
|
139
134
|
}
|
140
135
|
|
141
136
|
|
@@ -167,13 +162,10 @@ async def execute_tidy_adaptation(code: str) -> str:
|
|
167
162
|
)
|
168
163
|
|
169
164
|
try:
|
170
|
-
# 1. Instantiate CodeAdapter with the provided code
|
171
|
-
# It will operate on a *copy* specified by copied_input_path
|
172
|
-
# and write to a *new* file (_adapted suffix by default).
|
173
165
|
adapter = CodeAdapter(
|
174
166
|
function=code,
|
175
|
-
function_name="adapt_sdif",
|
176
|
-
output_suffix="_adapted_tool_run",
|
167
|
+
function_name="adapt_sdif",
|
168
|
+
output_suffix="_adapted_tool_run",
|
177
169
|
)
|
178
170
|
# Run the adaptation. It copies `copied_input_path` and modifies the copy.
|
179
171
|
# The returned path is the newly created, adapted file.
|
@@ -232,9 +224,9 @@ class TidyAdapter(Adapter):
|
|
232
224
|
|
233
225
|
def __init__(
|
234
226
|
self,
|
235
|
-
mcp_server: MCPServerStdio,
|
236
|
-
mcp_session: ClientSession,
|
237
|
-
llm_model: str = "o4-mini",
|
227
|
+
mcp_server: MCPServerStdio,
|
228
|
+
mcp_session: ClientSession,
|
229
|
+
llm_model: str = "o4-mini",
|
238
230
|
max_iterations: int = 5,
|
239
231
|
):
|
240
232
|
"""
|
@@ -339,12 +331,12 @@ class TidyAdapter(Adapter):
|
|
339
331
|
return code_text.strip()
|
340
332
|
return None # Indicate no valid code found
|
341
333
|
|
342
|
-
async def adapt(self,
|
334
|
+
async def adapt(self, sdif: Union[SDIFPath, SDIFDatabase]) -> Datasource:
|
343
335
|
"""
|
344
336
|
Transforms the data in the input SDIF to be tidy using an AI agent.
|
345
337
|
|
346
338
|
Args:
|
347
|
-
|
339
|
+
sdif: The input SDIF database instance. Connection will be closed.
|
348
340
|
|
349
341
|
Returns:
|
350
342
|
Path to the new SDIF file containing the tidied data.
|
@@ -354,13 +346,16 @@ class TidyAdapter(Adapter):
|
|
354
346
|
RuntimeError: If the agent fails to produce valid tidy code.
|
355
347
|
Exception: For unexpected errors during the process.
|
356
348
|
"""
|
357
|
-
|
349
|
+
if isinstance(sdif, SDIFDatabase):
|
350
|
+
input_path = Path(sdif.path)
|
351
|
+
else:
|
352
|
+
input_path = sdif
|
358
353
|
if not input_path.exists():
|
359
354
|
raise FileNotFoundError(f"Input SDIF file not found: {input_path}")
|
360
355
|
|
361
356
|
# Ensure the input DB connection is closed before copying
|
362
357
|
try:
|
363
|
-
|
358
|
+
sdif.close()
|
364
359
|
except Exception:
|
365
360
|
pass
|
366
361
|
|
@@ -372,17 +367,14 @@ class TidyAdapter(Adapter):
|
|
372
367
|
input_schema_dict = db.get_schema()
|
373
368
|
input_sample_dict = db.get_sample_analysis()
|
374
369
|
|
375
|
-
# Get SDIFDatabase method signatures
|
376
370
|
sdif_methods_str = self._get_sdif_methods()
|
377
371
|
|
378
|
-
# Prepare context for the prompt
|
379
372
|
initial_context = {
|
380
373
|
"input_schema": json.dumps(input_schema_dict, indent=2),
|
381
374
|
"input_sample": json.dumps(input_sample_dict, indent=2),
|
382
375
|
"sdif_database_methods": sdif_methods_str,
|
383
376
|
}
|
384
377
|
|
385
|
-
# Instantiate the Agent
|
386
378
|
agent = Agent(
|
387
379
|
name="Tidy SDIF Adapter Agent",
|
388
380
|
mcp_servers=[self.mcp_server],
|
@@ -390,8 +382,6 @@ class TidyAdapter(Adapter):
|
|
390
382
|
model=self.llm_model,
|
391
383
|
)
|
392
384
|
|
393
|
-
# Run the agent using the Runner
|
394
|
-
# Pass the prompt and initial context
|
395
385
|
logger.info(f"Running Tidy Agent with model {self.llm_model}...")
|
396
386
|
result = await Runner.run(
|
397
387
|
agent,
|
@@ -409,7 +399,6 @@ class TidyAdapter(Adapter):
|
|
409
399
|
f"Agent finished. Final output message:\n{result.final_output[:500]}..."
|
410
400
|
)
|
411
401
|
|
412
|
-
# Parse the final code from the agent's response
|
413
402
|
final_code = self.parse_code(result.final_output)
|
414
403
|
|
415
404
|
if not final_code:
|
@@ -421,20 +410,16 @@ class TidyAdapter(Adapter):
|
|
421
410
|
logger.info(
|
422
411
|
"Successfully parsed final adaptation code from agent response."
|
423
412
|
)
|
424
|
-
# print(f"--- Final Code ---\n{final_code}\n------------------") # Debugging
|
425
413
|
|
426
|
-
# Execute the *final* code using CodeAdapter directly to create the definitive output
|
427
414
|
logger.info("Executing final adaptation code...")
|
428
415
|
final_adapter = CodeAdapter(
|
429
416
|
function=final_code,
|
430
417
|
function_name="adapt_sdif",
|
431
|
-
output_suffix="_tidy_final",
|
418
|
+
output_suffix="_tidy_final",
|
432
419
|
)
|
433
|
-
|
420
|
+
|
434
421
|
final_adapted_path = final_adapter.adapt(copied_input_path)
|
435
422
|
|
436
|
-
# Move the final successful output SDIF to a persistent location
|
437
|
-
# Example: place it next to the original input file
|
438
423
|
persistent_output_path = (
|
439
424
|
input_path.parent / final_adapted_path.name
|
440
425
|
).resolve()
|
@@ -444,9 +429,7 @@ class TidyAdapter(Adapter):
|
|
444
429
|
)
|
445
430
|
persistent_output_path.unlink()
|
446
431
|
|
447
|
-
shutil.move(
|
448
|
-
str(final_adapted_path), persistent_output_path
|
449
|
-
) # Move needs strings sometimes
|
432
|
+
shutil.move(str(final_adapted_path), persistent_output_path)
|
450
433
|
logger.info(
|
451
434
|
f"Successfully generated final tidy SDIF: {persistent_output_path}"
|
452
435
|
)
|
@@ -455,8 +438,6 @@ class TidyAdapter(Adapter):
|
|
455
438
|
|
456
439
|
except Exception as e:
|
457
440
|
logger.exception(f"Error during TidyAdapter adapt process: {e}")
|
458
|
-
# Re-raise or handle as appropriate
|
459
441
|
raise
|
460
442
|
finally:
|
461
|
-
# Always clean up temporary files
|
462
443
|
self._cleanup_temp_env()
|
@@ -0,0 +1,112 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Any, Dict, Optional, Union
|
3
|
+
|
4
|
+
from satif_core.standardizers.base import AsyncStandardizer
|
5
|
+
from satif_core.types import Datasource, FilePath, SDIFPath, StandardizationResult
|
6
|
+
|
7
|
+
from satif_ai.adapters.tidy import TidyAdapter
|
8
|
+
from satif_ai.standardizers.ai import AIStandardizer
|
9
|
+
|
10
|
+
|
11
|
+
async def astandardize(
|
12
|
+
datasource: Datasource,
|
13
|
+
output_path: SDIFPath,
|
14
|
+
*,
|
15
|
+
overwrite: bool = False,
|
16
|
+
sdif_schema: Optional[Union[FilePath, Dict[str, Any]]] = None,
|
17
|
+
tidy_adapter: Union[bool, TidyAdapter] = False,
|
18
|
+
config: Optional[Dict[str, Any]] = None,
|
19
|
+
standardizer: Optional[AsyncStandardizer] = None,
|
20
|
+
mcp_server: Optional[Any] = None,
|
21
|
+
mcp_session: Optional[Any] = None,
|
22
|
+
llm_model: Optional[str] = None,
|
23
|
+
) -> StandardizationResult:
|
24
|
+
"""
|
25
|
+
Asynchronously standardizes a datasource into a single, canonical SDIF SQLite file.
|
26
|
+
|
27
|
+
This function serves as the primary entry point for the SATIF standardization layer.
|
28
|
+
It orchestrates the conversion of various input file formats (e.g., CSV, Excel, PDF)
|
29
|
+
from the provided datasource into a unified SDIF (Standard Data Interchange Format)
|
30
|
+
SQLite file. The process may involve AI-driven parsing, schema adaptation, and
|
31
|
+
data tidying, depending on the configuration and the capabilities of the
|
32
|
+
underlying standardizer.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
datasource: The source of the data to be standardized. This can be a
|
36
|
+
single file path (str or Path), a list of file paths, or other
|
37
|
+
datasource types supported by the chosen standardizer.
|
38
|
+
output_path: The path (str or Path) where the output SDIF SQLite database file
|
39
|
+
will be created (e.g., "./output/my_data.sdif").
|
40
|
+
overwrite: If True, an existing SDIF file at `output_path` will be
|
41
|
+
overwritten. Defaults to False.
|
42
|
+
sdif_schema: Optional. Path to an SDIF schema definition file (e.g., a JSON file)
|
43
|
+
or a dictionary representing the schema. If provided, the
|
44
|
+
standardization process (specifically if using the default
|
45
|
+
`AIStandardizer`) may attempt to adapt the data to this
|
46
|
+
target schema.
|
47
|
+
tidy_adapter: Optional. If True, a default `TidyAdapter` may be used.
|
48
|
+
Alternatively, a specific `TidyAdapter` instance can be provided
|
49
|
+
to perform data tidying processes (e.g., cleaning, normalization,
|
50
|
+
restructuring tables). If False (default), no explicit tidying
|
51
|
+
step is initiated by this top-level function, though underlying
|
52
|
+
standardizers might perform their own internal tidying.
|
53
|
+
The specifics depend on the standardizer's capabilities.
|
54
|
+
config: Optional. A dictionary for advanced or standardizer-specific
|
55
|
+
configurations. This config is passed directly to the
|
56
|
+
`standardize` method of the chosen standardizer.
|
57
|
+
standardizer: Optional. An instance of an `AsyncStandardizer` subclass.
|
58
|
+
If provided, this instance will be used for standardization.
|
59
|
+
This allows for using pre-configured or custom standardizers.
|
60
|
+
If None, a default `AIStandardizer` is instantiated using
|
61
|
+
`mcp_server`, `mcp_session`, `llm_model`, `sdif_schema`,
|
62
|
+
and `tidy_adapter`.
|
63
|
+
mcp_server: Optional. The MCP (Model Coordination Platform) server instance.
|
64
|
+
Used if `standardizer` is None for the default `AIStandardizer`.
|
65
|
+
mcp_session: Optional. The MCP session or transport object.
|
66
|
+
Used if `standardizer` is None for the default `AIStandardizer`.
|
67
|
+
llm_model: Optional. The language model to be used by the default `AIStandardizer`
|
68
|
+
if no `standardizer` instance is provided (e.g., "gpt-4o").
|
69
|
+
Each standardizer may have its own default model.
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
A `StandardizationResult` object containing:
|
73
|
+
- `output_path`: The absolute `Path` to the created or updated SDIF database file.
|
74
|
+
- `file_configs`: An optional dictionary detailing configurations used for
|
75
|
+
each processed input file, if applicable and returned by
|
76
|
+
the standardizer.
|
77
|
+
|
78
|
+
Raises:
|
79
|
+
FileNotFoundError: If the `datasource` (or parts of it) does not exist.
|
80
|
+
FileExistsError: If `output_path` exists and `overwrite` is False.
|
81
|
+
ValueError: If input arguments are invalid (e.g., unsupported datasource type).
|
82
|
+
RuntimeError: For general errors during the standardization process.
|
83
|
+
Specific exceptions may also be raised by the underlying
|
84
|
+
standardizer implementation.
|
85
|
+
"""
|
86
|
+
if standardizer is None:
|
87
|
+
standardizer = AIStandardizer(
|
88
|
+
mcp_server=mcp_server,
|
89
|
+
mcp_session=mcp_session,
|
90
|
+
llm_model=llm_model,
|
91
|
+
sdif_schema=sdif_schema,
|
92
|
+
tidy_adapter=tidy_adapter
|
93
|
+
if isinstance(tidy_adapter, TidyAdapter)
|
94
|
+
else (TidyAdapter() if tidy_adapter else None),
|
95
|
+
)
|
96
|
+
|
97
|
+
result = await standardizer.standardize(
|
98
|
+
datasource=datasource,
|
99
|
+
output_path=output_path,
|
100
|
+
overwrite=overwrite,
|
101
|
+
config=config,
|
102
|
+
)
|
103
|
+
|
104
|
+
output_sdif_path = (
|
105
|
+
Path(result.output_path)
|
106
|
+
if isinstance(result.output_path, str)
|
107
|
+
else result.output_path
|
108
|
+
)
|
109
|
+
|
110
|
+
return StandardizationResult(
|
111
|
+
output_path=output_sdif_path, file_configs=result.file_configs
|
112
|
+
)
|
@@ -0,0 +1,485 @@
|
|
1
|
+
import asyncio
|
2
|
+
import logging
|
3
|
+
import shutil
|
4
|
+
import tempfile
|
5
|
+
import uuid
|
6
|
+
from collections import defaultdict
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
9
|
+
|
10
|
+
from satif_core.standardizers.base import AsyncStandardizer
|
11
|
+
from satif_core.types import Datasource, FilePath, SDIFPath, StandardizationResult
|
12
|
+
|
13
|
+
from satif_ai.adapters.tidy import TidyAdapter
|
14
|
+
from satif_ai.utils.merge_sdif import merge_sdif_files
|
15
|
+
from satif_ai.utils.zip import extract_zip_archive_async
|
16
|
+
|
17
|
+
from .ai_csv import AICSVStandardizer
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class AIStandardizer(AsyncStandardizer):
|
23
|
+
"""
|
24
|
+
Orchestrates the standardization of various file types using specialized AI standardizers.
|
25
|
+
It processes a datasource, which can include individual files or ZIP archives.
|
26
|
+
Files are dispatched to appropriate AI agents (e.g., AICSVStandardizer),
|
27
|
+
and their SDIF outputs are merged into a single, final SDIF.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
mcp_server: Optional[Any] = None,
|
33
|
+
mcp_session: Optional[Any] = None,
|
34
|
+
llm_model: Optional[str] = None,
|
35
|
+
sdif_schema: Optional[Union[FilePath, Dict[str, Any]]] = None,
|
36
|
+
tidy_adapter: Optional[TidyAdapter] = None,
|
37
|
+
):
|
38
|
+
self.mcp_server = mcp_server
|
39
|
+
self.mcp_session = mcp_session
|
40
|
+
self.llm_model = llm_model
|
41
|
+
self.sdif_schema = sdif_schema # TODO: Implement schema adaptation logic
|
42
|
+
self.tidy_adapter = tidy_adapter # TODO: Implement tidying logic
|
43
|
+
|
44
|
+
self.ai_standardizer_map: Dict[str, Type[AsyncStandardizer]] = {
|
45
|
+
".csv": AICSVStandardizer,
|
46
|
+
# Future standardizers:
|
47
|
+
# ".xlsx": AIXLSXStandardizer,
|
48
|
+
# ".pdf": AIPDFStandardizer,
|
49
|
+
# ".json": AIJSONStandardizer,
|
50
|
+
# ".xml": AIXMLStandardizer,
|
51
|
+
}
|
52
|
+
for ext, standardizer_class in self.ai_standardizer_map.items():
|
53
|
+
if not issubclass(standardizer_class, AsyncStandardizer):
|
54
|
+
raise TypeError(
|
55
|
+
f"Standardizer for '{ext}' ({standardizer_class.__name__}) "
|
56
|
+
"must inherit from AsyncStandardizer."
|
57
|
+
)
|
58
|
+
|
59
|
+
def _get_ai_standardizer_class(
|
60
|
+
self, extension: str
|
61
|
+
) -> Optional[Type[AsyncStandardizer]]:
|
62
|
+
return self.ai_standardizer_map.get(extension.lower())
|
63
|
+
|
64
|
+
async def _resolve_input_files(
|
65
|
+
self, datasource: Datasource, temp_processing_dir: Path
|
66
|
+
) -> List[Path]:
|
67
|
+
"""
|
68
|
+
Resolves the input datasource to a list of individual file paths.
|
69
|
+
Handles single files, lists of files, and extracts ZIP archives.
|
70
|
+
"""
|
71
|
+
input_file_paths: List[Path] = []
|
72
|
+
raw_paths_to_check: List[Union[str, Path]] = []
|
73
|
+
|
74
|
+
if isinstance(datasource, (str, Path)):
|
75
|
+
raw_paths_to_check = [datasource]
|
76
|
+
elif isinstance(datasource, list) and all(
|
77
|
+
isinstance(p, (str, Path)) for p in datasource
|
78
|
+
):
|
79
|
+
raw_paths_to_check = datasource
|
80
|
+
else:
|
81
|
+
# This also catches the case where datasource is an empty list initially
|
82
|
+
raise ValueError(
|
83
|
+
"Datasource must be a non-empty file path (string or Path) or a non-empty list of such paths."
|
84
|
+
)
|
85
|
+
|
86
|
+
if not raw_paths_to_check: # Should be caught by above, but defensive
|
87
|
+
raise ValueError("No input datasource paths provided.")
|
88
|
+
|
89
|
+
for raw_path_item in raw_paths_to_check:
|
90
|
+
raw_path = Path(raw_path_item).resolve()
|
91
|
+
if not raw_path.exists():
|
92
|
+
raise FileNotFoundError(f"Input path not found: {raw_path}")
|
93
|
+
|
94
|
+
if raw_path.is_file():
|
95
|
+
if raw_path.suffix.lower() == ".zip":
|
96
|
+
zip_extract_target = (
|
97
|
+
temp_processing_dir
|
98
|
+
/ f"extracted_{raw_path.stem}_{uuid.uuid4().hex[:8]}"
|
99
|
+
)
|
100
|
+
try:
|
101
|
+
extracted_from_zip = await extract_zip_archive_async(
|
102
|
+
raw_path, zip_extract_target
|
103
|
+
)
|
104
|
+
input_file_paths.extend(extracted_from_zip)
|
105
|
+
except Exception as e_zip:
|
106
|
+
logger.error(
|
107
|
+
f"Failed to extract ZIP archive '{raw_path}': {e_zip}",
|
108
|
+
exc_info=True,
|
109
|
+
)
|
110
|
+
# Decide if one failed zip should stop all, or just be skipped.
|
111
|
+
# For now, skipping problematic zips.
|
112
|
+
continue
|
113
|
+
else:
|
114
|
+
input_file_paths.append(raw_path)
|
115
|
+
elif raw_path.is_dir():
|
116
|
+
logger.info(f"Processing directory datasource: {raw_path}")
|
117
|
+
for child_item in raw_path.iterdir():
|
118
|
+
if child_item.is_file():
|
119
|
+
input_file_paths.append(child_item)
|
120
|
+
# Deeper recursion to be implemeted.
|
121
|
+
else:
|
122
|
+
logger.warning(
|
123
|
+
f"Input path '{raw_path}' is not a file or directory and will be ignored."
|
124
|
+
)
|
125
|
+
|
126
|
+
if not input_file_paths:
|
127
|
+
# This means all inputs were invalid, unresolvable, or zips failed etc.
|
128
|
+
logger.error("No processable files found after resolving datasource.")
|
129
|
+
raise ValueError("Datasource resolution resulted in no processable files.")
|
130
|
+
return input_file_paths
|
131
|
+
|
132
|
+
def _group_files_by_standardizer(
|
133
|
+
self, file_paths: List[Path]
|
134
|
+
) -> Tuple[Dict[Type[AsyncStandardizer], List[Path]], List[Path]]:
|
135
|
+
"""Groups files by the AI standardizer responsible for them based on extension."""
|
136
|
+
grouped: Dict[Type[AsyncStandardizer], List[Path]] = defaultdict(list)
|
137
|
+
unsupported_files: List[Path] = []
|
138
|
+
for file_path in file_paths:
|
139
|
+
standardizer_class = self._get_ai_standardizer_class(file_path.suffix)
|
140
|
+
if standardizer_class:
|
141
|
+
grouped[standardizer_class].append(file_path)
|
142
|
+
else:
|
143
|
+
unsupported_files.append(file_path)
|
144
|
+
if unsupported_files:
|
145
|
+
logger.warning(
|
146
|
+
f"Unsupported files found and will be ignored: "
|
147
|
+
f"{[str(f.name) for f in unsupported_files]}"
|
148
|
+
)
|
149
|
+
return grouped, unsupported_files
|
150
|
+
|
151
|
+
async def _process_file_groups(
|
152
|
+
self,
|
153
|
+
grouped_files: Dict[Type[AsyncStandardizer], List[Path]],
|
154
|
+
temp_sdif_dir: Path,
|
155
|
+
config: Optional[Dict[str, Any]],
|
156
|
+
**kwargs,
|
157
|
+
) -> Tuple[List[Path], List[Dict[str, Any]]]:
|
158
|
+
"""
|
159
|
+
Processes groups of files using their respective AI standardizers.
|
160
|
+
Child standardizers are expected to produce a single SDIF SQLite file.
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
A tuple containing:
|
164
|
+
- List of Paths to successfully created intermediate SDIF SQLite files.
|
165
|
+
- List of aggregated file configurations from child standardizers.
|
166
|
+
"""
|
167
|
+
processing_tasks = []
|
168
|
+
standardizer_instances_info = []
|
169
|
+
|
170
|
+
for standardizer_class, files_in_group in grouped_files.items():
|
171
|
+
if not files_in_group:
|
172
|
+
continue
|
173
|
+
|
174
|
+
standardizer_init_kwargs = {}
|
175
|
+
# TODO: Pass standardizer-specific config from main 'config' if available for this standardizer_class
|
176
|
+
|
177
|
+
try:
|
178
|
+
ai_child_standardizer = standardizer_class(
|
179
|
+
mcp_server=self.mcp_server,
|
180
|
+
mcp_session=self.mcp_session,
|
181
|
+
llm_model=self.llm_model,
|
182
|
+
**standardizer_init_kwargs,
|
183
|
+
)
|
184
|
+
except Exception as e:
|
185
|
+
logger.error(
|
186
|
+
f"Failed to initialize standardizer {standardizer_class.__name__} for '{files_in_group[0].name}': {e}",
|
187
|
+
exc_info=True,
|
188
|
+
)
|
189
|
+
raise RuntimeError(
|
190
|
+
f"Initialization failed for {standardizer_class.__name__}: {e}"
|
191
|
+
)
|
192
|
+
|
193
|
+
# Generate a unique filename for the intermediate SDIF SQLite file
|
194
|
+
intermediate_sdif_filename = f"intermediate_{standardizer_class.__name__}_{uuid.uuid4().hex[:12]}.sdif"
|
195
|
+
intermediate_sdif_file_path = temp_sdif_dir / intermediate_sdif_filename
|
196
|
+
|
197
|
+
logger.info(
|
198
|
+
f"Queueing standardization for {len(files_in_group)} file(s) "
|
199
|
+
f"with {standardizer_class.__name__} (output file: {intermediate_sdif_file_path})"
|
200
|
+
)
|
201
|
+
|
202
|
+
task = ai_child_standardizer.standardize(
|
203
|
+
datasource=files_in_group,
|
204
|
+
output_path=intermediate_sdif_file_path,
|
205
|
+
overwrite=True, # Temporary intermediate files are always new/overwritten
|
206
|
+
config=config,
|
207
|
+
**kwargs,
|
208
|
+
)
|
209
|
+
processing_tasks.append(task)
|
210
|
+
standardizer_instances_info.append(
|
211
|
+
{
|
212
|
+
"class_name": standardizer_class.__name__,
|
213
|
+
"output_file": intermediate_sdif_file_path,
|
214
|
+
}
|
215
|
+
)
|
216
|
+
|
217
|
+
gathered_outputs = await asyncio.gather(
|
218
|
+
*processing_tasks, return_exceptions=True
|
219
|
+
)
|
220
|
+
|
221
|
+
successful_intermediate_sdif_files: List[Path] = []
|
222
|
+
aggregated_file_configs: List[Dict[str, Any]] = []
|
223
|
+
|
224
|
+
for i, result_or_exc in enumerate(gathered_outputs):
|
225
|
+
info = standardizer_instances_info[i]
|
226
|
+
expected_output_file: Path = info["output_file"]
|
227
|
+
|
228
|
+
if isinstance(result_or_exc, StandardizationResult):
|
229
|
+
# Child standardizer's output_path should be a file path.
|
230
|
+
child_reported_output_file = Path(result_or_exc.output_path)
|
231
|
+
|
232
|
+
if not child_reported_output_file.is_file():
|
233
|
+
logger.error(
|
234
|
+
f"Standardizer {info['class_name']} reported success, but its output path "
|
235
|
+
f"'{child_reported_output_file}' is not a file or does not exist. Skipping."
|
236
|
+
)
|
237
|
+
continue # Skip this problematic result
|
238
|
+
|
239
|
+
if (
|
240
|
+
child_reported_output_file.resolve()
|
241
|
+
!= expected_output_file.resolve()
|
242
|
+
):
|
243
|
+
logger.warning(
|
244
|
+
f"Standardizer {info['class_name']} reported output file '{child_reported_output_file}' "
|
245
|
+
f"which differs from expected '{expected_output_file}'. Using reported path."
|
246
|
+
)
|
247
|
+
|
248
|
+
logger.info(
|
249
|
+
f"Successfully standardized group with {info['class_name']}. "
|
250
|
+
f"Intermediate SDIF file: {child_reported_output_file}"
|
251
|
+
)
|
252
|
+
successful_intermediate_sdif_files.append(child_reported_output_file)
|
253
|
+
if result_or_exc.file_configs:
|
254
|
+
aggregated_file_configs.extend(result_or_exc.file_configs)
|
255
|
+
|
256
|
+
elif isinstance(result_or_exc, Exception):
|
257
|
+
logger.error(
|
258
|
+
f"Standardization by {info['class_name']} for target '{expected_output_file}' failed: {result_or_exc}",
|
259
|
+
exc_info=result_or_exc,
|
260
|
+
)
|
261
|
+
# Optionally, try to clean up the expected_output_file if it was created before erroring
|
262
|
+
if expected_output_file.exists():
|
263
|
+
try:
|
264
|
+
expected_output_file.unlink()
|
265
|
+
except OSError:
|
266
|
+
pass
|
267
|
+
|
268
|
+
return successful_intermediate_sdif_files, aggregated_file_configs
|
269
|
+
|
270
|
+
async def _consolidate_results(
|
271
|
+
self,
|
272
|
+
intermediate_sdif_files: List[Path],
|
273
|
+
aggregated_file_configs: Optional[List[Dict[str, Any]]],
|
274
|
+
final_sdif_file_target: Path,
|
275
|
+
overwrite: bool,
|
276
|
+
) -> StandardizationResult:
|
277
|
+
"""
|
278
|
+
Merges or moves intermediate SDIF SQLite files to the final target SDIF SQLite file.
|
279
|
+
Cleans up intermediate files.
|
280
|
+
"""
|
281
|
+
if not intermediate_sdif_files:
|
282
|
+
raise RuntimeError(
|
283
|
+
"No intermediate SDIF files were successfully generated to consolidate."
|
284
|
+
)
|
285
|
+
|
286
|
+
final_sdif_file_target.parent.mkdir(parents=True, exist_ok=True)
|
287
|
+
|
288
|
+
if final_sdif_file_target.exists():
|
289
|
+
if not overwrite:
|
290
|
+
raise FileExistsError(
|
291
|
+
f"Final output file {final_sdif_file_target} already exists and overwrite is False."
|
292
|
+
)
|
293
|
+
logger.info(
|
294
|
+
f"Overwriting existing final output file: {final_sdif_file_target}"
|
295
|
+
)
|
296
|
+
try:
|
297
|
+
final_sdif_file_target.unlink()
|
298
|
+
except OSError as e_unlink:
|
299
|
+
logger.error(
|
300
|
+
f"Could not delete existing file {final_sdif_file_target}: {e_unlink}"
|
301
|
+
)
|
302
|
+
raise # Re-raise as this is critical for overwrite
|
303
|
+
|
304
|
+
final_sdif_path_str: str
|
305
|
+
if len(intermediate_sdif_files) == 1:
|
306
|
+
source_sqlite_file = intermediate_sdif_files[0]
|
307
|
+
logger.info(
|
308
|
+
f"Moving single intermediate SDIF SQLite file '{source_sqlite_file}' to final output '{final_sdif_file_target}'."
|
309
|
+
)
|
310
|
+
try:
|
311
|
+
shutil.move(str(source_sqlite_file), str(final_sdif_file_target))
|
312
|
+
final_sdif_path_str = str(final_sdif_file_target)
|
313
|
+
except Exception as e_move:
|
314
|
+
logger.error(
|
315
|
+
f"Failed to move {source_sqlite_file} to {final_sdif_file_target}: {e_move}"
|
316
|
+
)
|
317
|
+
# Attempt to copy as a fallback, then try to remove source
|
318
|
+
try:
|
319
|
+
shutil.copy2(str(source_sqlite_file), str(final_sdif_file_target))
|
320
|
+
final_sdif_path_str = str(final_sdif_file_target)
|
321
|
+
source_sqlite_file.unlink(
|
322
|
+
missing_ok=True
|
323
|
+
) # Try to clean up source after copy
|
324
|
+
except Exception as e_copy_fallback:
|
325
|
+
logger.error(
|
326
|
+
f"Fallback copy also failed for {source_sqlite_file}: {e_copy_fallback}"
|
327
|
+
)
|
328
|
+
raise RuntimeError(
|
329
|
+
f"Could not place intermediate file into final location: {e_copy_fallback}"
|
330
|
+
) from e_copy_fallback
|
331
|
+
else:
|
332
|
+
logger.info(
|
333
|
+
f"Merging {len(intermediate_sdif_files)} intermediate SDIF SQLite files into '{final_sdif_file_target}'."
|
334
|
+
)
|
335
|
+
# merge_sdif_files must accept a list of source SQLite file paths and a target SQLite file path.
|
336
|
+
merged_target_path = await merge_sdif_files(
|
337
|
+
intermediate_sdif_files,
|
338
|
+
final_sdif_file_target,
|
339
|
+
overwrite=False, # We handled overwrite for final_sdif_file_target
|
340
|
+
)
|
341
|
+
final_sdif_path_str = str(merged_target_path)
|
342
|
+
|
343
|
+
# Clean up original intermediate files (they have been moved or their content merged)
|
344
|
+
for temp_file in intermediate_sdif_files:
|
345
|
+
if (
|
346
|
+
temp_file.exists()
|
347
|
+
and temp_file.resolve() != Path(final_sdif_path_str).resolve()
|
348
|
+
): # Don't delete the final file if it was one of the intermediates (single file case)
|
349
|
+
try:
|
350
|
+
temp_file.unlink()
|
351
|
+
logger.debug(f"Cleaned up intermediate file: {temp_file}")
|
352
|
+
except Exception as e_clean_file:
|
353
|
+
logger.warning(
|
354
|
+
f"Error cleaning up intermediate file {temp_file}: {e_clean_file}"
|
355
|
+
)
|
356
|
+
|
357
|
+
logger.info(
|
358
|
+
f"Consolidation complete. Final SDIF SQLite file: {final_sdif_path_str}"
|
359
|
+
)
|
360
|
+
return StandardizationResult(
|
361
|
+
output_path=Path(final_sdif_path_str),
|
362
|
+
file_configs=aggregated_file_configs if aggregated_file_configs else None,
|
363
|
+
)
|
364
|
+
|
365
|
+
async def standardize(
|
366
|
+
self,
|
367
|
+
datasource: Datasource,
|
368
|
+
output_path: SDIFPath, # Expected to be the path to the target *SDIF file*
|
369
|
+
*,
|
370
|
+
overwrite: bool = False,
|
371
|
+
config: Optional[Dict[str, Any]] = None,
|
372
|
+
**kwargs,
|
373
|
+
) -> StandardizationResult:
|
374
|
+
"""
|
375
|
+
Standardizes datasource to a single SDIF SQLite file.
|
376
|
+
|
377
|
+
Args:
|
378
|
+
datasource: Source data (file path, list of paths, or directory path).
|
379
|
+
output_path: Path to the target output SDIF SQLite file (e.g., "./output/data.sdif").
|
380
|
+
overwrite: If True, overwrite existing output file. Defaults to False.
|
381
|
+
config: Optional configuration dictionary for standardizers.
|
382
|
+
**kwargs: Additional arguments passed to child standardizers.
|
383
|
+
|
384
|
+
Returns:
|
385
|
+
StandardizationResult with the path to the created SDIF SQLite file.
|
386
|
+
"""
|
387
|
+
logger.info(
|
388
|
+
f"AIStandardizer starting process for output SDIF file: {output_path}"
|
389
|
+
)
|
390
|
+
final_sdif_file_target = Path(output_path).resolve()
|
391
|
+
|
392
|
+
if final_sdif_file_target.is_dir():
|
393
|
+
raise ValueError(
|
394
|
+
f"Target output_path '{final_sdif_file_target}' is a directory. "
|
395
|
+
"It must be a full file path for the target SDIF SQLite database (e.g., data.sqlite or data.sdif)."
|
396
|
+
)
|
397
|
+
if not final_sdif_file_target.suffix:
|
398
|
+
logger.warning(
|
399
|
+
f"Target output_path '{final_sdif_file_target}' has no file extension. "
|
400
|
+
"It should be a path to an SDIF SQLite database file (e.g., data.sqlite or data.sdif)."
|
401
|
+
)
|
402
|
+
elif final_sdif_file_target.suffix.lower() not in (".sdif", ".sqlite", ".db"):
|
403
|
+
logger.warning(
|
404
|
+
f"Target output_path '{final_sdif_file_target}' does not have a common SQLite extension. "
|
405
|
+
"Ensure this is the intended SQLite file path."
|
406
|
+
)
|
407
|
+
|
408
|
+
# Create a unique temporary directory for this standardization run
|
409
|
+
# This directory will hold intermediate files and ZIP extractions.
|
410
|
+
run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aistd_run_"))
|
411
|
+
intermediate_sdif_files_dir = run_temp_dir / "intermediate_sdif_files"
|
412
|
+
intermediate_sdif_files_dir.mkdir(parents=True, exist_ok=True)
|
413
|
+
file_processing_temp_dir = run_temp_dir / "file_processing_temp"
|
414
|
+
file_processing_temp_dir.mkdir(parents=True, exist_ok=True)
|
415
|
+
|
416
|
+
try:
|
417
|
+
# 1. Resolve input datasource to a list of processable file paths
|
418
|
+
resolved_files = await self._resolve_input_files(
|
419
|
+
datasource, file_processing_temp_dir
|
420
|
+
)
|
421
|
+
logger.info(f"Resolved {len(resolved_files)} file(s) for standardization.")
|
422
|
+
|
423
|
+
# 2. Group files by the AI standardizer responsible for them
|
424
|
+
grouped_by_std, unsupported = self._group_files_by_standardizer(
|
425
|
+
resolved_files
|
426
|
+
)
|
427
|
+
if not grouped_by_std:
|
428
|
+
user_message = (
|
429
|
+
"No files found that can be handled by configured AI standardizers."
|
430
|
+
)
|
431
|
+
if unsupported:
|
432
|
+
user_message += (
|
433
|
+
f" Unsupported files: {[str(f.name) for f in unsupported]}"
|
434
|
+
)
|
435
|
+
raise ValueError(user_message)
|
436
|
+
|
437
|
+
logger.debug(
|
438
|
+
f"File groups for standardization: { {cls.__name__: [f.name for f in paths] for cls, paths in grouped_by_std.items()} }"
|
439
|
+
)
|
440
|
+
|
441
|
+
# 3. Process each group of files, generating intermediate SDIF SQLite files
|
442
|
+
(
|
443
|
+
intermediate_sdif_files,
|
444
|
+
aggregated_file_configs,
|
445
|
+
) = await self._process_file_groups(
|
446
|
+
grouped_by_std, intermediate_sdif_files_dir, config, **kwargs
|
447
|
+
)
|
448
|
+
|
449
|
+
if not intermediate_sdif_files:
|
450
|
+
raise RuntimeError(
|
451
|
+
"No intermediate SDIF SQLite files were successfully generated."
|
452
|
+
)
|
453
|
+
logger.info(
|
454
|
+
f"Successfully generated {len(intermediate_sdif_files)} intermediate SDIF SQLite file(s)."
|
455
|
+
)
|
456
|
+
|
457
|
+
# 4. Consolidate intermediate SDIF files into the final target file
|
458
|
+
final_result = await self._consolidate_results(
|
459
|
+
intermediate_sdif_files,
|
460
|
+
aggregated_file_configs,
|
461
|
+
final_sdif_file_target,
|
462
|
+
overwrite,
|
463
|
+
)
|
464
|
+
|
465
|
+
logger.info(
|
466
|
+
f"AIStandardizer process completed. Final SDIF file at: {final_result.output_path}"
|
467
|
+
)
|
468
|
+
return final_result
|
469
|
+
|
470
|
+
except Exception as e:
|
471
|
+
logger.error(f"AIStandardizer failed: {e}", exc_info=True)
|
472
|
+
if isinstance(e, (ValueError, FileNotFoundError, FileExistsError)):
|
473
|
+
raise
|
474
|
+
raise RuntimeError(f"AIStandardizer processing error: {e}") from e
|
475
|
+
finally:
|
476
|
+
# Clean up the entire temporary directory for this run
|
477
|
+
if run_temp_dir.exists():
|
478
|
+
try:
|
479
|
+
shutil.rmtree(run_temp_dir)
|
480
|
+
logger.info(f"Cleaned up temporary run directory: {run_temp_dir}")
|
481
|
+
except Exception as e_clean:
|
482
|
+
logger.error(
|
483
|
+
f"Error cleaning up temporary run directory {run_temp_dir}: {e_clean}",
|
484
|
+
exc_info=True,
|
485
|
+
)
|
@@ -37,7 +37,7 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
|
|
37
37
|
- Encoding: {initial_encoding}
|
38
38
|
- Delimiter: '{initial_delimiter}'
|
39
39
|
|
40
|
-
**Your
|
40
|
+
**Your Task:**
|
41
41
|
|
42
42
|
1. **Core Parsing Parameters:**
|
43
43
|
* Determine the correct file `encoding` (string, e.g., "utf-8", "latin-1").
|
@@ -0,0 +1,121 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Any, Dict, List, Optional
|
3
|
+
|
4
|
+
from fastmcp import FastMCP
|
5
|
+
from fastmcp.client.transports import FastMCPTransport
|
6
|
+
from satif_core.code_executors.base import CodeExecutor
|
7
|
+
from satif_core.transformation_builders.base import AsyncTransformationBuilder
|
8
|
+
from satif_core.types import (
|
9
|
+
FilePath,
|
10
|
+
SDIFPath,
|
11
|
+
TransformationResult,
|
12
|
+
)
|
13
|
+
from satif_sdk.code_executors.local_executor import LocalCodeExecutor
|
14
|
+
from satif_sdk.transformers.code import CodeTransformer
|
15
|
+
from sdif_mcp.server import mcp
|
16
|
+
|
17
|
+
from satif_ai.transformation_builders.syncpulse import SyncpulseTransformationBuilder
|
18
|
+
from satif_ai.utils.openai_mcp import OpenAICompatibleMCP
|
19
|
+
|
20
|
+
|
21
|
+
async def atransform(
|
22
|
+
sdif: SDIFPath,
|
23
|
+
output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
|
24
|
+
instructions: Optional[str] = None,
|
25
|
+
output_path: FilePath = Path("."),
|
26
|
+
*,
|
27
|
+
transformation_code: Optional[str] = None,
|
28
|
+
transformation_builder: Optional[AsyncTransformationBuilder] = None,
|
29
|
+
code_executor: Optional[CodeExecutor] = None,
|
30
|
+
mcp_server: Optional[FastMCP] = None,
|
31
|
+
mcp_transport: Optional[FastMCPTransport] = None,
|
32
|
+
llm_model: str = "o4-mini",
|
33
|
+
schema_only: bool = False,
|
34
|
+
representer_kwargs: Optional[Dict[str, Any]] = None,
|
35
|
+
) -> TransformationResult:
|
36
|
+
"""
|
37
|
+
Asynchronously transforms an SDIF (Standard Data Interchange Format) input using
|
38
|
+
an AI-generated or provided transformation code.
|
39
|
+
|
40
|
+
This function orchestrates the process of:
|
41
|
+
1. Optionally generating transformation code using an AI model via a `CodeBuilder`
|
42
|
+
if `transformation_code` is not provided.
|
43
|
+
explicitly passed.
|
44
|
+
2. Executing the transformation code using a `CodeTransformer` and a `CodeExecutor`.
|
45
|
+
3. Exporting the results to the specified output.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
sdif: Path to the input SDIF file or an `SDIFDatabase` object.
|
49
|
+
output_target_files: A dictionary mapping original example file paths (or string identifiers)
|
50
|
+
to their desired agent-facing filenames, or a list of output example
|
51
|
+
file paths, or a single output file path. These are used by the AI to understand the target
|
52
|
+
format and structure, and also by the `CodeTransformer` to determine
|
53
|
+
output filenames if the transformation result keys match.
|
54
|
+
instructions: Optional. Natural language instructions for the AI to generate
|
55
|
+
the transformation code. Used if `transformation_code` is None.
|
56
|
+
transformation_code: Optional. Pre-existing Python code for the transformation.
|
57
|
+
If None, code will be generated by the `transformation_builder`.
|
58
|
+
transformation_builder: Optional. An `AsyncTransformationBuilder` instance responsible for generating
|
59
|
+
the transformation code if `transformation_code` is not provided.
|
60
|
+
If None, a `TransformationAsyncCodeBuilder` is instantiated.
|
61
|
+
code_executor: Optional. A `CodeExecutor` instance for running the transformation
|
62
|
+
code. If None, a `LocalCodeExecutor` is used.
|
63
|
+
mcp_server: Optional. A `FastMCP` server instance for the AI code builder.
|
64
|
+
Defaults to the global `mcp` instance if `transformation_builder` is None.
|
65
|
+
mcp_transport: Optional. A `FastMCPTransport` instance for communication with
|
66
|
+
the `mcp_server`. Defaults to a new transport using `mcp_server`
|
67
|
+
if `transformation_builder` is None.
|
68
|
+
llm_model: The language model to use for code generation (e.g., "o4-mini").
|
69
|
+
Used if `transformation_builder` is None.
|
70
|
+
schema_only: If True, the transformation aims to match only the schema (headers)
|
71
|
+
of the `output_target_files`, and input samples may be omitted or marked
|
72
|
+
as empty for the AI. This is useful for structural transformations
|
73
|
+
without processing actual data rows.
|
74
|
+
representer_kwargs: Optional dictionary of keyword arguments to pass to the
|
75
|
+
representer when analyzing `output_target_files`.
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
A `TransformationResult` object containing the path to the output
|
79
|
+
and the transformation code used.
|
80
|
+
"""
|
81
|
+
if transformation_builder is None:
|
82
|
+
if mcp_server is None:
|
83
|
+
mcp_server = mcp
|
84
|
+
|
85
|
+
if mcp_transport is None:
|
86
|
+
mcp_transport = FastMCPTransport(mcp=mcp_server)
|
87
|
+
|
88
|
+
openai_compatible_mcp = OpenAICompatibleMCP(mcp=mcp_server)
|
89
|
+
await openai_compatible_mcp.connect()
|
90
|
+
|
91
|
+
transformation_builder = SyncpulseTransformationBuilder(
|
92
|
+
mcp_server=openai_compatible_mcp,
|
93
|
+
mcp_session=mcp_transport,
|
94
|
+
llm_model=llm_model,
|
95
|
+
)
|
96
|
+
|
97
|
+
if transformation_code is None:
|
98
|
+
function_code = await transformation_builder.build(
|
99
|
+
sdif=sdif,
|
100
|
+
output_target_files=output_target_files,
|
101
|
+
instructions=instructions,
|
102
|
+
schema_only=schema_only,
|
103
|
+
representer_kwargs=representer_kwargs,
|
104
|
+
)
|
105
|
+
else:
|
106
|
+
function_code = transformation_code
|
107
|
+
|
108
|
+
if code_executor is None:
|
109
|
+
code_executor = LocalCodeExecutor()
|
110
|
+
|
111
|
+
transformer = CodeTransformer(
|
112
|
+
function=function_code,
|
113
|
+
code_executor=code_executor,
|
114
|
+
)
|
115
|
+
|
116
|
+
output_path = transformer.export(
|
117
|
+
sdif=sdif,
|
118
|
+
output_path=output_path,
|
119
|
+
)
|
120
|
+
|
121
|
+
return TransformationResult(output_path=output_path, function_code=function_code)
|
@@ -8,7 +8,9 @@ from typing import Any, Dict, List, Optional, Union
|
|
8
8
|
from agents import Agent, Runner, function_tool
|
9
9
|
from agents.mcp.server import MCPServer
|
10
10
|
from mcp import ClientSession
|
11
|
-
from satif_core import
|
11
|
+
from satif_core import AsyncTransformationBuilder
|
12
|
+
from satif_core.types import FilePath
|
13
|
+
from satif_sdk.code_executors.local_executor import LocalCodeExecutor
|
12
14
|
from satif_sdk.comparators import get_comparator
|
13
15
|
from satif_sdk.representers import get_representer
|
14
16
|
from satif_sdk.transformers import CodeTransformer
|
@@ -61,7 +63,10 @@ async def execute_transformation(code: str) -> str:
|
|
61
63
|
if INPUT_SDIF_PATH is None or OUTPUT_TARGET_FILES is None:
|
62
64
|
return "Error: Transformation context not initialized"
|
63
65
|
|
64
|
-
code_transformer = CodeTransformer(
|
66
|
+
code_transformer = CodeTransformer(
|
67
|
+
function=code,
|
68
|
+
code_executor=LocalCodeExecutor(disable_security_warning=True),
|
69
|
+
)
|
65
70
|
generated_output_path = code_transformer.export(INPUT_SDIF_PATH)
|
66
71
|
|
67
72
|
comparisons = []
|
@@ -120,19 +125,7 @@ async def execute_transformation(code: str) -> str:
|
|
120
125
|
return "\n".join(comparisons)
|
121
126
|
|
122
127
|
|
123
|
-
class
|
124
|
-
def __init__(self, output_example: Path | List[Path] | Dict[str, Path]):
|
125
|
-
self.output_example = output_example
|
126
|
-
|
127
|
-
def build(
|
128
|
-
self,
|
129
|
-
sdif: Path | SDIFDatabase,
|
130
|
-
instructions: Optional[str] = None,
|
131
|
-
) -> str:
|
132
|
-
pass
|
133
|
-
|
134
|
-
|
135
|
-
class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
|
128
|
+
class SyncpulseTransformationBuilder(AsyncTransformationBuilder):
|
136
129
|
"""This class is used to build a transformation code that will be used to transform a SDIF database into a set of files following the format of the given output files."""
|
137
130
|
|
138
131
|
def __init__(
|
@@ -147,23 +140,18 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
|
|
147
140
|
|
148
141
|
async def build(
|
149
142
|
self,
|
150
|
-
sdif: Path,
|
151
|
-
output_target_files: Dict[
|
152
|
-
output_sdif: Optional[Path] = None,
|
143
|
+
sdif: Path,
|
144
|
+
output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
|
145
|
+
output_sdif: Optional[Path] = None,
|
153
146
|
instructions: str = "",
|
154
147
|
schema_only: bool = False,
|
155
|
-
|
148
|
+
representer_kwargs: Optional[Dict[str, Any]] = None,
|
156
149
|
) -> str:
|
157
150
|
global INPUT_SDIF_PATH, OUTPUT_TARGET_FILES, SCHEMA_ONLY
|
158
|
-
|
159
|
-
# If execute_transformation runs in the same process as the builder, absolute path is fine.
|
160
|
-
# If it were a separate context, this might need adjustment.
|
161
|
-
# For now, assume execute_transformation can access absolute paths if needed for its *input SDIF*.
|
162
|
-
# However, the sdif for MCP URIs must be relative.
|
151
|
+
|
163
152
|
INPUT_SDIF_PATH = Path(sdif).resolve()
|
164
153
|
SCHEMA_ONLY = schema_only
|
165
|
-
#
|
166
|
-
# So, use them directly as strings.
|
154
|
+
# We must encode the path because special characters are not allowed in mcp read_resource()
|
167
155
|
input_sdif_mcp_uri_path = base64.b64encode(str(sdif).encode()).decode()
|
168
156
|
output_sdif_mcp_uri_path = (
|
169
157
|
base64.b64encode(str(output_sdif).encode()).decode()
|
@@ -205,9 +193,14 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
|
|
205
193
|
|
206
194
|
# OUTPUT_TARGET_FILES keys are absolute paths to original example files for local reading by representers/comparators.
|
207
195
|
# Values are agent-facing filenames.
|
208
|
-
if isinstance(output_target_files,
|
196
|
+
if isinstance(output_target_files, FilePath):
|
197
|
+
OUTPUT_TARGET_FILES = {
|
198
|
+
Path(output_target_files).resolve(): Path(output_target_files).name
|
199
|
+
}
|
200
|
+
elif isinstance(output_target_files, list):
|
209
201
|
OUTPUT_TARGET_FILES = {
|
210
|
-
file_path.resolve(): file_path.name
|
202
|
+
Path(file_path).resolve(): Path(file_path).name
|
203
|
+
for file_path in output_target_files
|
211
204
|
}
|
212
205
|
elif isinstance(output_target_files, dict):
|
213
206
|
temp_map = {}
|
@@ -229,7 +222,7 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
|
|
229
222
|
# Representer uses the absolute path (file_key_abs_path) to read the example file.
|
230
223
|
representer = get_representer(file_key_abs_path)
|
231
224
|
representation, used_params = representer.represent(
|
232
|
-
file_key_abs_path, **(
|
225
|
+
file_key_abs_path, **(representer_kwargs or {})
|
233
226
|
)
|
234
227
|
output_representation[agent_facing_name] = {
|
235
228
|
"representation": representation,
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
|
5
|
+
async def merge_sdif_files(sdif_paths: List[Path], output_dir: Path) -> Path:
|
6
|
+
"""Placeholder function to merge multiple SDIF files into one.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
sdif_paths: A list of paths to the SDIF files to merge.
|
10
|
+
output_dir: The directory where the merged file should be saved.
|
11
|
+
|
12
|
+
Returns:
|
13
|
+
Path to the merged SDIF file.
|
14
|
+
"""
|
15
|
+
if not sdif_paths:
|
16
|
+
raise ValueError("No SDIF files provided for merging.")
|
17
|
+
|
18
|
+
if len(sdif_paths) == 1:
|
19
|
+
return sdif_paths[0] # No merge needed
|
20
|
+
|
21
|
+
# TODO: Implement SDIF merge
|
22
|
+
raise NotImplementedError("Merge not implemented yet.")
|
@@ -0,0 +1,97 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
from agents.mcp.server import CallToolResult, MCPServer, MCPTool
|
5
|
+
from fastmcp import FastMCP
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class OpenAICompatibleMCP(MCPServer):
|
11
|
+
def __init__(self, mcp: FastMCP):
|
12
|
+
self.mcp = mcp
|
13
|
+
self._is_connected = False # Track connection state
|
14
|
+
|
15
|
+
async def connect(self):
|
16
|
+
"""Connect to the server.
|
17
|
+
For FastMCP, connection is managed externally when the server is run.
|
18
|
+
This method marks the wrapper as connected.
|
19
|
+
"""
|
20
|
+
# Assuming FastMCP instance is already running and configured.
|
21
|
+
# No specific connect action required for the FastMCP instance itself here,
|
22
|
+
# as its lifecycle (run, stop) is managed outside this wrapper.
|
23
|
+
logger.info(
|
24
|
+
f"OpenAICompatibleMCP: Simulating connection to FastMCP server '{self.mcp.name}'."
|
25
|
+
)
|
26
|
+
self._is_connected = True
|
27
|
+
|
28
|
+
@property
|
29
|
+
def name(self) -> str:
|
30
|
+
"""A readable name for the server."""
|
31
|
+
return self.mcp.name
|
32
|
+
|
33
|
+
async def cleanup(self):
|
34
|
+
"""Cleanup the server.
|
35
|
+
For FastMCP, cleanup is managed externally. This method marks the wrapper as disconnected.
|
36
|
+
"""
|
37
|
+
# Similar to connect, actual server cleanup is external.
|
38
|
+
logger.info(
|
39
|
+
f"OpenAICompatibleMCP: Simulating cleanup for FastMCP server '{self.mcp.name}'."
|
40
|
+
)
|
41
|
+
self._is_connected = False
|
42
|
+
|
43
|
+
async def list_tools(self) -> list[MCPTool]:
|
44
|
+
"""List the tools available on the server."""
|
45
|
+
if not self._is_connected:
|
46
|
+
# Or raise an error, depending on desired behavior for disconnected state
|
47
|
+
raise RuntimeError(
|
48
|
+
"OpenAICompatibleMCP.list_tools called while not connected."
|
49
|
+
)
|
50
|
+
|
51
|
+
# FastMCP's get_tools() returns a dict[str, fastmcp.tools.tool.Tool]
|
52
|
+
# Each fastmcp.tools.tool.Tool has a to_mcp_tool(name=key) method
|
53
|
+
# MCPTool is an alias for mcp.types.Tool
|
54
|
+
try:
|
55
|
+
fastmcp_tools = await self.mcp.get_tools()
|
56
|
+
mcp_tools_list = [
|
57
|
+
tool.to_mcp_tool(name=key) for key, tool in fastmcp_tools.items()
|
58
|
+
]
|
59
|
+
return mcp_tools_list
|
60
|
+
except Exception as e:
|
61
|
+
logger.error(
|
62
|
+
f"Error listing tools from FastMCP server '{self.mcp.name}': {e}",
|
63
|
+
exc_info=True,
|
64
|
+
)
|
65
|
+
raise e
|
66
|
+
|
67
|
+
async def call_tool(
|
68
|
+
self, tool_name: str, arguments: dict[str, Any] | None
|
69
|
+
) -> CallToolResult:
|
70
|
+
"""Invoke a tool on the server."""
|
71
|
+
if not self._is_connected:
|
72
|
+
logger.warning(
|
73
|
+
f"OpenAICompatibleMCP.call_tool '{tool_name}' called while not connected."
|
74
|
+
)
|
75
|
+
# Return an error CallToolResult
|
76
|
+
return CallToolResult(
|
77
|
+
content=[{"type": "text", "text": "Server not connected"}], isError=True
|
78
|
+
)
|
79
|
+
|
80
|
+
try:
|
81
|
+
# FastMCP's _mcp_call_tool is a protected member, but seems to be what we need.
|
82
|
+
# It returns: list[TextContent | ImageContent | EmbeddedResource]
|
83
|
+
# This matches the 'content' part of CallToolResult.
|
84
|
+
# We need to handle potential errors and wrap the result.
|
85
|
+
content = await self.mcp._mcp_call_tool(tool_name, arguments or {})
|
86
|
+
return CallToolResult(content=content, isError=False)
|
87
|
+
except Exception as e:
|
88
|
+
logger.error(
|
89
|
+
f"Error calling tool '{tool_name}' on FastMCP server '{self.mcp.name}': {e}",
|
90
|
+
exc_info=True,
|
91
|
+
)
|
92
|
+
error_message = f"Error calling tool '{tool_name}': {type(e).__name__}: {e}"
|
93
|
+
# Ensure content is a list of valid MCP content items, even for errors.
|
94
|
+
# A TextContent is a safe choice.
|
95
|
+
return CallToolResult(
|
96
|
+
content=[{"type": "text", "text": error_message}], isError=True
|
97
|
+
)
|
@@ -0,0 +1,120 @@
|
|
1
|
+
import asyncio
|
2
|
+
import logging
|
3
|
+
import zipfile
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import List, Tuple
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
# Constants for ZIP file processing, kept local to this utility or passed as args if needed
|
10
|
+
_IGNORED_ZIP_MEMBER_PREFIXES = ("__MACOSX/",)
|
11
|
+
_IGNORED_ZIP_FILENAME_PREFIXES = ("._",)
|
12
|
+
_IGNORED_ZIP_FILENAMES = (".DS_Store",)
|
13
|
+
|
14
|
+
|
15
|
+
async def extract_zip_archive_async(
|
16
|
+
zip_path: Path,
|
17
|
+
extract_to: Path,
|
18
|
+
ignored_member_prefixes: Tuple[str, ...] = _IGNORED_ZIP_MEMBER_PREFIXES,
|
19
|
+
ignored_filename_prefixes: Tuple[str, ...] = _IGNORED_ZIP_FILENAME_PREFIXES,
|
20
|
+
ignored_filenames: Tuple[str, ...] = _IGNORED_ZIP_FILENAMES,
|
21
|
+
) -> List[Path]:
|
22
|
+
"""
|
23
|
+
Asynchronously extracts a ZIP archive to a specified directory, filtering out ignored files.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
zip_path: Path to the ZIP archive.
|
27
|
+
extract_to: Directory where the contents will be extracted.
|
28
|
+
ignored_member_prefixes: Tuple of member path prefixes to ignore.
|
29
|
+
ignored_filename_prefixes: Tuple of filename prefixes to ignore.
|
30
|
+
ignored_filenames: Tuple of exact filenames to ignore.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
A list of paths to the successfully extracted files.
|
34
|
+
|
35
|
+
Raises:
|
36
|
+
ValueError: If the zip_path is invalid or corrupted.
|
37
|
+
RuntimeError: If any other error occurs during extraction.
|
38
|
+
"""
|
39
|
+
|
40
|
+
def blocking_extract() -> List[Path]:
|
41
|
+
extracted_file_paths = []
|
42
|
+
logger.info(f"Extracting ZIP archive '{zip_path.name}' to '{extract_to}'...")
|
43
|
+
try:
|
44
|
+
extract_to.mkdir(
|
45
|
+
parents=True, exist_ok=True
|
46
|
+
) # Ensure extract_to directory exists
|
47
|
+
|
48
|
+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
49
|
+
# Security: Preliminary check for unsafe paths before extraction
|
50
|
+
for member_name in zip_ref.namelist():
|
51
|
+
if member_name.startswith(("/", "..")):
|
52
|
+
logger.error(
|
53
|
+
f"Skipping potentially unsafe path in ZIP: {member_name}"
|
54
|
+
)
|
55
|
+
# Depending on security policy, might raise an error here
|
56
|
+
continue
|
57
|
+
|
58
|
+
# Extract all members
|
59
|
+
zip_ref.extractall(extract_to)
|
60
|
+
|
61
|
+
# After extractall, collect all *file* paths, applying filters
|
62
|
+
# This second pass of filtering ensures that even if extractall creates them,
|
63
|
+
# we don't return paths to ignored files.
|
64
|
+
for root, _, files in extract_to.walk():
|
65
|
+
for filename in files:
|
66
|
+
full_path = root / filename
|
67
|
+
# Create a path relative to 'extract_to' to check against member prefixes
|
68
|
+
# This ensures that '__MACOSX/file.txt' is correctly ignored,
|
69
|
+
# not just a top-level '__MACOSX' directory.
|
70
|
+
try:
|
71
|
+
relative_path_to_check = full_path.relative_to(extract_to)
|
72
|
+
except ValueError:
|
73
|
+
# This can happen if full_path is not under extract_to,
|
74
|
+
# which ideally shouldn't occur if zip_ref.extractall worked as expected
|
75
|
+
# and target_path checks were effective.
|
76
|
+
logger.warning(
|
77
|
+
f"File {full_path} seems to be outside extraction root {extract_to}. Skipping."
|
78
|
+
)
|
79
|
+
continue
|
80
|
+
|
81
|
+
path_str_to_check_prefixes = str(relative_path_to_check)
|
82
|
+
|
83
|
+
if not (
|
84
|
+
any(
|
85
|
+
path_str_to_check_prefixes.startswith(p)
|
86
|
+
for p in ignored_member_prefixes
|
87
|
+
)
|
88
|
+
or any(
|
89
|
+
full_path.name.startswith(p)
|
90
|
+
for p in ignored_filename_prefixes
|
91
|
+
)
|
92
|
+
or full_path.name in ignored_filenames
|
93
|
+
):
|
94
|
+
extracted_file_paths.append(full_path)
|
95
|
+
else:
|
96
|
+
logger.debug(f"Ignoring file post-extraction: {full_path}")
|
97
|
+
|
98
|
+
if not extracted_file_paths:
|
99
|
+
logger.warning(
|
100
|
+
f"ZIP archive '{zip_path.name}' is empty or contains no processable files after filtering."
|
101
|
+
)
|
102
|
+
else:
|
103
|
+
logger.info(
|
104
|
+
f"Successfully extracted {len(extracted_file_paths)} file(s) from '{zip_path.name}'."
|
105
|
+
)
|
106
|
+
return extracted_file_paths
|
107
|
+
except zipfile.BadZipFile as e:
|
108
|
+
logger.error(
|
109
|
+
f"Invalid or corrupted ZIP file: {zip_path.name}", exc_info=True
|
110
|
+
)
|
111
|
+
raise ValueError(f"Invalid or corrupted ZIP file: {zip_path.name}") from e
|
112
|
+
except Exception as e:
|
113
|
+
logger.error(
|
114
|
+
f"Failed to extract ZIP archive '{zip_path.name}': {e}", exc_info=True
|
115
|
+
)
|
116
|
+
raise RuntimeError(
|
117
|
+
f"Unexpected error during ZIP extraction for '{zip_path.name}'"
|
118
|
+
) from e
|
119
|
+
|
120
|
+
return await asyncio.to_thread(blocking_extract)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|