satif-ai 0.2.8__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- satif_ai/__init__.py +19 -0
- satif_ai/adapters/tidy.py +19 -38
- satif_ai/standardize.py +112 -0
- satif_ai/standardizers/ai.py +485 -0
- satif_ai/standardizers/ai_csv.py +1 -1
- satif_ai/transform.py +121 -0
- satif_ai/{code_builders/transformation.py → transformation_builders/syncpulse.py} +22 -29
- satif_ai/utils/__init__.py +5 -0
- satif_ai/utils/merge_sdif.py +22 -0
- satif_ai/utils/openai_mcp.py +97 -0
- satif_ai/utils/zip.py +120 -0
- {satif_ai-0.2.8.dist-info → satif_ai-0.2.9.dist-info}/METADATA +4 -3
- satif_ai-0.2.9.dist-info/RECORD +19 -0
- satif_ai/code_builders/adaptation.py +0 -9
- satif_ai-0.2.8.dist-info/RECORD +0 -13
- /satif_ai/{code_builders → transformation_builders}/__init__.py +0 -0
- {satif_ai-0.2.8.dist-info → satif_ai-0.2.9.dist-info}/LICENSE +0 -0
- {satif_ai-0.2.8.dist-info → satif_ai-0.2.9.dist-info}/WHEEL +0 -0
- {satif_ai-0.2.8.dist-info → satif_ai-0.2.9.dist-info}/entry_points.txt +0 -0
satif_ai/__init__.py
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
from .adapters.tidy import TidyAdapter
|
2
|
+
from .standardize import astandardize
|
3
|
+
from .standardizers.ai import AIStandardizer
|
4
|
+
from .standardizers.ai_csv import AICSVStandardizer
|
5
|
+
from .transform import atransform
|
6
|
+
from .transformation_builders.syncpulse import SyncpulseTransformationBuilder
|
7
|
+
from .utils import OpenAICompatibleMCP, extract_zip_archive_async, merge_sdif_files
|
8
|
+
|
9
|
+
__all__ = [
|
10
|
+
"astandardize",
|
11
|
+
"atransform",
|
12
|
+
"TidyAdapter",
|
13
|
+
"AICSVStandardizer",
|
14
|
+
"AIStandardizer",
|
15
|
+
"SyncpulseTransformationBuilder",
|
16
|
+
"OpenAICompatibleMCP",
|
17
|
+
"extract_zip_archive_async",
|
18
|
+
"merge_sdif_files",
|
19
|
+
]
|
satif_ai/adapters/tidy.py
CHANGED
@@ -6,23 +6,19 @@ import shutil
|
|
6
6
|
import sqlite3
|
7
7
|
import tempfile
|
8
8
|
from pathlib import Path
|
9
|
-
from typing import Optional
|
9
|
+
from typing import Optional, Union
|
10
10
|
|
11
|
-
# MCP and Agent imports
|
12
11
|
from agents import Agent, Runner, function_tool
|
13
12
|
from agents.mcp.server import MCPServerStdio
|
14
13
|
from mcp import ClientSession
|
15
|
-
|
16
|
-
# SATIF imports
|
17
14
|
from satif_core.adapters.base import Adapter
|
18
|
-
from satif_core.types import Datasource
|
15
|
+
from satif_core.types import Datasource, SDIFPath
|
19
16
|
from satif_sdk import SDIFDatabase
|
20
17
|
from satif_sdk.adapters.code import AdapterError, CodeAdapter
|
21
18
|
|
22
19
|
logger = logging.getLogger(__name__)
|
23
20
|
|
24
21
|
|
25
|
-
# --- Tidy Transformation Prompt ---
|
26
22
|
TIDY_TRANSFORMATION_PROMPT = """
|
27
23
|
You are an expert Data Tidying Agent for SDIF databases.
|
28
24
|
Your task is to write Python code to transform tables within a given SDIF database into a 'tidy' format, modifying the database *in place*.
|
@@ -130,12 +126,11 @@ def adapt_sdif(db: SDIFDatabase) -> None:
|
|
130
126
|
- Ensure pandas and other necessary libraries (like `typing`, `AdapterError`) are imported within the code string if you use them.
|
131
127
|
"""
|
132
128
|
|
133
|
-
|
134
|
-
# These will be set within the TidyAdapter instance when adapt is called
|
129
|
+
|
135
130
|
TOOL_CONTEXT = {
|
136
131
|
"copied_input_sdif_path": None,
|
137
132
|
"temp_dir": None,
|
138
|
-
"current_output_sdif_path": None,
|
133
|
+
"current_output_sdif_path": None,
|
139
134
|
}
|
140
135
|
|
141
136
|
|
@@ -167,13 +162,10 @@ async def execute_tidy_adaptation(code: str) -> str:
|
|
167
162
|
)
|
168
163
|
|
169
164
|
try:
|
170
|
-
# 1. Instantiate CodeAdapter with the provided code
|
171
|
-
# It will operate on a *copy* specified by copied_input_path
|
172
|
-
# and write to a *new* file (_adapted suffix by default).
|
173
165
|
adapter = CodeAdapter(
|
174
166
|
function=code,
|
175
|
-
function_name="adapt_sdif",
|
176
|
-
output_suffix="_adapted_tool_run",
|
167
|
+
function_name="adapt_sdif",
|
168
|
+
output_suffix="_adapted_tool_run",
|
177
169
|
)
|
178
170
|
# Run the adaptation. It copies `copied_input_path` and modifies the copy.
|
179
171
|
# The returned path is the newly created, adapted file.
|
@@ -232,9 +224,9 @@ class TidyAdapter(Adapter):
|
|
232
224
|
|
233
225
|
def __init__(
|
234
226
|
self,
|
235
|
-
mcp_server: MCPServerStdio,
|
236
|
-
mcp_session: ClientSession,
|
237
|
-
llm_model: str = "o4-mini",
|
227
|
+
mcp_server: MCPServerStdio,
|
228
|
+
mcp_session: ClientSession,
|
229
|
+
llm_model: str = "o4-mini",
|
238
230
|
max_iterations: int = 5,
|
239
231
|
):
|
240
232
|
"""
|
@@ -339,12 +331,12 @@ class TidyAdapter(Adapter):
|
|
339
331
|
return code_text.strip()
|
340
332
|
return None # Indicate no valid code found
|
341
333
|
|
342
|
-
async def adapt(self,
|
334
|
+
async def adapt(self, sdif: Union[SDIFPath, SDIFDatabase]) -> Datasource:
|
343
335
|
"""
|
344
336
|
Transforms the data in the input SDIF to be tidy using an AI agent.
|
345
337
|
|
346
338
|
Args:
|
347
|
-
|
339
|
+
sdif: The input SDIF database instance. Connection will be closed.
|
348
340
|
|
349
341
|
Returns:
|
350
342
|
Path to the new SDIF file containing the tidied data.
|
@@ -354,13 +346,16 @@ class TidyAdapter(Adapter):
|
|
354
346
|
RuntimeError: If the agent fails to produce valid tidy code.
|
355
347
|
Exception: For unexpected errors during the process.
|
356
348
|
"""
|
357
|
-
|
349
|
+
if isinstance(sdif, SDIFDatabase):
|
350
|
+
input_path = Path(sdif.path)
|
351
|
+
else:
|
352
|
+
input_path = sdif
|
358
353
|
if not input_path.exists():
|
359
354
|
raise FileNotFoundError(f"Input SDIF file not found: {input_path}")
|
360
355
|
|
361
356
|
# Ensure the input DB connection is closed before copying
|
362
357
|
try:
|
363
|
-
|
358
|
+
sdif.close()
|
364
359
|
except Exception:
|
365
360
|
pass
|
366
361
|
|
@@ -372,17 +367,14 @@ class TidyAdapter(Adapter):
|
|
372
367
|
input_schema_dict = db.get_schema()
|
373
368
|
input_sample_dict = db.get_sample_analysis()
|
374
369
|
|
375
|
-
# Get SDIFDatabase method signatures
|
376
370
|
sdif_methods_str = self._get_sdif_methods()
|
377
371
|
|
378
|
-
# Prepare context for the prompt
|
379
372
|
initial_context = {
|
380
373
|
"input_schema": json.dumps(input_schema_dict, indent=2),
|
381
374
|
"input_sample": json.dumps(input_sample_dict, indent=2),
|
382
375
|
"sdif_database_methods": sdif_methods_str,
|
383
376
|
}
|
384
377
|
|
385
|
-
# Instantiate the Agent
|
386
378
|
agent = Agent(
|
387
379
|
name="Tidy SDIF Adapter Agent",
|
388
380
|
mcp_servers=[self.mcp_server],
|
@@ -390,8 +382,6 @@ class TidyAdapter(Adapter):
|
|
390
382
|
model=self.llm_model,
|
391
383
|
)
|
392
384
|
|
393
|
-
# Run the agent using the Runner
|
394
|
-
# Pass the prompt and initial context
|
395
385
|
logger.info(f"Running Tidy Agent with model {self.llm_model}...")
|
396
386
|
result = await Runner.run(
|
397
387
|
agent,
|
@@ -409,7 +399,6 @@ class TidyAdapter(Adapter):
|
|
409
399
|
f"Agent finished. Final output message:\n{result.final_output[:500]}..."
|
410
400
|
)
|
411
401
|
|
412
|
-
# Parse the final code from the agent's response
|
413
402
|
final_code = self.parse_code(result.final_output)
|
414
403
|
|
415
404
|
if not final_code:
|
@@ -421,20 +410,16 @@ class TidyAdapter(Adapter):
|
|
421
410
|
logger.info(
|
422
411
|
"Successfully parsed final adaptation code from agent response."
|
423
412
|
)
|
424
|
-
# print(f"--- Final Code ---\n{final_code}\n------------------") # Debugging
|
425
413
|
|
426
|
-
# Execute the *final* code using CodeAdapter directly to create the definitive output
|
427
414
|
logger.info("Executing final adaptation code...")
|
428
415
|
final_adapter = CodeAdapter(
|
429
416
|
function=final_code,
|
430
417
|
function_name="adapt_sdif",
|
431
|
-
output_suffix="_tidy_final",
|
418
|
+
output_suffix="_tidy_final",
|
432
419
|
)
|
433
|
-
|
420
|
+
|
434
421
|
final_adapted_path = final_adapter.adapt(copied_input_path)
|
435
422
|
|
436
|
-
# Move the final successful output SDIF to a persistent location
|
437
|
-
# Example: place it next to the original input file
|
438
423
|
persistent_output_path = (
|
439
424
|
input_path.parent / final_adapted_path.name
|
440
425
|
).resolve()
|
@@ -444,9 +429,7 @@ class TidyAdapter(Adapter):
|
|
444
429
|
)
|
445
430
|
persistent_output_path.unlink()
|
446
431
|
|
447
|
-
shutil.move(
|
448
|
-
str(final_adapted_path), persistent_output_path
|
449
|
-
) # Move needs strings sometimes
|
432
|
+
shutil.move(str(final_adapted_path), persistent_output_path)
|
450
433
|
logger.info(
|
451
434
|
f"Successfully generated final tidy SDIF: {persistent_output_path}"
|
452
435
|
)
|
@@ -455,8 +438,6 @@ class TidyAdapter(Adapter):
|
|
455
438
|
|
456
439
|
except Exception as e:
|
457
440
|
logger.exception(f"Error during TidyAdapter adapt process: {e}")
|
458
|
-
# Re-raise or handle as appropriate
|
459
441
|
raise
|
460
442
|
finally:
|
461
|
-
# Always clean up temporary files
|
462
443
|
self._cleanup_temp_env()
|
satif_ai/standardize.py
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Any, Dict, Optional, Union
|
3
|
+
|
4
|
+
from satif_core.standardizers.base import AsyncStandardizer
|
5
|
+
from satif_core.types import Datasource, FilePath, SDIFPath, StandardizationResult
|
6
|
+
|
7
|
+
from satif_ai.adapters.tidy import TidyAdapter
|
8
|
+
from satif_ai.standardizers.ai import AIStandardizer
|
9
|
+
|
10
|
+
|
11
|
+
async def astandardize(
|
12
|
+
datasource: Datasource,
|
13
|
+
output_path: SDIFPath,
|
14
|
+
*,
|
15
|
+
overwrite: bool = False,
|
16
|
+
sdif_schema: Optional[Union[FilePath, Dict[str, Any]]] = None,
|
17
|
+
tidy_adapter: Union[bool, TidyAdapter] = False,
|
18
|
+
config: Optional[Dict[str, Any]] = None,
|
19
|
+
standardizer: Optional[AsyncStandardizer] = None,
|
20
|
+
mcp_server: Optional[Any] = None,
|
21
|
+
mcp_session: Optional[Any] = None,
|
22
|
+
llm_model: Optional[str] = None,
|
23
|
+
) -> StandardizationResult:
|
24
|
+
"""
|
25
|
+
Asynchronously standardizes a datasource into a single, canonical SDIF SQLite file.
|
26
|
+
|
27
|
+
This function serves as the primary entry point for the SATIF standardization layer.
|
28
|
+
It orchestrates the conversion of various input file formats (e.g., CSV, Excel, PDF)
|
29
|
+
from the provided datasource into a unified SDIF (Standard Data Interchange Format)
|
30
|
+
SQLite file. The process may involve AI-driven parsing, schema adaptation, and
|
31
|
+
data tidying, depending on the configuration and the capabilities of the
|
32
|
+
underlying standardizer.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
datasource: The source of the data to be standardized. This can be a
|
36
|
+
single file path (str or Path), a list of file paths, or other
|
37
|
+
datasource types supported by the chosen standardizer.
|
38
|
+
output_path: The path (str or Path) where the output SDIF SQLite database file
|
39
|
+
will be created (e.g., "./output/my_data.sdif").
|
40
|
+
overwrite: If True, an existing SDIF file at `output_path` will be
|
41
|
+
overwritten. Defaults to False.
|
42
|
+
sdif_schema: Optional. Path to an SDIF schema definition file (e.g., a JSON file)
|
43
|
+
or a dictionary representing the schema. If provided, the
|
44
|
+
standardization process (specifically if using the default
|
45
|
+
`AIStandardizer`) may attempt to adapt the data to this
|
46
|
+
target schema.
|
47
|
+
tidy_adapter: Optional. If True, a default `TidyAdapter` may be used.
|
48
|
+
Alternatively, a specific `TidyAdapter` instance can be provided
|
49
|
+
to perform data tidying processes (e.g., cleaning, normalization,
|
50
|
+
restructuring tables). If False (default), no explicit tidying
|
51
|
+
step is initiated by this top-level function, though underlying
|
52
|
+
standardizers might perform their own internal tidying.
|
53
|
+
The specifics depend on the standardizer's capabilities.
|
54
|
+
config: Optional. A dictionary for advanced or standardizer-specific
|
55
|
+
configurations. This config is passed directly to the
|
56
|
+
`standardize` method of the chosen standardizer.
|
57
|
+
standardizer: Optional. An instance of an `AsyncStandardizer` subclass.
|
58
|
+
If provided, this instance will be used for standardization.
|
59
|
+
This allows for using pre-configured or custom standardizers.
|
60
|
+
If None, a default `AIStandardizer` is instantiated using
|
61
|
+
`mcp_server`, `mcp_session`, `llm_model`, `sdif_schema`,
|
62
|
+
and `tidy_adapter`.
|
63
|
+
mcp_server: Optional. The MCP (Model Coordination Platform) server instance.
|
64
|
+
Used if `standardizer` is None for the default `AIStandardizer`.
|
65
|
+
mcp_session: Optional. The MCP session or transport object.
|
66
|
+
Used if `standardizer` is None for the default `AIStandardizer`.
|
67
|
+
llm_model: Optional. The language model to be used by the default `AIStandardizer`
|
68
|
+
if no `standardizer` instance is provided (e.g., "gpt-4o").
|
69
|
+
Each standardizer may have its own default model.
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
A `StandardizationResult` object containing:
|
73
|
+
- `output_path`: The absolute `Path` to the created or updated SDIF database file.
|
74
|
+
- `file_configs`: An optional dictionary detailing configurations used for
|
75
|
+
each processed input file, if applicable and returned by
|
76
|
+
the standardizer.
|
77
|
+
|
78
|
+
Raises:
|
79
|
+
FileNotFoundError: If the `datasource` (or parts of it) does not exist.
|
80
|
+
FileExistsError: If `output_path` exists and `overwrite` is False.
|
81
|
+
ValueError: If input arguments are invalid (e.g., unsupported datasource type).
|
82
|
+
RuntimeError: For general errors during the standardization process.
|
83
|
+
Specific exceptions may also be raised by the underlying
|
84
|
+
standardizer implementation.
|
85
|
+
"""
|
86
|
+
if standardizer is None:
|
87
|
+
standardizer = AIStandardizer(
|
88
|
+
mcp_server=mcp_server,
|
89
|
+
mcp_session=mcp_session,
|
90
|
+
llm_model=llm_model,
|
91
|
+
sdif_schema=sdif_schema,
|
92
|
+
tidy_adapter=tidy_adapter
|
93
|
+
if isinstance(tidy_adapter, TidyAdapter)
|
94
|
+
else (TidyAdapter() if tidy_adapter else None),
|
95
|
+
)
|
96
|
+
|
97
|
+
result = await standardizer.standardize(
|
98
|
+
datasource=datasource,
|
99
|
+
output_path=output_path,
|
100
|
+
overwrite=overwrite,
|
101
|
+
config=config,
|
102
|
+
)
|
103
|
+
|
104
|
+
output_sdif_path = (
|
105
|
+
Path(result.output_path)
|
106
|
+
if isinstance(result.output_path, str)
|
107
|
+
else result.output_path
|
108
|
+
)
|
109
|
+
|
110
|
+
return StandardizationResult(
|
111
|
+
output_path=output_sdif_path, file_configs=result.file_configs
|
112
|
+
)
|