satif-ai 0.2.8__tar.gz → 0.2.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,19 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: satif-ai
3
- Version: 0.2.8
3
+ Version: 0.2.10
4
4
  Summary: AI Agents for Satif
5
5
  License: MIT
6
- Author: Bryan Djafer
7
- Author-email: bryan.djafer@syncpulse.fr
8
- Requires-Python: >=3.10,<4.0
6
+ Author: Syncpulse
7
+ Maintainer: Bryan Djafer
8
+ Maintainer-email: bryan.djafer@syncpulse.fr
9
+ Requires-Python: >=3.10,<3.14
9
10
  Classifier: License :: OSI Approved :: MIT License
10
11
  Classifier: Programming Language :: Python :: 3
11
12
  Classifier: Programming Language :: Python :: 3.10
12
13
  Classifier: Programming Language :: Python :: 3.11
13
14
  Classifier: Programming Language :: Python :: 3.12
14
15
  Classifier: Programming Language :: Python :: 3.13
16
+ Provides-Extra: xlsx
15
17
  Requires-Dist: openai-agents (>=0.0.9,<0.0.10)
16
18
  Requires-Dist: satif-sdk (>=0.1.0,<1.0.0)
17
19
  Requires-Dist: sdif-mcp (>=0.1.0,<1.0.0)
@@ -1,14 +1,17 @@
1
1
  [project]
2
2
  name = "satif-ai"
3
- version = "0.2.8"
3
+ version = "0.2.10"
4
4
  description = "AI Agents for Satif"
5
5
  authors = [
6
+ {name = "Syncpulse"}
7
+ ]
8
+ maintainers = [
6
9
  {name = "Bryan Djafer", email = "bryan.djafer@syncpulse.fr"}
7
10
  ]
8
11
  license = "MIT"
9
12
  readme = "README.md"
10
13
 
11
- requires-python = ">=3.10,<4.0"
14
+ requires-python = ">=3.10,<3.14"
12
15
 
13
16
  [tool.poetry.dependencies]
14
17
  openai-agents = ">=0.0.9,<0.0.10"
@@ -19,6 +22,9 @@ sdif-mcp = ">=0.1.0,<1.0.0"
19
22
  requires = ["poetry-core>=2.0.0,<3.0.0"]
20
23
  build-backend = "poetry.core.masonry.api"
21
24
 
25
+ [tool.poetry.extras]
26
+ xlsx = ["xlsx-to-sdif"]
27
+
22
28
  [project.scripts]
23
29
  satif-ai = "satif.cli:main"
24
30
 
@@ -28,6 +34,7 @@ satif-core = {path = "../core", develop = true}
28
34
  satif-sdk = {path = "../sdk", develop = true}
29
35
  sdif-mcp = {path = "../mcp", develop = true}
30
36
  sdif-db = {path = "../sdif", develop = true}
37
+ xlsx-to-sdif = {path = "../xlsx-to-sdif", develop = true}
31
38
  ipykernel = "^6.29.5"
32
39
 
33
40
 
@@ -0,0 +1,19 @@
1
+ from .adapters.tidy import TidyAdapter
2
+ from .standardize import astandardize
3
+ from .standardizers.ai import AIStandardizer
4
+ from .standardizers.ai_csv import AICSVStandardizer
5
+ from .transform import atransform
6
+ from .transformation_builders.syncpulse import SyncpulseTransformationBuilder
7
+ from .utils import OpenAICompatibleMCP, extract_zip_archive_async, merge_sdif_files
8
+
9
+ __all__ = [
10
+ "astandardize",
11
+ "atransform",
12
+ "TidyAdapter",
13
+ "AICSVStandardizer",
14
+ "AIStandardizer",
15
+ "SyncpulseTransformationBuilder",
16
+ "OpenAICompatibleMCP",
17
+ "extract_zip_archive_async",
18
+ "merge_sdif_files",
19
+ ]
@@ -6,23 +6,19 @@ import shutil
6
6
  import sqlite3
7
7
  import tempfile
8
8
  from pathlib import Path
9
- from typing import Optional
9
+ from typing import Optional, Union
10
10
 
11
- # MCP and Agent imports
12
11
  from agents import Agent, Runner, function_tool
13
- from agents.mcp.server import MCPServerStdio
12
+ from agents.mcp.server import MCPServer
14
13
  from mcp import ClientSession
15
-
16
- # SATIF imports
17
14
  from satif_core.adapters.base import Adapter
18
- from satif_core.types import Datasource
15
+ from satif_core.types import Datasource, SDIFPath
19
16
  from satif_sdk import SDIFDatabase
20
17
  from satif_sdk.adapters.code import AdapterError, CodeAdapter
21
18
 
22
19
  logger = logging.getLogger(__name__)
23
20
 
24
21
 
25
- # --- Tidy Transformation Prompt ---
26
22
  TIDY_TRANSFORMATION_PROMPT = """
27
23
  You are an expert Data Tidying Agent for SDIF databases.
28
24
  Your task is to write Python code to transform tables within a given SDIF database into a 'tidy' format, modifying the database *in place*.
@@ -130,12 +126,11 @@ def adapt_sdif(db: SDIFDatabase) -> None:
130
126
  - Ensure pandas and other necessary libraries (like `typing`, `AdapterError`) are imported within the code string if you use them.
131
127
  """
132
128
 
133
- # --- Global context for tools ---
134
- # These will be set within the TidyAdapter instance when adapt is called
129
+
135
130
  TOOL_CONTEXT = {
136
131
  "copied_input_sdif_path": None,
137
132
  "temp_dir": None,
138
- "current_output_sdif_path": None, # Path generated by the tool
133
+ "current_output_sdif_path": None,
139
134
  }
140
135
 
141
136
 
@@ -167,13 +162,10 @@ async def execute_tidy_adaptation(code: str) -> str:
167
162
  )
168
163
 
169
164
  try:
170
- # 1. Instantiate CodeAdapter with the provided code
171
- # It will operate on a *copy* specified by copied_input_path
172
- # and write to a *new* file (_adapted suffix by default).
173
165
  adapter = CodeAdapter(
174
166
  function=code,
175
- function_name="adapt_sdif", # As specified in prompt
176
- output_suffix="_adapted_tool_run", # Give tool runs a distinct suffix
167
+ function_name="adapt_sdif",
168
+ output_suffix="_adapted_tool_run",
177
169
  )
178
170
  # Run the adaptation. It copies `copied_input_path` and modifies the copy.
179
171
  # The returned path is the newly created, adapted file.
@@ -232,16 +224,16 @@ class TidyAdapter(Adapter):
232
224
 
233
225
  def __init__(
234
226
  self,
235
- mcp_server: MCPServerStdio, # Use the server instance
236
- mcp_session: ClientSession, # Use the client session
237
- llm_model: str = "o4-mini", # Specify the LLM model
227
+ mcp_server: MCPServer,
228
+ mcp_session: ClientSession,
229
+ llm_model: str = "o4-mini",
238
230
  max_iterations: int = 5,
239
231
  ):
240
232
  """
241
233
  Initialize the TidyAdapter.
242
234
 
243
235
  Args:
244
- mcp_server: An instance of MCPServerStdio for agent communication.
236
+ mcp_server: An instance of MCPServer for agent communication.
245
237
  mcp_session: An instance of ClientSession for resource/prompt fetching.
246
238
  llm_model: Name of the language model to use for the agent.
247
239
  max_iterations: Maximum number of attempts the agent gets to refine the code.
@@ -339,12 +331,12 @@ class TidyAdapter(Adapter):
339
331
  return code_text.strip()
340
332
  return None # Indicate no valid code found
341
333
 
342
- async def adapt(self, sdif_database: SDIFDatabase) -> Datasource:
334
+ async def adapt(self, sdif: Union[SDIFPath, SDIFDatabase]) -> Datasource:
343
335
  """
344
336
  Transforms the data in the input SDIF to be tidy using an AI agent.
345
337
 
346
338
  Args:
347
- sdif_database: The input SDIF database instance. Connection will be closed.
339
+ sdif: The input SDIF database instance. Connection will be closed.
348
340
 
349
341
  Returns:
350
342
  Path to the new SDIF file containing the tidied data.
@@ -354,13 +346,16 @@ class TidyAdapter(Adapter):
354
346
  RuntimeError: If the agent fails to produce valid tidy code.
355
347
  Exception: For unexpected errors during the process.
356
348
  """
357
- input_path = Path(sdif_database.path)
349
+ if isinstance(sdif, SDIFDatabase):
350
+ input_path = Path(sdif.path)
351
+ else:
352
+ input_path = Path(sdif)
358
353
  if not input_path.exists():
359
354
  raise FileNotFoundError(f"Input SDIF file not found: {input_path}")
360
355
 
361
356
  # Ensure the input DB connection is closed before copying
362
357
  try:
363
- sdif_database.close()
358
+ sdif.close()
364
359
  except Exception:
365
360
  pass
366
361
 
@@ -372,17 +367,14 @@ class TidyAdapter(Adapter):
372
367
  input_schema_dict = db.get_schema()
373
368
  input_sample_dict = db.get_sample_analysis()
374
369
 
375
- # Get SDIFDatabase method signatures
376
370
  sdif_methods_str = self._get_sdif_methods()
377
371
 
378
- # Prepare context for the prompt
379
372
  initial_context = {
380
373
  "input_schema": json.dumps(input_schema_dict, indent=2),
381
374
  "input_sample": json.dumps(input_sample_dict, indent=2),
382
375
  "sdif_database_methods": sdif_methods_str,
383
376
  }
384
377
 
385
- # Instantiate the Agent
386
378
  agent = Agent(
387
379
  name="Tidy SDIF Adapter Agent",
388
380
  mcp_servers=[self.mcp_server],
@@ -390,8 +382,6 @@ class TidyAdapter(Adapter):
390
382
  model=self.llm_model,
391
383
  )
392
384
 
393
- # Run the agent using the Runner
394
- # Pass the prompt and initial context
395
385
  logger.info(f"Running Tidy Agent with model {self.llm_model}...")
396
386
  result = await Runner.run(
397
387
  agent,
@@ -409,7 +399,6 @@ class TidyAdapter(Adapter):
409
399
  f"Agent finished. Final output message:\n{result.final_output[:500]}..."
410
400
  )
411
401
 
412
- # Parse the final code from the agent's response
413
402
  final_code = self.parse_code(result.final_output)
414
403
 
415
404
  if not final_code:
@@ -421,20 +410,16 @@ class TidyAdapter(Adapter):
421
410
  logger.info(
422
411
  "Successfully parsed final adaptation code from agent response."
423
412
  )
424
- # print(f"--- Final Code ---\n{final_code}\n------------------") # Debugging
425
413
 
426
- # Execute the *final* code using CodeAdapter directly to create the definitive output
427
414
  logger.info("Executing final adaptation code...")
428
415
  final_adapter = CodeAdapter(
429
416
  function=final_code,
430
417
  function_name="adapt_sdif",
431
- output_suffix="_tidy_final", # Use a distinct suffix for the final output
418
+ output_suffix="_tidy_final",
432
419
  )
433
- # Adapt the *original* copied input path
420
+
434
421
  final_adapted_path = final_adapter.adapt(copied_input_path)
435
422
 
436
- # Move the final successful output SDIF to a persistent location
437
- # Example: place it next to the original input file
438
423
  persistent_output_path = (
439
424
  input_path.parent / final_adapted_path.name
440
425
  ).resolve()
@@ -444,9 +429,7 @@ class TidyAdapter(Adapter):
444
429
  )
445
430
  persistent_output_path.unlink()
446
431
 
447
- shutil.move(
448
- str(final_adapted_path), persistent_output_path
449
- ) # Move needs strings sometimes
432
+ shutil.move(str(final_adapted_path), persistent_output_path)
450
433
  logger.info(
451
434
  f"Successfully generated final tidy SDIF: {persistent_output_path}"
452
435
  )
@@ -455,8 +438,6 @@ class TidyAdapter(Adapter):
455
438
 
456
439
  except Exception as e:
457
440
  logger.exception(f"Error during TidyAdapter adapt process: {e}")
458
- # Re-raise or handle as appropriate
459
441
  raise
460
442
  finally:
461
- # Always clean up temporary files
462
443
  self._cleanup_temp_env()
@@ -0,0 +1,112 @@
1
+ from pathlib import Path
2
+ from typing import Any, Dict, Optional, Union
3
+
4
+ from satif_core.standardizers.base import AsyncStandardizer
5
+ from satif_core.types import Datasource, FilePath, SDIFPath, StandardizationResult
6
+
7
+ from satif_ai.adapters.tidy import TidyAdapter
8
+ from satif_ai.standardizers.ai import AIStandardizer
9
+
10
+
11
+ async def astandardize(
12
+ datasource: Datasource,
13
+ output_path: SDIFPath,
14
+ *,
15
+ overwrite: bool = False,
16
+ sdif_schema: Optional[Union[FilePath, Dict[str, Any]]] = None,
17
+ tidy_adapter: Union[bool, TidyAdapter] = False,
18
+ config: Optional[Dict[str, Any]] = None,
19
+ standardizer: Optional[AsyncStandardizer] = None,
20
+ mcp_server: Optional[Any] = None,
21
+ mcp_session: Optional[Any] = None,
22
+ llm_model: Optional[str] = None,
23
+ ) -> StandardizationResult:
24
+ """
25
+ Asynchronously standardizes a datasource into a single, canonical SDIF SQLite file.
26
+
27
+ This function serves as the primary entry point for the SATIF standardization layer.
28
+ It orchestrates the conversion of various input file formats (e.g., CSV, Excel, PDF)
29
+ from the provided datasource into a unified SDIF (Standard Data Interchange Format)
30
+ SQLite file. The process may involve AI-driven parsing, schema adaptation, and
31
+ data tidying, depending on the configuration and the capabilities of the
32
+ underlying standardizer.
33
+
34
+ Args:
35
+ datasource: The source of the data to be standardized. This can be a
36
+ single file path (str or Path), a list of file paths, or other
37
+ datasource types supported by the chosen standardizer.
38
+ output_path: The path (str or Path) where the output SDIF SQLite database file
39
+ will be created (e.g., "./output/my_data.sdif").
40
+ overwrite: If True, an existing SDIF file at `output_path` will be
41
+ overwritten. Defaults to False.
42
+ sdif_schema: Optional. Path to an SDIF schema definition file (e.g., a JSON file)
43
+ or a dictionary representing the schema. If provided, the
44
+ standardization process (specifically if using the default
45
+ `AIStandardizer`) may attempt to adapt the data to this
46
+ target schema.
47
+ tidy_adapter: Optional. If True, a default `TidyAdapter` may be used.
48
+ Alternatively, a specific `TidyAdapter` instance can be provided
49
+ to perform data tidying processes (e.g., cleaning, normalization,
50
+ restructuring tables). If False (default), no explicit tidying
51
+ step is initiated by this top-level function, though underlying
52
+ standardizers might perform their own internal tidying.
53
+ The specifics depend on the standardizer's capabilities.
54
+ config: Optional. A dictionary for advanced or standardizer-specific
55
+ configurations. This config is passed directly to the
56
+ `standardize` method of the chosen standardizer.
57
+ standardizer: Optional. An instance of an `AsyncStandardizer` subclass.
58
+ If provided, this instance will be used for standardization.
59
+ This allows for using pre-configured or custom standardizers.
60
+ If None, a default `AIStandardizer` is instantiated using
61
+ `mcp_server`, `mcp_session`, `llm_model`, `sdif_schema`,
62
+ and `tidy_adapter`.
63
+ mcp_server: Optional. The MCP (Model Coordination Platform) server instance.
64
+ Used if `standardizer` is None for the default `AIStandardizer`.
65
+ mcp_session: Optional. The MCP session or transport object.
66
+ Used if `standardizer` is None for the default `AIStandardizer`.
67
+ llm_model: Optional. The language model to be used by the default `AIStandardizer`
68
+ if no `standardizer` instance is provided (e.g., "gpt-4o").
69
+ Each standardizer may have its own default model.
70
+
71
+ Returns:
72
+ A `StandardizationResult` object containing:
73
+ - `output_path`: The absolute `Path` to the created or updated SDIF database file.
74
+ - `file_configs`: An optional dictionary detailing configurations used for
75
+ each processed input file, if applicable and returned by
76
+ the standardizer.
77
+
78
+ Raises:
79
+ FileNotFoundError: If the `datasource` (or parts of it) does not exist.
80
+ FileExistsError: If `output_path` exists and `overwrite` is False.
81
+ ValueError: If input arguments are invalid (e.g., unsupported datasource type).
82
+ RuntimeError: For general errors during the standardization process.
83
+ Specific exceptions may also be raised by the underlying
84
+ standardizer implementation.
85
+ """
86
+ if standardizer is None:
87
+ standardizer = AIStandardizer(
88
+ mcp_server=mcp_server,
89
+ mcp_session=mcp_session,
90
+ llm_model=llm_model,
91
+ sdif_schema=sdif_schema,
92
+ tidy_adapter=tidy_adapter
93
+ if isinstance(tidy_adapter, TidyAdapter)
94
+ else (TidyAdapter() if tidy_adapter else None),
95
+ )
96
+
97
+ result = await standardizer.standardize(
98
+ datasource=datasource,
99
+ output_path=output_path,
100
+ overwrite=overwrite,
101
+ config=config,
102
+ )
103
+
104
+ output_sdif_path = (
105
+ Path(result.output_path)
106
+ if isinstance(result.output_path, str)
107
+ else result.output_path
108
+ )
109
+
110
+ return StandardizationResult(
111
+ output_path=output_sdif_path, file_configs=result.file_configs
112
+ )