PyPI - satif-ai - Versions diffs - 0.2.8__tar.gz → 0.2.9__tar.gz - Mend

satif-ai 0.2.8tar.gz → 0.2.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{satif_ai-0.2.8 → satif_ai-0.2.9}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,11 @@
 Metadata-Version: 2.3
 Name: satif-ai
-Version: 0.2.8
+Version: 0.2.9
 Summary: AI Agents for Satif
 License: MIT
-Author: Bryan Djafer
-Author-email: bryan.djafer@syncpulse.fr
+Author: Syncpulse
+Maintainer: Bryan Djafer
+Maintainer-email: bryan.djafer@syncpulse.fr
 Requires-Python: >=3.10,<4.0
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3

{satif_ai-0.2.8 → satif_ai-0.2.9}/pyproject.toml RENAMED Viewed

@@ -1,8 +1,11 @@
 [project]
 name = "satif-ai"
-version = "0.2.8"
+version = "0.2.9"
 description = "AI Agents for Satif"
 authors = [
+    {name = "Syncpulse"}
+]
+maintainers = [
     {name = "Bryan Djafer", email = "bryan.djafer@syncpulse.fr"}
 ]
 license = "MIT"

satif_ai-0.2.9/satif_ai/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from .adapters.tidy import TidyAdapter
+from .standardize import astandardize
+from .standardizers.ai import AIStandardizer
+from .standardizers.ai_csv import AICSVStandardizer
+from .transform import atransform
+from .transformation_builders.syncpulse import SyncpulseTransformationBuilder
+from .utils import OpenAICompatibleMCP, extract_zip_archive_async, merge_sdif_files
+__all__ = [
+    "astandardize",
+    "atransform",
+    "TidyAdapter",
+    "AICSVStandardizer",
+    "AIStandardizer",
+    "SyncpulseTransformationBuilder",
+    "OpenAICompatibleMCP",
+    "extract_zip_archive_async",
+    "merge_sdif_files",
+]

{satif_ai-0.2.8 → satif_ai-0.2.9}/satif_ai/adapters/tidy.py RENAMED Viewed

@@ -6,23 +6,19 @@ import shutil
 import sqlite3
 import tempfile
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
-# MCP and Agent imports
 from agents import Agent, Runner, function_tool
 from agents.mcp.server import MCPServerStdio
 from mcp import ClientSession
-# SATIF imports
 from satif_core.adapters.base import Adapter
-from satif_core.types import Datasource
+from satif_core.types import Datasource, SDIFPath
 from satif_sdk import SDIFDatabase
 from satif_sdk.adapters.code import AdapterError, CodeAdapter
 logger = logging.getLogger(__name__)
-# --- Tidy Transformation Prompt ---
 TIDY_TRANSFORMATION_PROMPT = """
 You are an expert Data Tidying Agent for SDIF databases.
 Your task is to write Python code to transform tables within a given SDIF database into a 'tidy' format, modifying the database *in place*.
@@ -130,12 +126,11 @@ def adapt_sdif(db: SDIFDatabase) -> None:
 - Ensure pandas and other necessary libraries (like `typing`, `AdapterError`) are imported within the code string if you use them.
 """
-# --- Global context for tools ---
-# These will be set within the TidyAdapter instance when adapt is called
 TOOL_CONTEXT = {
     "copied_input_sdif_path": None,
     "temp_dir": None,
-    "current_output_sdif_path": None,  # Path generated by the tool
+    "current_output_sdif_path": None,
 }
@@ -167,13 +162,10 @@ async def execute_tidy_adaptation(code: str) -> str:
     )
     try:
-        # 1. Instantiate CodeAdapter with the provided code
-        #    It will operate on a *copy* specified by copied_input_path
-        #    and write to a *new* file (_adapted suffix by default).
         adapter = CodeAdapter(
             function=code,
-            function_name="adapt_sdif",  # As specified in prompt
-            output_suffix="_adapted_tool_run",  # Give tool runs a distinct suffix
+            function_name="adapt_sdif",
+            output_suffix="_adapted_tool_run",
         )
         # Run the adaptation. It copies `copied_input_path` and modifies the copy.
         # The returned path is the newly created, adapted file.
@@ -232,9 +224,9 @@ class TidyAdapter(Adapter):
     def __init__(
         self,
-        mcp_server: MCPServerStdio,  # Use the server instance
-        mcp_session: ClientSession,  # Use the client session
-        llm_model: str = "o4-mini",  # Specify the LLM model
+        mcp_server: MCPServerStdio,
+        mcp_session: ClientSession,
+        llm_model: str = "o4-mini",
         max_iterations: int = 5,
     ):
         """
@@ -339,12 +331,12 @@ class TidyAdapter(Adapter):
                 return code_text.strip()
             return None  # Indicate no valid code found
-    async def adapt(self, sdif_database: SDIFDatabase) -> Datasource:
+    async def adapt(self, sdif: Union[SDIFPath, SDIFDatabase]) -> Datasource:
         """
         Transforms the data in the input SDIF to be tidy using an AI agent.
         Args:
-            sdif_database: The input SDIF database instance. Connection will be closed.
+            sdif: The input SDIF database instance. Connection will be closed.
         Returns:
             Path to the new SDIF file containing the tidied data.
@@ -354,13 +346,16 @@ class TidyAdapter(Adapter):
             RuntimeError: If the agent fails to produce valid tidy code.
             Exception: For unexpected errors during the process.
         """
-        input_path = Path(sdif_database.path)
+        if isinstance(sdif, SDIFDatabase):
+            input_path = Path(sdif.path)
+        else:
+            input_path = sdif
         if not input_path.exists():
             raise FileNotFoundError(f"Input SDIF file not found: {input_path}")
         # Ensure the input DB connection is closed before copying
         try:
-            sdif_database.close()
+            sdif.close()
         except Exception:
             pass
@@ -372,17 +367,14 @@ class TidyAdapter(Adapter):
                 input_schema_dict = db.get_schema()
                 input_sample_dict = db.get_sample_analysis()
-            # Get SDIFDatabase method signatures
             sdif_methods_str = self._get_sdif_methods()
-            # Prepare context for the prompt
             initial_context = {
                 "input_schema": json.dumps(input_schema_dict, indent=2),
                 "input_sample": json.dumps(input_sample_dict, indent=2),
                 "sdif_database_methods": sdif_methods_str,
             }
-            # Instantiate the Agent
             agent = Agent(
                 name="Tidy SDIF Adapter Agent",
                 mcp_servers=[self.mcp_server],
@@ -390,8 +382,6 @@ class TidyAdapter(Adapter):
                 model=self.llm_model,
             )
-            # Run the agent using the Runner
-            # Pass the prompt and initial context
             logger.info(f"Running Tidy Agent with model {self.llm_model}...")
             result = await Runner.run(
                 agent,
@@ -409,7 +399,6 @@ class TidyAdapter(Adapter):
                 f"Agent finished. Final output message:\n{result.final_output[:500]}..."
             )
-            # Parse the final code from the agent's response
             final_code = self.parse_code(result.final_output)
             if not final_code:
@@ -421,20 +410,16 @@ class TidyAdapter(Adapter):
             logger.info(
                 "Successfully parsed final adaptation code from agent response."
             )
-            # print(f"--- Final Code ---\n{final_code}\n------------------") # Debugging
-            # Execute the *final* code using CodeAdapter directly to create the definitive output
             logger.info("Executing final adaptation code...")
             final_adapter = CodeAdapter(
                 function=final_code,
                 function_name="adapt_sdif",
-                output_suffix="_tidy_final",  # Use a distinct suffix for the final output
+                output_suffix="_tidy_final",
             )
-            # Adapt the *original* copied input path
             final_adapted_path = final_adapter.adapt(copied_input_path)
-            # Move the final successful output SDIF to a persistent location
-            # Example: place it next to the original input file
             persistent_output_path = (
                 input_path.parent / final_adapted_path.name
             ).resolve()
@@ -444,9 +429,7 @@ class TidyAdapter(Adapter):
                 )
                 persistent_output_path.unlink()
-            shutil.move(
-                str(final_adapted_path), persistent_output_path
-            )  # Move needs strings sometimes
+            shutil.move(str(final_adapted_path), persistent_output_path)
             logger.info(
                 f"Successfully generated final tidy SDIF: {persistent_output_path}"
             )
@@ -455,8 +438,6 @@ class TidyAdapter(Adapter):
         except Exception as e:
             logger.exception(f"Error during TidyAdapter adapt process: {e}")
-            # Re-raise or handle as appropriate
             raise
         finally:
-            # Always clean up temporary files
             self._cleanup_temp_env()

satif_ai-0.2.9/satif_ai/standardize.py ADDED Viewed

@@ -0,0 +1,112 @@
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+from satif_core.standardizers.base import AsyncStandardizer
+from satif_core.types import Datasource, FilePath, SDIFPath, StandardizationResult
+from satif_ai.adapters.tidy import TidyAdapter
+from satif_ai.standardizers.ai import AIStandardizer
+async def astandardize(
+    datasource: Datasource,
+    output_path: SDIFPath,
+    *,
+    overwrite: bool = False,
+    sdif_schema: Optional[Union[FilePath, Dict[str, Any]]] = None,
+    tidy_adapter: Union[bool, TidyAdapter] = False,
+    config: Optional[Dict[str, Any]] = None,
+    standardizer: Optional[AsyncStandardizer] = None,
+    mcp_server: Optional[Any] = None,
+    mcp_session: Optional[Any] = None,
+    llm_model: Optional[str] = None,
+) -> StandardizationResult:
+    """
+    Asynchronously standardizes a datasource into a single, canonical SDIF SQLite file.
+    This function serves as the primary entry point for the SATIF standardization layer.
+    It orchestrates the conversion of various input file formats (e.g., CSV, Excel, PDF)
+    from the provided datasource into a unified SDIF (Standard Data Interchange Format)
+    SQLite file. The process may involve AI-driven parsing, schema adaptation, and
+    data tidying, depending on the configuration and the capabilities of the
+    underlying standardizer.
+    Args:
+        datasource: The source of the data to be standardized. This can be a
+                    single file path (str or Path), a list of file paths, or other
+                    datasource types supported by the chosen standardizer.
+        output_path: The path (str or Path) where the output SDIF SQLite database file
+                     will be created (e.g., "./output/my_data.sdif").
+        overwrite: If True, an existing SDIF file at `output_path` will be
+                   overwritten. Defaults to False.
+        sdif_schema: Optional. Path to an SDIF schema definition file (e.g., a JSON file)
+                     or a dictionary representing the schema. If provided, the
+                     standardization process (specifically if using the default
+                     `AIStandardizer`) may attempt to adapt the data to this
+                     target schema.
+        tidy_adapter: Optional. If True, a default `TidyAdapter` may be used.
+                      Alternatively, a specific `TidyAdapter` instance can be provided
+                      to perform data tidying processes (e.g., cleaning, normalization,
+                      restructuring tables). If False (default), no explicit tidying
+                      step is initiated by this top-level function, though underlying
+                      standardizers might perform their own internal tidying.
+                      The specifics depend on the standardizer's capabilities.
+        config: Optional. A dictionary for advanced or standardizer-specific
+                configurations. This config is passed directly to the
+                `standardize` method of the chosen standardizer.
+        standardizer: Optional. An instance of an `AsyncStandardizer` subclass.
+                      If provided, this instance will be used for standardization.
+                      This allows for using pre-configured or custom standardizers.
+                      If None, a default `AIStandardizer` is instantiated using
+                      `mcp_server`, `mcp_session`, `llm_model`, `sdif_schema`,
+                      and `tidy_adapter`.
+        mcp_server: Optional. The MCP (Model Coordination Platform) server instance.
+                    Used if `standardizer` is None for the default `AIStandardizer`.
+        mcp_session: Optional. The MCP session or transport object.
+                     Used if `standardizer` is None for the default `AIStandardizer`.
+        llm_model: Optional. The language model to be used by the default `AIStandardizer`
+                   if no `standardizer` instance is provided (e.g., "gpt-4o").
+                   Each standardizer may have its own default model.
+    Returns:
+        A `StandardizationResult` object containing:
+        - `output_path`: The absolute `Path` to the created or updated SDIF database file.
+        - `file_configs`: An optional dictionary detailing configurations used for
+                          each processed input file, if applicable and returned by
+                          the standardizer.
+    Raises:
+        FileNotFoundError: If the `datasource` (or parts of it) does not exist.
+        FileExistsError: If `output_path` exists and `overwrite` is False.
+        ValueError: If input arguments are invalid (e.g., unsupported datasource type).
+        RuntimeError: For general errors during the standardization process.
+                      Specific exceptions may also be raised by the underlying
+                      standardizer implementation.
+    """
+    if standardizer is None:
+        standardizer = AIStandardizer(
+            mcp_server=mcp_server,
+            mcp_session=mcp_session,
+            llm_model=llm_model,
+            sdif_schema=sdif_schema,
+            tidy_adapter=tidy_adapter
+            if isinstance(tidy_adapter, TidyAdapter)
+            else (TidyAdapter() if tidy_adapter else None),
+        )
+    result = await standardizer.standardize(
+        datasource=datasource,
+        output_path=output_path,
+        overwrite=overwrite,
+        config=config,
+    )
+    output_sdif_path = (
+        Path(result.output_path)
+        if isinstance(result.output_path, str)
+        else result.output_path
+    )
+    return StandardizationResult(
+        output_path=output_sdif_path, file_configs=result.file_configs
+    )

satif_ai-0.2.9/satif_ai/standardizers/ai.py ADDED Viewed

@@ -0,0 +1,485 @@
+import asyncio
+import logging
+import shutil
+import tempfile
+import uuid
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from satif_core.standardizers.base import AsyncStandardizer
+from satif_core.types import Datasource, FilePath, SDIFPath, StandardizationResult
+from satif_ai.adapters.tidy import TidyAdapter
+from satif_ai.utils.merge_sdif import merge_sdif_files
+from satif_ai.utils.zip import extract_zip_archive_async
+from .ai_csv import AICSVStandardizer
+logger = logging.getLogger(__name__)
+class AIStandardizer(AsyncStandardizer):
+    """
+    Orchestrates the standardization of various file types using specialized AI standardizers.
+    It processes a datasource, which can include individual files or ZIP archives.
+    Files are dispatched to appropriate AI agents (e.g., AICSVStandardizer),
+    and their SDIF outputs are merged into a single, final SDIF.
+    """
+    def __init__(
+        self,
+        mcp_server: Optional[Any] = None,
+        mcp_session: Optional[Any] = None,
+        llm_model: Optional[str] = None,
+        sdif_schema: Optional[Union[FilePath, Dict[str, Any]]] = None,
+        tidy_adapter: Optional[TidyAdapter] = None,
+    ):
+        self.mcp_server = mcp_server
+        self.mcp_session = mcp_session
+        self.llm_model = llm_model
+        self.sdif_schema = sdif_schema  # TODO: Implement schema adaptation logic
+        self.tidy_adapter = tidy_adapter  # TODO: Implement tidying logic
+        self.ai_standardizer_map: Dict[str, Type[AsyncStandardizer]] = {
+            ".csv": AICSVStandardizer,
+            # Future standardizers:
+            # ".xlsx": AIXLSXStandardizer,
+            # ".pdf": AIPDFStandardizer,
+            # ".json": AIJSONStandardizer,
+            # ".xml": AIXMLStandardizer,
+        }
+        for ext, standardizer_class in self.ai_standardizer_map.items():
+            if not issubclass(standardizer_class, AsyncStandardizer):
+                raise TypeError(
+                    f"Standardizer for '{ext}' ({standardizer_class.__name__}) "
+                    "must inherit from AsyncStandardizer."
+                )
+    def _get_ai_standardizer_class(
+        self, extension: str
+    ) -> Optional[Type[AsyncStandardizer]]:
+        return self.ai_standardizer_map.get(extension.lower())
+    async def _resolve_input_files(
+        self, datasource: Datasource, temp_processing_dir: Path
+    ) -> List[Path]:
+        """
+        Resolves the input datasource to a list of individual file paths.
+        Handles single files, lists of files, and extracts ZIP archives.
+        """
+        input_file_paths: List[Path] = []
+        raw_paths_to_check: List[Union[str, Path]] = []
+        if isinstance(datasource, (str, Path)):
+            raw_paths_to_check = [datasource]
+        elif isinstance(datasource, list) and all(
+            isinstance(p, (str, Path)) for p in datasource
+        ):
+            raw_paths_to_check = datasource
+        else:
+            # This also catches the case where datasource is an empty list initially
+            raise ValueError(
+                "Datasource must be a non-empty file path (string or Path) or a non-empty list of such paths."
+            )
+        if not raw_paths_to_check:  # Should be caught by above, but defensive
+            raise ValueError("No input datasource paths provided.")
+        for raw_path_item in raw_paths_to_check:
+            raw_path = Path(raw_path_item).resolve()
+            if not raw_path.exists():
+                raise FileNotFoundError(f"Input path not found: {raw_path}")
+            if raw_path.is_file():
+                if raw_path.suffix.lower() == ".zip":
+                    zip_extract_target = (
+                        temp_processing_dir
+                        / f"extracted_{raw_path.stem}_{uuid.uuid4().hex[:8]}"
+                    )
+                    try:
+                        extracted_from_zip = await extract_zip_archive_async(
+                            raw_path, zip_extract_target
+                        )
+                        input_file_paths.extend(extracted_from_zip)
+                    except Exception as e_zip:
+                        logger.error(
+                            f"Failed to extract ZIP archive '{raw_path}': {e_zip}",
+                            exc_info=True,
+                        )
+                        # Decide if one failed zip should stop all, or just be skipped.
+                        # For now, skipping problematic zips.
+                        continue
+                else:
+                    input_file_paths.append(raw_path)
+            elif raw_path.is_dir():
+                logger.info(f"Processing directory datasource: {raw_path}")
+                for child_item in raw_path.iterdir():
+                    if child_item.is_file():
+                        input_file_paths.append(child_item)
+                    # Deeper recursion to be implemeted.
+            else:
+                logger.warning(
+                    f"Input path '{raw_path}' is not a file or directory and will be ignored."
+                )
+        if not input_file_paths:
+            # This means all inputs were invalid, unresolvable, or zips failed etc.
+            logger.error("No processable files found after resolving datasource.")
+            raise ValueError("Datasource resolution resulted in no processable files.")
+        return input_file_paths
+    def _group_files_by_standardizer(
+        self, file_paths: List[Path]
+    ) -> Tuple[Dict[Type[AsyncStandardizer], List[Path]], List[Path]]:
+        """Groups files by the AI standardizer responsible for them based on extension."""
+        grouped: Dict[Type[AsyncStandardizer], List[Path]] = defaultdict(list)
+        unsupported_files: List[Path] = []
+        for file_path in file_paths:
+            standardizer_class = self._get_ai_standardizer_class(file_path.suffix)
+            if standardizer_class:
+                grouped[standardizer_class].append(file_path)
+            else:
+                unsupported_files.append(file_path)
+        if unsupported_files:
+            logger.warning(
+                f"Unsupported files found and will be ignored: "
+                f"{[str(f.name) for f in unsupported_files]}"
+            )
+        return grouped, unsupported_files
+    async def _process_file_groups(
+        self,
+        grouped_files: Dict[Type[AsyncStandardizer], List[Path]],
+        temp_sdif_dir: Path,
+        config: Optional[Dict[str, Any]],
+        **kwargs,
+    ) -> Tuple[List[Path], List[Dict[str, Any]]]:
+        """
+        Processes groups of files using their respective AI standardizers.
+        Child standardizers are expected to produce a single SDIF SQLite file.
+        Returns:
+            A tuple containing:
+            - List of Paths to successfully created intermediate SDIF SQLite files.
+            - List of aggregated file configurations from child standardizers.
+        """
+        processing_tasks = []
+        standardizer_instances_info = []
+        for standardizer_class, files_in_group in grouped_files.items():
+            if not files_in_group:
+                continue
+            standardizer_init_kwargs = {}
+            # TODO: Pass standardizer-specific config from main 'config' if available for this standardizer_class
+            try:
+                ai_child_standardizer = standardizer_class(
+                    mcp_server=self.mcp_server,
+                    mcp_session=self.mcp_session,
+                    llm_model=self.llm_model,
+                    **standardizer_init_kwargs,
+                )
+            except Exception as e:
+                logger.error(
+                    f"Failed to initialize standardizer {standardizer_class.__name__} for '{files_in_group[0].name}': {e}",
+                    exc_info=True,
+                )
+                raise RuntimeError(
+                    f"Initialization failed for {standardizer_class.__name__}: {e}"
+                )
+            # Generate a unique filename for the intermediate SDIF SQLite file
+            intermediate_sdif_filename = f"intermediate_{standardizer_class.__name__}_{uuid.uuid4().hex[:12]}.sdif"
+            intermediate_sdif_file_path = temp_sdif_dir / intermediate_sdif_filename
+            logger.info(
+                f"Queueing standardization for {len(files_in_group)} file(s) "
+                f"with {standardizer_class.__name__} (output file: {intermediate_sdif_file_path})"
+            )
+            task = ai_child_standardizer.standardize(
+                datasource=files_in_group,
+                output_path=intermediate_sdif_file_path,
+                overwrite=True,  # Temporary intermediate files are always new/overwritten
+                config=config,
+                **kwargs,
+            )
+            processing_tasks.append(task)
+            standardizer_instances_info.append(
+                {
+                    "class_name": standardizer_class.__name__,
+                    "output_file": intermediate_sdif_file_path,
+                }
+            )
+        gathered_outputs = await asyncio.gather(
+            *processing_tasks, return_exceptions=True
+        )
+        successful_intermediate_sdif_files: List[Path] = []
+        aggregated_file_configs: List[Dict[str, Any]] = []
+        for i, result_or_exc in enumerate(gathered_outputs):
+            info = standardizer_instances_info[i]
+            expected_output_file: Path = info["output_file"]
+            if isinstance(result_or_exc, StandardizationResult):
+                # Child standardizer's output_path should be a file path.
+                child_reported_output_file = Path(result_or_exc.output_path)
+                if not child_reported_output_file.is_file():
+                    logger.error(
+                        f"Standardizer {info['class_name']} reported success, but its output path "
+                        f"'{child_reported_output_file}' is not a file or does not exist. Skipping."
+                    )
+                    continue  # Skip this problematic result
+                if (
+                    child_reported_output_file.resolve()
+                    != expected_output_file.resolve()
+                ):
+                    logger.warning(
+                        f"Standardizer {info['class_name']} reported output file '{child_reported_output_file}' "
+                        f"which differs from expected '{expected_output_file}'. Using reported path."
+                    )
+                logger.info(
+                    f"Successfully standardized group with {info['class_name']}. "
+                    f"Intermediate SDIF file: {child_reported_output_file}"
+                )
+                successful_intermediate_sdif_files.append(child_reported_output_file)
+                if result_or_exc.file_configs:
+                    aggregated_file_configs.extend(result_or_exc.file_configs)
+            elif isinstance(result_or_exc, Exception):
+                logger.error(
+                    f"Standardization by {info['class_name']} for target '{expected_output_file}' failed: {result_or_exc}",
+                    exc_info=result_or_exc,
+                )
+                # Optionally, try to clean up the expected_output_file if it was created before erroring
+                if expected_output_file.exists():
+                    try:
+                        expected_output_file.unlink()
+                    except OSError:
+                        pass
+        return successful_intermediate_sdif_files, aggregated_file_configs
+    async def _consolidate_results(
+        self,
+        intermediate_sdif_files: List[Path],
+        aggregated_file_configs: Optional[List[Dict[str, Any]]],
+        final_sdif_file_target: Path,
+        overwrite: bool,
+    ) -> StandardizationResult:
+        """
+        Merges or moves intermediate SDIF SQLite files to the final target SDIF SQLite file.
+        Cleans up intermediate files.
+        """
+        if not intermediate_sdif_files:
+            raise RuntimeError(
+                "No intermediate SDIF files were successfully generated to consolidate."
+            )
+        final_sdif_file_target.parent.mkdir(parents=True, exist_ok=True)
+        if final_sdif_file_target.exists():
+            if not overwrite:
+                raise FileExistsError(
+                    f"Final output file {final_sdif_file_target} already exists and overwrite is False."
+                )
+            logger.info(
+                f"Overwriting existing final output file: {final_sdif_file_target}"
+            )
+            try:
+                final_sdif_file_target.unlink()
+            except OSError as e_unlink:
+                logger.error(
+                    f"Could not delete existing file {final_sdif_file_target}: {e_unlink}"
+                )
+                raise  # Re-raise as this is critical for overwrite
+        final_sdif_path_str: str
+        if len(intermediate_sdif_files) == 1:
+            source_sqlite_file = intermediate_sdif_files[0]
+            logger.info(
+                f"Moving single intermediate SDIF SQLite file '{source_sqlite_file}' to final output '{final_sdif_file_target}'."
+            )
+            try:
+                shutil.move(str(source_sqlite_file), str(final_sdif_file_target))
+                final_sdif_path_str = str(final_sdif_file_target)
+            except Exception as e_move:
+                logger.error(
+                    f"Failed to move {source_sqlite_file} to {final_sdif_file_target}: {e_move}"
+                )
+                # Attempt to copy as a fallback, then try to remove source
+                try:
+                    shutil.copy2(str(source_sqlite_file), str(final_sdif_file_target))
+                    final_sdif_path_str = str(final_sdif_file_target)
+                    source_sqlite_file.unlink(
+                        missing_ok=True
+                    )  # Try to clean up source after copy
+                except Exception as e_copy_fallback:
+                    logger.error(
+                        f"Fallback copy also failed for {source_sqlite_file}: {e_copy_fallback}"
+                    )
+                    raise RuntimeError(
+                        f"Could not place intermediate file into final location: {e_copy_fallback}"
+                    ) from e_copy_fallback
+        else:
+            logger.info(
+                f"Merging {len(intermediate_sdif_files)} intermediate SDIF SQLite files into '{final_sdif_file_target}'."
+            )
+            # merge_sdif_files must accept a list of source SQLite file paths and a target SQLite file path.
+            merged_target_path = await merge_sdif_files(
+                intermediate_sdif_files,
+                final_sdif_file_target,
+                overwrite=False,  # We handled overwrite for final_sdif_file_target
+            )
+            final_sdif_path_str = str(merged_target_path)
+        # Clean up original intermediate files (they have been moved or their content merged)
+        for temp_file in intermediate_sdif_files:
+            if (
+                temp_file.exists()
+                and temp_file.resolve() != Path(final_sdif_path_str).resolve()
+            ):  # Don't delete the final file if it was one of the intermediates (single file case)
+                try:
+                    temp_file.unlink()
+                    logger.debug(f"Cleaned up intermediate file: {temp_file}")
+                except Exception as e_clean_file:
+                    logger.warning(
+                        f"Error cleaning up intermediate file {temp_file}: {e_clean_file}"
+                    )
+        logger.info(
+            f"Consolidation complete. Final SDIF SQLite file: {final_sdif_path_str}"
+        )
+        return StandardizationResult(
+            output_path=Path(final_sdif_path_str),
+            file_configs=aggregated_file_configs if aggregated_file_configs else None,
+        )
+    async def standardize(
+        self,
+        datasource: Datasource,
+        output_path: SDIFPath,  # Expected to be the path to the target *SDIF file*
+        *,
+        overwrite: bool = False,
+        config: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> StandardizationResult:
+        """
+        Standardizes datasource to a single SDIF SQLite file.
+        Args:
+            datasource: Source data (file path, list of paths, or directory path).
+            output_path: Path to the target output SDIF SQLite file (e.g., "./output/data.sdif").
+            overwrite: If True, overwrite existing output file. Defaults to False.
+            config: Optional configuration dictionary for standardizers.
+            **kwargs: Additional arguments passed to child standardizers.
+        Returns:
+            StandardizationResult with the path to the created SDIF SQLite file.
+        """
+        logger.info(
+            f"AIStandardizer starting process for output SDIF file: {output_path}"
+        )
+        final_sdif_file_target = Path(output_path).resolve()
+        if final_sdif_file_target.is_dir():
+            raise ValueError(
+                f"Target output_path '{final_sdif_file_target}' is a directory. "
+                "It must be a full file path for the target SDIF SQLite database (e.g., data.sqlite or data.sdif)."
+            )
+        if not final_sdif_file_target.suffix:
+            logger.warning(
+                f"Target output_path '{final_sdif_file_target}' has no file extension. "
+                "It should be a path to an SDIF SQLite database file (e.g., data.sqlite or data.sdif)."
+            )
+        elif final_sdif_file_target.suffix.lower() not in (".sdif", ".sqlite", ".db"):
+            logger.warning(
+                f"Target output_path '{final_sdif_file_target}' does not have a common SQLite extension. "
+                "Ensure this is the intended SQLite file path."
+            )
+        # Create a unique temporary directory for this standardization run
+        # This directory will hold intermediate files and ZIP extractions.
+        run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aistd_run_"))
+        intermediate_sdif_files_dir = run_temp_dir / "intermediate_sdif_files"
+        intermediate_sdif_files_dir.mkdir(parents=True, exist_ok=True)
+        file_processing_temp_dir = run_temp_dir / "file_processing_temp"
+        file_processing_temp_dir.mkdir(parents=True, exist_ok=True)
+        try:
+            # 1. Resolve input datasource to a list of processable file paths
+            resolved_files = await self._resolve_input_files(
+                datasource, file_processing_temp_dir
+            )
+            logger.info(f"Resolved {len(resolved_files)} file(s) for standardization.")
+            # 2. Group files by the AI standardizer responsible for them
+            grouped_by_std, unsupported = self._group_files_by_standardizer(
+                resolved_files
+            )
+            if not grouped_by_std:
+                user_message = (
+                    "No files found that can be handled by configured AI standardizers."
+                )
+                if unsupported:
+                    user_message += (
+                        f" Unsupported files: {[str(f.name) for f in unsupported]}"
+                    )
+                raise ValueError(user_message)
+            logger.debug(
+                f"File groups for standardization: { {cls.__name__: [f.name for f in paths] for cls, paths in grouped_by_std.items()} }"
+            )
+            # 3. Process each group of files, generating intermediate SDIF SQLite files
+            (
+                intermediate_sdif_files,
+                aggregated_file_configs,
+            ) = await self._process_file_groups(
+                grouped_by_std, intermediate_sdif_files_dir, config, **kwargs
+            )
+            if not intermediate_sdif_files:
+                raise RuntimeError(
+                    "No intermediate SDIF SQLite files were successfully generated."
+                )
+            logger.info(
+                f"Successfully generated {len(intermediate_sdif_files)} intermediate SDIF SQLite file(s)."
+            )
+            # 4. Consolidate intermediate SDIF files into the final target file
+            final_result = await self._consolidate_results(
+                intermediate_sdif_files,
+                aggregated_file_configs,
+                final_sdif_file_target,
+                overwrite,
+            )
+            logger.info(
+                f"AIStandardizer process completed. Final SDIF file at: {final_result.output_path}"
+            )
+            return final_result
+        except Exception as e:
+            logger.error(f"AIStandardizer failed: {e}", exc_info=True)
+            if isinstance(e, (ValueError, FileNotFoundError, FileExistsError)):
+                raise
+            raise RuntimeError(f"AIStandardizer processing error: {e}") from e
+        finally:
+            # Clean up the entire temporary directory for this run
+            if run_temp_dir.exists():
+                try:
+                    shutil.rmtree(run_temp_dir)
+                    logger.info(f"Cleaned up temporary run directory: {run_temp_dir}")
+                except Exception as e_clean:
+                    logger.error(
+                        f"Error cleaning up temporary run directory {run_temp_dir}: {e_clean}",
+                        exc_info=True,
+                    )

{satif_ai-0.2.8 → satif_ai-0.2.9}/satif_ai/standardizers/ai_csv.py RENAMED Viewed

@@ -37,7 +37,7 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
 - Encoding: {initial_encoding}
 - Delimiter: '{initial_delimiter}'
-**Your Comprehensive Task:**
+**Your Task:**
 1.  **Core Parsing Parameters:**
     *   Determine the correct file `encoding` (string, e.g., "utf-8", "latin-1").

satif_ai-0.2.9/satif_ai/transform.py ADDED Viewed

@@ -0,0 +1,121 @@
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from fastmcp import FastMCP
+from fastmcp.client.transports import FastMCPTransport
+from satif_core.code_executors.base import CodeExecutor
+from satif_core.transformation_builders.base import AsyncTransformationBuilder
+from satif_core.types import (
+    FilePath,
+    SDIFPath,
+    TransformationResult,
+)
+from satif_sdk.code_executors.local_executor import LocalCodeExecutor
+from satif_sdk.transformers.code import CodeTransformer
+from sdif_mcp.server import mcp
+from satif_ai.transformation_builders.syncpulse import SyncpulseTransformationBuilder
+from satif_ai.utils.openai_mcp import OpenAICompatibleMCP
+async def atransform(
+    sdif: SDIFPath,
+    output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
+    instructions: Optional[str] = None,
+    output_path: FilePath = Path("."),
+    *,
+    transformation_code: Optional[str] = None,
+    transformation_builder: Optional[AsyncTransformationBuilder] = None,
+    code_executor: Optional[CodeExecutor] = None,
+    mcp_server: Optional[FastMCP] = None,
+    mcp_transport: Optional[FastMCPTransport] = None,
+    llm_model: str = "o4-mini",
+    schema_only: bool = False,
+    representer_kwargs: Optional[Dict[str, Any]] = None,
+) -> TransformationResult:
+    """
+    Asynchronously transforms an SDIF (Standard Data Interchange Format) input using
+    an AI-generated or provided transformation code.
+    This function orchestrates the process of:
+    1. Optionally generating transformation code using an AI model via a `CodeBuilder`
+       if `transformation_code` is not provided.
+       explicitly passed.
+    2. Executing the transformation code using a `CodeTransformer` and a `CodeExecutor`.
+    3. Exporting the results to the specified output.
+    Args:
+        sdif: Path to the input SDIF file or an `SDIFDatabase` object.
+        output_target_files: A dictionary mapping original example file paths (or string identifiers)
+                             to their desired agent-facing filenames, or a list of output example
+                             file paths, or a single output file path. These are used by the AI to understand the target
+                             format and structure, and also by the `CodeTransformer` to determine
+                             output filenames if the transformation result keys match.
+        instructions: Optional. Natural language instructions for the AI to generate
+                      the transformation code. Used if `transformation_code` is None.
+        transformation_code: Optional. Pre-existing Python code for the transformation.
+                             If None, code will be generated by the `transformation_builder`.
+        transformation_builder: Optional. An `AsyncTransformationBuilder` instance responsible for generating
+                      the transformation code if `transformation_code` is not provided.
+                      If None, a `TransformationAsyncCodeBuilder` is instantiated.
+        code_executor: Optional. A `CodeExecutor` instance for running the transformation
+                       code. If None, a `LocalCodeExecutor` is used.
+        mcp_server: Optional. A `FastMCP` server instance for the AI code builder.
+                    Defaults to the global `mcp` instance if `transformation_builder` is None.
+        mcp_transport: Optional. A `FastMCPTransport` instance for communication with
+                       the `mcp_server`. Defaults to a new transport using `mcp_server`
+                       if `transformation_builder` is None.
+        llm_model: The language model to use for code generation (e.g., "o4-mini").
+                   Used if `transformation_builder` is None.
+        schema_only: If True, the transformation aims to match only the schema (headers)
+                     of the `output_target_files`, and input samples may be omitted or marked
+                     as empty for the AI. This is useful for structural transformations
+                     without processing actual data rows.
+        representer_kwargs: Optional dictionary of keyword arguments to pass to the
+                            representer when analyzing `output_target_files`.
+    Returns:
+        A `TransformationResult` object containing the path to the output
+        and the transformation code used.
+    """
+    if transformation_builder is None:
+        if mcp_server is None:
+            mcp_server = mcp
+        if mcp_transport is None:
+            mcp_transport = FastMCPTransport(mcp=mcp_server)
+        openai_compatible_mcp = OpenAICompatibleMCP(mcp=mcp_server)
+        await openai_compatible_mcp.connect()
+        transformation_builder = SyncpulseTransformationBuilder(
+            mcp_server=openai_compatible_mcp,
+            mcp_session=mcp_transport,
+            llm_model=llm_model,
+        )
+    if transformation_code is None:
+        function_code = await transformation_builder.build(
+            sdif=sdif,
+            output_target_files=output_target_files,
+            instructions=instructions,
+            schema_only=schema_only,
+            representer_kwargs=representer_kwargs,
+        )
+    else:
+        function_code = transformation_code
+    if code_executor is None:
+        code_executor = LocalCodeExecutor()
+    transformer = CodeTransformer(
+        function=function_code,
+        code_executor=code_executor,
+    )
+    output_path = transformer.export(
+        sdif=sdif,
+        output_path=output_path,
+    )
+    return TransformationResult(output_path=output_path, function_code=function_code)

satif_ai-0.2.8/satif_ai/code_builders/transformation.py → satif_ai-0.2.9/satif_ai/transformation_builders/syncpulse.py RENAMED Viewed

@@ -8,7 +8,9 @@ from typing import Any, Dict, List, Optional, Union
 from agents import Agent, Runner, function_tool
 from agents.mcp.server import MCPServer
 from mcp import ClientSession
-from satif_core import AsyncCodeBuilder, CodeBuilder, SDIFDatabase
+from satif_core import AsyncTransformationBuilder
+from satif_core.types import FilePath
+from satif_sdk.code_executors.local_executor import LocalCodeExecutor
 from satif_sdk.comparators import get_comparator
 from satif_sdk.representers import get_representer
 from satif_sdk.transformers import CodeTransformer
@@ -61,7 +63,10 @@ async def execute_transformation(code: str) -> str:
     if INPUT_SDIF_PATH is None or OUTPUT_TARGET_FILES is None:
         return "Error: Transformation context not initialized"
-    code_transformer = CodeTransformer(function=code)
+    code_transformer = CodeTransformer(
+        function=code,
+        code_executor=LocalCodeExecutor(disable_security_warning=True),
+    )
     generated_output_path = code_transformer.export(INPUT_SDIF_PATH)
     comparisons = []
@@ -120,19 +125,7 @@ async def execute_transformation(code: str) -> str:
     return "\n".join(comparisons)
-class TransformationCodeBuilder(CodeBuilder):
-    def __init__(self, output_example: Path | List[Path] | Dict[str, Path]):
-        self.output_example = output_example
-    def build(
-        self,
-        sdif: Path | SDIFDatabase,
-        instructions: Optional[str] = None,
-    ) -> str:
-        pass
-class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
+class SyncpulseTransformationBuilder(AsyncTransformationBuilder):
     """This class is used to build a transformation code that will be used to transform a SDIF database into a set of files following the format of the given output files."""
     def __init__(
@@ -147,23 +140,18 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
     async def build(
         self,
-        sdif: Path,  # This will now be relative to project root (MCP server CWD)
-        output_target_files: Dict[Union[str, Path], str] | List[Path],
-        output_sdif: Optional[Path] = None,  # This will now be relative or None
+        sdif: Path,
+        output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
+        output_sdif: Optional[Path] = None,
         instructions: str = "",
         schema_only: bool = False,
-        representer_options_for_build: Optional[Dict[str, Any]] = None,
+        representer_kwargs: Optional[Dict[str, Any]] = None,
     ) -> str:
         global INPUT_SDIF_PATH, OUTPUT_TARGET_FILES, SCHEMA_ONLY
-        # INPUT_SDIF_PATH is used by execute_transformation tool, needs to be accessible from where that tool runs.
-        # If execute_transformation runs in the same process as the builder, absolute path is fine.
-        # If it were a separate context, this might need adjustment.
-        # For now, assume execute_transformation can access absolute paths if needed for its *input SDIF*.
-        # However, the sdif for MCP URIs must be relative.
         INPUT_SDIF_PATH = Path(sdif).resolve()
         SCHEMA_ONLY = schema_only
-        # Paths for MCP URIs are now expected to be relative to MCP server CWD (project root)
-        # So, use them directly as strings.
+        # We must encode the path because special characters are not allowed in mcp read_resource()
         input_sdif_mcp_uri_path = base64.b64encode(str(sdif).encode()).decode()
         output_sdif_mcp_uri_path = (
             base64.b64encode(str(output_sdif).encode()).decode()
@@ -205,9 +193,14 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
         # OUTPUT_TARGET_FILES keys are absolute paths to original example files for local reading by representers/comparators.
         # Values are agent-facing filenames.
-        if isinstance(output_target_files, list):
+        if isinstance(output_target_files, FilePath):
+            OUTPUT_TARGET_FILES = {
+                Path(output_target_files).resolve(): Path(output_target_files).name
+            }
+        elif isinstance(output_target_files, list):
             OUTPUT_TARGET_FILES = {
-                file_path.resolve(): file_path.name for file_path in output_target_files
+                Path(file_path).resolve(): Path(file_path).name
+                for file_path in output_target_files
             }
         elif isinstance(output_target_files, dict):
             temp_map = {}
@@ -229,7 +222,7 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
                     # Representer uses the absolute path (file_key_abs_path) to read the example file.
                     representer = get_representer(file_key_abs_path)
                     representation, used_params = representer.represent(
-                        file_key_abs_path, **(representer_options_for_build or {})
+                        file_key_abs_path, **(representer_kwargs or {})
                     )
                     output_representation[agent_facing_name] = {
                         "representation": representation,

satif_ai-0.2.9/satif_ai/utils/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .merge_sdif import merge_sdif_files
+from .openai_mcp import OpenAICompatibleMCP
+from .zip import extract_zip_archive_async
+__all__ = ["merge_sdif_files", "extract_zip_archive_async", "OpenAICompatibleMCP"]

satif_ai-0.2.9/satif_ai/utils/merge_sdif.py ADDED Viewed

@@ -0,0 +1,22 @@
+from pathlib import Path
+from typing import List
+async def merge_sdif_files(sdif_paths: List[Path], output_dir: Path) -> Path:
+    """Placeholder function to merge multiple SDIF files into one.
+    Args:
+        sdif_paths: A list of paths to the SDIF files to merge.
+        output_dir: The directory where the merged file should be saved.
+    Returns:
+        Path to the merged SDIF file.
+    """
+    if not sdif_paths:
+        raise ValueError("No SDIF files provided for merging.")
+    if len(sdif_paths) == 1:
+        return sdif_paths[0]  # No merge needed
+    # TODO: Implement SDIF merge
+    raise NotImplementedError("Merge not implemented yet.")

satif_ai-0.2.9/satif_ai/utils/openai_mcp.py ADDED Viewed

@@ -0,0 +1,97 @@
+import logging
+from typing import Any
+from agents.mcp.server import CallToolResult, MCPServer, MCPTool
+from fastmcp import FastMCP
+logger = logging.getLogger(__name__)
+class OpenAICompatibleMCP(MCPServer):
+    def __init__(self, mcp: FastMCP):
+        self.mcp = mcp
+        self._is_connected = False  # Track connection state
+    async def connect(self):
+        """Connect to the server.
+        For FastMCP, connection is managed externally when the server is run.
+        This method marks the wrapper as connected.
+        """
+        # Assuming FastMCP instance is already running and configured.
+        # No specific connect action required for the FastMCP instance itself here,
+        # as its lifecycle (run, stop) is managed outside this wrapper.
+        logger.info(
+            f"OpenAICompatibleMCP: Simulating connection to FastMCP server '{self.mcp.name}'."
+        )
+        self._is_connected = True
+    @property
+    def name(self) -> str:
+        """A readable name for the server."""
+        return self.mcp.name
+    async def cleanup(self):
+        """Cleanup the server.
+        For FastMCP, cleanup is managed externally. This method marks the wrapper as disconnected.
+        """
+        # Similar to connect, actual server cleanup is external.
+        logger.info(
+            f"OpenAICompatibleMCP: Simulating cleanup for FastMCP server '{self.mcp.name}'."
+        )
+        self._is_connected = False
+    async def list_tools(self) -> list[MCPTool]:
+        """List the tools available on the server."""
+        if not self._is_connected:
+            # Or raise an error, depending on desired behavior for disconnected state
+            raise RuntimeError(
+                "OpenAICompatibleMCP.list_tools called while not connected."
+            )
+        # FastMCP's get_tools() returns a dict[str, fastmcp.tools.tool.Tool]
+        # Each fastmcp.tools.tool.Tool has a to_mcp_tool(name=key) method
+        # MCPTool is an alias for mcp.types.Tool
+        try:
+            fastmcp_tools = await self.mcp.get_tools()
+            mcp_tools_list = [
+                tool.to_mcp_tool(name=key) for key, tool in fastmcp_tools.items()
+            ]
+            return mcp_tools_list
+        except Exception as e:
+            logger.error(
+                f"Error listing tools from FastMCP server '{self.mcp.name}': {e}",
+                exc_info=True,
+            )
+            raise e
+    async def call_tool(
+        self, tool_name: str, arguments: dict[str, Any] | None
+    ) -> CallToolResult:
+        """Invoke a tool on the server."""
+        if not self._is_connected:
+            logger.warning(
+                f"OpenAICompatibleMCP.call_tool '{tool_name}' called while not connected."
+            )
+            # Return an error CallToolResult
+            return CallToolResult(
+                content=[{"type": "text", "text": "Server not connected"}], isError=True
+            )
+        try:
+            # FastMCP's _mcp_call_tool is a protected member, but seems to be what we need.
+            # It returns: list[TextContent | ImageContent | EmbeddedResource]
+            # This matches the 'content' part of CallToolResult.
+            # We need to handle potential errors and wrap the result.
+            content = await self.mcp._mcp_call_tool(tool_name, arguments or {})
+            return CallToolResult(content=content, isError=False)
+        except Exception as e:
+            logger.error(
+                f"Error calling tool '{tool_name}' on FastMCP server '{self.mcp.name}': {e}",
+                exc_info=True,
+            )
+            error_message = f"Error calling tool '{tool_name}': {type(e).__name__}: {e}"
+            # Ensure content is a list of valid MCP content items, even for errors.
+            # A TextContent is a safe choice.
+            return CallToolResult(
+                content=[{"type": "text", "text": error_message}], isError=True
+            )

satif_ai-0.2.9/satif_ai/utils/zip.py ADDED Viewed

@@ -0,0 +1,120 @@
+import asyncio
+import logging
+import zipfile
+from pathlib import Path
+from typing import List, Tuple
+logger = logging.getLogger(__name__)
+# Constants for ZIP file processing, kept local to this utility or passed as args if needed
+_IGNORED_ZIP_MEMBER_PREFIXES = ("__MACOSX/",)
+_IGNORED_ZIP_FILENAME_PREFIXES = ("._",)
+_IGNORED_ZIP_FILENAMES = (".DS_Store",)
+async def extract_zip_archive_async(
+    zip_path: Path,
+    extract_to: Path,
+    ignored_member_prefixes: Tuple[str, ...] = _IGNORED_ZIP_MEMBER_PREFIXES,
+    ignored_filename_prefixes: Tuple[str, ...] = _IGNORED_ZIP_FILENAME_PREFIXES,
+    ignored_filenames: Tuple[str, ...] = _IGNORED_ZIP_FILENAMES,
+) -> List[Path]:
+    """
+    Asynchronously extracts a ZIP archive to a specified directory, filtering out ignored files.
+    Args:
+        zip_path: Path to the ZIP archive.
+        extract_to: Directory where the contents will be extracted.
+        ignored_member_prefixes: Tuple of member path prefixes to ignore.
+        ignored_filename_prefixes: Tuple of filename prefixes to ignore.
+        ignored_filenames: Tuple of exact filenames to ignore.
+    Returns:
+        A list of paths to the successfully extracted files.
+    Raises:
+        ValueError: If the zip_path is invalid or corrupted.
+        RuntimeError: If any other error occurs during extraction.
+    """
+    def blocking_extract() -> List[Path]:
+        extracted_file_paths = []
+        logger.info(f"Extracting ZIP archive '{zip_path.name}' to '{extract_to}'...")
+        try:
+            extract_to.mkdir(
+                parents=True, exist_ok=True
+            )  # Ensure extract_to directory exists
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
+                # Security: Preliminary check for unsafe paths before extraction
+                for member_name in zip_ref.namelist():
+                    if member_name.startswith(("/", "..")):
+                        logger.error(
+                            f"Skipping potentially unsafe path in ZIP: {member_name}"
+                        )
+                        # Depending on security policy, might raise an error here
+                        continue
+                # Extract all members
+                zip_ref.extractall(extract_to)
+            # After extractall, collect all *file* paths, applying filters
+            # This second pass of filtering ensures that even if extractall creates them,
+            # we don't return paths to ignored files.
+            for root, _, files in extract_to.walk():
+                for filename in files:
+                    full_path = root / filename
+                    # Create a path relative to 'extract_to' to check against member prefixes
+                    # This ensures that '__MACOSX/file.txt' is correctly ignored,
+                    # not just a top-level '__MACOSX' directory.
+                    try:
+                        relative_path_to_check = full_path.relative_to(extract_to)
+                    except ValueError:
+                        # This can happen if full_path is not under extract_to,
+                        # which ideally shouldn't occur if zip_ref.extractall worked as expected
+                        # and target_path checks were effective.
+                        logger.warning(
+                            f"File {full_path} seems to be outside extraction root {extract_to}. Skipping."
+                        )
+                        continue
+                    path_str_to_check_prefixes = str(relative_path_to_check)
+                    if not (
+                        any(
+                            path_str_to_check_prefixes.startswith(p)
+                            for p in ignored_member_prefixes
+                        )
+                        or any(
+                            full_path.name.startswith(p)
+                            for p in ignored_filename_prefixes
+                        )
+                        or full_path.name in ignored_filenames
+                    ):
+                        extracted_file_paths.append(full_path)
+                    else:
+                        logger.debug(f"Ignoring file post-extraction: {full_path}")
+            if not extracted_file_paths:
+                logger.warning(
+                    f"ZIP archive '{zip_path.name}' is empty or contains no processable files after filtering."
+                )
+            else:
+                logger.info(
+                    f"Successfully extracted {len(extracted_file_paths)} file(s) from '{zip_path.name}'."
+                )
+            return extracted_file_paths
+        except zipfile.BadZipFile as e:
+            logger.error(
+                f"Invalid or corrupted ZIP file: {zip_path.name}", exc_info=True
+            )
+            raise ValueError(f"Invalid or corrupted ZIP file: {zip_path.name}") from e
+        except Exception as e:
+            logger.error(
+                f"Failed to extract ZIP archive '{zip_path.name}': {e}", exc_info=True
+            )
+            raise RuntimeError(
+                f"Unexpected error during ZIP extraction for '{zip_path.name}'"
+            ) from e
+    return await asyncio.to_thread(blocking_extract)

satif_ai-0.2.8/satif_ai/code_builders/adaptation.py DELETED Viewed

@@ -1,9 +0,0 @@
-from satif_core.code_builders.base import AsyncCodeBuilder, CodeBuilder
-class AdaptationCodeBuilder(CodeBuilder):
-    pass
-class AdaptationAsyncCodeBuilder(AsyncCodeBuilder):
-    pass

satif_ai-0.2.8/satif_ai/standardizers/__init__.py DELETED Viewed

File without changes

{satif_ai-0.2.8 → satif_ai-0.2.9}/LICENSE RENAMED Viewed

File without changes

{satif_ai-0.2.8 → satif_ai-0.2.9}/README.md RENAMED Viewed

File without changes

{satif_ai-0.2.8/satif_ai → satif_ai-0.2.9/satif_ai/adapters}/__init__.py RENAMED Viewed

File without changes

{satif_ai-0.2.8/satif_ai/adapters → satif_ai-0.2.9/satif_ai/standardizers}/__init__.py RENAMED Viewed

File without changes

{satif_ai-0.2.8/satif_ai/code_builders → satif_ai-0.2.9/satif_ai/transformation_builders}/__init__.py RENAMED Viewed

File without changes

satif-ai 0.2.8__tar.gz → 0.2.9__tar.gz

satif-ai 0.2.8tar.gz → 0.2.9tar.gz