PyPI - satif-ai - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

satif-ai 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

satif_ai/__init__.py +19 -0
satif_ai/adapters/tidy.py +19 -38
satif_ai/standardize.py +112 -0
satif_ai/standardizers/ai.py +485 -0
satif_ai/standardizers/ai_csv.py +47 -129
satif_ai/transform.py +121 -0
satif_ai/{code_builders/transformation.py → transformation_builders/syncpulse.py} +28 -36
satif_ai/utils/__init__.py +5 -0
satif_ai/utils/merge_sdif.py +22 -0
satif_ai/utils/openai_mcp.py +97 -0
satif_ai/utils/zip.py +120 -0
{satif_ai-0.2.7.dist-info → satif_ai-0.2.9.dist-info}/METADATA +4 -3
satif_ai-0.2.9.dist-info/RECORD +19 -0
satif_ai/code_builders/adaptation.py +0 -9
satif_ai/plot_builders/__init__.py +0 -0
satif_ai/plot_builders/agent.py +0 -204
satif_ai/plot_builders/prompt.py +0 -92
satif_ai/plot_builders/tool.py +0 -146
satif_ai-0.2.7.dist-info/RECORD +0 -17
/satif_ai/{code_builders → transformation_builders}/__init__.py +0 -0
{satif_ai-0.2.7.dist-info → satif_ai-0.2.9.dist-info}/LICENSE +0 -0
{satif_ai-0.2.7.dist-info → satif_ai-0.2.9.dist-info}/WHEEL +0 -0
{satif_ai-0.2.7.dist-info → satif_ai-0.2.9.dist-info}/entry_points.txt +0 -0

satif_ai/standardizers/ai_csv.py CHANGED Viewed

@@ -12,12 +12,13 @@ from agents import Agent, Runner, function_tool
 from agents.mcp.server import MCPServerStdio
 from charset_normalizer import detect
 from mcp import ClientSession
-from satif_core.types import Datasource, SDIFPath
+from satif_core.types import Datasource, SDIFPath, StandardizationResult
 from satif_sdk.standardizers.csv import (
+    CSVStandardizer,
+)
+from satif_sdk.utils import (
     DELIMITER_SAMPLE_SIZE,
     ENCODING_SAMPLE_SIZE,
-    CSVStandardizer,
-    SkipColumnsConfig,
 )
 logger = logging.getLogger(__name__)
@@ -36,7 +37,7 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
 - Encoding: {initial_encoding}
 - Delimiter: '{initial_delimiter}'
-**Your Comprehensive Task:**
+**Your Task:**
 1.  **Core Parsing Parameters:**
     *   Determine the correct file `encoding` (string, e.g., "utf-8", "latin-1").
@@ -50,7 +51,7 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
 3.  **Column Analysis and Definition:**
     *   For **each column** you identify that should be included in the final table:
-        *   `identifier_in_csv` (string): This is how the column is found in the *raw CSV data*.
+        *   `original_identifier` (string): This is how the column is found in the *raw CSV data*.
             *   If `has_header` is true, this is the **exact original header name** from the CSV.
             *   If `has_header` is false, this is a **string representation of the 0-based column index** (e.g., "0", "1", "2").
         *   `final_column_name` (string): This is the desired name for the column in the SDIF database table. It **MUST** be:
@@ -76,7 +77,7 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
         "skip_rows": 0, // Integer for initial N, or sorted list of 0-based indices e.g. [0, 1, 5]
         "columns": [
             {{
-                "identifier_in_csv": "original_header_or_index_string",
+                "original_identifier": "original_header_or_index_string",
                 "final_column_name": "sanitized_snake_case_name",
                 "description": null // Or string value. Null or omit if not generated.
             }}
@@ -88,19 +89,18 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
 **Tools Available:**
 - `read_csv_sample(encoding: str, delimiter: str, skip_initial_rows: int = 0, row_limit: int = 20, include_row_indices: bool = False)`: Reads a sample from the *beginning* of the file. Crucial for header and initial structure.
 - `read_raw_lines(encoding: str, line_limit: int = 50, start_line: int = 0)`: Reads raw lines. Useful for finding specific rows to skip (empty, repeated headers, footers) by their 0-based index.
-- `get_file_chunk(encoding: str, start_byte: int = 0, end_byte: int = 4096)`: Reads a raw chunk. Good for diagnosing encoding/delimiter issues if `read_csv_sample` returns garbled data or errors.
 **General Workflow Guidance:**
-1.  **Initial Probe & Core Params:** Use `read_csv_sample` with initial hints (and `include_row_indices=True`) to examine the first few rows. Verify/correct `encoding` and `delimiter`. If `read_csv_sample` reports errors or shows garbled data, use `get_file_chunk` with different encodings to diagnose. Determine `has_header` by looking at the first non-skipped row.
+1.  **Initial Probe & Core Params:** Use `read_csv_sample` with initial hints (and `include_row_indices=True`) to examine the first few rows. Verify/correct `encoding` and `delimiter`. If `read_csv_sample` reports errors or shows garbled data. Determine `has_header` by looking at the first non-skipped row.
 2.  **Identify Skip Rows:**
     *   If there's metadata/comments at the top, determine how many initial rows to skip and use that for `skip_rows` (integer value).
     *   Use `read_raw_lines` to scan for other rows to skip (e.g., empty lines, comment lines, repeated headers mid-file, summary footers). Collect all 0-based indices of such rows. If you have specific indices, `skip_rows` should be a sorted list of these indices. If you only skip initial N rows, it's an integer.
 3.  **Column Identification & Definition:**
     *   After settling `skip_rows` and `has_header`, call `read_csv_sample` again with `skip_initial_rows` set appropriately (if `skip_rows` is an int) to see the clean data rows and the header (if present).
-    *   If `has_header` is true, the first row from this clean sample gives you the `identifier_in_csv` values (original header names).
-    *   If `has_header` is false, the `identifier_in_csv` for each column will be its 0-based index as a string (e.g., "0", "1", "2", ... for as many columns as you see in the first data row).
+    *   If `has_header` is true, the first row from this clean sample gives you the `original_identifier` values (original header names).
+    *   If `has_header` is false, the `original_identifier` for each column will be its 0-based index as a string (e.g., "0", "1", "2", ... for as many columns as you see in the first data row).
     *   For each column you decide to include:
-        *   Determine its `identifier_in_csv`.
+        *   Determine its `original_identifier`.
         *   Create a clean, descriptive `final_column_name` (snake_case).
         *   If (and ONLY IF) necessary, write a `description` for that column.
 4.  **Table Naming & Description:** Based on the clean data and column names, formulate a `table_name` and, if valuable, a `table_description`.
@@ -273,60 +273,6 @@ async def read_raw_lines(
     )
-@function_tool
-async def get_file_chunk(
-    encoding: str, start_byte: int | None, end_byte: int | None
-) -> str:
-    if start_byte is None:
-        start_byte = 0
-    if end_byte is None:
-        end_byte = 4096
-    context = _CURRENT_AI_CSV_TOOL_CONTEXT.get()
-    if not context or not context.file_path or not context.file_path.exists():
-        return json.dumps({"error": "File path not found in tool context."})
-    if start_byte < 0 or end_byte < start_byte:
-        return json.dumps({"error": "Invalid byte range specified."})
-    chunk_text = ""
-    error_message = None
-    bytes_read = 0
-    try:
-        with open(context.file_path, "rb") as fb:
-            file_size = context.file_path.stat().st_size
-            effective_start_byte = min(start_byte, file_size)
-            fb.seek(effective_start_byte)
-            bytes_to_read = max(0, min(end_byte, file_size) - effective_start_byte)
-            if bytes_to_read > 0:
-                chunk_bytes = fb.read(bytes_to_read)
-                bytes_read = len(chunk_bytes)
-                chunk_text = chunk_bytes.decode(encoding, errors="replace")
-            else:
-                chunk_text = ""
-        return json.dumps(
-            {
-                "chunk": chunk_text,
-                "bytes_read": bytes_read,
-                "requested_range": [start_byte, end_byte],
-                "error": None,
-            }
-        )
-    except (UnicodeDecodeError, ValueError) as e:
-        error_message = f"Failed to decode file chunk: {e}. Used encoding '{encoding}'."
-    except OSError as e:
-        error_message = f"File read error: {e}."
-    except Exception as e:
-        logger.error(f"Unexpected error in get_file_chunk tool: {e}", exc_info=True)
-        error_message = f"Unexpected error reading file chunk: {str(e)}"
-    return json.dumps(
-        {
-            "error": error_message,
-            "chunk": chunk_text,
-            "bytes_read": bytes_read,
-            "requested_range": [start_byte, end_byte],
-        }
-    )
 # --- AICSVStandardizer Class ---
 class AICSVStandardizer(CSVStandardizer):  # Inherits from the enhanced CSVStandardizer
     def __init__(
@@ -337,19 +283,18 @@ class AICSVStandardizer(CSVStandardizer):  # Inherits from the enhanced CSVStand
         # --- Initial Hints (Optional) ---
         initial_delimiter: Optional[str] = None,
         initial_encoding: Optional[str] = None,
-        # --- Base Class Args Passthrough (some will be overridden by AI) ---
-        default_skip_columns: SkipColumnsConfig = None,  # Keep for base if AI doesn't define cols
     ):
+        # AI will determine the file_configs
         super().__init__(
-            delimiter=None,  # AI will determine
-            encoding=None,  # AI will determine
-            has_header=True,  # AI will determine
-            skip_rows=0,  # AI will determine
-            skip_columns=default_skip_columns,  # Can still be a fallback
-            descriptions=None,  # AI will generate table_description
-            table_names=None,  # AI will generate table_name
-            file_configs=None,  # AI provides all config for the one file
-            column_definitions=None,  # AI will generate column definitions
+            delimiter=None,
+            encoding=None,
+            has_header=True,
+            skip_rows=0,
+            skip_columns=None,
+            descriptions=None,
+            table_names=None,
+            file_configs=None,
+            column_definitions=None,
         )
         self.mcp_servers = [mcp_server] if mcp_server else []
@@ -357,7 +302,6 @@ class AICSVStandardizer(CSVStandardizer):  # Inherits from the enhanced CSVStand
         self.llm_model = llm_model
         self._initial_delimiter_hint = initial_delimiter
         self._initial_encoding_hint = initial_encoding
-        # self.generate_description from prompt structure (table_description, column descriptions)
     async def _get_initial_guesses(self, file_path: Path) -> Tuple[str, str]:
         """Helper to get initial encoding and delimiter guesses for a single file."""
@@ -419,7 +363,7 @@ class AICSVStandardizer(CSVStandardizer):  # Inherits from the enhanced CSVStand
             agent = Agent(
                 name="CSV Detail Analyzer Agent",
                 mcp_servers=self.mcp_servers,
-                tools=[read_csv_sample, read_raw_lines, get_file_chunk],
+                tools=[read_csv_sample, read_raw_lines],
                 model=self.llm_model,
             )
             logger.info(f"Running CSV Detail Analyzer Agent for {file_path.name}...")
@@ -469,7 +413,7 @@ class AICSVStandardizer(CSVStandardizer):  # Inherits from the enhanced CSVStand
                         raise ValueError(
                             f"Each item in 'columns' list must be a dictionary. Found: {type(col_spec)}"
                         )
-                    req_col_keys = {"identifier_in_csv", "final_column_name"}
+                    req_col_keys = {"original_identifier", "final_column_name"}
                     if not req_col_keys.issubset(col_spec.keys()):
                         missing_col_keys = req_col_keys - col_spec.keys()
                         raise ValueError(
@@ -520,7 +464,7 @@ class AICSVStandardizer(CSVStandardizer):  # Inherits from the enhanced CSVStand
         overwrite: bool = False,
         config: Optional[Dict[str, Any]] = None,
         **kwargs,
-    ) -> Path:
+    ) -> StandardizationResult:
         output_path_obj = Path(output_path)
         input_paths: List[Path]
@@ -545,8 +489,6 @@ class AICSVStandardizer(CSVStandardizer):  # Inherits from the enhanced CSVStand
                     f"Input CSV file not found or is not a file: {input_file_path}"
                 )
-            # Create a task for each file's analysis
-            # Need to wrap _get_initial_guesses and _run_analysis_agent in a single async co-routine for gather
             async def analyze_file_task(file_path_for_task: Path):
                 logger.info(
                     f"--- Starting AI Analysis for file: {file_path_for_task.name} ---"
@@ -554,86 +496,62 @@ class AICSVStandardizer(CSVStandardizer):  # Inherits from the enhanced CSVStand
                 enc_guess, delim_guess = await self._get_initial_guesses(
                     file_path_for_task
                 )
-                return await self._run_analysis_agent(
+                # Store the raw AI output for this file, potentially to add to StandardizationResult later
+                # This requires _run_analysis_agent to return the raw JSON string or parsed dict
+                ai_params_for_file = await self._run_analysis_agent(
                     file_path_for_task, enc_guess, delim_guess
                 )
+                return file_path_for_task, ai_params_for_file  # Return path with params
-            ai_analysis_tasks.append(
-                analyze_file_task(input_file_path)
-            )  # Pass the path to the task
+            ai_analysis_tasks.append(analyze_file_task(input_file_path))
         logger.info(f"Starting AI analysis for {len(ai_analysis_tasks)} CSV file(s)...")
+        all_ai_params_results_with_paths: List[Tuple[Path, Dict[str, Any]]] = []
         try:
-            all_ai_params_results = await asyncio.gather(*ai_analysis_tasks)
+            all_ai_params_results_with_paths = await asyncio.gather(*ai_analysis_tasks)
         except Exception as e:
             logger.exception(f"Critical error during concurrent AI analysis phase: {e}")
             raise RuntimeError("AI analysis phase failed.") from e
         logger.info(
-            f"AI analysis complete for all {len(all_ai_params_results)} file(s)."
+            f"AI analysis complete for all {len(all_ai_params_results_with_paths)} file(s)."
         )
-        # Aggregate parameters for the base CSVStandardizer
-        all_ai_table_names: List[str] = []
-        all_ai_table_descriptions: List[Optional[str]] = []
         all_ai_file_configs: List[Dict[str, Any]] = []
-        all_ai_column_definitions: List[
-            List[Dict[str, Any]]
-        ] = []  # List of lists of col_specs
-        for i, ai_params in enumerate(all_ai_params_results):
-            current_file_path = input_paths[i]  # Get corresponding input path
-            logger.info(f"Aggregating AI parameters for: {current_file_path.name}")
-            logger.info(f"  AI Table Name: {ai_params['table_name']}")
-            logger.info(f"  AI Encoding: {ai_params['encoding']}")
-            logger.info(f"  AI Delimiter: '{ai_params['delimiter']}'")
-            logger.info(f"  AI Has Header: {ai_params['has_header']}")
-            logger.info(f"  AI Skip Rows: {ai_params['skip_rows']}")
-            logger.info(
-                f"  AI Table Description: {ai_params.get('table_description') if ai_params.get('table_description') is not None else 'N/A'}"
-            )
-            # logger.info(f"  AI Column Definitions ({len(ai_params['columns'])} cols): {ai_params['columns'][:2]}...") # Log a sample
-            all_ai_table_names.append(ai_params["table_name"])
-            all_ai_table_descriptions.append(ai_params.get("table_description"))
+        for file_path, ai_params in all_ai_params_results_with_paths:
+            logger.info(f"Aggregating AI parameters for: {file_path.name}")
-            file_conf = {
+            file_conf_for_base = {
+                "table_name": ai_params["table_name"],
+                "description": ai_params.get("table_description"),
                 "encoding": ai_params["encoding"],
                 "delimiter": ai_params["delimiter"],
                 "has_header": ai_params["has_header"],
                 "skip_rows": ai_params["skip_rows"],
-                "skip_columns": None,  # Column selection is handled by column_definitions
+                "column_definitions": ai_params["columns"],
             }
-            all_ai_file_configs.append(file_conf)
-            all_ai_column_definitions.append(
-                ai_params["columns"]
-            )  # This is List[Dict], so we append it directly
+            all_ai_file_configs.append(file_conf_for_base)
-        # Instantiate the base CSVStandardizer with aggregated AI-derived parameters
-        logger.info(
-            "Initializing final CSVStandardizer with aggregated AI parameters..."
-        )
-        final_processor = CSVStandardizer(
-            table_names=all_ai_table_names,
-            descriptions=all_ai_table_descriptions,
-            file_configs=all_ai_file_configs,
-            column_definitions=all_ai_column_definitions,
-            skip_columns=self.default_skip_columns,  # Fallback, though ideally not used if AI defines all columns
+        logger.debug(
+            f"Initializing final CSVStandardizer with aggregated AI parameters: {all_ai_file_configs}"
         )
+        final_processor = CSVStandardizer(file_configs=all_ai_file_configs)
         try:
             logger.info(
                 f"Executing batch standardization for {len(input_paths)} file(s)..."
             )
-            result_path = final_processor.standardize(
-                datasource=input_paths,  # Pass the original list of Path objects
+            standardization_result = final_processor.standardize(
+                datasource=input_paths,
                 output_path=output_path_obj,
                 overwrite=overwrite,
             )
             logger.info(
-                f"AI CSV Standardization complete for all files. Output: {result_path}"
+                f"AI CSV Standardization complete. Output: {standardization_result.output_path}"
             )
-            return result_path
+            return standardization_result
         except Exception as e:
             logger.exception(
                 f"Error during final batch standardization step using AI parameters: {e}"

satif_ai/transform.py ADDED Viewed

@@ -0,0 +1,121 @@
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from fastmcp import FastMCP
+from fastmcp.client.transports import FastMCPTransport
+from satif_core.code_executors.base import CodeExecutor
+from satif_core.transformation_builders.base import AsyncTransformationBuilder
+from satif_core.types import (
+    FilePath,
+    SDIFPath,
+    TransformationResult,
+)
+from satif_sdk.code_executors.local_executor import LocalCodeExecutor
+from satif_sdk.transformers.code import CodeTransformer
+from sdif_mcp.server import mcp
+from satif_ai.transformation_builders.syncpulse import SyncpulseTransformationBuilder
+from satif_ai.utils.openai_mcp import OpenAICompatibleMCP
+async def atransform(
+    sdif: SDIFPath,
+    output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
+    instructions: Optional[str] = None,
+    output_path: FilePath = Path("."),
+    *,
+    transformation_code: Optional[str] = None,
+    transformation_builder: Optional[AsyncTransformationBuilder] = None,
+    code_executor: Optional[CodeExecutor] = None,
+    mcp_server: Optional[FastMCP] = None,
+    mcp_transport: Optional[FastMCPTransport] = None,
+    llm_model: str = "o4-mini",
+    schema_only: bool = False,
+    representer_kwargs: Optional[Dict[str, Any]] = None,
+) -> TransformationResult:
+    """
+    Asynchronously transforms an SDIF (Standard Data Interchange Format) input using
+    an AI-generated or provided transformation code.
+    This function orchestrates the process of:
+    1. Optionally generating transformation code using an AI model via a `CodeBuilder`
+       if `transformation_code` is not provided.
+       explicitly passed.
+    2. Executing the transformation code using a `CodeTransformer` and a `CodeExecutor`.
+    3. Exporting the results to the specified output.
+    Args:
+        sdif: Path to the input SDIF file or an `SDIFDatabase` object.
+        output_target_files: A dictionary mapping original example file paths (or string identifiers)
+                             to their desired agent-facing filenames, or a list of output example
+                             file paths, or a single output file path. These are used by the AI to understand the target
+                             format and structure, and also by the `CodeTransformer` to determine
+                             output filenames if the transformation result keys match.
+        instructions: Optional. Natural language instructions for the AI to generate
+                      the transformation code. Used if `transformation_code` is None.
+        transformation_code: Optional. Pre-existing Python code for the transformation.
+                             If None, code will be generated by the `transformation_builder`.
+        transformation_builder: Optional. An `AsyncTransformationBuilder` instance responsible for generating
+                      the transformation code if `transformation_code` is not provided.
+                      If None, a `TransformationAsyncCodeBuilder` is instantiated.
+        code_executor: Optional. A `CodeExecutor` instance for running the transformation
+                       code. If None, a `LocalCodeExecutor` is used.
+        mcp_server: Optional. A `FastMCP` server instance for the AI code builder.
+                    Defaults to the global `mcp` instance if `transformation_builder` is None.
+        mcp_transport: Optional. A `FastMCPTransport` instance for communication with
+                       the `mcp_server`. Defaults to a new transport using `mcp_server`
+                       if `transformation_builder` is None.
+        llm_model: The language model to use for code generation (e.g., "o4-mini").
+                   Used if `transformation_builder` is None.
+        schema_only: If True, the transformation aims to match only the schema (headers)
+                     of the `output_target_files`, and input samples may be omitted or marked
+                     as empty for the AI. This is useful for structural transformations
+                     without processing actual data rows.
+        representer_kwargs: Optional dictionary of keyword arguments to pass to the
+                            representer when analyzing `output_target_files`.
+    Returns:
+        A `TransformationResult` object containing the path to the output
+        and the transformation code used.
+    """
+    if transformation_builder is None:
+        if mcp_server is None:
+            mcp_server = mcp
+        if mcp_transport is None:
+            mcp_transport = FastMCPTransport(mcp=mcp_server)
+        openai_compatible_mcp = OpenAICompatibleMCP(mcp=mcp_server)
+        await openai_compatible_mcp.connect()
+        transformation_builder = SyncpulseTransformationBuilder(
+            mcp_server=openai_compatible_mcp,
+            mcp_session=mcp_transport,
+            llm_model=llm_model,
+        )
+    if transformation_code is None:
+        function_code = await transformation_builder.build(
+            sdif=sdif,
+            output_target_files=output_target_files,
+            instructions=instructions,
+            schema_only=schema_only,
+            representer_kwargs=representer_kwargs,
+        )
+    else:
+        function_code = transformation_code
+    if code_executor is None:
+        code_executor = LocalCodeExecutor()
+    transformer = CodeTransformer(
+        function=function_code,
+        code_executor=code_executor,
+    )
+    output_path = transformer.export(
+        sdif=sdif,
+        output_path=output_path,
+    )
+    return TransformationResult(output_path=output_path, function_code=function_code)

satif_ai/{code_builders/transformation.py → transformation_builders/syncpulse.py} RENAMED Viewed

@@ -8,7 +8,9 @@ from typing import Any, Dict, List, Optional, Union
 from agents import Agent, Runner, function_tool
 from agents.mcp.server import MCPServer
 from mcp import ClientSession
-from satif_core import AsyncCodeBuilder, CodeBuilder, SDIFDatabase
+from satif_core import AsyncTransformationBuilder
+from satif_core.types import FilePath
+from satif_sdk.code_executors.local_executor import LocalCodeExecutor
 from satif_sdk.comparators import get_comparator
 from satif_sdk.representers import get_representer
 from satif_sdk.transformers import CodeTransformer
@@ -61,11 +63,14 @@ async def execute_transformation(code: str) -> str:
     if INPUT_SDIF_PATH is None or OUTPUT_TARGET_FILES is None:
         return "Error: Transformation context not initialized"
-    code_transformer = CodeTransformer(function=code)
+    code_transformer = CodeTransformer(
+        function=code,
+        code_executor=LocalCodeExecutor(disable_security_warning=True),
+    )
     generated_output_path = code_transformer.export(INPUT_SDIF_PATH)
     comparisons = []
-    comparator_kwargs = {"check_structure_only": True}
+    comparator_kwargs = {}
     if SCHEMA_ONLY:
         comparator_kwargs["check_structure_only"] = True
@@ -120,19 +125,7 @@ async def execute_transformation(code: str) -> str:
     return "\n".join(comparisons)
-class TransformationCodeBuilder(CodeBuilder):
-    def __init__(self, output_example: Path | List[Path] | Dict[str, Path]):
-        self.output_example = output_example
-    def build(
-        self,
-        sdif: Path | SDIFDatabase,
-        instructions: Optional[str] = None,
-    ) -> str:
-        pass
-class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
+class SyncpulseTransformationBuilder(AsyncTransformationBuilder):
     """This class is used to build a transformation code that will be used to transform a SDIF database into a set of files following the format of the given output files."""
     def __init__(
@@ -147,23 +140,18 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
     async def build(
         self,
-        sdif: Path,  # This will now be relative to project root (MCP server CWD)
-        output_target_files: Dict[Union[str, Path], str] | List[Path],
-        output_sdif: Optional[Path] = None,  # This will now be relative or None
-        instructions: Optional[str] = None,
+        sdif: Path,
+        output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
+        output_sdif: Optional[Path] = None,
+        instructions: str = "",
         schema_only: bool = False,
-        representer_options_for_build: Optional[Dict[str, Any]] = None,
+        representer_kwargs: Optional[Dict[str, Any]] = None,
     ) -> str:
         global INPUT_SDIF_PATH, OUTPUT_TARGET_FILES, SCHEMA_ONLY
-        # INPUT_SDIF_PATH is used by execute_transformation tool, needs to be accessible from where that tool runs.
-        # If execute_transformation runs in the same process as the builder, absolute path is fine.
-        # If it were a separate context, this might need adjustment.
-        # For now, assume execute_transformation can access absolute paths if needed for its *input SDIF*.
-        # However, the sdif for MCP URIs must be relative.
         INPUT_SDIF_PATH = Path(sdif).resolve()
         SCHEMA_ONLY = schema_only
-        # Paths for MCP URIs are now expected to be relative to MCP server CWD (project root)
-        # So, use them directly as strings.
+        # We must encode the path because special characters are not allowed in mcp read_resource()
         input_sdif_mcp_uri_path = base64.b64encode(str(sdif).encode()).decode()
         output_sdif_mcp_uri_path = (
             base64.b64encode(str(output_sdif).encode()).decode()
@@ -205,9 +193,14 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
         # OUTPUT_TARGET_FILES keys are absolute paths to original example files for local reading by representers/comparators.
         # Values are agent-facing filenames.
-        if isinstance(output_target_files, list):
+        if isinstance(output_target_files, FilePath):
+            OUTPUT_TARGET_FILES = {
+                Path(output_target_files).resolve(): Path(output_target_files).name
+            }
+        elif isinstance(output_target_files, list):
             OUTPUT_TARGET_FILES = {
-                file_path.resolve(): file_path.name for file_path in output_target_files
+                Path(file_path).resolve(): Path(file_path).name
+                for file_path in output_target_files
             }
         elif isinstance(output_target_files, dict):
             temp_map = {}
@@ -229,7 +222,7 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
                     # Representer uses the absolute path (file_key_abs_path) to read the example file.
                     representer = get_representer(file_key_abs_path)
                     representation, used_params = representer.represent(
-                        file_key_abs_path, **(representer_options_for_build or {})
+                        file_key_abs_path, **(representer_kwargs or {})
                     )
                     output_representation[agent_facing_name] = {
                         "representation": representation,
@@ -259,11 +252,10 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
                 "output_schema": output_schema_text,
                 "output_sample": output_sample_text
                 if not SCHEMA_ONLY
-                else "Sample not available.",
-                "output_representation": str(
-                    output_representation
-                ),  # Representation keyed by agent-facing name
-                "instructions": instructions,
+                else "Sample not available. File is empty (no data).",
+                "output_representation": str(output_representation),
+                "instructions": instructions
+                or "No instructions provided. Use the output example.",
             },
         )
         agent = Agent(

satif_ai/utils/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .merge_sdif import merge_sdif_files
+from .openai_mcp import OpenAICompatibleMCP
+from .zip import extract_zip_archive_async
+__all__ = ["merge_sdif_files", "extract_zip_archive_async", "OpenAICompatibleMCP"]

satif_ai/utils/merge_sdif.py ADDED Viewed

@@ -0,0 +1,22 @@
+from pathlib import Path
+from typing import List
+async def merge_sdif_files(sdif_paths: List[Path], output_dir: Path) -> Path:
+    """Placeholder function to merge multiple SDIF files into one.
+    Args:
+        sdif_paths: A list of paths to the SDIF files to merge.
+        output_dir: The directory where the merged file should be saved.
+    Returns:
+        Path to the merged SDIF file.
+    """
+    if not sdif_paths:
+        raise ValueError("No SDIF files provided for merging.")
+    if len(sdif_paths) == 1:
+        return sdif_paths[0]  # No merge needed
+    # TODO: Implement SDIF merge
+    raise NotImplementedError("Merge not implemented yet.")

satif-ai 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

satif-ai 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl