satif-ai 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,12 +12,13 @@ from agents import Agent, Runner, function_tool
12
12
  from agents.mcp.server import MCPServerStdio
13
13
  from charset_normalizer import detect
14
14
  from mcp import ClientSession
15
- from satif_core.types import Datasource, SDIFPath
15
+ from satif_core.types import Datasource, SDIFPath, StandardizationResult
16
16
  from satif_sdk.standardizers.csv import (
17
+ CSVStandardizer,
18
+ )
19
+ from satif_sdk.utils import (
17
20
  DELIMITER_SAMPLE_SIZE,
18
21
  ENCODING_SAMPLE_SIZE,
19
- CSVStandardizer,
20
- SkipColumnsConfig,
21
22
  )
22
23
 
23
24
  logger = logging.getLogger(__name__)
@@ -36,7 +37,7 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
36
37
  - Encoding: {initial_encoding}
37
38
  - Delimiter: '{initial_delimiter}'
38
39
 
39
- **Your Comprehensive Task:**
40
+ **Your Task:**
40
41
 
41
42
  1. **Core Parsing Parameters:**
42
43
  * Determine the correct file `encoding` (string, e.g., "utf-8", "latin-1").
@@ -50,7 +51,7 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
50
51
 
51
52
  3. **Column Analysis and Definition:**
52
53
  * For **each column** you identify that should be included in the final table:
53
- * `identifier_in_csv` (string): This is how the column is found in the *raw CSV data*.
54
+ * `original_identifier` (string): This is how the column is found in the *raw CSV data*.
54
55
  * If `has_header` is true, this is the **exact original header name** from the CSV.
55
56
  * If `has_header` is false, this is a **string representation of the 0-based column index** (e.g., "0", "1", "2").
56
57
  * `final_column_name` (string): This is the desired name for the column in the SDIF database table. It **MUST** be:
@@ -76,7 +77,7 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
76
77
  "skip_rows": 0, // Integer for initial N, or sorted list of 0-based indices e.g. [0, 1, 5]
77
78
  "columns": [
78
79
  {{
79
- "identifier_in_csv": "original_header_or_index_string",
80
+ "original_identifier": "original_header_or_index_string",
80
81
  "final_column_name": "sanitized_snake_case_name",
81
82
  "description": null // Or string value. Null or omit if not generated.
82
83
  }}
@@ -88,19 +89,18 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
88
89
  **Tools Available:**
89
90
  - `read_csv_sample(encoding: str, delimiter: str, skip_initial_rows: int = 0, row_limit: int = 20, include_row_indices: bool = False)`: Reads a sample from the *beginning* of the file. Crucial for header and initial structure.
90
91
  - `read_raw_lines(encoding: str, line_limit: int = 50, start_line: int = 0)`: Reads raw lines. Useful for finding specific rows to skip (empty, repeated headers, footers) by their 0-based index.
91
- - `get_file_chunk(encoding: str, start_byte: int = 0, end_byte: int = 4096)`: Reads a raw chunk. Good for diagnosing encoding/delimiter issues if `read_csv_sample` returns garbled data or errors.
92
92
 
93
93
  **General Workflow Guidance:**
94
- 1. **Initial Probe & Core Params:** Use `read_csv_sample` with initial hints (and `include_row_indices=True`) to examine the first few rows. Verify/correct `encoding` and `delimiter`. If `read_csv_sample` reports errors or shows garbled data, use `get_file_chunk` with different encodings to diagnose. Determine `has_header` by looking at the first non-skipped row.
94
+ 1. **Initial Probe & Core Params:** Use `read_csv_sample` with initial hints (and `include_row_indices=True`) to examine the first few rows. Verify/correct `encoding` and `delimiter`. If `read_csv_sample` reports errors or shows garbled data. Determine `has_header` by looking at the first non-skipped row.
95
95
  2. **Identify Skip Rows:**
96
96
  * If there's metadata/comments at the top, determine how many initial rows to skip and use that for `skip_rows` (integer value).
97
97
  * Use `read_raw_lines` to scan for other rows to skip (e.g., empty lines, comment lines, repeated headers mid-file, summary footers). Collect all 0-based indices of such rows. If you have specific indices, `skip_rows` should be a sorted list of these indices. If you only skip initial N rows, it's an integer.
98
98
  3. **Column Identification & Definition:**
99
99
  * After settling `skip_rows` and `has_header`, call `read_csv_sample` again with `skip_initial_rows` set appropriately (if `skip_rows` is an int) to see the clean data rows and the header (if present).
100
- * If `has_header` is true, the first row from this clean sample gives you the `identifier_in_csv` values (original header names).
101
- * If `has_header` is false, the `identifier_in_csv` for each column will be its 0-based index as a string (e.g., "0", "1", "2", ... for as many columns as you see in the first data row).
100
+ * If `has_header` is true, the first row from this clean sample gives you the `original_identifier` values (original header names).
101
+ * If `has_header` is false, the `original_identifier` for each column will be its 0-based index as a string (e.g., "0", "1", "2", ... for as many columns as you see in the first data row).
102
102
  * For each column you decide to include:
103
- * Determine its `identifier_in_csv`.
103
+ * Determine its `original_identifier`.
104
104
  * Create a clean, descriptive `final_column_name` (snake_case).
105
105
  * If (and ONLY IF) necessary, write a `description` for that column.
106
106
  4. **Table Naming & Description:** Based on the clean data and column names, formulate a `table_name` and, if valuable, a `table_description`.
@@ -273,60 +273,6 @@ async def read_raw_lines(
273
273
  )
274
274
 
275
275
 
276
- @function_tool
277
- async def get_file_chunk(
278
- encoding: str, start_byte: int | None, end_byte: int | None
279
- ) -> str:
280
- if start_byte is None:
281
- start_byte = 0
282
- if end_byte is None:
283
- end_byte = 4096
284
- context = _CURRENT_AI_CSV_TOOL_CONTEXT.get()
285
- if not context or not context.file_path or not context.file_path.exists():
286
- return json.dumps({"error": "File path not found in tool context."})
287
- if start_byte < 0 or end_byte < start_byte:
288
- return json.dumps({"error": "Invalid byte range specified."})
289
-
290
- chunk_text = ""
291
- error_message = None
292
- bytes_read = 0
293
- try:
294
- with open(context.file_path, "rb") as fb:
295
- file_size = context.file_path.stat().st_size
296
- effective_start_byte = min(start_byte, file_size)
297
- fb.seek(effective_start_byte)
298
- bytes_to_read = max(0, min(end_byte, file_size) - effective_start_byte)
299
- if bytes_to_read > 0:
300
- chunk_bytes = fb.read(bytes_to_read)
301
- bytes_read = len(chunk_bytes)
302
- chunk_text = chunk_bytes.decode(encoding, errors="replace")
303
- else:
304
- chunk_text = ""
305
- return json.dumps(
306
- {
307
- "chunk": chunk_text,
308
- "bytes_read": bytes_read,
309
- "requested_range": [start_byte, end_byte],
310
- "error": None,
311
- }
312
- )
313
- except (UnicodeDecodeError, ValueError) as e:
314
- error_message = f"Failed to decode file chunk: {e}. Used encoding '{encoding}'."
315
- except OSError as e:
316
- error_message = f"File read error: {e}."
317
- except Exception as e:
318
- logger.error(f"Unexpected error in get_file_chunk tool: {e}", exc_info=True)
319
- error_message = f"Unexpected error reading file chunk: {str(e)}"
320
- return json.dumps(
321
- {
322
- "error": error_message,
323
- "chunk": chunk_text,
324
- "bytes_read": bytes_read,
325
- "requested_range": [start_byte, end_byte],
326
- }
327
- )
328
-
329
-
330
276
  # --- AICSVStandardizer Class ---
331
277
  class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStandardizer
332
278
  def __init__(
@@ -337,19 +283,18 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
337
283
  # --- Initial Hints (Optional) ---
338
284
  initial_delimiter: Optional[str] = None,
339
285
  initial_encoding: Optional[str] = None,
340
- # --- Base Class Args Passthrough (some will be overridden by AI) ---
341
- default_skip_columns: SkipColumnsConfig = None, # Keep for base if AI doesn't define cols
342
286
  ):
287
+ # AI will determine the file_configs
343
288
  super().__init__(
344
- delimiter=None, # AI will determine
345
- encoding=None, # AI will determine
346
- has_header=True, # AI will determine
347
- skip_rows=0, # AI will determine
348
- skip_columns=default_skip_columns, # Can still be a fallback
349
- descriptions=None, # AI will generate table_description
350
- table_names=None, # AI will generate table_name
351
- file_configs=None, # AI provides all config for the one file
352
- column_definitions=None, # AI will generate column definitions
289
+ delimiter=None,
290
+ encoding=None,
291
+ has_header=True,
292
+ skip_rows=0,
293
+ skip_columns=None,
294
+ descriptions=None,
295
+ table_names=None,
296
+ file_configs=None,
297
+ column_definitions=None,
353
298
  )
354
299
 
355
300
  self.mcp_servers = [mcp_server] if mcp_server else []
@@ -357,7 +302,6 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
357
302
  self.llm_model = llm_model
358
303
  self._initial_delimiter_hint = initial_delimiter
359
304
  self._initial_encoding_hint = initial_encoding
360
- # self.generate_description from prompt structure (table_description, column descriptions)
361
305
 
362
306
  async def _get_initial_guesses(self, file_path: Path) -> Tuple[str, str]:
363
307
  """Helper to get initial encoding and delimiter guesses for a single file."""
@@ -419,7 +363,7 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
419
363
  agent = Agent(
420
364
  name="CSV Detail Analyzer Agent",
421
365
  mcp_servers=self.mcp_servers,
422
- tools=[read_csv_sample, read_raw_lines, get_file_chunk],
366
+ tools=[read_csv_sample, read_raw_lines],
423
367
  model=self.llm_model,
424
368
  )
425
369
  logger.info(f"Running CSV Detail Analyzer Agent for {file_path.name}...")
@@ -469,7 +413,7 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
469
413
  raise ValueError(
470
414
  f"Each item in 'columns' list must be a dictionary. Found: {type(col_spec)}"
471
415
  )
472
- req_col_keys = {"identifier_in_csv", "final_column_name"}
416
+ req_col_keys = {"original_identifier", "final_column_name"}
473
417
  if not req_col_keys.issubset(col_spec.keys()):
474
418
  missing_col_keys = req_col_keys - col_spec.keys()
475
419
  raise ValueError(
@@ -520,7 +464,7 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
520
464
  overwrite: bool = False,
521
465
  config: Optional[Dict[str, Any]] = None,
522
466
  **kwargs,
523
- ) -> Path:
467
+ ) -> StandardizationResult:
524
468
  output_path_obj = Path(output_path)
525
469
 
526
470
  input_paths: List[Path]
@@ -545,8 +489,6 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
545
489
  f"Input CSV file not found or is not a file: {input_file_path}"
546
490
  )
547
491
 
548
- # Create a task for each file's analysis
549
- # Need to wrap _get_initial_guesses and _run_analysis_agent in a single async co-routine for gather
550
492
  async def analyze_file_task(file_path_for_task: Path):
551
493
  logger.info(
552
494
  f"--- Starting AI Analysis for file: {file_path_for_task.name} ---"
@@ -554,86 +496,62 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
554
496
  enc_guess, delim_guess = await self._get_initial_guesses(
555
497
  file_path_for_task
556
498
  )
557
- return await self._run_analysis_agent(
499
+ # Store the raw AI output for this file, potentially to add to StandardizationResult later
500
+ # This requires _run_analysis_agent to return the raw JSON string or parsed dict
501
+ ai_params_for_file = await self._run_analysis_agent(
558
502
  file_path_for_task, enc_guess, delim_guess
559
503
  )
504
+ return file_path_for_task, ai_params_for_file # Return path with params
560
505
 
561
- ai_analysis_tasks.append(
562
- analyze_file_task(input_file_path)
563
- ) # Pass the path to the task
506
+ ai_analysis_tasks.append(analyze_file_task(input_file_path))
564
507
 
565
508
  logger.info(f"Starting AI analysis for {len(ai_analysis_tasks)} CSV file(s)...")
509
+ all_ai_params_results_with_paths: List[Tuple[Path, Dict[str, Any]]] = []
566
510
  try:
567
- all_ai_params_results = await asyncio.gather(*ai_analysis_tasks)
511
+ all_ai_params_results_with_paths = await asyncio.gather(*ai_analysis_tasks)
568
512
  except Exception as e:
569
513
  logger.exception(f"Critical error during concurrent AI analysis phase: {e}")
570
514
  raise RuntimeError("AI analysis phase failed.") from e
571
515
 
572
516
  logger.info(
573
- f"AI analysis complete for all {len(all_ai_params_results)} file(s)."
517
+ f"AI analysis complete for all {len(all_ai_params_results_with_paths)} file(s)."
574
518
  )
575
519
 
576
- # Aggregate parameters for the base CSVStandardizer
577
- all_ai_table_names: List[str] = []
578
- all_ai_table_descriptions: List[Optional[str]] = []
579
520
  all_ai_file_configs: List[Dict[str, Any]] = []
580
- all_ai_column_definitions: List[
581
- List[Dict[str, Any]]
582
- ] = [] # List of lists of col_specs
583
-
584
- for i, ai_params in enumerate(all_ai_params_results):
585
- current_file_path = input_paths[i] # Get corresponding input path
586
- logger.info(f"Aggregating AI parameters for: {current_file_path.name}")
587
- logger.info(f" AI Table Name: {ai_params['table_name']}")
588
- logger.info(f" AI Encoding: {ai_params['encoding']}")
589
- logger.info(f" AI Delimiter: '{ai_params['delimiter']}'")
590
- logger.info(f" AI Has Header: {ai_params['has_header']}")
591
- logger.info(f" AI Skip Rows: {ai_params['skip_rows']}")
592
- logger.info(
593
- f" AI Table Description: {ai_params.get('table_description') if ai_params.get('table_description') is not None else 'N/A'}"
594
- )
595
- # logger.info(f" AI Column Definitions ({len(ai_params['columns'])} cols): {ai_params['columns'][:2]}...") # Log a sample
596
521
 
597
- all_ai_table_names.append(ai_params["table_name"])
598
- all_ai_table_descriptions.append(ai_params.get("table_description"))
522
+ for file_path, ai_params in all_ai_params_results_with_paths:
523
+ logger.info(f"Aggregating AI parameters for: {file_path.name}")
599
524
 
600
- file_conf = {
525
+ file_conf_for_base = {
526
+ "table_name": ai_params["table_name"],
527
+ "description": ai_params.get("table_description"),
601
528
  "encoding": ai_params["encoding"],
602
529
  "delimiter": ai_params["delimiter"],
603
530
  "has_header": ai_params["has_header"],
604
531
  "skip_rows": ai_params["skip_rows"],
605
- "skip_columns": None, # Column selection is handled by column_definitions
532
+ "column_definitions": ai_params["columns"],
606
533
  }
607
- all_ai_file_configs.append(file_conf)
608
- all_ai_column_definitions.append(
609
- ai_params["columns"]
610
- ) # This is List[Dict], so we append it directly
534
+ all_ai_file_configs.append(file_conf_for_base)
611
535
 
612
- # Instantiate the base CSVStandardizer with aggregated AI-derived parameters
613
- logger.info(
614
- "Initializing final CSVStandardizer with aggregated AI parameters..."
615
- )
616
- final_processor = CSVStandardizer(
617
- table_names=all_ai_table_names,
618
- descriptions=all_ai_table_descriptions,
619
- file_configs=all_ai_file_configs,
620
- column_definitions=all_ai_column_definitions,
621
- skip_columns=self.default_skip_columns, # Fallback, though ideally not used if AI defines all columns
536
+ logger.debug(
537
+ f"Initializing final CSVStandardizer with aggregated AI parameters: {all_ai_file_configs}"
622
538
  )
539
+ final_processor = CSVStandardizer(file_configs=all_ai_file_configs)
623
540
 
624
541
  try:
625
542
  logger.info(
626
543
  f"Executing batch standardization for {len(input_paths)} file(s)..."
627
544
  )
628
- result_path = final_processor.standardize(
629
- datasource=input_paths, # Pass the original list of Path objects
545
+ standardization_result = final_processor.standardize(
546
+ datasource=input_paths,
630
547
  output_path=output_path_obj,
631
548
  overwrite=overwrite,
632
549
  )
633
550
  logger.info(
634
- f"AI CSV Standardization complete for all files. Output: {result_path}"
551
+ f"AI CSV Standardization complete. Output: {standardization_result.output_path}"
635
552
  )
636
- return result_path
553
+
554
+ return standardization_result
637
555
  except Exception as e:
638
556
  logger.exception(
639
557
  f"Error during final batch standardization step using AI parameters: {e}"
satif_ai/transform.py ADDED
@@ -0,0 +1,121 @@
1
+ from pathlib import Path
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from fastmcp import FastMCP
5
+ from fastmcp.client.transports import FastMCPTransport
6
+ from satif_core.code_executors.base import CodeExecutor
7
+ from satif_core.transformation_builders.base import AsyncTransformationBuilder
8
+ from satif_core.types import (
9
+ FilePath,
10
+ SDIFPath,
11
+ TransformationResult,
12
+ )
13
+ from satif_sdk.code_executors.local_executor import LocalCodeExecutor
14
+ from satif_sdk.transformers.code import CodeTransformer
15
+ from sdif_mcp.server import mcp
16
+
17
+ from satif_ai.transformation_builders.syncpulse import SyncpulseTransformationBuilder
18
+ from satif_ai.utils.openai_mcp import OpenAICompatibleMCP
19
+
20
+
21
+ async def atransform(
22
+ sdif: SDIFPath,
23
+ output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
24
+ instructions: Optional[str] = None,
25
+ output_path: FilePath = Path("."),
26
+ *,
27
+ transformation_code: Optional[str] = None,
28
+ transformation_builder: Optional[AsyncTransformationBuilder] = None,
29
+ code_executor: Optional[CodeExecutor] = None,
30
+ mcp_server: Optional[FastMCP] = None,
31
+ mcp_transport: Optional[FastMCPTransport] = None,
32
+ llm_model: str = "o4-mini",
33
+ schema_only: bool = False,
34
+ representer_kwargs: Optional[Dict[str, Any]] = None,
35
+ ) -> TransformationResult:
36
+ """
37
+ Asynchronously transforms an SDIF (Standard Data Interchange Format) input using
38
+ an AI-generated or provided transformation code.
39
+
40
+ This function orchestrates the process of:
41
+ 1. Optionally generating transformation code using an AI model via a `CodeBuilder`
42
+ if `transformation_code` is not provided.
43
+ explicitly passed.
44
+ 2. Executing the transformation code using a `CodeTransformer` and a `CodeExecutor`.
45
+ 3. Exporting the results to the specified output.
46
+
47
+ Args:
48
+ sdif: Path to the input SDIF file or an `SDIFDatabase` object.
49
+ output_target_files: A dictionary mapping original example file paths (or string identifiers)
50
+ to their desired agent-facing filenames, or a list of output example
51
+ file paths, or a single output file path. These are used by the AI to understand the target
52
+ format and structure, and also by the `CodeTransformer` to determine
53
+ output filenames if the transformation result keys match.
54
+ instructions: Optional. Natural language instructions for the AI to generate
55
+ the transformation code. Used if `transformation_code` is None.
56
+ transformation_code: Optional. Pre-existing Python code for the transformation.
57
+ If None, code will be generated by the `transformation_builder`.
58
+ transformation_builder: Optional. An `AsyncTransformationBuilder` instance responsible for generating
59
+ the transformation code if `transformation_code` is not provided.
60
+ If None, a `TransformationAsyncCodeBuilder` is instantiated.
61
+ code_executor: Optional. A `CodeExecutor` instance for running the transformation
62
+ code. If None, a `LocalCodeExecutor` is used.
63
+ mcp_server: Optional. A `FastMCP` server instance for the AI code builder.
64
+ Defaults to the global `mcp` instance if `transformation_builder` is None.
65
+ mcp_transport: Optional. A `FastMCPTransport` instance for communication with
66
+ the `mcp_server`. Defaults to a new transport using `mcp_server`
67
+ if `transformation_builder` is None.
68
+ llm_model: The language model to use for code generation (e.g., "o4-mini").
69
+ Used if `transformation_builder` is None.
70
+ schema_only: If True, the transformation aims to match only the schema (headers)
71
+ of the `output_target_files`, and input samples may be omitted or marked
72
+ as empty for the AI. This is useful for structural transformations
73
+ without processing actual data rows.
74
+ representer_kwargs: Optional dictionary of keyword arguments to pass to the
75
+ representer when analyzing `output_target_files`.
76
+
77
+ Returns:
78
+ A `TransformationResult` object containing the path to the output
79
+ and the transformation code used.
80
+ """
81
+ if transformation_builder is None:
82
+ if mcp_server is None:
83
+ mcp_server = mcp
84
+
85
+ if mcp_transport is None:
86
+ mcp_transport = FastMCPTransport(mcp=mcp_server)
87
+
88
+ openai_compatible_mcp = OpenAICompatibleMCP(mcp=mcp_server)
89
+ await openai_compatible_mcp.connect()
90
+
91
+ transformation_builder = SyncpulseTransformationBuilder(
92
+ mcp_server=openai_compatible_mcp,
93
+ mcp_session=mcp_transport,
94
+ llm_model=llm_model,
95
+ )
96
+
97
+ if transformation_code is None:
98
+ function_code = await transformation_builder.build(
99
+ sdif=sdif,
100
+ output_target_files=output_target_files,
101
+ instructions=instructions,
102
+ schema_only=schema_only,
103
+ representer_kwargs=representer_kwargs,
104
+ )
105
+ else:
106
+ function_code = transformation_code
107
+
108
+ if code_executor is None:
109
+ code_executor = LocalCodeExecutor()
110
+
111
+ transformer = CodeTransformer(
112
+ function=function_code,
113
+ code_executor=code_executor,
114
+ )
115
+
116
+ output_path = transformer.export(
117
+ sdif=sdif,
118
+ output_path=output_path,
119
+ )
120
+
121
+ return TransformationResult(output_path=output_path, function_code=function_code)
@@ -8,7 +8,9 @@ from typing import Any, Dict, List, Optional, Union
8
8
  from agents import Agent, Runner, function_tool
9
9
  from agents.mcp.server import MCPServer
10
10
  from mcp import ClientSession
11
- from satif_core import AsyncCodeBuilder, CodeBuilder, SDIFDatabase
11
+ from satif_core import AsyncTransformationBuilder
12
+ from satif_core.types import FilePath
13
+ from satif_sdk.code_executors.local_executor import LocalCodeExecutor
12
14
  from satif_sdk.comparators import get_comparator
13
15
  from satif_sdk.representers import get_representer
14
16
  from satif_sdk.transformers import CodeTransformer
@@ -61,11 +63,14 @@ async def execute_transformation(code: str) -> str:
61
63
  if INPUT_SDIF_PATH is None or OUTPUT_TARGET_FILES is None:
62
64
  return "Error: Transformation context not initialized"
63
65
 
64
- code_transformer = CodeTransformer(function=code)
66
+ code_transformer = CodeTransformer(
67
+ function=code,
68
+ code_executor=LocalCodeExecutor(disable_security_warning=True),
69
+ )
65
70
  generated_output_path = code_transformer.export(INPUT_SDIF_PATH)
66
71
 
67
72
  comparisons = []
68
- comparator_kwargs = {"check_structure_only": True}
73
+ comparator_kwargs = {}
69
74
  if SCHEMA_ONLY:
70
75
  comparator_kwargs["check_structure_only"] = True
71
76
 
@@ -120,19 +125,7 @@ async def execute_transformation(code: str) -> str:
120
125
  return "\n".join(comparisons)
121
126
 
122
127
 
123
- class TransformationCodeBuilder(CodeBuilder):
124
- def __init__(self, output_example: Path | List[Path] | Dict[str, Path]):
125
- self.output_example = output_example
126
-
127
- def build(
128
- self,
129
- sdif: Path | SDIFDatabase,
130
- instructions: Optional[str] = None,
131
- ) -> str:
132
- pass
133
-
134
-
135
- class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
128
+ class SyncpulseTransformationBuilder(AsyncTransformationBuilder):
136
129
  """This class is used to build a transformation code that will be used to transform a SDIF database into a set of files following the format of the given output files."""
137
130
 
138
131
  def __init__(
@@ -147,23 +140,18 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
147
140
 
148
141
  async def build(
149
142
  self,
150
- sdif: Path, # This will now be relative to project root (MCP server CWD)
151
- output_target_files: Dict[Union[str, Path], str] | List[Path],
152
- output_sdif: Optional[Path] = None, # This will now be relative or None
153
- instructions: Optional[str] = None,
143
+ sdif: Path,
144
+ output_target_files: Dict[FilePath, str] | List[FilePath] | FilePath,
145
+ output_sdif: Optional[Path] = None,
146
+ instructions: str = "",
154
147
  schema_only: bool = False,
155
- representer_options_for_build: Optional[Dict[str, Any]] = None,
148
+ representer_kwargs: Optional[Dict[str, Any]] = None,
156
149
  ) -> str:
157
150
  global INPUT_SDIF_PATH, OUTPUT_TARGET_FILES, SCHEMA_ONLY
158
- # INPUT_SDIF_PATH is used by execute_transformation tool, needs to be accessible from where that tool runs.
159
- # If execute_transformation runs in the same process as the builder, absolute path is fine.
160
- # If it were a separate context, this might need adjustment.
161
- # For now, assume execute_transformation can access absolute paths if needed for its *input SDIF*.
162
- # However, the sdif for MCP URIs must be relative.
151
+
163
152
  INPUT_SDIF_PATH = Path(sdif).resolve()
164
153
  SCHEMA_ONLY = schema_only
165
- # Paths for MCP URIs are now expected to be relative to MCP server CWD (project root)
166
- # So, use them directly as strings.
154
+ # We must encode the path because special characters are not allowed in mcp read_resource()
167
155
  input_sdif_mcp_uri_path = base64.b64encode(str(sdif).encode()).decode()
168
156
  output_sdif_mcp_uri_path = (
169
157
  base64.b64encode(str(output_sdif).encode()).decode()
@@ -205,9 +193,14 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
205
193
 
206
194
  # OUTPUT_TARGET_FILES keys are absolute paths to original example files for local reading by representers/comparators.
207
195
  # Values are agent-facing filenames.
208
- if isinstance(output_target_files, list):
196
+ if isinstance(output_target_files, FilePath):
197
+ OUTPUT_TARGET_FILES = {
198
+ Path(output_target_files).resolve(): Path(output_target_files).name
199
+ }
200
+ elif isinstance(output_target_files, list):
209
201
  OUTPUT_TARGET_FILES = {
210
- file_path.resolve(): file_path.name for file_path in output_target_files
202
+ Path(file_path).resolve(): Path(file_path).name
203
+ for file_path in output_target_files
211
204
  }
212
205
  elif isinstance(output_target_files, dict):
213
206
  temp_map = {}
@@ -229,7 +222,7 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
229
222
  # Representer uses the absolute path (file_key_abs_path) to read the example file.
230
223
  representer = get_representer(file_key_abs_path)
231
224
  representation, used_params = representer.represent(
232
- file_key_abs_path, **(representer_options_for_build or {})
225
+ file_key_abs_path, **(representer_kwargs or {})
233
226
  )
234
227
  output_representation[agent_facing_name] = {
235
228
  "representation": representation,
@@ -259,11 +252,10 @@ class TransformationAsyncCodeBuilder(AsyncCodeBuilder):
259
252
  "output_schema": output_schema_text,
260
253
  "output_sample": output_sample_text
261
254
  if not SCHEMA_ONLY
262
- else "Sample not available.",
263
- "output_representation": str(
264
- output_representation
265
- ), # Representation keyed by agent-facing name
266
- "instructions": instructions,
255
+ else "Sample not available. File is empty (no data).",
256
+ "output_representation": str(output_representation),
257
+ "instructions": instructions
258
+ or "No instructions provided. Use the output example.",
267
259
  },
268
260
  )
269
261
  agent = Agent(
@@ -0,0 +1,5 @@
1
+ from .merge_sdif import merge_sdif_files
2
+ from .openai_mcp import OpenAICompatibleMCP
3
+ from .zip import extract_zip_archive_async
4
+
5
+ __all__ = ["merge_sdif_files", "extract_zip_archive_async", "OpenAICompatibleMCP"]
@@ -0,0 +1,22 @@
1
+ from pathlib import Path
2
+ from typing import List
3
+
4
+
5
+ async def merge_sdif_files(sdif_paths: List[Path], output_dir: Path) -> Path:
6
+ """Placeholder function to merge multiple SDIF files into one.
7
+
8
+ Args:
9
+ sdif_paths: A list of paths to the SDIF files to merge.
10
+ output_dir: The directory where the merged file should be saved.
11
+
12
+ Returns:
13
+ Path to the merged SDIF file.
14
+ """
15
+ if not sdif_paths:
16
+ raise ValueError("No SDIF files provided for merging.")
17
+
18
+ if len(sdif_paths) == 1:
19
+ return sdif_paths[0] # No merge needed
20
+
21
+ # TODO: Implement SDIF merge
22
+ raise NotImplementedError("Merge not implemented yet.")