satif-ai 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
satif_ai/adapters/tidy.py CHANGED
@@ -9,7 +9,7 @@ from pathlib import Path
9
9
  from typing import Optional, Union
10
10
 
11
11
  from agents import Agent, Runner, function_tool
12
- from agents.mcp.server import MCPServerStdio
12
+ from agents.mcp.server import MCPServer
13
13
  from mcp import ClientSession
14
14
  from satif_core.adapters.base import Adapter
15
15
  from satif_core.types import Datasource, SDIFPath
@@ -224,7 +224,7 @@ class TidyAdapter(Adapter):
224
224
 
225
225
  def __init__(
226
226
  self,
227
- mcp_server: MCPServerStdio,
227
+ mcp_server: MCPServer,
228
228
  mcp_session: ClientSession,
229
229
  llm_model: str = "o4-mini",
230
230
  max_iterations: int = 5,
@@ -233,7 +233,7 @@ class TidyAdapter(Adapter):
233
233
  Initialize the TidyAdapter.
234
234
 
235
235
  Args:
236
- mcp_server: An instance of MCPServerStdio for agent communication.
236
+ mcp_server: An instance of MCPServer for agent communication.
237
237
  mcp_session: An instance of ClientSession for resource/prompt fetching.
238
238
  llm_model: Name of the language model to use for the agent.
239
239
  max_iterations: Maximum number of attempts the agent gets to refine the code.
@@ -349,7 +349,7 @@ class TidyAdapter(Adapter):
349
349
  if isinstance(sdif, SDIFDatabase):
350
350
  input_path = Path(sdif.path)
351
351
  else:
352
- input_path = sdif
352
+ input_path = Path(sdif)
353
353
  if not input_path.exists():
354
354
  raise FileNotFoundError(f"Input SDIF file not found: {input_path}")
355
355
 
@@ -11,6 +11,7 @@ from satif_core.standardizers.base import AsyncStandardizer
11
11
  from satif_core.types import Datasource, FilePath, SDIFPath, StandardizationResult
12
12
 
13
13
  from satif_ai.adapters.tidy import TidyAdapter
14
+ from satif_ai.standardizers.ai_xlsx import AIXLSXStandardizer
14
15
  from satif_ai.utils.merge_sdif import merge_sdif_files
15
16
  from satif_ai.utils.zip import extract_zip_archive_async
16
17
 
@@ -43,8 +44,9 @@ class AIStandardizer(AsyncStandardizer):
43
44
 
44
45
  self.ai_standardizer_map: Dict[str, Type[AsyncStandardizer]] = {
45
46
  ".csv": AICSVStandardizer,
46
- # Future standardizers:
47
- # ".xlsx": AIXLSXStandardizer,
47
+ ".xlsx": AIXLSXStandardizer,
48
+ ".xls": AIXLSXStandardizer,
49
+ ".xlsm": AIXLSXStandardizer,
48
50
  # ".pdf": AIPDFStandardizer,
49
51
  # ".json": AIJSONStandardizer,
50
52
  # ".xml": AIXMLStandardizer,
@@ -332,11 +334,9 @@ class AIStandardizer(AsyncStandardizer):
332
334
  logger.info(
333
335
  f"Merging {len(intermediate_sdif_files)} intermediate SDIF SQLite files into '{final_sdif_file_target}'."
334
336
  )
335
- # merge_sdif_files must accept a list of source SQLite file paths and a target SQLite file path.
336
- merged_target_path = await merge_sdif_files(
337
+ merged_target_path = merge_sdif_files(
337
338
  intermediate_sdif_files,
338
339
  final_sdif_file_target,
339
- overwrite=False, # We handled overwrite for final_sdif_file_target
340
340
  )
341
341
  final_sdif_path_str = str(merged_target_path)
342
342
 
@@ -414,13 +414,11 @@ class AIStandardizer(AsyncStandardizer):
414
414
  file_processing_temp_dir.mkdir(parents=True, exist_ok=True)
415
415
 
416
416
  try:
417
- # 1. Resolve input datasource to a list of processable file paths
418
417
  resolved_files = await self._resolve_input_files(
419
418
  datasource, file_processing_temp_dir
420
419
  )
421
420
  logger.info(f"Resolved {len(resolved_files)} file(s) for standardization.")
422
421
 
423
- # 2. Group files by the AI standardizer responsible for them
424
422
  grouped_by_std, unsupported = self._group_files_by_standardizer(
425
423
  resolved_files
426
424
  )
@@ -438,7 +436,6 @@ class AIStandardizer(AsyncStandardizer):
438
436
  f"File groups for standardization: { {cls.__name__: [f.name for f in paths] for cls, paths in grouped_by_std.items()} }"
439
437
  )
440
438
 
441
- # 3. Process each group of files, generating intermediate SDIF SQLite files
442
439
  (
443
440
  intermediate_sdif_files,
444
441
  aggregated_file_configs,
@@ -454,7 +451,6 @@ class AIStandardizer(AsyncStandardizer):
454
451
  f"Successfully generated {len(intermediate_sdif_files)} intermediate SDIF SQLite file(s)."
455
452
  )
456
453
 
457
- # 4. Consolidate intermediate SDIF files into the final target file
458
454
  final_result = await self._consolidate_results(
459
455
  intermediate_sdif_files,
460
456
  aggregated_file_configs,
@@ -12,6 +12,7 @@ from agents import Agent, Runner, function_tool
12
12
  from agents.mcp.server import MCPServerStdio
13
13
  from charset_normalizer import detect
14
14
  from mcp import ClientSession
15
+ from satif_core import AsyncStandardizer
15
16
  from satif_core.types import Datasource, SDIFPath, StandardizationResult
16
17
  from satif_sdk.standardizers.csv import (
17
18
  CSVStandardizer,
@@ -274,7 +275,9 @@ async def read_raw_lines(
274
275
 
275
276
 
276
277
  # --- AICSVStandardizer Class ---
277
- class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStandardizer
278
+ class AICSVStandardizer(
279
+ CSVStandardizer, AsyncStandardizer
280
+ ): # Inherits from the enhanced CSVStandardizer
278
281
  def __init__(
279
282
  self,
280
283
  mcp_server: Optional[MCPServerStdio] = None,
@@ -0,0 +1,372 @@
1
+ import logging
2
+ import shutil
3
+ import tempfile
4
+ import uuid
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+ try:
9
+ from xlsx_to_sdif.graph import graph as xlsx_graph
10
+ from xlsx_to_sdif.state import State as XLSXState
11
+ except ImportError:
12
+ xlsx_graph = None # type: ignore
13
+ XLSXState = None # type: ignore
14
+ logging.getLogger(__name__).warning(
15
+ "Failed to import xlsx_to_sdif. AIXLSXStandardizer will not be functional."
16
+ )
17
+
18
+
19
+ from satif_core.standardizers.base import AsyncStandardizer
20
+ from satif_core.types import Datasource, SDIFPath, StandardizationResult
21
+
22
+ from satif_ai.utils.merge_sdif import merge_sdif_files
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class AIXLSXStandardizer(AsyncStandardizer):
28
+ """
29
+ An asynchronous standardizer for XLSX files that leverages the `xlsx-to-sdif` library.
30
+
31
+ This standardizer processes one or more XLSX files, converts each to an
32
+ intermediate SDIF (Standardized Data Interchange Format) file using the
33
+ `xlsx-to-sdif` processing graph, and then consolidates these intermediate
34
+ files into a single final SDIF file.
35
+ """
36
+
37
+ def __init__(self, *args: Any, **kwargs: Any):
38
+ """
39
+ Initializes the AIXLSXStandardizer.
40
+
41
+ Args:
42
+ ...
43
+ """
44
+
45
+ async def _invoke_xlsx_graph(
46
+ self, input_file_path: Path, graph_config: Dict[str, Any]
47
+ ) -> Path:
48
+ """
49
+ Invokes the `xlsx-to-sdif` graph for a single XLSX file.
50
+
51
+ Args:
52
+ input_file_path: Path to the input XLSX file.
53
+ graph_config: Configuration for the `xlsx-to-sdif` graph invocation,
54
+ including a unique `thread_id`.
55
+
56
+ Returns:
57
+ Path to the SDIF file produced by the graph.
58
+
59
+ Raises:
60
+ RuntimeError: If the `xlsx-to-sdif` graph is not available, fails to
61
+ return a final state, or does not produce an output path.
62
+ FileNotFoundError: If the graph reports an output file that doesn't exist.
63
+ """
64
+ if not xlsx_graph or not XLSXState:
65
+ raise RuntimeError(
66
+ "xlsx_to_sdif is not available. "
67
+ "Please ensure 'xlsx-to-sdif' library is installed correctly."
68
+ )
69
+
70
+ initial_state: XLSXState = {"spreadsheet_path": str(input_file_path)} # type: ignore
71
+
72
+ thread_id = graph_config.get("configurable", {}).get(
73
+ "thread_id", "unknown_thread"
74
+ )
75
+ logger.info(
76
+ f"Invoking xlsx_to_sdif graph for: {input_file_path.name} with thread_id: {thread_id}"
77
+ )
78
+
79
+ # Stream events for logging or potential progress updates
80
+ async for event in xlsx_graph.astream_events(
81
+ initial_state, graph_config, version="v1"
82
+ ):
83
+ event_type = event["event"]
84
+ event_name = event.get("name", "")
85
+ if event_type in ["on_tool_start", "on_chain_start"]:
86
+ logger.debug(
87
+ f"Graph event for {input_file_path.name} (Thread: {thread_id}): {event_type} - {event_name}"
88
+ )
89
+ elif event_type in ["on_tool_error", "on_chain_error", "on_llm_error"]:
90
+ logger.warning(
91
+ f"Graph error event for {input_file_path.name} (Thread: {thread_id}): {event_type} - {event_name}. Data: {event.get('data')}"
92
+ )
93
+
94
+ final_snapshot = await xlsx_graph.aget_state(graph_config)
95
+ if not final_snapshot or not final_snapshot.values:
96
+ raise RuntimeError(
97
+ f"xlsx_to_sdif graph did not return a final state for {input_file_path.name} (Thread: {thread_id})."
98
+ )
99
+
100
+ output_sdif_path_str = final_snapshot.values.get("output_sdif_path")
101
+ if not output_sdif_path_str:
102
+ raise RuntimeError(
103
+ f"xlsx_to_sdif graph for {input_file_path.name} (Thread: {thread_id}) "
104
+ f"did not produce an 'output_sdif_path' in its final state. State: {final_snapshot.values}"
105
+ )
106
+
107
+ output_sdif_path = Path(output_sdif_path_str)
108
+ if not output_sdif_path.is_file():
109
+ raise FileNotFoundError(
110
+ f"xlsx_to_sdif graph for {input_file_path.name} (Thread: {thread_id}) "
111
+ f"reported output file '{output_sdif_path}', but it does not exist or is not a file."
112
+ )
113
+
114
+ logger.info(
115
+ f"xlsx_to_sdif graph successfully processed {input_file_path.name} (Thread: {thread_id}). Output at {output_sdif_path}"
116
+ )
117
+ return output_sdif_path
118
+
119
+ async def _resolve_and_filter_input_files(
120
+ self, datasource: Datasource
121
+ ) -> List[Path]:
122
+ """Resolves and validates datasource, returning a list of XLSX file paths."""
123
+ input_files: List[Path]
124
+ if isinstance(datasource, (str, Path)):
125
+ input_files = [Path(datasource)]
126
+ elif isinstance(datasource, list) and all(
127
+ isinstance(p, (str, Path)) for p in datasource
128
+ ):
129
+ input_files = [Path(p) for p in datasource]
130
+ else:
131
+ raise ValueError(
132
+ "Datasource must be a file path (str or Path) or a list of such paths."
133
+ )
134
+
135
+ if not input_files:
136
+ raise ValueError("No input XLSX files provided in the datasource.")
137
+
138
+ xlsx_input_files = []
139
+ for f_path in input_files:
140
+ if not f_path.is_file():
141
+ raise FileNotFoundError(f"Input file not found: {f_path}")
142
+ if f_path.suffix.lower() not in (
143
+ ".xlsx",
144
+ ".xlsm",
145
+ ".xlsb",
146
+ ".xls",
147
+ ): # Common Excel extensions
148
+ logger.warning(
149
+ f"File {f_path.name} is not a typical XLSX file extension, but will be attempted."
150
+ )
151
+ xlsx_input_files.append(f_path)
152
+
153
+ if not xlsx_input_files:
154
+ raise ValueError(
155
+ "No processable XLSX files found in the datasource after filtering."
156
+ )
157
+ return xlsx_input_files
158
+
159
+ def _prepare_final_output_path(
160
+ self, output_path: SDIFPath, overwrite: bool
161
+ ) -> Path:
162
+ """Prepares the final output path, handling overwrites and directory creation."""
163
+ final_output_path = Path(output_path)
164
+ if final_output_path.exists() and not overwrite:
165
+ raise FileExistsError(
166
+ f"Output file {final_output_path} already exists and overwrite is False."
167
+ )
168
+ elif final_output_path.exists() and overwrite:
169
+ logger.info(
170
+ f"Overwrite active: Deleting existing output file {final_output_path}"
171
+ )
172
+ try:
173
+ if (
174
+ final_output_path.is_dir()
175
+ ): # Should not happen if SDIFPath is file path
176
+ raise IsADirectoryError(
177
+ f"Output path {final_output_path} is a directory."
178
+ )
179
+ final_output_path.unlink()
180
+ except OSError as e:
181
+ raise RuntimeError(
182
+ f"Failed to delete existing output file {final_output_path}: {e}"
183
+ ) from e
184
+
185
+ final_output_path.parent.mkdir(parents=True, exist_ok=True)
186
+ return final_output_path
187
+
188
+ def _setup_temp_directories(self) -> Tuple[Path, Path, Path]:
189
+ """Creates and returns paths for temporary working directories."""
190
+ run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aixlsx_run_"))
191
+ intermediate_sdif_dir = run_temp_dir / "intermediate_sdifs"
192
+ intermediate_sdif_dir.mkdir()
193
+ temp_input_copies_dir = (
194
+ run_temp_dir / "temp_input_copies"
195
+ ) # Directory for temporary input copies
196
+ temp_input_copies_dir.mkdir()
197
+ return run_temp_dir, intermediate_sdif_dir, temp_input_copies_dir
198
+
199
+ async def _process_single_file_to_intermediate_sdif(
200
+ self,
201
+ input_xlsx_file: Path,
202
+ final_output_path_stem: str,
203
+ temp_input_copies_dir: Path,
204
+ intermediate_sdif_dir: Path,
205
+ ) -> Path:
206
+ """Processes a single XLSX file to an intermediate SDIF in a controlled location."""
207
+ logger.info(f"Processing file: {input_xlsx_file.name}")
208
+ graph_thread_id = f"satif_aixlsx_{final_output_path_stem}_{input_xlsx_file.stem}_{uuid.uuid4().hex[:8]}"
209
+
210
+ temp_input_file_for_graph = (
211
+ temp_input_copies_dir
212
+ / f"{input_xlsx_file.stem}_{graph_thread_id}{input_xlsx_file.suffix}"
213
+ )
214
+ shutil.copy2(input_xlsx_file, temp_input_file_for_graph)
215
+ logger.debug(
216
+ f"Created temporary copy of {input_xlsx_file.name} at {temp_input_file_for_graph}"
217
+ )
218
+
219
+ graph_config_for_file = {
220
+ "configurable": {"thread_id": graph_thread_id},
221
+ "recursion_limit": 50, # Default, make configurable if needed
222
+ }
223
+
224
+ try:
225
+ graph_produced_sdif_path = await self._invoke_xlsx_graph(
226
+ temp_input_file_for_graph, graph_config_for_file
227
+ )
228
+
229
+ target_intermediate_sdif_path = (
230
+ intermediate_sdif_dir
231
+ / f"intermediate_{input_xlsx_file.stem}_{graph_thread_id}.sdif"
232
+ )
233
+ shutil.move(
234
+ str(graph_produced_sdif_path),
235
+ str(target_intermediate_sdif_path),
236
+ )
237
+ logger.info(
238
+ f"Moved graph output for {input_xlsx_file.name} to {target_intermediate_sdif_path}"
239
+ )
240
+ return target_intermediate_sdif_path
241
+ except Exception as e:
242
+ error_msg = f"Failed to process file {input_xlsx_file.name} (using copy {temp_input_file_for_graph.name}) with xlsx-to-sdif graph: {e}"
243
+ logger.error(error_msg, exc_info=True)
244
+ # Re-raise to be caught by the main standardize method's loop or error handling
245
+ raise RuntimeError(
246
+ f"Error processing {input_xlsx_file.name}. Halting batch."
247
+ ) from e
248
+
249
+ async def _consolidate_intermediate_sdifs(
250
+ self, intermediate_sdif_paths: List[Path], final_output_path: Path
251
+ ) -> None:
252
+ """Consolidates intermediate SDIF files into the final output path."""
253
+ if not intermediate_sdif_paths:
254
+ # This case should ideally be handled before calling, but as a safeguard:
255
+ raise RuntimeError(
256
+ "No intermediate SDIF files were provided for consolidation."
257
+ )
258
+
259
+ if len(intermediate_sdif_paths) == 1:
260
+ logger.info(
261
+ f"Only one intermediate SDIF generated. Moving {intermediate_sdif_paths[0]} to {final_output_path}"
262
+ )
263
+ shutil.move(str(intermediate_sdif_paths[0]), str(final_output_path))
264
+ else:
265
+ logger.info(
266
+ f"Merging {len(intermediate_sdif_paths)} intermediate SDIF files into {final_output_path}"
267
+ )
268
+ merge_sdif_files(
269
+ source_db_paths=intermediate_sdif_paths,
270
+ target_db_path=final_output_path,
271
+ )
272
+
273
+ async def standardize(
274
+ self,
275
+ datasource: Datasource,
276
+ output_path: SDIFPath,
277
+ *,
278
+ overwrite: bool = False,
279
+ config: Optional[Dict[str, Any]] = None,
280
+ **kwargs: Any,
281
+ ) -> StandardizationResult:
282
+ """
283
+ Standardizes one or more XLSX files into a single SDIF file.
284
+
285
+ Args:
286
+ datasource: A single file path (str or Path) or a list of file paths
287
+ to XLSX files.
288
+ output_path: The path where the final consolidated SDIF file will be saved.
289
+ overwrite: If True, overwrite the output_path if it already exists.
290
+ Defaults to False.
291
+ config: General configuration options (currently not used by this standardizer
292
+ for graph interaction but preserved for API consistency).
293
+ **kwargs: Additional keyword arguments (currently ignored).
294
+
295
+ Returns:
296
+ A StandardizationResult object containing the path to the final SDIF file.
297
+
298
+ Raises:
299
+ ValueError: If the datasource is invalid or no XLSX files are found.
300
+ RuntimeError: If critical errors occur during processing, such as the
301
+ `xlsx-to-sdif` graph not being available or failing.
302
+ FileNotFoundError: If input files are not found or graph outputs are invalid.
303
+ FileExistsError: If output_path exists and overwrite is False.
304
+ """
305
+ if not xlsx_graph or not XLSXState:
306
+ raise RuntimeError(
307
+ "AIXLSXStandardizer cannot operate because `xlsx_to_sdif.graph` or `xlsx_to_sdif.state` is not available. "
308
+ "Please ensure the 'xlsx-to-sdif' library is installed and accessible."
309
+ )
310
+
311
+ xlsx_input_files = await self._resolve_and_filter_input_files(datasource)
312
+ final_output_path = self._prepare_final_output_path(output_path, overwrite)
313
+ run_temp_dir, intermediate_sdif_dir, temp_input_copies_dir = (
314
+ self._setup_temp_directories()
315
+ )
316
+
317
+ intermediate_sdif_paths: List[Path] = []
318
+ processing_errors: List[str] = []
319
+
320
+ try:
321
+ # Process each file sequentially. Consider asyncio.gather for parallel if graph supports it well for many files.
322
+ for i, input_xlsx_file in enumerate(xlsx_input_files):
323
+ try:
324
+ logger.info(
325
+ f"Starting processing for file {i + 1}/{len(xlsx_input_files)}: {input_xlsx_file.name}"
326
+ )
327
+ intermediate_sdif_path = (
328
+ await self._process_single_file_to_intermediate_sdif(
329
+ input_xlsx_file,
330
+ final_output_path.stem, # Pass stem for unique naming
331
+ temp_input_copies_dir,
332
+ intermediate_sdif_dir,
333
+ )
334
+ )
335
+ intermediate_sdif_paths.append(intermediate_sdif_path)
336
+ except Exception:
337
+ logger.error(
338
+ f"Halting standardization due to error processing {input_xlsx_file.name}."
339
+ )
340
+ raise # Re-raise the exception to be caught by the outer try/finally
341
+
342
+ if not intermediate_sdif_paths:
343
+ # This condition might be redundant if _process_single_file_to_intermediate_sdif always raises on failure
344
+ # and we re-raise immediately.
345
+ if processing_errors: # This list would be empty if we fail fast
346
+ raise RuntimeError(
347
+ f"No XLSX files were successfully processed. Errors: {'; '.join(processing_errors)}"
348
+ )
349
+ else:
350
+ raise RuntimeError(
351
+ "No intermediate SDIF files were generated, though no specific errors were caught."
352
+ )
353
+
354
+ await self._consolidate_intermediate_sdifs(
355
+ intermediate_sdif_paths, final_output_path
356
+ )
357
+
358
+ logger.info(f"Successfully created final SDIF: {final_output_path}")
359
+ return StandardizationResult(
360
+ output_path=final_output_path, file_configs=None
361
+ ) # file_configs not available from this process
362
+
363
+ finally:
364
+ if run_temp_dir.exists():
365
+ try:
366
+ shutil.rmtree(run_temp_dir)
367
+ logger.debug(f"Cleaned up temporary directory: {run_temp_dir}")
368
+ except Exception as e_clean:
369
+ logger.error(
370
+ f"Error cleaning up temporary directory {run_temp_dir}: {e_clean}",
371
+ exc_info=True,
372
+ )
satif_ai/transform.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from pathlib import Path
2
2
  from typing import Any, Dict, List, Optional
3
3
 
4
- from fastmcp import FastMCP
4
+ from fastmcp import Client, FastMCP
5
5
  from fastmcp.client.transports import FastMCPTransport
6
6
  from satif_core.code_executors.base import CodeExecutor
7
7
  from satif_core.transformation_builders.base import AsyncTransformationBuilder
@@ -28,7 +28,7 @@ async def atransform(
28
28
  transformation_builder: Optional[AsyncTransformationBuilder] = None,
29
29
  code_executor: Optional[CodeExecutor] = None,
30
30
  mcp_server: Optional[FastMCP] = None,
31
- mcp_transport: Optional[FastMCPTransport] = None,
31
+ mcp_client: Optional[Client] = None,
32
32
  llm_model: str = "o4-mini",
33
33
  schema_only: bool = False,
34
34
  representer_kwargs: Optional[Dict[str, Any]] = None,
@@ -38,7 +38,7 @@ async def atransform(
38
38
  an AI-generated or provided transformation code.
39
39
 
40
40
  This function orchestrates the process of:
41
- 1. Optionally generating transformation code using an AI model via a `CodeBuilder`
41
+ 1. Optionally generating transformation code using an AI model via a `TransformationBuilder`
42
42
  if `transformation_code` is not provided.
43
43
  explicitly passed.
44
44
  2. Executing the transformation code using a `CodeTransformer` and a `CodeExecutor`.
@@ -53,18 +53,21 @@ async def atransform(
53
53
  output filenames if the transformation result keys match.
54
54
  instructions: Optional. Natural language instructions for the AI to generate
55
55
  the transformation code. Used if `transformation_code` is None.
56
+ output_path: Path to the directory where transformation outputs will be saved.
56
57
  transformation_code: Optional. Pre-existing Python code for the transformation.
57
58
  If None, code will be generated by the `transformation_builder`.
58
59
  transformation_builder: Optional. An `AsyncTransformationBuilder` instance responsible for generating
59
60
  the transformation code if `transformation_code` is not provided.
60
- If None, a `TransformationAsyncCodeBuilder` is instantiated.
61
+ If None, a `SyncpulseTransformationBuilder` is instantiated.
61
62
  code_executor: Optional. A `CodeExecutor` instance for running the transformation
62
63
  code. If None, a `LocalCodeExecutor` is used.
63
64
  mcp_server: Optional. A `FastMCP` server instance for the AI code builder.
64
- Defaults to the global `mcp` instance if `transformation_builder` is None.
65
- mcp_transport: Optional. A `FastMCPTransport` instance for communication with
66
- the `mcp_server`. Defaults to a new transport using `mcp_server`
67
- if `transformation_builder` is None.
65
+ Defaults to the global `mcp` instance if `transformation_builder` is None and
66
+ a new `SyncpulseTransformationBuilder` is being created.
67
+ mcp_client: Optional. A user-provided `Client` instance. If provided when
68
+ `transformation_builder` is None, it will be used by the internally
69
+ created `SyncpulseTransformationBuilder`. The caller is responsible for
70
+ managing the lifecycle of a provided client.
68
71
  llm_model: The language model to use for code generation (e.g., "o4-mini").
69
72
  Used if `transformation_builder` is None.
70
73
  schema_only: If True, the transformation aims to match only the schema (headers)
@@ -78,44 +81,75 @@ async def atransform(
78
81
  A `TransformationResult` object containing the path to the output
79
82
  and the transformation code used.
80
83
  """
81
- if transformation_builder is None:
82
- if mcp_server is None:
83
- mcp_server = mcp
84
-
85
- if mcp_transport is None:
86
- mcp_transport = FastMCPTransport(mcp=mcp_server)
87
-
88
- openai_compatible_mcp = OpenAICompatibleMCP(mcp=mcp_server)
89
- await openai_compatible_mcp.connect()
90
-
91
- transformation_builder = SyncpulseTransformationBuilder(
92
- mcp_server=openai_compatible_mcp,
93
- mcp_session=mcp_transport,
94
- llm_model=llm_model,
95
- )
96
-
97
- if transformation_code is None:
98
- function_code = await transformation_builder.build(
99
- sdif=sdif,
100
- output_target_files=output_target_files,
101
- instructions=instructions,
102
- schema_only=schema_only,
103
- representer_kwargs=representer_kwargs,
104
- )
105
- else:
106
- function_code = transformation_code
107
-
108
- if code_executor is None:
109
- code_executor = LocalCodeExecutor()
84
+ current_transformation_code: Optional[str] = transformation_code
85
+ active_builder: Optional[AsyncTransformationBuilder] = transformation_builder
86
+
87
+ _openai_mcp_instance: Optional[OpenAICompatibleMCP] = None
88
+ openai_mcp_managed_locally = False
89
+
90
+ # If code isn't provided, we need a builder. If a builder isn't provided, we create one.
91
+ if current_transformation_code is None:
92
+ if active_builder is None:
93
+ # Create SyncpulseTransformationBuilder
94
+ _effective_mcp_server = mcp_server if mcp_server is not None else mcp
95
+
96
+ _openai_mcp_instance = OpenAICompatibleMCP(mcp=_effective_mcp_server)
97
+ await _openai_mcp_instance.connect()
98
+ openai_mcp_managed_locally = True
99
+
100
+ if mcp_client is None: # No user-provided client, create and manage one
101
+ mcp_transport = FastMCPTransport(mcp=_effective_mcp_server)
102
+ async with Client(mcp_transport) as new_client:
103
+ active_builder = SyncpulseTransformationBuilder(
104
+ mcp_server=_openai_mcp_instance,
105
+ mcp_session=new_client.session,
106
+ llm_model=llm_model,
107
+ )
108
+ current_transformation_code = await active_builder.build(
109
+ sdif=sdif,
110
+ output_target_files=output_target_files,
111
+ instructions=instructions,
112
+ schema_only=schema_only,
113
+ representer_kwargs=representer_kwargs,
114
+ )
115
+ else:
116
+ active_builder = SyncpulseTransformationBuilder(
117
+ mcp_server=_openai_mcp_instance,
118
+ mcp_session=mcp_client, # Use the provided client
119
+ llm_model=llm_model,
120
+ )
121
+ current_transformation_code = await active_builder.build(
122
+ sdif=sdif,
123
+ output_target_files=output_target_files,
124
+ instructions=instructions,
125
+ schema_only=schema_only,
126
+ representer_kwargs=representer_kwargs,
127
+ )
128
+
129
+ # Disconnect OpenAICompatibleMCP if it was created and connected locally
130
+ if (
131
+ openai_mcp_managed_locally
132
+ and _openai_mcp_instance
133
+ and _openai_mcp_instance._is_connected
134
+ ):
135
+ await _openai_mcp_instance.cleanup()
136
+
137
+ if current_transformation_code is None:
138
+ raise ValueError("Transformation code could not be obtained or generated.")
139
+
140
+ # Code Executor and Transformation
141
+ _code_executor = code_executor if code_executor is not None else LocalCodeExecutor()
110
142
 
111
143
  transformer = CodeTransformer(
112
- function=function_code,
113
- code_executor=code_executor,
144
+ function=current_transformation_code,
145
+ code_executor=_code_executor,
114
146
  )
115
147
 
116
- output_path = transformer.export(
148
+ exported_artifact_path = transformer.export(
117
149
  sdif=sdif,
118
150
  output_path=output_path,
119
151
  )
120
152
 
121
- return TransformationResult(output_path=output_path, function_code=function_code)
153
+ return TransformationResult(
154
+ output_path=exported_artifact_path, function_code=current_transformation_code
155
+ )
@@ -1,22 +1,535 @@
1
+ import json
2
+ import logging
3
+ import shutil
4
+ import sqlite3
5
+ from datetime import datetime
1
6
  from pathlib import Path
2
- from typing import List
7
+ from typing import Any, Dict, List
3
8
 
9
+ from satif_core.types import SDIFPath
10
+ from sdif_db.database import (
11
+ SDIFDatabase, # Assuming this is the conventional import path
12
+ )
4
13
 
5
- async def merge_sdif_files(sdif_paths: List[Path], output_dir: Path) -> Path:
6
- """Placeholder function to merge multiple SDIF files into one.
14
+ log = logging.getLogger(__name__)
15
+
16
+
17
+ class _SDIFMerger:
18
+ def __init__(self, target_sdif_path: Path):
19
+ self.target_db = SDIFDatabase(target_sdif_path, overwrite=True)
20
+ # Mappings per source_db_idx:
21
+ self.source_id_map: Dict[int, Dict[int, int]] = {}
22
+ self.table_name_map: Dict[int, Dict[str, str]] = {}
23
+ self.object_name_map: Dict[int, Dict[str, str]] = {}
24
+ self.media_name_map: Dict[int, Dict[str, str]] = {}
25
+
26
+ def _get_new_source_id(self, source_db_idx: int, old_source_id: int) -> int:
27
+ return self.source_id_map[source_db_idx][old_source_id]
28
+
29
+ def _get_new_table_name(self, source_db_idx: int, old_table_name: str) -> str:
30
+ return self.table_name_map[source_db_idx].get(old_table_name, old_table_name)
31
+
32
+ def _get_new_object_name(self, source_db_idx: int, old_object_name: str) -> str:
33
+ return self.object_name_map[source_db_idx].get(old_object_name, old_object_name)
34
+
35
+ def _get_new_media_name(self, source_db_idx: int, old_media_name: str) -> str:
36
+ return self.media_name_map[source_db_idx].get(old_media_name, old_media_name)
37
+
38
+ def _generate_unique_name_in_target(self, base_name: str, list_func) -> str:
39
+ """Generates a unique name for the target DB by appending a suffix if base_name exists."""
40
+ if base_name not in list_func():
41
+ return base_name
42
+ i = 1
43
+ while True:
44
+ new_name = f"{base_name}_{i}"
45
+ if new_name not in list_func():
46
+ return new_name
47
+ i += 1
48
+ if i > 1000: # Safety break
49
+ raise RuntimeError(
50
+ f"Could not generate a unique name for base '{base_name}' after 1000 attempts."
51
+ )
52
+
53
+ def _merge_properties(self, source_db: SDIFDatabase, source_db_idx: int):
54
+ source_props = source_db.get_properties()
55
+ if not source_props:
56
+ log.warning(
57
+ f"Source database {source_db.path} has no properties. Skipping properties merge for this source."
58
+ )
59
+ return
60
+
61
+ if source_props.get("sdif_version") != "1.0":
62
+ # Or allow a configurable expected version
63
+ raise ValueError(
64
+ f"Source database {source_db.path} has unsupported SDIF version: {source_props.get('sdif_version')}. Expected '1.0'."
65
+ )
66
+
67
+ if source_db_idx == 0: # First database sets the version for the target
68
+ try:
69
+ self.target_db.conn.execute(
70
+ "UPDATE sdif_properties SET sdif_version = ?",
71
+ (
72
+ source_props.get("sdif_version", "1.0"),
73
+ ), # Default to 1.0 if somehow missing
74
+ )
75
+ self.target_db.conn.commit()
76
+ except sqlite3.Error as e:
77
+ log.error(
78
+ f"Failed to set sdif_version in target DB from {source_db.path}: {e}"
79
+ )
80
+ raise
81
+ # creation_timestamp will be set at the end of the entire merge process.
82
+
83
+ def _merge_sources(self, source_db: SDIFDatabase, source_db_idx: int):
84
+ self.source_id_map[source_db_idx] = {}
85
+ source_sources = source_db.list_sources()
86
+ for old_source_entry in source_sources:
87
+ old_source_id = old_source_entry["source_id"]
88
+ new_source_id = self.target_db.add_source(
89
+ file_name=old_source_entry["original_file_name"],
90
+ file_type=old_source_entry["original_file_type"],
91
+ description=old_source_entry.get("source_description"),
92
+ )
93
+ # original processing_timestamp is not directly carried over, new one is set by add_source
94
+ self.source_id_map[source_db_idx][old_source_id] = new_source_id
95
+
96
+ def _merge_tables(self, source_db: SDIFDatabase, source_db_idx: int):
97
+ self.table_name_map[source_db_idx] = {}
98
+ source_schema = source_db.get_schema()
99
+ source_tables_schema = source_schema.get("tables", {})
100
+
101
+ # Pass 1: Determine new table names for all tables from this source DB
102
+ # This is to ensure FKs can be remapped correctly to tables from the *same* source db.
103
+ temp_name_map_for_this_source = {}
104
+ for old_table_name in source_tables_schema.keys():
105
+ # Use create_table with if_exists='add' to get a unique name, but only for name generation.
106
+ # This is a bit of a workaround. A dedicated _generate_unique_target_table_name might be cleaner.
107
+ # The SDIFDatabase.create_table(if_exists='add') will actually create metadata entries.
108
+ # This might be acceptable if we're careful.
109
+ # Let's use the simpler approach of generating unique name first.
110
+ effective_new_name = self._generate_unique_name_in_target(
111
+ old_table_name, self.target_db.list_tables
112
+ )
113
+ temp_name_map_for_this_source[old_table_name] = effective_new_name
114
+ self.table_name_map[source_db_idx] = temp_name_map_for_this_source
115
+
116
+ # Pass 2: Create tables with remapped FKs and copy data
117
+ for old_table_name, table_detail_from_schema in source_tables_schema.items():
118
+ new_table_name = self.table_name_map[source_db_idx][old_table_name]
119
+
120
+ columns_for_create: Dict[str, Dict[str, Any]] = {}
121
+ original_columns_detail = table_detail_from_schema.get("columns", [])
122
+
123
+ for col_detail in original_columns_detail:
124
+ col_name = col_detail["name"]
125
+ col_props = {
126
+ "type": col_detail["sqlite_type"],
127
+ "not_null": col_detail["not_null"],
128
+ "primary_key": col_detail[
129
+ "primary_key"
130
+ ], # Assumes single col PK flag
131
+ "description": col_detail.get("description"),
132
+ "original_column_name": col_detail.get("original_column_name"),
133
+ # 'unique' constraint not in get_schema output, assumed not used or handled by primary_key
134
+ }
135
+
136
+ # Remap foreign keys defined for this column
137
+ table_fks_detail = table_detail_from_schema.get("foreign_keys", [])
138
+ for fk_info in table_fks_detail:
139
+ if fk_info["from_column"] == col_name:
140
+ original_fk_target_table = fk_info["target_table"]
141
+ # FKs are assumed to target tables within the same source SDIF file.
142
+ remapped_fk_target_table = self.table_name_map[
143
+ source_db_idx
144
+ ].get(original_fk_target_table)
145
+ if not remapped_fk_target_table:
146
+ log.warning(
147
+ f"Could not remap FK target table '{original_fk_target_table}' for column '{col_name}' in table '{old_table_name}'. FK might be dropped or invalid."
148
+ )
149
+ # Decide: skip FK, or raise error, or create FK pointing to original name (which might conflict or be wrong)
150
+ # For now, we'll proceed without this FK if target not found in map (shouldn't happen if all tables from source are processed)
151
+ continue
152
+
153
+ col_props["foreign_key"] = {
154
+ "table": remapped_fk_target_table,
155
+ "column": fk_info["target_column"],
156
+ "on_delete": fk_info[
157
+ "on_delete"
158
+ ].upper(), # Ensure standard casing
159
+ "on_update": fk_info[
160
+ "on_update"
161
+ ].upper(), # Ensure standard casing
162
+ }
163
+ break # Assuming one FK per 'from_column' for this col_props structure
164
+ columns_for_create[col_name] = col_props
165
+
166
+ source_table_metadata = table_detail_from_schema.get("metadata", {})
167
+ old_source_id_for_table = source_table_metadata.get("source_id")
168
+ if old_source_id_for_table is None:
169
+ raise ValueError(
170
+ f"Table '{old_table_name}' from {source_db.path} is missing source_id in its metadata."
171
+ )
172
+
173
+ new_source_id_for_table = self._get_new_source_id(
174
+ source_db_idx, old_source_id_for_table
175
+ )
176
+
177
+ # Create the table structure in the target database
178
+ # Using if_exists="fail" because new_table_name should already be unique.
179
+ # SDIFDatabase.create_table handles complex PKs via table_constraints reconstruction.
180
+ actual_created_name = self.target_db.create_table(
181
+ table_name=new_table_name,
182
+ columns=columns_for_create,
183
+ source_id=new_source_id_for_table,
184
+ description=source_table_metadata.get("description"),
185
+ original_identifier=source_table_metadata.get("original_identifier"),
186
+ if_exists="fail",
187
+ )
188
+ if actual_created_name != new_table_name:
189
+ # This case should ideally not happen if _generate_unique_target_table_name was correct
190
+ # and create_table used if_exists='fail'. If create_table internally changes name even with 'fail',
191
+ # this is an issue. For now, assume 'fail' means it uses the name or errors.
192
+ log.warning(
193
+ f"Table name discrepancy: expected {new_table_name}, created as {actual_created_name}. Using created name."
194
+ )
195
+ self.table_name_map[source_db_idx][old_table_name] = (
196
+ actual_created_name # Update map
197
+ )
198
+
199
+ # Copy data
200
+ try:
201
+ data_df = source_db.read_table(old_table_name)
202
+ if not data_df.empty:
203
+ # SDIFDatabase.insert_data expects List[Dict].
204
+ # SDIFDatabase.write_dataframe is higher level but might re-create table.
205
+ # Let's use insert_data.
206
+
207
+ # Handle data type conversions that pandas might do, to align with SQLite expectations
208
+ # For example, pandas bools to int 0/1, datetimes to ISO strings.
209
+ # The SDIFDatabase.write_dataframe has logic for this.
210
+ # We can replicate parts or simplify if read_table and insert_data are robust.
211
+ # For now, assume read_table gives compatible data for insert_data
212
+ # or insert_data can handle common pandas types.
213
+ # A quick check: SDIFDatabase.insert_data does not do type conversion.
214
+ # SDIFDatabase.write_dataframe does. So it's safer to go df -> records -> insert
215
+ # after manual conversion like in write_dataframe.
216
+
217
+ df_copy = data_df.copy()
218
+ for col_name_str in df_copy.columns:
219
+ col_name = str(col_name_str) # Ensure string
220
+ if pd.api.types.is_bool_dtype(df_copy[col_name].dtype):
221
+ df_copy[col_name] = df_copy[col_name].astype(int)
222
+ elif pd.api.types.is_datetime64_any_dtype(
223
+ df_copy[col_name].dtype
224
+ ):
225
+ df_copy[col_name] = df_copy[col_name].apply(
226
+ lambda x: x.isoformat() if pd.notnull(x) else None
227
+ )
228
+ elif pd.api.types.is_timedelta64_dtype(df_copy[col_name].dtype):
229
+ df_copy[col_name] = df_copy[col_name].astype(str)
230
+ # Handle potential np.nan to None for JSON compatibility if objects were stored as text
231
+ if df_copy[col_name].dtype == object:
232
+ df_copy[col_name] = df_copy[col_name].replace(
233
+ {np.nan: None}
234
+ )
235
+
236
+ data_records = df_copy.to_dict("records")
237
+ if data_records: # Ensure there are records to insert
238
+ self.target_db.insert_data(actual_created_name, data_records)
239
+ except Exception as e:
240
+ log.error(
241
+ f"Failed to copy data for table {old_table_name} to {actual_created_name}: {e}"
242
+ )
243
+ # Decide: continue with other tables or raise? For robustness, log and continue.
244
+ # Or add a strict mode flag. For now, log and continue.
245
+
246
+ def _merge_objects(self, source_db: SDIFDatabase, source_db_idx: int):
247
+ self.object_name_map[source_db_idx] = {}
248
+ for old_object_name in source_db.list_objects():
249
+ obj_data = source_db.get_object(
250
+ old_object_name, parse_json=False
251
+ ) # Get raw JSON strings
252
+ if not obj_data:
253
+ log.warning(
254
+ f"Could not retrieve object '{old_object_name}' from {source_db.path}. Skipping."
255
+ )
256
+ continue
257
+
258
+ new_object_name = self._generate_unique_name_in_target(
259
+ old_object_name, self.target_db.list_objects
260
+ )
261
+ self.object_name_map[source_db_idx][old_object_name] = new_object_name
262
+
263
+ new_source_id = self._get_new_source_id(
264
+ source_db_idx, obj_data["source_id"]
265
+ )
266
+
267
+ # Data is already string from parse_json=False. Schema hint also string.
268
+ # SDIFDatabase.add_object expects data to be Any (serializable) and schema_hint Dict.
269
+ # So we need to parse them back if they are strings.
270
+ parsed_json_data = json.loads(obj_data["json_data"])
271
+ parsed_schema_hint = (
272
+ json.loads(obj_data["schema_hint"])
273
+ if obj_data.get("schema_hint")
274
+ else None
275
+ )
276
+
277
+ self.target_db.add_object(
278
+ object_name=new_object_name,
279
+ json_data=parsed_json_data,
280
+ source_id=new_source_id,
281
+ description=obj_data.get("description"),
282
+ schema_hint=parsed_schema_hint,
283
+ )
284
+
285
+ def _merge_media(self, source_db: SDIFDatabase, source_db_idx: int):
286
+ self.media_name_map[source_db_idx] = {}
287
+ for old_media_name in source_db.list_media():
288
+ media_entry = source_db.get_media(
289
+ old_media_name, parse_json=False
290
+ ) # Get raw JSON for tech_metadata
291
+ if not media_entry:
292
+ log.warning(
293
+ f"Could not retrieve media '{old_media_name}' from {source_db.path}. Skipping."
294
+ )
295
+ continue
296
+
297
+ new_media_name = self._generate_unique_name_in_target(
298
+ old_media_name, self.target_db.list_media
299
+ )
300
+ self.media_name_map[source_db_idx][old_media_name] = new_media_name
301
+
302
+ new_source_id = self._get_new_source_id(
303
+ source_db_idx, media_entry["source_id"]
304
+ )
305
+
306
+ parsed_tech_metadata = (
307
+ json.loads(media_entry["technical_metadata"])
308
+ if media_entry.get("technical_metadata")
309
+ else None
310
+ )
311
+
312
+ self.target_db.add_media(
313
+ media_name=new_media_name,
314
+ media_data=media_entry["media_data"], # Should be bytes
315
+ media_type=media_entry["media_type"],
316
+ source_id=new_source_id,
317
+ description=media_entry.get("description"),
318
+ original_format=media_entry.get("original_format"),
319
+ technical_metadata=parsed_tech_metadata,
320
+ )
321
+
322
+ def _remap_element_spec(
323
+ self, element_type: str, element_spec_json: str, source_db_idx: int
324
+ ) -> str:
325
+ if not element_spec_json:
326
+ return element_spec_json
327
+
328
+ try:
329
+ spec_dict = json.loads(element_spec_json)
330
+ except json.JSONDecodeError:
331
+ log.warning(
332
+ f"Invalid JSON in element_spec: {element_spec_json}. Returning as is."
333
+ )
334
+ return element_spec_json
335
+
336
+ new_spec_dict = spec_dict.copy()
337
+
338
+ # Remap source_id if present (relevant for 'source' element_type in annotations, not directly in semantic_links spec)
339
+ # Semantic links link to other entities which carry their own source_id.
340
+ # But if spec itself contains a source_id key (e.g. for target_element_type='source' in annotations)
341
+ if "source_id" in new_spec_dict and isinstance(new_spec_dict["source_id"], int):
342
+ new_spec_dict["source_id"] = self._get_new_source_id(
343
+ source_db_idx, new_spec_dict["source_id"]
344
+ )
345
+
346
+ # Remap names based on element_type
347
+ if element_type in ["table", "column"]:
348
+ if "table_name" in new_spec_dict:
349
+ new_spec_dict["table_name"] = self._get_new_table_name(
350
+ source_db_idx, new_spec_dict["table_name"]
351
+ )
352
+ elif element_type == "object": # Direct object reference
353
+ if "object_name" in new_spec_dict:
354
+ new_spec_dict["object_name"] = self._get_new_object_name(
355
+ source_db_idx, new_spec_dict["object_name"]
356
+ )
357
+ elif element_type == "json_path": # JSONPath typically refers to an object
358
+ if (
359
+ "object_name" in new_spec_dict
360
+ ): # If the spec identifies the object container
361
+ new_spec_dict["object_name"] = self._get_new_object_name(
362
+ source_db_idx, new_spec_dict["object_name"]
363
+ )
364
+ elif element_type == "media":
365
+ if "media_name" in new_spec_dict:
366
+ new_spec_dict["media_name"] = self._get_new_media_name(
367
+ source_db_idx, new_spec_dict["media_name"]
368
+ )
369
+ # 'file' type needs no remapping of spec content.
370
+ # 'source' type: primary key is 'source_id', remapped above.
371
+
372
+ return json.dumps(new_spec_dict)
373
+
374
+ def _merge_semantic_links(self, source_db: SDIFDatabase, source_db_idx: int):
375
+ # SDIFDatabase.list_semantic_links default parses JSON spec. We need this.
376
+ source_links = source_db.list_semantic_links(parse_json=True)
377
+
378
+ for link in source_links:
379
+ # The specs are already dicts because parse_json=True was used.
380
+ try:
381
+ from_spec_dict = link["from_element_spec"]
382
+ to_spec_dict = link["to_element_spec"]
383
+
384
+ # Remap the dicts directly
385
+ new_from_spec_dict = self._remap_element_spec_dict(
386
+ link["from_element_type"], from_spec_dict, source_db_idx
387
+ )
388
+ new_to_spec_dict = self._remap_element_spec_dict(
389
+ link["to_element_type"], to_spec_dict, source_db_idx
390
+ )
391
+
392
+ self.target_db.add_semantic_link(
393
+ link_type=link["link_type"],
394
+ from_element_type=link["from_element_type"],
395
+ from_element_spec=new_from_spec_dict, # add_semantic_link takes dict
396
+ to_element_type=link["to_element_type"],
397
+ to_element_spec=new_to_spec_dict, # add_semantic_link takes dict
398
+ description=link.get("description"),
399
+ )
400
+ except Exception as e:
401
+ link_id = link.get("link_id", "Unknown")
402
+ log.error(
403
+ f"Failed to merge semantic link ID {link_id} from {source_db.path}: {e}. Skipping link."
404
+ )
405
+
406
+ def _remap_element_spec_dict(
407
+ self, element_type: str, spec_dict: Dict, source_db_idx: int
408
+ ) -> Dict:
409
+ # Helper for _merge_semantic_links that works with dicts directly
410
+ new_spec_dict = spec_dict.copy()
411
+
412
+ if "source_id" in new_spec_dict and isinstance(new_spec_dict["source_id"], int):
413
+ new_spec_dict["source_id"] = self._get_new_source_id(
414
+ source_db_idx, new_spec_dict["source_id"]
415
+ )
416
+
417
+ if element_type in ["table", "column"]:
418
+ if "table_name" in new_spec_dict:
419
+ new_spec_dict["table_name"] = self._get_new_table_name(
420
+ source_db_idx, new_spec_dict["table_name"]
421
+ )
422
+ elif element_type == "object" or (
423
+ element_type == "json_path" and "object_name" in new_spec_dict
424
+ ):
425
+ if "object_name" in new_spec_dict:
426
+ new_spec_dict["object_name"] = self._get_new_object_name(
427
+ source_db_idx, new_spec_dict["object_name"]
428
+ )
429
+ elif element_type == "media":
430
+ if "media_name" in new_spec_dict:
431
+ new_spec_dict["media_name"] = self._get_new_media_name(
432
+ source_db_idx, new_spec_dict["media_name"]
433
+ )
434
+ return new_spec_dict
435
+
436
+ def merge_all(self, source_sdif_paths: List[SDIFPath]):
437
+ # Import pandas and numpy here to avoid making them a hard dependency of the module if not used.
438
+ # However, SDIFDatabase itself uses them. So they are effectively dependencies.
439
+ global pd, np
440
+ import numpy as np
441
+ import pandas as pd
442
+
443
+ for idx, source_path_item in enumerate(source_sdif_paths):
444
+ source_path = Path(source_path_item) # Ensure Path object
445
+ log.info(
446
+ f"Processing source SDIF ({idx + 1}/{len(source_sdif_paths)}): {source_path}"
447
+ )
448
+ source_db = SDIFDatabase(source_path, read_only=True)
449
+ try: # Ensure source_db is closed
450
+ self._merge_properties(source_db, idx)
451
+ self._merge_sources(source_db, idx)
452
+ self._merge_tables(source_db, idx) # This needs pandas for data reading
453
+ self._merge_objects(source_db, idx)
454
+ self._merge_media(source_db, idx)
455
+ self._merge_semantic_links(source_db, idx)
456
+ # Not merging sdif_annotations in this version.
457
+ finally:
458
+ source_db.close()
459
+
460
+ # Finalize target DB properties
461
+ try:
462
+ current_timestamp_utc_z = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
463
+ self.target_db.conn.execute(
464
+ "UPDATE sdif_properties SET creation_timestamp = ?",
465
+ (current_timestamp_utc_z,),
466
+ )
467
+ self.target_db.conn.commit()
468
+ except sqlite3.Error as e:
469
+ log.error(f"Failed to update final creation_timestamp in target DB: {e}")
470
+ # Non-fatal, proceed.
471
+
472
+ self.target_db.close()
473
+ log.info(
474
+ f"Successfully merged {len(source_sdif_paths)} SDIF files into {self.target_db.path}"
475
+ )
476
+ return self.target_db.path
477
+
478
+
479
+ def merge_sdif_files(sdif_paths: List[SDIFPath], output_path: Path) -> Path:
480
+ """
481
+ Merges multiple SDIF files into a single new SDIF file.
7
482
 
8
483
  Args:
9
484
  sdif_paths: A list of paths to the SDIF files to merge.
10
- output_dir: The directory where the merged file should be saved.
485
+ output_path: The full path where the merged SDIF file should be saved.
486
+ Its parent directory will be created if it doesn't exist.
487
+ If output_path is an existing file, it will be overwritten.
488
+ If output_path is an existing directory, a ValueError is raised.
11
489
 
12
490
  Returns:
13
- Path to the merged SDIF file.
491
+ Path to the newly created merged SDIF file (same as output_path).
492
+
493
+ Raises:
494
+ ValueError: If no SDIF files are provided, or output_path is invalid (e.g., an existing directory).
495
+ FileNotFoundError: If a source SDIF file does not exist.
496
+ sqlite3.Error: For database-related errors during merging.
497
+ RuntimeError: For critical errors like inability to generate unique names.
14
498
  """
15
499
  if not sdif_paths:
16
500
  raise ValueError("No SDIF files provided for merging.")
17
501
 
18
- if len(sdif_paths) == 1:
19
- return sdif_paths[0] # No merge needed
502
+ output_path = Path(output_path).resolve()
503
+
504
+ if output_path.is_dir():
505
+ raise ValueError(
506
+ f"Output path '{output_path}' is an existing directory. Please provide a full file path."
507
+ )
508
+
509
+ # Ensure parent directory of output_path exists
510
+ output_path.parent.mkdir(parents=True, exist_ok=True)
511
+
512
+ # Ensure all source paths are Path objects and exist
513
+ processed_sdif_paths: List[Path] = []
514
+ for p in sdif_paths:
515
+ path_obj = Path(p).resolve()
516
+ if not path_obj.exists():
517
+ raise FileNotFoundError(f"Source SDIF file not found: {path_obj}")
518
+ if not path_obj.is_file():
519
+ raise ValueError(f"Source SDIF path is not a file: {path_obj}")
520
+ processed_sdif_paths.append(path_obj)
521
+
522
+ if len(processed_sdif_paths) == 1:
523
+ source_file = processed_sdif_paths[0]
524
+
525
+ # If the source and target are the same file, no copy is needed.
526
+ if source_file == output_path:
527
+ return source_file
528
+
529
+ shutil.copy(source_file, output_path)
530
+ log.info(f"Copied single SDIF file to '{output_path}' as no merge was needed.")
531
+ return output_path
20
532
 
21
- # TODO: Implement SDIF merge
22
- raise NotImplementedError("Merge not implemented yet.")
533
+ # For multiple files, merge them into the output_path
534
+ merger = _SDIFMerger(output_path)
535
+ return merger.merge_all(processed_sdif_paths)
@@ -1,18 +1,19 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: satif-ai
3
- Version: 0.2.9
3
+ Version: 0.2.10
4
4
  Summary: AI Agents for Satif
5
5
  License: MIT
6
6
  Author: Syncpulse
7
7
  Maintainer: Bryan Djafer
8
8
  Maintainer-email: bryan.djafer@syncpulse.fr
9
- Requires-Python: >=3.10,<4.0
9
+ Requires-Python: >=3.10,<3.14
10
10
  Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
+ Provides-Extra: xlsx
16
17
  Requires-Dist: openai-agents (>=0.0.9,<0.0.10)
17
18
  Requires-Dist: satif-sdk (>=0.1.0,<1.0.0)
18
19
  Requires-Dist: sdif-mcp (>=0.1.0,<1.0.0)
@@ -0,0 +1,20 @@
1
+ satif_ai/__init__.py,sha256=cqJ6Kd9IolVodPi9yOBPnfhYQXH5a1JgRB3HfLOtP_4,611
2
+ satif_ai/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ satif_ai/adapters/tidy.py,sha256=1g7Wcq8agAZhaAqQDhhD8yh3iO5gZ4mwdKHsiNN3hHY,18540
4
+ satif_ai/standardize.py,sha256=TgAB_nhcHY8zqlfT1PpgfgSswqdE-ly-dheQz-7NC7Q,5674
5
+ satif_ai/standardizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ satif_ai/standardizers/ai.py,sha256=jtYM-ChjLtkpFaubz980CTCNAoC33iYxB3pq0_hn2lU,21045
7
+ satif_ai/standardizers/ai_csv.py,sha256=LbCRaLleujQRgSRRyt9ujbED-PIGRq1J8zRnejGM5nc,25437
8
+ satif_ai/standardizers/ai_xlsx.py,sha256=558Bzfy8WGuk5mdnjMvvtakQXcU3rmwK3ykPjpXKwmQ,15863
9
+ satif_ai/transform.py,sha256=g5XNeVCIKUgDW3UIhf02MN9xkXnWF3EJXS0Eig_hfD8,7677
10
+ satif_ai/transformation_builders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ satif_ai/transformation_builders/syncpulse.py,sha256=c59BZicNnqs3NDKpflBAPqw42pGb6nYB2Zps0ChGyaM,11368
12
+ satif_ai/utils/__init__.py,sha256=F-usaCt_vX872mXvtukuZdNMPnkVqDb8RaDgox2uow4,212
13
+ satif_ai/utils/merge_sdif.py,sha256=y4C6pgkdyer0QugroFKUck4Eud4Ap-tJzM-eclMo3Rw,25629
14
+ satif_ai/utils/openai_mcp.py,sha256=duCQZXG0mBs9DOOFIUvzraJhxD2IDzegWO9iOiLfFwY,3938
15
+ satif_ai/utils/zip.py,sha256=G_GK8629Iw0TLFCQJfnqOscv7MoKF5zdzxvEAbL7Gss,5186
16
+ satif_ai-0.2.10.dist-info/LICENSE,sha256=kS8EN6yAaGZd7V5z6GKSn_x3ozcZltrfRky4vMPRCw8,1072
17
+ satif_ai-0.2.10.dist-info/METADATA,sha256=O5QWv8YJFtB5AIniv0LRgmSgpEaRLVdlz8WHZAru1X8,719
18
+ satif_ai-0.2.10.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
19
+ satif_ai-0.2.10.dist-info/entry_points.txt,sha256=Mz2SwYALjktap1bF-Q3EWBgiZVNT6QJCVsCs_fCV33Y,43
20
+ satif_ai-0.2.10.dist-info/RECORD,,
@@ -1,19 +0,0 @@
1
- satif_ai/__init__.py,sha256=cqJ6Kd9IolVodPi9yOBPnfhYQXH5a1JgRB3HfLOtP_4,611
2
- satif_ai/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- satif_ai/adapters/tidy.py,sha256=lcJXFmzEgCFy1W57kgbMOkoFTPLOkrvHC6NHVRKn-04,18549
4
- satif_ai/standardize.py,sha256=TgAB_nhcHY8zqlfT1PpgfgSswqdE-ly-dheQz-7NC7Q,5674
5
- satif_ai/standardizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- satif_ai/standardizers/ai.py,sha256=5vv-Rs6s_9FA21uM2iepTsbv6f3adZ8wFteOcW53z_s,21458
7
- satif_ai/standardizers/ai_csv.py,sha256=tMibsTp55sHJ56r7cYKjb5b0Hm6rdnV3TeA0EppIWJg,25371
8
- satif_ai/transform.py,sha256=iy9prkBCknRcsSXWOY_NwtNojQVcRW_luYFwkcjOnPw,5600
9
- satif_ai/transformation_builders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- satif_ai/transformation_builders/syncpulse.py,sha256=c59BZicNnqs3NDKpflBAPqw42pGb6nYB2Zps0ChGyaM,11368
11
- satif_ai/utils/__init__.py,sha256=F-usaCt_vX872mXvtukuZdNMPnkVqDb8RaDgox2uow4,212
12
- satif_ai/utils/merge_sdif.py,sha256=-BXsCaLDHEtKOQRWOKyVCNefFwkyVygFQs8NeeFONFA,663
13
- satif_ai/utils/openai_mcp.py,sha256=duCQZXG0mBs9DOOFIUvzraJhxD2IDzegWO9iOiLfFwY,3938
14
- satif_ai/utils/zip.py,sha256=G_GK8629Iw0TLFCQJfnqOscv7MoKF5zdzxvEAbL7Gss,5186
15
- satif_ai-0.2.9.dist-info/LICENSE,sha256=kS8EN6yAaGZd7V5z6GKSn_x3ozcZltrfRky4vMPRCw8,1072
16
- satif_ai-0.2.9.dist-info/METADATA,sha256=Vq62i6fUx8sKaM2mYVqRfGReHCTcFG_P6mW1otnx8GY,696
17
- satif_ai-0.2.9.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
18
- satif_ai-0.2.9.dist-info/entry_points.txt,sha256=Mz2SwYALjktap1bF-Q3EWBgiZVNT6QJCVsCs_fCV33Y,43
19
- satif_ai-0.2.9.dist-info/RECORD,,