satif-ai 0.2.8__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,481 @@
1
+ import asyncio
2
+ import logging
3
+ import shutil
4
+ import tempfile
5
+ import uuid
6
+ from collections import defaultdict
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Tuple, Type, Union
9
+
10
+ from satif_core.standardizers.base import AsyncStandardizer
11
+ from satif_core.types import Datasource, FilePath, SDIFPath, StandardizationResult
12
+
13
+ from satif_ai.adapters.tidy import TidyAdapter
14
+ from satif_ai.standardizers.ai_xlsx import AIXLSXStandardizer
15
+ from satif_ai.utils.merge_sdif import merge_sdif_files
16
+ from satif_ai.utils.zip import extract_zip_archive_async
17
+
18
+ from .ai_csv import AICSVStandardizer
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class AIStandardizer(AsyncStandardizer):
24
+ """
25
+ Orchestrates the standardization of various file types using specialized AI standardizers.
26
+ It processes a datasource, which can include individual files or ZIP archives.
27
+ Files are dispatched to appropriate AI agents (e.g., AICSVStandardizer),
28
+ and their SDIF outputs are merged into a single, final SDIF.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ mcp_server: Optional[Any] = None,
34
+ mcp_session: Optional[Any] = None,
35
+ llm_model: Optional[str] = None,
36
+ sdif_schema: Optional[Union[FilePath, Dict[str, Any]]] = None,
37
+ tidy_adapter: Optional[TidyAdapter] = None,
38
+ ):
39
+ self.mcp_server = mcp_server
40
+ self.mcp_session = mcp_session
41
+ self.llm_model = llm_model
42
+ self.sdif_schema = sdif_schema # TODO: Implement schema adaptation logic
43
+ self.tidy_adapter = tidy_adapter # TODO: Implement tidying logic
44
+
45
+ self.ai_standardizer_map: Dict[str, Type[AsyncStandardizer]] = {
46
+ ".csv": AICSVStandardizer,
47
+ ".xlsx": AIXLSXStandardizer,
48
+ ".xls": AIXLSXStandardizer,
49
+ ".xlsm": AIXLSXStandardizer,
50
+ # ".pdf": AIPDFStandardizer,
51
+ # ".json": AIJSONStandardizer,
52
+ # ".xml": AIXMLStandardizer,
53
+ }
54
+ for ext, standardizer_class in self.ai_standardizer_map.items():
55
+ if not issubclass(standardizer_class, AsyncStandardizer):
56
+ raise TypeError(
57
+ f"Standardizer for '{ext}' ({standardizer_class.__name__}) "
58
+ "must inherit from AsyncStandardizer."
59
+ )
60
+
61
+ def _get_ai_standardizer_class(
62
+ self, extension: str
63
+ ) -> Optional[Type[AsyncStandardizer]]:
64
+ return self.ai_standardizer_map.get(extension.lower())
65
+
66
+ async def _resolve_input_files(
67
+ self, datasource: Datasource, temp_processing_dir: Path
68
+ ) -> List[Path]:
69
+ """
70
+ Resolves the input datasource to a list of individual file paths.
71
+ Handles single files, lists of files, and extracts ZIP archives.
72
+ """
73
+ input_file_paths: List[Path] = []
74
+ raw_paths_to_check: List[Union[str, Path]] = []
75
+
76
+ if isinstance(datasource, (str, Path)):
77
+ raw_paths_to_check = [datasource]
78
+ elif isinstance(datasource, list) and all(
79
+ isinstance(p, (str, Path)) for p in datasource
80
+ ):
81
+ raw_paths_to_check = datasource
82
+ else:
83
+ # This also catches the case where datasource is an empty list initially
84
+ raise ValueError(
85
+ "Datasource must be a non-empty file path (string or Path) or a non-empty list of such paths."
86
+ )
87
+
88
+ if not raw_paths_to_check: # Should be caught by above, but defensive
89
+ raise ValueError("No input datasource paths provided.")
90
+
91
+ for raw_path_item in raw_paths_to_check:
92
+ raw_path = Path(raw_path_item).resolve()
93
+ if not raw_path.exists():
94
+ raise FileNotFoundError(f"Input path not found: {raw_path}")
95
+
96
+ if raw_path.is_file():
97
+ if raw_path.suffix.lower() == ".zip":
98
+ zip_extract_target = (
99
+ temp_processing_dir
100
+ / f"extracted_{raw_path.stem}_{uuid.uuid4().hex[:8]}"
101
+ )
102
+ try:
103
+ extracted_from_zip = await extract_zip_archive_async(
104
+ raw_path, zip_extract_target
105
+ )
106
+ input_file_paths.extend(extracted_from_zip)
107
+ except Exception as e_zip:
108
+ logger.error(
109
+ f"Failed to extract ZIP archive '{raw_path}': {e_zip}",
110
+ exc_info=True,
111
+ )
112
+ # Decide if one failed zip should stop all, or just be skipped.
113
+ # For now, skipping problematic zips.
114
+ continue
115
+ else:
116
+ input_file_paths.append(raw_path)
117
+ elif raw_path.is_dir():
118
+ logger.info(f"Processing directory datasource: {raw_path}")
119
+ for child_item in raw_path.iterdir():
120
+ if child_item.is_file():
121
+ input_file_paths.append(child_item)
122
+ # Deeper recursion to be implemeted.
123
+ else:
124
+ logger.warning(
125
+ f"Input path '{raw_path}' is not a file or directory and will be ignored."
126
+ )
127
+
128
+ if not input_file_paths:
129
+ # This means all inputs were invalid, unresolvable, or zips failed etc.
130
+ logger.error("No processable files found after resolving datasource.")
131
+ raise ValueError("Datasource resolution resulted in no processable files.")
132
+ return input_file_paths
133
+
134
+ def _group_files_by_standardizer(
135
+ self, file_paths: List[Path]
136
+ ) -> Tuple[Dict[Type[AsyncStandardizer], List[Path]], List[Path]]:
137
+ """Groups files by the AI standardizer responsible for them based on extension."""
138
+ grouped: Dict[Type[AsyncStandardizer], List[Path]] = defaultdict(list)
139
+ unsupported_files: List[Path] = []
140
+ for file_path in file_paths:
141
+ standardizer_class = self._get_ai_standardizer_class(file_path.suffix)
142
+ if standardizer_class:
143
+ grouped[standardizer_class].append(file_path)
144
+ else:
145
+ unsupported_files.append(file_path)
146
+ if unsupported_files:
147
+ logger.warning(
148
+ f"Unsupported files found and will be ignored: "
149
+ f"{[str(f.name) for f in unsupported_files]}"
150
+ )
151
+ return grouped, unsupported_files
152
+
153
+ async def _process_file_groups(
154
+ self,
155
+ grouped_files: Dict[Type[AsyncStandardizer], List[Path]],
156
+ temp_sdif_dir: Path,
157
+ config: Optional[Dict[str, Any]],
158
+ **kwargs,
159
+ ) -> Tuple[List[Path], List[Dict[str, Any]]]:
160
+ """
161
+ Processes groups of files using their respective AI standardizers.
162
+ Child standardizers are expected to produce a single SDIF SQLite file.
163
+
164
+ Returns:
165
+ A tuple containing:
166
+ - List of Paths to successfully created intermediate SDIF SQLite files.
167
+ - List of aggregated file configurations from child standardizers.
168
+ """
169
+ processing_tasks = []
170
+ standardizer_instances_info = []
171
+
172
+ for standardizer_class, files_in_group in grouped_files.items():
173
+ if not files_in_group:
174
+ continue
175
+
176
+ standardizer_init_kwargs = {}
177
+ # TODO: Pass standardizer-specific config from main 'config' if available for this standardizer_class
178
+
179
+ try:
180
+ ai_child_standardizer = standardizer_class(
181
+ mcp_server=self.mcp_server,
182
+ mcp_session=self.mcp_session,
183
+ llm_model=self.llm_model,
184
+ **standardizer_init_kwargs,
185
+ )
186
+ except Exception as e:
187
+ logger.error(
188
+ f"Failed to initialize standardizer {standardizer_class.__name__} for '{files_in_group[0].name}': {e}",
189
+ exc_info=True,
190
+ )
191
+ raise RuntimeError(
192
+ f"Initialization failed for {standardizer_class.__name__}: {e}"
193
+ )
194
+
195
+ # Generate a unique filename for the intermediate SDIF SQLite file
196
+ intermediate_sdif_filename = f"intermediate_{standardizer_class.__name__}_{uuid.uuid4().hex[:12]}.sdif"
197
+ intermediate_sdif_file_path = temp_sdif_dir / intermediate_sdif_filename
198
+
199
+ logger.info(
200
+ f"Queueing standardization for {len(files_in_group)} file(s) "
201
+ f"with {standardizer_class.__name__} (output file: {intermediate_sdif_file_path})"
202
+ )
203
+
204
+ task = ai_child_standardizer.standardize(
205
+ datasource=files_in_group,
206
+ output_path=intermediate_sdif_file_path,
207
+ overwrite=True, # Temporary intermediate files are always new/overwritten
208
+ config=config,
209
+ **kwargs,
210
+ )
211
+ processing_tasks.append(task)
212
+ standardizer_instances_info.append(
213
+ {
214
+ "class_name": standardizer_class.__name__,
215
+ "output_file": intermediate_sdif_file_path,
216
+ }
217
+ )
218
+
219
+ gathered_outputs = await asyncio.gather(
220
+ *processing_tasks, return_exceptions=True
221
+ )
222
+
223
+ successful_intermediate_sdif_files: List[Path] = []
224
+ aggregated_file_configs: List[Dict[str, Any]] = []
225
+
226
+ for i, result_or_exc in enumerate(gathered_outputs):
227
+ info = standardizer_instances_info[i]
228
+ expected_output_file: Path = info["output_file"]
229
+
230
+ if isinstance(result_or_exc, StandardizationResult):
231
+ # Child standardizer's output_path should be a file path.
232
+ child_reported_output_file = Path(result_or_exc.output_path)
233
+
234
+ if not child_reported_output_file.is_file():
235
+ logger.error(
236
+ f"Standardizer {info['class_name']} reported success, but its output path "
237
+ f"'{child_reported_output_file}' is not a file or does not exist. Skipping."
238
+ )
239
+ continue # Skip this problematic result
240
+
241
+ if (
242
+ child_reported_output_file.resolve()
243
+ != expected_output_file.resolve()
244
+ ):
245
+ logger.warning(
246
+ f"Standardizer {info['class_name']} reported output file '{child_reported_output_file}' "
247
+ f"which differs from expected '{expected_output_file}'. Using reported path."
248
+ )
249
+
250
+ logger.info(
251
+ f"Successfully standardized group with {info['class_name']}. "
252
+ f"Intermediate SDIF file: {child_reported_output_file}"
253
+ )
254
+ successful_intermediate_sdif_files.append(child_reported_output_file)
255
+ if result_or_exc.file_configs:
256
+ aggregated_file_configs.extend(result_or_exc.file_configs)
257
+
258
+ elif isinstance(result_or_exc, Exception):
259
+ logger.error(
260
+ f"Standardization by {info['class_name']} for target '{expected_output_file}' failed: {result_or_exc}",
261
+ exc_info=result_or_exc,
262
+ )
263
+ # Optionally, try to clean up the expected_output_file if it was created before erroring
264
+ if expected_output_file.exists():
265
+ try:
266
+ expected_output_file.unlink()
267
+ except OSError:
268
+ pass
269
+
270
+ return successful_intermediate_sdif_files, aggregated_file_configs
271
+
272
+ async def _consolidate_results(
273
+ self,
274
+ intermediate_sdif_files: List[Path],
275
+ aggregated_file_configs: Optional[List[Dict[str, Any]]],
276
+ final_sdif_file_target: Path,
277
+ overwrite: bool,
278
+ ) -> StandardizationResult:
279
+ """
280
+ Merges or moves intermediate SDIF SQLite files to the final target SDIF SQLite file.
281
+ Cleans up intermediate files.
282
+ """
283
+ if not intermediate_sdif_files:
284
+ raise RuntimeError(
285
+ "No intermediate SDIF files were successfully generated to consolidate."
286
+ )
287
+
288
+ final_sdif_file_target.parent.mkdir(parents=True, exist_ok=True)
289
+
290
+ if final_sdif_file_target.exists():
291
+ if not overwrite:
292
+ raise FileExistsError(
293
+ f"Final output file {final_sdif_file_target} already exists and overwrite is False."
294
+ )
295
+ logger.info(
296
+ f"Overwriting existing final output file: {final_sdif_file_target}"
297
+ )
298
+ try:
299
+ final_sdif_file_target.unlink()
300
+ except OSError as e_unlink:
301
+ logger.error(
302
+ f"Could not delete existing file {final_sdif_file_target}: {e_unlink}"
303
+ )
304
+ raise # Re-raise as this is critical for overwrite
305
+
306
+ final_sdif_path_str: str
307
+ if len(intermediate_sdif_files) == 1:
308
+ source_sqlite_file = intermediate_sdif_files[0]
309
+ logger.info(
310
+ f"Moving single intermediate SDIF SQLite file '{source_sqlite_file}' to final output '{final_sdif_file_target}'."
311
+ )
312
+ try:
313
+ shutil.move(str(source_sqlite_file), str(final_sdif_file_target))
314
+ final_sdif_path_str = str(final_sdif_file_target)
315
+ except Exception as e_move:
316
+ logger.error(
317
+ f"Failed to move {source_sqlite_file} to {final_sdif_file_target}: {e_move}"
318
+ )
319
+ # Attempt to copy as a fallback, then try to remove source
320
+ try:
321
+ shutil.copy2(str(source_sqlite_file), str(final_sdif_file_target))
322
+ final_sdif_path_str = str(final_sdif_file_target)
323
+ source_sqlite_file.unlink(
324
+ missing_ok=True
325
+ ) # Try to clean up source after copy
326
+ except Exception as e_copy_fallback:
327
+ logger.error(
328
+ f"Fallback copy also failed for {source_sqlite_file}: {e_copy_fallback}"
329
+ )
330
+ raise RuntimeError(
331
+ f"Could not place intermediate file into final location: {e_copy_fallback}"
332
+ ) from e_copy_fallback
333
+ else:
334
+ logger.info(
335
+ f"Merging {len(intermediate_sdif_files)} intermediate SDIF SQLite files into '{final_sdif_file_target}'."
336
+ )
337
+ merged_target_path = merge_sdif_files(
338
+ intermediate_sdif_files,
339
+ final_sdif_file_target,
340
+ )
341
+ final_sdif_path_str = str(merged_target_path)
342
+
343
+ # Clean up original intermediate files (they have been moved or their content merged)
344
+ for temp_file in intermediate_sdif_files:
345
+ if (
346
+ temp_file.exists()
347
+ and temp_file.resolve() != Path(final_sdif_path_str).resolve()
348
+ ): # Don't delete the final file if it was one of the intermediates (single file case)
349
+ try:
350
+ temp_file.unlink()
351
+ logger.debug(f"Cleaned up intermediate file: {temp_file}")
352
+ except Exception as e_clean_file:
353
+ logger.warning(
354
+ f"Error cleaning up intermediate file {temp_file}: {e_clean_file}"
355
+ )
356
+
357
+ logger.info(
358
+ f"Consolidation complete. Final SDIF SQLite file: {final_sdif_path_str}"
359
+ )
360
+ return StandardizationResult(
361
+ output_path=Path(final_sdif_path_str),
362
+ file_configs=aggregated_file_configs if aggregated_file_configs else None,
363
+ )
364
+
365
+ async def standardize(
366
+ self,
367
+ datasource: Datasource,
368
+ output_path: SDIFPath, # Expected to be the path to the target *SDIF file*
369
+ *,
370
+ overwrite: bool = False,
371
+ config: Optional[Dict[str, Any]] = None,
372
+ **kwargs,
373
+ ) -> StandardizationResult:
374
+ """
375
+ Standardizes datasource to a single SDIF SQLite file.
376
+
377
+ Args:
378
+ datasource: Source data (file path, list of paths, or directory path).
379
+ output_path: Path to the target output SDIF SQLite file (e.g., "./output/data.sdif").
380
+ overwrite: If True, overwrite existing output file. Defaults to False.
381
+ config: Optional configuration dictionary for standardizers.
382
+ **kwargs: Additional arguments passed to child standardizers.
383
+
384
+ Returns:
385
+ StandardizationResult with the path to the created SDIF SQLite file.
386
+ """
387
+ logger.info(
388
+ f"AIStandardizer starting process for output SDIF file: {output_path}"
389
+ )
390
+ final_sdif_file_target = Path(output_path).resolve()
391
+
392
+ if final_sdif_file_target.is_dir():
393
+ raise ValueError(
394
+ f"Target output_path '{final_sdif_file_target}' is a directory. "
395
+ "It must be a full file path for the target SDIF SQLite database (e.g., data.sqlite or data.sdif)."
396
+ )
397
+ if not final_sdif_file_target.suffix:
398
+ logger.warning(
399
+ f"Target output_path '{final_sdif_file_target}' has no file extension. "
400
+ "It should be a path to an SDIF SQLite database file (e.g., data.sqlite or data.sdif)."
401
+ )
402
+ elif final_sdif_file_target.suffix.lower() not in (".sdif", ".sqlite", ".db"):
403
+ logger.warning(
404
+ f"Target output_path '{final_sdif_file_target}' does not have a common SQLite extension. "
405
+ "Ensure this is the intended SQLite file path."
406
+ )
407
+
408
+ # Create a unique temporary directory for this standardization run
409
+ # This directory will hold intermediate files and ZIP extractions.
410
+ run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aistd_run_"))
411
+ intermediate_sdif_files_dir = run_temp_dir / "intermediate_sdif_files"
412
+ intermediate_sdif_files_dir.mkdir(parents=True, exist_ok=True)
413
+ file_processing_temp_dir = run_temp_dir / "file_processing_temp"
414
+ file_processing_temp_dir.mkdir(parents=True, exist_ok=True)
415
+
416
+ try:
417
+ resolved_files = await self._resolve_input_files(
418
+ datasource, file_processing_temp_dir
419
+ )
420
+ logger.info(f"Resolved {len(resolved_files)} file(s) for standardization.")
421
+
422
+ grouped_by_std, unsupported = self._group_files_by_standardizer(
423
+ resolved_files
424
+ )
425
+ if not grouped_by_std:
426
+ user_message = (
427
+ "No files found that can be handled by configured AI standardizers."
428
+ )
429
+ if unsupported:
430
+ user_message += (
431
+ f" Unsupported files: {[str(f.name) for f in unsupported]}"
432
+ )
433
+ raise ValueError(user_message)
434
+
435
+ logger.debug(
436
+ f"File groups for standardization: { {cls.__name__: [f.name for f in paths] for cls, paths in grouped_by_std.items()} }"
437
+ )
438
+
439
+ (
440
+ intermediate_sdif_files,
441
+ aggregated_file_configs,
442
+ ) = await self._process_file_groups(
443
+ grouped_by_std, intermediate_sdif_files_dir, config, **kwargs
444
+ )
445
+
446
+ if not intermediate_sdif_files:
447
+ raise RuntimeError(
448
+ "No intermediate SDIF SQLite files were successfully generated."
449
+ )
450
+ logger.info(
451
+ f"Successfully generated {len(intermediate_sdif_files)} intermediate SDIF SQLite file(s)."
452
+ )
453
+
454
+ final_result = await self._consolidate_results(
455
+ intermediate_sdif_files,
456
+ aggregated_file_configs,
457
+ final_sdif_file_target,
458
+ overwrite,
459
+ )
460
+
461
+ logger.info(
462
+ f"AIStandardizer process completed. Final SDIF file at: {final_result.output_path}"
463
+ )
464
+ return final_result
465
+
466
+ except Exception as e:
467
+ logger.error(f"AIStandardizer failed: {e}", exc_info=True)
468
+ if isinstance(e, (ValueError, FileNotFoundError, FileExistsError)):
469
+ raise
470
+ raise RuntimeError(f"AIStandardizer processing error: {e}") from e
471
+ finally:
472
+ # Clean up the entire temporary directory for this run
473
+ if run_temp_dir.exists():
474
+ try:
475
+ shutil.rmtree(run_temp_dir)
476
+ logger.info(f"Cleaned up temporary run directory: {run_temp_dir}")
477
+ except Exception as e_clean:
478
+ logger.error(
479
+ f"Error cleaning up temporary run directory {run_temp_dir}: {e_clean}",
480
+ exc_info=True,
481
+ )
@@ -12,6 +12,7 @@ from agents import Agent, Runner, function_tool
12
12
  from agents.mcp.server import MCPServerStdio
13
13
  from charset_normalizer import detect
14
14
  from mcp import ClientSession
15
+ from satif_core import AsyncStandardizer
15
16
  from satif_core.types import Datasource, SDIFPath, StandardizationResult
16
17
  from satif_sdk.standardizers.csv import (
17
18
  CSVStandardizer,
@@ -37,7 +38,7 @@ You are an expert CSV Data Standardization Agent. Your mission is to analyze a g
37
38
  - Encoding: {initial_encoding}
38
39
  - Delimiter: '{initial_delimiter}'
39
40
 
40
- **Your Comprehensive Task:**
41
+ **Your Task:**
41
42
 
42
43
  1. **Core Parsing Parameters:**
43
44
  * Determine the correct file `encoding` (string, e.g., "utf-8", "latin-1").
@@ -274,7 +275,9 @@ async def read_raw_lines(
274
275
 
275
276
 
276
277
  # --- AICSVStandardizer Class ---
277
- class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStandardizer
278
+ class AICSVStandardizer(
279
+ CSVStandardizer, AsyncStandardizer
280
+ ): # Inherits from the enhanced CSVStandardizer
278
281
  def __init__(
279
282
  self,
280
283
  mcp_server: Optional[MCPServerStdio] = None,