satif-ai 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,485 @@
1
+ import asyncio
2
+ import logging
3
+ import shutil
4
+ import tempfile
5
+ import uuid
6
+ from collections import defaultdict
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Tuple, Type, Union
9
+
10
+ from satif_core.standardizers.base import AsyncStandardizer
11
+ from satif_core.types import Datasource, FilePath, SDIFPath, StandardizationResult
12
+
13
+ from satif_ai.adapters.tidy import TidyAdapter
14
+ from satif_ai.utils.merge_sdif import merge_sdif_files
15
+ from satif_ai.utils.zip import extract_zip_archive_async
16
+
17
+ from .ai_csv import AICSVStandardizer
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class AIStandardizer(AsyncStandardizer):
23
+ """
24
+ Orchestrates the standardization of various file types using specialized AI standardizers.
25
+ It processes a datasource, which can include individual files or ZIP archives.
26
+ Files are dispatched to appropriate AI agents (e.g., AICSVStandardizer),
27
+ and their SDIF outputs are merged into a single, final SDIF.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ mcp_server: Optional[Any] = None,
33
+ mcp_session: Optional[Any] = None,
34
+ llm_model: Optional[str] = None,
35
+ sdif_schema: Optional[Union[FilePath, Dict[str, Any]]] = None,
36
+ tidy_adapter: Optional[TidyAdapter] = None,
37
+ ):
38
+ self.mcp_server = mcp_server
39
+ self.mcp_session = mcp_session
40
+ self.llm_model = llm_model
41
+ self.sdif_schema = sdif_schema # TODO: Implement schema adaptation logic
42
+ self.tidy_adapter = tidy_adapter # TODO: Implement tidying logic
43
+
44
+ self.ai_standardizer_map: Dict[str, Type[AsyncStandardizer]] = {
45
+ ".csv": AICSVStandardizer,
46
+ # Future standardizers:
47
+ # ".xlsx": AIXLSXStandardizer,
48
+ # ".pdf": AIPDFStandardizer,
49
+ # ".json": AIJSONStandardizer,
50
+ # ".xml": AIXMLStandardizer,
51
+ }
52
+ for ext, standardizer_class in self.ai_standardizer_map.items():
53
+ if not issubclass(standardizer_class, AsyncStandardizer):
54
+ raise TypeError(
55
+ f"Standardizer for '{ext}' ({standardizer_class.__name__}) "
56
+ "must inherit from AsyncStandardizer."
57
+ )
58
+
59
+ def _get_ai_standardizer_class(
60
+ self, extension: str
61
+ ) -> Optional[Type[AsyncStandardizer]]:
62
+ return self.ai_standardizer_map.get(extension.lower())
63
+
64
+ async def _resolve_input_files(
65
+ self, datasource: Datasource, temp_processing_dir: Path
66
+ ) -> List[Path]:
67
+ """
68
+ Resolves the input datasource to a list of individual file paths.
69
+ Handles single files, lists of files, and extracts ZIP archives.
70
+ """
71
+ input_file_paths: List[Path] = []
72
+ raw_paths_to_check: List[Union[str, Path]] = []
73
+
74
+ if isinstance(datasource, (str, Path)):
75
+ raw_paths_to_check = [datasource]
76
+ elif isinstance(datasource, list) and all(
77
+ isinstance(p, (str, Path)) for p in datasource
78
+ ):
79
+ raw_paths_to_check = datasource
80
+ else:
81
+ # This also catches the case where datasource is an empty list initially
82
+ raise ValueError(
83
+ "Datasource must be a non-empty file path (string or Path) or a non-empty list of such paths."
84
+ )
85
+
86
+ if not raw_paths_to_check: # Should be caught by above, but defensive
87
+ raise ValueError("No input datasource paths provided.")
88
+
89
+ for raw_path_item in raw_paths_to_check:
90
+ raw_path = Path(raw_path_item).resolve()
91
+ if not raw_path.exists():
92
+ raise FileNotFoundError(f"Input path not found: {raw_path}")
93
+
94
+ if raw_path.is_file():
95
+ if raw_path.suffix.lower() == ".zip":
96
+ zip_extract_target = (
97
+ temp_processing_dir
98
+ / f"extracted_{raw_path.stem}_{uuid.uuid4().hex[:8]}"
99
+ )
100
+ try:
101
+ extracted_from_zip = await extract_zip_archive_async(
102
+ raw_path, zip_extract_target
103
+ )
104
+ input_file_paths.extend(extracted_from_zip)
105
+ except Exception as e_zip:
106
+ logger.error(
107
+ f"Failed to extract ZIP archive '{raw_path}': {e_zip}",
108
+ exc_info=True,
109
+ )
110
+ # Decide if one failed zip should stop all, or just be skipped.
111
+ # For now, skipping problematic zips.
112
+ continue
113
+ else:
114
+ input_file_paths.append(raw_path)
115
+ elif raw_path.is_dir():
116
+ logger.info(f"Processing directory datasource: {raw_path}")
117
+ for child_item in raw_path.iterdir():
118
+ if child_item.is_file():
119
+ input_file_paths.append(child_item)
120
+ # Deeper recursion to be implemeted.
121
+ else:
122
+ logger.warning(
123
+ f"Input path '{raw_path}' is not a file or directory and will be ignored."
124
+ )
125
+
126
+ if not input_file_paths:
127
+ # This means all inputs were invalid, unresolvable, or zips failed etc.
128
+ logger.error("No processable files found after resolving datasource.")
129
+ raise ValueError("Datasource resolution resulted in no processable files.")
130
+ return input_file_paths
131
+
132
+ def _group_files_by_standardizer(
133
+ self, file_paths: List[Path]
134
+ ) -> Tuple[Dict[Type[AsyncStandardizer], List[Path]], List[Path]]:
135
+ """Groups files by the AI standardizer responsible for them based on extension."""
136
+ grouped: Dict[Type[AsyncStandardizer], List[Path]] = defaultdict(list)
137
+ unsupported_files: List[Path] = []
138
+ for file_path in file_paths:
139
+ standardizer_class = self._get_ai_standardizer_class(file_path.suffix)
140
+ if standardizer_class:
141
+ grouped[standardizer_class].append(file_path)
142
+ else:
143
+ unsupported_files.append(file_path)
144
+ if unsupported_files:
145
+ logger.warning(
146
+ f"Unsupported files found and will be ignored: "
147
+ f"{[str(f.name) for f in unsupported_files]}"
148
+ )
149
+ return grouped, unsupported_files
150
+
151
+ async def _process_file_groups(
152
+ self,
153
+ grouped_files: Dict[Type[AsyncStandardizer], List[Path]],
154
+ temp_sdif_dir: Path,
155
+ config: Optional[Dict[str, Any]],
156
+ **kwargs,
157
+ ) -> Tuple[List[Path], List[Dict[str, Any]]]:
158
+ """
159
+ Processes groups of files using their respective AI standardizers.
160
+ Child standardizers are expected to produce a single SDIF SQLite file.
161
+
162
+ Returns:
163
+ A tuple containing:
164
+ - List of Paths to successfully created intermediate SDIF SQLite files.
165
+ - List of aggregated file configurations from child standardizers.
166
+ """
167
+ processing_tasks = []
168
+ standardizer_instances_info = []
169
+
170
+ for standardizer_class, files_in_group in grouped_files.items():
171
+ if not files_in_group:
172
+ continue
173
+
174
+ standardizer_init_kwargs = {}
175
+ # TODO: Pass standardizer-specific config from main 'config' if available for this standardizer_class
176
+
177
+ try:
178
+ ai_child_standardizer = standardizer_class(
179
+ mcp_server=self.mcp_server,
180
+ mcp_session=self.mcp_session,
181
+ llm_model=self.llm_model,
182
+ **standardizer_init_kwargs,
183
+ )
184
+ except Exception as e:
185
+ logger.error(
186
+ f"Failed to initialize standardizer {standardizer_class.__name__} for '{files_in_group[0].name}': {e}",
187
+ exc_info=True,
188
+ )
189
+ raise RuntimeError(
190
+ f"Initialization failed for {standardizer_class.__name__}: {e}"
191
+ )
192
+
193
+ # Generate a unique filename for the intermediate SDIF SQLite file
194
+ intermediate_sdif_filename = f"intermediate_{standardizer_class.__name__}_{uuid.uuid4().hex[:12]}.sdif"
195
+ intermediate_sdif_file_path = temp_sdif_dir / intermediate_sdif_filename
196
+
197
+ logger.info(
198
+ f"Queueing standardization for {len(files_in_group)} file(s) "
199
+ f"with {standardizer_class.__name__} (output file: {intermediate_sdif_file_path})"
200
+ )
201
+
202
+ task = ai_child_standardizer.standardize(
203
+ datasource=files_in_group,
204
+ output_path=intermediate_sdif_file_path,
205
+ overwrite=True, # Temporary intermediate files are always new/overwritten
206
+ config=config,
207
+ **kwargs,
208
+ )
209
+ processing_tasks.append(task)
210
+ standardizer_instances_info.append(
211
+ {
212
+ "class_name": standardizer_class.__name__,
213
+ "output_file": intermediate_sdif_file_path,
214
+ }
215
+ )
216
+
217
+ gathered_outputs = await asyncio.gather(
218
+ *processing_tasks, return_exceptions=True
219
+ )
220
+
221
+ successful_intermediate_sdif_files: List[Path] = []
222
+ aggregated_file_configs: List[Dict[str, Any]] = []
223
+
224
+ for i, result_or_exc in enumerate(gathered_outputs):
225
+ info = standardizer_instances_info[i]
226
+ expected_output_file: Path = info["output_file"]
227
+
228
+ if isinstance(result_or_exc, StandardizationResult):
229
+ # Child standardizer's output_path should be a file path.
230
+ child_reported_output_file = Path(result_or_exc.output_path)
231
+
232
+ if not child_reported_output_file.is_file():
233
+ logger.error(
234
+ f"Standardizer {info['class_name']} reported success, but its output path "
235
+ f"'{child_reported_output_file}' is not a file or does not exist. Skipping."
236
+ )
237
+ continue # Skip this problematic result
238
+
239
+ if (
240
+ child_reported_output_file.resolve()
241
+ != expected_output_file.resolve()
242
+ ):
243
+ logger.warning(
244
+ f"Standardizer {info['class_name']} reported output file '{child_reported_output_file}' "
245
+ f"which differs from expected '{expected_output_file}'. Using reported path."
246
+ )
247
+
248
+ logger.info(
249
+ f"Successfully standardized group with {info['class_name']}. "
250
+ f"Intermediate SDIF file: {child_reported_output_file}"
251
+ )
252
+ successful_intermediate_sdif_files.append(child_reported_output_file)
253
+ if result_or_exc.file_configs:
254
+ aggregated_file_configs.extend(result_or_exc.file_configs)
255
+
256
+ elif isinstance(result_or_exc, Exception):
257
+ logger.error(
258
+ f"Standardization by {info['class_name']} for target '{expected_output_file}' failed: {result_or_exc}",
259
+ exc_info=result_or_exc,
260
+ )
261
+ # Optionally, try to clean up the expected_output_file if it was created before erroring
262
+ if expected_output_file.exists():
263
+ try:
264
+ expected_output_file.unlink()
265
+ except OSError:
266
+ pass
267
+
268
+ return successful_intermediate_sdif_files, aggregated_file_configs
269
+
270
+ async def _consolidate_results(
271
+ self,
272
+ intermediate_sdif_files: List[Path],
273
+ aggregated_file_configs: Optional[List[Dict[str, Any]]],
274
+ final_sdif_file_target: Path,
275
+ overwrite: bool,
276
+ ) -> StandardizationResult:
277
+ """
278
+ Merges or moves intermediate SDIF SQLite files to the final target SDIF SQLite file.
279
+ Cleans up intermediate files.
280
+ """
281
+ if not intermediate_sdif_files:
282
+ raise RuntimeError(
283
+ "No intermediate SDIF files were successfully generated to consolidate."
284
+ )
285
+
286
+ final_sdif_file_target.parent.mkdir(parents=True, exist_ok=True)
287
+
288
+ if final_sdif_file_target.exists():
289
+ if not overwrite:
290
+ raise FileExistsError(
291
+ f"Final output file {final_sdif_file_target} already exists and overwrite is False."
292
+ )
293
+ logger.info(
294
+ f"Overwriting existing final output file: {final_sdif_file_target}"
295
+ )
296
+ try:
297
+ final_sdif_file_target.unlink()
298
+ except OSError as e_unlink:
299
+ logger.error(
300
+ f"Could not delete existing file {final_sdif_file_target}: {e_unlink}"
301
+ )
302
+ raise # Re-raise as this is critical for overwrite
303
+
304
+ final_sdif_path_str: str
305
+ if len(intermediate_sdif_files) == 1:
306
+ source_sqlite_file = intermediate_sdif_files[0]
307
+ logger.info(
308
+ f"Moving single intermediate SDIF SQLite file '{source_sqlite_file}' to final output '{final_sdif_file_target}'."
309
+ )
310
+ try:
311
+ shutil.move(str(source_sqlite_file), str(final_sdif_file_target))
312
+ final_sdif_path_str = str(final_sdif_file_target)
313
+ except Exception as e_move:
314
+ logger.error(
315
+ f"Failed to move {source_sqlite_file} to {final_sdif_file_target}: {e_move}"
316
+ )
317
+ # Attempt to copy as a fallback, then try to remove source
318
+ try:
319
+ shutil.copy2(str(source_sqlite_file), str(final_sdif_file_target))
320
+ final_sdif_path_str = str(final_sdif_file_target)
321
+ source_sqlite_file.unlink(
322
+ missing_ok=True
323
+ ) # Try to clean up source after copy
324
+ except Exception as e_copy_fallback:
325
+ logger.error(
326
+ f"Fallback copy also failed for {source_sqlite_file}: {e_copy_fallback}"
327
+ )
328
+ raise RuntimeError(
329
+ f"Could not place intermediate file into final location: {e_copy_fallback}"
330
+ ) from e_copy_fallback
331
+ else:
332
+ logger.info(
333
+ f"Merging {len(intermediate_sdif_files)} intermediate SDIF SQLite files into '{final_sdif_file_target}'."
334
+ )
335
+ # merge_sdif_files must accept a list of source SQLite file paths and a target SQLite file path.
336
+ merged_target_path = await merge_sdif_files(
337
+ intermediate_sdif_files,
338
+ final_sdif_file_target,
339
+ overwrite=False, # We handled overwrite for final_sdif_file_target
340
+ )
341
+ final_sdif_path_str = str(merged_target_path)
342
+
343
+ # Clean up original intermediate files (they have been moved or their content merged)
344
+ for temp_file in intermediate_sdif_files:
345
+ if (
346
+ temp_file.exists()
347
+ and temp_file.resolve() != Path(final_sdif_path_str).resolve()
348
+ ): # Don't delete the final file if it was one of the intermediates (single file case)
349
+ try:
350
+ temp_file.unlink()
351
+ logger.debug(f"Cleaned up intermediate file: {temp_file}")
352
+ except Exception as e_clean_file:
353
+ logger.warning(
354
+ f"Error cleaning up intermediate file {temp_file}: {e_clean_file}"
355
+ )
356
+
357
+ logger.info(
358
+ f"Consolidation complete. Final SDIF SQLite file: {final_sdif_path_str}"
359
+ )
360
+ return StandardizationResult(
361
+ output_path=Path(final_sdif_path_str),
362
+ file_configs=aggregated_file_configs if aggregated_file_configs else None,
363
+ )
364
+
365
+ async def standardize(
366
+ self,
367
+ datasource: Datasource,
368
+ output_path: SDIFPath, # Expected to be the path to the target *SDIF file*
369
+ *,
370
+ overwrite: bool = False,
371
+ config: Optional[Dict[str, Any]] = None,
372
+ **kwargs,
373
+ ) -> StandardizationResult:
374
+ """
375
+ Standardizes datasource to a single SDIF SQLite file.
376
+
377
+ Args:
378
+ datasource: Source data (file path, list of paths, or directory path).
379
+ output_path: Path to the target output SDIF SQLite file (e.g., "./output/data.sdif").
380
+ overwrite: If True, overwrite existing output file. Defaults to False.
381
+ config: Optional configuration dictionary for standardizers.
382
+ **kwargs: Additional arguments passed to child standardizers.
383
+
384
+ Returns:
385
+ StandardizationResult with the path to the created SDIF SQLite file.
386
+ """
387
+ logger.info(
388
+ f"AIStandardizer starting process for output SDIF file: {output_path}"
389
+ )
390
+ final_sdif_file_target = Path(output_path).resolve()
391
+
392
+ if final_sdif_file_target.is_dir():
393
+ raise ValueError(
394
+ f"Target output_path '{final_sdif_file_target}' is a directory. "
395
+ "It must be a full file path for the target SDIF SQLite database (e.g., data.sqlite or data.sdif)."
396
+ )
397
+ if not final_sdif_file_target.suffix:
398
+ logger.warning(
399
+ f"Target output_path '{final_sdif_file_target}' has no file extension. "
400
+ "It should be a path to an SDIF SQLite database file (e.g., data.sqlite or data.sdif)."
401
+ )
402
+ elif final_sdif_file_target.suffix.lower() not in (".sdif", ".sqlite", ".db"):
403
+ logger.warning(
404
+ f"Target output_path '{final_sdif_file_target}' does not have a common SQLite extension. "
405
+ "Ensure this is the intended SQLite file path."
406
+ )
407
+
408
+ # Create a unique temporary directory for this standardization run
409
+ # This directory will hold intermediate files and ZIP extractions.
410
+ run_temp_dir = Path(tempfile.mkdtemp(prefix="satif_aistd_run_"))
411
+ intermediate_sdif_files_dir = run_temp_dir / "intermediate_sdif_files"
412
+ intermediate_sdif_files_dir.mkdir(parents=True, exist_ok=True)
413
+ file_processing_temp_dir = run_temp_dir / "file_processing_temp"
414
+ file_processing_temp_dir.mkdir(parents=True, exist_ok=True)
415
+
416
+ try:
417
+ # 1. Resolve input datasource to a list of processable file paths
418
+ resolved_files = await self._resolve_input_files(
419
+ datasource, file_processing_temp_dir
420
+ )
421
+ logger.info(f"Resolved {len(resolved_files)} file(s) for standardization.")
422
+
423
+ # 2. Group files by the AI standardizer responsible for them
424
+ grouped_by_std, unsupported = self._group_files_by_standardizer(
425
+ resolved_files
426
+ )
427
+ if not grouped_by_std:
428
+ user_message = (
429
+ "No files found that can be handled by configured AI standardizers."
430
+ )
431
+ if unsupported:
432
+ user_message += (
433
+ f" Unsupported files: {[str(f.name) for f in unsupported]}"
434
+ )
435
+ raise ValueError(user_message)
436
+
437
+ logger.debug(
438
+ f"File groups for standardization: { {cls.__name__: [f.name for f in paths] for cls, paths in grouped_by_std.items()} }"
439
+ )
440
+
441
+ # 3. Process each group of files, generating intermediate SDIF SQLite files
442
+ (
443
+ intermediate_sdif_files,
444
+ aggregated_file_configs,
445
+ ) = await self._process_file_groups(
446
+ grouped_by_std, intermediate_sdif_files_dir, config, **kwargs
447
+ )
448
+
449
+ if not intermediate_sdif_files:
450
+ raise RuntimeError(
451
+ "No intermediate SDIF SQLite files were successfully generated."
452
+ )
453
+ logger.info(
454
+ f"Successfully generated {len(intermediate_sdif_files)} intermediate SDIF SQLite file(s)."
455
+ )
456
+
457
+ # 4. Consolidate intermediate SDIF files into the final target file
458
+ final_result = await self._consolidate_results(
459
+ intermediate_sdif_files,
460
+ aggregated_file_configs,
461
+ final_sdif_file_target,
462
+ overwrite,
463
+ )
464
+
465
+ logger.info(
466
+ f"AIStandardizer process completed. Final SDIF file at: {final_result.output_path}"
467
+ )
468
+ return final_result
469
+
470
+ except Exception as e:
471
+ logger.error(f"AIStandardizer failed: {e}", exc_info=True)
472
+ if isinstance(e, (ValueError, FileNotFoundError, FileExistsError)):
473
+ raise
474
+ raise RuntimeError(f"AIStandardizer processing error: {e}") from e
475
+ finally:
476
+ # Clean up the entire temporary directory for this run
477
+ if run_temp_dir.exists():
478
+ try:
479
+ shutil.rmtree(run_temp_dir)
480
+ logger.info(f"Cleaned up temporary run directory: {run_temp_dir}")
481
+ except Exception as e_clean:
482
+ logger.error(
483
+ f"Error cleaning up temporary run directory {run_temp_dir}: {e_clean}",
484
+ exc_info=True,
485
+ )