satif-ai 0.1.2__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: satif-ai
3
- Version: 0.1.2
3
+ Version: 0.2.1
4
4
  Summary: AI Agents for Satif
5
5
  License: MIT
6
6
  Author: Bryan Djafer
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "satif-ai"
3
- version = "0.1.2"
3
+ version = "0.2.1"
4
4
  description = "AI Agents for Satif"
5
5
  authors = [
6
6
  {name = "Bryan Djafer", email = "bryan.djafer@syncpulse.fr"}
@@ -1,10 +1,11 @@
1
+ import asyncio
1
2
  import contextvars
2
3
  import csv
3
4
  import json
4
5
  import logging
5
6
  import re
6
7
  from pathlib import Path
7
- from typing import Any, Dict, Optional
8
+ from typing import Any, Dict, List, Optional, Tuple
8
9
 
9
10
  import clevercsv
10
11
  from agents import Agent, Runner, function_tool
@@ -358,6 +359,51 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
358
359
  self._initial_encoding_hint = initial_encoding
359
360
  # self.generate_description from prompt structure (table_description, column descriptions)
360
361
 
362
+ async def _get_initial_guesses(self, file_path: Path) -> Tuple[str, str]:
363
+ """Helper to get initial encoding and delimiter guesses for a single file."""
364
+ encoding_guess = self._initial_encoding_hint
365
+ if not encoding_guess:
366
+ try:
367
+ with open(file_path, "rb") as fb_enc:
368
+ enc_sample = fb_enc.read(ENCODING_SAMPLE_SIZE)
369
+ detected_enc_info = detect(enc_sample) if enc_sample else None
370
+ encoding_guess = (
371
+ detected_enc_info["encoding"]
372
+ if detected_enc_info and detected_enc_info["encoding"]
373
+ else "utf-8"
374
+ )
375
+ except Exception as e:
376
+ logger.warning(
377
+ f"Initial encoding detection for {file_path.name} failed: {e}. Using utf-8."
378
+ )
379
+ encoding_guess = "utf-8"
380
+
381
+ delimiter_guess = self._initial_delimiter_hint
382
+ if not delimiter_guess:
383
+ try:
384
+ with open(
385
+ file_path, encoding=encoding_guess, errors="ignore"
386
+ ) as f_delim_sample:
387
+ delim_sample_text = f_delim_sample.read(DELIMITER_SAMPLE_SIZE)
388
+ if delim_sample_text:
389
+ sniffer = clevercsv.Sniffer()
390
+ dialect = sniffer.sniff(delim_sample_text)
391
+ delimiter_guess = (
392
+ dialect.delimiter if dialect and dialect.delimiter else ","
393
+ )
394
+ else:
395
+ delimiter_guess = ","
396
+ except Exception as e:
397
+ logger.warning(
398
+ f"Initial delimiter detection for {file_path.name} failed ({e}). Using ','."
399
+ )
400
+ delimiter_guess = ","
401
+
402
+ logger.info(
403
+ f"Initial guesses for {file_path.name} - Encoding: {encoding_guess}, Delimiter: '{delimiter_guess}'"
404
+ )
405
+ return encoding_guess, delimiter_guess
406
+
361
407
  async def _run_analysis_agent(
362
408
  self,
363
409
  file_path: Path,
@@ -469,24 +515,21 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
469
515
  async def standardize(
470
516
  self,
471
517
  datasource: Datasource,
472
- output_path: SDIFPath, # Corrected name from output_sdif
518
+ output_path: SDIFPath,
473
519
  *,
474
520
  overwrite: bool = False,
475
521
  config: Optional[Dict[str, Any]] = None,
476
522
  **kwargs,
477
523
  ) -> Path:
478
524
  output_path_obj = Path(output_path)
525
+
526
+ input_paths: List[Path]
479
527
  if isinstance(datasource, (str, Path)):
480
528
  input_paths = [Path(datasource)]
481
529
  elif isinstance(datasource, list) and all(
482
530
  isinstance(p, (str, Path)) for p in datasource
483
531
  ):
484
532
  input_paths = [Path(p) for p in datasource]
485
- if len(input_paths) > 1:
486
- logger.warning(
487
- "AICSVStandardizer currently processes one CSV file at a time for detailed AI analysis. Using the first file only."
488
- )
489
- input_paths = [input_paths[0]] # Process one file if multiple given
490
533
  else:
491
534
  raise TypeError(
492
535
  "datasource must be a file path string/Path object or a list of such paths."
@@ -495,168 +538,104 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
495
538
  if not input_paths:
496
539
  raise ValueError("No input datasource provided.")
497
540
 
498
- input_path = input_paths[
499
- 0
500
- ] # We focus on a single file for this AI standardizer
501
-
502
- if not input_path.exists() or not input_path.is_file():
503
- raise FileNotFoundError(
504
- f"Input CSV file not found or is not a file: {input_path}"
505
- )
506
-
507
- logger.info(f"--- AI Analysis for file: {input_path.name} ---")
508
-
509
- # 1. Initial Guesses for AI
510
- initial_encoding_guess = self._initial_encoding_hint
511
- if not initial_encoding_guess:
512
- try:
513
- # Use base class's _detect_encoding, need an instance or make it static/helper
514
- # For simplicity, re-implement or call a static version if available.
515
- # Here, we simulate it for now or assume base standardizer's helper is callable.
516
- with open(input_path, "rb") as fb_enc:
517
- enc_sample = fb_enc.read(ENCODING_SAMPLE_SIZE)
518
- detected_enc_info = detect(enc_sample) if enc_sample else None
519
- initial_encoding_guess = (
520
- detected_enc_info["encoding"]
521
- if detected_enc_info and detected_enc_info["encoding"]
522
- else "utf-8"
541
+ ai_analysis_tasks = []
542
+ for input_file_path in input_paths:
543
+ if not input_file_path.exists() or not input_file_path.is_file():
544
+ raise FileNotFoundError(
545
+ f"Input CSV file not found or is not a file: {input_file_path}"
523
546
  )
547
+
548
+ # Create a task for each file's analysis
549
+ # Need to wrap _get_initial_guesses and _run_analysis_agent in a single async co-routine for gather
550
+ async def analyze_file_task(file_path_for_task: Path):
524
551
  logger.info(
525
- f"Initial encoding guess (detected): {initial_encoding_guess}"
552
+ f"--- Starting AI Analysis for file: {file_path_for_task.name} ---"
526
553
  )
527
- except Exception as e:
528
- logger.warning(
529
- f"Initial encoding detection failed: {e}. Using utf-8 as fallback guess."
554
+ enc_guess, delim_guess = await self._get_initial_guesses(
555
+ file_path_for_task
530
556
  )
531
- initial_encoding_guess = "utf-8"
532
- else:
533
- logger.info(
534
- f"Using provided initial encoding hint: {initial_encoding_guess}"
535
- )
536
-
537
- initial_delimiter_guess = self._initial_delimiter_hint
538
- if not initial_delimiter_guess:
539
- try:
540
- with open(
541
- input_path, encoding=initial_encoding_guess, errors="ignore"
542
- ) as f_delim_sample:
543
- delim_sample_text = f_delim_sample.read(DELIMITER_SAMPLE_SIZE)
544
- if delim_sample_text:
545
- # Simulate base class's _detect_delimiter
546
- sniffer = clevercsv.Sniffer()
547
- dialect = sniffer.sniff(delim_sample_text)
548
- initial_delimiter_guess = dialect.delimiter if dialect else ","
549
- logger.info(
550
- f"Initial delimiter guess (detected): '{initial_delimiter_guess}'"
551
- )
552
- else:
553
- initial_delimiter_guess = "," # Fallback
554
- logger.warning(
555
- f"File empty/small, defaulting delimiter guess to ',' for {input_path.name}"
556
- )
557
- except Exception as e:
558
- logger.warning(
559
- f"Initial delimiter detection failed ({e}). Using ',' as fallback guess for {input_path.name}."
557
+ return await self._run_analysis_agent(
558
+ file_path_for_task, enc_guess, delim_guess
560
559
  )
561
- initial_delimiter_guess = ","
562
- else:
563
- logger.info(
564
- f"Using provided initial delimiter hint: '{initial_delimiter_guess}'"
565
- )
566
560
 
567
- # 2. Run AI Agent Analysis
561
+ ai_analysis_tasks.append(
562
+ analyze_file_task(input_file_path)
563
+ ) # Pass the path to the task
564
+
565
+ logger.info(f"Starting AI analysis for {len(ai_analysis_tasks)} CSV file(s)...")
568
566
  try:
569
- ai_params = await self._run_analysis_agent(
570
- input_path,
571
- initial_encoding_guess,
572
- initial_delimiter_guess,
573
- )
567
+ all_ai_params_results = await asyncio.gather(*ai_analysis_tasks)
574
568
  except Exception as e:
575
- logger.exception(
576
- f"AI Agent analysis failed critically for {input_path.name}. Aborting."
569
+ logger.exception(f"Critical error during concurrent AI analysis phase: {e}")
570
+ raise RuntimeError("AI analysis phase failed.") from e
571
+
572
+ logger.info(
573
+ f"AI analysis complete for all {len(all_ai_params_results)} file(s)."
574
+ )
575
+
576
+ # Aggregate parameters for the base CSVStandardizer
577
+ all_ai_table_names: List[str] = []
578
+ all_ai_table_descriptions: List[Optional[str]] = []
579
+ all_ai_file_configs: List[Dict[str, Any]] = []
580
+ all_ai_column_definitions: List[
581
+ List[Dict[str, Any]]
582
+ ] = [] # List of lists of col_specs
583
+
584
+ for i, ai_params in enumerate(all_ai_params_results):
585
+ current_file_path = input_paths[i] # Get corresponding input path
586
+ logger.info(f"Aggregating AI parameters for: {current_file_path.name}")
587
+ logger.info(f" AI Table Name: {ai_params['table_name']}")
588
+ logger.info(f" AI Encoding: {ai_params['encoding']}")
589
+ logger.info(f" AI Delimiter: '{ai_params['delimiter']}'")
590
+ logger.info(f" AI Has Header: {ai_params['has_header']}")
591
+ logger.info(f" AI Skip Rows: {ai_params['skip_rows']}")
592
+ logger.info(
593
+ f" AI Table Description: {ai_params.get('table_description') if ai_params.get('table_description') is not None else 'N/A'}"
577
594
  )
578
- raise RuntimeError(f"AI analysis failed for {input_path.name}") from e
579
-
580
- # 3. Prepare parameters for the base CSVStandardizer
581
- # The AI provides parameters for a single file processing scenario.
582
-
583
- # Column definitions for the base standardizer:
584
- # Base class expects: List[Optional[Dict[str, List[Dict[str, Any]]]]]
585
- # For a single file, it's List containing one Dict: [{table_name: [col_specs...]}]
586
- # Or, if base class is adapted, List containing one List: [[col_specs...]]
587
-
588
- # The AI output `ai_params["columns"]` is already in the format:
589
- # [{"identifier_in_csv": ..., "final_column_name": ..., "description": ...}, ...]
590
- # This is exactly what the enhanced CSVStandardizer's `_setup_columns` expects for `defined_columns_spec`
591
- # when `column_definitions` is a list containing this list of specs.
592
-
593
- ai_column_definitions = [
594
- ai_params["columns"]
595
- ] # Wrap the list of col specs for the single file/table
596
-
597
- # The base CSVStandardizer will use its own _sanitize_name for the table name from AI.
598
- # We provide it via table_names list.
599
- ai_table_name = [ai_params["table_name"]]
600
- ai_table_description = [
601
- ai_params.get("table_description")
602
- ] # List of one description
603
-
604
- # File-specific config for the base standardizer
605
- # For a single file, this will be a list containing one dictionary.
606
- file_specific_config = [
607
- {
595
+ # logger.info(f" AI Column Definitions ({len(ai_params['columns'])} cols): {ai_params['columns'][:2]}...") # Log a sample
596
+
597
+ all_ai_table_names.append(ai_params["table_name"])
598
+ all_ai_table_descriptions.append(ai_params.get("table_description"))
599
+
600
+ file_conf = {
608
601
  "encoding": ai_params["encoding"],
609
602
  "delimiter": ai_params["delimiter"],
610
603
  "has_header": ai_params["has_header"],
611
604
  "skip_rows": ai_params["skip_rows"],
612
- # skip_columns is not used if column_definitions are provided,
613
- # as column selection is implicit in the provided definitions.
614
- "skip_columns": None, # Explicitly set to None
605
+ "skip_columns": None, # Column selection is handled by column_definitions
615
606
  }
616
- ]
607
+ all_ai_file_configs.append(file_conf)
608
+ all_ai_column_definitions.append(
609
+ ai_params["columns"]
610
+ ) # This is List[Dict], so we append it directly
617
611
 
618
- logger.info(f"AI determined parameters for {input_path.name}:")
619
- logger.info(f" Table Name: {ai_table_name[0]}")
620
- logger.info(f" Encoding: {file_specific_config[0]['encoding']}")
621
- logger.info(f" Delimiter: '{file_specific_config[0]['delimiter']}'")
622
- logger.info(f" Has Header: {file_specific_config[0]['has_header']}")
623
- logger.info(f" Skip Rows: {file_specific_config[0]['skip_rows']}")
612
+ # Instantiate the base CSVStandardizer with aggregated AI-derived parameters
624
613
  logger.info(
625
- f" Table Description: {ai_table_description[0] if ai_table_description and ai_table_description[0] is not None else 'N/A'}"
614
+ "Initializing final CSVStandardizer with aggregated AI parameters..."
626
615
  )
627
- logger.info(f" Column Definitions ({len(ai_column_definitions[0])} cols):")
628
- for i, c_def in enumerate(ai_column_definitions[0]):
629
- logger.info(
630
- f" {i + 1}. ID in CSV: '{c_def['identifier_in_csv']}', Final Name: '{c_def['final_column_name']}', Desc: '{c_def.get('description', 'N/A')}'"
631
- )
632
-
633
- # 4. Call Base Class Standardizer Logic with AI-derived parameters
634
- # We instantiate a new CSVStandardizer configured by the AI for this specific file.
635
616
  final_processor = CSVStandardizer(
636
- # These are now single-element lists because we process one file
637
- table_names=ai_table_name,
638
- descriptions=ai_table_description,
639
- file_configs=file_specific_config,
640
- column_definitions=ai_column_definitions, # Pass the AI-generated column specs
641
- # default_skip_columns from __init__ can remain as a very deep fallback if AI somehow fails for columns
642
- skip_columns=self.default_skip_columns,
617
+ table_names=all_ai_table_names,
618
+ descriptions=all_ai_table_descriptions,
619
+ file_configs=all_ai_file_configs,
620
+ column_definitions=all_ai_column_definitions,
621
+ skip_columns=self.default_skip_columns, # Fallback, though ideally not used if AI defines all columns
643
622
  )
644
623
 
645
624
  try:
646
- # The datasource for the base standardizer is the single input_path
625
+ logger.info(
626
+ f"Executing batch standardization for {len(input_paths)} file(s)..."
627
+ )
647
628
  result_path = final_processor.standardize(
648
- datasource=[input_path], # Pass as a list of one
629
+ datasource=input_paths, # Pass the original list of Path objects
649
630
  output_path=output_path_obj,
650
631
  overwrite=overwrite,
651
632
  )
652
633
  logger.info(
653
- f"AI CSV Standardization complete for {input_path.name}. Output: {result_path}"
634
+ f"AI CSV Standardization complete for all files. Output: {result_path}"
654
635
  )
655
636
  return result_path
656
637
  except Exception as e:
657
638
  logger.exception(
658
- f"Error during final standardization step using AI parameters for {input_path.name}: {e}"
639
+ f"Error during final batch standardization step using AI parameters: {e}"
659
640
  )
660
- raise RuntimeError(
661
- f"Final standardization step failed for {input_path.name}."
662
- ) from e
641
+ raise RuntimeError("Final batch standardization step failed.") from e
File without changes
File without changes
File without changes