satif-ai 0.1.2__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {satif_ai-0.1.2 → satif_ai-0.2.1}/PKG-INFO +1 -1
- {satif_ai-0.1.2 → satif_ai-0.2.1}/pyproject.toml +1 -1
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/standardizers/ai_csv.py +120 -141
- {satif_ai-0.1.2 → satif_ai-0.2.1}/LICENSE +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/README.md +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/__init__.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/adapters/__init__.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/adapters/tidy.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/code_builders/__init__.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/code_builders/adaptation.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/code_builders/transformation.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/plot_builders/__init__.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/plot_builders/agent.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/plot_builders/prompt.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/plot_builders/tool.py +0 -0
- {satif_ai-0.1.2 → satif_ai-0.2.1}/satif_ai/standardizers/__init__.py +0 -0
@@ -1,10 +1,11 @@
|
|
1
|
+
import asyncio
|
1
2
|
import contextvars
|
2
3
|
import csv
|
3
4
|
import json
|
4
5
|
import logging
|
5
6
|
import re
|
6
7
|
from pathlib import Path
|
7
|
-
from typing import Any, Dict, Optional
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
8
9
|
|
9
10
|
import clevercsv
|
10
11
|
from agents import Agent, Runner, function_tool
|
@@ -358,6 +359,51 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
|
|
358
359
|
self._initial_encoding_hint = initial_encoding
|
359
360
|
# self.generate_description from prompt structure (table_description, column descriptions)
|
360
361
|
|
362
|
+
async def _get_initial_guesses(self, file_path: Path) -> Tuple[str, str]:
|
363
|
+
"""Helper to get initial encoding and delimiter guesses for a single file."""
|
364
|
+
encoding_guess = self._initial_encoding_hint
|
365
|
+
if not encoding_guess:
|
366
|
+
try:
|
367
|
+
with open(file_path, "rb") as fb_enc:
|
368
|
+
enc_sample = fb_enc.read(ENCODING_SAMPLE_SIZE)
|
369
|
+
detected_enc_info = detect(enc_sample) if enc_sample else None
|
370
|
+
encoding_guess = (
|
371
|
+
detected_enc_info["encoding"]
|
372
|
+
if detected_enc_info and detected_enc_info["encoding"]
|
373
|
+
else "utf-8"
|
374
|
+
)
|
375
|
+
except Exception as e:
|
376
|
+
logger.warning(
|
377
|
+
f"Initial encoding detection for {file_path.name} failed: {e}. Using utf-8."
|
378
|
+
)
|
379
|
+
encoding_guess = "utf-8"
|
380
|
+
|
381
|
+
delimiter_guess = self._initial_delimiter_hint
|
382
|
+
if not delimiter_guess:
|
383
|
+
try:
|
384
|
+
with open(
|
385
|
+
file_path, encoding=encoding_guess, errors="ignore"
|
386
|
+
) as f_delim_sample:
|
387
|
+
delim_sample_text = f_delim_sample.read(DELIMITER_SAMPLE_SIZE)
|
388
|
+
if delim_sample_text:
|
389
|
+
sniffer = clevercsv.Sniffer()
|
390
|
+
dialect = sniffer.sniff(delim_sample_text)
|
391
|
+
delimiter_guess = (
|
392
|
+
dialect.delimiter if dialect and dialect.delimiter else ","
|
393
|
+
)
|
394
|
+
else:
|
395
|
+
delimiter_guess = ","
|
396
|
+
except Exception as e:
|
397
|
+
logger.warning(
|
398
|
+
f"Initial delimiter detection for {file_path.name} failed ({e}). Using ','."
|
399
|
+
)
|
400
|
+
delimiter_guess = ","
|
401
|
+
|
402
|
+
logger.info(
|
403
|
+
f"Initial guesses for {file_path.name} - Encoding: {encoding_guess}, Delimiter: '{delimiter_guess}'"
|
404
|
+
)
|
405
|
+
return encoding_guess, delimiter_guess
|
406
|
+
|
361
407
|
async def _run_analysis_agent(
|
362
408
|
self,
|
363
409
|
file_path: Path,
|
@@ -469,24 +515,21 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
|
|
469
515
|
async def standardize(
|
470
516
|
self,
|
471
517
|
datasource: Datasource,
|
472
|
-
output_path: SDIFPath,
|
518
|
+
output_path: SDIFPath,
|
473
519
|
*,
|
474
520
|
overwrite: bool = False,
|
475
521
|
config: Optional[Dict[str, Any]] = None,
|
476
522
|
**kwargs,
|
477
523
|
) -> Path:
|
478
524
|
output_path_obj = Path(output_path)
|
525
|
+
|
526
|
+
input_paths: List[Path]
|
479
527
|
if isinstance(datasource, (str, Path)):
|
480
528
|
input_paths = [Path(datasource)]
|
481
529
|
elif isinstance(datasource, list) and all(
|
482
530
|
isinstance(p, (str, Path)) for p in datasource
|
483
531
|
):
|
484
532
|
input_paths = [Path(p) for p in datasource]
|
485
|
-
if len(input_paths) > 1:
|
486
|
-
logger.warning(
|
487
|
-
"AICSVStandardizer currently processes one CSV file at a time for detailed AI analysis. Using the first file only."
|
488
|
-
)
|
489
|
-
input_paths = [input_paths[0]] # Process one file if multiple given
|
490
533
|
else:
|
491
534
|
raise TypeError(
|
492
535
|
"datasource must be a file path string/Path object or a list of such paths."
|
@@ -495,168 +538,104 @@ class AICSVStandardizer(CSVStandardizer): # Inherits from the enhanced CSVStand
|
|
495
538
|
if not input_paths:
|
496
539
|
raise ValueError("No input datasource provided.")
|
497
540
|
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
raise FileNotFoundError(
|
504
|
-
f"Input CSV file not found or is not a file: {input_path}"
|
505
|
-
)
|
506
|
-
|
507
|
-
logger.info(f"--- AI Analysis for file: {input_path.name} ---")
|
508
|
-
|
509
|
-
# 1. Initial Guesses for AI
|
510
|
-
initial_encoding_guess = self._initial_encoding_hint
|
511
|
-
if not initial_encoding_guess:
|
512
|
-
try:
|
513
|
-
# Use base class's _detect_encoding, need an instance or make it static/helper
|
514
|
-
# For simplicity, re-implement or call a static version if available.
|
515
|
-
# Here, we simulate it for now or assume base standardizer's helper is callable.
|
516
|
-
with open(input_path, "rb") as fb_enc:
|
517
|
-
enc_sample = fb_enc.read(ENCODING_SAMPLE_SIZE)
|
518
|
-
detected_enc_info = detect(enc_sample) if enc_sample else None
|
519
|
-
initial_encoding_guess = (
|
520
|
-
detected_enc_info["encoding"]
|
521
|
-
if detected_enc_info and detected_enc_info["encoding"]
|
522
|
-
else "utf-8"
|
541
|
+
ai_analysis_tasks = []
|
542
|
+
for input_file_path in input_paths:
|
543
|
+
if not input_file_path.exists() or not input_file_path.is_file():
|
544
|
+
raise FileNotFoundError(
|
545
|
+
f"Input CSV file not found or is not a file: {input_file_path}"
|
523
546
|
)
|
547
|
+
|
548
|
+
# Create a task for each file's analysis
|
549
|
+
# Need to wrap _get_initial_guesses and _run_analysis_agent in a single async co-routine for gather
|
550
|
+
async def analyze_file_task(file_path_for_task: Path):
|
524
551
|
logger.info(
|
525
|
-
f"
|
552
|
+
f"--- Starting AI Analysis for file: {file_path_for_task.name} ---"
|
526
553
|
)
|
527
|
-
|
528
|
-
|
529
|
-
f"Initial encoding detection failed: {e}. Using utf-8 as fallback guess."
|
554
|
+
enc_guess, delim_guess = await self._get_initial_guesses(
|
555
|
+
file_path_for_task
|
530
556
|
)
|
531
|
-
|
532
|
-
|
533
|
-
logger.info(
|
534
|
-
f"Using provided initial encoding hint: {initial_encoding_guess}"
|
535
|
-
)
|
536
|
-
|
537
|
-
initial_delimiter_guess = self._initial_delimiter_hint
|
538
|
-
if not initial_delimiter_guess:
|
539
|
-
try:
|
540
|
-
with open(
|
541
|
-
input_path, encoding=initial_encoding_guess, errors="ignore"
|
542
|
-
) as f_delim_sample:
|
543
|
-
delim_sample_text = f_delim_sample.read(DELIMITER_SAMPLE_SIZE)
|
544
|
-
if delim_sample_text:
|
545
|
-
# Simulate base class's _detect_delimiter
|
546
|
-
sniffer = clevercsv.Sniffer()
|
547
|
-
dialect = sniffer.sniff(delim_sample_text)
|
548
|
-
initial_delimiter_guess = dialect.delimiter if dialect else ","
|
549
|
-
logger.info(
|
550
|
-
f"Initial delimiter guess (detected): '{initial_delimiter_guess}'"
|
551
|
-
)
|
552
|
-
else:
|
553
|
-
initial_delimiter_guess = "," # Fallback
|
554
|
-
logger.warning(
|
555
|
-
f"File empty/small, defaulting delimiter guess to ',' for {input_path.name}"
|
556
|
-
)
|
557
|
-
except Exception as e:
|
558
|
-
logger.warning(
|
559
|
-
f"Initial delimiter detection failed ({e}). Using ',' as fallback guess for {input_path.name}."
|
557
|
+
return await self._run_analysis_agent(
|
558
|
+
file_path_for_task, enc_guess, delim_guess
|
560
559
|
)
|
561
|
-
initial_delimiter_guess = ","
|
562
|
-
else:
|
563
|
-
logger.info(
|
564
|
-
f"Using provided initial delimiter hint: '{initial_delimiter_guess}'"
|
565
|
-
)
|
566
560
|
|
567
|
-
|
561
|
+
ai_analysis_tasks.append(
|
562
|
+
analyze_file_task(input_file_path)
|
563
|
+
) # Pass the path to the task
|
564
|
+
|
565
|
+
logger.info(f"Starting AI analysis for {len(ai_analysis_tasks)} CSV file(s)...")
|
568
566
|
try:
|
569
|
-
|
570
|
-
input_path,
|
571
|
-
initial_encoding_guess,
|
572
|
-
initial_delimiter_guess,
|
573
|
-
)
|
567
|
+
all_ai_params_results = await asyncio.gather(*ai_analysis_tasks)
|
574
568
|
except Exception as e:
|
575
|
-
logger.exception(
|
576
|
-
|
569
|
+
logger.exception(f"Critical error during concurrent AI analysis phase: {e}")
|
570
|
+
raise RuntimeError("AI analysis phase failed.") from e
|
571
|
+
|
572
|
+
logger.info(
|
573
|
+
f"AI analysis complete for all {len(all_ai_params_results)} file(s)."
|
574
|
+
)
|
575
|
+
|
576
|
+
# Aggregate parameters for the base CSVStandardizer
|
577
|
+
all_ai_table_names: List[str] = []
|
578
|
+
all_ai_table_descriptions: List[Optional[str]] = []
|
579
|
+
all_ai_file_configs: List[Dict[str, Any]] = []
|
580
|
+
all_ai_column_definitions: List[
|
581
|
+
List[Dict[str, Any]]
|
582
|
+
] = [] # List of lists of col_specs
|
583
|
+
|
584
|
+
for i, ai_params in enumerate(all_ai_params_results):
|
585
|
+
current_file_path = input_paths[i] # Get corresponding input path
|
586
|
+
logger.info(f"Aggregating AI parameters for: {current_file_path.name}")
|
587
|
+
logger.info(f" AI Table Name: {ai_params['table_name']}")
|
588
|
+
logger.info(f" AI Encoding: {ai_params['encoding']}")
|
589
|
+
logger.info(f" AI Delimiter: '{ai_params['delimiter']}'")
|
590
|
+
logger.info(f" AI Has Header: {ai_params['has_header']}")
|
591
|
+
logger.info(f" AI Skip Rows: {ai_params['skip_rows']}")
|
592
|
+
logger.info(
|
593
|
+
f" AI Table Description: {ai_params.get('table_description') if ai_params.get('table_description') is not None else 'N/A'}"
|
577
594
|
)
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
# Base class expects: List[Optional[Dict[str, List[Dict[str, Any]]]]]
|
585
|
-
# For a single file, it's List containing one Dict: [{table_name: [col_specs...]}]
|
586
|
-
# Or, if base class is adapted, List containing one List: [[col_specs...]]
|
587
|
-
|
588
|
-
# The AI output `ai_params["columns"]` is already in the format:
|
589
|
-
# [{"identifier_in_csv": ..., "final_column_name": ..., "description": ...}, ...]
|
590
|
-
# This is exactly what the enhanced CSVStandardizer's `_setup_columns` expects for `defined_columns_spec`
|
591
|
-
# when `column_definitions` is a list containing this list of specs.
|
592
|
-
|
593
|
-
ai_column_definitions = [
|
594
|
-
ai_params["columns"]
|
595
|
-
] # Wrap the list of col specs for the single file/table
|
596
|
-
|
597
|
-
# The base CSVStandardizer will use its own _sanitize_name for the table name from AI.
|
598
|
-
# We provide it via table_names list.
|
599
|
-
ai_table_name = [ai_params["table_name"]]
|
600
|
-
ai_table_description = [
|
601
|
-
ai_params.get("table_description")
|
602
|
-
] # List of one description
|
603
|
-
|
604
|
-
# File-specific config for the base standardizer
|
605
|
-
# For a single file, this will be a list containing one dictionary.
|
606
|
-
file_specific_config = [
|
607
|
-
{
|
595
|
+
# logger.info(f" AI Column Definitions ({len(ai_params['columns'])} cols): {ai_params['columns'][:2]}...") # Log a sample
|
596
|
+
|
597
|
+
all_ai_table_names.append(ai_params["table_name"])
|
598
|
+
all_ai_table_descriptions.append(ai_params.get("table_description"))
|
599
|
+
|
600
|
+
file_conf = {
|
608
601
|
"encoding": ai_params["encoding"],
|
609
602
|
"delimiter": ai_params["delimiter"],
|
610
603
|
"has_header": ai_params["has_header"],
|
611
604
|
"skip_rows": ai_params["skip_rows"],
|
612
|
-
#
|
613
|
-
# as column selection is implicit in the provided definitions.
|
614
|
-
"skip_columns": None, # Explicitly set to None
|
605
|
+
"skip_columns": None, # Column selection is handled by column_definitions
|
615
606
|
}
|
616
|
-
|
607
|
+
all_ai_file_configs.append(file_conf)
|
608
|
+
all_ai_column_definitions.append(
|
609
|
+
ai_params["columns"]
|
610
|
+
) # This is List[Dict], so we append it directly
|
617
611
|
|
618
|
-
|
619
|
-
logger.info(f" Table Name: {ai_table_name[0]}")
|
620
|
-
logger.info(f" Encoding: {file_specific_config[0]['encoding']}")
|
621
|
-
logger.info(f" Delimiter: '{file_specific_config[0]['delimiter']}'")
|
622
|
-
logger.info(f" Has Header: {file_specific_config[0]['has_header']}")
|
623
|
-
logger.info(f" Skip Rows: {file_specific_config[0]['skip_rows']}")
|
612
|
+
# Instantiate the base CSVStandardizer with aggregated AI-derived parameters
|
624
613
|
logger.info(
|
625
|
-
|
614
|
+
"Initializing final CSVStandardizer with aggregated AI parameters..."
|
626
615
|
)
|
627
|
-
logger.info(f" Column Definitions ({len(ai_column_definitions[0])} cols):")
|
628
|
-
for i, c_def in enumerate(ai_column_definitions[0]):
|
629
|
-
logger.info(
|
630
|
-
f" {i + 1}. ID in CSV: '{c_def['identifier_in_csv']}', Final Name: '{c_def['final_column_name']}', Desc: '{c_def.get('description', 'N/A')}'"
|
631
|
-
)
|
632
|
-
|
633
|
-
# 4. Call Base Class Standardizer Logic with AI-derived parameters
|
634
|
-
# We instantiate a new CSVStandardizer configured by the AI for this specific file.
|
635
616
|
final_processor = CSVStandardizer(
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
# default_skip_columns from __init__ can remain as a very deep fallback if AI somehow fails for columns
|
642
|
-
skip_columns=self.default_skip_columns,
|
617
|
+
table_names=all_ai_table_names,
|
618
|
+
descriptions=all_ai_table_descriptions,
|
619
|
+
file_configs=all_ai_file_configs,
|
620
|
+
column_definitions=all_ai_column_definitions,
|
621
|
+
skip_columns=self.default_skip_columns, # Fallback, though ideally not used if AI defines all columns
|
643
622
|
)
|
644
623
|
|
645
624
|
try:
|
646
|
-
|
625
|
+
logger.info(
|
626
|
+
f"Executing batch standardization for {len(input_paths)} file(s)..."
|
627
|
+
)
|
647
628
|
result_path = final_processor.standardize(
|
648
|
-
datasource=
|
629
|
+
datasource=input_paths, # Pass the original list of Path objects
|
649
630
|
output_path=output_path_obj,
|
650
631
|
overwrite=overwrite,
|
651
632
|
)
|
652
633
|
logger.info(
|
653
|
-
f"AI CSV Standardization complete for
|
634
|
+
f"AI CSV Standardization complete for all files. Output: {result_path}"
|
654
635
|
)
|
655
636
|
return result_path
|
656
637
|
except Exception as e:
|
657
638
|
logger.exception(
|
658
|
-
f"Error during final standardization step using AI parameters
|
639
|
+
f"Error during final batch standardization step using AI parameters: {e}"
|
659
640
|
)
|
660
|
-
raise RuntimeError(
|
661
|
-
f"Final standardization step failed for {input_path.name}."
|
662
|
-
) from e
|
641
|
+
raise RuntimeError("Final batch standardization step failed.") from e
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|