PyPI - local-deep-research - Versions diffs - 0.5.7__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

local-deep-research 0.5.7py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

local_deep_research/benchmarks/graders.py CHANGED Viewed

@@ -65,15 +65,16 @@ def get_evaluation_llm(custom_config: Optional[Dict[str, Any]] = None):
     # Check if we're using openai_endpoint but don't have an API key configured
     if filtered_config.get("provider") == "openai_endpoint":
-        # Try to get API key from environment or config
-        import os
+        # Try to get API key from database settings first, then environment
+        from ..utilities.db_utils import get_db_setting
+        api_key = get_db_setting("llm.openai_endpoint.api_key")
-        api_key = os.getenv("OPENAI_ENDPOINT_API_KEY")
         if not api_key:
             logger.warning(
                 "Using openai_endpoint provider but no API key found. "
-                "Set the OPENAI_ENDPOINT_API_KEY environment variable or "
-                "specify api_key in the evaluation_config."
+                "Set the llm.openai_endpoint.api_key setting in the database or "
+                "LDR_LLM_OPENAI_ENDPOINT_API_KEY environment variable."
             )
             # Try to fall back to LDR's config if API key not explicitly provided
             # The get_llm function will handle this case
@@ -117,6 +118,150 @@ def extract_answer_from_response(
     }
+def grade_single_result(
+    result_data: Dict[str, Any],
+    dataset_type: str = "simpleqa",
+    evaluation_config: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """
+    Grade a single benchmark result using LLM.
+    Args:
+        result_data: Dictionary containing result data with keys: id, problem, correct_answer, response, extracted_answer
+        dataset_type: Type of dataset
+        evaluation_config: Optional custom config for evaluation LLM
+    Returns:
+        Dictionary with grading results
+    """
+    # Get evaluation LLM
+    evaluation_llm = get_evaluation_llm(evaluation_config)
+    # Select appropriate template
+    template = (
+        BROWSECOMP_GRADER_TEMPLATE
+        if dataset_type.lower() == "browsecomp"
+        else SIMPLEQA_GRADER_TEMPLATE
+    )
+    question = result_data.get("problem", "")
+    correct_answer = result_data.get("correct_answer", "")
+    response = result_data.get("response", "")
+    logger.info(f"Grading single result: {question[:50]}...")
+    # Format grading prompt
+    grading_prompt = template.format(
+        question=question, correct_answer=correct_answer, response=response
+    )
+    try:
+        # Grade using LLM
+        if hasattr(evaluation_llm, "invoke") and callable(
+            evaluation_llm.invoke
+        ):
+            if hasattr(evaluation_llm, "chat_messages"):
+                # Handle ChatOpenAI and similar models that use messages
+                grading_response = evaluation_llm.invoke(
+                    [HumanMessage(content=grading_prompt)]
+                ).content
+            else:
+                # Handle other LLM types
+                grading_response = evaluation_llm.invoke(grading_prompt)
+                if hasattr(grading_response, "content"):
+                    grading_response = grading_response.content
+        else:
+            # Fallback for other LLM interfaces
+            grading_response = str(evaluation_llm(grading_prompt))
+        # Extract grading information using regex
+        if dataset_type.lower() == "browsecomp":
+            # BrowseComp-specific extraction
+            extracted_answer_match = re.search(
+                r"extracted_final_answer:\s*(.*?)(?:\n|$)", grading_response
+            )
+            extracted_answer = (
+                extracted_answer_match.group(1).strip()
+                if extracted_answer_match
+                else "None"
+            )
+            reasoning_match = re.search(
+                r"reasoning:\s*(.*?)(?:\n\n|\ncorrect:|\Z)",
+                grading_response,
+                re.DOTALL,
+            )
+            reasoning = (
+                reasoning_match.group(1).strip() if reasoning_match else ""
+            )
+            correct_match = re.search(
+                r"correct:\s*(yes|no)", grading_response, re.IGNORECASE
+            )
+            is_correct = (
+                (correct_match.group(1).lower() == "yes")
+                if correct_match
+                else False
+            )
+            confidence_match = re.search(
+                r"confidence:\s*(\d+)", grading_response
+            )
+            confidence = (
+                confidence_match.group(1) if confidence_match else "100"
+            )
+        else:
+            # SimpleQA extraction
+            extracted_answer_match = re.search(
+                r"Extracted Answer:\s*(.*?)(?:\n|$)", grading_response
+            )
+            extracted_answer = (
+                extracted_answer_match.group(1).strip()
+                if extracted_answer_match
+                else "None"
+            )
+            reasoning_match = re.search(
+                r"Reasoning:\s*(.*?)(?:\nCorrect:|\Z)",
+                grading_response,
+                re.DOTALL,
+            )
+            reasoning = (
+                reasoning_match.group(1).strip() if reasoning_match else ""
+            )
+            correct_match = re.search(
+                r"Correct:\s*(yes|no)", grading_response, re.IGNORECASE
+            )
+            is_correct = (
+                (correct_match.group(1).lower() == "yes")
+                if correct_match
+                else False
+            )
+            confidence = "100"  # SimpleQA doesn't have confidence
+        # Format graded result
+        graded_result = {
+            "extracted_by_grader": extracted_answer,
+            "reasoning": reasoning,
+            "is_correct": is_correct,
+            "graded_confidence": confidence,
+            "grader_response": grading_response,
+        }
+        return graded_result
+    except Exception as e:
+        logger.error(f"Error grading single result: {str(e)}")
+        return {
+            "grading_error": str(e),
+            "is_correct": False,
+            "graded_confidence": "0",
+            "grader_response": f"Grading failed: {str(e)}",
+        }
 def grade_results(
     results_file: str,
     output_file: str,

local_deep_research/benchmarks/models/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Benchmark database models for ORM."""
+from .benchmark_models import (
+    BenchmarkConfig,
+    BenchmarkProgress,
+    BenchmarkResult,
+    BenchmarkRun,
+    BenchmarkStatus,
+    DatasetType,
+)
+__all__ = [
+    "BenchmarkRun",
+    "BenchmarkResult",
+    "BenchmarkConfig",
+    "BenchmarkProgress",
+    "BenchmarkStatus",
+    "DatasetType",
+]

local_deep_research/benchmarks/models/benchmark_models.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""Database models for benchmark system."""
+import enum
+from sqlalchemy import (
+    JSON,
+    Boolean,
+    Column,
+    DateTime,
+    Enum,
+    Float,
+    ForeignKey,
+    Integer,
+    String,
+    Text,
+    UniqueConstraint,
+    Index,
+)
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import relationship
+from sqlalchemy.sql import func
+# Use the same base as the main app
+try:
+    from ...web.database.models import Base
+except ImportError:
+    # Fallback for different import contexts
+    from sqlalchemy.ext.declarative import declarative_base
+    Base = declarative_base()
+class BenchmarkStatus(enum.Enum):
+    """Status of a benchmark run."""
+    PENDING = "pending"
+    IN_PROGRESS = "in_progress"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+    PAUSED = "paused"
+class DatasetType(enum.Enum):
+    """Supported dataset types."""
+    SIMPLEQA = "simpleqa"
+    BROWSECOMP = "browsecomp"
+    CUSTOM = "custom"
+class BenchmarkRun(Base):
+    """Main benchmark run metadata."""
+    __tablename__ = "benchmark_runs"
+    id = Column(Integer, primary_key=True, index=True)
+    # Run identification
+    run_name = Column(String(255), nullable=True)  # User-friendly name
+    config_hash = Column(
+        String(16), nullable=False, index=True
+    )  # For compatibility matching
+    query_hash_list = Column(
+        JSON, nullable=False
+    )  # List of query hashes to avoid duplication
+    # Configuration
+    search_config = Column(
+        JSON, nullable=False
+    )  # Complete search configuration
+    evaluation_config = Column(JSON, nullable=False)  # Evaluation settings
+    datasets_config = Column(
+        JSON, nullable=False
+    )  # Dataset selection and quantities
+    # Status and timing
+    status = Column(
+        Enum(BenchmarkStatus), default=BenchmarkStatus.PENDING, nullable=False
+    )
+    created_at = Column(DateTime, server_default=func.now(), nullable=False)
+    updated_at = Column(
+        DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
+    )
+    start_time = Column(DateTime, nullable=True)
+    end_time = Column(DateTime, nullable=True)
+    # Progress tracking
+    total_examples = Column(Integer, default=0, nullable=False)
+    completed_examples = Column(Integer, default=0, nullable=False)
+    failed_examples = Column(Integer, default=0, nullable=False)
+    # Results summary
+    overall_accuracy = Column(Float, nullable=True)
+    processing_rate = Column(Float, nullable=True)  # Examples per minute
+    # Error handling
+    error_message = Column(Text, nullable=True)
+    # Relationships
+    results = relationship(
+        "BenchmarkResult",
+        back_populates="benchmark_run",
+        cascade="all, delete-orphan",
+        lazy="dynamic",
+    )
+    progress_updates = relationship(
+        "BenchmarkProgress",
+        back_populates="benchmark_run",
+        cascade="all, delete-orphan",
+        lazy="dynamic",
+    )
+    # Indexes for performance and extend existing
+    __table_args__ = (
+        Index("idx_benchmark_runs_config_hash", "config_hash"),
+        Index("idx_benchmark_runs_status_created", "status", "created_at"),
+        {"extend_existing": True},
+    )
+class BenchmarkResult(Base):
+    """Individual benchmark result for a single question."""
+    __tablename__ = "benchmark_results"
+    id = Column(Integer, primary_key=True, index=True)
+    # Foreign key
+    benchmark_run_id = Column(
+        Integer,
+        ForeignKey("benchmark_runs.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    # Question identification
+    example_id = Column(String(255), nullable=False)  # Original dataset ID
+    query_hash = Column(
+        String(32), nullable=False, index=True
+    )  # For deduplication
+    dataset_type = Column(Enum(DatasetType), nullable=False)
+    research_id = Column(
+        String(36), nullable=True, index=True
+    )  # UUID string or converted integer
+    # Question and answer
+    question = Column(Text, nullable=False)
+    correct_answer = Column(Text, nullable=False)
+    # Research results
+    response = Column(Text, nullable=True)
+    extracted_answer = Column(Text, nullable=True)
+    confidence = Column(String(10), nullable=True)
+    processing_time = Column(Float, nullable=True)
+    sources = Column(JSON, nullable=True)
+    # Evaluation results
+    is_correct = Column(Boolean, nullable=True)
+    graded_confidence = Column(String(10), nullable=True)
+    grader_response = Column(Text, nullable=True)
+    # Timestamps
+    created_at = Column(DateTime, server_default=func.now(), nullable=False)
+    completed_at = Column(DateTime, nullable=True)
+    # Error handling
+    research_error = Column(Text, nullable=True)
+    evaluation_error = Column(Text, nullable=True)
+    # Additional metadata
+    task_index = Column(Integer, nullable=True)  # Order in processing
+    result_metadata = Column(JSON, nullable=True)  # Additional data
+    # Relationships
+    benchmark_run = relationship("BenchmarkRun", back_populates="results")
+    # Indexes for performance
+    __table_args__ = (
+        Index(
+            "idx_benchmark_results_run_dataset",
+            "benchmark_run_id",
+            "dataset_type",
+        ),
+        Index("idx_benchmark_results_query_hash", "query_hash"),
+        Index("idx_benchmark_results_completed", "completed_at"),
+        UniqueConstraint(
+            "benchmark_run_id", "query_hash", name="uix_run_query"
+        ),
+        {"extend_existing": True},
+    )
+class BenchmarkConfig(Base):
+    """Saved benchmark configurations for reuse."""
+    __tablename__ = "benchmark_configs"
+    id = Column(Integer, primary_key=True, index=True)
+    # Configuration details
+    name = Column(String(255), nullable=False)
+    description = Column(Text, nullable=True)
+    config_hash = Column(String(16), nullable=False, index=True)
+    # Configuration data
+    search_config = Column(JSON, nullable=False)
+    evaluation_config = Column(JSON, nullable=False)
+    datasets_config = Column(JSON, nullable=False)
+    # Metadata
+    created_at = Column(DateTime, server_default=func.now(), nullable=False)
+    updated_at = Column(
+        DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
+    )
+    is_default = Column(Boolean, default=False, nullable=False)
+    is_public = Column(Boolean, default=True, nullable=False)
+    # Usage tracking
+    usage_count = Column(Integer, default=0, nullable=False)
+    last_used = Column(DateTime, nullable=True)
+    # Performance data (if available)
+    best_accuracy = Column(Float, nullable=True)
+    avg_processing_rate = Column(Float, nullable=True)
+    # Indexes
+    __table_args__ = (
+        Index("idx_benchmark_configs_name", "name"),
+        Index("idx_benchmark_configs_hash", "config_hash"),
+        Index("idx_benchmark_configs_default", "is_default"),
+        {"extend_existing": True},
+    )
+class BenchmarkProgress(Base):
+    """Real-time progress tracking for benchmark runs."""
+    __tablename__ = "benchmark_progress"
+    id = Column(Integer, primary_key=True, index=True)
+    # Foreign key
+    benchmark_run_id = Column(
+        Integer,
+        ForeignKey("benchmark_runs.id", ondelete="CASCADE"),
+        nullable=False,
+        index=True,
+    )
+    # Progress data
+    timestamp = Column(DateTime, server_default=func.now(), nullable=False)
+    completed_examples = Column(Integer, nullable=False)
+    total_examples = Column(Integer, nullable=False)
+    # Accuracy tracking
+    overall_accuracy = Column(Float, nullable=True)
+    dataset_accuracies = Column(JSON, nullable=True)  # Per-dataset accuracy
+    # Performance metrics
+    processing_rate = Column(Float, nullable=True)  # Examples per minute
+    estimated_completion = Column(DateTime, nullable=True)
+    # Current status
+    current_dataset = Column(Enum(DatasetType), nullable=True)
+    current_example_id = Column(String(255), nullable=True)
+    # Additional metrics
+    memory_usage = Column(Float, nullable=True)  # MB
+    cpu_usage = Column(Float, nullable=True)  # Percentage
+    # Relationships
+    benchmark_run = relationship(
+        "BenchmarkRun", back_populates="progress_updates"
+    )
+    # Indexes for real-time queries
+    __table_args__ = (
+        Index(
+            "idx_benchmark_progress_run_time", "benchmark_run_id", "timestamp"
+        ),
+        {"extend_existing": True},
+    )

local_deep_research/benchmarks/ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Benchmark UI components package."""

local_deep_research/benchmarks/web_api/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Benchmark web API package."""
+from .benchmark_service import BenchmarkService
+from .benchmark_routes import benchmark_bp
+__all__ = ["BenchmarkService", "benchmark_bp"]

local-deep-research 0.5.7__py3-none-any.whl → 0.6.0__py3-none-any.whl

local-deep-research 0.5.7py3-none-any.whl → 0.6.0py3-none-any.whl