PyPI - local-deep-research - Versions diffs - 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

local-deep-research 0.3.12py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

local_deep_research/benchmarks/runners.py ADDED Viewed

@@ -0,0 +1,434 @@
+"""
+Benchmark runners for Local Deep Research.
+This module provides the main functions for running benchmarks using LDR.
+"""
+import json
+import logging
+import os
+import time
+from typing import Any, Callable, Dict, Optional
+from ..api import quick_summary
+from .datasets import DEFAULT_DATASET_URLS, load_dataset
+from .datasets.base import DatasetRegistry
+from .graders import extract_answer_from_response, grade_results
+from .metrics import calculate_metrics, generate_report
+from .templates import BROWSECOMP_QUERY_TEMPLATE
+logger = logging.getLogger(__name__)
+def format_query(question: str, dataset_type: str = "simpleqa") -> str:
+    """
+    Format query based on dataset type.
+    Args:
+        question: Original question
+        dataset_type: Type of dataset
+    Returns:
+        Formatted query for LDR
+    """
+    if dataset_type.lower() == "browsecomp":
+        # BrowseComp requires specific formatting
+        return BROWSECOMP_QUERY_TEMPLATE.format(question=question)
+    # Simple format for SimpleQA
+    return question
+def run_benchmark(
+    dataset_type: str,
+    dataset_path: Optional[str] = None,
+    num_examples: Optional[int] = None,
+    output_dir: str = "benchmark_results",
+    run_evaluation: bool = True,
+    evaluation_config: Optional[Dict[str, Any]] = None,
+    search_config: Optional[Dict[str, Any]] = None,
+    human_evaluation: bool = False,
+    progress_callback: Optional[Callable[[str, int, Dict], None]] = None,
+    seed: int = 42,
+) -> Dict[str, Any]:
+    """
+    Run a benchmark on the specified dataset.
+    Args:
+        dataset_type: Type of dataset ("simpleqa" or "browsecomp")
+        dataset_path: Optional custom dataset path
+        num_examples: Number of examples to use
+        output_dir: Directory to save results
+        run_evaluation: Whether to evaluate results
+        evaluation_config: Custom LLM config for evaluation
+        search_config: Custom search parameters
+        human_evaluation: Whether to use human evaluation
+        progress_callback: Optional callback for progress updates
+        seed: Random seed for reproducibility
+    Returns:
+        Dictionary with benchmark results and metrics
+    """
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    # Default search configuration
+    if not search_config:
+        search_config = {
+            "iterations": 3,
+            "questions_per_iteration": 3,
+            "search_tool": "searxng",
+        }
+    # Load dataset using the class-based approach
+    try:
+        # Create the dataset instance from registry
+        dataset_instance = DatasetRegistry.create_dataset(
+            dataset_id=dataset_type.lower(),
+            dataset_path=dataset_path,
+            num_examples=num_examples,
+            seed=seed,
+        )
+        # Load the examples
+        dataset = dataset_instance.load()
+        logger.info(f"Loaded {len(dataset)} examples using dataset class {type(dataset_instance).__name__}")
+    except Exception as e:
+        # Fallback to legacy function if there's any issue
+        logger.warning(f"Error using dataset class: {e}. Falling back to legacy function.")
+        dataset = load_dataset(
+            dataset_type=dataset_type,
+            dataset_path=dataset_path,
+            num_examples=num_examples,
+            seed=seed,
+        )
+    # Set up output files
+    timestamp = time.strftime("%Y%m%d_%H%M%S")
+    results_file = os.path.join(output_dir, f"{dataset_type}_{timestamp}_results.jsonl")
+    evaluation_file = os.path.join(
+        output_dir, f"{dataset_type}_{timestamp}_evaluation.jsonl"
+    )
+    report_file = os.path.join(output_dir, f"{dataset_type}_{timestamp}_report.md")
+    # Make sure output files don't exist
+    for file in [results_file, evaluation_file, report_file]:
+        if os.path.exists(file):
+            os.remove(file)
+    # Progress tracking
+    total_examples = len(dataset)
+    if progress_callback:
+        progress_callback(
+            "Starting benchmark",
+            0,
+            {
+                "status": "started",
+                "dataset_type": dataset_type,
+                "total_examples": total_examples,
+            },
+        )
+    # Process each example
+    results = []
+    for i, example in enumerate(dataset):
+        # Extract question and answer in a way that uses the dataset class when available
+        if 'dataset_instance' in locals() and isinstance(dataset_instance, DatasetRegistry.get_dataset_class(dataset_type.lower())):
+            # Use the dataset class methods to extract question and answer
+            question = dataset_instance.get_question(example)
+            correct_answer = dataset_instance.get_answer(example)
+            logger.debug(f"Using dataset class methods to extract question and answer")
+        else:
+            # Fallback to the legacy approach
+            if dataset_type.lower() == "simpleqa":
+                question = example.get("problem", "")
+                correct_answer = example.get("answer", "")
+            else:  # browsecomp
+                question = example.get("problem", "")
+                # For BrowseComp, the answer should be in "correct_answer" after decryption
+                correct_answer = example.get("correct_answer", "")
+                if not correct_answer and "answer" in example:
+                    # Fallback to "answer" field if "correct_answer" is not available
+                    correct_answer = example.get("answer", "")
+        # Update progress
+        if progress_callback:
+            progress_callback(
+                f"Processing example {i + 1}/{total_examples}",
+                int(i / total_examples * 50),
+                {
+                    "status": "processing",
+                    "current": i + 1,
+                    "total": total_examples,
+                    "question": (
+                        question[:50] + "..." if len(question) > 50 else question
+                    ),
+                },
+            )
+        logger.info(f"Processing {i + 1}/{total_examples}: {question[:50]}...")
+        try:
+            # Format query based on dataset type
+            formatted_query = format_query(question, dataset_type)
+            # Time the search
+            start_time = time.time()
+            # Get response from LDR
+            search_result = quick_summary(
+                query=formatted_query,
+                iterations=search_config.get("iterations", 3),
+                questions_per_iteration=search_config.get("questions_per_iteration", 3),
+                search_tool=search_config.get("search_tool", "searxng"),
+            )
+            end_time = time.time()
+            processing_time = end_time - start_time
+            # Extract response and search info
+            response = search_result.get("summary", "")
+            # Extract structured information
+            extracted = extract_answer_from_response(response, dataset_type)
+            # Format result
+            result = {
+                "id": example.get("id", f"example_{i}"),
+                "problem": question,
+                "correct_answer": correct_answer,
+                "response": response,
+                "extracted_answer": extracted["extracted_answer"],
+                "confidence": extracted["confidence"],
+                "processing_time": processing_time,
+                "sources": search_result.get("sources", []),
+                "search_config": search_config,
+            }
+            # Add to results list
+            results.append(result)
+            # Write result to file
+            with open(results_file, "a") as f:
+                f.write(json.dumps(result) + "\n")
+            # Update progress
+            if progress_callback:
+                progress_callback(
+                    f"Completed example {i + 1}/{total_examples}",
+                    int((i + 0.5) / total_examples * 50),
+                    {
+                        "status": "completed_example",
+                        "current": i + 1,
+                        "total": total_examples,
+                        "result": result,
+                    },
+                )
+        except Exception as e:
+            logger.error(f"Error processing example {i + 1}: {str(e)}")
+            # Create error result
+            error_result = {
+                "id": example.get("id", f"example_{i}"),
+                "problem": question,
+                "correct_answer": correct_answer,
+                "error": str(e),
+                "processing_time": (
+                    time.time() - start_time if "start_time" in locals() else 0
+                ),
+            }
+            # Add to results list
+            results.append(error_result)
+            # Write error result to file
+            with open(results_file, "a") as f:
+                f.write(json.dumps(error_result) + "\n")
+            # Update progress
+            if progress_callback:
+                progress_callback(
+                    f"Error processing example {i + 1}/{total_examples}",
+                    int((i + 0.5) / total_examples * 50),
+                    {
+                        "status": "error",
+                        "current": i + 1,
+                        "total": total_examples,
+                        "error": str(e),
+                        "result": error_result,
+                    },
+                )
+    logger.info(f"Completed processing {total_examples} examples")
+    # Run evaluation if requested
+    if run_evaluation:
+        if progress_callback:
+            progress_callback(
+                "Starting evaluation",
+                50,
+                {"status": "evaluating", "results_file": results_file},
+            )
+        if human_evaluation:
+            from .graders import human_evaluation as evaluate
+            logger.info("Running human evaluation...")
+            evaluation_results = evaluate(
+                results_file=results_file, output_file=evaluation_file, interactive=True
+            )
+        else:
+            logger.info("Running automated evaluation...")
+            try:
+                evaluation_results = grade_results(
+                    results_file=results_file,
+                    output_file=evaluation_file,
+                    dataset_type=dataset_type,
+                    evaluation_config=evaluation_config,
+                    progress_callback=lambda current, total, meta: (
+                        progress_callback(
+                            f"Evaluating {current + 1}/{total}",
+                            50 + int((current + 0.5) / total * 40),
+                            {**meta, "status": "evaluating"},
+                        )
+                        if progress_callback
+                        else None
+                    ),
+                )
+            except Exception as e:
+                logger.error(f"Automated evaluation failed: {str(e)}")
+                if progress_callback:
+                    progress_callback(
+                        "Automated evaluation failed. Falling back to human evaluation.",
+                        60,
+                        {"status": "evaluation_fallback", "error": str(e)},
+                    )
+                # Ask if user wants to fall back to human evaluation
+                fallback_to_human = False
+                print("\nAutomated evaluation failed with error:", str(e))
+                response = input(
+                    "Do you want to fall back to human evaluation? (y/n): "
+                )
+                fallback_to_human = response.strip().lower() == "y"
+                if fallback_to_human:
+                    logger.info("Falling back to human evaluation...")
+                    from .graders import human_evaluation as evaluate
+                    evaluation_results = evaluate(
+                        results_file=results_file,
+                        output_file=evaluation_file,
+                        interactive=True,
+                    )
+                else:
+                    logger.info("Skipping evaluation due to error.")
+                    # Create an empty evaluation file to prevent issues
+                    with open(evaluation_file, "w") as f:
+                        f.write("")
+                    return {
+                        "status": "evaluation_error",
+                        "dataset_type": dataset_type,
+                        "results_path": results_file,
+                        "evaluation_error": str(e),
+                        "total_examples": total_examples,
+                    }
+        # Calculate metrics
+        if progress_callback:
+            progress_callback(
+                "Calculating metrics", 90, {"status": "calculating_metrics"}
+            )
+        metrics = calculate_metrics(evaluation_file)
+        # Generate report
+        if progress_callback:
+            progress_callback("Generating report", 95, {"status": "generating_report"})
+        dataset_name = dataset_type.capitalize()
+        report_path = generate_report(
+            metrics=metrics,
+            results_file=evaluation_file,
+            output_file=report_file,
+            dataset_name=dataset_name,
+            config_info={
+                "Dataset": dataset_path
+                or DEFAULT_DATASET_URLS.get(dataset_type, "Unknown"),
+                "Examples": total_examples,
+                "Iterations": search_config.get("iterations", 3),
+                "Questions per iteration": search_config.get(
+                    "questions_per_iteration", 3
+                ),
+                "Search tool": search_config.get("search_tool", "searxng"),
+                "Evaluation method": "Human" if human_evaluation else "Automated",
+            },
+        )
+        # Mark as complete
+        if progress_callback:
+            progress_callback(
+                "Benchmark complete",
+                100,
+                {"status": "complete", "metrics": metrics, "report_path": report_path},
+            )
+        return {
+            "status": "complete",
+            "dataset_type": dataset_type,
+            "results_path": results_file,
+            "evaluation_path": evaluation_file,
+            "report_path": report_path,
+            "metrics": metrics,
+            "total_examples": total_examples,
+            "accuracy": metrics.get("accuracy", 0),
+        }
+    else:
+        # No evaluation, just return results
+        if progress_callback:
+            progress_callback(
+                "Benchmark complete (no evaluation)",
+                100,
+                {"status": "complete_no_eval", "results_path": results_file},
+            )
+        return {
+            "status": "complete_no_eval",
+            "dataset_type": dataset_type,
+            "results_path": results_file,
+            "total_examples": total_examples,
+        }
+def run_simpleqa_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
+    """
+    Run SimpleQA benchmark with default settings.
+    Args:
+        num_examples: Number of examples to process
+        **kwargs: Additional arguments to pass to run_benchmark
+    Returns:
+        Dictionary with benchmark results
+    """
+    return run_benchmark(dataset_type="simpleqa", num_examples=num_examples, **kwargs)
+def run_browsecomp_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
+    """
+    Run BrowseComp benchmark with default settings.
+    Args:
+        num_examples: Number of examples to process
+        **kwargs: Additional arguments to pass to run_benchmark
+    Returns:
+        Dictionary with benchmark results
+    """
+    return run_benchmark(dataset_type="browsecomp", num_examples=num_examples, **kwargs)

local_deep_research/benchmarks/templates.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""
+Templates for benchmark evaluation.
+This module contains prompt templates used for evaluating model outputs
+against reference answers.
+"""
+# Template for SimpleQA grading
+SIMPLEQA_GRADER_TEMPLATE = """
+You are evaluating a question-answering system. Given a question, the correct answer,
+and the model's response, determine if the model's answer is correct.
+Question: {question}
+Correct Answer: {correct_answer}
+Model Response: {response}
+First, extract the model's final answer, ignoring explanations or reasoning.
+Then determine if the model's answer is correct by comparing it to the correct answer.
+Allow for minor variations in wording but be strict about factual accuracy.
+Provide your evaluation in this format:
+Extracted Answer: [The model's final answer, extracted from the full response]
+Reasoning: [Your explanation for why the answer is correct or incorrect]
+Correct: [yes/no]
+"""
+# Template for BrowseComp grading
+BROWSECOMP_GRADER_TEMPLATE = """
+Judge whether the following [response] to [question] is correct or not based on
+the precise and unambiguous [correct_answer] below.
+[question]: {question}
+[response]: {response}
+Your judgement must be in the format and criteria specified below:
+extracted_final_answer: The final exact answer extracted from the [response].
+Put the extracted answer as 'None' if there is no exact, final answer to extract
+from the response.
+[correct_answer]: {correct_answer}
+reasoning: Explain why the extracted_final_answer is correct or incorrect based
+on [correct_answer], focusing only on if there are meaningful differences between
+[correct_answer] and the extracted_final_answer.
+correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given
+above, or is within a small margin of error for numerical problems. Answer 'no' otherwise.
+confidence: The extracted confidence score between 0% and 100% from [response].
+Put 100 if there is no confidence score available.
+"""
+# Template for formatted BrowseComp queries
+BROWSECOMP_QUERY_TEMPLATE = """
+{question}
+Your response should be in the following format:
+Explanation: {{your explanation for your final answer}}
+Exact Answer: {{your succinct, final answer}}
+Confidence: {{your confidence score between 0% and 100% for your answer}}
+"""

local_deep_research/config/llm_config.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import logging
 import os
 from langchain_anthropic import ChatAnthropic
@@ -6,14 +5,12 @@ from langchain_community.llms import VLLM
 from langchain_core.language_models import FakeListChatModel
 from langchain_ollama import ChatOllama
 from langchain_openai import ChatOpenAI
+from loguru import logger
 from ..utilities.db_utils import get_db_setting
 from ..utilities.search_utilities import remove_think_tags
 from ..utilities.url_utils import normalize_url
-# Setup logging
-logger = logging.getLogger(__name__)
 # Valid provider options
 VALID_PROVIDERS = [
     "ollama",
@@ -67,7 +64,7 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
         raise ValueError(
             f"Invalid provider: {provider}. Must be one of: {VALID_PROVIDERS}"
         )
-    print(
+    logger.info(
         f"Getting LLM with model: {model_name}, temperature: {temperature}, provider: {provider}"
     )
@@ -75,8 +72,16 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
     common_params = {
         "temperature": temperature,
     }
+    # Get context window size from settings
+    context_window_size = get_db_setting("llm.context_window_size", 32000)
     if get_db_setting("llm.supports_max_tokens", True):
-        common_params["max_tokens"] = get_db_setting("llm.max_tokens", 30000)
+        # Use 80% of context window to leave room for prompts
+        max_tokens = min(
+            get_db_setting("llm.max_tokens", 30000), int(context_window_size * 0.8)
+        )
+        common_params["max_tokens"] = max_tokens
     # Handle different providers
     if provider == "anthropic":
@@ -134,9 +139,8 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
                 temperature=temperature,
             )
             return wrap_llm_without_think_tags(llm)
-        except Exception as e:
-            logger.error(f"Error loading VLLM model: {e}")
-            logger.warning("Falling back.")
+        except Exception:
+            logger.exception("Error loading VLLM model")
             return get_fallback_model(temperature)
     elif provider == "ollama":
@@ -184,10 +188,8 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
                             f"Model '{model_name}' not found in Ollama. Available models: {', '.join(model_names[:5])}"
                         )
                         return get_fallback_model(temperature)
-            except Exception as model_check_error:
-                logger.error(
-                    f"Error checking for model '{model_name}' in Ollama: {str(model_check_error)}"
-                )
+            except Exception:
+                logger.exception(f"Error checking for model '{model_name}' in Ollama")
                 # Continue anyway, let ChatOllama handle potential errors
             logger.info(
@@ -202,11 +204,11 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
                     f"Ollama test successful. Response type: {type(test_result)}"
                 )
                 return wrap_llm_without_think_tags(llm)
-            except Exception as chat_error:
-                logger.error(f"Error creating or testing ChatOllama: {str(chat_error)}")
+            except Exception:
+                logger.exception("Error creating or testing ChatOllama")
                 return get_fallback_model(temperature)
-        except Exception as e:
-            logger.error(f"Error in Ollama provider section: {str(e)}")
+        except Exception:
+            logger.exception("Error in Ollama provider section")
             return get_fallback_model(temperature)
     elif provider == "lmstudio":
@@ -218,7 +220,7 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
             api_key="lm-studio",  # LM Studio doesn't require a real API key
             base_url=f"{lmstudio_url}/v1",  # Use the configured URL with /v1 endpoint
             temperature=temperature,
-            max_tokens=get_db_setting("llm.max_tokens", 30000),
+            max_tokens=max_tokens,  # Use calculated max_tokens based on context size
         )
         return wrap_llm_without_think_tags(llm)
@@ -260,10 +262,11 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
             llm = LlamaCpp(
                 model_path=model_path,
                 temperature=temperature,
-                max_tokens=get_db_setting("llm.max_tokens", 30000),
+                max_tokens=max_tokens,  # Use calculated max_tokens
                 n_gpu_layers=n_gpu_layers,
                 n_batch=n_batch,
                 f16_kv=f16_kv,
+                n_ctx=context_window_size,  # Set context window size directly
                 verbose=True,
             )
@@ -398,11 +401,11 @@ def is_ollama_available():
         except requests.exceptions.RequestException as req_error:
             logger.error(f"Request error when checking Ollama: {str(req_error)}")
             return False
-        except Exception as e:
-            logger.error(f"Unexpected error when checking Ollama: {str(e)}")
+        except Exception:
+            logger.exception("Unexpected error when checking Ollama")
             return False
-    except Exception as outer_e:
-        logger.error(f"Error in is_ollama_available: {str(outer_e)}")
+    except Exception:
+        logger.exception("Error in is_ollama_available")
         return False

local_deep_research/config/search_config.py CHANGED Viewed

@@ -1,14 +1,10 @@
 # local_deep_research/config.py
-import logging
+from loguru import logger
 from ..utilities.db_utils import get_db_setting
 from ..web_search_engines.search_engine_factory import get_search as factory_get_search
 from .llm_config import get_llm
-# Setup logging
-logger = logging.getLogger(__name__)
 # Whether to check the quality search results using the LLM.
 QUALITY_CHECK_DDG_URLS = True
 # Whether to only retrieve snippets instead of full search results.

local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl

local-deep-research 0.3.12py3-none-any.whl → 0.4.1py3-none-any.whl