PyPI - local-deep-research - Versions diffs - 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

local-deep-research 0.5.9py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

local_deep_research/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.9"
1	+ __version__ = "0.6.0"

local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py CHANGED Viewed

@@ -236,6 +236,11 @@ class ProgressiveExplorer:
         """Execute searches in parallel and return results."""
         results = []
+        # Import context preservation utility
+        from ...utilities.thread_context import (
+            create_context_preserving_wrapper,
+        )
         def search_query(query):
             try:
                 search_results = self.search_engine.run(query)
@@ -244,11 +249,16 @@ class ProgressiveExplorer:
                 logger.error(f"Error searching '{query}': {str(e)}")
                 return (query, [])
+        # Create context-preserving wrapper for the search function
+        context_aware_search = create_context_preserving_wrapper(search_query)
         # Run searches in parallel
         with concurrent.futures.ThreadPoolExecutor(
             max_workers=max_workers
         ) as executor:
-            futures = [executor.submit(search_query, q) for q in queries]
+            futures = [
+                executor.submit(context_aware_search, q) for q in queries
+            ]
             for future in concurrent.futures.as_completed(futures):
                 results.append(future.result())

local_deep_research/advanced_search_system/questions/browsecomp_question.py CHANGED Viewed

@@ -144,15 +144,25 @@ DESCRIPTORS: [entity1], [entity2], ...
         # 1. Original query (always include)
         searches.append(query)
+        # If only 1 question requested, return just the original query
+        if num_questions <= 1:
+            return searches[:1]
         # 2. Domain exploration searches (combine key entities)
-        if entities["names"]:
+        if entities["names"] and len(searches) < num_questions:
             for name in entities["names"][:2]:  # Top 2 names
+                if len(searches) >= num_questions:
+                    break
                 searches.append(f"{name}")
-                if entities["descriptors"]:
+                if entities["descriptors"] and len(searches) < num_questions:
                     searches.append(f"{name} {entities['descriptors'][0]}")
         # 3. Temporal searches if years are important
-        if entities["temporal"] and len(entities["temporal"]) <= 10:
+        if (
+            entities["temporal"]
+            and len(entities["temporal"]) <= 10
+            and len(searches) < num_questions
+        ):
             # For small year ranges, search each year with a key term
             key_term = (
                 entities["names"][0]
@@ -162,14 +172,18 @@ DESCRIPTORS: [entity1], [entity2], ...
                 else ""
             )
             for year in entities["temporal"][:5]:  # Limit to 5 years initially
+                if len(searches) >= num_questions:
+                    break
                 if key_term:
                     searches.append(f"{key_term} {year}")
         # 4. Location-based searches
-        if entities["locations"]:
+        if entities["locations"] and len(searches) < num_questions:
             for location in entities["locations"][:2]:
+                if len(searches) >= num_questions:
+                    break
                 searches.append(f"{location}")
-                if entities["descriptors"]:
+                if entities["descriptors"] and len(searches) < num_questions:
                     searches.append(f"{location} {entities['descriptors'][0]}")
         # Remove duplicates and limit to requested number
@@ -179,6 +193,8 @@ DESCRIPTORS: [entity1], [entity2], ...
             if s.lower() not in seen:
                 seen.add(s.lower())
                 unique_searches.append(s)
+                if len(unique_searches) >= num_questions:
+                    break
         return unique_searches[:num_questions]
@@ -238,21 +254,26 @@ Format: One search per line
                 if line:
                     searches.append(line)
-        # Ensure we have enough searches
+        # Ensure we have enough searches, but respect the limit
         while len(searches) < num_questions:
             # Generate combinations programmatically
             if iteration <= 5 and entities["temporal"]:
                 # Continue with year-based searches
+                added_any = False
                 for year in entities["temporal"]:
                     if not self._was_searched(year, questions_by_iteration):
                         base_term = (
                             entities["names"][0] if entities["names"] else ""
                         )
                         searches.append(f"{base_term} {year}".strip())
+                        added_any = True
                         if len(searches) >= num_questions:
                             break
+                if not added_any:
+                    break  # No more year searches to add
             else:
                 # Combine multiple constraints
+                added_any = False
                 if entities["names"] and entities["descriptors"]:
                     for name in entities["names"]:
                         for desc in entities["descriptors"]:
@@ -261,8 +282,13 @@ Format: One search per line
                                 combo, questions_by_iteration
                             ):
                                 searches.append(combo)
+                                added_any = True
                                 if len(searches) >= num_questions:
                                     break
+                        if len(searches) >= num_questions:
+                            break
+                if not added_any:
+                    break  # No more combinations to add
         return searches[:num_questions]

local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py CHANGED Viewed

@@ -53,9 +53,9 @@ class FocusedIterationStrategy(BaseSearchStrategy):
         search=None,
         citation_handler=None,
         all_links_of_system=None,
-        max_iterations: int = 8,  # OPTIMAL FOR SIMPLEQA: 96.51% accuracy achieved
+        max_iterations: int = 8,  # OPTIMAL FOR SIMPLEQA: 90%+ accuracy achieved
         questions_per_iteration: int = 5,  # OPTIMAL FOR SIMPLEQA: proven config
-        use_browsecomp_optimization: bool = True,  # Can be False for pure SimpleQA
+        use_browsecomp_optimization: bool = True,  # True for 90%+ accuracy with forced_answer handler
     ):
         """Initialize with components optimized for focused iteration."""
         super().__init__(all_links_of_system)
@@ -63,9 +63,9 @@ class FocusedIterationStrategy(BaseSearchStrategy):
         self.model = model or get_llm()
         self.progress_callback = None
-        # Configuration
-        self.max_iterations = max_iterations
-        self.questions_per_iteration = questions_per_iteration
+        # Configuration - ensure these are integers
+        self.max_iterations = int(max_iterations)
+        self.questions_per_iteration = int(questions_per_iteration)
         self.use_browsecomp_optimization = use_browsecomp_optimization
         # Initialize specialized components
@@ -158,9 +158,11 @@ class FocusedIterationStrategy(BaseSearchStrategy):
                         questions_by_iteration=self.questions_by_iteration,
                     )
-                # Always include original query in first iteration
+                # Always include original query in first iteration, but respect question limit
                 if iteration == 1 and query not in questions:
                     questions = [query] + questions
+                    # Trim to respect questions_per_iteration limit
+                    questions = questions[: self.questions_per_iteration]
                 self.questions_by_iteration[iteration] = questions
                 logger.info(f"Iteration {iteration} questions: {questions}")
@@ -357,6 +359,11 @@ class FocusedIterationStrategy(BaseSearchStrategy):
         """Execute searches in parallel (like source-based strategy)."""
         all_results = []
+        # Import context preservation utility
+        from ...utilities.thread_context import (
+            create_context_preserving_wrapper,
+        )
         def search_question(q):
             try:
                 result = self.search.run(q)
@@ -365,11 +372,18 @@ class FocusedIterationStrategy(BaseSearchStrategy):
                 logger.error(f"Error searching '{q}': {str(e)}")
                 return {"question": q, "results": [], "error": str(e)}
+        # Create context-preserving wrapper for the search function
+        context_aware_search = create_context_preserving_wrapper(
+            search_question
+        )
         # Run searches in parallel
         with concurrent.futures.ThreadPoolExecutor(
             max_workers=len(queries)
         ) as executor:
-            futures = [executor.submit(search_question, q) for q in queries]
+            futures = [
+                executor.submit(context_aware_search, q) for q in queries
+            ]
             for future in concurrent.futures.as_completed(futures):
                 result_dict = future.result()
@@ -385,6 +399,11 @@ class FocusedIterationStrategy(BaseSearchStrategy):
         completed_searches = 0
         total_searches = len(queries)
+        # Import context preservation utility
+        from ...utilities.thread_context import (
+            create_context_preserving_wrapper,
+        )
         def search_question_with_progress(q):
             nonlocal completed_searches
             try:
@@ -440,12 +459,17 @@ class FocusedIterationStrategy(BaseSearchStrategy):
                     "result_count": 0,
                 }
+        # Create context-preserving wrapper for the search function
+        context_aware_search_with_progress = create_context_preserving_wrapper(
+            search_question_with_progress
+        )
         # Run searches in parallel
         with concurrent.futures.ThreadPoolExecutor(
             max_workers=min(len(queries), 5)
         ) as executor:
             futures = [
-                executor.submit(search_question_with_progress, q)
+                executor.submit(context_aware_search_with_progress, q)
                 for q in queries
             ]

local_deep_research/advanced_search_system/strategies/source_based_strategy.py CHANGED Viewed

@@ -8,6 +8,7 @@ from ...config.llm_config import get_llm
 from ...config.search_config import get_search
 from ...utilities.db_utils import get_db_setting
 from ...utilities.threading_utils import thread_context, thread_with_app_context
+from ...utilities.thread_context import preserve_research_context
 from ..filters.cross_engine_filter import CrossEngineFilter
 from ..findings.repository import FindingsRepository
 from ..questions.atomic_fact_question import AtomicFactQuestionGenerator
@@ -211,6 +212,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                 # Function for thread pool
                 @thread_with_app_context
+                @preserve_research_context
                 def search_question(q):
                     try:
                         result = self.search.run(q)

local_deep_research/api/__init__.py CHANGED Viewed

@@ -5,12 +5,14 @@ API module for programmatic access to Local Deep Research functionality.
 from .research_functions import (
     analyze_documents,
+    detailed_research,
     generate_report,
     quick_summary,
 )
 __all__ = [
     "quick_summary",
+    "detailed_research",
     "generate_report",
     "analyze_documents",
 ]

local_deep_research/api/research_functions.py CHANGED Viewed

@@ -3,7 +3,8 @@ API module for Local Deep Research.
 Provides programmatic access to search and research capabilities.
 """
-from typing import Any, Callable, Dict, Optional
+from datetime import datetime
+from typing import Any, Callable, Dict, Optional, Union
 from loguru import logger
@@ -24,6 +25,8 @@ def _init_search_system(
     search_strategy: str = "source_based",
     iterations: int = 1,
     questions_per_iteration: int = 1,
+    retrievers: Optional[Dict[str, Any]] = None,
+    llms: Optional[Dict[str, Any]] = None,
 ) -> AdvancedSearchSystem:
     """
     Initializes the advanced search system with specified parameters. This function sets up
@@ -43,11 +46,30 @@ def _init_search_system(
         iterations: Number of research cycles to perform
         questions_per_iteration: Number of questions to generate per cycle
         search_strategy: The name of the search strategy to use.
+        retrievers: Optional dictionary of {name: retriever} pairs to use as search engines
+        llms: Optional dictionary of {name: llm} pairs to use as language models
     Returns:
         AdvancedSearchSystem: An instance of the configured AdvancedSearchSystem.
     """
+    # Register retrievers if provided
+    if retrievers:
+        from ..web_search_engines.retriever_registry import retriever_registry
+        retriever_registry.register_multiple(retrievers)
+        logger.info(
+            f"Registered {len(retrievers)} retrievers: {list(retrievers.keys())}"
+        )
+    # Register LLMs if provided
+    if llms:
+        from ..llm import register_llm
+        for name, llm_instance in llms.items():
+            register_llm(name, llm_instance)
+        logger.info(f"Registered {len(llms)} LLMs: {list(llms.keys())}")
     # Get language model with custom temperature
     llm = get_llm(
         temperature=temperature,
@@ -84,6 +106,9 @@ def _init_search_system(
 def quick_summary(
     query: str,
+    research_id: Optional[Union[int, str]] = None,
+    retrievers: Optional[Dict[str, Any]] = None,
+    llms: Optional[Dict[str, Any]] = None,
     **kwargs: Any,
 ) -> Dict[str, Any]:
     """
@@ -91,6 +116,9 @@ def quick_summary(
     Args:
         query: The research query to analyze
+        research_id: Optional research ID (int or UUID string) for tracking metrics
+        retrievers: Optional dictionary of {name: retriever} pairs to use as search engines
+        llms: Optional dictionary of {name: llm} pairs to use as language models
         **kwargs: Configuration for the search system. Will be forwarded to
             `_init_search_system()`.
@@ -103,7 +131,46 @@ def quick_summary(
     """
     logger.info("Generating quick summary for query: %s", query)
-    system = _init_search_system(**kwargs)
+    # Generate a research_id if none provided
+    if research_id is None:
+        import uuid
+        research_id = str(uuid.uuid4())
+        logger.debug(f"Generated research_id: {research_id}")
+    # Register retrievers if provided
+    if retrievers:
+        from ..web_search_engines.retriever_registry import retriever_registry
+        retriever_registry.register_multiple(retrievers)
+        logger.info(
+            f"Registered {len(retrievers)} retrievers: {list(retrievers.keys())}"
+        )
+    # Register LLMs if provided
+    if llms:
+        from ..llm import register_llm
+        for name, llm_instance in llms.items():
+            register_llm(name, llm_instance)
+        logger.info(f"Registered {len(llms)} LLMs: {list(llms.keys())}")
+    # Set search context with research_id
+    from ..metrics.search_tracker import set_search_context
+    search_context = {
+        "research_id": research_id,  # Pass UUID or integer directly
+        "research_query": query,
+        "research_mode": kwargs.get("research_mode", "quick"),
+        "research_phase": "init",
+        "search_iteration": 0,
+        "search_engine_selected": kwargs.get("search_tool"),
+    }
+    set_search_context(search_context)
+    # Remove research_mode from kwargs before passing to _init_search_system
+    init_kwargs = {k: v for k, v in kwargs.items() if k != "research_mode"}
+    system = _init_search_system(llms=llms, **init_kwargs)
     # Perform the search and analysis
     results = system.analyze_topic(query)
@@ -130,6 +197,8 @@ def generate_report(
     output_file: Optional[str] = None,
     progress_callback: Optional[Callable] = None,
     searches_per_section: int = 2,
+    retrievers: Optional[Dict[str, Any]] = None,
+    llms: Optional[Dict[str, Any]] = None,
     **kwargs: Any,
 ) -> Dict[str, Any]:
     """
@@ -141,6 +210,8 @@ def generate_report(
         progress_callback: Optional callback function to receive progress updates
         searches_per_section: The number of searches to perform for each
             section in the report.
+        retrievers: Optional dictionary of {name: retriever} pairs to use as search engines
+        llms: Optional dictionary of {name: llm} pairs to use as language models
     Returns:
         Dictionary containing the research report with keys:
@@ -149,7 +220,24 @@ def generate_report(
     """
     logger.info("Generating comprehensive research report for query: %s", query)
-    system = _init_search_system(**kwargs)
+    # Register retrievers if provided
+    if retrievers:
+        from ..web_search_engines.retriever_registry import retriever_registry
+        retriever_registry.register_multiple(retrievers)
+        logger.info(
+            f"Registered {len(retrievers)} retrievers: {list(retrievers.keys())}"
+        )
+    # Register LLMs if provided
+    if llms:
+        from ..llm import register_llm
+        for name, llm_instance in llms.items():
+            register_llm(name, llm_instance)
+        logger.info(f"Registered {len(llms)} LLMs: {list(llms.keys())}")
+    system = _init_search_system(retrievers=retrievers, llms=llms, **kwargs)
     # Set progress callback if provided
     if progress_callback:
@@ -175,6 +263,92 @@ def generate_report(
     return report
+def detailed_research(
+    query: str,
+    research_id: Optional[Union[int, str]] = None,
+    retrievers: Optional[Dict[str, Any]] = None,
+    llms: Optional[Dict[str, Any]] = None,
+    **kwargs: Any,
+) -> Dict[str, Any]:
+    """
+    Perform detailed research with comprehensive analysis.
+    Similar to generate_report but returns structured data instead of markdown.
+    Args:
+        query: The research query to analyze
+        research_id: Optional research ID (int or UUID string) for tracking metrics
+        retrievers: Optional dictionary of {name: retriever} pairs to use as search engines
+        llms: Optional dictionary of {name: llm} pairs to use as language models
+        **kwargs: Configuration for the search system
+    Returns:
+        Dictionary containing detailed research results
+    """
+    logger.info("Performing detailed research for query: %s", query)
+    # Generate a research_id if none provided
+    if research_id is None:
+        import uuid
+        research_id = str(uuid.uuid4())
+        logger.debug(f"Generated research_id: {research_id}")
+    # Register retrievers if provided
+    if retrievers:
+        from ..web_search_engines.retriever_registry import retriever_registry
+        retriever_registry.register_multiple(retrievers)
+        logger.info(
+            f"Registered {len(retrievers)} retrievers: {list(retrievers.keys())}"
+        )
+    # Register LLMs if provided
+    if llms:
+        from ..llm import register_llm
+        for name, llm_instance in llms.items():
+            register_llm(name, llm_instance)
+        logger.info(f"Registered {len(llms)} LLMs: {list(llms.keys())}")
+    # Set search context
+    from ..metrics.search_tracker import set_search_context
+    search_context = {
+        "research_id": research_id,
+        "research_query": query,
+        "research_mode": "detailed",
+        "research_phase": "init",
+        "search_iteration": 0,
+        "search_engine_selected": kwargs.get("search_tool"),
+    }
+    set_search_context(search_context)
+    # Initialize system
+    system = _init_search_system(retrievers=retrievers, llms=llms, **kwargs)
+    # Perform detailed research
+    results = system.analyze_topic(query)
+    # Return comprehensive results
+    return {
+        "query": query,
+        "research_id": research_id,
+        "summary": results.get("current_knowledge", ""),
+        "findings": results.get("findings", []),
+        "iterations": results.get("iterations", 0),
+        "questions": results.get("questions", {}),
+        "formatted_findings": results.get("formatted_findings", ""),
+        "sources": results.get("all_links_of_system", []),
+        "metadata": {
+            "timestamp": datetime.now().isoformat(),
+            "search_tool": kwargs.get("search_tool", "auto"),
+            "iterations_requested": kwargs.get("iterations", 1),
+            "strategy": kwargs.get("search_strategy", "source_based"),
+        },
+    }
 def analyze_documents(
     query: str,
     collection_name: str,

local_deep_research/benchmarks/graders.py CHANGED Viewed

@@ -65,15 +65,16 @@ def get_evaluation_llm(custom_config: Optional[Dict[str, Any]] = None):
     # Check if we're using openai_endpoint but don't have an API key configured
     if filtered_config.get("provider") == "openai_endpoint":
-        # Try to get API key from environment or config
-        import os
+        # Try to get API key from database settings first, then environment
+        from ..utilities.db_utils import get_db_setting
+        api_key = get_db_setting("llm.openai_endpoint.api_key")
-        api_key = os.getenv("OPENAI_ENDPOINT_API_KEY")
         if not api_key:
             logger.warning(
                 "Using openai_endpoint provider but no API key found. "
-                "Set the OPENAI_ENDPOINT_API_KEY environment variable or "
-                "specify api_key in the evaluation_config."
+                "Set the llm.openai_endpoint.api_key setting in the database or "
+                "LDR_LLM_OPENAI_ENDPOINT_API_KEY environment variable."
             )
             # Try to fall back to LDR's config if API key not explicitly provided
             # The get_llm function will handle this case
@@ -117,6 +118,150 @@ def extract_answer_from_response(
     }
+def grade_single_result(
+    result_data: Dict[str, Any],
+    dataset_type: str = "simpleqa",
+    evaluation_config: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """
+    Grade a single benchmark result using LLM.
+    Args:
+        result_data: Dictionary containing result data with keys: id, problem, correct_answer, response, extracted_answer
+        dataset_type: Type of dataset
+        evaluation_config: Optional custom config for evaluation LLM
+    Returns:
+        Dictionary with grading results
+    """
+    # Get evaluation LLM
+    evaluation_llm = get_evaluation_llm(evaluation_config)
+    # Select appropriate template
+    template = (
+        BROWSECOMP_GRADER_TEMPLATE
+        if dataset_type.lower() == "browsecomp"
+        else SIMPLEQA_GRADER_TEMPLATE
+    )
+    question = result_data.get("problem", "")
+    correct_answer = result_data.get("correct_answer", "")
+    response = result_data.get("response", "")
+    logger.info(f"Grading single result: {question[:50]}...")
+    # Format grading prompt
+    grading_prompt = template.format(
+        question=question, correct_answer=correct_answer, response=response
+    )
+    try:
+        # Grade using LLM
+        if hasattr(evaluation_llm, "invoke") and callable(
+            evaluation_llm.invoke
+        ):
+            if hasattr(evaluation_llm, "chat_messages"):
+                # Handle ChatOpenAI and similar models that use messages
+                grading_response = evaluation_llm.invoke(
+                    [HumanMessage(content=grading_prompt)]
+                ).content
+            else:
+                # Handle other LLM types
+                grading_response = evaluation_llm.invoke(grading_prompt)
+                if hasattr(grading_response, "content"):
+                    grading_response = grading_response.content
+        else:
+            # Fallback for other LLM interfaces
+            grading_response = str(evaluation_llm(grading_prompt))
+        # Extract grading information using regex
+        if dataset_type.lower() == "browsecomp":
+            # BrowseComp-specific extraction
+            extracted_answer_match = re.search(
+                r"extracted_final_answer:\s*(.*?)(?:\n|$)", grading_response
+            )
+            extracted_answer = (
+                extracted_answer_match.group(1).strip()
+                if extracted_answer_match
+                else "None"
+            )
+            reasoning_match = re.search(
+                r"reasoning:\s*(.*?)(?:\n\n|\ncorrect:|\Z)",
+                grading_response,
+                re.DOTALL,
+            )
+            reasoning = (
+                reasoning_match.group(1).strip() if reasoning_match else ""
+            )
+            correct_match = re.search(
+                r"correct:\s*(yes|no)", grading_response, re.IGNORECASE
+            )
+            is_correct = (
+                (correct_match.group(1).lower() == "yes")
+                if correct_match
+                else False
+            )
+            confidence_match = re.search(
+                r"confidence:\s*(\d+)", grading_response
+            )
+            confidence = (
+                confidence_match.group(1) if confidence_match else "100"
+            )
+        else:
+            # SimpleQA extraction
+            extracted_answer_match = re.search(
+                r"Extracted Answer:\s*(.*?)(?:\n|$)", grading_response
+            )
+            extracted_answer = (
+                extracted_answer_match.group(1).strip()
+                if extracted_answer_match
+                else "None"
+            )
+            reasoning_match = re.search(
+                r"Reasoning:\s*(.*?)(?:\nCorrect:|\Z)",
+                grading_response,
+                re.DOTALL,
+            )
+            reasoning = (
+                reasoning_match.group(1).strip() if reasoning_match else ""
+            )
+            correct_match = re.search(
+                r"Correct:\s*(yes|no)", grading_response, re.IGNORECASE
+            )
+            is_correct = (
+                (correct_match.group(1).lower() == "yes")
+                if correct_match
+                else False
+            )
+            confidence = "100"  # SimpleQA doesn't have confidence
+        # Format graded result
+        graded_result = {
+            "extracted_by_grader": extracted_answer,
+            "reasoning": reasoning,
+            "is_correct": is_correct,
+            "graded_confidence": confidence,
+            "grader_response": grading_response,
+        }
+        return graded_result
+    except Exception as e:
+        logger.error(f"Error grading single result: {str(e)}")
+        return {
+            "grading_error": str(e),
+            "is_correct": False,
+            "graded_confidence": "0",
+            "grader_response": f"Grading failed: {str(e)}",
+        }
 def grade_results(
     results_file: str,
     output_file: str,

local-deep-research 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl

local-deep-research 0.5.9py3-none-any.whl → 0.6.0py3-none-any.whl