PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/citation_handler.py CHANGED Viewed

@@ -1,73 +1,80 @@
 # citation_handler.py
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
-from langchain_core.documents import Document
+from loguru import logger
 from .utilities.db_utils import get_db_setting
 class CitationHandler:
-    def __init__(self, llm):
+    """
+    Configurable citation handler that delegates to specific implementations.
+    Maintains backward compatibility while allowing strategy-specific handlers.
+    """
+    def __init__(self, llm, handler_type: Optional[str] = None):
         self.llm = llm
-    def _create_documents(
-        self, search_results: Union[str, List[Dict]], nr_of_links: int = 0
-    ) -> List[Document]:
-        """
-        Convert search results to LangChain documents format and add index
-        to original search results.
-        """
-        documents = []
-        if isinstance(search_results, str):
-            return documents
-        for i, result in enumerate(search_results):
-            if isinstance(result, dict):
-                # Add index to the original search result dictionary
-                result["index"] = str(i + nr_of_links + 1)
-                content = result.get("full_content", result.get("snippet", ""))
-                documents.append(
-                    Document(
-                        page_content=content,
-                        metadata={
-                            "source": result.get("link", f"source_{i + 1}"),
-                            "title": result.get("title", f"Source {i + 1}"),
-                            "index": i + nr_of_links + 1,
-                        },
-                    )
-                )
-        return documents
-    def _format_sources(self, documents: List[Document]) -> str:
-        """Format sources with numbers for citation."""
-        sources = []
-        for doc in documents:
-            source_id = doc.metadata["index"]
-            sources.append(f"[{source_id}] {doc.page_content}")
-        return "\n\n".join(sources)
+        # Determine which handler to use
+        if handler_type is None:
+            # Try to get from settings, default to standard
+            handler_type = get_db_setting("citation.handler_type", "standard")
-    def analyze_initial(
-        self, query: str, search_results: Union[str, List[Dict]]
-    ) -> Dict[str, Any]:
+        # Import and instantiate the appropriate handler
+        self._handler = self._create_handler(handler_type)
+        # For backward compatibility, expose internal methods
+        self._create_documents = self._handler._create_documents
+        self._format_sources = self._handler._format_sources
+    def _create_handler(self, handler_type: str):
+        """Create the appropriate citation handler based on type."""
+        handler_type = handler_type.lower()
-        documents = self._create_documents(search_results)
-        formatted_sources = self._format_sources(documents)
-        prompt = f"""Analyze the following information concerning the question and include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source.
+        if handler_type == "standard":
+            from .citation_handlers.standard_citation_handler import (
+                StandardCitationHandler,
+            )
-Question: {query}
+            logger.info("Using StandardCitationHandler")
+            return StandardCitationHandler(self.llm)
-Sources:
-{formatted_sources}
+        elif handler_type in ["forced", "forced_answer", "browsecomp"]:
+            from .citation_handlers.forced_answer_citation_handler import (
+                ForcedAnswerCitationHandler,
+            )
+            logger.info(
+                "Using ForcedAnswerCitationHandler for better benchmark performance"
+            )
+            return ForcedAnswerCitationHandler(self.llm)
+        elif handler_type in ["precision", "precision_extraction", "simpleqa"]:
+            from .citation_handlers.precision_extraction_handler import (
+                PrecisionExtractionHandler,
+            )
+            logger.info(
+                "Using PrecisionExtractionHandler for precise answer extraction"
+            )
+            return PrecisionExtractionHandler(self.llm)
+        else:
+            logger.warning(
+                f"Unknown citation handler type: {handler_type}, falling back to standard"
+            )
+            from .citation_handlers.standard_citation_handler import (
+                StandardCitationHandler,
+            )
-Provide a detailed analysis with citations. Do not create the bibliography, it will be provided automatically.  Never make up sources. Never write or create urls. Only write text relevant to the question. Example format: "According to the research [1], ..."
-"""
+            return StandardCitationHandler(self.llm)
-        response = self.llm.invoke(prompt)
-        if not isinstance(response, str):
-            response = response.content
-        return {"content": response, "documents": documents}
+    def analyze_initial(
+        self, query: str, search_results: Union[str, List[Dict]]
+    ) -> Dict[str, Any]:
+        """Delegate to the configured handler."""
+        return self._handler.analyze_initial(query, search_results)
     def analyze_followup(
         self,
@@ -76,41 +83,7 @@ Provide a detailed analysis with citations. Do not create the bibliography, it w
         previous_knowledge: str,
         nr_of_links: int,
     ) -> Dict[str, Any]:
-        """Process follow-up analysis with citations."""
-        documents = self._create_documents(search_results, nr_of_links=nr_of_links)
-        formatted_sources = self._format_sources(documents)
-        # Add fact-checking step
-        fact_check_prompt = f"""Analyze these sources for factual consistency:
-1. Cross-reference major claims between sources
-2. Identify and flag any contradictions
-3. Verify basic facts (dates, company names, ownership)
-4. Note when sources disagree
-Previous Knowledge:
-{previous_knowledge}
-New Sources:
-{formatted_sources}
-        Return any inconsistencies or conflicts found."""
-        if get_db_setting("general.enable_fact_checking", True):
-            fact_check_response = self.llm.invoke(fact_check_prompt).content
-        else:
-            fact_check_response = ""
-        prompt = f"""Using the previous knowledge and new sources, answer the question. Include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source. Reflect information from sources critically.
-Previous Knowledge:
-{previous_knowledge}
-Question: {question}
-New Sources:
-{formatted_sources}
-Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
-Provide a detailed answer with citations.  Example format: "According to [1], ..." """
-        response = self.llm.invoke(prompt)
-        return {"content": response.content, "documents": documents}
+        """Delegate to the configured handler."""
+        return self._handler.analyze_followup(
+            question, search_results, previous_knowledge, nr_of_links
+        )

local_deep_research/citation_handlers/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+Citation handlers for different search strategies.
+"""
+from .base_citation_handler import BaseCitationHandler
+from .forced_answer_citation_handler import ForcedAnswerCitationHandler
+from .precision_extraction_handler import PrecisionExtractionHandler
+from .standard_citation_handler import StandardCitationHandler
+__all__ = [
+    "BaseCitationHandler",
+    "StandardCitationHandler",
+    "ForcedAnswerCitationHandler",
+    "PrecisionExtractionHandler",
+]

local_deep_research/citation_handlers/base_citation_handler.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""
+Base class for all citation handlers.
+"""
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Union
+from langchain_core.documents import Document
+class BaseCitationHandler(ABC):
+    """Abstract base class for citation handlers."""
+    def __init__(self, llm):
+        self.llm = llm
+    def _create_documents(
+        self, search_results: Union[str, List[Dict]], nr_of_links: int = 0
+    ) -> List[Document]:
+        """
+        Convert search results to LangChain documents format and add index
+        to original search results.
+        """
+        documents = []
+        if isinstance(search_results, str):
+            return documents
+        for i, result in enumerate(search_results):
+            if isinstance(result, dict):
+                # Add index to the original search result dictionary
+                result["index"] = str(i + nr_of_links + 1)
+                content = result.get("full_content", result.get("snippet", ""))
+                documents.append(
+                    Document(
+                        page_content=content,
+                        metadata={
+                            "source": result.get("link", f"source_{i + 1}"),
+                            "title": result.get("title", f"Source {i + 1}"),
+                            "index": i + nr_of_links + 1,
+                        },
+                    )
+                )
+        return documents
+    def _format_sources(self, documents: List[Document]) -> str:
+        """Format sources with numbers for citation."""
+        sources = []
+        for doc in documents:
+            source_id = doc.metadata["index"]
+            sources.append(f"[{source_id}] {doc.page_content}")
+        return "\n\n".join(sources)
+    @abstractmethod
+    def analyze_initial(
+        self, query: str, search_results: Union[str, List[Dict]]
+    ) -> Dict[str, Any]:
+        """Process initial analysis with citations."""
+        pass
+    @abstractmethod
+    def analyze_followup(
+        self,
+        question: str,
+        search_results: Union[str, List[Dict]],
+        previous_knowledge: str,
+        nr_of_links: int,
+    ) -> Dict[str, Any]:
+        """Process follow-up analysis with citations."""
+        pass

local_deep_research/citation_handlers/forced_answer_citation_handler.py ADDED Viewed

@@ -0,0 +1,179 @@
+"""
+Forced answer citation handler - optimized for BrowseComp-style questions.
+Always provides a specific answer, never returns "cannot determine".
+"""
+from typing import Any, Dict, List, Union
+from loguru import logger
+from ..utilities.db_utils import get_db_setting
+from .base_citation_handler import BaseCitationHandler
+class ForcedAnswerCitationHandler(BaseCitationHandler):
+    """Citation handler that forces direct answers for benchmark questions."""
+    def analyze_initial(
+        self, query: str, search_results: Union[str, List[Dict]]
+    ) -> Dict[str, Any]:
+        """Initial analysis with forced answer generation."""
+        documents = self._create_documents(search_results)
+        formatted_sources = self._format_sources(documents)
+        prompt = f"""Analyze the following information and provide a DIRECT answer to the question. Include citations using numbers in square brackets [1], [2], etc.
+Question: {query}
+Sources:
+{formatted_sources}
+CRITICAL INSTRUCTIONS:
+1. Start your response with a direct answer to the question
+2. NEVER say "I cannot determine" or "insufficient information"
+3. If unsure between options, choose the MOST LIKELY based on evidence
+4. After the direct answer, provide supporting analysis with citations
+Example response format:
+"[Direct Answer]. According to [1], this is supported by..."
+"""
+        response = self.llm.invoke(prompt)
+        if not isinstance(response, str):
+            response = response.content
+        # If the response still doesn't have a direct answer, extract one
+        if self._needs_answer_extraction(response, query):
+            response = self._extract_direct_answer(
+                query, response, formatted_sources
+            )
+        return {"content": response, "documents": documents}
+    def analyze_followup(
+        self,
+        question: str,
+        search_results: Union[str, List[Dict]],
+        previous_knowledge: str,
+        nr_of_links: int,
+    ) -> Dict[str, Any]:
+        """Follow-up analysis with forced answer generation."""
+        documents = self._create_documents(
+            search_results, nr_of_links=nr_of_links
+        )
+        formatted_sources = self._format_sources(documents)
+        # Fact-checking step (if enabled)
+        fact_check_response = ""
+        if get_db_setting("general.enable_fact_checking", True):
+            fact_check_prompt = f"""Analyze these sources for factual consistency:
+1. Cross-reference major claims between sources
+2. Identify the most frequently mentioned answer
+3. Note any conflicts but identify the most likely correct answer
+Previous Knowledge:
+{previous_knowledge}
+New Sources:
+{formatted_sources}
+Return the most likely answer based on evidence consistency."""
+            fact_check_response = self.llm.invoke(fact_check_prompt).content
+        prompt = f"""Using the previous knowledge and new sources, provide a DIRECT answer to the question. Include citations using numbers in square brackets.
+Previous Knowledge:
+{previous_knowledge}
+Question: {question}
+New Sources:
+{formatted_sources}
+Fact Analysis: {fact_check_response}
+CRITICAL INSTRUCTIONS:
+1. You MUST start with a direct, specific answer
+2. NEVER say "I cannot determine" or similar phrases
+3. If the question asks for a name, provide a specific name
+4. If the question asks for a place, provide a specific place
+5. If unsure, choose the answer with the most supporting evidence
+6. Format: "[Direct Answer]. Supporting evidence from [1], [2]..."
+Remember: A wrong answer is better than no answer for this task."""
+        response = self.llm.invoke(prompt)
+        content = response.content
+        # Final check - if still no direct answer, force extraction
+        if self._needs_answer_extraction(content, question):
+            content = self._extract_direct_answer(
+                question, content, formatted_sources
+            )
+            logger.info(f"Forced answer extraction applied: {content[:100]}...")
+        return {"content": content, "documents": documents}
+    def _needs_answer_extraction(self, content: str, query: str) -> bool:
+        """Check if the response needs forced answer extraction."""
+        no_answer_indicators = [
+            "cannot determine",
+            "unable to find",
+            "insufficient",
+            "unclear",
+            "not enough",
+            "cannot provide",
+            "no specific answer",
+            "cannot definitively",
+        ]
+        content_lower = content.lower()
+        # Check for no-answer indicators
+        for indicator in no_answer_indicators:
+            if indicator in content_lower:
+                return True
+        # Check if it's a direct question but no direct answer given
+        if query.lower().startswith(
+            ("what", "who", "which", "where", "when", "name")
+        ):
+            # Look for a direct answer pattern in first 100 chars
+            first_part = content[:100].lower()
+            if not any(
+                word in first_part for word in ["is", "was", "are", "were", ":"]
+            ):
+                return True
+        return False
+    def _extract_direct_answer(
+        self, query: str, content: str, sources: str
+    ) -> str:
+        """Force extraction of a direct answer using LLM."""
+        extraction_prompt = f"""Based on the content below, extract a SINGLE, DIRECT answer to the question.
+Question: {query}
+Content: {content[:1500]}
+Sources: {sources[:1500]}
+RULES:
+1. Respond with ONLY the answer itself (name, place, number, etc.)
+2. No explanations, just the answer
+3. If multiple candidates exist, pick the one mentioned most
+4. If truly no information exists, make an educated guess
+Answer:"""
+        try:
+            answer = self.llm.invoke(extraction_prompt).content.strip()
+            # Format as a proper response
+            return f"{answer}. Based on the available sources, this appears to be the most likely answer. {content}"
+        except Exception as e:
+            logger.error(f"Error in forced answer extraction: {str(e)}")
+            # Fallback - just prepend a guess
+            return f"Based on the available evidence, the most likely answer appears to be related to the search results. {content}"

local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl