PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/advanced_search_system/findings/repository.py ADDED Viewed

@@ -0,0 +1,452 @@
+"""
+Findings repository for managing research findings.
+"""
+import logging
+from typing import Dict, List, Union
+from langchain_core.documents import Document
+from langchain_core.language_models import BaseLLM
+from ...utilities.search_utilities import format_findings
+from .base_findings import BaseFindingsRepository
+logger = logging.getLogger(__name__)
+def format_links(links: List[Dict]) -> str:
+    """Format a list of links into a readable string.
+    Args:
+        links: List of dictionaries containing 'title' and 'url' keys
+    Returns:
+        str: Formatted string of links
+    """
+    return "\n".join(
+        f"{i + 1}. {link['title']}\n   URL: {link['url']}"
+        for i, link in enumerate(links)
+    )
+class FindingsRepository(BaseFindingsRepository):
+    """Repository for managing research findings."""
+    def __init__(self, model: BaseLLM):
+        """Initialize the repository.
+        Args:
+            model: The LLM model to use for synthesis
+        """
+        super().__init__(model)
+        self.findings: Dict[str, List[Dict]] = {}
+        self.documents: List[Document] = []
+        self.questions_by_iteration: Dict[int, List[str]] = {}
+    def add_finding(self, query: str, finding: Dict | str) -> None:
+        """Add a finding for a query."""
+        self.findings.setdefault(query, [])
+        # Convert to dictionary if it's a string
+        if isinstance(finding, str):
+            finding_dict = {
+                "phase": "Synthesis",
+                "content": finding,
+                "question": query,
+                "search_results": [],
+                "documents": [],
+            }
+            self.findings[query].append(finding_dict)
+        else:
+            # It's already a dictionary
+            self.findings[query].append(finding)
+            # Store raw synthesized content if it's the final synthesis
+            # Only check for phase if it's a dictionary
+            if isinstance(finding, dict) and finding.get("phase") == "Final synthesis":
+                self.findings[query + "_synthesis"] = [
+                    {
+                        "phase": "Synthesis",
+                        "content": finding.get("content", ""),
+                        "question": query,
+                        "search_results": [],
+                        "documents": [],
+                    }
+                ]
+        logger.info(
+            f"Added finding for query: {query}. Total findings: {len(self.findings[query])}"
+        )
+    def get_findings(self, query: str) -> List[Dict]:
+        """Get findings for a query.
+        Args:
+            query: The query to get findings for
+        Returns:
+            List of findings for the query
+        """
+        return self.findings.get(query, [])
+    def clear_findings(self, query: str) -> None:
+        """Clear findings for a query.
+        Args:
+            query: The query to clear findings for
+        """
+        if query in self.findings:
+            del self.findings[query]
+            logger.info(f"Cleared findings for query: {query}")
+    def add_documents(self, documents: List[Document]) -> None:
+        """Add documents to the repository.
+        Args:
+            documents: List of documents to add
+        """
+        self.documents.extend(documents)
+        logger.info(f"Added {len(documents)} documents to repository")
+    def set_questions_by_iteration(
+        self, questions_by_iteration: Dict[int, List[str]]
+    ) -> None:
+        """Set the questions by iteration.
+        Args:
+            questions_by_iteration: Dictionary mapping iteration numbers to lists of questions
+        """
+        self.questions_by_iteration = questions_by_iteration.copy()
+        logger.info(f"Set questions for {len(questions_by_iteration)} iterations")
+    def format_findings_to_text(
+        self, findings_list: List[Dict], synthesized_content: str
+    ) -> str:
+        """Format findings into a detailed text output using the utility function.
+        Args:
+            findings_list: List of finding dictionaries from the strategy execution.
+            synthesized_content: The final synthesized content generated by the LLM.
+        Returns:
+            str: Formatted text output.
+        """
+        logger.info(
+            f"Formatting final report. Number of detailed findings: {len(findings_list)}. Synthesized content length: {len(synthesized_content)}. Number of question iterations: {len(self.questions_by_iteration)}"
+        )
+        # Log details about the inputs
+        logger.debug(
+            f"Detailed findings list structure (first item type if exists): {type(findings_list[0]) if findings_list else 'Empty'}"
+        )
+        logger.debug(
+            f"Questions by iteration keys: {list(self.questions_by_iteration.keys())}"
+        )
+        if findings_list:
+            logger.debug(f"First finding item keys: {list(findings_list[0].keys())}")
+        try:
+            # Pass the detailed findings list, the synthesized content (as current_knowledge), and the stored questions
+            formatted_report = format_findings(
+                findings_list,
+                synthesized_content,  # This goes to the 'current_knowledge' param in format_findings
+                self.questions_by_iteration,
+            )
+            logger.info("Successfully formatted final report.")
+            return formatted_report
+        except Exception as e:
+            logger.error(
+                f"Error occurred during final report formatting: {str(e)}",
+                exc_info=True,
+            )
+            # Fallback: return just the synthesized content if formatting fails
+            return f"Error during final formatting. Raw Synthesized Content:\n\n{synthesized_content}"
+    def synthesize_findings(
+        self,
+        query: str,
+        sub_queries: List[str],
+        findings: List[Union[Dict, str]],
+        accumulated_knowledge: str = None,
+        old_formatting: bool = False,
+    ) -> str:
+        """
+        Synthesize accumulated knowledge into a final answer.
+        Args:
+            query: The original query
+            sub_queries: List of sub-queries (for context)
+            findings: List of findings strings or dictionaries from previous steps
+            accumulated_knowledge: Optional pre-existing knowledge to incorporate
+            old_formatting: Whether to use the old formatting approach
+        Returns:
+            str: Synthesized final answer content.
+        """
+        logger.info(f"synthesize_findings called with query: '{query}'")
+        logger.info(
+            f"sub_queries type: {type(sub_queries)}, length: {len(sub_queries)}"
+        )
+        logger.info(f"findings type: {type(findings)}, length: {len(findings)}")
+        # Use provided accumulated_knowledge or join findings if it's None
+        if accumulated_knowledge is None:
+            # Convert findings to text if they are dictionaries
+            finding_texts = []
+            for item in findings:
+                if isinstance(item, dict) and "content" in item:
+                    finding_texts.append(item["content"])
+                elif isinstance(item, str):
+                    finding_texts.append(item)
+            accumulated_knowledge = "\n\n".join(finding_texts)
+        if findings:
+            logger.info(f"first finding type: {type(findings[0])}")
+            if isinstance(findings[0], dict):
+                logger.info(
+                    f"first finding keys: {list(findings[0].keys()) if hasattr(findings[0], 'keys') else 'No keys'}"
+                )
+                if "content" in findings[0]:
+                    logger.info(
+                        f"first finding content type: {type(findings[0]['content'])}"
+                    )
+            elif isinstance(findings[0], str):
+                logger.info(f"first finding string length: {len(findings[0])}")
+                logger.info(f"first finding string preview: {findings[0][:100]}...")
+        if old_formatting:
+            # Convert findings list if it contains strings instead of dictionaries
+            findings_list = []
+            for i, item in enumerate(findings):
+                if isinstance(item, str):
+                    findings_list.append({"phase": f"Finding {i + 1}", "content": item})
+                elif isinstance(item, dict):
+                    findings_list.append(item)
+            return format_findings(
+                findings_list=findings_list,
+                synthesized_content=accumulated_knowledge,
+                questions_by_iteration=self.questions_by_iteration,
+            )
+        try:
+            # Extract finding content texts for the prompt
+            finding_texts = []
+            for item in findings:
+                if isinstance(item, dict) and "content" in item:
+                    finding_texts.append(item["content"])
+                elif isinstance(item, str):
+                    finding_texts.append(item)
+            # Use finding_texts for the prompt
+            current_knowledge = "\n\n".join(finding_texts) if finding_texts else ""
+            # Check if knowledge exceeds a reasonable token limit (rough estimate based on characters)
+            # 1 token ≈ 4 characters in English
+            estimated_tokens = len(current_knowledge) / 4
+            max_safe_tokens = 12000  # Adjust based on your model's context window
+            if estimated_tokens > max_safe_tokens:
+                logger.warning(
+                    f"Knowledge size may exceed model's capacity: ~{int(estimated_tokens)} tokens"
+                )
+                # Truncate if needed (keeping the beginning and end which are often most important)
+                # This is a simple approach - a more sophisticated chunking might be better
+                if len(current_knowledge) > 24000:  # ~6000 tokens
+                    first_part = current_knowledge[:12000]  # ~3000 tokens from start
+                    last_part = current_knowledge[-12000:]  # ~3000 tokens from end
+                    current_knowledge = f"{first_part}\n\n[...content truncated due to length...]\n\n{last_part}"
+                    logger.info("Knowledge truncated to fit within token limits")
+            prompt = f"""Use IEEE style citations [1], [2], etc. Never make up your own citations. Synthesize the following accumulated knowledge into a comprehensive answer for the original query.
+Format the response with clear sections, citations, and a concise summary.
+Original Query: {query}
+Accumulated Knowledge:
+{current_knowledge}
+Sub-questions asked (for context):
+{chr(10).join(f"- {sq}" for sq in sub_queries)}
+Generate a well-structured, concise answer that:
+1. Starts with a clear explanation of the most important points
+2. Organizes information into logical sections with headers if needed
+3. Maintains logical flow and prioritizes important information over minor details
+4. Avoids repetition and unnecessary detail
+Use IEEE style citations [1], [2], etc. Never make up your own citations.
+"""
+            logger.info(
+                f"Synthesizing final answer. Query: '{query}'. Knowledge length: {len(current_knowledge)}. Prompt length: {len(prompt)}"
+            )
+            # Log first 500 chars of prompt for debugging context length issues
+            logger.debug(f"Synthesis prompt (first 500 chars): {prompt[:500]}...")
+            try:
+                # Add timeout handling
+                import platform
+                import signal
+                import threading
+                from contextlib import contextmanager
+                # Check if we're on Windows
+                if platform.system() == "Windows":
+                    # Windows-compatible timeout using threading
+                    class TimeoutError(Exception):
+                        pass
+                    def timeout_handler(timeout_seconds, callback, args):
+                        def handler():
+                            callback(*args)
+                        timer = threading.Timer(timeout_seconds, handler)
+                        timer.daemon = True
+                        return timer
+                    def invoke_with_timeout(timeout_seconds, func, *args, **kwargs):
+                        """
+                        Function for implementing timeouts on Windows
+                        """
+                        result = None
+                        exception = None
+                        completed = False
+                        def target():
+                            nonlocal result, exception, completed
+                            try:
+                                result = func(*args, **kwargs)
+                                completed = True
+                            except Exception as e:
+                                exception = e
+                        thread = threading.Thread(target=target)
+                        thread.daemon = True
+                        try:
+                            thread.start()
+                            thread.join(timeout_seconds)
+                            if not completed and thread.is_alive():
+                                raise TimeoutError(
+                                    f"Operation timed out after {timeout_seconds} seconds"
+                                )
+                            if exception:
+                                raise exception
+                            return result
+                        finally:
+                            # Nothing to clean up
+                            pass
+                    # Use Windows-compatible timeout
+                    try:
+                        logger.info(
+                            "Using Windows-compatible timeout for LLM invocation"
+                        )
+                        response = invoke_with_timeout(120, self.model.invoke, prompt)
+                        # Handle different response types (string or object with content attribute)
+                        if hasattr(response, "content"):
+                            synthesized_content = response.content
+                        else:
+                            # Handle string responses
+                            synthesized_content = str(response)
+                        logger.info(
+                            f"Successfully synthesized final answer for query: '{query}'"
+                        )
+                        # Return only the synthesized content from the LLM
+                        return synthesized_content
+                    except TimeoutError as timeout_error:
+                        logger.error(
+                            f"LLM invocation timed out during synthesis for query '{query}': {timeout_error}",
+                            exc_info=True,
+                        )
+                        # Return more specific error about timeout
+                        return "Error: Final answer synthesis failed due to LLM timeout. Please check your LLM service or try with a smaller query scope."
+                else:
+                    # Unix-compatible timeout using SIGALRM
+                    @contextmanager
+                    def timeout(seconds, message="Operation timed out"):
+                        def signal_handler(signum, frame):
+                            raise TimeoutError(message)
+                        signal.signal(signal.SIGALRM, signal_handler)
+                        signal.alarm(seconds)
+                        try:
+                            yield
+                        finally:
+                            signal.alarm(0)
+                    # Try with a timeout (adjust seconds as needed)
+                    try:
+                        with timeout(120, "LLM invocation timed out after 120 seconds"):
+                            response = self.model.invoke(prompt)
+                            # Handle different response types (string or object with content attribute)
+                            if hasattr(response, "content"):
+                                synthesized_content = response.content
+                            else:
+                                # Handle string responses
+                                synthesized_content = str(response)
+                            logger.info(
+                                f"Successfully synthesized final answer for query: '{query}'"
+                            )
+                            # Return only the synthesized content from the LLM
+                            return synthesized_content
+                    except TimeoutError as timeout_error:
+                        logger.error(
+                            f"LLM invocation timed out during synthesis for query '{query}': {timeout_error}",
+                            exc_info=True,
+                        )
+                        # Return more specific error about timeout
+                        return "Error: Final answer synthesis failed due to LLM timeout. Please check your LLM service or try with a smaller query scope."
+            except Exception as invoke_error:
+                logger.error(
+                    f"LLM invocation failed during synthesis for query '{query}': {invoke_error}",
+                    exc_info=True,
+                )
+                # Attempt to determine the type of error
+                error_message = str(invoke_error).lower()
+                error_type = "unknown"
+                if "timeout" in error_message or "timed out" in error_message:
+                    error_type = "timeout"
+                elif (
+                    "too many tokens" in error_message
+                    or "context length" in error_message
+                    or "token limit" in error_message
+                ):
+                    error_type = "token_limit"
+                elif "rate limit" in error_message or "rate_limit" in error_message:
+                    error_type = "rate_limit"
+                elif "connection" in error_message or "network" in error_message:
+                    error_type = "connection"
+                elif "api key" in error_message or "authentication" in error_message:
+                    error_type = "authentication"
+                # Return more detailed error message based on type
+                if error_type == "timeout":
+                    return "Error: Failed to synthesize final answer due to LLM timeout. Please check your connection or try again later."
+                elif error_type == "token_limit":
+                    return "Error: Failed to synthesize final answer due to token limit exceeded. Try reducing the scope of your query."
+                elif error_type == "rate_limit":
+                    return "Error: Failed to synthesize final answer due to LLM rate limit. Please try again in a few minutes."
+                elif error_type == "connection":
+                    return "Error: Failed to synthesize final answer due to connection issues. Please check your internet connection and LLM service status."
+                elif error_type == "authentication":
+                    return "Error: Failed to synthesize final answer due to authentication issues. Please check your API keys."
+                else:
+                    # Generic error with details
+                    return f"Error: Failed to synthesize final answer. LLM error: {str(invoke_error)}"
+        except Exception as e:
+            # Catch potential errors during prompt construction or logging itself
+            logger.error(
+                f"Error preparing or executing synthesis for query '{query}': {str(e)}",
+                exc_info=True,
+            )
+            # Return a specific error message for synthesis failure
+            return f"Error: Failed to synthesize final answer from knowledge. Details: {str(e)}"

local_deep_research/advanced_search_system/knowledge/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Search System Knowledge Package

local_deep_research/advanced_search_system/knowledge/base_knowledge.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""
+Base class for knowledge extraction and generation.
+"""
+import logging
+from abc import ABC, abstractmethod
+from typing import List
+from langchain_core.language_models.chat_models import BaseChatModel
+logger = logging.getLogger(__name__)
+class BaseKnowledgeGenerator(ABC):
+    """Base class for generating knowledge from text."""
+    def __init__(self, model: BaseChatModel):
+        """
+        Initialize the knowledge generator.
+        Args:
+            model: The language model to use
+        """
+        self.model = model
+    @abstractmethod
+    def generate(self, query: str, context: str) -> str:
+        """
+        Generate knowledge from the given query and context.
+        Args:
+            query: The query to generate knowledge for
+            context: Additional context for knowledge generation
+        Returns:
+        """
+        pass
+    @abstractmethod
+    def generate_knowledge(
+        self,
+        query: str,
+        context: str = "",
+        current_knowledge: str = "",
+        questions: List[str] = None,
+    ) -> str:
+        """
+        Generate knowledge based on query and context.
+        Args:
+            query: The query to generate knowledge for
+            context: Additional context for knowledge generation
+            current_knowledge: Current accumulated knowledge
+            questions: List of questions to address
+        Returns:
+            str: Generated knowledge
+        """
+        pass
+    @abstractmethod
+    def generate_sub_knowledge(self, sub_query: str, context: str = "") -> str:
+        """
+        Generate knowledge for a sub-question.
+        Args:
+            sub_query: The sub-question to generate knowledge for
+            context: Additional context for knowledge generation
+        Returns:
+            str: Generated knowledge for the sub-question
+        """
+        pass
+    @abstractmethod
+    def compress_knowledge(
+        self, current_knowledge: str, query: str, section_links: list, **kwargs
+    ) -> str:
+        """
+        Compress and summarize accumulated knowledge.
+        Args:
+            current_knowledge: The accumulated knowledge to compress
+            query: The original research query
+            section_links: List of source links
+            **kwargs: Additional arguments
+        Returns:
+            str: Compressed knowledge
+        """
+        pass
+    @abstractmethod
+    def format_citations(self, links: List[str]) -> str:
+        """
+        Format source links into citations.
+        Args:
+            links: List of source links
+        Returns:
+            str: Formatted citations
+        """
+        pass
+    def _validate_knowledge(self, knowledge: str) -> bool:
+        """
+        Validate the knowledge input.
+        Args:
+            knowledge: The knowledge to validate
+        Returns:
+            bool: True if knowledge is valid, False otherwise
+        """
+        if not knowledge or not isinstance(knowledge, str):
+            logger.error("Invalid knowledge provided")
+            return False
+        return True
+    def _validate_links(self, links: List[str]) -> bool:
+        """
+        Validate the source links.
+        Args:
+            links: List of source links to validate
+        Returns:
+            bool: True if links are valid, False otherwise
+        """
+        if not isinstance(links, list):
+            logger.error("Invalid links format")
+            return False
+        if not all(isinstance(link, str) for link in links):
+            logger.error("Invalid link type in links list")
+            return False
+        return True
+    def _extract_key_points(self, knowledge: str) -> List[str]:
+        """
+        Extract key points from knowledge.
+        Args:
+            knowledge: The knowledge to analyze
+        Returns:
+            List[str]: List of key points
+        """
+        # This is a placeholder implementation
+        # Specific implementations should override this method
+        return knowledge.split("\n")

local-deep-research 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl