PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py CHANGED Viewed

@@ -23,7 +23,11 @@ class RapidSearchStrategy(BaseSearchStrategy):
     """
     def __init__(
-        self, search=None, model=None, citation_handler=None, all_links_of_system=None
+        self,
+        search=None,
+        model=None,
+        citation_handler=None,
+        all_links_of_system=None,
     ):
         """Initialize with optional dependency injection for testing."""
         super().__init__(all_links_of_system=all_links_of_system)
@@ -90,7 +94,10 @@ class RapidSearchStrategy(BaseSearchStrategy):
                 self._update_progress(
                     f"Found {len(initial_results)} initial results",
                     15,
-                    {"phase": "search_complete", "result_count": len(initial_results)},
+                    {
+                        "phase": "search_complete",
+                        "result_count": len(initial_results),
+                    },
                 )
             # Extract snippets and links
@@ -123,7 +130,9 @@ class RapidSearchStrategy(BaseSearchStrategy):
         # Step 2: Generate a few follow-up questions (optional, can be skipped for ultimate speed)
         self._update_progress(
-            "Generating follow-up questions", 25, {"phase": "question_generation"}
+            "Generating follow-up questions",
+            25,
+            {"phase": "question_generation"},
         )
         questions = self.question_generator.generate_questions(
@@ -161,7 +170,10 @@ class RapidSearchStrategy(BaseSearchStrategy):
                 self._update_progress(
                     f"Found {len(search_results)} results for question: {question}",
                     int(question_progress + 5),
-                    {"phase": "search_complete", "result_count": len(search_results)},
+                    {
+                        "phase": "search_complete",
+                        "result_count": len(search_results),
+                    },
                 )
                 # Extract snippets only
@@ -195,7 +207,9 @@ class RapidSearchStrategy(BaseSearchStrategy):
         # Step 4: Perform a single final synthesis with all collected snippets using the citation handler
         self._update_progress(
-            "Synthesizing all collected information", 80, {"phase": "final_synthesis"}
+            "Synthesizing all collected information",
+            80,
+            {"phase": "final_synthesis"},
         )
         try:
@@ -203,7 +217,9 @@ class RapidSearchStrategy(BaseSearchStrategy):
             # First, we need a stub of current knowledge
             # Use the citation handler to analyze the results
-            result = self.citation_handler.analyze_initial(query, all_search_results)
+            result = self.citation_handler.analyze_initial(
+                query, all_search_results
+            )
             if result:
                 synthesized_content = result["content"]
@@ -224,12 +240,16 @@ class RapidSearchStrategy(BaseSearchStrategy):
                 )
                 # Format the findings with search questions and sources
-                formatted_findings = self.findings_repository.format_findings_to_text(
-                    findings, synthesized_content
+                formatted_findings = (
+                    self.findings_repository.format_findings_to_text(
+                        findings, synthesized_content
+                    )
                 )
                 # Also add to the repository
-                self.findings_repository.add_documents(result.get("documents", []))
+                self.findings_repository.add_documents(
+                    result.get("documents", [])
+                )
             else:
                 # Fallback if citation handler fails
                 synthesized_content = (

local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py ADDED Viewed

@@ -0,0 +1,492 @@
+"""
+Recursive Decomposition Strategy for hierarchical question answering.
+This strategy decomposes complex questions into subtasks, recursively solving
+each subtask and aggregating results to answer the original question.
+"""
+from typing import Any, Dict, List
+from langchain_core.language_models import BaseChatModel
+from loguru import logger
+from ...utilities.search_utilities import format_findings, remove_think_tags
+from ..findings.repository import FindingsRepository
+from ..questions.standard_question import StandardQuestionGenerator
+from .base_strategy import BaseSearchStrategy
+from .source_based_strategy import SourceBasedSearchStrategy
+class RecursiveDecompositionStrategy(BaseSearchStrategy):
+    """
+    A strategy that recursively decomposes complex questions into subtasks.
+    Each subtask is either solved directly via search or further decomposed,
+    creating a hierarchical problem-solving approach.
+    """
+    def __init__(
+        self,
+        model: BaseChatModel,
+        search: Any,
+        all_links_of_system: List[str],
+        max_recursion_depth: int = 5,
+        source_search_iterations: int = 2,
+        source_questions_per_iteration: int = 20,
+    ):
+        """Initialize the recursive decomposition strategy.
+        Args:
+            model: The language model to use
+            search: The search engine instance
+            all_links_of_system: List to store all encountered links
+            max_recursion_depth: Maximum recursion depth to prevent infinite loops
+            source_search_iterations: Iterations for source-based searches
+            source_questions_per_iteration: Questions per iteration for source-based searches
+        """
+        super().__init__(all_links_of_system)
+        self.model = model
+        self.search = search
+        self.max_recursion_depth = max_recursion_depth
+        self.source_search_iterations = source_search_iterations
+        self.source_questions_per_iteration = source_questions_per_iteration
+        self.question_generator = StandardQuestionGenerator(model)
+        self.findings_repository = FindingsRepository(model)
+        self.current_depth = 0
+        self.original_query = None  # Store the original query for context
+    def analyze_topic(self, query: str, recursion_depth: int = 0) -> Dict:
+        """Analyze a topic using recursive decomposition.
+        Args:
+            query: The research query to analyze
+            recursion_depth: Current recursion depth
+        Returns:
+            Dictionary containing analysis results
+        """
+        if recursion_depth >= self.max_recursion_depth:
+            logger.warning(
+                f"Max recursion depth {self.max_recursion_depth} reached"
+            )
+            return self._use_source_based_strategy(query)
+        # Initialize tracking at top level
+        if recursion_depth == 0:
+            self.all_links_of_system.clear()
+            self.questions_by_iteration = []
+            self.findings = []
+            self.original_query = query  # Store the original query
+            # Progress callback for UI
+            if self.progress_callback:
+                self.progress_callback(
+                    "Starting recursive decomposition analysis",
+                    1,
+                    {"phase": "init", "strategy": "recursive_decomposition"},
+                )
+        logger.info(f"Analyzing query at depth {recursion_depth}: {query}")
+        # Decide whether to decompose or search directly
+        decomposition_decision = self._decide_decomposition(query)
+        if decomposition_decision["should_decompose"]:
+            # Add decomposition decision to findings for UI visibility
+            self.findings.append(
+                {
+                    "phase": f"Decomposition Decision (Depth {recursion_depth})",
+                    "content": decomposition_decision["reasoning"],
+                    "subtasks": decomposition_decision["subtasks"],
+                    "timestamp": self._get_timestamp(),
+                }
+            )
+            if self.progress_callback:
+                self.progress_callback(
+                    f"Decomposing query into {len(decomposition_decision['subtasks'])} subtasks",
+                    10 + (recursion_depth * 10),
+                    {
+                        "phase": "decomposition",
+                        "depth": recursion_depth,
+                        "subtask_count": len(
+                            decomposition_decision["subtasks"]
+                        ),
+                    },
+                )
+            return self._handle_decomposition(
+                query, decomposition_decision, recursion_depth
+            )
+        else:
+            # Add search decision to findings
+            self.findings.append(
+                {
+                    "phase": f"Direct Search Decision (Depth {recursion_depth})",
+                    "content": f"Searching directly for: {query}",
+                    "reasoning": decomposition_decision["reasoning"],
+                    "timestamp": self._get_timestamp(),
+                }
+            )
+            return self._use_source_based_strategy(query)
+    def _decide_decomposition(self, query: str) -> Dict:
+        """Decide whether to decompose the query or search directly.
+        Args:
+            query: The query to analyze
+        Returns:
+            Dictionary with decomposition decision and subtasks if applicable
+        """
+        # Include original query context when needed
+        context_info = ""
+        if self.original_query and query != self.original_query:
+            context_info = f"\nOriginal research topic: {self.original_query}"
+        prompt = f"""Analyze this research query and decide whether to decompose it into subtasks or search directly.
+Query: {query}{context_info}
+Consider:
+1. Is this a compound question with multiple distinct parts?
+2. Does it require finding specific information that builds on other information?
+3. Would breaking it down lead to more focused, answerable questions?
+4. Can this be answered with a straightforward web search?
+Respond in this format:
+DECISION: [DECOMPOSE or SEARCH_DIRECTLY]
+REASONING: [Your reasoning in 1-2 sentences]
+If DECOMPOSE, provide:
+SUBTASKS:
+1. [First subtask - make it specific and searchable]
+2. [Second subtask - make it specific and searchable]
+...
+DEPENDENCIES: [Explain which subtasks depend on others, if any]
+When creating subtasks, ensure they maintain relevance to the original topic."""
+        response = self.model.invoke(prompt)
+        content = remove_think_tags(response.content)
+        # Log the decision for debugging
+        logger.info(f"Decomposition decision for '{query}': {content[:200]}...")
+        # Parse the response
+        lines = content.strip().split("\n")
+        decision = "SEARCH_DIRECTLY"
+        subtasks = []
+        dependencies = []
+        reasoning = ""
+        parsing_subtasks = False
+        for line in lines:
+            if line.startswith("DECISION:"):
+                decision = line.split(":", 1)[1].strip()
+            elif line.startswith("REASONING:"):
+                reasoning = line.split(":", 1)[1].strip()
+            elif line.startswith("SUBTASKS:"):
+                parsing_subtasks = True
+            elif line.startswith("DEPENDENCIES:"):
+                parsing_subtasks = False
+                dependencies_text = line.split(":", 1)[1].strip()
+                if dependencies_text:
+                    dependencies = [dependencies_text]
+            elif parsing_subtasks and line.strip():
+                # Handle numbered subtasks
+                if line[0].isdigit() and "." in line:
+                    subtasks.append(line.split(".", 1)[1].strip())
+                elif line.strip().startswith("-"):
+                    subtasks.append(line.strip()[1:].strip())
+        return {
+            "should_decompose": decision == "DECOMPOSE",
+            "reasoning": reasoning,
+            "subtasks": subtasks,
+            "dependencies": dependencies,
+        }
+    def _handle_decomposition(
+        self, query: str, decomposition: Dict, recursion_depth: int
+    ) -> Dict:
+        """Handle the decomposition of a query into subtasks.
+        Args:
+            query: The original query
+            decomposition: The decomposition decision with subtasks
+            recursion_depth: Current recursion depth
+        Returns:
+            Aggregated results from all subtasks
+        """
+        subtasks = decomposition["subtasks"]
+        subtask_results = []
+        # Process each subtask
+        for i, subtask in enumerate(subtasks):
+            logger.info(
+                f"Processing subtask {i + 1}/{len(subtasks)} at depth {recursion_depth}: {subtask}"
+            )
+            # Update progress for UI
+            progress = (
+                20 + (recursion_depth * 10) + ((i + 1) / len(subtasks) * 40)
+            )
+            if self.progress_callback:
+                self.progress_callback(
+                    f"Processing subtask {i + 1}/{len(subtasks)}: {subtask[:50]}...",
+                    int(progress),
+                    {
+                        "phase": "subtask_processing",
+                        "depth": recursion_depth,
+                        "current_subtask": i + 1,
+                        "total_subtasks": len(subtasks),
+                        "subtask_text": subtask,
+                    },
+                )
+            # Recursively analyze subtask
+            result = self.analyze_topic(subtask, recursion_depth + 1)
+            # Store subtask result with metadata
+            subtask_results.append(
+                {
+                    "subtask": subtask,
+                    "result": result,
+                    "depth": recursion_depth + 1,
+                    "index": i + 1,
+                }
+            )
+            # Add subtask completion to findings
+            self.findings.append(
+                {
+                    "phase": f"Subtask {i + 1} Complete (Depth {recursion_depth})",
+                    "content": f"Completed: {subtask}",
+                    "result_summary": result.get("current_knowledge", "")[:500],
+                    "timestamp": self._get_timestamp(),
+                }
+            )
+        # Aggregate results
+        aggregated_result = self._aggregate_subtask_results(
+            query, subtask_results, recursion_depth
+        )
+        return aggregated_result
+    def _aggregate_subtask_results(
+        self,
+        original_query: str,
+        subtask_results: List[Dict],
+        recursion_depth: int,
+    ) -> Dict:
+        """Aggregate results from multiple subtasks to answer the original query.
+        Args:
+            original_query: The original query
+            subtask_results: Results from all subtasks
+            recursion_depth: Current recursion depth
+        Returns:
+            Aggregated result answering the original query
+        """
+        # Update progress
+        if self.progress_callback:
+            self.progress_callback(
+                f"Synthesizing results from {len(subtask_results)} subtasks",
+                80 + (recursion_depth * 5),
+                {"phase": "synthesis", "depth": recursion_depth},
+            )
+        # Prepare context from subtask results
+        context_parts = []
+        all_links = []
+        all_findings = []
+        all_questions = []
+        for idx, result in enumerate(subtask_results):
+            subtask = result["subtask"]
+            subtask_result = result["result"]
+            # Extract key information
+            if "current_knowledge" in subtask_result:
+                context_parts.append(
+                    f"### Subtask {idx + 1}: {subtask}\n"
+                    f"Result: {subtask_result['current_knowledge']}\n"
+                )
+            if "all_links_of_system" in subtask_result:
+                all_links.extend(subtask_result["all_links_of_system"])
+            if "findings" in subtask_result:
+                all_findings.extend(subtask_result["findings"])
+            if "questions_by_iteration" in subtask_result:
+                all_questions.extend(subtask_result["questions_by_iteration"])
+        context = "\n".join(context_parts)
+        # Include master context for better synthesis
+        master_context_info = ""
+        if self.original_query and original_query != self.original_query:
+            master_context_info = (
+                f"\nMaster Research Topic: {self.original_query}"
+            )
+        # Use LLM to synthesize final answer
+        synthesis_prompt = f"""Based on the following subtask results, provide a comprehensive answer to the original query.
+Original Query: {original_query}{master_context_info}
+Subtask Results:
+{context}
+Synthesize the information to directly answer the original query. Be specific and reference the relevant information from the subtasks. Provide a clear, well-structured answer.
+"""
+        response = self.model.invoke(synthesis_prompt)
+        synthesized_answer = remove_think_tags(response.content)
+        # Add synthesis to findings
+        synthesis_finding = {
+            "phase": f"Final Synthesis (Depth {recursion_depth})",
+            "content": synthesized_answer,
+            "question": original_query,
+            "timestamp": self._get_timestamp(),
+        }
+        self.findings.append(synthesis_finding)
+        all_findings.append(synthesis_finding)
+        # Format findings for UI using the standard utility
+        # Remove duplicate links (can't use set() since links might be dicts)
+        unique_links = []
+        seen_links = set()
+        for link in all_links:
+            # If link is a dict, use its URL/link as the unique identifier
+            if isinstance(link, dict):
+                link_id = link.get("link") or link.get("url") or str(link)
+            else:
+                link_id = str(link)
+            if link_id not in seen_links:
+                seen_links.add(link_id)
+                unique_links.append(link)
+        # Convert questions list to the expected dictionary format
+        questions_dict = {}
+        for i, questions in enumerate(all_questions):
+            if isinstance(questions, list):
+                questions_dict[i + 1] = questions
+            elif isinstance(questions, dict):
+                # If it's already a dict, just merge it
+                questions_dict.update(questions)
+            elif questions is not None:
+                # For any other type, make it a single-item list
+                questions_dict[i + 1] = [str(questions)]
+        formatted_findings = format_findings(
+            all_findings,
+            synthesized_answer,  # Pass the synthesized answer as the second argument
+            questions_dict,  # Pass the questions dictionary as the third argument
+        )
+        # Compile final result matching source-based strategy format
+        result = {
+            "current_knowledge": synthesized_answer,
+            "formatted_findings": formatted_findings,
+            "findings": all_findings,
+            "iterations": 0,  # Set to 0 since we don't track iterations
+            "questions_by_iteration": questions_dict,
+            "all_links_of_system": unique_links,
+            "sources": unique_links,
+            "subtask_results": subtask_results,
+            "strategy": "recursive_decomposition",
+            "recursion_depth": recursion_depth,
+            "questions": {
+                "total": sum(
+                    len(q) if isinstance(q, (list, dict)) else 1
+                    for q in all_questions
+                    if q is not None
+                ),
+                "by_iteration": questions_dict,
+            },
+        }
+        # Final progress update
+        if self.progress_callback and recursion_depth == 0:
+            self.progress_callback(
+                "Analysis complete",
+                100,
+                {"phase": "complete", "strategy": "recursive_decomposition"},
+            )
+        return result
+    def _use_source_based_strategy(self, query: str) -> Dict:
+        """Fall back to source-based strategy for direct search.
+        Args:
+            query: The query to search
+        Returns:
+            Search results from source-based strategy
+        """
+        # If we have original query context and it's different from current query,
+        # create an enhanced query that includes the context
+        enhanced_query = query
+        if self.original_query and query != self.original_query:
+            enhanced_query = (
+                f"{query} (in the context of: {self.original_query})"
+            )
+            logger.info(
+                f"Enhanced query for source-based search: {enhanced_query}"
+            )
+        else:
+            logger.info(
+                f"Using source-based strategy for direct search: {query}"
+            )
+        # Create a source-based strategy instance with specified parameters
+        source_strategy = SourceBasedSearchStrategy(
+            model=self.model,
+            search=self.search,
+            all_links_of_system=self.all_links_of_system,
+            include_text_content=True,
+            use_cross_engine_filter=True,
+            use_atomic_facts=True,
+        )
+        # Configure with our parameters
+        source_strategy.max_iterations = self.source_search_iterations
+        source_strategy.questions_per_iteration = (
+            self.source_questions_per_iteration
+        )
+        # Copy our callback to maintain UI integration
+        if self.progress_callback:
+            source_strategy.set_progress_callback(self.progress_callback)
+        # Run the search with the enhanced query
+        results = source_strategy.analyze_topic(enhanced_query)
+        # Update our tracking to maintain consistency with UI
+        if hasattr(source_strategy, "questions_by_iteration"):
+            self.questions_by_iteration.extend(
+                source_strategy.questions_by_iteration
+            )
+        # Ensure results have the fields the UI expects
+        if "findings" not in results:
+            results["findings"] = []
+        if "questions_by_iteration" not in results:
+            results["questions_by_iteration"] = self.questions_by_iteration
+        return results
+    def _get_timestamp(self) -> str:
+        """Get current timestamp for findings."""
+        from datetime import datetime
+        return datetime.utcnow().isoformat()

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl