PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py ADDED Viewed

@@ -0,0 +1,312 @@
+"""
+Parallel search strategy implementation for maximum search speed.
+"""
+import concurrent.futures
+import logging
+from typing import Dict
+from ...citation_handler import CitationHandler
+from ...config.llm_config import get_llm
+from ...config.search_config import get_search
+from ...utilities.db_utils import get_db_setting
+from ...utilities.search_utilities import extract_links_from_search_results
+from ..filters.cross_engine_filter import CrossEngineFilter
+from ..findings.repository import FindingsRepository
+from ..questions.standard_question import StandardQuestionGenerator
+from .base_strategy import BaseSearchStrategy
+logger = logging.getLogger(__name__)
+class ParallelSearchStrategy(BaseSearchStrategy):
+    """
+    Parallel search strategy that generates questions and runs all searches
+    simultaneously for maximum speed.
+    """
+    def __init__(
+        self,
+        search=None,
+        model=None,
+        citation_handler=None,
+        include_text_content: bool = True,
+        use_cross_engine_filter: bool = True,
+        filter_reorder: bool = True,
+        filter_reindex: bool = True,
+        filter_max_results: int = 20,
+    ):
+        """Initialize with optional dependency injection for testing.
+        Args:
+            search: Optional search engine instance
+            model: Optional LLM model instance
+            citation_handler: Optional citation handler instance
+            include_text_content: If False, only includes metadata and links in search results
+            use_cross_engine_filter: If True, filter search results across engines
+            filter_reorder: Whether to reorder results by relevance
+            filter_reindex: Whether to update result indices after filtering
+            filter_max_results: Maximum number of results to keep after filtering
+        """
+        super().__init__()
+        self.search = search or get_search()
+        self.model = model or get_llm()
+        self.progress_callback = None
+        self.all_links_of_system = list()
+        self.questions_by_iteration = {}
+        self.include_text_content = include_text_content
+        self.use_cross_engine_filter = use_cross_engine_filter
+        self.filter_reorder = filter_reorder
+        self.filter_reindex = filter_reindex
+        # Initialize the cross-engine filter
+        self.cross_engine_filter = CrossEngineFilter(
+            model=self.model,
+            max_results=filter_max_results,
+            default_reorder=filter_reorder,
+            default_reindex=filter_reindex,
+        )
+        # Set include_full_content on the search engine if it supports it
+        if hasattr(self.search, "include_full_content"):
+            self.search.include_full_content = include_text_content
+        # Use provided citation_handler or create one
+        self.citation_handler = citation_handler or CitationHandler(self.model)
+        # Initialize components
+        self.question_generator = StandardQuestionGenerator(self.model)
+        self.findings_repository = FindingsRepository(self.model)
+    def analyze_topic(self, query: str) -> Dict:
+        """
+        Parallel implementation that generates questions and searches all at once.
+        Args:
+            query: The research query to analyze
+        """
+        logger.info(f"Starting parallel research on topic: {query}")
+        findings = []
+        all_search_results = []
+        self._update_progress(
+            "Initializing parallel research",
+            5,
+            {
+                "phase": "init",
+                "strategy": "parallel",
+                "include_text_content": self.include_text_content,
+            },
+        )
+        # Check search engine
+        if not self._validate_search_engine():
+            return {
+                "findings": [],
+                "iterations": 0,
+                "questions": {},
+                "formatted_findings": "Error: Unable to conduct research without a search engine.",
+                "current_knowledge": "",
+                "error": "No search engine available",
+            }
+        try:
+            # Step 1: Generate questions first
+            self._update_progress(
+                "Generating search questions", 10, {"phase": "question_generation"}
+            )
+            # Generate 3 additional questions (plus the main query = 4 total)
+            questions = self.question_generator.generate_questions(
+                current_knowledge="",  # No knowledge accumulation
+                query=query,
+                questions_per_iteration=int(
+                    get_db_setting("search.questions_per_iteration")
+                ),  # 3 additional questions
+                questions_by_iteration={},
+            )
+            # Add the original query as the first question
+            all_questions = [query] + questions
+            # Store in questions_by_iteration
+            self.questions_by_iteration[0] = questions
+            logger.info(f"Generated questions: {questions}")
+            # Step 2: Run all searches in parallel
+            self._update_progress(
+                "Running parallel searches for all questions",
+                20,
+                {"phase": "parallel_search"},
+            )
+            # Function for thread pool
+            def search_question(q):
+                try:
+                    result = self.search.run(q)
+                    return {"question": q, "results": result or []}
+                except Exception as e:
+                    logger.error(f"Error searching for '{q}': {str(e)}")
+                    return {"question": q, "results": [], "error": str(e)}
+            # Run searches in parallel
+            with concurrent.futures.ThreadPoolExecutor(
+                max_workers=len(all_questions)
+            ) as executor:
+                futures = [executor.submit(search_question, q) for q in all_questions]
+                all_search_dict = {}
+                # Process results as they complete
+                for i, future in enumerate(concurrent.futures.as_completed(futures)):
+                    result_dict = future.result()
+                    question = result_dict["question"]
+                    search_results = result_dict["results"]
+                    all_search_dict[question] = search_results
+                    self._update_progress(
+                        f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
+                        20 + ((i + 1) / len(all_questions) * 40),
+                        {
+                            "phase": "search_complete",
+                            "result_count": len(search_results),
+                            "question": question,
+                        },
+                    )
+                    # Extract and save links
+                    if not self.use_cross_engine_filter:
+                        links = extract_links_from_search_results(search_results)
+                        self.all_links_of_system.extend(links)
+                    all_search_results.extend(search_results)
+            # Step 3: Analysis of collected search results
+            self._update_progress(
+                "Analyzing all collected search results",
+                70,
+                {"phase": "final_analysis"},
+            )
+            if self.use_cross_engine_filter:
+                self._update_progress(
+                    "Filtering search results across engines",
+                    65,
+                    {"phase": "cross_engine_filtering"},
+                )
+                # Get the current link count (for indexing)
+                existing_link_count = len(self.all_links_of_system)
+                # Filter the search results
+                filtered_search_results = self.cross_engine_filter.filter_results(
+                    all_search_results,
+                    query,
+                    reorder=self.filter_reorder,
+                    reindex=self.filter_reindex,
+                    start_index=existing_link_count,  # Start indexing after existing links
+                )
+                links = extract_links_from_search_results(filtered_search_results)
+                self.all_links_of_system.extend(links)
+                self._update_progress(
+                    f"Filtered from {len(all_search_results)} to {len(filtered_search_results)} results",
+                    70,
+                    {
+                        "phase": "filtering_complete",
+                        "links_count": len(self.all_links_of_system),
+                    },
+                )
+                # Use filtered results for analysis
+                all_search_results = filtered_search_results
+            # Now when we use the citation handler, ensure we're using all_search_results:
+            if self.include_text_content:
+                # Use citation handler for analysis of all results together
+                citation_result = self.citation_handler.analyze_initial(
+                    query, all_search_results
+                )
+            if citation_result:
+                synthesized_content = citation_result["content"]
+                finding = {
+                    "phase": "Final synthesis",
+                    "content": synthesized_content,
+                    "question": query,
+                    "search_results": all_search_results,
+                    "documents": citation_result.get("documents", []),
+                }
+                findings.append(finding)
+                # Transfer questions to repository
+                self.findings_repository.set_questions_by_iteration(
+                    self.questions_by_iteration
+                )
+                # Format findings
+                formatted_findings = self.findings_repository.format_findings_to_text(
+                    findings, synthesized_content
+                )
+                # Add documents to repository
+                if "documents" in citation_result:
+                    self.findings_repository.add_documents(citation_result["documents"])
+                else:
+                    synthesized_content = "No relevant results found."
+                    formatted_findings = synthesized_content
+                    finding = {
+                        "phase": "Error",
+                        "content": "No relevant results found.",
+                        "question": query,
+                        "search_results": all_search_results,
+                        "documents": [],
+                    }
+                    findings.append(finding)
+            else:
+                # Skip LLM analysis, just format the raw search results
+                synthesized_content = "LLM analysis skipped"
+                finding = {
+                    "phase": "Raw search results",
+                    "content": "LLM analysis was skipped. Displaying raw search results with links.",
+                    "question": query,
+                    "search_results": all_search_results,
+                    "documents": [],
+                }
+                findings.append(finding)
+                # Transfer questions to repository
+                self.findings_repository.set_questions_by_iteration(
+                    self.questions_by_iteration
+                )
+                # Format findings without synthesis
+                formatted_findings = self.findings_repository.format_findings_to_text(
+                    findings, "Raw search results (LLM analysis skipped)"
+                )
+        except Exception as e:
+            import traceback
+            error_msg = f"Error in research process: {str(e)}"
+            logger.error(error_msg)
+            logger.error(traceback.format_exc())
+            synthesized_content = f"Error: {str(e)}"
+            formatted_findings = f"Error: {str(e)}"
+            finding = {
+                "phase": "Error",
+                "content": synthesized_content,
+                "question": query,
+                "search_results": [],
+                "documents": [],
+            }
+            findings.append(finding)
+        self._update_progress("Research complete", 100, {"phase": "complete"})
+        return {
+            "findings": findings,
+            "iterations": 1,
+            "questions": self.questions_by_iteration,
+            "formatted_findings": formatted_findings,
+            "current_knowledge": synthesized_content,
+        }

local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py ADDED Viewed

@@ -0,0 +1,270 @@
+"""
+RapidSearch strategy implementation.
+"""
+import logging
+from typing import Dict
+from ...citation_handler import CitationHandler
+from ...config.llm_config import get_llm
+from ...config.search_config import get_search
+from ...utilities.search_utilities import extract_links_from_search_results
+from ..findings.repository import FindingsRepository
+from ..knowledge.standard_knowledge import StandardKnowledge
+from ..questions.standard_question import StandardQuestionGenerator
+from .base_strategy import BaseSearchStrategy
+logger = logging.getLogger(__name__)
+class RapidSearchStrategy(BaseSearchStrategy):
+    """
+    Rapid search strategy that only analyzes snippets and performs
+    a single synthesis step at the end, optimized for speed.
+    """
+    def __init__(self, search=None, model=None, citation_handler=None):
+        """Initialize with optional dependency injection for testing."""
+        super().__init__()
+        self.search = search or get_search()
+        self.model = model or get_llm()
+        self.progress_callback = None
+        self.all_links_of_system = list()
+        self.questions_by_iteration = {}
+        # Use provided citation_handler or create one
+        self.citation_handler = citation_handler or CitationHandler(self.model)
+        # Initialize components
+        self.question_generator = StandardQuestionGenerator(self.model)
+        self.knowledge_generator = StandardKnowledge(self.model)
+        self.findings_repository = FindingsRepository(self.model)
+    def analyze_topic(self, query: str) -> Dict:
+        """
+        RapidSearch implementation that collects snippets, avoids intermediate
+        synthesis, and only performs a final synthesis at the end.
+        """
+        logger.info(f"Starting rapid research on topic: {query}")
+        findings = []
+        all_search_results = []
+        collected_snippets = []
+        section_links = list()
+        self._update_progress(
+            "Initializing rapid research system",
+            5,
+            {"phase": "init", "strategy": "rapid"},
+        )
+        # Check if search engine is available
+        if not self._validate_search_engine():
+            return {
+                "findings": [],
+                "iterations": 0,
+                "questions": {},
+                "formatted_findings": "Error: Unable to conduct research without a search engine.",
+                "current_knowledge": "",
+                "error": "No search engine available",
+            }
+        # Step 1: Initial search for the main query
+        self._update_progress(
+            "Performing initial search for main query",
+            10,
+            {"phase": "search", "iteration": 1},
+        )
+        try:
+            initial_results = self.search.run(query)
+            if not initial_results:
+                self._update_progress(
+                    "No initial results found",
+                    15,
+                    {"phase": "search_complete", "result_count": 0},
+                )
+                initial_results = []
+            else:
+                self._update_progress(
+                    f"Found {len(initial_results)} initial results",
+                    15,
+                    {"phase": "search_complete", "result_count": len(initial_results)},
+                )
+            # Extract snippets and links
+            for result in initial_results:
+                if "snippet" in result:
+                    collected_snippets.append(
+                        {
+                            "text": result["snippet"],
+                            "source": result.get("title", "Unknown Source"),
+                            "link": result.get("link", ""),
+                            "query": query,
+                        }
+                    )
+            # Extract and save links
+            initial_links = extract_links_from_search_results(initial_results)
+            self.all_links_of_system.extend(initial_links)
+            section_links.extend(initial_links)
+            all_search_results.extend(initial_results)
+            # No findings added here - just collecting data
+        except Exception as e:
+            error_msg = f"Error during initial search: {str(e)}"
+            logger.error(f"SEARCH ERROR: {error_msg}")
+            self._update_progress(
+                error_msg, 15, {"phase": "search_error", "error": str(e)}
+            )
+            initial_results = []
+        # Step 2: Generate a few follow-up questions (optional, can be skipped for ultimate speed)
+        self._update_progress(
+            "Generating follow-up questions", 25, {"phase": "question_generation"}
+        )
+        questions = self.question_generator.generate_questions(
+            current_knowledge="",  # No knowledge accumulation in rapid mode
+            query=query,
+            questions_per_iteration=3,  # Fewer questions for speed
+            questions_by_iteration={},
+        )
+        self.questions_by_iteration[0] = questions
+        logger.info(f"Generated questions: {questions}")
+        # Step 3: Process follow-up questions
+        question_count = len(questions)
+        for q_idx, question in enumerate(questions):
+            question_progress = 30 + ((q_idx + 1) / question_count * 40)
+            self._update_progress(
+                f"Searching for: {question}",
+                int(question_progress),
+                {"phase": "search", "question_index": q_idx + 1},
+            )
+            try:
+                search_results = self.search.run(question)
+                if not search_results:
+                    self._update_progress(
+                        f"No results found for question: {question}",
+                        int(question_progress + 2),
+                        {"phase": "search_complete", "result_count": 0},
+                    )
+                    continue
+                self._update_progress(
+                    f"Found {len(search_results)} results for question: {question}",
+                    int(question_progress + 5),
+                    {"phase": "search_complete", "result_count": len(search_results)},
+                )
+                # Extract snippets only
+                for result in search_results:
+                    if "snippet" in result:
+                        collected_snippets.append(
+                            {
+                                "text": result["snippet"],
+                                "source": result.get("title", "Unknown Source"),
+                                "link": result.get("link", ""),
+                                "query": question,
+                            }
+                        )
+                # Extract and save links
+                links = extract_links_from_search_results(search_results)
+                self.all_links_of_system.extend(links)
+                section_links.extend(links)
+                all_search_results.extend(search_results)
+                # No findings added here - just collecting data
+            except Exception as e:
+                error_msg = f"Error during search: {str(e)}"
+                logger.error(f"SEARCH ERROR: {error_msg}")
+                self._update_progress(
+                    error_msg,
+                    int(question_progress + 2),
+                    {"phase": "search_error", "error": str(e)},
+                )
+        # Step 4: Perform a single final synthesis with all collected snippets using the citation handler
+        self._update_progress(
+            "Synthesizing all collected information", 80, {"phase": "final_synthesis"}
+        )
+        try:
+            # Use citation handler for the final analysis
+            # First, we need a stub of current knowledge
+            # Use the citation handler to analyze the results
+            result = self.citation_handler.analyze_initial(query, all_search_results)
+            if result:
+                synthesized_content = result["content"]
+                # Create a synthesis finding
+                finding = {
+                    "phase": "Final synthesis",
+                    "content": synthesized_content,
+                    "question": query,
+                    "search_results": all_search_results,
+                    "documents": result.get("documents", []),
+                }
+                findings.append(finding)
+                # Transfer questions to the repository
+                self.findings_repository.set_questions_by_iteration(
+                    self.questions_by_iteration
+                )
+                # Format the findings with search questions and sources
+                formatted_findings = self.findings_repository.format_findings_to_text(
+                    findings, synthesized_content
+                )
+                # Also add to the repository
+                self.findings_repository.add_documents(result.get("documents", []))
+            else:
+                # Fallback if citation handler fails
+                synthesized_content = (
+                    "Error: Failed to synthesize results with citation handler."
+                )
+                formatted_findings = synthesized_content
+                finding = {
+                    "phase": "Error",
+                    "content": synthesized_content,
+                    "question": query,
+                    "search_results": all_search_results,
+                    "documents": [],
+                }
+                findings.append(finding)
+        except Exception as e:
+            error_msg = f"Error synthesizing final answer: {str(e)}"
+            logger.error(error_msg)
+            synthesized_content = f"Error generating synthesis: {str(e)}"
+            formatted_findings = f"Error: {str(e)}"
+            finding = {
+                "phase": "Error",
+                "content": synthesized_content,
+                "question": query,
+                "search_results": all_search_results,
+                "documents": [],
+            }
+            findings.append(finding)
+        self._update_progress("Research complete", 100, {"phase": "complete"})
+        return {
+            "findings": findings,
+            "iterations": 1,  # Always 1 iteration in rapid mode
+            "questions": self.questions_by_iteration,
+            "formatted_findings": formatted_findings,
+            "current_knowledge": synthesized_content,
+        }

local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl