PyPI - local-deep-research - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

local-deep-research 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

local_deep_research/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 Local Deep Research - A tool for conducting deep research using AI.
 """
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 __author__ = "Your Name"
 __description__ = "A tool for conducting deep research using AI"

local_deep_research/advanced_search_system/filters/cross_engine_filter.py CHANGED Viewed

@@ -6,6 +6,7 @@ import json
 import logging
 from typing import Dict, List
+from ...utilities.db_utils import get_db_setting
 from ...utilities.search_utilities import remove_think_tags
 from .base_filter import BaseFilter
@@ -16,7 +17,7 @@ class CrossEngineFilter(BaseFilter):
     """Filter that ranks and filters results from multiple search engines."""
     def __init__(
-        self, model, max_results=20, default_reorder=True, default_reindex=True
+        self, model, max_results=None, default_reorder=True, default_reindex=True
     ):
         """
         Initialize the cross-engine filter.
@@ -28,6 +29,9 @@ class CrossEngineFilter(BaseFilter):
             default_reindex: Default setting for reindexing results after filtering
         """
         super().__init__(model)
+        # Get max_results from database settings if not provided
+        if max_results is None:
+            max_results = get_db_setting("search.cross_engine_max_results", 100)
         self.max_results = max_results
         self.default_reorder = default_reorder
         self.default_reindex = default_reindex

local_deep_research/advanced_search_system/strategies/base_strategy.py CHANGED Viewed

@@ -13,11 +13,14 @@ logger = logging.getLogger(__name__)
 class BaseSearchStrategy(ABC):
     """Abstract base class for all search strategies."""
-    def __init__(self):
+    def __init__(self, all_links_of_system=None):
         """Initialize the base strategy with common attributes."""
         self.progress_callback = None
         self.questions_by_iteration = {}
-        self.all_links_of_system = []
+        # Create a new list if None is provided (avoiding mutable default argument)
+        self.all_links_of_system = (
+            all_links_of_system if all_links_of_system is not None else []
+        )
     def set_progress_callback(self, callback: Callable[[str, int, dict], None]) -> None:
         """Set a callback function to receive progress updates."""

local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py CHANGED Viewed

@@ -7,10 +7,7 @@ import logging
 from datetime import datetime
 from typing import Dict, List
-from langchain_core.language_models import BaseLLM
 from ...citation_handler import CitationHandler
-from ...config.config_files import settings
 from ...config.llm_config import get_llm
 from ...config.search_config import get_search
 from ...utilities.db_utils import get_db_setting
@@ -27,18 +24,34 @@ class IterDRAGStrategy(BaseSearchStrategy):
     """IterDRAG strategy that breaks queries into sub-queries."""
     def __init__(
-        self, model: BaseLLM | None = None, search=None, citation_handler=None
+        self,
+        search=None,
+        model=None,
+        max_iterations=3,
+        subqueries_per_iteration=2,
+        all_links_of_system=None,
     ):
-        """Initialize the strategy with optional dependency injection for testing."""
-        super().__init__()
-        self.model = model or get_llm()
+        """Initialize the IterDRAG strategy with search and LLM.
+        Args:
+            search: Search engine to use for web queries
+            model: LLM to use for text generation and reasoning
+            max_iterations: Maximum number of iterations to run
+            subqueries_per_iteration: Number of sub-queries to generate per iteration
+            all_links_of_system: Optional list of links to initialize with
+        """
+        super().__init__(all_links_of_system=all_links_of_system)
         self.search = search or get_search()
+        self.model = model or get_llm()
+        self.max_iterations = max_iterations
+        self.subqueries_per_iteration = subqueries_per_iteration
+        # Initialize progress callback
         self.progress_callback = None
-        self.all_links_of_system = list()
         self.questions_by_iteration = {}
         # Use provided citation_handler or create one
-        self.citation_handler = citation_handler or CitationHandler(self.model)
+        self.citation_handler = CitationHandler(self.model)
         # Initialize components
         self.question_generator = DecompositionQuestionGenerator(self.model)
@@ -396,13 +409,7 @@ Please try again with a different query or contact support.
                     """
         # Compress knowledge if needed
-        if (
-            get_db_setting(
-                "general.knowledge_accumulation",
-                settings.general.knowledge_accumulation,
-            )
-            == "ITERATION"
-        ):
+        if get_db_setting("general.knowledge_accumulation", "ITERATION") == "ITERATION":
             try:
                 self._update_progress(
                     "Compressing knowledge", 90, {"phase": "knowledge_compression"}

local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py CHANGED Viewed

@@ -34,7 +34,8 @@ class ParallelSearchStrategy(BaseSearchStrategy):
         use_cross_engine_filter: bool = True,
         filter_reorder: bool = True,
         filter_reindex: bool = True,
-        filter_max_results: int = 20,
+        cross_engine_max_results: int = None,
+        all_links_of_system=None,
     ):
         """Initialize with optional dependency injection for testing.
@@ -46,23 +47,29 @@ class ParallelSearchStrategy(BaseSearchStrategy):
             use_cross_engine_filter: If True, filter search results across engines
             filter_reorder: Whether to reorder results by relevance
             filter_reindex: Whether to update result indices after filtering
-            filter_max_results: Maximum number of results to keep after filtering
+            cross_engine_max_results: Maximum number of results to keep after cross-engine filtering
+            all_links_of_system: Optional list of links to initialize with
         """
-        super().__init__()
+        super().__init__(all_links_of_system=all_links_of_system)
         self.search = search or get_search()
         self.model = model or get_llm()
         self.progress_callback = None
-        self.all_links_of_system = list()
         self.questions_by_iteration = {}
         self.include_text_content = include_text_content
         self.use_cross_engine_filter = use_cross_engine_filter
         self.filter_reorder = filter_reorder
         self.filter_reindex = filter_reindex
+        # Get max_filtered_results from database if not provided
+        if cross_engine_max_results is None:
+            cross_engine_max_results = get_db_setting(
+                "search.cross_engine_max_results", 100
+            )
         # Initialize the cross-engine filter
         self.cross_engine_filter = CrossEngineFilter(
             model=self.model,
-            max_results=filter_max_results,
+            max_results=cross_engine_max_results,
             default_reorder=filter_reorder,
             default_reindex=filter_reindex,
         )
@@ -80,7 +87,7 @@ class ParallelSearchStrategy(BaseSearchStrategy):
     def analyze_topic(self, query: str) -> Dict:
         """
-        Parallel implementation that generates questions and searches all at once.
+        Analyze a topic using parallel search, supporting multiple iterations.
         Args:
             query: The research query to analyze
@@ -89,6 +96,11 @@ class ParallelSearchStrategy(BaseSearchStrategy):
         findings = []
         all_search_results = []
+        current_knowledge = ""
+        # Track all search results across iterations
+        self.all_links_of_system = list()
+        self.questions_by_iteration = {}
         self._update_progress(
             "Initializing parallel research",
@@ -105,184 +117,301 @@ class ParallelSearchStrategy(BaseSearchStrategy):
             return {
                 "findings": [],
                 "iterations": 0,
-                "questions": {},
+                "questions_by_iteration": {},
                 "formatted_findings": "Error: Unable to conduct research without a search engine.",
                 "current_knowledge": "",
                 "error": "No search engine available",
             }
+        # Determine number of iterations to run
+        iterations_to_run = get_db_setting("search.iterations")
+        logger.debug("Selected amount of iterations: " + str(iterations_to_run))
+        iterations_to_run = int(iterations_to_run)
         try:
-            # Step 1: Generate questions first
-            self._update_progress(
-                "Generating search questions", 10, {"phase": "question_generation"}
-            )
+            # Run each iteration
+            for iteration in range(1, iterations_to_run + 1):
+                iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
-            # Generate 3 additional questions (plus the main query = 4 total)
-            questions = self.question_generator.generate_questions(
-                current_knowledge="",  # No knowledge accumulation
-                query=query,
-                questions_per_iteration=int(
-                    get_db_setting("search.questions_per_iteration")
-                ),  # 3 additional questions
-                questions_by_iteration={},
-            )
+                self._update_progress(
+                    f"Starting iteration {iteration}/{iterations_to_run}",
+                    iteration_progress_base,
+                    {"phase": f"iteration_{iteration}", "iteration": iteration},
+                )
+                # Step 1: Generate questions
+                self._update_progress(
+                    f"Generating search questions for iteration {iteration}",
+                    iteration_progress_base + 5,
+                    {"phase": "question_generation", "iteration": iteration},
+                )
-            # Add the original query as the first question
-            all_questions = [query] + questions
+                # For first iteration, generate initial questions
+                # For subsequent iterations, generate follow-up questions
+                logger.info("Starting to generate questions")
+                if iteration == 1:
+                    # Generate additional questions (plus the main query)
+                    if iterations_to_run > 1:
+                        context = f"""Iteration: {1} of {iterations_to_run}"""
+                    else:
+                        context = ""
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=context,
+                        query=query,
+                        questions_per_iteration=int(
+                            get_db_setting("search.questions_per_iteration")
+                        ),
+                        questions_by_iteration=self.questions_by_iteration,
+                    )
-            # Store in questions_by_iteration
-            self.questions_by_iteration[0] = questions
-            logger.info(f"Generated questions: {questions}")
+                    # Add the original query as the first question
+                    all_questions = [query] + questions
-            # Step 2: Run all searches in parallel
-            self._update_progress(
-                "Running parallel searches for all questions",
-                20,
-                {"phase": "parallel_search"},
-            )
+                    # Store in questions_by_iteration
+                    self.questions_by_iteration[iteration] = questions
+                    logger.info(
+                        f"Generated questions for iteration {iteration}: {questions}"
+                    )
+                else:
+                    # Get past questions from all previous iterations
+                    past_questions = []
+                    for prev_iter in range(1, iteration):
+                        if prev_iter in self.questions_by_iteration:
+                            past_questions.extend(
+                                self.questions_by_iteration[prev_iter]
+                            )
+                    # Generate follow-up questions based on accumulated knowledge if iterations > 2
+                    use_knowledge = iterations_to_run > 2
+                    knowledge_for_questions = current_knowledge if use_knowledge else ""
+                    context = f"""Current Knowledge: {knowledge_for_questions}
+                    Iteration: {iteration} of {iterations_to_run}"""
+                    # Generate questions
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=context,
+                        query=query,
+                        questions_per_iteration=int(
+                            get_db_setting("search.questions_per_iteration")
+                        ),
+                        questions_by_iteration=self.questions_by_iteration,
+                    )
-            # Function for thread pool
-            def search_question(q):
-                try:
-                    result = self.search.run(q)
-                    return {"question": q, "results": result or []}
-                except Exception as e:
-                    logger.error(f"Error searching for '{q}': {str(e)}")
-                    return {"question": q, "results": [], "error": str(e)}
-            # Run searches in parallel
-            with concurrent.futures.ThreadPoolExecutor(
-                max_workers=len(all_questions)
-            ) as executor:
-                futures = [executor.submit(search_question, q) for q in all_questions]
-                all_search_dict = {}
-                # Process results as they complete
-                for i, future in enumerate(concurrent.futures.as_completed(futures)):
-                    result_dict = future.result()
-                    question = result_dict["question"]
-                    search_results = result_dict["results"]
-                    all_search_dict[question] = search_results
+                    # Use only the new questions for this iteration's searches
+                    all_questions = questions
-                    self._update_progress(
-                        f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
-                        20 + ((i + 1) / len(all_questions) * 40),
-                        {
-                            "phase": "search_complete",
-                            "result_count": len(search_results),
-                            "question": question,
-                        },
+                    # Store in questions_by_iteration
+                    self.questions_by_iteration[iteration] = questions
+                    logger.info(
+                        f"Generated questions for iteration {iteration}: {questions}"
                     )
-                    # Extract and save links
-                    if not self.use_cross_engine_filter:
-                        links = extract_links_from_search_results(search_results)
-                        self.all_links_of_system.extend(links)
-                    all_search_results.extend(search_results)
+                # Step 2: Run all searches in parallel for this iteration
+                self._update_progress(
+                    f"Running parallel searches for iteration {iteration}",
+                    iteration_progress_base + 10,
+                    {"phase": "parallel_search", "iteration": iteration},
+                )
-            # Step 3: Analysis of collected search results
-            self._update_progress(
-                "Analyzing all collected search results",
-                70,
-                {"phase": "final_analysis"},
-            )
-            if self.use_cross_engine_filter:
+                # Function for thread pool
+                def search_question(q):
+                    try:
+                        result = self.search.run(q)
+                        return {"question": q, "results": result or []}
+                    except Exception as e:
+                        logger.error(f"Error searching for '{q}': {str(e)}")
+                        return {"question": q, "results": [], "error": str(e)}
+                # Run searches in parallel
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=len(all_questions)
+                ) as executor:
+                    futures = [
+                        executor.submit(search_question, q) for q in all_questions
+                    ]
+                    iteration_search_dict = {}
+                    iteration_search_results = []
+                    # Process results as they complete
+                    for i, future in enumerate(
+                        concurrent.futures.as_completed(futures)
+                    ):
+                        result_dict = future.result()
+                        question = result_dict["question"]
+                        search_results = result_dict["results"]
+                        iteration_search_dict[question] = search_results
+                        self._update_progress(
+                            f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
+                            iteration_progress_base
+                            + 10
+                            + ((i + 1) / len(all_questions) * 30),
+                            {
+                                "phase": "search_complete",
+                                "iteration": iteration,
+                                "result_count": len(search_results),
+                                "question": question,
+                            },
+                        )
+                        # Collect all search results for this iteration
+                        iteration_search_results.extend(search_results)
+                # Step 3: Filter and analyze results for this iteration
                 self._update_progress(
-                    "Filtering search results across engines",
-                    65,
-                    {"phase": "cross_engine_filtering"},
+                    f"Analyzing results for iteration {iteration}",
+                    iteration_progress_base + 45,
+                    {"phase": "iteration_analysis", "iteration": iteration},
                 )
-                # Get the current link count (for indexing)
-                existing_link_count = len(self.all_links_of_system)
+                # Apply cross-engine filtering if enabled
+                if self.use_cross_engine_filter:
+                    self._update_progress(
+                        f"Filtering search results for iteration {iteration}",
+                        iteration_progress_base + 45,
+                        {"phase": "cross_engine_filtering", "iteration": iteration},
+                    )
+                    # Get the current link count (for indexing)
+                    existing_link_count = len(self.all_links_of_system)
-                # Filter the search results
-                filtered_search_results = self.cross_engine_filter.filter_results(
-                    all_search_results,
-                    query,
-                    reorder=self.filter_reorder,
-                    reindex=self.filter_reindex,
-                    start_index=existing_link_count,  # Start indexing after existing links
-                )
+                    # Filter the search results
+                    filtered_search_results = self.cross_engine_filter.filter_results(
+                        iteration_search_results,
+                        query,
+                        reorder=self.filter_reorder,
+                        reindex=self.filter_reindex,
+                        start_index=existing_link_count,  # Start indexing after existing links
+                    )
-                links = extract_links_from_search_results(filtered_search_results)
-                self.all_links_of_system.extend(links)
+                    links = extract_links_from_search_results(filtered_search_results)
+                    self.all_links_of_system.extend(links)
+                    self._update_progress(
+                        f"Filtered from {len(iteration_search_results)} to {len(filtered_search_results)} results",
+                        iteration_progress_base + 50,
+                        {
+                            "phase": "filtering_complete",
+                            "iteration": iteration,
+                            "links_count": len(self.all_links_of_system),
+                        },
+                    )
+                    # Use filtered results for analysis
+                    iteration_search_results = filtered_search_results
+                else:
+                    # Just extract links without filtering
+                    links = extract_links_from_search_results(iteration_search_results)
+                    self.all_links_of_system.extend(links)
+                # Add to all search results
+                all_search_results.extend(iteration_search_results)
+                # Create a finding for this iteration's results
+                if self.include_text_content and iteration_search_results:
+                    # For iteration > 1 with knowledge accumulation, use follow-up analysis
+                    if iteration > 1 and iterations_to_run > 2:
+                        citation_result = self.citation_handler.analyze_followup(
+                            query,
+                            iteration_search_results,
+                            current_knowledge,
+                            len(self.all_links_of_system) - len(links),
+                        )
+                    else:
+                        # For first iteration or without knowledge accumulation, use initial analysis
+                        citation_result = self.citation_handler.analyze_initial(
+                            query, iteration_search_results
+                        )
+                    if citation_result:
+                        # Create a finding for this iteration
+                        iteration_content = citation_result["content"]
+                        # Update current knowledge if iterations > 2
+                        if iterations_to_run > 2:
+                            if current_knowledge:
+                                current_knowledge = f"{current_knowledge}\n\n## FINDINGS FROM ITERATION {iteration}:\n\n{iteration_content}"
+                            else:
+                                current_knowledge = iteration_content
+                        finding = {
+                            "phase": f"Iteration {iteration}",
+                            "content": iteration_content,
+                            "question": query,
+                            "search_results": iteration_search_results,
+                            "documents": citation_result.get("documents", []),
+                        }
+                        findings.append(finding)
+                        # Add documents to repository
+                        if "documents" in citation_result:
+                            self.findings_repository.add_documents(
+                                citation_result["documents"]
+                            )
+                # Mark iteration as complete
+                iteration_progress = 5 + iteration * (70 / iterations_to_run)
                 self._update_progress(
-                    f"Filtered from {len(all_search_results)} to {len(filtered_search_results)} results",
-                    70,
-                    {
-                        "phase": "filtering_complete",
-                        "links_count": len(self.all_links_of_system),
-                    },
+                    f"Completed iteration {iteration}/{iterations_to_run}",
+                    iteration_progress,
+                    {"phase": "iteration_complete", "iteration": iteration},
                 )
-                # Use filtered results for analysis
-                all_search_results = filtered_search_results
+            # Final synthesis after all iterations
+            self._update_progress(
+                "Generating final synthesis", 80, {"phase": "synthesis"}
+            )
-            # Now when we use the citation handler, ensure we're using all_search_results:
+            # Handle final synthesis based on include_text_content flag
             if self.include_text_content:
-                # Use citation handler for analysis of all results together
-                citation_result = self.citation_handler.analyze_initial(
-                    query, all_search_results
-                )
-            if citation_result:
-                synthesized_content = citation_result["content"]
-                finding = {
+                # Generate a final synthesis from all search results
+                if iterations_to_run > 1:
+                    final_citation_result = self.citation_handler.analyze_initial(
+                        query, all_search_results
+                    )
+                    # Add null check for final_citation_result
+                    if final_citation_result:
+                        synthesized_content = final_citation_result["content"]
+                    else:
+                        synthesized_content = (
+                            "No relevant results found in final synthesis."
+                        )
+                else:
+                    # For single iteration, use the content from findings
+                    synthesized_content = (
+                        findings[0]["content"]
+                        if findings
+                        else "No relevant results found."
+                    )
+                # Add a final synthesis finding
+                final_finding = {
                     "phase": "Final synthesis",
                     "content": synthesized_content,
                     "question": query,
                     "search_results": all_search_results,
-                    "documents": citation_result.get("documents", []),
+                    "documents": [],
                 }
-                findings.append(finding)
-                # Transfer questions to repository
-                self.findings_repository.set_questions_by_iteration(
-                    self.questions_by_iteration
-                )
-                # Format findings
-                formatted_findings = self.findings_repository.format_findings_to_text(
-                    findings, synthesized_content
-                )
-                # Add documents to repository
-                if "documents" in citation_result:
-                    self.findings_repository.add_documents(citation_result["documents"])
-                else:
-                    synthesized_content = "No relevant results found."
-                    formatted_findings = synthesized_content
-                    finding = {
-                        "phase": "Error",
-                        "content": "No relevant results found.",
-                        "question": query,
-                        "search_results": all_search_results,
-                        "documents": [],
-                    }
-                    findings.append(finding)
+                findings.append(final_finding)
             else:
                 # Skip LLM analysis, just format the raw search results
                 synthesized_content = "LLM analysis skipped"
-                finding = {
+                final_finding = {
                     "phase": "Raw search results",
                     "content": "LLM analysis was skipped. Displaying raw search results with links.",
                     "question": query,
                     "search_results": all_search_results,
                     "documents": [],
                 }
-                findings.append(finding)
+                findings.append(final_finding)
-                # Transfer questions to repository
-                self.findings_repository.set_questions_by_iteration(
-                    self.questions_by_iteration
-                )
+            # Transfer questions to repository
+            self.findings_repository.set_questions_by_iteration(
+                self.questions_by_iteration
+            )
-                # Format findings without synthesis
-                formatted_findings = self.findings_repository.format_findings_to_text(
-                    findings, "Raw search results (LLM analysis skipped)"
-                )
+            # Format findings
+            formatted_findings = self.findings_repository.format_findings_to_text(
+                findings, synthesized_content
+            )
         except Exception as e:
             import traceback
@@ -305,8 +434,8 @@ class ParallelSearchStrategy(BaseSearchStrategy):
         return {
             "findings": findings,
-            "iterations": 1,
-            "questions": self.questions_by_iteration,
+            "iterations": iterations_to_run,
+            "questions_by_iteration": self.questions_by_iteration,
             "formatted_findings": formatted_findings,
             "current_knowledge": synthesized_content,
         }

local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py CHANGED Viewed

@@ -23,13 +23,14 @@ class RapidSearchStrategy(BaseSearchStrategy):
     a single synthesis step at the end, optimized for speed.
     """
-    def __init__(self, search=None, model=None, citation_handler=None):
+    def __init__(
+        self, search=None, model=None, citation_handler=None, all_links_of_system=None
+    ):
         """Initialize with optional dependency injection for testing."""
-        super().__init__()
+        super().__init__(all_links_of_system=all_links_of_system)
         self.search = search or get_search()
         self.model = model or get_llm()
         self.progress_callback = None
-        self.all_links_of_system = list()
         self.questions_by_iteration = {}
         # Use provided citation_handler or create one

local-deep-research 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

local-deep-research 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl