PyPI - local-deep-research - Versions diffs - 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

local-deep-research 0.2.3py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

local_deep_research/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 Local Deep Research - A tool for conducting deep research using AI.
 """
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 __author__ = "Your Name"
 __description__ = "A tool for conducting deep research using AI"

local_deep_research/advanced_search_system/filters/cross_engine_filter.py CHANGED Viewed

@@ -6,6 +6,7 @@ import json
 import logging
 from typing import Dict, List
+from ...utilities.db_utils import get_db_setting
 from ...utilities.search_utilities import remove_think_tags
 from .base_filter import BaseFilter
@@ -16,7 +17,7 @@ class CrossEngineFilter(BaseFilter):
     """Filter that ranks and filters results from multiple search engines."""
     def __init__(
-        self, model, max_results=20, default_reorder=True, default_reindex=True
+        self, model, max_results=None, default_reorder=True, default_reindex=True
     ):
         """
         Initialize the cross-engine filter.
@@ -28,6 +29,9 @@ class CrossEngineFilter(BaseFilter):
             default_reindex: Default setting for reindexing results after filtering
         """
         super().__init__(model)
+        # Get max_results from database settings if not provided
+        if max_results is None:
+            max_results = get_db_setting("search.cross_engine_max_results", 100)
         self.max_results = max_results
         self.default_reorder = default_reorder
         self.default_reindex = default_reindex

local_deep_research/advanced_search_system/strategies/base_strategy.py CHANGED Viewed

@@ -13,11 +13,14 @@ logger = logging.getLogger(__name__)
 class BaseSearchStrategy(ABC):
     """Abstract base class for all search strategies."""
-    def __init__(self):
+    def __init__(self, all_links_of_system=None):
         """Initialize the base strategy with common attributes."""
         self.progress_callback = None
         self.questions_by_iteration = {}
-        self.all_links_of_system = []
+        # Create a new list if None is provided (avoiding mutable default argument)
+        self.all_links_of_system = (
+            all_links_of_system if all_links_of_system is not None else []
+        )
     def set_progress_callback(self, callback: Callable[[str, int, dict], None]) -> None:
         """Set a callback function to receive progress updates."""

local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py CHANGED Viewed

@@ -7,10 +7,7 @@ import logging
 from datetime import datetime
 from typing import Dict, List
-from langchain_core.language_models import BaseLLM
 from ...citation_handler import CitationHandler
-from ...config.config_files import settings
 from ...config.llm_config import get_llm
 from ...config.search_config import get_search
 from ...utilities.db_utils import get_db_setting
@@ -27,18 +24,34 @@ class IterDRAGStrategy(BaseSearchStrategy):
     """IterDRAG strategy that breaks queries into sub-queries."""
     def __init__(
-        self, model: BaseLLM | None = None, search=None, citation_handler=None
+        self,
+        search=None,
+        model=None,
+        max_iterations=3,
+        subqueries_per_iteration=2,
+        all_links_of_system=None,
     ):
-        """Initialize the strategy with optional dependency injection for testing."""
-        super().__init__()
-        self.model = model or get_llm()
+        """Initialize the IterDRAG strategy with search and LLM.
+        Args:
+            search: Search engine to use for web queries
+            model: LLM to use for text generation and reasoning
+            max_iterations: Maximum number of iterations to run
+            subqueries_per_iteration: Number of sub-queries to generate per iteration
+            all_links_of_system: Optional list of links to initialize with
+        """
+        super().__init__(all_links_of_system=all_links_of_system)
         self.search = search or get_search()
+        self.model = model or get_llm()
+        self.max_iterations = max_iterations
+        self.subqueries_per_iteration = subqueries_per_iteration
+        # Initialize progress callback
         self.progress_callback = None
-        self.all_links_of_system = list()
         self.questions_by_iteration = {}
         # Use provided citation_handler or create one
-        self.citation_handler = citation_handler or CitationHandler(self.model)
+        self.citation_handler = CitationHandler(self.model)
         # Initialize components
         self.question_generator = DecompositionQuestionGenerator(self.model)
@@ -396,13 +409,7 @@ Please try again with a different query or contact support.
                     """
         # Compress knowledge if needed
-        if (
-            get_db_setting(
-                "general.knowledge_accumulation",
-                settings.general.knowledge_accumulation,
-            )
-            == "ITERATION"
-        ):
+        if get_db_setting("general.knowledge_accumulation", "ITERATION") == "ITERATION":
             try:
                 self._update_progress(
                     "Compressing knowledge", 90, {"phase": "knowledge_compression"}

local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py CHANGED Viewed

@@ -34,7 +34,8 @@ class ParallelSearchStrategy(BaseSearchStrategy):
         use_cross_engine_filter: bool = True,
         filter_reorder: bool = True,
         filter_reindex: bool = True,
-        filter_max_results: int = 20,
+        cross_engine_max_results: int = None,
+        all_links_of_system=None,
     ):
         """Initialize with optional dependency injection for testing.
@@ -46,23 +47,29 @@ class ParallelSearchStrategy(BaseSearchStrategy):
             use_cross_engine_filter: If True, filter search results across engines
             filter_reorder: Whether to reorder results by relevance
             filter_reindex: Whether to update result indices after filtering
-            filter_max_results: Maximum number of results to keep after filtering
+            cross_engine_max_results: Maximum number of results to keep after cross-engine filtering
+            all_links_of_system: Optional list of links to initialize with
         """
-        super().__init__()
+        super().__init__(all_links_of_system=all_links_of_system)
         self.search = search or get_search()
         self.model = model or get_llm()
         self.progress_callback = None
-        self.all_links_of_system = list()
         self.questions_by_iteration = {}
         self.include_text_content = include_text_content
         self.use_cross_engine_filter = use_cross_engine_filter
         self.filter_reorder = filter_reorder
         self.filter_reindex = filter_reindex
+        # Get max_filtered_results from database if not provided
+        if cross_engine_max_results is None:
+            cross_engine_max_results = get_db_setting(
+                "search.cross_engine_max_results", 100
+            )
         # Initialize the cross-engine filter
         self.cross_engine_filter = CrossEngineFilter(
             model=self.model,
-            max_results=filter_max_results,
+            max_results=cross_engine_max_results,
             default_reorder=filter_reorder,
             default_reindex=filter_reindex,
         )
@@ -118,7 +125,7 @@ class ParallelSearchStrategy(BaseSearchStrategy):
         # Determine number of iterations to run
         iterations_to_run = get_db_setting("search.iterations")
-        logger.debug("Selected amount of iterations: " + iterations_to_run)
+        logger.debug("Selected amount of iterations: " + str(iterations_to_run))
         iterations_to_run = int(iterations_to_run)
         try:
             # Run each iteration

local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py CHANGED Viewed

@@ -23,13 +23,14 @@ class RapidSearchStrategy(BaseSearchStrategy):
     a single synthesis step at the end, optimized for speed.
     """
-    def __init__(self, search=None, model=None, citation_handler=None):
+    def __init__(
+        self, search=None, model=None, citation_handler=None, all_links_of_system=None
+    ):
         """Initialize with optional dependency injection for testing."""
-        super().__init__()
+        super().__init__(all_links_of_system=all_links_of_system)
         self.search = search or get_search()
         self.model = model or get_llm()
         self.progress_callback = None
-        self.all_links_of_system = list()
         self.questions_by_iteration = {}
         # Use provided citation_handler or create one

local_deep_research/advanced_search_system/strategies/source_based_strategy.py CHANGED Viewed

@@ -6,7 +6,6 @@ from ...citation_handler import CitationHandler
 from ...config.llm_config import get_llm
 from ...config.search_config import get_search
 from ...utilities.db_utils import get_db_setting
-from ...utilities.search_utilities import extract_links_from_search_results
 from ..filters.cross_engine_filter import CrossEngineFilter
 from ..findings.repository import FindingsRepository
 from ..questions.standard_question import StandardQuestionGenerator
@@ -30,25 +29,32 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
         use_cross_engine_filter: bool = True,
         filter_reorder: bool = True,
         filter_reindex: bool = True,
-        filter_max_results: int = 20,
+        cross_engine_max_results: int = None,
+        all_links_of_system=None,
     ):
         """Initialize with optional dependency injection for testing."""
-        super().__init__()
+        # Pass the links list to the parent class
+        super().__init__(all_links_of_system=all_links_of_system)
         self.search = search or get_search()
         self.model = model or get_llm()
         self.progress_callback = None
-        self.all_links_of_system = list()
-        self.all_search_results = []
         self.questions_by_iteration = {}
         self.include_text_content = include_text_content
         self.use_cross_engine_filter = use_cross_engine_filter
         self.filter_reorder = filter_reorder
         self.filter_reindex = filter_reindex
+        # Get cross_engine_max_results from database if not provided
+        if cross_engine_max_results is None:
+            cross_engine_max_results = get_db_setting(
+                "search.cross_engine_max_results", 100
+            )
         # Initialize the cross-engine filter
         self.cross_engine_filter = CrossEngineFilter(
             model=self.model,
-            max_results=filter_max_results,
+            max_results=cross_engine_max_results,
             default_reorder=filter_reorder,
             default_reindex=filter_reindex,
         )
@@ -87,13 +93,11 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
         Analyze a topic using source-based search strategy.
         """
         logger.info(f"Starting source-based research on topic: {query}")
+        accumulated_search_results_across_all_iterations = (
+            []
+        )  # tracking links across iterations but not global
         findings = []
-        self.all_search_results = []
-        # Track all search results across iterations
-        self.all_links_of_system = list()
-        self.questions_by_iteration = {}
+        total_citation_count_before_this_search = len(self.all_links_of_system)
         self._update_progress(
             "Initializing source-based research",
@@ -121,6 +125,8 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
         logger.debug("Selected amount of iterations: " + str(iterations_to_run))
         iterations_to_run = int(iterations_to_run)
         try:
+            filtered_search_results = []
+            total_citation_count_before_this_search = len(self.all_links_of_system)
             # Run each iteration
             for iteration in range(1, iterations_to_run + 1):
                 iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
@@ -141,9 +147,6 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                 # For first iteration, use initial query
                 if iteration == 1:
                     # Generate questions for first iteration
-                    source_context = self._format_search_results_as_context(
-                        self.all_search_results
-                    )
                     context = f"""Iteration: {iteration} of {iterations_to_run}"""
                     questions = self.question_generator.generate_questions(
                         current_knowledge=context,
@@ -167,7 +170,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                 else:
                     # For subsequent iterations, generate questions based on previous search results
                     source_context = self._format_search_results_as_context(
-                        self.all_search_results
+                        filtered_search_results
                     )
                     if iteration != 1:
                         context = f"""Previous search results:\n{source_context}\n\nIteration: {iteration} of {iterations_to_run}"""
@@ -242,32 +245,25 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                             },
                         )
-                        # Collect all search results for this iteration
                         iteration_search_results.extend(search_results)
-                # Step 3: Apply cross-engine filtering if enabled
-                if self.use_cross_engine_filter:
+                if False and self.use_cross_engine_filter:
                     self._update_progress(
                         f"Filtering search results for iteration {iteration}",
                         iteration_progress_base + 45,
                         {"phase": "cross_engine_filtering", "iteration": iteration},
                     )
-                    # Get the current link count (for indexing)
                     existing_link_count = len(self.all_links_of_system)
-                    # Filter the search results
+                    logger.info(f"Existing link count: {existing_link_count}")
                     filtered_search_results = self.cross_engine_filter.filter_results(
                         iteration_search_results,
                         query,
-                        reorder=self.filter_reorder,
-                        reindex=self.filter_reindex,
+                        reorder=True,
+                        reindex=True,
                         start_index=existing_link_count,  # Start indexing after existing links
                     )
-                    links = extract_links_from_search_results(filtered_search_results)
-                    self.all_links_of_system.extend(links)
                     self._update_progress(
                         f"Filtered from {len(iteration_search_results)} to {len(filtered_search_results)} results",
                         iteration_progress_base + 50,
@@ -277,23 +273,20 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                             "links_count": len(self.all_links_of_system),
                         },
                     )
-                    # Use filtered results
-                    iteration_search_results = filtered_search_results
                 else:
-                    # Just extract links without filtering
-                    links = extract_links_from_search_results(iteration_search_results)
-                    self.all_links_of_system.extend(links)
+                    # Use the search results as they are
+                    filtered_search_results = iteration_search_results
-                # Add to all search results
-                self.all_search_results.extend(iteration_search_results)
+                    # Use filtered results
+                accumulated_search_results_across_all_iterations.extend(
+                    filtered_search_results
+                )
                 # Create a lightweight finding for this iteration's search metadata (no text content)
                 finding = {
                     "phase": f"Iteration {iteration}",
-                    "content": f"Searched with {len(all_questions)} questions, found {len(iteration_search_results)} results.",
+                    "content": f"Searched with {len(all_questions)} questions, found {len(filtered_search_results)} results.",
                     "question": query,
-                    "search_results": iteration_search_results,
                     "documents": [],
                 }
                 findings.append(finding)
@@ -306,46 +299,47 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                     {"phase": "iteration_complete", "iteration": iteration},
                 )
-            # Final filtering of all accumulated search results
-            self._update_progress(
-                "Performing final filtering of all results",
-                80,
-                {"phase": "final_filtering"},
-            )
-            # Apply final cross-engine filtering to all accumulated results if enabled
+            # Do we need this filter?
             if self.use_cross_engine_filter:
+                # Final filtering of all accumulated search results
+                self._update_progress(
+                    "Performing final filtering of all results",
+                    80,
+                    {"phase": "final_filtering"},
+                )
                 final_filtered_results = self.cross_engine_filter.filter_results(
-                    self.all_search_results,
+                    accumulated_search_results_across_all_iterations,
                     query,
                     reorder=True,  # Always reorder in final filtering
-                    reindex=False,  # Always reindex in final filtering
-                    max_results=int(get_db_setting("search.final_max_results") or 30),
+                    reindex=True,  # Always reindex in final filtering
+                    max_results=int(get_db_setting("search.final_max_results") or 100),
+                    start_index=len(self.all_links_of_system),
+                )
+                self._update_progress(
+                    f"Filtered from {len(accumulated_search_results_across_all_iterations)} to {len(final_filtered_results)} results",
+                    iteration_progress_base + 85,
+                    {
+                        "phase": "filtering_complete",
+                        "iteration": iteration,
+                        "links_count": len(self.all_links_of_system),
+                    },
                 )
             else:
-                final_filtered_results = self.all_search_results
-            self._update_progress(
-                f"Filtered from {len(self.all_search_results)} to {len(final_filtered_results)} results",
-                iteration_progress_base + 85,
-                {
-                    "phase": "filtering_complete",
-                    "iteration": iteration,
-                    "links_count": len(self.all_links_of_system),
-                },
-            )
+                final_filtered_results = filtered_search_results
+                # links = extract_links_from_search_results()
+            self.all_links_of_system.extend(final_filtered_results)
             # Final synthesis after all iterations
             self._update_progress(
                 "Generating final synthesis", 90, {"phase": "synthesis"}
             )
-            total_citation_count = len(self.all_links_of_system)
             # Final synthesis
             final_citation_result = self.citation_handler.analyze_followup(
                 query,
                 final_filtered_results,
                 previous_knowledge="",  # Empty string as we don't need previous knowledge here
-                nr_of_links=total_citation_count,
+                nr_of_links=total_citation_count_before_this_search,
             )
             # Add null check for final_citation_result
@@ -361,7 +355,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                 "phase": "Final synthesis",
                 "content": synthesized_content,
                 "question": query,
-                "search_results": final_filtered_results,
+                "search_results": self.all_links_of_system,
                 "documents": documents,
             }
             findings.append(final_finding)
@@ -404,4 +398,5 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
             "questions_by_iteration": self.questions_by_iteration,
             "formatted_findings": formatted_findings,
             "current_knowledge": synthesized_content,
+            "all_links_of_system": self.all_links_of_system,
         }

local_deep_research/advanced_search_system/strategies/standard_strategy.py CHANGED Viewed

@@ -3,7 +3,6 @@ import logging
 from typing import Dict
 from ...citation_handler import CitationHandler
-from ...config.config_files import settings
 from ...config.llm_config import get_llm
 from ...config.search_config import get_search
 from ...utilities.db_utils import get_db_setting
@@ -20,11 +19,17 @@ logger = logging.getLogger(__name__)
 class StandardSearchStrategy(BaseSearchStrategy):
     """Standard iterative search strategy that generates follow-up questions."""
-    def __init__(self, search=None, model=None, citation_handler=None):
+    def __init__(
+        self, search=None, model=None, citation_handler=None, all_links_of_system=None
+    ):
         """Initialize with optional dependency injection for testing."""
+        super().__init__(all_links_of_system=all_links_of_system)
         self.search = search or get_search()
         self.model = model or get_llm()
+        # Get iterations setting
         self.max_iterations = int(get_db_setting("search.iterations"))
         self.questions_per_iteration = int(
             get_db_setting("search.questions_per_iteration")
         )
@@ -43,7 +48,6 @@ class StandardSearchStrategy(BaseSearchStrategy):
         # Initialize other attributes
         self.progress_callback = None
-        self.all_links_of_system = list()
     def _update_progress(
         self, message: str, progress_percent: int = None, metadata: dict = None
@@ -117,7 +121,7 @@ Iteration: {iteration + 1} of {total_iterations}"""
             question_count = len(questions)
             knowledge_accumulation = get_db_setting(
                 "general.knowledge_accumulation",
-                settings.general.knowledge_accumulation,
+                "ITERATION",
             )
             for q_idx, question in enumerate(questions):
                 question_progress_base = iteration_progress_base + (

local_deep_research/api/research_functions.py CHANGED Viewed

@@ -4,11 +4,8 @@ Provides programmatic access to search and research capabilities.
 """
 import logging
-import os
 from typing import Any, Callable, Dict, Optional
-import toml
 from ..config.llm_config import get_llm
 from ..config.search_config import get_search
 from ..report_generator import IntegratedReportGenerator
@@ -279,46 +276,3 @@ def analyze_documents(
         logger.info(f"Analysis saved to {output_file}")
     return analysis_result
-def get_available_search_engines() -> Dict[str, str]:
-    """
-    Get a dictionary of available search engines.
-    Returns:
-        Dictionary mapping engine names to descriptions
-    """
-    from ..web_search_engines.search_engine_factory import get_available_engines
-    engines = get_available_engines()
-    # Add some descriptions for common engines
-    descriptions = {
-        "auto": "Automatic selection based on query type",
-        "wikipedia": "Wikipedia articles and general knowledge",
-        "arxiv": "Scientific papers and research",
-        "pubmed": "Medical and biomedical literature",
-        "semantic_scholar": "Academic papers across all fields",
-        "github": "Code repositories and technical documentation",
-        "local_all": "All local document collections",
-    }
-    return {engine: descriptions.get(engine, "Search engine") for engine in engines}
-def get_available_collections() -> Dict[str, Dict[str, Any]]:
-    """
-    Get a dictionary of available local document collections.
-    Returns:
-        Dictionary mapping collection names to their configuration
-    """
-    from ..config.config_files import LOCAL_COLLECTIONS_FILE
-    if os.path.exists(LOCAL_COLLECTIONS_FILE):
-        collections = toml.load(LOCAL_COLLECTIONS_FILE)
-        return collections
-    return {}

local_deep_research/citation_handler.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import Any, Dict, List, Union
 from langchain_core.documents import Document
-from .config.config_files import settings
 from .utilities.db_utils import get_db_setting
@@ -93,10 +92,8 @@ Previous Knowledge:
 New Sources:
 {formatted_sources}
-Return any inconsistencies or conflicts found."""
-        if get_db_setting(
-            "general.enable_fact_checking", settings.general.enable_fact_checking
-        ):
+        Return any inconsistencies or conflicts found."""
+        if get_db_setting("general.enable_fact_checking", True):
             fact_check_response = self.llm.invoke(fact_check_prompt).content
         else:

local-deep-research 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

local-deep-research 0.2.3py3-none-any.whl → 0.3.0py3-none-any.whl