PyPI - local-deep-research - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

local-deep-research 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

local_deep_research/advanced_search_system/strategies/source_based_strategy.py ADDED Viewed

@@ -0,0 +1,402 @@
+import concurrent.futures
+import logging
+from typing import Dict
+from ...citation_handler import CitationHandler
+from ...config.llm_config import get_llm
+from ...config.search_config import get_search
+from ...utilities.db_utils import get_db_setting
+from ..filters.cross_engine_filter import CrossEngineFilter
+from ..findings.repository import FindingsRepository
+from ..questions.standard_question import StandardQuestionGenerator
+from .base_strategy import BaseSearchStrategy
+logger = logging.getLogger(__name__)
+class SourceBasedSearchStrategy(BaseSearchStrategy):
+    """
+    Source-based search strategy that generates questions based on search results and
+    defers content analysis until final synthesis.
+    """
+    def __init__(
+        self,
+        search=None,
+        model=None,
+        citation_handler=None,
+        include_text_content: bool = True,
+        use_cross_engine_filter: bool = True,
+        filter_reorder: bool = True,
+        filter_reindex: bool = True,
+        cross_engine_max_results: int = None,
+        all_links_of_system=None,
+    ):
+        """Initialize with optional dependency injection for testing."""
+        # Pass the links list to the parent class
+        super().__init__(all_links_of_system=all_links_of_system)
+        self.search = search or get_search()
+        self.model = model or get_llm()
+        self.progress_callback = None
+        self.questions_by_iteration = {}
+        self.include_text_content = include_text_content
+        self.use_cross_engine_filter = use_cross_engine_filter
+        self.filter_reorder = filter_reorder
+        self.filter_reindex = filter_reindex
+        # Get cross_engine_max_results from database if not provided
+        if cross_engine_max_results is None:
+            cross_engine_max_results = get_db_setting(
+                "search.cross_engine_max_results", 100
+            )
+        # Initialize the cross-engine filter
+        self.cross_engine_filter = CrossEngineFilter(
+            model=self.model,
+            max_results=cross_engine_max_results,
+            default_reorder=filter_reorder,
+            default_reindex=filter_reindex,
+        )
+        # Set include_full_content on the search engine if it supports it
+        if hasattr(self.search, "include_full_content"):
+            self.search.include_full_content = include_text_content
+        # Use provided citation_handler or create one
+        self.citation_handler = citation_handler or CitationHandler(self.model)
+        # Initialize components
+        self.question_generator = StandardQuestionGenerator(self.model)
+        self.findings_repository = FindingsRepository(self.model)
+    def _format_search_results_as_context(self, search_results):
+        """Format search results into context for question generation."""
+        context_snippets = []
+        for i, result in enumerate(
+            search_results[:10]
+        ):  # Limit to prevent context overflow
+            title = result.get("title", "Untitled")
+            snippet = result.get("snippet", "")
+            url = result.get("link", "")
+            if snippet:
+                context_snippets.append(
+                    f"Source {i + 1}: {title}\nURL: {url}\nSnippet: {snippet}"
+                )
+        return "\n\n".join(context_snippets)
+    def analyze_topic(self, query: str) -> Dict:
+        """
+        Analyze a topic using source-based search strategy.
+        """
+        logger.info(f"Starting source-based research on topic: {query}")
+        accumulated_search_results_across_all_iterations = (
+            []
+        )  # tracking links across iterations but not global
+        findings = []
+        total_citation_count_before_this_search = len(self.all_links_of_system)
+        self._update_progress(
+            "Initializing source-based research",
+            5,
+            {
+                "phase": "init",
+                "strategy": "source-based",
+                "include_text_content": self.include_text_content,
+            },
+        )
+        # Check search engine
+        if not self._validate_search_engine():
+            return {
+                "findings": [],
+                "iterations": 0,
+                "questions_by_iteration": {},
+                "formatted_findings": "Error: Unable to conduct research without a search engine.",
+                "current_knowledge": "",
+                "error": "No search engine available",
+            }
+        # Determine number of iterations to run
+        iterations_to_run = get_db_setting("search.iterations")
+        logger.debug("Selected amount of iterations: " + str(iterations_to_run))
+        iterations_to_run = int(iterations_to_run)
+        try:
+            filtered_search_results = []
+            total_citation_count_before_this_search = len(self.all_links_of_system)
+            # Run each iteration
+            for iteration in range(1, iterations_to_run + 1):
+                iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
+                self._update_progress(
+                    f"Starting iteration {iteration}/{iterations_to_run}",
+                    iteration_progress_base,
+                    {"phase": f"iteration_{iteration}", "iteration": iteration},
+                )
+                # Step 1: Generate or use questions
+                self._update_progress(
+                    f"Generating search questions for iteration {iteration}",
+                    iteration_progress_base + 5,
+                    {"phase": "question_generation", "iteration": iteration},
+                )
+                # For first iteration, use initial query
+                if iteration == 1:
+                    # Generate questions for first iteration
+                    context = f"""Iteration: {iteration} of {iterations_to_run}"""
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=context,
+                        query=query,
+                        questions_per_iteration=int(
+                            get_db_setting("search.questions_per_iteration")
+                        ),
+                        questions_by_iteration=self.questions_by_iteration,
+                    )
+                    # Always include the original query for the first iteration
+                    if query not in questions:
+                        all_questions = [query] + questions
+                    else:
+                        all_questions = questions
+                    self.questions_by_iteration[iteration] = all_questions
+                    logger.info(
+                        f"Using questions for iteration {iteration}: {all_questions}"
+                    )
+                else:
+                    # For subsequent iterations, generate questions based on previous search results
+                    source_context = self._format_search_results_as_context(
+                        filtered_search_results
+                    )
+                    if iteration != 1:
+                        context = f"""Previous search results:\n{source_context}\n\nIteration: {iteration} of {iterations_to_run}"""
+                    elif iterations_to_run == 1:
+                        context = ""
+                    else:
+                        context = f"""Iteration: {iteration} of {iterations_to_run}"""
+                    # Use standard question generator with search results as context
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=context,
+                        query=query,
+                        questions_per_iteration=int(
+                            get_db_setting("search.questions_per_iteration")
+                        ),
+                        questions_by_iteration=self.questions_by_iteration,
+                    )
+                    # Use only the new questions for this iteration's searches
+                    all_questions = questions
+                    # Store in questions_by_iteration
+                    self.questions_by_iteration[iteration] = questions
+                    logger.info(
+                        f"Generated questions for iteration {iteration}: {questions}"
+                    )
+                # Step 2: Run all searches in parallel for this iteration
+                self._update_progress(
+                    f"Running parallel searches for iteration {iteration}",
+                    iteration_progress_base + 10,
+                    {"phase": "parallel_search", "iteration": iteration},
+                )
+                # Function for thread pool
+                def search_question(q):
+                    try:
+                        result = self.search.run(q)
+                        return {"question": q, "results": result or []}
+                    except Exception as e:
+                        logger.error(f"Error searching for '{q}': {str(e)}")
+                        return {"question": q, "results": [], "error": str(e)}
+                # Run searches in parallel
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=len(all_questions)
+                ) as executor:
+                    futures = [
+                        executor.submit(search_question, q) for q in all_questions
+                    ]
+                    iteration_search_dict = {}
+                    iteration_search_results = []
+                    # Process results as they complete
+                    for i, future in enumerate(
+                        concurrent.futures.as_completed(futures)
+                    ):
+                        result_dict = future.result()
+                        question = result_dict["question"]
+                        search_results = result_dict["results"]
+                        iteration_search_dict[question] = search_results
+                        self._update_progress(
+                            f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
+                            iteration_progress_base
+                            + 10
+                            + ((i + 1) / len(all_questions) * 30),
+                            {
+                                "phase": "search_complete",
+                                "iteration": iteration,
+                                "result_count": len(search_results),
+                                "question": question,
+                            },
+                        )
+                        iteration_search_results.extend(search_results)
+                if False and self.use_cross_engine_filter:
+                    self._update_progress(
+                        f"Filtering search results for iteration {iteration}",
+                        iteration_progress_base + 45,
+                        {"phase": "cross_engine_filtering", "iteration": iteration},
+                    )
+                    existing_link_count = len(self.all_links_of_system)
+                    logger.info(f"Existing link count: {existing_link_count}")
+                    filtered_search_results = self.cross_engine_filter.filter_results(
+                        iteration_search_results,
+                        query,
+                        reorder=True,
+                        reindex=True,
+                        start_index=existing_link_count,  # Start indexing after existing links
+                    )
+                    self._update_progress(
+                        f"Filtered from {len(iteration_search_results)} to {len(filtered_search_results)} results",
+                        iteration_progress_base + 50,
+                        {
+                            "phase": "filtering_complete",
+                            "iteration": iteration,
+                            "links_count": len(self.all_links_of_system),
+                        },
+                    )
+                else:
+                    # Use the search results as they are
+                    filtered_search_results = iteration_search_results
+                    # Use filtered results
+                accumulated_search_results_across_all_iterations.extend(
+                    filtered_search_results
+                )
+                # Create a lightweight finding for this iteration's search metadata (no text content)
+                finding = {
+                    "phase": f"Iteration {iteration}",
+                    "content": f"Searched with {len(all_questions)} questions, found {len(filtered_search_results)} results.",
+                    "question": query,
+                    "documents": [],
+                }
+                findings.append(finding)
+                # Mark iteration as complete
+                iteration_progress = 5 + iteration * (70 / iterations_to_run)
+                self._update_progress(
+                    f"Completed iteration {iteration}/{iterations_to_run}",
+                    iteration_progress,
+                    {"phase": "iteration_complete", "iteration": iteration},
+                )
+            # Do we need this filter?
+            if self.use_cross_engine_filter:
+                # Final filtering of all accumulated search results
+                self._update_progress(
+                    "Performing final filtering of all results",
+                    80,
+                    {"phase": "final_filtering"},
+                )
+                final_filtered_results = self.cross_engine_filter.filter_results(
+                    accumulated_search_results_across_all_iterations,
+                    query,
+                    reorder=True,  # Always reorder in final filtering
+                    reindex=True,  # Always reindex in final filtering
+                    max_results=int(get_db_setting("search.final_max_results") or 100),
+                    start_index=len(self.all_links_of_system),
+                )
+                self._update_progress(
+                    f"Filtered from {len(accumulated_search_results_across_all_iterations)} to {len(final_filtered_results)} results",
+                    iteration_progress_base + 85,
+                    {
+                        "phase": "filtering_complete",
+                        "iteration": iteration,
+                        "links_count": len(self.all_links_of_system),
+                    },
+                )
+            else:
+                final_filtered_results = filtered_search_results
+                # links = extract_links_from_search_results()
+            self.all_links_of_system.extend(final_filtered_results)
+            # Final synthesis after all iterations
+            self._update_progress(
+                "Generating final synthesis", 90, {"phase": "synthesis"}
+            )
+            # Final synthesis
+            final_citation_result = self.citation_handler.analyze_followup(
+                query,
+                final_filtered_results,
+                previous_knowledge="",  # Empty string as we don't need previous knowledge here
+                nr_of_links=total_citation_count_before_this_search,
+            )
+            # Add null check for final_citation_result
+            if final_citation_result:
+                synthesized_content = final_citation_result["content"]
+                documents = final_citation_result.get("documents", [])
+            else:
+                synthesized_content = "No relevant results found in final synthesis."
+                documents = []
+            # Add a final synthesis finding
+            final_finding = {
+                "phase": "Final synthesis",
+                "content": synthesized_content,
+                "question": query,
+                "search_results": self.all_links_of_system,
+                "documents": documents,
+            }
+            findings.append(final_finding)
+            # Add documents to repository
+            self.findings_repository.add_documents(documents)
+            # Transfer questions to repository
+            self.findings_repository.set_questions_by_iteration(
+                self.questions_by_iteration
+            )
+            # Format findings
+            formatted_findings = self.findings_repository.format_findings_to_text(
+                findings, synthesized_content
+            )
+        except Exception as e:
+            import traceback
+            error_msg = f"Error in research process: {str(e)}"
+            logger.error(error_msg)
+            logger.error(traceback.format_exc())
+            synthesized_content = f"Error: {str(e)}"
+            formatted_findings = f"Error: {str(e)}"
+            finding = {
+                "phase": "Error",
+                "content": synthesized_content,
+                "question": query,
+                "search_results": [],
+                "documents": [],
+            }
+            findings.append(finding)
+        self._update_progress("Research complete", 100, {"phase": "complete"})
+        return {
+            "findings": findings,
+            "iterations": iterations_to_run,
+            "questions_by_iteration": self.questions_by_iteration,
+            "formatted_findings": formatted_findings,
+            "current_knowledge": synthesized_content,
+            "all_links_of_system": self.all_links_of_system,
+        }

local_deep_research/advanced_search_system/strategies/standard_strategy.py CHANGED Viewed

@@ -3,7 +3,6 @@ import logging
 from typing import Dict
 from ...citation_handler import CitationHandler
-from ...config.config_files import settings
 from ...config.llm_config import get_llm
 from ...config.search_config import get_search
 from ...utilities.db_utils import get_db_setting
@@ -20,11 +19,17 @@ logger = logging.getLogger(__name__)
 class StandardSearchStrategy(BaseSearchStrategy):
     """Standard iterative search strategy that generates follow-up questions."""
-    def __init__(self, search=None, model=None, citation_handler=None):
+    def __init__(
+        self, search=None, model=None, citation_handler=None, all_links_of_system=None
+    ):
         """Initialize with optional dependency injection for testing."""
+        super().__init__(all_links_of_system=all_links_of_system)
         self.search = search or get_search()
         self.model = model or get_llm()
+        # Get iterations setting
         self.max_iterations = int(get_db_setting("search.iterations"))
         self.questions_per_iteration = int(
             get_db_setting("search.questions_per_iteration")
         )
@@ -43,7 +48,6 @@ class StandardSearchStrategy(BaseSearchStrategy):
         # Initialize other attributes
         self.progress_callback = None
-        self.all_links_of_system = list()
     def _update_progress(
         self, message: str, progress_percent: int = None, metadata: dict = None
@@ -117,7 +121,7 @@ Iteration: {iteration + 1} of {total_iterations}"""
             question_count = len(questions)
             knowledge_accumulation = get_db_setting(
                 "general.knowledge_accumulation",
-                settings.general.knowledge_accumulation,
+                "ITERATION",
             )
             for q_idx, question in enumerate(questions):
                 question_progress_base = iteration_progress_base + (

local_deep_research/api/research_functions.py CHANGED Viewed

@@ -4,11 +4,8 @@ Provides programmatic access to search and research capabilities.
 """
 import logging
-import os
 from typing import Any, Callable, Dict, Optional
-import toml
 from ..config.llm_config import get_llm
 from ..config.search_config import get_search
 from ..report_generator import IntegratedReportGenerator
@@ -279,46 +276,3 @@ def analyze_documents(
         logger.info(f"Analysis saved to {output_file}")
     return analysis_result
-def get_available_search_engines() -> Dict[str, str]:
-    """
-    Get a dictionary of available search engines.
-    Returns:
-        Dictionary mapping engine names to descriptions
-    """
-    from ..web_search_engines.search_engine_factory import get_available_engines
-    engines = get_available_engines()
-    # Add some descriptions for common engines
-    descriptions = {
-        "auto": "Automatic selection based on query type",
-        "wikipedia": "Wikipedia articles and general knowledge",
-        "arxiv": "Scientific papers and research",
-        "pubmed": "Medical and biomedical literature",
-        "semantic_scholar": "Academic papers across all fields",
-        "github": "Code repositories and technical documentation",
-        "local_all": "All local document collections",
-    }
-    return {engine: descriptions.get(engine, "Search engine") for engine in engines}
-def get_available_collections() -> Dict[str, Dict[str, Any]]:
-    """
-    Get a dictionary of available local document collections.
-    Returns:
-        Dictionary mapping collection names to their configuration
-    """
-    from ..config.config_files import LOCAL_COLLECTIONS_FILE
-    if os.path.exists(LOCAL_COLLECTIONS_FILE):
-        collections = toml.load(LOCAL_COLLECTIONS_FILE)
-        return collections
-    return {}

local_deep_research/citation_handler.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import Any, Dict, List, Union
 from langchain_core.documents import Document
-from .config.config_files import settings
 from .utilities.db_utils import get_db_setting
@@ -82,21 +81,19 @@ Provide a detailed analysis with citations. Do not create the bibliography, it w
         formatted_sources = self._format_sources(documents)
         # Add fact-checking step
         fact_check_prompt = f"""Analyze these sources for factual consistency:
-        1. Cross-reference major claims between sources
-        2. Identify and flag any contradictions
-        3. Verify basic facts (dates, company names, ownership)
-        4. Note when sources disagree
+1. Cross-reference major claims between sources
+2. Identify and flag any contradictions
+3. Verify basic facts (dates, company names, ownership)
+4. Note when sources disagree
-        Previous Knowledge:
-        {previous_knowledge}
+Previous Knowledge:
+{previous_knowledge}
-        New Sources:
-        {formatted_sources}
+New Sources:
+{formatted_sources}
         Return any inconsistencies or conflicts found."""
-        if get_db_setting(
-            "general.enable_fact_checking", settings.general.enable_fact_checking
-        ):
+        if get_db_setting("general.enable_fact_checking", True):
             fact_check_response = self.llm.invoke(fact_check_prompt).content
         else:
@@ -104,16 +101,15 @@ Provide a detailed analysis with citations. Do not create the bibliography, it w
         prompt = f"""Using the previous knowledge and new sources, answer the question. Include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source. Reflect information from sources critically.
-            Previous Knowledge:
-            {previous_knowledge}
+Previous Knowledge:
+{previous_knowledge}
-            Question: {question}
+Question: {question}
-            New Sources:
-            {formatted_sources}
-            Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
-            Provide a detailed answer with citations.  Example format: "According to [1], ..."
-            """
+New Sources:
+{formatted_sources}
+Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
+Provide a detailed answer with citations.  Example format: "According to [1], ..." """
         response = self.llm.invoke(prompt)

local-deep-research 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

local-deep-research 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl