PyPI - local-deep-research - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

local-deep-research 0.2.0py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

local_deep_research/advanced_search_system/strategies/source_based_strategy.py ADDED Viewed

@@ -0,0 +1,407 @@
+import concurrent.futures
+import logging
+from typing import Dict
+from ...citation_handler import CitationHandler
+from ...config.llm_config import get_llm
+from ...config.search_config import get_search
+from ...utilities.db_utils import get_db_setting
+from ...utilities.search_utilities import extract_links_from_search_results
+from ..filters.cross_engine_filter import CrossEngineFilter
+from ..findings.repository import FindingsRepository
+from ..questions.standard_question import StandardQuestionGenerator
+from .base_strategy import BaseSearchStrategy
+logger = logging.getLogger(__name__)
+class SourceBasedSearchStrategy(BaseSearchStrategy):
+    """
+    Source-based search strategy that generates questions based on search results and
+    defers content analysis until final synthesis.
+    """
+    def __init__(
+        self,
+        search=None,
+        model=None,
+        citation_handler=None,
+        include_text_content: bool = True,
+        use_cross_engine_filter: bool = True,
+        filter_reorder: bool = True,
+        filter_reindex: bool = True,
+        filter_max_results: int = 20,
+    ):
+        """Initialize with optional dependency injection for testing."""
+        super().__init__()
+        self.search = search or get_search()
+        self.model = model or get_llm()
+        self.progress_callback = None
+        self.all_links_of_system = list()
+        self.all_search_results = []
+        self.questions_by_iteration = {}
+        self.include_text_content = include_text_content
+        self.use_cross_engine_filter = use_cross_engine_filter
+        self.filter_reorder = filter_reorder
+        self.filter_reindex = filter_reindex
+        # Initialize the cross-engine filter
+        self.cross_engine_filter = CrossEngineFilter(
+            model=self.model,
+            max_results=filter_max_results,
+            default_reorder=filter_reorder,
+            default_reindex=filter_reindex,
+        )
+        # Set include_full_content on the search engine if it supports it
+        if hasattr(self.search, "include_full_content"):
+            self.search.include_full_content = include_text_content
+        # Use provided citation_handler or create one
+        self.citation_handler = citation_handler or CitationHandler(self.model)
+        # Initialize components
+        self.question_generator = StandardQuestionGenerator(self.model)
+        self.findings_repository = FindingsRepository(self.model)
+    def _format_search_results_as_context(self, search_results):
+        """Format search results into context for question generation."""
+        context_snippets = []
+        for i, result in enumerate(
+            search_results[:10]
+        ):  # Limit to prevent context overflow
+            title = result.get("title", "Untitled")
+            snippet = result.get("snippet", "")
+            url = result.get("link", "")
+            if snippet:
+                context_snippets.append(
+                    f"Source {i + 1}: {title}\nURL: {url}\nSnippet: {snippet}"
+                )
+        return "\n\n".join(context_snippets)
+    def analyze_topic(self, query: str) -> Dict:
+        """
+        Analyze a topic using source-based search strategy.
+        """
+        logger.info(f"Starting source-based research on topic: {query}")
+        findings = []
+        self.all_search_results = []
+        # Track all search results across iterations
+        self.all_links_of_system = list()
+        self.questions_by_iteration = {}
+        self._update_progress(
+            "Initializing source-based research",
+            5,
+            {
+                "phase": "init",
+                "strategy": "source-based",
+                "include_text_content": self.include_text_content,
+            },
+        )
+        # Check search engine
+        if not self._validate_search_engine():
+            return {
+                "findings": [],
+                "iterations": 0,
+                "questions_by_iteration": {},
+                "formatted_findings": "Error: Unable to conduct research without a search engine.",
+                "current_knowledge": "",
+                "error": "No search engine available",
+            }
+        # Determine number of iterations to run
+        iterations_to_run = get_db_setting("search.iterations")
+        logger.debug("Selected amount of iterations: " + str(iterations_to_run))
+        iterations_to_run = int(iterations_to_run)
+        try:
+            # Run each iteration
+            for iteration in range(1, iterations_to_run + 1):
+                iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
+                self._update_progress(
+                    f"Starting iteration {iteration}/{iterations_to_run}",
+                    iteration_progress_base,
+                    {"phase": f"iteration_{iteration}", "iteration": iteration},
+                )
+                # Step 1: Generate or use questions
+                self._update_progress(
+                    f"Generating search questions for iteration {iteration}",
+                    iteration_progress_base + 5,
+                    {"phase": "question_generation", "iteration": iteration},
+                )
+                # For first iteration, use initial query
+                if iteration == 1:
+                    # Generate questions for first iteration
+                    source_context = self._format_search_results_as_context(
+                        self.all_search_results
+                    )
+                    context = f"""Iteration: {iteration} of {iterations_to_run}"""
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=context,
+                        query=query,
+                        questions_per_iteration=int(
+                            get_db_setting("search.questions_per_iteration")
+                        ),
+                        questions_by_iteration=self.questions_by_iteration,
+                    )
+                    # Always include the original query for the first iteration
+                    if query not in questions:
+                        all_questions = [query] + questions
+                    else:
+                        all_questions = questions
+                    self.questions_by_iteration[iteration] = all_questions
+                    logger.info(
+                        f"Using questions for iteration {iteration}: {all_questions}"
+                    )
+                else:
+                    # For subsequent iterations, generate questions based on previous search results
+                    source_context = self._format_search_results_as_context(
+                        self.all_search_results
+                    )
+                    if iteration != 1:
+                        context = f"""Previous search results:\n{source_context}\n\nIteration: {iteration} of {iterations_to_run}"""
+                    elif iterations_to_run == 1:
+                        context = ""
+                    else:
+                        context = f"""Iteration: {iteration} of {iterations_to_run}"""
+                    # Use standard question generator with search results as context
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=context,
+                        query=query,
+                        questions_per_iteration=int(
+                            get_db_setting("search.questions_per_iteration")
+                        ),
+                        questions_by_iteration=self.questions_by_iteration,
+                    )
+                    # Use only the new questions for this iteration's searches
+                    all_questions = questions
+                    # Store in questions_by_iteration
+                    self.questions_by_iteration[iteration] = questions
+                    logger.info(
+                        f"Generated questions for iteration {iteration}: {questions}"
+                    )
+                # Step 2: Run all searches in parallel for this iteration
+                self._update_progress(
+                    f"Running parallel searches for iteration {iteration}",
+                    iteration_progress_base + 10,
+                    {"phase": "parallel_search", "iteration": iteration},
+                )
+                # Function for thread pool
+                def search_question(q):
+                    try:
+                        result = self.search.run(q)
+                        return {"question": q, "results": result or []}
+                    except Exception as e:
+                        logger.error(f"Error searching for '{q}': {str(e)}")
+                        return {"question": q, "results": [], "error": str(e)}
+                # Run searches in parallel
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=len(all_questions)
+                ) as executor:
+                    futures = [
+                        executor.submit(search_question, q) for q in all_questions
+                    ]
+                    iteration_search_dict = {}
+                    iteration_search_results = []
+                    # Process results as they complete
+                    for i, future in enumerate(
+                        concurrent.futures.as_completed(futures)
+                    ):
+                        result_dict = future.result()
+                        question = result_dict["question"]
+                        search_results = result_dict["results"]
+                        iteration_search_dict[question] = search_results
+                        self._update_progress(
+                            f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
+                            iteration_progress_base
+                            + 10
+                            + ((i + 1) / len(all_questions) * 30),
+                            {
+                                "phase": "search_complete",
+                                "iteration": iteration,
+                                "result_count": len(search_results),
+                                "question": question,
+                            },
+                        )
+                        # Collect all search results for this iteration
+                        iteration_search_results.extend(search_results)
+                # Step 3: Apply cross-engine filtering if enabled
+                if self.use_cross_engine_filter:
+                    self._update_progress(
+                        f"Filtering search results for iteration {iteration}",
+                        iteration_progress_base + 45,
+                        {"phase": "cross_engine_filtering", "iteration": iteration},
+                    )
+                    # Get the current link count (for indexing)
+                    existing_link_count = len(self.all_links_of_system)
+                    # Filter the search results
+                    filtered_search_results = self.cross_engine_filter.filter_results(
+                        iteration_search_results,
+                        query,
+                        reorder=self.filter_reorder,
+                        reindex=self.filter_reindex,
+                        start_index=existing_link_count,  # Start indexing after existing links
+                    )
+                    links = extract_links_from_search_results(filtered_search_results)
+                    self.all_links_of_system.extend(links)
+                    self._update_progress(
+                        f"Filtered from {len(iteration_search_results)} to {len(filtered_search_results)} results",
+                        iteration_progress_base + 50,
+                        {
+                            "phase": "filtering_complete",
+                            "iteration": iteration,
+                            "links_count": len(self.all_links_of_system),
+                        },
+                    )
+                    # Use filtered results
+                    iteration_search_results = filtered_search_results
+                else:
+                    # Just extract links without filtering
+                    links = extract_links_from_search_results(iteration_search_results)
+                    self.all_links_of_system.extend(links)
+                # Add to all search results
+                self.all_search_results.extend(iteration_search_results)
+                # Create a lightweight finding for this iteration's search metadata (no text content)
+                finding = {
+                    "phase": f"Iteration {iteration}",
+                    "content": f"Searched with {len(all_questions)} questions, found {len(iteration_search_results)} results.",
+                    "question": query,
+                    "search_results": iteration_search_results,
+                    "documents": [],
+                }
+                findings.append(finding)
+                # Mark iteration as complete
+                iteration_progress = 5 + iteration * (70 / iterations_to_run)
+                self._update_progress(
+                    f"Completed iteration {iteration}/{iterations_to_run}",
+                    iteration_progress,
+                    {"phase": "iteration_complete", "iteration": iteration},
+                )
+            # Final filtering of all accumulated search results
+            self._update_progress(
+                "Performing final filtering of all results",
+                80,
+                {"phase": "final_filtering"},
+            )
+            # Apply final cross-engine filtering to all accumulated results if enabled
+            if self.use_cross_engine_filter:
+                final_filtered_results = self.cross_engine_filter.filter_results(
+                    self.all_search_results,
+                    query,
+                    reorder=True,  # Always reorder in final filtering
+                    reindex=False,  # Always reindex in final filtering
+                    max_results=int(get_db_setting("search.final_max_results") or 30),
+                )
+            else:
+                final_filtered_results = self.all_search_results
+            self._update_progress(
+                f"Filtered from {len(self.all_search_results)} to {len(final_filtered_results)} results",
+                iteration_progress_base + 85,
+                {
+                    "phase": "filtering_complete",
+                    "iteration": iteration,
+                    "links_count": len(self.all_links_of_system),
+                },
+            )
+            # Final synthesis after all iterations
+            self._update_progress(
+                "Generating final synthesis", 90, {"phase": "synthesis"}
+            )
+            total_citation_count = len(self.all_links_of_system)
+            # Final synthesis
+            final_citation_result = self.citation_handler.analyze_followup(
+                query,
+                final_filtered_results,
+                previous_knowledge="",  # Empty string as we don't need previous knowledge here
+                nr_of_links=total_citation_count,
+            )
+            # Add null check for final_citation_result
+            if final_citation_result:
+                synthesized_content = final_citation_result["content"]
+                documents = final_citation_result.get("documents", [])
+            else:
+                synthesized_content = "No relevant results found in final synthesis."
+                documents = []
+            # Add a final synthesis finding
+            final_finding = {
+                "phase": "Final synthesis",
+                "content": synthesized_content,
+                "question": query,
+                "search_results": final_filtered_results,
+                "documents": documents,
+            }
+            findings.append(final_finding)
+            # Add documents to repository
+            self.findings_repository.add_documents(documents)
+            # Transfer questions to repository
+            self.findings_repository.set_questions_by_iteration(
+                self.questions_by_iteration
+            )
+            # Format findings
+            formatted_findings = self.findings_repository.format_findings_to_text(
+                findings, synthesized_content
+            )
+        except Exception as e:
+            import traceback
+            error_msg = f"Error in research process: {str(e)}"
+            logger.error(error_msg)
+            logger.error(traceback.format_exc())
+            synthesized_content = f"Error: {str(e)}"
+            formatted_findings = f"Error: {str(e)}"
+            finding = {
+                "phase": "Error",
+                "content": synthesized_content,
+                "question": query,
+                "search_results": [],
+                "documents": [],
+            }
+            findings.append(finding)
+        self._update_progress("Research complete", 100, {"phase": "complete"})
+        return {
+            "findings": findings,
+            "iterations": iterations_to_run,
+            "questions_by_iteration": self.questions_by_iteration,
+            "formatted_findings": formatted_findings,
+            "current_knowledge": synthesized_content,
+        }

local_deep_research/api/research_functions.py CHANGED Viewed

@@ -9,78 +9,100 @@ from typing import Any, Callable, Dict, Optional
 import toml
-from .. import get_report_generator  # Use the lazy import function
 from ..config.llm_config import get_llm
 from ..config.search_config import get_search
+from ..report_generator import IntegratedReportGenerator
 from ..search_system import AdvancedSearchSystem
 from ..utilities.search_utilities import remove_think_tags
 logger = logging.getLogger(__name__)
-def quick_summary(
-    query: str,
+def _init_search_system(
+    model_name: str | None = None,
+    temperature: float = 0.7,
+    provider: str | None = None,
+    openai_endpoint_url: str | None = None,
+    progress_callback: Callable[[str, int, dict], None] | None = None,
     search_tool: Optional[str] = None,
     iterations: int = 1,
     questions_per_iteration: int = 1,
-    max_results: int = 20,
-    max_filtered_results: int = 5,
-    region: str = "us",
-    time_period: str = "y",
-    safe_search: bool = True,
-    temperature: float = 0.7,
-    progress_callback: Optional[Callable] = None,
-) -> Dict[str, Any]:
+) -> AdvancedSearchSystem:
     """
-    Generate a quick research summary for a given query.
+    Initializes the advanced search system with specified parameters. This function sets up
+    and returns an instance of the AdvancedSearchSystem using the provided configuration
+    options such as model name, temperature for randomness in responses, provider service
+    details, endpoint URL, and an optional search tool.
     Args:
-        query: The research query to analyze
+        model_name: Name of the model to use (if None, uses database setting)
+        temperature: LLM temperature for generation
+        provider: Provider to use (if None, uses database setting)
+        openai_endpoint_url: Custom endpoint URL to use (if None, uses database
+            setting)
+        progress_callback: Optional callback function to receive progress updates
         search_tool: Search engine to use (auto, wikipedia, arxiv, etc.). If None, uses default
         iterations: Number of research cycles to perform
         questions_per_iteration: Number of questions to generate per cycle
-        max_results: Maximum number of search results to consider
-        max_filtered_results: Maximum results after relevance filtering
-        region: Search region/locale
-        time_period: Time period for search results (d=day, w=week, m=month, y=year)
-        safe_search: Whether to enable safe search
-        temperature: LLM temperature for generation
-        progress_callback: Optional callback function to receive progress updates
     Returns:
-        Dictionary containing the research results with keys:
-        - 'summary': The generated summary text
-        - 'findings': List of detailed findings from each search
-        - 'iterations': Number of iterations performed
-        - 'questions': Questions generated during research
-    """
-    logger.info("Generating quick summary for query: %s", query)
+        AdvancedSearchSystem: An instance of the configured AdvancedSearchSystem.
+    """
     # Get language model with custom temperature
-    llm = get_llm(temperature=temperature)
-    # Create search system with custom parameters
-    system = AdvancedSearchSystem()
-    # Override default settings with user-provided values
-    system.max_iterations = iterations
-    system.questions_per_iteration = questions_per_iteration
-    system.model = llm  # Ensure the model is directly attached to the system
+    llm = get_llm(
+        temperature=temperature,
+        openai_endpoint_url=openai_endpoint_url,
+        model_name=model_name,
+        provider=provider,
+    )
     # Set the search engine if specified
+    search_engine = None
     if search_tool:
-        search_engine = get_search(search_tool)
-        if search_engine:
-            system.search = search_engine
-        else:
+        search_engine = get_search(search_tool, llm_instance=llm)
+        if search_engine is None:
             logger.warning(
                 f"Could not create search engine '{search_tool}', using default."
             )
+    # Create search system with custom parameters
+    system = AdvancedSearchSystem(llm=llm, search=search_engine)
+    # Override default settings with user-provided values
+    system.max_iterations = iterations
+    system.questions_per_iteration = questions_per_iteration
     # Set progress callback if provided
     if progress_callback:
         system.set_progress_callback(progress_callback)
+    return system
+def quick_summary(
+    query: str,
+    **kwargs: Any,
+) -> Dict[str, Any]:
+    """
+    Generate a quick research summary for a given query.
+    Args:
+        query: The research query to analyze
+        **kwargs: Configuration for the search system. Will be forwarded to
+            `_init_search_system()`.
+    Returns:
+        Dictionary containing the research results with keys:
+        - 'summary': The generated summary text
+        - 'findings': List of detailed findings from each search
+        - 'iterations': Number of iterations performed
+        - 'questions': Questions generated during research
+    """
+    logger.info("Generating quick summary for query: %s", query)
+    system = _init_search_system(**kwargs)
     # Perform the search and analysis
     results = system.analyze_topic(query)
@@ -103,36 +125,20 @@ def quick_summary(
 def generate_report(
     query: str,
-    search_tool: Optional[str] = None,
-    iterations: int = 2,
-    questions_per_iteration: int = 2,
-    searches_per_section: int = 2,
-    max_results: int = 50,
-    max_filtered_results: int = 5,
-    region: str = "us",
-    time_period: str = "y",
-    safe_search: bool = True,
-    temperature: float = 0.7,
     output_file: Optional[str] = None,
     progress_callback: Optional[Callable] = None,
+    searches_per_section: int = 2,
+    **kwargs: Any,
 ) -> Dict[str, Any]:
     """
     Generate a comprehensive, structured research report for a given query.
     Args:
         query: The research query to analyze
-        search_tool: Search engine to use (auto, wikipedia, arxiv, etc.). If None, uses default
-        iterations: Number of research cycles to perform
-        questions_per_iteration: Number of questions to generate per cycle
-        searches_per_section: Number of searches to perform per report section
-        max_results: Maximum number of search results to consider
-        max_filtered_results: Maximum results after relevance filtering
-        region: Search region/locale
-        time_period: Time period for search results (d=day, w=week, m=month, y=year)
-        safe_search: Whether to enable safe search
-        temperature: LLM temperature for generation
         output_file: Optional path to save report markdown file
         progress_callback: Optional callback function to receive progress updates
+        searches_per_section: The number of searches to perform for each
+            section in the report.
     Returns:
         Dictionary containing the research report with keys:
@@ -141,34 +147,7 @@ def generate_report(
     """
     logger.info("Generating comprehensive research report for query: %s", query)
-    # Get language model with custom temperature
-    llm = get_llm(temperature=temperature)
-    # Create search system with custom parameters
-    system = AdvancedSearchSystem()
-    # Override default settings with user-provided values
-    system.max_iterations = iterations
-    system.questions_per_iteration = questions_per_iteration
-    system.model = llm  # Ensure the model is directly attached to the system
-    # Set the search engine if specified
-    if search_tool:
-        search_engine = get_search(
-            search_tool,
-            llm_instance=llm,
-            max_results=max_results,
-            max_filtered_results=max_filtered_results,
-            region=region,
-            time_period=time_period,
-            safe_search=safe_search,
-        )
-        if search_engine:
-            system.search = search_engine
-        else:
-            logger.warning(
-                f"Could not create search engine '{search_tool}', using default."
-            )
+    system = _init_search_system(**kwargs)
     # Set progress callback if provided
     if progress_callback:
@@ -178,8 +157,11 @@ def generate_report(
     initial_findings = system.analyze_topic(query)
     # Generate the structured report
-    report_generator = get_report_generator(searches_per_section=searches_per_section)
-    report_generator.model = llm  # Ensure the model is set on the report generator too
+    report_generator = IntegratedReportGenerator(
+        search_system=system,
+        llm=system.model,
+        searches_per_section=searches_per_section,
+    )
     report = report_generator.generate_report(initial_findings, query)
     # Save report to file if path is provided

local_deep_research/citation_handler.py CHANGED Viewed

@@ -82,18 +82,18 @@ Provide a detailed analysis with citations. Do not create the bibliography, it w
         formatted_sources = self._format_sources(documents)
         # Add fact-checking step
         fact_check_prompt = f"""Analyze these sources for factual consistency:
-        1. Cross-reference major claims between sources
-        2. Identify and flag any contradictions
-        3. Verify basic facts (dates, company names, ownership)
-        4. Note when sources disagree
+1. Cross-reference major claims between sources
+2. Identify and flag any contradictions
+3. Verify basic facts (dates, company names, ownership)
+4. Note when sources disagree
-        Previous Knowledge:
-        {previous_knowledge}
+Previous Knowledge:
+{previous_knowledge}
-        New Sources:
-        {formatted_sources}
+New Sources:
+{formatted_sources}
-        Return any inconsistencies or conflicts found."""
+Return any inconsistencies or conflicts found."""
         if get_db_setting(
             "general.enable_fact_checking", settings.general.enable_fact_checking
         ):
@@ -104,16 +104,15 @@ Provide a detailed analysis with citations. Do not create the bibliography, it w
         prompt = f"""Using the previous knowledge and new sources, answer the question. Include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source. Reflect information from sources critically.
-            Previous Knowledge:
-            {previous_knowledge}
+Previous Knowledge:
+{previous_knowledge}
-            Question: {question}
+Question: {question}
-            New Sources:
-            {formatted_sources}
-            Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
-            Provide a detailed answer with citations.  Example format: "According to [1], ..."
-            """
+New Sources:
+{formatted_sources}
+Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
+Provide a detailed answer with citations.  Example format: "According to [1], ..." """
         response = self.llm.invoke(prompt)

local-deep-research 0.2.0__py3-none-any.whl → 0.2.3__py3-none-any.whl

local-deep-research 0.2.0py3-none-any.whl → 0.2.3py3-none-any.whl