PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/report_generator.py CHANGED Viewed

@@ -1,16 +1,28 @@
-from typing import Dict, List, Optional
-from .config import get_llm
-import re
-from datetime import datetime
+import importlib
+from typing import Dict, List
+# Fix circular import by importing directly from source modules
+from .config.llm_config import get_llm
 from .search_system import AdvancedSearchSystem
-from local_deep_research import config
-from . import utilties
-from .utilties import search_utilities
+# from . import utilities
+from .utilities import search_utilities
+def get_report_generator(search_system=None):
+    """Return an instance of the report generator with default settings.
+    Args:
+        search_system: Optional existing AdvancedSearchSystem to use
+    """
+    return IntegratedReportGenerator(search_system=search_system)
 class IntegratedReportGenerator:
-    def __init__(self, searches_per_section: int = 2):
+    def __init__(self, searches_per_section: int = 2, search_system=None):
         self.model = get_llm()
-        self.search_system = AdvancedSearchSystem()
+        # Use provided search_system or create a new one
+        self.search_system = search_system or AdvancedSearchSystem()
         self.searches_per_section = (
             searches_per_section  # Control search depth per section
         )
@@ -22,17 +34,16 @@ class IntegratedReportGenerator:
         structure = self._determine_report_structure(initial_findings, query)
         # Step 2: Research and generate content for each section in one step
-        sections = self._research_and_generate_sections(initial_findings, structure, query)
+        sections = self._research_and_generate_sections(
+            initial_findings, structure, query
+        )
         # Step 3: Format final report
         report = self._format_final_report(sections, structure, query)
         return report
-    def _determine_report_structure(
-        self, findings: Dict, query: str
-    ) -> List[Dict]:
+    def _determine_report_structure(self, findings: Dict, query: str) -> List[Dict]:
         """Analyze content and determine optimal report structure."""
         combined_content = findings["current_knowledge"]
         prompt = f"""
@@ -92,44 +103,51 @@ class IntegratedReportGenerator:
     ) -> Dict[str, str]:
         """Research and generate content for each section in one step."""
         sections = {}
         for section in structure:
             print(f"Processing section: {section['name']}")
             section_content = []
             section_content.append(f"# {section['name']}\n")
             # Process each subsection by directly researching it
             for subsection in section["subsections"]:
                 # Add subsection header
                 section_content.append(f"## {subsection['name']}\n")
                 section_content.append(f"_{subsection['purpose']}_\n\n")
                 # Generate a specific search query for this subsection
                 subsection_query = f"{query} {section['name']} {subsection['name']} {subsection['purpose']}"
-                print(f"Researching subsection: {subsection['name']} with query: {subsection_query}")
+                print(
+                    f"Researching subsection: {subsection['name']} with query: {subsection_query}"
+                )
                 # Configure search system for focused search
                 original_max_iterations = self.search_system.max_iterations
                 self.search_system.max_iterations = 1  # Keep search focused
                 # Perform search for this subsection
                 subsection_results = self.search_system.analyze_topic(subsection_query)
                 # Restore original iterations setting
                 self.search_system.max_iterations = original_max_iterations
                 # Add the researched content for this subsection
-                if "current_knowledge" in subsection_results and subsection_results["current_knowledge"]:
+                if (
+                    "current_knowledge" in subsection_results
+                    and subsection_results["current_knowledge"]
+                ):
                     section_content.append(subsection_results["current_knowledge"])
                 else:
-                    section_content.append("*Limited information was found for this subsection.*\n")
+                    section_content.append(
+                        "*Limited information was found for this subsection.*\n"
+                    )
                 section_content.append("\n\n")
             # Combine all content for this section
             sections[section["name"]] = "\n".join(section_content)
         return sections
     def _generate_sections(
@@ -157,15 +175,21 @@ class IntegratedReportGenerator:
         for i, section in enumerate(structure, 1):
             toc.append(f"{i}. **{section['name']}**")
             for j, subsection in enumerate(section["subsections"], 1):
-                toc.append(f"   {i}.{j} {subsection['name']} | _{subsection['purpose']}_")
+                toc.append(
+                    f"   {i}.{j} {subsection['name']} | _{subsection['purpose']}_"
+                )
         # Combine TOC and sections
         report_parts = ["\n".join(toc), ""]
         # Add a summary of the research
         report_parts.append("# Research Summary")
-        report_parts.append(f"This report was researched using an advanced search system.")
-        report_parts.append(f"Research included targeted searches for each section and subsection.")
+        report_parts.append(
+            "This report was researched using an advanced search system."
+        )
+        report_parts.append(
+            "Research included targeted searches for each section and subsection."
+        )
         report_parts.append("\n---\n")
         # Add each section's content
@@ -173,30 +197,34 @@ class IntegratedReportGenerator:
             if section["name"] in sections:
                 report_parts.append(sections[section["name"]])
                 report_parts.append("")
         # Format links from search system
-        formatted_all_links = utilties.search_utilities.format_links(links=self.search_system.all_links_of_system)
+        # Get utilities module dynamically to avoid circular imports
+        utilities = importlib.import_module("local_deep_research.utilities")
+        formatted_all_links = utilities.search_utilities.format_links_to_markdown(
+            all_links=self.search_system.all_links_of_system
+        )
         # Create final report with all parts
         final_report_content = "\n\n".join(report_parts)
-        final_report_content = final_report_content + "\n\n## Sources\n\n" + formatted_all_links
+        final_report_content = (
+            final_report_content + "\n\n## Sources\n\n" + formatted_all_links
+        )
         # Create metadata dictionary
         from datetime import datetime
         metadata = {
             "generated_at": datetime.utcnow().isoformat(),
             "initial_sources": len(self.search_system.all_links_of_system),
             "sections_researched": len(structure),
             "searches_per_section": self.searches_per_section,
-            "query": query
+            "query": query,
         }
         # Return both content and metadata
-        return {
-            "content": final_report_content,
-            "metadata": metadata
-        }
+        return {"content": final_report_content, "metadata": metadata}
     def _generate_error_report(self, query: str, error_msg: str) -> str:
         error_report = f"=== ERROR REPORT ===\nQuery: {query}\nError: {error_msg}"
-        return error_report
+        return error_report

local_deep_research/search_system.py CHANGED Viewed

@@ -1,306 +1,170 @@
-from typing import Dict, List, Optional, Callable
-from datetime import datetime
-from .utilties.search_utilities import remove_think_tags, format_findings_to_text, format_links
-import os
-from .utilties.enums import KnowledgeAccumulationApproach
-from .config import settings, get_llm, get_search
-from .citation_handler import CitationHandler
-from datetime import datetime
-from .utilties.search_utilities import extract_links_from_search_results
+# src/local_deep_research/search_system/search_system.py
 import logging
-logger = logging.getLogger(__name__)
-class AdvancedSearchSystem:
-    def __init__(self):
+from typing import Callable, Dict
+from langchain_core.language_models import BaseChatModel
+from .advanced_search_system.findings.repository import FindingsRepository
+from .advanced_search_system.questions.standard_question import (
+    StandardQuestionGenerator,
+)
+from .advanced_search_system.strategies.iterdrag_strategy import IterDRAGStrategy
+from .advanced_search_system.strategies.parallel_search_strategy import (
+    ParallelSearchStrategy,
+)
+from .advanced_search_system.strategies.rapid_search_strategy import RapidSearchStrategy
+from .advanced_search_system.strategies.standard_strategy import StandardSearchStrategy
+from .citation_handler import CitationHandler
+from .config.config_files import settings
+from .config.llm_config import get_llm
+from .config.search_config import get_search
+from .utilities.db_utils import get_db_setting
-        # Get fresh configuration
+logger = logging.getLogger(__name__)
-        self.search = get_search()
-        self.model = get_llm()
-        self.max_iterations = settings.search.iterations
-        self.questions_per_iteration = settings.search.questions_per_iteration
-        self.context_limit = settings.general.knowledge_accumulation_context_limit
-        self.questions_by_iteration = {}
-        self.citation_handler = CitationHandler(self.model)
-        self.progress_callback = None
-        self.all_links_of_system = list()
-        # Check if search is available, log warning if not
-        if self.search is None:
-            logger.info("WARNING: Search system initialized with no search engine! Research will not be effective.")
-            self._update_progress("WARNING: No search engine available", None, {"error": "No search engine configured properly"})
+class AdvancedSearchSystem:
+    """
+    Advanced search system that coordinates different search strategies.
+    """
+    def __init__(
+        self,
+        strategy_name: str = "parallel",
+        include_text_content: bool = True,
+        use_cross_engine_filter: bool = True,
+        llm: BaseChatModel | None = None,
+    ):
+        """Initialize the advanced search system.
-    def set_progress_callback(self, callback: Callable[[str, int, dict], None]) -> None:
-        """Set a callback function to receive progress updates.
         Args:
-            callback: Function that takes (message, progress_percent, metadata)
+            strategy_name: The name of the search strategy to use ("standard" or "iterdrag")
+            include_text_content: If False, only includes metadata and links in search results
+            use_cross_engine_filter: Whether to filter results across search
+                engines.
+            llm: LLM to use. If not provided, it will use the default one.
         """
-        self.progress_callback = callback
+        # Get configuration
+        self.search = get_search()
+        self.model = llm
+        if llm is None:
+            self.model = get_llm()
+        self.max_iterations = get_db_setting(
+            "search.iterations", settings.search.iterations
+        )
+        self.questions_per_iteration = get_db_setting(
+            "search.questions_per_iteration", settings.search.questions_per_iteration
+        )
-    def _update_progress(self, message: str, progress_percent: int = None, metadata: dict = None) -> None:
-        """Send a progress update via the callback if available.
-        Args:
-            message: Description of the current progress state
-            progress_percent: Progress percentage (0-100), if applicable
-            metadata: Additional data about the progress state
-        """
-        if self.progress_callback:
-            self.progress_callback(message, progress_percent, metadata or {})
+        # Log the strategy name that's being used
+        logger.info(
+            f"Initializing AdvancedSearchSystem with strategy_name='{strategy_name}'"
+        )
-    def _get_follow_up_questions(self, current_knowledge: str, query: str) -> List[str]:
-        now = datetime.now()
-        current_time = now.strftime("%Y-%m-%d")
-        self._update_progress("Generating follow-up questions...", None, {"iteration": len(self.questions_by_iteration)})
-        if self.questions_by_iteration:
-            prompt = f"""Critically reflect current knowledge (e.g., timeliness), what {self.questions_per_iteration} high-quality internet search questions remain unanswered to exactly answer the query?
-            Query: {query}
-            Today: {current_time}
-            Past questions: {str(self.questions_by_iteration)}
-            Knowledge: {current_knowledge}
-            Include questions that critically reflect current knowledge.
-            \n\n\nFormat: One question per line, e.g. \n Q: question1 \n Q: question2\n\n"""
+        # Initialize components
+        self.citation_handler = CitationHandler(self.model)
+        self.question_generator = StandardQuestionGenerator(self.model)
+        self.findings_repository = FindingsRepository(self.model)
+        # Initialize strategy based on name
+        if strategy_name.lower() == "iterdrag":
+            logger.info("Creating IterDRAGStrategy instance")
+            self.strategy = IterDRAGStrategy(model=self.model, search=self.search)
+        elif strategy_name.lower() == "parallel":
+            logger.info("Creating ParallelSearchStrategy instance")
+            self.strategy = ParallelSearchStrategy(
+                model=self.model,
+                search=self.search,
+                include_text_content=include_text_content,
+                use_cross_engine_filter=use_cross_engine_filter,
+            )
+        elif strategy_name.lower() == "rapid":
+            logger.info("Creating RapidSearchStrategy instance")
+            self.strategy = RapidSearchStrategy(model=self.model, search=self.search)
         else:
-            prompt = f" You will have follow up questions. First, identify if your knowledge is outdated (high chance). Today: {current_time}. Generate {self.questions_per_iteration} high-quality internet search questions to exactly answer: {query}\n\n\nFormat: One question per line, e.g. \n Q: question1 \n Q: question2\n\n"
+            logger.info("Creating StandardSearchStrategy instance")
+            self.strategy = StandardSearchStrategy(model=self.model, search=self.search)
-        response = self.model.invoke(prompt)
-        questions = [
-            q.replace("Q:", "").strip()
-            for q in remove_think_tags(response.content).split("\n")
-            if q.strip().startswith("Q:")
-        ][: self.questions_per_iteration]
-        self._update_progress(
-            f"Generated {len(questions)} follow-up questions",
-            None,
-            {"questions": questions}
-        )
-        return questions
+        # Log the actual strategy class
+        logger.info(f"Created strategy of type: {type(self.strategy).__name__}")
-    def _compress_knowledge(self, current_knowledge: str, query: str, section_links) -> List[str]:
-        self._update_progress("Compressing and summarizing knowledge...", None)
+        # For backward compatibility
+        self.questions_by_iteration = {}
+        self.progress_callback = None
+        self.all_links_of_system = list()
-        now = datetime.now()
-        current_time = now.strftime("%Y-%m-%d")
-        formatted_links = format_links(links=section_links)
-        if self.questions_by_iteration:
-            prompt = f"""First provide a high-quality 1 page explanation with IEEE Referencing Style e.g. [1,2]. Never make up sources. Than provide a exact high-quality one sentence-long answer to the query.
+        # Configure the strategy with our attributes
+        if hasattr(self, "progress_callback") and self.progress_callback:
+            self.strategy.set_progress_callback(self.progress_callback)
-            Knowledge: {current_knowledge}
-            Query: {query}
-            I will append following text to your output for the sources (dont repeat it):\n\n {formatted_links}"""
-        response = self.model.invoke(prompt)
-        self._update_progress("Knowledge compression complete", None)
-        response = remove_think_tags(response.content)
-        response = str(response) #+ "\n\n" + str(formatted_links)
+    def _progress_callback(self, message: str, progress: int, metadata: dict) -> None:
+        """Handle progress updates from the strategy."""
+        logger.info(f"Progress: {progress}% - {message}")
+        if hasattr(self, "progress_callback"):
+            self.progress_callback(message, progress, metadata)
-        return response
+    def set_progress_callback(self, callback: Callable[[str, int, dict], None]) -> None:
+        """Set a callback function to receive progress updates."""
+        self.progress_callback = callback
+        if hasattr(self, "strategy"):
+            self.strategy.set_progress_callback(callback)
     def analyze_topic(self, query: str) -> Dict:
-        logger.info(f"Starting research on topic: {query}")
-        findings = []
-        current_knowledge = ""
-        iteration = 0
-        total_iterations = self.max_iterations
-        section_links = list()
-        self._update_progress("Initializing research system", 5, {
-            "phase": "init",
-            "iterations_planned": total_iterations
-        })
-        # Check if search engine is available
-        if self.search is None:
-            error_msg = "Error: No search engine available. Please check your configuration."
-            self._update_progress(error_msg, 100, {
-                "phase": "error",
-                "error": "No search engine available",
-                "status": "failed"
-            })
-            return {
-                "findings": [],
-                "iterations": 0,
-                "questions": {},
-                "formatted_findings": "Error: Unable to conduct research without a search engine.",
-                "current_knowledge": "",
-                "error": error_msg
-            }
-        while iteration < self.max_iterations:
-            iteration_progress_base = (iteration / total_iterations) * 100
-            self._update_progress(f"Starting iteration {iteration + 1} of {total_iterations}",
-                                 int(iteration_progress_base),
-                                 {"phase": "iteration_start", "iteration": iteration + 1})
-            # Generate questions for this iteration
-            questions = self._get_follow_up_questions(current_knowledge, query)
-            self.questions_by_iteration[iteration] = questions
-            logger.info(f"Generated questions: {questions}")
-            question_count = len(questions)
-            for q_idx, question in enumerate(questions):
-                question_progress_base = iteration_progress_base + (((q_idx+1) / question_count) * (100/total_iterations) * 0.5)
-                self._update_progress(f"Searching for: {question}",
-                                     int(question_progress_base),
-                                     {"phase": "search", "iteration": iteration + 1, "question_index": q_idx + 1})
-                try:
-                    if self.search is None:
-                        self._update_progress(f"Search engine unavailable, skipping search for: {question}",
-                                            int(question_progress_base + 2),
-                                            {"phase": "search_error", "error": "No search engine available"})
-                        search_results = []
-                    else:
-                        search_results = self.search.run(question)
-                except Exception as e:
-                    error_msg = f"Error during search: {str(e)}"
-                    logger.info(f"SEARCH ERROR: {error_msg}")
-                    self._update_progress(error_msg,
-                                        int(question_progress_base + 2),
-                                        {"phase": "search_error", "error": str(e)})
-                    search_results = []
-                if search_results is None:
-                    self._update_progress(f"No search results found for question: {question}",
-                                        int(question_progress_base + 2),
-                                        {"phase": "search_complete", "result_count": 0})
-                    search_results = []  # Initialize to empty list instead of None
-                    continue
-                self._update_progress(f"Found {len(search_results)} results for question: {question}",
-                                    int(question_progress_base + 2),
-                                    {"phase": "search_complete", "result_count": len(search_results)})
-                logger.info(f"len search: {len(search_results)}")
-                if len(search_results) == 0:
-                    continue
+        """Analyze a topic using the current strategy.
-                self._update_progress(f"Analyzing results for: {question}",
-                                     int(question_progress_base + 5),
-                                     {"phase": "analysis"})
+        Args:
+            query: The research query to analyze
+        """
+        # Send progress message with LLM info
+        self.progress_callback(
+            f"Using {get_db_setting('llm.provider')} model: {get_db_setting('llm.model')}",
+            1,  # Low percentage to show this as an early step
+            {
+                "phase": "setup",
+                "llm_info": {
+                    "name": get_db_setting("llm.model"),
+                    "provider": get_db_setting("llm.provider"),
+                },
+            },
+        )
+        # Send progress message with search strategy info
+        search_tool = get_db_setting("search.tool")
+        self.progress_callback(
+            f"Using search tool: {search_tool}",
+            1.5,  # Between setup and processing steps
+            {
+                "phase": "setup",
+                "search_info": {
+                    "tool": search_tool,
+                },
+            },
+        )
-                try:
-                    result = self.citation_handler.analyze_followup(
-                        question, search_results, current_knowledge, nr_of_links=len(self.all_links_of_system)
-                    )
-                    links = extract_links_from_search_results(search_results)
-                    self.all_links_of_system.extend(links)
-                    section_links.extend(links)
-                    formatted_links = ""
-                    if links:
-                        formatted_links=format_links(links=links)
-                    logger.info(f"Generated questions: {formatted_links}")
-                    if result is not None:
-                        results_with_links = str(result["content"])
-                        findings.append(
-                            {
-                                "phase": f"Follow-up {iteration}.{questions.index(question) + 1}",
-                                "content": results_with_links,
-                                "question": question,
-                                "search_results": search_results,
-                                "documents": result["documents"],
-                            }
+        # Use the strategy to analyze the topic
+        result = self.strategy.analyze_topic(query)
+        # Update our attributes for backward compatibility
+        if hasattr(self.strategy, "questions_by_iteration"):
+            self.questions_by_iteration = self.strategy.questions_by_iteration
+            # Send progress message with search info
+            self.progress_callback(
+                f"Processed questions: {self.strategy.questions_by_iteration}",
+                2,  # Low percentage to show this as an early step
+                {
+                    "phase": "setup",
+                    "search_info": {
+                        "questions_by_iteration": len(
+                            self.strategy.questions_by_iteration
                         )
+                    },
+                },
+            )
+        if hasattr(self.strategy, "all_links_of_system"):
+            self.all_links_of_system = self.strategy.all_links_of_system
-                        if settings.general.knowledge_accumulation != str(KnowledgeAccumulationApproach.NO_KNOWLEDGE.value):
-                            current_knowledge = current_knowledge + "\n\n\n New: \n" + results_with_links
-                        if settings.general.knowledge_accumulation == str(KnowledgeAccumulationApproach.QUESTION.value):
-                            logger.info("Compressing knowledge")
-                            self._update_progress(f"Compress Knowledge for: {question}",
-                                        int(question_progress_base + 0),
-                                        {"phase": "analysis"})
-                            current_knowledge = self._compress_knowledge(current_knowledge , query, section_links)
-                        self._update_progress(f"Analysis complete for question: {question}",
-                                            int(question_progress_base + 10),
-                                            {"phase": "analysis_complete"})
-                except Exception as e:
-                    error_msg = f"Error analyzing results: {str(e)}"
-                    logger.info(f"ANALYSIS ERROR: {error_msg}")
-                    self._update_progress(error_msg,
-                                        int(question_progress_base + 10),
-                                        {"phase": "analysis_error", "error": str(e)})
-            iteration += 1
-            self._update_progress(f"Compressing knowledge after iteration {iteration}",
-                                 int((iteration / total_iterations) * 100 - 5),
-                                 {"phase": "knowledge_compression"})
-            logger.info(str(iteration))
-            logger.info(settings.general.knowledge_accumulation)
-            logger.info(str(KnowledgeAccumulationApproach.ITERATION.value))
-            if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.ITERATION.value:
-                try:
-                    logger.info("ITERATION - Compressing Knowledge")
-                    current_knowledge = self._compress_knowledge(current_knowledge , query, section_links)
-                    logger.info("FINISHED ITERATION - Compressing Knowledge")
-                except Exception as e:
-                    error_msg = f"Error compressing knowledge: {str(e)}"
-                    logger.info(f"COMPRESSION ERROR: {error_msg}")
-                    self._update_progress(error_msg,
-                                        int((iteration / total_iterations) * 100 - 3),
-                                        {"phase": "compression_error", "error": str(e)})
-            self._update_progress(f"Iteration {iteration} complete",
-                                 int((iteration / total_iterations) * 100),
-                                 {"phase": "iteration_complete", "iteration": iteration})
-            try:
-                formatted_findings = self._save_findings(findings, current_knowledge, query)
-            except Exception as e:
-                error_msg = f"Error saving findings: {str(e)}"
-                logger.info(f"SAVE ERROR: {error_msg}")
-                self._update_progress(error_msg,
-                                    int((iteration / total_iterations) * 100),
-                                    {"phase": "save_error", "error": str(e)})
-                formatted_findings = "Error: Could not format findings due to an error."
-        self._update_progress("Research complete", 95, {"phase": "complete"})
-        return {
-            "findings": findings,
-            "iterations": iteration,
-            "questions": self.questions_by_iteration,
-            "formatted_findings": formatted_findings,
-            "current_knowledge": current_knowledge
-        }
-    def _save_findings(self, findings: List[Dict], current_knowledge: str, query: str):
-        logger.info("Saving findings ...")
-        self._update_progress("Saving research findings...", None)
-        formatted_findings = format_findings_to_text(
-            findings, current_knowledge, self.questions_by_iteration
-        )
-        safe_query = "".join(x for x in query if x.isalnum() or x in [" ", "-", "_"])[
-            :50
-        ]
-        safe_query = safe_query.replace(" ", "_").lower()
-        import local_deep_research.config as conf
-        output_dir = f"{conf.get_config_dir()}/research_outputs"
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-        filename = os.path.join(output_dir, f"formatted_output_{safe_query}.txt")
+        # Include the search system instance for access to citations
+        result["search_system"] = self
-        with open(filename, "w", encoding="utf-8") as text_file:
-            text_file.write(formatted_findings)
-        logger.info("Saved findings")
-        self._update_progress("Research findings saved", None, {"filename": filename})
-        return formatted_findings
+        return result

local-deep-research 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl