PyPI - local-deep-research - Versions diffs - 0.1.0__py3-none-any.whl - Mend

local-deep-research 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

local_deep_research/__init__.py +24 -0
local_deep_research/citation_handler.py +113 -0
local_deep_research/config.py +166 -0
local_deep_research/defaults/__init__.py +44 -0
local_deep_research/defaults/llm_config.py +269 -0
local_deep_research/defaults/local_collections.toml +47 -0
local_deep_research/defaults/main.toml +57 -0
local_deep_research/defaults/search_engines.toml +244 -0
local_deep_research/local_collections.py +141 -0
local_deep_research/main.py +113 -0
local_deep_research/report_generator.py +206 -0
local_deep_research/search_system.py +241 -0
local_deep_research/utilties/__init__.py +0 -0
local_deep_research/utilties/enums.py +9 -0
local_deep_research/utilties/llm_utils.py +116 -0
local_deep_research/utilties/search_utilities.py +115 -0
local_deep_research/utilties/setup_utils.py +6 -0
local_deep_research/web/__init__.py +2 -0
local_deep_research/web/app.py +1209 -0
local_deep_research/web/static/css/styles.css +1008 -0
local_deep_research/web/static/js/app.js +2078 -0
local_deep_research/web/templates/api_keys_config.html +82 -0
local_deep_research/web/templates/collections_config.html +90 -0
local_deep_research/web/templates/index.html +312 -0
local_deep_research/web/templates/llm_config.html +120 -0
local_deep_research/web/templates/main_config.html +89 -0
local_deep_research/web/templates/search_engines_config.html +154 -0
local_deep_research/web/templates/settings.html +519 -0
local_deep_research/web/templates/settings_dashboard.html +207 -0
local_deep_research/web_search_engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/full_search.py +128 -0
local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
local_deep_research/web_search_engines/full_search.py +254 -0
local_deep_research/web_search_engines/search_engine_base.py +197 -0
local_deep_research/web_search_engines/search_engine_factory.py +233 -0
local_deep_research/web_search_engines/search_engines_config.py +54 -0
local_deep_research-0.1.0.dist-info/LICENSE +21 -0
local_deep_research-0.1.0.dist-info/METADATA +328 -0
local_deep_research-0.1.0.dist-info/RECORD +56 -0
local_deep_research-0.1.0.dist-info/WHEEL +5 -0
local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
local_deep_research-0.1.0.dist-info/top_level.txt +1 -0

local_deep_research/report_generator.py ADDED Viewed

@@ -0,0 +1,206 @@
+from typing import Dict, List, Optional
+from .config import get_llm
+import re
+from datetime import datetime
+from .search_system import AdvancedSearchSystem
+from local_deep_research import config
+from . import utilties
+from .utilties import search_utilities
+class IntegratedReportGenerator:
+    def __init__(self, searches_per_section: int = 2):
+        self.model = get_llm()
+        self.search_system = AdvancedSearchSystem()
+        self.searches_per_section = (
+            searches_per_section  # Control search depth per section
+        )
+    def _remove_think_tags(self, text: str) -> str:
+        print(text)
+        return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
+    def generate_report(self, initial_findings: Dict, query: str) -> Dict:
+        """Generate a complete research report with section-specific research."""
+        # Step 1: Determine structure
+        structure = self._determine_report_structure(initial_findings, query)
+        # Step 2: Research and generate content for each section in one step
+        sections = self._research_and_generate_sections(initial_findings, structure, query)
+        # Step 3: Format final report
+        report = self._format_final_report(sections, structure, query)
+        return report
+    def _determine_report_structure(
+        self, findings: Dict, query: str
+    ) -> List[Dict]:
+        """Analyze content and determine optimal report structure."""
+        combined_content = findings["current_knowledge"]
+        prompt = f"""
+        Analyze this research content about: {query}
+        Content Summary:
+        {combined_content[:1000]}... [truncated]
+        Determine the most appropriate report structure by:
+        1. Analyzing the type of content (technical, business, academic, etc.)
+        2. Identifying main themes and logical groupings
+        3. Considering the depth and breadth of the research
+        Return a table of contents structure in this exact format:
+        STRUCTURE
+        1. [Section Name]
+           - [Subsection] | [purpose]
+        2. [Section Name]
+           - [Subsection] | [purpose]
+        ...
+        END_STRUCTURE
+        Make the structure specific to the content, not generic.
+        Each subsection must include its purpose after the | symbol.
+        """
+        response = self._remove_think_tags(self.model.invoke(prompt).content)
+        # Parse the structure
+        structure = []
+        current_section = None
+        for line in response.split("\n"):
+            if line.strip() in ["STRUCTURE", "END_STRUCTURE"]:
+                continue
+            if line.strip().startswith(tuple("123456789")):
+                # Main section
+                section_name = line.split(".")[1].strip()
+                current_section = {"name": section_name, "subsections": []}
+                structure.append(current_section)
+            elif line.strip().startswith("-") and current_section:
+                # Subsection with purpose
+                parts = line.strip("- ").split("|")
+                if len(parts) == 2:
+                    current_section["subsections"].append(
+                        {"name": parts[0].strip(), "purpose": parts[1].strip()}
+                    )
+        return structure
+    def _research_and_generate_sections(
+        self,
+        initial_findings: Dict,
+        structure: List[Dict],
+        query: str,
+    ) -> Dict[str, str]:
+        """Research and generate content for each section in one step."""
+        sections = {}
+        for section in structure:
+            print(f"Processing section: {section['name']}")
+            section_content = []
+            section_content.append(f"# {section['name']}\n")
+            # Process each subsection by directly researching it
+            for subsection in section["subsections"]:
+                # Add subsection header
+                section_content.append(f"## {subsection['name']}\n")
+                section_content.append(f"_{subsection['purpose']}_\n\n")
+                # Generate a specific search query for this subsection
+                subsection_query = f"{query} {section['name']} {subsection['name']} {subsection['purpose']}"
+                print(f"Researching subsection: {subsection['name']} with query: {subsection_query}")
+                # Configure search system for focused search
+                original_max_iterations = self.search_system.max_iterations
+                self.search_system.max_iterations = 1  # Keep search focused
+                # Perform search for this subsection
+                subsection_results = self.search_system.analyze_topic(subsection_query)
+                # Restore original iterations setting
+                self.search_system.max_iterations = original_max_iterations
+                # Add the researched content for this subsection
+                if "current_knowledge" in subsection_results and subsection_results["current_knowledge"]:
+                    section_content.append(subsection_results["current_knowledge"])
+                else:
+                    section_content.append("*Limited information was found for this subsection.*\n")
+                section_content.append("\n\n")
+            # Combine all content for this section
+            sections[section["name"]] = "\n".join(section_content)
+        return sections
+    def _generate_sections(
+        self,
+        initial_findings: Dict,
+        section_research: Dict[str, List[Dict]],
+        structure: List[Dict],
+        query: str,
+    ) -> Dict[str, str]:
+        """
+        This method is kept for compatibility but no longer used.
+        The functionality has been moved to _research_and_generate_sections.
+        """
+        return {}
+    def _format_final_report(
+        self,
+        sections: Dict[str, str],
+        structure: List[Dict],
+        query: str,
+    ) -> Dict:
+        """Format the final report with table of contents and sections."""
+        # Generate TOC
+        toc = ["# Table of Contents\n"]
+        for i, section in enumerate(structure, 1):
+            toc.append(f"{i}. **{section['name']}**")
+            for j, subsection in enumerate(section["subsections"], 1):
+                toc.append(f"   {i}.{j} {subsection['name']} | _{subsection['purpose']}_")
+        # Combine TOC and sections
+        report_parts = ["\n".join(toc), ""]
+        # Add a summary of the research
+        report_parts.append("# Research Summary")
+        report_parts.append(f"This report was researched using an advanced search system.")
+        report_parts.append(f"Research included targeted searches for each section and subsection.")
+        report_parts.append("\n---\n")
+        # Add each section's content
+        for section in structure:
+            if section["name"] in sections:
+                report_parts.append(sections[section["name"]])
+                report_parts.append("")
+        # Format links from search system
+        formatted_all_links = utilties.search_utilities.format_links(links=self.search_system.all_links_of_system)
+        # Create final report with all parts
+        final_report_content = "\n\n".join(report_parts)
+        final_report_content = final_report_content + "\n\n## Sources\n\n" + formatted_all_links
+        # Create metadata dictionary
+        from datetime import datetime
+        metadata = {
+            "generated_at": datetime.utcnow().isoformat(),
+            "initial_sources": len(self.search_system.all_links_of_system),
+            "sections_researched": len(structure),
+            "searches_per_section": self.searches_per_section,
+            "query": query
+        }
+        # Return both content and metadata
+        return {
+            "content": final_report_content,
+            "metadata": metadata
+        }
+    def _generate_error_report(self, query: str, error_msg: str) -> str:
+        error_report = f"=== ERROR REPORT ===\nQuery: {query}\nError: {error_msg}"
+        return error_report

local_deep_research/search_system.py ADDED Viewed

@@ -0,0 +1,241 @@
+from typing import Dict, List, Optional, Callable
+from datetime import datetime
+from .utilties.search_utilities import remove_think_tags, format_findings_to_text, print_search_results, format_links
+import os
+from .utilties.enums import KnowledgeAccumulationApproach
+from .config import settings, get_llm, get_search
+from .citation_handler import CitationHandler
+from datetime import datetime
+from .utilties.search_utilities import extract_links_from_search_results
+import logging
+logger = logging.getLogger(__name__)
+class AdvancedSearchSystem:
+    def __init__(self):
+        # Get fresh configuration
+        self.search = get_search()
+        self.model = get_llm()
+        self.max_iterations = settings.search.iterations
+        self.questions_per_iteration = settings.search.questions_per_iteration
+        self.context_limit = settings.general.knowledge_accumulation_context_limit
+        self.questions_by_iteration = {}
+        self.citation_handler = CitationHandler(self.model)
+        self.progress_callback = None
+        self.all_links_of_system = list()
+    def set_progress_callback(self, callback: Callable[[str, int, dict], None]) -> None:
+        """Set a callback function to receive progress updates.
+        Args:
+            callback: Function that takes (message, progress_percent, metadata)
+        """
+        self.progress_callback = callback
+    def _update_progress(self, message: str, progress_percent: int = None, metadata: dict = None) -> None:
+        """Send a progress update via the callback if available.
+        Args:
+            message: Description of the current progress state
+            progress_percent: Progress percentage (0-100), if applicable
+            metadata: Additional data about the progress state
+        """
+        if self.progress_callback:
+            self.progress_callback(message, progress_percent, metadata or {})
+    def _get_follow_up_questions(self, current_knowledge: str, query: str) -> List[str]:
+        now = datetime.now()
+        current_time = now.strftime("%Y-%m-%d")
+        self._update_progress("Generating follow-up questions...", None, {"iteration": len(self.questions_by_iteration)})
+        if self.questions_by_iteration:
+            prompt = f"""Critically reflect current knowledge (e.g., timeliness), what {self.questions_per_iteration} high-quality internet search questions remain unanswered to exactly answer the query?
+            Query: {query}
+            Today: {current_time}
+            Past questions: {str(self.questions_by_iteration)}
+            Knowledge: {current_knowledge}
+            Include questions that critically reflect current knowledge.
+            \n\n\nFormat: One question per line, e.g. \n Q: question1 \n Q: question2\n\n"""
+        else:
+            prompt = f" You will have follow up questions. First, identify if your knowledge is outdated (high chance). Today: {current_time}. Generate {self.questions_per_iteration} high-quality internet search questions to exactly answer: {query}\n\n\nFormat: One question per line, e.g. \n Q: question1 \n Q: question2\n\n"
+        response = self.model.invoke(prompt)
+        questions = [
+            q.replace("Q:", "").strip()
+            for q in remove_think_tags(response.content).split("\n")
+            if q.strip().startswith("Q:")
+        ][: self.questions_per_iteration]
+        self._update_progress(
+            f"Generated {len(questions)} follow-up questions",
+            None,
+            {"questions": questions}
+        )
+        return questions
+    def _compress_knowledge(self, current_knowledge: str, query: str, section_links) -> List[str]:
+        self._update_progress("Compressing and summarizing knowledge...", None)
+        now = datetime.now()
+        current_time = now.strftime("%Y-%m-%d")
+        formatted_links = format_links(links=section_links)
+        if self.questions_by_iteration:
+            prompt = f"""First provide a high-quality 1 page explanation with IEEE Referencing Style e.g. [1,2]. Never make up sources. Than provide a exact high-quality one sentence-long answer to the query.
+            Knowledge: {current_knowledge}
+            Query: {query}
+            I will append following text to your output for the sources (dont repeat it):\n\n {formatted_links}"""
+        response = self.model.invoke(prompt)
+        self._update_progress("Knowledge compression complete", None)
+        response = remove_think_tags(response.content)
+        response = str(response) #+ "\n\n" + str(formatted_links)
+        print(response)
+        return response
+    def analyze_topic(self, query: str) -> Dict:
+        logger.info(f"Starting research on topic: {query}")
+        findings = []
+        current_knowledge = ""
+        iteration = 0
+        total_iterations = self.max_iterations
+        section_links = list()
+        self._update_progress("Initializing research system", 5, {
+            "phase": "init",
+            "iterations_planned": total_iterations
+        })
+        while iteration < self.max_iterations:
+            iteration_progress_base = (iteration / total_iterations) * 100
+            self._update_progress(f"Starting iteration {iteration + 1} of {total_iterations}",
+                                 int(iteration_progress_base),
+                                 {"phase": "iteration_start", "iteration": iteration + 1})
+            # Generate questions for this iteration
+            questions = self._get_follow_up_questions(current_knowledge, query)
+            self.questions_by_iteration[iteration] = questions
+            logger.info(f"Generated questions: {questions}")
+            question_count = len(questions)
+            for q_idx, question in enumerate(questions):
+                question_progress_base = iteration_progress_base + (((q_idx+1) / question_count) * (100/total_iterations) * 0.5)
+                self._update_progress(f"Searching for: {question}",
+                                     int(question_progress_base),
+                                     {"phase": "search", "iteration": iteration + 1, "question_index": q_idx + 1})
+                search_results = self.search.run(question)
+                if search_results is None:
+                    self._update_progress(f"No search results found for question: {question}",
+                                        int(question_progress_base + 2),
+                                        {"phase": "search_complete", "result_count": 0})
+                    search_results = []  # Initialize to empty list instead of None
+                    continue
+                self._update_progress(f"Found {len(search_results)} results for question: {question}",
+                                    int(question_progress_base + 2),
+                                    {"phase": "search_complete", "result_count": len(search_results)})
+                logger.info("len search", len(search_results))
+                if len(search_results) == 0:
+                    continue
+                self._update_progress(f"Analyzing results for: {question}",
+                                     int(question_progress_base + 5),
+                                     {"phase": "analysis"})
+                print("NR OF SOURCES: ", len(self.all_links_of_system))
+                result = self.citation_handler.analyze_followup(
+                    question, search_results, current_knowledge, nr_of_links=len(self.all_links_of_system)
+                )
+                links = extract_links_from_search_results(search_results)
+                self.all_links_of_system.extend(links)
+                section_links.extend(links)
+                formatted_links = ""
+                if links:
+                    formatted_links=format_links(links=links)
+                logger.debug(f"Generated questions: {formatted_links}")
+                if result is not None:
+                    results_with_links = str(result["content"])
+                    findings.append(
+                        {
+                            "phase": f"Follow-up {iteration}.{questions.index(question) + 1}",
+                            "content": results_with_links,
+                            "question": question,
+                            "search_results": search_results,
+                            "documents": result["documents"],
+                        }
+                    )
+                    if settings.general.knowledge_accumulation != KnowledgeAccumulationApproach.NO_KNOWLEDGE:
+                        current_knowledge = current_knowledge + "\n\n\n New: \n" + results_with_links
+                    print(current_knowledge)
+                    if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.QUESTION:
+                        self._update_progress(f"Compress Knowledge for: {question}",
+                                     int(question_progress_base + 0),
+                                     {"phase": "analysis"})
+                        current_knowledge = self._compress_knowledge(current_knowledge , query, section_links)
+                    self._update_progress(f"Analysis complete for question: {question}",
+                                         int(question_progress_base + 10),
+                                         {"phase": "analysis_complete"})
+            iteration += 1
+            self._update_progress(f"Compressing knowledge after iteration {iteration}",
+                                 int((iteration / total_iterations) * 100 - 5),
+                                 {"phase": "knowledge_compression"})
+            if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.ITERATION:
+                current_knowledge = self._compress_knowledge(current_knowledge , query, section_links)
+            self._update_progress(f"Iteration {iteration} complete",
+                                 int((iteration / total_iterations) * 100),
+                                 {"phase": "iteration_complete", "iteration": iteration})
+            formatted_findings = self._save_findings(findings, current_knowledge, query)
+        self._update_progress("Research complete", 95, {"phase": "complete"})
+        return {
+            "findings": findings,
+            "iterations": iteration,
+            "questions": self.questions_by_iteration,
+            "formatted_findings": formatted_findings,
+            "current_knowledge": current_knowledge
+        }
+    def _save_findings(self, findings: List[Dict], current_knowledge: str, query: str):
+        self._update_progress("Saving research findings...", None)
+        formatted_findings = format_findings_to_text(
+            findings, current_knowledge, self.questions_by_iteration
+        )
+        safe_query = "".join(x for x in query if x.isalnum() or x in [" ", "-", "_"])[
+            :50
+        ]
+        safe_query = safe_query.replace(" ", "_").lower()
+        output_dir = "research_outputs"
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        filename = os.path.join(output_dir, f"formatted_output_{safe_query}.txt")
+        with open(filename, "w", encoding="utf-8") as text_file:
+            text_file.write(formatted_findings)
+        self._update_progress("Research findings saved", None, {"filename": filename})
+        return formatted_findings

local_deep_research/utilties/__init__.py ADDED Viewed

File without changes

local_deep_research/utilties/enums.py ADDED Viewed

@@ -0,0 +1,9 @@
+# config/enums.py
+from enum import Enum, auto
+class KnowledgeAccumulationApproach(Enum):
+    QUESTION = auto()
+    ITERATION = auto()
+    NO_KNOWLEDGE = auto()
+    MAX_NR_OF_CHARACTERS = auto()

local_deep_research/utilties/llm_utils.py ADDED Viewed

@@ -0,0 +1,116 @@
+# utilties/llm_utils.py
+"""
+LLM utilities for Local Deep Research.
+This module provides utility functions for working with language models
+when the user's llm_config.py is missing or incomplete.
+"""
+import os
+import logging
+from typing import Dict, Any, Optional
+# Setup logging
+logger = logging.getLogger(__name__)
+def get_model(
+    model_name: Optional[str] = None,
+    model_type: Optional[str] = None,
+    temperature: Optional[float] = None,
+    **kwargs
+) -> Any:
+    """
+    Get a language model instance as fallback when llm_config.get_llm is not available.
+    Args:
+        model_name: Name of the model to use
+        model_type: Type of the model provider
+        temperature: Model temperature
+        **kwargs: Additional parameters
+    Returns:
+        LangChain language model instance
+    """
+    # Get default values from kwargs or use reasonable defaults
+    model_name = model_name or kwargs.get('DEFAULT_MODEL', 'mistral')
+    model_type = model_type or kwargs.get('DEFAULT_MODEL_TYPE', 'ollama')
+    temperature = temperature or kwargs.get('DEFAULT_TEMPERATURE', 0.7)
+    max_tokens = kwargs.get('max_tokens', kwargs.get('MAX_TOKENS', 30000))
+    # Common parameters
+    common_params = {
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }
+    # Add additional kwargs
+    for key, value in kwargs.items():
+        if key not in ['DEFAULT_MODEL', 'DEFAULT_MODEL_TYPE', 'DEFAULT_TEMPERATURE', 'MAX_TOKENS']:
+            common_params[key] = value
+    # Try to load the model based on type
+    if model_type == "ollama":
+        try:
+            from langchain_ollama import ChatOllama
+            return ChatOllama(model=model_name, **common_params)
+        except ImportError:
+            try:
+                from langchain_community.llms import Ollama
+                return Ollama(model=model_name, **common_params)
+            except ImportError:
+                logger.error("Neither langchain_ollama nor langchain_community.llms.Ollama available")
+                raise
+    elif model_type == "openai":
+        try:
+            from langchain_openai import ChatOpenAI
+            api_key = os.getenv("OPENAI_API_KEY")
+            if not api_key:
+                raise ValueError("OPENAI_API_KEY environment variable not set")
+            return ChatOpenAI(model=model_name, api_key=api_key, **common_params)
+        except ImportError:
+            logger.error("langchain_openai not available")
+            raise
+    elif model_type == "anthropic":
+        try:
+            from langchain_anthropic import ChatAnthropic
+            api_key = os.getenv("ANTHROPIC_API_KEY")
+            if not api_key:
+                raise ValueError("ANTHROPIC_API_KEY environment variable not set")
+            return ChatAnthropic(model=model_name, anthropic_api_key=api_key, **common_params)
+        except ImportError:
+            logger.error("langchain_anthropic not available")
+            raise
+    elif model_type == "openai_endpoint":
+        try:
+            from langchain_openai import ChatOpenAI
+            api_key = os.getenv("OPENAI_ENDPOINT_API_KEY")
+            if not api_key:
+                raise ValueError("OPENAI_ENDPOINT_API_KEY environment variable not set")
+            endpoint_url = kwargs.get("OPENAI_ENDPOINT_URL", "https://openrouter.ai/api/v1")
+            if model_name is None and not kwargs.get("OPENAI_ENDPOINT_REQUIRES_MODEL", True):
+                return ChatOpenAI(api_key=api_key, openai_api_base=endpoint_url, **common_params)
+            else:
+                return ChatOpenAI(model=model_name, api_key=api_key, openai_api_base=endpoint_url, **common_params)
+        except ImportError:
+            logger.error("langchain_openai not available")
+            raise
+    # Default fallback
+    try:
+        from langchain_ollama import ChatOllama
+        logger.warning(f"Unknown model type '{model_type}', defaulting to Ollama")
+        return ChatOllama(model=model_name, **common_params)
+    except (ImportError, Exception) as e:
+        logger.error(f"Failed to load any model: {e}")
+        # Last resort: create a dummy model
+        try:
+            from langchain_community.llms.fake import FakeListLLM
+            return FakeListLLM(responses=["No language models are available. Please install Ollama or set up API keys."])
+        except ImportError:
+            raise ValueError("No language models available and could not create dummy model")