PyPI - local-deep-research - Versions diffs - 0.1.0__py3-none-any.whl - Mend

local-deep-research 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

local_deep_research/__init__.py +24 -0
local_deep_research/citation_handler.py +113 -0
local_deep_research/config.py +166 -0
local_deep_research/defaults/__init__.py +44 -0
local_deep_research/defaults/llm_config.py +269 -0
local_deep_research/defaults/local_collections.toml +47 -0
local_deep_research/defaults/main.toml +57 -0
local_deep_research/defaults/search_engines.toml +244 -0
local_deep_research/local_collections.py +141 -0
local_deep_research/main.py +113 -0
local_deep_research/report_generator.py +206 -0
local_deep_research/search_system.py +241 -0
local_deep_research/utilties/__init__.py +0 -0
local_deep_research/utilties/enums.py +9 -0
local_deep_research/utilties/llm_utils.py +116 -0
local_deep_research/utilties/search_utilities.py +115 -0
local_deep_research/utilties/setup_utils.py +6 -0
local_deep_research/web/__init__.py +2 -0
local_deep_research/web/app.py +1209 -0
local_deep_research/web/static/css/styles.css +1008 -0
local_deep_research/web/static/js/app.js +2078 -0
local_deep_research/web/templates/api_keys_config.html +82 -0
local_deep_research/web/templates/collections_config.html +90 -0
local_deep_research/web/templates/index.html +312 -0
local_deep_research/web/templates/llm_config.html +120 -0
local_deep_research/web/templates/main_config.html +89 -0
local_deep_research/web/templates/search_engines_config.html +154 -0
local_deep_research/web/templates/settings.html +519 -0
local_deep_research/web/templates/settings_dashboard.html +207 -0
local_deep_research/web_search_engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/full_search.py +128 -0
local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
local_deep_research/web_search_engines/full_search.py +254 -0
local_deep_research/web_search_engines/search_engine_base.py +197 -0
local_deep_research/web_search_engines/search_engine_factory.py +233 -0
local_deep_research/web_search_engines/search_engines_config.py +54 -0
local_deep_research-0.1.0.dist-info/LICENSE +21 -0
local_deep_research-0.1.0.dist-info/METADATA +328 -0
local_deep_research-0.1.0.dist-info/RECORD +56 -0
local_deep_research-0.1.0.dist-info/WHEEL +5 -0
local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
local_deep_research-0.1.0.dist-info/top_level.txt +1 -0

local_deep_research/defaults/main.toml ADDED Viewed

@@ -0,0 +1,57 @@
+# Main configuration for Local Deep Research
+[general]
+# Directory for research outputs (relative to user data directory)
+output_dir = "research_outputs"
+# Knowledge accumulation approach (NONE, QUESTION, or ITERATION)
+knowledge_accumulation = "ITERATION"
+# Maximum context size for knowledge accumulation
+knowledge_accumulation_context_limit = 2000000
+# Enable fact checking (experimental, works better with large LLMs)
+enable_fact_checking = false
+[search]
+# Search tool to use (auto, wikipedia, arxiv, duckduckgo, serp, google_pse, etc.)
+# "auto" intelligently selects based on query content (recommended)
+# "local_all" searches only local document collections
+tool = "auto"
+# Number of research cycles
+iterations = 3
+# Questions generated per cycle
+questions_per_iteration = 3
+# Searches per report section
+searches_per_section = 3
+# Results per search query
+max_results = 50
+# Results after relevance filtering
+max_filtered_results = 5
+# Search region
+region = "us"
+# Time period (d=day, w=week, m=month, y=year)
+time_period = "y"
+# Enable safe search
+safe_search = true
+# Search language
+search_language = "English"
+# Return only snippets, not full content (faster but less detailed)
+snippets_only = false
+# Skip relevance filtering (return all results)
+skip_relevance_filter = false
+# Check URL quality
+quality_check_urls = true

local_deep_research/defaults/search_engines.toml ADDED Viewed

@@ -0,0 +1,244 @@
+# Search Engines Configuration for Local Deep Research
+# This file defines all available search engines and their properties
+[wikipedia]
+module_path = "local_deep_research.web_search_engines.engines.search_engine_wikipedia"
+class_name = "WikipediaSearchEngine"
+requires_api_key = false
+reliability = 0.95
+strengths = [
+    "factual information", "general knowledge", "definitions",
+    "historical facts", "biographies", "overview information"
+]
+weaknesses = ["recent events", "specialized academic topics", "product comparisons"]
+[wikipedia.default_params]
+max_results = 20
+include_content = true
+[arxiv]
+module_path = "local_deep_research.web_search_engines.engines.search_engine_arxiv"
+class_name = "ArXivSearchEngine"
+requires_api_key = false
+reliability = 0.9
+strengths = [
+    "scientific papers", "academic research", "physics", "computer science",
+    "mathematics", "statistics", "machine learning", "preprints"
+]
+weaknesses = ["non-academic topics", "consumer products", "news", "general information"]
+[arxiv.default_params]
+max_results = 20
+sort_by = "relevance"
+sort_order = "descending"
+[pubmed]
+module_path = "local_deep_research.web_search_engines.engines.search_engine_pubmed"
+class_name = "PubMedSearchEngine"
+requires_api_key = false
+api_key_env = "NCBI_API_KEY"
+reliability = 0.95
+strengths = [
+    "biomedical literature", "medical research", "clinical studies",
+    "life sciences", "health information", "scientific papers"
+]
+weaknesses = [
+    "non-medical topics", "very recent papers may be missing",
+    "limited to published research"
+]
+requires_llm = true
+[pubmed.default_params]
+max_results = 20
+get_abstracts = true
+get_full_text = false
+full_text_limit = 3
+days_limit = 0
+optimize_queries = true
+[github]
+module_path = "local_deep_research.web_search_engines.engines.search_engine_github"
+class_name = "GitHubSearchEngine"
+requires_api_key = false
+reliability = 0.99
+strengths = [
+    "code repositories", "software documentation", "open source projects",
+    "programming issues", "developer information", "technical documentation"
+]
+weaknesses = ["non-technical content", "content outside GitHub", "rate limits without API key"]
+supports_full_search = true
+[github.default_params]
+max_results = 15
+search_type = "repositories"
+include_readme = true
+include_issues = false
+[serpapi]
+module_path = "local_deep_research.web_search_engines.engines.search_engine_serpapi"
+class_name = "SerpAPISearchEngine"
+requires_api_key = true
+api_key_env = "SERP_API_KEY"
+reliability = 0.6
+strengths = [
+    "comprehensive web search", "product information", "reviews",
+    "recent content", "news", "broad coverage"
+]
+weaknesses = ["requires API key with usage limits", "not specialized for academic content"]
+supports_full_search = true
+full_search_module = "local_deep_research.web_search_engines.engines.full_serp_search_results_old"
+full_search_class = "FullSerpAPISearchResults"
+[serpapi.default_params]
+region = "us"
+time_period = "y"
+safe_search = true
+search_language = "English"
+[google_pse]
+module_path = "local_deep_research.web_search_engines.engines.search_engine_google_pse"
+class_name = "GooglePSESearchEngine"
+requires_api_key = true
+api_key_env = "GOOGLE_PSE_API_KEY"
+reliability = 0.9
+strengths = [
+    "custom search scope", "high-quality results", "domain-specific search",
+    "configurable search experience", "control over search index"
+]
+weaknesses = [
+    "requires API key with usage limits",
+    "limited to 10,000 queries/day on free tier",
+    "requires search engine configuration in Google Control Panel"
+]
+supports_full_search = true
+full_search_module = "local_deep_research.web_search_engines.engines.full_search"
+full_search_class = "FullSearchResults"
+[google_pse.default_params]
+region = "us"
+safe_search = true
+search_language = "English"
+[brave]
+module_path = "local_deep_research.web_search_engines.engines.search_engine_brave"
+class_name = "BraveSearchEngine"
+requires_api_key = true
+api_key_env = "BRAVE_API_KEY"
+reliability = 0.7
+strengths = [
+    "privacy-focused web search", "product information", "reviews",
+    "recent content", "news", "broad coverage"
+]
+weaknesses = ["requires API key with usage limits", "smaller index than Google"]
+supports_full_search = true
+full_search_module = "local_deep_research.web_search_engines.engines.full_search"
+full_search_class = "FullSearchResults"
+[brave.default_params]
+region = "US"
+time_period = "y"
+safe_search = true
+search_language = "English"
+[wayback]
+module_path = "local_deep_research.web_search_engines.engines.search_engine_wayback"
+class_name = "WaybackSearchEngine"
+requires_api_key = false
+reliability = 0.5
+strengths = [
+    "historical web content", "archived websites", "content verification",
+    "deleted or changed web pages", "website evolution tracking"
+]
+weaknesses = [
+    "limited to previously archived content", "may miss recent changes",
+    "archiving quality varies"
+]
+supports_full_search = true
+[wayback.default_params]
+max_results = 15
+max_snapshots_per_url = 3
+closest_only = false
+language = "English"
+[auto]
+module_path = "local_deep_research.web_search_engines.engines.meta_search_engine"
+class_name = "MetaSearchEngine"
+requires_api_key = false
+reliability = 0.85
+strengths = [
+    "intelligent engine selection", "adaptable to query type",
+    "fallback capabilities"
+]
+weaknesses = ["slightly slower due to LLM analysis"]
+requires_llm = true
+[auto.default_params]
+use_api_key_services = true
+max_engines_to_try = 3
+[local_all]
+module_path = "local_deep_research.web_search_engines.engines.search_engine_local_all"
+class_name = "LocalAllSearchEngine"
+requires_api_key = false
+reliability = 0.85
+strengths = ["searches all local collections", "personal documents", "offline access"]
+weaknesses = ["may return too many results", "requires indexing"]
+requires_llm = true
+# Default search engine to use if none specified
+DEFAULT_SEARCH_ENGINE = "wikipedia"
+# Additional search engines can be added below
+# Uncomment and modify these templates as needed
+# [duckduckgo]
+# module_path = "local_deep_research.web_search_engines.engines.search_engine_ddg"
+# class_name = "DuckDuckGoSearchEngine"
+# requires_api_key = false
+# reliability = 0.4
+# strengths = [
+#     "web search", "product information", "reviews", "recent information",
+#     "news", "general queries", "broad coverage"
+# ]
+# weaknesses = ["inconsistent due to rate limits", "not specialized for academic content"]
+# supports_full_search = true
+# full_search_module = "local_deep_research.web_search_engines.engines.full_search"
+# full_search_class = "FullSearchResults"
+#
+# [duckduckgo.default_params]
+# region = "us"
+# safe_search = true
+# [guardian]
+# module_path = "local_deep_research.web_search_engines.engines.search_engine_guardian"
+# class_name = "GuardianSearchEngine"
+# requires_api_key = true
+# api_key_env = "GUARDIAN_API_KEY"
+# reliability = 0.5
+# strengths = [
+#     "news articles", "current events", "opinion pieces", "journalism",
+#     "UK and global news", "political analysis"
+# ]
+# weaknesses = ["primarily focused on news", "limited historical content pre-1999"]
+#
+# [guardian.default_params]
+# order_by = "relevance"
+# [medrxiv]
+# module_path = "local_deep_research.web_search_engines.engines.search_engine_medrxiv"
+# class_name = "MedRxivSearchEngine"
+# requires_api_key = false
+# reliability = 0.85
+# strengths = [
+#     "medical preprints", "health research", "covid-19 research",
+#     "clinical studies", "medical sciences", "preliminary results"
+# ]
+# weaknesses = ["not peer-reviewed", "preliminary findings", "limited to medical research"]
+# requires_llm = true
+#
+# [medrxiv.default_params]
+# sort_by = "relevance_score"
+# sort_order = "desc"
+# include_full_text = false
+# optimize_queries = true

local_deep_research/local_collections.py ADDED Viewed

@@ -0,0 +1,141 @@
+# local_collections.py
+"""
+Configuration file for local document collections.
+Each collection functions as an independent search engine.
+"""
+import os
+from typing import Dict, Any
+# Registry of local document collections
+# Each collection appears as a separate search engine in the main configuration
+LOCAL_COLLECTIONS = {
+    # Project Documents Collection
+    "project_docs": {
+        "name": "Project Documents",
+        "description": "Project documentation and specifications",
+        "paths": [os.path.abspath("./local_search_files/project_documents")],
+        "enabled": True,
+        "embedding_model": "all-MiniLM-L6-v2",
+        "embedding_device": "cpu",
+        "embedding_model_type": "sentence_transformers",
+        "max_results": 20,
+        "max_filtered_results": 5,
+        "chunk_size": 1000,
+        "chunk_overlap": 200,
+        "cache_dir": ".cache/local_search/project_docs"
+    },
+    # Research Papers Collection
+    "research_papers": {
+        "name": "Research Papers",
+        "description": "Academic research papers and articles",
+        "paths": [os.path.abspath("local_search_files/research_papers")],
+        "enabled": True,
+        "embedding_model": "all-MiniLM-L6-v2",
+        "embedding_device": "cpu",
+        "embedding_model_type": "sentence_transformers",
+        "max_results": 20,
+        "max_filtered_results": 5,
+        "chunk_size": 800,  # Smaller chunks for academic content
+        "chunk_overlap": 150,
+        "cache_dir": ".cache/local_search/research_papers"
+    },
+    # Personal Notes Collection
+    "personal_notes": {
+        "name": "Personal Notes",
+        "description": "Personal notes and documents",
+        "paths": [os.path.abspath("./local_search_files/personal_notes")],
+        "enabled": True,
+        "embedding_model": "all-MiniLM-L6-v2",
+        "embedding_device": "cpu",
+        "embedding_model_type": "sentence_transformers",
+        "max_results": 30,
+        "max_filtered_results": 10,
+        "chunk_size": 500,  # Smaller chunks for notes
+        "chunk_overlap": 100,
+        "cache_dir": ".cache/local_search/personal_notes"
+    }
+}
+# Configuration for local search integration
+LOCAL_SEARCH_CONFIG = {
+    # General embedding options
+    "DEFAULT_EMBEDDING_MODEL": "all-MiniLM-L6-v2",
+    "DEFAULT_EMBEDDING_DEVICE": "cpu",  # "cpu" or "cuda" for GPU acceleration
+    "DEFAULT_EMBEDDING_MODEL_TYPE": "sentence_transformers",  # or "ollama"
+    # Ollama settings (only used if model type is "ollama")
+    # Note: You must run 'ollama pull nomic-embed-text' first if using Ollama for embeddings
+    "OLLAMA_BASE_URL": "http://localhost:11434",
+    "OLLAMA_EMBEDDING_MODEL": "nomic-embed-text",
+    # Default indexing options
+    "FORCE_REINDEX": True,  # Force reindexing on startup
+    "CACHE_DIR": ".cache/local_search",  # Base directory for cache
+}
+def register_local_collections(search_engines_dict: Dict[str, Any]) -> None:
+    """
+    Register all enabled local collections as search engines.
+    Args:
+        search_engines_dict: The main search engines dictionary to update
+    """
+    for collection_id, collection in LOCAL_COLLECTIONS.items():
+        print(collection_id, collection)
+        if collection.get("enabled", True):
+            # Skip if already defined (don't override)
+            if collection_id in search_engines_dict:
+                continue
+            # Validate paths exist
+            paths = collection.get("paths", [])
+            valid_paths = []
+            for path in paths:
+                if os.path.exists(path) and os.path.isdir(path):
+                    valid_paths.append(path)
+                else:
+                    print(f"Warning: Collection '{collection_id}' contains non-existent folder: {path}")
+            # Log warning if no valid paths
+            if not valid_paths and paths:
+                print(f"Warning: Collection '{collection_id}' has no valid folders. It will be registered but won't return results.")
+            # Create a search engine entry for this collection
+            search_engines_dict[collection_id] = {
+                "module_path": "local_deep_research.web_search_engines.engines.search_engine_local",
+                "class_name": "LocalSearchEngine",
+                "requires_api_key": False,
+                "reliability": 0.9,  # High reliability for local documents
+                "strengths": ["personal documents", "offline access",
+                             collection.get("description", "local documents")],
+                "weaknesses": ["requires indexing", "limited to specific folders"],
+                "default_params": {
+                    "folder_paths": collection.get("paths", []),
+                    "embedding_model": collection.get(
+                        "embedding_model",
+                        LOCAL_SEARCH_CONFIG["DEFAULT_EMBEDDING_MODEL"]
+                    ),
+                    "embedding_device": collection.get(
+                        "embedding_device",
+                        LOCAL_SEARCH_CONFIG["DEFAULT_EMBEDDING_DEVICE"]
+                    ),
+                    "embedding_model_type": collection.get(
+                        "embedding_model_type",
+                        LOCAL_SEARCH_CONFIG["DEFAULT_EMBEDDING_MODEL_TYPE"]
+                    ),
+                    "chunk_size": collection.get("chunk_size", 1000),
+                    "chunk_overlap": collection.get("chunk_overlap", 200),
+                    "cache_dir": collection.get(
+                        "cache_dir",
+                        f"{LOCAL_SEARCH_CONFIG['CACHE_DIR']}/{collection_id}"
+                    ),
+                    "max_results": collection.get("max_results", 20),
+                    "max_filtered_results": collection.get("max_filtered_results", 5),
+                    "collection_name": collection.get("name", collection_id),
+                    "collection_description": collection.get("description", "")
+                },
+                "requires_llm": True
+            }

local_deep_research/main.py ADDED Viewed

@@ -0,0 +1,113 @@
+from .search_system import AdvancedSearchSystem
+from typing import Dict
+from .config import settings
+def print_report(report: Dict):
+    """Print and save the report in a readable format"""
+    # Print to console in readable format
+    print("\n=== GENERATED REPORT ===\n")
+    # Print content
+    print(report["content"])
+    # Save to file in markdown format
+    with open("report.md", "w", encoding="utf-8") as markdown_file:
+        # Write content
+        markdown_file.write(report["content"])
+        # Write metadata at the end of the file
+        markdown_file.write("\n\n---\n\n")
+        markdown_file.write("## Report Metadata\n")
+        markdown_file.write(f"- Query: {report['metadata']['query']}\n")
+    print(f"\nReport has been saved to report.md")
+from .report_generator import IntegratedReportGenerator
+report_generator = IntegratedReportGenerator()
+def main():
+    import os
+    import logging
+    from .utilties.setup_utils import setup_user_directories
+    # Configure logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__name__)
+    logger.info(f"Starting with settings: iterations={settings.search.iterations}, "
+                f"questions_per_iteration={settings.search.questions_per_iteration}")
+    # Explicitly run setup
+    logger.info("Initializing configuration...")
+    setup_user_directories()
+    system = AdvancedSearchSystem()
+    print("Welcome to the Advanced Research System")
+    print("Type 'quit' to exit")
+    while True:
+        print("\nSelect output type:")
+        print("1) Quick Summary (Generated in a few minutes)")
+        print(
+            "2) Detailed Research Report (Recommended for deeper analysis - may take several hours)"
+        )
+        choice = input("Enter number (1 or 2): ").strip()
+        while choice not in ["1", "2"]:
+            print("\nInvalid input. Please enter 1 or 2:")
+            print("1) Quick Summary (Generated in a few minutes)")
+            print(
+                "2) Detailed Research Report (Recommended for deeper analysis - may take several hours)"
+            )
+            choice = input("Enter number (1 or 2): ").strip()
+        query = input("\nEnter your research query: ").strip()
+        if query.lower() == "quit":
+            break
+        # System will automatically use updated configuration
+        # through the automatic reloading in get_llm() and get_search()
+        if choice == "1":
+            print("\nResearching... This may take a few minutes.\n")
+        else:
+            print(
+                "\nGenerating detailed report... This may take several hours. Please be patient as this enables deeper analysis.\n"
+            )
+        results = system.analyze_topic(query)
+        if results:
+            if choice == "1":
+                # Quick Summary
+                print("\n=== QUICK SUMMARY ===")
+                if results["findings"] and len(results["findings"]) > 0:
+                    initial_analysis = [
+                        finding["content"] for finding in results["findings"]
+                    ]
+                    print(initial_analysis)
+            else:
+                # Full Report
+                final_report = report_generator.generate_report(
+                    results, query
+                )
+                print("\n=== RESEARCH REPORT ===")
+                print_report(final_report)
+                print("\n=== RESEARCH METRICS ===")
+                print(f"Search Iterations: {results['iterations']}")
+        else:
+            print("Research failed. Please try again.")
+if __name__ == "__main__":
+    main()