PyPI - local-deep-research - Versions diffs - 0.1.0__py3-none-any.whl - Mend

local-deep-research 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

local_deep_research/__init__.py +24 -0
local_deep_research/citation_handler.py +113 -0
local_deep_research/config.py +166 -0
local_deep_research/defaults/__init__.py +44 -0
local_deep_research/defaults/llm_config.py +269 -0
local_deep_research/defaults/local_collections.toml +47 -0
local_deep_research/defaults/main.toml +57 -0
local_deep_research/defaults/search_engines.toml +244 -0
local_deep_research/local_collections.py +141 -0
local_deep_research/main.py +113 -0
local_deep_research/report_generator.py +206 -0
local_deep_research/search_system.py +241 -0
local_deep_research/utilties/__init__.py +0 -0
local_deep_research/utilties/enums.py +9 -0
local_deep_research/utilties/llm_utils.py +116 -0
local_deep_research/utilties/search_utilities.py +115 -0
local_deep_research/utilties/setup_utils.py +6 -0
local_deep_research/web/__init__.py +2 -0
local_deep_research/web/app.py +1209 -0
local_deep_research/web/static/css/styles.css +1008 -0
local_deep_research/web/static/js/app.js +2078 -0
local_deep_research/web/templates/api_keys_config.html +82 -0
local_deep_research/web/templates/collections_config.html +90 -0
local_deep_research/web/templates/index.html +312 -0
local_deep_research/web/templates/llm_config.html +120 -0
local_deep_research/web/templates/main_config.html +89 -0
local_deep_research/web/templates/search_engines_config.html +154 -0
local_deep_research/web/templates/settings.html +519 -0
local_deep_research/web/templates/settings_dashboard.html +207 -0
local_deep_research/web_search_engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/full_search.py +128 -0
local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
local_deep_research/web_search_engines/full_search.py +254 -0
local_deep_research/web_search_engines/search_engine_base.py +197 -0
local_deep_research/web_search_engines/search_engine_factory.py +233 -0
local_deep_research/web_search_engines/search_engines_config.py +54 -0
local_deep_research-0.1.0.dist-info/LICENSE +21 -0
local_deep_research-0.1.0.dist-info/METADATA +328 -0
local_deep_research-0.1.0.dist-info/RECORD +56 -0
local_deep_research-0.1.0.dist-info/WHEEL +5 -0
local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
local_deep_research-0.1.0.dist-info/top_level.txt +1 -0

local_deep_research/web_search_engines/engines/search_engine_wayback.py ADDED Viewed

@@ -0,0 +1,474 @@
+import requests
+import logging
+from typing import Dict, List, Any, Optional, Tuple
+from langchain_core.language_models import BaseLLM
+import re
+import time
+from datetime import datetime
+from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
+from local_deep_research import config
+import justext
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class WaybackSearchEngine(BaseSearchEngine):
+    """
+    Internet Archive Wayback Machine search engine implementation
+    Provides access to historical versions of web pages
+    """
+    def __init__(self,
+                max_results: int = 10,
+                max_snapshots_per_url: int = 3,
+                llm: Optional[BaseLLM] = None,
+                language: str = "English",
+                max_filtered_results: Optional[int] = None,
+                closest_only: bool = False):
+        """
+        Initialize the Wayback Machine search engine.
+        Args:
+            max_results: Maximum number of search results
+            max_snapshots_per_url: Maximum snapshots to retrieve per URL
+            llm: Language model for relevance filtering
+            language: Language for content processing
+            max_filtered_results: Maximum number of results to keep after filtering
+            closest_only: If True, only retrieves the closest snapshot for each URL
+        """
+        # Initialize the BaseSearchEngine with the LLM and max_filtered_results
+        super().__init__(llm=llm, max_filtered_results=max_filtered_results)
+        self.max_results = max_results
+        self.max_snapshots_per_url = max_snapshots_per_url
+        self.language = language
+        self.closest_only = closest_only
+        # API endpoints
+        self.available_api = "https://archive.org/wayback/available"
+        self.cdx_api = "https://web.archive.org/cdx/search/cdx"
+    def _extract_urls_from_query(self, query: str) -> List[str]:
+        """
+        Extract URLs from a query string or interpret as an URL if possible.
+        For non-URL queries, use a DuckDuckGo search to find relevant URLs.
+        Args:
+            query: The search query or URL
+        Returns:
+            List of URLs to search in the Wayback Machine
+        """
+        # Check if the query is already a URL
+        url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
+        urls = url_pattern.findall(query)
+        if urls:
+            logger.info(f"Found {len(urls)} URLs in query")
+            return urls
+        # Check if query is a domain without http prefix
+        domain_pattern = re.compile(r'^(?:[-\w.]|(?:%[\da-fA-F]{2}))+\.\w+$')
+        if domain_pattern.match(query):
+            logger.info(f"Query appears to be a domain: {query}")
+            return [f"http://{query}"]
+        # For non-URL queries, use DuckDuckGo to find relevant URLs
+        logger.info(f"Query is not a URL, using DuckDuckGo to find relevant URLs")
+        try:
+            # Import DuckDuckGo search engine
+            from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
+            ddg = DuckDuckGoSearchAPIWrapper(max_results=5)
+            results = ddg.results(query)
+            # Extract URLs from results
+            ddg_urls = [result.get("link") for result in results if result.get("link")]
+            if ddg_urls:
+                logger.info(f"Found {len(ddg_urls)} URLs from DuckDuckGo search")
+                return ddg_urls
+        except Exception as e:
+            logger.error(f"Error using DuckDuckGo for URL discovery: {e}")
+        # Fallback: treat the query as a potential domain or path
+        if "/" in query and "." in query:
+            logger.info(f"Treating query as a partial URL: {query}")
+            return [f"http://{query}"]
+        elif "." in query:
+            logger.info(f"Treating query as a domain: {query}")
+            return [f"http://{query}"]
+        # Return empty list if nothing worked
+        logger.warning(f"Could not extract any URLs from query: {query}")
+        return []
+    def _format_timestamp(self, timestamp: str) -> str:
+        """Format Wayback Machine timestamp into readable date"""
+        if len(timestamp) < 14:
+            return timestamp
+        try:
+            year = timestamp[0:4]
+            month = timestamp[4:6]
+            day = timestamp[6:8]
+            hour = timestamp[8:10]
+            minute = timestamp[10:12]
+            second = timestamp[12:14]
+            return f"{year}-{month}-{day} {hour}:{minute}:{second}"
+        except:
+            return timestamp
+    def _get_wayback_snapshots(self, url: str) -> List[Dict[str, Any]]:
+        """
+        Get snapshots from the Wayback Machine for a specific URL.
+        Args:
+            url: URL to get snapshots for
+        Returns:
+            List of snapshot dictionaries
+        """
+        snapshots = []
+        try:
+            if self.closest_only:
+                # Get only the closest snapshot
+                response = requests.get(
+                    self.available_api,
+                    params={"url": url}
+                )
+                data = response.json()
+                if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
+                    snapshot = data["archived_snapshots"]["closest"]
+                    snapshot_url = snapshot["url"]
+                    timestamp = snapshot["timestamp"]
+                    snapshots.append({
+                        "timestamp": timestamp,
+                        "formatted_date": self._format_timestamp(timestamp),
+                        "url": snapshot_url,
+                        "original_url": url,
+                        "available": snapshot.get("available", True),
+                        "status": snapshot.get("status", "200")
+                    })
+            else:
+                # Get multiple snapshots using CDX API
+                response = requests.get(
+                    self.cdx_api,
+                    params={
+                        "url": url,
+                        "output": "json",
+                        "fl": "timestamp,original,statuscode,mimetype",
+                        "collapse": "timestamp:4",  # Group by year
+                        "limit": self.max_snapshots_per_url
+                    }
+                )
+                # Check if response is valid JSON
+                data = response.json()
+                # First item is the header
+                if len(data) > 1:
+                    headers = data[0]
+                    for item in data[1:]:
+                        snapshot = dict(zip(headers, item))
+                        timestamp = snapshot.get("timestamp", "")
+                        wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
+                        snapshots.append({
+                            "timestamp": timestamp,
+                            "formatted_date": self._format_timestamp(timestamp),
+                            "url": wayback_url,
+                            "original_url": url,
+                            "available": True,
+                            "status": snapshot.get("statuscode", "200")
+                        })
+                # Limit to max snapshots per URL
+                snapshots = snapshots[:self.max_snapshots_per_url]
+        except Exception as e:
+            logger.error(f"Error getting Wayback snapshots for {url}: {e}")
+        return snapshots
+    def _get_previews(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Get preview information for Wayback Machine snapshots.
+        Args:
+            query: The search query
+        Returns:
+            List of preview dictionaries
+        """
+        logger.info(f"Getting Wayback Machine previews for query: {query}")
+        # Extract URLs from query
+        urls = self._extract_urls_from_query(query)
+        if not urls:
+            logger.warning(f"No URLs found in query: {query}")
+            return []
+        # Get snapshots for each URL
+        all_snapshots = []
+        for url in urls:
+            snapshots = self._get_wayback_snapshots(url)
+            all_snapshots.extend(snapshots)
+            # Respect rate limits
+            if len(urls) > 1:
+                time.sleep(0.5)
+        # Format as previews
+        previews = []
+        for snapshot in all_snapshots:
+            preview = {
+                "id": f"{snapshot['timestamp']}_{snapshot['original_url']}",
+                "title": f"Archive of {snapshot['original_url']} ({snapshot['formatted_date']})",
+                "link": snapshot["url"],
+                "snippet": f"Archived version from {snapshot['formatted_date']}",
+                "original_url": snapshot["original_url"],
+                "timestamp": snapshot["timestamp"],
+                "formatted_date": snapshot["formatted_date"]
+            }
+            previews.append(preview)
+        logger.info(f"Found {len(previews)} Wayback Machine snapshots")
+        return previews
+    def _remove_boilerplate(self, html: str) -> str:
+        """
+        Remove boilerplate content from HTML.
+        Args:
+            html: HTML content
+        Returns:
+            Cleaned text content
+        """
+        if not html or not html.strip():
+            return ""
+        try:
+            paragraphs = justext.justext(html, justext.get_stoplist(self.language))
+            cleaned = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
+            return cleaned
+        except Exception as e:
+            logger.error(f"Error removing boilerplate: {e}")
+            return html
+    def _get_wayback_content(self, url: str) -> Tuple[str, str]:
+        """
+        Retrieve content from a Wayback Machine URL.
+        Args:
+            url: Wayback Machine URL
+        Returns:
+            Tuple of (raw_html, cleaned_text)
+        """
+        try:
+            headers = {
+                "User-Agent": "Mozilla/5.0 (Local Deep Research Bot; research project)"
+            }
+            response = requests.get(url, headers=headers, timeout=10)
+            raw_html = response.text
+            # Clean the HTML
+            cleaned_text = self._remove_boilerplate(raw_html)
+            return raw_html, cleaned_text
+        except Exception as e:
+            logger.error(f"Error retrieving content from {url}: {e}")
+            return "", f"Error retrieving content: {str(e)}"
+    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Get full content for the relevant Wayback Machine snapshots.
+        Args:
+            relevant_items: List of relevant preview dictionaries
+        Returns:
+            List of result dictionaries with full content
+        """
+        # Check if we should add full content
+        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+            logger.info("Snippet-only mode, skipping full content retrieval")
+            return relevant_items
+        logger.info(f"Getting full content for {len(relevant_items)} Wayback Machine snapshots")
+        results = []
+        for item in relevant_items:
+            wayback_url = item.get("link")
+            if not wayback_url:
+                results.append(item)
+                continue
+            logger.info(f"Retrieving content from {wayback_url}")
+            try:
+                # Retrieve content
+                raw_html, full_content = self._get_wayback_content(wayback_url)
+                # Add full content to the result
+                result = item.copy()
+                result["raw_html"] = raw_html
+                result["full_content"] = full_content
+                results.append(result)
+                # Brief pause for rate limiting
+                time.sleep(0.5)
+            except Exception as e:
+                logger.error(f"Error processing {wayback_url}: {e}")
+                results.append(item)
+        return results
+    def search_by_url(self, url: str, max_snapshots: int = None) -> List[Dict[str, Any]]:
+        """
+        Search for archived versions of a specific URL.
+        Args:
+            url: The URL to search for archives
+            max_snapshots: Maximum number of snapshots to return
+        Returns:
+            List of snapshot dictionaries
+        """
+        max_snapshots = max_snapshots or self.max_snapshots_per_url
+        snapshots = self._get_wayback_snapshots(url)
+        previews = []
+        for snapshot in snapshots[:max_snapshots]:
+            preview = {
+                "id": f"{snapshot['timestamp']}_{snapshot['original_url']}",
+                "title": f"Archive of {snapshot['original_url']} ({snapshot['formatted_date']})",
+                "link": snapshot["url"],
+                "snippet": f"Archived version from {snapshot['formatted_date']}",
+                "original_url": snapshot["original_url"],
+                "timestamp": snapshot["timestamp"],
+                "formatted_date": snapshot["formatted_date"]
+            }
+            previews.append(preview)
+        # Get full content if not in snippets-only mode
+        if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
+            return self._get_full_content(previews)
+        return previews
+    def search_by_date_range(self, url: str, start_date: str, end_date: str) -> List[Dict[str, Any]]:
+        """
+        Search for archived versions of a URL within a date range.
+        Args:
+            url: The URL to search for archives
+            start_date: Start date in format YYYYMMDD
+            end_date: End date in format YYYYMMDD
+        Returns:
+            List of snapshot dictionaries
+        """
+        try:
+            # Use CDX API with date range
+            response = requests.get(
+                self.cdx_api,
+                params={
+                    "url": url,
+                    "output": "json",
+                    "fl": "timestamp,original,statuscode,mimetype",
+                    "from": start_date,
+                    "to": end_date,
+                    "limit": self.max_snapshots_per_url
+                }
+            )
+            # Process response
+            data = response.json()
+            # First item is the header
+            if len(data) <= 1:
+                return []
+            headers = data[0]
+            snapshots = []
+            for item in data[1:]:
+                snapshot = dict(zip(headers, item))
+                timestamp = snapshot.get("timestamp", "")
+                wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
+                snapshots.append({
+                    "id": f"{timestamp}_{url}",
+                    "title": f"Archive of {url} ({self._format_timestamp(timestamp)})",
+                    "link": wayback_url,
+                    "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
+                    "original_url": url,
+                    "timestamp": timestamp,
+                    "formatted_date": self._format_timestamp(timestamp)
+                })
+            # Get full content if not in snippets-only mode
+            if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
+                return self._get_full_content(snapshots)
+            return snapshots
+        except Exception as e:
+            logger.error(f"Error searching date range for {url}: {e}")
+            return []
+    def get_latest_snapshot(self, url: str) -> Optional[Dict[str, Any]]:
+        """
+        Get the most recent snapshot of a URL.
+        Args:
+            url: The URL to get the latest snapshot for
+        Returns:
+            Dictionary with snapshot information or None if not found
+        """
+        try:
+            response = requests.get(
+                self.available_api,
+                params={"url": url}
+            )
+            data = response.json()
+            if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
+                snapshot = data["archived_snapshots"]["closest"]
+                timestamp = snapshot["timestamp"]
+                wayback_url = snapshot["url"]
+                result = {
+                    "id": f"{timestamp}_{url}",
+                    "title": f"Latest archive of {url} ({self._format_timestamp(timestamp)})",
+                    "link": wayback_url,
+                    "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
+                    "original_url": url,
+                    "timestamp": timestamp,
+                    "formatted_date": self._format_timestamp(timestamp)
+                }
+                # Get full content if not in snippets-only mode
+                if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
+                    raw_html, full_content = self._get_wayback_content(wayback_url)
+                    result["raw_html"] = raw_html
+                    result["full_content"] = full_content
+                return result
+            return None
+        except Exception as e:
+            logger.error(f"Error getting latest snapshot for {url}: {e}")
+            return None