PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/web_search_engines/engines/search_engine_wayback.py CHANGED Viewed

@@ -1,35 +1,38 @@
-import requests
 import logging
-from typing import Dict, List, Any, Optional, Tuple
-from langchain_core.language_models import BaseLLM
 import re
 import time
-from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
-from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
-from local_deep_research import config
 import justext
+import requests
+from langchain_core.language_models import BaseLLM
+from ...config import search_config
+from ..search_engine_base import BaseSearchEngine
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class WaybackSearchEngine(BaseSearchEngine):
     """
     Internet Archive Wayback Machine search engine implementation
     Provides access to historical versions of web pages
     """
-    def __init__(self,
-                max_results: int = 10,
-                max_snapshots_per_url: int = 3,
-                llm: Optional[BaseLLM] = None,
-                language: str = "English",
-                max_filtered_results: Optional[int] = None,
-                closest_only: bool = False):
+    def __init__(
+        self,
+        max_results: int = 10,
+        max_snapshots_per_url: int = 3,
+        llm: Optional[BaseLLM] = None,
+        language: str = "English",
+        max_filtered_results: Optional[int] = None,
+        closest_only: bool = False,
+    ):
         """
         Initialize the Wayback Machine search engine.
         Args:
             max_results: Maximum number of search results
             max_snapshots_per_url: Maximum snapshots to retrieve per URL
@@ -39,48 +42,54 @@ class WaybackSearchEngine(BaseSearchEngine):
             closest_only: If True, only retrieves the closest snapshot for each URL
         """
         # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
-        super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
+        super().__init__(
+            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+        )
         self.max_snapshots_per_url = max_snapshots_per_url
         self.language = language
         self.closest_only = closest_only
         # API endpoints
         self.available_api = "https://archive.org/wayback/available"
         self.cdx_api = "https://web.archive.org/cdx/search/cdx"
     def _extract_urls_from_query(self, query: str) -> List[str]:
         """
         Extract URLs from a query string or interpret as an URL if possible.
         For non-URL queries, use a DuckDuckGo search to find relevant URLs.
         Args:
             query: The search query or URL
         Returns:
             List of URLs to search in the Wayback Machine
         """
         # Check if the query is already a URL
-        url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
+        url_pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+")
         urls = url_pattern.findall(query)
         if urls:
             logger.info(f"Found {len(urls)} URLs in query")
             return urls
         # Check if query is a domain without http prefix
-        domain_pattern = re.compile(r'^(?:[-\w.]|(?:%[\da-fA-F]{2}))+\.\w+$')
+        domain_pattern = re.compile(r"^(?:[-\w.]|(?:%[\da-fA-F]{2}))+\.\w+$")
         if domain_pattern.match(query):
             logger.info(f"Query appears to be a domain: {query}")
             return [f"http://{query}"]
         # For non-URL queries, use DuckDuckGo to find relevant URLs
-        logger.info(f"Query is not a URL, using DuckDuckGo to find relevant URLs")
+        logger.info("Query is not a URL, using DuckDuckGo to find relevant URLs")
         try:
             # Import DuckDuckGo search engine
             from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
-            ddg = DuckDuckGoSearchAPIWrapper(max_results=5)
-            results = ddg.results(query)
+            # Use max_results from parent class, but limit to 5 for URL discovery
+            url_search_limit = min(5, self.max_results)
+            ddg = DuckDuckGoSearchAPIWrapper(max_results=url_search_limit)
+            # Pass max_results as a positional argument
+            results = ddg.results(query, url_search_limit)
             # Extract URLs from results
             ddg_urls = [result.get("link") for result in results if result.get("link")]
             if ddg_urls:
@@ -88,7 +97,7 @@ class WaybackSearchEngine(BaseSearchEngine):
                 return ddg_urls
         except Exception as e:
             logger.error(f"Error using DuckDuckGo for URL discovery: {e}")
         # Fallback: treat the query as a potential domain or path
         if "/" in query and "." in query:
             logger.info(f"Treating query as a partial URL: {query}")
@@ -96,16 +105,16 @@ class WaybackSearchEngine(BaseSearchEngine):
         elif "." in query:
             logger.info(f"Treating query as a domain: {query}")
             return [f"http://{query}"]
         # Return empty list if nothing worked
         logger.warning(f"Could not extract any URLs from query: {query}")
         return []
     def _format_timestamp(self, timestamp: str) -> str:
         """Format Wayback Machine timestamp into readable date"""
         if len(timestamp) < 14:
             return timestamp
         try:
             year = timestamp[0:4]
             month = timestamp[4:6]
@@ -114,43 +123,45 @@ class WaybackSearchEngine(BaseSearchEngine):
             minute = timestamp[10:12]
             second = timestamp[12:14]
             return f"{year}-{month}-{day} {hour}:{minute}:{second}"
-        except:
+        except Exception:
             return timestamp
     def _get_wayback_snapshots(self, url: str) -> List[Dict[str, Any]]:
         """
         Get snapshots from the Wayback Machine for a specific URL.
         Args:
             url: URL to get snapshots for
         Returns:
             List of snapshot dictionaries
         """
         snapshots = []
         try:
             if self.closest_only:
                 # Get only the closest snapshot
-                response = requests.get(
-                    self.available_api,
-                    params={"url": url}
-                )
+                response = requests.get(self.available_api, params={"url": url})
                 data = response.json()
-                if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
+                if (
+                    "archived_snapshots" in data
+                    and "closest" in data["archived_snapshots"]
+                ):
                     snapshot = data["archived_snapshots"]["closest"]
                     snapshot_url = snapshot["url"]
                     timestamp = snapshot["timestamp"]
-                    snapshots.append({
-                        "timestamp": timestamp,
-                        "formatted_date": self._format_timestamp(timestamp),
-                        "url": snapshot_url,
-                        "original_url": url,
-                        "available": snapshot.get("available", True),
-                        "status": snapshot.get("status", "200")
-                    })
+                    snapshots.append(
+                        {
+                            "timestamp": timestamp,
+                            "formatted_date": self._format_timestamp(timestamp),
+                            "url": snapshot_url,
+                            "original_url": url,
+                            "available": snapshot.get("available", True),
+                            "status": snapshot.get("status", "200"),
+                        }
+                    )
             else:
                 # Get multiple snapshots using CDX API
                 response = requests.get(
@@ -160,68 +171,70 @@ class WaybackSearchEngine(BaseSearchEngine):
                         "output": "json",
                         "fl": "timestamp,original,statuscode,mimetype",
                         "collapse": "timestamp:4",  # Group by year
-                        "limit": self.max_snapshots_per_url
-                    }
+                        "limit": self.max_snapshots_per_url,
+                    },
                 )
                 # Check if response is valid JSON
                 data = response.json()
                 # First item is the header
                 if len(data) > 1:
                     headers = data[0]
                     for item in data[1:]:
                         snapshot = dict(zip(headers, item))
                         timestamp = snapshot.get("timestamp", "")
                         wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
-                        snapshots.append({
-                            "timestamp": timestamp,
-                            "formatted_date": self._format_timestamp(timestamp),
-                            "url": wayback_url,
-                            "original_url": url,
-                            "available": True,
-                            "status": snapshot.get("statuscode", "200")
-                        })
+                        snapshots.append(
+                            {
+                                "timestamp": timestamp,
+                                "formatted_date": self._format_timestamp(timestamp),
+                                "url": wayback_url,
+                                "original_url": url,
+                                "available": True,
+                                "status": snapshot.get("statuscode", "200"),
+                            }
+                        )
                 # Limit to max snapshots per URL
-                snapshots = snapshots[:self.max_snapshots_per_url]
+                snapshots = snapshots[: self.max_snapshots_per_url]
         except Exception as e:
             logger.error(f"Error getting Wayback snapshots for {url}: {e}")
         return snapshots
     def _get_previews(self, query: str) -> List[Dict[str, Any]]:
         """
         Get preview information for Wayback Machine snapshots.
         Args:
             query: The search query
         Returns:
             List of preview dictionaries
         """
         logger.info(f"Getting Wayback Machine previews for query: {query}")
         # Extract URLs from query
         urls = self._extract_urls_from_query(query)
         if not urls:
             logger.warning(f"No URLs found in query: {query}")
             return []
         # Get snapshots for each URL
         all_snapshots = []
         for url in urls:
             snapshots = self._get_wayback_snapshots(url)
             all_snapshots.extend(snapshots)
             # Respect rate limits
             if len(urls) > 1:
                 time.sleep(0.5)
         # Format as previews
         previews = []
         for snapshot in all_snapshots:
@@ -232,20 +245,20 @@ class WaybackSearchEngine(BaseSearchEngine):
                 "snippet": f"Archived version from {snapshot['formatted_date']}",
                 "original_url": snapshot["original_url"],
                 "timestamp": snapshot["timestamp"],
-                "formatted_date": snapshot["formatted_date"]
+                "formatted_date": snapshot["formatted_date"],
             }
             previews.append(preview)
         logger.info(f"Found {len(previews)} Wayback Machine snapshots")
         return previews
     def _remove_boilerplate(self, html: str) -> str:
         """
         Remove boilerplate content from HTML.
         Args:
             html: HTML content
         Returns:
             Cleaned text content
         """
@@ -258,14 +271,14 @@ class WaybackSearchEngine(BaseSearchEngine):
         except Exception as e:
             logger.error(f"Error removing boilerplate: {e}")
             return html
     def _get_wayback_content(self, url: str) -> Tuple[str, str]:
         """
         Retrieve content from a Wayback Machine URL.
         Args:
             url: Wayback Machine URL
         Returns:
             Tuple of (raw_html, cleaned_text)
         """
@@ -275,76 +288,85 @@ class WaybackSearchEngine(BaseSearchEngine):
             }
             response = requests.get(url, headers=headers, timeout=10)
             raw_html = response.text
             # Clean the HTML
             cleaned_text = self._remove_boilerplate(raw_html)
             return raw_html, cleaned_text
         except Exception as e:
             logger.error(f"Error retrieving content from {url}: {e}")
             return "", f"Error retrieving content: {str(e)}"
-    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
         """
         Get full content for the relevant Wayback Machine snapshots.
         Args:
             relevant_items: List of relevant preview dictionaries
         Returns:
             List of result dictionaries with full content
         """
         # Check if we should add full content
-        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
             logger.info("Snippet-only mode, skipping full content retrieval")
             return relevant_items
-        logger.info(f"Getting full content for {len(relevant_items)} Wayback Machine snapshots")
+        logger.info(
+            f"Getting full content for {len(relevant_items)} Wayback Machine snapshots"
+        )
         results = []
         for item in relevant_items:
             wayback_url = item.get("link")
             if not wayback_url:
                 results.append(item)
                 continue
             logger.info(f"Retrieving content from {wayback_url}")
             try:
                 # Retrieve content
                 raw_html, full_content = self._get_wayback_content(wayback_url)
                 # Add full content to the result
                 result = item.copy()
                 result["raw_html"] = raw_html
                 result["full_content"] = full_content
                 results.append(result)
                 # Brief pause for rate limiting
                 time.sleep(0.5)
             except Exception as e:
                 logger.error(f"Error processing {wayback_url}: {e}")
                 results.append(item)
         return results
-    def search_by_url(self, url: str, max_snapshots: int = None) -> List[Dict[str, Any]]:
+    def search_by_url(
+        self, url: str, max_snapshots: int = None
+    ) -> List[Dict[str, Any]]:
         """
         Search for archived versions of a specific URL.
         Args:
             url: The URL to search for archives
             max_snapshots: Maximum number of snapshots to return
         Returns:
             List of snapshot dictionaries
         """
         max_snapshots = max_snapshots or self.max_snapshots_per_url
         snapshots = self._get_wayback_snapshots(url)
         previews = []
         for snapshot in snapshots[:max_snapshots]:
             preview = {
                 "id": f"{snapshot['timestamp']}_{snapshot['original_url']}",
@@ -353,25 +375,30 @@ class WaybackSearchEngine(BaseSearchEngine):
                 "snippet": f"Archived version from {snapshot['formatted_date']}",
                 "original_url": snapshot["original_url"],
                 "timestamp": snapshot["timestamp"],
-                "formatted_date": snapshot["formatted_date"]
+                "formatted_date": snapshot["formatted_date"],
             }
             previews.append(preview)
         # Get full content if not in snippets-only mode
-        if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
+        if (
+            not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            or not search_config.SEARCH_SNIPPETS_ONLY
+        ):
             return self._get_full_content(previews)
         return previews
-    def search_by_date_range(self, url: str, start_date: str, end_date: str) -> List[Dict[str, Any]]:
+    def search_by_date_range(
+        self, url: str, start_date: str, end_date: str
+    ) -> List[Dict[str, Any]]:
         """
         Search for archived versions of a URL within a date range.
         Args:
             url: The URL to search for archives
             start_date: Start date in format YYYYMMDD
             end_date: End date in format YYYYMMDD
         Returns:
             List of snapshot dictionaries
         """
@@ -385,68 +412,70 @@ class WaybackSearchEngine(BaseSearchEngine):
                     "fl": "timestamp,original,statuscode,mimetype",
                     "from": start_date,
                     "to": end_date,
-                    "limit": self.max_snapshots_per_url
-                }
+                    "limit": self.max_snapshots_per_url,
+                },
             )
             # Process response
             data = response.json()
             # First item is the header
             if len(data) <= 1:
                 return []
             headers = data[0]
             snapshots = []
             for item in data[1:]:
                 snapshot = dict(zip(headers, item))
                 timestamp = snapshot.get("timestamp", "")
                 wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
-                snapshots.append({
-                    "id": f"{timestamp}_{url}",
-                    "title": f"Archive of {url} ({self._format_timestamp(timestamp)})",
-                    "link": wayback_url,
-                    "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
-                    "original_url": url,
-                    "timestamp": timestamp,
-                    "formatted_date": self._format_timestamp(timestamp)
-                })
+                snapshots.append(
+                    {
+                        "id": f"{timestamp}_{url}",
+                        "title": f"Archive of {url} ({self._format_timestamp(timestamp)})",
+                        "link": wayback_url,
+                        "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
+                        "original_url": url,
+                        "timestamp": timestamp,
+                        "formatted_date": self._format_timestamp(timestamp),
+                    }
+                )
             # Get full content if not in snippets-only mode
-            if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
+            if (
+                not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+                or not search_config.SEARCH_SNIPPETS_ONLY
+            ):
                 return self._get_full_content(snapshots)
             return snapshots
         except Exception as e:
             logger.error(f"Error searching date range for {url}: {e}")
             return []
     def get_latest_snapshot(self, url: str) -> Optional[Dict[str, Any]]:
         """
         Get the most recent snapshot of a URL.
         Args:
             url: The URL to get the latest snapshot for
         Returns:
             Dictionary with snapshot information or None if not found
         """
         try:
-            response = requests.get(
-                self.available_api,
-                params={"url": url}
-            )
+            response = requests.get(self.available_api, params={"url": url})
             data = response.json()
             if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
                 snapshot = data["archived_snapshots"]["closest"]
                 timestamp = snapshot["timestamp"]
                 wayback_url = snapshot["url"]
                 result = {
                     "id": f"{timestamp}_{url}",
                     "title": f"Latest archive of {url} ({self._format_timestamp(timestamp)})",
@@ -454,19 +483,22 @@ class WaybackSearchEngine(BaseSearchEngine):
                     "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
                     "original_url": url,
                     "timestamp": timestamp,
-                    "formatted_date": self._format_timestamp(timestamp)
+                    "formatted_date": self._format_timestamp(timestamp),
                 }
                 # Get full content if not in snippets-only mode
-                if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
+                if (
+                    not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+                    or not search_config.SEARCH_SNIPPETS_ONLY
+                ):
                     raw_html, full_content = self._get_wayback_content(wayback_url)
                     result["raw_html"] = raw_html
                     result["full_content"] = full_content
                 return result
             return None
         except Exception as e:
             logger.error(f"Error getting latest snapshot for {url}: {e}")
-            return None
+            return None

local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl