PyPI - local-deep-research - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

local-deep-research 0.1.13py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

local_deep_research/web_search_engines/engines/search_engine_guardian.py CHANGED Viewed

@@ -1,15 +1,20 @@
 import requests
-from typing import Dict, List, Any, Optional
+import logging
+from typing import Dict, List, Any, Optional, Tuple
 import os
 from datetime import datetime, timedelta
 from langchain_core.language_models import BaseLLM
 from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
 from local_deep_research import config
+from local_deep_research.utilties.search_utilities import remove_think_tags
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class GuardianSearchEngine(BaseSearchEngine):
-    """The Guardian API search engine implementation"""
+    """Enhanced Guardian API search engine implementation with LLM query optimization"""
     def __init__(self,
                 max_results: int = 10,
@@ -18,9 +23,12 @@ class GuardianSearchEngine(BaseSearchEngine):
                 to_date: Optional[str] = None,
                 section: Optional[str] = None,
                 order_by: str = "relevance",
-                llm: Optional[BaseLLM] = None):
+                llm: Optional[BaseLLM] = None,
+                max_filtered_results: Optional[int] = None,
+                optimize_queries: bool = True,
+                adaptive_search: bool = True):
         """
-        Initialize The Guardian search engine.
+        Initialize The Guardian search engine with enhanced features.
         Args:
             max_results: Maximum number of search results
@@ -29,13 +37,16 @@ class GuardianSearchEngine(BaseSearchEngine):
             to_date: End date for search (YYYY-MM-DD format, default today)
             section: Filter by section (e.g., "politics", "technology", "sport")
             order_by: Sort order ("relevance", "newest", "oldest")
-            llm: Language model for relevance filtering
+            llm: Language model for relevance filtering and query optimization
+            max_filtered_results: Maximum number of results to keep after filtering
+            optimize_queries: Whether to optimize queries using LLM
+            adaptive_search: Whether to use adaptive search (adjusting date ranges)
         """
-        # Initialize the BaseSearchEngine with the LLM
-        super().__init__(llm=llm)
-        self.max_results = max_results
+        # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
+        super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
         self.api_key = api_key or os.getenv("GUARDIAN_API_KEY")
+        self.optimize_queries = optimize_queries
+        self.adaptive_search = adaptive_search
         if not self.api_key:
             raise ValueError("Guardian API key not found. Please provide api_key or set the GUARDIAN_API_KEY environment variable.")
@@ -56,10 +67,203 @@ class GuardianSearchEngine(BaseSearchEngine):
         self.section = section
         self.order_by = order_by
+        self._original_date_params = {
+            "from_date": self.from_date,
+            "to_date": self.to_date
+        }
         # API base URL
         self.api_url = "https://content.guardianapis.com/search"
+    def _optimize_query_for_guardian(self, query: str) -> str:
+        """
+        Optimize a natural language query for Guardian search.
+        Uses LLM to transform questions into effective news search queries.
+        Args:
+            query: Natural language query
+        Returns:
+            Optimized query string for Guardian
+        """
+        # Handle extremely long queries by truncating first
+        if len(query) > 150:
+            simple_query = " ".join(query.split()[:10])
+            logger.info(f"Query too long ({len(query)} chars), truncating to: {simple_query}")
+            query = simple_query
+        if not self.llm or not self.optimize_queries:
+            # Return original query if no LLM available or optimization disabled
+            return query
+        try:
+            # Prompt for query optimization
+            prompt = f"""Transform this natural language question into a very short Guardian news search query.
+Original query: "{query}"
+CRITICAL RULES:
+1. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
+2. Keep it EXTREMELY BRIEF - MAXIMUM 3-4 words total
+3. Focus only on the main topic/person/event
+4. Include proper names when relevant
+5. Remove ALL unnecessary words
+6. DO NOT use Boolean operators (no AND/OR)
+7. DO NOT use quotes
+EXAMPLE CONVERSIONS:
+✓ "What's the impact of rising interest rates on UK housing market?" → "UK housing rates"
+✓ "Latest developments in the Ukraine-Russia peace negotiations" → "Ukraine Russia negotiations"
+✓ "How are tech companies responding to AI regulation?" → "tech AI regulation"
+✓ "What is Donald Trump's current political activity?" → "Trump political activity"
+Return ONLY the extremely brief search query.
+"""
+            # Get response from LLM
+            response = self.llm.invoke(prompt)
+            optimized_query = remove_think_tags(response.content).strip()
+            # Clean up the query - remove any explanations
+            lines = optimized_query.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and not line.lower().startswith(('here', 'i would', 'the best', 'this query')):
+                    optimized_query = line
+                    break
+            # Remove any quotes that wrap the entire query
+            if optimized_query.startswith('"') and optimized_query.endswith('"') and optimized_query.count('"') == 2:
+                optimized_query = optimized_query[1:-1]
+            logger.info(f"Original query: '{query}'")
+            logger.info(f"Optimized for Guardian: '{optimized_query}'")
+            return optimized_query
+        except Exception as e:
+            logger.error(f"Error optimizing query: {e}")
+            return query  # Fall back to original query on error
+    def _adapt_dates_for_query_type(self, query: str) -> None:
+        """
+        Adapt date range based on query type (historical vs current).
+        Args:
+            query: The search query
+        """
+        # Fast path - for very short queries, default to recent news
+        if len(query.split()) <= 4:
+            logger.info("Short query detected, defaulting to recent news")
+            # Default to 60 days for short queries
+            recent = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
+            self.from_date = recent
+            self.order_by = "newest"
+            return
+        if not self.llm or not self.adaptive_search:
+            return
+        try:
+            prompt = f"""Is this query asking about HISTORICAL events or CURRENT events?
+Query: "{query}"
+ONE WORD ANSWER ONLY:
+- "HISTORICAL" if about past events (older than 1 year)
+- "CURRENT" if about recent events (within past year)
+- "UNCLEAR" if can't determine
+ONE WORD ONLY:"""
+            response = self.llm.invoke(prompt)
+            answer = remove_think_tags(response.content).strip().upper()
+            # Reset to original parameters first
+            self.from_date = self._original_date_params["from_date"]
+            self.to_date = self._original_date_params["to_date"]
+            if "HISTORICAL" in answer:
+                # For historical queries, go back 10 years
+                logger.info("Query classified as HISTORICAL - extending search timeframe")
+                ten_years_ago = (datetime.now() - timedelta(days=3650)).strftime("%Y-%m-%d")
+                self.from_date = ten_years_ago
+            elif "CURRENT" in answer:
+                # For current events, focus on recent content
+                logger.info("Query classified as CURRENT - focusing on recent content")
+                recent = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
+                self.from_date = recent
+                self.order_by = "newest"  # Prioritize newest for current events
+        except Exception as e:
+            logger.error(f"Error adapting dates for query type: {e}")
+            # Keep original date parameters on error
+    def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
+        """
+        Perform adaptive search that progressively adjusts parameters based on results.
+        Args:
+            query: The search query
+        Returns:
+            Tuple of (list of articles, search strategy used)
+        """
+        # Try with current parameters
+        articles = self._get_all_data(query)
+        strategy = "initial"
+        # If no results or too few, try different strategies
+        if len(articles) < 3 and self.adaptive_search:
+            logger.info(f"Initial search found only {len(articles)} results, trying alternative strategies")
+            # Try with expanded date range
+            original_from_date = self.from_date
+            original_order_by = self.order_by
+            # Strategy 1: Expand to 6 months
+            logger.info("Strategy 1: Expanding time range to 6 months")
+            six_months_ago = (datetime.now() - timedelta(days=180)).strftime("%Y-%m-%d")
+            self.from_date = six_months_ago
+            articles1 = self._get_all_data(query)
+            if len(articles1) > len(articles):
+                articles = articles1
+                strategy = "expanded_6mo"
+            # Strategy 2: Expand to all time and try relevance order
+            if len(articles) < 3:
+                logger.info("Strategy 2: Expanding to all time with relevance ordering")
+                self.from_date = "2000-01-01"  # Effectively "all time"
+                self.order_by = "relevance"
+                articles2 = self._get_all_data(query)
+                if len(articles2) > len(articles):
+                    articles = articles2
+                    strategy = "all_time_relevance"
+            # Strategy 3: Try removing section constraints
+            if len(articles) < 3 and self.section:
+                logger.info("Strategy 3: Removing section constraint")
+                original_section = self.section
+                self.section = None
+                articles3 = self._get_all_data(query)
+                if len(articles3) > len(articles):
+                    articles = articles3
+                    strategy = "no_section"
+                # Restore section setting
+                self.section = original_section
+            # Restore original settings
+            self.from_date = original_from_date
+            self.order_by = original_order_by
+        logger.info(f"Adaptive search using strategy '{strategy}' found {len(articles)} results")
+        return articles, strategy
     def _get_all_data(self, query: str) -> List[Dict[str, Any]]:
         """
         Get all article data from The Guardian API in a single call.
@@ -72,14 +276,31 @@ class GuardianSearchEngine(BaseSearchEngine):
             List of articles with all data
         """
         try:
+            # Ensure query is not empty
+            if not query or query.strip() == "":
+                query = "news"
+                logger.warning("Empty query provided, using 'news' as default")
+            # Ensure query is not too long for API
+            if len(query) > 100:
+                logger.warning(f"Query too long for Guardian API ({len(query)} chars), truncating")
+                query = query[:100]
             # Always request all fields for simplicity
+            # Ensure max_results is an integer to avoid comparison errors
+            page_size = min(int(self.max_results) if self.max_results is not None else 10, 50)
+            # Log full parameters for debugging
+            logger.info(f"Guardian API search query: '{query}'")
+            logger.info(f"Guardian API date range: {self.from_date} to {self.to_date}")
             params = {
                 "q": query,
                 "api-key": self.api_key,
                 "from-date": self.from_date,
                 "to-date": self.to_date,
                 "order-by": self.order_by,
-                "page-size": min(self.max_results, 50),  # API maximum is 50
+                "page-size": page_size,  # API maximum is 50
                 "show-fields": "headline,trailText,byline,body,publication",
                 "show-tags": "keyword"
             }
@@ -88,6 +309,11 @@ class GuardianSearchEngine(BaseSearchEngine):
             if self.section:
                 params["section"] = self.section
+            # Log the complete request parameters (except API key)
+            log_params = params.copy()
+            log_params["api-key"] = "REDACTED"
+            logger.info(f"Guardian API request parameters: {log_params}")
             # Execute the API request
             response = requests.get(self.api_url, params=params)
             response.raise_for_status()
@@ -96,6 +322,7 @@ class GuardianSearchEngine(BaseSearchEngine):
             # Extract results from the response
             articles = data.get("response", {}).get("results", [])
+            logger.info(f"Guardian API returned {len(articles)} articles")
             # Format results to include all data
             formatted_articles = []
@@ -127,13 +354,12 @@ class GuardianSearchEngine(BaseSearchEngine):
             return formatted_articles
         except Exception as e:
-            print(f"Error getting data from The Guardian API: {e}")
+            logger.error(f"Error getting data from The Guardian API: {e}")
             return []
     def _get_previews(self, query: str) -> List[Dict[str, Any]]:
         """
-        Get preview information for Guardian articles.
-        Actually gets all data but returns only preview fields.
+        Get preview information for Guardian articles with enhanced optimization.
         Args:
             query: The search query
@@ -141,12 +367,29 @@ class GuardianSearchEngine(BaseSearchEngine):
         Returns:
             List of preview dictionaries
         """
-        print("Getting articles from The Guardian API")
+        logger.info(f"Getting articles from The Guardian API for query: {query}")
-        # Get all article data
-        articles = self._get_all_data(query)
+        # Step 1: Optimize the query using LLM
+        optimized_query = self._optimize_query_for_guardian(query)
+        # Step 2: Adapt date parameters based on query type
+        self._adapt_dates_for_query_type(optimized_query)
+        # Step 3: Perform adaptive search
+        articles, strategy = self._adaptive_search(optimized_query)
-        # Store full articles for later use (implementation detail)
+        # Store search metadata for debugging
+        self._search_metadata = {
+            "original_query": query,
+            "optimized_query": optimized_query,
+            "strategy": strategy,
+            "from_date": self.from_date,
+            "to_date": self.to_date,
+            "section": self.section,
+            "order_by": self.order_by
+        }
+        # Store full articles for later use
         self._full_articles = {a["id"]: a for a in articles}
         # Return only preview fields for each article
@@ -177,7 +420,7 @@ class GuardianSearchEngine(BaseSearchEngine):
         Returns:
             List of result dictionaries with full content
         """
-        print("Adding full content to relevant Guardian articles")
+        logger.info(f"Adding full content to {len(relevant_items)} relevant Guardian articles")
         # Check if we should add full content
         if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
@@ -199,7 +442,7 @@ class GuardianSearchEngine(BaseSearchEngine):
     def run(self, query: str) -> List[Dict[str, Any]]:
         """
-        Execute a search using The Guardian API with the two-phase approach.
+        Execute a search using The Guardian API with the enhanced approach.
         Args:
             query: The search query
@@ -207,75 +450,77 @@ class GuardianSearchEngine(BaseSearchEngine):
         Returns:
             List of search results
         """
-        print("---Execute a search using The Guardian---")
+        logger.info(f"---Execute a search using The Guardian (enhanced)---")
-        # Use the implementation from the parent class which handles all phases
-        results = super().run(query)
+        # Additional safety check for None query
+        if query is None:
+            logger.error("None query passed to Guardian search engine")
+            query = "news"
-        # Clean up the cache after use
-        if hasattr(self, '_full_articles'):
-            del self._full_articles
+        try:
+            # Get previews with our enhanced method
+            previews = self._get_previews(query)
-        return results
-    def get_article_by_id(self, article_id: str) -> Dict[str, Any]:
-        """
-        Get a specific article by its ID.
-        Args:
-            article_id: The Guardian article ID
+            # If no results, try one more time with a simplified query
+            if not previews:
+                simple_query = " ".join([w for w in query.split() if len(w) > 3][:3])
+                logger.warning(f"No Guardian articles found, trying simplified query: {simple_query}")
+                previews = self._get_previews(simple_query)
+                # If still no results, try with a very generic query as last resort
+                if not previews and "trump" in query.lower():
+                    logger.warning("Trying last resort query: 'Donald Trump'")
+                    previews = self._get_previews("Donald Trump")
+                elif not previews:
+                    logger.warning("Trying last resort query: 'news'")
+                    previews = self._get_previews("news")
-        Returns:
-            Dictionary with article information
-        """
-        try:
-            # Guardian article API URL
-            url = f"https://content.guardianapis.com/{article_id}"
-            # Always request all fields
-            response = requests.get(
-                url,
-                params={
-                    "api-key": self.api_key,
-                    "show-fields": "headline,trailText,body,byline,publication",
-                    "show-tags": "keyword"
-                }
-            )
-            response.raise_for_status()
+            # If still no results after all attempts, return empty list
+            if not previews:
+                logger.warning(f"No Guardian articles found after multiple attempts")
+                return []
-            data = response.json()
-            article = data.get("response", {}).get("content", {})
+            # Filter for relevance if we have an LLM
+            if self.llm and hasattr(self, 'max_filtered_results') and self.max_filtered_results:
+                filtered_items = self._filter_for_relevance(previews, query)
+                if not filtered_items:
+                    # Fall back to unfiltered results if everything was filtered out
+                    logger.warning("All articles filtered out, using unfiltered results")
+                    filtered_items = previews[:self.max_filtered_results]
+            else:
+                filtered_items = previews
-            if not article:
-                return {}
-            fields = article.get("fields", {})
-            # Format the article with all fields
-            result = {
-                "id": article_id,
-                "title": fields.get("headline", article.get("webTitle", "")),
-                "link": article.get("webUrl", ""),
-                "snippet": fields.get("trailText", ""),
-                "publication_date": article.get("webPublicationDate", ""),
-                "section": article.get("sectionName", ""),
-                "author": fields.get("byline", "")
-            }
+            # Get full content for relevant items
+            results = self._get_full_content(filtered_items)
-            # Only include full content if not in snippet-only mode
-            if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
-                result["content"] = fields.get("body", "")
-                result["full_content"] = fields.get("body", "")
+            # Add source information to make it clear these are from The Guardian
+            for result in results:
+                if "source" not in result:
+                    result["source"] = "The Guardian"
-            # Extract tags/keywords
-            tags = article.get("tags", [])
-            result["keywords"] = [tag.get("webTitle", "") for tag in tags if tag.get("type") == "keyword"]
+            # Clean up the cache after use
+            if hasattr(self, '_full_articles'):
+                del self._full_articles
+            # Restore original date parameters
+            self.from_date = self._original_date_params["from_date"]
+            self.to_date = self._original_date_params["to_date"]
-            return result
+            # Log search metadata if available
+            if hasattr(self, '_search_metadata'):
+                logger.info(f"Search metadata: {self._search_metadata}")
+                del self._search_metadata
+            return results
         except Exception as e:
-            print(f"Error getting article details: {e}")
-            return {}
+            logger.error(f"Error in Guardian search: {e}")
+            # Restore original date parameters on error
+            self.from_date = self._original_date_params["from_date"]
+            self.to_date = self._original_date_params["to_date"]
+            return []
     def search_by_section(self, section: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
         """
@@ -283,7 +528,7 @@ class GuardianSearchEngine(BaseSearchEngine):
         Args:
             section: The Guardian section name (e.g., "politics", "technology")
-            max_results: Maximum number of search results (defaults to self.max_results)
+            max_results: Maximum number of results (defaults to self.max_results)
         Returns:
             List of articles in the section

local_deep_research/web_search_engines/engines/search_engine_local_all.py CHANGED Viewed

@@ -35,11 +35,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
             max_filtered_results: Maximum results after filtering
             **kwargs: Additional parameters passed to LocalSearchEngine instances
         """
-        # Initialize the base search engine
-        super().__init__(llm=llm, max_filtered_results=max_filtered_results)
-        self.max_results = max_results
+        # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
+        super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
         # Find all local collection search engines
         self.local_engines = {}
         try:

local_deep_research/web_search_engines/engines/search_engine_pubmed.py CHANGED Viewed

@@ -44,10 +44,9 @@ class PubMedSearchEngine(BaseSearchEngine):
             max_filtered_results: Maximum number of results to keep after filtering
             optimize_queries: Whether to optimize natural language queries for PubMed
         """
-        # Initialize the BaseSearchEngine with the LLM and max_filtered_results
-        super().__init__(llm=llm, max_filtered_results=max_filtered_results)
-        self.max_results = max_results
+        # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
+        super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
+        self.max_results=max(self.max_results,25)
         self.api_key = api_key
         self.days_limit = days_limit
         self.get_abstracts = get_abstracts

local_deep_research/web_search_engines/engines/search_engine_searxng.py CHANGED Viewed

@@ -51,8 +51,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
             include_full_content: Whether to include full webpage content in results
             api_key: Alternative way to provide instance URL (takes precedence over instance_url)
         """
-        # Initialize the BaseSearchEngine with the LLM and max_filtered_results
-        super().__init__(llm=llm, max_filtered_results=max_filtered_results)
+        # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
+        super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
         # Get instance URL from various sources in priority order:
         # 1. api_key parameter (which is actually the instance URL)

local-deep-research 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

local-deep-research 0.1.13py3-none-any.whl → 0.1.14py3-none-any.whl