PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/questions/browsecomp_question.py ADDED Viewed

@@ -0,0 +1,287 @@
+"""
+BrowseComp-specific question generation that creates progressive, entity-focused searches.
+"""
+import logging
+import re
+from typing import Dict, List
+from .base_question import BaseQuestionGenerator
+logger = logging.getLogger(__name__)
+class BrowseCompQuestionGenerator(BaseQuestionGenerator):
+    """
+    Question generator optimized for BrowseComp-style queries.
+    Key features:
+    1. Extract concrete entities (dates, numbers, names, places)
+    2. Generate progressive search combinations
+    3. Start broad, then narrow systematically
+    4. Focus on verifiable facts
+    """
+    def __init__(self, model):
+        super().__init__(model)
+        self.extracted_entities = {}
+        self.search_progression = []
+    def generate_questions(
+        self,
+        current_knowledge: str,
+        query: str,
+        questions_per_iteration: int = 5,
+        questions_by_iteration: dict = None,
+        iteration: int = 1,
+    ) -> List[str]:
+        """Generate progressive search queries for BrowseComp problems."""
+        questions_by_iteration = questions_by_iteration or {}
+        # First iteration: Extract entities and create initial searches
+        if iteration == 1 or not self.extracted_entities:
+            self.extracted_entities = self._extract_entities(query)
+            return self._generate_initial_searches(
+                query, self.extracted_entities, questions_per_iteration
+            )
+        # Subsequent iterations: Progressive refinement
+        return self._generate_progressive_searches(
+            query,
+            current_knowledge,
+            self.extracted_entities,
+            questions_by_iteration,
+            questions_per_iteration,
+            iteration,
+        )
+    def _extract_entities(self, query: str) -> Dict[str, List[str]]:
+        """Extract concrete entities from the query."""
+        prompt = f"""Extract ALL concrete, searchable entities from this query:
+Query: {query}
+Extract:
+1. TEMPORAL: All years, dates, time periods (e.g., "2018", "between 1995 and 2006", "2023")
+2. NUMERICAL: All numbers, statistics, counts (e.g., "300", "more than 3", "4-3", "84.5%")
+3. NAMES: Partial names, name hints, proper nouns (e.g., "Dartmouth", "EMNLP", "Plastic Man")
+4. LOCATIONS: Places, institutions, geographic features (e.g., "Pennsylvania", "Grand Canyon")
+5. DESCRIPTORS: Key descriptive terms (e.g., "fourth wall", "ascetics", "decider game")
+For TEMPORAL entities, if there's a range (e.g., "between 2018-2023"), list EACH individual year.
+Format your response as:
+TEMPORAL: [entity1], [entity2], ...
+NUMERICAL: [entity1], [entity2], ...
+NAMES: [entity1], [entity2], ...
+LOCATIONS: [entity1], [entity2], ...
+DESCRIPTORS: [entity1], [entity2], ...
+"""
+        response = self.model.invoke(prompt)
+        content = (
+            response.content if hasattr(response, "content") else str(response)
+        )
+        entities = {
+            "temporal": [],
+            "numerical": [],
+            "names": [],
+            "locations": [],
+            "descriptors": [],
+        }
+        # current_category = None  # Not currently used
+        for line in content.strip().split("\n"):
+            line = line.strip()
+            if ":" in line:
+                category, values = line.split(":", 1)
+                category = category.strip().lower()
+                if category in entities:
+                    # Parse comma-separated values
+                    values = [v.strip() for v in values.split(",") if v.strip()]
+                    entities[category].extend(values)
+        # Expand temporal ranges
+        entities["temporal"] = self._expand_temporal_ranges(
+            entities["temporal"]
+        )
+        logger.info(f"Extracted entities: {entities}")
+        return entities
+    def _expand_temporal_ranges(
+        self, temporal_entities: List[str]
+    ) -> List[str]:
+        """Expand year ranges into individual years."""
+        expanded = []
+        for entity in temporal_entities:
+            # Check for range patterns like "2018-2023" or "between 1995 and 2006"
+            range_match = re.search(
+                r"(\d{4})[-\s]+(?:to|and)?\s*(\d{4})", entity
+            )
+            if range_match:
+                start_year = int(range_match.group(1))
+                end_year = int(range_match.group(2))
+                for year in range(start_year, end_year + 1):
+                    expanded.append(str(year))
+            else:
+                # Single year or other temporal entity
+                year_match = re.search(r"\d{4}", entity)
+                if year_match:
+                    expanded.append(year_match.group())
+                else:
+                    expanded.append(entity)
+        return list(set(expanded))  # Remove duplicates
+    def _generate_initial_searches(
+        self, query: str, entities: Dict[str, List[str]], num_questions: int
+    ) -> List[str]:
+        """Generate initial broad searches."""
+        searches = []
+        # 1. Original query (always include)
+        searches.append(query)
+        # 2. Domain exploration searches (combine key entities)
+        if entities["names"]:
+            for name in entities["names"][:2]:  # Top 2 names
+                searches.append(f"{name}")
+                if entities["descriptors"]:
+                    searches.append(f"{name} {entities['descriptors'][0]}")
+        # 3. Temporal searches if years are important
+        if entities["temporal"] and len(entities["temporal"]) <= 10:
+            # For small year ranges, search each year with a key term
+            key_term = (
+                entities["names"][0]
+                if entities["names"]
+                else entities["descriptors"][0]
+                if entities["descriptors"]
+                else ""
+            )
+            for year in entities["temporal"][:5]:  # Limit to 5 years initially
+                if key_term:
+                    searches.append(f"{key_term} {year}")
+        # 4. Location-based searches
+        if entities["locations"]:
+            for location in entities["locations"][:2]:
+                searches.append(f"{location}")
+                if entities["descriptors"]:
+                    searches.append(f"{location} {entities['descriptors'][0]}")
+        # Remove duplicates and limit to requested number
+        seen = set()
+        unique_searches = []
+        for s in searches:
+            if s.lower() not in seen:
+                seen.add(s.lower())
+                unique_searches.append(s)
+        return unique_searches[:num_questions]
+    def _generate_progressive_searches(
+        self,
+        query: str,
+        current_knowledge: str,
+        entities: Dict[str, List[str]],
+        questions_by_iteration: dict,
+        num_questions: int,
+        iteration: int,
+    ) -> List[str]:
+        """Generate progressively more specific searches based on findings."""
+        # Analyze what we've found so far
+        prompt = f"""Based on our search progress, generate targeted follow-up searches.
+Original Query: {query}
+Entities Found:
+- Names/Terms: {", ".join(entities["names"][:5])}
+- Years: {", ".join(entities["temporal"][:5])}
+- Locations: {", ".join(entities["locations"][:3])}
+- Key Features: {", ".join(entities["descriptors"][:3])}
+Current Knowledge Summary:
+{current_knowledge[:1500]}
+Previous Searches:
+{self._format_previous_searches(questions_by_iteration)}
+Generate {num_questions} NEW search queries that:
+1. Combine 2-3 entities we haven't tried together
+2. If we found candidate names, search for them with other constraints
+3. For year ranges, systematically cover years we haven't searched
+4. Use quotes for exact phrases when beneficial
+Focus on finding the specific answer, not general information.
+Format: One search per line
+"""
+        response = self.model.invoke(prompt)
+        content = (
+            response.content if hasattr(response, "content") else str(response)
+        )
+        # Extract searches from response
+        searches = []
+        for line in content.strip().split("\n"):
+            line = line.strip()
+            if line and not line.endswith(":") and len(line) > 5:
+                # Clean up common prefixes
+                for prefix in ["Q:", "Search:", "-", "*", "•"]:
+                    if line.startswith(prefix):
+                        line = line[len(prefix) :].strip()
+                if line:
+                    searches.append(line)
+        # Ensure we have enough searches
+        while len(searches) < num_questions:
+            # Generate combinations programmatically
+            if iteration <= 5 and entities["temporal"]:
+                # Continue with year-based searches
+                for year in entities["temporal"]:
+                    if not self._was_searched(year, questions_by_iteration):
+                        base_term = (
+                            entities["names"][0] if entities["names"] else ""
+                        )
+                        searches.append(f"{base_term} {year}".strip())
+                        if len(searches) >= num_questions:
+                            break
+            else:
+                # Combine multiple constraints
+                if entities["names"] and entities["descriptors"]:
+                    for name in entities["names"]:
+                        for desc in entities["descriptors"]:
+                            combo = f"{name} {desc}"
+                            if not self._was_searched(
+                                combo, questions_by_iteration
+                            ):
+                                searches.append(combo)
+                                if len(searches) >= num_questions:
+                                    break
+        return searches[:num_questions]
+    def _format_previous_searches(self, questions_by_iteration: dict) -> str:
+        """Format previous searches for context."""
+        formatted = []
+        for iteration, questions in questions_by_iteration.items():
+            if isinstance(questions, list):
+                formatted.extend(
+                    [f"Iteration {iteration}: {q}" for q in questions[:3]]
+                )
+        return "\n".join(formatted[-10:])  # Last 10 searches
+    def _was_searched(self, term: str, questions_by_iteration: dict) -> bool:
+        """Check if a term was already searched."""
+        term_lower = term.lower()
+        for questions in questions_by_iteration.values():
+            if isinstance(questions, list):
+                for q in questions:
+                    if term_lower in q.lower():
+                        return True
+        return False

local_deep_research/advanced_search_system/questions/decomposition_question.py CHANGED Viewed

@@ -101,7 +101,9 @@ class DecompositionQuestionGenerator(BaseQuestionGenerator):
                 if subject.lower().startswith(article):
                     subject = subject[len(article) :].strip()
-        logger.info(f"Original query: '{query}', Extracted subject: '{subject}'")
+        logger.info(
+            f"Original query: '{query}', Extracted subject: '{subject}'"
+        )
         # Create a prompt to decompose the query into sub-questions
         prompt = f"""Decompose the main research topic into 3-5 specific sub-queries that can be answered independently.
@@ -223,7 +225,9 @@ What are the security implications of X?
                     for conjunction in conjunctions:
                         if conjunction in topic_text.lower():
                             # Take only the part before the conjunction
-                            topic_text = topic_text.split(conjunction)[0].strip()
+                            topic_text = topic_text.split(conjunction)[
+                                0
+                            ].strip()
                             logger.info(
                                 f"Simplified prompt: Split compound query at '{conjunction}', extracted: '{topic_text}'"
                             )
@@ -288,7 +292,9 @@ Sub-questions:
                 )
                 return self._generate_default_questions(query)
-            logger.info(f"Generated {len(sub_queries)} sub-questions: {sub_queries}")
+            logger.info(
+                f"Generated {len(sub_queries)} sub-questions: {sub_queries}"
+            )
             return sub_queries[: self.max_subqueries]  # Limit to max_subqueries
         except Exception as e:
@@ -380,7 +386,10 @@ Sub-questions:
         )
         # Special case for CSRF - if we've extracted just "csrf" from a longer query
-        if subject.lower() == "csrf" or subject.lower() == "cross-site request forgery":
+        if (
+            subject.lower() == "csrf"
+            or subject.lower() == "cross-site request forgery"
+        ):
             # CSRF-specific questions
             default_questions = [
                 "What is Cross-Site Request Forgery (CSRF)?",

local_deep_research/advanced_search_system/questions/entity_aware_question.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""
+Entity-aware question generation for improved entity identification.
+"""
+import logging
+from datetime import datetime
+from typing import List
+from .base_question import BaseQuestionGenerator
+logger = logging.getLogger(__name__)
+class EntityAwareQuestionGenerator(BaseQuestionGenerator):
+    """Question generator that creates more targeted searches for entity identification."""
+    def generate_questions(
+        self,
+        current_knowledge: str,
+        query: str,
+        questions_per_iteration: int = 2,
+        questions_by_iteration: dict = None,
+    ) -> List[str]:
+        """Generate questions with entity-aware search patterns."""
+        now = datetime.now()
+        current_time = now.strftime("%Y-%m-%d")
+        questions_by_iteration = questions_by_iteration or {}
+        logger.info("Generating entity-aware follow-up questions...")
+        # Detect if this is likely an entity identification query
+        entity_keywords = [
+            "who",
+            "what",
+            "which",
+            "identify",
+            "name",
+            "character",
+            "person",
+            "place",
+            "organization",
+            "company",
+            "author",
+            "scientist",
+            "inventor",
+            "city",
+            "country",
+            "book",
+            "movie",
+        ]
+        is_entity_query = any(
+            keyword in query.lower() for keyword in entity_keywords
+        )
+        if is_entity_query:
+            # Use more direct entity-focused prompt
+            if questions_by_iteration:
+                prompt = f"""Generate {questions_per_iteration} targeted search queries to identify the specific entity in the query.
+Query: {query}
+Today: {current_time}
+Past questions: {str(questions_by_iteration)}
+Current knowledge: {current_knowledge}
+Create direct search queries that combine the key identifying features to find the specific name/entity.
+Focus on:
+1. Combining multiple constraints in a single search
+2. Using quotation marks for exact phrases
+3. Including specific details that narrow down results
+Format: One question per line, e.g.
+Q: "fictional character" "breaks fourth wall" "TV show" 1960s 1980s
+Q: character name ascetics humor television fewer than 50 episodes
+"""
+            else:
+                prompt = f"""Generate {questions_per_iteration} direct search queries to identify the specific entity in: {query}
+Today: {current_time}
+Create search queries that:
+1. Combine multiple identifying features
+2. Target the specific entity name/identification
+3. Use variations of key terms
+Format: One question per line, e.g.
+Q: question1
+Q: question2
+"""
+        else:
+            # Fall back to standard question generation for non-entity queries
+            return super().generate_questions(
+                current_knowledge,
+                query,
+                questions_per_iteration,
+                questions_by_iteration,
+            )
+        response = self.model.invoke(prompt)
+        # Handle both string responses and responses with .content attribute
+        response_text = ""
+        if hasattr(response, "content"):
+            response_text = response.content
+        else:
+            response_text = str(response)
+        questions = [
+            q.replace("Q:", "").strip()
+            for q in response_text.split("\n")
+            if q.strip().startswith("Q:")
+        ][:questions_per_iteration]
+        logger.info(f"Generated {len(questions)} entity-aware questions")
+        return questions
+    def generate_sub_questions(
+        self, query: str, context: str = ""
+    ) -> List[str]:
+        """Generate sub-questions with entity focus when appropriate."""
+        # Check if this is an entity identification query
+        entity_keywords = [
+            "who",
+            "what",
+            "which",
+            "identify",
+            "name",
+            "character",
+            "person",
+            "place",
+            "organization",
+            "company",
+        ]
+        is_entity_query = any(
+            keyword in query.lower() for keyword in entity_keywords
+        )
+        if is_entity_query:
+            prompt = f"""Break down this entity identification query into targeted sub-questions.
+Original Question: {query}
+{context}
+Generate 2-5 sub-questions that will help identify the specific entity.
+Focus on:
+1. Combining constraints to narrow down results
+2. Finding the actual name/identity
+3. Verifying the entity matches all criteria
+Format your response as:
+1. First sub-question
+2. Second sub-question
+...
+Only provide the numbered sub-questions."""
+        else:
+            return super().generate_sub_questions(query, context)
+        try:
+            response = self.model.invoke(prompt)
+            content = ""
+            if hasattr(response, "content"):
+                content = response.content
+            else:
+                content = str(response)
+            # Extract numbered questions
+            questions = []
+            for line in content.strip().split("\n"):
+                line = line.strip()
+                if line and (line[0].isdigit() or line.startswith("-")):
+                    # Remove the number/bullet and clean up
+                    question = line.split(".", 1)[-1].strip()
+                    question = question.lstrip("- ").strip()
+                    if question:
+                        questions.append(question)
+            return questions
+        except Exception as e:
+            logger.error(f"Error generating sub-questions: {str(e)}")
+            return []

local_deep_research/advanced_search_system/questions/standard_question.py CHANGED Viewed

@@ -50,7 +50,7 @@ class StandardQuestionGenerator(BaseQuestionGenerator):
             response_text = str(response)
         questions = [
-            q.replace("Q:", "").strip()
+            q.replace("Q:", "").strip().strip("\"'")
             for q in response_text.split("\n")
             if q.strip().startswith("Q:")
         ][:questions_per_iteration]
@@ -59,7 +59,9 @@ class StandardQuestionGenerator(BaseQuestionGenerator):
         return questions
-    def generate_sub_questions(self, query: str, context: str = "") -> List[str]:
+    def generate_sub_questions(
+        self, query: str, context: str = ""
+    ) -> List[str]:
         """
         Generate sub-questions from a main query.
@@ -107,7 +109,11 @@ Only provide the numbered sub-questions, nothing else."""
                 line = line.strip()
                 if line and (line[0].isdigit() or line.startswith("-")):
                     # Extract sub-question from numbered or bulleted list
-                    parts = line.split(".", 1) if "." in line else line.split(" ", 1)
+                    parts = (
+                        line.split(".", 1)
+                        if "." in line
+                        else line.split(" ", 1)
+                    )
                     if len(parts) > 1:
                         sub_question = parts[1].strip()
                         sub_questions.append(sub_question)

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl