PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py ADDED Viewed

@@ -0,0 +1,405 @@
+"""
+Adaptive query generation system for improved search performance.
+"""
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Tuple
+from langchain_core.language_models import BaseChatModel
+from ...utilities.search_utilities import remove_think_tags
+from ..constraints.base_constraint import Constraint, ConstraintType
+@dataclass
+class QueryPattern:
+    """Represents a successful query pattern."""
+    template: str
+    constraint_types: List[ConstraintType]
+    success_rate: float
+    example_queries: List[str]
+    discovered_entities: Set[str]
+class AdaptiveQueryGenerator:
+    """
+    Generates search queries that adapt based on past performance.
+    Features:
+    1. Pattern learning from successful queries
+    2. Semantic expansion for broader coverage
+    3. Constraint combination optimization
+    4. Failure recovery strategies
+    """
+    def __init__(self, model: BaseChatModel):
+        """Initialize the adaptive query generator."""
+        self.model = model
+        self.successful_patterns: List[QueryPattern] = []
+        self.failed_queries: Set[str] = set()
+        self.semantic_expansions: Dict[str, List[str]] = {}
+        self.constraint_combinations: Dict[
+            Tuple[ConstraintType, ...], float
+        ] = defaultdict(float)
+        # Initialize default patterns
+        self._initialize_default_patterns()
+    def _initialize_default_patterns(self):
+        """Initialize with proven query patterns."""
+        default_patterns = [
+            QueryPattern(
+                template='"{entity}" {property} {location}',
+                constraint_types=[
+                    ConstraintType.NAME_PATTERN,
+                    ConstraintType.PROPERTY,
+                    ConstraintType.LOCATION,
+                ],
+                success_rate=0.7,
+                example_queries=['"mountain" formed ice age Colorado'],
+                discovered_entities=set(),
+            ),
+            QueryPattern(
+                template="{event} {temporal} {statistic}",
+                constraint_types=[
+                    ConstraintType.EVENT,
+                    ConstraintType.TEMPORAL,
+                    ConstraintType.STATISTIC,
+                ],
+                success_rate=0.6,
+                example_queries=["accident 2000-2021 statistics"],
+                discovered_entities=set(),
+            ),
+            QueryPattern(
+                template='"{name_pattern}" AND {comparison} {value}',
+                constraint_types=[
+                    ConstraintType.NAME_PATTERN,
+                    ConstraintType.COMPARISON,
+                ],
+                success_rate=0.65,
+                example_queries=['"body part" AND "84.5 times" ratio'],
+                discovered_entities=set(),
+            ),
+        ]
+        self.successful_patterns.extend(default_patterns)
+    def generate_query(
+        self, constraints: List[Constraint], context: Optional[Dict] = None
+    ) -> str:
+        """Generate an adaptive query based on constraints and context."""
+        # Try pattern-based generation first
+        pattern_query = self._generate_from_patterns(constraints)
+        if pattern_query and pattern_query not in self.failed_queries:
+            return pattern_query
+        # Try semantic expansion
+        expanded_query = self._generate_with_expansion(constraints)
+        if expanded_query and expanded_query not in self.failed_queries:
+            return expanded_query
+        # Fall back to LLM-based generation
+        return self._generate_with_llm(constraints, context)
+    def _generate_from_patterns(
+        self, constraints: List[Constraint]
+    ) -> Optional[str]:
+        """Generate query using learned patterns."""
+        constraint_types = [c.type for c in constraints]
+        # Find matching patterns
+        matching_patterns = []
+        for pattern in self.successful_patterns:
+            if all(t in constraint_types for t in pattern.constraint_types):
+                matching_patterns.append(pattern)
+        if not matching_patterns:
+            return None
+        # Use the highest success rate pattern
+        best_pattern = max(matching_patterns, key=lambda p: p.success_rate)
+        # Fill in the template
+        template_vars = {}
+        for constraint in constraints:
+            if constraint.type == ConstraintType.NAME_PATTERN:
+                template_vars["name_pattern"] = constraint.value
+                template_vars["entity"] = constraint.value
+            elif constraint.type == ConstraintType.PROPERTY:
+                template_vars["property"] = constraint.value
+            elif constraint.type == ConstraintType.LOCATION:
+                template_vars["location"] = constraint.value
+            elif constraint.type == ConstraintType.EVENT:
+                template_vars["event"] = constraint.value
+            elif constraint.type == ConstraintType.TEMPORAL:
+                template_vars["temporal"] = constraint.value
+            elif constraint.type == ConstraintType.STATISTIC:
+                template_vars["statistic"] = constraint.value
+            elif constraint.type == ConstraintType.COMPARISON:
+                template_vars["comparison"] = f'"{constraint.value}"'
+                template_vars["value"] = constraint.value
+        try:
+            query = best_pattern.template.format(**template_vars)
+            return query
+        except KeyError:
+            return None
+    def _generate_with_expansion(
+        self, constraints: List[Constraint]
+    ) -> Optional[str]:
+        """Generate query with semantic expansion."""
+        expanded_terms = []
+        for constraint in constraints:
+            # Get expansions for this value
+            if constraint.value not in self.semantic_expansions:
+                self.semantic_expansions[constraint.value] = (
+                    self._get_semantic_expansions(
+                        constraint.value, constraint.type
+                    )
+                )
+            expansions = self.semantic_expansions[constraint.value]
+            if expansions:
+                # Use OR to include expansions
+                expanded = (
+                    f"({constraint.value} OR {' OR '.join(expansions[:2])})"
+                )
+                expanded_terms.append(expanded)
+            else:
+                expanded_terms.append(f'"{constraint.value}"')
+        return " AND ".join(expanded_terms)
+    def _get_semantic_expansions(
+        self, term: str, constraint_type: ConstraintType
+    ) -> List[str]:
+        """Get semantic expansions for a term."""
+        prompt = f"""
+Generate 3 alternative phrases or related terms for "{term}" in the context of {constraint_type.value}.
+These should be:
+1. Synonyms or near-synonyms
+2. Related concepts
+3. Alternative phrasings
+Return only the terms, one per line.
+"""
+        response = self.model.invoke(prompt)
+        expansions = [
+            line.strip()
+            for line in remove_think_tags(response.content).strip().split("\n")
+            if line.strip()
+        ]
+        return [f'"{exp}"' for exp in expansions[:3]]
+    def _generate_with_llm(
+        self, constraints: List[Constraint], context: Optional[Dict] = None
+    ) -> str:
+        """Generate query using LLM with context awareness."""
+        constraint_desc = self._format_constraints(constraints)
+        context_info = ""
+        if context:
+            if "failed_queries" in context:
+                context_info += "\nFailed queries to avoid:\n" + "\n".join(
+                    context["failed_queries"][:3]
+                )
+            if "successful_queries" in context:
+                context_info += "\nSuccessful query patterns:\n" + "\n".join(
+                    context["successful_queries"][:3]
+                )
+        prompt = f"""
+Create an effective search query for these constraints:
+{constraint_desc}
+{context_info}
+Guidelines:
+1. Focus on finding specific named entities
+2. Use operators (AND, OR, quotes) effectively
+3. Combine constraints strategically
+4. Make the query neither too broad nor too narrow
+Return only the search query.
+"""
+        response = self.model.invoke(prompt)
+        return remove_think_tags(response.content).strip()
+    def update_patterns(
+        self,
+        query: str,
+        constraints: List[Constraint],
+        success: bool,
+        entities_found: List[str],
+    ):
+        """Update patterns based on query performance."""
+        if success and entities_found:
+            # Extract pattern from successful query
+            pattern = self._extract_pattern(query, constraints)
+            if pattern:
+                # Update or add pattern
+                existing = next(
+                    (
+                        p
+                        for p in self.successful_patterns
+                        if p.template == pattern.template
+                    ),
+                    None,
+                )
+                if existing:
+                    existing.success_rate = (existing.success_rate + 1.0) / 2
+                    existing.example_queries.append(query)
+                    existing.discovered_entities.update(entities_found)
+                else:
+                    self.successful_patterns.append(pattern)
+            # Update constraint combinations
+            constraint_types = tuple(sorted(c.type for c in constraints))
+            self.constraint_combinations[constraint_types] += 1
+        else:
+            self.failed_queries.add(query)
+    def _extract_pattern(
+        self, query: str, constraints: List[Constraint]
+    ) -> Optional[QueryPattern]:
+        """Extract a reusable pattern from a successful query."""
+        # Simple pattern extraction - could be made more sophisticated
+        pattern = query
+        # Replace specific values with placeholders
+        for constraint in constraints:
+            if constraint.value in query:
+                placeholder = f"{{{constraint.type.value}}}"
+                pattern = pattern.replace(constraint.value, placeholder)
+        # Only create pattern if it has placeholders
+        if "{" in pattern:
+            return QueryPattern(
+                template=pattern,
+                constraint_types=[c.type for c in constraints],
+                success_rate=1.0,
+                example_queries=[query],
+                discovered_entities=set(),
+            )
+        return None
+    def _format_constraints(self, constraints: List[Constraint]) -> str:
+        """Format constraints for prompts."""
+        formatted = []
+        for c in constraints:
+            formatted.append(
+                f"- {c.type.value}: {c.description} [value: {c.value}]"
+            )
+        return "\n".join(formatted)
+    def generate_fallback_queries(
+        self, original_query: str, constraints: List[Constraint]
+    ) -> List[str]:
+        """Generate fallback queries when the original fails."""
+        fallback_queries = []
+        # 1. Simplified query (fewer constraints)
+        if len(constraints) > 2:
+            priority_constraints = sorted(
+                constraints, key=lambda c: c.weight, reverse=True
+            )[:2]
+            simplified = self.generate_query(priority_constraints)
+            fallback_queries.append(simplified)
+        # 2. Broadened query (with OR instead of AND)
+        terms = [f'"{c.value}"' for c in constraints]
+        broadened = " OR ".join(terms)
+        fallback_queries.append(broadened)
+        # 3. Decomposed queries (one constraint at a time)
+        for constraint in constraints[:3]:
+            single_query = self._generate_single_constraint_query(constraint)
+            fallback_queries.append(single_query)
+        # 4. Alternative phrasing
+        alt_prompt = f"""
+The query "{original_query}" failed. Create 2 alternative queries with different phrasing.
+Constraints to satisfy:
+{self._format_constraints(constraints)}
+Return only the queries, one per line.
+"""
+        response = self.model.invoke(alt_prompt)
+        alt_queries = [
+            line.strip()
+            for line in remove_think_tags(response.content).strip().split("\n")
+            if line.strip()
+        ]
+        fallback_queries.extend(alt_queries)
+        # Remove duplicates and failed queries
+        unique_fallbacks = []
+        for q in fallback_queries:
+            if q and q not in self.failed_queries and q not in unique_fallbacks:
+                unique_fallbacks.append(q)
+        return unique_fallbacks[:5]
+    def _generate_single_constraint_query(self, constraint: Constraint) -> str:
+        """Generate a query for a single constraint."""
+        type_specific_templates = {
+            ConstraintType.NAME_PATTERN: '"{value}" names list',
+            ConstraintType.LOCATION: '"{value}" places locations',
+            ConstraintType.EVENT: '"{value}" incidents accidents',
+            ConstraintType.PROPERTY: 'things with "{value}" property',
+            ConstraintType.STATISTIC: '"{value}" statistics data',
+            ConstraintType.TEMPORAL: "events in {value}",
+            ConstraintType.COMPARISON: '"{value}" comparison ratio',
+            ConstraintType.EXISTENCE: 'has "{value}" feature',
+        }
+        template = type_specific_templates.get(constraint.type, '"{value}"')
+        return template.format(value=constraint.value)
+    def optimize_constraint_combinations(
+        self, constraints: List[Constraint]
+    ) -> List[List[Constraint]]:
+        """Optimize constraint combinations based on past success."""
+        combinations = []
+        # Sort constraint combinations by success rate
+        sorted_combos = sorted(
+            self.constraint_combinations.items(),
+            key=lambda x: x[1],
+            reverse=True,
+        )
+        # Try successful combinations first
+        for combo_types, _ in sorted_combos:
+            matching_constraints = []
+            for ctype in combo_types:
+                matching = [c for c in constraints if c.type == ctype]
+                if matching:
+                    matching_constraints.append(matching[0])
+            if len(matching_constraints) == len(combo_types):
+                combinations.append(matching_constraints)
+        # Add individual constraints
+        combinations.extend([[c] for c in constraints])
+        # Add pairs not yet tried
+        for i in range(len(constraints)):
+            for j in range(i + 1, len(constraints)):
+                pair = [constraints[i], constraints[j]]
+                if pair not in combinations:
+                    combinations.append(pair)
+        return combinations[:10]  # Limit to top 10

local_deep_research/advanced_search_system/questions/__init__.py CHANGED Viewed

@@ -1 +1,17 @@
 # Search System Questions Package
+from .atomic_fact_question import AtomicFactQuestionGenerator
+from .base_question import BaseQuestionGenerator
+from .browsecomp_question import BrowseCompQuestionGenerator
+from .decomposition_question import DecompositionQuestionGenerator
+from .entity_aware_question import EntityAwareQuestionGenerator
+from .standard_question import StandardQuestionGenerator
+__all__ = [
+    "BaseQuestionGenerator",
+    "StandardQuestionGenerator",
+    "DecompositionQuestionGenerator",
+    "AtomicFactQuestionGenerator",
+    "EntityAwareQuestionGenerator",
+    "BrowseCompQuestionGenerator",
+]

local_deep_research/advanced_search_system/questions/atomic_fact_question.py ADDED Viewed

@@ -0,0 +1,171 @@
+"""
+Atomic fact question generator for complex queries.
+Decomposes complex queries into atomic, independently searchable facts.
+"""
+import logging
+from typing import Dict, List
+from .base_question import BaseQuestionGenerator
+logger = logging.getLogger(__name__)
+class AtomicFactQuestionGenerator(BaseQuestionGenerator):
+    """
+    Generates questions by decomposing complex queries into atomic facts.
+    This approach prevents the system from searching for documents that match
+    ALL criteria at once, instead finding facts independently and then reasoning
+    about connections.
+    """
+    def generate_questions(
+        self,
+        current_knowledge: str,
+        query: str,
+        questions_per_iteration: int = 5,
+        questions_by_iteration: Dict[int, List[str]] = None,
+    ) -> List[str]:
+        """
+        Generate atomic fact questions from a complex query.
+        Args:
+            current_knowledge: The accumulated knowledge so far
+            query: The original research query
+            questions_per_iteration: Number of questions to generate
+            questions_by_iteration: Questions generated in previous iterations
+        Returns:
+            List of atomic fact questions
+        """
+        questions_by_iteration = questions_by_iteration or {}
+        # On first iteration, decompose the query
+        if not questions_by_iteration:
+            return self._decompose_to_atomic_facts(query)
+        # On subsequent iterations, fill knowledge gaps or explore connections
+        return self._generate_gap_filling_questions(
+            query,
+            current_knowledge,
+            questions_by_iteration,
+            questions_per_iteration,
+        )
+    def _decompose_to_atomic_facts(self, query: str) -> List[str]:
+        """Decompose complex query into atomic, searchable facts."""
+        prompt = f"""Decompose this complex query into simple, atomic facts that can be searched independently.
+Query: {query}
+Break this down into individual facts that can be searched separately. Each fact should:
+1. Be about ONE thing only
+2. Be searchable on its own
+3. Not depend on other facts
+4. Use general terms (e.g., "body parts" not specific ones)
+For example, if the query is about a location with multiple criteria, create separate questions for:
+- The geographical/geological aspect
+- The naming aspect
+- The historical events
+- The statistical comparisons
+Return ONLY the questions, one per line.
+Example format:
+What locations were formed by glaciers?
+What geographic features are named after body parts?
+Where did falls occur between specific dates?
+"""
+        response = self.model.invoke(prompt)
+        # Extract response text
+        response_text = ""
+        if hasattr(response, "content"):
+            response_text = response.content
+        else:
+            response_text = str(response)
+        # Parse questions
+        questions = []
+        for line in response_text.strip().split("\n"):
+            line = line.strip()
+            if line and not line.startswith("#") and len(line) > 10:
+                # Clean up any numbering or bullets
+                for prefix in ["1.", "2.", "3.", "4.", "5.", "-", "*", "•"]:
+                    if line.startswith(prefix):
+                        line = line[len(prefix) :].strip()
+                questions.append(line)
+        logger.info(f"Decomposed query into {len(questions)} atomic facts")
+        return questions[:5]  # Limit to 5 atomic facts
+    def _generate_gap_filling_questions(
+        self,
+        original_query: str,
+        current_knowledge: str,
+        questions_by_iteration: Dict[int, List[str]],
+        questions_per_iteration: int,
+    ) -> List[str]:
+        """Generate questions to fill knowledge gaps or make connections."""
+        # Check if we have enough information to start reasoning
+        if len(questions_by_iteration) >= 3:
+            prompt = f"""Based on the accumulated knowledge, generate questions that help connect the facts or fill remaining gaps.
+Original Query: {original_query}
+Current Knowledge:
+{current_knowledge}
+Previous Questions:
+{self._format_previous_questions(questions_by_iteration)}
+Generate {questions_per_iteration} questions that:
+1. Connect different facts you've found
+2. Fill specific gaps in knowledge
+3. Search for locations that match multiple criteria
+4. Verify specific details
+Return ONLY the questions, one per line.
+"""
+        else:
+            # Still gathering basic facts
+            prompt = f"""Continue gathering atomic facts for this query.
+Original Query: {original_query}
+Previous Questions:
+{self._format_previous_questions(questions_by_iteration)}
+Current Knowledge:
+{current_knowledge}
+Generate {questions_per_iteration} more atomic fact questions that help build a complete picture.
+Focus on facts not yet explored.
+Return ONLY the questions, one per line.
+"""
+        response = self.model.invoke(prompt)
+        # Extract response text
+        response_text = ""
+        if hasattr(response, "content"):
+            response_text = response.content
+        else:
+            response_text = str(response)
+        # Parse questions
+        questions = []
+        for line in response_text.strip().split("\n"):
+            line = line.strip()
+            if line and not line.startswith("#") and len(line) > 10:
+                # Clean up any numbering or bullets
+                for prefix in ["1.", "2.", "3.", "4.", "5.", "-", "*", "•"]:
+                    if line.startswith(prefix):
+                        line = line[len(prefix) :].strip()
+                questions.append(line)
+        return questions[:questions_per_iteration]

local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl