PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/strategies/smart_query_strategy.py ADDED Viewed

@@ -0,0 +1,515 @@
+"""
+Smart query generation strategy that works for any type of search target.
+"""
+import concurrent.futures
+from typing import Dict, List
+from loguru import logger
+from ..constraints.base_constraint import Constraint
+from ..constraints.constraint_analyzer import ConstraintType
+from .early_stop_constrained_strategy import EarlyStopConstrainedStrategy
+class SmartQueryStrategy(EarlyStopConstrainedStrategy):
+    """
+    Enhanced strategy with intelligent query generation that:
+    1. Analyzes constraints to identify key search terms
+    2. Uses LLM to suggest search queries based on constraint meaning
+    3. Generates multiple query variations for better coverage
+    """
+    def __init__(
+        self,
+        *args,
+        use_llm_query_generation: bool = True,
+        queries_per_combination: int = 3,
+        use_entity_seeding: bool = True,
+        use_direct_property_search: bool = True,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.use_llm_query_generation = use_llm_query_generation
+        self.queries_per_combination = queries_per_combination
+        self.use_entity_seeding = use_entity_seeding
+        self.use_direct_property_search = use_direct_property_search
+        # Track queries to avoid duplicates
+        self.searched_queries = set()
+        self.query_variations = set()
+        # Store entity seeds for targeted search
+        self.entity_seeds = []
+    def _build_query(self, constraints: List[Constraint]) -> str:
+        """Build intelligent queries using constraint analysis."""
+        if self.use_llm_query_generation:
+            # Use LLM to generate smart queries
+            return self._generate_smart_query(constraints)
+        else:
+            # Fallback to improved standard approach
+            return self._build_standard_query(constraints)
+    def _generate_smart_query(self, constraints: List[Constraint]) -> str:
+        """Use LLM to generate optimal search queries."""
+        constraint_text = "\n".join(
+            [
+                f"- {c.type.value}: {c.value} (weight: {c.weight})"
+                for c in constraints
+            ]
+        )
+        # Build a list of already searched queries to avoid duplication
+        searched_list = list(self.searched_queries)[:10]  # Show last 10 to LLM
+        already_searched = (
+            "\n".join([f"- {q}" for q in searched_list])
+            if searched_list
+            else "None"
+        )
+        prompt = f"""
+Analyze these search constraints and generate an optimal web search query:
+Constraints:
+{constraint_text}
+Target type: {getattr(self, "entity_type", "unknown")}
+Already searched queries (avoid these):
+{already_searched}
+Generate a single search query that would most effectively find results matching these constraints.
+The query should:
+1. Include the most identifying/unique terms
+2. Use appropriate search operators (quotes, AND, OR)
+3. Be specific enough to find relevant results but not too narrow
+4. Focus on the highest weighted constraints
+5. Be different from already searched queries
+Return only the search query, nothing else.
+"""
+        try:
+            query = self.model.invoke(prompt).content.strip()
+            # Check if this query is too similar to existing ones
+            normalized_query = query.strip().lower()
+            if normalized_query in self.searched_queries:
+                logger.info(
+                    f"LLM generated duplicate query, using fallback: {query}"
+                )
+                return self._build_standard_query(constraints)
+            logger.info(f"LLM generated query: {query}")
+            return query
+        except Exception as e:
+            logger.error(f"Failed to generate smart query: {e}")
+            return self._build_standard_query(constraints)
+    def _build_standard_query(self, constraints: List[Constraint]) -> str:
+        """Improved standard query building."""
+        # Group constraints by importance
+        critical_terms = []
+        supplementary_terms = []
+        for c in constraints:
+            term = c.value
+            # Quote multi-word terms
+            if " " in term and not term.startswith('"'):
+                term = f'"{term}"'
+            if c.weight > 0.7:
+                critical_terms.append(term)
+            else:
+                supplementary_terms.append(term)
+        # Build query with critical terms required, supplementary optional
+        query_parts = []
+        # Add entity type if known
+        entity_type = getattr(self, "entity_type", None)
+        if entity_type and entity_type != "unknown entity":
+            query_parts.append(entity_type)
+        # Add critical terms
+        if critical_terms:
+            query_parts.extend(critical_terms)
+        # Add some supplementary terms
+        if supplementary_terms:
+            query_parts.extend(
+                supplementary_terms[:2]
+            )  # Limit to avoid overly specific queries
+        return " ".join(query_parts)
+    def _execute_combination_search(self, combo) -> List:
+        """Override to generate multiple query variations per combination."""
+        all_candidates = []
+        if self.use_llm_query_generation:
+            # Generate multiple query variations
+            queries = self._generate_query_variations(combo.constraints)
+            # Execute searches in parallel
+            with concurrent.futures.ThreadPoolExecutor(
+                max_workers=self.queries_per_combination
+            ) as executor:
+                futures = []
+                for query in queries:
+                    # Check if we've already searched this query
+                    normalized_query = query.strip().lower()
+                    if normalized_query in self.searched_queries:
+                        logger.info(f"Skipping duplicate query: '{query}'")
+                        continue
+                    self.searched_queries.add(normalized_query)
+                    future = executor.submit(self._execute_search, query)
+                    futures.append((query, future))
+                for query, future in futures:
+                    try:
+                        results = future.result()
+                        candidates = self._extract_candidates_from_results(
+                            results
+                        )
+                        all_candidates.extend(candidates)
+                        logger.info(
+                            f"Query '{query}' found {len(candidates)} candidates"
+                        )
+                    except Exception as e:
+                        logger.error(f"Search failed for query '{query}': {e}")
+        else:
+            # Use single query from parent implementation
+            candidates = super()._execute_combination_search(combo)
+            all_candidates.extend(candidates)
+        return all_candidates
+    def _generate_query_variations(
+        self, constraints: List[Constraint]
+    ) -> List[str]:
+        """Generate multiple query variations for better coverage."""
+        # Handle single constraint case
+        if isinstance(constraints, Constraint):
+            constraints = [constraints]
+        constraint_text = "\n".join(
+            [f"- {c.type.value}: {c.value}" for c in constraints]
+        )
+        # Build a list of already searched queries to avoid duplication
+        searched_list = list(self.searched_queries)[:20]  # Show last 20 to LLM
+        already_searched = (
+            "\n".join([f"- {q}" for q in searched_list])
+            if searched_list
+            else "None"
+        )
+        prompt = f"""
+Generate {self.queries_per_combination} different search queries for these constraints:
+{constraint_text}
+Already searched queries (avoid these):
+{already_searched}
+Each query should:
+- Approach the search from a different angle
+- Use different search terms or operators
+- Target different aspects of the constraints
+- Be distinctly different from already searched queries
+Provide each query on a separate line.
+"""
+        try:
+            response = self.model.invoke(prompt).content
+            queries = [q.strip() for q in response.split("\n") if q.strip()]
+            # Filter out duplicates
+            unique_queries = []
+            for query in queries:
+                normalized = query.strip().lower()
+                if (
+                    normalized not in self.searched_queries
+                    and normalized not in self.query_variations
+                ):
+                    unique_queries.append(query)
+                    self.query_variations.add(normalized)
+                else:
+                    logger.info(
+                        f"Filtering out duplicate query variation: {query}"
+                    )
+            # If all queries were duplicates, generate a fallback
+            if not unique_queries:
+                fallback = self._build_standard_query(constraints)
+                if fallback.strip().lower() not in self.searched_queries:
+                    unique_queries = [fallback]
+            return unique_queries[: self.queries_per_combination]
+        except Exception as e:
+            logger.error(f"Failed to generate query variations: {e}")
+            # Fallback to single query
+            return [self._build_standard_query(constraints)]
+    def _extract_candidates_from_results(self, results: Dict) -> List:
+        """Improved candidate extraction that's more generic."""
+        candidates = []
+        content = results.get("current_knowledge", "")
+        if not content:
+            return candidates
+        # Use LLM to extract relevant entities/topics from the content
+        prompt = f"""
+From the following search results, extract all relevant entities, topics, or answers that match our search target type: {getattr(self, "entity_type", "unknown")}
+Content:
+{content}
+List each potential match on a separate line.
+Include only names/titles/identifiers, not descriptions.
+"""
+        try:
+            response = self.model.invoke(prompt).content
+            entity_names = [
+                name.strip() for name in response.split("\n") if name.strip()
+            ]
+            # Create candidates from extracted names
+            from ..candidates.base_candidate import Candidate
+            for name in entity_names:
+                if name and len(name) < 100:  # Basic validation
+                    candidate = Candidate(name=name)
+                    candidates.append(candidate)
+            logger.info(f"Extracted {len(candidates)} candidates from results")
+        except Exception as e:
+            logger.error(f"Error extracting candidates: {e}")
+        return candidates
+    def _should_use_entity_seeding(self) -> bool:
+        """Determine if entity seeding would be beneficial."""
+        entity_type = getattr(self, "entity_type", "").lower()
+        return (
+            "character" in entity_type
+            or "person" in entity_type
+            or "hero" in entity_type
+        )
+    def _perform_entity_seeding(self):
+        """Use LLM to suggest specific entity names based on constraints."""
+        logger.info("Performing entity seeding based on constraints")
+        # Extract key properties from constraints
+        key_properties = []
+        for constraint in self.constraint_ranking:
+            if constraint.weight > 0.7:  # High-weight constraints
+                key_properties.append(constraint.value)
+        if not key_properties:
+            return
+        properties_text = "\n".join([f"- {prop}" for prop in key_properties])
+        prompt = f"""
+Based on these properties, suggest 5-10 specific {self.entity_type} names that might match:
+Properties:
+{properties_text}
+For example, if looking for a scientist from the 19th century, you might suggest:
+- Charles Darwin
+- Marie Curie
+- Louis Pasteur
+- Thomas Edison
+Provide one name per line. Be specific with actual character/entity names.
+"""
+        try:
+            response = self.model.invoke(prompt).content
+            self.entity_seeds = [
+                name.strip() for name in response.split("\n") if name.strip()
+            ]
+            logger.info(f"Generated entity seeds: {self.entity_seeds}")
+            # Immediately search for these seeds
+            self._search_entity_seeds()
+        except Exception as e:
+            logger.error(f"Error generating entity seeds: {e}")
+    def _search_entity_seeds(self):
+        """Search for the entity seeds directly."""
+        if not self.entity_seeds:
+            return
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+            futures = []
+            for seed in self.entity_seeds[:5]:  # Limit to top 5
+                query = f'"{seed}"'
+                if query.lower() not in self.searched_queries:
+                    self.searched_queries.add(query.lower())
+                    future = executor.submit(self._execute_search, query)
+                    futures.append((seed, future))
+            for seed, future in futures:
+                try:
+                    results = future.result()
+                    candidates = self._extract_candidates_from_results(results)
+                    # Look for exact matches
+                    for candidate in candidates:
+                        if seed.lower() in candidate.name.lower():
+                            logger.info(
+                                f"Found seeded entity: {candidate.name}"
+                            )
+                            # Evaluate immediately
+                            if hasattr(self, "_evaluate_candidate_immediately"):
+                                self._evaluate_candidate_immediately(candidate)
+                            else:
+                                # Add to candidates list
+                                if not hasattr(self, "candidates"):
+                                    self.candidates = []
+                                self.candidates.append(candidate)
+                except Exception as e:
+                    logger.error(f"Error searching for seed {seed}: {e}")
+    def _try_direct_property_search(self):
+        """Try direct searches for high-weight property constraints."""
+        property_queries = []
+        for constraint in self.constraint_ranking:
+            if (
+                constraint.weight > 0.7
+                and constraint.type == ConstraintType.PROPERTY
+            ):
+                # Create specific property-based queries
+                if (
+                    "elastic" in constraint.value.lower()
+                    or "stretch" in constraint.value.lower()
+                ):
+                    property_queries.extend(
+                        [
+                            f'"{constraint.value}" superhero character',
+                            f'characters with "{constraint.value}"',
+                            f"list of {self.entity_type} {constraint.value}",
+                        ]
+                    )
+                elif (
+                    "voice" in constraint.value.lower()
+                    or "actor" in constraint.value.lower()
+                ):
+                    property_queries.append(
+                        f"{constraint.value} {self.entity_type}"
+                    )
+        # Execute property searches
+        if property_queries:
+            logger.info(
+                f"Executing direct property searches: {property_queries}"
+            )
+            with concurrent.futures.ThreadPoolExecutor(
+                max_workers=3
+            ) as executor:
+                futures = []
+                for query in property_queries[
+                    :3
+                ]:  # Limit to avoid too many searches
+                    if query.lower() not in self.searched_queries:
+                        self.searched_queries.add(query.lower())
+                        future = executor.submit(self._execute_search, query)
+                        futures.append(future)
+                for future in futures:
+                    try:
+                        results = future.result()
+                        candidates = self._extract_candidates_from_results(
+                            results
+                        )
+                        for candidate in candidates:
+                            if hasattr(self, "_evaluate_candidate_immediately"):
+                                self._evaluate_candidate_immediately(candidate)
+                    except Exception as e:
+                        logger.error(f"Property search error: {e}")
+    def _perform_entity_name_search(self):
+        """Last resort: search for entity names directly with constraints."""
+        logger.info("Performing entity name search fallback")
+        for entity_name in self.entity_seeds[:3]:  # Top 3 seeds
+            # Combine entity name with key constraints
+            constraint_terms = []
+            for constraint in self.constraint_ranking[:2]:  # Top 2 constraints
+                if constraint.weight > 0.5:
+                    constraint_terms.append(constraint.value)
+            if constraint_terms:
+                query = f'"{entity_name}" {" ".join(constraint_terms)}'
+                if query.lower() not in self.searched_queries:
+                    logger.info(f"Trying targeted entity search: {query}")
+                    self.searched_queries.add(query.lower())
+                    try:
+                        results = self._execute_search(query)
+                        candidates = self._extract_candidates_from_results(
+                            results
+                        )
+                        for candidate in candidates:
+                            if entity_name.lower() in candidate.name.lower():
+                                logger.info(
+                                    f"Found target entity in fallback: {candidate.name}"
+                                )
+                                if hasattr(
+                                    self, "_evaluate_candidate_immediately"
+                                ):
+                                    self._evaluate_candidate_immediately(
+                                        candidate
+                                    )
+                                    # Check for early stop
+                                    if (
+                                        hasattr(self, "best_score")
+                                        and self.best_score >= 0.99
+                                    ):
+                                        return
+                    except Exception as e:
+                        logger.error(f"Entity name search error: {e}")
+    def _progressive_constraint_search(self):
+        """Override to add entity seeding and property search."""
+        # Detect entity type first
+        self.entity_type = self._detect_entity_type()
+        logger.info(f"Detected entity type: {self.entity_type}")
+        # Perform entity seeding if enabled and entity type suggests specific entities
+        if self.use_entity_seeding and self._should_use_entity_seeding():
+            self._perform_entity_seeding()
+        # Try direct property search for high-weight properties
+        if self.use_direct_property_search:
+            self._try_direct_property_search()
+        # Continue with normal progressive search
+        super()._progressive_constraint_search()
+        # If still no good results, try name-based fallback
+        if (
+            hasattr(self, "best_score")
+            and self.best_score < 0.9
+            and self.entity_seeds
+        ):
+            self._perform_entity_name_search()

local_deep_research/advanced_search_system/strategies/source_based_strategy.py CHANGED Viewed

@@ -1,18 +1,19 @@
 import concurrent.futures
-import logging
 from typing import Dict
+from loguru import logger
 from ...citation_handler import CitationHandler
 from ...config.llm_config import get_llm
 from ...config.search_config import get_search
 from ...utilities.db_utils import get_db_setting
+from ...utilities.threading_utils import thread_context, thread_with_app_context
 from ..filters.cross_engine_filter import CrossEngineFilter
 from ..findings.repository import FindingsRepository
+from ..questions.atomic_fact_question import AtomicFactQuestionGenerator
 from ..questions.standard_question import StandardQuestionGenerator
 from .base_strategy import BaseSearchStrategy
-logger = logging.getLogger(__name__)
 class SourceBasedSearchStrategy(BaseSearchStrategy):
     """
@@ -31,6 +32,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
         filter_reindex: bool = True,
         cross_engine_max_results: int = None,
         all_links_of_system=None,
+        use_atomic_facts: bool = False,
     ):
         """Initialize with optional dependency injection for testing."""
         # Pass the links list to the parent class
@@ -61,7 +63,10 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
         self.citation_handler = citation_handler or CitationHandler(self.model)
         # Initialize components
-        self.question_generator = StandardQuestionGenerator(self.model)
+        if use_atomic_facts:
+            self.question_generator = AtomicFactQuestionGenerator(self.model)
+        else:
+            self.question_generator = StandardQuestionGenerator(self.model)
         self.findings_repository = FindingsRepository(self.model)
     def _format_search_results_as_context(self, search_results):
@@ -87,9 +92,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
         Analyze a topic using source-based search strategy.
         """
         logger.info(f"Starting source-based research on topic: {query}")
-        accumulated_search_results_across_all_iterations = (
-            []
-        )  # tracking links across iterations but not global
+        accumulated_search_results_across_all_iterations = []  # tracking links across iterations but not global
         findings = []
         total_citation_count_before_this_search = len(self.all_links_of_system)
@@ -120,10 +123,14 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
         iterations_to_run = int(iterations_to_run)
         try:
             filtered_search_results = []
-            total_citation_count_before_this_search = len(self.all_links_of_system)
+            total_citation_count_before_this_search = len(
+                self.all_links_of_system
+            )
             # Run each iteration
             for iteration in range(1, iterations_to_run + 1):
-                iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
+                iteration_progress_base = 5 + (iteration - 1) * (
+                    70 / iterations_to_run
+                )
                 self._update_progress(
                     f"Starting iteration {iteration}/{iterations_to_run}",
@@ -141,7 +148,9 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                 # For first iteration, use initial query
                 if iteration == 1:
                     # Generate questions for first iteration
-                    context = f"""Iteration: {iteration} of {iterations_to_run}"""
+                    context = (
+                        f"""Iteration: {iteration} of {iterations_to_run}"""
+                    )
                     questions = self.question_generator.generate_questions(
                         current_knowledge=context,
                         query=query,
@@ -171,7 +180,9 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                     elif iterations_to_run == 1:
                         context = ""
                     else:
-                        context = f"""Iteration: {iteration} of {iterations_to_run}"""
+                        context = (
+                            f"""Iteration: {iteration} of {iterations_to_run}"""
+                        )
                     # Use standard question generator with search results as context
                     questions = self.question_generator.generate_questions(
                         current_knowledge=context,
@@ -199,6 +210,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                 )
                 # Function for thread pool
+                @thread_with_app_context
                 def search_question(q):
                     try:
                         result = self.search.run(q)
@@ -212,7 +224,8 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                     max_workers=len(all_questions)
                 ) as executor:
                     futures = [
-                        executor.submit(search_question, q) for q in all_questions
+                        executor.submit(search_question, thread_context(), q)
+                        for q in all_questions
                     ]
                     iteration_search_dict = {}
                     iteration_search_results = []
@@ -227,7 +240,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                         iteration_search_dict[question] = search_results
                         self._update_progress(
-                            f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
+                            f"Completed search {i + 1} of {len(all_questions)}: {question[:3000]}",
                             iteration_progress_base
                             + 10
                             + ((i + 1) / len(all_questions) * 30),
@@ -245,7 +258,10 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                     self._update_progress(
                         f"Filtering search results for iteration {iteration}",
                         iteration_progress_base + 45,
-                        {"phase": "cross_engine_filtering", "iteration": iteration},
+                        {
+                            "phase": "cross_engine_filtering",
+                            "iteration": iteration,
+                        },
                     )
                     existing_link_count = len(self.all_links_of_system)
@@ -301,13 +317,17 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                     80,
                     {"phase": "final_filtering"},
                 )
-                final_filtered_results = self.cross_engine_filter.filter_results(
-                    accumulated_search_results_across_all_iterations,
-                    query,
-                    reorder=True,  # Always reorder in final filtering
-                    reindex=True,  # Always reindex in final filtering
-                    max_results=int(get_db_setting("search.final_max_results") or 100),
-                    start_index=len(self.all_links_of_system),
+                final_filtered_results = (
+                    self.cross_engine_filter.filter_results(
+                        accumulated_search_results_across_all_iterations,
+                        query,
+                        reorder=True,  # Always reorder in final filtering
+                        reindex=True,  # Always reindex in final filtering
+                        max_results=int(
+                            get_db_setting("search.final_max_results") or 100
+                        ),
+                        start_index=len(self.all_links_of_system),
+                    )
                 )
                 self._update_progress(
                     f"Filtered from {len(accumulated_search_results_across_all_iterations)} to {len(final_filtered_results)} results",
@@ -341,7 +361,9 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
                 synthesized_content = final_citation_result["content"]
                 documents = final_citation_result.get("documents", [])
             else:
-                synthesized_content = "No relevant results found in final synthesis."
+                synthesized_content = (
+                    "No relevant results found in final synthesis."
+                )
                 documents = []
             # Add a final synthesis finding
@@ -363,8 +385,10 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
             )
             # Format findings
-            formatted_findings = self.findings_repository.format_findings_to_text(
-                findings, synthesized_content
+            formatted_findings = (
+                self.findings_repository.format_findings_to_text(
+                    findings, synthesized_content
+                )
             )
         except Exception as e:

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl