PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py ADDED Viewed

@@ -0,0 +1,522 @@
+"""
+Constraint Parallel Strategy for search optimization.
+Key features:
+1. Runs separate searches for each constraint in parallel
+2. Uses entity type detection to focus all searches
+3. Collects candidates from all constraint-specific searches
+4. Evaluates candidates using the existing evaluation system
+5. Early rejection of poor candidates for efficiency
+"""
+import concurrent.futures
+import threading
+import time
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+from loguru import logger
+from ..candidates.base_candidate import Candidate
+from ..constraints.base_constraint import Constraint
+from .dual_confidence_with_rejection import DualConfidenceWithRejectionStrategy
+@dataclass
+class ConstraintSearchState:
+    """Tracks the state of parallel constraint-specific searches."""
+    all_candidates: List[Candidate] = field(default_factory=list)
+    evaluated_candidates: List[Tuple[Candidate, float]] = field(
+        default_factory=list
+    )
+    total_evaluated: int = 0
+    start_time: float = field(default_factory=time.time)
+    constraint_searches: Dict[str, List[Candidate]] = field(
+        default_factory=dict
+    )
+    candidates_lock: threading.Lock = field(default_factory=threading.Lock)
+    stop_search: threading.Event = field(default_factory=threading.Event)
+    evaluation_futures: List[concurrent.futures.Future] = field(
+        default_factory=list
+    )
+    entity_type: str = "unknown entity"
+class ConstraintParallelStrategy(DualConfidenceWithRejectionStrategy):
+    """
+    Strategy that runs parallel searches for each constraint independently.
+    Rather than combining constraints in queries, this approach:
+    1. Runs a separate search for each constraint in parallel
+    2. Uses entity type to focus all searches
+    3. Collects candidates from all constraint searches
+    4. Evaluates candidates as they're found
+    5. Identifies candidates that match multiple constraints
+    """
+    def __init__(
+        self,
+        *args,
+        # Concurrent execution settings
+        max_workers: int = 10,
+        # Candidate targets
+        min_good_candidates: int = 3,
+        target_candidates: int = 5,
+        max_candidates: int = 10,
+        # Quality thresholds
+        min_score_threshold: float = 0.65,
+        exceptional_score: float = 0.95,
+        quality_plateau_threshold: float = 0.1,
+        # Time and resource limits
+        max_search_time: float = 30.0,
+        max_evaluations: int = 30,
+        # Search behavior
+        initial_search_timeout: float = 5.0,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        # Thread pool for concurrent operations
+        self.search_executor = concurrent.futures.ThreadPoolExecutor(
+            max_workers=max_workers
+        )
+        self.evaluation_executor = concurrent.futures.ThreadPoolExecutor(
+            max_workers=max_workers
+        )
+        # Candidate thresholds
+        self.min_good_candidates = min_good_candidates
+        self.target_candidates = target_candidates
+        self.max_candidates = max_candidates
+        # Quality settings
+        self.min_score_threshold = min_score_threshold
+        self.exceptional_score = exceptional_score
+        self.quality_plateau_threshold = quality_plateau_threshold
+        # Resource limits
+        self.max_search_time = max_search_time
+        self.max_evaluations = max_evaluations
+        self.initial_search_timeout = initial_search_timeout
+        # Search state
+        self.state: Optional[ConstraintSearchState] = None
+    def find_relevant_information(self):
+        """Override to use parallel constraint-specific searches."""
+        # Initialize state
+        self.state = ConstraintSearchState(start_time=time.time())
+        # Detect entity type first to guide all searches
+        self.state.entity_type = self._detect_entity_type()
+        logger.info(f"Detected entity type: {self.state.entity_type}")
+        if self.progress_callback:
+            self.progress_callback(
+                f"Starting parallel searches for {self.state.entity_type}",
+                10,
+                {
+                    "phase": "entity_detection",
+                    "entity_type": self.state.entity_type,
+                },
+            )
+        # Start parallel constraint searches with concurrent evaluation
+        try:
+            self._run_parallel_constraint_searches()
+        finally:
+            # Clean up thread pools
+            self.search_executor.shutdown(wait=False)
+            self.evaluation_executor.shutdown(wait=False)
+        # Return best candidates
+        self.candidates = [
+            c
+            for c, _ in sorted(
+                self.state.evaluated_candidates,
+                key=lambda x: x[1],
+                reverse=True,
+            )[: self.max_candidates]
+        ]
+        logger.info(
+            f"Found {len(self.candidates)} candidates after parallel constraint searches"
+        )
+    def _detect_entity_type(self) -> str:
+        """Use LLM to detect what type of entity we're searching for."""
+        # Build context from constraints
+        constraint_text = "\n".join(
+            [f"- {c.value}" for c in self.constraint_ranking]
+        )
+        prompt = f"""
+        Analyze these search constraints and determine what type of entity is being searched for:
+        Constraints:
+        {constraint_text}
+        What is the primary entity type being searched for? Be specific.
+        Examples of entity types (but you can choose any appropriate type):
+        - fictional character
+        - TV show
+        - movie
+        - actor/actress
+        - historical figure
+        - company
+        - product
+        - location
+        - event
+        Respond with just the entity type.
+        """
+        try:
+            entity_type = self.model.invoke(prompt).content.strip()
+            logger.info(f"LLM determined entity type: {entity_type}")
+            return entity_type
+        except Exception as e:
+            logger.error(f"Failed to detect entity type: {e}")
+            return "unknown entity"
+    def _run_parallel_constraint_searches(self):
+        """Run separate searches for each constraint in parallel."""
+        # Submit a search for each constraint
+        search_futures = {}
+        for i, constraint in enumerate(self.constraint_ranking):
+            if self.state.stop_search.is_set():
+                break
+            logger.info(
+                f"Scheduling search for constraint {i + 1}/{len(self.constraint_ranking)}: {constraint.value}"
+            )
+            # Submit search task
+            future = self.search_executor.submit(
+                self._run_constraint_search,
+                constraint,
+                i,
+                len(self.constraint_ranking),
+            )
+            search_futures[future] = constraint
+        # Process results as they complete
+        for future in concurrent.futures.as_completed(search_futures):
+            if self.state.stop_search.is_set():
+                break
+            constraint = search_futures[future]
+            try:
+                candidates = future.result()
+                # Store results by constraint
+                with self.state.candidates_lock:
+                    self.state.constraint_searches[constraint.id] = candidates
+                    # Add to overall candidate pool
+                    self.state.all_candidates.extend(candidates)
+                logger.info(
+                    f"Constraint '{constraint.value[:30]}...' found {len(candidates)} candidates"
+                )
+                # Submit candidates for evaluation
+                self._submit_candidates_for_evaluation(candidates)
+            except Exception as e:
+                logger.error(
+                    f"Search failed for constraint {constraint.value}: {e}"
+                )
+        # Wait for evaluations to complete
+        self._finalize_evaluations()
+    def _run_constraint_search(
+        self, constraint: Constraint, index: int, total: int
+    ) -> List[Candidate]:
+        """Execute search for a specific constraint."""
+        try:
+            # Build a query combining entity type and constraint
+            query = self._build_constraint_query(constraint)
+            if self.progress_callback:
+                self.progress_callback(
+                    f"Searching for constraint {index + 1}/{total}: {constraint.value[:30]}...",
+                    20 + int(30 * (index / total)),
+                    {
+                        "phase": "constraint_search",
+                        "constraint_index": index,
+                        "constraint_total": total,
+                        "constraint_value": constraint.value,
+                    },
+                )
+            # Execute search
+            search_results = self._execute_search(query)
+            # Extract candidates
+            candidates = self._extract_relevant_candidates(
+                search_results, constraint
+            )
+            logger.info(
+                f"Found {len(candidates)} candidates for constraint: {constraint.value[:30]}..."
+            )
+            return candidates
+        except Exception as e:
+            logger.error(f"Error in constraint search: {e}", exc_info=True)
+            return []
+    def _build_constraint_query(self, constraint: Constraint) -> str:
+        """Build a query combining entity type and constraint."""
+        # Get entity type
+        entity_type = self.state.entity_type
+        query_parts = []
+        # Always include entity type if known
+        if entity_type and entity_type != "unknown entity":
+            # Add entity type as a search term
+            if " " in entity_type and not entity_type.startswith('"'):
+                query_parts.append(f'"{entity_type}"')
+            else:
+                query_parts.append(entity_type)
+        # Add constraint
+        value = constraint.value
+        if " " in value and not value.startswith('"'):
+            query_parts.append(f'"{value}"')
+        else:
+            query_parts.append(value)
+        # Convert constraint to search-friendly terms based on type
+        search_terms = constraint.to_search_terms()
+        if search_terms and search_terms != value:
+            query_parts.append(search_terms)
+        return " ".join(query_parts)
+    def _submit_candidates_for_evaluation(self, candidates: List[Candidate]):
+        """Submit candidates for concurrent evaluation."""
+        for candidate in candidates:
+            if self.state.stop_search.is_set():
+                break
+            if self.state.total_evaluated >= self.max_evaluations:
+                logger.info("Reached maximum evaluations limit")
+                self.state.stop_search.set()
+                break
+            # Check if we already evaluated this candidate
+            if self._is_candidate_evaluated(candidate):
+                continue
+            # Submit for evaluation
+            future = self.evaluation_executor.submit(
+                self._evaluate_candidate_thread, candidate
+            )
+            self.state.evaluation_futures.append(future)
+            self.state.total_evaluated += 1
+    def _evaluate_candidate_thread(
+        self, candidate: Candidate
+    ) -> Tuple[Candidate, float]:
+        """Evaluate a candidate in a separate thread."""
+        try:
+            thread_name = threading.current_thread().name
+            logger.info(
+                f"[{thread_name}] Starting evaluation of {candidate.name}"
+            )
+            # FIRST CHECK: Verify that candidate matches the expected entity type
+            entity_match_score = self._verify_entity_type_match(candidate)
+            if entity_match_score < 0.5:  # Threshold for entity type match
+                logger.info(
+                    f"[{thread_name}] ❌ {candidate.name} rejected - Not a {self.state.entity_type} (score: {entity_match_score:.3f})"
+                )
+                return (candidate, 0.0)
+            # Continue with parent's evaluation with early rejection
+            score = self._evaluate_candidate_immediately(candidate)
+            # Log result
+            if score >= self.min_score_threshold:
+                logger.info(
+                    f"[{thread_name}] ✓ {candidate.name} passed (score: {score:.3f})"
+                )
+                # Add to good candidates
+                with self.state.candidates_lock:
+                    self.state.evaluated_candidates.append((candidate, score))
+                    # Check if we should stop
+                    if self._should_stop_search():
+                        logger.info("Stopping criteria met after evaluation")
+                        self.state.stop_search.set()
+            else:
+                logger.info(
+                    f"[{thread_name}] ❌ {candidate.name} rejected (score: {score:.3f})"
+                )
+            return (candidate, score)
+        except Exception as e:
+            logger.error(
+                f"Error evaluating {candidate.name}: {e}", exc_info=True
+            )
+            return (candidate, 0.0)
+    def _verify_entity_type_match(self, candidate: Candidate) -> float:
+        """Verify that the candidate matches the expected entity type.
+        Returns:
+            float: Score between 0.0 and 1.0 indicating confidence that candidate matches the entity type
+        """
+        entity_type = self.state.entity_type
+        candidate_name = candidate.name
+        # Skip check if entity type is unknown
+        if not entity_type or entity_type == "unknown entity":
+            return 1.0
+        # Use LLM to verify entity type match
+        try:
+            prompt = f"""
+            Determine whether "{candidate_name}" is a specific {entity_type} or a general category/collection.
+            Rules:
+            1. A specific {entity_type} refers to a single, identifiable instance (e.g., "Mount Rainier" is a specific mountain)
+            2. A general category refers to a group or collection (e.g., "U.S. national parks" is a category, not a specific location)
+            3. Be strict - answer must be a single, concrete {entity_type}
+            Return ONLY a score from 0.0 to 1.0 where:
+            - 1.0 = Definitely a specific {entity_type}
+            - 0.5 = Unclear or partially matches
+            - 0.0 = Definitely NOT a specific {entity_type} (too general or wrong type)
+            Score:
+            """
+            response = self.model.invoke(prompt).content.strip()
+            # Extract numeric score from response
+            try:
+                score = float(response.split()[0].strip())
+                # Ensure score is in valid range
+                score = max(0.0, min(score, 1.0))
+                logger.info(
+                    f"Entity type check for {candidate_name}: {score:.2f} (entity type: {entity_type})"
+                )
+                return score
+            except (ValueError, IndexError):
+                logger.warning(
+                    f"Could not parse entity type score from: {response}"
+                )
+                return 0.5  # Default to middle value on parsing error
+        except Exception as e:
+            logger.error(
+                f"Error verifying entity type for {candidate_name}: {e}"
+            )
+            return 0.5  # Default to middle value on error
+    def _is_candidate_evaluated(self, candidate: Candidate) -> bool:
+        """Check if we already evaluated this candidate."""
+        with self.state.candidates_lock:
+            return any(
+                c.name == candidate.name
+                for c, _ in self.state.evaluated_candidates
+            )
+    def _should_stop_search(self) -> bool:
+        """Determine if we should stop searching based on multiple criteria."""
+        # Always respect the stop flag
+        if self.state.stop_search.is_set():
+            return True
+        num_good = len(self.state.evaluated_candidates)
+        # 1. Maximum candidates reached
+        if num_good >= self.max_candidates:
+            logger.info(f"Maximum candidates reached ({self.max_candidates})")
+            return True
+        # 2. Target reached with good quality
+        if num_good >= self.target_candidates:
+            avg_score = (
+                sum(s for _, s in self.state.evaluated_candidates) / num_good
+            )
+            if avg_score >= 0.8:
+                logger.info(
+                    f"Target reached with high quality (avg: {avg_score:.3f})"
+                )
+                return True
+        # 3. Minimum satisfied with exceptional candidates
+        if num_good >= self.min_good_candidates:
+            top_score = max(s for _, s in self.state.evaluated_candidates)
+            if top_score >= self.exceptional_score:
+                logger.info(
+                    f"Exceptional candidate found (score: {top_score:.3f})"
+                )
+                return True
+        # 4. Time limit reached
+        elapsed = time.time() - self.state.start_time
+        if elapsed > self.max_search_time:
+            logger.info(f"Time limit reached ({elapsed:.1f}s)")
+            return True
+        # 5. Too many evaluations
+        if self.state.total_evaluated >= self.max_evaluations:
+            logger.info(f"Evaluation limit reached ({self.max_evaluations})")
+            return True
+        # 6. Quality plateau detection
+        if num_good >= 5:
+            recent_scores = [s for _, s in self.state.evaluated_candidates[-5:]]
+            score_range = max(recent_scores) - min(recent_scores)
+            if score_range < self.quality_plateau_threshold:
+                logger.info(
+                    f"Quality plateau detected (range: {score_range:.3f})"
+                )
+                return True
+        return False
+    def _finalize_evaluations(self):
+        """Wait for or cancel remaining evaluations."""
+        if self.state.evaluation_futures:
+            logger.info(
+                f"Finalizing {len(self.state.evaluation_futures)} remaining evaluations"
+            )
+            # Give them a short time to complete
+            wait_time = min(
+                5.0,
+                self.max_search_time - (time.time() - self.state.start_time),
+            )
+            if wait_time > 0:
+                concurrent.futures.wait(
+                    self.state.evaluation_futures,
+                    timeout=wait_time,
+                    return_when=concurrent.futures.FIRST_COMPLETED,
+                )
+            # Cancel any still running
+            for future in self.state.evaluation_futures:
+                if not future.done():
+                    future.cancel()
+        # Final report
+        logger.info(
+            f"""
+Search completed:
+- Total evaluated: {self.state.total_evaluated}
+- Good candidates found: {len(self.state.evaluated_candidates)}
+- Time taken: {time.time() - self.state.start_time:.1f}s
+- Constraint searches: {len(self.state.constraint_searches)}
+"""
+        )

local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl