PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py ADDED Viewed

@@ -0,0 +1,1248 @@
+"""
+Evidence-based search strategy for complex query resolution.
+This strategy decomposes queries into constraints, finds candidates,
+and systematically gathers evidence to score each candidate.
+"""
+from datetime import datetime
+from typing import Any, Dict, List
+from langchain_core.language_models import BaseChatModel
+from loguru import logger
+from ...utilities.search_utilities import format_findings, remove_think_tags
+from ..candidates.base_candidate import Candidate
+from ..constraints.base_constraint import Constraint, ConstraintType
+from ..constraints.constraint_analyzer import ConstraintAnalyzer
+from ..evidence.evaluator import EvidenceEvaluator
+from ..findings.repository import FindingsRepository
+from .base_strategy import BaseSearchStrategy
+from .source_based_strategy import SourceBasedSearchStrategy
+class EvidenceBasedStrategy(BaseSearchStrategy):
+    """
+    Evidence-based strategy for solving complex queries.
+    Key features:
+    1. Decomposes queries into verifiable constraints
+    2. Finds candidates that might satisfy constraints
+    3. Gathers specific evidence for each candidate-constraint pair
+    4. Scores candidates based on evidence quality
+    5. Progressively refines the search
+    """
+    def __init__(
+        self,
+        model: BaseChatModel,
+        search: Any,
+        all_links_of_system: List[str],
+        max_iterations: int = 20,
+        confidence_threshold: float = 0.85,
+        candidate_limit: int = 10,
+        evidence_threshold: float = 0.6,
+        max_search_iterations: int = 2,  # For source-based sub-searches
+        questions_per_iteration: int = 3,
+    ):
+        """Initialize the evidence-based strategy."""
+        super().__init__(all_links_of_system)
+        self.model = model
+        self.search = search
+        self.max_iterations = max_iterations
+        self.confidence_threshold = confidence_threshold
+        self.candidate_limit = candidate_limit
+        self.evidence_threshold = evidence_threshold
+        self.max_search_iterations = max_search_iterations
+        self.questions_per_iteration = questions_per_iteration
+        # Enable direct search by default for performance
+        self.use_direct_search = True
+        logger.info(
+            f"EvidenceBasedStrategy init: use_direct_search={self.use_direct_search}"
+        )
+        # Initialize components
+        self.constraint_analyzer = ConstraintAnalyzer(model)
+        self.evidence_evaluator = EvidenceEvaluator(model)
+        self.findings_repository = FindingsRepository(model)
+        # State tracking
+        self.constraints: List[Constraint] = []
+        self.candidates: List[Candidate] = []
+        self.search_history: List[Dict] = []
+        self.iteration: int = 0
+    def analyze_topic(self, query: str) -> Dict:
+        """Analyze a topic using evidence-based approach."""
+        # Initialize
+        self.all_links_of_system.clear()
+        self.questions_by_iteration = []
+        self.findings = []
+        self.iteration = 0
+        # Step 1: Extract constraints from query
+        if self.progress_callback:
+            self.progress_callback(
+                "Analyzing query to extract constraints...",
+                2,
+                {
+                    "phase": "constraint_analysis",
+                    "status": "starting",
+                    "query_length": len(query),
+                },
+            )
+        self.constraints = self.constraint_analyzer.extract_constraints(query)
+        if self.progress_callback:
+            constraint_summary = {}
+            for c in self.constraints:
+                type_name = c.type.name
+                if type_name not in constraint_summary:
+                    constraint_summary[type_name] = 0
+                constraint_summary[type_name] += 1
+            self.progress_callback(
+                f"Extracted {len(self.constraints)} constraints ({len([c for c in self.constraints if c.weight >= 0.9])} critical)",
+                5,
+                {
+                    "phase": "constraint_extraction",
+                    "constraints_count": len(self.constraints),
+                    "constraint_types": constraint_summary,
+                    "high_priority": len(
+                        [c for c in self.constraints if c.weight >= 0.9]
+                    ),
+                },
+            )
+        # Add initial analysis finding
+        initial_finding = {
+            "phase": "Initial Analysis",
+            "content": self._format_initial_analysis(query),
+            "timestamp": self._get_timestamp(),
+        }
+        self.findings.append(initial_finding)
+        # Step 2: Find initial candidates
+        self._find_initial_candidates()
+        # Step 3: Main evidence-gathering loop
+        while (
+            self.iteration < self.max_iterations
+            and not self._has_sufficient_answer()
+        ):
+            self.iteration += 1
+            if self.progress_callback:
+                progress = 15 + int((self.iteration / self.max_iterations) * 70)
+                # Calculate detailed metrics
+                evidence_coverage = self._calculate_evidence_coverage()
+                top_score = self.candidates[0].score if self.candidates else 0
+                # Calculate constraint satisfaction
+                satisfied_constraints = 0
+                if self.candidates:
+                    top_candidate = self.candidates[0]
+                    satisfied_constraints = len(
+                        [
+                            c
+                            for c in self.constraints
+                            if c.id in top_candidate.evidence
+                            and top_candidate.evidence[c.id].confidence
+                            >= self.evidence_threshold
+                        ]
+                    )
+                confidence_level = (
+                    "HIGH"
+                    if top_score >= self.confidence_threshold
+                    else "MEDIUM"
+                    if top_score >= 0.6
+                    else "LOW"
+                )
+                self.progress_callback(
+                    f"Iteration {self.iteration}/{self.max_iterations} - {self._get_iteration_status()} [{confidence_level}]",
+                    progress,
+                    {
+                        "phase": "iteration_start",
+                        "iteration": self.iteration,
+                        "max_iterations": self.max_iterations,
+                        "candidates_count": len(self.candidates),
+                        "evidence_coverage": f"{evidence_coverage:.0%}",
+                        "top_score": f"{top_score:.0%}",
+                        "status": self._get_iteration_status(),
+                        "constraints_satisfied": f"{satisfied_constraints}/{len(self.constraints)}",
+                        "search_count": len(self.search_history),
+                        "confidence_level": confidence_level,
+                    },
+                )
+            # Gather evidence for each candidate
+            self._gather_evidence_round()
+            # Score and prune candidates
+            self._score_and_prune_candidates()
+            # Add iteration finding
+            iteration_finding = {
+                "phase": f"Iteration {self.iteration}",
+                "content": self._format_iteration_summary(),
+                "timestamp": self._get_timestamp(),
+                "metadata": {
+                    "candidates": len(self.candidates),
+                    "evidence_collected": sum(
+                        len(c.evidence) for c in self.candidates
+                    ),
+                    "top_score": self.candidates[0].score
+                    if self.candidates
+                    else 0,
+                },
+            }
+            self.findings.append(iteration_finding)
+            # Check if we need more candidates
+            if len(self.candidates) < 3:
+                if self.iteration <= 2:
+                    # Early iterations - try different search strategies
+                    self._find_initial_candidates()
+                elif self.iteration < self.max_iterations / 2:
+                    # Mid iterations - look for additional candidates
+                    self._find_additional_candidates()
+        # Step 4: Final verification of top candidates
+        self._final_verification()
+        # Step 5: Generate final answer
+        final_result = self._synthesize_final_answer(query)
+        if self.progress_callback:
+            self.progress_callback(
+                f"Analysis complete - evaluated {len(self.candidates)} candidates",
+                100,
+                {
+                    "phase": "complete",
+                    "strategy": "evidence_based",
+                    "total_iterations": self.iteration,
+                    "final_candidates": len(self.candidates),
+                },
+            )
+        return final_result
+    def _find_initial_candidates(self):
+        """Find initial candidates based on key constraints."""
+        # Try multiple search strategies to find candidates
+        all_candidates = []
+        # Strategy 1: Use the most distinctive constraints
+        distinctive_constraints = self._get_distinctive_constraints()
+        if self.progress_callback:
+            self.progress_callback(
+                f"Prioritizing {len(distinctive_constraints)} key constraints from {len(self.constraints)} total",
+                7,
+                {
+                    "phase": "constraint_prioritization",
+                    "total_constraints": len(self.constraints),
+                    "selected_constraints": len(distinctive_constraints),
+                    "key_constraint_types": [
+                        c.type.value for c in distinctive_constraints
+                    ],
+                },
+            )
+        # Try first query with distinctive constraints
+        search_query = self._create_candidate_search_query(
+            distinctive_constraints
+        )
+        if self.progress_callback:
+            self.progress_callback(
+                f"Creating search query: {search_query[:50]}...",
+                8,
+                {
+                    "phase": "query_generation",
+                    "search_query": (
+                        search_query[:100] + "..."
+                        if len(search_query) > 100
+                        else search_query
+                    ),
+                    "query_length": len(search_query),
+                    "constraint_count": len(distinctive_constraints),
+                },
+            )
+        if self.progress_callback:
+            self.progress_callback(
+                f"Searching for candidates using {type(self.search).__name__ if self.search else 'Unknown'}",
+                9,
+                {
+                    "phase": "candidate_search",
+                    "status": "searching",
+                    "search_engine": (
+                        type(self.search).__name__ if self.search else "Unknown"
+                    ),
+                },
+            )
+        results = self._execute_search(search_query)
+        candidates = self._extract_candidates_from_results(
+            results, search_query
+        )
+        all_candidates.extend(candidates)
+        # If no candidates found, try a different approach
+        if not all_candidates:
+            if self.progress_callback:
+                self.progress_callback(
+                    "Primary search found 0 candidates - trying alternative search strategies",
+                    10,
+                    {"phase": "alternative_search", "status": "searching"},
+                )
+            # Strategy 2: Focus on name pattern constraints if available
+            pattern_constraints = [
+                c
+                for c in self.constraints
+                if c.type == ConstraintType.NAME_PATTERN
+            ]
+            location_constraints = [
+                c for c in self.constraints if c.type == ConstraintType.LOCATION
+            ]
+            if pattern_constraints or location_constraints:
+                combined_constraints = (
+                    pattern_constraints + location_constraints
+                )[:3]
+                search_query = self._create_candidate_search_query(
+                    combined_constraints
+                )
+                results = self._execute_search(search_query)
+                candidates = self._extract_candidates_from_results(
+                    results, search_query
+                )
+                all_candidates.extend(candidates)
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_candidates = []
+        for c in all_candidates:
+            if c.name.lower() not in seen:
+                seen.add(c.name.lower())
+                unique_candidates.append(c)
+        self.candidates.extend(unique_candidates[: self.candidate_limit])
+        if self.progress_callback:
+            status_msg = (
+                f"Found {len(self.candidates)} candidates"
+                if self.candidates
+                else "No candidates found - will retry in next iteration"
+            )
+            self.progress_callback(
+                status_msg,
+                15,
+                {
+                    "phase": "candidates_found",
+                    "count": len(self.candidates),
+                    "candidates": [
+                        {"name": c.name, "initial_score": 0}
+                        for c in self.candidates[:5]
+                    ],
+                    "status": (
+                        "ready_for_evidence_gathering"
+                        if self.candidates
+                        else "no_candidates_found"
+                    ),
+                },
+            )
+        logger.info(f"Found {len(self.candidates)} initial candidates")
+    def _gather_evidence_round(self):
+        """Gather evidence for candidates in this round."""
+        evidence_gathered = 0
+        total_candidates = min(5, len(self.candidates))
+        if self.progress_callback:
+            evidence_msg = f"Gathering evidence for {total_candidates} candidates x {len(self.constraints)} constraints"
+            if total_candidates == 0:
+                evidence_msg = (
+                    "No candidates to process - skipping evidence gathering"
+                )
+            self.progress_callback(
+                evidence_msg,
+                None,
+                {
+                    "phase": "evidence_round_start",
+                    "candidates_to_process": total_candidates,
+                    "iteration": self.iteration,
+                    "total_evidence_needed": total_candidates
+                    * len(self.constraints),
+                },
+            )
+        for i, candidate in enumerate(
+            self.candidates[:5]
+        ):  # Focus on top candidates
+            unverified = candidate.get_unverified_constraints(self.constraints)
+            if not unverified:
+                if self.progress_callback:
+                    self.progress_callback(
+                        f"All constraints verified for {candidate.name}",
+                        None,
+                        {
+                            "phase": "candidate_complete",
+                            "candidate": candidate.name,
+                            "evidence_count": len(candidate.evidence),
+                        },
+                    )
+                continue
+            # Pick the most important unverified constraint
+            constraint = max(unverified, key=lambda c: c.weight)
+            if self.progress_callback:
+                current_evidence = sum(
+                    len(c.evidence) for c in self.candidates[:total_candidates]
+                )
+                total_possible = total_candidates * len(self.constraints)
+                evidence_percentage = (
+                    (current_evidence / total_possible * 100)
+                    if total_possible > 0
+                    else 0
+                )
+                self.progress_callback(
+                    f"Processing {candidate.name} [{i + 1}/{total_candidates}] - verifying {constraint.type.value}",
+                    None,
+                    {
+                        "phase": "evidence_search",
+                        "candidate": candidate.name,
+                        "constraint": constraint.description,
+                        "constraint_type": constraint.type.value,
+                        "constraint_weight": constraint.weight,
+                        "candidate_rank": i + 1,
+                        "evidence_progress": f"{evidence_percentage:.0f}%",
+                        "unverified_count": len(unverified),
+                    },
+                )
+            # Create a targeted search query for evidence
+            evidence_query_prompt = f"""Create a search query to verify if "{candidate.name}" satisfies this constraint:
+Constraint: {constraint.description}
+Type: {constraint.type.value}
+Create a specific search query that would find evidence about whether this candidate meets this constraint.
+Return only the search query, no explanation."""
+            query_response = self.model.invoke(evidence_query_prompt)
+            search_query = remove_think_tags(query_response.content).strip()
+            # Fallback to simple query if needed
+            if not search_query or len(search_query) < 5:
+                search_query = (
+                    f"{candidate.name} {constraint.to_search_terms()}"
+                )
+            results = self._execute_search(search_query)
+            # Extract evidence
+            evidence = self.evidence_evaluator.extract_evidence(
+                results.get("current_knowledge", ""), candidate.name, constraint
+            )
+            candidate.add_evidence(constraint.id, evidence)
+            evidence_gathered += 1
+            if self.progress_callback:
+                self.progress_callback(
+                    f"Evidence found: {evidence.confidence:.0%} confidence",
+                    None,
+                    {
+                        "phase": "evidence_found",
+                        "candidate": candidate.name,
+                        "constraint": constraint.description,
+                        "confidence": evidence.confidence,
+                        "evidence_type": evidence.type.value,
+                        "evidence_claim": (
+                            evidence.claim[:100] + "..."
+                            if len(evidence.claim) > 100
+                            else evidence.claim
+                        ),
+                        "constraint_satisfied": evidence.confidence
+                        >= self.evidence_threshold,
+                    },
+                )
+            logger.info(
+                f"Added evidence for {candidate.name} - {constraint.id}: "
+                f"{evidence.confidence:.2f} confidence"
+            )
+        if self.progress_callback and evidence_gathered > 0:
+            self.progress_callback(
+                f"Gathered {evidence_gathered} pieces of evidence",
+                None,
+                {
+                    "phase": "evidence_round_complete",
+                    "evidence_count": evidence_gathered,
+                },
+            )
+    def _score_and_prune_candidates(self):
+        """Score candidates and remove low-scoring ones."""
+        if self.progress_callback:
+            self.progress_callback(
+                "Scoring candidates based on evidence",
+                None,
+                {
+                    "phase": "scoring_start",
+                    "candidate_count": len(self.candidates),
+                },
+            )
+        for i, candidate in enumerate(self.candidates):
+            old_score = candidate.score
+            candidate.calculate_score(self.constraints)
+            if self.progress_callback and i < 5:  # Report top 5
+                self.progress_callback(
+                    f"Scored {candidate.name}",
+                    None,
+                    {
+                        "phase": "candidate_scored",
+                        "candidate": candidate.name,
+                        "old_score": old_score,
+                        "new_score": candidate.score,
+                        "evidence_count": len(candidate.evidence),
+                        "satisfied_constraints": len(
+                            [
+                                c
+                                for c in self.constraints
+                                if c.id in candidate.evidence
+                            ]
+                        ),
+                        "score_change": candidate.score - old_score,
+                    },
+                )
+        # Sort by score
+        self.candidates.sort(key=lambda c: c.score, reverse=True)
+        # Prune low-scoring candidates
+        old_count = len(self.candidates)
+        min_score = (
+            max(0.2, self.candidates[0].score * 0.3) if self.candidates else 0.2
+        )
+        self.candidates = [c for c in self.candidates if c.score >= min_score]
+        # Keep only top candidates
+        self.candidates = self.candidates[: self.candidate_limit]
+        if self.progress_callback:
+            removed = old_count - len(self.candidates)
+            self.progress_callback(
+                f"Pruned {removed} low-scoring candidates - keeping top {len(self.candidates)}",
+                None,
+                {
+                    "phase": "pruning_complete",
+                    "candidates_removed": removed,
+                    "min_score_threshold": min_score,
+                    "top_score": self.candidates[0].score
+                    if self.candidates
+                    else 0,
+                    "remaining_candidates": [
+                        {"name": c.name, "score": c.score, "rank": i + 1}
+                        for i, c in enumerate(self.candidates[:5])
+                    ],
+                },
+            )
+    def _has_sufficient_answer(self) -> bool:
+        """Check if we have a sufficiently confident answer."""
+        if not self.candidates:
+            return False
+        top_candidate = self.candidates[0]
+        # Check if top candidate has high score
+        if top_candidate.score >= self.confidence_threshold:
+            # Verify it has evidence for all critical constraints
+            critical_constraints = [
+                c for c in self.constraints if c.weight >= 0.8
+            ]
+            critical_evidence = [
+                c.id
+                for c in critical_constraints
+                if c.id in top_candidate.evidence
+                and top_candidate.evidence[c.id].confidence
+                >= self.evidence_threshold
+            ]
+            if len(critical_evidence) == len(critical_constraints):
+                return True
+        return False
+    def _final_verification(self):
+        """Perform final verification on top candidates."""
+        if not self.candidates:
+            return
+        if self.progress_callback:
+            self.progress_callback(
+                "Starting final verification of top candidates",
+                85,
+                {
+                    "phase": "final_verification_start",
+                    "top_candidates": [c.name for c in self.candidates[:3]],
+                },
+            )
+        # Get top 3 candidates
+        top_candidates = self.candidates[:3]
+        for candidate in top_candidates:
+            # Find weak evidence or missing critical constraints
+            weak_evidence = candidate.get_weak_evidence(self.evidence_threshold)
+            critical_missing = [
+                c
+                for c in self.constraints
+                if c.weight >= 0.8 and c.id not in candidate.evidence
+            ]
+            # Search for better evidence
+            for constraint_id in weak_evidence[:2] + [
+                c.id for c in critical_missing[:2]
+            ]:
+                constraint = next(
+                    (c for c in self.constraints if c.id == constraint_id), None
+                )
+                if constraint:
+                    search_query = f"{candidate.name} {constraint.value} exact verification"
+                    results = self._execute_search(search_query)
+                    evidence = self.evidence_evaluator.extract_evidence(
+                        results.get("current_knowledge", ""),
+                        candidate.name,
+                        constraint,
+                    )
+                    # Update if better evidence
+                    if (
+                        constraint_id not in candidate.evidence
+                        or evidence.confidence
+                        > candidate.evidence[constraint_id].confidence
+                    ):
+                        candidate.add_evidence(constraint.id, evidence)
+        # Final scoring
+        self._score_and_prune_candidates()
+    def _execute_search(self, search_query: str) -> Dict:
+        """Execute a search - optimized for direct queries."""
+        self.search_history.append(
+            {
+                "query": search_query,
+                "timestamp": self._get_timestamp(),
+                "iteration": self.iteration,
+            }
+        )
+        # For candidate searches and verification queries, use direct search
+        # This avoids the overhead of full source-based strategy
+        logger.info(
+            f"_execute_search called with use_direct_search={self.use_direct_search} for query: {search_query[:50]}..."
+        )
+        if self.use_direct_search:  # Always use direct search when flag is True
+            # Direct search without question generation or iterations
+            search_results = self.search.run(search_query)
+            # Simple synthesis for knowledge extraction
+            if search_results:
+                content = "\n\n".join(
+                    [
+                        f"Result {i + 1}:\n{result.get('snippet', '')}"
+                        for i, result in enumerate(search_results[:10])
+                    ]
+                )
+            else:
+                content = "No results found."
+            return {
+                "current_knowledge": content,
+                "findings": [],
+                "iterations": 1,
+                "questions_by_iteration": [[search_query]],
+                "all_links_of_system": search_results or [],
+            }
+        # Use source-based strategy for complex searches if needed
+        source_strategy = SourceBasedSearchStrategy(
+            model=self.model,
+            search=self.search,  # Pass the existing search instance
+            all_links_of_system=self.all_links_of_system,
+            include_text_content=True,
+            use_cross_engine_filter=True,
+            use_atomic_facts=True,
+        )
+        source_strategy.max_iterations = self.max_search_iterations
+        source_strategy.questions_per_iteration = self.questions_per_iteration
+        if self.progress_callback:
+            def wrapped_callback(message, progress, data):
+                # Add parent context
+                data["parent_iteration"] = self.iteration
+                data["parent_strategy"] = "evidence_based"
+                data["search_query"] = search_query[:100]
+                # Don't override parent progress percentage
+                parent_progress = None
+                # Make messages very short for frontend visibility, but keep query visible
+                clean_message = message
+                if "Generating questions" in message:
+                    clean_message = f"Q: {search_query[:500]}"
+                elif "Searching" in message:
+                    clean_message = f"S: {search_query[:500]}"
+                elif "Processing" in message:
+                    clean_message = f"P: {search_query[:500]}"
+                elif "Completed search" in message:
+                    # Extract the actual search query from the message
+                    parts = message.split(":", 2)
+                    if len(parts) > 2:
+                        query_part = parts[2].strip()
+                        clean_message = f"✓ {query_part}"  # Show full query
+                    else:
+                        clean_message = f"✓ {search_query[:500]}"
+                elif "iteration" in message.lower():
+                    clean_message = (
+                        f"I{data.get('iteration', '?')}: {search_query[:500]}"
+                    )
+                elif "Filtered" in message:
+                    results = data.get("result_count", "?")
+                    filtered = (
+                        len(data.get("links_count", []))
+                        if "links_count" in data
+                        else "?"
+                    )
+                    clean_message = f"Filter: {results}→{filtered}"
+                self.progress_callback(
+                    clean_message,
+                    parent_progress,  # Let parent manage overall progress
+                    data,
+                )
+            source_strategy.set_progress_callback(wrapped_callback)
+        results = source_strategy.analyze_topic(search_query)
+        if "questions_by_iteration" in results:
+            self.questions_by_iteration.extend(
+                results["questions_by_iteration"]
+            )
+        return results
+    def _create_candidate_search_query(
+        self, constraints: List[Constraint]
+    ) -> str:
+        """Create a search query to find candidates."""
+        # Use an LLM to create effective search queries from constraints
+        constraint_descriptions = []
+        for c in constraints[:5]:  # Limit to top 5 constraints
+            constraint_descriptions.append(
+                f"- {c.type.value}: {c.value} (weight: {c.weight})"
+            )
+        prompt = f"""Given these constraints for finding a specific answer, create an effective search query.
+Constraints:
+{chr(10).join(constraint_descriptions)}
+Your task is to create a search query that finds SPECIFIC NAMED ENTITIES that satisfy these constraints.
+Key principle: Focus on finding actual names of things, not general information.
+Guidelines:
+1. If the constraints describe properties, search for entities that have those properties
+2. If the constraints describe patterns, search for entities whose names match those patterns
+3. Combine the most distinctive constraints to narrow the search
+4. Use query operators (AND, OR, quotes) effectively
+Important: The query should be designed to surface specific names/entities in the search results, not explanations or general information about the constraints.
+Return only the search query, no explanation."""
+        response = self.model.invoke(prompt)
+        search_query = remove_think_tags(response.content).strip()
+        # Fallback if the query is too generic
+        if len(search_query.split()) < 3:
+            # Combine the most important constraint values
+            key_terms = []
+            for c in sorted(constraints, key=lambda x: x.weight, reverse=True)[
+                :3
+            ]:
+                key_terms.append(f'"{c.value}"')
+            search_query = " AND ".join(key_terms)
+        return search_query
+    def _get_distinctive_constraints(self) -> List[Constraint]:
+        """Get the most distinctive constraints for initial search."""
+        # Prioritize constraints that are most likely to identify specific entities
+        priority_order = [
+            ConstraintType.NAME_PATTERN,
+            ConstraintType.LOCATION,
+            ConstraintType.EVENT,
+            ConstraintType.STATISTIC,
+            ConstraintType.COMPARISON,
+            ConstraintType.PROPERTY,
+            ConstraintType.TEMPORAL,
+            ConstraintType.EXISTENCE,
+        ]
+        # Sort constraints by priority and weight
+        sorted_constraints = sorted(
+            self.constraints,
+            key=lambda c: (priority_order.index(c.type), -c.weight),
+        )
+        # Take top 3 constraints, ensuring we have at least one name/location constraint if available
+        distinctive = sorted_constraints[:3]
+        # Ensure we have at least one naming constraint if available
+        has_naming = any(
+            c.type in [ConstraintType.NAME_PATTERN, ConstraintType.LOCATION]
+            for c in distinctive
+        )
+        if not has_naming:
+            for c in sorted_constraints[3:]:
+                if c.type in [
+                    ConstraintType.NAME_PATTERN,
+                    ConstraintType.LOCATION,
+                ]:
+                    distinctive[-1] = c  # Replace the least important
+                    break
+        return distinctive
+    def _extract_candidates_from_results(
+        self, results: Dict, search_query: str
+    ) -> List[Candidate]:
+        """Extract potential candidates from search results."""
+        knowledge = results.get("current_knowledge", "")
+        # Also check the raw findings
+        findings = results.get("findings", [])
+        all_content = knowledge
+        for finding in findings:
+            if isinstance(finding, dict) and "content" in finding:
+                all_content += "\n" + finding["content"]
+        # First, understand what type of entity we're looking for
+        type_prompt = f"""Based on this search query, what type of entity are we looking for?
+Search Query: {search_query}
+Common types include: location, person, organization, product, concept, event, etc.
+Answer with just the entity type, no explanation."""
+        type_response = self.model.invoke(type_prompt)
+        entity_type = remove_think_tags(type_response.content).strip().lower()
+        # Now extract candidates based on the entity type
+        prompt = f"""Extract potential {entity_type} candidates from these search results.
+Search Query: {search_query}
+Search Results:
+{all_content[:4000]}
+CRITICAL: Extract ONLY specific named entities that could answer the query.
+For example:
+- If looking for a character: extract "Sherlock Holmes", NOT "detective" or "fictional character"
+- If looking for a place: extract "Mount Everest", NOT "mountain" or "high peak"
+- If looking for a person: extract "Albert Einstein", NOT "scientist" or "physicist"
+Extract ONLY:
+1. Proper nouns and specific names
+2. Individual entities that could be the answer
+3. Concrete names mentioned in the text
+DO NOT extract:
+- Sources or websites (e.g., "Wikipedia", "IMDb")
+- General descriptions (e.g., "TV show", "fictional character")
+- Categories or types (e.g., "mountain", "scientist")
+- Meta-information about search results
+If the text mentions a work (book, movie, TV show) when looking for a character,
+extract CHARACTER NAMES from that work, not the work's title.
+Format each candidate as:
+CANDIDATE_1: [specific name]
+CANDIDATE_2: [specific name]
+...
+Limit to the 10 most relevant candidates."""
+        response = self.model.invoke(prompt)
+        content = remove_think_tags(response.content)
+        candidates = []
+        for line in content.strip().split("\n"):
+            if line.startswith("CANDIDATE_"):
+                name = line.split(":", 1)[1].strip().strip("\"'")
+                # Basic validation - must be non-empty and reasonable length
+                if name and 2 < len(name) < 100:
+                    # Additional validation based on entity type
+                    if entity_type in [
+                        "location",
+                        "place",
+                        "person",
+                        "organization",
+                    ]:
+                        # Should start with capital letter for these types
+                        if name[0].isupper():
+                            candidate = Candidate(name=name)
+                            candidates.append(candidate)
+                    else:
+                        # Other types might not need capital letters
+                        candidate = Candidate(name=name)
+                        candidates.append(candidate)
+        return candidates[: self.candidate_limit]
+    def _find_additional_candidates(self):
+        """Find additional candidates if we don't have enough."""
+        # Use different constraint combinations
+        unused_constraints = [
+            c
+            for c in self.constraints
+            if not any(c.id in cand.evidence for cand in self.candidates)
+        ]
+        if unused_constraints:
+            search_query = self._create_candidate_search_query(
+                unused_constraints[:3]
+            )
+            results = self._execute_search(search_query)
+            new_candidates = self._extract_candidates_from_results(
+                results, search_query
+            )
+            # Add only truly new candidates
+            existing_names = {c.name.lower() for c in self.candidates}
+            for candidate in new_candidates:
+                if candidate.name.lower() not in existing_names:
+                    self.candidates.append(candidate)
+                    existing_names.add(candidate.name.lower())
+    def _format_initial_analysis(self, query: str) -> str:
+        """Format initial analysis summary."""
+        # Group constraints by type
+        constraint_groups = {}
+        for c in self.constraints:
+            type_name = c.type.name.replace("_", " ").title()
+            if type_name not in constraint_groups:
+                constraint_groups[type_name] = []
+            constraint_groups[type_name].append(c)
+        constraints_formatted = ""
+        for type_name, constraints in constraint_groups.items():
+            constraints_formatted += f"\n**{type_name}**:\n"
+            for c in constraints:
+                constraints_formatted += (
+                    f"  • {c.description} (importance: {c.weight:.0%})\n"
+                )
+        return f"""
+**Research Query**: {query}
+**Strategy**: Evidence-Based Search
+**Approach**: Systematically identify candidates and verify each constraint
+**Identified Constraints**: {len(self.constraints)}
+{constraints_formatted}
+**Next Steps**:
+1. Search for candidates matching key constraints
+2. Gather evidence for each candidate-constraint pair
+3. Score candidates based on evidence quality
+4. Progressively refine until confident answer found
+**Starting Research...**
+""".strip()
+    def _format_iteration_summary(self) -> str:
+        """Format iteration summary."""
+        top_candidates = self.candidates[:3]
+        # Calculate overall progress
+        total_evidence_needed = (
+            len(self.candidates[:5]) * len(self.constraints)
+            if self.candidates
+            else 0
+        )
+        evidence_collected = sum(len(c.evidence) for c in self.candidates[:5])
+        evidence_progress = (
+            evidence_collected / total_evidence_needed
+            if total_evidence_needed > 0
+            else 0
+        )
+        summary = f"""
+**Iteration {self.iteration}**
+**Top Candidates**:
+{chr(10).join(f"{i + 1}. {c.name} (score: {c.score:.0%})" for i, c in enumerate(top_candidates)) if top_candidates else "No candidates found yet"}
+**Evidence Collection Progress**:
+- Total candidates: {len(self.candidates)}
+- Evidence gathered: {evidence_collected}/{total_evidence_needed} ({evidence_progress:.0%})
+- Constraints verified: {len([c for c in self.constraints if any(cand.evidence.get(c.id) for cand in self.candidates[:3])])}/{len(self.constraints)}
+**Current Search Focus**:
+"""
+        if top_candidates:
+            for candidate in top_candidates[:2]:
+                satisfied = len(
+                    [
+                        c
+                        for c in self.constraints
+                        if c.id in candidate.evidence
+                        and candidate.evidence[c.id].confidence
+                        >= self.evidence_threshold
+                    ]
+                )
+                summary += f"\n{candidate.name} - {satisfied}/{len(self.constraints)} constraints satisfied:\n"
+                for constraint in self.constraints[:3]:
+                    evidence = candidate.evidence.get(constraint.id)
+                    if evidence:
+                        summary += f"  ✓ {constraint.description}: {evidence.confidence:.0%} confidence\n"
+                    else:
+                        summary += (
+                            f"  ? {constraint.description}: Searching...\n"
+                        )
+        else:
+            summary += "\nSearching for initial candidates..."
+            summary += "\nFocus: " + ", ".join(
+                [c.type.value for c in self.constraints[:3]]
+            )
+        summary += f"\n**Overall Progress**: {self.iteration}/{self.max_iterations} iterations ({self.iteration / self.max_iterations:.0%})"
+        # Add recent searches
+        if self.search_history:
+            recent_searches = self.search_history[-3:]
+            summary += "\n\n**Recent Searches**:\n"
+            summary += chr(10).join(
+                f"- {s['query'][:60]}..." for s in recent_searches
+            )
+        return summary.strip()
+    def _synthesize_final_answer(self, original_query: str) -> Dict:
+        """Generate final answer based on evidence."""
+        if not self.candidates:
+            answer = "Unable to determine"
+            confidence = 0
+        else:
+            top_candidate = self.candidates[0]
+            answer = top_candidate.name
+            confidence = int(top_candidate.score * 100)
+        # Generate detailed explanation
+        prompt = f"""
+Based on our evidence-based research, provide a final answer to:
+{original_query}
+Top Answer: {answer}
+Confidence: {confidence}%
+Evidence Summary:
+{self._format_evidence_summary()}
+Provide a clear, concise answer with justification based on the evidence.
+Include which constraints were satisfied and which weren't.
+"""
+        response = self.model.invoke(prompt)
+        final_answer = remove_think_tags(response.content)
+        # Add final synthesis finding
+        synthesis_finding = {
+            "phase": "Final Synthesis",
+            "content": self._format_final_synthesis(answer, confidence),
+            "timestamp": self._get_timestamp(),
+        }
+        self.findings.append(synthesis_finding)
+        # Compile questions
+        questions_dict = {}
+        for i, questions in enumerate(self.questions_by_iteration):
+            if isinstance(questions, list):
+                questions_dict[i + 1] = questions
+            elif isinstance(questions, dict):
+                questions_dict.update(questions)
+        # Format findings
+        formatted_findings = format_findings(
+            self.findings, final_answer, questions_dict
+        )
+        return {
+            "current_knowledge": final_answer,
+            "formatted_findings": formatted_findings,
+            "findings": self.findings,
+            "iterations": self.iteration,
+            "questions_by_iteration": questions_dict,
+            "all_links_of_system": self.all_links_of_system,
+            "sources": self.all_links_of_system,
+            "candidates": [
+                {
+                    "name": c.name,
+                    "score": c.score,
+                    "evidence": {
+                        k: {
+                            "claim": e.claim,
+                            "confidence": e.confidence,
+                            "type": e.type.value,
+                        }
+                        for k, e in c.evidence.items()
+                    },
+                }
+                for c in self.candidates[:5]
+            ],
+            "constraints": [
+                {
+                    "id": c.id,
+                    "description": c.description,
+                    "weight": c.weight,
+                    "type": c.type.value,
+                }
+                for c in self.constraints
+            ],
+            "strategy": "evidence_based",
+        }
+    def _format_evidence_summary(self) -> str:
+        """Format evidence summary for top candidates."""
+        if not self.candidates:
+            return "No candidates found"
+        summary = ""
+        for candidate in self.candidates[:2]:
+            summary += f"\n{candidate.name} (score: {candidate.score:.2f}):\n"
+            for constraint in self.constraints:
+                evidence = candidate.evidence.get(constraint.id)
+                if evidence:
+                    summary += (
+                        f"  - {constraint.description}: {evidence.claim} "
+                    )
+                    summary += f"(confidence: {evidence.confidence:.2f}, type: {evidence.type.value})\n"
+                else:
+                    summary += f"  - {constraint.description}: No evidence\n"
+        return summary
+    def _format_final_synthesis(self, answer: str, confidence: int) -> str:
+        """Format final synthesis summary."""
+        if not self.candidates:
+            evidence_summary = "No candidates found"
+            constraint_breakdown = "Unable to verify constraints"
+        else:
+            top_candidate = self.candidates[0]
+            satisfied = len(
+                [
+                    c
+                    for c in self.constraints
+                    if c.id in top_candidate.evidence
+                    and top_candidate.evidence[c.id].confidence
+                    >= self.evidence_threshold
+                ]
+            )
+            evidence_summary = (
+                f"Satisfied {satisfied}/{len(self.constraints)} constraints"
+            )
+            # Create constraint satisfaction breakdown
+            constraint_breakdown = "\n**Constraint Satisfaction**:\n"
+            for constraint in self.constraints:
+                evidence = top_candidate.evidence.get(constraint.id)
+                if evidence and evidence.confidence >= self.evidence_threshold:
+                    constraint_breakdown += f"✓ {constraint.description} - {evidence.confidence:.0%} confidence\n"
+                elif evidence:
+                    constraint_breakdown += f"⚠ {constraint.description} - {evidence.confidence:.0%} confidence (below threshold)\n"
+                else:
+                    constraint_breakdown += (
+                        f"✗ {constraint.description} - No evidence found\n"
+                    )
+        return f"""
+**Final Answer**: {answer} ({confidence}% confidence)
+**Research Summary**:
+- Strategy: Evidence-Based Search
+- Iterations completed: {self.iteration}/{self.max_iterations}
+- Candidates evaluated: {len(self.candidates)}
+- Evidence pieces collected: {sum(len(c.evidence) for c in self.candidates)}
+- {evidence_summary}
+**Top Candidates**:
+{chr(10).join(f"{i + 1}. {c.name} (score: {c.score:.0%})" for i, c in enumerate(self.candidates[:3]))}
+{constraint_breakdown}
+**Evidence Details**:
+{self._format_evidence_summary()}
+**Search Strategy**:
+- Total searches performed: {len(self.search_history)}
+- Constraint-focused searches: {len([s for s in self.search_history if any(c.description in s["query"] for c in self.constraints)])}
+- Candidate verification searches: {len([s for s in self.search_history if any(c.name in s["query"] for c in self.candidates)])}
+**Recent Search Queries**:
+{chr(10).join(f"{i + 1}. {s['query'][:80]}..." for i, s in enumerate(self.search_history[-5:]))}
+""".strip()
+    def _get_timestamp(self) -> str:
+        """Get current timestamp for findings."""
+        return datetime.utcnow().isoformat()
+    def _calculate_evidence_coverage(self) -> float:
+        """Calculate how much evidence we've collected across all candidates."""
+        if not self.candidates or not self.constraints:
+            return 0.0
+        total_possible = len(self.candidates[:5]) * len(self.constraints)
+        total_collected = sum(len(c.evidence) for c in self.candidates[:5])
+        return total_collected / total_possible if total_possible > 0 else 0.0
+    def _get_iteration_status(self) -> str:
+        """Get a human-readable status for the current iteration."""
+        if not self.candidates:
+            return "Searching for initial candidates"
+        top_score = self.candidates[0].score if self.candidates else 0
+        if top_score >= self.confidence_threshold:
+            return "Verifying top candidate"
+        elif top_score >= 0.7:
+            return "Gathering final evidence"
+        elif top_score >= 0.5:
+            return "Refining candidate scores"
+        else:
+            return "Exploring candidate evidence"

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl