PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py ADDED Viewed

@@ -0,0 +1,537 @@
+"""
+Focused Iteration Strategy - **PROVEN HIGH-PERFORMANCE STRATEGY FOR SIMPLEQA**
+**PERFORMANCE RECORD:**
+- SimpleQA Accuracy: 96.51% (CONFIRMED HIGH PERFORMER)
+- Optimal Configuration: 8 iterations, 5 questions/iteration, GPT-4.1 Mini
+- Status: PRESERVE THIS STRATEGY - Core SimpleQA implementation
+This strategy achieves excellent SimpleQA performance by:
+1. Using simple, direct search execution (like source-based)
+2. Progressive entity-focused exploration
+3. No early filtering or complex constraint checking
+4. Trusting the LLM for final synthesis
+IMPORTANT: This strategy works exceptionally well for SimpleQA. Any modifications
+should preserve the core approach that achieves 96.51% accuracy.
+**BrowseComp Enhancement:** Also includes BrowseComp-specific optimizations
+when use_browsecomp_optimization=True, but SimpleQA performance is the priority.
+"""
+import concurrent.futures
+import logging
+from typing import Dict, List
+from ...citation_handler import CitationHandler
+from ...config.llm_config import get_llm
+from ...config.search_config import get_search
+from ..candidate_exploration import ProgressiveExplorer
+from ..findings.repository import FindingsRepository
+from ..questions import BrowseCompQuestionGenerator
+from .base_strategy import BaseSearchStrategy
+logger = logging.getLogger(__name__)
+class FocusedIterationStrategy(BaseSearchStrategy):
+    """
+    A hybrid strategy that combines the simplicity of source-based search
+    with BrowseComp-optimized progressive exploration.
+    Key principles:
+    1. Start broad, then narrow progressively
+    2. Extract and systematically search entities
+    3. Keep all results without filtering
+    4. Trust LLM for final constraint matching
+    5. Use more iterations for thorough exploration
+    """
+    def __init__(
+        self,
+        model=None,
+        search=None,
+        citation_handler=None,
+        all_links_of_system=None,
+        max_iterations: int = 8,  # OPTIMAL FOR SIMPLEQA: 96.51% accuracy achieved
+        questions_per_iteration: int = 5,  # OPTIMAL FOR SIMPLEQA: proven config
+        use_browsecomp_optimization: bool = True,  # Can be False for pure SimpleQA
+    ):
+        """Initialize with components optimized for focused iteration."""
+        super().__init__(all_links_of_system)
+        self.search = search or get_search()
+        self.model = model or get_llm()
+        self.progress_callback = None
+        # Configuration
+        self.max_iterations = max_iterations
+        self.questions_per_iteration = questions_per_iteration
+        self.use_browsecomp_optimization = use_browsecomp_optimization
+        # Initialize specialized components
+        if use_browsecomp_optimization:
+            self.question_generator = BrowseCompQuestionGenerator(self.model)
+            self.explorer = ProgressiveExplorer(self.search, self.model)
+        else:
+            # Fall back to standard components
+            from ..questions import StandardQuestionGenerator
+            self.question_generator = StandardQuestionGenerator(self.model)
+            self.explorer = None
+        # Use forced answer handler for BrowseComp optimization
+        handler_type = (
+            "forced_answer" if use_browsecomp_optimization else "standard"
+        )
+        self.citation_handler = citation_handler or CitationHandler(
+            self.model, handler_type=handler_type
+        )
+        self.findings_repository = FindingsRepository(self.model)
+        # Track all search results
+        self.all_search_results = []
+        self.questions_by_iteration = {}
+    def analyze_topic(self, query: str) -> Dict:
+        """
+        Analyze topic using focused iteration approach.
+        Combines simplicity of source-based with progressive BrowseComp optimization.
+        """
+        logger.info(f"Starting focused iteration search: {query}")
+        self._update_progress(
+            "Initializing focused iteration search",
+            5,
+            {
+                "phase": "init",
+                "strategy": "focused_iteration",
+                "max_iterations": self.max_iterations,
+                "browsecomp_optimized": self.use_browsecomp_optimization,
+            },
+        )
+        # Validate search engine
+        if not self._validate_search_engine():
+            return self._create_error_response("No search engine available")
+        findings = []
+        extracted_entities = {}
+        try:
+            # Main iteration loop
+            for iteration in range(1, self.max_iterations + 1):
+                iteration_progress = 10 + (iteration - 1) * (
+                    80 / self.max_iterations
+                )
+                self._update_progress(
+                    f"Iteration {iteration}/{self.max_iterations}",
+                    iteration_progress,
+                    {"phase": f"iteration_{iteration}", "iteration": iteration},
+                )
+                # Generate questions for this iteration
+                if self.use_browsecomp_optimization:
+                    # Use BrowseComp-aware question generation
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=self._get_current_knowledge_summary(),
+                        query=query,
+                        questions_per_iteration=self.questions_per_iteration,
+                        questions_by_iteration=self.questions_by_iteration,
+                        iteration=iteration,
+                    )
+                    # Extract entities on first iteration
+                    if iteration == 1 and hasattr(
+                        self.question_generator, "extracted_entities"
+                    ):
+                        extracted_entities = (
+                            self.question_generator.extracted_entities
+                        )
+                else:
+                    # Standard question generation
+                    questions = self.question_generator.generate_questions(
+                        current_knowledge=self._get_current_knowledge_summary(),
+                        query=query,
+                        questions_per_iteration=self.questions_per_iteration,
+                        questions_by_iteration=self.questions_by_iteration,
+                    )
+                # Always include original query in first iteration
+                if iteration == 1 and query not in questions:
+                    questions = [query] + questions
+                self.questions_by_iteration[iteration] = questions
+                logger.info(f"Iteration {iteration} questions: {questions}")
+                # Report starting searches for this iteration
+                self._update_progress(
+                    f"Executing {len(questions)} searches in iteration {iteration}",
+                    iteration_progress - (80 / self.max_iterations / 4),
+                    {
+                        "phase": f"iteration_{iteration}_searching",
+                        "queries": questions[:3],  # Show first 3 queries
+                        "total_queries": len(questions),
+                    },
+                )
+                # Execute searches
+                if self.explorer and self.use_browsecomp_optimization:
+                    # Use progressive explorer for better tracking
+                    iteration_results, search_progress = self.explorer.explore(
+                        queries=questions,
+                        max_workers=len(questions),
+                        extracted_entities=extracted_entities,
+                    )
+                    # Report detailed search progress
+                    # Convert sets to lists for JSON serialization
+                    serializable_entity_coverage = {
+                        k: list(v)
+                        for k, v in list(
+                            search_progress.entity_coverage.items()
+                        )[:3]
+                    }
+                    self._update_progress(
+                        f"Found {len(search_progress.found_candidates)} candidates, covered {sum(len(v) for v in search_progress.entity_coverage.values())} entities",
+                        iteration_progress,
+                        {
+                            "phase": f"iteration_{iteration}_results",
+                            "candidates_found": len(
+                                search_progress.found_candidates
+                            ),
+                            "entities_covered": sum(
+                                len(v)
+                                for v in search_progress.entity_coverage.values()
+                            ),
+                            "entity_coverage": serializable_entity_coverage,  # JSON-serializable version
+                        },
+                    )
+                    # Check if we should generate verification searches
+                    if iteration > 3 and search_progress.found_candidates:
+                        verification_searches = (
+                            self.explorer.suggest_next_searches(
+                                extracted_entities, max_suggestions=2
+                            )
+                        )
+                        if verification_searches:
+                            logger.info(
+                                f"Adding verification searches: {verification_searches}"
+                            )
+                            self._update_progress(
+                                f"Running {len(verification_searches)} verification searches",
+                                iteration_progress
+                                + (80 / self.max_iterations / 8),
+                                {
+                                    "phase": f"iteration_{iteration}_verification",
+                                    "verification_queries": verification_searches,
+                                },
+                            )
+                            questions.extend(verification_searches)
+                            # Re-run with verification searches
+                            verification_results, _ = self.explorer.explore(
+                                queries=verification_searches,
+                                max_workers=len(verification_searches),
+                            )
+                            iteration_results.extend(verification_results)
+                else:
+                    # Simple parallel search (like source-based) with detailed reporting
+                    iteration_results = (
+                        self._execute_parallel_searches_with_progress(
+                            questions, iteration
+                        )
+                    )
+                # Accumulate all results (no filtering!)
+                self.all_search_results.extend(iteration_results)
+                # Update progress
+                self._update_progress(
+                    f"Completed iteration {iteration} - {len(iteration_results)} results",
+                    iteration_progress + (80 / self.max_iterations / 2),
+                    {
+                        "phase": f"iteration_{iteration}_complete",
+                        "results_count": len(iteration_results),
+                        "total_results": len(self.all_search_results),
+                    },
+                )
+                # Add iteration finding
+                finding = {
+                    "phase": f"Iteration {iteration}",
+                    "content": f"Searched with {len(questions)} questions, found {len(iteration_results)} results.",
+                    "question": query,
+                    "documents": [],
+                }
+                findings.append(finding)
+                # Early termination check for BrowseComp
+                if self._should_terminate_early(iteration):
+                    logger.info(f"Early termination at iteration {iteration}")
+                    break
+            # Final synthesis (like source-based - trust the LLM!)
+            self._update_progress(
+                "Generating final synthesis",
+                90,
+                {"phase": "synthesis"},
+            )
+            # Use citation handler for final synthesis
+            final_result = self.citation_handler.analyze_followup(
+                query,
+                self.all_search_results,
+                previous_knowledge="",
+                nr_of_links=len(self.all_links_of_system),
+            )
+            synthesized_content = final_result.get(
+                "content", "No relevant results found."
+            )
+            documents = final_result.get("documents", [])
+            # Add final synthesis finding
+            final_finding = {
+                "phase": "Final synthesis",
+                "content": synthesized_content,
+                "question": query,
+                "search_results": self.all_search_results,
+                "documents": documents,
+            }
+            findings.append(final_finding)
+            # Add documents to repository
+            self.findings_repository.add_documents(documents)
+            self.findings_repository.set_questions_by_iteration(
+                self.questions_by_iteration
+            )
+            # Format findings
+            formatted_findings = (
+                self.findings_repository.format_findings_to_text(
+                    findings, synthesized_content
+                )
+            )
+            self._update_progress(
+                "Search complete",
+                100,
+                {"phase": "complete"},
+            )
+            # Return results
+            result = {
+                "findings": findings,
+                "iterations": len(self.questions_by_iteration),
+                "questions_by_iteration": self.questions_by_iteration,
+                "formatted_findings": formatted_findings,
+                "current_knowledge": synthesized_content,
+                "all_links_of_system": self.all_links_of_system,
+                "sources": self.all_links_of_system,
+            }
+            # Add BrowseComp-specific data if available
+            if self.explorer and hasattr(self.explorer, "progress"):
+                result["candidates"] = dict(
+                    self.explorer.progress.found_candidates
+                )
+                result["entity_coverage"] = {
+                    k: list(v)
+                    for k, v in self.explorer.progress.entity_coverage.items()
+                }
+            return result
+        except Exception as e:
+            logger.error(f"Error in focused iteration search: {str(e)}")
+            import traceback
+            logger.error(traceback.format_exc())
+            return self._create_error_response(str(e))
+    def _execute_parallel_searches(self, queries: List[str]) -> List[Dict]:
+        """Execute searches in parallel (like source-based strategy)."""
+        all_results = []
+        def search_question(q):
+            try:
+                result = self.search.run(q)
+                return {"question": q, "results": result or []}
+            except Exception as e:
+                logger.error(f"Error searching '{q}': {str(e)}")
+                return {"question": q, "results": [], "error": str(e)}
+        # Run searches in parallel
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=len(queries)
+        ) as executor:
+            futures = [executor.submit(search_question, q) for q in queries]
+            for future in concurrent.futures.as_completed(futures):
+                result_dict = future.result()
+                all_results.extend(result_dict.get("results", []))
+        return all_results
+    def _execute_parallel_searches_with_progress(
+        self, queries: List[str], iteration: int
+    ) -> List[Dict]:
+        """Execute searches in parallel with detailed progress reporting."""
+        all_results = []
+        completed_searches = 0
+        total_searches = len(queries)
+        def search_question_with_progress(q):
+            nonlocal completed_searches
+            try:
+                # Report starting this search
+                self._update_progress(
+                    f"Searching: {q[:50]}{'...' if len(q) > 50 else ''}",
+                    None,  # Don't update overall progress for individual searches
+                    {
+                        "phase": f"iteration_{iteration}_individual_search",
+                        "current_query": q,
+                        "search_progress": f"{completed_searches + 1}/{total_searches}",
+                    },
+                )
+                result = self.search.run(q)
+                completed_searches += 1
+                # Report completion of this search
+                result_count = len(result) if result else 0
+                self._update_progress(
+                    f"Completed search for '{q[:30]}{'...' if len(q) > 30 else ''}' - {result_count} results",
+                    None,
+                    {
+                        "phase": f"iteration_{iteration}_search_complete",
+                        "completed_query": q,
+                        "results_found": result_count,
+                        "search_progress": f"{completed_searches}/{total_searches}",
+                    },
+                )
+                return {
+                    "question": q,
+                    "results": result or [],
+                    "result_count": result_count,
+                }
+            except Exception as e:
+                completed_searches += 1
+                logger.error(f"Error searching '{q}': {str(e)}")
+                self._update_progress(
+                    f"Search failed for '{q[:30]}{'...' if len(q) > 30 else ''}': {str(e)[:50]}",
+                    None,
+                    {
+                        "phase": f"iteration_{iteration}_search_error",
+                        "failed_query": q,
+                        "error": str(e)[:100],
+                        "search_progress": f"{completed_searches}/{total_searches}",
+                    },
+                )
+                return {
+                    "question": q,
+                    "results": [],
+                    "error": str(e),
+                    "result_count": 0,
+                }
+        # Run searches in parallel
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=min(len(queries), 5)
+        ) as executor:
+            futures = [
+                executor.submit(search_question_with_progress, q)
+                for q in queries
+            ]
+            total_results_found = 0
+            for future in concurrent.futures.as_completed(futures):
+                result_dict = future.result()
+                results = result_dict.get("results", [])
+                all_results.extend(results)
+                total_results_found += result_dict.get("result_count", 0)
+        # Report final iteration summary
+        self._update_progress(
+            f"Iteration {iteration} complete: {total_results_found} total results from {total_searches} searches",
+            None,
+            {
+                "phase": f"iteration_{iteration}_summary",
+                "total_searches": total_searches,
+                "total_results": total_results_found,
+                "average_results": (
+                    round(total_results_found / total_searches, 1)
+                    if total_searches > 0
+                    else 0
+                ),
+            },
+        )
+        return all_results
+    def _get_current_knowledge_summary(self) -> str:
+        """Get summary of current knowledge for question generation."""
+        if not self.all_search_results:
+            return ""
+        # Simple summary of top results
+        summary_parts = []
+        for i, result in enumerate(self.all_search_results[:10]):
+            title = result.get("title", "")
+            snippet = result.get("snippet", "")
+            if title or snippet:
+                summary_parts.append(f"{i + 1}. {title}: {snippet[:200]}...")
+        return "\n".join(summary_parts)
+    def _should_terminate_early(self, iteration: int) -> bool:
+        """Check if we should terminate early based on findings."""
+        # For BrowseComp, continue if we're making progress
+        if self.explorer and hasattr(self.explorer, "progress"):
+            progress = self.explorer.progress
+            # Continue if we're still finding new candidates
+            if iteration > 3 and len(progress.found_candidates) > 0:
+                # Check if top candidate has very high confidence
+                if progress.found_candidates:
+                    top_confidence = max(progress.found_candidates.values())
+                    if top_confidence > 0.9:
+                        return True
+            # Continue if we haven't covered all entities
+            if extracted_entities := getattr(
+                self.question_generator, "extracted_entities", {}
+            ):
+                total_entities = sum(
+                    len(v) for v in extracted_entities.values()
+                )
+                covered_entities = sum(
+                    len(v) for v in progress.entity_coverage.values()
+                )
+                coverage_ratio = (
+                    covered_entities / total_entities
+                    if total_entities > 0
+                    else 0
+                )
+                # Continue if coverage is low
+                if coverage_ratio < 0.8 and iteration < 6:
+                    return False
+        # Default: continue to max iterations for thoroughness
+        return False
+    def _create_error_response(self, error_msg: str) -> Dict:
+        """Create standardized error response."""
+        return {
+            "findings": [],
+            "iterations": 0,
+            "questions_by_iteration": {},
+            "formatted_findings": f"Error: {error_msg}",
+            "current_knowledge": "",
+            "error": error_msg,
+        }

local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl