PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py ADDED Viewed

@@ -0,0 +1,457 @@
+"""
+Diversity-focused candidate explorer implementation.
+This explorer prioritizes finding diverse candidates across different
+categories, types, and characteristics.
+"""
+import time
+from collections import defaultdict
+from typing import List, Optional
+from loguru import logger
+from ..candidates.base_candidate import Candidate
+from ..constraints.base_constraint import Constraint
+from .base_explorer import (
+    BaseCandidateExplorer,
+    ExplorationResult,
+    ExplorationStrategy,
+)
+class DiversityExplorer(BaseCandidateExplorer):
+    """
+    Diversity-focused candidate explorer.
+    This explorer:
+    1. Seeks candidates from different categories/types
+    2. Avoids clustering around similar candidates
+    3. Uses diversity metrics to guide exploration
+    4. Balances breadth over depth
+    """
+    def __init__(
+        self,
+        *args,
+        diversity_threshold: float = 0.7,  # Minimum diversity score
+        category_limit: int = 10,  # Max candidates per category
+        similarity_threshold: float = 0.8,  # Similarity threshold for deduplication
+        **kwargs,
+    ):
+        """
+        Initialize diversity explorer.
+        Args:
+            diversity_threshold: Minimum diversity score to maintain
+            category_limit: Maximum candidates per category
+            similarity_threshold: Threshold for considering candidates similar
+        """
+        super().__init__(*args, **kwargs)
+        self.diversity_threshold = diversity_threshold
+        self.category_limit = category_limit
+        self.similarity_threshold = similarity_threshold
+        # Track diversity
+        self.category_counts = defaultdict(int)
+        self.diversity_categories = set()
+    def explore(
+        self,
+        initial_query: str,
+        constraints: Optional[List[Constraint]] = None,
+        entity_type: Optional[str] = None,
+    ) -> ExplorationResult:
+        """Explore candidates using diversity-focused strategy."""
+        start_time = time.time()
+        logger.info(
+            f"Starting diversity-focused exploration for: {initial_query}"
+        )
+        all_candidates = []
+        exploration_paths = []
+        total_searched = 0
+        # Initial broad search
+        initial_results = self._execute_search(initial_query)
+        initial_candidates = self._extract_candidates_from_results(
+            initial_results, entity_type
+        )
+        all_candidates.extend(initial_candidates)
+        total_searched += 1
+        exploration_paths.append(
+            f"Initial search: {initial_query} -> {len(initial_candidates)} candidates"
+        )
+        # Categorize initial candidates
+        self._categorize_candidates(initial_candidates)
+        # Generate diverse exploration paths
+        while self._should_continue_exploration(
+            start_time, len(all_candidates)
+        ):
+            # Calculate current diversity
+            diversity_score = self._calculate_diversity_score(all_candidates)
+            if (
+                diversity_score >= self.diversity_threshold
+                and len(all_candidates) >= 10
+            ):
+                logger.info(f"Diversity threshold met ({diversity_score:.2f})")
+                break
+            # Find underrepresented categories
+            underrepresented_categories = (
+                self._find_underrepresented_categories()
+            )
+            if not underrepresented_categories:
+                # Generate new category exploration
+                new_queries = self._generate_diversity_queries(
+                    initial_query, all_candidates, entity_type
+                )
+            else:
+                # Focus on underrepresented categories
+                new_queries = self._generate_category_queries(
+                    underrepresented_categories, initial_query, entity_type
+                )
+            if not new_queries:
+                break
+            # Execute diverse searches
+            for query in new_queries[:3]:  # Limit concurrent searches
+                if query.lower() in self.explored_queries:
+                    continue
+                results = self._execute_search(query)
+                candidates = self._extract_candidates_from_results(
+                    results, entity_type
+                )
+                # Filter for diversity
+                diverse_candidates = self._filter_for_diversity(
+                    candidates, all_candidates
+                )
+                all_candidates.extend(diverse_candidates)
+                total_searched += 1
+                # Update categories
+                self._categorize_candidates(diverse_candidates)
+                exploration_paths.append(
+                    f"Diversity search: {query} -> {len(diverse_candidates)} diverse candidates"
+                )
+                if not self._should_continue_exploration(
+                    start_time, len(all_candidates)
+                ):
+                    break
+        # Final diversity filtering and ranking
+        diverse_candidates = self._final_diversity_selection(all_candidates)
+        ranked_candidates = self._rank_by_diversity(
+            diverse_candidates, initial_query
+        )
+        final_candidates = ranked_candidates[: self.max_candidates]
+        elapsed_time = time.time() - start_time
+        final_diversity = self._calculate_diversity_score(final_candidates)
+        logger.info(
+            f"Diversity exploration completed: {len(final_candidates)} candidates, diversity: {final_diversity:.2f}"
+        )
+        return ExplorationResult(
+            candidates=final_candidates,
+            total_searched=total_searched,
+            unique_candidates=len(diverse_candidates),
+            exploration_paths=exploration_paths,
+            metadata={
+                "strategy": "diversity_focused",
+                "final_diversity_score": final_diversity,
+                "categories_found": len(self.diversity_categories),
+                "category_distribution": dict(self.category_counts),
+                "entity_type": entity_type,
+            },
+            elapsed_time=elapsed_time,
+            strategy_used=ExplorationStrategy.DIVERSITY_FOCUSED,
+        )
+    def generate_exploration_queries(
+        self,
+        base_query: str,
+        found_candidates: List[Candidate],
+        constraints: Optional[List[Constraint]] = None,
+    ) -> List[str]:
+        """Generate diversity-focused exploration queries."""
+        return self._generate_diversity_queries(base_query, found_candidates)
+    def _categorize_candidates(self, candidates: List[Candidate]):
+        """Categorize candidates for diversity tracking."""
+        for candidate in candidates:
+            category = self._determine_category(candidate)
+            self.category_counts[category] += 1
+            self.diversity_categories.add(category)
+            # Store category in candidate metadata
+            if not candidate.metadata:
+                candidate.metadata = {}
+            candidate.metadata["diversity_category"] = category
+    def _determine_category(self, candidate: Candidate) -> str:
+        """Determine the category of a candidate."""
+        name = candidate.name.lower()
+        # Simple categorization based on common patterns
+        if any(word in name for word in ["mountain", "peak", "summit", "hill"]):
+            return "mountain"
+        elif any(
+            word in name
+            for word in ["lake", "river", "creek", "stream", "pond"]
+        ):
+            return "water"
+        elif any(
+            word in name for word in ["park", "forest", "reserve", "wilderness"]
+        ):
+            return "park"
+        elif any(word in name for word in ["trail", "path", "route", "way"]):
+            return "trail"
+        elif any(word in name for word in ["canyon", "gorge", "valley", "gap"]):
+            return "canyon"
+        elif any(
+            word in name for word in ["cliff", "bluff", "overlook", "viewpoint"]
+        ):
+            return "viewpoint"
+        elif any(
+            word in name for word in ["island", "beach", "coast", "shore"]
+        ):
+            return "coastal"
+        elif any(word in name for word in ["city", "town", "county", "state"]):
+            return "place"
+        else:
+            return "other"
+    def _calculate_diversity_score(self, candidates: List[Candidate]) -> float:
+        """Calculate diversity score for a set of candidates."""
+        if not candidates:
+            return 0.0
+        # Count categories
+        category_counts = defaultdict(int)
+        for candidate in candidates:
+            category = candidate.metadata.get("diversity_category", "other")
+            category_counts[category] += 1
+        # Calculate diversity using Shannon entropy
+        total = len(candidates)
+        entropy = 0.0
+        for count in category_counts.values():
+            if count > 0:
+                p = count / total
+                entropy -= p * (p.bit_length() - 1) if p > 0 else 0
+        # Normalize to 0-1 scale
+        max_entropy = (
+            (len(category_counts).bit_length() - 1)
+            if len(category_counts) > 1
+            else 1
+        )
+        return entropy / max_entropy if max_entropy > 0 else 0.0
+    def _find_underrepresented_categories(self) -> List[str]:
+        """Find categories that are underrepresented."""
+        if not self.category_counts:
+            return []
+        avg_count = sum(self.category_counts.values()) / len(
+            self.category_counts
+        )
+        threshold = avg_count * 0.5  # Categories with less than 50% of average
+        underrepresented = [
+            category
+            for category, count in self.category_counts.items()
+            if count < threshold and count < self.category_limit
+        ]
+        return underrepresented
+    def _generate_diversity_queries(
+        self,
+        base_query: str,
+        found_candidates: List[Candidate],
+        entity_type: Optional[str] = None,
+    ) -> List[str]:
+        """Generate queries to increase diversity."""
+        queries = []
+        # Analyze existing categories
+        existing_categories = set()
+        for candidate in found_candidates:
+            if (
+                candidate.metadata
+                and "diversity_category" in candidate.metadata
+            ):
+                existing_categories.add(
+                    candidate.metadata["diversity_category"]
+                )
+        # Generate queries for missing categories
+        all_categories = [
+            "mountain",
+            "water",
+            "park",
+            "trail",
+            "canyon",
+            "viewpoint",
+            "coastal",
+            "place",
+        ]
+        missing_categories = [
+            cat for cat in all_categories if cat not in existing_categories
+        ]
+        base = entity_type or base_query
+        for category in missing_categories[:3]:  # Limit to 3 new categories
+            if category == "mountain":
+                queries.append(f"{base} mountain peak summit")
+            elif category == "water":
+                queries.append(f"{base} lake river creek")
+            elif category == "park":
+                queries.append(f"{base} park forest reserve")
+            elif category == "trail":
+                queries.append(f"{base} trail path route")
+            elif category == "canyon":
+                queries.append(f"{base} canyon gorge valley")
+            elif category == "viewpoint":
+                queries.append(f"{base} overlook viewpoint cliff")
+            elif category == "coastal":
+                queries.append(f"{base} beach coast island")
+            elif category == "place":
+                queries.append(f"{base} location place area")
+        return queries
+    def _generate_category_queries(
+        self, categories: List[str], base_query: str, entity_type: Optional[str]
+    ) -> List[str]:
+        """Generate queries for specific underrepresented categories."""
+        queries = []
+        base = entity_type or base_query
+        for category in categories[:3]:
+            queries.append(f"{base} {category}")
+            queries.append(f"{category} examples {base}")
+        return queries
+    def _filter_for_diversity(
+        self,
+        new_candidates: List[Candidate],
+        existing_candidates: List[Candidate],
+    ) -> List[Candidate]:
+        """Filter new candidates to maintain diversity."""
+        filtered = []
+        for candidate in new_candidates:
+            category = self._determine_category(candidate)
+            # Check if this category is already well-represented
+            if self.category_counts[category] >= self.category_limit:
+                continue
+            # Check for similarity with existing candidates
+            if not self._is_sufficiently_different(
+                candidate, existing_candidates
+            ):
+                continue
+            filtered.append(candidate)
+        return filtered
+    def _is_sufficiently_different(
+        self, candidate: Candidate, existing_candidates: List[Candidate]
+    ) -> bool:
+        """Check if candidate is sufficiently different from existing ones."""
+        candidate_words = set(candidate.name.lower().split())
+        for existing in existing_candidates[
+            -10:
+        ]:  # Check against recent candidates
+            existing_words = set(existing.name.lower().split())
+            # Calculate Jaccard similarity
+            intersection = len(candidate_words.intersection(existing_words))
+            union = len(candidate_words.union(existing_words))
+            if union > 0:
+                similarity = intersection / union
+                if similarity > self.similarity_threshold:
+                    return False
+        return True
+    def _final_diversity_selection(
+        self, candidates: List[Candidate]
+    ) -> List[Candidate]:
+        """Final selection to maximize diversity."""
+        if not candidates:
+            return candidates
+        # Group by category
+        category_groups = defaultdict(list)
+        for candidate in candidates:
+            category = candidate.metadata.get("diversity_category", "other")
+            category_groups[category].append(candidate)
+        # Select balanced representation from each category
+        selected = []
+        max_per_category = max(1, self.max_candidates // len(category_groups))
+        for category, group in category_groups.items():
+            # Sort by relevance score if available
+            sorted_group = sorted(
+                group,
+                key=lambda c: getattr(c, "relevance_score", 0.0),
+                reverse=True,
+            )
+            selected.extend(sorted_group[:max_per_category])
+        return selected
+    def _rank_by_diversity(
+        self, candidates: List[Candidate], base_query: str
+    ) -> List[Candidate]:
+        """Rank candidates considering both relevance and diversity contribution."""
+        # First rank by relevance
+        relevance_ranked = self._rank_candidates_by_relevance(
+            candidates, base_query
+        )
+        # Then adjust based on diversity contribution
+        for i, candidate in enumerate(relevance_ranked):
+            category = candidate.metadata.get("diversity_category", "other")
+            # Boost score for underrepresented categories
+            category_count = self.category_counts[category]
+            avg_count = (
+                sum(self.category_counts.values()) / len(self.category_counts)
+                if self.category_counts
+                else 1
+            )
+            diversity_boost = max(0, (avg_count - category_count) / avg_count)
+            relevance_score = getattr(candidate, "relevance_score", 0.0)
+            candidate.final_score = relevance_score + (diversity_boost * 0.2)
+        return sorted(
+            relevance_ranked,
+            key=lambda c: getattr(c, "final_score", 0.0),
+            reverse=True,
+        )

local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""
+Parallel candidate explorer implementation.
+This explorer runs multiple search queries in parallel to quickly discover
+a wide range of candidates.
+"""
+import concurrent.futures
+import time
+from typing import List, Optional
+from loguru import logger
+from ..candidates.base_candidate import Candidate
+from ..constraints.base_constraint import Constraint
+from .base_explorer import (
+    BaseCandidateExplorer,
+    ExplorationResult,
+    ExplorationStrategy,
+)
+class ParallelExplorer(BaseCandidateExplorer):
+    """
+    Parallel candidate explorer that runs multiple searches concurrently.
+    This explorer:
+    1. Generates multiple search queries from the initial query
+    2. Runs searches in parallel for speed
+    3. Collects and deduplicates candidates
+    4. Focuses on breadth-first exploration
+    """
+    def __init__(
+        self,
+        *args,
+        max_workers: int = 5,
+        queries_per_round: int = 8,
+        max_rounds: int = 3,
+        **kwargs,
+    ):
+        """
+        Initialize parallel explorer.
+        Args:
+            max_workers: Maximum number of parallel search threads
+            queries_per_round: Number of queries to generate per round
+            max_rounds: Maximum exploration rounds
+        """
+        super().__init__(*args, **kwargs)
+        self.max_workers = max_workers
+        self.queries_per_round = queries_per_round
+        self.max_rounds = max_rounds
+    def explore(
+        self,
+        initial_query: str,
+        constraints: Optional[List[Constraint]] = None,
+        entity_type: Optional[str] = None,
+    ) -> ExplorationResult:
+        """Explore candidates using parallel search strategy."""
+        start_time = time.time()
+        logger.info(f"Starting parallel exploration for: {initial_query}")
+        all_candidates = []
+        exploration_paths = []
+        total_searched = 0
+        # Initial search
+        current_queries = [initial_query]
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.max_workers
+        ) as executor:
+            for round_num in range(self.max_rounds):
+                if not self._should_continue_exploration(
+                    start_time, len(all_candidates)
+                ):
+                    break
+                logger.info(
+                    f"Exploration round {round_num + 1}: {len(current_queries)} queries"
+                )
+                # Submit all queries for parallel execution
+                future_to_query = {
+                    executor.submit(self._execute_search, query): query
+                    for query in current_queries
+                }
+                round_candidates = []
+                # Collect results as they complete
+                for future in concurrent.futures.as_completed(future_to_query):
+                    query = future_to_query[future]
+                    total_searched += 1
+                    try:
+                        results = future.result()
+                        candidates = self._extract_candidates_from_results(
+                            results, entity_type
+                        )
+                        round_candidates.extend(candidates)
+                        exploration_paths.append(
+                            f"Round {round_num + 1}: {query} -> {len(candidates)} candidates"
+                        )
+                    except Exception as e:
+                        logger.error(f"Error processing query '{query}': {e}")
+                # Add new candidates
+                all_candidates.extend(round_candidates)
+                # Generate queries for next round
+                if round_num < self.max_rounds - 1:
+                    current_queries = self.generate_exploration_queries(
+                        initial_query, all_candidates, constraints
+                    )[: self.queries_per_round]
+                    if not current_queries:
+                        logger.info("No more queries to explore")
+                        break
+        # Deduplicate and rank
+        unique_candidates = self._deduplicate_candidates(all_candidates)
+        ranked_candidates = self._rank_candidates_by_relevance(
+            unique_candidates, initial_query
+        )
+        # Limit to max candidates
+        final_candidates = ranked_candidates[: self.max_candidates]
+        elapsed_time = time.time() - start_time
+        logger.info(
+            f"Parallel exploration completed: {len(final_candidates)} unique candidates in {elapsed_time:.1f}s"
+        )
+        return ExplorationResult(
+            candidates=final_candidates,
+            total_searched=total_searched,
+            unique_candidates=len(unique_candidates),
+            exploration_paths=exploration_paths,
+            metadata={
+                "strategy": "parallel",
+                "rounds": min(round_num + 1, self.max_rounds),
+                "max_workers": self.max_workers,
+                "entity_type": entity_type,
+            },
+            elapsed_time=elapsed_time,
+            strategy_used=ExplorationStrategy.BREADTH_FIRST,
+        )
+    def generate_exploration_queries(
+        self,
+        base_query: str,
+        found_candidates: List[Candidate],
+        constraints: Optional[List[Constraint]] = None,
+    ) -> List[str]:
+        """Generate queries for parallel exploration."""
+        queries = []
+        # Query variations based on base query
+        base_variations = self._generate_query_variations(base_query)
+        queries.extend(base_variations)
+        # Queries based on found candidates
+        if found_candidates:
+            candidate_queries = self._generate_candidate_based_queries(
+                found_candidates, base_query
+            )
+            queries.extend(candidate_queries)
+        # Constraint-based queries
+        if constraints:
+            constraint_queries = self._generate_constraint_queries(
+                constraints, base_query
+            )
+            queries.extend(constraint_queries)
+        # Remove already explored queries
+        new_queries = [
+            q for q in queries if q.lower() not in self.explored_queries
+        ]
+        return new_queries[: self.queries_per_round]
+    def _generate_query_variations(self, base_query: str) -> List[str]:
+        """Generate variations of the base query."""
+        try:
+            prompt = f"""
+Generate 4 search query variations for: "{base_query}"
+Each variation should:
+1. Use different keywords but same intent
+2. Be specific and searchable
+3. Focus on finding concrete examples or instances
+Format as numbered list:
+1. [query]
+2. [query]
+3. [query]
+4. [query]
+"""
+            response = self.model.invoke(prompt).content.strip()
+            # Parse numbered list
+            queries = []
+            for line in response.split("\n"):
+                line = line.strip()
+                if line and any(line.startswith(f"{i}.") for i in range(1, 10)):
+                    # Remove number prefix
+                    query = line.split(".", 1)[1].strip()
+                    if query:
+                        queries.append(query)
+            return queries[:4]
+        except Exception as e:
+            logger.error(f"Error generating query variations: {e}")
+            return []
+    def _generate_candidate_based_queries(
+        self, candidates: List[Candidate], base_query: str
+    ) -> List[str]:
+        """Generate queries based on found candidates."""
+        queries = []
+        # Sample a few candidates to avoid too many queries
+        sample_candidates = candidates[:3]
+        for candidate in sample_candidates:
+            # Query for similar entities
+            queries.append(f'similar to "{candidate.name}"')
+            queries.append(f'like "{candidate.name}" examples')
+        return queries
+    def _generate_constraint_queries(
+        self, constraints: List[Constraint], base_query: str
+    ) -> List[str]:
+        """Generate queries focusing on specific constraints."""
+        queries = []
+        # Sample constraints to avoid too many queries
+        for constraint in constraints[:2]:
+            queries.append(f"{constraint.value} examples")
+            queries.append(f'"{constraint.value}" instances')
+        return queries

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl