PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/source_management/diversity_manager.py ADDED Viewed

@@ -0,0 +1,613 @@
+"""
+Source diversity management for improved evidence quality.
+"""
+import re
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Dict, List, Optional, Set, Tuple, Any
+from langchain_core.language_models import BaseChatModel
+from ..constraints.base_constraint import Constraint, ConstraintType
+@dataclass
+class SourceProfile:
+    """Profile of a source for diversity tracking."""
+    url: str
+    domain: str
+    source_type: str  # 'academic', 'news', 'government', 'wiki', 'blog', etc.
+    credibility_score: float
+    specialties: List[str] = field(default_factory=list)
+    temporal_coverage: Optional[Tuple[datetime, datetime]] = None
+    geographic_focus: Optional[str] = None
+    evidence_count: int = 0
+    last_accessed: Optional[datetime] = None
+@dataclass
+class DiversityMetrics:
+    """Metrics for source diversity assessment."""
+    type_diversity: float  # 0.0 to 1.0
+    temporal_diversity: float
+    geographic_diversity: float
+    credibility_distribution: Dict[str, float]
+    specialty_coverage: Dict[str, int]
+    overall_score: float
+class SourceDiversityManager:
+    """
+    Manages source diversity to ensure comprehensive evidence collection.
+    Key features:
+    1. Tracks source types and characteristics
+    2. Ensures diverse source selection
+    3. Prioritizes high-credibility sources
+    4. Manages geographic and temporal diversity
+    """
+    def __init__(self, model: BaseChatModel):
+        """Initialize the source diversity manager."""
+        self.model = model
+        self.source_profiles: Dict[str, SourceProfile] = {}
+        self.source_types: Dict[str, Set[str]] = defaultdict(set)
+        self.type_priorities: Dict[str, float] = {
+            "academic": 0.9,
+            "government": 0.85,
+            "news": 0.7,
+            "wiki": 0.75,
+            "blog": 0.5,
+            "forum": 0.4,
+            "social": 0.3,
+        }
+        self.minimum_source_types: int = 3
+        self.credibility_threshold: float = 0.6
+    def analyze_source(
+        self, url: str, content: Optional[str] = None
+    ) -> SourceProfile:
+        """Analyze a source and create its profile."""
+        if url in self.source_profiles:
+            profile = self.source_profiles[url]
+            profile.evidence_count += 1
+            profile.last_accessed = datetime.utcnow()
+            return profile
+        # Extract domain
+        domain = self._extract_domain(url)
+        # Determine source type
+        source_type = self._determine_source_type(url, domain, content)
+        # Calculate credibility
+        credibility = self._calculate_credibility(
+            url, domain, source_type, content
+        )
+        # Extract specialties
+        specialties = self._extract_specialties(url, content)
+        # Determine temporal and geographic coverage
+        temporal_coverage = self._extract_temporal_coverage(content)
+        geographic_focus = self._extract_geographic_focus(url, content)
+        profile = SourceProfile(
+            url=url,
+            domain=domain,
+            source_type=source_type,
+            credibility_score=credibility,
+            specialties=specialties,
+            temporal_coverage=temporal_coverage,
+            geographic_focus=geographic_focus,
+            evidence_count=1,
+            last_accessed=datetime.utcnow(),
+        )
+        self.source_profiles[url] = profile
+        self.source_types[source_type].add(url)
+        return profile
+    def _extract_domain(self, url: str) -> str:
+        """Extract domain from URL."""
+        import re
+        pattern = r"https?://(?:www\.)?([^/]+)"
+        match = re.match(pattern, url)
+        if match:
+            return match.group(1)
+        return url
+    def _determine_source_type(
+        self, url: str, domain: str, content: Optional[str]
+    ) -> str:
+        """Determine the type of source."""
+        # Check known patterns
+        academic_domains = [
+            ".edu",
+            ".ac.",
+            "scholar",
+            "pubmed",
+            "arxiv",
+            "jstor",
+        ]
+        government_domains = [".gov", ".mil"]
+        news_domains = [
+            "news",
+            "times",
+            "post",
+            "guardian",
+            "bbc",
+            "cnn",
+            "reuters",
+        ]
+        wiki_domains = ["wikipedia", "wiki"]
+        lower_domain = domain.lower()
+        lower_url = url.lower()
+        # Check patterns
+        for pattern in academic_domains:
+            if pattern in lower_domain or pattern in lower_url:
+                return "academic"
+        for pattern in government_domains:
+            if pattern in lower_domain:
+                return "government"
+        for pattern in wiki_domains:
+            if pattern in lower_domain:
+                return "wiki"
+        for pattern in news_domains:
+            if pattern in lower_domain:
+                return "news"
+        # Use content analysis as fallback
+        if content:
+            return self._analyze_content_type(content)
+        return "general"
+    def _analyze_content_type(self, content: str) -> str:
+        """Analyze content to determine source type."""
+        prompt = f"""
+Analyze this content excerpt and determine the source type:
+{content[:500]}
+Choose from: academic, government, news, wiki, blog, forum, social, general
+Return only the source type.
+"""
+        response = self.model.invoke(prompt)
+        source_type = response.content.strip().lower()
+        if source_type in self.type_priorities:
+            return source_type
+        return "general"
+    def _calculate_credibility(
+        self, url: str, domain: str, source_type: str, content: Optional[str]
+    ) -> float:
+        """Calculate credibility score for a source."""
+        # Base score from source type
+        base_score = self.type_priorities.get(source_type, 0.5)
+        # Adjust based on domain characteristics
+        if ".edu" in domain or ".gov" in domain:
+            base_score = min(base_score + 0.1, 1.0)
+        # Check for HTTPS
+        if url.startswith("https://"):
+            base_score = min(base_score + 0.05, 1.0)
+        # Additional analysis if content provided
+        if content:
+            # Check for citations/references
+            if re.search(r"\[\d+\]|\(\d{4}\)", content):
+                base_score = min(base_score + 0.1, 1.0)
+            # Check for author information
+            if re.search(
+                r"[Aa]uthor:|[Bb]y\s+[A-Z][a-z]+\s+[A-Z][a-z]+", content
+            ):
+                base_score = min(base_score + 0.05, 1.0)
+        return base_score
+    def _extract_specialties(
+        self, url: str, content: Optional[str]
+    ) -> List[str]:
+        """Extract topic specialties from source."""
+        specialties = []
+        # URL-based extraction
+        url_keywords = re.findall(r"/([a-z]+)/", url.lower())
+        specialties.extend([kw for kw in url_keywords if len(kw) > 3][:3])
+        # Content-based extraction if available
+        if content:
+            prompt = f"""
+Identify the main topic areas or specialties covered in this content:
+{content[:500]}
+Return up to 3 topic areas, one per line.
+"""
+            response = self.model.invoke(prompt)
+            topics = [
+                line.strip()
+                for line in response.content.strip().split("\n")
+                if line.strip()
+            ]
+            specialties.extend(topics[:3])
+        return list(set(specialties))[:5]
+    def _extract_temporal_coverage(
+        self, content: Optional[str]
+    ) -> Optional[Tuple[datetime, datetime]]:
+        """Extract temporal coverage from content."""
+        if not content:
+            return None
+        # Look for year patterns
+        years = re.findall(r"\b(19\d{2}|20\d{2})\b", content)
+        if years:
+            years = [int(year) for year in years]
+            min_year = min(years)
+            max_year = max(years)
+            try:
+                return (datetime(min_year, 1, 1), datetime(max_year, 12, 31))
+            except ValueError:
+                return None
+        return None
+    def _extract_geographic_focus(
+        self, url: str, content: Optional[str]
+    ) -> Optional[str]:
+        """Extract geographic focus from source."""
+        # Check URL for geographic indicators
+        geo_patterns = {
+            "us": "United States",
+            "uk": "United Kingdom",
+            "ca": "Canada",
+            "au": "Australia",
+            "eu": "Europe",
+        }
+        for pattern, location in geo_patterns.items():
+            if f".{pattern}" in url or f"/{pattern}/" in url:
+                return location
+        # Content-based extraction
+        if content:
+            # Look for country/region mentions
+            locations = re.findall(
+                r"\b(?:United States|UK|Canada|Australia|Europe|Asia|Africa|Americas)\b",
+                content[:1000],
+                re.IGNORECASE,
+            )
+            if locations:
+                # Return most frequent
+                from collections import Counter
+                location_counts = Counter(locations)
+                return location_counts.most_common(1)[0][0]
+        return None
+    def calculate_diversity_metrics(
+        self, sources: List[str]
+    ) -> DiversityMetrics:
+        """Calculate diversity metrics for a set of sources."""
+        if not sources:
+            return DiversityMetrics(
+                type_diversity=0.0,
+                temporal_diversity=0.0,
+                geographic_diversity=0.0,
+                credibility_distribution={},
+                specialty_coverage={},
+                overall_score=0.0,
+            )
+        # Get profiles
+        profiles = [
+            self.source_profiles.get(url) or self.analyze_source(url)
+            for url in sources
+        ]
+        # Type diversity
+        source_types = [p.source_type for p in profiles]
+        unique_types = len(set(source_types))
+        type_diversity = min(unique_types / self.minimum_source_types, 1.0)
+        # Temporal diversity
+        temporal_ranges = [
+            p.temporal_coverage for p in profiles if p.temporal_coverage
+        ]
+        temporal_diversity = self._calculate_temporal_diversity(temporal_ranges)
+        # Geographic diversity
+        geo_focuses = [
+            p.geographic_focus for p in profiles if p.geographic_focus
+        ]
+        unique_geos = len(set(geo_focuses))
+        geographic_diversity = min(unique_geos / 3, 1.0) if geo_focuses else 0.0
+        # Credibility distribution
+        credibility_distribution = {}
+        for p in profiles:
+            level = (
+                "high"
+                if p.credibility_score >= 0.8
+                else "medium"
+                if p.credibility_score >= 0.6
+                else "low"
+            )
+            credibility_distribution[level] = (
+                credibility_distribution.get(level, 0) + 1
+            )
+        # Specialty coverage
+        specialty_coverage = {}
+        for p in profiles:
+            for specialty in p.specialties:
+                specialty_coverage[specialty] = (
+                    specialty_coverage.get(specialty, 0) + 1
+                )
+        # Overall score
+        overall_score = (
+            type_diversity * 0.3
+            + temporal_diversity * 0.2
+            + geographic_diversity * 0.2
+            + (credibility_distribution.get("high", 0) / len(profiles)) * 0.3
+        )
+        return DiversityMetrics(
+            type_diversity=type_diversity,
+            temporal_diversity=temporal_diversity,
+            geographic_diversity=geographic_diversity,
+            credibility_distribution=credibility_distribution,
+            specialty_coverage=specialty_coverage,
+            overall_score=overall_score,
+        )
+    def _calculate_temporal_diversity(
+        self, ranges: List[Tuple[datetime, datetime]]
+    ) -> float:
+        """Calculate temporal diversity from date ranges."""
+        if not ranges:
+            return 0.0
+        # Calculate span coverage
+        all_years = set()
+        for start, end in ranges:
+            for year in range(start.year, end.year + 1):
+                all_years.add(year)
+        # Diversity based on year span
+        if len(all_years) > 1:
+            year_span = max(all_years) - min(all_years)
+            # Normalize to 0-1 (20 years = max diversity)
+            return min(year_span / 20, 1.0)
+        return 0.0
+    def recommend_additional_sources(
+        self, current_sources: List[str], constraints: List[Constraint]
+    ) -> List[Dict[str, Any]]:
+        """Recommend additional sources to improve diversity."""
+        current_metrics = self.calculate_diversity_metrics(current_sources)
+        recommendations = []
+        # Identify gaps
+        gaps = self._identify_diversity_gaps(current_metrics, constraints)
+        for gap_type, gap_details in gaps.items():
+            if gap_type == "source_type":
+                # Recommend sources of missing types
+                for missing_type in gap_details:
+                    rec = {
+                        "type": "source_type",
+                        "target": missing_type,
+                        "query_modifier": self._get_source_type_modifier(
+                            missing_type
+                        ),
+                        "reason": f"Add {missing_type} sources for better perspective",
+                    }
+                    recommendations.append(rec)
+            elif gap_type == "temporal":
+                # Recommend sources for missing time periods
+                rec = {
+                    "type": "temporal",
+                    "target": gap_details,
+                    "query_modifier": f'"{gap_details}" historical archive',
+                    "reason": f"Add sources covering {gap_details}",
+                }
+                recommendations.append(rec)
+            elif gap_type == "geographic":
+                # Recommend sources from missing regions
+                for region in gap_details:
+                    rec = {
+                        "type": "geographic",
+                        "target": region,
+                        "query_modifier": f"site:{self._get_region_domain(region)}",
+                        "reason": f"Add sources from {region}",
+                    }
+                    recommendations.append(rec)
+            elif gap_type == "credibility":
+                # Recommend higher credibility sources
+                rec = {
+                    "type": "credibility",
+                    "target": "high_credibility",
+                    "query_modifier": "site:.edu OR site:.gov OR peer-reviewed",
+                    "reason": "Add more authoritative sources",
+                }
+                recommendations.append(rec)
+        return recommendations[:5]  # Limit recommendations
+    def _identify_diversity_gaps(
+        self, metrics: DiversityMetrics, constraints: List[Constraint]
+    ) -> Dict[str, Any]:
+        """Identify gaps in source diversity."""
+        gaps = {}
+        # Source type gaps
+        if metrics.type_diversity < 0.7:
+            current_types = set(
+                p.source_type for p in self.source_profiles.values()
+            )
+            desired_types = {"academic", "government", "news", "wiki"}
+            missing_types = desired_types - current_types
+            if missing_types:
+                gaps["source_type"] = list(missing_types)
+        # Temporal gaps (based on constraints)
+        temporal_constraints = [
+            c for c in constraints if c.type == ConstraintType.TEMPORAL
+        ]
+        if temporal_constraints and metrics.temporal_diversity < 0.5:
+            # Extract years from constraints
+            years_needed = []
+            for c in temporal_constraints:
+                year_match = re.search(r"\b(19\d{2}|20\d{2})\b", c.value)
+                if year_match:
+                    years_needed.append(year_match.group(1))
+            if years_needed:
+                gaps["temporal"] = f"{min(years_needed)}-{max(years_needed)}"
+        # Geographic gaps
+        location_constraints = [
+            c for c in constraints if c.type == ConstraintType.LOCATION
+        ]
+        if location_constraints and metrics.geographic_diversity < 0.5:
+            locations_needed = [c.value for c in location_constraints]
+            gaps["geographic"] = locations_needed
+        # Credibility gaps
+        high_cred_ratio = metrics.credibility_distribution.get("high", 0) / max(
+            sum(metrics.credibility_distribution.values()), 1
+        )
+        if high_cred_ratio < 0.3:
+            gaps["credibility"] = True
+        return gaps
+    def _get_source_type_modifier(self, source_type: str) -> str:
+        """Get search modifier for specific source type."""
+        modifiers = {
+            "academic": "site:.edu OR site:scholar.google.com OR site:pubmed.gov",
+            "government": "site:.gov OR site:.mil",
+            "news": 'news OR "press release" OR journalism',
+            "wiki": "site:wikipedia.org OR wiki",
+            "blog": 'blog OR "posted by" OR comments',
+        }
+        return modifiers.get(source_type, "")
+    def _get_region_domain(self, region: str) -> str:
+        """Get domain suffix for a region."""
+        region_domains = {
+            "United States": ".us OR .com",
+            "United Kingdom": ".uk",
+            "Canada": ".ca",
+            "Australia": ".au",
+            "Europe": ".eu OR .de OR .fr",
+        }
+        return region_domains.get(region, ".com")
+    def select_diverse_sources(
+        self, available_sources: List[str], target_count: int
+    ) -> List[str]:
+        """Select a diverse subset of sources."""
+        if len(available_sources) <= target_count:
+            return available_sources
+        # Score each source based on diversity contribution
+        source_scores = []
+        for source in available_sources:
+            profile = self.source_profiles.get(source) or self.analyze_source(
+                source
+            )
+            # Calculate diversity score
+            score = (
+                self.type_priorities.get(profile.source_type, 0.5) * 0.4
+                + profile.credibility_score * 0.3
+                + (1.0 if profile.specialties else 0.5) * 0.15
+                + (1.0 if profile.temporal_coverage else 0.5) * 0.15
+            )
+            source_scores.append((source, score, profile))
+        # Sort by score
+        source_scores.sort(key=lambda x: x[1], reverse=True)
+        # Select diverse sources
+        selected = []
+        selected_types = set()
+        selected_geos = set()
+        for source, score, profile in source_scores:
+            # Prioritize diversity
+            is_diverse = profile.source_type not in selected_types or (
+                profile.geographic_focus
+                and profile.geographic_focus not in selected_geos
+            )
+            if is_diverse or len(selected) < target_count // 2:
+                selected.append(source)
+                selected_types.add(profile.source_type)
+                if profile.geographic_focus:
+                    selected_geos.add(profile.geographic_focus)
+            if len(selected) >= target_count:
+                break
+        return selected
+    def track_source_effectiveness(
+        self, source: str, evidence_quality: float, constraint_satisfied: bool
+    ):
+        """Track how effective a source is for evidence gathering."""
+        profile = self.source_profiles.get(source)
+        if not profile:
+            return
+        # Update profile based on effectiveness
+        if constraint_satisfied:
+            # Boost credibility slightly
+            profile.credibility_score = min(
+                profile.credibility_score * 1.05, 1.0
+            )
+        # Track in metadata
+        if "effectiveness" not in profile.__dict__:
+            profile.effectiveness = []
+        profile.effectiveness.append(
+            {
+                "timestamp": datetime.utcnow(),
+                "evidence_quality": evidence_quality,
+                "constraint_satisfied": constraint_satisfied,
+            }
+        )

local_deep_research/advanced_search_system/strategies/__init__.py CHANGED Viewed

@@ -1 +1,43 @@
 # Search System Strategies Package
+from .adaptive_decomposition_strategy import AdaptiveDecompositionStrategy
+from .base_strategy import BaseSearchStrategy
+from .browsecomp_entity_strategy import BrowseCompEntityStrategy
+from .browsecomp_optimized_strategy import BrowseCompOptimizedStrategy
+from .constraint_parallel_strategy import ConstraintParallelStrategy
+from .dual_confidence_strategy import DualConfidenceStrategy
+from .dual_confidence_with_rejection import DualConfidenceWithRejectionStrategy
+from .evidence_based_strategy import EvidenceBasedStrategy
+from .focused_iteration_strategy import FocusedIterationStrategy
+from .iterative_reasoning_strategy import IterativeReasoningStrategy
+from .iterdrag_strategy import IterDRAGStrategy
+from .modular_strategy import ModularStrategy
+from .parallel_constrained_strategy import ParallelConstrainedStrategy
+from .parallel_search_strategy import ParallelSearchStrategy
+from .rapid_search_strategy import RapidSearchStrategy
+from .recursive_decomposition_strategy import RecursiveDecompositionStrategy
+from .smart_decomposition_strategy import SmartDecompositionStrategy
+from .source_based_strategy import SourceBasedSearchStrategy
+from .standard_strategy import StandardSearchStrategy
+__all__ = [
+    "BaseSearchStrategy",
+    "StandardSearchStrategy",
+    "ParallelSearchStrategy",
+    "ParallelConstrainedStrategy",
+    "SourceBasedSearchStrategy",
+    "RapidSearchStrategy",
+    "IterDRAGStrategy",
+    "RecursiveDecompositionStrategy",
+    "AdaptiveDecompositionStrategy",
+    "SmartDecompositionStrategy",
+    "IterativeReasoningStrategy",
+    "BrowseCompOptimizedStrategy",
+    "BrowseCompEntityStrategy",
+    "EvidenceBasedStrategy",
+    "DualConfidenceStrategy",
+    "DualConfidenceWithRejectionStrategy",
+    "ConstraintParallelStrategy",
+    "ModularStrategy",
+    "FocusedIterationStrategy",
+]

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl