PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/__init__.py CHANGED Viewed

@@ -5,12 +5,17 @@ Local Deep Research - A tool for conducting deep research using AI.
 __author__ = "Your Name"
 __description__ = "A tool for conducting deep research using AI"
+from loguru import logger
 from .__version__ import __version__
 from .config.llm_config import get_llm
 from .config.search_config import get_search
 from .report_generator import get_report_generator
 from .web.app import main
+# Disable logging by default to not interfere with user setup.
+logger.disable("local_deep_research")
 def get_advanced_search_system(strategy_name: str = "iterdrag"):
     """
@@ -32,4 +37,6 @@ __all__ = [
     "get_search",
     "get_report_generator",
     "get_advanced_search_system",
+    "main",
+    "__version__",
 ]

local_deep_research/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.4"
1	+ __version__ = "0.5.0"

local_deep_research/advanced_search_system/answer_decoding/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Answer decoding module for BrowseComp."""
+from .browsecomp_answer_decoder import BrowseCompAnswerDecoder
+__all__ = ["BrowseCompAnswerDecoder"]

local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py ADDED Viewed

@@ -0,0 +1,421 @@
+"""
+BrowseComp Answer Decoding Pipeline
+This module handles encoded answers found in BrowseComp datasets.
+Some BrowseComp answers appear to be encoded (e.g., "Y00Qh+ep") and need
+decoding to extract the actual answer.
+Based on BROWSECOMP_IMPROVEMENT_STRATEGY.md recommendations.
+"""
+import base64
+import logging
+import re
+import urllib.parse
+from typing import Optional, Tuple
+logger = logging.getLogger(__name__)
+class BrowseCompAnswerDecoder:
+    """
+    Handle encoded BrowseComp answers with multiple decoding schemes.
+    Features:
+    1. Automatic encoding detection
+    2. Multiple decoding scheme support
+    3. Answer validation
+    4. Fallback to original if decoding fails
+    """
+    def __init__(self):
+        self.encoding_schemes = [
+            "base64",
+            "hex",
+            "url_encoding",
+            "rot13",
+            "caesar_cipher",
+        ]
+        # Patterns that suggest encoded content
+        self.encoded_patterns = [
+            r"^[A-Za-z0-9+/]+=*$",  # Base64 pattern
+            r"^[0-9A-Fa-f]+$",  # Hex pattern
+            r"%[0-9A-Fa-f]{2}",  # URL encoded
+            r"^[A-Za-z0-9]{8,}$",  # Random string pattern
+        ]
+    def decode_answer(self, raw_answer: str) -> Tuple[str, Optional[str]]:
+        """
+        Attempt to decode a potentially encoded answer.
+        Args:
+            raw_answer: The raw answer string that may be encoded
+        Returns:
+            Tuple of (decoded_answer, encoding_scheme_used)
+            If no decoding works, returns (original_answer, None)
+        """
+        if not raw_answer or len(raw_answer.strip()) == 0:
+            return raw_answer, None
+        # Clean the input
+        clean_answer = raw_answer.strip()
+        # Check if answer looks like plaintext first
+        if self.is_likely_direct_answer(clean_answer):
+            logger.debug(f"Answer appears to be plaintext: {clean_answer}")
+            return clean_answer, None
+        logger.info(
+            f"Attempting to decode potentially encoded answer: {clean_answer}"
+        )
+        # Try each encoding scheme
+        for scheme in self.encoding_schemes:
+            try:
+                decoded = self.apply_decoding_scheme(clean_answer, scheme)
+                if decoded and self.validate_decoded_answer(decoded):
+                    logger.info(
+                        f"Successfully decoded using {scheme}: {clean_answer} -> {decoded}"
+                    )
+                    return decoded, scheme
+            except Exception as e:
+                logger.debug(f"Failed to decode with {scheme}: {e}")
+                continue
+        # No successful decoding
+        logger.warning(
+            f"Could not decode answer, returning original: {clean_answer}"
+        )
+        return clean_answer, None
+    def is_likely_direct_answer(self, answer: str) -> bool:
+        """
+        Check if answer looks like plaintext rather than encoded.
+        Args:
+            answer: The answer string to check
+        Returns:
+            True if answer appears to be plaintext
+        """
+        # Very short answers are likely plaintext
+        if len(answer) < 4:
+            return True
+        # Check for common English words
+        english_indicators = [
+            "the",
+            "and",
+            "or",
+            "of",
+            "in",
+            "to",
+            "a",
+            "an",
+            "company",
+            "group",
+            "inc",
+            "ltd",
+            "corp",
+            "corporation",
+            "person",
+            "people",
+            "event",
+            "year",
+            "years",
+            "million",
+            "billion",
+            "thousand",
+        ]
+        answer_lower = answer.lower()
+        if any(word in answer_lower for word in english_indicators):
+            return True
+        # Check for sentence-like structure
+        if " " in answer and len(answer.split()) > 1:
+            # Has spaces and multiple words - likely plaintext
+            return True
+        # Check if it matches common answer patterns
+        common_patterns = [
+            r"^\d{4}$",  # Year
+            r"^\$?\d+\.?\d*[KMB]?$",  # Number/money
+            r"^[A-Z][a-z]+ [A-Z][a-z]+$",  # Name format
+            r"^\d+%$",  # Percentage
+        ]
+        for pattern in common_patterns:
+            if re.match(pattern, answer):
+                return True
+        # Check character distribution - encoded text often has unusual distribution
+        char_diversity = (
+            len(set(answer)) / len(answer) if len(answer) > 0 else 0
+        )
+        if char_diversity < 0.3:  # Low diversity suggests repetitive/encoded
+            return False
+        # If none of the encoded patterns match, probably plaintext
+        is_encoded = any(
+            re.search(pattern, answer) for pattern in self.encoded_patterns
+        )
+        return not is_encoded
+    def apply_decoding_scheme(self, text: str, scheme: str) -> Optional[str]:
+        """
+        Apply a specific decoding scheme to text.
+        Args:
+            text: Text to decode
+            scheme: Decoding scheme to use
+        Returns:
+            Decoded text or None if decoding fails
+        """
+        try:
+            if scheme == "base64":
+                return self._decode_base64(text)
+            elif scheme == "hex":
+                return self._decode_hex(text)
+            elif scheme == "url_encoding":
+                return self._decode_url(text)
+            elif scheme == "rot13":
+                return self._decode_rot13(text)
+            elif scheme == "caesar_cipher":
+                return self._decode_caesar(text)
+            else:
+                logger.warning(f"Unknown decoding scheme: {scheme}")
+                return None
+        except Exception as e:
+            logger.debug(f"Failed to apply {scheme} decoding: {e}")
+            return None
+    def _decode_base64(self, text: str) -> Optional[str]:
+        """Decode base64 encoded text."""
+        try:
+            # Add padding if needed
+            missing_padding = len(text) % 4
+            if missing_padding:
+                text += "=" * (4 - missing_padding)
+            decoded_bytes = base64.b64decode(text)
+            return decoded_bytes.decode("utf-8")
+        except Exception:
+            return None
+    def _decode_hex(self, text: str) -> Optional[str]:
+        """Decode hexadecimal encoded text."""
+        try:
+            # Remove any whitespace or non-hex characters
+            clean_hex = re.sub(r"[^0-9A-Fa-f]", "", text)
+            # Must have even length
+            if len(clean_hex) % 2 != 0:
+                return None
+            decoded_bytes = bytes.fromhex(clean_hex)
+            return decoded_bytes.decode("utf-8")
+        except Exception:
+            return None
+    def _decode_url(self, text: str) -> Optional[str]:
+        """Decode URL encoded text."""
+        try:
+            return urllib.parse.unquote(text)
+        except Exception:
+            return None
+    def _decode_rot13(self, text: str) -> Optional[str]:
+        """Decode ROT13 encoded text."""
+        try:
+            import codecs
+            return codecs.decode(text, "rot13")
+        except Exception:
+            return None
+    def _decode_caesar(self, text: str) -> Optional[str]:
+        """
+        Try different Caesar cipher shifts.
+        Returns the most English-like result.
+        """
+        best_result = None
+        best_score = 0
+        # Try shifts 1-25
+        for shift in range(1, 26):
+            try:
+                decoded = self._caesar_shift(text, shift)
+                score = self._english_score(decoded)
+                if score > best_score:
+                    best_score = score
+                    best_result = decoded
+            except Exception:
+                continue
+        # Only return if it looks reasonably English-like
+        return best_result if best_score > 0.3 else None
+    def _caesar_shift(self, text: str, shift: int) -> str:
+        """Apply Caesar cipher shift."""
+        result = []
+        for char in text:
+            if char.isalpha():
+                # Determine if uppercase or lowercase
+                start = ord("A") if char.isupper() else ord("a")
+                # Apply shift with wraparound
+                shifted = (ord(char) - start + shift) % 26 + start
+                result.append(chr(shifted))
+            else:
+                result.append(char)
+        return "".join(result)
+    def _english_score(self, text: str) -> float:
+        """
+        Score how English-like a text appears.
+        Simple heuristic based on common letters and words.
+        """
+        if not text:
+            return 0.0
+        text_lower = text.lower()
+        # Common English letter frequencies (approximate)
+        common_letters = "etaoinshrdlcumwfgypbvkjxqz"
+        letter_score = 0
+        letter_count = 0
+        for char in text_lower:
+            if char.isalpha():
+                letter_count += 1
+                # More common letters get higher scores
+                if char in common_letters[:10]:  # Top 10 most common
+                    letter_score += 2
+                elif char in common_letters[:20]:  # Top 20
+                    letter_score += 1
+        if letter_count == 0:
+            return 0.0
+        base_score = letter_score / letter_count
+        # Bonus for common English words
+        common_words = [
+            "the",
+            "and",
+            "of",
+            "to",
+            "a",
+            "in",
+            "is",
+            "it",
+            "you",
+            "that",
+        ]
+        word_bonus = sum(1 for word in common_words if word in text_lower)
+        return min(1.0, base_score + word_bonus * 0.1)
+    def validate_decoded_answer(self, decoded: str) -> bool:
+        """
+        Validate that decoded text looks like a reasonable answer.
+        Args:
+            decoded: The decoded text to validate
+        Returns:
+            True if decoded text appears valid
+        """
+        if not decoded or len(decoded.strip()) == 0:
+            return False
+        # Remove leading/trailing whitespace
+        decoded = decoded.strip()
+        # Check length - should be reasonable
+        if len(decoded) < 1 or len(decoded) > 1000:
+            return False
+        # Check for readable characters
+        printable_count = sum(1 for c in decoded if c.isprintable())
+        if printable_count / len(decoded) < 0.8:  # At least 80% printable
+            return False
+        # Check for control characters (bad sign)
+        if any(ord(c) < 32 and c not in "\t\n\r" for c in decoded):
+            return False
+        # Check character distribution
+        char_types = {
+            "alpha": sum(1 for c in decoded if c.isalpha()),
+            "digit": sum(1 for c in decoded if c.isdigit()),
+            "space": sum(1 for c in decoded if c.isspace()),
+            "punct": sum(
+                1 for c in decoded if not c.isalnum() and not c.isspace()
+            ),
+        }
+        total_chars = len(decoded)
+        # Should have some letters
+        if char_types["alpha"] / total_chars < 0.3:
+            return False
+        # Shouldn't be mostly punctuation
+        if char_types["punct"] / total_chars > 0.5:
+            return False
+        return True
+    def analyze_answer_encoding(self, answer: str) -> dict:
+        """
+        Analyze an answer to determine likely encoding type.
+        Returns analysis results for debugging/logging.
+        """
+        analysis = {
+            "original": answer,
+            "length": len(answer),
+            "likely_plaintext": self.is_likely_direct_answer(answer),
+            "pattern_matches": [],
+            "attempted_decodings": {},
+        }
+        # Check which patterns match
+        for i, pattern in enumerate(self.encoded_patterns):
+            if re.search(pattern, answer):
+                analysis["pattern_matches"].append(
+                    {
+                        "pattern": pattern,
+                        "type": ["base64", "hex", "url", "random"][i],
+                    }
+                )
+        # Try each decoding scheme
+        for scheme in self.encoding_schemes:
+            try:
+                decoded = self.apply_decoding_scheme(answer, scheme)
+                is_valid = (
+                    self.validate_decoded_answer(decoded) if decoded else False
+                )
+                analysis["attempted_decodings"][scheme] = {
+                    "decoded": decoded,
+                    "valid": is_valid,
+                    "length": len(decoded) if decoded else 0,
+                }
+            except Exception as e:
+                analysis["attempted_decodings"][scheme] = {"error": str(e)}
+        return analysis

local_deep_research/advanced_search_system/candidate_exploration/README.md ADDED Viewed

@@ -0,0 +1,219 @@
+# Candidate Exploration System
+This module provides an inheritance-based candidate exploration system for discovering and collecting candidates in the Local Deep Research framework.
+## Architecture
+The system is built around inheritance and provides multiple exploration strategies:
+### Base Class
+- **`BaseCandidateExplorer`**: Abstract base class defining the exploration interface
+### Concrete Implementations
+- **`ParallelExplorer`**: Runs multiple searches in parallel for speed and breadth
+- **`AdaptiveExplorer`**: Learns which search strategies work best and adapts
+- **`ConstraintGuidedExplorer`**: Uses constraints to guide the exploration process
+- **`DiversityExplorer`**: Prioritizes finding diverse candidates across categories
+### Supporting Components
+- **`ExplorationResult`**: Data class containing exploration results and metadata
+- **`ExplorationStrategy`**: Enum defining different exploration approaches
+## Usage Examples
+### Using ParallelExplorer
+```python
+from candidate_exploration import ParallelExplorer
+explorer = ParallelExplorer(
+    model=llm,
+    search_engine=search,
+    max_workers=5,           # Parallel search threads
+    queries_per_round=8,     # Queries generated per round
+    max_rounds=3             # Maximum exploration rounds
+)
+result = explorer.explore(
+    initial_query="hiking locations",
+    constraints=constraints,
+    entity_type="location"
+)
+```
+### Using AdaptiveExplorer
+```python
+from candidate_exploration import AdaptiveExplorer
+explorer = AdaptiveExplorer(
+    model=llm,
+    search_engine=search,
+    initial_strategies=["direct_search", "synonym_expansion", "category_exploration"],
+    adaptation_threshold=5   # Adapt after 5 searches
+)
+result = explorer.explore("scenic viewpoints", constraints, "viewpoint")
+```
+### Using ConstraintGuidedExplorer
+```python
+from candidate_exploration import ConstraintGuidedExplorer
+explorer = ConstraintGuidedExplorer(
+    model=llm,
+    search_engine=search,
+    constraint_weight_threshold=0.7,  # Focus on high-weight constraints
+    early_validation=True              # Validate during exploration
+)
+result = explorer.explore("mountain peaks", constraints, "mountain")
+```
+### Using DiversityExplorer
+```python
+from candidate_exploration import DiversityExplorer
+explorer = DiversityExplorer(
+    model=llm,
+    search_engine=search,
+    diversity_threshold=0.7,    # Minimum diversity score
+    category_limit=10,          # Max per category
+    similarity_threshold=0.8    # Similarity threshold
+)
+result = explorer.explore("natural landmarks", constraints, "landmark")
+```
+## Creating Custom Variants
+To create your own exploration strategy:
+1. **Inherit from BaseCandidateExplorer**:
+```python
+from .base_explorer import BaseCandidateExplorer, ExplorationResult
+class MyCustomExplorer(BaseCandidateExplorer):
+    def __init__(self, *args, my_param=0.5, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.my_param = my_param
+```
+2. **Implement required methods**:
+```python
+    def explore(self, initial_query, constraints=None, entity_type=None):
+        # Your exploration implementation
+        return ExplorationResult(...)
+    def generate_exploration_queries(self, base_query, found_candidates, constraints=None):
+        # Your query generation logic
+        return ["query1", "query2", "query3"]
+```
+3. **Add custom exploration logic**:
+```python
+    def _my_custom_search_strategy(self, query, context):
+        # Your custom search approach
+        pass
+```
+## Integration with Strategies
+Use in your strategy by initializing the explorer:
+```python
+class MyStrategy(BaseStrategy):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Choose your explorer
+        self.explorer = AdaptiveExplorer(
+            model=self.model,
+            search_engine=self.search,
+            max_candidates=50,
+            max_search_time=120.0
+        )
+    def find_candidates(self, query, constraints):
+        result = self.explorer.explore(
+            initial_query=query,
+            constraints=constraints,
+            entity_type=self._detect_entity_type(query)
+        )
+        return result.candidates
+```
+## Available Explorers
+### ParallelExplorer
+- **Best for**: Fast, broad candidate discovery
+- **Strategy**: Breadth-first parallel search
+- **Parameters**: `max_workers`, `queries_per_round`, `max_rounds`
+- **Output**: Many candidates found quickly
+### AdaptiveExplorer
+- **Best for**: Learning optimal search approaches
+- **Strategy**: Adapts based on search success
+- **Parameters**: `initial_strategies`, `adaptation_threshold`
+- **Output**: Candidates found using best-performing strategies
+### ConstraintGuidedExplorer
+- **Best for**: Constraint-driven discovery
+- **Strategy**: Constraint-guided search prioritization
+- **Parameters**: `constraint_weight_threshold`, `early_validation`
+- **Output**: Candidates likely to satisfy constraints
+### DiversityExplorer
+- **Best for**: Diverse candidate sets
+- **Strategy**: Diversity-focused exploration
+- **Parameters**: `diversity_threshold`, `category_limit`, `similarity_threshold`
+- **Output**: Diverse candidates across categories
+## ExplorationResult Structure
+```python
+@dataclass
+class ExplorationResult:
+    candidates: List[Candidate]           # Found candidates
+    total_searched: int                   # Number of searches performed
+    unique_candidates: int                # Number of unique candidates
+    exploration_paths: List[str]          # Search path descriptions
+    metadata: Dict                        # Strategy-specific metadata
+    elapsed_time: float                   # Time taken for exploration
+    strategy_used: ExplorationStrategy    # Strategy that was used
+```
+## Performance Considerations
+### Speed vs. Quality Trade-offs
+- **ParallelExplorer**: Fastest, good breadth
+- **AdaptiveExplorer**: Medium speed, learns over time
+- **ConstraintGuidedExplorer**: Medium speed, higher constraint satisfaction
+- **DiversityExplorer**: Slower, but most diverse results
+### Memory Usage
+- All explorers track found candidates to avoid duplicates
+- Large candidate sets may use significant memory
+- Consider using `max_candidates` parameter to limit memory usage
+### Search Engine Load
+- Parallel explorers generate more concurrent search requests
+- Consider rate limiting or using fewer `max_workers`
+- Monitor search engine response times
+## Extending the System
+The inheritance-based design makes it easy to:
+1. **Create domain-specific explorers** (e.g., GeoExplorer, PersonExplorer)
+2. **Combine exploration strategies** (e.g., parallel + adaptive)
+3. **Add new search patterns** and query generation methods
+4. **Implement caching strategies** for discovered candidates
+5. **Add quality scoring** for candidate ranking
+## Best Practices
+1. **Choose the right explorer** for your use case
+2. **Set appropriate limits** (`max_candidates`, `max_search_time`)
+3. **Provide good constraints** when using ConstraintGuidedExplorer
+4. **Monitor diversity scores** when using DiversityExplorer
+5. **Let AdaptiveExplorer learn** over multiple runs for best results

local_deep_research/advanced_search_system/candidate_exploration/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""
+Candidate exploration system for discovering and refining candidates.
+This module provides inheritance-based components for exploring and discovering
+candidates through different search strategies and approaches.
+"""
+from .adaptive_explorer import AdaptiveExplorer
+from .base_explorer import BaseCandidateExplorer, ExplorationResult
+from .constraint_guided_explorer import ConstraintGuidedExplorer
+from .diversity_explorer import DiversityExplorer
+from .parallel_explorer import ParallelExplorer
+from .progressive_explorer import ProgressiveExplorer
+__all__ = [
+    # Base classes
+    "BaseCandidateExplorer",
+    "ExplorationResult",
+    # Concrete implementations
+    "ParallelExplorer",
+    "AdaptiveExplorer",
+    "ConstraintGuidedExplorer",
+    "DiversityExplorer",
+    "ProgressiveExplorer",
+]

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl