PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py ADDED Viewed

@@ -0,0 +1,506 @@
+"""
+Parallel constrained search strategy with progressive constraint relaxation.
+Key improvements:
+1. Combines multiple constraints in initial searches
+2. Runs searches in parallel for efficiency
+3. Progressively loosens constraints if needed
+4. Compact design to minimize context usage
+"""
+import concurrent.futures
+from dataclasses import dataclass
+from typing import List
+from loguru import logger
+from ..candidates.base_candidate import Candidate
+from ..constraints.base_constraint import Constraint, ConstraintType
+from .constrained_search_strategy import ConstrainedSearchStrategy
+@dataclass
+class SearchCombination:
+    """Represents a combination of constraints for searching."""
+    constraints: List[Constraint]
+    query: str
+    priority: int
+    def __hash__(self):
+        return hash(self.query)
+class ParallelConstrainedStrategy(ConstrainedSearchStrategy):
+    """
+    Enhanced constrained strategy with parallel search and smart constraint combination.
+    """
+    def __init__(
+        self,
+        *args,
+        parallel_workers: int = 100,
+        min_results_threshold: int = 10,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.parallel_workers = parallel_workers
+        self.min_results_threshold = min_results_threshold
+        # Define hard constraints that must be satisfied
+        self.hard_constraint_keywords = [
+            "aired between",
+            "aired during",
+            "air date",
+            "broadcast",
+            "episodes",
+            "season",
+            "year",
+            "decade",
+            "male",
+            "female",
+            "gender",
+            "tv show",
+            "series",
+            "program",
+        ]
+    def _classify_constraints(self):
+        """Classify constraints into hard (must satisfy) and soft (scoring) categories."""
+        self.hard_constraints = []
+        self.soft_constraints = []
+        for constraint in self.constraint_ranking:
+            # Check if constraint is hard based on keywords and type
+            is_hard = False
+            # Temporal and statistic constraints are usually hard
+            if constraint.type in [
+                ConstraintType.TEMPORAL,
+                ConstraintType.STATISTIC,
+            ]:
+                is_hard = True
+            # Check for hard constraint keywords
+            constraint_text = constraint.value.lower()
+            for keyword in self.hard_constraint_keywords:
+                if keyword in constraint_text:
+                    is_hard = True
+                    break
+            if is_hard:
+                self.hard_constraints.append(constraint)
+            else:
+                self.soft_constraints.append(constraint)
+        logger.info(
+            f"Classified {len(self.hard_constraints)} hard constraints and {len(self.soft_constraints)} soft constraints"
+        )
+    def _progressive_constraint_search(self):
+        """Override parent method with parallel, combined constraint search."""
+        current_candidates = []
+        search_iterations = 0
+        max_search_iterations = 3
+        # Check if constraint_ranking is available
+        if (
+            not hasattr(self, "constraint_ranking")
+            or not self.constraint_ranking
+        ):
+            logger.error(
+                "No constraint ranking available - calling parent method"
+            )
+            return super()._progressive_constraint_search()
+        # Detect what type of entity we're looking for
+        self.entity_type = self._detect_entity_type()
+        logger.info(f"Detected entity type: {self.entity_type}")
+        logger.info(
+            f"Starting parallel constraint search with {len(self.constraint_ranking)} constraints"
+        )
+        logger.info(
+            f"Constraint ranking: {[c.value for c in self.constraint_ranking[:5]]}"
+        )
+        while search_iterations < max_search_iterations:
+            search_iterations += 1
+            # Phase 1: Combined constraints (strict)
+            if search_iterations == 1:
+                combinations = self._create_strict_combinations()
+                strictness = "strict"
+            # Phase 2: Relaxed combinations
+            elif search_iterations == 2:
+                combinations = self._create_relaxed_combinations()
+                strictness = "relaxed"
+            # Phase 3: Individual constraints (fallback)
+            else:
+                combinations = self._create_individual_combinations()
+                strictness = "individual"
+            logger.info(
+                f"Iteration {search_iterations}: {strictness} mode with {len(combinations)} combinations"
+            )
+            # Log the actual combinations
+            for i, combo in enumerate(combinations):
+                logger.info(
+                    f"  Combination {i + 1}: query='{combo.query[:60]}...', constraints={len(combo.constraints)}"
+                )
+            if self.progress_callback:
+                self.progress_callback(
+                    f"Search iteration {search_iterations}: {strictness} mode ({len(combinations)} combinations)",
+                    15 + (search_iterations * 25),
+                    {
+                        "phase": "parallel_search",
+                        "iteration": search_iterations,
+                        "combinations": len(combinations),
+                        "mode": strictness,
+                    },
+                )
+            # Run searches in parallel
+            new_candidates = self._parallel_search(combinations)
+            current_candidates.extend(new_candidates)
+            # Check if we have enough results
+            unique_candidates = self._deduplicate_candidates(current_candidates)
+            if len(unique_candidates) >= self.min_results_threshold:
+                if self.progress_callback:
+                    self.progress_callback(
+                        f"Found {len(unique_candidates)} candidates - stopping search",
+                        90,
+                        {
+                            "phase": "search_complete",
+                            "candidates": len(unique_candidates),
+                        },
+                    )
+                break
+            if self.progress_callback:
+                self.progress_callback(
+                    f"Found {len(unique_candidates)} candidates - continuing search",
+                    None,
+                    {
+                        "phase": "search_continue",
+                        "candidates": len(unique_candidates),
+                    },
+                )
+        self.candidates = unique_candidates[: self.candidate_limit]
+        # Add stage tracking for parent class compatibility
+        self.stage_candidates = {
+            0: self.candidates,  # Final results as last stage
+        }
+    def _create_strict_combinations(self) -> List[SearchCombination]:
+        """Create initial strict constraint combinations."""
+        combinations = []
+        # Group constraints by type for better combination
+        by_type = {}
+        for c in self.constraint_ranking[:6]:  # Top 6 constraints
+            if c.type not in by_type:
+                by_type[c.type] = []
+            by_type[c.type].append(c)
+        # Strategy 1: Combine most restrictive constraints
+        if len(self.constraint_ranking) >= 2:
+            top_two = self.constraint_ranking[:2]
+            query = self._build_query(top_two)
+            combinations.append(SearchCombination(top_two, query, 1))
+        # Strategy 2: Combine temporal + property constraints
+        temporal = [
+            c
+            for c in self.constraint_ranking
+            if c.type in [ConstraintType.EVENT, ConstraintType.TEMPORAL]
+        ]
+        properties = [
+            c
+            for c in self.constraint_ranking
+            if c.type == ConstraintType.PROPERTY
+        ]
+        if temporal and properties:
+            combined = temporal[:1] + properties[:1]
+            query = self._build_query(combined)
+            combinations.append(SearchCombination(combined, query, 2))
+        # Strategy 3: Combine statistic + property
+        stats = [
+            c
+            for c in self.constraint_ranking
+            if c.type == ConstraintType.STATISTIC
+        ]
+        if stats and properties:
+            combined = stats[:1] + properties[:2]
+            query = self._build_query(combined)
+            combinations.append(SearchCombination(combined, query, 3))
+        return combinations[:5]  # Limit to 5 combinations
+    def _create_relaxed_combinations(self) -> List[SearchCombination]:
+        """Create relaxed constraint combinations."""
+        combinations = []
+        # Use single most restrictive constraints
+        for i, constraint in enumerate(self.constraint_ranking[:3]):
+            query = self._build_query([constraint])
+            combinations.append(SearchCombination([constraint], query, i + 10))
+        # Combine weaker constraints
+        if len(self.constraint_ranking) > 3:
+            weaker = self.constraint_ranking[3:6]
+            query = self._build_query(weaker)
+            combinations.append(SearchCombination(weaker, query, 20))
+        return combinations
+    def _create_individual_combinations(self) -> List[SearchCombination]:
+        """Create individual constraint searches as fallback."""
+        combinations = []
+        for i, constraint in enumerate(self.constraint_ranking[:5]):
+            # Create multiple query variations
+            queries = self._generate_query_variations(constraint)
+            for j, query in enumerate(
+                queries[:2]
+            ):  # 2 variations per constraint
+                combinations.append(
+                    SearchCombination([constraint], query, i * 10 + j + 30)
+                )
+        return combinations
+    def _build_query(self, constraints: List[Constraint]) -> str:
+        """Build an optimized query from constraints."""
+        terms = []
+        # Use entity type to add context
+        entity_type = getattr(self, "entity_type", None)
+        if entity_type and entity_type != "unknown entity":
+            # Add entity type as a search term
+            terms.append(f'"{entity_type}"')
+        for c in constraints:
+            # Add quotes for multi-word values
+            value = c.value
+            if " " in value and not value.startswith('"'):
+                value = f'"{value}"'
+            terms.append(value)
+        # Join with AND for strict matching
+        return " AND ".join(terms)
+    def _generate_query_variations(self, constraint: Constraint) -> List[str]:
+        """Generate query variations for a single constraint."""
+        base = constraint.value
+        variations = [base]
+        # Add type-specific variations
+        if constraint.type == ConstraintType.STATISTIC:
+            variations.extend(
+                [f"list {base}", f"complete {base}", f"all {base}"]
+            )
+        elif constraint.type == ConstraintType.PROPERTY:
+            variations.extend(
+                [f"with {base}", f"featuring {base}", f"known for {base}"]
+            )
+        return variations[:3]  # Limit variations
+    def _parallel_search(
+        self, combinations: List[SearchCombination]
+    ) -> List[Candidate]:
+        """Execute searches in parallel."""
+        all_candidates = []
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.parallel_workers
+        ) as executor:
+            # Submit all searches
+            future_to_combo = {
+                executor.submit(self._execute_combination_search, combo): combo
+                for combo in combinations
+            }
+            # Process results as they complete
+            for i, future in enumerate(
+                concurrent.futures.as_completed(future_to_combo)
+            ):
+                combo = future_to_combo[future]
+                try:
+                    candidates = future.result()
+                    all_candidates.extend(candidates)
+                    if self.progress_callback:
+                        self.progress_callback(
+                            f"Completed search {i + 1}/{len(combinations)}: {len(candidates)} results",
+                            None,
+                            {
+                                "phase": "parallel_result",
+                                "query": combo.query[:50],
+                                "candidates": len(candidates),
+                                "total_so_far": len(all_candidates),
+                            },
+                        )
+                except Exception as e:
+                    logger.error(f"Search failed for {combo.query}: {e}")
+        return all_candidates
+    def _execute_combination_search(
+        self, combination: SearchCombination
+    ) -> List[Candidate]:
+        """Execute a single combination search."""
+        try:
+            results = self._execute_search(combination.query)
+            # Extract candidates using LLM
+            candidates = []
+            content = results.get("current_knowledge", "")
+            logger.info(
+                f"Search '{combination.query[:50]}...' returned {len(content)} chars of content"
+            )
+            if content and len(content) > 50:
+                # Always use LLM extraction for accuracy
+                extracted = self._extract_relevant_candidates(
+                    {"current_knowledge": content},
+                    combination.constraints[0]
+                    if combination.constraints
+                    else None,
+                )
+                candidates.extend(extracted)
+            logger.info(
+                f"Search '{combination.query[:30]}' found {len(candidates)} candidates"
+            )
+            return candidates
+        except Exception as e:
+            logger.error(f"Error in combination search: {e}", exc_info=True)
+            return []
+    def _quick_extract_candidates(
+        self, content: str, constraints: List[Constraint]
+    ) -> List[Candidate]:
+        """Extract candidates using LLM with entity type awareness."""
+        # Use the detected entity type if available
+        entity_type = getattr(self, "entity_type", "entity")
+        extraction_prompt = f"""
+From the following search result, extract {entity_type} names that might match the given constraints.
+Search result:
+{content}
+Constraints to consider:
+{chr(10).join(f"- {c.value}" for c in constraints)}
+Important:
+- Extract ONLY {entity_type} names
+- Do NOT include other types of entities
+- Focus on entities that could potentially match the constraints
+Return the {entity_type} names, one per line.
+"""
+        try:
+            response = self.model.invoke(extraction_prompt).content
+            candidates = []
+            for line in response.split("\n"):
+                name = line.strip()
+                if name and len(name) > 2:
+                    candidates.append(Candidate(name=name))
+            return candidates[:15]
+        except Exception as e:
+            logger.error(f"Entity extraction failed: {e}")
+            return []
+    def _validate_hard_constraints(
+        self, candidates: List[Candidate]
+    ) -> List[Candidate]:
+        """Filter candidates that don't meet hard constraints."""
+        if not self.hard_constraints or not candidates:
+            return candidates
+        entity_type = getattr(self, "entity_type", "entity")
+        validation_prompt = f"""
+Validate {entity_type} candidates against hard constraints.
+Hard constraints that MUST be satisfied:
+{chr(10).join(f"- {c.value}" for c in self.hard_constraints)}
+{entity_type} candidates to evaluate:
+{chr(10).join(f"- {c.name}" for c in candidates[:20])}
+Return ONLY the {entity_type} names that satisfy ALL hard constraints, one per line.
+Reject any candidates that:
+1. Are not actually a {entity_type}
+2. Do not satisfy ALL the hard constraints listed above
+Be strict - if there's doubt about a constraint being satisfied, reject the candidate."""
+        try:
+            response = self.model.invoke(validation_prompt).content
+            valid_names = [
+                line.strip() for line in response.split("\n") if line.strip()
+            ]
+            # Keep only candidates that passed validation
+            filtered = [c for c in candidates if c.name in valid_names]
+            logger.info(
+                f"Hard constraint validation: {len(candidates)} -> {len(filtered)} candidates"
+            )
+            return filtered
+        except Exception as e:
+            logger.error(f"Hard constraint validation failed: {e}")
+            return candidates[:10]  # Return top candidates if validation fails
+    def _detect_entity_type(self) -> str:
+        """Use LLM to detect what type of entity we're searching for."""
+        # Build context from constraints
+        constraint_text = "\n".join(
+            [f"- {c.value}" for c in self.constraint_ranking]
+        )
+        prompt = f"""
+Analyze these search constraints and determine what type of entity is being searched for:
+Constraints:
+{constraint_text}
+What is the primary entity type being searched for? Be specific.
+Examples of entity types (but you can choose any appropriate type):
+- fictional character
+- TV show
+- movie
+- actor/actress
+- historical figure
+- company
+- product
+- location
+- event
+Respond with just the entity type.
+"""
+        try:
+            entity_type = self.model.invoke(prompt).content.strip()
+            logger.info(f"LLM determined entity type: {entity_type}")
+            return entity_type
+        except Exception as e:
+            logger.error(f"Failed to detect entity type: {e}")
+            return "unknown entity"

local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py CHANGED Viewed

@@ -123,7 +123,9 @@ class ParallelSearchStrategy(BaseSearchStrategy):
         try:
             # Run each iteration
             for iteration in range(1, iterations_to_run + 1):
-                iteration_progress_base = 5 + (iteration - 1) * (70 / iterations_to_run)
+                iteration_progress_base = 5 + (iteration - 1) * (
+                    70 / iterations_to_run
+                )
                 self._update_progress(
                     f"Starting iteration {iteration}/{iterations_to_run}",
@@ -175,7 +177,9 @@ class ParallelSearchStrategy(BaseSearchStrategy):
                     # Generate follow-up questions based on accumulated knowledge if iterations > 2
                     use_knowledge = iterations_to_run > 2
-                    knowledge_for_questions = current_knowledge if use_knowledge else ""
+                    knowledge_for_questions = (
+                        current_knowledge if use_knowledge else ""
+                    )
                     context = f"""Current Knowledge: {knowledge_for_questions}
                     Iteration: {iteration} of {iterations_to_run}"""
@@ -219,7 +223,8 @@ class ParallelSearchStrategy(BaseSearchStrategy):
                     max_workers=len(all_questions)
                 ) as executor:
                     futures = [
-                        executor.submit(search_question, q) for q in all_questions
+                        executor.submit(search_question, q)
+                        for q in all_questions
                     ]
                     iteration_search_dict = {}
                     iteration_search_results = []
@@ -234,7 +239,7 @@ class ParallelSearchStrategy(BaseSearchStrategy):
                         iteration_search_dict[question] = search_results
                         self._update_progress(
-                            f"Completed search {i + 1} of {len(all_questions)}: {question[:30]}...",
+                            f"Completed search {i + 1} of {len(all_questions)}: {question[:500]}",
                             iteration_progress_base
                             + 10
                             + ((i + 1) / len(all_questions) * 30),
@@ -261,7 +266,10 @@ class ParallelSearchStrategy(BaseSearchStrategy):
                     self._update_progress(
                         f"Filtering search results for iteration {iteration}",
                         iteration_progress_base + 45,
-                        {"phase": "cross_engine_filtering", "iteration": iteration},
+                        {
+                            "phase": "cross_engine_filtering",
+                            "iteration": iteration,
+                        },
                     )
                     # Get the current link count (for indexing)
@@ -276,7 +284,9 @@ class ParallelSearchStrategy(BaseSearchStrategy):
                         start_index=existing_link_count,  # Start indexing after existing links
                     )
-                    links = extract_links_from_search_results(filtered_search_results)
+                    links = extract_links_from_search_results(
+                        filtered_search_results
+                    )
                     self.all_links_of_system.extend(links)
                     self._update_progress(
@@ -293,7 +303,9 @@ class ParallelSearchStrategy(BaseSearchStrategy):
                     iteration_search_results = filtered_search_results
                 else:
                     # Just extract links without filtering
-                    links = extract_links_from_search_results(iteration_search_results)
+                    links = extract_links_from_search_results(
+                        iteration_search_results
+                    )
                     self.all_links_of_system.extend(links)
                 # Add to all search results
@@ -303,11 +315,13 @@ class ParallelSearchStrategy(BaseSearchStrategy):
                 if self.include_text_content and iteration_search_results:
                     # For iteration > 1 with knowledge accumulation, use follow-up analysis
                     if iteration > 1 and iterations_to_run > 2:
-                        citation_result = self.citation_handler.analyze_followup(
-                            query,
-                            iteration_search_results,
-                            current_knowledge,
-                            len(self.all_links_of_system) - len(links),
+                        citation_result = (
+                            self.citation_handler.analyze_followup(
+                                query,
+                                iteration_search_results,
+                                current_knowledge,
+                                len(self.all_links_of_system) - len(links),
+                            )
                         )
                     else:
                         # For first iteration or without knowledge accumulation, use initial analysis
@@ -358,8 +372,10 @@ class ParallelSearchStrategy(BaseSearchStrategy):
             if self.include_text_content:
                 # Generate a final synthesis from all search results
                 if iterations_to_run > 1:
-                    final_citation_result = self.citation_handler.analyze_initial(
-                        query, all_search_results
+                    final_citation_result = (
+                        self.citation_handler.analyze_initial(
+                            query, all_search_results
+                        )
                     )
                     # Add null check for final_citation_result
                     if final_citation_result:
@@ -402,8 +418,10 @@ class ParallelSearchStrategy(BaseSearchStrategy):
             )
             # Format findings
-            formatted_findings = self.findings_repository.format_findings_to_text(
-                findings, synthesized_content
+            formatted_findings = (
+                self.findings_repository.format_findings_to_text(
+                    findings, synthesized_content
+                )
             )
         except Exception as e:

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl