PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/citation_handlers/precision_extraction_handler.py ADDED Viewed

@@ -0,0 +1,550 @@
+"""
+Precision Extraction Citation Handler
+This handler focuses on extracting precise, complete answers for SimpleQA-style questions.
+It includes specialized extractors for:
+- Full names (including middle names)
+- Single answers when only one is requested
+- Dimension-aware measurements
+- Specific entities without extra information
+"""
+import re
+from typing import Any, Dict, List, Union
+from loguru import logger
+from .base_citation_handler import BaseCitationHandler
+class PrecisionExtractionHandler(BaseCitationHandler):
+    """Citation handler optimized for precise answer extraction."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Answer type patterns
+        self.answer_patterns = {
+            "full_name": re.compile(
+                r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})\b"
+            ),
+            "year": re.compile(r"\b(19\d{2}|20\d{2})\b"),
+            "number": re.compile(r"\b(\d+(?:\.\d+)?)\b"),
+            "dimension": re.compile(
+                r"(\d+(?:\.\d+)?)\s*(meters?|feet|inches|cm|km|miles?|m|ft|kg|pounds?|lbs?)",
+                re.I,
+            ),
+            "score": re.compile(r"(\d+)\s*[-–]\s*(\d+)"),
+            "percentage": re.compile(r"(\d+(?:\.\d+)?)\s*%"),
+            "location": re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"),
+        }
+    def analyze_initial(
+        self, query: str, search_results: Union[str, List[Dict]]
+    ) -> Dict[str, Any]:
+        """Initial analysis with precision extraction."""
+        documents = self._create_documents(search_results)
+        formatted_sources = self._format_sources(documents)
+        # Determine question type for targeted extraction
+        question_type = self._identify_question_type(query)
+        prompt = f"""Analyze the following information and provide a PRECISE answer to the question. Include citations using numbers in square brackets [1], [2], etc.
+Question: {query}
+Question Type: {question_type}
+Sources:
+{formatted_sources}
+PRECISION INSTRUCTIONS:
+1. Extract the EXACT answer as it appears in the sources
+2. For names: Include FULL names with all parts (first, middle, last)
+3. For numbers: Include exact values with units if present
+4. For single-answer questions: Provide ONLY ONE answer, not multiple options
+5. For dimensions: Specify the exact measurement type (height, length, width)
+6. Citations should support the specific answer given
+Format: Start with the direct, precise answer, then explain with citations."""
+        response = self.llm.invoke(prompt)
+        if not isinstance(response, str):
+            response = response.content
+        # Apply precision extraction if needed
+        response = self._apply_precision_extraction(
+            response, query, question_type, formatted_sources
+        )
+        return {"content": response, "documents": documents}
+    def analyze_followup(
+        self,
+        question: str,
+        search_results: Union[str, List[Dict]],
+        previous_knowledge: str,
+        nr_of_links: int,
+    ) -> Dict[str, Any]:
+        """Follow-up analysis with precision extraction."""
+        documents = self._create_documents(
+            search_results, nr_of_links=nr_of_links
+        )
+        formatted_sources = self._format_sources(documents)
+        question_type = self._identify_question_type(question)
+        # Extract key facts from previous knowledge
+        key_facts = self._extract_key_facts(previous_knowledge, question_type)
+        prompt = f"""Using the previous knowledge and new sources, provide a PRECISE answer to the question.
+Previous Key Facts:
+{key_facts}
+Question: {question}
+Question Type: {question_type}
+New Sources:
+{formatted_sources}
+PRECISION REQUIREMENTS:
+1. Build on previous knowledge to provide the MOST COMPLETE answer
+2. If a full name was partially found before, complete it now
+3. If multiple candidates exist, select the one with the MOST evidence
+4. For measurements, ensure units and dimension types match the question
+5. Reconcile any conflicts by choosing the most frequently cited answer
+Provide the precise answer with citations."""
+        response = self.llm.invoke(prompt)
+        content = response.content
+        # Apply precision extraction
+        content = self._apply_precision_extraction(
+            content, question, question_type, formatted_sources
+        )
+        return {"content": content, "documents": documents}
+    def _identify_question_type(self, query: str) -> str:
+        """Identify the type of question for targeted extraction."""
+        query_lower = query.lower()
+        # Name questions
+        if any(
+            phrase in query_lower
+            for phrase in ["full name", "name of", "who was", "who is"]
+        ):
+            if "full name" in query_lower:
+                return "full_name"
+            return "name"
+        # Location questions
+        if any(
+            phrase in query_lower
+            for phrase in ["where", "location", "city", "country", "place"]
+        ):
+            return "location"
+        # Temporal questions
+        if any(phrase in query_lower for phrase in ["when", "year", "date"]):
+            return "temporal"
+        # Numerical questions
+        if any(
+            phrase in query_lower
+            for phrase in ["how many", "how much", "number", "count"]
+        ):
+            return "number"
+        # Score/result questions
+        if any(
+            phrase in query_lower
+            for phrase in ["score", "result", "final", "outcome"]
+        ):
+            return "score"
+        # Dimension questions
+        if any(
+            phrase in query_lower
+            for phrase in [
+                "height",
+                "length",
+                "width",
+                "size",
+                "tall",
+                "long",
+                "wide",
+            ]
+        ):
+            return "dimension"
+        # Single answer questions
+        if query_lower.startswith("which") and "one" in query_lower:
+            return "single_choice"
+        return "general"
+    def _apply_precision_extraction(
+        self, content: str, query: str, question_type: str, sources: str
+    ) -> str:
+        """Apply precision extraction based on question type."""
+        # Check if content already has a good answer in the first line
+        # first_line = content.split(".")[0].strip()  # Not currently used
+        if question_type == "full_name":
+            return self._extract_full_name(content, query, sources)
+        elif question_type == "name":
+            return self._extract_best_name(content, query, sources)
+        elif question_type == "single_choice":
+            return self._extract_single_answer(content, query, sources)
+        elif question_type == "dimension":
+            return self._extract_dimension(content, query, sources)
+        elif question_type == "score":
+            return self._extract_score(content, query, sources)
+        elif question_type == "temporal":
+            return self._extract_temporal(content, query, sources)
+        elif question_type == "number":
+            return self._extract_number(content, query, sources)
+        return content
+    def _extract_full_name(self, content: str, query: str, sources: str) -> str:
+        """Extract complete full names."""
+        # First, use LLM to identify all name variations
+        extraction_prompt = f"""Find ALL variations of the person's name mentioned in the sources.
+Question: {query}
+Content: {content[:2000]}
+Sources: {sources[:2000]}
+List all name variations found:
+1. Shortest version:
+2. Longest/most complete version:
+3. Most frequently mentioned version:
+Which is the FULL name (including middle name if present)?"""
+        try:
+            extraction = self.llm.invoke(extraction_prompt).content
+            # Extract the identified full name
+            if "full name" in extraction.lower():
+                lines = extraction.split("\n")
+                for line in lines:
+                    if "full name" in line.lower() or "longest" in line.lower():
+                        # Extract name from this line
+                        matches = self.answer_patterns["full_name"].findall(
+                            line
+                        )
+                        if matches:
+                            # Choose the longest match
+                            full_name = max(
+                                matches, key=lambda x: len(x.split())
+                            )
+                            return f"{full_name}. {content}"
+            # Fallback: find all names and pick the longest
+            all_names = self.answer_patterns["full_name"].findall(
+                content + " " + sources
+            )
+            if all_names:
+                # Group similar names and pick the longest variant
+                name_groups = {}
+                for name in all_names:
+                    last_word = name.split()[-1]
+                    if last_word not in name_groups:
+                        name_groups[last_word] = []
+                    name_groups[last_word].append(name)
+                # Find the group with the most complete name
+                best_name = ""
+                for group in name_groups.values():
+                    longest_in_group = max(group, key=lambda x: len(x.split()))
+                    if len(longest_in_group.split()) > len(best_name.split()):
+                        best_name = longest_in_group
+                if best_name:
+                    return f"{best_name}. {content}"
+        except Exception as e:
+            logger.error(f"Error in full name extraction: {e}")
+        return content
+    def _extract_single_answer(
+        self, content: str, query: str, sources: str
+    ) -> str:
+        """Extract a single answer when multiple options might be present."""
+        extraction_prompt = f"""The question asks for ONE specific answer. Extract ONLY that answer.
+Question: {query}
+Content: {content[:1500]}
+Rules:
+1. If multiple items are listed, identify which ONE actually answers the question
+2. Look for the PRIMARY or FIRST mentioned item
+3. Do not include alternatives or additional options
+The single answer is:"""
+        try:
+            answer = self.llm.invoke(extraction_prompt).content.strip()
+            # Clean up the answer
+            answer = answer.split(",")[
+                0
+            ].strip()  # Take only first if comma-separated
+            answer = answer.split(" and ")[
+                0
+            ].strip()  # Take only first if "and"-separated
+            answer = answer.split(" or ")[
+                0
+            ].strip()  # Take only first if "or"-separated
+            return f"{answer}. {content}"
+        except Exception as e:
+            logger.error(f"Error in single answer extraction: {e}")
+        return content
+    def _extract_dimension(self, content: str, query: str, sources: str) -> str:
+        """Extract specific dimensions with correct units and context awareness."""
+        # Enhanced dimension type detection
+        dimension_types = {
+            "height": ["height", "tall", "high", "elevation", "altitude"],
+            "length": ["length", "long", "distance", "reach", "span"],
+            "width": ["width", "wide", "breadth", "diameter"],
+            "depth": ["depth", "deep", "thickness"],
+            "weight": ["weight", "weigh", "heavy", "mass"],
+            "speed": ["speed", "fast", "velocity", "mph", "kmh"],
+            "area": ["area", "square"],
+            "volume": ["volume", "cubic"],
+        }
+        query_lower = query.lower()
+        dimension_type = None
+        dimension_keywords = []
+        # Find the most specific dimension type
+        for dim_type, keywords in dimension_types.items():
+            matching_keywords = [kw for kw in keywords if kw in query_lower]
+            if matching_keywords:
+                dimension_type = dim_type
+                dimension_keywords = matching_keywords
+                break
+        extraction_prompt = f"""Extract the EXACT measurement that answers this question.
+Question: {query}
+Content: {content[:1500]}
+Rules:
+1. Find the specific {dimension_type or "dimension"} measurement
+2. Return ONLY the number and unit (e.g., "20 meters", "5.5 feet")
+3. Distinguish between different types of measurements:
+   - Height/tall: vertical measurements
+   - Length/long: horizontal distance
+   - Width/wide: horizontal breadth
+4. Look for context clues near the measurement
+5. If multiple measurements, choose the one that matches the question type
+The exact {dimension_type or "dimension"} is:"""
+        try:
+            answer = self.llm.invoke(extraction_prompt).content.strip()
+            # Clean and validate the answer
+            import re
+            measurement_match = re.search(
+                r"(\d+(?:\.\d+)?)\s*([a-zA-Z/°]+)", answer
+            )
+            if measurement_match:
+                number, unit = measurement_match.groups()
+                clean_answer = f"{number} {unit}"
+                return f"{clean_answer}. {content}"
+            # Fallback: intelligent pattern matching
+            all_dimensions = self.answer_patterns["dimension"].findall(
+                content + " " + sources
+            )
+            if all_dimensions:
+                # Score dimensions based on context and dimension type
+                scored_dimensions = []
+                for dim in all_dimensions:
+                    number, unit = dim
+                    dim_str = f"{number} {unit}"
+                    score = 0
+                    # Find the dimension in content
+                    pos = content.find(dim_str)
+                    if pos >= 0:
+                        # Get context around this measurement
+                        context = content[max(0, pos - 100) : pos + 100].lower()
+                        # Score based on dimension keywords in context
+                        for keyword in dimension_keywords:
+                            if keyword in context:
+                                score += 10
+                        # Score based on unit appropriateness
+                        unit_lower = unit.lower()
+                        if dimension_type == "height" and any(
+                            u in unit_lower
+                            for u in ["m", "meter", "ft", "feet", "cm"]
+                        ):
+                            score += 5
+                        elif dimension_type == "length" and any(
+                            u in unit_lower
+                            for u in ["m", "meter", "km", "mile", "ft"]
+                        ):
+                            score += 5
+                        elif dimension_type == "weight" and any(
+                            u in unit_lower
+                            for u in ["kg", "lb", "pound", "gram", "ton"]
+                        ):
+                            score += 5
+                        elif dimension_type == "speed" and any(
+                            u in unit_lower
+                            for u in ["mph", "kmh", "km/h", "m/s"]
+                        ):
+                            score += 5
+                        # Prefer measurements closer to the beginning (more likely to be primary)
+                        score += max(0, 5 - (pos / 100))
+                    scored_dimensions.append((score, dim_str))
+                # Return the highest scoring dimension
+                if scored_dimensions:
+                    scored_dimensions.sort(key=lambda x: x[0], reverse=True)
+                    best_dimension = scored_dimensions[0][1]
+                    return f"{best_dimension}. {content}"
+                # Final fallback: first dimension
+                return (
+                    f"{all_dimensions[0][0]} {all_dimensions[0][1]}. {content}"
+                )
+        except Exception as e:
+            logger.error(f"Error in dimension extraction: {e}")
+        return content
+    def _extract_score(self, content: str, query: str, sources: str) -> str:
+        """Extract game scores or results."""
+        # Find all score patterns
+        scores = self.answer_patterns["score"].findall(content + " " + sources)
+        if scores:
+            # Use LLM to identify the correct score
+            extraction_prompt = f"""Which score/result answers this question?
+Question: {query}
+Found scores: {scores}
+Context: {content[:1000]}
+The answer is:"""
+            try:
+                answer = self.llm.invoke(extraction_prompt).content.strip()
+                return f"{answer}. {content}"
+            except Exception:
+                # Return first score found if LLM extraction fails
+                return f"{scores[0][0]}-{scores[0][1]}. {content}"
+        return content
+    def _extract_temporal(self, content: str, query: str, sources: str) -> str:
+        """Extract dates or years."""
+        # Find all year patterns
+        years = self.answer_patterns["year"].findall(content + " " + sources)
+        if years:
+            # Use LLM to pick the right one
+            extraction_prompt = f"""Which date/year specifically answers this question?
+Question: {query}
+Found years: {set(years)}
+Context: {content[:1000]}
+The answer is:"""
+            try:
+                answer = self.llm.invoke(extraction_prompt).content.strip()
+                # Clean to just the year/date
+                year_match = self.answer_patterns["year"].search(answer)
+                if year_match:
+                    return f"{year_match.group()}. {content}"
+                return f"{answer}. {content}"
+            except Exception:
+                # Fallback to first found year if LLM extraction fails
+                return f"{years[0]}. {content}"
+        return content
+    def _extract_number(self, content: str, query: str, sources: str) -> str:
+        """Extract specific numbers."""
+        # Find all numbers
+        numbers = self.answer_patterns["number"].findall(
+            content + " " + sources
+        )
+        if numbers:
+            extraction_prompt = f"""Which number specifically answers this question?
+Question: {query}
+Found numbers: {numbers[:10]}
+Context: {content[:1000]}
+The answer is:"""
+            try:
+                answer = self.llm.invoke(extraction_prompt).content.strip()
+                return f"{answer}. {content}"
+            except Exception:
+                # Fallback to first found number if LLM extraction fails
+                return f"{numbers[0]}. {content}"
+        return content
+    def _extract_best_name(self, content: str, query: str, sources: str) -> str:
+        """Extract the best matching name (not necessarily full)."""
+        # Find all potential names
+        names = self.answer_patterns["full_name"].findall(
+            content + " " + sources
+        )
+        if names:
+            # Count frequency
+            name_counts = {}
+            for name in names:
+                name_counts[name] = name_counts.get(name, 0) + 1
+            # Get most frequent
+            best_name = max(name_counts.items(), key=lambda x: x[1])[0]
+            return f"{best_name}. {content}"
+        return content
+    def _extract_key_facts(
+        self, previous_knowledge: str, question_type: str
+    ) -> str:
+        """Extract key facts from previous knowledge."""
+        extraction_prompt = f"""Extract key facts related to a {question_type} question from this knowledge:
+{previous_knowledge[:1500]}
+List the most important facts (names, numbers, dates) found:"""
+        try:
+            facts = self.llm.invoke(extraction_prompt).content
+            return facts[:500]
+        except Exception:
+            # Fallback to truncated previous knowledge if LLM extraction fails
+            return previous_knowledge[:500]

local_deep_research/citation_handlers/standard_citation_handler.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""
+Standard citation handler - the original implementation.
+"""
+from typing import Any, Dict, List, Union
+from ..utilities.db_utils import get_db_setting
+from .base_citation_handler import BaseCitationHandler
+class StandardCitationHandler(BaseCitationHandler):
+    """Standard citation handler with detailed analysis."""
+    def analyze_initial(
+        self, query: str, search_results: Union[str, List[Dict]]
+    ) -> Dict[str, Any]:
+        documents = self._create_documents(search_results)
+        formatted_sources = self._format_sources(documents)
+        prompt = f"""Analyze the following information concerning the question and include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source.
+Question: {query}
+Sources:
+{formatted_sources}
+Provide a detailed analysis with citations. Do not create the bibliography, it will be provided automatically.  Never make up sources. Never write or create urls. Only write text relevant to the question. Example format: "According to the research [1], ..."
+"""
+        response = self.llm.invoke(prompt)
+        if not isinstance(response, str):
+            response = response.content
+        return {"content": response, "documents": documents}
+    def analyze_followup(
+        self,
+        question: str,
+        search_results: Union[str, List[Dict]],
+        previous_knowledge: str,
+        nr_of_links: int,
+    ) -> Dict[str, Any]:
+        """Process follow-up analysis with citations."""
+        documents = self._create_documents(
+            search_results, nr_of_links=nr_of_links
+        )
+        formatted_sources = self._format_sources(documents)
+        # Add fact-checking step
+        fact_check_prompt = f"""Analyze these sources for factual consistency:
+1. Cross-reference major claims between sources
+2. Identify and flag any contradictions
+3. Verify basic facts (dates, company names, ownership)
+4. Note when sources disagree
+Previous Knowledge:
+{previous_knowledge}
+New Sources:
+{formatted_sources}
+        Return any inconsistencies or conflicts found."""
+        if get_db_setting("general.enable_fact_checking", True):
+            fact_check_response = self.llm.invoke(fact_check_prompt).content
+        else:
+            fact_check_response = ""
+        prompt = f"""Using the previous knowledge and new sources, answer the question. Include citations using numbers in square brackets [1], [2], etc. When citing, use the source number provided at the start of each source. Reflect information from sources critically.
+Previous Knowledge:
+{previous_knowledge}
+Question: {question}
+New Sources:
+{formatted_sources}
+Reflect information from sources critically based on: {fact_check_response}. Never invent sources.
+Provide a detailed answer with citations.  Example format: "According to [1], ..." """
+        response = self.llm.invoke(prompt)
+        return {"content": response.content, "documents": documents}

local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl