PyPI - cite-agent - Versions diffs - 1.3.9__py3-none-any.whl → 1.4.3__py3-none-any.whl - Mend

cite-agent 1.3.9py3-none-any.whl → 1.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cite_agent/__init__.py +13 -13
cite_agent/__version__.py +1 -1
cite_agent/action_first_mode.py +150 -0
cite_agent/adaptive_providers.py +413 -0
cite_agent/archive_api_client.py +186 -0
cite_agent/auth.py +0 -1
cite_agent/auto_expander.py +70 -0
cite_agent/cache.py +379 -0
cite_agent/circuit_breaker.py +370 -0
cite_agent/citation_network.py +377 -0
cite_agent/cli.py +8 -16
cite_agent/cli_conversational.py +113 -3
cite_agent/confidence_calibration.py +381 -0
cite_agent/deduplication.py +325 -0
cite_agent/enhanced_ai_agent.py +689 -371
cite_agent/error_handler.py +228 -0
cite_agent/execution_safety.py +329 -0
cite_agent/full_paper_reader.py +239 -0
cite_agent/observability.py +398 -0
cite_agent/offline_mode.py +348 -0
cite_agent/paper_comparator.py +368 -0
cite_agent/paper_summarizer.py +420 -0
cite_agent/pdf_extractor.py +350 -0
cite_agent/proactive_boundaries.py +266 -0
cite_agent/quality_gate.py +442 -0
cite_agent/request_queue.py +390 -0
cite_agent/response_enhancer.py +257 -0
cite_agent/response_formatter.py +458 -0
cite_agent/response_pipeline.py +295 -0
cite_agent/response_style_enhancer.py +259 -0
cite_agent/self_healing.py +418 -0
cite_agent/similarity_finder.py +524 -0
cite_agent/streaming_ui.py +13 -9
cite_agent/thinking_blocks.py +308 -0
cite_agent/tool_orchestrator.py +416 -0
cite_agent/trend_analyzer.py +540 -0
cite_agent/unpaywall_client.py +226 -0
{cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/METADATA +15 -1
cite_agent-1.4.3.dist-info/RECORD +62 -0
cite_agent-1.3.9.dist-info/RECORD +0 -32
{cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/WHEEL +0 -0
{cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/entry_points.txt +0 -0
{cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/licenses/LICENSE +0 -0
{cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/top_level.txt +0 -0

cite_agent/pdf_extractor.py ADDED Viewed

@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+"""
+PDF Extraction Service - KILLER FEATURE
+Extracts full text from academic papers so you don't have to read them!
+"""
+import io
+import logging
+import re
+from typing import Dict, List, Optional, Any
+from pathlib import Path
+try:
+    import PyPDF2
+    import pdfplumber
+    import fitz  # PyMuPDF
+    PDF_LIBRARIES_AVAILABLE = True
+except ImportError:
+    PDF_LIBRARIES_AVAILABLE = False
+import requests
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+@dataclass
+class ExtractedPaper:
+    """Fully extracted paper content"""
+    title: Optional[str] = None
+    abstract: Optional[str] = None
+    introduction: Optional[str] = None
+    methodology: Optional[str] = None
+    results: Optional[str] = None
+    discussion: Optional[str] = None
+    conclusion: Optional[str] = None
+    references: Optional[List[str]] = None
+    full_text: Optional[str] = None
+    tables: Optional[List[Dict[str, Any]]] = None
+    figures_count: int = 0
+    page_count: int = 0
+    word_count: int = 0
+    extraction_method: str = "unknown"
+    extraction_quality: str = "unknown"  # high, medium, low
+    error_message: Optional[str] = None
+class PDFExtractor:
+    """Extract and analyze full text from academic PDFs"""
+    def __init__(self):
+        if not PDF_LIBRARIES_AVAILABLE:
+            logger.warning("PDF libraries not installed. Install: pip install pypdf2 pdfplumber pymupdf")
+        self.max_file_size_mb = 50  # Don't download PDFs larger than 50MB
+        self.timeout_seconds = 30
+    async def extract_from_url(self, pdf_url: str) -> ExtractedPaper:
+        """
+        Download and extract full text from PDF URL
+        Args:
+            pdf_url: Direct link to PDF file
+        Returns:
+            ExtractedPaper with full content
+        """
+        if not PDF_LIBRARIES_AVAILABLE:
+            return ExtractedPaper(
+                error_message="PDF extraction libraries not installed",
+                extraction_quality="low"
+            )
+        try:
+            # Download PDF
+            logger.info(f"Downloading PDF from {pdf_url}")
+            response = requests.get(
+                pdf_url,
+                timeout=self.timeout_seconds,
+                headers={'User-Agent': 'Mozilla/5.0 (Research Bot)'},
+                stream=True
+            )
+            response.raise_for_status()
+            # Check file size
+            content_length = response.headers.get('content-length')
+            if content_length and int(content_length) > self.max_file_size_mb * 1024 * 1024:
+                return ExtractedPaper(
+                    error_message=f"PDF too large ({int(content_length)/(1024*1024):.1f}MB > {self.max_file_size_mb}MB)",
+                    extraction_quality="low"
+                )
+            pdf_bytes = response.content
+            # Try extraction methods in order of quality
+            # 1. PyMuPDF (best quality, fastest)
+            extracted = self._extract_with_pymupdf(pdf_bytes)
+            if extracted.extraction_quality == "high":
+                return extracted
+            # 2. pdfplumber (good for tables and layout)
+            extracted = self._extract_with_pdfplumber(pdf_bytes)
+            if extracted.extraction_quality in ("high", "medium"):
+                return extracted
+            # 3. PyPDF2 (basic fallback)
+            extracted = self._extract_with_pypdf2(pdf_bytes)
+            return extracted
+        except requests.Timeout:
+            return ExtractedPaper(
+                error_message="PDF download timeout",
+                extraction_quality="low"
+            )
+        except requests.RequestException as e:
+            return ExtractedPaper(
+                error_message=f"PDF download failed: {str(e)}",
+                extraction_quality="low"
+            )
+        except Exception as e:
+            logger.error(f"PDF extraction error: {e}")
+            return ExtractedPaper(
+                error_message=f"Extraction error: {str(e)}",
+                extraction_quality="low"
+            )
+    def _extract_with_pymupdf(self, pdf_bytes: bytes) -> ExtractedPaper:
+        """Extract using PyMuPDF (fitz) - fastest and most accurate"""
+        try:
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            full_text = ""
+            for page in doc:
+                full_text += page.get_text()
+            # Parse sections
+            sections = self._parse_sections(full_text)
+            # Count stats
+            word_count = len(full_text.split())
+            page_count = len(doc)
+            # Extract tables (basic)
+            tables = []
+            for page in doc:
+                tabs = page.find_tables()
+                if tabs:
+                    for tab in tabs:
+                        tables.append({
+                            'page': page.number + 1,
+                            'rows': len(tab.extract()),
+                            'data': tab.extract()[:5]  # First 5 rows only
+                        })
+            doc.close()
+            quality = "high" if word_count > 500 else "medium"
+            return ExtractedPaper(
+                full_text=full_text,
+                title=sections.get('title'),
+                abstract=sections.get('abstract'),
+                introduction=sections.get('introduction'),
+                methodology=sections.get('methodology'),
+                results=sections.get('results'),
+                discussion=sections.get('discussion'),
+                conclusion=sections.get('conclusion'),
+                references=sections.get('references'),
+                tables=tables if tables else None,
+                page_count=page_count,
+                word_count=word_count,
+                extraction_method="pymupdf",
+                extraction_quality=quality
+            )
+        except Exception as e:
+            logger.warning(f"PyMuPDF extraction failed: {e}")
+            return ExtractedPaper(
+                error_message=f"PyMuPDF failed: {str(e)}",
+                extraction_quality="low"
+            )
+    def _extract_with_pdfplumber(self, pdf_bytes: bytes) -> ExtractedPaper:
+        """Extract using pdfplumber - good for tables"""
+        try:
+            pdf = pdfplumber.open(io.BytesIO(pdf_bytes))
+            full_text = ""
+            tables = []
+            for page_num, page in enumerate(pdf.pages, start=1):
+                # Extract text
+                text = page.extract_text()
+                if text:
+                    full_text += text + "\n"
+                # Extract tables
+                page_tables = page.extract_tables()
+                if page_tables:
+                    for table in page_tables:
+                        tables.append({
+                            'page': page_num,
+                            'rows': len(table),
+                            'data': table[:5]  # First 5 rows
+                        })
+            pdf.close()
+            # Parse sections
+            sections = self._parse_sections(full_text)
+            word_count = len(full_text.split())
+            quality = "high" if word_count > 500 else "medium"
+            return ExtractedPaper(
+                full_text=full_text,
+                title=sections.get('title'),
+                abstract=sections.get('abstract'),
+                introduction=sections.get('introduction'),
+                methodology=sections.get('methodology'),
+                results=sections.get('results'),
+                discussion=sections.get('discussion'),
+                conclusion=sections.get('conclusion'),
+                references=sections.get('references'),
+                tables=tables if tables else None,
+                page_count=len(pdf.pages),
+                word_count=word_count,
+                extraction_method="pdfplumber",
+                extraction_quality=quality
+            )
+        except Exception as e:
+            logger.warning(f"pdfplumber extraction failed: {e}")
+            return ExtractedPaper(
+                error_message=f"pdfplumber failed: {str(e)}",
+                extraction_quality="low"
+            )
+    def _extract_with_pypdf2(self, pdf_bytes: bytes) -> ExtractedPaper:
+        """Extract using PyPDF2 - basic fallback"""
+        try:
+            pdf = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
+            full_text = ""
+            for page in pdf.pages:
+                text = page.extract_text()
+                if text:
+                    full_text += text + "\n"
+            # Parse sections
+            sections = self._parse_sections(full_text)
+            word_count = len(full_text.split())
+            quality = "medium" if word_count > 500 else "low"
+            return ExtractedPaper(
+                full_text=full_text,
+                title=sections.get('title'),
+                abstract=sections.get('abstract'),
+                introduction=sections.get('introduction'),
+                methodology=sections.get('methodology'),
+                results=sections.get('results'),
+                discussion=sections.get('discussion'),
+                conclusion=sections.get('conclusion'),
+                references=sections.get('references'),
+                page_count=len(pdf.pages),
+                word_count=word_count,
+                extraction_method="pypdf2",
+                extraction_quality=quality
+            )
+        except Exception as e:
+            logger.warning(f"PyPDF2 extraction failed: {e}")
+            return ExtractedPaper(
+                error_message=f"PyPDF2 failed: {str(e)}",
+                extraction_quality="low"
+            )
+    def _parse_sections(self, full_text: str) -> Dict[str, Optional[str]]:
+        """
+        Parse academic paper sections from full text
+        Uses common section headers to split the paper
+        """
+        sections = {}
+        # Common section patterns (case-insensitive)
+        patterns = {
+            'abstract': r'(?i)\bABSTRACT\b',
+            'introduction': r'(?i)\b(INTRODUCTION|1\.\s*INTRODUCTION)\b',
+            'methodology': r'(?i)\b(METHODOLOGY|METHODS|MATERIALS AND METHODS|2\.\s*METHOD)\b',
+            'results': r'(?i)\b(RESULTS|FINDINGS|3\.\s*RESULTS)\b',
+            'discussion': r'(?i)\b(DISCUSSION|4\.\s*DISCUSSION)\b',
+            'conclusion': r'(?i)\b(CONCLUSION|CONCLUSIONS|5\.\s*CONCLUSION)\b',
+            'references': r'(?i)\b(REFERENCES|BIBLIOGRAPHY)\b'
+        }
+        # Find all section positions
+        section_positions = {}
+        for section_name, pattern in patterns.items():
+            match = re.search(pattern, full_text)
+            if match:
+                section_positions[section_name] = match.start()
+        # Sort sections by position
+        sorted_sections = sorted(section_positions.items(), key=lambda x: x[1])
+        # Extract text between sections
+        for i, (section_name, start_pos) in enumerate(sorted_sections):
+            # Get end position (start of next section, or end of text)
+            if i + 1 < len(sorted_sections):
+                end_pos = sorted_sections[i + 1][1]
+            else:
+                end_pos = len(full_text)
+            # Extract section text
+            section_text = full_text[start_pos:end_pos].strip()
+            # Remove section header from text
+            section_text = re.sub(patterns[section_name], '', section_text, count=1).strip()
+            # Limit length (first 3000 chars per section)
+            if len(section_text) > 3000:
+                section_text = section_text[:3000] + "... [truncated]"
+            sections[section_name] = section_text if section_text else None
+        # Extract title (usually first few lines)
+        title_match = re.search(r'^(.+?)(?:\n\n|\n[A-Z])', full_text, re.MULTILINE)
+        if title_match:
+            title = title_match.group(1).strip()
+            # Clean up title
+            title = re.sub(r'\s+', ' ', title)
+            if len(title) > 200:
+                title = title[:200]
+            sections['title'] = title
+        # Extract references (last section, list of citations)
+        if 'references' in sections and sections['references']:
+            ref_text = sections['references']
+            # Split by newlines and filter
+            refs = [line.strip() for line in ref_text.split('\n') if line.strip()]
+            # Keep only lines that look like citations (have year and authors)
+            citations = [ref for ref in refs if re.search(r'\b(19|20)\d{2}\b', ref)]
+            sections['references'] = citations[:20]  # First 20 refs
+        return sections
+# Global instance
+pdf_extractor = PDFExtractor()

cite_agent/proactive_boundaries.py ADDED Viewed

@@ -0,0 +1,266 @@
+"""
+Proactive Action Boundaries
+Defines what actions the agent can do automatically vs what needs explicit permission
+PHILOSOPHY: Be proactive with READ operations, cautious with WRITE operations
+"""
+from typing import Dict, List, Set
+import re
+class ProactiveBoundaries:
+    """
+    Defines safe boundaries for proactive agent behavior
+    SAFE TO AUTO-DO (read-only, informational):
+    - These enhance user experience without risk
+    NEEDS PERMISSION (write/destructive):
+    - These could cause problems if done incorrectly
+    """
+    # Commands/actions that are SAFE to do proactively
+    SAFE_AUTO_ACTIONS: Set[str] = {
+        # File operations (read-only)
+        'list_files',
+        'read_file',
+        'preview_file',
+        'search_in_files',
+        'find_files',
+        'show_file_info',
+        'cat',
+        'head',
+        'tail',
+        'less',
+        'grep',
+        'find',
+        # Directory operations (read-only)
+        'list_directory',
+        'show_directory_tree',
+        'navigate_directory',  # cd is safe
+        'pwd',
+        'ls',
+        'tree',
+        # Code analysis (read-only)
+        'explain_code',
+        'show_functions',
+        'analyze_structure',
+        'find_definitions',
+        # Data operations (read-only)
+        'query_api',
+        'fetch_data',
+        'show_stats',
+        'search_papers',
+        'get_financial_data',
+        # Git operations (read-only)
+        'git_status',
+        'git_log',
+        'git_diff',
+        'git_show',
+        'git_blame',
+        # System info (read-only)
+        'show_env',
+        'check_dependencies',
+        'list_processes',
+    }
+    # Commands/actions that NEED EXPLICIT PERMISSION
+    NEEDS_PERMISSION: Set[str] = {
+        # File operations (write/destructive)
+        'create_file',
+        'delete_file',
+        'modify_file',
+        'move_file',
+        'rename_file',
+        'chmod',
+        'chown',
+        'touch',
+        'mkdir',
+        'rmdir',
+        'rm',
+        'mv',
+        'cp',  # Can overwrite
+        # Code execution (potentially dangerous)
+        'run_script',
+        'execute_code',
+        'eval',
+        'exec',
+        # Package management
+        'install_package',
+        'uninstall_package',
+        'update_packages',
+        'pip',
+        'npm',
+        'apt',
+        'brew',
+        # Git operations (write)
+        'git_add',
+        'git_commit',
+        'git_push',
+        'git_pull',
+        'git_merge',
+        'git_rebase',
+        'git_reset',
+        # Network operations (write/external)
+        'send_request',
+        'post_data',
+        'upload_file',
+        'download_file',
+        # System operations
+        'change_settings',
+        'modify_config',
+        'kill_process',
+        'start_service',
+        'stop_service',
+    }
+    @classmethod
+    def is_safe_to_auto_do(cls, action: str) -> bool:
+        """
+        Check if action is safe to do automatically
+        Returns:
+            True if safe to do proactively
+            False if needs explicit user permission
+        """
+        action_lower = action.lower()
+        # Check exact matches
+        if action_lower in cls.SAFE_AUTO_ACTIONS:
+            return True
+        if action_lower in cls.NEEDS_PERMISSION:
+            return False
+        # Check patterns
+        # Safe patterns
+        safe_patterns = [
+            r'^(ls|pwd|cd|find|grep|cat|head|tail|less)',
+            r'^git\s+(status|log|diff|show|blame)',
+            r'search|find|list|show|display|preview|read',
+        ]
+        for pattern in safe_patterns:
+            if re.search(pattern, action_lower):
+                return True
+        # Dangerous patterns
+        dangerous_patterns = [
+            r'^(rm|mv|cp|touch|mkdir|chmod)',
+            r'^git\s+(add|commit|push|pull|merge|rebase|reset)',
+            r'(delete|remove|modify|edit|create|install|update)',
+            r'^(pip|npm|apt|brew)',
+        ]
+        for pattern in dangerous_patterns:
+            if re.search(pattern, action_lower):
+                return False
+        # Default: be conservative - if unsure, ask permission
+        return False
+    @classmethod
+    def get_auto_expansion_for_query(cls, query: str, initial_result: str) -> Dict[str, any]:
+        """
+        Determine what automatic expansion to do based on query and initial result
+        Returns dict with:
+        - should_expand: bool
+        - expansion_actions: List[str] - actions to take automatically
+        - reason: str - why expanding
+        """
+        query_lower = query.lower()
+        result_lower = initial_result.lower()
+        expansions = {
+            'should_expand': False,
+            'expansion_actions': [],
+            'reason': ''
+        }
+        # Pattern 1: Listed files → preview main one
+        if any(word in query_lower for word in ['list', 'show', 'find']) and \
+           any(word in query_lower for word in ['file', 'files', 'py', 'js']):
+            # Check if result is just a list (short, has bullets/lines, BUT no code/details)
+            has_code_block = '```' in initial_result
+            has_detailed_descriptions = ' - ' in initial_result or ': ' in result_lower
+            is_short_list = len(initial_result) < 300 and ('•' in initial_result or '\n' in initial_result)
+            if is_short_list and not has_code_block and not has_detailed_descriptions:
+                expansions['should_expand'] = True
+                expansions['expansion_actions'] = ['preview_main_file']
+                expansions['reason'] = 'Listed files but no content shown - auto-preview main file'
+        # Pattern 2: Found papers → show abstracts
+        if 'paper' in query_lower and 'found' in result_lower:
+            if 'abstract' not in result_lower:
+                expansions['should_expand'] = True
+                expansions['expansion_actions'] = ['show_paper_abstracts']
+                expansions['reason'] = 'Found papers but no abstracts - auto-show summaries'
+        # Pattern 3: Code query → show examples
+        if any(word in query_lower for word in ['function', 'class', 'code']) and \
+           'how' in query_lower:
+            if '```' not in initial_result and len(initial_result) < 200:
+                expansions['should_expand'] = True
+                expansions['expansion_actions'] = ['show_code_examples']
+                expansions['reason'] = 'Code explanation without examples - auto-show code'
+        # Pattern 4: Data query → show sample/visualization
+        if any(word in query_lower for word in ['data', 'revenue', 'metrics', 'stats']):
+            if len(initial_result) < 150:  # Just a number, not detailed
+                expansions['should_expand'] = True
+                expansions['expansion_actions'] = ['show_data_breakdown']
+                expansions['reason'] = 'Data query with minimal detail - auto-show breakdown'
+        return expansions
+    @classmethod
+    def validate_proactive_action(cls, action: str, context: Dict) -> Dict[str, any]:
+        """
+        Validate if a proactive action should be allowed
+        Returns:
+        - allowed: bool
+        - reason: str
+        - requires_confirmation: bool
+        """
+        is_safe = cls.is_safe_to_auto_do(action)
+        if is_safe:
+            return {
+                'allowed': True,
+                'reason': 'Safe read-only operation',
+                'requires_confirmation': False
+            }
+        else:
+            return {
+                'allowed': False,
+                'reason': 'Write/destructive operation requires explicit permission',
+                'requires_confirmation': True
+            }
+# Convenience functions
+def is_safe_to_auto_do(action: str) -> bool:
+    """Quick check if action is safe to do automatically"""
+    return ProactiveBoundaries.is_safe_to_auto_do(action)
+def should_auto_expand(query: str, result: str) -> bool:
+    """Quick check if result should be automatically expanded"""
+    expansion = ProactiveBoundaries.get_auto_expansion_for_query(query, result)
+    return expansion['should_expand']

cite-agent 1.3.9__py3-none-any.whl → 1.4.3__py3-none-any.whl

cite-agent 1.3.9py3-none-any.whl → 1.4.3py3-none-any.whl