PyPI - document-analyser - Versions diffs - 0.1.0__py3-none-any.whl - Mend

document-analyser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

app/__init__.py +1 -0
app/analyzers/__init__.py +1 -0
app/analyzers/domain_mapper.py +173 -0
app/analyzers/integrity_checker.py +386 -0
app/analyzers/keyword_analyzer.py +159 -0
app/analyzers/ner_analyzer.py +40 -0
app/analyzers/ngram_analyzer.py +64 -0
app/analyzers/readability.py +150 -0
app/analyzers/sentiment_analyzer.py +279 -0
app/analyzers/structural_mismatch.py +276 -0
app/analyzers/word_analysis.py +289 -0
app/analyzers/writing_quality.py +334 -0
app/api/__init__.py +1 -0
app/api/routes/__init__.py +1 -0
app/api/routes/academic_analysis.py +373 -0
app/api/routes/advanced_text.py +147 -0
app/api/routes/analysis.py +205 -0
app/api/routes/future_endpoints.py +913 -0
app/api/routes/health.py +25 -0
app/api/routes/semantic_analysis.py +148 -0
app/api/routes/text_analysis.py +126 -0
app/core/__init__.py +1 -0
app/core/config.py +73 -0
app/data/ai_patterns.json +146 -0
app/main.py +115 -0
app/models/__init__.py +1 -0
app/models/schemas.py +318 -0
app/services/__init__.py +1 -0
app/services/document_processor.py +472 -0
app/services/doi_resolver.py +327 -0
app/services/reference_extractor.py +117 -0
app/services/url_verifier.py +370 -0
app/utils/__init__.py +1 -0
document_analyser-0.1.0.dist-info/METADATA +178 -0
document_analyser-0.1.0.dist-info/RECORD +37 -0
document_analyser-0.1.0.dist-info/WHEEL +4 -0
document_analyser-0.1.0.dist-info/licenses/LICENSE +21 -0

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # FastAPI backend for CiteSight

app/analyzers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Analysis modules package

app/analyzers/domain_mapper.py ADDED Viewed

@@ -0,0 +1,173 @@
+"""Domain mapping analyzer using semantic similarity with sentence-transformers."""
+from typing import Any, Literal
+try:
+    from sentence_transformers import SentenceTransformer
+    import numpy as np
+except ImportError:
+    SentenceTransformer = None
+    np = None
+from app.models.schemas import DomainMapping, DomainMappingResponse
+class DomainMapper:
+    """Map document sections to user-defined domains using semantic similarity."""
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
+        """Initialize domain mapper with sentence-transformers model."""
+        self.model_name = model_name
+        self.model = None
+        if SentenceTransformer:
+            try:
+                self.model = SentenceTransformer(model_name)
+            except Exception:
+                self.model = None
+    def analyze(self, text: str, domains: list[str]) -> DomainMappingResponse:
+        """
+        Map sections to domains using cosine similarity.
+        Args:
+            text: Document text to analyze
+            domains: List of domain labels to map sections to
+        Returns:
+            DomainMappingResponse with section-domain mappings
+        """
+        if not text.strip() or not self.model or not domains:
+            return DomainMappingResponse(total_sections=0)
+        # 1. Detect sections using heuristic patterns
+        sections = self._detect_sections(text)
+        # 2. Generate embeddings for sections and domains
+        section_texts = [s["text"] for s in sections]
+        section_embeddings = self.model.encode(section_texts)
+        domain_embeddings = self.model.encode(domains)
+        # 3. Calculate cosine similarities and build mappings
+        mappings = []
+        for i, (section, section_emb) in enumerate(zip(sections, section_embeddings)):
+            similarities = self._cosine_similarities(section_emb, domain_embeddings)
+            # Find primary domain (highest similarity)
+            best_idx = int(np.argmax(similarities))
+            primary_domain = domains[best_idx]
+            best_score = float(similarities[best_idx])
+            # Determine confidence
+            confidence = self._calculate_confidence(best_score)
+            mappings.append(DomainMapping(
+                section_text=section["text"][:200],  # Truncate for response
+                section_index=i,
+                primary_domain=primary_domain,
+                similarity_score=best_score,
+                all_domain_scores={d: float(s) for d, s in zip(domains, similarities)},
+                confidence=confidence
+            ))
+        # 4. Calculate domain distribution
+        domain_distribution: dict[str, int] = {}
+        for m in mappings:
+            domain_distribution[m.primary_domain] = domain_distribution.get(m.primary_domain, 0) + 1
+        # 5. Calculate average confidence
+        avg_conf = float(np.mean([m.similarity_score for m in mappings])) if mappings else 0.0
+        return DomainMappingResponse(
+            total_sections=len(sections),
+            domains_analyzed=domains,
+            mappings=mappings,
+            domain_distribution=domain_distribution,
+            average_confidence=avg_conf
+        )
+    def _detect_sections(self, text: str) -> list[dict[str, str]]:
+        """
+        Detect sections using heuristic patterns.
+        Looks for:
+        1. ALL CAPS lines (>50% uppercase) as headers
+        2. Short lines (<60 chars) containing section keywords
+        """
+        paragraphs = text.split("\n\n")
+        sections: list[dict[str, str]] = []
+        section_keywords = [
+            "introduction", "background", "methodology", "methods", "results",
+            "discussion", "conclusion", "abstract", "summary", "findings",
+            "recommendations", "analysis", "overview", "scope"
+        ]
+        current_section: dict[str, str] = {"header": "Introduction", "text": ""}
+        for i, para in enumerate(paragraphs):
+            if not para.strip():
+                continue
+            lines = para.split("\n")
+            first_line = lines[0].strip() if lines else ""
+            # Check if this is a section header
+            is_header = False
+            # Pattern 1: ALL CAPS (at least 50% uppercase)
+            if len(first_line) > 0:
+                upper_count = sum(1 for c in first_line if c.isupper())
+                upper_ratio = upper_count / len(first_line)
+                if upper_ratio > 0.5 and len(first_line) < 100:
+                    is_header = True
+            # Pattern 2: Short line with keywords
+            if len(first_line) < 60 and any(kw in first_line.lower() for kw in section_keywords):
+                is_header = True
+            if is_header and i > 0:
+                # Save previous section
+                if current_section["text"].strip():
+                    sections.append(current_section)
+                # Start new section
+                rest_lines = "\n".join(lines[1:]) if len(lines) > 1 else ""
+                current_section = {"header": first_line, "text": rest_lines}
+            else:
+                # Add to current section
+                current_section["text"] += "\n\n" + para
+        # Add final section
+        if current_section["text"].strip():
+            sections.append(current_section)
+        # If no sections detected, treat entire text as one section
+        if not sections:
+            sections = [{"header": "Document", "text": text}]
+        return sections
+    def _cosine_similarities(
+        self, vec1: Any, vec2_matrix: Any
+    ) -> Any:
+        """Calculate cosine similarity between vec1 and each row in vec2_matrix."""
+        if np is None:
+            return []
+        # Normalize
+        vec1_norm = vec1 / np.linalg.norm(vec1)
+        vec2_norms = vec2_matrix / np.linalg.norm(vec2_matrix, axis=1, keepdims=True)
+        # Dot product
+        return np.dot(vec2_norms, vec1_norm)
+    def _calculate_confidence(
+        self, best_score: float
+    ) -> Literal["high", "medium", "low"]:
+        """Determine confidence level based on similarity score."""
+        if best_score > 0.7:
+            return "high"
+        elif best_score > 0.5:
+            return "medium"
+        else:
+            return "low"

app/analyzers/integrity_checker.py ADDED Viewed

@@ -0,0 +1,386 @@
+"""
+Integrity checker for detecting AI patterns and content authenticity issues
+"""
+import json
+import re
+from collections import Counter
+from pathlib import Path
+from typing import Any
+from app.models.schemas import SuspiciousPatterns
+class IntegrityChecker:
+    """Detects AI patterns, suspicious content, and integrity issues in text"""
+    def __init__(self) -> None:
+        """Initialize the integrity checker with AI pattern data"""
+        self.patterns_file = Path(__file__).parent.parent / "data" / "ai_patterns.json"
+        self.ai_patterns = self._load_patterns()
+    def _load_patterns(self) -> dict[str, Any]:
+        """Load AI detection patterns from JSON file"""
+        try:
+            with self.patterns_file.open(encoding='utf-8') as f:
+                data: dict[str, Any] = json.load(f)
+                return data
+        except FileNotFoundError:
+            # Fallback to minimal patterns if file not found
+            fallback: dict[str, Any] = {
+                "patterns": {
+                    "ai_phrases": [],
+                    "ai_verbs": [],
+                    "ai_adjectives": [],
+                    "llm_artifacts": []
+                },
+                "thresholds": {
+                    "ai_word_frequency": {"high": 0.15, "medium": 0.08, "low": 0.03}
+                }
+            }
+            return fallback
+    def detect_patterns(self, text: str, references: list, documents: list[str] | None = None) -> SuspiciousPatterns:
+        """
+        Detect suspicious patterns including AI-generated content indicators
+        Args:
+            text: The text to analyze
+            references: List of references/citations
+            documents: Optional list of other documents for comparison
+        Returns:
+            SuspiciousPatterns with detected issues
+        """
+        # Normalize text for analysis
+        text_lower = text.lower()
+        words = re.findall(r'\b\w+\b', text_lower)
+        total_words = len(words)
+        # Detect AI patterns
+        ai_indicators = self._detect_ai_patterns(text, text_lower, words, total_words)
+        # Detect self-plagiarism if documents provided
+        self_plagiarism = self._detect_self_plagiarism(text, documents) if documents else []
+        # Detect citation anomalies
+        citation_anomalies = self._detect_citation_anomalies(text, references)
+        # Detect style inconsistencies
+        style_inconsistencies = self._detect_style_inconsistencies(text)
+        # Calculate overall integrity score
+        integrity_score = self._calculate_integrity_score(
+            ai_indicators,
+            self_plagiarism,
+            citation_anomalies,
+            style_inconsistencies
+        )
+        # Compile all issues
+        all_issues = []
+        # Add AI detection results
+        if ai_indicators['risk_level'] in ['medium', 'high']:
+            all_issues.append(f"AI content detected (confidence: {ai_indicators['confidence']:.1%})")
+            if ai_indicators['llm_artifacts']:
+                all_issues.append(f"LLM artifacts found: {', '.join(ai_indicators['llm_artifacts'][:3])}")
+        all_issues.extend(self_plagiarism)
+        all_issues.extend(citation_anomalies)
+        all_issues.extend(style_inconsistencies)
+        return SuspiciousPatterns(
+            self_plagiarism=self_plagiarism,
+            citation_anomalies=citation_anomalies,
+            style_inconsistencies=style_inconsistencies,
+            ai_indicators=ai_indicators,
+            integrity_score=integrity_score,
+            all_issues=all_issues
+        )
+    def _detect_ai_patterns(self, text: str, text_lower: str, words: list[str], total_words: int) -> dict[str, Any]:
+        """Detect AI-generated content patterns"""
+        if total_words == 0:
+            return {
+                'risk_level': 'low',
+                'confidence': 0.0,
+                'detected_patterns': {}
+            }
+        patterns = self.ai_patterns.get('patterns', {})
+        thresholds = self.ai_patterns.get('thresholds', {})
+        weights = self.ai_patterns.get('weights', {
+            'ai_words': 0.25,
+            'ai_phrases': 0.3,
+            'llm_artifacts': 0.35,
+            'structural_markers': 0.1
+        })
+        results: dict[str, Any] = {
+            'ai_word_frequency': 0.0,
+            'ai_phrase_count': 0,
+            'llm_artifacts': [],
+            'em_dash_frequency': 0.0,
+            'bullet_ratio': 0.0,
+            'detected_ai_words': [],
+            'detected_ai_phrases': []
+        }
+        # Check AI words (verbs and adjectives)
+        ai_words = set(patterns.get('ai_verbs', []) + patterns.get('ai_adjectives', []))
+        ai_word_count = sum(1 for word in words if word in ai_words)
+        results['ai_word_frequency'] = ai_word_count / total_words if total_words > 0 else 0
+        # Find which AI words were used
+        word_counter = Counter(words)
+        for word in ai_words:
+            if word in word_counter:
+                results['detected_ai_words'].append((word, word_counter[word]))
+        results['detected_ai_words'].sort(key=lambda x: x[1], reverse=True)
+        # Check AI phrases
+        for phrase in patterns.get('ai_phrases', []):
+            if phrase in text_lower:
+                results['ai_phrase_count'] += text_lower.count(phrase)
+                results['detected_ai_phrases'].append(phrase)
+        # Check for LLM artifacts
+        for artifact in patterns.get('llm_artifacts', []):
+            if artifact in text_lower:
+                results['llm_artifacts'].append(artifact)
+        # Check structural patterns
+        structural = patterns.get('structural_patterns', {})
+        # Em-dash frequency
+        em_dash = structural.get('excessive_em_dash', '—')
+        results['em_dash_frequency'] = text.count(em_dash) / total_words if total_words > 0 else 0
+        # Bullet point ratio
+        lines = text.split('\n')
+        bullet_lines = 0
+        for line in lines:
+            for bullet in structural.get('bullet_indicators', ['•', '-', '*']):
+                if line.strip().startswith(bullet):
+                    bullet_lines += 1
+                    break
+        results['bullet_ratio'] = bullet_lines / len(lines) if lines else 0
+        # Calculate overall AI confidence score
+        score = 0.0
+        # Word frequency component
+        word_freq_thresholds = thresholds.get('ai_word_frequency', {})
+        if results['ai_word_frequency'] >= word_freq_thresholds.get('high', 0.15):
+            score += weights['ai_words'] * 1.0
+        elif results['ai_word_frequency'] >= word_freq_thresholds.get('medium', 0.08):
+            score += weights['ai_words'] * 0.6
+        elif results['ai_word_frequency'] >= word_freq_thresholds.get('low', 0.03):
+            score += weights['ai_words'] * 0.3
+        # Phrase density component
+        phrase_thresholds = thresholds.get('ai_phrase_density', {})
+        if results['ai_phrase_count'] >= phrase_thresholds.get('high', 5):
+            score += weights['ai_phrases'] * 1.0
+        elif results['ai_phrase_count'] >= phrase_thresholds.get('medium', 3):
+            score += weights['ai_phrases'] * 0.6
+        elif results['ai_phrase_count'] >= phrase_thresholds.get('low', 1):
+            score += weights['ai_phrases'] * 0.3
+        # LLM artifacts component
+        if len(results['llm_artifacts']) > 0:
+            score += weights['llm_artifacts'] * min(1.0, len(results['llm_artifacts']) / 3)
+        # Structural markers component
+        structural_score = 0.0
+        if results['em_dash_frequency'] >= thresholds.get('em_dash_frequency', {}).get('high', 0.02):
+            structural_score += 0.5
+        if results['bullet_ratio'] >= thresholds.get('bullet_ratio', {}).get('high', 0.3):
+            structural_score += 0.5
+        score += weights['structural_markers'] * structural_score
+        # Determine risk level
+        overall_thresholds = thresholds.get('overall_risk', {})
+        if score >= overall_thresholds.get('high', 0.7):
+            risk_level = 'high'
+        elif score >= overall_thresholds.get('medium', 0.4):
+            risk_level = 'medium'
+        else:
+            risk_level = 'low'
+        return {
+            'risk_level': risk_level,
+            'confidence': score,
+            'detected_patterns': results,
+            'llm_artifacts': results['llm_artifacts'],
+            'disclaimer': self.ai_patterns.get('disclaimer', '')
+        }
+    def _detect_self_plagiarism(self, text: str, documents: list[str] | None) -> list[str]:
+        """Detect potential self-plagiarism by comparing with other documents"""
+        if not documents or len(documents) < 2:
+            return []
+        issues = []
+        text_sentences = set(re.split(r'[.!?]+', text))
+        text_sentences = {s.strip().lower() for s in text_sentences if len(s.strip()) > 20}
+        for i, doc in enumerate(documents):
+            if doc == text:
+                continue
+            doc_sentences = set(re.split(r'[.!?]+', doc))
+            doc_sentences = {s.strip().lower() for s in doc_sentences if len(s.strip()) > 20}
+            overlap = text_sentences.intersection(doc_sentences)
+            if len(overlap) > 3:
+                overlap_ratio = len(overlap) / len(text_sentences) if text_sentences else 0
+                if overlap_ratio > 0.1:
+                    issues.append(f"Significant text overlap ({overlap_ratio:.1%}) with document {i+1}")
+        return issues
+    def _detect_citation_anomalies(self, text: str, references: list) -> list[str]:
+        """Detect issues with citations and references"""
+        issues = []
+        # Check for citation density
+        sentences = re.split(r'[.!?]+', text)
+        sentences_with_citations = 0
+        # Common in-text citation patterns
+        citation_patterns = [
+            r'\([A-Z][a-z]+(?:\s+et\s+al\.)?,?\s*\d{4}\)',  # (Author, 2024) or (Author et al., 2024)
+            r'[A-Z][a-z]+(?:\s+et\s+al\.)?\s+\(\d{4}\)',    # Author (2024) or Author et al. (2024)
+            r'\[\d+\]',                                       # [1] style citations
+            r'\[[\w\s,]+\d{4}\]'                             # [Author 2024] style
+        ]
+        for sentence in sentences:
+            for pattern in citation_patterns:
+                if re.search(pattern, sentence):
+                    sentences_with_citations += 1
+                    break
+        citation_density = sentences_with_citations / len(sentences) if sentences else 0
+        # Flag unusual citation patterns
+        if len(sentences) > 10:
+            if citation_density > 0.8:
+                issues.append(f"Excessive citation density ({citation_density:.1%} of sentences)")
+            elif citation_density < 0.05 and len(references) > 5:
+                issues.append("Many references but few in-text citations")
+        # Check for citation clustering
+        text_thirds = [text[:len(text)//3], text[len(text)//3:2*len(text)//3], text[2*len(text)//3:]]
+        citations_per_third = []
+        for third in text_thirds:
+            count = 0
+            for pattern in citation_patterns:
+                count += len(re.findall(pattern, third))
+            citations_per_third.append(count)
+        total_citations = sum(citations_per_third)
+        if total_citations > 10:
+            for i, count in enumerate(citations_per_third):
+                if count > total_citations * 0.7:
+                    position = ['beginning', 'middle', 'end'][i]
+                    issues.append(f"Citations heavily clustered in {position} of document")
+        return issues
+    def _detect_style_inconsistencies(self, text: str) -> list[str]:
+        """Detect inconsistencies in writing style"""
+        issues: list[str] = []
+        # Split text into paragraphs
+        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
+        if len(paragraphs) < 3:
+            return issues
+        # Analyze sentence complexity variation
+        paragraph_complexities = []
+        for para in paragraphs:
+            sentences = re.split(r'[.!?]+', para)
+            sentences = [s for s in sentences if s.strip()]
+            if sentences:
+                avg_length = sum(len(s.split()) for s in sentences) / len(sentences)
+                paragraph_complexities.append(avg_length)
+        if paragraph_complexities:
+            avg_complexity = sum(paragraph_complexities) / len(paragraph_complexities)
+            for i, complexity in enumerate(paragraph_complexities):
+                deviation = abs(complexity - avg_complexity) / avg_complexity if avg_complexity > 0 else 0
+                if deviation > 0.5:  # More than 50% deviation
+                    issues.append(f"Paragraph {i+1} has significantly different sentence complexity")
+        # Check for spelling variety mixing (US vs UK)
+        us_uk_pairs = [
+            (r'\bcolor\b', r'\bcolour\b'),
+            (r'\banalyze\b', r'\banalyse\b'),
+            (r'\borganize\b', r'\borganise\b'),
+            (r'\bcenter\b', r'\bcentre\b'),
+            (r'\boptimize\b', r'\boptimise\b')
+        ]
+        mixed_spelling = False
+        for us_pattern, uk_pattern in us_uk_pairs:
+            has_us = bool(re.search(us_pattern, text, re.IGNORECASE))
+            has_uk = bool(re.search(uk_pattern, text, re.IGNORECASE))
+            if has_us and has_uk:
+                mixed_spelling = True
+                break
+        if mixed_spelling:
+            issues.append("Mixed US/UK spelling detected (possible copy-paste from multiple sources)")
+        # Check for sudden tone shifts
+        formal_indicators = ['furthermore', 'moreover', 'consequently', 'therefore', 'thus']
+        informal_indicators = ["it's", "don't", "won't", "can't", "shouldn't", "you'll"]
+        para_formality = []
+        for para in paragraphs:
+            para_lower = para.lower()
+            formal_count = sum(1 for word in formal_indicators if word in para_lower)
+            informal_count = sum(1 for word in informal_indicators if word in para_lower)
+            if formal_count > informal_count * 2:
+                para_formality.append('formal')
+            elif informal_count > formal_count * 2:
+                para_formality.append('informal')
+            else:
+                para_formality.append('neutral')
+        # Check for abrupt tone changes
+        for i in range(1, len(para_formality)):
+            if para_formality[i-1] == 'formal' and para_formality[i] == 'informal':
+                issues.append(f"Abrupt tone shift from formal to informal at paragraph {i+1}")
+            elif para_formality[i-1] == 'informal' and para_formality[i] == 'formal':
+                issues.append(f"Abrupt tone shift from informal to formal at paragraph {i+1}")
+        return issues
+    def _calculate_integrity_score(self, ai_indicators: dict, self_plagiarism: list,
+                                  citation_anomalies: list, style_inconsistencies: list) -> float:
+        """Calculate overall document integrity score (0-100, higher is better)"""
+        base_score = 100.0
+        # Deduct for AI indicators
+        ai_confidence = ai_indicators.get('confidence', 0)
+        base_score -= ai_confidence * 30  # Max 30 point deduction for AI
+        # Deduct for self-plagiarism
+        base_score -= len(self_plagiarism) * 10  # 10 points per issue
+        # Deduct for citation anomalies
+        base_score -= len(citation_anomalies) * 5  # 5 points per issue
+        # Deduct for style inconsistencies
+        base_score -= len(style_inconsistencies) * 5  # 5 points per issue
+        # Ensure score stays within 0-100 range
+        final_score: float = max(0.0, min(100.0, base_score))
+        return final_score