document-analyser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
app/__init__.py ADDED
@@ -0,0 +1 @@
1
+ # FastAPI backend for CiteSight
@@ -0,0 +1 @@
1
+ # Analysis modules package
@@ -0,0 +1,173 @@
1
+ """Domain mapping analyzer using semantic similarity with sentence-transformers."""
2
+
3
+ from typing import Any, Literal
4
+
5
+ try:
6
+ from sentence_transformers import SentenceTransformer
7
+ import numpy as np
8
+ except ImportError:
9
+ SentenceTransformer = None
10
+ np = None
11
+
12
+ from app.models.schemas import DomainMapping, DomainMappingResponse
13
+
14
+
15
+ class DomainMapper:
16
+ """Map document sections to user-defined domains using semantic similarity."""
17
+
18
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
19
+ """Initialize domain mapper with sentence-transformers model."""
20
+ self.model_name = model_name
21
+ self.model = None
22
+
23
+ if SentenceTransformer:
24
+ try:
25
+ self.model = SentenceTransformer(model_name)
26
+ except Exception:
27
+ self.model = None
28
+
29
+ def analyze(self, text: str, domains: list[str]) -> DomainMappingResponse:
30
+ """
31
+ Map sections to domains using cosine similarity.
32
+
33
+ Args:
34
+ text: Document text to analyze
35
+ domains: List of domain labels to map sections to
36
+
37
+ Returns:
38
+ DomainMappingResponse with section-domain mappings
39
+ """
40
+ if not text.strip() or not self.model or not domains:
41
+ return DomainMappingResponse(total_sections=0)
42
+
43
+ # 1. Detect sections using heuristic patterns
44
+ sections = self._detect_sections(text)
45
+
46
+ # 2. Generate embeddings for sections and domains
47
+ section_texts = [s["text"] for s in sections]
48
+ section_embeddings = self.model.encode(section_texts)
49
+ domain_embeddings = self.model.encode(domains)
50
+
51
+ # 3. Calculate cosine similarities and build mappings
52
+ mappings = []
53
+ for i, (section, section_emb) in enumerate(zip(sections, section_embeddings)):
54
+ similarities = self._cosine_similarities(section_emb, domain_embeddings)
55
+
56
+ # Find primary domain (highest similarity)
57
+ best_idx = int(np.argmax(similarities))
58
+ primary_domain = domains[best_idx]
59
+ best_score = float(similarities[best_idx])
60
+
61
+ # Determine confidence
62
+ confidence = self._calculate_confidence(best_score)
63
+
64
+ mappings.append(DomainMapping(
65
+ section_text=section["text"][:200], # Truncate for response
66
+ section_index=i,
67
+ primary_domain=primary_domain,
68
+ similarity_score=best_score,
69
+ all_domain_scores={d: float(s) for d, s in zip(domains, similarities)},
70
+ confidence=confidence
71
+ ))
72
+
73
+ # 4. Calculate domain distribution
74
+ domain_distribution: dict[str, int] = {}
75
+ for m in mappings:
76
+ domain_distribution[m.primary_domain] = domain_distribution.get(m.primary_domain, 0) + 1
77
+
78
+ # 5. Calculate average confidence
79
+ avg_conf = float(np.mean([m.similarity_score for m in mappings])) if mappings else 0.0
80
+
81
+ return DomainMappingResponse(
82
+ total_sections=len(sections),
83
+ domains_analyzed=domains,
84
+ mappings=mappings,
85
+ domain_distribution=domain_distribution,
86
+ average_confidence=avg_conf
87
+ )
88
+
89
+ def _detect_sections(self, text: str) -> list[dict[str, str]]:
90
+ """
91
+ Detect sections using heuristic patterns.
92
+
93
+ Looks for:
94
+ 1. ALL CAPS lines (>50% uppercase) as headers
95
+ 2. Short lines (<60 chars) containing section keywords
96
+ """
97
+ paragraphs = text.split("\n\n")
98
+ sections: list[dict[str, str]] = []
99
+
100
+ section_keywords = [
101
+ "introduction", "background", "methodology", "methods", "results",
102
+ "discussion", "conclusion", "abstract", "summary", "findings",
103
+ "recommendations", "analysis", "overview", "scope"
104
+ ]
105
+
106
+ current_section: dict[str, str] = {"header": "Introduction", "text": ""}
107
+
108
+ for i, para in enumerate(paragraphs):
109
+ if not para.strip():
110
+ continue
111
+
112
+ lines = para.split("\n")
113
+ first_line = lines[0].strip() if lines else ""
114
+
115
+ # Check if this is a section header
116
+ is_header = False
117
+
118
+ # Pattern 1: ALL CAPS (at least 50% uppercase)
119
+ if len(first_line) > 0:
120
+ upper_count = sum(1 for c in first_line if c.isupper())
121
+ upper_ratio = upper_count / len(first_line)
122
+ if upper_ratio > 0.5 and len(first_line) < 100:
123
+ is_header = True
124
+
125
+ # Pattern 2: Short line with keywords
126
+ if len(first_line) < 60 and any(kw in first_line.lower() for kw in section_keywords):
127
+ is_header = True
128
+
129
+ if is_header and i > 0:
130
+ # Save previous section
131
+ if current_section["text"].strip():
132
+ sections.append(current_section)
133
+ # Start new section
134
+ rest_lines = "\n".join(lines[1:]) if len(lines) > 1 else ""
135
+ current_section = {"header": first_line, "text": rest_lines}
136
+ else:
137
+ # Add to current section
138
+ current_section["text"] += "\n\n" + para
139
+
140
+ # Add final section
141
+ if current_section["text"].strip():
142
+ sections.append(current_section)
143
+
144
+ # If no sections detected, treat entire text as one section
145
+ if not sections:
146
+ sections = [{"header": "Document", "text": text}]
147
+
148
+ return sections
149
+
150
+ def _cosine_similarities(
151
+ self, vec1: Any, vec2_matrix: Any
152
+ ) -> Any:
153
+ """Calculate cosine similarity between vec1 and each row in vec2_matrix."""
154
+ if np is None:
155
+ return []
156
+
157
+ # Normalize
158
+ vec1_norm = vec1 / np.linalg.norm(vec1)
159
+ vec2_norms = vec2_matrix / np.linalg.norm(vec2_matrix, axis=1, keepdims=True)
160
+
161
+ # Dot product
162
+ return np.dot(vec2_norms, vec1_norm)
163
+
164
+ def _calculate_confidence(
165
+ self, best_score: float
166
+ ) -> Literal["high", "medium", "low"]:
167
+ """Determine confidence level based on similarity score."""
168
+ if best_score > 0.7:
169
+ return "high"
170
+ elif best_score > 0.5:
171
+ return "medium"
172
+ else:
173
+ return "low"
@@ -0,0 +1,386 @@
1
+ """
2
+ Integrity checker for detecting AI patterns and content authenticity issues
3
+ """
4
+
5
+ import json
6
+ import re
7
+ from collections import Counter
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from app.models.schemas import SuspiciousPatterns
12
+
13
+
14
+ class IntegrityChecker:
15
+ """Detects AI patterns, suspicious content, and integrity issues in text"""
16
+
17
+ def __init__(self) -> None:
18
+ """Initialize the integrity checker with AI pattern data"""
19
+ self.patterns_file = Path(__file__).parent.parent / "data" / "ai_patterns.json"
20
+ self.ai_patterns = self._load_patterns()
21
+
22
+ def _load_patterns(self) -> dict[str, Any]:
23
+ """Load AI detection patterns from JSON file"""
24
+ try:
25
+ with self.patterns_file.open(encoding='utf-8') as f:
26
+ data: dict[str, Any] = json.load(f)
27
+ return data
28
+ except FileNotFoundError:
29
+ # Fallback to minimal patterns if file not found
30
+ fallback: dict[str, Any] = {
31
+ "patterns": {
32
+ "ai_phrases": [],
33
+ "ai_verbs": [],
34
+ "ai_adjectives": [],
35
+ "llm_artifacts": []
36
+ },
37
+ "thresholds": {
38
+ "ai_word_frequency": {"high": 0.15, "medium": 0.08, "low": 0.03}
39
+ }
40
+ }
41
+ return fallback
42
+
43
+ def detect_patterns(self, text: str, references: list, documents: list[str] | None = None) -> SuspiciousPatterns:
44
+ """
45
+ Detect suspicious patterns including AI-generated content indicators
46
+
47
+ Args:
48
+ text: The text to analyze
49
+ references: List of references/citations
50
+ documents: Optional list of other documents for comparison
51
+
52
+ Returns:
53
+ SuspiciousPatterns with detected issues
54
+ """
55
+ # Normalize text for analysis
56
+ text_lower = text.lower()
57
+ words = re.findall(r'\b\w+\b', text_lower)
58
+ total_words = len(words)
59
+
60
+ # Detect AI patterns
61
+ ai_indicators = self._detect_ai_patterns(text, text_lower, words, total_words)
62
+
63
+ # Detect self-plagiarism if documents provided
64
+ self_plagiarism = self._detect_self_plagiarism(text, documents) if documents else []
65
+
66
+ # Detect citation anomalies
67
+ citation_anomalies = self._detect_citation_anomalies(text, references)
68
+
69
+ # Detect style inconsistencies
70
+ style_inconsistencies = self._detect_style_inconsistencies(text)
71
+
72
+ # Calculate overall integrity score
73
+ integrity_score = self._calculate_integrity_score(
74
+ ai_indicators,
75
+ self_plagiarism,
76
+ citation_anomalies,
77
+ style_inconsistencies
78
+ )
79
+
80
+ # Compile all issues
81
+ all_issues = []
82
+
83
+ # Add AI detection results
84
+ if ai_indicators['risk_level'] in ['medium', 'high']:
85
+ all_issues.append(f"AI content detected (confidence: {ai_indicators['confidence']:.1%})")
86
+ if ai_indicators['llm_artifacts']:
87
+ all_issues.append(f"LLM artifacts found: {', '.join(ai_indicators['llm_artifacts'][:3])}")
88
+
89
+ all_issues.extend(self_plagiarism)
90
+ all_issues.extend(citation_anomalies)
91
+ all_issues.extend(style_inconsistencies)
92
+
93
+ return SuspiciousPatterns(
94
+ self_plagiarism=self_plagiarism,
95
+ citation_anomalies=citation_anomalies,
96
+ style_inconsistencies=style_inconsistencies,
97
+ ai_indicators=ai_indicators,
98
+ integrity_score=integrity_score,
99
+ all_issues=all_issues
100
+ )
101
+
102
+ def _detect_ai_patterns(self, text: str, text_lower: str, words: list[str], total_words: int) -> dict[str, Any]:
103
+ """Detect AI-generated content patterns"""
104
+ if total_words == 0:
105
+ return {
106
+ 'risk_level': 'low',
107
+ 'confidence': 0.0,
108
+ 'detected_patterns': {}
109
+ }
110
+
111
+ patterns = self.ai_patterns.get('patterns', {})
112
+ thresholds = self.ai_patterns.get('thresholds', {})
113
+ weights = self.ai_patterns.get('weights', {
114
+ 'ai_words': 0.25,
115
+ 'ai_phrases': 0.3,
116
+ 'llm_artifacts': 0.35,
117
+ 'structural_markers': 0.1
118
+ })
119
+
120
+ results: dict[str, Any] = {
121
+ 'ai_word_frequency': 0.0,
122
+ 'ai_phrase_count': 0,
123
+ 'llm_artifacts': [],
124
+ 'em_dash_frequency': 0.0,
125
+ 'bullet_ratio': 0.0,
126
+ 'detected_ai_words': [],
127
+ 'detected_ai_phrases': []
128
+ }
129
+
130
+ # Check AI words (verbs and adjectives)
131
+ ai_words = set(patterns.get('ai_verbs', []) + patterns.get('ai_adjectives', []))
132
+ ai_word_count = sum(1 for word in words if word in ai_words)
133
+ results['ai_word_frequency'] = ai_word_count / total_words if total_words > 0 else 0
134
+
135
+ # Find which AI words were used
136
+ word_counter = Counter(words)
137
+ for word in ai_words:
138
+ if word in word_counter:
139
+ results['detected_ai_words'].append((word, word_counter[word]))
140
+ results['detected_ai_words'].sort(key=lambda x: x[1], reverse=True)
141
+
142
+ # Check AI phrases
143
+ for phrase in patterns.get('ai_phrases', []):
144
+ if phrase in text_lower:
145
+ results['ai_phrase_count'] += text_lower.count(phrase)
146
+ results['detected_ai_phrases'].append(phrase)
147
+
148
+ # Check for LLM artifacts
149
+ for artifact in patterns.get('llm_artifacts', []):
150
+ if artifact in text_lower:
151
+ results['llm_artifacts'].append(artifact)
152
+
153
+ # Check structural patterns
154
+ structural = patterns.get('structural_patterns', {})
155
+
156
+ # Em-dash frequency
157
+ em_dash = structural.get('excessive_em_dash', '—')
158
+ results['em_dash_frequency'] = text.count(em_dash) / total_words if total_words > 0 else 0
159
+
160
+ # Bullet point ratio
161
+ lines = text.split('\n')
162
+ bullet_lines = 0
163
+ for line in lines:
164
+ for bullet in structural.get('bullet_indicators', ['•', '-', '*']):
165
+ if line.strip().startswith(bullet):
166
+ bullet_lines += 1
167
+ break
168
+ results['bullet_ratio'] = bullet_lines / len(lines) if lines else 0
169
+
170
+ # Calculate overall AI confidence score
171
+ score = 0.0
172
+
173
+ # Word frequency component
174
+ word_freq_thresholds = thresholds.get('ai_word_frequency', {})
175
+ if results['ai_word_frequency'] >= word_freq_thresholds.get('high', 0.15):
176
+ score += weights['ai_words'] * 1.0
177
+ elif results['ai_word_frequency'] >= word_freq_thresholds.get('medium', 0.08):
178
+ score += weights['ai_words'] * 0.6
179
+ elif results['ai_word_frequency'] >= word_freq_thresholds.get('low', 0.03):
180
+ score += weights['ai_words'] * 0.3
181
+
182
+ # Phrase density component
183
+ phrase_thresholds = thresholds.get('ai_phrase_density', {})
184
+ if results['ai_phrase_count'] >= phrase_thresholds.get('high', 5):
185
+ score += weights['ai_phrases'] * 1.0
186
+ elif results['ai_phrase_count'] >= phrase_thresholds.get('medium', 3):
187
+ score += weights['ai_phrases'] * 0.6
188
+ elif results['ai_phrase_count'] >= phrase_thresholds.get('low', 1):
189
+ score += weights['ai_phrases'] * 0.3
190
+
191
+ # LLM artifacts component
192
+ if len(results['llm_artifacts']) > 0:
193
+ score += weights['llm_artifacts'] * min(1.0, len(results['llm_artifacts']) / 3)
194
+
195
+ # Structural markers component
196
+ structural_score = 0.0
197
+ if results['em_dash_frequency'] >= thresholds.get('em_dash_frequency', {}).get('high', 0.02):
198
+ structural_score += 0.5
199
+ if results['bullet_ratio'] >= thresholds.get('bullet_ratio', {}).get('high', 0.3):
200
+ structural_score += 0.5
201
+ score += weights['structural_markers'] * structural_score
202
+
203
+ # Determine risk level
204
+ overall_thresholds = thresholds.get('overall_risk', {})
205
+ if score >= overall_thresholds.get('high', 0.7):
206
+ risk_level = 'high'
207
+ elif score >= overall_thresholds.get('medium', 0.4):
208
+ risk_level = 'medium'
209
+ else:
210
+ risk_level = 'low'
211
+
212
+ return {
213
+ 'risk_level': risk_level,
214
+ 'confidence': score,
215
+ 'detected_patterns': results,
216
+ 'llm_artifacts': results['llm_artifacts'],
217
+ 'disclaimer': self.ai_patterns.get('disclaimer', '')
218
+ }
219
+
220
+ def _detect_self_plagiarism(self, text: str, documents: list[str] | None) -> list[str]:
221
+ """Detect potential self-plagiarism by comparing with other documents"""
222
+ if not documents or len(documents) < 2:
223
+ return []
224
+
225
+ issues = []
226
+ text_sentences = set(re.split(r'[.!?]+', text))
227
+ text_sentences = {s.strip().lower() for s in text_sentences if len(s.strip()) > 20}
228
+
229
+ for i, doc in enumerate(documents):
230
+ if doc == text:
231
+ continue
232
+
233
+ doc_sentences = set(re.split(r'[.!?]+', doc))
234
+ doc_sentences = {s.strip().lower() for s in doc_sentences if len(s.strip()) > 20}
235
+
236
+ overlap = text_sentences.intersection(doc_sentences)
237
+ if len(overlap) > 3:
238
+ overlap_ratio = len(overlap) / len(text_sentences) if text_sentences else 0
239
+ if overlap_ratio > 0.1:
240
+ issues.append(f"Significant text overlap ({overlap_ratio:.1%}) with document {i+1}")
241
+
242
+ return issues
243
+
244
+ def _detect_citation_anomalies(self, text: str, references: list) -> list[str]:
245
+ """Detect issues with citations and references"""
246
+ issues = []
247
+
248
+ # Check for citation density
249
+ sentences = re.split(r'[.!?]+', text)
250
+ sentences_with_citations = 0
251
+
252
+ # Common in-text citation patterns
253
+ citation_patterns = [
254
+ r'\([A-Z][a-z]+(?:\s+et\s+al\.)?,?\s*\d{4}\)', # (Author, 2024) or (Author et al., 2024)
255
+ r'[A-Z][a-z]+(?:\s+et\s+al\.)?\s+\(\d{4}\)', # Author (2024) or Author et al. (2024)
256
+ r'\[\d+\]', # [1] style citations
257
+ r'\[[\w\s,]+\d{4}\]' # [Author 2024] style
258
+ ]
259
+
260
+ for sentence in sentences:
261
+ for pattern in citation_patterns:
262
+ if re.search(pattern, sentence):
263
+ sentences_with_citations += 1
264
+ break
265
+
266
+ citation_density = sentences_with_citations / len(sentences) if sentences else 0
267
+
268
+ # Flag unusual citation patterns
269
+ if len(sentences) > 10:
270
+ if citation_density > 0.8:
271
+ issues.append(f"Excessive citation density ({citation_density:.1%} of sentences)")
272
+ elif citation_density < 0.05 and len(references) > 5:
273
+ issues.append("Many references but few in-text citations")
274
+
275
+ # Check for citation clustering
276
+ text_thirds = [text[:len(text)//3], text[len(text)//3:2*len(text)//3], text[2*len(text)//3:]]
277
+ citations_per_third = []
278
+
279
+ for third in text_thirds:
280
+ count = 0
281
+ for pattern in citation_patterns:
282
+ count += len(re.findall(pattern, third))
283
+ citations_per_third.append(count)
284
+
285
+ total_citations = sum(citations_per_third)
286
+ if total_citations > 10:
287
+ for i, count in enumerate(citations_per_third):
288
+ if count > total_citations * 0.7:
289
+ position = ['beginning', 'middle', 'end'][i]
290
+ issues.append(f"Citations heavily clustered in {position} of document")
291
+
292
+ return issues
293
+
294
+ def _detect_style_inconsistencies(self, text: str) -> list[str]:
295
+ """Detect inconsistencies in writing style"""
296
+ issues: list[str] = []
297
+
298
+ # Split text into paragraphs
299
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
300
+
301
+ if len(paragraphs) < 3:
302
+ return issues
303
+
304
+ # Analyze sentence complexity variation
305
+ paragraph_complexities = []
306
+ for para in paragraphs:
307
+ sentences = re.split(r'[.!?]+', para)
308
+ sentences = [s for s in sentences if s.strip()]
309
+ if sentences:
310
+ avg_length = sum(len(s.split()) for s in sentences) / len(sentences)
311
+ paragraph_complexities.append(avg_length)
312
+
313
+ if paragraph_complexities:
314
+ avg_complexity = sum(paragraph_complexities) / len(paragraph_complexities)
315
+ for i, complexity in enumerate(paragraph_complexities):
316
+ deviation = abs(complexity - avg_complexity) / avg_complexity if avg_complexity > 0 else 0
317
+ if deviation > 0.5: # More than 50% deviation
318
+ issues.append(f"Paragraph {i+1} has significantly different sentence complexity")
319
+
320
+ # Check for spelling variety mixing (US vs UK)
321
+ us_uk_pairs = [
322
+ (r'\bcolor\b', r'\bcolour\b'),
323
+ (r'\banalyze\b', r'\banalyse\b'),
324
+ (r'\borganize\b', r'\borganise\b'),
325
+ (r'\bcenter\b', r'\bcentre\b'),
326
+ (r'\boptimize\b', r'\boptimise\b')
327
+ ]
328
+
329
+ mixed_spelling = False
330
+ for us_pattern, uk_pattern in us_uk_pairs:
331
+ has_us = bool(re.search(us_pattern, text, re.IGNORECASE))
332
+ has_uk = bool(re.search(uk_pattern, text, re.IGNORECASE))
333
+ if has_us and has_uk:
334
+ mixed_spelling = True
335
+ break
336
+
337
+ if mixed_spelling:
338
+ issues.append("Mixed US/UK spelling detected (possible copy-paste from multiple sources)")
339
+
340
+ # Check for sudden tone shifts
341
+ formal_indicators = ['furthermore', 'moreover', 'consequently', 'therefore', 'thus']
342
+ informal_indicators = ["it's", "don't", "won't", "can't", "shouldn't", "you'll"]
343
+
344
+ para_formality = []
345
+ for para in paragraphs:
346
+ para_lower = para.lower()
347
+ formal_count = sum(1 for word in formal_indicators if word in para_lower)
348
+ informal_count = sum(1 for word in informal_indicators if word in para_lower)
349
+
350
+ if formal_count > informal_count * 2:
351
+ para_formality.append('formal')
352
+ elif informal_count > formal_count * 2:
353
+ para_formality.append('informal')
354
+ else:
355
+ para_formality.append('neutral')
356
+
357
+ # Check for abrupt tone changes
358
+ for i in range(1, len(para_formality)):
359
+ if para_formality[i-1] == 'formal' and para_formality[i] == 'informal':
360
+ issues.append(f"Abrupt tone shift from formal to informal at paragraph {i+1}")
361
+ elif para_formality[i-1] == 'informal' and para_formality[i] == 'formal':
362
+ issues.append(f"Abrupt tone shift from informal to formal at paragraph {i+1}")
363
+
364
+ return issues
365
+
366
+ def _calculate_integrity_score(self, ai_indicators: dict, self_plagiarism: list,
367
+ citation_anomalies: list, style_inconsistencies: list) -> float:
368
+ """Calculate overall document integrity score (0-100, higher is better)"""
369
+ base_score = 100.0
370
+
371
+ # Deduct for AI indicators
372
+ ai_confidence = ai_indicators.get('confidence', 0)
373
+ base_score -= ai_confidence * 30 # Max 30 point deduction for AI
374
+
375
+ # Deduct for self-plagiarism
376
+ base_score -= len(self_plagiarism) * 10 # 10 points per issue
377
+
378
+ # Deduct for citation anomalies
379
+ base_score -= len(citation_anomalies) * 5 # 5 points per issue
380
+
381
+ # Deduct for style inconsistencies
382
+ base_score -= len(style_inconsistencies) * 5 # 5 points per issue
383
+
384
+ # Ensure score stays within 0-100 range
385
+ final_score: float = max(0.0, min(100.0, base_score))
386
+ return final_score