cite-agent 1.3.6__py3-none-any.whl → 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cite-agent might be problematic. Click here for more details.
- cite_agent/__version__.py +1 -1
- cite_agent/cli.py +9 -2
- cite_agent/enhanced_ai_agent.py +332 -73
- {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/METADATA +1 -1
- cite_agent-1.3.7.dist-info/RECORD +31 -0
- {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/top_level.txt +0 -1
- cite_agent-1.3.6.dist-info/RECORD +0 -57
- src/__init__.py +0 -1
- src/services/__init__.py +0 -132
- src/services/auth_service/__init__.py +0 -3
- src/services/auth_service/auth_manager.py +0 -33
- src/services/graph/__init__.py +0 -1
- src/services/graph/knowledge_graph.py +0 -194
- src/services/llm_service/__init__.py +0 -5
- src/services/llm_service/llm_manager.py +0 -495
- src/services/paper_service/__init__.py +0 -5
- src/services/paper_service/openalex.py +0 -231
- src/services/performance_service/__init__.py +0 -1
- src/services/performance_service/rust_performance.py +0 -395
- src/services/research_service/__init__.py +0 -23
- src/services/research_service/chatbot.py +0 -2056
- src/services/research_service/citation_manager.py +0 -436
- src/services/research_service/context_manager.py +0 -1441
- src/services/research_service/conversation_manager.py +0 -597
- src/services/research_service/critical_paper_detector.py +0 -577
- src/services/research_service/enhanced_research.py +0 -121
- src/services/research_service/enhanced_synthesizer.py +0 -375
- src/services/research_service/query_generator.py +0 -777
- src/services/research_service/synthesizer.py +0 -1273
- src/services/search_service/__init__.py +0 -5
- src/services/search_service/indexer.py +0 -186
- src/services/search_service/search_engine.py +0 -342
- src/services/simple_enhanced_main.py +0 -287
- {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/WHEEL +0 -0
- {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/entry_points.txt +0 -0
- {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,577 +0,0 @@
|
|
|
1
|
-
# src/services/research_service/critical_paper_detector.py
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
import re
|
|
5
|
-
import math
|
|
6
|
-
from typing import List, Dict, Any, Set, Optional
|
|
7
|
-
from collections import Counter
|
|
8
|
-
import networkx as nx # type: ignore[import]
|
|
9
|
-
from datetime import datetime, timezone
|
|
10
|
-
|
|
11
|
-
# Configure structured logging
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def _utc_timestamp() -> str:
|
|
16
|
-
return datetime.now(timezone.utc).isoformat()
|
|
17
|
-
|
|
18
|
-
class CriticalPaperDetector:
|
|
19
|
-
"""
|
|
20
|
-
Enhanced critical paper detector with comprehensive error handling, security, and observability.
|
|
21
|
-
|
|
22
|
-
Features:
|
|
23
|
-
- Secure paper analysis and scoring
|
|
24
|
-
- Input validation and sanitization
|
|
25
|
-
- Comprehensive error handling and fallback logic
|
|
26
|
-
- Structured logging and monitoring
|
|
27
|
-
- Protection against injection attacks
|
|
28
|
-
- Multi-factor paper importance scoring
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
def __init__(self, db_operations=None):
|
|
32
|
-
"""
|
|
33
|
-
Initialize detector with enhanced security and error handling.
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
db_operations: Optional database operations instance
|
|
37
|
-
|
|
38
|
-
Raises:
|
|
39
|
-
ValueError: If initialization fails
|
|
40
|
-
"""
|
|
41
|
-
try:
|
|
42
|
-
#logger.info("Initializing CriticalPaperDetector with enhanced security")
|
|
43
|
-
|
|
44
|
-
self.db = db_operations
|
|
45
|
-
|
|
46
|
-
# Define importance indicators with enhanced coverage
|
|
47
|
-
self.method_terms = {
|
|
48
|
-
"novel", "methodology", "approach", "framework", "technique",
|
|
49
|
-
"algorithm", "protocol", "procedure", "process", "method",
|
|
50
|
-
"implementation", "design", "architecture", "strategy",
|
|
51
|
-
"paradigm", "model", "system", "mechanism", "solution"
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
self.result_terms = {
|
|
55
|
-
"significant", "breakthrough", "discovery", "finding",
|
|
56
|
-
"evidence", "proves", "demonstrates", "shows", "reveals",
|
|
57
|
-
"establishes", "confirms", "validates", "supports", "indicates",
|
|
58
|
-
"suggests", "implies", "concludes", "determines", "identifies"
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
self.contradiction_terms = {
|
|
62
|
-
"contrary", "opposed", "conflict", "contradiction", "inconsistent",
|
|
63
|
-
"challenge", "dispute", "unlike", "differs", "contrast",
|
|
64
|
-
"disagreement", "debate", "controversy", "question", "doubt",
|
|
65
|
-
"skepticism", "criticism", "limitation", "weakness", "flaw"
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
#logger.info("CriticalPaperDetector initialized successfully")
|
|
69
|
-
|
|
70
|
-
except Exception as e:
|
|
71
|
-
logger.error(f"Failed to initialize CriticalPaperDetector: {str(e)}")
|
|
72
|
-
raise
|
|
73
|
-
|
|
74
|
-
def _validate_papers(self, papers: List[Dict[str, Any]]) -> None:
|
|
75
|
-
"""
|
|
76
|
-
Validate papers list for security and safety.
|
|
77
|
-
|
|
78
|
-
Args:
|
|
79
|
-
papers: Papers list to validate
|
|
80
|
-
|
|
81
|
-
Raises:
|
|
82
|
-
ValueError: If papers list is invalid
|
|
83
|
-
"""
|
|
84
|
-
if not isinstance(papers, list):
|
|
85
|
-
raise ValueError("Papers must be a list")
|
|
86
|
-
|
|
87
|
-
if len(papers) > 1000: # Reasonable limit
|
|
88
|
-
raise ValueError("Too many papers (max 1000)")
|
|
89
|
-
|
|
90
|
-
for i, paper in enumerate(papers):
|
|
91
|
-
if not isinstance(paper, dict):
|
|
92
|
-
raise ValueError(f"Paper at index {i} must be a dictionary")
|
|
93
|
-
|
|
94
|
-
# Validate required fields
|
|
95
|
-
if "id" not in paper:
|
|
96
|
-
raise ValueError(f"Paper at index {i} missing required 'id' field")
|
|
97
|
-
|
|
98
|
-
paper_id = str(paper["id"])
|
|
99
|
-
if len(paper_id) > 100:
|
|
100
|
-
raise ValueError(f"Paper ID at index {i} too long (max 100 characters)")
|
|
101
|
-
|
|
102
|
-
# Check for potentially dangerous content in text fields
|
|
103
|
-
text_fields = ["title", "abstract", "summary"]
|
|
104
|
-
for field in text_fields:
|
|
105
|
-
if field in paper and paper[field]:
|
|
106
|
-
content = str(paper[field])
|
|
107
|
-
if len(content) > 10000: # Reasonable limit
|
|
108
|
-
raise ValueError(f"Paper {field} at index {i} too long (max 10000 characters)")
|
|
109
|
-
|
|
110
|
-
def _validate_threshold(self, threshold_percentage: int) -> None:
|
|
111
|
-
"""
|
|
112
|
-
Validate threshold percentage.
|
|
113
|
-
|
|
114
|
-
Args:
|
|
115
|
-
threshold_percentage: Threshold percentage to validate
|
|
116
|
-
|
|
117
|
-
Raises:
|
|
118
|
-
ValueError: If threshold is invalid
|
|
119
|
-
"""
|
|
120
|
-
if not isinstance(threshold_percentage, int):
|
|
121
|
-
raise ValueError("Threshold percentage must be an integer")
|
|
122
|
-
|
|
123
|
-
if threshold_percentage < 1 or threshold_percentage > 50:
|
|
124
|
-
raise ValueError("Threshold percentage must be between 1 and 50")
|
|
125
|
-
|
|
126
|
-
def _sanitize_text(self, text: str, max_length: int = 10000) -> str:
|
|
127
|
-
"""
|
|
128
|
-
Sanitize text to prevent injection attacks.
|
|
129
|
-
|
|
130
|
-
Args:
|
|
131
|
-
text: Text to sanitize
|
|
132
|
-
max_length: Maximum allowed length
|
|
133
|
-
|
|
134
|
-
Returns:
|
|
135
|
-
Sanitized text
|
|
136
|
-
"""
|
|
137
|
-
if not isinstance(text, str):
|
|
138
|
-
return ""
|
|
139
|
-
|
|
140
|
-
if len(text) > max_length:
|
|
141
|
-
text = text[:max_length]
|
|
142
|
-
|
|
143
|
-
# Basic XSS protection
|
|
144
|
-
sanitized = text.replace('<', '<').replace('>', '>')
|
|
145
|
-
|
|
146
|
-
# Remove null bytes and other control characters
|
|
147
|
-
sanitized = ''.join(char for char in sanitized if ord(char) >= 32 or char in '\n\r\t')
|
|
148
|
-
|
|
149
|
-
return sanitized.strip()
|
|
150
|
-
|
|
151
|
-
async def identify_critical_papers(self, papers: List[Dict[str, Any]],
|
|
152
|
-
threshold_percentage: int = 20) -> List[Dict[str, Any]]:
|
|
153
|
-
"""
|
|
154
|
-
Identify critical papers with enhanced error handling and security.
|
|
155
|
-
|
|
156
|
-
Args:
|
|
157
|
-
papers: List of paper dictionaries with metadata
|
|
158
|
-
threshold_percentage: Percentage of papers to mark as critical
|
|
159
|
-
|
|
160
|
-
Returns:
|
|
161
|
-
List of critical papers with scores
|
|
162
|
-
|
|
163
|
-
Raises:
|
|
164
|
-
ValueError: If inputs are invalid
|
|
165
|
-
"""
|
|
166
|
-
try:
|
|
167
|
-
# Input validation
|
|
168
|
-
self._validate_papers(papers)
|
|
169
|
-
self._validate_threshold(threshold_percentage)
|
|
170
|
-
|
|
171
|
-
if not papers:
|
|
172
|
-
#logger.info("No papers provided for critical analysis")
|
|
173
|
-
return []
|
|
174
|
-
|
|
175
|
-
#logger.info(f"Analyzing {len(papers)} papers for critical importance (threshold: {threshold_percentage}%)")
|
|
176
|
-
|
|
177
|
-
# Calculate scores for all papers with error handling
|
|
178
|
-
paper_scores = {}
|
|
179
|
-
for i, paper in enumerate(papers):
|
|
180
|
-
try:
|
|
181
|
-
score = self._calculate_paper_score(paper)
|
|
182
|
-
paper_scores[paper["id"]] = score
|
|
183
|
-
except Exception as e:
|
|
184
|
-
logger.warning(f"Error calculating score for paper {i}: {str(e)}")
|
|
185
|
-
paper_scores[paper["id"]] = 0.0 # Default score
|
|
186
|
-
|
|
187
|
-
# Determine threshold based on percentage
|
|
188
|
-
num_critical = max(1, int(len(papers) * threshold_percentage / 100))
|
|
189
|
-
|
|
190
|
-
# Get top scoring papers
|
|
191
|
-
top_papers = sorted(
|
|
192
|
-
[(paper_id, score) for paper_id, score in paper_scores.items()],
|
|
193
|
-
key=lambda x: x[1],
|
|
194
|
-
reverse=True
|
|
195
|
-
)[:num_critical]
|
|
196
|
-
|
|
197
|
-
# Format results with error handling
|
|
198
|
-
results = []
|
|
199
|
-
for paper_id, score in top_papers:
|
|
200
|
-
try:
|
|
201
|
-
paper_data = next((p for p in papers if p["id"] == paper_id), None)
|
|
202
|
-
if paper_data:
|
|
203
|
-
results.append({
|
|
204
|
-
"paper_id": paper_id,
|
|
205
|
-
"title": self._sanitize_text(paper_data.get("title", "Unknown"), max_length=200),
|
|
206
|
-
"score": round(score, 2),
|
|
207
|
-
"factors": self._get_factor_breakdown(paper_data, score),
|
|
208
|
-
"analyzed_at": _utc_timestamp()
|
|
209
|
-
})
|
|
210
|
-
except Exception as e:
|
|
211
|
-
logger.warning(f"Error formatting result for paper {paper_id}: {str(e)}")
|
|
212
|
-
continue
|
|
213
|
-
|
|
214
|
-
#logger.info(f"Successfully identified {len(results)} critical papers")
|
|
215
|
-
return results
|
|
216
|
-
|
|
217
|
-
except ValueError as e:
|
|
218
|
-
logger.error(f"Invalid input for critical paper identification: {str(e)}")
|
|
219
|
-
raise
|
|
220
|
-
except Exception as e:
|
|
221
|
-
logger.error(f"Error identifying critical papers: {str(e)}")
|
|
222
|
-
return []
|
|
223
|
-
|
|
224
|
-
def _calculate_paper_score(self, paper: Dict[str, Any]) -> float:
|
|
225
|
-
"""
|
|
226
|
-
Calculate importance score for a paper with enhanced error handling.
|
|
227
|
-
|
|
228
|
-
Args:
|
|
229
|
-
paper: Paper dictionary with metadata
|
|
230
|
-
|
|
231
|
-
Returns:
|
|
232
|
-
Numerical score (higher = more important)
|
|
233
|
-
"""
|
|
234
|
-
try:
|
|
235
|
-
score = 0.0
|
|
236
|
-
|
|
237
|
-
# Factor 1: Citation impact (if available)
|
|
238
|
-
try:
|
|
239
|
-
citation_score = self._calculate_citation_score(paper)
|
|
240
|
-
score += citation_score * 0.25 # 25% weight
|
|
241
|
-
except Exception as e:
|
|
242
|
-
logger.warning(f"Error calculating citation score: {str(e)}")
|
|
243
|
-
score += 0.0
|
|
244
|
-
|
|
245
|
-
# Factor 2: Recency
|
|
246
|
-
try:
|
|
247
|
-
recency_score = self._calculate_recency_score(paper)
|
|
248
|
-
score += recency_score * 0.15 # 15% weight
|
|
249
|
-
except Exception as e:
|
|
250
|
-
logger.warning(f"Error calculating recency score: {str(e)}")
|
|
251
|
-
score += 5.0 # Default middle score
|
|
252
|
-
|
|
253
|
-
# Factor 3: Title and abstract significance
|
|
254
|
-
try:
|
|
255
|
-
significance_score = self._calculate_significance_score(paper)
|
|
256
|
-
score += significance_score * 0.20 # 20% weight
|
|
257
|
-
except Exception as e:
|
|
258
|
-
logger.warning(f"Error calculating significance score: {str(e)}")
|
|
259
|
-
score += 0.0
|
|
260
|
-
|
|
261
|
-
# Factor 4: Methodology novelty
|
|
262
|
-
try:
|
|
263
|
-
methodology_score = self._calculate_methodology_score(paper)
|
|
264
|
-
score += methodology_score * 0.20 # 20% weight
|
|
265
|
-
except Exception as e:
|
|
266
|
-
logger.warning(f"Error calculating methodology score: {str(e)}")
|
|
267
|
-
score += 0.0
|
|
268
|
-
|
|
269
|
-
# Factor 5: Contradiction potential
|
|
270
|
-
try:
|
|
271
|
-
contradiction_score = self._calculate_contradiction_score(paper)
|
|
272
|
-
score += contradiction_score * 0.20 # 20% weight
|
|
273
|
-
except Exception as e:
|
|
274
|
-
logger.warning(f"Error calculating contradiction score: {str(e)}")
|
|
275
|
-
score += 0.0
|
|
276
|
-
|
|
277
|
-
return max(0.0, min(10.0, score)) # Ensure score is between 0 and 10
|
|
278
|
-
|
|
279
|
-
except Exception as e:
|
|
280
|
-
logger.error(f"Error calculating paper score: {str(e)}")
|
|
281
|
-
return 0.0
|
|
282
|
-
|
|
283
|
-
def _calculate_citation_score(self, paper: Dict[str, Any]) -> float:
|
|
284
|
-
"""
|
|
285
|
-
Calculate score based on citation count with enhanced error handling.
|
|
286
|
-
|
|
287
|
-
Args:
|
|
288
|
-
paper: Paper dictionary
|
|
289
|
-
|
|
290
|
-
Returns:
|
|
291
|
-
Citation score
|
|
292
|
-
"""
|
|
293
|
-
try:
|
|
294
|
-
citation_count = paper.get("citation_count", 0)
|
|
295
|
-
|
|
296
|
-
# Validate citation count
|
|
297
|
-
if not isinstance(citation_count, (int, float)):
|
|
298
|
-
return 0.0
|
|
299
|
-
|
|
300
|
-
citation_count = max(0, int(citation_count))
|
|
301
|
-
|
|
302
|
-
# Log-scale to prevent extremely cited papers from dominating
|
|
303
|
-
if citation_count > 0:
|
|
304
|
-
return min(10.0, 2.0 * math.log10(citation_count + 1))
|
|
305
|
-
return 0.0
|
|
306
|
-
|
|
307
|
-
except Exception as e:
|
|
308
|
-
logger.warning(f"Error calculating citation score: {str(e)}")
|
|
309
|
-
return 0.0
|
|
310
|
-
|
|
311
|
-
def _calculate_recency_score(self, paper: Dict[str, Any]) -> float:
|
|
312
|
-
"""
|
|
313
|
-
Calculate score based on paper recency with enhanced error handling.
|
|
314
|
-
|
|
315
|
-
Args:
|
|
316
|
-
paper: Paper dictionary
|
|
317
|
-
|
|
318
|
-
Returns:
|
|
319
|
-
Recency score
|
|
320
|
-
"""
|
|
321
|
-
try:
|
|
322
|
-
year = paper.get("year")
|
|
323
|
-
|
|
324
|
-
# Try to extract year from various fields
|
|
325
|
-
if not year:
|
|
326
|
-
# Try published_date
|
|
327
|
-
if paper.get("published_date"):
|
|
328
|
-
year_match = re.search(r'20\d\d', str(paper.get("published_date", "")))
|
|
329
|
-
if year_match:
|
|
330
|
-
year = int(year_match.group(0))
|
|
331
|
-
|
|
332
|
-
# Try publication_date
|
|
333
|
-
if not year and paper.get("publication_date"):
|
|
334
|
-
year_match = re.search(r'20\d\d', str(paper.get("publication_date", "")))
|
|
335
|
-
if year_match:
|
|
336
|
-
year = int(year_match.group(0))
|
|
337
|
-
|
|
338
|
-
if not year:
|
|
339
|
-
return 5.0 # Middle score if unknown
|
|
340
|
-
|
|
341
|
-
# Convert to int if it's a string
|
|
342
|
-
if isinstance(year, str):
|
|
343
|
-
try:
|
|
344
|
-
year = int(year)
|
|
345
|
-
except ValueError:
|
|
346
|
-
return 5.0
|
|
347
|
-
|
|
348
|
-
# Validate year range
|
|
349
|
-
current_year = datetime.now().year
|
|
350
|
-
if year < 1900 or year > current_year + 1:
|
|
351
|
-
return 5.0 # Invalid year, use middle score
|
|
352
|
-
|
|
353
|
-
# Scoring by recency
|
|
354
|
-
years_old = current_year - year
|
|
355
|
-
|
|
356
|
-
if years_old <= 1:
|
|
357
|
-
return 10.0 # Very recent (0-1 years)
|
|
358
|
-
elif years_old <= 3:
|
|
359
|
-
return 8.0 # Recent (1-3 years)
|
|
360
|
-
elif years_old <= 5:
|
|
361
|
-
return 6.0 # Somewhat recent (3-5 years)
|
|
362
|
-
elif years_old <= 10:
|
|
363
|
-
return 4.0 # Older but still relevant (5-10 years)
|
|
364
|
-
else:
|
|
365
|
-
return 2.0 # Much older (>10 years)
|
|
366
|
-
|
|
367
|
-
except Exception as e:
|
|
368
|
-
logger.warning(f"Error calculating recency score: {str(e)}")
|
|
369
|
-
return 5.0 # Default middle score
|
|
370
|
-
|
|
371
|
-
def _calculate_significance_score(self, paper: Dict[str, Any]) -> float:
|
|
372
|
-
"""
|
|
373
|
-
Calculate score based on title and abstract significance indicators with enhanced error handling.
|
|
374
|
-
|
|
375
|
-
Args:
|
|
376
|
-
paper: Paper dictionary
|
|
377
|
-
|
|
378
|
-
Returns:
|
|
379
|
-
Significance score
|
|
380
|
-
"""
|
|
381
|
-
try:
|
|
382
|
-
title = self._sanitize_text(paper.get("title", ""), max_length=1000).lower()
|
|
383
|
-
abstract = self._sanitize_text(paper.get("abstract", ""), max_length=5000).lower()
|
|
384
|
-
|
|
385
|
-
combined_text = title + " " + abstract
|
|
386
|
-
|
|
387
|
-
# Check for significant result terms
|
|
388
|
-
result_count = sum(1 for term in self.result_terms if term in combined_text)
|
|
389
|
-
|
|
390
|
-
# Score based on significance indicators
|
|
391
|
-
score = min(10.0, result_count * 2.0)
|
|
392
|
-
|
|
393
|
-
return score
|
|
394
|
-
|
|
395
|
-
except Exception as e:
|
|
396
|
-
logger.warning(f"Error calculating significance score: {str(e)}")
|
|
397
|
-
return 0.0
|
|
398
|
-
|
|
399
|
-
def _calculate_methodology_score(self, paper: Dict[str, Any]) -> float:
|
|
400
|
-
"""
|
|
401
|
-
Calculate score based on methodology innovation indicators with enhanced error handling.
|
|
402
|
-
|
|
403
|
-
Args:
|
|
404
|
-
paper: Paper dictionary
|
|
405
|
-
|
|
406
|
-
Returns:
|
|
407
|
-
Methodology score
|
|
408
|
-
"""
|
|
409
|
-
try:
|
|
410
|
-
title = self._sanitize_text(paper.get("title", ""), max_length=1000).lower()
|
|
411
|
-
abstract = self._sanitize_text(paper.get("abstract", ""), max_length=5000).lower()
|
|
412
|
-
|
|
413
|
-
combined_text = title + " " + abstract
|
|
414
|
-
|
|
415
|
-
# Check for methodology terms
|
|
416
|
-
method_count = sum(1 for term in self.method_terms if term in combined_text)
|
|
417
|
-
|
|
418
|
-
# Score based on methodology indicators
|
|
419
|
-
score = min(10.0, method_count * 2.0)
|
|
420
|
-
|
|
421
|
-
return score
|
|
422
|
-
|
|
423
|
-
except Exception as e:
|
|
424
|
-
logger.warning(f"Error calculating methodology score: {str(e)}")
|
|
425
|
-
return 0.0
|
|
426
|
-
|
|
427
|
-
def _calculate_contradiction_score(self, paper: Dict[str, Any]) -> float:
|
|
428
|
-
"""
|
|
429
|
-
Calculate score based on contradiction/challenge indicators with enhanced error handling.
|
|
430
|
-
|
|
431
|
-
Args:
|
|
432
|
-
paper: Paper dictionary
|
|
433
|
-
|
|
434
|
-
Returns:
|
|
435
|
-
Contradiction score
|
|
436
|
-
"""
|
|
437
|
-
try:
|
|
438
|
-
title = self._sanitize_text(paper.get("title", ""), max_length=1000).lower()
|
|
439
|
-
abstract = self._sanitize_text(paper.get("abstract", ""), max_length=5000).lower()
|
|
440
|
-
|
|
441
|
-
combined_text = title + " " + abstract
|
|
442
|
-
|
|
443
|
-
# Check for contradiction terms
|
|
444
|
-
contradiction_count = sum(1 for term in self.contradiction_terms if term in combined_text)
|
|
445
|
-
|
|
446
|
-
# Score based on contradiction indicators
|
|
447
|
-
score = min(10.0, contradiction_count * 2.0)
|
|
448
|
-
|
|
449
|
-
return score
|
|
450
|
-
|
|
451
|
-
except Exception as e:
|
|
452
|
-
logger.warning(f"Error calculating contradiction score: {str(e)}")
|
|
453
|
-
return 0.0
|
|
454
|
-
|
|
455
|
-
def _get_factor_breakdown(self, paper: Dict[str, Any], total_score: float) -> Dict[str, Any]:
|
|
456
|
-
"""
|
|
457
|
-
Get detailed breakdown of scoring factors with enhanced error handling.
|
|
458
|
-
|
|
459
|
-
Args:
|
|
460
|
-
paper: Paper dictionary
|
|
461
|
-
total_score: Total calculated score
|
|
462
|
-
|
|
463
|
-
Returns:
|
|
464
|
-
Factor breakdown dictionary
|
|
465
|
-
"""
|
|
466
|
-
try:
|
|
467
|
-
factors = {}
|
|
468
|
-
|
|
469
|
-
# Calculate individual factor scores
|
|
470
|
-
try:
|
|
471
|
-
factors["citation_impact"] = round(self._calculate_citation_score(paper) * 0.25, 2)
|
|
472
|
-
except Exception:
|
|
473
|
-
factors["citation_impact"] = 0.0
|
|
474
|
-
|
|
475
|
-
try:
|
|
476
|
-
factors["recency"] = round(self._calculate_recency_score(paper) * 0.15, 2)
|
|
477
|
-
except Exception:
|
|
478
|
-
factors["recency"] = 0.0
|
|
479
|
-
|
|
480
|
-
try:
|
|
481
|
-
factors["significance"] = round(self._calculate_significance_score(paper) * 0.20, 2)
|
|
482
|
-
except Exception:
|
|
483
|
-
factors["significance"] = 0.0
|
|
484
|
-
|
|
485
|
-
try:
|
|
486
|
-
factors["methodology"] = round(self._calculate_methodology_score(paper) * 0.20, 2)
|
|
487
|
-
except Exception:
|
|
488
|
-
factors["methodology"] = 0.0
|
|
489
|
-
|
|
490
|
-
try:
|
|
491
|
-
factors["contradiction_potential"] = round(self._calculate_contradiction_score(paper) * 0.20, 2)
|
|
492
|
-
except Exception:
|
|
493
|
-
factors["contradiction_potential"] = 0.0
|
|
494
|
-
|
|
495
|
-
# Add metadata
|
|
496
|
-
factors["total_score"] = round(total_score, 2)
|
|
497
|
-
factors["calculated_at"] = _utc_timestamp()
|
|
498
|
-
|
|
499
|
-
return factors
|
|
500
|
-
|
|
501
|
-
except Exception as e:
|
|
502
|
-
logger.warning(f"Error getting factor breakdown: {str(e)}")
|
|
503
|
-
return {
|
|
504
|
-
"total_score": round(total_score, 2),
|
|
505
|
-
"error": "Factor breakdown unavailable"
|
|
506
|
-
}
|
|
507
|
-
|
|
508
|
-
async def health_check(self) -> Dict[str, Any]:
|
|
509
|
-
"""
|
|
510
|
-
Perform health check of the critical paper detector.
|
|
511
|
-
|
|
512
|
-
Returns:
|
|
513
|
-
Health status
|
|
514
|
-
"""
|
|
515
|
-
try:
|
|
516
|
-
health_status = {
|
|
517
|
-
"status": "healthy",
|
|
518
|
-
"timestamp": _utc_timestamp(),
|
|
519
|
-
"components": {}
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
# Check term sets
|
|
523
|
-
try:
|
|
524
|
-
term_counts = {
|
|
525
|
-
"method_terms": len(self.method_terms),
|
|
526
|
-
"result_terms": len(self.result_terms),
|
|
527
|
-
"contradiction_terms": len(self.contradiction_terms)
|
|
528
|
-
}
|
|
529
|
-
health_status["components"]["term_sets"] = {
|
|
530
|
-
"status": "healthy",
|
|
531
|
-
"counts": term_counts
|
|
532
|
-
}
|
|
533
|
-
except Exception as e:
|
|
534
|
-
health_status["components"]["term_sets"] = {"status": "error", "error": str(e)}
|
|
535
|
-
health_status["status"] = "degraded"
|
|
536
|
-
|
|
537
|
-
# Check database operations if available
|
|
538
|
-
if self.db:
|
|
539
|
-
try:
|
|
540
|
-
health_status["components"]["database"] = {"status": "available"}
|
|
541
|
-
except Exception as e:
|
|
542
|
-
health_status["components"]["database"] = {"status": "error", "error": str(e)}
|
|
543
|
-
health_status["status"] = "degraded"
|
|
544
|
-
else:
|
|
545
|
-
health_status["components"]["database"] = {"status": "not_configured"}
|
|
546
|
-
|
|
547
|
-
#logger.info(f"Health check completed: {health_status['status']}")
|
|
548
|
-
return health_status
|
|
549
|
-
|
|
550
|
-
except Exception as e:
|
|
551
|
-
logger.error(f"Health check failed: {str(e)}")
|
|
552
|
-
return {
|
|
553
|
-
"status": "error",
|
|
554
|
-
"error": str(e),
|
|
555
|
-
"timestamp": _utc_timestamp()
|
|
556
|
-
}
|
|
557
|
-
|
|
558
|
-
def get_stats(self) -> Dict[str, Any]:
|
|
559
|
-
"""
|
|
560
|
-
Get statistics about the detector.
|
|
561
|
-
|
|
562
|
-
Returns:
|
|
563
|
-
Statistics dictionary
|
|
564
|
-
"""
|
|
565
|
-
try:
|
|
566
|
-
stats = {
|
|
567
|
-
"method_terms_count": len(self.method_terms),
|
|
568
|
-
"result_terms_count": len(self.result_terms),
|
|
569
|
-
"contradiction_terms_count": len(self.contradiction_terms),
|
|
570
|
-
"database_configured": self.db is not None
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
return stats
|
|
574
|
-
|
|
575
|
-
except Exception as e:
|
|
576
|
-
logger.error(f"Error getting stats: {str(e)}")
|
|
577
|
-
return {"error": str(e)}
|
|
@@ -1,121 +0,0 @@
|
|
|
1
|
-
"""High-level orchestration service combining search and synthesis."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
from typing import Any, Dict, List, Optional
|
|
7
|
-
|
|
8
|
-
from src.services.llm_service import LLMManager
|
|
9
|
-
from src.services.paper_service import OpenAlexClient
|
|
10
|
-
from src.services.performance_service.rust_performance import HighPerformanceService
|
|
11
|
-
from src.services.research_service.enhanced_synthesizer import EnhancedSynthesizer
|
|
12
|
-
from src.services.search_service import SearchEngine
|
|
13
|
-
|
|
14
|
-
logger = logging.getLogger(__name__)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class EnhancedResearchService:
|
|
18
|
-
"""Bundle search + synthesis into a cohesive workflow."""
|
|
19
|
-
|
|
20
|
-
def __init__(
|
|
21
|
-
self,
|
|
22
|
-
*,
|
|
23
|
-
search_engine: Optional[SearchEngine] = None,
|
|
24
|
-
synthesizer: Optional[EnhancedSynthesizer] = None,
|
|
25
|
-
llm_manager: Optional[LLMManager] = None,
|
|
26
|
-
openalex_client: Optional[OpenAlexClient] = None,
|
|
27
|
-
performance_service: Optional[HighPerformanceService] = None,
|
|
28
|
-
) -> None:
|
|
29
|
-
self.openalex = openalex_client or OpenAlexClient()
|
|
30
|
-
self.llm = llm_manager or LLMManager()
|
|
31
|
-
self.performance = performance_service or HighPerformanceService()
|
|
32
|
-
self.search_engine = search_engine or SearchEngine(
|
|
33
|
-
openalex_client=self.openalex,
|
|
34
|
-
performance_service=self.performance,
|
|
35
|
-
)
|
|
36
|
-
self.synthesizer = synthesizer or EnhancedSynthesizer(
|
|
37
|
-
llm_manager=self.llm,
|
|
38
|
-
openalex_client=self.openalex,
|
|
39
|
-
performance_service=self.performance,
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
async def conduct_research(
|
|
43
|
-
self,
|
|
44
|
-
query: str,
|
|
45
|
-
*,
|
|
46
|
-
limit: int = 10,
|
|
47
|
-
max_words: int = 600,
|
|
48
|
-
style: str = "comprehensive",
|
|
49
|
-
include_advanced: bool = True,
|
|
50
|
-
context: Optional[Dict[str, Any]] = None,
|
|
51
|
-
) -> Dict[str, Any]:
|
|
52
|
-
if not query or not query.strip():
|
|
53
|
-
raise ValueError("Query must be a non-empty string")
|
|
54
|
-
|
|
55
|
-
context = context or {}
|
|
56
|
-
context.setdefault("original_query", query)
|
|
57
|
-
|
|
58
|
-
search_payload = await self.search_engine.search_papers(
|
|
59
|
-
query,
|
|
60
|
-
limit=limit,
|
|
61
|
-
sources=("openalex", "pubmed") if include_advanced else ("openalex",),
|
|
62
|
-
include_metadata=True,
|
|
63
|
-
include_abstracts=True,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
paper_ids = [paper["id"] for paper in search_payload.get("papers", [])]
|
|
67
|
-
raw_papers: List[Dict[str, Any]] = []
|
|
68
|
-
if paper_ids:
|
|
69
|
-
raw_papers = await self.openalex.get_papers_bulk(paper_ids)
|
|
70
|
-
|
|
71
|
-
if not raw_papers:
|
|
72
|
-
# Fall back to lightly formatted payloads if bulk fetch fails
|
|
73
|
-
raw_papers = [self._paper_stub(paper) for paper in search_payload.get("papers", [])]
|
|
74
|
-
|
|
75
|
-
synthesis = await self.synthesizer.synthesize_research(
|
|
76
|
-
papers=raw_papers,
|
|
77
|
-
max_words=max_words,
|
|
78
|
-
style=style,
|
|
79
|
-
context=context,
|
|
80
|
-
include_visualizations=True,
|
|
81
|
-
include_topic_modeling=True,
|
|
82
|
-
include_quality_assessment=True,
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
return {
|
|
86
|
-
"query": query,
|
|
87
|
-
"search": search_payload,
|
|
88
|
-
"synthesis": synthesis,
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
async def get_health_status(self) -> Dict[str, Any]:
|
|
92
|
-
search_stats = {
|
|
93
|
-
"openalex": True,
|
|
94
|
-
"web_search": True,
|
|
95
|
-
}
|
|
96
|
-
try:
|
|
97
|
-
kg_stats = await self.synthesizer.kg.stats()
|
|
98
|
-
except Exception as exc: # pragma: no cover - KG optional
|
|
99
|
-
logger.info("Knowledge graph stats unavailable", extra={"error": str(exc)})
|
|
100
|
-
kg_stats = {"entities": 0, "relationships": 0}
|
|
101
|
-
|
|
102
|
-
llm_health = await self.llm.health_check()
|
|
103
|
-
return {
|
|
104
|
-
"search": search_stats,
|
|
105
|
-
"knowledge_graph": kg_stats,
|
|
106
|
-
"llm": llm_health,
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
def _paper_stub(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
110
|
-
return {
|
|
111
|
-
"id": payload.get("id"),
|
|
112
|
-
"title": payload.get("title"),
|
|
113
|
-
"abstract": payload.get("abstract", ""),
|
|
114
|
-
"authors": payload.get("authors", []),
|
|
115
|
-
"publication_year": payload.get("year"),
|
|
116
|
-
"doi": payload.get("doi"),
|
|
117
|
-
"concepts": [{"display_name": kw} for kw in payload.get("keywords", [])],
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
__all__ = ["EnhancedResearchService"]
|