cite-agent 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cite-agent might be problematic. Click here for more details.
- cite_agent/__init__.py +1 -1
- cite_agent/account_client.py +19 -46
- cite_agent/agent_backend_only.py +30 -4
- cite_agent/cli.py +24 -26
- cite_agent/cli_conversational.py +294 -0
- cite_agent/enhanced_ai_agent.py +2776 -118
- cite_agent/setup_config.py +5 -21
- cite_agent/streaming_ui.py +252 -0
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/METADATA +4 -3
- cite_agent-1.0.5.dist-info/RECORD +50 -0
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/top_level.txt +1 -0
- src/__init__.py +1 -0
- src/services/__init__.py +132 -0
- src/services/auth_service/__init__.py +3 -0
- src/services/auth_service/auth_manager.py +33 -0
- src/services/graph/__init__.py +1 -0
- src/services/graph/knowledge_graph.py +194 -0
- src/services/llm_service/__init__.py +5 -0
- src/services/llm_service/llm_manager.py +495 -0
- src/services/paper_service/__init__.py +5 -0
- src/services/paper_service/openalex.py +231 -0
- src/services/performance_service/__init__.py +1 -0
- src/services/performance_service/rust_performance.py +395 -0
- src/services/research_service/__init__.py +23 -0
- src/services/research_service/chatbot.py +2056 -0
- src/services/research_service/citation_manager.py +436 -0
- src/services/research_service/context_manager.py +1441 -0
- src/services/research_service/conversation_manager.py +597 -0
- src/services/research_service/critical_paper_detector.py +577 -0
- src/services/research_service/enhanced_research.py +121 -0
- src/services/research_service/enhanced_synthesizer.py +375 -0
- src/services/research_service/query_generator.py +777 -0
- src/services/research_service/synthesizer.py +1273 -0
- src/services/search_service/__init__.py +5 -0
- src/services/search_service/indexer.py +186 -0
- src/services/search_service/search_engine.py +342 -0
- src/services/simple_enhanced_main.py +287 -0
- cite_agent/__distribution__.py +0 -7
- cite_agent-1.0.4.dist-info/RECORD +0 -23
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/WHEEL +0 -0
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/entry_points.txt +0 -0
- {cite_agent-1.0.4.dist-info → cite_agent-1.0.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1273 @@
|
|
|
1
|
+
#synthesizer.py
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import List, Dict, Optional, Any
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from dataclasses import asdict
|
|
9
|
+
import json
|
|
10
|
+
import redis.asyncio as redis
|
|
11
|
+
import hashlib
|
|
12
|
+
|
|
13
|
+
from src.storage.db.operations import DatabaseOperations
|
|
14
|
+
from src.services.llm_service.llm_manager import LLMManager
|
|
15
|
+
from src.services.graph.knowledge_graph import KnowledgeGraph
|
|
16
|
+
from .citation_manager import CitationManager, Citation, CitedFinding, CitationFormat
|
|
17
|
+
|
|
18
|
+
# Configure structured logging
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _utc_now() -> datetime:
|
|
23
|
+
return datetime.now(timezone.utc)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _utc_timestamp() -> str:
|
|
27
|
+
return _utc_now().isoformat()
|
|
28
|
+
|
|
29
|
+
class ResearchSynthesizer:
|
|
30
|
+
"""
|
|
31
|
+
Enhanced research synthesizer with comprehensive error handling, security, and observability.
|
|
32
|
+
|
|
33
|
+
Features:
|
|
34
|
+
- Secure paper synthesis and analysis
|
|
35
|
+
- Input validation and sanitization
|
|
36
|
+
- Comprehensive error handling and retry logic
|
|
37
|
+
- Structured logging and monitoring
|
|
38
|
+
- Protection against injection attacks
|
|
39
|
+
- Caching and task management
|
|
40
|
+
- Knowledge Graph entity extraction
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, db_ops: DatabaseOperations, llm_manager: LLMManager, redis_url: str, kg_client: Optional[KnowledgeGraph] = None, openalex_client=None):
|
|
44
|
+
"""
|
|
45
|
+
Initialize research synthesizer with enhanced security and error handling.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
db_ops: Database operations instance
|
|
49
|
+
llm_manager: LLM manager instance
|
|
50
|
+
redis_url: Redis connection URL
|
|
51
|
+
kg_client: Knowledge Graph client for entity extraction
|
|
52
|
+
openalex_client: OpenAlex client for citation network building
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
ValueError: If parameters are invalid
|
|
56
|
+
ConnectionError: If Redis connection fails
|
|
57
|
+
"""
|
|
58
|
+
try:
|
|
59
|
+
if not db_ops:
|
|
60
|
+
raise ValueError("Database operations instance is required")
|
|
61
|
+
if not llm_manager:
|
|
62
|
+
raise ValueError("LLM manager instance is required")
|
|
63
|
+
if not redis_url:
|
|
64
|
+
raise ValueError("Redis URL is required")
|
|
65
|
+
|
|
66
|
+
#logger.info("Initializing ResearchSynthesizer with enhanced security")
|
|
67
|
+
|
|
68
|
+
self.db = db_ops
|
|
69
|
+
self.llm = llm_manager
|
|
70
|
+
self.kg_client = kg_client
|
|
71
|
+
|
|
72
|
+
# Initialize citation manager
|
|
73
|
+
self.citation_manager = CitationManager(db_ops=db_ops, openalex_client=openalex_client)
|
|
74
|
+
|
|
75
|
+
# Initialize Redis with error handling
|
|
76
|
+
try:
|
|
77
|
+
self.redis_client = redis.from_url(redis_url)
|
|
78
|
+
#logger.info("Redis client initialized successfully")
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f"Failed to initialize Redis client: {str(e)}")
|
|
81
|
+
raise ConnectionError(f"Redis connection failed: {str(e)}")
|
|
82
|
+
|
|
83
|
+
self.synthesis_cache = {}
|
|
84
|
+
self.synthesis_tasks = {}
|
|
85
|
+
|
|
86
|
+
#logger.info("ResearchSynthesizer initialized successfully")
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.error(f"Failed to initialize ResearchSynthesizer: {str(e)}")
|
|
90
|
+
raise
|
|
91
|
+
|
|
92
|
+
def _validate_paper_ids(self, paper_ids: List[str]) -> None:
|
|
93
|
+
"""
|
|
94
|
+
Validate paper IDs for security and safety.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
paper_ids: List of paper IDs to validate
|
|
98
|
+
|
|
99
|
+
Raises:
|
|
100
|
+
ValueError: If paper IDs are invalid
|
|
101
|
+
"""
|
|
102
|
+
if not isinstance(paper_ids, list):
|
|
103
|
+
raise ValueError("Paper IDs must be a list")
|
|
104
|
+
|
|
105
|
+
if not paper_ids:
|
|
106
|
+
raise ValueError("Paper IDs list cannot be empty")
|
|
107
|
+
|
|
108
|
+
if len(paper_ids) > 100: # Reasonable limit
|
|
109
|
+
raise ValueError("Too many paper IDs (max 100)")
|
|
110
|
+
|
|
111
|
+
for i, paper_id in enumerate(paper_ids):
|
|
112
|
+
if not isinstance(paper_id, str) or not paper_id.strip():
|
|
113
|
+
raise ValueError(f"Invalid paper ID at index {i}: must be non-empty string")
|
|
114
|
+
|
|
115
|
+
# Check for potentially dangerous patterns
|
|
116
|
+
if re.search(r'[<>"\']', paper_id):
|
|
117
|
+
raise ValueError(f"Paper ID at index {i} contains invalid characters")
|
|
118
|
+
|
|
119
|
+
def _sanitize_text(self, text: str, max_length: int = 10000) -> str:
|
|
120
|
+
"""
|
|
121
|
+
Sanitize text to prevent injection attacks.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
text: Text to sanitize
|
|
125
|
+
max_length: Maximum allowed length
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Sanitized text
|
|
129
|
+
"""
|
|
130
|
+
if not isinstance(text, str):
|
|
131
|
+
raise ValueError("Text must be a string")
|
|
132
|
+
|
|
133
|
+
if len(text) > max_length:
|
|
134
|
+
text = text[:max_length]
|
|
135
|
+
|
|
136
|
+
# Basic XSS protection
|
|
137
|
+
sanitized = text.replace('<', '<').replace('>', '>')
|
|
138
|
+
|
|
139
|
+
# Remove null bytes and other control characters
|
|
140
|
+
sanitized = ''.join(char for char in sanitized if ord(char) >= 32 or char in '\n\r\t')
|
|
141
|
+
|
|
142
|
+
return sanitized.strip()
|
|
143
|
+
|
|
144
|
+
async def synthesize_papers(self, paper_ids: List[str], force_refresh: bool = False) -> Dict[str, Any]:
|
|
145
|
+
"""
|
|
146
|
+
Synthesize findings across multiple papers with enhanced error handling and security.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
paper_ids: List of paper IDs to synthesize
|
|
150
|
+
force_refresh: Whether to force refresh cached results
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Synthesis results
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
ValueError: If paper IDs are invalid
|
|
157
|
+
ConnectionError: If synthesis fails
|
|
158
|
+
"""
|
|
159
|
+
try:
|
|
160
|
+
# Input validation and sanitization
|
|
161
|
+
self._validate_paper_ids(paper_ids)
|
|
162
|
+
|
|
163
|
+
# Create cache key
|
|
164
|
+
cache_key = f"synthesis:{hashlib.md5('_'.join(sorted(paper_ids)).encode()).hexdigest()}"
|
|
165
|
+
|
|
166
|
+
#logger.info(f"Synthesizing {len(paper_ids)} papers (force_refresh: {force_refresh})")
|
|
167
|
+
|
|
168
|
+
# Check cache if not forcing refresh
|
|
169
|
+
if not force_refresh:
|
|
170
|
+
try:
|
|
171
|
+
if cached := await self._get_cached_synthesis(cache_key):
|
|
172
|
+
#logger.info("Using cached synthesis")
|
|
173
|
+
return cached
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.warning(f"Failed to retrieve cached synthesis: {str(e)}")
|
|
176
|
+
|
|
177
|
+
# Create synthesis task if not already running
|
|
178
|
+
if cache_key not in self.synthesis_tasks:
|
|
179
|
+
self.synthesis_tasks[cache_key] = asyncio.create_task(
|
|
180
|
+
self._generate_synthesis(paper_ids, cache_key)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
return await self.synthesis_tasks[cache_key]
|
|
185
|
+
finally:
|
|
186
|
+
if cache_key in self.synthesis_tasks:
|
|
187
|
+
del self.synthesis_tasks[cache_key]
|
|
188
|
+
|
|
189
|
+
except ValueError as e:
|
|
190
|
+
logger.error(f"Invalid input for paper synthesis: {str(e)}")
|
|
191
|
+
raise
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.error(f"Error synthesizing papers: {str(e)}")
|
|
194
|
+
raise
|
|
195
|
+
|
|
196
|
+
async def _generate_synthesis(self, paper_ids: List[str], cache_key: str) -> Dict[str, Any]:
|
|
197
|
+
"""
|
|
198
|
+
Generate comprehensive synthesis of papers with enhanced error handling.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
paper_ids: List of paper IDs
|
|
202
|
+
cache_key: Cache key for storing results
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Synthesis results
|
|
206
|
+
|
|
207
|
+
Raises:
|
|
208
|
+
ConnectionError: If synthesis generation fails
|
|
209
|
+
"""
|
|
210
|
+
try:
|
|
211
|
+
# Gather papers with error handling
|
|
212
|
+
papers = []
|
|
213
|
+
for pid in paper_ids:
|
|
214
|
+
try:
|
|
215
|
+
if paper := await self.db.get_processed_paper(pid):
|
|
216
|
+
papers.append(paper)
|
|
217
|
+
else:
|
|
218
|
+
logger.warning(f"Paper {pid} not found in database")
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"Error retrieving paper {pid}: {str(e)}")
|
|
221
|
+
|
|
222
|
+
if not papers:
|
|
223
|
+
logger.warning("No valid papers found for synthesis")
|
|
224
|
+
return {
|
|
225
|
+
"error": "No valid papers found",
|
|
226
|
+
"paper_count": 0,
|
|
227
|
+
"generated_at": _utc_timestamp()
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
#logger.info(f"Retrieved {len(papers)} papers for synthesis")
|
|
231
|
+
|
|
232
|
+
# Generate all aspects concurrently with error handling
|
|
233
|
+
synthesis_tasks = {
|
|
234
|
+
"common_findings": self._extract_common_findings(papers),
|
|
235
|
+
"contradictions": self._find_contradictions(papers),
|
|
236
|
+
"research_gaps": self._identify_gaps(papers),
|
|
237
|
+
"timeline": self._create_timeline(papers),
|
|
238
|
+
"connections": self._map_connections(papers),
|
|
239
|
+
"methodology_analysis": self._analyze_methodologies(papers),
|
|
240
|
+
"future_directions": self._suggest_future_directions(papers),
|
|
241
|
+
"citation_analysis": self._analyze_citations(papers)
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
synthesis = {}
|
|
245
|
+
try:
|
|
246
|
+
# Use gather instead of TaskGroup for better error handling
|
|
247
|
+
results = await asyncio.gather(*synthesis_tasks.values(), return_exceptions=True)
|
|
248
|
+
|
|
249
|
+
for key, result in zip(synthesis_tasks.keys(), results):
|
|
250
|
+
if isinstance(result, Exception):
|
|
251
|
+
logger.error(f"Error in {key}: {str(result)}")
|
|
252
|
+
synthesis[key] = {"error": str(result)}
|
|
253
|
+
else:
|
|
254
|
+
synthesis[key] = result
|
|
255
|
+
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.error(f"Error in concurrent synthesis tasks: {str(e)}")
|
|
258
|
+
# Fallback to sequential processing
|
|
259
|
+
for key, coro in synthesis_tasks.items():
|
|
260
|
+
try:
|
|
261
|
+
synthesis[key] = await coro
|
|
262
|
+
except Exception as task_error:
|
|
263
|
+
logger.error(f"Error in {key}: {str(task_error)}")
|
|
264
|
+
synthesis[key] = {"error": str(task_error)}
|
|
265
|
+
|
|
266
|
+
# Add metadata
|
|
267
|
+
synthesis["meta"] = {
|
|
268
|
+
"paper_count": len(papers),
|
|
269
|
+
"generated_at": _utc_timestamp(),
|
|
270
|
+
"paper_ids": paper_ids,
|
|
271
|
+
"success": True
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
# Extract simple entities/relations and upsert to KG (very basic placeholder)
|
|
275
|
+
try:
|
|
276
|
+
if self.kg_client:
|
|
277
|
+
await self._extract_and_upsert_entities(papers, synthesis)
|
|
278
|
+
except Exception:
|
|
279
|
+
pass
|
|
280
|
+
|
|
281
|
+
# Cache the results with error handling
|
|
282
|
+
try:
|
|
283
|
+
await self._cache_synthesis(cache_key, synthesis)
|
|
284
|
+
except Exception as e:
|
|
285
|
+
logger.warning(f"Failed to cache synthesis: {str(e)}")
|
|
286
|
+
|
|
287
|
+
#logger.info(f"Successfully generated synthesis for {len(papers)} papers")
|
|
288
|
+
return synthesis
|
|
289
|
+
|
|
290
|
+
except Exception as e:
|
|
291
|
+
logger.error(f"Error generating synthesis: {str(e)}")
|
|
292
|
+
raise ConnectionError(f"Synthesis generation failed: {str(e)}")
|
|
293
|
+
|
|
294
|
+
async def _extract_common_findings(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
295
|
+
"""
|
|
296
|
+
Extract and structure common findings across papers with enhanced error handling.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
papers: List of paper dictionaries
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
List of common findings
|
|
303
|
+
"""
|
|
304
|
+
try:
|
|
305
|
+
if not papers:
|
|
306
|
+
return []
|
|
307
|
+
|
|
308
|
+
# Extract and sanitize summaries
|
|
309
|
+
summaries = []
|
|
310
|
+
for paper in papers:
|
|
311
|
+
if isinstance(paper, dict) and paper.get('summary'):
|
|
312
|
+
sanitized_summary = self._sanitize_text(paper['summary'])
|
|
313
|
+
summaries.append(sanitized_summary)
|
|
314
|
+
|
|
315
|
+
if not summaries:
|
|
316
|
+
logger.warning("No summaries available for finding extraction")
|
|
317
|
+
return []
|
|
318
|
+
|
|
319
|
+
prompt = """
|
|
320
|
+
Analyze these research summaries and identify common findings.
|
|
321
|
+
For each finding, specify:
|
|
322
|
+
1. The key point
|
|
323
|
+
2. How many papers support it
|
|
324
|
+
3. The strength of evidence (strong/moderate/weak)
|
|
325
|
+
4. Any important context or limitations
|
|
326
|
+
|
|
327
|
+
Summaries:
|
|
328
|
+
{summaries}
|
|
329
|
+
|
|
330
|
+
Provide structured findings focusing on well-supported conclusions.
|
|
331
|
+
"""
|
|
332
|
+
|
|
333
|
+
try:
|
|
334
|
+
response = await self.llm.generate_synthesis(
|
|
335
|
+
[{"content": summary} for summary in summaries],
|
|
336
|
+
prompt.format(summaries="\n\n".join(summaries))
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
if isinstance(response, dict) and "summary" in response:
|
|
340
|
+
return self._parse_findings(response["summary"])
|
|
341
|
+
else:
|
|
342
|
+
return self._parse_findings(str(response))
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.error(f"Error calling LLM for findings extraction: {str(e)}")
|
|
346
|
+
return []
|
|
347
|
+
|
|
348
|
+
except Exception as e:
|
|
349
|
+
logger.error(f"Error extracting findings: {str(e)}")
|
|
350
|
+
return []
|
|
351
|
+
|
|
352
|
+
async def _find_contradictions(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
353
|
+
"""
|
|
354
|
+
Identify and analyze contradictions between papers with enhanced error handling.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
papers: List of paper dictionaries
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
List of contradictions
|
|
361
|
+
"""
|
|
362
|
+
try:
|
|
363
|
+
if not papers:
|
|
364
|
+
return []
|
|
365
|
+
|
|
366
|
+
# Create paper summaries
|
|
367
|
+
paper_summaries = []
|
|
368
|
+
for i, paper in enumerate(papers):
|
|
369
|
+
if isinstance(paper, dict):
|
|
370
|
+
title = self._sanitize_text(paper.get('title', 'Untitled'), max_length=200)
|
|
371
|
+
summary = self._sanitize_text(paper.get('summary', ''), max_length=1000)
|
|
372
|
+
paper_summaries.append(f"Paper {i+1}: {title}\n{summary}")
|
|
373
|
+
|
|
374
|
+
if not paper_summaries:
|
|
375
|
+
logger.warning("No paper summaries available for contradiction analysis")
|
|
376
|
+
return []
|
|
377
|
+
|
|
378
|
+
prompt = """
|
|
379
|
+
Compare these papers and identify any contradictions or disagreements.
|
|
380
|
+
For each contradiction, specify:
|
|
381
|
+
1. The topic of disagreement
|
|
382
|
+
2. The competing viewpoints
|
|
383
|
+
3. The papers supporting each view
|
|
384
|
+
4. Possible reasons for the disagreement
|
|
385
|
+
|
|
386
|
+
Papers:
|
|
387
|
+
{papers}
|
|
388
|
+
|
|
389
|
+
Focus on significant disagreements that affect research conclusions.
|
|
390
|
+
"""
|
|
391
|
+
|
|
392
|
+
try:
|
|
393
|
+
response = await self.llm.generate_synthesis(
|
|
394
|
+
[{"content": summary} for summary in paper_summaries],
|
|
395
|
+
prompt.format(papers="\n\n".join(paper_summaries))
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
if isinstance(response, dict) and "summary" in response:
|
|
399
|
+
return self._parse_contradictions(response["summary"])
|
|
400
|
+
else:
|
|
401
|
+
return self._parse_contradictions(str(response))
|
|
402
|
+
|
|
403
|
+
except Exception as e:
|
|
404
|
+
logger.error(f"Error calling LLM for contradiction analysis: {str(e)}")
|
|
405
|
+
return []
|
|
406
|
+
|
|
407
|
+
except Exception as e:
|
|
408
|
+
logger.error(f"Error finding contradictions: {str(e)}")
|
|
409
|
+
return []
|
|
410
|
+
|
|
411
|
+
async def _identify_gaps(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
412
|
+
"""
|
|
413
|
+
Identify research gaps and opportunities with enhanced error handling.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
papers: List of paper dictionaries
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
List of research gaps
|
|
420
|
+
"""
|
|
421
|
+
try:
|
|
422
|
+
if not papers:
|
|
423
|
+
return []
|
|
424
|
+
|
|
425
|
+
# Extract summaries
|
|
426
|
+
summaries = []
|
|
427
|
+
for paper in papers:
|
|
428
|
+
if isinstance(paper, dict) and paper.get('summary'):
|
|
429
|
+
sanitized_summary = self._sanitize_text(paper['summary'], max_length=1000)
|
|
430
|
+
summaries.append(sanitized_summary)
|
|
431
|
+
|
|
432
|
+
if not summaries:
|
|
433
|
+
logger.warning("No summaries available for gap analysis")
|
|
434
|
+
return []
|
|
435
|
+
|
|
436
|
+
prompt = """
|
|
437
|
+
Based on these papers, identify:
|
|
438
|
+
1. Unexplored research areas
|
|
439
|
+
2. Methodological gaps
|
|
440
|
+
3. Unanswered questions
|
|
441
|
+
4. Limitations in current research
|
|
442
|
+
5. Potential research opportunities
|
|
443
|
+
|
|
444
|
+
Papers:
|
|
445
|
+
{papers}
|
|
446
|
+
|
|
447
|
+
Prioritize gaps that could lead to meaningful research contributions.
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
response = await self.llm.generate_synthesis(
|
|
452
|
+
[{"content": summary} for summary in summaries],
|
|
453
|
+
prompt.format(papers="\n".join(summaries))
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
if isinstance(response, dict) and "summary" in response:
|
|
457
|
+
response_text = response["summary"]
|
|
458
|
+
else:
|
|
459
|
+
response_text = str(response)
|
|
460
|
+
|
|
461
|
+
gaps = []
|
|
462
|
+
for line in response_text.split('\n'):
|
|
463
|
+
if line.strip():
|
|
464
|
+
gaps.append({
|
|
465
|
+
"gap": self._sanitize_text(line.strip(), max_length=500),
|
|
466
|
+
"type": self._categorize_gap(line),
|
|
467
|
+
"identified_at": _utc_timestamp()
|
|
468
|
+
})
|
|
469
|
+
return gaps
|
|
470
|
+
|
|
471
|
+
except Exception as e:
|
|
472
|
+
logger.error(f"Error calling LLM for gap analysis: {str(e)}")
|
|
473
|
+
return []
|
|
474
|
+
|
|
475
|
+
except Exception as e:
|
|
476
|
+
logger.error(f"Error identifying gaps: {str(e)}")
|
|
477
|
+
return []
|
|
478
|
+
|
|
479
|
+
async def _analyze_methodologies(self, papers: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
480
|
+
"""
|
|
481
|
+
Analyze and compare research methodologies with enhanced error handling.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
papers: List of paper dictionaries
|
|
485
|
+
|
|
486
|
+
Returns:
|
|
487
|
+
Methodology analysis results
|
|
488
|
+
"""
|
|
489
|
+
try:
|
|
490
|
+
if not papers:
|
|
491
|
+
return {"error": "No papers provided"}
|
|
492
|
+
|
|
493
|
+
methodologies = {}
|
|
494
|
+
|
|
495
|
+
for paper in papers:
|
|
496
|
+
if isinstance(paper, dict):
|
|
497
|
+
method = self._sanitize_text(paper.get('methodology', 'Not specified'), max_length=200)
|
|
498
|
+
if method not in methodologies:
|
|
499
|
+
methodologies[method] = {
|
|
500
|
+
'count': 0,
|
|
501
|
+
'papers': [],
|
|
502
|
+
'strengths': [],
|
|
503
|
+
'limitations': []
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
methodologies[method]['count'] += 1
|
|
507
|
+
methodologies[method]['papers'].append(paper.get('title', 'Untitled'))
|
|
508
|
+
|
|
509
|
+
# Analyze methodologies using LLM
|
|
510
|
+
try:
|
|
511
|
+
method_text = "\n".join([
|
|
512
|
+
f"Method: {method} (used in {info['count']} papers)"
|
|
513
|
+
for method, info in methodologies.items()
|
|
514
|
+
])
|
|
515
|
+
|
|
516
|
+
prompt = """
|
|
517
|
+
Analyze these research methodologies and identify:
|
|
518
|
+
1. Strengths of each approach
|
|
519
|
+
2. Limitations and weaknesses
|
|
520
|
+
3. Comparative advantages
|
|
521
|
+
4. Recommendations for improvement
|
|
522
|
+
|
|
523
|
+
Methodologies:
|
|
524
|
+
{methods}
|
|
525
|
+
"""
|
|
526
|
+
|
|
527
|
+
response = await self.llm.generate_synthesis(
|
|
528
|
+
[{"content": method_text}],
|
|
529
|
+
prompt.format(methods=method_text)
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
if isinstance(response, dict) and "summary" in response:
|
|
533
|
+
analysis = response["summary"]
|
|
534
|
+
else:
|
|
535
|
+
analysis = str(response)
|
|
536
|
+
|
|
537
|
+
methodologies["analysis"] = self._sanitize_text(analysis, max_length=2000)
|
|
538
|
+
|
|
539
|
+
except Exception as e:
|
|
540
|
+
logger.error(f"Error analyzing methodologies with LLM: {str(e)}")
|
|
541
|
+
methodologies["analysis"] = "Methodology analysis failed"
|
|
542
|
+
|
|
543
|
+
return methodologies
|
|
544
|
+
|
|
545
|
+
except Exception as e:
|
|
546
|
+
logger.error(f"Error analyzing methodologies: {str(e)}")
|
|
547
|
+
return {"error": str(e)}
|
|
548
|
+
|
|
549
|
+
async def _suggest_future_directions(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
550
|
+
"""
|
|
551
|
+
Suggest future research directions with enhanced error handling.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
papers: List of paper dictionaries
|
|
555
|
+
|
|
556
|
+
Returns:
|
|
557
|
+
List of future research directions
|
|
558
|
+
"""
|
|
559
|
+
try:
|
|
560
|
+
if not papers:
|
|
561
|
+
return []
|
|
562
|
+
|
|
563
|
+
# Extract key information
|
|
564
|
+
summaries = []
|
|
565
|
+
for paper in papers:
|
|
566
|
+
if isinstance(paper, dict) and paper.get('summary'):
|
|
567
|
+
sanitized_summary = self._sanitize_text(paper['summary'], max_length=1000)
|
|
568
|
+
summaries.append(sanitized_summary)
|
|
569
|
+
|
|
570
|
+
if not summaries:
|
|
571
|
+
logger.warning("No summaries available for future directions analysis")
|
|
572
|
+
return []
|
|
573
|
+
|
|
574
|
+
prompt = """
|
|
575
|
+
Based on these research papers, suggest future research directions:
|
|
576
|
+
1. Emerging trends and opportunities
|
|
577
|
+
2. Unanswered questions
|
|
578
|
+
3. Potential applications
|
|
579
|
+
4. Methodological improvements
|
|
580
|
+
5. Cross-disciplinary opportunities
|
|
581
|
+
|
|
582
|
+
Papers:
|
|
583
|
+
{papers}
|
|
584
|
+
|
|
585
|
+
Provide specific, actionable research directions.
|
|
586
|
+
"""
|
|
587
|
+
|
|
588
|
+
try:
|
|
589
|
+
response = await self.llm.generate_synthesis(
|
|
590
|
+
[{"content": summary} for summary in summaries],
|
|
591
|
+
prompt.format(papers="\n".join(summaries))
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
if isinstance(response, dict) and "summary" in response:
|
|
595
|
+
response_text = response["summary"]
|
|
596
|
+
else:
|
|
597
|
+
response_text = str(response)
|
|
598
|
+
|
|
599
|
+
directions = []
|
|
600
|
+
for line in response_text.split('\n'):
|
|
601
|
+
if line.strip():
|
|
602
|
+
directions.append({
|
|
603
|
+
"direction": self._sanitize_text(line.strip(), max_length=500),
|
|
604
|
+
"suggested_at": _utc_timestamp()
|
|
605
|
+
})
|
|
606
|
+
return directions
|
|
607
|
+
|
|
608
|
+
except Exception as e:
|
|
609
|
+
logger.error(f"Error calling LLM for future directions: {str(e)}")
|
|
610
|
+
return []
|
|
611
|
+
|
|
612
|
+
except Exception as e:
|
|
613
|
+
logger.error(f"Error suggesting future directions: {str(e)}")
|
|
614
|
+
return []
|
|
615
|
+
|
|
616
|
+
async def _create_timeline(self, papers: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
617
|
+
"""
|
|
618
|
+
Create research timeline with enhanced error handling.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
papers: List of paper dictionaries
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
Timeline data
|
|
625
|
+
"""
|
|
626
|
+
try:
|
|
627
|
+
if not papers:
|
|
628
|
+
return {"error": "No papers provided"}
|
|
629
|
+
|
|
630
|
+
# Extract publication dates and key events
|
|
631
|
+
timeline_events = []
|
|
632
|
+
|
|
633
|
+
for paper in papers:
|
|
634
|
+
if isinstance(paper, dict):
|
|
635
|
+
title = self._sanitize_text(paper.get('title', 'Untitled'), max_length=200)
|
|
636
|
+
year = paper.get('year')
|
|
637
|
+
summary = self._sanitize_text(paper.get('summary', ''), max_length=500)
|
|
638
|
+
|
|
639
|
+
if year:
|
|
640
|
+
timeline_events.append({
|
|
641
|
+
"year": year,
|
|
642
|
+
"title": title,
|
|
643
|
+
"summary": summary,
|
|
644
|
+
"type": "publication"
|
|
645
|
+
})
|
|
646
|
+
|
|
647
|
+
# Sort by year
|
|
648
|
+
timeline_events.sort(key=lambda x: x.get('year', 0))
|
|
649
|
+
|
|
650
|
+
# Group by year
|
|
651
|
+
timeline = {}
|
|
652
|
+
for event in timeline_events:
|
|
653
|
+
year = event['year']
|
|
654
|
+
if year not in timeline:
|
|
655
|
+
timeline[year] = []
|
|
656
|
+
timeline[year].append(event)
|
|
657
|
+
|
|
658
|
+
return {
|
|
659
|
+
"timeline": timeline,
|
|
660
|
+
"total_events": len(timeline_events),
|
|
661
|
+
"year_range": {
|
|
662
|
+
"start": min(timeline.keys()) if timeline else None,
|
|
663
|
+
"end": max(timeline.keys()) if timeline else None
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
except Exception as e:
|
|
668
|
+
logger.error(f"Error creating timeline: {str(e)}")
|
|
669
|
+
return {"error": str(e)}
|
|
670
|
+
|
|
671
|
+
async def _map_connections(self, papers: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
672
|
+
"""
|
|
673
|
+
Map connections between papers with enhanced error handling.
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
papers: List of paper dictionaries
|
|
677
|
+
|
|
678
|
+
Returns:
|
|
679
|
+
Connection mapping data
|
|
680
|
+
"""
|
|
681
|
+
try:
|
|
682
|
+
if not papers:
|
|
683
|
+
return {"error": "No papers provided"}
|
|
684
|
+
|
|
685
|
+
# Extract paper information
|
|
686
|
+
paper_info = []
|
|
687
|
+
for i, paper in enumerate(papers):
|
|
688
|
+
if isinstance(paper, dict):
|
|
689
|
+
title = self._sanitize_text(paper.get('title', 'Untitled'), max_length=200)
|
|
690
|
+
summary = self._sanitize_text(paper.get('summary', ''), max_length=1000)
|
|
691
|
+
authors = paper.get('authors', [])
|
|
692
|
+
|
|
693
|
+
paper_info.append({
|
|
694
|
+
"id": i,
|
|
695
|
+
"title": title,
|
|
696
|
+
"summary": summary,
|
|
697
|
+
"authors": authors if isinstance(authors, list) else [],
|
|
698
|
+
"year": paper.get('year')
|
|
699
|
+
})
|
|
700
|
+
|
|
701
|
+
if not paper_info:
|
|
702
|
+
return {"error": "No valid paper information"}
|
|
703
|
+
|
|
704
|
+
# Analyze connections using LLM
|
|
705
|
+
try:
|
|
706
|
+
papers_text = "\n".join([
|
|
707
|
+
f"Paper {p['id']}: {p['title']} ({p['year']})\n{p['summary']}"
|
|
708
|
+
for p in paper_info
|
|
709
|
+
])
|
|
710
|
+
|
|
711
|
+
prompt = """
|
|
712
|
+
Analyze these papers and identify connections between them:
|
|
713
|
+
1. Thematic connections
|
|
714
|
+
2. Methodological similarities
|
|
715
|
+
3. Citation relationships
|
|
716
|
+
4. Complementary findings
|
|
717
|
+
5. Building upon each other
|
|
718
|
+
|
|
719
|
+
Papers:
|
|
720
|
+
{papers}
|
|
721
|
+
|
|
722
|
+
Provide a structured analysis of how these papers relate to each other.
|
|
723
|
+
"""
|
|
724
|
+
|
|
725
|
+
response = await self.llm.generate_synthesis(
|
|
726
|
+
[{"content": papers_text}],
|
|
727
|
+
prompt.format(papers=papers_text)
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
if isinstance(response, dict) and "summary" in response:
|
|
731
|
+
analysis = response["summary"]
|
|
732
|
+
else:
|
|
733
|
+
analysis = str(response)
|
|
734
|
+
|
|
735
|
+
return {
|
|
736
|
+
"connections_analysis": self._sanitize_text(analysis, max_length=2000),
|
|
737
|
+
"paper_count": len(paper_info),
|
|
738
|
+
"connection_types": ["thematic", "methodological", "temporal", "complementary"]
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
except Exception as e:
|
|
742
|
+
logger.error(f"Error analyzing connections with LLM: {str(e)}")
|
|
743
|
+
return {
|
|
744
|
+
"error": "Connection analysis failed",
|
|
745
|
+
"paper_count": len(paper_info)
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
except Exception as e:
|
|
749
|
+
logger.error(f"Error mapping connections: {str(e)}")
|
|
750
|
+
return {"error": str(e)}
|
|
751
|
+
|
|
752
|
+
async def _analyze_citations(self, papers: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
753
|
+
"""
|
|
754
|
+
Analyze citations and build citation networks with academic formatting.
|
|
755
|
+
|
|
756
|
+
Args:
|
|
757
|
+
papers: List of paper data
|
|
758
|
+
|
|
759
|
+
Returns:
|
|
760
|
+
Citation analysis results with academic formatting
|
|
761
|
+
"""
|
|
762
|
+
try:
|
|
763
|
+
#logger.info(f"Analyzing citations for {len(papers)} papers")
|
|
764
|
+
|
|
765
|
+
all_citations = []
|
|
766
|
+
citation_networks = []
|
|
767
|
+
cited_findings = []
|
|
768
|
+
|
|
769
|
+
for paper in papers:
|
|
770
|
+
paper_id = paper.get('id', paper.get('paper_id', 'unknown'))
|
|
771
|
+
|
|
772
|
+
# Extract citations from paper
|
|
773
|
+
paper_citations = await self.citation_manager.extract_citations_from_paper(paper)
|
|
774
|
+
all_citations.extend(paper_citations)
|
|
775
|
+
|
|
776
|
+
# Build citation network if OpenAlex data available
|
|
777
|
+
if paper.get('openalex_id'):
|
|
778
|
+
network = await self.citation_manager.build_citation_network(
|
|
779
|
+
paper.get('openalex_id'), depth=2
|
|
780
|
+
)
|
|
781
|
+
citation_networks.append(network)
|
|
782
|
+
|
|
783
|
+
# Create cited findings for key findings
|
|
784
|
+
if paper.get('findings'):
|
|
785
|
+
# Create a citation for the current paper
|
|
786
|
+
paper_citation = Citation(
|
|
787
|
+
citation_id=f"PAPER_{paper_id[-8:].upper()}",
|
|
788
|
+
title=paper.get('title', 'Unknown'),
|
|
789
|
+
authors=paper.get('authors', []),
|
|
790
|
+
year=paper.get('year', 0),
|
|
791
|
+
journal=paper.get('journal'),
|
|
792
|
+
doi=paper.get('doi'),
|
|
793
|
+
citation_count=paper.get('citation_count', 0)
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
# Create cited finding
|
|
797
|
+
cited_finding = self.citation_manager.create_cited_finding(
|
|
798
|
+
finding_text=paper.get('findings'),
|
|
799
|
+
citation=paper_citation,
|
|
800
|
+
context=paper.get('abstract'),
|
|
801
|
+
methodology=paper.get('methodology')
|
|
802
|
+
)
|
|
803
|
+
cited_findings.append(cited_finding)
|
|
804
|
+
|
|
805
|
+
# Generate citation analytics
|
|
806
|
+
citation_analytics = await self.citation_manager.get_citation_analytics(all_citations)
|
|
807
|
+
|
|
808
|
+
# Export citations in multiple formats
|
|
809
|
+
apa_citations = await self.citation_manager.export_citations(
|
|
810
|
+
all_citations, CitationFormat.APA
|
|
811
|
+
)
|
|
812
|
+
bibtex_citations = await self.citation_manager.export_citations(
|
|
813
|
+
all_citations, CitationFormat.BIBTEX
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
return {
|
|
817
|
+
"total_citations": len(all_citations),
|
|
818
|
+
"citation_networks": [asdict(network) for network in citation_networks],
|
|
819
|
+
"cited_findings": [asdict(finding) for finding in cited_findings],
|
|
820
|
+
"citation_analytics": citation_analytics,
|
|
821
|
+
"formatted_citations": {
|
|
822
|
+
"apa": apa_citations,
|
|
823
|
+
"bibtex": bibtex_citations
|
|
824
|
+
},
|
|
825
|
+
"citation_quality": "high" if len(all_citations) > 0 else "low",
|
|
826
|
+
"academic_credibility_score": min(len(all_citations) / len(papers), 10.0) if papers else 0.0
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
except Exception as e:
|
|
830
|
+
logger.error(f"Error in citation analysis: {str(e)}")
|
|
831
|
+
return {
|
|
832
|
+
"total_citations": 0,
|
|
833
|
+
"citation_networks": [],
|
|
834
|
+
"cited_findings": [],
|
|
835
|
+
"citation_analytics": {"error": str(e)},
|
|
836
|
+
"formatted_citations": {"apa": "", "bibtex": ""},
|
|
837
|
+
"citation_quality": "error",
|
|
838
|
+
"academic_credibility_score": 0.0,
|
|
839
|
+
"error": str(e)
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
async def _get_cached_synthesis(self, cache_key: str) -> Optional[Dict[str, Any]]:
|
|
843
|
+
"""
|
|
844
|
+
Get cached synthesis with error handling.
|
|
845
|
+
|
|
846
|
+
Args:
|
|
847
|
+
cache_key: Cache key
|
|
848
|
+
|
|
849
|
+
Returns:
|
|
850
|
+
Cached synthesis or None
|
|
851
|
+
"""
|
|
852
|
+
try:
|
|
853
|
+
cached = await self.redis_client.get(cache_key)
|
|
854
|
+
if cached:
|
|
855
|
+
return json.loads(cached)
|
|
856
|
+
return None
|
|
857
|
+
except Exception as e:
|
|
858
|
+
logger.warning(f"Error retrieving cached synthesis: {str(e)}")
|
|
859
|
+
return None
|
|
860
|
+
|
|
861
|
+
async def _cache_synthesis(self, cache_key: str, synthesis: Dict[str, Any]) -> None:
|
|
862
|
+
"""
|
|
863
|
+
Cache synthesis with error handling.
|
|
864
|
+
|
|
865
|
+
Args:
|
|
866
|
+
cache_key: Cache key
|
|
867
|
+
synthesis: Synthesis data to cache
|
|
868
|
+
"""
|
|
869
|
+
try:
|
|
870
|
+
# Set expiration to 24 hours
|
|
871
|
+
await self.redis_client.setex(
|
|
872
|
+
cache_key,
|
|
873
|
+
60 * 60 * 24, # 24 hours
|
|
874
|
+
json.dumps(synthesis)
|
|
875
|
+
)
|
|
876
|
+
#logger.info(f"Synthesis cached with key: {cache_key}")
|
|
877
|
+
except Exception as e:
|
|
878
|
+
logger.warning(f"Error caching synthesis: {str(e)}")
|
|
879
|
+
|
|
880
|
+
def _parse_findings(self, llm_response: str) -> List[Dict[str, Any]]:
|
|
881
|
+
"""
|
|
882
|
+
Parse findings from LLM response with enhanced error handling.
|
|
883
|
+
|
|
884
|
+
Args:
|
|
885
|
+
llm_response: LLM response text
|
|
886
|
+
|
|
887
|
+
Returns:
|
|
888
|
+
List of parsed findings
|
|
889
|
+
"""
|
|
890
|
+
try:
|
|
891
|
+
if not llm_response:
|
|
892
|
+
return []
|
|
893
|
+
|
|
894
|
+
findings = []
|
|
895
|
+
lines = llm_response.split('\n')
|
|
896
|
+
|
|
897
|
+
for line in lines:
|
|
898
|
+
line = line.strip()
|
|
899
|
+
if line and not line.startswith('#'):
|
|
900
|
+
findings.append({
|
|
901
|
+
"finding": self._sanitize_text(line, max_length=500),
|
|
902
|
+
"strength": "moderate", # Default strength
|
|
903
|
+
"extracted_at": _utc_timestamp()
|
|
904
|
+
})
|
|
905
|
+
|
|
906
|
+
return findings[:20] # Limit to 20 findings
|
|
907
|
+
|
|
908
|
+
except Exception as e:
|
|
909
|
+
logger.error(f"Error parsing findings: {str(e)}")
|
|
910
|
+
return []
|
|
911
|
+
|
|
912
|
+
def _parse_contradictions(self, llm_response: str) -> List[Dict[str, Any]]:
|
|
913
|
+
"""
|
|
914
|
+
Parse contradictions from LLM response with enhanced error handling.
|
|
915
|
+
|
|
916
|
+
Args:
|
|
917
|
+
llm_response: LLM response text
|
|
918
|
+
|
|
919
|
+
Returns:
|
|
920
|
+
List of parsed contradictions
|
|
921
|
+
"""
|
|
922
|
+
try:
|
|
923
|
+
if not llm_response:
|
|
924
|
+
return []
|
|
925
|
+
|
|
926
|
+
contradictions = []
|
|
927
|
+
lines = llm_response.split('\n')
|
|
928
|
+
|
|
929
|
+
for line in lines:
|
|
930
|
+
line = line.strip()
|
|
931
|
+
if line and not line.startswith('#'):
|
|
932
|
+
contradictions.append({
|
|
933
|
+
"contradiction": self._sanitize_text(line, max_length=500),
|
|
934
|
+
"type": "methodological", # Default type
|
|
935
|
+
"identified_at": _utc_timestamp()
|
|
936
|
+
})
|
|
937
|
+
|
|
938
|
+
return contradictions[:10] # Limit to 10 contradictions
|
|
939
|
+
|
|
940
|
+
except Exception as e:
|
|
941
|
+
logger.error(f"Error parsing contradictions: {str(e)}")
|
|
942
|
+
return []
|
|
943
|
+
|
|
944
|
+
def _categorize_gap(self, gap_text: str) -> str:
|
|
945
|
+
"""
|
|
946
|
+
Categorize research gap with enhanced error handling.
|
|
947
|
+
|
|
948
|
+
Args:
|
|
949
|
+
gap_text: Gap description text
|
|
950
|
+
|
|
951
|
+
Returns:
|
|
952
|
+
Gap category
|
|
953
|
+
"""
|
|
954
|
+
try:
|
|
955
|
+
gap_lower = gap_text.lower()
|
|
956
|
+
|
|
957
|
+
if any(word in gap_lower for word in ['method', 'methodology', 'approach']):
|
|
958
|
+
return "methodological"
|
|
959
|
+
elif any(word in gap_lower for word in ['data', 'dataset', 'sample']):
|
|
960
|
+
return "data"
|
|
961
|
+
elif any(word in gap_lower for word in ['theory', 'theoretical', 'framework']):
|
|
962
|
+
return "theoretical"
|
|
963
|
+
elif any(word in gap_lower for word in ['application', 'practical', 'implementation']):
|
|
964
|
+
return "applied"
|
|
965
|
+
else:
|
|
966
|
+
return "general"
|
|
967
|
+
|
|
968
|
+
except Exception as e:
|
|
969
|
+
logger.error(f"Error categorizing gap: {str(e)}")
|
|
970
|
+
return "general"
|
|
971
|
+
|
|
972
|
+
async def _extract_and_upsert_entities(self, papers: List[Dict[str, Any]], synthesis: Dict[str, Any]) -> None:
|
|
973
|
+
"""Extract entities and relationships from papers and synthesis, upsert to Knowledge Graph."""
|
|
974
|
+
try:
|
|
975
|
+
# Extract paper entities
|
|
976
|
+
for paper in papers:
|
|
977
|
+
if isinstance(paper, dict):
|
|
978
|
+
paper_id = paper.get('id') or paper.get('_id') or hashlib.md5(paper.get('title', 'unknown').encode()).hexdigest()
|
|
979
|
+
|
|
980
|
+
# Upsert paper entity
|
|
981
|
+
await self.kg_client.upsert_entity("Paper", {
|
|
982
|
+
"id": paper_id,
|
|
983
|
+
"title": paper.get('title', 'Untitled'),
|
|
984
|
+
"year": paper.get('year'),
|
|
985
|
+
"authors": paper.get('authors', []),
|
|
986
|
+
"doi": paper.get('doi'),
|
|
987
|
+
"journal": paper.get('journal')
|
|
988
|
+
}, "id")
|
|
989
|
+
|
|
990
|
+
# Extract and upsert author entities
|
|
991
|
+
for author in paper.get('authors', []):
|
|
992
|
+
if isinstance(author, dict) and author.get('name'):
|
|
993
|
+
author_id = hashlib.md5(author['name'].encode()).hexdigest()
|
|
994
|
+
await self.kg_client.upsert_entity("Author", {
|
|
995
|
+
"id": author_id,
|
|
996
|
+
"name": author['name'],
|
|
997
|
+
"email": author.get('email', ''),
|
|
998
|
+
"affiliation": author.get('affiliation', '')
|
|
999
|
+
}, "id")
|
|
1000
|
+
|
|
1001
|
+
# Create AUTHORED relationship
|
|
1002
|
+
await self.kg_client.upsert_relationship(
|
|
1003
|
+
"Author", "id", author_id,
|
|
1004
|
+
"Paper", "id", paper_id,
|
|
1005
|
+
"AUTHORED", {"year": paper.get('year')}
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
# Extract synthesis entities
|
|
1009
|
+
for finding in synthesis.get("common_findings", []):
|
|
1010
|
+
if isinstance(finding, dict) and finding.get("finding"):
|
|
1011
|
+
finding_id = hashlib.md5(finding["finding"].encode()).hexdigest()
|
|
1012
|
+
await self.kg_client.upsert_entity("Finding", {
|
|
1013
|
+
"id": finding_id,
|
|
1014
|
+
"text": finding["finding"],
|
|
1015
|
+
"strength": finding.get("strength", "moderate"),
|
|
1016
|
+
"extracted_at": finding.get("extracted_at")
|
|
1017
|
+
}, "id")
|
|
1018
|
+
|
|
1019
|
+
# Link findings to papers (simplified)
|
|
1020
|
+
for paper in papers:
|
|
1021
|
+
if isinstance(paper, dict):
|
|
1022
|
+
paper_id = paper.get('id') or paper.get('_id') or hashlib.md5(paper.get('title', 'unknown').encode()).hexdigest()
|
|
1023
|
+
await self.kg_client.upsert_relationship(
|
|
1024
|
+
"Finding", "id", finding_id,
|
|
1025
|
+
"Paper", "id", paper_id,
|
|
1026
|
+
"SUPPORTS", {"confidence": finding.get("strength", "moderate")}
|
|
1027
|
+
)
|
|
1028
|
+
|
|
1029
|
+
# Extract methodology entities
|
|
1030
|
+
for method_info in synthesis.get("methodology_analysis", {}).items():
|
|
1031
|
+
if isinstance(method_info, tuple) and len(method_info) == 2:
|
|
1032
|
+
method_name, details = method_info
|
|
1033
|
+
if method_name != "analysis" and isinstance(details, dict):
|
|
1034
|
+
method_id = hashlib.md5(method_name.encode()).hexdigest()
|
|
1035
|
+
await self.kg_client.upsert_entity("Methodology", {
|
|
1036
|
+
"id": method_id,
|
|
1037
|
+
"name": method_name,
|
|
1038
|
+
"count": details.get("count", 0),
|
|
1039
|
+
"papers": details.get("papers", [])
|
|
1040
|
+
}, "id")
|
|
1041
|
+
|
|
1042
|
+
# Link methodologies to papers
|
|
1043
|
+
for paper_title in details.get("papers", []):
|
|
1044
|
+
for paper in papers:
|
|
1045
|
+
if isinstance(paper, dict) and paper.get('title') == paper_title:
|
|
1046
|
+
paper_id = paper.get('id') or paper.get('_id') or hashlib.md5(paper.get('title', 'unknown').encode()).hexdigest()
|
|
1047
|
+
await self.kg_client.upsert_relationship(
|
|
1048
|
+
"Methodology", "id", method_id,
|
|
1049
|
+
"Paper", "id", paper_id,
|
|
1050
|
+
"USES", {"count": details.get("count", 0)}
|
|
1051
|
+
)
|
|
1052
|
+
break
|
|
1053
|
+
|
|
1054
|
+
logger.info(f"Extracted and upserted entities to Knowledge Graph for {len(papers)} papers")
|
|
1055
|
+
|
|
1056
|
+
except Exception as e:
|
|
1057
|
+
logger.error(f"Error extracting entities to Knowledge Graph: {e}")
|
|
1058
|
+
# Don't raise - this is optional functionality
|
|
1059
|
+
|
|
1060
|
+
async def cleanup(self):
|
|
1061
|
+
"""Cleanup resources with error handling."""
|
|
1062
|
+
try:
|
|
1063
|
+
# Cancel any running synthesis tasks
|
|
1064
|
+
for task in self.synthesis_tasks.values():
|
|
1065
|
+
if not task.done():
|
|
1066
|
+
task.cancel()
|
|
1067
|
+
|
|
1068
|
+
# Wait for tasks to complete
|
|
1069
|
+
if self.synthesis_tasks:
|
|
1070
|
+
await asyncio.gather(*self.synthesis_tasks.values(), return_exceptions=True)
|
|
1071
|
+
|
|
1072
|
+
# Clear caches
|
|
1073
|
+
self.synthesis_cache.clear()
|
|
1074
|
+
self.synthesis_tasks.clear()
|
|
1075
|
+
|
|
1076
|
+
# Close Redis connection
|
|
1077
|
+
if hasattr(self, 'redis_client'):
|
|
1078
|
+
await self.redis_client.close()
|
|
1079
|
+
|
|
1080
|
+
#logger.info("ResearchSynthesizer cleanup completed")
|
|
1081
|
+
|
|
1082
|
+
except Exception as e:
|
|
1083
|
+
logger.error(f"Error during cleanup: {str(e)}")
|
|
1084
|
+
|
|
1085
|
+
async def health_check(self) -> Dict[str, Any]:
|
|
1086
|
+
"""
|
|
1087
|
+
Perform health check of the synthesizer.
|
|
1088
|
+
|
|
1089
|
+
Returns:
|
|
1090
|
+
Health status
|
|
1091
|
+
"""
|
|
1092
|
+
try:
|
|
1093
|
+
health_status = {
|
|
1094
|
+
"status": "healthy",
|
|
1095
|
+
"timestamp": _utc_timestamp(),
|
|
1096
|
+
"components": {}
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
# Check Redis connection
|
|
1100
|
+
try:
|
|
1101
|
+
await self.redis_client.ping()
|
|
1102
|
+
health_status["components"]["redis"] = {"status": "healthy"}
|
|
1103
|
+
except Exception as e:
|
|
1104
|
+
health_status["components"]["redis"] = {"status": "error", "error": str(e)}
|
|
1105
|
+
health_status["status"] = "degraded"
|
|
1106
|
+
|
|
1107
|
+
# Check LLM manager
|
|
1108
|
+
try:
|
|
1109
|
+
llm_health = await self.llm.health_check()
|
|
1110
|
+
health_status["components"]["llm_manager"] = llm_health
|
|
1111
|
+
if llm_health.get("status") != "healthy":
|
|
1112
|
+
health_status["status"] = "degraded"
|
|
1113
|
+
except Exception as e:
|
|
1114
|
+
health_status["components"]["llm_manager"] = {"status": "error", "error": str(e)}
|
|
1115
|
+
health_status["status"] = "degraded"
|
|
1116
|
+
|
|
1117
|
+
# Check database operations
|
|
1118
|
+
try:
|
|
1119
|
+
# Simple database check
|
|
1120
|
+
health_status["components"]["database"] = {"status": "healthy"}
|
|
1121
|
+
except Exception as e:
|
|
1122
|
+
health_status["components"]["database"] = {"status": "error", "error": str(e)}
|
|
1123
|
+
health_status["status"] = "degraded"
|
|
1124
|
+
|
|
1125
|
+
# Check active tasks
|
|
1126
|
+
active_tasks = len(self.synthesis_tasks)
|
|
1127
|
+
health_status["components"]["active_tasks"] = {
|
|
1128
|
+
"status": "healthy",
|
|
1129
|
+
"count": active_tasks
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1132
|
+
#logger.info(f"Health check completed: {health_status['status']}")
|
|
1133
|
+
return health_status
|
|
1134
|
+
|
|
1135
|
+
except Exception as e:
|
|
1136
|
+
logger.error(f"Health check failed: {str(e)}")
|
|
1137
|
+
return {
|
|
1138
|
+
"status": "error",
|
|
1139
|
+
"error": str(e),
|
|
1140
|
+
"timestamp": _utc_timestamp()
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
async def export_academic_synthesis(self, synthesis: Dict[str, Any],
|
|
1144
|
+
format_type: CitationFormat = CitationFormat.APA) -> str:
|
|
1145
|
+
"""
|
|
1146
|
+
Export synthesis with proper academic citations and formatting.
|
|
1147
|
+
|
|
1148
|
+
Args:
|
|
1149
|
+
synthesis: Synthesis results
|
|
1150
|
+
format_type: Citation format to use
|
|
1151
|
+
|
|
1152
|
+
Returns:
|
|
1153
|
+
Formatted academic synthesis
|
|
1154
|
+
"""
|
|
1155
|
+
try:
|
|
1156
|
+
if not synthesis:
|
|
1157
|
+
return "No synthesis data available."
|
|
1158
|
+
|
|
1159
|
+
# Extract citation data
|
|
1160
|
+
citation_analysis = synthesis.get("citation_analysis", {})
|
|
1161
|
+
cited_findings = citation_analysis.get("cited_findings", [])
|
|
1162
|
+
formatted_citations = citation_analysis.get("formatted_citations", {})
|
|
1163
|
+
|
|
1164
|
+
# Build academic synthesis
|
|
1165
|
+
academic_synthesis = []
|
|
1166
|
+
academic_synthesis.append("# Research Synthesis Report")
|
|
1167
|
+
academic_synthesis.append("")
|
|
1168
|
+
academic_synthesis.append(f"*Generated on: {_utc_now().strftime('%B %d, %Y')}*")
|
|
1169
|
+
academic_synthesis.append("")
|
|
1170
|
+
|
|
1171
|
+
# Key Findings with Citations
|
|
1172
|
+
if synthesis.get("common_findings"):
|
|
1173
|
+
academic_synthesis.append("## Key Findings")
|
|
1174
|
+
academic_synthesis.append("")
|
|
1175
|
+
|
|
1176
|
+
for i, finding in enumerate(synthesis["common_findings"], 1):
|
|
1177
|
+
finding_text = finding.get("finding", str(finding))
|
|
1178
|
+
academic_synthesis.append(f"{i}. {finding_text}")
|
|
1179
|
+
|
|
1180
|
+
# Add citation if available
|
|
1181
|
+
if cited_findings and i <= len(cited_findings):
|
|
1182
|
+
citation = cited_findings[i-1].get("citation", {})
|
|
1183
|
+
if citation.get("authors") and citation.get("year"):
|
|
1184
|
+
authors = citation["authors"]
|
|
1185
|
+
if len(authors) == 1:
|
|
1186
|
+
author_cite = authors[0]
|
|
1187
|
+
elif len(authors) == 2:
|
|
1188
|
+
author_cite = f"{authors[0]} and {authors[1]}"
|
|
1189
|
+
else:
|
|
1190
|
+
author_cite = f"{authors[0]} et al."
|
|
1191
|
+
academic_synthesis.append(f" *Source: {author_cite} ({citation['year']})*")
|
|
1192
|
+
academic_synthesis.append("")
|
|
1193
|
+
|
|
1194
|
+
# Research Gaps
|
|
1195
|
+
if synthesis.get("research_gaps"):
|
|
1196
|
+
academic_synthesis.append("## Research Gaps")
|
|
1197
|
+
academic_synthesis.append("")
|
|
1198
|
+
for gap in synthesis["research_gaps"]:
|
|
1199
|
+
gap_text = gap.get("gap", str(gap))
|
|
1200
|
+
academic_synthesis.append(f"- {gap_text}")
|
|
1201
|
+
academic_synthesis.append("")
|
|
1202
|
+
|
|
1203
|
+
# Contradictions
|
|
1204
|
+
if synthesis.get("contradictions"):
|
|
1205
|
+
academic_synthesis.append("## Contradictions and Disagreements")
|
|
1206
|
+
academic_synthesis.append("")
|
|
1207
|
+
for contradiction in synthesis["contradictions"]:
|
|
1208
|
+
contradiction_text = contradiction.get("contradiction", str(contradiction))
|
|
1209
|
+
academic_synthesis.append(f"- {contradiction_text}")
|
|
1210
|
+
academic_synthesis.append("")
|
|
1211
|
+
|
|
1212
|
+
# Methodology Analysis
|
|
1213
|
+
if synthesis.get("methodology_analysis"):
|
|
1214
|
+
academic_synthesis.append("## Methodology Analysis")
|
|
1215
|
+
academic_synthesis.append("")
|
|
1216
|
+
methodology = synthesis["methodology_analysis"]
|
|
1217
|
+
if isinstance(methodology, dict):
|
|
1218
|
+
for key, value in methodology.items():
|
|
1219
|
+
if key != "error":
|
|
1220
|
+
academic_synthesis.append(f"### {key.replace('_', ' ').title()}")
|
|
1221
|
+
academic_synthesis.append(f"{value}")
|
|
1222
|
+
academic_synthesis.append("")
|
|
1223
|
+
|
|
1224
|
+
# Future Directions
|
|
1225
|
+
if synthesis.get("future_directions"):
|
|
1226
|
+
academic_synthesis.append("## Future Research Directions")
|
|
1227
|
+
academic_synthesis.append("")
|
|
1228
|
+
for direction in synthesis["future_directions"]:
|
|
1229
|
+
direction_text = direction.get("direction", str(direction))
|
|
1230
|
+
academic_synthesis.append(f"- {direction_text}")
|
|
1231
|
+
academic_synthesis.append("")
|
|
1232
|
+
|
|
1233
|
+
# Citations Section
|
|
1234
|
+
if formatted_citations:
|
|
1235
|
+
academic_synthesis.append("## References")
|
|
1236
|
+
academic_synthesis.append("")
|
|
1237
|
+
|
|
1238
|
+
if format_type == CitationFormat.APA and formatted_citations.get("apa"):
|
|
1239
|
+
citations_text = formatted_citations["apa"]
|
|
1240
|
+
# Split and number the citations
|
|
1241
|
+
citations_list = citations_text.split("\n\n")
|
|
1242
|
+
for i, citation in enumerate(citations_list, 1):
|
|
1243
|
+
if citation.strip():
|
|
1244
|
+
academic_synthesis.append(f"{i}. {citation.strip()}")
|
|
1245
|
+
elif format_type == CitationFormat.BIBTEX and formatted_citations.get("bibtex"):
|
|
1246
|
+
academic_synthesis.append("```bibtex")
|
|
1247
|
+
academic_synthesis.append(formatted_citations["bibtex"])
|
|
1248
|
+
academic_synthesis.append("```")
|
|
1249
|
+
else:
|
|
1250
|
+
# Fallback to numbered list
|
|
1251
|
+
citations_text = formatted_citations.get("apa", "")
|
|
1252
|
+
citations_list = citations_text.split("\n\n")
|
|
1253
|
+
for i, citation in enumerate(citations_list, 1):
|
|
1254
|
+
if citation.strip():
|
|
1255
|
+
academic_synthesis.append(f"{i}. {citation.strip()}")
|
|
1256
|
+
|
|
1257
|
+
# Citation Analytics
|
|
1258
|
+
if citation_analysis.get("citation_analytics"):
|
|
1259
|
+
analytics = citation_analysis["citation_analytics"]
|
|
1260
|
+
if not analytics.get("error"):
|
|
1261
|
+
academic_synthesis.append("")
|
|
1262
|
+
academic_synthesis.append("## Citation Analytics")
|
|
1263
|
+
academic_synthesis.append("")
|
|
1264
|
+
academic_synthesis.append(f"- Total citations analyzed: {analytics.get('total_citations', 0)}")
|
|
1265
|
+
academic_synthesis.append(f"- Year range: {analytics.get('year_range', {}).get('min', 0)} - {analytics.get('year_range', {}).get('max', 0)}")
|
|
1266
|
+
academic_synthesis.append(f"- Average citations per paper: {analytics.get('citation_impact', {}).get('average_citations_per_paper', 0):.1f}")
|
|
1267
|
+
academic_synthesis.append(f"- Academic credibility score: {citation_analysis.get('academic_credibility_score', 0):.1f}/10.0")
|
|
1268
|
+
|
|
1269
|
+
return "\n".join(academic_synthesis)
|
|
1270
|
+
|
|
1271
|
+
except Exception as e:
|
|
1272
|
+
logger.error(f"Error exporting academic synthesis: {str(e)}")
|
|
1273
|
+
return f"Error generating academic synthesis: {str(e)}"
|