cite-agent 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cite-agent might be problematic. Click here for more details.
- cite_agent/__init__.py +1 -1
- cite_agent/agent_backend_only.py +30 -4
- cite_agent/cli.py +24 -26
- cite_agent/cli_conversational.py +294 -0
- cite_agent/enhanced_ai_agent.py +2776 -118
- cite_agent/streaming_ui.py +252 -0
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/METADATA +4 -3
- cite_agent-1.0.5.dist-info/RECORD +50 -0
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/top_level.txt +1 -0
- src/__init__.py +1 -0
- src/services/__init__.py +132 -0
- src/services/auth_service/__init__.py +3 -0
- src/services/auth_service/auth_manager.py +33 -0
- src/services/graph/__init__.py +1 -0
- src/services/graph/knowledge_graph.py +194 -0
- src/services/llm_service/__init__.py +5 -0
- src/services/llm_service/llm_manager.py +495 -0
- src/services/paper_service/__init__.py +5 -0
- src/services/paper_service/openalex.py +231 -0
- src/services/performance_service/__init__.py +1 -0
- src/services/performance_service/rust_performance.py +395 -0
- src/services/research_service/__init__.py +23 -0
- src/services/research_service/chatbot.py +2056 -0
- src/services/research_service/citation_manager.py +436 -0
- src/services/research_service/context_manager.py +1441 -0
- src/services/research_service/conversation_manager.py +597 -0
- src/services/research_service/critical_paper_detector.py +577 -0
- src/services/research_service/enhanced_research.py +121 -0
- src/services/research_service/enhanced_synthesizer.py +375 -0
- src/services/research_service/query_generator.py +777 -0
- src/services/research_service/synthesizer.py +1273 -0
- src/services/search_service/__init__.py +5 -0
- src/services/search_service/indexer.py +186 -0
- src/services/search_service/search_engine.py +342 -0
- src/services/simple_enhanced_main.py +287 -0
- cite_agent/__distribution__.py +0 -7
- cite_agent-1.0.3.dist-info/RECORD +0 -23
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/WHEEL +0 -0
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/entry_points.txt +0 -0
- {cite_agent-1.0.3.dist-info → cite_agent-1.0.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1441 @@
|
|
|
1
|
+
#context_manager.py
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List, Optional, Any
|
|
4
|
+
import asyncio
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
import uuid
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import redis.asyncio as redis
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Dict, Any, Optional, List
|
|
12
|
+
import hashlib
|
|
13
|
+
|
|
14
|
+
from ...utils.logger import logger, log_operation
|
|
15
|
+
from ...storage.db.operations import DatabaseOperations
|
|
16
|
+
from ...storage.db.models import ResearchSession
|
|
17
|
+
from .synthesizer import ResearchSynthesizer
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _utc_now() -> datetime:
|
|
23
|
+
return datetime.now(timezone.utc)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _utc_timestamp() -> str:
|
|
27
|
+
return _utc_now().isoformat()
|
|
28
|
+
|
|
29
|
+
class ResearchContextManager:
|
|
30
|
+
"""
|
|
31
|
+
Manages research context and provides real-time streaming updates.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, db_ops: DatabaseOperations, synthesizer: ResearchSynthesizer, redis_url: str):
|
|
35
|
+
self.db = db_ops
|
|
36
|
+
self.synthesizer = synthesizer
|
|
37
|
+
self.redis_client = redis.from_url(redis_url)
|
|
38
|
+
self.active_sessions: Dict[str, Dict] = {}
|
|
39
|
+
self.stream_subscribers: Dict[str, List[asyncio.Queue]] = {}
|
|
40
|
+
|
|
41
|
+
async def create_research_session(self, user_id: str, topic: str, research_questions: List[str]) -> str:
|
|
42
|
+
"""Create a new research session with streaming support."""
|
|
43
|
+
session_id = hashlib.md5(f"{user_id}_{topic}_{_utc_timestamp()}".encode()).hexdigest()
|
|
44
|
+
|
|
45
|
+
session_data = {
|
|
46
|
+
"id": session_id,
|
|
47
|
+
"user_id": user_id,
|
|
48
|
+
"topic": topic,
|
|
49
|
+
"research_questions": json.dumps(research_questions), # Convert list to JSON string
|
|
50
|
+
"status": "initialized",
|
|
51
|
+
"progress": 0.0,
|
|
52
|
+
"current_step": "Initializing research session",
|
|
53
|
+
"papers": json.dumps([]), # Convert list to JSON string
|
|
54
|
+
"notes": json.dumps([]), # Convert list to JSON string
|
|
55
|
+
"created_at": _utc_timestamp(),
|
|
56
|
+
"updated_at": _utc_timestamp(),
|
|
57
|
+
"synthesis": "", # Convert None to empty string
|
|
58
|
+
"error": "" # Convert None to empty string
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
# Store in Redis for persistence
|
|
62
|
+
await self.redis_client.hset(f"research_session:{session_id}", mapping=session_data)
|
|
63
|
+
|
|
64
|
+
# Store in memory with proper types
|
|
65
|
+
self.active_sessions[session_id] = {
|
|
66
|
+
"id": session_id,
|
|
67
|
+
"user_id": user_id,
|
|
68
|
+
"topic": topic,
|
|
69
|
+
"research_questions": research_questions,
|
|
70
|
+
"status": "initialized",
|
|
71
|
+
"progress": 0.0,
|
|
72
|
+
"current_step": "Initializing research session",
|
|
73
|
+
"papers": [],
|
|
74
|
+
"notes": [],
|
|
75
|
+
"created_at": _utc_timestamp(),
|
|
76
|
+
"updated_at": _utc_timestamp(),
|
|
77
|
+
"synthesis": None,
|
|
78
|
+
"error": None
|
|
79
|
+
}
|
|
80
|
+
self.stream_subscribers[session_id] = []
|
|
81
|
+
|
|
82
|
+
# Send initial update
|
|
83
|
+
await self._broadcast_update(session_id, {
|
|
84
|
+
"type": "session_created",
|
|
85
|
+
"session_id": session_id,
|
|
86
|
+
"data": self.active_sessions[session_id]
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
logger.info(f"Created research session {session_id} for user {user_id}")
|
|
90
|
+
return session_id
|
|
91
|
+
|
|
92
|
+
async def update_session_status(self, session_id: str, status: str, message: str, progress: Optional[float] = None):
|
|
93
|
+
"""Update session status and broadcast to subscribers."""
|
|
94
|
+
if session_id not in self.active_sessions:
|
|
95
|
+
logger.warning(f"Session {session_id} not found")
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
session = self.active_sessions[session_id]
|
|
99
|
+
session["status"] = status
|
|
100
|
+
session["current_step"] = message
|
|
101
|
+
session["updated_at"] = _utc_timestamp()
|
|
102
|
+
|
|
103
|
+
if progress is not None:
|
|
104
|
+
session["progress"] = progress
|
|
105
|
+
|
|
106
|
+
# Update Redis with serialized data
|
|
107
|
+
redis_data = {
|
|
108
|
+
"status": status,
|
|
109
|
+
"current_step": message,
|
|
110
|
+
"updated_at": session["updated_at"],
|
|
111
|
+
"papers": json.dumps(session["papers"]),
|
|
112
|
+
"notes": json.dumps(session["notes"]),
|
|
113
|
+
"research_questions": json.dumps(session["research_questions"])
|
|
114
|
+
}
|
|
115
|
+
if progress is not None:
|
|
116
|
+
redis_data["progress"] = progress
|
|
117
|
+
|
|
118
|
+
await self.redis_client.hset(f"research_session:{session_id}", mapping=redis_data)
|
|
119
|
+
|
|
120
|
+
# Broadcast update
|
|
121
|
+
update_data = {
|
|
122
|
+
"type": "status_update",
|
|
123
|
+
"session_id": session_id,
|
|
124
|
+
"status": status,
|
|
125
|
+
"message": message,
|
|
126
|
+
"progress": session["progress"],
|
|
127
|
+
"timestamp": _utc_timestamp()
|
|
128
|
+
}
|
|
129
|
+
await self._broadcast_update(session_id, update_data)
|
|
130
|
+
|
|
131
|
+
logger.info(f"Session {session_id} status: {status} - {message}")
|
|
132
|
+
|
|
133
|
+
async def add_paper_to_session(self, session_id: str, paper_id: str, paper_info: Dict[str, Any]):
|
|
134
|
+
"""Add a paper to the session and broadcast update."""
|
|
135
|
+
if session_id not in self.active_sessions:
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
session = self.active_sessions[session_id]
|
|
139
|
+
session["papers"].append(paper_id)
|
|
140
|
+
session["updated_at"] = _utc_timestamp()
|
|
141
|
+
|
|
142
|
+
# Update Redis
|
|
143
|
+
await self.redis_client.hset(f"research_session:{session_id}", mapping=session)
|
|
144
|
+
|
|
145
|
+
# Broadcast paper addition
|
|
146
|
+
await self._broadcast_update(session_id, {
|
|
147
|
+
"type": "paper_added",
|
|
148
|
+
"session_id": session_id,
|
|
149
|
+
"paper_id": paper_id,
|
|
150
|
+
"paper_info": paper_info,
|
|
151
|
+
"total_papers": len(session["papers"]),
|
|
152
|
+
"timestamp": _utc_timestamp()
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
async def update_session_synthesis(self, session_id: str, synthesis: Dict[str, Any]):
|
|
156
|
+
"""Update session with synthesis results and broadcast."""
|
|
157
|
+
if session_id not in self.active_sessions:
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
session = self.active_sessions[session_id]
|
|
161
|
+
session["synthesis"] = synthesis
|
|
162
|
+
session["status"] = "completed"
|
|
163
|
+
session["progress"] = 100.0
|
|
164
|
+
session["updated_at"] = _utc_timestamp()
|
|
165
|
+
|
|
166
|
+
# Update Redis
|
|
167
|
+
await self.redis_client.hset(f"research_session:{session_id}", mapping=session)
|
|
168
|
+
|
|
169
|
+
# Broadcast synthesis completion
|
|
170
|
+
await self._broadcast_update(session_id, {
|
|
171
|
+
"type": "synthesis_complete",
|
|
172
|
+
"session_id": session_id,
|
|
173
|
+
"synthesis_summary": {
|
|
174
|
+
"paper_count": synthesis.get("meta", {}).get("paper_count", 0),
|
|
175
|
+
"findings_count": len(synthesis.get("common_findings", [])),
|
|
176
|
+
"gaps_count": len(synthesis.get("research_gaps", [])),
|
|
177
|
+
"contradictions_count": len(synthesis.get("contradictions", []))
|
|
178
|
+
},
|
|
179
|
+
"timestamp": _utc_timestamp()
|
|
180
|
+
})
|
|
181
|
+
|
|
182
|
+
async def subscribe_to_updates(self, session_id: str) -> asyncio.Queue:
|
|
183
|
+
"""Subscribe to real-time updates for a session."""
|
|
184
|
+
if session_id not in self.stream_subscribers:
|
|
185
|
+
self.stream_subscribers[session_id] = []
|
|
186
|
+
|
|
187
|
+
queue = asyncio.Queue()
|
|
188
|
+
self.stream_subscribers[session_id].append(queue)
|
|
189
|
+
|
|
190
|
+
# Send current session state
|
|
191
|
+
if session_id in self.active_sessions:
|
|
192
|
+
await queue.put({
|
|
193
|
+
"type": "session_state",
|
|
194
|
+
"session_id": session_id,
|
|
195
|
+
"data": self.active_sessions[session_id]
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
logger.info(f"New subscriber added to session {session_id}")
|
|
199
|
+
return queue
|
|
200
|
+
|
|
201
|
+
async def unsubscribe_from_updates(self, session_id: str, queue: asyncio.Queue):
|
|
202
|
+
"""Unsubscribe from session updates."""
|
|
203
|
+
if session_id in self.stream_subscribers:
|
|
204
|
+
try:
|
|
205
|
+
self.stream_subscribers[session_id].remove(queue)
|
|
206
|
+
except ValueError:
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
async def _broadcast_update(self, session_id: str, update: Dict[str, Any]):
|
|
210
|
+
"""Broadcast update to all subscribers."""
|
|
211
|
+
if session_id not in self.stream_subscribers:
|
|
212
|
+
return
|
|
213
|
+
|
|
214
|
+
# Send to Redis pub/sub for cross-process communication
|
|
215
|
+
await self.redis_client.publish(f"research_updates:{session_id}", json.dumps(update))
|
|
216
|
+
|
|
217
|
+
# Send to local subscribers
|
|
218
|
+
dead_queues = []
|
|
219
|
+
for queue in self.stream_subscribers[session_id]:
|
|
220
|
+
try:
|
|
221
|
+
await queue.put(update)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.warning(f"Failed to send update to subscriber: {e}")
|
|
224
|
+
dead_queues.append(queue)
|
|
225
|
+
|
|
226
|
+
# Clean up dead queues
|
|
227
|
+
for queue in dead_queues:
|
|
228
|
+
try:
|
|
229
|
+
self.stream_subscribers[session_id].remove(queue)
|
|
230
|
+
except ValueError:
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
async def get_session_status(self, session_id: str) -> Optional[Dict[str, Any]]:
|
|
234
|
+
"""Get current session status."""
|
|
235
|
+
if session_id in self.active_sessions:
|
|
236
|
+
return self.active_sessions[session_id]
|
|
237
|
+
|
|
238
|
+
# Try to load from Redis
|
|
239
|
+
try:
|
|
240
|
+
session_data = await self.redis_client.hgetall(f"research_session:{session_id}")
|
|
241
|
+
if session_data:
|
|
242
|
+
return session_data
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.error(f"Error loading session from Redis: {e}")
|
|
245
|
+
|
|
246
|
+
return None
|
|
247
|
+
|
|
248
|
+
async def list_user_sessions(self, user_id: str) -> List[Dict[str, Any]]:
|
|
249
|
+
"""List all sessions for a user."""
|
|
250
|
+
sessions = []
|
|
251
|
+
for session_id, session_data in self.active_sessions.items():
|
|
252
|
+
if session_data.get("user_id") == user_id:
|
|
253
|
+
sessions.append(session_data)
|
|
254
|
+
return sessions
|
|
255
|
+
|
|
256
|
+
async def cleanup_session(self, session_id: str):
|
|
257
|
+
"""Clean up a session and its subscribers."""
|
|
258
|
+
if session_id in self.active_sessions:
|
|
259
|
+
del self.active_sessions[session_id]
|
|
260
|
+
|
|
261
|
+
if session_id in self.stream_subscribers:
|
|
262
|
+
# Cancel all subscribers
|
|
263
|
+
for queue in self.stream_subscribers[session_id]:
|
|
264
|
+
try:
|
|
265
|
+
await queue.put(None) # Signal shutdown
|
|
266
|
+
except Exception:
|
|
267
|
+
pass
|
|
268
|
+
del self.stream_subscribers[session_id]
|
|
269
|
+
|
|
270
|
+
# Remove from Redis
|
|
271
|
+
try:
|
|
272
|
+
await self.redis_client.delete(f"research_session:{session_id}")
|
|
273
|
+
except Exception as e:
|
|
274
|
+
logger.error(f"Error cleaning up session from Redis: {e}")
|
|
275
|
+
|
|
276
|
+
logger.info(f"Cleaned up session {session_id}")
|
|
277
|
+
|
|
278
|
+
async def health_check(self) -> Dict[str, Any]:
|
|
279
|
+
"""Health check for the context manager."""
|
|
280
|
+
try:
|
|
281
|
+
active_sessions = len(self.active_sessions)
|
|
282
|
+
total_subscribers = sum(len(subscribers) for subscribers in self.stream_subscribers.values())
|
|
283
|
+
|
|
284
|
+
return {
|
|
285
|
+
"status": "healthy",
|
|
286
|
+
"active_sessions": active_sessions,
|
|
287
|
+
"total_subscribers": total_subscribers,
|
|
288
|
+
"timestamp": _utc_timestamp()
|
|
289
|
+
}
|
|
290
|
+
except Exception as e:
|
|
291
|
+
return {
|
|
292
|
+
"status": "error",
|
|
293
|
+
"error": str(e),
|
|
294
|
+
"timestamp": _utc_timestamp()
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
# ---------------- Caching & Reporting Utilities ----------------
|
|
298
|
+
def _normalize_topic(self, text: str) -> str:
|
|
299
|
+
text = (text or "").lower()
|
|
300
|
+
cleaned = ''.join(ch if ch.isalnum() or ch.isspace() else ' ' for ch in text)
|
|
301
|
+
tokens = [t for t in cleaned.split() if t]
|
|
302
|
+
return ' '.join(sorted(tokens))
|
|
303
|
+
|
|
304
|
+
def _cache_key(self, topic: str, questions: List[str]) -> str:
|
|
305
|
+
norm_topic = self._normalize_topic(topic)
|
|
306
|
+
norm_questions = [self._normalize_topic(q) for q in (questions or [])]
|
|
307
|
+
return json.dumps({"t": norm_topic, "q": norm_questions}, sort_keys=True)
|
|
308
|
+
|
|
309
|
+
async def _get_cached_synthesis(self, topic: str, questions: List[str]) -> Optional[Dict[str, Any]]:
|
|
310
|
+
try:
|
|
311
|
+
key = self._cache_key(topic, questions)
|
|
312
|
+
data = await self.redis_client.hget("synthesis_cache", key)
|
|
313
|
+
if not data:
|
|
314
|
+
return None
|
|
315
|
+
if isinstance(data, bytes):
|
|
316
|
+
try:
|
|
317
|
+
data = data.decode('utf-8', 'ignore')
|
|
318
|
+
except Exception:
|
|
319
|
+
return None
|
|
320
|
+
return json.loads(data)
|
|
321
|
+
except Exception:
|
|
322
|
+
return None
|
|
323
|
+
|
|
324
|
+
async def _store_synthesis_cache(self, topic: str, questions: List[str], synthesis: Dict[str, Any]):
|
|
325
|
+
try:
|
|
326
|
+
key = self._cache_key(topic, questions)
|
|
327
|
+
def _convert(obj):
|
|
328
|
+
if isinstance(obj, set):
|
|
329
|
+
return list(obj)
|
|
330
|
+
return obj
|
|
331
|
+
payload = json.dumps(synthesis, ensure_ascii=False, default=_convert)
|
|
332
|
+
await self.redis_client.hset("synthesis_cache", key, payload)
|
|
333
|
+
except Exception:
|
|
334
|
+
pass
|
|
335
|
+
|
|
336
|
+
async def _find_similar_cached(self, topic: str, questions: List[str], min_score: float = 0.7) -> Optional[Dict[str, Any]]:
|
|
337
|
+
"""Find a semantically similar cached synthesis using simple token Jaccard overlap."""
|
|
338
|
+
try:
|
|
339
|
+
entries = await self.redis_client.hgetall("synthesis_cache")
|
|
340
|
+
if not entries:
|
|
341
|
+
return None
|
|
342
|
+
target_tokens = set(self._normalize_topic(topic).split())
|
|
343
|
+
best = None
|
|
344
|
+
best_score = 0.0
|
|
345
|
+
for key, val in entries.items():
|
|
346
|
+
try:
|
|
347
|
+
if isinstance(key, bytes):
|
|
348
|
+
key_str = key.decode('utf-8', 'ignore')
|
|
349
|
+
else:
|
|
350
|
+
key_str = str(key)
|
|
351
|
+
parsed = json.loads(key_str)
|
|
352
|
+
cached_topic = parsed.get('t', '')
|
|
353
|
+
tokens = set(str(cached_topic).split())
|
|
354
|
+
inter = len(tokens & target_tokens)
|
|
355
|
+
union = len(tokens | target_tokens) or 1
|
|
356
|
+
score = inter / union
|
|
357
|
+
if score > best_score:
|
|
358
|
+
best_score = score
|
|
359
|
+
best = val
|
|
360
|
+
except Exception:
|
|
361
|
+
continue
|
|
362
|
+
if best_score >= min_score and best is not None:
|
|
363
|
+
data = best.decode('utf-8', 'ignore') if isinstance(best, bytes) else str(best)
|
|
364
|
+
return json.loads(data)
|
|
365
|
+
except Exception:
|
|
366
|
+
return None
|
|
367
|
+
return None
|
|
368
|
+
|
|
369
|
+
async def _write_reports(self, session_id: str, topic: str, synthesis: Dict[str, Any]):
|
|
370
|
+
try:
|
|
371
|
+
reports_dir = os.path.join(os.getcwd(), "reports")
|
|
372
|
+
os.makedirs(reports_dir, exist_ok=True)
|
|
373
|
+
# JSON report
|
|
374
|
+
json_path = os.path.join(reports_dir, f"{session_id}.json")
|
|
375
|
+
with open(json_path, 'w', encoding='utf-8') as f:
|
|
376
|
+
json.dump(synthesis, f, ensure_ascii=False, indent=2)
|
|
377
|
+
# Markdown report
|
|
378
|
+
md_path = os.path.join(reports_dir, f"{session_id}.md")
|
|
379
|
+
summary = synthesis.get('summary') or synthesis.get('synthesis') or ""
|
|
380
|
+
body = json.dumps({k: v for k, v in synthesis.items() if k not in ['summary', 'synthesis']}, ensure_ascii=False, indent=2)
|
|
381
|
+
md_lines = [
|
|
382
|
+
f"# Research Report: {topic}",
|
|
383
|
+
"",
|
|
384
|
+
f"Generated: {_utc_timestamp()}Z",
|
|
385
|
+
"",
|
|
386
|
+
"## Summary",
|
|
387
|
+
summary if isinstance(summary, str) else json.dumps(summary, ensure_ascii=False, indent=2),
|
|
388
|
+
"",
|
|
389
|
+
"## Details",
|
|
390
|
+
body
|
|
391
|
+
]
|
|
392
|
+
with open(md_path, 'w', encoding='utf-8') as f:
|
|
393
|
+
f.write("\n".join(md_lines))
|
|
394
|
+
except Exception:
|
|
395
|
+
pass
|
|
396
|
+
|
|
397
|
+
@log_operation("create_session")
|
|
398
|
+
async def create_session(self, topic: str, context: Optional[Dict] = None, user_id: str = "default_user") -> str:
|
|
399
|
+
"""Create a new research session with enhanced context handling."""
|
|
400
|
+
#logger.info(f"Creating research session for topic: {topic}")
|
|
401
|
+
|
|
402
|
+
session_id = str(uuid.uuid4())
|
|
403
|
+
session = ResearchSession(
|
|
404
|
+
id=session_id,
|
|
405
|
+
user_id=user_id,
|
|
406
|
+
topic=topic,
|
|
407
|
+
context=context or {},
|
|
408
|
+
created_at=_utc_now(),
|
|
409
|
+
updated_at=_utc_now(),
|
|
410
|
+
status="initializing",
|
|
411
|
+
papers=[],
|
|
412
|
+
notes=[],
|
|
413
|
+
progress={
|
|
414
|
+
"stage": "created",
|
|
415
|
+
"percentage": 0,
|
|
416
|
+
"papers_found": 0,
|
|
417
|
+
"papers_processed": 0
|
|
418
|
+
}
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
# Store in database and cache
|
|
422
|
+
await self.db.store_research_session(session.dict())
|
|
423
|
+
self.active_sessions[session_id] = session
|
|
424
|
+
|
|
425
|
+
# Start background processing
|
|
426
|
+
self.session_tasks[session_id] = asyncio.create_task(
|
|
427
|
+
self._process_session(session_id)
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
#logger.info(f"Created research session: {session_id}")
|
|
431
|
+
return session_id
|
|
432
|
+
|
|
433
|
+
@log_operation("process_session")
|
|
434
|
+
async def _process_session(self, session_id: str):
|
|
435
|
+
"""Handle long-running session processing."""
|
|
436
|
+
try:
|
|
437
|
+
session = self.active_sessions[session_id]
|
|
438
|
+
|
|
439
|
+
# Update status
|
|
440
|
+
session.status = "searching"
|
|
441
|
+
await self._update_session(session)
|
|
442
|
+
|
|
443
|
+
# Search for papers
|
|
444
|
+
papers = await self._search_papers(session.topic, session.context)
|
|
445
|
+
session.papers = [p["id"] for p in papers]
|
|
446
|
+
session.progress["papers_found"] = len(papers)
|
|
447
|
+
await self._update_session(session)
|
|
448
|
+
|
|
449
|
+
# Process papers
|
|
450
|
+
for paper in papers:
|
|
451
|
+
await self._queue_paper_processing(session_id, paper)
|
|
452
|
+
session.progress["papers_processed"] += 1
|
|
453
|
+
await self._update_session(session)
|
|
454
|
+
|
|
455
|
+
# After synthesis, collect citations
|
|
456
|
+
if session.papers:
|
|
457
|
+
session.status = "synthesizing"
|
|
458
|
+
await self._update_session(session)
|
|
459
|
+
synthesis = await self.synthesizer.synthesize_papers(session.papers)
|
|
460
|
+
session.synthesis = synthesis
|
|
461
|
+
# Collect citations from synthesis
|
|
462
|
+
session.citations = synthesis.get('citations', [])
|
|
463
|
+
|
|
464
|
+
session.status = "completed"
|
|
465
|
+
session.progress["percentage"] = 100
|
|
466
|
+
await self._update_session(session)
|
|
467
|
+
|
|
468
|
+
except Exception as e:
|
|
469
|
+
logger.error(f"Error processing session {session_id}: {str(e)}")
|
|
470
|
+
session.status = "error"
|
|
471
|
+
session.error = str(e)
|
|
472
|
+
await self._update_session(session)
|
|
473
|
+
|
|
474
|
+
async def _search_papers(self, topic: str, context: Dict) -> List[Dict]:
|
|
475
|
+
"""Search for relevant papers with parallelization, deduplication, and ranking."""
|
|
476
|
+
import heapq
|
|
477
|
+
import operator
|
|
478
|
+
#logger.info(f"Searching for papers on: {topic}")
|
|
479
|
+
papers = []
|
|
480
|
+
errors = {}
|
|
481
|
+
try:
|
|
482
|
+
# Gather from multiple sources in parallel
|
|
483
|
+
from src.services.paper_service.openalex import OpenAlexClient
|
|
484
|
+
from src.services.paper_service.paper_access import PaperAccessManager
|
|
485
|
+
# Optionally add more sources here
|
|
486
|
+
async with OpenAlexClient() as openalex, PaperAccessManager() as pam:
|
|
487
|
+
tasks = [
|
|
488
|
+
openalex.search_works(topic, per_page=15),
|
|
489
|
+
# Add more sources here as needed, e.g. pam.semantic_scholar_search(topic), pam.core_search(topic)
|
|
490
|
+
]
|
|
491
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
492
|
+
for res in results:
|
|
493
|
+
if isinstance(res, Exception):
|
|
494
|
+
errors[type(res).__name__] = str(res)
|
|
495
|
+
continue
|
|
496
|
+
if isinstance(res, dict) and res.get('results'):
|
|
497
|
+
for paper in res['results']:
|
|
498
|
+
papers.append(paper)
|
|
499
|
+
elif isinstance(res, list):
|
|
500
|
+
papers.extend(res)
|
|
501
|
+
except Exception as e:
|
|
502
|
+
logger.warning(f"Paper search failed: {str(e)}")
|
|
503
|
+
errors[type(e).__name__] = str(e)
|
|
504
|
+
# Deduplicate by DOI or title
|
|
505
|
+
seen = set()
|
|
506
|
+
deduped = []
|
|
507
|
+
for paper in papers:
|
|
508
|
+
doi = paper.get('doi') or paper.get('id')
|
|
509
|
+
title = paper.get('title', '').lower()
|
|
510
|
+
key = (doi, title)
|
|
511
|
+
if key in seen:
|
|
512
|
+
continue
|
|
513
|
+
seen.add(key)
|
|
514
|
+
deduped.append(paper)
|
|
515
|
+
# Rank by relevance, citation count, recency
|
|
516
|
+
def paper_score(p):
|
|
517
|
+
score = 0
|
|
518
|
+
# Prefer more recent
|
|
519
|
+
year = p.get('year') or p.get('publication_year') or 0
|
|
520
|
+
score += (int(year) if year else 0) * 1.0
|
|
521
|
+
# Prefer more citations
|
|
522
|
+
score += (p.get('citations') or p.get('cited_by_count') or 0) * 2.0
|
|
523
|
+
# Prefer open access
|
|
524
|
+
if p.get('open_access') or (p.get('open_access', {}).get('is_oa', False)):
|
|
525
|
+
score += 10
|
|
526
|
+
return score
|
|
527
|
+
deduped.sort(key=paper_score, reverse=True)
|
|
528
|
+
# If nothing found, fall back to mock papers
|
|
529
|
+
if not deduped:
|
|
530
|
+
#logger.info(f"Using mock papers for topic: {topic}")
|
|
531
|
+
return [
|
|
532
|
+
{
|
|
533
|
+
'id': f"mock-paper-1-{uuid.uuid4()}",
|
|
534
|
+
'title': f"Advances in {topic}",
|
|
535
|
+
'doi': "https://doi.org/10.1234/mock.123",
|
|
536
|
+
'authors': ["A. Researcher", "B. Scientist", "C. Professor"],
|
|
537
|
+
'year': 2024,
|
|
538
|
+
'citations': 42,
|
|
539
|
+
'open_access': True,
|
|
540
|
+
'abstract': f"This paper explores recent developments in {topic}, focusing on practical applications.",
|
|
541
|
+
},
|
|
542
|
+
{
|
|
543
|
+
'id': f"mock-paper-2-{uuid.uuid4()}",
|
|
544
|
+
'title': f"Review of {topic} Technologies",
|
|
545
|
+
'doi': "https://doi.org/10.5678/mock.456",
|
|
546
|
+
'authors': ["D. Expert", "E. Analyst"],
|
|
547
|
+
'year': 2023,
|
|
548
|
+
'citations': 28,
|
|
549
|
+
'open_access': True,
|
|
550
|
+
'abstract': f"A comprehensive review of current {topic} technologies and methodologies.",
|
|
551
|
+
},
|
|
552
|
+
{
|
|
553
|
+
'id': f"mock-paper-3-{uuid.uuid4()}",
|
|
554
|
+
'title': f"Future Directions in {topic}",
|
|
555
|
+
'doi': "https://doi.org/10.9012/mock.789",
|
|
556
|
+
'authors': ["F. Visionary", "G. Pioneer"],
|
|
557
|
+
'year': 2024,
|
|
558
|
+
'citations': 15,
|
|
559
|
+
'open_access': True,
|
|
560
|
+
'abstract': f"This forward-looking paper examines emerging trends and future directions in {topic}.",
|
|
561
|
+
}
|
|
562
|
+
]
|
|
563
|
+
return deduped[:20] # Limit to top 20 papers
|
|
564
|
+
|
|
565
|
+
def _extract_abstract(self, paper):
|
|
566
|
+
"""Extract abstract from OpenAlex format"""
|
|
567
|
+
abstract_index = paper.get("abstract_inverted_index")
|
|
568
|
+
if not abstract_index:
|
|
569
|
+
return "No abstract available"
|
|
570
|
+
|
|
571
|
+
# Reconstruct from inverted index
|
|
572
|
+
words = [""] * 300
|
|
573
|
+
for word, positions in abstract_index.items():
|
|
574
|
+
for pos in positions:
|
|
575
|
+
if pos < len(words):
|
|
576
|
+
words[pos] = word
|
|
577
|
+
|
|
578
|
+
abstract = " ".join(word for word in words if word).strip()
|
|
579
|
+
return abstract[:800] + "..." if len(abstract) > 800 else abstract
|
|
580
|
+
|
|
581
|
+
async def _queue_paper_processing(self, session_id: str, paper: Dict):
|
|
582
|
+
"""Queue paper for processing with session context."""
|
|
583
|
+
try:
|
|
584
|
+
processing_request = {
|
|
585
|
+
"session_id": session_id,
|
|
586
|
+
"paper": paper,
|
|
587
|
+
"timestamp": _utc_timestamp()
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
await self.redis_client.lpush(
|
|
591
|
+
"processing_queue",
|
|
592
|
+
json.dumps(processing_request)
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
except Exception as e:
|
|
596
|
+
logger.error(f"Error queuing paper for processing: {str(e)}")
|
|
597
|
+
raise
|
|
598
|
+
|
|
599
|
+
@log_operation("get_session_status")
|
|
600
|
+
async def get_session_status(self, session_id: str) -> Dict:
|
|
601
|
+
"""Get detailed session status."""
|
|
602
|
+
session = await self._get_session(session_id)
|
|
603
|
+
if not session:
|
|
604
|
+
return {"error": "Session not found"}
|
|
605
|
+
|
|
606
|
+
# Get processing progress
|
|
607
|
+
progress = await self.redis_client.hgetall(f"progress:{session_id}")
|
|
608
|
+
|
|
609
|
+
return {
|
|
610
|
+
"id": session_id,
|
|
611
|
+
"status": session.status,
|
|
612
|
+
"progress": session.progress,
|
|
613
|
+
"papers_total": len(session.papers),
|
|
614
|
+
"processing_stage": progress.get("stage", "unknown"),
|
|
615
|
+
"processing_percentage": progress.get("percentage", 0),
|
|
616
|
+
"last_updated": session.updated_at.isoformat(),
|
|
617
|
+
"error": session.error if hasattr(session, 'error') else None
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
@log_operation("add_note")
|
|
621
|
+
async def add_note(self, session_id: str, note: Dict) -> bool:
|
|
622
|
+
"""Add a structured note to research session."""
|
|
623
|
+
session = await self._get_session(session_id)
|
|
624
|
+
if not session:
|
|
625
|
+
return False
|
|
626
|
+
|
|
627
|
+
note_entry = {
|
|
628
|
+
"id": str(uuid.uuid4()),
|
|
629
|
+
"content": note.get("content"),
|
|
630
|
+
"type": note.get("type", "general"),
|
|
631
|
+
"references": note.get("references", []),
|
|
632
|
+
"created_at": _utc_timestamp()
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
session.notes.append(note_entry)
|
|
636
|
+
session.updated_at = _utc_now()
|
|
637
|
+
|
|
638
|
+
await self._update_session(session)
|
|
639
|
+
return True
|
|
640
|
+
|
|
641
|
+
async def _update_session(self, session: ResearchSession):
|
|
642
|
+
"""Update session in database and cache."""
|
|
643
|
+
session.updated_at = _utc_now()
|
|
644
|
+
await self.db.update_research_session(session.dict())
|
|
645
|
+
self.active_sessions[session.id] = session
|
|
646
|
+
|
|
647
|
+
# Prepare Redis-compatible mapping
|
|
648
|
+
redis_mapping = {}
|
|
649
|
+
for key, value in session.dict().items():
|
|
650
|
+
if value is None:
|
|
651
|
+
# Convert None to empty string for Redis
|
|
652
|
+
redis_mapping[key] = ""
|
|
653
|
+
elif isinstance(value, datetime):
|
|
654
|
+
# Convert datetime to ISO format string
|
|
655
|
+
redis_mapping[key] = value.isoformat()
|
|
656
|
+
elif isinstance(value, (dict, list)):
|
|
657
|
+
# Serialize complex types
|
|
658
|
+
redis_mapping[key] = json.dumps(value)
|
|
659
|
+
else:
|
|
660
|
+
redis_mapping[key] = value
|
|
661
|
+
|
|
662
|
+
# Update Redis cache
|
|
663
|
+
try:
|
|
664
|
+
await self.redis_client.hset(
|
|
665
|
+
f"session:{session.id}",
|
|
666
|
+
mapping=redis_mapping # Use serialized mapping
|
|
667
|
+
)
|
|
668
|
+
except Exception as e:
|
|
669
|
+
logger.error(f"Error updating Redis cache: {str(e)}")
|
|
670
|
+
|
|
671
|
+
async def _get_session(self, session_id: str) -> Optional[ResearchSession]:
|
|
672
|
+
"""Get research session by ID with caching."""
|
|
673
|
+
# Check memory cache first
|
|
674
|
+
if session_id in self.active_sessions:
|
|
675
|
+
return self.active_sessions[session_id]
|
|
676
|
+
|
|
677
|
+
# Try Redis cache
|
|
678
|
+
session_data = await self.redis_client.hgetall(f"session:{session_id}")
|
|
679
|
+
if session_data:
|
|
680
|
+
# Convert JSON strings and datetime strings back to Python objects
|
|
681
|
+
for key, value in session_data.items():
|
|
682
|
+
if key in ['progress', 'context', 'synthesis']:
|
|
683
|
+
try:
|
|
684
|
+
session_data[key] = json.loads(value)
|
|
685
|
+
except (json.JSONDecodeError, TypeError):
|
|
686
|
+
pass
|
|
687
|
+
elif key in ['created_at', 'updated_at'] and isinstance(value, str):
|
|
688
|
+
try:
|
|
689
|
+
session_data[key] = datetime.fromisoformat(value)
|
|
690
|
+
except ValueError:
|
|
691
|
+
pass
|
|
692
|
+
|
|
693
|
+
session = ResearchSession(**session_data)
|
|
694
|
+
self.active_sessions[session_id] = session
|
|
695
|
+
return session
|
|
696
|
+
|
|
697
|
+
# Finally try database
|
|
698
|
+
session_data = await self.db.get_research_session(session_id)
|
|
699
|
+
if session_data:
|
|
700
|
+
session = ResearchSession(**session_data)
|
|
701
|
+
self.active_sessions[session_id] = session
|
|
702
|
+
|
|
703
|
+
# Update cache
|
|
704
|
+
await self.redis_client.hset(
|
|
705
|
+
f"session:{session_id}",
|
|
706
|
+
mapping=session.dict()
|
|
707
|
+
)
|
|
708
|
+
return session
|
|
709
|
+
|
|
710
|
+
return None
|
|
711
|
+
|
|
712
|
+
async def cleanup(self):
|
|
713
|
+
"""Cleanup manager resources."""
|
|
714
|
+
try:
|
|
715
|
+
# Cancel all running tasks
|
|
716
|
+
for task in self.session_tasks.values():
|
|
717
|
+
task.cancel()
|
|
718
|
+
|
|
719
|
+
# Wait for tasks to complete
|
|
720
|
+
await asyncio.gather(*self.session_tasks.values(), return_exceptions=True)
|
|
721
|
+
|
|
722
|
+
# Close Redis connection
|
|
723
|
+
await self.redis_client.close()
|
|
724
|
+
|
|
725
|
+
except Exception as e:
|
|
726
|
+
logger.error(f"Error during cleanup: {str(e)}")
|
|
727
|
+
|
|
728
|
+
@log_operation("start_layered_research")
|
|
729
|
+
async def start_layered_research(self, topic: str, research_questions: List[str], max_layers: int = 3, user_id: str = "default_user") -> str:
|
|
730
|
+
"""Start a multi-layered research process that explores connected concepts"""
|
|
731
|
+
#logger.info(f"Starting layered research on: {topic}")
|
|
732
|
+
|
|
733
|
+
# Create a new session
|
|
734
|
+
session_id = str(uuid.uuid4())
|
|
735
|
+
session = ResearchSession(
|
|
736
|
+
id=session_id,
|
|
737
|
+
topic=topic,
|
|
738
|
+
user_id=user_id,
|
|
739
|
+
context={
|
|
740
|
+
"research_type": "layered",
|
|
741
|
+
"questions": research_questions,
|
|
742
|
+
"max_layers": max_layers,
|
|
743
|
+
"current_layer": 0,
|
|
744
|
+
"explored_concepts": [topic],
|
|
745
|
+
"discovered_concepts": []
|
|
746
|
+
},
|
|
747
|
+
created_at=_utc_now(),
|
|
748
|
+
updated_at=_utc_now(),
|
|
749
|
+
status="initializing",
|
|
750
|
+
papers=[],
|
|
751
|
+
notes=[],
|
|
752
|
+
progress={
|
|
753
|
+
"stage": "created",
|
|
754
|
+
"percentage": 0,
|
|
755
|
+
"web_sources_found": 0,
|
|
756
|
+
"papers_found": 0,
|
|
757
|
+
"papers_processed": 0
|
|
758
|
+
}
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
# Store in database and cache
|
|
762
|
+
await self.db.store_research_session(session.dict())
|
|
763
|
+
self.active_sessions[session_id] = session
|
|
764
|
+
|
|
765
|
+
# Serve from cache if available
|
|
766
|
+
cached = await self._get_cached_synthesis(topic, research_questions)
|
|
767
|
+
if cached:
|
|
768
|
+
session.synthesis = cached
|
|
769
|
+
session.status = "completed"
|
|
770
|
+
session.progress["percentage"] = 100
|
|
771
|
+
await self._update_session(session)
|
|
772
|
+
else:
|
|
773
|
+
# Start layered research process
|
|
774
|
+
self.session_tasks[session_id] = asyncio.create_task(
|
|
775
|
+
self._process_layered_research(session_id)
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
#logger.info(f"Created layered research session: {session_id}")
|
|
779
|
+
return session_id
|
|
780
|
+
|
|
781
|
+
async def _process_layered_research(self, session_id: str):
|
|
782
|
+
"""Process a layered research session with LLM-powered query generation, semantic clustering, and source scoring."""
|
|
783
|
+
try:
|
|
784
|
+
session = self.active_sessions[session_id]
|
|
785
|
+
session.status = "exploring_main_topic"
|
|
786
|
+
await self._update_session(session)
|
|
787
|
+
|
|
788
|
+
import time
|
|
789
|
+
SATURATION_CRITERIA = {
|
|
790
|
+
'min_papers': 10,
|
|
791
|
+
'min_web_sources': 15,
|
|
792
|
+
'novelty_threshold': 0.15,
|
|
793
|
+
'time_budget': 1800, # 30 min
|
|
794
|
+
'ideal_paper_range': (20, 100),
|
|
795
|
+
}
|
|
796
|
+
state = {
|
|
797
|
+
'web_sources': [],
|
|
798
|
+
'academic_papers': [],
|
|
799
|
+
'key_insights': set(),
|
|
800
|
+
'information_graph': {},
|
|
801
|
+
'saturation_score': 0,
|
|
802
|
+
'start_time': time.time(),
|
|
803
|
+
'diversity_metrics': {},
|
|
804
|
+
'contradictions': [],
|
|
805
|
+
'gap_queries': set(),
|
|
806
|
+
'clusters': [],
|
|
807
|
+
'source_scores': {},
|
|
808
|
+
}
|
|
809
|
+
topic = session.topic
|
|
810
|
+
layer = 1
|
|
811
|
+
max_layers = session.context.get("max_layers", 3)
|
|
812
|
+
concepts_to_explore = [topic]
|
|
813
|
+
explored_concepts = set()
|
|
814
|
+
gap_queries = set()
|
|
815
|
+
while True:
|
|
816
|
+
if not concepts_to_explore or layer > max_layers:
|
|
817
|
+
break
|
|
818
|
+
concept = concepts_to_explore.pop(0)
|
|
819
|
+
if concept in explored_concepts:
|
|
820
|
+
continue
|
|
821
|
+
explored_concepts.add(concept)
|
|
822
|
+
session.context["current_layer"] = layer
|
|
823
|
+
session.context.setdefault("explored_concepts", []).append(concept)
|
|
824
|
+
session.progress["percentage"] = min(80, int((layer / max_layers) * 80))
|
|
825
|
+
session.status = f"exploring_layer_{layer}"
|
|
826
|
+
await self._update_session(session)
|
|
827
|
+
|
|
828
|
+
# --- Parallel search ---
|
|
829
|
+
web_results, papers = await asyncio.gather(
|
|
830
|
+
self._search_web_sources(concept),
|
|
831
|
+
self._search_papers(concept, session.context)
|
|
832
|
+
)
|
|
833
|
+
# Fetch web content for important results
|
|
834
|
+
web_content_tasks = []
|
|
835
|
+
for result in web_results:
|
|
836
|
+
if self._is_important_result(result):
|
|
837
|
+
task = asyncio.create_task(self._fetch_web_content(result.get("url", "")))
|
|
838
|
+
web_content_tasks.append((result, task))
|
|
839
|
+
for result, task in web_content_tasks:
|
|
840
|
+
try:
|
|
841
|
+
content = await task
|
|
842
|
+
result["extracted_content"] = content
|
|
843
|
+
except Exception as e:
|
|
844
|
+
result["extracted_content"] = f"Failed to extract: {str(e)}"
|
|
845
|
+
# --- Source credibility scoring ---
|
|
846
|
+
for paper in papers:
|
|
847
|
+
score = self._score_academic_source(paper)
|
|
848
|
+
state['source_scores'][paper.get('id')] = score
|
|
849
|
+
for web in web_results:
|
|
850
|
+
score = self._score_web_source(web)
|
|
851
|
+
state['source_scores'][web.get('url')] = score
|
|
852
|
+
# Deduplicate web results by (url, title)
|
|
853
|
+
seen = set()
|
|
854
|
+
uniq_web = []
|
|
855
|
+
for w in web_results:
|
|
856
|
+
key = (w.get('url','').strip().lower(), w.get('title','').strip().lower())
|
|
857
|
+
if key in seen:
|
|
858
|
+
continue
|
|
859
|
+
seen.add(key)
|
|
860
|
+
uniq_web.append(w)
|
|
861
|
+
web_results = uniq_web
|
|
862
|
+
# Update state
|
|
863
|
+
state['web_sources'].extend(web_results)
|
|
864
|
+
state['academic_papers'].extend(papers)
|
|
865
|
+
session.context.setdefault("web_sources", []).extend(web_results)
|
|
866
|
+
session.papers.extend([p["id"] for p in papers])
|
|
867
|
+
session.progress["web_sources_found"] += len(web_results)
|
|
868
|
+
session.progress["papers_found"] += len(papers)
|
|
869
|
+
await self._update_session(session)
|
|
870
|
+
# --- Extract insights ---
|
|
871
|
+
new_insights_web = self._extract_insights_from_web(web_results)
|
|
872
|
+
new_insights_papers = self._extract_insights_from_papers(papers)
|
|
873
|
+
# --- Semantic clustering ---
|
|
874
|
+
state['clusters'] = self._cluster_insights(new_insights_web | new_insights_papers)
|
|
875
|
+
# --- Calculate novelty ---
|
|
876
|
+
web_novelty = self._calculate_novelty(new_insights_web, state['key_insights'])
|
|
877
|
+
paper_novelty = self._calculate_novelty(new_insights_papers, state['key_insights'])
|
|
878
|
+
# --- Update key insights ---
|
|
879
|
+
state['key_insights'].update(new_insights_web)
|
|
880
|
+
state['key_insights'].update(new_insights_papers)
|
|
881
|
+
# --- Diversity & contradiction analysis ---
|
|
882
|
+
state['diversity_metrics'] = self._analyze_diversity(state['academic_papers'], state['web_sources'])
|
|
883
|
+
state['contradictions'].extend(self._detect_contradictions(new_insights_web, new_insights_papers))
|
|
884
|
+
# --- Gap analysis & LLM-powered query refinement ---
|
|
885
|
+
new_gap_queries = self._generate_gap_queries(state, concept)
|
|
886
|
+
# Use LLM to suggest additional queries
|
|
887
|
+
llm_gap_queries = await self._llm_generate_gap_queries(state, concept)
|
|
888
|
+
for q in new_gap_queries.union(llm_gap_queries):
|
|
889
|
+
if q not in gap_queries:
|
|
890
|
+
gap_queries.add(q)
|
|
891
|
+
concepts_to_explore.append(q)
|
|
892
|
+
# --- Calculate saturation ---
|
|
893
|
+
state['saturation_score'] = self._calculate_saturation(
|
|
894
|
+
web_novelty, paper_novelty,
|
|
895
|
+
len(state['academic_papers']), len(state['web_sources'])
|
|
896
|
+
)
|
|
897
|
+
# --- Coverage checks ---
|
|
898
|
+
coverage_met = (
|
|
899
|
+
len(state['academic_papers']) >= SATURATION_CRITERIA['min_papers'] and
|
|
900
|
+
len(state['web_sources']) >= SATURATION_CRITERIA['min_web_sources']
|
|
901
|
+
)
|
|
902
|
+
time_elapsed = time.time() - state['start_time']
|
|
903
|
+
# --- Stopping conditions ---
|
|
904
|
+
if (
|
|
905
|
+
(web_novelty < SATURATION_CRITERIA['novelty_threshold'] and paper_novelty < SATURATION_CRITERIA['novelty_threshold']) or
|
|
906
|
+
state['saturation_score'] >= 0.95 or
|
|
907
|
+
coverage_met or
|
|
908
|
+
time_elapsed > SATURATION_CRITERIA['time_budget']
|
|
909
|
+
):
|
|
910
|
+
break
|
|
911
|
+
layer += 1
|
|
912
|
+
# --- Synthesis ---
|
|
913
|
+
session.status = "synthesizing"
|
|
914
|
+
await self._update_session(session)
|
|
915
|
+
synthesis = await self._generate_layered_synthesis(session_id)
|
|
916
|
+
# Attach advanced metrics to synthesis
|
|
917
|
+
synthesis['diversity_metrics'] = state['diversity_metrics']
|
|
918
|
+
synthesis['contradictions'] = state['contradictions']
|
|
919
|
+
synthesis['gap_queries'] = list(gap_queries)
|
|
920
|
+
synthesis['clusters'] = state['clusters']
|
|
921
|
+
synthesis['source_scores'] = state['source_scores']
|
|
922
|
+
session.synthesis = synthesis
|
|
923
|
+
session.status = "completed"
|
|
924
|
+
session.progress["percentage"] = 100
|
|
925
|
+
await self._update_session(session)
|
|
926
|
+
# Persist for reuse and create reports
|
|
927
|
+
await self._store_synthesis_cache(session.topic, session.context.get('questions', []), synthesis)
|
|
928
|
+
await self._write_reports(session_id, session.topic, synthesis)
|
|
929
|
+
except Exception as e:
|
|
930
|
+
logger.error(f"Error in layered research {session_id}: {str(e)}")
|
|
931
|
+
session.status = "error"
|
|
932
|
+
session.error = str(e)
|
|
933
|
+
await self._update_session(session)
|
|
934
|
+
|
|
935
|
+
async def _explore_concept(self, session_id: str, concept: str, layer: int):
|
|
936
|
+
"""Explore a concept in the layered research process"""
|
|
937
|
+
session = self.active_sessions[session_id]
|
|
938
|
+
max_layers = session.context.get("max_layers", 3)
|
|
939
|
+
|
|
940
|
+
# Don't explore beyond max layers or already explored concepts
|
|
941
|
+
if layer > max_layers or concept in session.context.get("explored_concepts", []):
|
|
942
|
+
return
|
|
943
|
+
|
|
944
|
+
#logger.info(f"Exploring concept '{concept}' at layer {layer} for session {session_id}")
|
|
945
|
+
|
|
946
|
+
# Update session state
|
|
947
|
+
session.context["current_layer"] = layer
|
|
948
|
+
session.context.get("explored_concepts", []).append(concept)
|
|
949
|
+
session.progress["percentage"] = min(80, int((layer / max_layers) * 80)) # Reserve 20% for synthesis
|
|
950
|
+
session.status = f"exploring_layer_{layer}"
|
|
951
|
+
await self._update_session(session)
|
|
952
|
+
|
|
953
|
+
# Search for both web and academic sources
|
|
954
|
+
web_results, papers = await asyncio.gather(
|
|
955
|
+
self._search_web_sources(concept),
|
|
956
|
+
self._search_papers(concept, session.context)
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
web_content_tasks = []
|
|
960
|
+
for result in web_results:
|
|
961
|
+
if self._is_important_result(result):
|
|
962
|
+
task = asyncio.create_task(self._fetch_web_content(result.get("url", "")))
|
|
963
|
+
web_content_tasks.append((result, task))
|
|
964
|
+
|
|
965
|
+
# Wait for content fetching to complete
|
|
966
|
+
for result, task in web_content_tasks:
|
|
967
|
+
try:
|
|
968
|
+
content = await task
|
|
969
|
+
result["extracted_content"] = content
|
|
970
|
+
except Exception as e:
|
|
971
|
+
logger.error(f"Error fetching content: {str(e)}")
|
|
972
|
+
result["extracted_content"] = f"Failed to extract: {str(e)}"
|
|
973
|
+
|
|
974
|
+
# Update session with new sources
|
|
975
|
+
if "web_sources" not in session.context:
|
|
976
|
+
session.context["web_sources"] = []
|
|
977
|
+
session.context["web_sources"].extend(web_results)
|
|
978
|
+
session.progress["web_sources_found"] += len(web_results)
|
|
979
|
+
|
|
980
|
+
# Add papers to session
|
|
981
|
+
paper_ids = [p["id"] for p in papers]
|
|
982
|
+
session.papers.extend(paper_ids)
|
|
983
|
+
session.progress["papers_found"] += len(papers)
|
|
984
|
+
await self._update_session(session)
|
|
985
|
+
|
|
986
|
+
# Process papers to extract information
|
|
987
|
+
for paper in papers:
|
|
988
|
+
await self._queue_paper_processing(session_id, paper)
|
|
989
|
+
session.progress["papers_processed"] += 1
|
|
990
|
+
await self._update_session(session)
|
|
991
|
+
|
|
992
|
+
# Discover new concepts from current sources
|
|
993
|
+
new_concepts = await self._discover_related_concepts(session_id, concept, web_results, papers)
|
|
994
|
+
|
|
995
|
+
# Store discovered concepts
|
|
996
|
+
if "discovered_concepts" not in session.context:
|
|
997
|
+
session.context["discovered_concepts"] = []
|
|
998
|
+
|
|
999
|
+
concept_layer = {
|
|
1000
|
+
"concept": concept,
|
|
1001
|
+
"layer": layer,
|
|
1002
|
+
"related_concepts": new_concepts
|
|
1003
|
+
}
|
|
1004
|
+
session.context["discovered_concepts"].append(concept_layer)
|
|
1005
|
+
await self._update_session(session)
|
|
1006
|
+
|
|
1007
|
+
# Recursively explore new concepts in next layer
|
|
1008
|
+
for new_concept in new_concepts[:3]: # Limit to top 3 concepts per layer
|
|
1009
|
+
await self._explore_concept(session_id, new_concept, layer + 1)
|
|
1010
|
+
|
|
1011
|
+
async def _search_web_sources(self, concept: str) -> List[Dict]:
|
|
1012
|
+
"""Search for web sources related to a concept (Google -> Bing -> mock)."""
|
|
1013
|
+
|
|
1014
|
+
# Mock fallback
|
|
1015
|
+
mock_results = [
|
|
1016
|
+
{
|
|
1017
|
+
"title": f"Overview of {concept}",
|
|
1018
|
+
"url": "https://example.com/mock1",
|
|
1019
|
+
"snippet": f"This is a comprehensive overview of {concept} and its applications in scientific research.",
|
|
1020
|
+
"source": "mock"
|
|
1021
|
+
},
|
|
1022
|
+
{
|
|
1023
|
+
"title": f"Recent developments in {concept}",
|
|
1024
|
+
"url": "https://example.com/mock2",
|
|
1025
|
+
"snippet": f"Recent studies have shown significant progress in the field of {concept}.",
|
|
1026
|
+
"source": "mock"
|
|
1027
|
+
}
|
|
1028
|
+
]
|
|
1029
|
+
|
|
1030
|
+
try:
|
|
1031
|
+
# Prefer Google Custom Search if configured
|
|
1032
|
+
google_results = await self._try_google_search(concept)
|
|
1033
|
+
if google_results:
|
|
1034
|
+
return google_results
|
|
1035
|
+
|
|
1036
|
+
# Fallback to Bing Search API
|
|
1037
|
+
bing_results = await self._try_bing_search(concept)
|
|
1038
|
+
if bing_results:
|
|
1039
|
+
return bing_results
|
|
1040
|
+
|
|
1041
|
+
logger.warning(f"All web searches failed - using mock results for: {concept}")
|
|
1042
|
+
return mock_results
|
|
1043
|
+
except Exception as e:
|
|
1044
|
+
logger.error(f"Web search error: {str(e)}")
|
|
1045
|
+
return mock_results
|
|
1046
|
+
|
|
1047
|
+
async def _try_google_search(self, query: str, num_results: int = 10) -> List[Dict]:
|
|
1048
|
+
"""Try to search using Google Custom Search API"""
|
|
1049
|
+
import os
|
|
1050
|
+
import aiohttp
|
|
1051
|
+
|
|
1052
|
+
# Support multiple common env var names for compatibility
|
|
1053
|
+
api_key = (
|
|
1054
|
+
os.environ.get("GOOGLE_SEARCH_API_KEY")
|
|
1055
|
+
or os.environ.get("GOOGLE_CUSTOM_SEARCH_API_KEY")
|
|
1056
|
+
or os.environ.get("GOOGLE_API_KEY")
|
|
1057
|
+
)
|
|
1058
|
+
cx = (
|
|
1059
|
+
os.environ.get("GOOGLE_SEARCH_CX")
|
|
1060
|
+
or os.environ.get("GOOGLE_SEARCH_ENGINE_ID")
|
|
1061
|
+
or os.environ.get("GOOGLE_SEARCH_ENGINE_CX")
|
|
1062
|
+
)
|
|
1063
|
+
|
|
1064
|
+
if not api_key or not cx:
|
|
1065
|
+
logger.warning("Google Search API credentials not configured")
|
|
1066
|
+
return []
|
|
1067
|
+
|
|
1068
|
+
url = "https://www.googleapis.com/customsearch/v1"
|
|
1069
|
+
params = {
|
|
1070
|
+
"key": api_key,
|
|
1071
|
+
"cx": cx,
|
|
1072
|
+
"q": query,
|
|
1073
|
+
"num": min(num_results, 10)
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
# Retry with basic backoff
|
|
1077
|
+
for attempt in range(3):
|
|
1078
|
+
try:
|
|
1079
|
+
async with aiohttp.ClientSession() as session:
|
|
1080
|
+
async with session.get(url, params=params, timeout=10) as response:
|
|
1081
|
+
if response.status == 200:
|
|
1082
|
+
data = await response.json()
|
|
1083
|
+
results = []
|
|
1084
|
+
for item in data.get("items", []):
|
|
1085
|
+
results.append({
|
|
1086
|
+
"title": item.get("title"),
|
|
1087
|
+
"url": item.get("link"),
|
|
1088
|
+
"snippet": item.get("snippet"),
|
|
1089
|
+
"source": "google_search"
|
|
1090
|
+
})
|
|
1091
|
+
return results
|
|
1092
|
+
else:
|
|
1093
|
+
logger.warning(f"Google search failed with status {response.status}")
|
|
1094
|
+
except Exception as e:
|
|
1095
|
+
logger.error(f"Google search error (attempt {attempt+1}): {str(e)}")
|
|
1096
|
+
await asyncio.sleep(1.0 * (attempt + 1))
|
|
1097
|
+
return []
|
|
1098
|
+
|
|
1099
|
+
async def _try_bing_search(self, query: str, num_results: int = 10) -> List[Dict]:
|
|
1100
|
+
"""Try to search using Bing Search API"""
|
|
1101
|
+
import os
|
|
1102
|
+
import aiohttp
|
|
1103
|
+
|
|
1104
|
+
api_key = os.environ.get("BING_SEARCH_API_KEY")
|
|
1105
|
+
|
|
1106
|
+
if not api_key:
|
|
1107
|
+
logger.warning("Bing Search API key not configured")
|
|
1108
|
+
return []
|
|
1109
|
+
|
|
1110
|
+
url = "https://api.bing.microsoft.com/v7.0/search"
|
|
1111
|
+
headers = {"Ocp-Apim-Subscription-Key": api_key}
|
|
1112
|
+
params = {
|
|
1113
|
+
"q": query,
|
|
1114
|
+
"count": min(num_results, 50),
|
|
1115
|
+
"responseFilter": "Webpages"
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
try:
|
|
1119
|
+
async with aiohttp.ClientSession() as session:
|
|
1120
|
+
async with session.get(url, headers=headers, params=params) as response:
|
|
1121
|
+
if response.status == 200:
|
|
1122
|
+
data = await response.json()
|
|
1123
|
+
|
|
1124
|
+
results = []
|
|
1125
|
+
for item in data.get("webPages", {}).get("value", []):
|
|
1126
|
+
results.append({
|
|
1127
|
+
"title": item.get("name"),
|
|
1128
|
+
"url": item.get("url"),
|
|
1129
|
+
"snippet": item.get("snippet"),
|
|
1130
|
+
"source": "bing_search"
|
|
1131
|
+
})
|
|
1132
|
+
return results
|
|
1133
|
+
else:
|
|
1134
|
+
logger.warning(f"Bing search failed with status {response.status}")
|
|
1135
|
+
return []
|
|
1136
|
+
except Exception as e:
|
|
1137
|
+
logger.error(f"Bing search error: {str(e)}")
|
|
1138
|
+
return []
|
|
1139
|
+
|
|
1140
|
+
async def _fetch_web_content(self, url: str) -> str:
|
|
1141
|
+
"""Fetch and extract main content from web page"""
|
|
1142
|
+
import aiohttp
|
|
1143
|
+
from bs4 import BeautifulSoup
|
|
1144
|
+
|
|
1145
|
+
for attempt in range(3):
|
|
1146
|
+
try:
|
|
1147
|
+
async with aiohttp.ClientSession() as session:
|
|
1148
|
+
async with session.get(url, timeout=10) as response:
|
|
1149
|
+
if response.status == 200:
|
|
1150
|
+
html = await response.text()
|
|
1151
|
+
# Parse with BeautifulSoup
|
|
1152
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
1153
|
+
# Remove non-content elements
|
|
1154
|
+
for element in soup(["script", "style", "header", "footer", "nav"]):
|
|
1155
|
+
element.decompose()
|
|
1156
|
+
# Get text content
|
|
1157
|
+
text = soup.get_text(separator='\n')
|
|
1158
|
+
# Clean up text
|
|
1159
|
+
lines = (line.strip() for line in text.splitlines())
|
|
1160
|
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
1161
|
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
|
1162
|
+
# Truncate if too long
|
|
1163
|
+
if len(text) > 5000:
|
|
1164
|
+
text = text[:5000] + "..."
|
|
1165
|
+
return text
|
|
1166
|
+
else:
|
|
1167
|
+
msg = f"Failed to fetch content: HTTP {response.status}"
|
|
1168
|
+
if attempt == 2:
|
|
1169
|
+
return msg
|
|
1170
|
+
except Exception as e:
|
|
1171
|
+
logger.error(f"Error fetching content from {url} (attempt {attempt+1}): {str(e)}")
|
|
1172
|
+
await asyncio.sleep(1.0 * (attempt + 1))
|
|
1173
|
+
return "Error fetching content after retries"
|
|
1174
|
+
|
|
1175
|
+
def _is_important_result(self, result: Dict) -> bool:
|
|
1176
|
+
"""Determine if a web result is important enough to fetch full content"""
|
|
1177
|
+
important_keywords = ["research", "study", "analysis", "review", "journal"]
|
|
1178
|
+
return any(keyword in result.get("title", "").lower() for keyword in important_keywords)
|
|
1179
|
+
|
|
1180
|
+
async def _discover_related_concepts(self, session_id: str, concept: str,
|
|
1181
|
+
web_results: List[Dict], papers: List[Dict]) -> List[str]:
|
|
1182
|
+
"""Discover related concepts from current research sources"""
|
|
1183
|
+
session = self.active_sessions[session_id]
|
|
1184
|
+
|
|
1185
|
+
# Check if llm_manager is available
|
|
1186
|
+
if not hasattr(self, 'llm_manager') or self.llm_manager is None:
|
|
1187
|
+
logger.warning(f"No LLM manager available - returning mock concepts for: {concept}")
|
|
1188
|
+
# Return mock concepts for testing without LLM
|
|
1189
|
+
return ["Quantum algorithms", "Molecular modeling", "Drug discovery optimization"]
|
|
1190
|
+
|
|
1191
|
+
# Use the query generator to discover concepts
|
|
1192
|
+
from .query_generator import EnhancedQueryGenerator
|
|
1193
|
+
query_generator = EnhancedQueryGenerator(self.llm_manager) # Use llm_manager here
|
|
1194
|
+
|
|
1195
|
+
# Extract context from web results
|
|
1196
|
+
web_context = []
|
|
1197
|
+
for result in web_results:
|
|
1198
|
+
if "extracted_content" in result:
|
|
1199
|
+
web_context.append(f"Title: {result['title']}\nContent: {result['extracted_content'][:500]}...")
|
|
1200
|
+
else:
|
|
1201
|
+
web_context.append(f"Title: {result['title']}\nSnippet: {result['snippet']}")
|
|
1202
|
+
|
|
1203
|
+
# Extract context from papers
|
|
1204
|
+
paper_context = []
|
|
1205
|
+
for paper in papers:
|
|
1206
|
+
paper_context.append(f"Title: {paper.get('title', '')}\nAuthors: {', '.join(paper.get('authors', []))}")
|
|
1207
|
+
|
|
1208
|
+
# Generate concepts
|
|
1209
|
+
concepts_prompt = f"""
|
|
1210
|
+
Based on research about "{concept}", identify 3-5 important related concepts that should be explored next.
|
|
1211
|
+
|
|
1212
|
+
RESEARCH QUESTIONS:
|
|
1213
|
+
{json.dumps(session.context.get('questions', []))}
|
|
1214
|
+
|
|
1215
|
+
WEB SOURCES:
|
|
1216
|
+
{json.dumps(web_context)}
|
|
1217
|
+
|
|
1218
|
+
ACADEMIC PAPERS:
|
|
1219
|
+
{json.dumps(paper_context)}
|
|
1220
|
+
|
|
1221
|
+
Identify concepts that:
|
|
1222
|
+
1. Are mentioned across multiple sources
|
|
1223
|
+
2. Appear important but not fully explained
|
|
1224
|
+
3. Would deepen understanding of the main topic
|
|
1225
|
+
4. Are distinct from the current focus
|
|
1226
|
+
|
|
1227
|
+
Return ONLY a list of concepts, one per line.
|
|
1228
|
+
"""
|
|
1229
|
+
|
|
1230
|
+
# Get concepts from LLM
|
|
1231
|
+
response = await self.llm_manager.generate_text(concepts_prompt) # Use llm_manager here
|
|
1232
|
+
concepts = [c.strip() for c in response.split('\n') if c.strip()]
|
|
1233
|
+
|
|
1234
|
+
return concepts[:5] # Limit to 5 concepts
|
|
1235
|
+
|
|
1236
|
+
async def _generate_layered_synthesis(self, session_id: str) -> Dict:
|
|
1237
|
+
"""Generate comprehensive synthesis for layered research"""
|
|
1238
|
+
session = self.active_sessions[session_id]
|
|
1239
|
+
|
|
1240
|
+
# Collect all web sources
|
|
1241
|
+
web_sources = session.context.get("web_sources", [])
|
|
1242
|
+
|
|
1243
|
+
# Collect all processed papers
|
|
1244
|
+
processed_papers = []
|
|
1245
|
+
for paper_id in session.papers:
|
|
1246
|
+
paper_data = await self.db.get_processed_paper(paper_id)
|
|
1247
|
+
if paper_data:
|
|
1248
|
+
processed_papers.append(paper_data)
|
|
1249
|
+
|
|
1250
|
+
# Get concept layers
|
|
1251
|
+
concept_layers = session.context.get("discovered_concepts", [])
|
|
1252
|
+
|
|
1253
|
+
# Use synthesizer to generate synthesis
|
|
1254
|
+
synthesis = await self.synthesizer.synthesize_layered_research(
|
|
1255
|
+
session.topic,
|
|
1256
|
+
session.context.get("questions", []),
|
|
1257
|
+
web_sources,
|
|
1258
|
+
processed_papers,
|
|
1259
|
+
concept_layers
|
|
1260
|
+
)
|
|
1261
|
+
# Attach helpful artifacts
|
|
1262
|
+
synthesis['artifacts'] = {
|
|
1263
|
+
'report_json': f"/reports/{session_id}.json",
|
|
1264
|
+
'report_markdown': f"/reports/{session_id}.md"
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
return synthesis
|
|
1268
|
+
|
|
1269
|
+
def _extract_insights_from_web(self, web_results):
|
|
1270
|
+
"""Extract key insights from web results."""
|
|
1271
|
+
insights = set()
|
|
1272
|
+
for result in web_results:
|
|
1273
|
+
snippet = result.get("snippet") or result.get("extracted_content")
|
|
1274
|
+
if snippet:
|
|
1275
|
+
for sent in snippet.split(". "):
|
|
1276
|
+
if len(sent.split()) > 6:
|
|
1277
|
+
insights.add(sent.strip())
|
|
1278
|
+
return insights
|
|
1279
|
+
|
|
1280
|
+
def _extract_insights_from_papers(self, papers):
|
|
1281
|
+
"""Extract key insights from academic papers."""
|
|
1282
|
+
insights = set()
|
|
1283
|
+
for paper in papers:
|
|
1284
|
+
abstract = paper.get("abstract")
|
|
1285
|
+
if abstract:
|
|
1286
|
+
for sent in abstract.split(". "):
|
|
1287
|
+
if len(sent.split()) > 6:
|
|
1288
|
+
insights.add(sent.strip())
|
|
1289
|
+
return insights
|
|
1290
|
+
|
|
1291
|
+
def _calculate_novelty(self, new_insights, known_insights):
|
|
1292
|
+
"""Calculate novelty as the fraction of new insights."""
|
|
1293
|
+
if not new_insights:
|
|
1294
|
+
return 0.0
|
|
1295
|
+
new = set(new_insights) - set(known_insights)
|
|
1296
|
+
return len(new) / max(1, len(new_insights))
|
|
1297
|
+
|
|
1298
|
+
def _calculate_saturation(self, web_novelty, paper_novelty, num_papers, num_web):
|
|
1299
|
+
"""Calculate a saturation score based on novelty and coverage."""
|
|
1300
|
+
novelty_score = 1.0 - max(web_novelty, paper_novelty)
|
|
1301
|
+
coverage_score = min(num_papers / 30, num_web / 30, 1.0)
|
|
1302
|
+
return 0.7 * novelty_score + 0.3 * coverage_score
|
|
1303
|
+
|
|
1304
|
+
def _analyze_diversity(self, papers, web_sources):
|
|
1305
|
+
"""Analyze diversity of institutions, methodologies, viewpoints, and publication years."""
|
|
1306
|
+
diversity = {
|
|
1307
|
+
'institutions': set(),
|
|
1308
|
+
'methodologies': set(),
|
|
1309
|
+
'years': set(),
|
|
1310
|
+
'source_types': set(),
|
|
1311
|
+
}
|
|
1312
|
+
for paper in papers:
|
|
1313
|
+
if 'institution' in paper:
|
|
1314
|
+
diversity['institutions'].add(paper['institution'])
|
|
1315
|
+
if 'methodology' in paper:
|
|
1316
|
+
diversity['methodologies'].add(paper['methodology'])
|
|
1317
|
+
if 'year' in paper:
|
|
1318
|
+
diversity['years'].add(paper['year'])
|
|
1319
|
+
diversity['source_types'].add('academic')
|
|
1320
|
+
for web in web_sources:
|
|
1321
|
+
if 'source' in web:
|
|
1322
|
+
diversity['source_types'].add(web['source'])
|
|
1323
|
+
if 'date' in web:
|
|
1324
|
+
diversity['years'].add(web['date'][:4])
|
|
1325
|
+
# Convert sets to lists for serialization
|
|
1326
|
+
for k in diversity:
|
|
1327
|
+
diversity[k] = list(diversity[k])
|
|
1328
|
+
return diversity
|
|
1329
|
+
|
|
1330
|
+
def _detect_contradictions(self, web_insights, paper_insights):
|
|
1331
|
+
"""Detect contradictions between web and academic insights."""
|
|
1332
|
+
contradictions = []
|
|
1333
|
+
web_set = set(web_insights)
|
|
1334
|
+
paper_set = set(paper_insights)
|
|
1335
|
+
# Simple contradiction: same topic, opposite claims (heuristic)
|
|
1336
|
+
for w in web_set:
|
|
1337
|
+
for p in paper_set:
|
|
1338
|
+
if w.lower() in p.lower() or p.lower() in w.lower():
|
|
1339
|
+
continue
|
|
1340
|
+
# Heuristic: if both mention the same keyword but have different sentiment
|
|
1341
|
+
# (This can be improved with LLM or sentiment analysis)
|
|
1342
|
+
if any(neg in w.lower() for neg in ['not ', 'no ', 'fail', 'lack']) != any(neg in p.lower() for neg in ['not ', 'no ', 'fail', 'lack']):
|
|
1343
|
+
contradictions.append({'web': w, 'paper': p})
|
|
1344
|
+
return contradictions
|
|
1345
|
+
|
|
1346
|
+
def _generate_gap_queries(self, state, concept):
|
|
1347
|
+
"""Generate new queries to fill gaps in coverage."""
|
|
1348
|
+
# Heuristic: look for missing years, institutions, or methodologies
|
|
1349
|
+
gap_queries = set()
|
|
1350
|
+
# Example: if not enough recent papers, add query for 'recent advances in ...'
|
|
1351
|
+
if len(state['diversity_metrics'].get('years', [])) < 3:
|
|
1352
|
+
gap_queries.add(f"recent advances in {concept}")
|
|
1353
|
+
if len(state['diversity_metrics'].get('institutions', [])) < 2:
|
|
1354
|
+
gap_queries.add(f"institutional perspectives on {concept}")
|
|
1355
|
+
if len(state['diversity_metrics'].get('methodologies', [])) < 2:
|
|
1356
|
+
gap_queries.add(f"methodological approaches to {concept}")
|
|
1357
|
+
# Add more heuristics as needed
|
|
1358
|
+
return gap_queries
|
|
1359
|
+
|
|
1360
|
+
async def _llm_generate_gap_queries(self, state, concept: str) -> set:
|
|
1361
|
+
"""Use LLM to propose additional gap-filling queries."""
|
|
1362
|
+
try:
|
|
1363
|
+
if not hasattr(self, 'llm_manager') or self.llm_manager is None:
|
|
1364
|
+
return set()
|
|
1365
|
+
web_sources = state.get('web_sources', [])[-5:]
|
|
1366
|
+
papers = state.get('academic_papers', [])[-5:]
|
|
1367
|
+
prompt = (
|
|
1368
|
+
f"Given the topic '{concept}', propose 3-6 additional search queries to fill knowledge gaps.\n"
|
|
1369
|
+
f"Recent web sources: {json.dumps(web_sources[:3])}\n"
|
|
1370
|
+
f"Recent papers titles: {[p.get('title','') for p in papers[:5]]}\n"
|
|
1371
|
+
f"Focus on missing institutions, methodologies, time ranges, and alternative viewpoints.\n"
|
|
1372
|
+
f"Return one query per line, no bullets or numbering."
|
|
1373
|
+
)
|
|
1374
|
+
text = await self.llm_manager.generate_text(prompt)
|
|
1375
|
+
queries = {q.strip() for q in text.split('\n') if q.strip()}
|
|
1376
|
+
return set(list(queries)[:6])
|
|
1377
|
+
except Exception:
|
|
1378
|
+
return set()
|
|
1379
|
+
|
|
1380
|
+
def _cluster_insights(self, insights):
|
|
1381
|
+
"""Cluster insights by semantic similarity (using embeddings)."""
|
|
1382
|
+
# Placeholder: In production, use embeddings + clustering (e.g., KMeans)
|
|
1383
|
+
# For now, group by keyword overlap as a simple heuristic
|
|
1384
|
+
clusters = []
|
|
1385
|
+
insights = list(insights)
|
|
1386
|
+
used = set()
|
|
1387
|
+
for i, sent in enumerate(insights):
|
|
1388
|
+
if i in used:
|
|
1389
|
+
continue
|
|
1390
|
+
cluster = [sent]
|
|
1391
|
+
for j, other in enumerate(insights):
|
|
1392
|
+
if i != j and j not in used:
|
|
1393
|
+
if len(set(sent.lower().split()) & set(other.lower().split())) > 3:
|
|
1394
|
+
cluster.append(other)
|
|
1395
|
+
used.add(j)
|
|
1396
|
+
used.add(i)
|
|
1397
|
+
clusters.append(cluster)
|
|
1398
|
+
return clusters
|
|
1399
|
+
|
|
1400
|
+
def _score_academic_source(self, paper):
|
|
1401
|
+
"""Score academic source credibility (citation count, journal, recency)."""
|
|
1402
|
+
score = 0.0
|
|
1403
|
+
if 'citation_count' in paper:
|
|
1404
|
+
try:
|
|
1405
|
+
c = int(paper['citation_count'])
|
|
1406
|
+
score += min(5.0, 2.0 * (c ** 0.5))
|
|
1407
|
+
except:
|
|
1408
|
+
pass
|
|
1409
|
+
if 'year' in paper:
|
|
1410
|
+
try:
|
|
1411
|
+
y = int(paper['year'])
|
|
1412
|
+
if y >= 2022:
|
|
1413
|
+
score += 2.0
|
|
1414
|
+
elif y >= 2018:
|
|
1415
|
+
score += 1.0
|
|
1416
|
+
except:
|
|
1417
|
+
pass
|
|
1418
|
+
if 'journal' in paper and paper['journal']:
|
|
1419
|
+
score += 1.0
|
|
1420
|
+
return min(score, 10.0)
|
|
1421
|
+
|
|
1422
|
+
def _score_web_source(self, web):
|
|
1423
|
+
"""Score web source credibility (domain, recency, cross-references)."""
|
|
1424
|
+
score = 0.0
|
|
1425
|
+
url = web.get('url', '')
|
|
1426
|
+
if any(domain in url for domain in ['.edu', '.ac.', '.gov']):
|
|
1427
|
+
score += 3.0
|
|
1428
|
+
if any(domain in url.lower() for domain in ['ieee', 'springer', 'nature', 'acm', 'nejm', 'science.org', 'cell.com', 'wiley']):
|
|
1429
|
+
score += 2.0
|
|
1430
|
+
if 'date' in web:
|
|
1431
|
+
try:
|
|
1432
|
+
y = int(web['date'][:4])
|
|
1433
|
+
if y >= 2022:
|
|
1434
|
+
score += 2.0
|
|
1435
|
+
elif y >= 2018:
|
|
1436
|
+
score += 1.0
|
|
1437
|
+
except:
|
|
1438
|
+
pass
|
|
1439
|
+
if 'source' in web and web['source'] in ['web_search', 'bing_search']:
|
|
1440
|
+
score += 1.0
|
|
1441
|
+
return min(score, 10.0)
|