cite-agent 1.3.6__py3-none-any.whl → 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cite-agent might be problematic. Click here for more details.

Files changed (36) hide show
  1. cite_agent/__version__.py +1 -1
  2. cite_agent/cli.py +9 -2
  3. cite_agent/enhanced_ai_agent.py +332 -73
  4. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/METADATA +1 -1
  5. cite_agent-1.3.7.dist-info/RECORD +31 -0
  6. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/top_level.txt +0 -1
  7. cite_agent-1.3.6.dist-info/RECORD +0 -57
  8. src/__init__.py +0 -1
  9. src/services/__init__.py +0 -132
  10. src/services/auth_service/__init__.py +0 -3
  11. src/services/auth_service/auth_manager.py +0 -33
  12. src/services/graph/__init__.py +0 -1
  13. src/services/graph/knowledge_graph.py +0 -194
  14. src/services/llm_service/__init__.py +0 -5
  15. src/services/llm_service/llm_manager.py +0 -495
  16. src/services/paper_service/__init__.py +0 -5
  17. src/services/paper_service/openalex.py +0 -231
  18. src/services/performance_service/__init__.py +0 -1
  19. src/services/performance_service/rust_performance.py +0 -395
  20. src/services/research_service/__init__.py +0 -23
  21. src/services/research_service/chatbot.py +0 -2056
  22. src/services/research_service/citation_manager.py +0 -436
  23. src/services/research_service/context_manager.py +0 -1441
  24. src/services/research_service/conversation_manager.py +0 -597
  25. src/services/research_service/critical_paper_detector.py +0 -577
  26. src/services/research_service/enhanced_research.py +0 -121
  27. src/services/research_service/enhanced_synthesizer.py +0 -375
  28. src/services/research_service/query_generator.py +0 -777
  29. src/services/research_service/synthesizer.py +0 -1273
  30. src/services/search_service/__init__.py +0 -5
  31. src/services/search_service/indexer.py +0 -186
  32. src/services/search_service/search_engine.py +0 -342
  33. src/services/simple_enhanced_main.py +0 -287
  34. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/WHEEL +0 -0
  35. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/entry_points.txt +0 -0
  36. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/licenses/LICENSE +0 -0
@@ -1,1441 +0,0 @@
1
- #context_manager.py
2
-
3
- from typing import Dict, List, Optional, Any
4
- import asyncio
5
- from datetime import datetime, timezone
6
- import uuid
7
- import json
8
- import os
9
- import redis.asyncio as redis
10
- import logging
11
- from typing import Dict, Any, Optional, List
12
- import hashlib
13
-
14
- from ...utils.logger import logger, log_operation
15
- from ...storage.db.operations import DatabaseOperations
16
- from ...storage.db.models import ResearchSession
17
- from .synthesizer import ResearchSynthesizer
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- def _utc_now() -> datetime:
23
- return datetime.now(timezone.utc)
24
-
25
-
26
- def _utc_timestamp() -> str:
27
- return _utc_now().isoformat()
28
-
29
- class ResearchContextManager:
30
- """
31
- Manages research context and provides real-time streaming updates.
32
- """
33
-
34
- def __init__(self, db_ops: DatabaseOperations, synthesizer: ResearchSynthesizer, redis_url: str):
35
- self.db = db_ops
36
- self.synthesizer = synthesizer
37
- self.redis_client = redis.from_url(redis_url)
38
- self.active_sessions: Dict[str, Dict] = {}
39
- self.stream_subscribers: Dict[str, List[asyncio.Queue]] = {}
40
-
41
- async def create_research_session(self, user_id: str, topic: str, research_questions: List[str]) -> str:
42
- """Create a new research session with streaming support."""
43
- session_id = hashlib.md5(f"{user_id}_{topic}_{_utc_timestamp()}".encode()).hexdigest()
44
-
45
- session_data = {
46
- "id": session_id,
47
- "user_id": user_id,
48
- "topic": topic,
49
- "research_questions": json.dumps(research_questions), # Convert list to JSON string
50
- "status": "initialized",
51
- "progress": 0.0,
52
- "current_step": "Initializing research session",
53
- "papers": json.dumps([]), # Convert list to JSON string
54
- "notes": json.dumps([]), # Convert list to JSON string
55
- "created_at": _utc_timestamp(),
56
- "updated_at": _utc_timestamp(),
57
- "synthesis": "", # Convert None to empty string
58
- "error": "" # Convert None to empty string
59
- }
60
-
61
- # Store in Redis for persistence
62
- await self.redis_client.hset(f"research_session:{session_id}", mapping=session_data)
63
-
64
- # Store in memory with proper types
65
- self.active_sessions[session_id] = {
66
- "id": session_id,
67
- "user_id": user_id,
68
- "topic": topic,
69
- "research_questions": research_questions,
70
- "status": "initialized",
71
- "progress": 0.0,
72
- "current_step": "Initializing research session",
73
- "papers": [],
74
- "notes": [],
75
- "created_at": _utc_timestamp(),
76
- "updated_at": _utc_timestamp(),
77
- "synthesis": None,
78
- "error": None
79
- }
80
- self.stream_subscribers[session_id] = []
81
-
82
- # Send initial update
83
- await self._broadcast_update(session_id, {
84
- "type": "session_created",
85
- "session_id": session_id,
86
- "data": self.active_sessions[session_id]
87
- })
88
-
89
- logger.info(f"Created research session {session_id} for user {user_id}")
90
- return session_id
91
-
92
- async def update_session_status(self, session_id: str, status: str, message: str, progress: Optional[float] = None):
93
- """Update session status and broadcast to subscribers."""
94
- if session_id not in self.active_sessions:
95
- logger.warning(f"Session {session_id} not found")
96
- return
97
-
98
- session = self.active_sessions[session_id]
99
- session["status"] = status
100
- session["current_step"] = message
101
- session["updated_at"] = _utc_timestamp()
102
-
103
- if progress is not None:
104
- session["progress"] = progress
105
-
106
- # Update Redis with serialized data
107
- redis_data = {
108
- "status": status,
109
- "current_step": message,
110
- "updated_at": session["updated_at"],
111
- "papers": json.dumps(session["papers"]),
112
- "notes": json.dumps(session["notes"]),
113
- "research_questions": json.dumps(session["research_questions"])
114
- }
115
- if progress is not None:
116
- redis_data["progress"] = progress
117
-
118
- await self.redis_client.hset(f"research_session:{session_id}", mapping=redis_data)
119
-
120
- # Broadcast update
121
- update_data = {
122
- "type": "status_update",
123
- "session_id": session_id,
124
- "status": status,
125
- "message": message,
126
- "progress": session["progress"],
127
- "timestamp": _utc_timestamp()
128
- }
129
- await self._broadcast_update(session_id, update_data)
130
-
131
- logger.info(f"Session {session_id} status: {status} - {message}")
132
-
133
- async def add_paper_to_session(self, session_id: str, paper_id: str, paper_info: Dict[str, Any]):
134
- """Add a paper to the session and broadcast update."""
135
- if session_id not in self.active_sessions:
136
- return
137
-
138
- session = self.active_sessions[session_id]
139
- session["papers"].append(paper_id)
140
- session["updated_at"] = _utc_timestamp()
141
-
142
- # Update Redis
143
- await self.redis_client.hset(f"research_session:{session_id}", mapping=session)
144
-
145
- # Broadcast paper addition
146
- await self._broadcast_update(session_id, {
147
- "type": "paper_added",
148
- "session_id": session_id,
149
- "paper_id": paper_id,
150
- "paper_info": paper_info,
151
- "total_papers": len(session["papers"]),
152
- "timestamp": _utc_timestamp()
153
- })
154
-
155
- async def update_session_synthesis(self, session_id: str, synthesis: Dict[str, Any]):
156
- """Update session with synthesis results and broadcast."""
157
- if session_id not in self.active_sessions:
158
- return
159
-
160
- session = self.active_sessions[session_id]
161
- session["synthesis"] = synthesis
162
- session["status"] = "completed"
163
- session["progress"] = 100.0
164
- session["updated_at"] = _utc_timestamp()
165
-
166
- # Update Redis
167
- await self.redis_client.hset(f"research_session:{session_id}", mapping=session)
168
-
169
- # Broadcast synthesis completion
170
- await self._broadcast_update(session_id, {
171
- "type": "synthesis_complete",
172
- "session_id": session_id,
173
- "synthesis_summary": {
174
- "paper_count": synthesis.get("meta", {}).get("paper_count", 0),
175
- "findings_count": len(synthesis.get("common_findings", [])),
176
- "gaps_count": len(synthesis.get("research_gaps", [])),
177
- "contradictions_count": len(synthesis.get("contradictions", []))
178
- },
179
- "timestamp": _utc_timestamp()
180
- })
181
-
182
- async def subscribe_to_updates(self, session_id: str) -> asyncio.Queue:
183
- """Subscribe to real-time updates for a session."""
184
- if session_id not in self.stream_subscribers:
185
- self.stream_subscribers[session_id] = []
186
-
187
- queue = asyncio.Queue()
188
- self.stream_subscribers[session_id].append(queue)
189
-
190
- # Send current session state
191
- if session_id in self.active_sessions:
192
- await queue.put({
193
- "type": "session_state",
194
- "session_id": session_id,
195
- "data": self.active_sessions[session_id]
196
- })
197
-
198
- logger.info(f"New subscriber added to session {session_id}")
199
- return queue
200
-
201
- async def unsubscribe_from_updates(self, session_id: str, queue: asyncio.Queue):
202
- """Unsubscribe from session updates."""
203
- if session_id in self.stream_subscribers:
204
- try:
205
- self.stream_subscribers[session_id].remove(queue)
206
- except ValueError:
207
- pass
208
-
209
- async def _broadcast_update(self, session_id: str, update: Dict[str, Any]):
210
- """Broadcast update to all subscribers."""
211
- if session_id not in self.stream_subscribers:
212
- return
213
-
214
- # Send to Redis pub/sub for cross-process communication
215
- await self.redis_client.publish(f"research_updates:{session_id}", json.dumps(update))
216
-
217
- # Send to local subscribers
218
- dead_queues = []
219
- for queue in self.stream_subscribers[session_id]:
220
- try:
221
- await queue.put(update)
222
- except Exception as e:
223
- logger.warning(f"Failed to send update to subscriber: {e}")
224
- dead_queues.append(queue)
225
-
226
- # Clean up dead queues
227
- for queue in dead_queues:
228
- try:
229
- self.stream_subscribers[session_id].remove(queue)
230
- except ValueError:
231
- pass
232
-
233
- async def get_session_status(self, session_id: str) -> Optional[Dict[str, Any]]:
234
- """Get current session status."""
235
- if session_id in self.active_sessions:
236
- return self.active_sessions[session_id]
237
-
238
- # Try to load from Redis
239
- try:
240
- session_data = await self.redis_client.hgetall(f"research_session:{session_id}")
241
- if session_data:
242
- return session_data
243
- except Exception as e:
244
- logger.error(f"Error loading session from Redis: {e}")
245
-
246
- return None
247
-
248
- async def list_user_sessions(self, user_id: str) -> List[Dict[str, Any]]:
249
- """List all sessions for a user."""
250
- sessions = []
251
- for session_id, session_data in self.active_sessions.items():
252
- if session_data.get("user_id") == user_id:
253
- sessions.append(session_data)
254
- return sessions
255
-
256
- async def cleanup_session(self, session_id: str):
257
- """Clean up a session and its subscribers."""
258
- if session_id in self.active_sessions:
259
- del self.active_sessions[session_id]
260
-
261
- if session_id in self.stream_subscribers:
262
- # Cancel all subscribers
263
- for queue in self.stream_subscribers[session_id]:
264
- try:
265
- await queue.put(None) # Signal shutdown
266
- except Exception:
267
- pass
268
- del self.stream_subscribers[session_id]
269
-
270
- # Remove from Redis
271
- try:
272
- await self.redis_client.delete(f"research_session:{session_id}")
273
- except Exception as e:
274
- logger.error(f"Error cleaning up session from Redis: {e}")
275
-
276
- logger.info(f"Cleaned up session {session_id}")
277
-
278
- async def health_check(self) -> Dict[str, Any]:
279
- """Health check for the context manager."""
280
- try:
281
- active_sessions = len(self.active_sessions)
282
- total_subscribers = sum(len(subscribers) for subscribers in self.stream_subscribers.values())
283
-
284
- return {
285
- "status": "healthy",
286
- "active_sessions": active_sessions,
287
- "total_subscribers": total_subscribers,
288
- "timestamp": _utc_timestamp()
289
- }
290
- except Exception as e:
291
- return {
292
- "status": "error",
293
- "error": str(e),
294
- "timestamp": _utc_timestamp()
295
- }
296
-
297
- # ---------------- Caching & Reporting Utilities ----------------
298
- def _normalize_topic(self, text: str) -> str:
299
- text = (text or "").lower()
300
- cleaned = ''.join(ch if ch.isalnum() or ch.isspace() else ' ' for ch in text)
301
- tokens = [t for t in cleaned.split() if t]
302
- return ' '.join(sorted(tokens))
303
-
304
- def _cache_key(self, topic: str, questions: List[str]) -> str:
305
- norm_topic = self._normalize_topic(topic)
306
- norm_questions = [self._normalize_topic(q) for q in (questions or [])]
307
- return json.dumps({"t": norm_topic, "q": norm_questions}, sort_keys=True)
308
-
309
- async def _get_cached_synthesis(self, topic: str, questions: List[str]) -> Optional[Dict[str, Any]]:
310
- try:
311
- key = self._cache_key(topic, questions)
312
- data = await self.redis_client.hget("synthesis_cache", key)
313
- if not data:
314
- return None
315
- if isinstance(data, bytes):
316
- try:
317
- data = data.decode('utf-8', 'ignore')
318
- except Exception:
319
- return None
320
- return json.loads(data)
321
- except Exception:
322
- return None
323
-
324
- async def _store_synthesis_cache(self, topic: str, questions: List[str], synthesis: Dict[str, Any]):
325
- try:
326
- key = self._cache_key(topic, questions)
327
- def _convert(obj):
328
- if isinstance(obj, set):
329
- return list(obj)
330
- return obj
331
- payload = json.dumps(synthesis, ensure_ascii=False, default=_convert)
332
- await self.redis_client.hset("synthesis_cache", key, payload)
333
- except Exception:
334
- pass
335
-
336
- async def _find_similar_cached(self, topic: str, questions: List[str], min_score: float = 0.7) -> Optional[Dict[str, Any]]:
337
- """Find a semantically similar cached synthesis using simple token Jaccard overlap."""
338
- try:
339
- entries = await self.redis_client.hgetall("synthesis_cache")
340
- if not entries:
341
- return None
342
- target_tokens = set(self._normalize_topic(topic).split())
343
- best = None
344
- best_score = 0.0
345
- for key, val in entries.items():
346
- try:
347
- if isinstance(key, bytes):
348
- key_str = key.decode('utf-8', 'ignore')
349
- else:
350
- key_str = str(key)
351
- parsed = json.loads(key_str)
352
- cached_topic = parsed.get('t', '')
353
- tokens = set(str(cached_topic).split())
354
- inter = len(tokens & target_tokens)
355
- union = len(tokens | target_tokens) or 1
356
- score = inter / union
357
- if score > best_score:
358
- best_score = score
359
- best = val
360
- except Exception:
361
- continue
362
- if best_score >= min_score and best is not None:
363
- data = best.decode('utf-8', 'ignore') if isinstance(best, bytes) else str(best)
364
- return json.loads(data)
365
- except Exception:
366
- return None
367
- return None
368
-
369
- async def _write_reports(self, session_id: str, topic: str, synthesis: Dict[str, Any]):
370
- try:
371
- reports_dir = os.path.join(os.getcwd(), "reports")
372
- os.makedirs(reports_dir, exist_ok=True)
373
- # JSON report
374
- json_path = os.path.join(reports_dir, f"{session_id}.json")
375
- with open(json_path, 'w', encoding='utf-8') as f:
376
- json.dump(synthesis, f, ensure_ascii=False, indent=2)
377
- # Markdown report
378
- md_path = os.path.join(reports_dir, f"{session_id}.md")
379
- summary = synthesis.get('summary') or synthesis.get('synthesis') or ""
380
- body = json.dumps({k: v for k, v in synthesis.items() if k not in ['summary', 'synthesis']}, ensure_ascii=False, indent=2)
381
- md_lines = [
382
- f"# Research Report: {topic}",
383
- "",
384
- f"Generated: {_utc_timestamp()}Z",
385
- "",
386
- "## Summary",
387
- summary if isinstance(summary, str) else json.dumps(summary, ensure_ascii=False, indent=2),
388
- "",
389
- "## Details",
390
- body
391
- ]
392
- with open(md_path, 'w', encoding='utf-8') as f:
393
- f.write("\n".join(md_lines))
394
- except Exception:
395
- pass
396
-
397
- @log_operation("create_session")
398
- async def create_session(self, topic: str, context: Optional[Dict] = None, user_id: str = "default_user") -> str:
399
- """Create a new research session with enhanced context handling."""
400
- #logger.info(f"Creating research session for topic: {topic}")
401
-
402
- session_id = str(uuid.uuid4())
403
- session = ResearchSession(
404
- id=session_id,
405
- user_id=user_id,
406
- topic=topic,
407
- context=context or {},
408
- created_at=_utc_now(),
409
- updated_at=_utc_now(),
410
- status="initializing",
411
- papers=[],
412
- notes=[],
413
- progress={
414
- "stage": "created",
415
- "percentage": 0,
416
- "papers_found": 0,
417
- "papers_processed": 0
418
- }
419
- )
420
-
421
- # Store in database and cache
422
- await self.db.store_research_session(session.dict())
423
- self.active_sessions[session_id] = session
424
-
425
- # Start background processing
426
- self.session_tasks[session_id] = asyncio.create_task(
427
- self._process_session(session_id)
428
- )
429
-
430
- #logger.info(f"Created research session: {session_id}")
431
- return session_id
432
-
433
- @log_operation("process_session")
434
- async def _process_session(self, session_id: str):
435
- """Handle long-running session processing."""
436
- try:
437
- session = self.active_sessions[session_id]
438
-
439
- # Update status
440
- session.status = "searching"
441
- await self._update_session(session)
442
-
443
- # Search for papers
444
- papers = await self._search_papers(session.topic, session.context)
445
- session.papers = [p["id"] for p in papers]
446
- session.progress["papers_found"] = len(papers)
447
- await self._update_session(session)
448
-
449
- # Process papers
450
- for paper in papers:
451
- await self._queue_paper_processing(session_id, paper)
452
- session.progress["papers_processed"] += 1
453
- await self._update_session(session)
454
-
455
- # After synthesis, collect citations
456
- if session.papers:
457
- session.status = "synthesizing"
458
- await self._update_session(session)
459
- synthesis = await self.synthesizer.synthesize_papers(session.papers)
460
- session.synthesis = synthesis
461
- # Collect citations from synthesis
462
- session.citations = synthesis.get('citations', [])
463
-
464
- session.status = "completed"
465
- session.progress["percentage"] = 100
466
- await self._update_session(session)
467
-
468
- except Exception as e:
469
- logger.error(f"Error processing session {session_id}: {str(e)}")
470
- session.status = "error"
471
- session.error = str(e)
472
- await self._update_session(session)
473
-
474
- async def _search_papers(self, topic: str, context: Dict) -> List[Dict]:
475
- """Search for relevant papers with parallelization, deduplication, and ranking."""
476
- import heapq
477
- import operator
478
- #logger.info(f"Searching for papers on: {topic}")
479
- papers = []
480
- errors = {}
481
- try:
482
- # Gather from multiple sources in parallel
483
- from src.services.paper_service.openalex import OpenAlexClient
484
- from src.services.paper_service.paper_access import PaperAccessManager
485
- # Optionally add more sources here
486
- async with OpenAlexClient() as openalex, PaperAccessManager() as pam:
487
- tasks = [
488
- openalex.search_works(topic, per_page=15),
489
- # Add more sources here as needed, e.g. pam.semantic_scholar_search(topic), pam.core_search(topic)
490
- ]
491
- results = await asyncio.gather(*tasks, return_exceptions=True)
492
- for res in results:
493
- if isinstance(res, Exception):
494
- errors[type(res).__name__] = str(res)
495
- continue
496
- if isinstance(res, dict) and res.get('results'):
497
- for paper in res['results']:
498
- papers.append(paper)
499
- elif isinstance(res, list):
500
- papers.extend(res)
501
- except Exception as e:
502
- logger.warning(f"Paper search failed: {str(e)}")
503
- errors[type(e).__name__] = str(e)
504
- # Deduplicate by DOI or title
505
- seen = set()
506
- deduped = []
507
- for paper in papers:
508
- doi = paper.get('doi') or paper.get('id')
509
- title = paper.get('title', '').lower()
510
- key = (doi, title)
511
- if key in seen:
512
- continue
513
- seen.add(key)
514
- deduped.append(paper)
515
- # Rank by relevance, citation count, recency
516
- def paper_score(p):
517
- score = 0
518
- # Prefer more recent
519
- year = p.get('year') or p.get('publication_year') or 0
520
- score += (int(year) if year else 0) * 1.0
521
- # Prefer more citations
522
- score += (p.get('citations') or p.get('cited_by_count') or 0) * 2.0
523
- # Prefer open access
524
- if p.get('open_access') or (p.get('open_access', {}).get('is_oa', False)):
525
- score += 10
526
- return score
527
- deduped.sort(key=paper_score, reverse=True)
528
- # If nothing found, fall back to mock papers
529
- if not deduped:
530
- #logger.info(f"Using mock papers for topic: {topic}")
531
- return [
532
- {
533
- 'id': f"mock-paper-1-{uuid.uuid4()}",
534
- 'title': f"Advances in {topic}",
535
- 'doi': "https://doi.org/10.1234/mock.123",
536
- 'authors': ["A. Researcher", "B. Scientist", "C. Professor"],
537
- 'year': 2024,
538
- 'citations': 42,
539
- 'open_access': True,
540
- 'abstract': f"This paper explores recent developments in {topic}, focusing on practical applications.",
541
- },
542
- {
543
- 'id': f"mock-paper-2-{uuid.uuid4()}",
544
- 'title': f"Review of {topic} Technologies",
545
- 'doi': "https://doi.org/10.5678/mock.456",
546
- 'authors': ["D. Expert", "E. Analyst"],
547
- 'year': 2023,
548
- 'citations': 28,
549
- 'open_access': True,
550
- 'abstract': f"A comprehensive review of current {topic} technologies and methodologies.",
551
- },
552
- {
553
- 'id': f"mock-paper-3-{uuid.uuid4()}",
554
- 'title': f"Future Directions in {topic}",
555
- 'doi': "https://doi.org/10.9012/mock.789",
556
- 'authors': ["F. Visionary", "G. Pioneer"],
557
- 'year': 2024,
558
- 'citations': 15,
559
- 'open_access': True,
560
- 'abstract': f"This forward-looking paper examines emerging trends and future directions in {topic}.",
561
- }
562
- ]
563
- return deduped[:20] # Limit to top 20 papers
564
-
565
- def _extract_abstract(self, paper):
566
- """Extract abstract from OpenAlex format"""
567
- abstract_index = paper.get("abstract_inverted_index")
568
- if not abstract_index:
569
- return "No abstract available"
570
-
571
- # Reconstruct from inverted index
572
- words = [""] * 300
573
- for word, positions in abstract_index.items():
574
- for pos in positions:
575
- if pos < len(words):
576
- words[pos] = word
577
-
578
- abstract = " ".join(word for word in words if word).strip()
579
- return abstract[:800] + "..." if len(abstract) > 800 else abstract
580
-
581
- async def _queue_paper_processing(self, session_id: str, paper: Dict):
582
- """Queue paper for processing with session context."""
583
- try:
584
- processing_request = {
585
- "session_id": session_id,
586
- "paper": paper,
587
- "timestamp": _utc_timestamp()
588
- }
589
-
590
- await self.redis_client.lpush(
591
- "processing_queue",
592
- json.dumps(processing_request)
593
- )
594
-
595
- except Exception as e:
596
- logger.error(f"Error queuing paper for processing: {str(e)}")
597
- raise
598
-
599
- @log_operation("get_session_status")
600
- async def get_session_status(self, session_id: str) -> Dict:
601
- """Get detailed session status."""
602
- session = await self._get_session(session_id)
603
- if not session:
604
- return {"error": "Session not found"}
605
-
606
- # Get processing progress
607
- progress = await self.redis_client.hgetall(f"progress:{session_id}")
608
-
609
- return {
610
- "id": session_id,
611
- "status": session.status,
612
- "progress": session.progress,
613
- "papers_total": len(session.papers),
614
- "processing_stage": progress.get("stage", "unknown"),
615
- "processing_percentage": progress.get("percentage", 0),
616
- "last_updated": session.updated_at.isoformat(),
617
- "error": session.error if hasattr(session, 'error') else None
618
- }
619
-
620
- @log_operation("add_note")
621
- async def add_note(self, session_id: str, note: Dict) -> bool:
622
- """Add a structured note to research session."""
623
- session = await self._get_session(session_id)
624
- if not session:
625
- return False
626
-
627
- note_entry = {
628
- "id": str(uuid.uuid4()),
629
- "content": note.get("content"),
630
- "type": note.get("type", "general"),
631
- "references": note.get("references", []),
632
- "created_at": _utc_timestamp()
633
- }
634
-
635
- session.notes.append(note_entry)
636
- session.updated_at = _utc_now()
637
-
638
- await self._update_session(session)
639
- return True
640
-
641
- async def _update_session(self, session: ResearchSession):
642
- """Update session in database and cache."""
643
- session.updated_at = _utc_now()
644
- await self.db.update_research_session(session.dict())
645
- self.active_sessions[session.id] = session
646
-
647
- # Prepare Redis-compatible mapping
648
- redis_mapping = {}
649
- for key, value in session.dict().items():
650
- if value is None:
651
- # Convert None to empty string for Redis
652
- redis_mapping[key] = ""
653
- elif isinstance(value, datetime):
654
- # Convert datetime to ISO format string
655
- redis_mapping[key] = value.isoformat()
656
- elif isinstance(value, (dict, list)):
657
- # Serialize complex types
658
- redis_mapping[key] = json.dumps(value)
659
- else:
660
- redis_mapping[key] = value
661
-
662
- # Update Redis cache
663
- try:
664
- await self.redis_client.hset(
665
- f"session:{session.id}",
666
- mapping=redis_mapping # Use serialized mapping
667
- )
668
- except Exception as e:
669
- logger.error(f"Error updating Redis cache: {str(e)}")
670
-
671
- async def _get_session(self, session_id: str) -> Optional[ResearchSession]:
672
- """Get research session by ID with caching."""
673
- # Check memory cache first
674
- if session_id in self.active_sessions:
675
- return self.active_sessions[session_id]
676
-
677
- # Try Redis cache
678
- session_data = await self.redis_client.hgetall(f"session:{session_id}")
679
- if session_data:
680
- # Convert JSON strings and datetime strings back to Python objects
681
- for key, value in session_data.items():
682
- if key in ['progress', 'context', 'synthesis']:
683
- try:
684
- session_data[key] = json.loads(value)
685
- except (json.JSONDecodeError, TypeError):
686
- pass
687
- elif key in ['created_at', 'updated_at'] and isinstance(value, str):
688
- try:
689
- session_data[key] = datetime.fromisoformat(value)
690
- except ValueError:
691
- pass
692
-
693
- session = ResearchSession(**session_data)
694
- self.active_sessions[session_id] = session
695
- return session
696
-
697
- # Finally try database
698
- session_data = await self.db.get_research_session(session_id)
699
- if session_data:
700
- session = ResearchSession(**session_data)
701
- self.active_sessions[session_id] = session
702
-
703
- # Update cache
704
- await self.redis_client.hset(
705
- f"session:{session_id}",
706
- mapping=session.dict()
707
- )
708
- return session
709
-
710
- return None
711
-
712
- async def cleanup(self):
713
- """Cleanup manager resources."""
714
- try:
715
- # Cancel all running tasks
716
- for task in self.session_tasks.values():
717
- task.cancel()
718
-
719
- # Wait for tasks to complete
720
- await asyncio.gather(*self.session_tasks.values(), return_exceptions=True)
721
-
722
- # Close Redis connection
723
- await self.redis_client.close()
724
-
725
- except Exception as e:
726
- logger.error(f"Error during cleanup: {str(e)}")
727
-
728
- @log_operation("start_layered_research")
729
- async def start_layered_research(self, topic: str, research_questions: List[str], max_layers: int = 3, user_id: str = "default_user") -> str:
730
- """Start a multi-layered research process that explores connected concepts"""
731
- #logger.info(f"Starting layered research on: {topic}")
732
-
733
- # Create a new session
734
- session_id = str(uuid.uuid4())
735
- session = ResearchSession(
736
- id=session_id,
737
- topic=topic,
738
- user_id=user_id,
739
- context={
740
- "research_type": "layered",
741
- "questions": research_questions,
742
- "max_layers": max_layers,
743
- "current_layer": 0,
744
- "explored_concepts": [topic],
745
- "discovered_concepts": []
746
- },
747
- created_at=_utc_now(),
748
- updated_at=_utc_now(),
749
- status="initializing",
750
- papers=[],
751
- notes=[],
752
- progress={
753
- "stage": "created",
754
- "percentage": 0,
755
- "web_sources_found": 0,
756
- "papers_found": 0,
757
- "papers_processed": 0
758
- }
759
- )
760
-
761
- # Store in database and cache
762
- await self.db.store_research_session(session.dict())
763
- self.active_sessions[session_id] = session
764
-
765
- # Serve from cache if available
766
- cached = await self._get_cached_synthesis(topic, research_questions)
767
- if cached:
768
- session.synthesis = cached
769
- session.status = "completed"
770
- session.progress["percentage"] = 100
771
- await self._update_session(session)
772
- else:
773
- # Start layered research process
774
- self.session_tasks[session_id] = asyncio.create_task(
775
- self._process_layered_research(session_id)
776
- )
777
-
778
- #logger.info(f"Created layered research session: {session_id}")
779
- return session_id
780
-
781
- async def _process_layered_research(self, session_id: str):
782
- """Process a layered research session with LLM-powered query generation, semantic clustering, and source scoring."""
783
- try:
784
- session = self.active_sessions[session_id]
785
- session.status = "exploring_main_topic"
786
- await self._update_session(session)
787
-
788
- import time
789
- SATURATION_CRITERIA = {
790
- 'min_papers': 10,
791
- 'min_web_sources': 15,
792
- 'novelty_threshold': 0.15,
793
- 'time_budget': 1800, # 30 min
794
- 'ideal_paper_range': (20, 100),
795
- }
796
- state = {
797
- 'web_sources': [],
798
- 'academic_papers': [],
799
- 'key_insights': set(),
800
- 'information_graph': {},
801
- 'saturation_score': 0,
802
- 'start_time': time.time(),
803
- 'diversity_metrics': {},
804
- 'contradictions': [],
805
- 'gap_queries': set(),
806
- 'clusters': [],
807
- 'source_scores': {},
808
- }
809
- topic = session.topic
810
- layer = 1
811
- max_layers = session.context.get("max_layers", 3)
812
- concepts_to_explore = [topic]
813
- explored_concepts = set()
814
- gap_queries = set()
815
- while True:
816
- if not concepts_to_explore or layer > max_layers:
817
- break
818
- concept = concepts_to_explore.pop(0)
819
- if concept in explored_concepts:
820
- continue
821
- explored_concepts.add(concept)
822
- session.context["current_layer"] = layer
823
- session.context.setdefault("explored_concepts", []).append(concept)
824
- session.progress["percentage"] = min(80, int((layer / max_layers) * 80))
825
- session.status = f"exploring_layer_{layer}"
826
- await self._update_session(session)
827
-
828
- # --- Parallel search ---
829
- web_results, papers = await asyncio.gather(
830
- self._search_web_sources(concept),
831
- self._search_papers(concept, session.context)
832
- )
833
- # Fetch web content for important results
834
- web_content_tasks = []
835
- for result in web_results:
836
- if self._is_important_result(result):
837
- task = asyncio.create_task(self._fetch_web_content(result.get("url", "")))
838
- web_content_tasks.append((result, task))
839
- for result, task in web_content_tasks:
840
- try:
841
- content = await task
842
- result["extracted_content"] = content
843
- except Exception as e:
844
- result["extracted_content"] = f"Failed to extract: {str(e)}"
845
- # --- Source credibility scoring ---
846
- for paper in papers:
847
- score = self._score_academic_source(paper)
848
- state['source_scores'][paper.get('id')] = score
849
- for web in web_results:
850
- score = self._score_web_source(web)
851
- state['source_scores'][web.get('url')] = score
852
- # Deduplicate web results by (url, title)
853
- seen = set()
854
- uniq_web = []
855
- for w in web_results:
856
- key = (w.get('url','').strip().lower(), w.get('title','').strip().lower())
857
- if key in seen:
858
- continue
859
- seen.add(key)
860
- uniq_web.append(w)
861
- web_results = uniq_web
862
- # Update state
863
- state['web_sources'].extend(web_results)
864
- state['academic_papers'].extend(papers)
865
- session.context.setdefault("web_sources", []).extend(web_results)
866
- session.papers.extend([p["id"] for p in papers])
867
- session.progress["web_sources_found"] += len(web_results)
868
- session.progress["papers_found"] += len(papers)
869
- await self._update_session(session)
870
- # --- Extract insights ---
871
- new_insights_web = self._extract_insights_from_web(web_results)
872
- new_insights_papers = self._extract_insights_from_papers(papers)
873
- # --- Semantic clustering ---
874
- state['clusters'] = self._cluster_insights(new_insights_web | new_insights_papers)
875
- # --- Calculate novelty ---
876
- web_novelty = self._calculate_novelty(new_insights_web, state['key_insights'])
877
- paper_novelty = self._calculate_novelty(new_insights_papers, state['key_insights'])
878
- # --- Update key insights ---
879
- state['key_insights'].update(new_insights_web)
880
- state['key_insights'].update(new_insights_papers)
881
- # --- Diversity & contradiction analysis ---
882
- state['diversity_metrics'] = self._analyze_diversity(state['academic_papers'], state['web_sources'])
883
- state['contradictions'].extend(self._detect_contradictions(new_insights_web, new_insights_papers))
884
- # --- Gap analysis & LLM-powered query refinement ---
885
- new_gap_queries = self._generate_gap_queries(state, concept)
886
- # Use LLM to suggest additional queries
887
- llm_gap_queries = await self._llm_generate_gap_queries(state, concept)
888
- for q in new_gap_queries.union(llm_gap_queries):
889
- if q not in gap_queries:
890
- gap_queries.add(q)
891
- concepts_to_explore.append(q)
892
- # --- Calculate saturation ---
893
- state['saturation_score'] = self._calculate_saturation(
894
- web_novelty, paper_novelty,
895
- len(state['academic_papers']), len(state['web_sources'])
896
- )
897
- # --- Coverage checks ---
898
- coverage_met = (
899
- len(state['academic_papers']) >= SATURATION_CRITERIA['min_papers'] and
900
- len(state['web_sources']) >= SATURATION_CRITERIA['min_web_sources']
901
- )
902
- time_elapsed = time.time() - state['start_time']
903
- # --- Stopping conditions ---
904
- if (
905
- (web_novelty < SATURATION_CRITERIA['novelty_threshold'] and paper_novelty < SATURATION_CRITERIA['novelty_threshold']) or
906
- state['saturation_score'] >= 0.95 or
907
- coverage_met or
908
- time_elapsed > SATURATION_CRITERIA['time_budget']
909
- ):
910
- break
911
- layer += 1
912
- # --- Synthesis ---
913
- session.status = "synthesizing"
914
- await self._update_session(session)
915
- synthesis = await self._generate_layered_synthesis(session_id)
916
- # Attach advanced metrics to synthesis
917
- synthesis['diversity_metrics'] = state['diversity_metrics']
918
- synthesis['contradictions'] = state['contradictions']
919
- synthesis['gap_queries'] = list(gap_queries)
920
- synthesis['clusters'] = state['clusters']
921
- synthesis['source_scores'] = state['source_scores']
922
- session.synthesis = synthesis
923
- session.status = "completed"
924
- session.progress["percentage"] = 100
925
- await self._update_session(session)
926
- # Persist for reuse and create reports
927
- await self._store_synthesis_cache(session.topic, session.context.get('questions', []), synthesis)
928
- await self._write_reports(session_id, session.topic, synthesis)
929
- except Exception as e:
930
- logger.error(f"Error in layered research {session_id}: {str(e)}")
931
- session.status = "error"
932
- session.error = str(e)
933
- await self._update_session(session)
934
-
935
- async def _explore_concept(self, session_id: str, concept: str, layer: int):
936
- """Explore a concept in the layered research process"""
937
- session = self.active_sessions[session_id]
938
- max_layers = session.context.get("max_layers", 3)
939
-
940
- # Don't explore beyond max layers or already explored concepts
941
- if layer > max_layers or concept in session.context.get("explored_concepts", []):
942
- return
943
-
944
- #logger.info(f"Exploring concept '{concept}' at layer {layer} for session {session_id}")
945
-
946
- # Update session state
947
- session.context["current_layer"] = layer
948
- session.context.get("explored_concepts", []).append(concept)
949
- session.progress["percentage"] = min(80, int((layer / max_layers) * 80)) # Reserve 20% for synthesis
950
- session.status = f"exploring_layer_{layer}"
951
- await self._update_session(session)
952
-
953
- # Search for both web and academic sources
954
- web_results, papers = await asyncio.gather(
955
- self._search_web_sources(concept),
956
- self._search_papers(concept, session.context)
957
- )
958
-
959
- web_content_tasks = []
960
- for result in web_results:
961
- if self._is_important_result(result):
962
- task = asyncio.create_task(self._fetch_web_content(result.get("url", "")))
963
- web_content_tasks.append((result, task))
964
-
965
- # Wait for content fetching to complete
966
- for result, task in web_content_tasks:
967
- try:
968
- content = await task
969
- result["extracted_content"] = content
970
- except Exception as e:
971
- logger.error(f"Error fetching content: {str(e)}")
972
- result["extracted_content"] = f"Failed to extract: {str(e)}"
973
-
974
- # Update session with new sources
975
- if "web_sources" not in session.context:
976
- session.context["web_sources"] = []
977
- session.context["web_sources"].extend(web_results)
978
- session.progress["web_sources_found"] += len(web_results)
979
-
980
- # Add papers to session
981
- paper_ids = [p["id"] for p in papers]
982
- session.papers.extend(paper_ids)
983
- session.progress["papers_found"] += len(papers)
984
- await self._update_session(session)
985
-
986
- # Process papers to extract information
987
- for paper in papers:
988
- await self._queue_paper_processing(session_id, paper)
989
- session.progress["papers_processed"] += 1
990
- await self._update_session(session)
991
-
992
- # Discover new concepts from current sources
993
- new_concepts = await self._discover_related_concepts(session_id, concept, web_results, papers)
994
-
995
- # Store discovered concepts
996
- if "discovered_concepts" not in session.context:
997
- session.context["discovered_concepts"] = []
998
-
999
- concept_layer = {
1000
- "concept": concept,
1001
- "layer": layer,
1002
- "related_concepts": new_concepts
1003
- }
1004
- session.context["discovered_concepts"].append(concept_layer)
1005
- await self._update_session(session)
1006
-
1007
- # Recursively explore new concepts in next layer
1008
- for new_concept in new_concepts[:3]: # Limit to top 3 concepts per layer
1009
- await self._explore_concept(session_id, new_concept, layer + 1)
1010
-
1011
- async def _search_web_sources(self, concept: str) -> List[Dict]:
1012
- """Search for web sources related to a concept (Google -> Bing -> mock)."""
1013
-
1014
- # Mock fallback
1015
- mock_results = [
1016
- {
1017
- "title": f"Overview of {concept}",
1018
- "url": "https://example.com/mock1",
1019
- "snippet": f"This is a comprehensive overview of {concept} and its applications in scientific research.",
1020
- "source": "mock"
1021
- },
1022
- {
1023
- "title": f"Recent developments in {concept}",
1024
- "url": "https://example.com/mock2",
1025
- "snippet": f"Recent studies have shown significant progress in the field of {concept}.",
1026
- "source": "mock"
1027
- }
1028
- ]
1029
-
1030
- try:
1031
- # Prefer Google Custom Search if configured
1032
- google_results = await self._try_google_search(concept)
1033
- if google_results:
1034
- return google_results
1035
-
1036
- # Fallback to Bing Search API
1037
- bing_results = await self._try_bing_search(concept)
1038
- if bing_results:
1039
- return bing_results
1040
-
1041
- logger.warning(f"All web searches failed - using mock results for: {concept}")
1042
- return mock_results
1043
- except Exception as e:
1044
- logger.error(f"Web search error: {str(e)}")
1045
- return mock_results
1046
-
1047
- async def _try_google_search(self, query: str, num_results: int = 10) -> List[Dict]:
1048
- """Try to search using Google Custom Search API"""
1049
- import os
1050
- import aiohttp
1051
-
1052
- # Support multiple common env var names for compatibility
1053
- api_key = (
1054
- os.environ.get("GOOGLE_SEARCH_API_KEY")
1055
- or os.environ.get("GOOGLE_CUSTOM_SEARCH_API_KEY")
1056
- or os.environ.get("GOOGLE_API_KEY")
1057
- )
1058
- cx = (
1059
- os.environ.get("GOOGLE_SEARCH_CX")
1060
- or os.environ.get("GOOGLE_SEARCH_ENGINE_ID")
1061
- or os.environ.get("GOOGLE_SEARCH_ENGINE_CX")
1062
- )
1063
-
1064
- if not api_key or not cx:
1065
- logger.warning("Google Search API credentials not configured")
1066
- return []
1067
-
1068
- url = "https://www.googleapis.com/customsearch/v1"
1069
- params = {
1070
- "key": api_key,
1071
- "cx": cx,
1072
- "q": query,
1073
- "num": min(num_results, 10)
1074
- }
1075
-
1076
- # Retry with basic backoff
1077
- for attempt in range(3):
1078
- try:
1079
- async with aiohttp.ClientSession() as session:
1080
- async with session.get(url, params=params, timeout=10) as response:
1081
- if response.status == 200:
1082
- data = await response.json()
1083
- results = []
1084
- for item in data.get("items", []):
1085
- results.append({
1086
- "title": item.get("title"),
1087
- "url": item.get("link"),
1088
- "snippet": item.get("snippet"),
1089
- "source": "google_search"
1090
- })
1091
- return results
1092
- else:
1093
- logger.warning(f"Google search failed with status {response.status}")
1094
- except Exception as e:
1095
- logger.error(f"Google search error (attempt {attempt+1}): {str(e)}")
1096
- await asyncio.sleep(1.0 * (attempt + 1))
1097
- return []
1098
-
1099
- async def _try_bing_search(self, query: str, num_results: int = 10) -> List[Dict]:
1100
- """Try to search using Bing Search API"""
1101
- import os
1102
- import aiohttp
1103
-
1104
- api_key = os.environ.get("BING_SEARCH_API_KEY")
1105
-
1106
- if not api_key:
1107
- logger.warning("Bing Search API key not configured")
1108
- return []
1109
-
1110
- url = "https://api.bing.microsoft.com/v7.0/search"
1111
- headers = {"Ocp-Apim-Subscription-Key": api_key}
1112
- params = {
1113
- "q": query,
1114
- "count": min(num_results, 50),
1115
- "responseFilter": "Webpages"
1116
- }
1117
-
1118
- try:
1119
- async with aiohttp.ClientSession() as session:
1120
- async with session.get(url, headers=headers, params=params) as response:
1121
- if response.status == 200:
1122
- data = await response.json()
1123
-
1124
- results = []
1125
- for item in data.get("webPages", {}).get("value", []):
1126
- results.append({
1127
- "title": item.get("name"),
1128
- "url": item.get("url"),
1129
- "snippet": item.get("snippet"),
1130
- "source": "bing_search"
1131
- })
1132
- return results
1133
- else:
1134
- logger.warning(f"Bing search failed with status {response.status}")
1135
- return []
1136
- except Exception as e:
1137
- logger.error(f"Bing search error: {str(e)}")
1138
- return []
1139
-
1140
- async def _fetch_web_content(self, url: str) -> str:
1141
- """Fetch and extract main content from web page"""
1142
- import aiohttp
1143
- from bs4 import BeautifulSoup
1144
-
1145
- for attempt in range(3):
1146
- try:
1147
- async with aiohttp.ClientSession() as session:
1148
- async with session.get(url, timeout=10) as response:
1149
- if response.status == 200:
1150
- html = await response.text()
1151
- # Parse with BeautifulSoup
1152
- soup = BeautifulSoup(html, 'html.parser')
1153
- # Remove non-content elements
1154
- for element in soup(["script", "style", "header", "footer", "nav"]):
1155
- element.decompose()
1156
- # Get text content
1157
- text = soup.get_text(separator='\n')
1158
- # Clean up text
1159
- lines = (line.strip() for line in text.splitlines())
1160
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
1161
- text = '\n'.join(chunk for chunk in chunks if chunk)
1162
- # Truncate if too long
1163
- if len(text) > 5000:
1164
- text = text[:5000] + "..."
1165
- return text
1166
- else:
1167
- msg = f"Failed to fetch content: HTTP {response.status}"
1168
- if attempt == 2:
1169
- return msg
1170
- except Exception as e:
1171
- logger.error(f"Error fetching content from {url} (attempt {attempt+1}): {str(e)}")
1172
- await asyncio.sleep(1.0 * (attempt + 1))
1173
- return "Error fetching content after retries"
1174
-
1175
- def _is_important_result(self, result: Dict) -> bool:
1176
- """Determine if a web result is important enough to fetch full content"""
1177
- important_keywords = ["research", "study", "analysis", "review", "journal"]
1178
- return any(keyword in result.get("title", "").lower() for keyword in important_keywords)
1179
-
1180
- async def _discover_related_concepts(self, session_id: str, concept: str,
1181
- web_results: List[Dict], papers: List[Dict]) -> List[str]:
1182
- """Discover related concepts from current research sources"""
1183
- session = self.active_sessions[session_id]
1184
-
1185
- # Check if llm_manager is available
1186
- if not hasattr(self, 'llm_manager') or self.llm_manager is None:
1187
- logger.warning(f"No LLM manager available - returning mock concepts for: {concept}")
1188
- # Return mock concepts for testing without LLM
1189
- return ["Quantum algorithms", "Molecular modeling", "Drug discovery optimization"]
1190
-
1191
- # Use the query generator to discover concepts
1192
- from .query_generator import EnhancedQueryGenerator
1193
- query_generator = EnhancedQueryGenerator(self.llm_manager) # Use llm_manager here
1194
-
1195
- # Extract context from web results
1196
- web_context = []
1197
- for result in web_results:
1198
- if "extracted_content" in result:
1199
- web_context.append(f"Title: {result['title']}\nContent: {result['extracted_content'][:500]}...")
1200
- else:
1201
- web_context.append(f"Title: {result['title']}\nSnippet: {result['snippet']}")
1202
-
1203
- # Extract context from papers
1204
- paper_context = []
1205
- for paper in papers:
1206
- paper_context.append(f"Title: {paper.get('title', '')}\nAuthors: {', '.join(paper.get('authors', []))}")
1207
-
1208
- # Generate concepts
1209
- concepts_prompt = f"""
1210
- Based on research about "{concept}", identify 3-5 important related concepts that should be explored next.
1211
-
1212
- RESEARCH QUESTIONS:
1213
- {json.dumps(session.context.get('questions', []))}
1214
-
1215
- WEB SOURCES:
1216
- {json.dumps(web_context)}
1217
-
1218
- ACADEMIC PAPERS:
1219
- {json.dumps(paper_context)}
1220
-
1221
- Identify concepts that:
1222
- 1. Are mentioned across multiple sources
1223
- 2. Appear important but not fully explained
1224
- 3. Would deepen understanding of the main topic
1225
- 4. Are distinct from the current focus
1226
-
1227
- Return ONLY a list of concepts, one per line.
1228
- """
1229
-
1230
- # Get concepts from LLM
1231
- response = await self.llm_manager.generate_text(concepts_prompt) # Use llm_manager here
1232
- concepts = [c.strip() for c in response.split('\n') if c.strip()]
1233
-
1234
- return concepts[:5] # Limit to 5 concepts
1235
-
1236
- async def _generate_layered_synthesis(self, session_id: str) -> Dict:
1237
- """Generate comprehensive synthesis for layered research"""
1238
- session = self.active_sessions[session_id]
1239
-
1240
- # Collect all web sources
1241
- web_sources = session.context.get("web_sources", [])
1242
-
1243
- # Collect all processed papers
1244
- processed_papers = []
1245
- for paper_id in session.papers:
1246
- paper_data = await self.db.get_processed_paper(paper_id)
1247
- if paper_data:
1248
- processed_papers.append(paper_data)
1249
-
1250
- # Get concept layers
1251
- concept_layers = session.context.get("discovered_concepts", [])
1252
-
1253
- # Use synthesizer to generate synthesis
1254
- synthesis = await self.synthesizer.synthesize_layered_research(
1255
- session.topic,
1256
- session.context.get("questions", []),
1257
- web_sources,
1258
- processed_papers,
1259
- concept_layers
1260
- )
1261
- # Attach helpful artifacts
1262
- synthesis['artifacts'] = {
1263
- 'report_json': f"/reports/{session_id}.json",
1264
- 'report_markdown': f"/reports/{session_id}.md"
1265
- }
1266
-
1267
- return synthesis
1268
-
1269
- def _extract_insights_from_web(self, web_results):
1270
- """Extract key insights from web results."""
1271
- insights = set()
1272
- for result in web_results:
1273
- snippet = result.get("snippet") or result.get("extracted_content")
1274
- if snippet:
1275
- for sent in snippet.split(". "):
1276
- if len(sent.split()) > 6:
1277
- insights.add(sent.strip())
1278
- return insights
1279
-
1280
- def _extract_insights_from_papers(self, papers):
1281
- """Extract key insights from academic papers."""
1282
- insights = set()
1283
- for paper in papers:
1284
- abstract = paper.get("abstract")
1285
- if abstract:
1286
- for sent in abstract.split(". "):
1287
- if len(sent.split()) > 6:
1288
- insights.add(sent.strip())
1289
- return insights
1290
-
1291
- def _calculate_novelty(self, new_insights, known_insights):
1292
- """Calculate novelty as the fraction of new insights."""
1293
- if not new_insights:
1294
- return 0.0
1295
- new = set(new_insights) - set(known_insights)
1296
- return len(new) / max(1, len(new_insights))
1297
-
1298
- def _calculate_saturation(self, web_novelty, paper_novelty, num_papers, num_web):
1299
- """Calculate a saturation score based on novelty and coverage."""
1300
- novelty_score = 1.0 - max(web_novelty, paper_novelty)
1301
- coverage_score = min(num_papers / 30, num_web / 30, 1.0)
1302
- return 0.7 * novelty_score + 0.3 * coverage_score
1303
-
1304
- def _analyze_diversity(self, papers, web_sources):
1305
- """Analyze diversity of institutions, methodologies, viewpoints, and publication years."""
1306
- diversity = {
1307
- 'institutions': set(),
1308
- 'methodologies': set(),
1309
- 'years': set(),
1310
- 'source_types': set(),
1311
- }
1312
- for paper in papers:
1313
- if 'institution' in paper:
1314
- diversity['institutions'].add(paper['institution'])
1315
- if 'methodology' in paper:
1316
- diversity['methodologies'].add(paper['methodology'])
1317
- if 'year' in paper:
1318
- diversity['years'].add(paper['year'])
1319
- diversity['source_types'].add('academic')
1320
- for web in web_sources:
1321
- if 'source' in web:
1322
- diversity['source_types'].add(web['source'])
1323
- if 'date' in web:
1324
- diversity['years'].add(web['date'][:4])
1325
- # Convert sets to lists for serialization
1326
- for k in diversity:
1327
- diversity[k] = list(diversity[k])
1328
- return diversity
1329
-
1330
- def _detect_contradictions(self, web_insights, paper_insights):
1331
- """Detect contradictions between web and academic insights."""
1332
- contradictions = []
1333
- web_set = set(web_insights)
1334
- paper_set = set(paper_insights)
1335
- # Simple contradiction: same topic, opposite claims (heuristic)
1336
- for w in web_set:
1337
- for p in paper_set:
1338
- if w.lower() in p.lower() or p.lower() in w.lower():
1339
- continue
1340
- # Heuristic: if both mention the same keyword but have different sentiment
1341
- # (This can be improved with LLM or sentiment analysis)
1342
- if any(neg in w.lower() for neg in ['not ', 'no ', 'fail', 'lack']) != any(neg in p.lower() for neg in ['not ', 'no ', 'fail', 'lack']):
1343
- contradictions.append({'web': w, 'paper': p})
1344
- return contradictions
1345
-
1346
- def _generate_gap_queries(self, state, concept):
1347
- """Generate new queries to fill gaps in coverage."""
1348
- # Heuristic: look for missing years, institutions, or methodologies
1349
- gap_queries = set()
1350
- # Example: if not enough recent papers, add query for 'recent advances in ...'
1351
- if len(state['diversity_metrics'].get('years', [])) < 3:
1352
- gap_queries.add(f"recent advances in {concept}")
1353
- if len(state['diversity_metrics'].get('institutions', [])) < 2:
1354
- gap_queries.add(f"institutional perspectives on {concept}")
1355
- if len(state['diversity_metrics'].get('methodologies', [])) < 2:
1356
- gap_queries.add(f"methodological approaches to {concept}")
1357
- # Add more heuristics as needed
1358
- return gap_queries
1359
-
1360
- async def _llm_generate_gap_queries(self, state, concept: str) -> set:
1361
- """Use LLM to propose additional gap-filling queries."""
1362
- try:
1363
- if not hasattr(self, 'llm_manager') or self.llm_manager is None:
1364
- return set()
1365
- web_sources = state.get('web_sources', [])[-5:]
1366
- papers = state.get('academic_papers', [])[-5:]
1367
- prompt = (
1368
- f"Given the topic '{concept}', propose 3-6 additional search queries to fill knowledge gaps.\n"
1369
- f"Recent web sources: {json.dumps(web_sources[:3])}\n"
1370
- f"Recent papers titles: {[p.get('title','') for p in papers[:5]]}\n"
1371
- f"Focus on missing institutions, methodologies, time ranges, and alternative viewpoints.\n"
1372
- f"Return one query per line, no bullets or numbering."
1373
- )
1374
- text = await self.llm_manager.generate_text(prompt)
1375
- queries = {q.strip() for q in text.split('\n') if q.strip()}
1376
- return set(list(queries)[:6])
1377
- except Exception:
1378
- return set()
1379
-
1380
- def _cluster_insights(self, insights):
1381
- """Cluster insights by semantic similarity (using embeddings)."""
1382
- # Placeholder: In production, use embeddings + clustering (e.g., KMeans)
1383
- # For now, group by keyword overlap as a simple heuristic
1384
- clusters = []
1385
- insights = list(insights)
1386
- used = set()
1387
- for i, sent in enumerate(insights):
1388
- if i in used:
1389
- continue
1390
- cluster = [sent]
1391
- for j, other in enumerate(insights):
1392
- if i != j and j not in used:
1393
- if len(set(sent.lower().split()) & set(other.lower().split())) > 3:
1394
- cluster.append(other)
1395
- used.add(j)
1396
- used.add(i)
1397
- clusters.append(cluster)
1398
- return clusters
1399
-
1400
- def _score_academic_source(self, paper):
1401
- """Score academic source credibility (citation count, journal, recency)."""
1402
- score = 0.0
1403
- if 'citation_count' in paper:
1404
- try:
1405
- c = int(paper['citation_count'])
1406
- score += min(5.0, 2.0 * (c ** 0.5))
1407
- except:
1408
- pass
1409
- if 'year' in paper:
1410
- try:
1411
- y = int(paper['year'])
1412
- if y >= 2022:
1413
- score += 2.0
1414
- elif y >= 2018:
1415
- score += 1.0
1416
- except:
1417
- pass
1418
- if 'journal' in paper and paper['journal']:
1419
- score += 1.0
1420
- return min(score, 10.0)
1421
-
1422
- def _score_web_source(self, web):
1423
- """Score web source credibility (domain, recency, cross-references)."""
1424
- score = 0.0
1425
- url = web.get('url', '')
1426
- if any(domain in url for domain in ['.edu', '.ac.', '.gov']):
1427
- score += 3.0
1428
- if any(domain in url.lower() for domain in ['ieee', 'springer', 'nature', 'acm', 'nejm', 'science.org', 'cell.com', 'wiley']):
1429
- score += 2.0
1430
- if 'date' in web:
1431
- try:
1432
- y = int(web['date'][:4])
1433
- if y >= 2022:
1434
- score += 2.0
1435
- elif y >= 2018:
1436
- score += 1.0
1437
- except:
1438
- pass
1439
- if 'source' in web and web['source'] in ['web_search', 'bing_search']:
1440
- score += 1.0
1441
- return min(score, 10.0)