cite-agent 1.3.6__py3-none-any.whl → 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cite-agent might be problematic. Click here for more details.

Files changed (36) hide show
  1. cite_agent/__version__.py +1 -1
  2. cite_agent/cli.py +9 -2
  3. cite_agent/enhanced_ai_agent.py +332 -73
  4. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/METADATA +1 -1
  5. cite_agent-1.3.7.dist-info/RECORD +31 -0
  6. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/top_level.txt +0 -1
  7. cite_agent-1.3.6.dist-info/RECORD +0 -57
  8. src/__init__.py +0 -1
  9. src/services/__init__.py +0 -132
  10. src/services/auth_service/__init__.py +0 -3
  11. src/services/auth_service/auth_manager.py +0 -33
  12. src/services/graph/__init__.py +0 -1
  13. src/services/graph/knowledge_graph.py +0 -194
  14. src/services/llm_service/__init__.py +0 -5
  15. src/services/llm_service/llm_manager.py +0 -495
  16. src/services/paper_service/__init__.py +0 -5
  17. src/services/paper_service/openalex.py +0 -231
  18. src/services/performance_service/__init__.py +0 -1
  19. src/services/performance_service/rust_performance.py +0 -395
  20. src/services/research_service/__init__.py +0 -23
  21. src/services/research_service/chatbot.py +0 -2056
  22. src/services/research_service/citation_manager.py +0 -436
  23. src/services/research_service/context_manager.py +0 -1441
  24. src/services/research_service/conversation_manager.py +0 -597
  25. src/services/research_service/critical_paper_detector.py +0 -577
  26. src/services/research_service/enhanced_research.py +0 -121
  27. src/services/research_service/enhanced_synthesizer.py +0 -375
  28. src/services/research_service/query_generator.py +0 -777
  29. src/services/research_service/synthesizer.py +0 -1273
  30. src/services/search_service/__init__.py +0 -5
  31. src/services/search_service/indexer.py +0 -186
  32. src/services/search_service/search_engine.py +0 -342
  33. src/services/simple_enhanced_main.py +0 -287
  34. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/WHEEL +0 -0
  35. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/entry_points.txt +0 -0
  36. {cite_agent-1.3.6.dist-info → cite_agent-1.3.7.dist-info}/licenses/LICENSE +0 -0
@@ -1,1273 +0,0 @@
1
- #synthesizer.py
2
-
3
- import logging
4
- import re
5
- import asyncio
6
- from typing import List, Dict, Optional, Any
7
- from datetime import datetime, timezone
8
- from dataclasses import asdict
9
- import json
10
- import redis.asyncio as redis
11
- import hashlib
12
-
13
- from src.storage.db.operations import DatabaseOperations
14
- from src.services.llm_service.llm_manager import LLMManager
15
- from src.services.graph.knowledge_graph import KnowledgeGraph
16
- from .citation_manager import CitationManager, Citation, CitedFinding, CitationFormat
17
-
18
- # Configure structured logging
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- def _utc_now() -> datetime:
23
- return datetime.now(timezone.utc)
24
-
25
-
26
- def _utc_timestamp() -> str:
27
- return _utc_now().isoformat()
28
-
29
- class ResearchSynthesizer:
30
- """
31
- Enhanced research synthesizer with comprehensive error handling, security, and observability.
32
-
33
- Features:
34
- - Secure paper synthesis and analysis
35
- - Input validation and sanitization
36
- - Comprehensive error handling and retry logic
37
- - Structured logging and monitoring
38
- - Protection against injection attacks
39
- - Caching and task management
40
- - Knowledge Graph entity extraction
41
- """
42
-
43
- def __init__(self, db_ops: DatabaseOperations, llm_manager: LLMManager, redis_url: str, kg_client: Optional[KnowledgeGraph] = None, openalex_client=None):
44
- """
45
- Initialize research synthesizer with enhanced security and error handling.
46
-
47
- Args:
48
- db_ops: Database operations instance
49
- llm_manager: LLM manager instance
50
- redis_url: Redis connection URL
51
- kg_client: Knowledge Graph client for entity extraction
52
- openalex_client: OpenAlex client for citation network building
53
-
54
- Raises:
55
- ValueError: If parameters are invalid
56
- ConnectionError: If Redis connection fails
57
- """
58
- try:
59
- if not db_ops:
60
- raise ValueError("Database operations instance is required")
61
- if not llm_manager:
62
- raise ValueError("LLM manager instance is required")
63
- if not redis_url:
64
- raise ValueError("Redis URL is required")
65
-
66
- #logger.info("Initializing ResearchSynthesizer with enhanced security")
67
-
68
- self.db = db_ops
69
- self.llm = llm_manager
70
- self.kg_client = kg_client
71
-
72
- # Initialize citation manager
73
- self.citation_manager = CitationManager(db_ops=db_ops, openalex_client=openalex_client)
74
-
75
- # Initialize Redis with error handling
76
- try:
77
- self.redis_client = redis.from_url(redis_url)
78
- #logger.info("Redis client initialized successfully")
79
- except Exception as e:
80
- logger.error(f"Failed to initialize Redis client: {str(e)}")
81
- raise ConnectionError(f"Redis connection failed: {str(e)}")
82
-
83
- self.synthesis_cache = {}
84
- self.synthesis_tasks = {}
85
-
86
- #logger.info("ResearchSynthesizer initialized successfully")
87
-
88
- except Exception as e:
89
- logger.error(f"Failed to initialize ResearchSynthesizer: {str(e)}")
90
- raise
91
-
92
- def _validate_paper_ids(self, paper_ids: List[str]) -> None:
93
- """
94
- Validate paper IDs for security and safety.
95
-
96
- Args:
97
- paper_ids: List of paper IDs to validate
98
-
99
- Raises:
100
- ValueError: If paper IDs are invalid
101
- """
102
- if not isinstance(paper_ids, list):
103
- raise ValueError("Paper IDs must be a list")
104
-
105
- if not paper_ids:
106
- raise ValueError("Paper IDs list cannot be empty")
107
-
108
- if len(paper_ids) > 100: # Reasonable limit
109
- raise ValueError("Too many paper IDs (max 100)")
110
-
111
- for i, paper_id in enumerate(paper_ids):
112
- if not isinstance(paper_id, str) or not paper_id.strip():
113
- raise ValueError(f"Invalid paper ID at index {i}: must be non-empty string")
114
-
115
- # Check for potentially dangerous patterns
116
- if re.search(r'[<>"\']', paper_id):
117
- raise ValueError(f"Paper ID at index {i} contains invalid characters")
118
-
119
- def _sanitize_text(self, text: str, max_length: int = 10000) -> str:
120
- """
121
- Sanitize text to prevent injection attacks.
122
-
123
- Args:
124
- text: Text to sanitize
125
- max_length: Maximum allowed length
126
-
127
- Returns:
128
- Sanitized text
129
- """
130
- if not isinstance(text, str):
131
- raise ValueError("Text must be a string")
132
-
133
- if len(text) > max_length:
134
- text = text[:max_length]
135
-
136
- # Basic XSS protection
137
- sanitized = text.replace('<', '&lt;').replace('>', '&gt;')
138
-
139
- # Remove null bytes and other control characters
140
- sanitized = ''.join(char for char in sanitized if ord(char) >= 32 or char in '\n\r\t')
141
-
142
- return sanitized.strip()
143
-
144
- async def synthesize_papers(self, paper_ids: List[str], force_refresh: bool = False) -> Dict[str, Any]:
145
- """
146
- Synthesize findings across multiple papers with enhanced error handling and security.
147
-
148
- Args:
149
- paper_ids: List of paper IDs to synthesize
150
- force_refresh: Whether to force refresh cached results
151
-
152
- Returns:
153
- Synthesis results
154
-
155
- Raises:
156
- ValueError: If paper IDs are invalid
157
- ConnectionError: If synthesis fails
158
- """
159
- try:
160
- # Input validation and sanitization
161
- self._validate_paper_ids(paper_ids)
162
-
163
- # Create cache key
164
- cache_key = f"synthesis:{hashlib.md5('_'.join(sorted(paper_ids)).encode()).hexdigest()}"
165
-
166
- #logger.info(f"Synthesizing {len(paper_ids)} papers (force_refresh: {force_refresh})")
167
-
168
- # Check cache if not forcing refresh
169
- if not force_refresh:
170
- try:
171
- if cached := await self._get_cached_synthesis(cache_key):
172
- #logger.info("Using cached synthesis")
173
- return cached
174
- except Exception as e:
175
- logger.warning(f"Failed to retrieve cached synthesis: {str(e)}")
176
-
177
- # Create synthesis task if not already running
178
- if cache_key not in self.synthesis_tasks:
179
- self.synthesis_tasks[cache_key] = asyncio.create_task(
180
- self._generate_synthesis(paper_ids, cache_key)
181
- )
182
-
183
- try:
184
- return await self.synthesis_tasks[cache_key]
185
- finally:
186
- if cache_key in self.synthesis_tasks:
187
- del self.synthesis_tasks[cache_key]
188
-
189
- except ValueError as e:
190
- logger.error(f"Invalid input for paper synthesis: {str(e)}")
191
- raise
192
- except Exception as e:
193
- logger.error(f"Error synthesizing papers: {str(e)}")
194
- raise
195
-
196
- async def _generate_synthesis(self, paper_ids: List[str], cache_key: str) -> Dict[str, Any]:
197
- """
198
- Generate comprehensive synthesis of papers with enhanced error handling.
199
-
200
- Args:
201
- paper_ids: List of paper IDs
202
- cache_key: Cache key for storing results
203
-
204
- Returns:
205
- Synthesis results
206
-
207
- Raises:
208
- ConnectionError: If synthesis generation fails
209
- """
210
- try:
211
- # Gather papers with error handling
212
- papers = []
213
- for pid in paper_ids:
214
- try:
215
- if paper := await self.db.get_processed_paper(pid):
216
- papers.append(paper)
217
- else:
218
- logger.warning(f"Paper {pid} not found in database")
219
- except Exception as e:
220
- logger.error(f"Error retrieving paper {pid}: {str(e)}")
221
-
222
- if not papers:
223
- logger.warning("No valid papers found for synthesis")
224
- return {
225
- "error": "No valid papers found",
226
- "paper_count": 0,
227
- "generated_at": _utc_timestamp()
228
- }
229
-
230
- #logger.info(f"Retrieved {len(papers)} papers for synthesis")
231
-
232
- # Generate all aspects concurrently with error handling
233
- synthesis_tasks = {
234
- "common_findings": self._extract_common_findings(papers),
235
- "contradictions": self._find_contradictions(papers),
236
- "research_gaps": self._identify_gaps(papers),
237
- "timeline": self._create_timeline(papers),
238
- "connections": self._map_connections(papers),
239
- "methodology_analysis": self._analyze_methodologies(papers),
240
- "future_directions": self._suggest_future_directions(papers),
241
- "citation_analysis": self._analyze_citations(papers)
242
- }
243
-
244
- synthesis = {}
245
- try:
246
- # Use gather instead of TaskGroup for better error handling
247
- results = await asyncio.gather(*synthesis_tasks.values(), return_exceptions=True)
248
-
249
- for key, result in zip(synthesis_tasks.keys(), results):
250
- if isinstance(result, Exception):
251
- logger.error(f"Error in {key}: {str(result)}")
252
- synthesis[key] = {"error": str(result)}
253
- else:
254
- synthesis[key] = result
255
-
256
- except Exception as e:
257
- logger.error(f"Error in concurrent synthesis tasks: {str(e)}")
258
- # Fallback to sequential processing
259
- for key, coro in synthesis_tasks.items():
260
- try:
261
- synthesis[key] = await coro
262
- except Exception as task_error:
263
- logger.error(f"Error in {key}: {str(task_error)}")
264
- synthesis[key] = {"error": str(task_error)}
265
-
266
- # Add metadata
267
- synthesis["meta"] = {
268
- "paper_count": len(papers),
269
- "generated_at": _utc_timestamp(),
270
- "paper_ids": paper_ids,
271
- "success": True
272
- }
273
-
274
- # Extract simple entities/relations and upsert to KG (very basic placeholder)
275
- try:
276
- if self.kg_client:
277
- await self._extract_and_upsert_entities(papers, synthesis)
278
- except Exception:
279
- pass
280
-
281
- # Cache the results with error handling
282
- try:
283
- await self._cache_synthesis(cache_key, synthesis)
284
- except Exception as e:
285
- logger.warning(f"Failed to cache synthesis: {str(e)}")
286
-
287
- #logger.info(f"Successfully generated synthesis for {len(papers)} papers")
288
- return synthesis
289
-
290
- except Exception as e:
291
- logger.error(f"Error generating synthesis: {str(e)}")
292
- raise ConnectionError(f"Synthesis generation failed: {str(e)}")
293
-
294
- async def _extract_common_findings(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
295
- """
296
- Extract and structure common findings across papers with enhanced error handling.
297
-
298
- Args:
299
- papers: List of paper dictionaries
300
-
301
- Returns:
302
- List of common findings
303
- """
304
- try:
305
- if not papers:
306
- return []
307
-
308
- # Extract and sanitize summaries
309
- summaries = []
310
- for paper in papers:
311
- if isinstance(paper, dict) and paper.get('summary'):
312
- sanitized_summary = self._sanitize_text(paper['summary'])
313
- summaries.append(sanitized_summary)
314
-
315
- if not summaries:
316
- logger.warning("No summaries available for finding extraction")
317
- return []
318
-
319
- prompt = """
320
- Analyze these research summaries and identify common findings.
321
- For each finding, specify:
322
- 1. The key point
323
- 2. How many papers support it
324
- 3. The strength of evidence (strong/moderate/weak)
325
- 4. Any important context or limitations
326
-
327
- Summaries:
328
- {summaries}
329
-
330
- Provide structured findings focusing on well-supported conclusions.
331
- """
332
-
333
- try:
334
- response = await self.llm.generate_synthesis(
335
- [{"content": summary} for summary in summaries],
336
- prompt.format(summaries="\n\n".join(summaries))
337
- )
338
-
339
- if isinstance(response, dict) and "summary" in response:
340
- return self._parse_findings(response["summary"])
341
- else:
342
- return self._parse_findings(str(response))
343
-
344
- except Exception as e:
345
- logger.error(f"Error calling LLM for findings extraction: {str(e)}")
346
- return []
347
-
348
- except Exception as e:
349
- logger.error(f"Error extracting findings: {str(e)}")
350
- return []
351
-
352
- async def _find_contradictions(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
353
- """
354
- Identify and analyze contradictions between papers with enhanced error handling.
355
-
356
- Args:
357
- papers: List of paper dictionaries
358
-
359
- Returns:
360
- List of contradictions
361
- """
362
- try:
363
- if not papers:
364
- return []
365
-
366
- # Create paper summaries
367
- paper_summaries = []
368
- for i, paper in enumerate(papers):
369
- if isinstance(paper, dict):
370
- title = self._sanitize_text(paper.get('title', 'Untitled'), max_length=200)
371
- summary = self._sanitize_text(paper.get('summary', ''), max_length=1000)
372
- paper_summaries.append(f"Paper {i+1}: {title}\n{summary}")
373
-
374
- if not paper_summaries:
375
- logger.warning("No paper summaries available for contradiction analysis")
376
- return []
377
-
378
- prompt = """
379
- Compare these papers and identify any contradictions or disagreements.
380
- For each contradiction, specify:
381
- 1. The topic of disagreement
382
- 2. The competing viewpoints
383
- 3. The papers supporting each view
384
- 4. Possible reasons for the disagreement
385
-
386
- Papers:
387
- {papers}
388
-
389
- Focus on significant disagreements that affect research conclusions.
390
- """
391
-
392
- try:
393
- response = await self.llm.generate_synthesis(
394
- [{"content": summary} for summary in paper_summaries],
395
- prompt.format(papers="\n\n".join(paper_summaries))
396
- )
397
-
398
- if isinstance(response, dict) and "summary" in response:
399
- return self._parse_contradictions(response["summary"])
400
- else:
401
- return self._parse_contradictions(str(response))
402
-
403
- except Exception as e:
404
- logger.error(f"Error calling LLM for contradiction analysis: {str(e)}")
405
- return []
406
-
407
- except Exception as e:
408
- logger.error(f"Error finding contradictions: {str(e)}")
409
- return []
410
-
411
- async def _identify_gaps(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
412
- """
413
- Identify research gaps and opportunities with enhanced error handling.
414
-
415
- Args:
416
- papers: List of paper dictionaries
417
-
418
- Returns:
419
- List of research gaps
420
- """
421
- try:
422
- if not papers:
423
- return []
424
-
425
- # Extract summaries
426
- summaries = []
427
- for paper in papers:
428
- if isinstance(paper, dict) and paper.get('summary'):
429
- sanitized_summary = self._sanitize_text(paper['summary'], max_length=1000)
430
- summaries.append(sanitized_summary)
431
-
432
- if not summaries:
433
- logger.warning("No summaries available for gap analysis")
434
- return []
435
-
436
- prompt = """
437
- Based on these papers, identify:
438
- 1. Unexplored research areas
439
- 2. Methodological gaps
440
- 3. Unanswered questions
441
- 4. Limitations in current research
442
- 5. Potential research opportunities
443
-
444
- Papers:
445
- {papers}
446
-
447
- Prioritize gaps that could lead to meaningful research contributions.
448
- """
449
-
450
- try:
451
- response = await self.llm.generate_synthesis(
452
- [{"content": summary} for summary in summaries],
453
- prompt.format(papers="\n".join(summaries))
454
- )
455
-
456
- if isinstance(response, dict) and "summary" in response:
457
- response_text = response["summary"]
458
- else:
459
- response_text = str(response)
460
-
461
- gaps = []
462
- for line in response_text.split('\n'):
463
- if line.strip():
464
- gaps.append({
465
- "gap": self._sanitize_text(line.strip(), max_length=500),
466
- "type": self._categorize_gap(line),
467
- "identified_at": _utc_timestamp()
468
- })
469
- return gaps
470
-
471
- except Exception as e:
472
- logger.error(f"Error calling LLM for gap analysis: {str(e)}")
473
- return []
474
-
475
- except Exception as e:
476
- logger.error(f"Error identifying gaps: {str(e)}")
477
- return []
478
-
479
- async def _analyze_methodologies(self, papers: List[Dict[str, Any]]) -> Dict[str, Any]:
480
- """
481
- Analyze and compare research methodologies with enhanced error handling.
482
-
483
- Args:
484
- papers: List of paper dictionaries
485
-
486
- Returns:
487
- Methodology analysis results
488
- """
489
- try:
490
- if not papers:
491
- return {"error": "No papers provided"}
492
-
493
- methodologies = {}
494
-
495
- for paper in papers:
496
- if isinstance(paper, dict):
497
- method = self._sanitize_text(paper.get('methodology', 'Not specified'), max_length=200)
498
- if method not in methodologies:
499
- methodologies[method] = {
500
- 'count': 0,
501
- 'papers': [],
502
- 'strengths': [],
503
- 'limitations': []
504
- }
505
-
506
- methodologies[method]['count'] += 1
507
- methodologies[method]['papers'].append(paper.get('title', 'Untitled'))
508
-
509
- # Analyze methodologies using LLM
510
- try:
511
- method_text = "\n".join([
512
- f"Method: {method} (used in {info['count']} papers)"
513
- for method, info in methodologies.items()
514
- ])
515
-
516
- prompt = """
517
- Analyze these research methodologies and identify:
518
- 1. Strengths of each approach
519
- 2. Limitations and weaknesses
520
- 3. Comparative advantages
521
- 4. Recommendations for improvement
522
-
523
- Methodologies:
524
- {methods}
525
- """
526
-
527
- response = await self.llm.generate_synthesis(
528
- [{"content": method_text}],
529
- prompt.format(methods=method_text)
530
- )
531
-
532
- if isinstance(response, dict) and "summary" in response:
533
- analysis = response["summary"]
534
- else:
535
- analysis = str(response)
536
-
537
- methodologies["analysis"] = self._sanitize_text(analysis, max_length=2000)
538
-
539
- except Exception as e:
540
- logger.error(f"Error analyzing methodologies with LLM: {str(e)}")
541
- methodologies["analysis"] = "Methodology analysis failed"
542
-
543
- return methodologies
544
-
545
- except Exception as e:
546
- logger.error(f"Error analyzing methodologies: {str(e)}")
547
- return {"error": str(e)}
548
-
549
- async def _suggest_future_directions(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
550
- """
551
- Suggest future research directions with enhanced error handling.
552
-
553
- Args:
554
- papers: List of paper dictionaries
555
-
556
- Returns:
557
- List of future research directions
558
- """
559
- try:
560
- if not papers:
561
- return []
562
-
563
- # Extract key information
564
- summaries = []
565
- for paper in papers:
566
- if isinstance(paper, dict) and paper.get('summary'):
567
- sanitized_summary = self._sanitize_text(paper['summary'], max_length=1000)
568
- summaries.append(sanitized_summary)
569
-
570
- if not summaries:
571
- logger.warning("No summaries available for future directions analysis")
572
- return []
573
-
574
- prompt = """
575
- Based on these research papers, suggest future research directions:
576
- 1. Emerging trends and opportunities
577
- 2. Unanswered questions
578
- 3. Potential applications
579
- 4. Methodological improvements
580
- 5. Cross-disciplinary opportunities
581
-
582
- Papers:
583
- {papers}
584
-
585
- Provide specific, actionable research directions.
586
- """
587
-
588
- try:
589
- response = await self.llm.generate_synthesis(
590
- [{"content": summary} for summary in summaries],
591
- prompt.format(papers="\n".join(summaries))
592
- )
593
-
594
- if isinstance(response, dict) and "summary" in response:
595
- response_text = response["summary"]
596
- else:
597
- response_text = str(response)
598
-
599
- directions = []
600
- for line in response_text.split('\n'):
601
- if line.strip():
602
- directions.append({
603
- "direction": self._sanitize_text(line.strip(), max_length=500),
604
- "suggested_at": _utc_timestamp()
605
- })
606
- return directions
607
-
608
- except Exception as e:
609
- logger.error(f"Error calling LLM for future directions: {str(e)}")
610
- return []
611
-
612
- except Exception as e:
613
- logger.error(f"Error suggesting future directions: {str(e)}")
614
- return []
615
-
616
- async def _create_timeline(self, papers: List[Dict[str, Any]]) -> Dict[str, Any]:
617
- """
618
- Create research timeline with enhanced error handling.
619
-
620
- Args:
621
- papers: List of paper dictionaries
622
-
623
- Returns:
624
- Timeline data
625
- """
626
- try:
627
- if not papers:
628
- return {"error": "No papers provided"}
629
-
630
- # Extract publication dates and key events
631
- timeline_events = []
632
-
633
- for paper in papers:
634
- if isinstance(paper, dict):
635
- title = self._sanitize_text(paper.get('title', 'Untitled'), max_length=200)
636
- year = paper.get('year')
637
- summary = self._sanitize_text(paper.get('summary', ''), max_length=500)
638
-
639
- if year:
640
- timeline_events.append({
641
- "year": year,
642
- "title": title,
643
- "summary": summary,
644
- "type": "publication"
645
- })
646
-
647
- # Sort by year
648
- timeline_events.sort(key=lambda x: x.get('year', 0))
649
-
650
- # Group by year
651
- timeline = {}
652
- for event in timeline_events:
653
- year = event['year']
654
- if year not in timeline:
655
- timeline[year] = []
656
- timeline[year].append(event)
657
-
658
- return {
659
- "timeline": timeline,
660
- "total_events": len(timeline_events),
661
- "year_range": {
662
- "start": min(timeline.keys()) if timeline else None,
663
- "end": max(timeline.keys()) if timeline else None
664
- }
665
- }
666
-
667
- except Exception as e:
668
- logger.error(f"Error creating timeline: {str(e)}")
669
- return {"error": str(e)}
670
-
671
- async def _map_connections(self, papers: List[Dict[str, Any]]) -> Dict[str, Any]:
672
- """
673
- Map connections between papers with enhanced error handling.
674
-
675
- Args:
676
- papers: List of paper dictionaries
677
-
678
- Returns:
679
- Connection mapping data
680
- """
681
- try:
682
- if not papers:
683
- return {"error": "No papers provided"}
684
-
685
- # Extract paper information
686
- paper_info = []
687
- for i, paper in enumerate(papers):
688
- if isinstance(paper, dict):
689
- title = self._sanitize_text(paper.get('title', 'Untitled'), max_length=200)
690
- summary = self._sanitize_text(paper.get('summary', ''), max_length=1000)
691
- authors = paper.get('authors', [])
692
-
693
- paper_info.append({
694
- "id": i,
695
- "title": title,
696
- "summary": summary,
697
- "authors": authors if isinstance(authors, list) else [],
698
- "year": paper.get('year')
699
- })
700
-
701
- if not paper_info:
702
- return {"error": "No valid paper information"}
703
-
704
- # Analyze connections using LLM
705
- try:
706
- papers_text = "\n".join([
707
- f"Paper {p['id']}: {p['title']} ({p['year']})\n{p['summary']}"
708
- for p in paper_info
709
- ])
710
-
711
- prompt = """
712
- Analyze these papers and identify connections between them:
713
- 1. Thematic connections
714
- 2. Methodological similarities
715
- 3. Citation relationships
716
- 4. Complementary findings
717
- 5. Building upon each other
718
-
719
- Papers:
720
- {papers}
721
-
722
- Provide a structured analysis of how these papers relate to each other.
723
- """
724
-
725
- response = await self.llm.generate_synthesis(
726
- [{"content": papers_text}],
727
- prompt.format(papers=papers_text)
728
- )
729
-
730
- if isinstance(response, dict) and "summary" in response:
731
- analysis = response["summary"]
732
- else:
733
- analysis = str(response)
734
-
735
- return {
736
- "connections_analysis": self._sanitize_text(analysis, max_length=2000),
737
- "paper_count": len(paper_info),
738
- "connection_types": ["thematic", "methodological", "temporal", "complementary"]
739
- }
740
-
741
- except Exception as e:
742
- logger.error(f"Error analyzing connections with LLM: {str(e)}")
743
- return {
744
- "error": "Connection analysis failed",
745
- "paper_count": len(paper_info)
746
- }
747
-
748
- except Exception as e:
749
- logger.error(f"Error mapping connections: {str(e)}")
750
- return {"error": str(e)}
751
-
752
- async def _analyze_citations(self, papers: List[Dict[str, Any]]) -> Dict[str, Any]:
753
- """
754
- Analyze citations and build citation networks with academic formatting.
755
-
756
- Args:
757
- papers: List of paper data
758
-
759
- Returns:
760
- Citation analysis results with academic formatting
761
- """
762
- try:
763
- #logger.info(f"Analyzing citations for {len(papers)} papers")
764
-
765
- all_citations = []
766
- citation_networks = []
767
- cited_findings = []
768
-
769
- for paper in papers:
770
- paper_id = paper.get('id', paper.get('paper_id', 'unknown'))
771
-
772
- # Extract citations from paper
773
- paper_citations = await self.citation_manager.extract_citations_from_paper(paper)
774
- all_citations.extend(paper_citations)
775
-
776
- # Build citation network if OpenAlex data available
777
- if paper.get('openalex_id'):
778
- network = await self.citation_manager.build_citation_network(
779
- paper.get('openalex_id'), depth=2
780
- )
781
- citation_networks.append(network)
782
-
783
- # Create cited findings for key findings
784
- if paper.get('findings'):
785
- # Create a citation for the current paper
786
- paper_citation = Citation(
787
- citation_id=f"PAPER_{paper_id[-8:].upper()}",
788
- title=paper.get('title', 'Unknown'),
789
- authors=paper.get('authors', []),
790
- year=paper.get('year', 0),
791
- journal=paper.get('journal'),
792
- doi=paper.get('doi'),
793
- citation_count=paper.get('citation_count', 0)
794
- )
795
-
796
- # Create cited finding
797
- cited_finding = self.citation_manager.create_cited_finding(
798
- finding_text=paper.get('findings'),
799
- citation=paper_citation,
800
- context=paper.get('abstract'),
801
- methodology=paper.get('methodology')
802
- )
803
- cited_findings.append(cited_finding)
804
-
805
- # Generate citation analytics
806
- citation_analytics = await self.citation_manager.get_citation_analytics(all_citations)
807
-
808
- # Export citations in multiple formats
809
- apa_citations = await self.citation_manager.export_citations(
810
- all_citations, CitationFormat.APA
811
- )
812
- bibtex_citations = await self.citation_manager.export_citations(
813
- all_citations, CitationFormat.BIBTEX
814
- )
815
-
816
- return {
817
- "total_citations": len(all_citations),
818
- "citation_networks": [asdict(network) for network in citation_networks],
819
- "cited_findings": [asdict(finding) for finding in cited_findings],
820
- "citation_analytics": citation_analytics,
821
- "formatted_citations": {
822
- "apa": apa_citations,
823
- "bibtex": bibtex_citations
824
- },
825
- "citation_quality": "high" if len(all_citations) > 0 else "low",
826
- "academic_credibility_score": min(len(all_citations) / len(papers), 10.0) if papers else 0.0
827
- }
828
-
829
- except Exception as e:
830
- logger.error(f"Error in citation analysis: {str(e)}")
831
- return {
832
- "total_citations": 0,
833
- "citation_networks": [],
834
- "cited_findings": [],
835
- "citation_analytics": {"error": str(e)},
836
- "formatted_citations": {"apa": "", "bibtex": ""},
837
- "citation_quality": "error",
838
- "academic_credibility_score": 0.0,
839
- "error": str(e)
840
- }
841
-
842
- async def _get_cached_synthesis(self, cache_key: str) -> Optional[Dict[str, Any]]:
843
- """
844
- Get cached synthesis with error handling.
845
-
846
- Args:
847
- cache_key: Cache key
848
-
849
- Returns:
850
- Cached synthesis or None
851
- """
852
- try:
853
- cached = await self.redis_client.get(cache_key)
854
- if cached:
855
- return json.loads(cached)
856
- return None
857
- except Exception as e:
858
- logger.warning(f"Error retrieving cached synthesis: {str(e)}")
859
- return None
860
-
861
- async def _cache_synthesis(self, cache_key: str, synthesis: Dict[str, Any]) -> None:
862
- """
863
- Cache synthesis with error handling.
864
-
865
- Args:
866
- cache_key: Cache key
867
- synthesis: Synthesis data to cache
868
- """
869
- try:
870
- # Set expiration to 24 hours
871
- await self.redis_client.setex(
872
- cache_key,
873
- 60 * 60 * 24, # 24 hours
874
- json.dumps(synthesis)
875
- )
876
- #logger.info(f"Synthesis cached with key: {cache_key}")
877
- except Exception as e:
878
- logger.warning(f"Error caching synthesis: {str(e)}")
879
-
880
- def _parse_findings(self, llm_response: str) -> List[Dict[str, Any]]:
881
- """
882
- Parse findings from LLM response with enhanced error handling.
883
-
884
- Args:
885
- llm_response: LLM response text
886
-
887
- Returns:
888
- List of parsed findings
889
- """
890
- try:
891
- if not llm_response:
892
- return []
893
-
894
- findings = []
895
- lines = llm_response.split('\n')
896
-
897
- for line in lines:
898
- line = line.strip()
899
- if line and not line.startswith('#'):
900
- findings.append({
901
- "finding": self._sanitize_text(line, max_length=500),
902
- "strength": "moderate", # Default strength
903
- "extracted_at": _utc_timestamp()
904
- })
905
-
906
- return findings[:20] # Limit to 20 findings
907
-
908
- except Exception as e:
909
- logger.error(f"Error parsing findings: {str(e)}")
910
- return []
911
-
912
- def _parse_contradictions(self, llm_response: str) -> List[Dict[str, Any]]:
913
- """
914
- Parse contradictions from LLM response with enhanced error handling.
915
-
916
- Args:
917
- llm_response: LLM response text
918
-
919
- Returns:
920
- List of parsed contradictions
921
- """
922
- try:
923
- if not llm_response:
924
- return []
925
-
926
- contradictions = []
927
- lines = llm_response.split('\n')
928
-
929
- for line in lines:
930
- line = line.strip()
931
- if line and not line.startswith('#'):
932
- contradictions.append({
933
- "contradiction": self._sanitize_text(line, max_length=500),
934
- "type": "methodological", # Default type
935
- "identified_at": _utc_timestamp()
936
- })
937
-
938
- return contradictions[:10] # Limit to 10 contradictions
939
-
940
- except Exception as e:
941
- logger.error(f"Error parsing contradictions: {str(e)}")
942
- return []
943
-
944
- def _categorize_gap(self, gap_text: str) -> str:
945
- """
946
- Categorize research gap with enhanced error handling.
947
-
948
- Args:
949
- gap_text: Gap description text
950
-
951
- Returns:
952
- Gap category
953
- """
954
- try:
955
- gap_lower = gap_text.lower()
956
-
957
- if any(word in gap_lower for word in ['method', 'methodology', 'approach']):
958
- return "methodological"
959
- elif any(word in gap_lower for word in ['data', 'dataset', 'sample']):
960
- return "data"
961
- elif any(word in gap_lower for word in ['theory', 'theoretical', 'framework']):
962
- return "theoretical"
963
- elif any(word in gap_lower for word in ['application', 'practical', 'implementation']):
964
- return "applied"
965
- else:
966
- return "general"
967
-
968
- except Exception as e:
969
- logger.error(f"Error categorizing gap: {str(e)}")
970
- return "general"
971
-
972
- async def _extract_and_upsert_entities(self, papers: List[Dict[str, Any]], synthesis: Dict[str, Any]) -> None:
973
- """Extract entities and relationships from papers and synthesis, upsert to Knowledge Graph."""
974
- try:
975
- # Extract paper entities
976
- for paper in papers:
977
- if isinstance(paper, dict):
978
- paper_id = paper.get('id') or paper.get('_id') or hashlib.md5(paper.get('title', 'unknown').encode()).hexdigest()
979
-
980
- # Upsert paper entity
981
- await self.kg_client.upsert_entity("Paper", {
982
- "id": paper_id,
983
- "title": paper.get('title', 'Untitled'),
984
- "year": paper.get('year'),
985
- "authors": paper.get('authors', []),
986
- "doi": paper.get('doi'),
987
- "journal": paper.get('journal')
988
- }, "id")
989
-
990
- # Extract and upsert author entities
991
- for author in paper.get('authors', []):
992
- if isinstance(author, dict) and author.get('name'):
993
- author_id = hashlib.md5(author['name'].encode()).hexdigest()
994
- await self.kg_client.upsert_entity("Author", {
995
- "id": author_id,
996
- "name": author['name'],
997
- "email": author.get('email', ''),
998
- "affiliation": author.get('affiliation', '')
999
- }, "id")
1000
-
1001
- # Create AUTHORED relationship
1002
- await self.kg_client.upsert_relationship(
1003
- "Author", "id", author_id,
1004
- "Paper", "id", paper_id,
1005
- "AUTHORED", {"year": paper.get('year')}
1006
- )
1007
-
1008
- # Extract synthesis entities
1009
- for finding in synthesis.get("common_findings", []):
1010
- if isinstance(finding, dict) and finding.get("finding"):
1011
- finding_id = hashlib.md5(finding["finding"].encode()).hexdigest()
1012
- await self.kg_client.upsert_entity("Finding", {
1013
- "id": finding_id,
1014
- "text": finding["finding"],
1015
- "strength": finding.get("strength", "moderate"),
1016
- "extracted_at": finding.get("extracted_at")
1017
- }, "id")
1018
-
1019
- # Link findings to papers (simplified)
1020
- for paper in papers:
1021
- if isinstance(paper, dict):
1022
- paper_id = paper.get('id') or paper.get('_id') or hashlib.md5(paper.get('title', 'unknown').encode()).hexdigest()
1023
- await self.kg_client.upsert_relationship(
1024
- "Finding", "id", finding_id,
1025
- "Paper", "id", paper_id,
1026
- "SUPPORTS", {"confidence": finding.get("strength", "moderate")}
1027
- )
1028
-
1029
- # Extract methodology entities
1030
- for method_info in synthesis.get("methodology_analysis", {}).items():
1031
- if isinstance(method_info, tuple) and len(method_info) == 2:
1032
- method_name, details = method_info
1033
- if method_name != "analysis" and isinstance(details, dict):
1034
- method_id = hashlib.md5(method_name.encode()).hexdigest()
1035
- await self.kg_client.upsert_entity("Methodology", {
1036
- "id": method_id,
1037
- "name": method_name,
1038
- "count": details.get("count", 0),
1039
- "papers": details.get("papers", [])
1040
- }, "id")
1041
-
1042
- # Link methodologies to papers
1043
- for paper_title in details.get("papers", []):
1044
- for paper in papers:
1045
- if isinstance(paper, dict) and paper.get('title') == paper_title:
1046
- paper_id = paper.get('id') or paper.get('_id') or hashlib.md5(paper.get('title', 'unknown').encode()).hexdigest()
1047
- await self.kg_client.upsert_relationship(
1048
- "Methodology", "id", method_id,
1049
- "Paper", "id", paper_id,
1050
- "USES", {"count": details.get("count", 0)}
1051
- )
1052
- break
1053
-
1054
- logger.info(f"Extracted and upserted entities to Knowledge Graph for {len(papers)} papers")
1055
-
1056
- except Exception as e:
1057
- logger.error(f"Error extracting entities to Knowledge Graph: {e}")
1058
- # Don't raise - this is optional functionality
1059
-
1060
- async def cleanup(self):
1061
- """Cleanup resources with error handling."""
1062
- try:
1063
- # Cancel any running synthesis tasks
1064
- for task in self.synthesis_tasks.values():
1065
- if not task.done():
1066
- task.cancel()
1067
-
1068
- # Wait for tasks to complete
1069
- if self.synthesis_tasks:
1070
- await asyncio.gather(*self.synthesis_tasks.values(), return_exceptions=True)
1071
-
1072
- # Clear caches
1073
- self.synthesis_cache.clear()
1074
- self.synthesis_tasks.clear()
1075
-
1076
- # Close Redis connection
1077
- if hasattr(self, 'redis_client'):
1078
- await self.redis_client.close()
1079
-
1080
- #logger.info("ResearchSynthesizer cleanup completed")
1081
-
1082
- except Exception as e:
1083
- logger.error(f"Error during cleanup: {str(e)}")
1084
-
1085
- async def health_check(self) -> Dict[str, Any]:
1086
- """
1087
- Perform health check of the synthesizer.
1088
-
1089
- Returns:
1090
- Health status
1091
- """
1092
- try:
1093
- health_status = {
1094
- "status": "healthy",
1095
- "timestamp": _utc_timestamp(),
1096
- "components": {}
1097
- }
1098
-
1099
- # Check Redis connection
1100
- try:
1101
- await self.redis_client.ping()
1102
- health_status["components"]["redis"] = {"status": "healthy"}
1103
- except Exception as e:
1104
- health_status["components"]["redis"] = {"status": "error", "error": str(e)}
1105
- health_status["status"] = "degraded"
1106
-
1107
- # Check LLM manager
1108
- try:
1109
- llm_health = await self.llm.health_check()
1110
- health_status["components"]["llm_manager"] = llm_health
1111
- if llm_health.get("status") != "healthy":
1112
- health_status["status"] = "degraded"
1113
- except Exception as e:
1114
- health_status["components"]["llm_manager"] = {"status": "error", "error": str(e)}
1115
- health_status["status"] = "degraded"
1116
-
1117
- # Check database operations
1118
- try:
1119
- # Simple database check
1120
- health_status["components"]["database"] = {"status": "healthy"}
1121
- except Exception as e:
1122
- health_status["components"]["database"] = {"status": "error", "error": str(e)}
1123
- health_status["status"] = "degraded"
1124
-
1125
- # Check active tasks
1126
- active_tasks = len(self.synthesis_tasks)
1127
- health_status["components"]["active_tasks"] = {
1128
- "status": "healthy",
1129
- "count": active_tasks
1130
- }
1131
-
1132
- #logger.info(f"Health check completed: {health_status['status']}")
1133
- return health_status
1134
-
1135
- except Exception as e:
1136
- logger.error(f"Health check failed: {str(e)}")
1137
- return {
1138
- "status": "error",
1139
- "error": str(e),
1140
- "timestamp": _utc_timestamp()
1141
- }
1142
-
1143
- async def export_academic_synthesis(self, synthesis: Dict[str, Any],
1144
- format_type: CitationFormat = CitationFormat.APA) -> str:
1145
- """
1146
- Export synthesis with proper academic citations and formatting.
1147
-
1148
- Args:
1149
- synthesis: Synthesis results
1150
- format_type: Citation format to use
1151
-
1152
- Returns:
1153
- Formatted academic synthesis
1154
- """
1155
- try:
1156
- if not synthesis:
1157
- return "No synthesis data available."
1158
-
1159
- # Extract citation data
1160
- citation_analysis = synthesis.get("citation_analysis", {})
1161
- cited_findings = citation_analysis.get("cited_findings", [])
1162
- formatted_citations = citation_analysis.get("formatted_citations", {})
1163
-
1164
- # Build academic synthesis
1165
- academic_synthesis = []
1166
- academic_synthesis.append("# Research Synthesis Report")
1167
- academic_synthesis.append("")
1168
- academic_synthesis.append(f"*Generated on: {_utc_now().strftime('%B %d, %Y')}*")
1169
- academic_synthesis.append("")
1170
-
1171
- # Key Findings with Citations
1172
- if synthesis.get("common_findings"):
1173
- academic_synthesis.append("## Key Findings")
1174
- academic_synthesis.append("")
1175
-
1176
- for i, finding in enumerate(synthesis["common_findings"], 1):
1177
- finding_text = finding.get("finding", str(finding))
1178
- academic_synthesis.append(f"{i}. {finding_text}")
1179
-
1180
- # Add citation if available
1181
- if cited_findings and i <= len(cited_findings):
1182
- citation = cited_findings[i-1].get("citation", {})
1183
- if citation.get("authors") and citation.get("year"):
1184
- authors = citation["authors"]
1185
- if len(authors) == 1:
1186
- author_cite = authors[0]
1187
- elif len(authors) == 2:
1188
- author_cite = f"{authors[0]} and {authors[1]}"
1189
- else:
1190
- author_cite = f"{authors[0]} et al."
1191
- academic_synthesis.append(f" *Source: {author_cite} ({citation['year']})*")
1192
- academic_synthesis.append("")
1193
-
1194
- # Research Gaps
1195
- if synthesis.get("research_gaps"):
1196
- academic_synthesis.append("## Research Gaps")
1197
- academic_synthesis.append("")
1198
- for gap in synthesis["research_gaps"]:
1199
- gap_text = gap.get("gap", str(gap))
1200
- academic_synthesis.append(f"- {gap_text}")
1201
- academic_synthesis.append("")
1202
-
1203
- # Contradictions
1204
- if synthesis.get("contradictions"):
1205
- academic_synthesis.append("## Contradictions and Disagreements")
1206
- academic_synthesis.append("")
1207
- for contradiction in synthesis["contradictions"]:
1208
- contradiction_text = contradiction.get("contradiction", str(contradiction))
1209
- academic_synthesis.append(f"- {contradiction_text}")
1210
- academic_synthesis.append("")
1211
-
1212
- # Methodology Analysis
1213
- if synthesis.get("methodology_analysis"):
1214
- academic_synthesis.append("## Methodology Analysis")
1215
- academic_synthesis.append("")
1216
- methodology = synthesis["methodology_analysis"]
1217
- if isinstance(methodology, dict):
1218
- for key, value in methodology.items():
1219
- if key != "error":
1220
- academic_synthesis.append(f"### {key.replace('_', ' ').title()}")
1221
- academic_synthesis.append(f"{value}")
1222
- academic_synthesis.append("")
1223
-
1224
- # Future Directions
1225
- if synthesis.get("future_directions"):
1226
- academic_synthesis.append("## Future Research Directions")
1227
- academic_synthesis.append("")
1228
- for direction in synthesis["future_directions"]:
1229
- direction_text = direction.get("direction", str(direction))
1230
- academic_synthesis.append(f"- {direction_text}")
1231
- academic_synthesis.append("")
1232
-
1233
- # Citations Section
1234
- if formatted_citations:
1235
- academic_synthesis.append("## References")
1236
- academic_synthesis.append("")
1237
-
1238
- if format_type == CitationFormat.APA and formatted_citations.get("apa"):
1239
- citations_text = formatted_citations["apa"]
1240
- # Split and number the citations
1241
- citations_list = citations_text.split("\n\n")
1242
- for i, citation in enumerate(citations_list, 1):
1243
- if citation.strip():
1244
- academic_synthesis.append(f"{i}. {citation.strip()}")
1245
- elif format_type == CitationFormat.BIBTEX and formatted_citations.get("bibtex"):
1246
- academic_synthesis.append("```bibtex")
1247
- academic_synthesis.append(formatted_citations["bibtex"])
1248
- academic_synthesis.append("```")
1249
- else:
1250
- # Fallback to numbered list
1251
- citations_text = formatted_citations.get("apa", "")
1252
- citations_list = citations_text.split("\n\n")
1253
- for i, citation in enumerate(citations_list, 1):
1254
- if citation.strip():
1255
- academic_synthesis.append(f"{i}. {citation.strip()}")
1256
-
1257
- # Citation Analytics
1258
- if citation_analysis.get("citation_analytics"):
1259
- analytics = citation_analysis["citation_analytics"]
1260
- if not analytics.get("error"):
1261
- academic_synthesis.append("")
1262
- academic_synthesis.append("## Citation Analytics")
1263
- academic_synthesis.append("")
1264
- academic_synthesis.append(f"- Total citations analyzed: {analytics.get('total_citations', 0)}")
1265
- academic_synthesis.append(f"- Year range: {analytics.get('year_range', {}).get('min', 0)} - {analytics.get('year_range', {}).get('max', 0)}")
1266
- academic_synthesis.append(f"- Average citations per paper: {analytics.get('citation_impact', {}).get('average_citations_per_paper', 0):.1f}")
1267
- academic_synthesis.append(f"- Academic credibility score: {citation_analysis.get('academic_credibility_score', 0):.1f}/10.0")
1268
-
1269
- return "\n".join(academic_synthesis)
1270
-
1271
- except Exception as e:
1272
- logger.error(f"Error exporting academic synthesis: {str(e)}")
1273
- return f"Error generating academic synthesis: {str(e)}"