cite-agent 1.3.9__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. cite_agent/__init__.py +13 -13
  2. cite_agent/__version__.py +1 -1
  3. cite_agent/action_first_mode.py +150 -0
  4. cite_agent/adaptive_providers.py +413 -0
  5. cite_agent/archive_api_client.py +186 -0
  6. cite_agent/auth.py +0 -1
  7. cite_agent/auto_expander.py +70 -0
  8. cite_agent/cache.py +379 -0
  9. cite_agent/circuit_breaker.py +370 -0
  10. cite_agent/citation_network.py +377 -0
  11. cite_agent/cli.py +8 -16
  12. cite_agent/cli_conversational.py +113 -3
  13. cite_agent/confidence_calibration.py +381 -0
  14. cite_agent/deduplication.py +325 -0
  15. cite_agent/enhanced_ai_agent.py +689 -371
  16. cite_agent/error_handler.py +228 -0
  17. cite_agent/execution_safety.py +329 -0
  18. cite_agent/full_paper_reader.py +239 -0
  19. cite_agent/observability.py +398 -0
  20. cite_agent/offline_mode.py +348 -0
  21. cite_agent/paper_comparator.py +368 -0
  22. cite_agent/paper_summarizer.py +420 -0
  23. cite_agent/pdf_extractor.py +350 -0
  24. cite_agent/proactive_boundaries.py +266 -0
  25. cite_agent/quality_gate.py +442 -0
  26. cite_agent/request_queue.py +390 -0
  27. cite_agent/response_enhancer.py +257 -0
  28. cite_agent/response_formatter.py +458 -0
  29. cite_agent/response_pipeline.py +295 -0
  30. cite_agent/response_style_enhancer.py +259 -0
  31. cite_agent/self_healing.py +418 -0
  32. cite_agent/similarity_finder.py +524 -0
  33. cite_agent/streaming_ui.py +13 -9
  34. cite_agent/thinking_blocks.py +308 -0
  35. cite_agent/tool_orchestrator.py +416 -0
  36. cite_agent/trend_analyzer.py +540 -0
  37. cite_agent/unpaywall_client.py +226 -0
  38. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/METADATA +15 -1
  39. cite_agent-1.4.3.dist-info/RECORD +62 -0
  40. cite_agent-1.3.9.dist-info/RECORD +0 -32
  41. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/WHEEL +0 -0
  42. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/entry_points.txt +0 -0
  43. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/licenses/LICENSE +0 -0
  44. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,524 @@
1
+ """
2
+ Similarity Finder - Discover similar papers and researchers
3
+
4
+ Provides tools for:
5
+ - Finding similar papers
6
+ - Discovering related researchers
7
+ - Institution rankings
8
+ - Collaboration network analysis
9
+ """
10
+
11
+ from typing import List, Dict, Any, Optional, Set
12
+ from collections import defaultdict, Counter
13
+ import logging
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class SimilarityFinder:
19
+ """Find similar papers and researchers"""
20
+
21
+ def __init__(self, archive_client=None):
22
+ """
23
+ Initialize similarity finder
24
+
25
+ Args:
26
+ archive_client: ArchiveAPIClient instance
27
+ """
28
+ self.archive_client = archive_client
29
+
30
+ def find_similar_papers(self, paper_id: str, limit: int = 10, method: str = "citations") -> List[Dict[str, Any]]:
31
+ """
32
+ Find papers similar to a given paper
33
+
34
+ Args:
35
+ paper_id: Paper ID (DOI, arXiv, or Semantic Scholar ID)
36
+ limit: Maximum papers to return
37
+ method: Similarity method ("citations", "keywords", or "authors")
38
+
39
+ Returns:
40
+ List of similar papers with similarity scores
41
+ """
42
+ if not self.archive_client:
43
+ return []
44
+
45
+ try:
46
+ # Get base paper
47
+ base_paper = self.archive_client.get_paper(
48
+ paper_id,
49
+ fields=['paperId', 'title', 'authors', 'year', 'citationCount', 'abstract', 'references']
50
+ )
51
+
52
+ if not base_paper:
53
+ return []
54
+
55
+ similar = []
56
+
57
+ if method == "citations":
58
+ similar = self._find_similar_by_citations(base_paper, limit * 2)
59
+ elif method == "keywords":
60
+ similar = self._find_similar_by_keywords(base_paper, limit * 2)
61
+ elif method == "authors":
62
+ similar = self._find_similar_by_authors(base_paper, limit * 2)
63
+ else:
64
+ # Hybrid approach
65
+ citation_similar = self._find_similar_by_citations(base_paper, limit)
66
+ keyword_similar = self._find_similar_by_keywords(base_paper, limit)
67
+ similar = self._merge_similarity_results([citation_similar, keyword_similar])
68
+
69
+ # Score and rank
70
+ scored = []
71
+ for paper in similar:
72
+ score = self._calculate_similarity_score(base_paper, paper, method)
73
+ scored.append({
74
+ **paper,
75
+ 'similarity_score': score,
76
+ 'similarity_reasons': self._explain_similarity(base_paper, paper)
77
+ })
78
+
79
+ # Sort by score and deduplicate
80
+ scored.sort(key=lambda x: x['similarity_score'], reverse=True)
81
+
82
+ # Remove base paper if present
83
+ base_id = base_paper.get('paperId')
84
+ scored = [p for p in scored if p.get('paperId') != base_id]
85
+
86
+ return scored[:limit]
87
+
88
+ except Exception as e:
89
+ logger.error(f"Error finding similar papers: {e}")
90
+ return []
91
+
92
+ def find_similar_researchers(self, topic: str, limit: int = 20) -> List[Dict[str, Any]]:
93
+ """
94
+ Find researchers working on a topic
95
+
96
+ Args:
97
+ topic: Research topic
98
+ limit: Maximum researchers to return
99
+
100
+ Returns:
101
+ List of researchers with publication counts and metrics
102
+ """
103
+ if not self.archive_client:
104
+ return []
105
+
106
+ try:
107
+ # Search for papers on topic
108
+ results = self.archive_client.search_papers(
109
+ query=topic,
110
+ limit=100,
111
+ fields=['authors', 'year', 'citationCount']
112
+ )
113
+
114
+ papers = results.get('data', [])
115
+
116
+ # Aggregate by author
117
+ author_stats = defaultdict(lambda: {
118
+ 'papers': [],
119
+ 'total_citations': 0,
120
+ 'years': set(),
121
+ 'paper_count': 0
122
+ })
123
+
124
+ for paper in papers:
125
+ citations = paper.get('citationCount', 0)
126
+ year = paper.get('year')
127
+
128
+ for author in paper.get('authors', []):
129
+ author_id = author.get('authorId')
130
+ author_name = author.get('name')
131
+
132
+ if not author_name:
133
+ continue
134
+
135
+ key = author_id or author_name
136
+
137
+ author_stats[key]['name'] = author_name
138
+ author_stats[key]['author_id'] = author_id
139
+ author_stats[key]['papers'].append(paper)
140
+ author_stats[key]['total_citations'] += citations
141
+ if year:
142
+ author_stats[key]['years'].add(year)
143
+ author_stats[key]['paper_count'] += 1
144
+
145
+ # Calculate metrics and rank
146
+ researchers = []
147
+
148
+ for key, stats in author_stats.items():
149
+ if stats['paper_count'] < 2: # Filter out one-time authors
150
+ continue
151
+
152
+ years = sorted(list(stats['years']))
153
+ active_years = max(years) - min(years) + 1 if len(years) > 1 else 1
154
+
155
+ h_index = self._calculate_h_index([p.get('citationCount', 0) for p in stats['papers']])
156
+
157
+ researchers.append({
158
+ 'name': stats['name'],
159
+ 'author_id': stats['author_id'],
160
+ 'paper_count': stats['paper_count'],
161
+ 'total_citations': stats['total_citations'],
162
+ 'avg_citations': round(stats['total_citations'] / stats['paper_count'], 1),
163
+ 'h_index': h_index,
164
+ 'active_years': active_years,
165
+ 'first_year': min(years) if years else None,
166
+ 'latest_year': max(years) if years else None,
167
+ 'productivity': round(stats['paper_count'] / active_years, 2),
168
+ 'relevance_score': self._calculate_researcher_relevance(stats)
169
+ })
170
+
171
+ # Sort by relevance
172
+ researchers.sort(key=lambda x: x['relevance_score'], reverse=True)
173
+
174
+ return researchers[:limit]
175
+
176
+ except Exception as e:
177
+ logger.error(f"Error finding similar researchers: {e}")
178
+ return []
179
+
180
+ def find_collaborators(self, author_name: str, limit: int = 20) -> List[Dict[str, Any]]:
181
+ """
182
+ Find an author's collaborators
183
+
184
+ Args:
185
+ author_name: Author name
186
+ limit: Maximum collaborators to return
187
+
188
+ Returns:
189
+ List of co-authors with collaboration metrics
190
+ """
191
+ if not self.archive_client:
192
+ return []
193
+
194
+ try:
195
+ # Search for author's papers
196
+ results = self.archive_client.search_papers(
197
+ query=f'author:"{author_name}"',
198
+ limit=100,
199
+ fields=['authors', 'year', 'citationCount']
200
+ )
201
+
202
+ papers = results.get('data', [])
203
+
204
+ # Count co-authorships
205
+ collaborator_stats = defaultdict(lambda: {
206
+ 'papers': [],
207
+ 'years': set(),
208
+ 'total_citations': 0
209
+ })
210
+
211
+ for paper in papers:
212
+ citations = paper.get('citationCount', 0)
213
+ year = paper.get('year')
214
+
215
+ for author in paper.get('authors', []):
216
+ name = author.get('name', '')
217
+
218
+ # Skip the author themselves
219
+ if name.lower() == author_name.lower():
220
+ continue
221
+
222
+ collaborator_stats[name]['papers'].append(paper)
223
+ collaborator_stats[name]['total_citations'] += citations
224
+ if year:
225
+ collaborator_stats[name]['years'].add(year)
226
+
227
+ # Format results
228
+ collaborators = []
229
+
230
+ for name, stats in collaborator_stats.items():
231
+ years = sorted(list(stats['years']))
232
+
233
+ collaborators.append({
234
+ 'name': name,
235
+ 'joint_papers': len(stats['papers']),
236
+ 'total_citations': stats['total_citations'],
237
+ 'avg_citations': round(stats['total_citations'] / len(stats['papers']), 1),
238
+ 'first_collaboration': min(years) if years else None,
239
+ 'latest_collaboration': max(years) if years else None,
240
+ 'collaboration_span': max(years) - min(years) + 1 if len(years) > 1 else 1
241
+ })
242
+
243
+ # Sort by number of joint papers
244
+ collaborators.sort(key=lambda x: x['joint_papers'], reverse=True)
245
+
246
+ return collaborators[:limit]
247
+
248
+ except Exception as e:
249
+ logger.error(f"Error finding collaborators: {e}")
250
+ return []
251
+
252
+ def institution_rankings(self, topic: str, limit: int = 20) -> List[Dict[str, Any]]:
253
+ """
254
+ Rank institutions by research output in a topic
255
+
256
+ Args:
257
+ topic: Research topic
258
+ limit: Maximum institutions to return
259
+
260
+ Returns:
261
+ List of institutions with publication metrics
262
+ """
263
+ if not self.archive_client:
264
+ return []
265
+
266
+ try:
267
+ # Search for papers
268
+ results = self.archive_client.search_papers(
269
+ query=topic,
270
+ limit=200,
271
+ fields=['authors', 'year', 'citationCount']
272
+ )
273
+
274
+ papers = results.get('data', [])
275
+
276
+ # Extract institutions from author affiliations
277
+ institution_stats = defaultdict(lambda: {
278
+ 'papers': [],
279
+ 'authors': set(),
280
+ 'total_citations': 0,
281
+ 'years': set()
282
+ })
283
+
284
+ for paper in papers:
285
+ citations = paper.get('citationCount', 0)
286
+ year = paper.get('year')
287
+
288
+ for author in paper.get('authors', []):
289
+ # Extract institution from author info
290
+ # Note: Semantic Scholar doesn't always provide affiliations
291
+ # This is a simplified version
292
+ author_name = author.get('name', '')
293
+
294
+ # In a real implementation, would need affiliation data
295
+ # For now, we'll skip this or use heuristics
296
+
297
+ continue
298
+
299
+ # Note: Without reliable affiliation data from API,
300
+ # this feature is limited. Would need to:
301
+ # 1. Use OpenAlex API which has better affiliation data
302
+ # 2. Parse affiliations from paper metadata
303
+ # 3. Use a separate affiliation database
304
+
305
+ logger.warning("Institution rankings require affiliation data not available in current API")
306
+
307
+ return []
308
+
309
+ except Exception as e:
310
+ logger.error(f"Error ranking institutions: {e}")
311
+ return []
312
+
313
+ def _find_similar_by_citations(self, base_paper: Dict[str, Any], limit: int) -> List[Dict[str, Any]]:
314
+ """Find papers with similar citation patterns"""
315
+ similar = []
316
+
317
+ try:
318
+ # Get papers cited by this paper (references)
319
+ references = base_paper.get('references', [])
320
+
321
+ if not references:
322
+ # Fallback: get references via API
323
+ paper_id = base_paper.get('paperId')
324
+ if paper_id:
325
+ references = self.archive_client.get_paper_references(paper_id, limit=50)
326
+
327
+ # For each reference, find papers that also cite it
328
+ # (papers with similar references are likely similar)
329
+ for ref in references[:10]: # Sample top references
330
+ ref_id = ref.get('paperId')
331
+ if ref_id:
332
+ citing_papers = self.archive_client.get_paper_citations(ref_id, limit=20)
333
+ similar.extend(citing_papers)
334
+
335
+ except Exception as e:
336
+ logger.warning(f"Error in citation-based similarity: {e}")
337
+
338
+ return similar
339
+
340
+ def _find_similar_by_keywords(self, base_paper: Dict[str, Any], limit: int) -> List[Dict[str, Any]]:
341
+ """Find papers with similar keywords/topics"""
342
+ similar = []
343
+
344
+ try:
345
+ # Extract keywords from title and abstract
346
+ title = base_paper.get('title', '')
347
+ abstract = base_paper.get('abstract', '')
348
+
349
+ # Simple keyword extraction (first 3-4 significant words)
350
+ keywords = []
351
+
352
+ # Extract from title
353
+ title_words = [w for w in title.lower().split() if len(w) > 4]
354
+ keywords.extend(title_words[:4])
355
+
356
+ # Build search query
357
+ query = " ".join(keywords)
358
+
359
+ # Search for similar papers
360
+ results = self.archive_client.search_papers(
361
+ query=query,
362
+ limit=limit,
363
+ fields=['paperId', 'title', 'authors', 'year', 'citationCount', 'abstract']
364
+ )
365
+
366
+ similar = results.get('data', [])
367
+
368
+ except Exception as e:
369
+ logger.warning(f"Error in keyword-based similarity: {e}")
370
+
371
+ return similar
372
+
373
+ def _find_similar_by_authors(self, base_paper: Dict[str, Any], limit: int) -> List[Dict[str, Any]]:
374
+ """Find papers by the same authors"""
375
+ similar = []
376
+
377
+ try:
378
+ authors = base_paper.get('authors', [])
379
+
380
+ if not authors:
381
+ return []
382
+
383
+ # Get papers by first author
384
+ first_author = authors[0].get('name')
385
+
386
+ if first_author:
387
+ results = self.archive_client.search_papers(
388
+ query=f'author:"{first_author}"',
389
+ limit=limit,
390
+ fields=['paperId', 'title', 'authors', 'year', 'citationCount', 'abstract']
391
+ )
392
+
393
+ similar = results.get('data', [])
394
+
395
+ except Exception as e:
396
+ logger.warning(f"Error in author-based similarity: {e}")
397
+
398
+ return similar
399
+
400
+ def _merge_similarity_results(self, result_lists: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
401
+ """Merge and deduplicate similarity results from multiple methods"""
402
+ seen = set()
403
+ merged = []
404
+
405
+ for results in result_lists:
406
+ for paper in results:
407
+ paper_id = paper.get('paperId')
408
+
409
+ if not paper_id or paper_id in seen:
410
+ continue
411
+
412
+ seen.add(paper_id)
413
+ merged.append(paper)
414
+
415
+ return merged
416
+
417
+ def _calculate_similarity_score(self, base_paper: Dict[str, Any], candidate: Dict[str, Any], method: str) -> float:
418
+ """Calculate similarity score between two papers"""
419
+ score = 0.0
420
+
421
+ # Year proximity (papers from similar years are more similar)
422
+ base_year = base_paper.get('year', 2020)
423
+ cand_year = candidate.get('year', 2020)
424
+ year_diff = abs(base_year - cand_year)
425
+ year_score = max(0, 1 - year_diff / 20) * 20
426
+ score += year_score
427
+
428
+ # Citation count similarity (papers with similar citation counts)
429
+ base_cites = base_paper.get('citationCount', 0)
430
+ cand_cites = candidate.get('citationCount', 0)
431
+
432
+ if base_cites > 0:
433
+ cite_ratio = min(cand_cites / base_cites, base_cites / max(cand_cites, 1))
434
+ cite_score = cite_ratio * 30
435
+ score += cite_score
436
+
437
+ # Author overlap
438
+ base_authors = {a.get('name', '').lower() for a in base_paper.get('authors', [])}
439
+ cand_authors = {a.get('name', '').lower() for a in candidate.get('authors', [])}
440
+
441
+ overlap = len(base_authors & cand_authors)
442
+ author_score = min(overlap * 25, 50)
443
+ score += author_score
444
+
445
+ return round(score, 2)
446
+
447
+ def _explain_similarity(self, base_paper: Dict[str, Any], candidate: Dict[str, Any]) -> List[str]:
448
+ """Generate human-readable similarity explanations"""
449
+ reasons = []
450
+
451
+ # Check author overlap
452
+ base_authors = {a.get('name', '').lower() for a in base_paper.get('authors', [])}
453
+ cand_authors = {a.get('name', '').lower() for a in candidate.get('authors', [])}
454
+
455
+ overlap = base_authors & cand_authors
456
+ if overlap:
457
+ reasons.append(f"Shared authors: {', '.join(overlap)}")
458
+
459
+ # Check year proximity
460
+ base_year = base_paper.get('year')
461
+ cand_year = candidate.get('year')
462
+
463
+ if base_year and cand_year and abs(base_year - cand_year) <= 2:
464
+ reasons.append(f"Published around same time ({cand_year})")
465
+
466
+ # Check citation similarity
467
+ base_cites = base_paper.get('citationCount', 0)
468
+ cand_cites = candidate.get('citationCount', 0)
469
+
470
+ if base_cites > 0 and cand_cites > 0:
471
+ ratio = cand_cites / base_cites
472
+ if 0.5 <= ratio <= 2.0:
473
+ reasons.append("Similar citation count")
474
+
475
+ if not reasons:
476
+ reasons.append("Related topic")
477
+
478
+ return reasons
479
+
480
+ def _calculate_h_index(self, citation_counts: List[int]) -> int:
481
+ """Calculate h-index from citation counts"""
482
+ citation_counts_sorted = sorted(citation_counts, reverse=True)
483
+
484
+ h = 0
485
+ for i, citations in enumerate(citation_counts_sorted, 1):
486
+ if citations >= i:
487
+ h = i
488
+ else:
489
+ break
490
+
491
+ return h
492
+
493
+ def _calculate_researcher_relevance(self, stats: Dict[str, Any]) -> float:
494
+ """Calculate relevance score for a researcher"""
495
+ score = 0.0
496
+
497
+ # Number of papers
498
+ paper_count = stats['paper_count']
499
+ score += min(paper_count * 10, 40)
500
+
501
+ # Total citations
502
+ citations = stats['total_citations']
503
+ score += min(citations / 10, 30)
504
+
505
+ # Productivity (papers per year)
506
+ years = len(stats['years'])
507
+ if years > 0:
508
+ productivity = paper_count / years
509
+ score += min(productivity * 10, 30)
510
+
511
+ return round(score, 2)
512
+
513
+
514
+ def get_similarity_finder(archive_client=None) -> SimilarityFinder:
515
+ """
516
+ Get SimilarityFinder instance
517
+
518
+ Args:
519
+ archive_client: ArchiveAPIClient instance
520
+
521
+ Returns:
522
+ SimilarityFinder instance
523
+ """
524
+ return SimilarityFinder(archive_client)
@@ -5,7 +5,6 @@ Minimal, clean, conversational interface for data analysis assistant
5
5
  """
6
6
 
7
7
  import sys
8
- import time
9
8
  import asyncio
10
9
  from typing import Optional, AsyncGenerator
11
10
  from rich.console import Console
@@ -31,7 +30,8 @@ class StreamingChatUI:
31
30
  self.app_name = app_name
32
31
  self.working_dir = working_dir
33
32
  self.console = Console()
34
- self.typing_speed = 0.015 # ~60 chars/sec
33
+ # Stream responses as full chunks (no artificial typing delay)
34
+ self.typing_speed = 0.0
35
35
 
36
36
  def show_header(self):
37
37
  """Display minimal header on startup"""
@@ -61,12 +61,15 @@ class StreamingChatUI:
61
61
  # No prefix for agent - just stream naturally
62
62
  buffer = ""
63
63
 
64
- async for chunk in content_generator:
65
- buffer += chunk
66
- # Stream character by character for natural feel
67
- for char in chunk:
68
- self.console.print(char, end="", style="white")
69
- await asyncio.sleep(self.typing_speed)
64
+ try:
65
+ async for chunk in content_generator:
66
+ buffer += chunk
67
+ self.console.print(chunk, end="", style="white")
68
+ if self.typing_speed:
69
+ await asyncio.sleep(self.typing_speed)
70
+ except KeyboardInterrupt:
71
+ self.console.print("\n[dim]⏹️ Streaming interrupted by user.[/dim]")
72
+ return buffer
70
73
 
71
74
  self.console.print() # Newline after response
72
75
  self.console.print() # Extra space for readability
@@ -180,7 +183,8 @@ async def simulate_streaming(text: str, chunk_size: int = 5) -> AsyncGenerator[s
180
183
  for i in range(0, len(text), chunk_size):
181
184
  chunk = text[i:i + chunk_size]
182
185
  yield chunk
183
- await asyncio.sleep(0.05) # Simulate network delay
186
+ # No artificial delay; mimic immediate chunk availability
187
+ await asyncio.sleep(0)
184
188
 
185
189
 
186
190
  # Example usage