cite-agent 1.3.9__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cite_agent/__init__.py +13 -13
- cite_agent/__version__.py +1 -1
- cite_agent/action_first_mode.py +150 -0
- cite_agent/adaptive_providers.py +413 -0
- cite_agent/archive_api_client.py +186 -0
- cite_agent/auth.py +0 -1
- cite_agent/auto_expander.py +70 -0
- cite_agent/cache.py +379 -0
- cite_agent/circuit_breaker.py +370 -0
- cite_agent/citation_network.py +377 -0
- cite_agent/cli.py +8 -16
- cite_agent/cli_conversational.py +113 -3
- cite_agent/confidence_calibration.py +381 -0
- cite_agent/deduplication.py +325 -0
- cite_agent/enhanced_ai_agent.py +689 -371
- cite_agent/error_handler.py +228 -0
- cite_agent/execution_safety.py +329 -0
- cite_agent/full_paper_reader.py +239 -0
- cite_agent/observability.py +398 -0
- cite_agent/offline_mode.py +348 -0
- cite_agent/paper_comparator.py +368 -0
- cite_agent/paper_summarizer.py +420 -0
- cite_agent/pdf_extractor.py +350 -0
- cite_agent/proactive_boundaries.py +266 -0
- cite_agent/quality_gate.py +442 -0
- cite_agent/request_queue.py +390 -0
- cite_agent/response_enhancer.py +257 -0
- cite_agent/response_formatter.py +458 -0
- cite_agent/response_pipeline.py +295 -0
- cite_agent/response_style_enhancer.py +259 -0
- cite_agent/self_healing.py +418 -0
- cite_agent/similarity_finder.py +524 -0
- cite_agent/streaming_ui.py +13 -9
- cite_agent/thinking_blocks.py +308 -0
- cite_agent/tool_orchestrator.py +416 -0
- cite_agent/trend_analyzer.py +540 -0
- cite_agent/unpaywall_client.py +226 -0
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/METADATA +15 -1
- cite_agent-1.4.3.dist-info/RECORD +62 -0
- cite_agent-1.3.9.dist-info/RECORD +0 -32
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/WHEEL +0 -0
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/entry_points.txt +0 -0
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/licenses/LICENSE +0 -0
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Research Trend Analyzer - Analyze research trends and predict future directions
|
|
3
|
+
|
|
4
|
+
Provides tools for:
|
|
5
|
+
- Topic evolution analysis
|
|
6
|
+
- Emerging topic detection
|
|
7
|
+
- Publication trend visualization
|
|
8
|
+
- Research direction prediction
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
12
|
+
from collections import defaultdict, Counter
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ResearchTrendAnalyzer:
|
|
21
|
+
"""Analyze trends in academic research"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, archive_client=None):
|
|
24
|
+
"""
|
|
25
|
+
Initialize trend analyzer
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
archive_client: ArchiveAPIClient instance for fetching papers
|
|
29
|
+
"""
|
|
30
|
+
self.archive_client = archive_client
|
|
31
|
+
|
|
32
|
+
def analyze_topic_evolution(self, topic: str, years: int = 10, granularity: str = "year") -> Dict[str, Any]:
|
|
33
|
+
"""
|
|
34
|
+
Analyze how a research topic has evolved over time
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
topic: Research topic to analyze
|
|
38
|
+
years: Number of years to look back
|
|
39
|
+
granularity: Time granularity ("year" or "quarter")
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Evolution data with publication counts, citation trends, key papers
|
|
43
|
+
"""
|
|
44
|
+
if not self.archive_client:
|
|
45
|
+
return {"error": "Archive client required"}
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
current_year = datetime.now().year
|
|
49
|
+
start_year = current_year - years
|
|
50
|
+
|
|
51
|
+
# Fetch papers from each year
|
|
52
|
+
yearly_data = {}
|
|
53
|
+
|
|
54
|
+
for year in range(start_year, current_year + 1):
|
|
55
|
+
papers = self._fetch_papers_for_year(topic, year)
|
|
56
|
+
|
|
57
|
+
yearly_data[year] = {
|
|
58
|
+
'year': year,
|
|
59
|
+
'paper_count': len(papers),
|
|
60
|
+
'total_citations': sum(p.get('citationCount', 0) for p in papers),
|
|
61
|
+
'avg_citations': sum(p.get('citationCount', 0) for p in papers) / max(len(papers), 1),
|
|
62
|
+
'top_papers': sorted(
|
|
63
|
+
papers,
|
|
64
|
+
key=lambda x: x.get('citationCount', 0),
|
|
65
|
+
reverse=True
|
|
66
|
+
)[:5],
|
|
67
|
+
'keywords': self._extract_trending_keywords(papers)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# Calculate growth metrics
|
|
71
|
+
growth_rate = self._calculate_growth_rate(yearly_data)
|
|
72
|
+
|
|
73
|
+
# Detect inflection points
|
|
74
|
+
inflection_points = self._detect_inflection_points(yearly_data)
|
|
75
|
+
|
|
76
|
+
# Extract emerging keywords
|
|
77
|
+
emerging_keywords = self._identify_emerging_keywords(yearly_data)
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
'topic': topic,
|
|
81
|
+
'time_range': f'{start_year}-{current_year}',
|
|
82
|
+
'yearly_data': yearly_data,
|
|
83
|
+
'growth_rate': growth_rate,
|
|
84
|
+
'inflection_points': inflection_points,
|
|
85
|
+
'emerging_keywords': emerging_keywords,
|
|
86
|
+
'trend': self._classify_trend(growth_rate)
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.error(f"Error analyzing topic evolution: {e}")
|
|
91
|
+
return {"error": str(e)}
|
|
92
|
+
|
|
93
|
+
def emerging_topics(self, field: str, min_papers: int = 20, time_window: int = 2) -> List[Dict[str, Any]]:
|
|
94
|
+
"""
|
|
95
|
+
Detect emerging research topics in a field
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
field: Research field to analyze
|
|
99
|
+
min_papers: Minimum papers for a topic to be considered
|
|
100
|
+
time_window: Years to look back for "emerging" status
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of emerging topics with growth metrics
|
|
104
|
+
"""
|
|
105
|
+
if not self.archive_client:
|
|
106
|
+
return []
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
current_year = datetime.now().year
|
|
110
|
+
recent_years = range(current_year - time_window, current_year + 1)
|
|
111
|
+
older_years = range(current_year - time_window * 2, current_year - time_window)
|
|
112
|
+
|
|
113
|
+
# Fetch papers from both periods
|
|
114
|
+
recent_papers = []
|
|
115
|
+
older_papers = []
|
|
116
|
+
|
|
117
|
+
for year in recent_years:
|
|
118
|
+
papers = self._fetch_papers_for_year(field, year, limit=200)
|
|
119
|
+
recent_papers.extend(papers)
|
|
120
|
+
|
|
121
|
+
for year in older_years:
|
|
122
|
+
papers = self._fetch_papers_for_year(field, year, limit=200)
|
|
123
|
+
older_papers.extend(papers)
|
|
124
|
+
|
|
125
|
+
# Extract keywords/phrases from both periods
|
|
126
|
+
recent_keywords = self._extract_all_keywords(recent_papers)
|
|
127
|
+
older_keywords = self._extract_all_keywords(older_papers)
|
|
128
|
+
|
|
129
|
+
# Find keywords with significant growth
|
|
130
|
+
emerging = []
|
|
131
|
+
|
|
132
|
+
for keyword, recent_count in recent_keywords.items():
|
|
133
|
+
if recent_count < min_papers:
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
older_count = older_keywords.get(keyword, 0)
|
|
137
|
+
|
|
138
|
+
# Calculate growth
|
|
139
|
+
if older_count == 0:
|
|
140
|
+
growth = float('inf') if recent_count > 0 else 0
|
|
141
|
+
else:
|
|
142
|
+
growth = (recent_count - older_count) / older_count
|
|
143
|
+
|
|
144
|
+
# Filter for significant growth
|
|
145
|
+
if growth > 1.0: # 100% growth
|
|
146
|
+
emerging.append({
|
|
147
|
+
'topic': keyword,
|
|
148
|
+
'recent_papers': recent_count,
|
|
149
|
+
'older_papers': older_count,
|
|
150
|
+
'growth_rate': round(growth * 100, 1),
|
|
151
|
+
'status': 'emerging' if older_count < 10 else 'accelerating'
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
# Sort by growth rate
|
|
155
|
+
emerging.sort(key=lambda x: x['growth_rate'], reverse=True)
|
|
156
|
+
|
|
157
|
+
return emerging[:20] # Top 20 emerging topics
|
|
158
|
+
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.error(f"Error detecting emerging topics: {e}")
|
|
161
|
+
return []
|
|
162
|
+
|
|
163
|
+
def predict_next_papers(self, topic: str, limit: int = 10) -> List[Dict[str, Any]]:
|
|
164
|
+
"""
|
|
165
|
+
Predict/recommend next papers to read based on trends
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
topic: Research topic
|
|
169
|
+
limit: Maximum papers to return
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
List of recommended papers sorted by relevance and recency
|
|
173
|
+
"""
|
|
174
|
+
if not self.archive_client:
|
|
175
|
+
return []
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
# Get recent papers (last 2 years)
|
|
179
|
+
current_year = datetime.now().year
|
|
180
|
+
recent_papers = []
|
|
181
|
+
|
|
182
|
+
for year in range(current_year - 1, current_year + 1):
|
|
183
|
+
papers = self._fetch_papers_for_year(topic, year, limit=50)
|
|
184
|
+
recent_papers.extend(papers)
|
|
185
|
+
|
|
186
|
+
# Score papers by multiple factors
|
|
187
|
+
scored_papers = []
|
|
188
|
+
|
|
189
|
+
for paper in recent_papers:
|
|
190
|
+
score = self._calculate_relevance_score(paper)
|
|
191
|
+
|
|
192
|
+
scored_papers.append({
|
|
193
|
+
'paper': paper,
|
|
194
|
+
'score': score,
|
|
195
|
+
'title': paper.get('title'),
|
|
196
|
+
'authors': [a.get('name') for a in paper.get('authors', [])[:3]],
|
|
197
|
+
'year': paper.get('year'),
|
|
198
|
+
'citations': paper.get('citationCount', 0),
|
|
199
|
+
'reason': self._generate_recommendation_reason(paper, score)
|
|
200
|
+
})
|
|
201
|
+
|
|
202
|
+
# Sort by score
|
|
203
|
+
scored_papers.sort(key=lambda x: x['score'], reverse=True)
|
|
204
|
+
|
|
205
|
+
return scored_papers[:limit]
|
|
206
|
+
|
|
207
|
+
except Exception as e:
|
|
208
|
+
logger.error(f"Error predicting next papers: {e}")
|
|
209
|
+
return []
|
|
210
|
+
|
|
211
|
+
def compare_research_trends(self, topics: List[str], years: int = 10) -> Dict[str, Any]:
|
|
212
|
+
"""
|
|
213
|
+
Compare research trends across multiple topics
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
topics: List of topics to compare
|
|
217
|
+
years: Number of years to analyze
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Comparative trend data
|
|
221
|
+
"""
|
|
222
|
+
if not self.archive_client:
|
|
223
|
+
return {"error": "Archive client required"}
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
current_year = datetime.now().year
|
|
227
|
+
start_year = current_year - years
|
|
228
|
+
|
|
229
|
+
comparison = {
|
|
230
|
+
'topics': topics,
|
|
231
|
+
'time_range': f'{start_year}-{current_year}',
|
|
232
|
+
'data': {}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
# Analyze each topic
|
|
236
|
+
for topic in topics:
|
|
237
|
+
yearly_counts = {}
|
|
238
|
+
|
|
239
|
+
for year in range(start_year, current_year + 1):
|
|
240
|
+
papers = self._fetch_papers_for_year(topic, year, limit=100)
|
|
241
|
+
yearly_counts[year] = len(papers)
|
|
242
|
+
|
|
243
|
+
comparison['data'][topic] = {
|
|
244
|
+
'yearly_counts': yearly_counts,
|
|
245
|
+
'total_papers': sum(yearly_counts.values()),
|
|
246
|
+
'avg_per_year': sum(yearly_counts.values()) / len(yearly_counts),
|
|
247
|
+
'peak_year': max(yearly_counts, key=yearly_counts.get),
|
|
248
|
+
'trend': self._classify_trend(self._calculate_simple_growth_rate(yearly_counts))
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
# Determine leader
|
|
252
|
+
leader = max(
|
|
253
|
+
comparison['data'].items(),
|
|
254
|
+
key=lambda x: x[1]['total_papers']
|
|
255
|
+
)[0]
|
|
256
|
+
|
|
257
|
+
comparison['leader'] = leader
|
|
258
|
+
comparison['insights'] = self._generate_comparison_insights(comparison['data'])
|
|
259
|
+
|
|
260
|
+
return comparison
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.error(f"Error comparing trends: {e}")
|
|
264
|
+
return {"error": str(e)}
|
|
265
|
+
|
|
266
|
+
def _fetch_papers_for_year(self, topic: str, year: int, limit: int = 100) -> List[Dict[str, Any]]:
|
|
267
|
+
"""Fetch papers for a specific year"""
|
|
268
|
+
if not self.archive_client:
|
|
269
|
+
return []
|
|
270
|
+
|
|
271
|
+
try:
|
|
272
|
+
# Query with year filter
|
|
273
|
+
query = f"{topic} year:{year}"
|
|
274
|
+
|
|
275
|
+
results = self.archive_client.search_papers(
|
|
276
|
+
query=query,
|
|
277
|
+
limit=limit,
|
|
278
|
+
fields=['paperId', 'title', 'authors', 'year', 'citationCount', 'abstract']
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
papers = results.get('data', [])
|
|
282
|
+
|
|
283
|
+
# Filter by year (sometimes API returns adjacent years)
|
|
284
|
+
return [p for p in papers if p.get('year') == year]
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.warning(f"Could not fetch papers for {year}: {e}")
|
|
288
|
+
return []
|
|
289
|
+
|
|
290
|
+
def _extract_trending_keywords(self, papers: List[Dict[str, Any]], top_n: int = 10) -> List[str]:
|
|
291
|
+
"""Extract trending keywords from papers"""
|
|
292
|
+
all_words = []
|
|
293
|
+
|
|
294
|
+
for paper in papers:
|
|
295
|
+
# Extract from title and abstract
|
|
296
|
+
text = f"{paper.get('title', '')} {paper.get('abstract', '')}"
|
|
297
|
+
text = text.lower()
|
|
298
|
+
|
|
299
|
+
# Simple keyword extraction (could be enhanced with NLP)
|
|
300
|
+
words = re.findall(r'\b[a-z]{4,}\b', text) # Words 4+ chars
|
|
301
|
+
all_words.extend(words)
|
|
302
|
+
|
|
303
|
+
# Count and filter common words
|
|
304
|
+
stop_words = {'that', 'this', 'with', 'from', 'have', 'been', 'using', 'which', 'their', 'they'}
|
|
305
|
+
word_counts = Counter(w for w in all_words if w not in stop_words)
|
|
306
|
+
|
|
307
|
+
return [word for word, count in word_counts.most_common(top_n)]
|
|
308
|
+
|
|
309
|
+
def _calculate_growth_rate(self, yearly_data: Dict[int, Dict]) -> float:
|
|
310
|
+
"""Calculate overall growth rate"""
|
|
311
|
+
years = sorted(yearly_data.keys())
|
|
312
|
+
|
|
313
|
+
if len(years) < 2:
|
|
314
|
+
return 0.0
|
|
315
|
+
|
|
316
|
+
first_year_count = yearly_data[years[0]]['paper_count']
|
|
317
|
+
last_year_count = yearly_data[years[-1]]['paper_count']
|
|
318
|
+
|
|
319
|
+
if first_year_count == 0:
|
|
320
|
+
return float('inf') if last_year_count > 0 else 0.0
|
|
321
|
+
|
|
322
|
+
return (last_year_count - first_year_count) / first_year_count
|
|
323
|
+
|
|
324
|
+
def _calculate_simple_growth_rate(self, yearly_counts: Dict[int, int]) -> float:
|
|
325
|
+
"""Calculate simple growth rate from year->count mapping"""
|
|
326
|
+
years = sorted(yearly_counts.keys())
|
|
327
|
+
|
|
328
|
+
if len(years) < 2:
|
|
329
|
+
return 0.0
|
|
330
|
+
|
|
331
|
+
first_count = yearly_counts[years[0]]
|
|
332
|
+
last_count = yearly_counts[years[-1]]
|
|
333
|
+
|
|
334
|
+
if first_count == 0:
|
|
335
|
+
return float('inf') if last_count > 0 else 0.0
|
|
336
|
+
|
|
337
|
+
return (last_count - first_count) / first_count
|
|
338
|
+
|
|
339
|
+
def _detect_inflection_points(self, yearly_data: Dict[int, Dict]) -> List[Dict[str, Any]]:
|
|
340
|
+
"""Detect significant inflection points in trend"""
|
|
341
|
+
inflection_points = []
|
|
342
|
+
years = sorted(yearly_data.keys())
|
|
343
|
+
|
|
344
|
+
for i in range(1, len(years) - 1):
|
|
345
|
+
prev_year = years[i - 1]
|
|
346
|
+
curr_year = years[i]
|
|
347
|
+
next_year = years[i + 1]
|
|
348
|
+
|
|
349
|
+
prev_count = yearly_data[prev_year]['paper_count']
|
|
350
|
+
curr_count = yearly_data[curr_year]['paper_count']
|
|
351
|
+
next_count = yearly_data[next_year]['paper_count']
|
|
352
|
+
|
|
353
|
+
# Check for significant change in direction
|
|
354
|
+
if curr_count > prev_count * 1.5 and curr_count > next_count:
|
|
355
|
+
inflection_points.append({
|
|
356
|
+
'year': curr_year,
|
|
357
|
+
'type': 'peak',
|
|
358
|
+
'paper_count': curr_count
|
|
359
|
+
})
|
|
360
|
+
elif curr_count < prev_count * 0.5 and curr_count < next_count:
|
|
361
|
+
inflection_points.append({
|
|
362
|
+
'year': curr_year,
|
|
363
|
+
'type': 'trough',
|
|
364
|
+
'paper_count': curr_count
|
|
365
|
+
})
|
|
366
|
+
|
|
367
|
+
return inflection_points
|
|
368
|
+
|
|
369
|
+
def _identify_emerging_keywords(self, yearly_data: Dict[int, Dict]) -> List[Dict[str, Any]]:
|
|
370
|
+
"""Identify keywords that emerged recently"""
|
|
371
|
+
years = sorted(yearly_data.keys())
|
|
372
|
+
|
|
373
|
+
if len(years) < 2:
|
|
374
|
+
return []
|
|
375
|
+
|
|
376
|
+
# Compare recent vs older keywords
|
|
377
|
+
recent_years = years[-3:] if len(years) >= 3 else years[-2:]
|
|
378
|
+
older_years = years[:-3] if len(years) >= 3 else years[:-2]
|
|
379
|
+
|
|
380
|
+
recent_keywords = Counter()
|
|
381
|
+
older_keywords = Counter()
|
|
382
|
+
|
|
383
|
+
for year in recent_years:
|
|
384
|
+
keywords = yearly_data[year]['keywords']
|
|
385
|
+
recent_keywords.update(keywords)
|
|
386
|
+
|
|
387
|
+
for year in older_years:
|
|
388
|
+
keywords = yearly_data[year]['keywords']
|
|
389
|
+
older_keywords.update(keywords)
|
|
390
|
+
|
|
391
|
+
# Find new keywords
|
|
392
|
+
emerging = []
|
|
393
|
+
for keyword, recent_count in recent_keywords.items():
|
|
394
|
+
older_count = older_keywords.get(keyword, 0)
|
|
395
|
+
|
|
396
|
+
if older_count == 0 and recent_count >= 2:
|
|
397
|
+
emerging.append({
|
|
398
|
+
'keyword': keyword,
|
|
399
|
+
'recent_mentions': recent_count,
|
|
400
|
+
'status': 'new'
|
|
401
|
+
})
|
|
402
|
+
elif older_count > 0 and recent_count > older_count * 2:
|
|
403
|
+
emerging.append({
|
|
404
|
+
'keyword': keyword,
|
|
405
|
+
'recent_mentions': recent_count,
|
|
406
|
+
'older_mentions': older_count,
|
|
407
|
+
'growth': round((recent_count - older_count) / older_count * 100, 1),
|
|
408
|
+
'status': 'growing'
|
|
409
|
+
})
|
|
410
|
+
|
|
411
|
+
return emerging[:10]
|
|
412
|
+
|
|
413
|
+
def _classify_trend(self, growth_rate: float) -> str:
|
|
414
|
+
"""Classify trend based on growth rate"""
|
|
415
|
+
if growth_rate > 1.0:
|
|
416
|
+
return 'exponential_growth'
|
|
417
|
+
elif growth_rate > 0.5:
|
|
418
|
+
return 'strong_growth'
|
|
419
|
+
elif growth_rate > 0.2:
|
|
420
|
+
return 'moderate_growth'
|
|
421
|
+
elif growth_rate > -0.2:
|
|
422
|
+
return 'stable'
|
|
423
|
+
elif growth_rate > -0.5:
|
|
424
|
+
return 'declining'
|
|
425
|
+
else:
|
|
426
|
+
return 'strong_decline'
|
|
427
|
+
|
|
428
|
+
def _extract_all_keywords(self, papers: List[Dict[str, Any]]) -> Counter:
|
|
429
|
+
"""Extract and count all keywords from papers"""
|
|
430
|
+
keywords = Counter()
|
|
431
|
+
|
|
432
|
+
for paper in papers:
|
|
433
|
+
text = f"{paper.get('title', '')} {paper.get('abstract', '')}"
|
|
434
|
+
text = text.lower()
|
|
435
|
+
|
|
436
|
+
# Extract bigrams and trigrams (more meaningful than single words)
|
|
437
|
+
words = re.findall(r'\b[a-z]+\b', text)
|
|
438
|
+
|
|
439
|
+
# Bigrams
|
|
440
|
+
for i in range(len(words) - 1):
|
|
441
|
+
bigram = f"{words[i]} {words[i+1]}"
|
|
442
|
+
if len(bigram) > 8: # Filter very short bigrams
|
|
443
|
+
keywords[bigram] += 1
|
|
444
|
+
|
|
445
|
+
# Trigrams
|
|
446
|
+
for i in range(len(words) - 2):
|
|
447
|
+
trigram = f"{words[i]} {words[i+1]} {words[i+2]}"
|
|
448
|
+
if len(trigram) > 12:
|
|
449
|
+
keywords[trigram] += 1
|
|
450
|
+
|
|
451
|
+
return keywords
|
|
452
|
+
|
|
453
|
+
def _calculate_relevance_score(self, paper: Dict[str, Any]) -> float:
|
|
454
|
+
"""Calculate relevance score for paper recommendation"""
|
|
455
|
+
score = 0.0
|
|
456
|
+
|
|
457
|
+
# Recency (papers from current year get boost)
|
|
458
|
+
current_year = datetime.now().year
|
|
459
|
+
year = paper.get('year', current_year - 10)
|
|
460
|
+
recency = max(0, 1 - (current_year - year) / 10) # 0-1 score
|
|
461
|
+
score += recency * 30
|
|
462
|
+
|
|
463
|
+
# Citations (normalize to 0-40 range)
|
|
464
|
+
citations = paper.get('citationCount', 0)
|
|
465
|
+
citation_score = min(citations / 100, 1.0) * 40
|
|
466
|
+
score += citation_score
|
|
467
|
+
|
|
468
|
+
# Citation velocity (citations per year)
|
|
469
|
+
age = max(1, current_year - year)
|
|
470
|
+
velocity = citations / age
|
|
471
|
+
velocity_score = min(velocity / 50, 1.0) * 30
|
|
472
|
+
score += velocity_score
|
|
473
|
+
|
|
474
|
+
return round(score, 2)
|
|
475
|
+
|
|
476
|
+
def _generate_recommendation_reason(self, paper: Dict[str, Any], score: float) -> str:
|
|
477
|
+
"""Generate human-readable reason for recommendation"""
|
|
478
|
+
current_year = datetime.now().year
|
|
479
|
+
year = paper.get('year', current_year)
|
|
480
|
+
citations = paper.get('citationCount', 0)
|
|
481
|
+
|
|
482
|
+
reasons = []
|
|
483
|
+
|
|
484
|
+
if year >= current_year:
|
|
485
|
+
reasons.append("Very recent")
|
|
486
|
+
elif year >= current_year - 1:
|
|
487
|
+
reasons.append("Recent")
|
|
488
|
+
|
|
489
|
+
if citations > 100:
|
|
490
|
+
reasons.append("Highly cited")
|
|
491
|
+
elif citations > 50:
|
|
492
|
+
reasons.append("Well cited")
|
|
493
|
+
|
|
494
|
+
age = max(1, current_year - year)
|
|
495
|
+
velocity = citations / age
|
|
496
|
+
|
|
497
|
+
if velocity > 50:
|
|
498
|
+
reasons.append("High impact")
|
|
499
|
+
|
|
500
|
+
if not reasons:
|
|
501
|
+
reasons.append("Relevant")
|
|
502
|
+
|
|
503
|
+
return " · ".join(reasons)
|
|
504
|
+
|
|
505
|
+
def _generate_comparison_insights(self, comparison_data: Dict[str, Any]) -> List[str]:
|
|
506
|
+
"""Generate insights from comparison data"""
|
|
507
|
+
insights = []
|
|
508
|
+
|
|
509
|
+
# Find fastest growing
|
|
510
|
+
growth_rates = {
|
|
511
|
+
topic: self._calculate_simple_growth_rate(data['yearly_counts'])
|
|
512
|
+
for topic, data in comparison_data.items()
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
fastest = max(growth_rates, key=growth_rates.get)
|
|
516
|
+
insights.append(f"{fastest} shows the fastest growth")
|
|
517
|
+
|
|
518
|
+
# Find most established
|
|
519
|
+
total_papers = {
|
|
520
|
+
topic: data['total_papers']
|
|
521
|
+
for topic, data in comparison_data.items()
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
most_established = max(total_papers, key=total_papers.get)
|
|
525
|
+
insights.append(f"{most_established} has the most publications")
|
|
526
|
+
|
|
527
|
+
return insights
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def get_trend_analyzer(archive_client=None) -> ResearchTrendAnalyzer:
|
|
531
|
+
"""
|
|
532
|
+
Get ResearchTrendAnalyzer instance
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
archive_client: ArchiveAPIClient instance
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
ResearchTrendAnalyzer instance
|
|
539
|
+
"""
|
|
540
|
+
return ResearchTrendAnalyzer(archive_client)
|