cite-agent 1.3.9__py3-none-any.whl → 1.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cite_agent/__init__.py +13 -13
- cite_agent/__version__.py +1 -1
- cite_agent/action_first_mode.py +150 -0
- cite_agent/adaptive_providers.py +413 -0
- cite_agent/archive_api_client.py +186 -0
- cite_agent/auth.py +0 -1
- cite_agent/auto_expander.py +70 -0
- cite_agent/cache.py +379 -0
- cite_agent/circuit_breaker.py +370 -0
- cite_agent/citation_network.py +377 -0
- cite_agent/cli.py +8 -16
- cite_agent/cli_conversational.py +113 -3
- cite_agent/confidence_calibration.py +381 -0
- cite_agent/deduplication.py +325 -0
- cite_agent/enhanced_ai_agent.py +689 -371
- cite_agent/error_handler.py +228 -0
- cite_agent/execution_safety.py +329 -0
- cite_agent/full_paper_reader.py +239 -0
- cite_agent/observability.py +398 -0
- cite_agent/offline_mode.py +348 -0
- cite_agent/paper_comparator.py +368 -0
- cite_agent/paper_summarizer.py +420 -0
- cite_agent/pdf_extractor.py +350 -0
- cite_agent/proactive_boundaries.py +266 -0
- cite_agent/quality_gate.py +442 -0
- cite_agent/request_queue.py +390 -0
- cite_agent/response_enhancer.py +257 -0
- cite_agent/response_formatter.py +458 -0
- cite_agent/response_pipeline.py +295 -0
- cite_agent/response_style_enhancer.py +259 -0
- cite_agent/self_healing.py +418 -0
- cite_agent/similarity_finder.py +524 -0
- cite_agent/streaming_ui.py +13 -9
- cite_agent/thinking_blocks.py +308 -0
- cite_agent/tool_orchestrator.py +416 -0
- cite_agent/trend_analyzer.py +540 -0
- cite_agent/unpaywall_client.py +226 -0
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/METADATA +15 -1
- cite_agent-1.4.3.dist-info/RECORD +62 -0
- cite_agent-1.3.9.dist-info/RECORD +0 -32
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/WHEEL +0 -0
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/entry_points.txt +0 -0
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/licenses/LICENSE +0 -0
- {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Paper Summarization Service - The Magic Happens Here!
|
|
4
|
+
Turns full papers into structured summaries so you don't have to read them
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import json
|
|
9
|
+
from typing import Dict, List, Optional, Any
|
|
10
|
+
from dataclasses import dataclass, asdict
|
|
11
|
+
from .pdf_extractor import ExtractedPaper
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class PaperSummary:
|
|
18
|
+
"""Structured summary of an academic paper"""
|
|
19
|
+
# Core info
|
|
20
|
+
doi: Optional[str] = None
|
|
21
|
+
title: Optional[str] = None
|
|
22
|
+
authors: Optional[List[str]] = None
|
|
23
|
+
year: Optional[int] = None
|
|
24
|
+
|
|
25
|
+
# Summary sections
|
|
26
|
+
research_question: Optional[str] = None
|
|
27
|
+
methodology: Optional[str] = None
|
|
28
|
+
key_findings: Optional[List[str]] = None
|
|
29
|
+
limitations: Optional[str] = None
|
|
30
|
+
implications: Optional[str] = None
|
|
31
|
+
|
|
32
|
+
# Additional
|
|
33
|
+
keywords: Optional[List[str]] = None
|
|
34
|
+
citations_to: Optional[List[str]] = None # Papers this cites
|
|
35
|
+
tables_summary: Optional[List[Dict[str, Any]]] = None
|
|
36
|
+
|
|
37
|
+
# Meta
|
|
38
|
+
word_count: int = 0
|
|
39
|
+
confidence: str = "medium" # high, medium, low
|
|
40
|
+
extraction_quality: str = "unknown"
|
|
41
|
+
|
|
42
|
+
def to_markdown(self) -> str:
|
|
43
|
+
"""Convert to readable markdown format"""
|
|
44
|
+
lines = []
|
|
45
|
+
|
|
46
|
+
# Header
|
|
47
|
+
if self.title:
|
|
48
|
+
lines.append(f"# {self.title}")
|
|
49
|
+
if self.authors:
|
|
50
|
+
lines.append(f"**Authors:** {', '.join(self.authors[:3])}" +
|
|
51
|
+
(" et al." if len(self.authors) > 3 else ""))
|
|
52
|
+
if self.year:
|
|
53
|
+
lines.append(f"**Year:** {self.year}")
|
|
54
|
+
if self.doi:
|
|
55
|
+
lines.append(f"**DOI:** {self.doi}")
|
|
56
|
+
|
|
57
|
+
lines.append("")
|
|
58
|
+
|
|
59
|
+
# Research Question
|
|
60
|
+
if self.research_question:
|
|
61
|
+
lines.append("## Research Question")
|
|
62
|
+
lines.append(self.research_question)
|
|
63
|
+
lines.append("")
|
|
64
|
+
|
|
65
|
+
# Methodology
|
|
66
|
+
if self.methodology:
|
|
67
|
+
lines.append("## Methodology")
|
|
68
|
+
lines.append(self.methodology)
|
|
69
|
+
lines.append("")
|
|
70
|
+
|
|
71
|
+
# Key Findings
|
|
72
|
+
if self.key_findings:
|
|
73
|
+
lines.append("## Key Findings")
|
|
74
|
+
for i, finding in enumerate(self.key_findings, 1):
|
|
75
|
+
lines.append(f"{i}. {finding}")
|
|
76
|
+
lines.append("")
|
|
77
|
+
|
|
78
|
+
# Limitations
|
|
79
|
+
if self.limitations:
|
|
80
|
+
lines.append("## Limitations")
|
|
81
|
+
lines.append(self.limitations)
|
|
82
|
+
lines.append("")
|
|
83
|
+
|
|
84
|
+
# Implications
|
|
85
|
+
if self.implications:
|
|
86
|
+
lines.append("## Implications")
|
|
87
|
+
lines.append(self.implications)
|
|
88
|
+
lines.append("")
|
|
89
|
+
|
|
90
|
+
return "\n".join(lines)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class PaperSummarizer:
|
|
94
|
+
"""
|
|
95
|
+
Summarize academic papers using LLM
|
|
96
|
+
Extracts methodology, findings, and implications
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(self, llm_client=None):
|
|
100
|
+
"""
|
|
101
|
+
Initialize summarizer
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
llm_client: Optional LLM client (Groq, OpenAI, Anthropic)
|
|
105
|
+
If None, uses rule-based extraction
|
|
106
|
+
"""
|
|
107
|
+
self.llm_client = llm_client
|
|
108
|
+
self.use_llm = llm_client is not None
|
|
109
|
+
|
|
110
|
+
async def summarize_paper(
|
|
111
|
+
self,
|
|
112
|
+
extracted: ExtractedPaper,
|
|
113
|
+
doi: Optional[str] = None,
|
|
114
|
+
authors: Optional[List[str]] = None,
|
|
115
|
+
year: Optional[int] = None
|
|
116
|
+
) -> PaperSummary:
|
|
117
|
+
"""
|
|
118
|
+
Summarize an extracted paper
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
extracted: ExtractedPaper from PDF extraction
|
|
122
|
+
doi: DOI of paper
|
|
123
|
+
authors: List of author names
|
|
124
|
+
year: Publication year
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
PaperSummary with structured information
|
|
128
|
+
"""
|
|
129
|
+
if self.use_llm and self.llm_client:
|
|
130
|
+
return await self._summarize_with_llm(extracted, doi, authors, year)
|
|
131
|
+
else:
|
|
132
|
+
return self._summarize_rule_based(extracted, doi, authors, year)
|
|
133
|
+
|
|
134
|
+
async def _summarize_with_llm(
|
|
135
|
+
self,
|
|
136
|
+
extracted: ExtractedPaper,
|
|
137
|
+
doi: Optional[str],
|
|
138
|
+
authors: Optional[List[str]],
|
|
139
|
+
year: Optional[int]
|
|
140
|
+
) -> PaperSummary:
|
|
141
|
+
"""Summarize using LLM (best quality)"""
|
|
142
|
+
|
|
143
|
+
# Build prompt
|
|
144
|
+
prompt = self._build_summary_prompt(extracted)
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
# Call LLM
|
|
148
|
+
response = await self._call_llm(prompt)
|
|
149
|
+
|
|
150
|
+
# If LLM failed or unavailable, fall back to rule-based
|
|
151
|
+
if response is None:
|
|
152
|
+
logger.info("LLM not available, using rule-based extraction")
|
|
153
|
+
return self._summarize_rule_based(extracted, doi, authors, year)
|
|
154
|
+
|
|
155
|
+
# Parse structured response
|
|
156
|
+
summary = self._parse_llm_response(response)
|
|
157
|
+
|
|
158
|
+
# Add metadata
|
|
159
|
+
summary.doi = doi
|
|
160
|
+
summary.authors = authors
|
|
161
|
+
summary.year = year
|
|
162
|
+
summary.word_count = extracted.word_count
|
|
163
|
+
summary.extraction_quality = extracted.extraction_quality
|
|
164
|
+
summary.confidence = "high" if extracted.extraction_quality == "high" else "medium"
|
|
165
|
+
|
|
166
|
+
return summary
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.error(f"LLM summarization failed: {e}")
|
|
170
|
+
# Fallback to rule-based
|
|
171
|
+
return self._summarize_rule_based(extracted, doi, authors, year)
|
|
172
|
+
|
|
173
|
+
def _summarize_rule_based(
|
|
174
|
+
self,
|
|
175
|
+
extracted: ExtractedPaper,
|
|
176
|
+
doi: Optional[str],
|
|
177
|
+
authors: Optional[List[str]],
|
|
178
|
+
year: Optional[int]
|
|
179
|
+
) -> PaperSummary:
|
|
180
|
+
"""Summarize using rules (fallback, no LLM needed)"""
|
|
181
|
+
|
|
182
|
+
summary = PaperSummary(
|
|
183
|
+
doi=doi,
|
|
184
|
+
title=extracted.title,
|
|
185
|
+
authors=authors,
|
|
186
|
+
year=year,
|
|
187
|
+
word_count=extracted.word_count,
|
|
188
|
+
extraction_quality=extracted.extraction_quality,
|
|
189
|
+
confidence="medium" if extracted.extraction_quality == "high" else "low"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Extract research question from introduction
|
|
193
|
+
if extracted.introduction:
|
|
194
|
+
summary.research_question = self._extract_research_question(extracted.introduction)
|
|
195
|
+
elif extracted.abstract:
|
|
196
|
+
summary.research_question = self._extract_research_question(extracted.abstract)
|
|
197
|
+
|
|
198
|
+
# Methodology
|
|
199
|
+
if extracted.methodology:
|
|
200
|
+
summary.methodology = self._truncate(extracted.methodology, 500)
|
|
201
|
+
|
|
202
|
+
# Key findings from results
|
|
203
|
+
if extracted.results:
|
|
204
|
+
summary.key_findings = self._extract_key_findings(extracted.results)
|
|
205
|
+
|
|
206
|
+
# Limitations from discussion
|
|
207
|
+
if extracted.discussion:
|
|
208
|
+
summary.limitations = self._extract_limitations(extracted.discussion)
|
|
209
|
+
|
|
210
|
+
# Implications from conclusion
|
|
211
|
+
if extracted.conclusion:
|
|
212
|
+
summary.implications = self._truncate(extracted.conclusion, 500)
|
|
213
|
+
|
|
214
|
+
# Tables
|
|
215
|
+
if extracted.tables:
|
|
216
|
+
summary.tables_summary = [
|
|
217
|
+
{
|
|
218
|
+
'page': t.get('page'),
|
|
219
|
+
'rows': t.get('rows'),
|
|
220
|
+
'preview': str(t.get('data', [])[:2])
|
|
221
|
+
}
|
|
222
|
+
for t in extracted.tables[:3] # First 3 tables
|
|
223
|
+
]
|
|
224
|
+
|
|
225
|
+
return summary
|
|
226
|
+
|
|
227
|
+
def _build_summary_prompt(self, extracted: ExtractedPaper) -> str:
|
|
228
|
+
"""Build prompt for LLM summarization"""
|
|
229
|
+
|
|
230
|
+
sections = []
|
|
231
|
+
|
|
232
|
+
if extracted.abstract:
|
|
233
|
+
sections.append(f"ABSTRACT:\n{extracted.abstract[:2000]}")
|
|
234
|
+
|
|
235
|
+
if extracted.introduction:
|
|
236
|
+
sections.append(f"INTRODUCTION:\n{extracted.introduction[:2000]}")
|
|
237
|
+
|
|
238
|
+
if extracted.methodology:
|
|
239
|
+
sections.append(f"METHODOLOGY:\n{extracted.methodology[:2000]}")
|
|
240
|
+
|
|
241
|
+
if extracted.results:
|
|
242
|
+
sections.append(f"RESULTS:\n{extracted.results[:2000]}")
|
|
243
|
+
|
|
244
|
+
if extracted.conclusion:
|
|
245
|
+
sections.append(f"CONCLUSION:\n{extracted.conclusion[:1000]}")
|
|
246
|
+
|
|
247
|
+
paper_text = "\n\n".join(sections)
|
|
248
|
+
|
|
249
|
+
prompt = f"""Analyze this academic paper and extract key information.
|
|
250
|
+
|
|
251
|
+
{paper_text}
|
|
252
|
+
|
|
253
|
+
Provide a structured summary in JSON format:
|
|
254
|
+
{{
|
|
255
|
+
"research_question": "What research question does this paper address?",
|
|
256
|
+
"methodology": "What methods did they use? (2-3 sentences)",
|
|
257
|
+
"key_findings": ["Finding 1", "Finding 2", "Finding 3"],
|
|
258
|
+
"limitations": "What are the main limitations? (1-2 sentences)",
|
|
259
|
+
"implications": "What are the implications/conclusions? (2-3 sentences)",
|
|
260
|
+
"keywords": ["keyword1", "keyword2", "keyword3"]
|
|
261
|
+
}}
|
|
262
|
+
|
|
263
|
+
Be concise. Extract ONLY what's explicitly stated, do not infer."""
|
|
264
|
+
|
|
265
|
+
return prompt
|
|
266
|
+
|
|
267
|
+
async def _call_llm(self, prompt: str) -> str:
|
|
268
|
+
"""Call LLM client (Groq, OpenAI, Anthropic, etc.)"""
|
|
269
|
+
if self.llm_client is None:
|
|
270
|
+
logger.warning("No LLM client available, will use rule-based extraction")
|
|
271
|
+
return None
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
# Try Groq/OpenAI-compatible API
|
|
275
|
+
if hasattr(self.llm_client, 'chat'):
|
|
276
|
+
response = await self.llm_client.chat.completions.create(
|
|
277
|
+
model="llama-3.3-70b-versatile",
|
|
278
|
+
messages=[
|
|
279
|
+
{"role": "system", "content": "You are a research assistant that summarizes academic papers. Be concise and accurate."},
|
|
280
|
+
{"role": "user", "content": prompt}
|
|
281
|
+
],
|
|
282
|
+
temperature=0.2,
|
|
283
|
+
max_tokens=1000
|
|
284
|
+
)
|
|
285
|
+
return response.choices[0].message.content
|
|
286
|
+
|
|
287
|
+
# Try Anthropic API
|
|
288
|
+
elif hasattr(self.llm_client, 'messages'):
|
|
289
|
+
response = await self.llm_client.messages.create(
|
|
290
|
+
model="claude-3-haiku-20240307",
|
|
291
|
+
max_tokens=1000,
|
|
292
|
+
temperature=0.2,
|
|
293
|
+
messages=[
|
|
294
|
+
{"role": "user", "content": f"You are a research assistant that summarizes academic papers. Be concise and accurate.\n\n{prompt}"}
|
|
295
|
+
]
|
|
296
|
+
)
|
|
297
|
+
return response.content[0].text
|
|
298
|
+
|
|
299
|
+
else:
|
|
300
|
+
logger.warning(f"LLM client type not recognized: {type(self.llm_client).__name__}")
|
|
301
|
+
logger.info("Falling back to rule-based extraction")
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logger.warning(f"LLM call failed: {e}")
|
|
306
|
+
logger.info("Falling back to rule-based extraction")
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
def _parse_llm_response(self, response: str) -> PaperSummary:
|
|
310
|
+
"""Parse structured JSON from LLM"""
|
|
311
|
+
try:
|
|
312
|
+
# Extract JSON from response
|
|
313
|
+
json_start = response.find('{')
|
|
314
|
+
json_end = response.rfind('}') + 1
|
|
315
|
+
json_str = response[json_start:json_end]
|
|
316
|
+
|
|
317
|
+
data = json.loads(json_str)
|
|
318
|
+
|
|
319
|
+
return PaperSummary(
|
|
320
|
+
research_question=data.get('research_question'),
|
|
321
|
+
methodology=data.get('methodology'),
|
|
322
|
+
key_findings=data.get('key_findings', []),
|
|
323
|
+
limitations=data.get('limitations'),
|
|
324
|
+
implications=data.get('implications'),
|
|
325
|
+
keywords=data.get('keywords', []),
|
|
326
|
+
confidence="high"
|
|
327
|
+
)
|
|
328
|
+
except Exception as e:
|
|
329
|
+
logger.warning(f"Failed to parse LLM response: {e}")
|
|
330
|
+
return PaperSummary(confidence="low")
|
|
331
|
+
|
|
332
|
+
def _extract_research_question(self, text: str) -> Optional[str]:
|
|
333
|
+
"""Extract research question from introduction/abstract"""
|
|
334
|
+
# Look for common patterns
|
|
335
|
+
patterns = [
|
|
336
|
+
r"[Ww]e (?:investigate|examine|explore|study|analyze) (.+?)\.",
|
|
337
|
+
r"[Tt]his (?:paper|study|research) (?:investigates|examines|explores) (.+?)\.",
|
|
338
|
+
r"[Tt]he (?:aim|goal|objective|purpose) (?:is|was) to (.+?)\.",
|
|
339
|
+
r"[Rr]esearch question[:\s]+(.+?)\.",
|
|
340
|
+
]
|
|
341
|
+
|
|
342
|
+
import re
|
|
343
|
+
for pattern in patterns:
|
|
344
|
+
match = re.search(pattern, text)
|
|
345
|
+
if match:
|
|
346
|
+
return match.group(1).strip()[:200]
|
|
347
|
+
|
|
348
|
+
# Fallback: first sentence of abstract/intro
|
|
349
|
+
sentences = text.split('.')
|
|
350
|
+
if sentences:
|
|
351
|
+
return sentences[0].strip()[:200]
|
|
352
|
+
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
def _extract_key_findings(self, results_text: str) -> List[str]:
|
|
356
|
+
"""Extract key findings from results section"""
|
|
357
|
+
findings = []
|
|
358
|
+
|
|
359
|
+
# Look for sentences with statistical significance or strong claims
|
|
360
|
+
sentences = results_text.split('.')
|
|
361
|
+
|
|
362
|
+
import re
|
|
363
|
+
for sentence in sentences:
|
|
364
|
+
# Look for p-values, significant, correlation, etc.
|
|
365
|
+
if any(keyword in sentence.lower() for keyword in [
|
|
366
|
+
'significant', 'p <', 'p=', 'correlation', 'showed', 'demonstrated',
|
|
367
|
+
'increased', 'decreased', 'higher', 'lower'
|
|
368
|
+
]):
|
|
369
|
+
finding = sentence.strip()
|
|
370
|
+
if 50 < len(finding) < 300: # Reasonable length
|
|
371
|
+
findings.append(finding)
|
|
372
|
+
if len(findings) >= 5:
|
|
373
|
+
break
|
|
374
|
+
|
|
375
|
+
return findings[:5] if findings else None
|
|
376
|
+
|
|
377
|
+
def _extract_limitations(self, discussion_text: str) -> Optional[str]:
|
|
378
|
+
"""Extract limitations from discussion"""
|
|
379
|
+
import re
|
|
380
|
+
|
|
381
|
+
# Look for limitations section
|
|
382
|
+
patterns = [
|
|
383
|
+
r"[Ll]imitations?[:\s]+(.+?)(?:\.|$)",
|
|
384
|
+
r"[Aa] limitation (?:is|was) (.+?)\.",
|
|
385
|
+
]
|
|
386
|
+
|
|
387
|
+
for pattern in patterns:
|
|
388
|
+
match = re.search(pattern, discussion_text)
|
|
389
|
+
if match:
|
|
390
|
+
return match.group(1).strip()[:300]
|
|
391
|
+
|
|
392
|
+
return None
|
|
393
|
+
|
|
394
|
+
def _truncate(self, text: str, max_length: int) -> str:
|
|
395
|
+
"""Truncate text to max length"""
|
|
396
|
+
if len(text) <= max_length:
|
|
397
|
+
return text
|
|
398
|
+
return text[:max_length].rsplit(' ', 1)[0] + "..."
|
|
399
|
+
|
|
400
|
+
async def batch_summarize(
|
|
401
|
+
self,
|
|
402
|
+
papers: List[tuple[ExtractedPaper, str, List[str], int]]
|
|
403
|
+
) -> List[PaperSummary]:
|
|
404
|
+
"""
|
|
405
|
+
Summarize multiple papers in parallel
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
papers: List of (extracted, doi, authors, year) tuples
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
List of PaperSummary objects
|
|
412
|
+
"""
|
|
413
|
+
import asyncio
|
|
414
|
+
|
|
415
|
+
tasks = [
|
|
416
|
+
self.summarize_paper(ext, doi, authors, year)
|
|
417
|
+
for ext, doi, authors, year in papers
|
|
418
|
+
]
|
|
419
|
+
|
|
420
|
+
return await asyncio.gather(*tasks, return_exceptions=True)
|