cite-agent 1.3.9__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. cite_agent/__init__.py +13 -13
  2. cite_agent/__version__.py +1 -1
  3. cite_agent/action_first_mode.py +150 -0
  4. cite_agent/adaptive_providers.py +413 -0
  5. cite_agent/archive_api_client.py +186 -0
  6. cite_agent/auth.py +0 -1
  7. cite_agent/auto_expander.py +70 -0
  8. cite_agent/cache.py +379 -0
  9. cite_agent/circuit_breaker.py +370 -0
  10. cite_agent/citation_network.py +377 -0
  11. cite_agent/cli.py +8 -16
  12. cite_agent/cli_conversational.py +113 -3
  13. cite_agent/confidence_calibration.py +381 -0
  14. cite_agent/deduplication.py +325 -0
  15. cite_agent/enhanced_ai_agent.py +689 -371
  16. cite_agent/error_handler.py +228 -0
  17. cite_agent/execution_safety.py +329 -0
  18. cite_agent/full_paper_reader.py +239 -0
  19. cite_agent/observability.py +398 -0
  20. cite_agent/offline_mode.py +348 -0
  21. cite_agent/paper_comparator.py +368 -0
  22. cite_agent/paper_summarizer.py +420 -0
  23. cite_agent/pdf_extractor.py +350 -0
  24. cite_agent/proactive_boundaries.py +266 -0
  25. cite_agent/quality_gate.py +442 -0
  26. cite_agent/request_queue.py +390 -0
  27. cite_agent/response_enhancer.py +257 -0
  28. cite_agent/response_formatter.py +458 -0
  29. cite_agent/response_pipeline.py +295 -0
  30. cite_agent/response_style_enhancer.py +259 -0
  31. cite_agent/self_healing.py +418 -0
  32. cite_agent/similarity_finder.py +524 -0
  33. cite_agent/streaming_ui.py +13 -9
  34. cite_agent/thinking_blocks.py +308 -0
  35. cite_agent/tool_orchestrator.py +416 -0
  36. cite_agent/trend_analyzer.py +540 -0
  37. cite_agent/unpaywall_client.py +226 -0
  38. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/METADATA +15 -1
  39. cite_agent-1.4.3.dist-info/RECORD +62 -0
  40. cite_agent-1.3.9.dist-info/RECORD +0 -32
  41. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/WHEEL +0 -0
  42. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/entry_points.txt +0 -0
  43. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/licenses/LICENSE +0 -0
  44. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,420 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Paper Summarization Service - The Magic Happens Here!
4
+ Turns full papers into structured summaries so you don't have to read them
5
+ """
6
+
7
+ import logging
8
+ import json
9
+ from typing import Dict, List, Optional, Any
10
+ from dataclasses import dataclass, asdict
11
+ from .pdf_extractor import ExtractedPaper
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class PaperSummary:
18
+ """Structured summary of an academic paper"""
19
+ # Core info
20
+ doi: Optional[str] = None
21
+ title: Optional[str] = None
22
+ authors: Optional[List[str]] = None
23
+ year: Optional[int] = None
24
+
25
+ # Summary sections
26
+ research_question: Optional[str] = None
27
+ methodology: Optional[str] = None
28
+ key_findings: Optional[List[str]] = None
29
+ limitations: Optional[str] = None
30
+ implications: Optional[str] = None
31
+
32
+ # Additional
33
+ keywords: Optional[List[str]] = None
34
+ citations_to: Optional[List[str]] = None # Papers this cites
35
+ tables_summary: Optional[List[Dict[str, Any]]] = None
36
+
37
+ # Meta
38
+ word_count: int = 0
39
+ confidence: str = "medium" # high, medium, low
40
+ extraction_quality: str = "unknown"
41
+
42
+ def to_markdown(self) -> str:
43
+ """Convert to readable markdown format"""
44
+ lines = []
45
+
46
+ # Header
47
+ if self.title:
48
+ lines.append(f"# {self.title}")
49
+ if self.authors:
50
+ lines.append(f"**Authors:** {', '.join(self.authors[:3])}" +
51
+ (" et al." if len(self.authors) > 3 else ""))
52
+ if self.year:
53
+ lines.append(f"**Year:** {self.year}")
54
+ if self.doi:
55
+ lines.append(f"**DOI:** {self.doi}")
56
+
57
+ lines.append("")
58
+
59
+ # Research Question
60
+ if self.research_question:
61
+ lines.append("## Research Question")
62
+ lines.append(self.research_question)
63
+ lines.append("")
64
+
65
+ # Methodology
66
+ if self.methodology:
67
+ lines.append("## Methodology")
68
+ lines.append(self.methodology)
69
+ lines.append("")
70
+
71
+ # Key Findings
72
+ if self.key_findings:
73
+ lines.append("## Key Findings")
74
+ for i, finding in enumerate(self.key_findings, 1):
75
+ lines.append(f"{i}. {finding}")
76
+ lines.append("")
77
+
78
+ # Limitations
79
+ if self.limitations:
80
+ lines.append("## Limitations")
81
+ lines.append(self.limitations)
82
+ lines.append("")
83
+
84
+ # Implications
85
+ if self.implications:
86
+ lines.append("## Implications")
87
+ lines.append(self.implications)
88
+ lines.append("")
89
+
90
+ return "\n".join(lines)
91
+
92
+
93
+ class PaperSummarizer:
94
+ """
95
+ Summarize academic papers using LLM
96
+ Extracts methodology, findings, and implications
97
+ """
98
+
99
+ def __init__(self, llm_client=None):
100
+ """
101
+ Initialize summarizer
102
+
103
+ Args:
104
+ llm_client: Optional LLM client (Groq, OpenAI, Anthropic)
105
+ If None, uses rule-based extraction
106
+ """
107
+ self.llm_client = llm_client
108
+ self.use_llm = llm_client is not None
109
+
110
+ async def summarize_paper(
111
+ self,
112
+ extracted: ExtractedPaper,
113
+ doi: Optional[str] = None,
114
+ authors: Optional[List[str]] = None,
115
+ year: Optional[int] = None
116
+ ) -> PaperSummary:
117
+ """
118
+ Summarize an extracted paper
119
+
120
+ Args:
121
+ extracted: ExtractedPaper from PDF extraction
122
+ doi: DOI of paper
123
+ authors: List of author names
124
+ year: Publication year
125
+
126
+ Returns:
127
+ PaperSummary with structured information
128
+ """
129
+ if self.use_llm and self.llm_client:
130
+ return await self._summarize_with_llm(extracted, doi, authors, year)
131
+ else:
132
+ return self._summarize_rule_based(extracted, doi, authors, year)
133
+
134
+ async def _summarize_with_llm(
135
+ self,
136
+ extracted: ExtractedPaper,
137
+ doi: Optional[str],
138
+ authors: Optional[List[str]],
139
+ year: Optional[int]
140
+ ) -> PaperSummary:
141
+ """Summarize using LLM (best quality)"""
142
+
143
+ # Build prompt
144
+ prompt = self._build_summary_prompt(extracted)
145
+
146
+ try:
147
+ # Call LLM
148
+ response = await self._call_llm(prompt)
149
+
150
+ # If LLM failed or unavailable, fall back to rule-based
151
+ if response is None:
152
+ logger.info("LLM not available, using rule-based extraction")
153
+ return self._summarize_rule_based(extracted, doi, authors, year)
154
+
155
+ # Parse structured response
156
+ summary = self._parse_llm_response(response)
157
+
158
+ # Add metadata
159
+ summary.doi = doi
160
+ summary.authors = authors
161
+ summary.year = year
162
+ summary.word_count = extracted.word_count
163
+ summary.extraction_quality = extracted.extraction_quality
164
+ summary.confidence = "high" if extracted.extraction_quality == "high" else "medium"
165
+
166
+ return summary
167
+
168
+ except Exception as e:
169
+ logger.error(f"LLM summarization failed: {e}")
170
+ # Fallback to rule-based
171
+ return self._summarize_rule_based(extracted, doi, authors, year)
172
+
173
+ def _summarize_rule_based(
174
+ self,
175
+ extracted: ExtractedPaper,
176
+ doi: Optional[str],
177
+ authors: Optional[List[str]],
178
+ year: Optional[int]
179
+ ) -> PaperSummary:
180
+ """Summarize using rules (fallback, no LLM needed)"""
181
+
182
+ summary = PaperSummary(
183
+ doi=doi,
184
+ title=extracted.title,
185
+ authors=authors,
186
+ year=year,
187
+ word_count=extracted.word_count,
188
+ extraction_quality=extracted.extraction_quality,
189
+ confidence="medium" if extracted.extraction_quality == "high" else "low"
190
+ )
191
+
192
+ # Extract research question from introduction
193
+ if extracted.introduction:
194
+ summary.research_question = self._extract_research_question(extracted.introduction)
195
+ elif extracted.abstract:
196
+ summary.research_question = self._extract_research_question(extracted.abstract)
197
+
198
+ # Methodology
199
+ if extracted.methodology:
200
+ summary.methodology = self._truncate(extracted.methodology, 500)
201
+
202
+ # Key findings from results
203
+ if extracted.results:
204
+ summary.key_findings = self._extract_key_findings(extracted.results)
205
+
206
+ # Limitations from discussion
207
+ if extracted.discussion:
208
+ summary.limitations = self._extract_limitations(extracted.discussion)
209
+
210
+ # Implications from conclusion
211
+ if extracted.conclusion:
212
+ summary.implications = self._truncate(extracted.conclusion, 500)
213
+
214
+ # Tables
215
+ if extracted.tables:
216
+ summary.tables_summary = [
217
+ {
218
+ 'page': t.get('page'),
219
+ 'rows': t.get('rows'),
220
+ 'preview': str(t.get('data', [])[:2])
221
+ }
222
+ for t in extracted.tables[:3] # First 3 tables
223
+ ]
224
+
225
+ return summary
226
+
227
+ def _build_summary_prompt(self, extracted: ExtractedPaper) -> str:
228
+ """Build prompt for LLM summarization"""
229
+
230
+ sections = []
231
+
232
+ if extracted.abstract:
233
+ sections.append(f"ABSTRACT:\n{extracted.abstract[:2000]}")
234
+
235
+ if extracted.introduction:
236
+ sections.append(f"INTRODUCTION:\n{extracted.introduction[:2000]}")
237
+
238
+ if extracted.methodology:
239
+ sections.append(f"METHODOLOGY:\n{extracted.methodology[:2000]}")
240
+
241
+ if extracted.results:
242
+ sections.append(f"RESULTS:\n{extracted.results[:2000]}")
243
+
244
+ if extracted.conclusion:
245
+ sections.append(f"CONCLUSION:\n{extracted.conclusion[:1000]}")
246
+
247
+ paper_text = "\n\n".join(sections)
248
+
249
+ prompt = f"""Analyze this academic paper and extract key information.
250
+
251
+ {paper_text}
252
+
253
+ Provide a structured summary in JSON format:
254
+ {{
255
+ "research_question": "What research question does this paper address?",
256
+ "methodology": "What methods did they use? (2-3 sentences)",
257
+ "key_findings": ["Finding 1", "Finding 2", "Finding 3"],
258
+ "limitations": "What are the main limitations? (1-2 sentences)",
259
+ "implications": "What are the implications/conclusions? (2-3 sentences)",
260
+ "keywords": ["keyword1", "keyword2", "keyword3"]
261
+ }}
262
+
263
+ Be concise. Extract ONLY what's explicitly stated, do not infer."""
264
+
265
+ return prompt
266
+
267
+ async def _call_llm(self, prompt: str) -> str:
268
+ """Call LLM client (Groq, OpenAI, Anthropic, etc.)"""
269
+ if self.llm_client is None:
270
+ logger.warning("No LLM client available, will use rule-based extraction")
271
+ return None
272
+
273
+ try:
274
+ # Try Groq/OpenAI-compatible API
275
+ if hasattr(self.llm_client, 'chat'):
276
+ response = await self.llm_client.chat.completions.create(
277
+ model="llama-3.3-70b-versatile",
278
+ messages=[
279
+ {"role": "system", "content": "You are a research assistant that summarizes academic papers. Be concise and accurate."},
280
+ {"role": "user", "content": prompt}
281
+ ],
282
+ temperature=0.2,
283
+ max_tokens=1000
284
+ )
285
+ return response.choices[0].message.content
286
+
287
+ # Try Anthropic API
288
+ elif hasattr(self.llm_client, 'messages'):
289
+ response = await self.llm_client.messages.create(
290
+ model="claude-3-haiku-20240307",
291
+ max_tokens=1000,
292
+ temperature=0.2,
293
+ messages=[
294
+ {"role": "user", "content": f"You are a research assistant that summarizes academic papers. Be concise and accurate.\n\n{prompt}"}
295
+ ]
296
+ )
297
+ return response.content[0].text
298
+
299
+ else:
300
+ logger.warning(f"LLM client type not recognized: {type(self.llm_client).__name__}")
301
+ logger.info("Falling back to rule-based extraction")
302
+ return None
303
+
304
+ except Exception as e:
305
+ logger.warning(f"LLM call failed: {e}")
306
+ logger.info("Falling back to rule-based extraction")
307
+ return None
308
+
309
+ def _parse_llm_response(self, response: str) -> PaperSummary:
310
+ """Parse structured JSON from LLM"""
311
+ try:
312
+ # Extract JSON from response
313
+ json_start = response.find('{')
314
+ json_end = response.rfind('}') + 1
315
+ json_str = response[json_start:json_end]
316
+
317
+ data = json.loads(json_str)
318
+
319
+ return PaperSummary(
320
+ research_question=data.get('research_question'),
321
+ methodology=data.get('methodology'),
322
+ key_findings=data.get('key_findings', []),
323
+ limitations=data.get('limitations'),
324
+ implications=data.get('implications'),
325
+ keywords=data.get('keywords', []),
326
+ confidence="high"
327
+ )
328
+ except Exception as e:
329
+ logger.warning(f"Failed to parse LLM response: {e}")
330
+ return PaperSummary(confidence="low")
331
+
332
+ def _extract_research_question(self, text: str) -> Optional[str]:
333
+ """Extract research question from introduction/abstract"""
334
+ # Look for common patterns
335
+ patterns = [
336
+ r"[Ww]e (?:investigate|examine|explore|study|analyze) (.+?)\.",
337
+ r"[Tt]his (?:paper|study|research) (?:investigates|examines|explores) (.+?)\.",
338
+ r"[Tt]he (?:aim|goal|objective|purpose) (?:is|was) to (.+?)\.",
339
+ r"[Rr]esearch question[:\s]+(.+?)\.",
340
+ ]
341
+
342
+ import re
343
+ for pattern in patterns:
344
+ match = re.search(pattern, text)
345
+ if match:
346
+ return match.group(1).strip()[:200]
347
+
348
+ # Fallback: first sentence of abstract/intro
349
+ sentences = text.split('.')
350
+ if sentences:
351
+ return sentences[0].strip()[:200]
352
+
353
+ return None
354
+
355
+ def _extract_key_findings(self, results_text: str) -> List[str]:
356
+ """Extract key findings from results section"""
357
+ findings = []
358
+
359
+ # Look for sentences with statistical significance or strong claims
360
+ sentences = results_text.split('.')
361
+
362
+ import re
363
+ for sentence in sentences:
364
+ # Look for p-values, significant, correlation, etc.
365
+ if any(keyword in sentence.lower() for keyword in [
366
+ 'significant', 'p <', 'p=', 'correlation', 'showed', 'demonstrated',
367
+ 'increased', 'decreased', 'higher', 'lower'
368
+ ]):
369
+ finding = sentence.strip()
370
+ if 50 < len(finding) < 300: # Reasonable length
371
+ findings.append(finding)
372
+ if len(findings) >= 5:
373
+ break
374
+
375
+ return findings[:5] if findings else None
376
+
377
+ def _extract_limitations(self, discussion_text: str) -> Optional[str]:
378
+ """Extract limitations from discussion"""
379
+ import re
380
+
381
+ # Look for limitations section
382
+ patterns = [
383
+ r"[Ll]imitations?[:\s]+(.+?)(?:\.|$)",
384
+ r"[Aa] limitation (?:is|was) (.+?)\.",
385
+ ]
386
+
387
+ for pattern in patterns:
388
+ match = re.search(pattern, discussion_text)
389
+ if match:
390
+ return match.group(1).strip()[:300]
391
+
392
+ return None
393
+
394
+ def _truncate(self, text: str, max_length: int) -> str:
395
+ """Truncate text to max length"""
396
+ if len(text) <= max_length:
397
+ return text
398
+ return text[:max_length].rsplit(' ', 1)[0] + "..."
399
+
400
+ async def batch_summarize(
401
+ self,
402
+ papers: List[tuple[ExtractedPaper, str, List[str], int]]
403
+ ) -> List[PaperSummary]:
404
+ """
405
+ Summarize multiple papers in parallel
406
+
407
+ Args:
408
+ papers: List of (extracted, doi, authors, year) tuples
409
+
410
+ Returns:
411
+ List of PaperSummary objects
412
+ """
413
+ import asyncio
414
+
415
+ tasks = [
416
+ self.summarize_paper(ext, doi, authors, year)
417
+ for ext, doi, authors, year in papers
418
+ ]
419
+
420
+ return await asyncio.gather(*tasks, return_exceptions=True)