cite-agent 1.3.9__py3-none-any.whl → 1.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. cite_agent/__init__.py +13 -13
  2. cite_agent/__version__.py +1 -1
  3. cite_agent/action_first_mode.py +150 -0
  4. cite_agent/adaptive_providers.py +413 -0
  5. cite_agent/archive_api_client.py +186 -0
  6. cite_agent/auth.py +0 -1
  7. cite_agent/auto_expander.py +70 -0
  8. cite_agent/cache.py +379 -0
  9. cite_agent/circuit_breaker.py +370 -0
  10. cite_agent/citation_network.py +377 -0
  11. cite_agent/cli.py +8 -16
  12. cite_agent/cli_conversational.py +113 -3
  13. cite_agent/confidence_calibration.py +381 -0
  14. cite_agent/deduplication.py +325 -0
  15. cite_agent/enhanced_ai_agent.py +689 -371
  16. cite_agent/error_handler.py +228 -0
  17. cite_agent/execution_safety.py +329 -0
  18. cite_agent/full_paper_reader.py +239 -0
  19. cite_agent/observability.py +398 -0
  20. cite_agent/offline_mode.py +348 -0
  21. cite_agent/paper_comparator.py +368 -0
  22. cite_agent/paper_summarizer.py +420 -0
  23. cite_agent/pdf_extractor.py +350 -0
  24. cite_agent/proactive_boundaries.py +266 -0
  25. cite_agent/quality_gate.py +442 -0
  26. cite_agent/request_queue.py +390 -0
  27. cite_agent/response_enhancer.py +257 -0
  28. cite_agent/response_formatter.py +458 -0
  29. cite_agent/response_pipeline.py +295 -0
  30. cite_agent/response_style_enhancer.py +259 -0
  31. cite_agent/self_healing.py +418 -0
  32. cite_agent/similarity_finder.py +524 -0
  33. cite_agent/streaming_ui.py +13 -9
  34. cite_agent/thinking_blocks.py +308 -0
  35. cite_agent/tool_orchestrator.py +416 -0
  36. cite_agent/trend_analyzer.py +540 -0
  37. cite_agent/unpaywall_client.py +226 -0
  38. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/METADATA +15 -1
  39. cite_agent-1.4.3.dist-info/RECORD +62 -0
  40. cite_agent-1.3.9.dist-info/RECORD +0 -32
  41. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/WHEEL +0 -0
  42. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/entry_points.txt +0 -0
  43. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/licenses/LICENSE +0 -0
  44. {cite_agent-1.3.9.dist-info → cite_agent-1.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,442 @@
1
+ """
2
+ Response Quality Gate - Assess Before Sending
3
+ Ensures every response meets quality standards
4
+
5
+ This is the "reflection" step that Claude/Cursor has but this agent was missing
6
+ """
7
+
8
+ import re
9
+ import logging
10
+ from typing import Dict, Any, List, Optional
11
+ from dataclasses import dataclass
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class QualityAssessment:
18
+ """Results of quality assessment"""
19
+ overall_score: float # 0.0-1.0
20
+ should_retry: bool # True if quality too low
21
+ issues: List[str] # Problems found
22
+ suggestions: List[str] # How to improve
23
+ strengths: List[str] # What's good
24
+ category_scores: Dict[str, float] # Detailed scores
25
+
26
+
27
+ class ResponseQualityGate:
28
+ """
29
+ Gate-keeper for response quality
30
+ Assesses responses before sending to ensure they meet standards
31
+
32
+ Standards:
33
+ 1. Clarity - Is it easy to understand?
34
+ 2. Completeness - Does it answer the question?
35
+ 3. Structure - Is it scannable?
36
+ 4. Appropriateness - Right tone and level?
37
+ 5. Safety - No technical errors exposed?
38
+ """
39
+
40
+ # Minimum acceptable quality score
41
+ MIN_ACCEPTABLE_SCORE = 0.6 # Below this, we should retry
42
+
43
+ # Weights for different aspects
44
+ WEIGHTS = {
45
+ 'clarity': 0.25,
46
+ 'completeness': 0.30,
47
+ 'structure': 0.20,
48
+ 'appropriateness': 0.15,
49
+ 'safety': 0.10
50
+ }
51
+
52
+ @classmethod
53
+ def assess(
54
+ cls,
55
+ response: str,
56
+ original_query: str,
57
+ context: Dict[str, Any]
58
+ ) -> QualityAssessment:
59
+ """
60
+ Comprehensive quality assessment
61
+
62
+ Returns assessment with scores, issues, and suggestions
63
+ """
64
+ issues = []
65
+ suggestions = []
66
+ strengths = []
67
+ scores = {}
68
+
69
+ # 1. Clarity - Is it understandable?
70
+ scores['clarity'] = cls._assess_clarity(response, issues, suggestions, strengths)
71
+
72
+ # 2. Completeness - Does it answer the question?
73
+ scores['completeness'] = cls._assess_completeness(
74
+ response, original_query, context, issues, suggestions, strengths
75
+ )
76
+
77
+ # 3. Structure - Is it scannable?
78
+ scores['structure'] = cls._assess_structure(response, issues, suggestions, strengths)
79
+
80
+ # 4. Appropriateness - Right tone/level?
81
+ scores['appropriateness'] = cls._assess_appropriateness(
82
+ response, original_query, issues, suggestions, strengths
83
+ )
84
+
85
+ # 5. Safety - No technical leakage?
86
+ scores['safety'] = cls._assess_safety(response, issues, suggestions, strengths)
87
+
88
+ # Calculate overall score
89
+ overall_score = sum(scores[k] * cls.WEIGHTS[k] for k in cls.WEIGHTS)
90
+
91
+ # Determine if we should retry
92
+ should_retry = overall_score < cls.MIN_ACCEPTABLE_SCORE
93
+
94
+ return QualityAssessment(
95
+ overall_score=overall_score,
96
+ should_retry=should_retry,
97
+ issues=issues,
98
+ suggestions=suggestions,
99
+ strengths=strengths,
100
+ category_scores=scores
101
+ )
102
+
103
+ @classmethod
104
+ def _assess_clarity(
105
+ cls,
106
+ response: str,
107
+ issues: List[str],
108
+ suggestions: List[str],
109
+ strengths: List[str]
110
+ ) -> float:
111
+ """Assess: Is the response clear and easy to understand?"""
112
+ score = 1.0
113
+
114
+ # Check for excessive hedging/uncertainty
115
+ hedge_words = ['might', 'could', 'possibly', 'perhaps', 'maybe', 'probably']
116
+ hedge_count = sum(1 for word in hedge_words if f' {word} ' in response.lower())
117
+
118
+ if hedge_count > 4:
119
+ score -= 0.2
120
+ issues.append(f"Too many uncertain words ({hedge_count})")
121
+ suggestions.append("Be more definitive where possible")
122
+
123
+ # Check for run-on sentences
124
+ sentences = re.split(r'[.!?]+', response)
125
+ long_sentences = [s for s in sentences if len(s.split()) > 40]
126
+
127
+ if len(long_sentences) > 2:
128
+ score -= 0.15
129
+ issues.append("Has overly long sentences")
130
+ suggestions.append("Break up long sentences for clarity")
131
+
132
+ # Check for jargon without explanation
133
+ jargon_terms = [
134
+ 'api', 'json', 'http', 'ssl', 'tls', 'tcp', 'latency',
135
+ 'throughput', 'endpoint', 'payload', 'schema'
136
+ ]
137
+ jargon_found = [term for term in jargon_terms if term in response.lower()]
138
+
139
+ if len(jargon_found) > 3:
140
+ score -= 0.2
141
+ issues.append(f"Contains unexplained jargon: {', '.join(jargon_found[:3])}")
142
+ suggestions.append("Explain technical terms or use simpler language")
143
+
144
+ # Positive signals
145
+ if any(marker in response for marker in ['•', '- ', '**', '__']):
146
+ strengths.append("Uses formatting for clarity")
147
+ score = min(1.0, score + 0.1)
148
+
149
+ return max(0.0, score)
150
+
151
+ @classmethod
152
+ def _assess_completeness(
153
+ cls,
154
+ response: str,
155
+ query: str,
156
+ context: Dict[str, Any],
157
+ issues: List[str],
158
+ suggestions: List[str],
159
+ strengths: List[str]
160
+ ) -> float:
161
+ """Assess: Does the response actually answer the question?"""
162
+ score = 0.7 # Start with assumption it's mostly complete
163
+
164
+ query_lower = query.lower()
165
+ response_lower = response.lower()
166
+
167
+ # Check for deflection without attempt
168
+ deflection_phrases = [
169
+ "i don't have access",
170
+ "i can't help",
171
+ "i'm not sure",
172
+ "i don't know"
173
+ ]
174
+
175
+ has_deflection = any(phrase in response_lower for phrase in deflection_phrases)
176
+
177
+ if has_deflection:
178
+ # Deflection is OK if response offers alternatives
179
+ offers_alternative = any(word in response_lower for word in ['try', 'instead', 'alternatively', 'you could'])
180
+
181
+ if not offers_alternative:
182
+ score -= 0.3
183
+ issues.append("Deflects without offering alternatives")
184
+ suggestions.append("Suggest what the user can do instead")
185
+ else:
186
+ strengths.append("Deflects gracefully with alternatives")
187
+
188
+ # Check if response is too brief for complex query
189
+ query_complexity = len(query.split())
190
+ response_length = len(response.split())
191
+
192
+ if query_complexity > 15 and response_length < 30:
193
+ score -= 0.2
194
+ issues.append("Response too brief for complex query")
195
+ suggestions.append("Provide more detailed explanation")
196
+
197
+ # Check if response addresses key terms from query
198
+ # Extract important words from query (not stop words)
199
+ stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'what', 'how', 'why', 'when', 'where', 'who'}
200
+ query_keywords = [
201
+ word.lower().strip('?.,!')
202
+ for word in query.split()
203
+ if len(word) > 3 and word.lower() not in stop_words
204
+ ]
205
+
206
+ # Check how many keywords are addressed
207
+ if query_keywords:
208
+ addressed = sum(1 for kw in query_keywords if kw in response_lower)
209
+ coverage = addressed / len(query_keywords)
210
+
211
+ if coverage < 0.3:
212
+ score -= 0.25
213
+ issues.append("Doesn't address key terms from query")
214
+ suggestions.append(f"Address these terms: {', '.join(query_keywords[:3])}")
215
+ elif coverage > 0.7:
216
+ strengths.append("Addresses key terms from query")
217
+
218
+ # Positive: Provides specific information
219
+ has_specifics = (
220
+ any(char.isdigit() for char in response) or # Numbers
221
+ '/' in response or # Paths
222
+ bool(re.search(r'\w+\.\w+', response)) # File extensions or domains
223
+ )
224
+
225
+ if has_specifics:
226
+ strengths.append("Provides specific information")
227
+ score = min(1.0, score + 0.1)
228
+
229
+ return max(0.0, score)
230
+
231
+ @classmethod
232
+ def _assess_structure(
233
+ cls,
234
+ response: str,
235
+ issues: List[str],
236
+ suggestions: List[str],
237
+ strengths: List[str]
238
+ ) -> float:
239
+ """Assess: Is the response well-structured and scannable?"""
240
+ score = 0.5 # Neutral start
241
+
242
+ # Good structure indicators
243
+ has_bullets = ('•' in response or
244
+ re.search(r'^\s*[-*]\s', response, re.MULTILINE) or
245
+ re.search(r'^\s*\d+\.\s', response, re.MULTILINE))
246
+
247
+ has_paragraphs = '\n\n' in response or response.count('\n') >= 2
248
+
249
+ has_emphasis = '**' in response or '__' in response
250
+
251
+ # Score based on structure elements
252
+ if has_bullets:
253
+ score += 0.2
254
+ strengths.append("Uses bullets/lists for structure")
255
+
256
+ if has_paragraphs:
257
+ score += 0.15
258
+ strengths.append("Breaks into readable paragraphs")
259
+
260
+ if has_emphasis:
261
+ score += 0.15
262
+ strengths.append("Uses emphasis for key points")
263
+
264
+ # Negative: Wall of text
265
+ if not has_paragraphs and len(response) > 200:
266
+ score -= 0.25
267
+ issues.append("Wall of text - hard to scan")
268
+ suggestions.append("Break into paragraphs or use bullets")
269
+
270
+ # Negative: Lines too long
271
+ lines = response.split('\n')
272
+ max_line_length = max(len(line) for line in lines) if lines else 0
273
+
274
+ if max_line_length > 200:
275
+ score -= 0.2
276
+ issues.append("Very long lines - hard to scan")
277
+ suggestions.append("Break up long lines")
278
+
279
+ # Check if it starts well
280
+ if response and response[0].isupper():
281
+ strengths.append("Starts with proper capitalization")
282
+ else:
283
+ score -= 0.1
284
+ issues.append("Doesn't start with capital letter")
285
+
286
+ return min(1.0, max(0.0, score))
287
+
288
+ @classmethod
289
+ def _assess_appropriateness(
290
+ cls,
291
+ response: str,
292
+ query: str,
293
+ issues: List[str],
294
+ suggestions: List[str],
295
+ strengths: List[str]
296
+ ) -> float:
297
+ """Assess: Is the tone and level appropriate for the query?"""
298
+ score = 1.0
299
+
300
+ query_lower = query.lower()
301
+ response_lower = response.lower()
302
+
303
+ # Check tone matches query
304
+ # Simple/casual query should get simple/casual response
305
+ is_casual_query = any(word in query_lower for word in ['hey', 'hi', 'hello', 'thanks'])
306
+
307
+ if is_casual_query:
308
+ # Response should be brief and friendly
309
+ if len(response.split()) > 50:
310
+ score -= 0.2
311
+ issues.append("Too verbose for casual query")
312
+ suggestions.append("Keep casual responses brief")
313
+
314
+ # Technical query should get detailed response
315
+ is_technical_query = any(word in query_lower for word in ['function', 'class', 'code', 'bug', 'error'])
316
+
317
+ if is_technical_query:
318
+ # Response should have details
319
+ if len(response.split()) < 30:
320
+ score -= 0.15
321
+ issues.append("Too brief for technical query")
322
+ suggestions.append("Provide more technical detail")
323
+
324
+ # Check for inappropriate phrasing
325
+ inappropriate_phrases = [
326
+ "i think you're confused",
327
+ "that's wrong",
328
+ "you should know",
329
+ "obviously",
330
+ "clearly you"
331
+ ]
332
+
333
+ for phrase in inappropriate_phrases:
334
+ if phrase in response_lower:
335
+ score -= 0.3
336
+ issues.append(f"Inappropriate phrasing: '{phrase}'")
337
+ suggestions.append("Be more respectful and helpful")
338
+
339
+ # Check for overly apologetic
340
+ apology_count = response_lower.count("sorry") + response_lower.count("apolog")
341
+
342
+ if apology_count > 2:
343
+ score -= 0.1
344
+ issues.append("Too apologetic")
345
+ suggestions.append("Be helpful without excessive apologies")
346
+
347
+ return max(0.0, score)
348
+
349
+ @classmethod
350
+ def _assess_safety(
351
+ cls,
352
+ response: str,
353
+ issues: List[str],
354
+ suggestions: List[str],
355
+ strengths: List[str]
356
+ ) -> float:
357
+ """Assess: Are technical errors hidden from user?"""
358
+ score = 1.0
359
+
360
+ # Technical error patterns that should NEVER appear
361
+ forbidden_patterns = [
362
+ ('traceback', 'Contains stack trace'),
363
+ ('exception:', 'Shows exception'),
364
+ ('error:', 'Shows raw error'),
365
+ ('tls_error', 'Shows TLS error'),
366
+ ('certificate_verify_failed', 'Shows certificate error'),
367
+ ('upstream connect error', 'Shows connection error'),
368
+ ('api call failed', 'Shows API error'),
369
+ ('⚠️ i couldn\'t finish', 'Shows internal error message')
370
+ ]
371
+
372
+ response_lower = response.lower()
373
+
374
+ for pattern, description in forbidden_patterns:
375
+ if pattern in response_lower:
376
+ score = 0.0 # Immediate fail
377
+ issues.append(f"CRITICAL: {description}")
378
+ suggestions.append("Replace with user-friendly error message")
379
+ break
380
+
381
+ if score == 1.0:
382
+ strengths.append("No technical errors exposed")
383
+
384
+ return score
385
+
386
+ @classmethod
387
+ def improve_response(
388
+ cls,
389
+ response: str,
390
+ assessment: QualityAssessment,
391
+ query: str
392
+ ) -> str:
393
+ """
394
+ Apply automatic improvements based on assessment
395
+
396
+ This can fix some issues without LLM call:
397
+ - Add structure (bullets)
398
+ - Fix capitalization
399
+ - Remove technical errors
400
+ - Trim excessive length
401
+ """
402
+ improved = response
403
+
404
+ # Fix: No paragraphs (add line breaks at sentence boundaries)
405
+ if "Wall of text" in str(assessment.issues):
406
+ sentences = re.split(r'([.!?]+)', improved)
407
+ chunks = []
408
+ for i in range(0, len(sentences), 4): # Group ~2 sentences
409
+ chunk = ''.join(sentences[i:i+4])
410
+ if chunk.strip():
411
+ chunks.append(chunk.strip())
412
+ improved = '\n\n'.join(chunks)
413
+
414
+ # Fix: Doesn't start with capital
415
+ if improved and not improved[0].isupper():
416
+ improved = improved[0].upper() + improved[1:]
417
+
418
+ # Fix: Technical errors exposed (try to clean)
419
+ if "technical errors exposed" in ' '.join(assessment.issues).lower():
420
+ # Remove lines with technical errors
421
+ forbidden_terms = ['traceback', 'exception:', 'error:', 'tls_error', 'certificate']
422
+ lines = improved.split('\n')
423
+ cleaned_lines = [
424
+ line for line in lines
425
+ if not any(term in line.lower() for term in forbidden_terms)
426
+ ]
427
+ improved = '\n'.join(cleaned_lines)
428
+
429
+ # If we removed too much, add generic message
430
+ if len(improved.strip()) < 20:
431
+ improved = "I encountered an issue while processing that. Could you try rephrasing?"
432
+
433
+ return improved
434
+
435
+
436
+ def assess_response_quality(
437
+ response: str,
438
+ query: str,
439
+ context: Dict[str, Any] = None
440
+ ) -> QualityAssessment:
441
+ """Convenience function for quality assessment"""
442
+ return ResponseQualityGate.assess(response, query, context or {})