@clawos-dev/clawd 0.2.50 → 0.2.51-beta.78.2024c11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/persona-defaults/persona-clawd-helper/CLAUDE.md +1 -1
  2. package/dist/persona-defaults/persona-knowledge-base/CLAUDE.md +19 -0
  3. package/dist/persona-defaults/persona-researcher/CLAUDE.md +20 -1
  4. package/package.json +1 -1
  5. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/SKILL.md +0 -187
  6. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/archive-template.md +0 -21
  7. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/article-template.md +0 -20
  8. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/index-template.md +0 -18
  9. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/raw-template.md +0 -7
  10. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/README.md +0 -119
  11. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/SKILL.md +0 -108
  12. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/continuation.md +0 -167
  13. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/html-generation.md +0 -103
  14. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/methodology.md +0 -421
  15. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/quality-gates.md +0 -192
  16. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/report-assembly.md +0 -130
  17. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/weasyprint_guidelines.md +0 -324
  18. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/requirements.txt +0 -14
  19. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/claim.schema.json +0 -49
  20. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/evidence.schema.json +0 -43
  21. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/run_manifest.schema.json +0 -97
  22. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/source.schema.json +0 -49
  23. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/citation_manager.py +0 -300
  24. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/evidence_store.py +0 -205
  25. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/extract_claims.py +0 -358
  26. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/md_to_html.py +0 -330
  27. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/research_engine.py +0 -584
  28. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/source_evaluator.py +0 -292
  29. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/validate_report.py +0 -354
  30. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_citations.py +0 -426
  31. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_claim_support.py +0 -344
  32. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_html.py +0 -220
  33. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/mckinsey_report_template.html +0 -443
  34. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/report_template.md +0 -414
  35. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/invalid_report.md +0 -27
  36. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/valid_report.md +0 -114
  37. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_citation_manager.py +0 -195
  38. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_evidence_store.py +0 -166
  39. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_extract_claims.py +0 -213
  40. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_verify_claim_support.py +0 -230
  41. package/dist/persona-defaults/persona-researcher/skills-lock.json +0 -11
@@ -1,426 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Citation Verification Script
4
-
5
- Catches fabricated citations by checking:
6
- 1. DOI resolution (via doi.org)
7
- 2. Basic metadata matching (title similarity, year match)
8
- 3. URL accessibility verification
9
- 4. Hallucination pattern detection (generic titles, suspicious patterns)
10
- 5. Flags suspicious entries for manual review
11
-
12
- Usage:
13
- python verify_citations.py --report [path]
14
- python verify_citations.py --report [path] --strict # Fail on any unverified
15
-
16
- Does NOT require API keys - uses free DOI resolver and heuristics.
17
- """
18
-
19
- import sys
20
- import argparse
21
- import re
22
- from pathlib import Path
23
- from typing import List, Dict, Tuple
24
- from urllib import request, error
25
- from urllib.parse import quote
26
- import json
27
- import time
28
- from datetime import datetime
29
-
30
- class CitationVerifier:
31
- """Verify citations in research report"""
32
-
33
- def __init__(self, report_path: Path, strict_mode: bool = False):
34
- self.report_path = report_path
35
- self.strict_mode = strict_mode
36
- self.content = self._read_report()
37
- self.suspicious = []
38
- self.verified = []
39
- self.errors = []
40
-
41
- # Hallucination detection patterns (2025 CiteGuard enhancement)
42
- self.suspicious_patterns = [
43
- # Generic academic-sounding but fake patterns
44
- (r'^(A |An |The )?(Study|Analysis|Review|Survey|Investigation) (of|on|into)',
45
- "Generic academic title pattern"),
46
- (r'^(Recent|Current|Modern|Contemporary) (Advances|Developments|Trends) in',
47
- "Generic 'advances' title pattern"),
48
- # Too perfect, templated titles
49
- (r'^[A-Z][a-z]+ [A-Z][a-z]+: A (Comprehensive|Complete|Systematic) (Review|Analysis|Guide)$',
50
- "Too perfect, templated structure"),
51
- ]
52
-
53
- def _read_report(self) -> str:
54
- """Read report file"""
55
- try:
56
- with open(self.report_path, 'r', encoding='utf-8') as f:
57
- return f.read()
58
- except Exception as e:
59
- print(f"L ERROR: Cannot read report: {e}")
60
- sys.exit(1)
61
-
62
- def extract_bibliography(self) -> List[Dict]:
63
- """Extract bibliography entries from report"""
64
- pattern = r'## Bibliography(.*?)(?=##|\Z)'
65
- match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE)
66
-
67
- if not match:
68
- self.errors.append("No Bibliography section found")
69
- return []
70
-
71
- bib_section = match.group(1)
72
-
73
- # Parse entries: [N] Author (Year). "Title". Venue. URL
74
- entries = []
75
- lines = bib_section.strip().split('\n')
76
-
77
- current_entry = None
78
- for line in lines:
79
- line = line.strip()
80
- if not line:
81
- continue
82
-
83
- # Check if starts with citation number [N]
84
- match_num = re.match(r'^\[(\d+)\]\s+(.+)$', line)
85
- if match_num:
86
- if current_entry:
87
- entries.append(current_entry)
88
-
89
- num = match_num.group(1)
90
- rest = match_num.group(2)
91
-
92
- # Try to parse: Author (Year). "Title". Venue. URL
93
- year_match = re.search(r'\((\d{4})\)', rest)
94
- title_match = re.search(r'"([^"]+)"', rest)
95
- doi_match = re.search(r'doi\.org/(10\.\S+)', rest)
96
- url_match = re.search(r'https?://[^\s\)]+', rest)
97
-
98
- current_entry = {
99
- 'num': num,
100
- 'raw': rest,
101
- 'year': year_match.group(1) if year_match else None,
102
- 'title': title_match.group(1) if title_match else None,
103
- 'doi': doi_match.group(1) if doi_match else None,
104
- 'url': url_match.group(0) if url_match else None
105
- }
106
- elif current_entry:
107
- # Multi-line entry, append to raw
108
- current_entry['raw'] += ' ' + line
109
-
110
- if current_entry:
111
- entries.append(current_entry)
112
-
113
- return entries
114
-
115
- def verify_doi(self, doi: str) -> Tuple[bool, Dict]:
116
- """
117
- Verify DOI exists and get metadata.
118
- Returns (success, metadata_dict)
119
- """
120
- if not doi:
121
- return False, {}
122
-
123
- try:
124
- # Use content negotiation to get JSON metadata
125
- url = f"https://doi.org/{quote(doi)}"
126
- req = request.Request(url)
127
- req.add_header('Accept', 'application/vnd.citationstyles.csl+json')
128
-
129
- with request.urlopen(req, timeout=10) as response:
130
- data = json.loads(response.read().decode('utf-8'))
131
-
132
- return True, {
133
- 'title': data.get('title', ''),
134
- 'year': data.get('issued', {}).get('date-parts', [[None]])[0][0],
135
- 'authors': [
136
- f"{a.get('family', '')} {a.get('given', '')}"
137
- for a in data.get('author', [])
138
- ],
139
- 'venue': data.get('container-title', '')
140
- }
141
- except error.HTTPError as e:
142
- if e.code == 404:
143
- return False, {'error': 'DOI not found (404)'}
144
- return False, {'error': f'HTTP {e.code}'}
145
- except Exception as e:
146
- return False, {'error': str(e)}
147
-
148
- def verify_url(self, url: str) -> Tuple[bool, str]:
149
- """
150
- Verify URL is accessible (2025 CiteGuard enhancement).
151
- Returns (accessible, status_message)
152
- """
153
- if not url:
154
- return False, "No URL"
155
-
156
- try:
157
- # HEAD request to check accessibility without downloading
158
- req = request.Request(url, method='HEAD')
159
- req.add_header('User-Agent', 'Mozilla/5.0 (Research Citation Verifier)')
160
-
161
- with request.urlopen(req, timeout=10) as response:
162
- if response.status == 200:
163
- return True, "URL accessible"
164
- else:
165
- return False, f"HTTP {response.status}"
166
- except error.HTTPError as e:
167
- return False, f"HTTP {e.code}"
168
- except error.URLError as e:
169
- return False, f"URL error: {e.reason}"
170
- except Exception as e:
171
- return False, f"Connection error: {str(e)[:50]}"
172
-
173
- def detect_hallucination_patterns(self, entry: Dict) -> List[str]:
174
- """
175
- Detect common LLM hallucination patterns in citations (2025 CiteGuard).
176
- Returns list of detected issues.
177
- """
178
- issues = []
179
- title = entry.get('title', '')
180
-
181
- if not title:
182
- return issues
183
-
184
- # Check against suspicious patterns
185
- for pattern, description in self.suspicious_patterns:
186
- if re.match(pattern, title, re.IGNORECASE):
187
- issues.append(f"Suspicious title pattern: {description}")
188
-
189
- # Check for overly generic titles
190
- generic_words = ['overview', 'introduction', 'guide', 'handbook', 'manual']
191
- if any(word in title.lower() for word in generic_words) and len(title.split()) < 5:
192
- issues.append("Very generic short title")
193
-
194
- # Check for placeholder-like titles
195
- if any(x in title.lower() for x in ['tbd', 'todo', 'placeholder', 'example']):
196
- issues.append("Placeholder text in title")
197
-
198
- # Check for inconsistent metadata
199
- if entry.get('year'):
200
- year = int(entry['year'])
201
- current_year = datetime.now().year
202
- # Very recent without DOI or URL is suspicious
203
- if year >= current_year - 1 and not entry.get('doi') and not entry.get('url'):
204
- issues.append(f"Recent year ({year}) with no verification method")
205
- # Future year is definitely wrong
206
- if year > current_year:
207
- issues.append(f"Future year: {year} (current: {current_year})")
208
- # Very old with modern phrasing is suspicious
209
- if year < 2000 and any(word in title.lower() for word in ['ai', 'llm', 'gpt', 'transformer']):
210
- issues.append(f"Anachronistic: pre-2000 ({year}) citation mentioning modern AI terms")
211
-
212
- return issues
213
-
214
- def check_title_similarity(self, title1: str, title2: str) -> float:
215
- """
216
- Simple title similarity check (word overlap).
217
- Returns score 0.0-1.0
218
- """
219
- if not title1 or not title2:
220
- return 0.0
221
-
222
- # Normalize: lowercase, remove punctuation, split
223
- def normalize(s):
224
- s = s.lower()
225
- s = re.sub(r'[^\w\s]', ' ', s)
226
- return set(s.split())
227
-
228
- words1 = normalize(title1)
229
- words2 = normalize(title2)
230
-
231
- if not words1 or not words2:
232
- return 0.0
233
-
234
- overlap = len(words1 & words2)
235
- total = len(words1 | words2)
236
-
237
- return overlap / total if total > 0 else 0.0
238
-
239
- def verify_entry(self, entry: Dict) -> Dict:
240
- """Verify a single bibliography entry (Enhanced 2025 with CiteGuard)"""
241
- result = {
242
- 'num': entry['num'],
243
- 'status': 'unknown',
244
- 'issues': [],
245
- 'metadata': {},
246
- 'verification_methods': []
247
- }
248
-
249
- # STEP 1: Run hallucination detection (CiteGuard 2025)
250
- hallucination_issues = self.detect_hallucination_patterns(entry)
251
- if hallucination_issues:
252
- result['issues'].extend(hallucination_issues)
253
- result['status'] = 'suspicious'
254
-
255
- # STEP 2: Has DOI?
256
- if entry['doi']:
257
- print(f" [{entry['num']}] Checking DOI {entry['doi']}...", end=' ')
258
- success, metadata = self.verify_doi(entry['doi'])
259
-
260
- if success:
261
- result['metadata'] = metadata
262
- result['status'] = 'verified'
263
- print("")
264
-
265
- # Check title similarity if we have both
266
- if entry['title'] and metadata.get('title'):
267
- similarity = self.check_title_similarity(
268
- entry['title'],
269
- metadata['title']
270
- )
271
-
272
- if similarity < 0.5:
273
- result['issues'].append(
274
- f"Title mismatch (similarity: {similarity:.1%})"
275
- )
276
- result['status'] = 'suspicious'
277
-
278
- # Check year match
279
- if entry['year'] and metadata.get('year'):
280
- if int(entry['year']) != int(metadata['year']):
281
- result['issues'].append(
282
- f"Year mismatch: report says {entry['year']}, DOI says {metadata['year']}"
283
- )
284
- result['status'] = 'suspicious'
285
-
286
- else:
287
- print(f"✗ {metadata.get('error', 'Failed')}")
288
- result['status'] = 'unverified'
289
- result['issues'].append(f"DOI resolution failed: {metadata.get('error', 'unknown')}")
290
-
291
- # STEP 3: Check URL accessibility (if no DOI or DOI failed)
292
- if entry['url'] and result['status'] != 'verified':
293
- url_ok, url_status = self.verify_url(entry['url'])
294
- if url_ok:
295
- result['verification_methods'].append('URL')
296
- # Upgrade status if URL verifies
297
- if result['status'] in ['unknown', 'no_doi', 'unverified']:
298
- result['status'] = 'url_verified'
299
- print(f" [{entry['num']}] URL accessible ✓")
300
- else:
301
- result['issues'].append(f"URL check failed: {url_status}")
302
-
303
- # STEP 4: Final fallback - no verification method
304
- if not entry['doi'] and not entry['url']:
305
- if 'No DOI provided' not in ' '.join(result['issues']):
306
- result['issues'].append("No DOI or URL - cannot verify")
307
- result['status'] = 'suspicious'
308
-
309
- return result
310
-
311
- def verify_all(self):
312
- """Verify all bibliography entries"""
313
- print(f"\n{'='*60}")
314
- print(f"CITATION VERIFICATION: {self.report_path.name}")
315
- print(f"{'='*60}\n")
316
-
317
- entries = self.extract_bibliography()
318
-
319
- if not entries:
320
- print("L No bibliography entries found\n")
321
- return False
322
-
323
- print(f"Found {len(entries)} citations\n")
324
-
325
- results = []
326
- for entry in entries:
327
- result = self.verify_entry(entry)
328
- results.append(result)
329
-
330
- # Rate limiting
331
- time.sleep(0.5)
332
-
333
- # Summarize
334
- print(f"\n{'='*60}")
335
- print(f"VERIFICATION SUMMARY")
336
- print(f"{'='*60}\n")
337
-
338
- verified = [r for r in results if r['status'] == 'verified']
339
- url_verified = [r for r in results if r['status'] == 'url_verified']
340
- suspicious = [r for r in results if r['status'] == 'suspicious']
341
- unverified = [r for r in results if r['status'] in ['unverified', 'no_doi', 'unknown']]
342
-
343
- print(f'DOI Verified: {len(verified)}/{len(results)}')
344
- print(f'URL Verified: {len(url_verified)}/{len(results)}')
345
- print(f'Suspicious: {len(suspicious)}/{len(results)}')
346
- print(f'Unverified: {len(unverified)}/{len(results)}')
347
- print()
348
-
349
- if suspicious:
350
- print('SUSPICIOUS CITATIONS (Manual Review Needed):')
351
- for r in suspicious:
352
- print(f"\n [{r['num']}]")
353
- for issue in r['issues']:
354
- print(f" - {issue}")
355
- print()
356
-
357
- if unverified and len(unverified) > 0:
358
- print('UNVERIFIED CITATIONS (Could not check):')
359
- for r in unverified:
360
- print(f" [{r['num']}] {r['issues'][0] if r['issues'] else 'Unknown'}")
361
- print()
362
-
363
- # Decision (Enhanced 2025 - includes URL-verified as acceptable)
364
- total_verified = len(verified) + len(url_verified)
365
-
366
- if suspicious:
367
- print('WARNING: Suspicious citations detected')
368
- if self.strict_mode:
369
- print(' STRICT MODE: Failing due to suspicious citations')
370
- return False
371
- else:
372
- print(' (Continuing in non-strict mode)')
373
-
374
- if self.strict_mode and unverified:
375
- print('STRICT MODE: Unverified citations found')
376
- return False
377
-
378
- if total_verified / len(results) < 0.5:
379
- print('WARNING: Less than 50% citations verified')
380
- return True # Pass with warning
381
- else:
382
- print('CITATION VERIFICATION PASSED')
383
- return True
384
-
385
-
386
- def main():
387
- parser = argparse.ArgumentParser(
388
- description="Verify citations in research report",
389
- formatter_class=argparse.RawDescriptionHelpFormatter,
390
- epilog="""
391
- Examples:
392
- python verify_citations.py --report report.md
393
-
394
- Note: Requires internet connection to check DOIs.
395
- Uses free DOI resolver - no API key needed.
396
- """
397
- )
398
-
399
- parser.add_argument(
400
- '--report', '-r',
401
- type=str,
402
- required=True,
403
- help='Path to research report markdown file'
404
- )
405
-
406
- parser.add_argument(
407
- '--strict',
408
- action='store_true',
409
- help='Strict mode: fail on any unverified or suspicious citations'
410
- )
411
-
412
- args = parser.parse_args()
413
- report_path = Path(args.report)
414
-
415
- if not report_path.exists():
416
- print(f"ERROR: Report file not found: {report_path}")
417
- sys.exit(1)
418
-
419
- verifier = CitationVerifier(report_path, strict_mode=args.strict)
420
- passed = verifier.verify_all()
421
-
422
- sys.exit(0 if passed else 1)
423
-
424
-
425
- if __name__ == '__main__':
426
- main()