@clawos-dev/clawd 0.2.50 → 0.2.51-beta.78.2024c11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/persona-defaults/persona-clawd-helper/CLAUDE.md +1 -1
- package/dist/persona-defaults/persona-knowledge-base/CLAUDE.md +19 -0
- package/dist/persona-defaults/persona-researcher/CLAUDE.md +20 -1
- package/package.json +1 -1
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/SKILL.md +0 -187
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/archive-template.md +0 -21
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/article-template.md +0 -20
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/index-template.md +0 -18
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/raw-template.md +0 -7
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/README.md +0 -119
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/SKILL.md +0 -108
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/continuation.md +0 -167
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/html-generation.md +0 -103
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/methodology.md +0 -421
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/quality-gates.md +0 -192
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/report-assembly.md +0 -130
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/weasyprint_guidelines.md +0 -324
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/requirements.txt +0 -14
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/claim.schema.json +0 -49
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/evidence.schema.json +0 -43
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/run_manifest.schema.json +0 -97
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/source.schema.json +0 -49
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/citation_manager.py +0 -300
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/evidence_store.py +0 -205
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/extract_claims.py +0 -358
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/md_to_html.py +0 -330
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/research_engine.py +0 -584
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/source_evaluator.py +0 -292
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/validate_report.py +0 -354
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_citations.py +0 -426
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_claim_support.py +0 -344
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_html.py +0 -220
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/mckinsey_report_template.html +0 -443
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/report_template.md +0 -414
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/invalid_report.md +0 -27
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/valid_report.md +0 -114
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_citation_manager.py +0 -195
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_evidence_store.py +0 -166
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_extract_claims.py +0 -213
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_verify_claim_support.py +0 -230
- package/dist/persona-defaults/persona-researcher/skills-lock.json +0 -11
|
@@ -1,358 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Atomic Claim Extractor — decomposes report sections into typed claims.
|
|
4
|
-
|
|
5
|
-
CLI subcommands:
|
|
6
|
-
extract Parse a markdown report into atomic claims (claims.jsonl)
|
|
7
|
-
add Manually add a single claim
|
|
8
|
-
list List claims, optionally filtered by section or type
|
|
9
|
-
stats Show claim statistics (counts by type/status)
|
|
10
|
-
|
|
11
|
-
Claim identity:
|
|
12
|
-
claim_id = sha256(section_id + normalized_text)[:16]
|
|
13
|
-
|
|
14
|
-
Claim types (per GPT Pro's refinement of Codex's proposal):
|
|
15
|
-
- factual: hard-fails on lack of support
|
|
16
|
-
- synthesis: needs traceability, softer threshold
|
|
17
|
-
- recommendation: needs traceability, softer threshold
|
|
18
|
-
- speculation: labeled, no support gate
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
import argparse
|
|
22
|
-
import hashlib
|
|
23
|
-
import json
|
|
24
|
-
import os
|
|
25
|
-
import re
|
|
26
|
-
import sys
|
|
27
|
-
from datetime import datetime, timezone
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
# ---------------------------------------------------------------------------
|
|
31
|
-
# Claim ID computation
|
|
32
|
-
# ---------------------------------------------------------------------------
|
|
33
|
-
|
|
34
|
-
_WHITESPACE_RE = re.compile(r'\s+')
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def normalize_text(text: str) -> str:
|
|
38
|
-
"""Normalize for stable hashing."""
|
|
39
|
-
return _WHITESPACE_RE.sub(' ', text.strip()).lower()
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def compute_claim_id(section_id: str, text: str) -> str:
|
|
43
|
-
"""sha256(section_id + normalized_text)[:16] hex."""
|
|
44
|
-
payload = section_id + normalize_text(text)
|
|
45
|
-
return hashlib.sha256(payload.encode('utf-8')).hexdigest()[:16]
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
# ---------------------------------------------------------------------------
|
|
49
|
-
# JSONL helpers
|
|
50
|
-
# ---------------------------------------------------------------------------
|
|
51
|
-
|
|
52
|
-
def append_jsonl(path: str, obj: dict) -> None:
|
|
53
|
-
with open(path, 'a') as f:
|
|
54
|
-
f.write(json.dumps(obj, ensure_ascii=False) + '\n')
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def read_jsonl(path: str) -> list[dict]:
|
|
58
|
-
rows = []
|
|
59
|
-
if not os.path.exists(path):
|
|
60
|
-
return rows
|
|
61
|
-
with open(path) as f:
|
|
62
|
-
for line in f:
|
|
63
|
-
line = line.strip()
|
|
64
|
-
if line:
|
|
65
|
-
rows.append(json.loads(line))
|
|
66
|
-
return rows
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
# ---------------------------------------------------------------------------
|
|
70
|
-
# Report parsing helpers
|
|
71
|
-
# ---------------------------------------------------------------------------
|
|
72
|
-
|
|
73
|
-
# Section header patterns
|
|
74
|
-
SECTION_PATTERNS = [
|
|
75
|
-
(re.compile(r'^##\s+Executive\s+Summary', re.I), 'executive_summary'),
|
|
76
|
-
(re.compile(r'^##\s+Introduction', re.I), 'introduction'),
|
|
77
|
-
(re.compile(r'^##\s+Finding\s+(\d+)', re.I), lambda m: f'finding_{m.group(1)}'),
|
|
78
|
-
(re.compile(r'^##\s+Synthesis', re.I), 'synthesis'),
|
|
79
|
-
(re.compile(r'^##\s+Limitations', re.I), 'limitations'),
|
|
80
|
-
(re.compile(r'^##\s+Recommendations', re.I), 'recommendations'),
|
|
81
|
-
(re.compile(r'^##\s+Conclusion', re.I), 'conclusion'),
|
|
82
|
-
(re.compile(r'^##\s+(.+)', re.I), lambda m: re.sub(r'\W+', '_', m.group(1).strip().lower())[:30]),
|
|
83
|
-
]
|
|
84
|
-
|
|
85
|
-
# Citation pattern [N] or [N, M]
|
|
86
|
-
CITATION_RE = re.compile(r'\[(\d+(?:,\s*\d+)*)\]')
|
|
87
|
-
|
|
88
|
-
# Sentence splitting (basic but handles abbreviations)
|
|
89
|
-
SENTENCE_RE = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def classify_claim(text: str, section_id: str) -> str:
|
|
93
|
-
"""Heuristic claim type classification."""
|
|
94
|
-
lower = text.lower()
|
|
95
|
-
|
|
96
|
-
# Recommendation indicators
|
|
97
|
-
if any(w in lower for w in ['should', 'recommend', 'suggest', 'advise', 'consider']):
|
|
98
|
-
if section_id == 'recommendations':
|
|
99
|
-
return 'recommendation'
|
|
100
|
-
return 'recommendation'
|
|
101
|
-
|
|
102
|
-
# Speculation indicators
|
|
103
|
-
if any(w in lower for w in ['might', 'could potentially', 'it is possible', 'may eventually',
|
|
104
|
-
'hypothetically', 'speculatively']):
|
|
105
|
-
return 'speculation'
|
|
106
|
-
|
|
107
|
-
# Synthesis indicators (often in synthesis/conclusion sections)
|
|
108
|
-
if section_id in ('synthesis', 'conclusion', 'limitations'):
|
|
109
|
-
if any(w in lower for w in ['overall', 'taken together', 'collectively',
|
|
110
|
-
'the evidence suggests', 'this implies']):
|
|
111
|
-
return 'synthesis'
|
|
112
|
-
|
|
113
|
-
# Default: factual
|
|
114
|
-
return 'factual'
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def parse_sections(markdown: str) -> list[tuple[str, str]]:
|
|
118
|
-
"""Parse markdown into (section_id, content) pairs."""
|
|
119
|
-
lines = markdown.split('\n')
|
|
120
|
-
sections = []
|
|
121
|
-
current_id = 'preamble'
|
|
122
|
-
current_lines = []
|
|
123
|
-
|
|
124
|
-
for line in lines:
|
|
125
|
-
matched = False
|
|
126
|
-
for pattern, id_or_fn in SECTION_PATTERNS:
|
|
127
|
-
m = pattern.match(line)
|
|
128
|
-
if m:
|
|
129
|
-
if current_lines:
|
|
130
|
-
sections.append((current_id, '\n'.join(current_lines)))
|
|
131
|
-
current_id = id_or_fn(m) if callable(id_or_fn) else id_or_fn
|
|
132
|
-
current_lines = []
|
|
133
|
-
matched = True
|
|
134
|
-
break
|
|
135
|
-
if not matched:
|
|
136
|
-
current_lines.append(line)
|
|
137
|
-
|
|
138
|
-
if current_lines:
|
|
139
|
-
sections.append((current_id, '\n'.join(current_lines)))
|
|
140
|
-
|
|
141
|
-
return sections
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def extract_sentences(text: str) -> list[str]:
|
|
145
|
-
"""Split text into sentences, filtering noise."""
|
|
146
|
-
# Remove markdown formatting noise
|
|
147
|
-
text = re.sub(r'^[-*]\s+', '', text, flags=re.M) # bullet points
|
|
148
|
-
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # bold
|
|
149
|
-
text = re.sub(r'\*([^*]+)\*', r'\1', text) # italic
|
|
150
|
-
|
|
151
|
-
sentences = SENTENCE_RE.split(text)
|
|
152
|
-
result = []
|
|
153
|
-
for s in sentences:
|
|
154
|
-
s = s.strip()
|
|
155
|
-
# Filter out very short fragments, headings, empty lines
|
|
156
|
-
if len(s) > 30 and not s.startswith('#') and not s.startswith('|'):
|
|
157
|
-
result.append(s)
|
|
158
|
-
return result
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
# ---------------------------------------------------------------------------
|
|
162
|
-
# Subcommands
|
|
163
|
-
# ---------------------------------------------------------------------------
|
|
164
|
-
|
|
165
|
-
def cmd_extract(args: argparse.Namespace) -> None:
|
|
166
|
-
"""Extract atomic claims from a markdown report."""
|
|
167
|
-
report_path = args.report
|
|
168
|
-
if not os.path.exists(report_path):
|
|
169
|
-
print(json.dumps({'error': f'Report not found: {report_path}'}), file=sys.stderr)
|
|
170
|
-
sys.exit(1)
|
|
171
|
-
|
|
172
|
-
with open(report_path) as f:
|
|
173
|
-
markdown = f.read()
|
|
174
|
-
|
|
175
|
-
claims_path = os.path.join(args.dir, 'claims.jsonl')
|
|
176
|
-
existing_ids = {r['claim_id'] for r in read_jsonl(claims_path)}
|
|
177
|
-
|
|
178
|
-
sections = parse_sections(markdown)
|
|
179
|
-
added = 0
|
|
180
|
-
skipped = 0
|
|
181
|
-
|
|
182
|
-
for section_id, content in sections:
|
|
183
|
-
if section_id == 'preamble':
|
|
184
|
-
continue
|
|
185
|
-
sentences = extract_sentences(content)
|
|
186
|
-
for sentence in sentences:
|
|
187
|
-
claim_id = compute_claim_id(section_id, sentence)
|
|
188
|
-
if claim_id in existing_ids:
|
|
189
|
-
skipped += 1
|
|
190
|
-
continue
|
|
191
|
-
|
|
192
|
-
# Extract citation numbers from sentence
|
|
193
|
-
citation_nums = []
|
|
194
|
-
for m in CITATION_RE.finditer(sentence):
|
|
195
|
-
nums = [int(n.strip()) for n in m.group(1).split(',')]
|
|
196
|
-
citation_nums.extend(nums)
|
|
197
|
-
|
|
198
|
-
claim = {
|
|
199
|
-
'claim_id': claim_id,
|
|
200
|
-
'section_id': section_id,
|
|
201
|
-
'text': sentence,
|
|
202
|
-
'claim_type': classify_claim(sentence, section_id),
|
|
203
|
-
'cited_source_ids': [], # Populated by linking step
|
|
204
|
-
'evidence_ids': [], # Populated by verify_claim_support
|
|
205
|
-
'support_status': 'unverified',
|
|
206
|
-
'extracted_at': datetime.now(timezone.utc).isoformat(),
|
|
207
|
-
'_citation_numbers': citation_nums, # Temporary, for linking
|
|
208
|
-
}
|
|
209
|
-
append_jsonl(claims_path, claim)
|
|
210
|
-
existing_ids.add(claim_id)
|
|
211
|
-
added += 1
|
|
212
|
-
|
|
213
|
-
print(json.dumps({
|
|
214
|
-
'status': 'ok',
|
|
215
|
-
'claims_added': added,
|
|
216
|
-
'claims_skipped': skipped,
|
|
217
|
-
'total_claims': len(existing_ids),
|
|
218
|
-
}))
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def cmd_add(args: argparse.Namespace) -> None:
|
|
222
|
-
"""Manually add a single claim."""
|
|
223
|
-
data = json.loads(args.json)
|
|
224
|
-
section_id = data.get('section_id', 'unknown')
|
|
225
|
-
text = data.get('text', '')
|
|
226
|
-
if not text:
|
|
227
|
-
print(json.dumps({'error': 'text is required'}), file=sys.stderr)
|
|
228
|
-
sys.exit(1)
|
|
229
|
-
|
|
230
|
-
claim_id = compute_claim_id(section_id, text)
|
|
231
|
-
claims_path = os.path.join(args.dir, 'claims.jsonl')
|
|
232
|
-
|
|
233
|
-
existing = read_jsonl(claims_path)
|
|
234
|
-
for row in existing:
|
|
235
|
-
if row.get('claim_id') == claim_id:
|
|
236
|
-
print(json.dumps({'status': 'duplicate', 'claim_id': claim_id}))
|
|
237
|
-
return
|
|
238
|
-
|
|
239
|
-
valid_types = {'factual', 'synthesis', 'recommendation', 'speculation'}
|
|
240
|
-
claim_type = data.get('claim_type', 'factual')
|
|
241
|
-
if claim_type not in valid_types:
|
|
242
|
-
claim_type = 'factual'
|
|
243
|
-
|
|
244
|
-
claim = {
|
|
245
|
-
'claim_id': claim_id,
|
|
246
|
-
'section_id': section_id,
|
|
247
|
-
'text': text,
|
|
248
|
-
'claim_type': claim_type,
|
|
249
|
-
'cited_source_ids': data.get('cited_source_ids', []),
|
|
250
|
-
'evidence_ids': data.get('evidence_ids', []),
|
|
251
|
-
'support_status': 'unverified',
|
|
252
|
-
'extracted_at': datetime.now(timezone.utc).isoformat(),
|
|
253
|
-
}
|
|
254
|
-
append_jsonl(claims_path, claim)
|
|
255
|
-
print(json.dumps({'status': 'added', 'claim_id': claim_id}))
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
def cmd_list(args: argparse.Namespace) -> None:
|
|
259
|
-
"""List claims with optional filters."""
|
|
260
|
-
claims_path = os.path.join(args.dir, 'claims.jsonl')
|
|
261
|
-
rows = read_jsonl(claims_path)
|
|
262
|
-
|
|
263
|
-
if args.section:
|
|
264
|
-
rows = [r for r in rows if r.get('section_id') == args.section]
|
|
265
|
-
if args.type:
|
|
266
|
-
rows = [r for r in rows if r.get('claim_type') == args.type]
|
|
267
|
-
if args.status:
|
|
268
|
-
rows = [r for r in rows if r.get('support_status') == args.status]
|
|
269
|
-
|
|
270
|
-
# Deduplicate
|
|
271
|
-
seen = set()
|
|
272
|
-
unique = []
|
|
273
|
-
for r in rows:
|
|
274
|
-
cid = r.get('claim_id')
|
|
275
|
-
if cid not in seen:
|
|
276
|
-
seen.add(cid)
|
|
277
|
-
unique.append(r)
|
|
278
|
-
|
|
279
|
-
print(json.dumps({'count': len(unique), 'claims': unique}, indent=2, ensure_ascii=False))
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
def cmd_stats(args: argparse.Namespace) -> None:
|
|
283
|
-
"""Show claim statistics."""
|
|
284
|
-
claims_path = os.path.join(args.dir, 'claims.jsonl')
|
|
285
|
-
rows = read_jsonl(claims_path)
|
|
286
|
-
|
|
287
|
-
# Deduplicate
|
|
288
|
-
seen = set()
|
|
289
|
-
unique = []
|
|
290
|
-
for r in rows:
|
|
291
|
-
cid = r.get('claim_id')
|
|
292
|
-
if cid not in seen:
|
|
293
|
-
seen.add(cid)
|
|
294
|
-
unique.append(r)
|
|
295
|
-
|
|
296
|
-
by_type = {}
|
|
297
|
-
by_status = {}
|
|
298
|
-
by_section = {}
|
|
299
|
-
for r in unique:
|
|
300
|
-
t = r.get('claim_type', 'unknown')
|
|
301
|
-
s = r.get('support_status', 'unknown')
|
|
302
|
-
sec = r.get('section_id', 'unknown')
|
|
303
|
-
by_type[t] = by_type.get(t, 0) + 1
|
|
304
|
-
by_status[s] = by_status.get(s, 0) + 1
|
|
305
|
-
by_section[sec] = by_section.get(sec, 0) + 1
|
|
306
|
-
|
|
307
|
-
print(json.dumps({
|
|
308
|
-
'total': len(unique),
|
|
309
|
-
'by_type': by_type,
|
|
310
|
-
'by_status': by_status,
|
|
311
|
-
'by_section': by_section,
|
|
312
|
-
}, indent=2))
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
# ---------------------------------------------------------------------------
|
|
316
|
-
# CLI entry point
|
|
317
|
-
# ---------------------------------------------------------------------------
|
|
318
|
-
|
|
319
|
-
def main() -> None:
|
|
320
|
-
parser = argparse.ArgumentParser(
|
|
321
|
-
prog='extract_claims',
|
|
322
|
-
description='Atomic claim extraction and ledger for deep-research v3.0',
|
|
323
|
-
)
|
|
324
|
-
sub = parser.add_subparsers(dest='command', required=True)
|
|
325
|
-
|
|
326
|
-
# extract
|
|
327
|
-
p_ext = sub.add_parser('extract', help='Extract claims from markdown report')
|
|
328
|
-
p_ext.add_argument('--report', required=True, help='Path to report.md')
|
|
329
|
-
p_ext.add_argument('--dir', required=True, help='Run directory containing claims.jsonl')
|
|
330
|
-
|
|
331
|
-
# add
|
|
332
|
-
p_add = sub.add_parser('add', help='Manually add a single claim')
|
|
333
|
-
p_add.add_argument('--json', required=True, help='JSON with section_id, text, claim_type')
|
|
334
|
-
p_add.add_argument('--dir', required=True, help='Run directory')
|
|
335
|
-
|
|
336
|
-
# list
|
|
337
|
-
p_list = sub.add_parser('list', help='List claims')
|
|
338
|
-
p_list.add_argument('--dir', required=True, help='Run directory')
|
|
339
|
-
p_list.add_argument('--section', default=None, help='Filter by section_id')
|
|
340
|
-
p_list.add_argument('--type', default=None, help='Filter by claim_type')
|
|
341
|
-
p_list.add_argument('--status', default=None, help='Filter by support_status')
|
|
342
|
-
|
|
343
|
-
# stats
|
|
344
|
-
p_stats = sub.add_parser('stats', help='Claim statistics')
|
|
345
|
-
p_stats.add_argument('--dir', required=True, help='Run directory')
|
|
346
|
-
|
|
347
|
-
args = parser.parse_args()
|
|
348
|
-
dispatch = {
|
|
349
|
-
'extract': cmd_extract,
|
|
350
|
-
'add': cmd_add,
|
|
351
|
-
'list': cmd_list,
|
|
352
|
-
'stats': cmd_stats,
|
|
353
|
-
}
|
|
354
|
-
dispatch[args.command](args)
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
if __name__ == '__main__':
|
|
358
|
-
main()
|
package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/md_to_html.py
DELETED
|
@@ -1,330 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Markdown to HTML converter for research reports
|
|
4
|
-
Properly converts markdown sections to HTML while preserving structure and formatting
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import re
|
|
8
|
-
from typing import Tuple
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def convert_markdown_to_html(markdown_text: str) -> Tuple[str, str]:
|
|
13
|
-
"""
|
|
14
|
-
Convert markdown to HTML in two parts: content and bibliography
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
markdown_text: Full markdown report text
|
|
18
|
-
|
|
19
|
-
Returns:
|
|
20
|
-
Tuple of (content_html, bibliography_html)
|
|
21
|
-
"""
|
|
22
|
-
# Split content and bibliography
|
|
23
|
-
parts = markdown_text.split('## Bibliography')
|
|
24
|
-
content_md = parts[0]
|
|
25
|
-
bibliography_md = parts[1] if len(parts) > 1 else ""
|
|
26
|
-
|
|
27
|
-
# Convert content (everything except bibliography)
|
|
28
|
-
content_html = _convert_content_section(content_md)
|
|
29
|
-
|
|
30
|
-
# Convert bibliography separately
|
|
31
|
-
bibliography_html = _convert_bibliography_section(bibliography_md)
|
|
32
|
-
|
|
33
|
-
return content_html, bibliography_html
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def _convert_content_section(markdown: str) -> str:
|
|
37
|
-
"""Convert main content sections to HTML"""
|
|
38
|
-
html = markdown
|
|
39
|
-
|
|
40
|
-
# Remove title and front matter (first ## heading is handled separately)
|
|
41
|
-
lines = html.split('\n')
|
|
42
|
-
processed_lines = []
|
|
43
|
-
skip_until_first_section = True
|
|
44
|
-
|
|
45
|
-
for line in lines:
|
|
46
|
-
# Skip everything until we hit "## Executive Summary" or first major section
|
|
47
|
-
if skip_until_first_section:
|
|
48
|
-
if line.startswith('## ') and not line.startswith('### '):
|
|
49
|
-
skip_until_first_section = False
|
|
50
|
-
processed_lines.append(line)
|
|
51
|
-
continue
|
|
52
|
-
processed_lines.append(line)
|
|
53
|
-
|
|
54
|
-
html = '\n'.join(processed_lines)
|
|
55
|
-
|
|
56
|
-
# Convert headers
|
|
57
|
-
# ## Section Title → <div class="section"><h2 class="section-title">Section Title</h2></div>
|
|
58
|
-
html = re.sub(
|
|
59
|
-
r'^## (.+)$',
|
|
60
|
-
r'<div class="section"><h2 class="section-title">\1</h2>',
|
|
61
|
-
html,
|
|
62
|
-
flags=re.MULTILINE
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
# ### Subsection → <h3 class="subsection-title">Subsection</h3>
|
|
66
|
-
html = re.sub(
|
|
67
|
-
r'^### (.+)$',
|
|
68
|
-
r'<h3 class="subsection-title">\1</h3>',
|
|
69
|
-
html,
|
|
70
|
-
flags=re.MULTILINE
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
# #### Subsubsection → <h4 class="subsubsection-title">Title</h4>
|
|
74
|
-
html = re.sub(
|
|
75
|
-
r'^#### (.+)$',
|
|
76
|
-
r'<h4 class="subsubsection-title">\1</h4>',
|
|
77
|
-
html,
|
|
78
|
-
flags=re.MULTILINE
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
# Convert **bold** text
|
|
82
|
-
html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
|
|
83
|
-
|
|
84
|
-
# Convert *italic* text
|
|
85
|
-
html = re.sub(r'\*(.+?)\*', r'<em>\1</em>', html)
|
|
86
|
-
|
|
87
|
-
# Convert inline code `code`
|
|
88
|
-
html = re.sub(r'`(.+?)`', r'<code>\1</code>', html)
|
|
89
|
-
|
|
90
|
-
# Convert unordered lists
|
|
91
|
-
html = _convert_lists(html)
|
|
92
|
-
|
|
93
|
-
# Convert tables
|
|
94
|
-
html = _convert_tables(html)
|
|
95
|
-
|
|
96
|
-
# Convert paragraphs (wrap non-HTML lines in <p> tags)
|
|
97
|
-
html = _convert_paragraphs(html)
|
|
98
|
-
|
|
99
|
-
# Close all open sections
|
|
100
|
-
html = _close_sections(html)
|
|
101
|
-
|
|
102
|
-
# Wrap executive summary if present
|
|
103
|
-
html = html.replace(
|
|
104
|
-
'<h2 class="section-title">Executive Summary</h2>',
|
|
105
|
-
'<div class="executive-summary"><h2 class="section-title">Executive Summary</h2>'
|
|
106
|
-
)
|
|
107
|
-
if '<div class="executive-summary">' in html:
|
|
108
|
-
# Close executive summary at the next section
|
|
109
|
-
html = html.replace(
|
|
110
|
-
'</h2>\n<div class="section">',
|
|
111
|
-
'</h2></div>\n<div class="section">',
|
|
112
|
-
1
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
return html
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def _convert_bibliography_section(markdown: str) -> str:
|
|
119
|
-
"""Convert bibliography section to HTML"""
|
|
120
|
-
if not markdown.strip():
|
|
121
|
-
return ""
|
|
122
|
-
|
|
123
|
-
html = markdown
|
|
124
|
-
|
|
125
|
-
# Convert each [N] citation to a proper bibliography entry
|
|
126
|
-
# Look for patterns like [1] Title - URL
|
|
127
|
-
html = re.sub(
|
|
128
|
-
r'\[(\d+)\]\s*(.+?)\s*-\s*(https?://[^\s\)]+)',
|
|
129
|
-
r'<div class="bib-entry"><span class="bib-number">[\1]</span> <a href="\3" target="_blank">\2</a></div>',
|
|
130
|
-
html
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
# Convert any remaining **bold** sections
|
|
134
|
-
html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
|
|
135
|
-
|
|
136
|
-
# Wrap in bibliography content div
|
|
137
|
-
html = f'<div class="bibliography-content">{html}</div>'
|
|
138
|
-
|
|
139
|
-
return html
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def _convert_lists(html: str) -> str:
|
|
143
|
-
"""Convert markdown lists to HTML lists"""
|
|
144
|
-
lines = html.split('\n')
|
|
145
|
-
result = []
|
|
146
|
-
in_list = False
|
|
147
|
-
list_level = 0
|
|
148
|
-
|
|
149
|
-
for i, line in enumerate(lines):
|
|
150
|
-
stripped = line.strip()
|
|
151
|
-
|
|
152
|
-
# Check for unordered list item
|
|
153
|
-
if stripped.startswith('- ') or stripped.startswith('* '):
|
|
154
|
-
if not in_list:
|
|
155
|
-
result.append('<ul>')
|
|
156
|
-
in_list = True
|
|
157
|
-
list_level = len(line) - len(line.lstrip())
|
|
158
|
-
|
|
159
|
-
# Get the content after the marker
|
|
160
|
-
content = stripped[2:]
|
|
161
|
-
result.append(f'<li>{content}</li>')
|
|
162
|
-
|
|
163
|
-
# Check for ordered list item
|
|
164
|
-
elif re.match(r'^\d+\.\s', stripped):
|
|
165
|
-
if not in_list:
|
|
166
|
-
result.append('<ol>')
|
|
167
|
-
in_list = True
|
|
168
|
-
list_level = len(line) - len(line.lstrip())
|
|
169
|
-
|
|
170
|
-
# Get the content after the number and period
|
|
171
|
-
content = re.sub(r'^\d+\.\s', '', stripped)
|
|
172
|
-
result.append(f'<li>{content}</li>')
|
|
173
|
-
|
|
174
|
-
else:
|
|
175
|
-
# Not a list item
|
|
176
|
-
if in_list:
|
|
177
|
-
# Check if we're still in the list (indented continuation)
|
|
178
|
-
current_level = len(line) - len(line.lstrip())
|
|
179
|
-
if current_level > list_level and stripped:
|
|
180
|
-
# Continuation of previous list item
|
|
181
|
-
if result[-1].endswith('</li>'):
|
|
182
|
-
result[-1] = result[-1][:-5] + ' ' + stripped + '</li>'
|
|
183
|
-
continue
|
|
184
|
-
else:
|
|
185
|
-
# End of list
|
|
186
|
-
result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
|
|
187
|
-
in_list = False
|
|
188
|
-
list_level = 0
|
|
189
|
-
|
|
190
|
-
result.append(line)
|
|
191
|
-
|
|
192
|
-
# Close any remaining open list
|
|
193
|
-
if in_list:
|
|
194
|
-
result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
|
|
195
|
-
|
|
196
|
-
return '\n'.join(result)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
def _convert_tables(html: str) -> str:
|
|
200
|
-
"""Convert markdown tables to HTML tables"""
|
|
201
|
-
lines = html.split('\n')
|
|
202
|
-
result = []
|
|
203
|
-
in_table = False
|
|
204
|
-
|
|
205
|
-
for i, line in enumerate(lines):
|
|
206
|
-
if '|' in line and line.strip().startswith('|'):
|
|
207
|
-
if not in_table:
|
|
208
|
-
result.append('<table>')
|
|
209
|
-
in_table = True
|
|
210
|
-
# This is the header row
|
|
211
|
-
cells = [cell.strip() for cell in line.split('|')[1:-1]]
|
|
212
|
-
result.append('<thead><tr>')
|
|
213
|
-
for cell in cells:
|
|
214
|
-
result.append(f'<th>{cell}</th>')
|
|
215
|
-
result.append('</tr></thead>')
|
|
216
|
-
result.append('<tbody>')
|
|
217
|
-
elif '---' in line:
|
|
218
|
-
# Skip separator row
|
|
219
|
-
continue
|
|
220
|
-
else:
|
|
221
|
-
# Data row
|
|
222
|
-
cells = [cell.strip() for cell in line.split('|')[1:-1]]
|
|
223
|
-
result.append('<tr>')
|
|
224
|
-
for cell in cells:
|
|
225
|
-
result.append(f'<td>{cell}</td>')
|
|
226
|
-
result.append('</tr>')
|
|
227
|
-
else:
|
|
228
|
-
if in_table:
|
|
229
|
-
result.append('</tbody></table>')
|
|
230
|
-
in_table = False
|
|
231
|
-
result.append(line)
|
|
232
|
-
|
|
233
|
-
if in_table:
|
|
234
|
-
result.append('</tbody></table>')
|
|
235
|
-
|
|
236
|
-
return '\n'.join(result)
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
def _convert_paragraphs(html: str) -> str:
|
|
240
|
-
"""Wrap non-HTML lines in paragraph tags"""
|
|
241
|
-
lines = html.split('\n')
|
|
242
|
-
result = []
|
|
243
|
-
in_paragraph = False
|
|
244
|
-
|
|
245
|
-
for line in lines:
|
|
246
|
-
stripped = line.strip()
|
|
247
|
-
|
|
248
|
-
# Skip empty lines
|
|
249
|
-
if not stripped:
|
|
250
|
-
if in_paragraph:
|
|
251
|
-
result.append('</p>')
|
|
252
|
-
in_paragraph = False
|
|
253
|
-
result.append(line)
|
|
254
|
-
continue
|
|
255
|
-
|
|
256
|
-
# Skip lines that are already HTML tags
|
|
257
|
-
if (stripped.startswith('<') and stripped.endswith('>')) or \
|
|
258
|
-
stripped.startswith('</') or \
|
|
259
|
-
'<h' in stripped or '<div' in stripped or '<ul' in stripped or \
|
|
260
|
-
'<ol' in stripped or '<li' in stripped or '<table' in stripped or \
|
|
261
|
-
'</div>' in stripped or '</ul>' in stripped or '</ol>' in stripped:
|
|
262
|
-
if in_paragraph:
|
|
263
|
-
result.append('</p>')
|
|
264
|
-
in_paragraph = False
|
|
265
|
-
result.append(line)
|
|
266
|
-
continue
|
|
267
|
-
|
|
268
|
-
# Regular text line - wrap in paragraph
|
|
269
|
-
if not in_paragraph:
|
|
270
|
-
result.append('<p>' + line)
|
|
271
|
-
in_paragraph = True
|
|
272
|
-
else:
|
|
273
|
-
result.append(line)
|
|
274
|
-
|
|
275
|
-
if in_paragraph:
|
|
276
|
-
result.append('</p>')
|
|
277
|
-
|
|
278
|
-
return '\n'.join(result)
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
def _close_sections(html: str) -> str:
|
|
282
|
-
"""Close all open section divs"""
|
|
283
|
-
# Count open and closed divs
|
|
284
|
-
open_divs = html.count('<div class="section">')
|
|
285
|
-
closed_divs = html.count('</div>')
|
|
286
|
-
|
|
287
|
-
# Add closing divs for sections
|
|
288
|
-
# Each section should be closed before the next section starts
|
|
289
|
-
lines = html.split('\n')
|
|
290
|
-
result = []
|
|
291
|
-
section_open = False
|
|
292
|
-
|
|
293
|
-
for i, line in enumerate(lines):
|
|
294
|
-
if '<div class="section">' in line:
|
|
295
|
-
if section_open:
|
|
296
|
-
result.append('</div>') # Close previous section
|
|
297
|
-
section_open = True
|
|
298
|
-
result.append(line)
|
|
299
|
-
|
|
300
|
-
# Close final section if still open
|
|
301
|
-
if section_open:
|
|
302
|
-
result.append('</div>')
|
|
303
|
-
|
|
304
|
-
return '\n'.join(result)
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def main():
|
|
308
|
-
"""Test the converter with a sample markdown file"""
|
|
309
|
-
import sys
|
|
310
|
-
|
|
311
|
-
if len(sys.argv) < 2:
|
|
312
|
-
print("Usage: python md_to_html.py <markdown_file>")
|
|
313
|
-
sys.exit(1)
|
|
314
|
-
|
|
315
|
-
md_file = Path(sys.argv[1])
|
|
316
|
-
if not md_file.exists():
|
|
317
|
-
print(f"Error: File {md_file} not found")
|
|
318
|
-
sys.exit(1)
|
|
319
|
-
|
|
320
|
-
markdown_text = md_file.read_text()
|
|
321
|
-
content_html, bib_html = convert_markdown_to_html(markdown_text)
|
|
322
|
-
|
|
323
|
-
print("=== CONTENT HTML ===")
|
|
324
|
-
print(content_html[:1000])
|
|
325
|
-
print("\n=== BIBLIOGRAPHY HTML ===")
|
|
326
|
-
print(bib_html[:500])
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
if __name__ == "__main__":
|
|
330
|
-
main()
|