@clawos-dev/clawd 0.2.50-beta.77.3a9364e → 0.2.51-beta.78.2024c11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/persona-defaults/persona-clawd-helper/CLAUDE.md +1 -1
  2. package/dist/persona-defaults/persona-knowledge-base/CLAUDE.md +19 -0
  3. package/dist/persona-defaults/persona-researcher/CLAUDE.md +20 -1
  4. package/package.json +1 -1
  5. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/SKILL.md +0 -187
  6. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/archive-template.md +0 -21
  7. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/article-template.md +0 -20
  8. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/index-template.md +0 -18
  9. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/raw-template.md +0 -7
  10. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/README.md +0 -119
  11. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/SKILL.md +0 -108
  12. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/continuation.md +0 -167
  13. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/html-generation.md +0 -103
  14. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/methodology.md +0 -421
  15. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/quality-gates.md +0 -192
  16. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/report-assembly.md +0 -130
  17. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/weasyprint_guidelines.md +0 -324
  18. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/requirements.txt +0 -14
  19. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/claim.schema.json +0 -49
  20. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/evidence.schema.json +0 -43
  21. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/run_manifest.schema.json +0 -97
  22. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/source.schema.json +0 -49
  23. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/citation_manager.py +0 -300
  24. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/evidence_store.py +0 -205
  25. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/extract_claims.py +0 -358
  26. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/md_to_html.py +0 -330
  27. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/research_engine.py +0 -584
  28. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/source_evaluator.py +0 -292
  29. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/validate_report.py +0 -354
  30. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_citations.py +0 -426
  31. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_claim_support.py +0 -344
  32. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_html.py +0 -220
  33. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/mckinsey_report_template.html +0 -443
  34. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/report_template.md +0 -414
  35. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/invalid_report.md +0 -27
  36. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/valid_report.md +0 -114
  37. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_citation_manager.py +0 -195
  38. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_evidence_store.py +0 -166
  39. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_extract_claims.py +0 -213
  40. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_verify_claim_support.py +0 -230
  41. package/dist/persona-defaults/persona-researcher/skills-lock.json +0 -11
@@ -1,358 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Atomic Claim Extractor — decomposes report sections into typed claims.
4
-
5
- CLI subcommands:
6
- extract Parse a markdown report into atomic claims (claims.jsonl)
7
- add Manually add a single claim
8
- list List claims, optionally filtered by section or type
9
- stats Show claim statistics (counts by type/status)
10
-
11
- Claim identity:
12
- claim_id = sha256(section_id + normalized_text)[:16]
13
-
14
- Claim types (per GPT Pro's refinement of Codex's proposal):
15
- - factual: hard-fails on lack of support
16
- - synthesis: needs traceability, softer threshold
17
- - recommendation: needs traceability, softer threshold
18
- - speculation: labeled, no support gate
19
- """
20
-
21
- import argparse
22
- import hashlib
23
- import json
24
- import os
25
- import re
26
- import sys
27
- from datetime import datetime, timezone
28
-
29
-
30
- # ---------------------------------------------------------------------------
31
- # Claim ID computation
32
- # ---------------------------------------------------------------------------
33
-
34
- _WHITESPACE_RE = re.compile(r'\s+')
35
-
36
-
37
- def normalize_text(text: str) -> str:
38
- """Normalize for stable hashing."""
39
- return _WHITESPACE_RE.sub(' ', text.strip()).lower()
40
-
41
-
42
- def compute_claim_id(section_id: str, text: str) -> str:
43
- """sha256(section_id + normalized_text)[:16] hex."""
44
- payload = section_id + normalize_text(text)
45
- return hashlib.sha256(payload.encode('utf-8')).hexdigest()[:16]
46
-
47
-
48
- # ---------------------------------------------------------------------------
49
- # JSONL helpers
50
- # ---------------------------------------------------------------------------
51
-
52
- def append_jsonl(path: str, obj: dict) -> None:
53
- with open(path, 'a') as f:
54
- f.write(json.dumps(obj, ensure_ascii=False) + '\n')
55
-
56
-
57
- def read_jsonl(path: str) -> list[dict]:
58
- rows = []
59
- if not os.path.exists(path):
60
- return rows
61
- with open(path) as f:
62
- for line in f:
63
- line = line.strip()
64
- if line:
65
- rows.append(json.loads(line))
66
- return rows
67
-
68
-
69
- # ---------------------------------------------------------------------------
70
- # Report parsing helpers
71
- # ---------------------------------------------------------------------------
72
-
73
- # Section header patterns
74
- SECTION_PATTERNS = [
75
- (re.compile(r'^##\s+Executive\s+Summary', re.I), 'executive_summary'),
76
- (re.compile(r'^##\s+Introduction', re.I), 'introduction'),
77
- (re.compile(r'^##\s+Finding\s+(\d+)', re.I), lambda m: f'finding_{m.group(1)}'),
78
- (re.compile(r'^##\s+Synthesis', re.I), 'synthesis'),
79
- (re.compile(r'^##\s+Limitations', re.I), 'limitations'),
80
- (re.compile(r'^##\s+Recommendations', re.I), 'recommendations'),
81
- (re.compile(r'^##\s+Conclusion', re.I), 'conclusion'),
82
- (re.compile(r'^##\s+(.+)', re.I), lambda m: re.sub(r'\W+', '_', m.group(1).strip().lower())[:30]),
83
- ]
84
-
85
- # Citation pattern [N] or [N, M]
86
- CITATION_RE = re.compile(r'\[(\d+(?:,\s*\d+)*)\]')
87
-
88
- # Sentence splitting (basic but handles abbreviations)
89
- SENTENCE_RE = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
90
-
91
-
92
- def classify_claim(text: str, section_id: str) -> str:
93
- """Heuristic claim type classification."""
94
- lower = text.lower()
95
-
96
- # Recommendation indicators
97
- if any(w in lower for w in ['should', 'recommend', 'suggest', 'advise', 'consider']):
98
- if section_id == 'recommendations':
99
- return 'recommendation'
100
- return 'recommendation'
101
-
102
- # Speculation indicators
103
- if any(w in lower for w in ['might', 'could potentially', 'it is possible', 'may eventually',
104
- 'hypothetically', 'speculatively']):
105
- return 'speculation'
106
-
107
- # Synthesis indicators (often in synthesis/conclusion sections)
108
- if section_id in ('synthesis', 'conclusion', 'limitations'):
109
- if any(w in lower for w in ['overall', 'taken together', 'collectively',
110
- 'the evidence suggests', 'this implies']):
111
- return 'synthesis'
112
-
113
- # Default: factual
114
- return 'factual'
115
-
116
-
117
- def parse_sections(markdown: str) -> list[tuple[str, str]]:
118
- """Parse markdown into (section_id, content) pairs."""
119
- lines = markdown.split('\n')
120
- sections = []
121
- current_id = 'preamble'
122
- current_lines = []
123
-
124
- for line in lines:
125
- matched = False
126
- for pattern, id_or_fn in SECTION_PATTERNS:
127
- m = pattern.match(line)
128
- if m:
129
- if current_lines:
130
- sections.append((current_id, '\n'.join(current_lines)))
131
- current_id = id_or_fn(m) if callable(id_or_fn) else id_or_fn
132
- current_lines = []
133
- matched = True
134
- break
135
- if not matched:
136
- current_lines.append(line)
137
-
138
- if current_lines:
139
- sections.append((current_id, '\n'.join(current_lines)))
140
-
141
- return sections
142
-
143
-
144
- def extract_sentences(text: str) -> list[str]:
145
- """Split text into sentences, filtering noise."""
146
- # Remove markdown formatting noise
147
- text = re.sub(r'^[-*]\s+', '', text, flags=re.M) # bullet points
148
- text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # bold
149
- text = re.sub(r'\*([^*]+)\*', r'\1', text) # italic
150
-
151
- sentences = SENTENCE_RE.split(text)
152
- result = []
153
- for s in sentences:
154
- s = s.strip()
155
- # Filter out very short fragments, headings, empty lines
156
- if len(s) > 30 and not s.startswith('#') and not s.startswith('|'):
157
- result.append(s)
158
- return result
159
-
160
-
161
- # ---------------------------------------------------------------------------
162
- # Subcommands
163
- # ---------------------------------------------------------------------------
164
-
165
- def cmd_extract(args: argparse.Namespace) -> None:
166
- """Extract atomic claims from a markdown report."""
167
- report_path = args.report
168
- if not os.path.exists(report_path):
169
- print(json.dumps({'error': f'Report not found: {report_path}'}), file=sys.stderr)
170
- sys.exit(1)
171
-
172
- with open(report_path) as f:
173
- markdown = f.read()
174
-
175
- claims_path = os.path.join(args.dir, 'claims.jsonl')
176
- existing_ids = {r['claim_id'] for r in read_jsonl(claims_path)}
177
-
178
- sections = parse_sections(markdown)
179
- added = 0
180
- skipped = 0
181
-
182
- for section_id, content in sections:
183
- if section_id == 'preamble':
184
- continue
185
- sentences = extract_sentences(content)
186
- for sentence in sentences:
187
- claim_id = compute_claim_id(section_id, sentence)
188
- if claim_id in existing_ids:
189
- skipped += 1
190
- continue
191
-
192
- # Extract citation numbers from sentence
193
- citation_nums = []
194
- for m in CITATION_RE.finditer(sentence):
195
- nums = [int(n.strip()) for n in m.group(1).split(',')]
196
- citation_nums.extend(nums)
197
-
198
- claim = {
199
- 'claim_id': claim_id,
200
- 'section_id': section_id,
201
- 'text': sentence,
202
- 'claim_type': classify_claim(sentence, section_id),
203
- 'cited_source_ids': [], # Populated by linking step
204
- 'evidence_ids': [], # Populated by verify_claim_support
205
- 'support_status': 'unverified',
206
- 'extracted_at': datetime.now(timezone.utc).isoformat(),
207
- '_citation_numbers': citation_nums, # Temporary, for linking
208
- }
209
- append_jsonl(claims_path, claim)
210
- existing_ids.add(claim_id)
211
- added += 1
212
-
213
- print(json.dumps({
214
- 'status': 'ok',
215
- 'claims_added': added,
216
- 'claims_skipped': skipped,
217
- 'total_claims': len(existing_ids),
218
- }))
219
-
220
-
221
- def cmd_add(args: argparse.Namespace) -> None:
222
- """Manually add a single claim."""
223
- data = json.loads(args.json)
224
- section_id = data.get('section_id', 'unknown')
225
- text = data.get('text', '')
226
- if not text:
227
- print(json.dumps({'error': 'text is required'}), file=sys.stderr)
228
- sys.exit(1)
229
-
230
- claim_id = compute_claim_id(section_id, text)
231
- claims_path = os.path.join(args.dir, 'claims.jsonl')
232
-
233
- existing = read_jsonl(claims_path)
234
- for row in existing:
235
- if row.get('claim_id') == claim_id:
236
- print(json.dumps({'status': 'duplicate', 'claim_id': claim_id}))
237
- return
238
-
239
- valid_types = {'factual', 'synthesis', 'recommendation', 'speculation'}
240
- claim_type = data.get('claim_type', 'factual')
241
- if claim_type not in valid_types:
242
- claim_type = 'factual'
243
-
244
- claim = {
245
- 'claim_id': claim_id,
246
- 'section_id': section_id,
247
- 'text': text,
248
- 'claim_type': claim_type,
249
- 'cited_source_ids': data.get('cited_source_ids', []),
250
- 'evidence_ids': data.get('evidence_ids', []),
251
- 'support_status': 'unverified',
252
- 'extracted_at': datetime.now(timezone.utc).isoformat(),
253
- }
254
- append_jsonl(claims_path, claim)
255
- print(json.dumps({'status': 'added', 'claim_id': claim_id}))
256
-
257
-
258
- def cmd_list(args: argparse.Namespace) -> None:
259
- """List claims with optional filters."""
260
- claims_path = os.path.join(args.dir, 'claims.jsonl')
261
- rows = read_jsonl(claims_path)
262
-
263
- if args.section:
264
- rows = [r for r in rows if r.get('section_id') == args.section]
265
- if args.type:
266
- rows = [r for r in rows if r.get('claim_type') == args.type]
267
- if args.status:
268
- rows = [r for r in rows if r.get('support_status') == args.status]
269
-
270
- # Deduplicate
271
- seen = set()
272
- unique = []
273
- for r in rows:
274
- cid = r.get('claim_id')
275
- if cid not in seen:
276
- seen.add(cid)
277
- unique.append(r)
278
-
279
- print(json.dumps({'count': len(unique), 'claims': unique}, indent=2, ensure_ascii=False))
280
-
281
-
282
- def cmd_stats(args: argparse.Namespace) -> None:
283
- """Show claim statistics."""
284
- claims_path = os.path.join(args.dir, 'claims.jsonl')
285
- rows = read_jsonl(claims_path)
286
-
287
- # Deduplicate
288
- seen = set()
289
- unique = []
290
- for r in rows:
291
- cid = r.get('claim_id')
292
- if cid not in seen:
293
- seen.add(cid)
294
- unique.append(r)
295
-
296
- by_type = {}
297
- by_status = {}
298
- by_section = {}
299
- for r in unique:
300
- t = r.get('claim_type', 'unknown')
301
- s = r.get('support_status', 'unknown')
302
- sec = r.get('section_id', 'unknown')
303
- by_type[t] = by_type.get(t, 0) + 1
304
- by_status[s] = by_status.get(s, 0) + 1
305
- by_section[sec] = by_section.get(sec, 0) + 1
306
-
307
- print(json.dumps({
308
- 'total': len(unique),
309
- 'by_type': by_type,
310
- 'by_status': by_status,
311
- 'by_section': by_section,
312
- }, indent=2))
313
-
314
-
315
- # ---------------------------------------------------------------------------
316
- # CLI entry point
317
- # ---------------------------------------------------------------------------
318
-
319
- def main() -> None:
320
- parser = argparse.ArgumentParser(
321
- prog='extract_claims',
322
- description='Atomic claim extraction and ledger for deep-research v3.0',
323
- )
324
- sub = parser.add_subparsers(dest='command', required=True)
325
-
326
- # extract
327
- p_ext = sub.add_parser('extract', help='Extract claims from markdown report')
328
- p_ext.add_argument('--report', required=True, help='Path to report.md')
329
- p_ext.add_argument('--dir', required=True, help='Run directory containing claims.jsonl')
330
-
331
- # add
332
- p_add = sub.add_parser('add', help='Manually add a single claim')
333
- p_add.add_argument('--json', required=True, help='JSON with section_id, text, claim_type')
334
- p_add.add_argument('--dir', required=True, help='Run directory')
335
-
336
- # list
337
- p_list = sub.add_parser('list', help='List claims')
338
- p_list.add_argument('--dir', required=True, help='Run directory')
339
- p_list.add_argument('--section', default=None, help='Filter by section_id')
340
- p_list.add_argument('--type', default=None, help='Filter by claim_type')
341
- p_list.add_argument('--status', default=None, help='Filter by support_status')
342
-
343
- # stats
344
- p_stats = sub.add_parser('stats', help='Claim statistics')
345
- p_stats.add_argument('--dir', required=True, help='Run directory')
346
-
347
- args = parser.parse_args()
348
- dispatch = {
349
- 'extract': cmd_extract,
350
- 'add': cmd_add,
351
- 'list': cmd_list,
352
- 'stats': cmd_stats,
353
- }
354
- dispatch[args.command](args)
355
-
356
-
357
- if __name__ == '__main__':
358
- main()
@@ -1,330 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Markdown to HTML converter for research reports
4
- Properly converts markdown sections to HTML while preserving structure and formatting
5
- """
6
-
7
- import re
8
- from typing import Tuple
9
- from pathlib import Path
10
-
11
-
12
- def convert_markdown_to_html(markdown_text: str) -> Tuple[str, str]:
13
- """
14
- Convert markdown to HTML in two parts: content and bibliography
15
-
16
- Args:
17
- markdown_text: Full markdown report text
18
-
19
- Returns:
20
- Tuple of (content_html, bibliography_html)
21
- """
22
- # Split content and bibliography
23
- parts = markdown_text.split('## Bibliography')
24
- content_md = parts[0]
25
- bibliography_md = parts[1] if len(parts) > 1 else ""
26
-
27
- # Convert content (everything except bibliography)
28
- content_html = _convert_content_section(content_md)
29
-
30
- # Convert bibliography separately
31
- bibliography_html = _convert_bibliography_section(bibliography_md)
32
-
33
- return content_html, bibliography_html
34
-
35
-
36
- def _convert_content_section(markdown: str) -> str:
37
- """Convert main content sections to HTML"""
38
- html = markdown
39
-
40
- # Remove title and front matter (first ## heading is handled separately)
41
- lines = html.split('\n')
42
- processed_lines = []
43
- skip_until_first_section = True
44
-
45
- for line in lines:
46
- # Skip everything until we hit "## Executive Summary" or first major section
47
- if skip_until_first_section:
48
- if line.startswith('## ') and not line.startswith('### '):
49
- skip_until_first_section = False
50
- processed_lines.append(line)
51
- continue
52
- processed_lines.append(line)
53
-
54
- html = '\n'.join(processed_lines)
55
-
56
- # Convert headers
57
- # ## Section Title → <div class="section"><h2 class="section-title">Section Title</h2></div>
58
- html = re.sub(
59
- r'^## (.+)$',
60
- r'<div class="section"><h2 class="section-title">\1</h2>',
61
- html,
62
- flags=re.MULTILINE
63
- )
64
-
65
- # ### Subsection → <h3 class="subsection-title">Subsection</h3>
66
- html = re.sub(
67
- r'^### (.+)$',
68
- r'<h3 class="subsection-title">\1</h3>',
69
- html,
70
- flags=re.MULTILINE
71
- )
72
-
73
- # #### Subsubsection → <h4 class="subsubsection-title">Title</h4>
74
- html = re.sub(
75
- r'^#### (.+)$',
76
- r'<h4 class="subsubsection-title">\1</h4>',
77
- html,
78
- flags=re.MULTILINE
79
- )
80
-
81
- # Convert **bold** text
82
- html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
83
-
84
- # Convert *italic* text
85
- html = re.sub(r'\*(.+?)\*', r'<em>\1</em>', html)
86
-
87
- # Convert inline code `code`
88
- html = re.sub(r'`(.+?)`', r'<code>\1</code>', html)
89
-
90
- # Convert unordered lists
91
- html = _convert_lists(html)
92
-
93
- # Convert tables
94
- html = _convert_tables(html)
95
-
96
- # Convert paragraphs (wrap non-HTML lines in <p> tags)
97
- html = _convert_paragraphs(html)
98
-
99
- # Close all open sections
100
- html = _close_sections(html)
101
-
102
- # Wrap executive summary if present
103
- html = html.replace(
104
- '<h2 class="section-title">Executive Summary</h2>',
105
- '<div class="executive-summary"><h2 class="section-title">Executive Summary</h2>'
106
- )
107
- if '<div class="executive-summary">' in html:
108
- # Close executive summary at the next section
109
- html = html.replace(
110
- '</h2>\n<div class="section">',
111
- '</h2></div>\n<div class="section">',
112
- 1
113
- )
114
-
115
- return html
116
-
117
-
118
- def _convert_bibliography_section(markdown: str) -> str:
119
- """Convert bibliography section to HTML"""
120
- if not markdown.strip():
121
- return ""
122
-
123
- html = markdown
124
-
125
- # Convert each [N] citation to a proper bibliography entry
126
- # Look for patterns like [1] Title - URL
127
- html = re.sub(
128
- r'\[(\d+)\]\s*(.+?)\s*-\s*(https?://[^\s\)]+)',
129
- r'<div class="bib-entry"><span class="bib-number">[\1]</span> <a href="\3" target="_blank">\2</a></div>',
130
- html
131
- )
132
-
133
- # Convert any remaining **bold** sections
134
- html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
135
-
136
- # Wrap in bibliography content div
137
- html = f'<div class="bibliography-content">{html}</div>'
138
-
139
- return html
140
-
141
-
142
- def _convert_lists(html: str) -> str:
143
- """Convert markdown lists to HTML lists"""
144
- lines = html.split('\n')
145
- result = []
146
- in_list = False
147
- list_level = 0
148
-
149
- for i, line in enumerate(lines):
150
- stripped = line.strip()
151
-
152
- # Check for unordered list item
153
- if stripped.startswith('- ') or stripped.startswith('* '):
154
- if not in_list:
155
- result.append('<ul>')
156
- in_list = True
157
- list_level = len(line) - len(line.lstrip())
158
-
159
- # Get the content after the marker
160
- content = stripped[2:]
161
- result.append(f'<li>{content}</li>')
162
-
163
- # Check for ordered list item
164
- elif re.match(r'^\d+\.\s', stripped):
165
- if not in_list:
166
- result.append('<ol>')
167
- in_list = True
168
- list_level = len(line) - len(line.lstrip())
169
-
170
- # Get the content after the number and period
171
- content = re.sub(r'^\d+\.\s', '', stripped)
172
- result.append(f'<li>{content}</li>')
173
-
174
- else:
175
- # Not a list item
176
- if in_list:
177
- # Check if we're still in the list (indented continuation)
178
- current_level = len(line) - len(line.lstrip())
179
- if current_level > list_level and stripped:
180
- # Continuation of previous list item
181
- if result[-1].endswith('</li>'):
182
- result[-1] = result[-1][:-5] + ' ' + stripped + '</li>'
183
- continue
184
- else:
185
- # End of list
186
- result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
187
- in_list = False
188
- list_level = 0
189
-
190
- result.append(line)
191
-
192
- # Close any remaining open list
193
- if in_list:
194
- result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
195
-
196
- return '\n'.join(result)
197
-
198
-
199
- def _convert_tables(html: str) -> str:
200
- """Convert markdown tables to HTML tables"""
201
- lines = html.split('\n')
202
- result = []
203
- in_table = False
204
-
205
- for i, line in enumerate(lines):
206
- if '|' in line and line.strip().startswith('|'):
207
- if not in_table:
208
- result.append('<table>')
209
- in_table = True
210
- # This is the header row
211
- cells = [cell.strip() for cell in line.split('|')[1:-1]]
212
- result.append('<thead><tr>')
213
- for cell in cells:
214
- result.append(f'<th>{cell}</th>')
215
- result.append('</tr></thead>')
216
- result.append('<tbody>')
217
- elif '---' in line:
218
- # Skip separator row
219
- continue
220
- else:
221
- # Data row
222
- cells = [cell.strip() for cell in line.split('|')[1:-1]]
223
- result.append('<tr>')
224
- for cell in cells:
225
- result.append(f'<td>{cell}</td>')
226
- result.append('</tr>')
227
- else:
228
- if in_table:
229
- result.append('</tbody></table>')
230
- in_table = False
231
- result.append(line)
232
-
233
- if in_table:
234
- result.append('</tbody></table>')
235
-
236
- return '\n'.join(result)
237
-
238
-
239
- def _convert_paragraphs(html: str) -> str:
240
- """Wrap non-HTML lines in paragraph tags"""
241
- lines = html.split('\n')
242
- result = []
243
- in_paragraph = False
244
-
245
- for line in lines:
246
- stripped = line.strip()
247
-
248
- # Skip empty lines
249
- if not stripped:
250
- if in_paragraph:
251
- result.append('</p>')
252
- in_paragraph = False
253
- result.append(line)
254
- continue
255
-
256
- # Skip lines that are already HTML tags
257
- if (stripped.startswith('<') and stripped.endswith('>')) or \
258
- stripped.startswith('</') or \
259
- '<h' in stripped or '<div' in stripped or '<ul' in stripped or \
260
- '<ol' in stripped or '<li' in stripped or '<table' in stripped or \
261
- '</div>' in stripped or '</ul>' in stripped or '</ol>' in stripped:
262
- if in_paragraph:
263
- result.append('</p>')
264
- in_paragraph = False
265
- result.append(line)
266
- continue
267
-
268
- # Regular text line - wrap in paragraph
269
- if not in_paragraph:
270
- result.append('<p>' + line)
271
- in_paragraph = True
272
- else:
273
- result.append(line)
274
-
275
- if in_paragraph:
276
- result.append('</p>')
277
-
278
- return '\n'.join(result)
279
-
280
-
281
- def _close_sections(html: str) -> str:
282
- """Close all open section divs"""
283
- # Count open and closed divs
284
- open_divs = html.count('<div class="section">')
285
- closed_divs = html.count('</div>')
286
-
287
- # Add closing divs for sections
288
- # Each section should be closed before the next section starts
289
- lines = html.split('\n')
290
- result = []
291
- section_open = False
292
-
293
- for i, line in enumerate(lines):
294
- if '<div class="section">' in line:
295
- if section_open:
296
- result.append('</div>') # Close previous section
297
- section_open = True
298
- result.append(line)
299
-
300
- # Close final section if still open
301
- if section_open:
302
- result.append('</div>')
303
-
304
- return '\n'.join(result)
305
-
306
-
307
- def main():
308
- """Test the converter with a sample markdown file"""
309
- import sys
310
-
311
- if len(sys.argv) < 2:
312
- print("Usage: python md_to_html.py <markdown_file>")
313
- sys.exit(1)
314
-
315
- md_file = Path(sys.argv[1])
316
- if not md_file.exists():
317
- print(f"Error: File {md_file} not found")
318
- sys.exit(1)
319
-
320
- markdown_text = md_file.read_text()
321
- content_html, bib_html = convert_markdown_to_html(markdown_text)
322
-
323
- print("=== CONTENT HTML ===")
324
- print(content_html[:1000])
325
- print("\n=== BIBLIOGRAPHY HTML ===")
326
- print(bib_html[:500])
327
-
328
-
329
- if __name__ == "__main__":
330
- main()