@clawos-dev/clawd 0.2.50 → 0.2.51-beta.78.2024c11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/persona-defaults/persona-clawd-helper/CLAUDE.md +1 -1
  2. package/dist/persona-defaults/persona-knowledge-base/CLAUDE.md +19 -0
  3. package/dist/persona-defaults/persona-researcher/CLAUDE.md +20 -1
  4. package/package.json +1 -1
  5. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/SKILL.md +0 -187
  6. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/archive-template.md +0 -21
  7. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/article-template.md +0 -20
  8. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/index-template.md +0 -18
  9. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/raw-template.md +0 -7
  10. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/README.md +0 -119
  11. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/SKILL.md +0 -108
  12. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/continuation.md +0 -167
  13. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/html-generation.md +0 -103
  14. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/methodology.md +0 -421
  15. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/quality-gates.md +0 -192
  16. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/report-assembly.md +0 -130
  17. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/weasyprint_guidelines.md +0 -324
  18. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/requirements.txt +0 -14
  19. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/claim.schema.json +0 -49
  20. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/evidence.schema.json +0 -43
  21. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/run_manifest.schema.json +0 -97
  22. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/source.schema.json +0 -49
  23. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/citation_manager.py +0 -300
  24. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/evidence_store.py +0 -205
  25. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/extract_claims.py +0 -358
  26. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/md_to_html.py +0 -330
  27. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/research_engine.py +0 -584
  28. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/source_evaluator.py +0 -292
  29. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/validate_report.py +0 -354
  30. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_citations.py +0 -426
  31. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_claim_support.py +0 -344
  32. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_html.py +0 -220
  33. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/mckinsey_report_template.html +0 -443
  34. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/report_template.md +0 -414
  35. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/invalid_report.md +0 -27
  36. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/valid_report.md +0 -114
  37. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_citation_manager.py +0 -195
  38. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_evidence_store.py +0 -166
  39. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_extract_claims.py +0 -213
  40. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_verify_claim_support.py +0 -230
  41. package/dist/persona-defaults/persona-researcher/skills-lock.json +0 -11
@@ -1,43 +0,0 @@
1
- {
2
- "$schema": "https://json-schema.org/draft/2020-12/schema",
3
- "title": "Evidence",
4
- "description": "A piece of evidence extracted from a source. evidence_id = sha256(source_id + normalized_quote + locator)[:16].",
5
- "type": "object",
6
- "required": ["evidence_id", "source_id", "quote", "evidence_type", "captured_at"],
7
- "properties": {
8
- "evidence_id": {
9
- "type": "string",
10
- "pattern": "^[0-9a-f]{16}$",
11
- "description": "sha256(source_id + normalized_quote + locator)[:16]"
12
- },
13
- "source_id": {
14
- "type": "string",
15
- "pattern": "^[0-9a-f]{16}$",
16
- "description": "References a source in sources.jsonl"
17
- },
18
- "retrieval_query": {
19
- "type": ["string", "null"],
20
- "description": "The search query or prompt that led to this evidence",
21
- "default": null
22
- },
23
- "locator": {
24
- "type": ["string", "null"],
25
- "description": "Page number, section heading, URL fragment, or timestamp within the source",
26
- "default": null
27
- },
28
- "quote": {
29
- "type": "string",
30
- "description": "Exact or near-exact text extracted from the source"
31
- },
32
- "evidence_type": {
33
- "type": "string",
34
- "enum": ["direct_quote", "paraphrase", "data_point", "figure_reference", "methodology"],
35
- "description": "How the evidence was captured"
36
- },
37
- "captured_at": {
38
- "type": "string",
39
- "format": "date-time"
40
- }
41
- },
42
- "additionalProperties": false
43
- }
@@ -1,97 +0,0 @@
1
- {
2
- "$schema": "https://json-schema.org/draft/2020-12/schema",
3
- "title": "RunManifest",
4
- "description": "Manifest for a single research run. Created at init, updated throughout.",
5
- "type": "object",
6
- "required": ["version", "query", "mode", "started_at", "report_dir", "artifact_paths"],
7
- "properties": {
8
- "version": {
9
- "type": "string",
10
- "const": "3.0.0"
11
- },
12
- "query": {
13
- "type": "string",
14
- "description": "Original research question"
15
- },
16
- "mode": {
17
- "type": "string",
18
- "enum": ["quick", "standard", "deep", "ultradeep"]
19
- },
20
- "started_at": {
21
- "type": "string",
22
- "format": "date-time"
23
- },
24
- "finished_at": {
25
- "type": ["string", "null"],
26
- "format": "date-time",
27
- "default": null
28
- },
29
- "assumptions": {
30
- "type": "array",
31
- "items": {
32
- "type": "object",
33
- "required": ["assumption_id", "text", "materiality", "status"],
34
- "properties": {
35
- "assumption_id": {
36
- "type": "string",
37
- "pattern": "^asm_[0-9a-f]{8}$"
38
- },
39
- "text": { "type": "string" },
40
- "materiality": {
41
- "type": "string",
42
- "enum": ["low", "medium", "high"]
43
- },
44
- "status": {
45
- "type": "string",
46
- "enum": ["implicit", "user_confirmed", "evidence_validated"]
47
- }
48
- },
49
- "additionalProperties": false
50
- },
51
- "default": []
52
- },
53
- "provider_config": {
54
- "type": "object",
55
- "properties": {
56
- "primary": {
57
- "type": "string",
58
- "description": "Primary search provider (e.g. WebSearch)"
59
- },
60
- "scholarly": {
61
- "type": ["string", "null"],
62
- "description": "Scholarly API provider if configured (e.g. openalex, semantic_scholar)"
63
- }
64
- },
65
- "default": { "primary": "search-cli", "scholarly": null }
66
- },
67
- "report_dir": {
68
- "type": "string",
69
- "description": "Absolute path to the report directory"
70
- },
71
- "artifact_paths": {
72
- "type": "object",
73
- "required": ["sources", "evidence", "claims", "report"],
74
- "properties": {
75
- "sources": { "type": "string", "default": "sources.jsonl" },
76
- "evidence": { "type": "string", "default": "evidence.jsonl" },
77
- "claims": { "type": "string", "default": "claims.jsonl" },
78
- "report": { "type": "string", "default": "report.md" }
79
- },
80
- "additionalProperties": false
81
- },
82
- "continuation": {
83
- "type": ["object", "null"],
84
- "description": "Populated when resuming a previous run",
85
- "properties": {
86
- "previous_run_manifest": { "type": "string" },
87
- "resumed_at": { "type": "string", "format": "date-time" },
88
- "sections_completed": {
89
- "type": "array",
90
- "items": { "type": "string" }
91
- }
92
- },
93
- "default": null
94
- }
95
- },
96
- "additionalProperties": false
97
- }
@@ -1,49 +0,0 @@
1
- {
2
- "$schema": "https://json-schema.org/draft/2020-12/schema",
3
- "title": "Source",
4
- "description": "A research source with stable identity. source_id = sha256(canonical_locator)[:16].",
5
- "type": "object",
6
- "required": ["source_id", "canonical_locator", "raw_url", "title", "source_type", "metadata_status", "registered_at"],
7
- "properties": {
8
- "source_id": {
9
- "type": "string",
10
- "pattern": "^[0-9a-f]{16}$",
11
- "description": "sha256(canonical_locator)[:16] — stable across edits and continuation"
12
- },
13
- "canonical_locator": {
14
- "type": "string",
15
- "description": "Canonical identifier: doi:10.1038/..., arxiv:2305.14251, or normalized URL (scheme+host+path, no fragment/tracking params)"
16
- },
17
- "raw_url": {
18
- "type": "string",
19
- "description": "Original URL as retrieved, before normalization"
20
- },
21
- "title": {
22
- "type": "string"
23
- },
24
- "authors": {
25
- "type": ["array", "null"],
26
- "items": { "type": "string" },
27
- "default": null
28
- },
29
- "year": {
30
- "type": ["string", "null"],
31
- "default": null
32
- },
33
- "source_type": {
34
- "type": "string",
35
- "enum": ["web", "academic", "documentation", "code", "news", "government", "book"]
36
- },
37
- "metadata_status": {
38
- "type": "string",
39
- "enum": ["unverified", "doi_verified", "url_verified", "title_matched"],
40
- "description": "How far metadata has been verified"
41
- },
42
- "registered_at": {
43
- "type": "string",
44
- "format": "date-time",
45
- "description": "ISO 8601 timestamp when source was registered"
46
- }
47
- },
48
- "additionalProperties": false
49
- }
@@ -1,300 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Citation Manager — stable source identity and run manifest management.
4
-
5
- CLI subcommands:
6
- init-run Create run_manifest.json + empty artifact JSONL files
7
- register-source Append a source to sources.jsonl, return source_id
8
- assign-display-numbers Generate stable_id -> display_number mapping
9
- export-bibliography Render bibliography from sources.jsonl
10
-
11
- Source identity:
12
- source_id = sha256(canonical_locator)[:16]
13
- canonical_locator = doi:..., arxiv:..., or normalized URL
14
-
15
- All state is append-only JSONL. No mutable citation numbers in state files.
16
- """
17
-
18
- import argparse
19
- import hashlib
20
- import json
21
- import os
22
- import re
23
- import sys
24
- from datetime import datetime, timezone
25
- from urllib.parse import urlparse, urlunparse
26
-
27
-
28
- # ---------------------------------------------------------------------------
29
- # Canonical locator normalization
30
- # ---------------------------------------------------------------------------
31
-
32
- DOI_RE = re.compile(r'(?:https?://(?:dx\.)?doi\.org/|doi:)(10\.\d{4,}/\S+)', re.IGNORECASE)
33
- ARXIV_RE = re.compile(r'(?:https?://arxiv\.org/abs/|arxiv:)(\d{4}\.\d{4,}(?:v\d+)?)', re.IGNORECASE)
34
-
35
- # URL query params that are tracking noise, not content identifiers
36
- TRACKING_PARAMS = frozenset([
37
- 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
38
- 'ref', 'source', 'fbclid', 'gclid', 'mc_cid', 'mc_eid',
39
- ])
40
-
41
-
42
- def canonicalize_locator(raw_url: str) -> str:
43
- """Derive a canonical locator from a raw URL or identifier string.
44
-
45
- Priority: DOI > arXiv > normalized URL.
46
- """
47
- # DOI
48
- m = DOI_RE.search(raw_url)
49
- if m:
50
- return f'doi:{m.group(1).rstrip(".")}'
51
-
52
- # arXiv
53
- m = ARXIV_RE.search(raw_url)
54
- if m:
55
- return f'arxiv:{m.group(1)}'
56
-
57
- # Normalized URL: lowercase scheme+host, strip fragment and tracking params
58
- parsed = urlparse(raw_url)
59
- scheme = (parsed.scheme or 'https').lower()
60
- host = (parsed.hostname or '').lower()
61
- path = parsed.path.rstrip('/')
62
- # Filter query params
63
- if parsed.query:
64
- pairs = []
65
- for part in parsed.query.split('&'):
66
- kv = part.split('=', 1)
67
- if kv[0].lower() not in TRACKING_PARAMS:
68
- pairs.append(part)
69
- query = '&'.join(sorted(pairs))
70
- else:
71
- query = ''
72
- return urlunparse((scheme, host, path, '', query, ''))
73
-
74
-
75
- def compute_source_id(canonical_locator: str) -> str:
76
- """sha256(canonical_locator)[:16] hex."""
77
- return hashlib.sha256(canonical_locator.encode('utf-8')).hexdigest()[:16]
78
-
79
-
80
- # ---------------------------------------------------------------------------
81
- # JSONL helpers
82
- # ---------------------------------------------------------------------------
83
-
84
- def append_jsonl(path: str, obj: dict) -> None:
85
- with open(path, 'a') as f:
86
- f.write(json.dumps(obj, ensure_ascii=False) + '\n')
87
-
88
-
89
- def read_jsonl(path: str) -> list[dict]:
90
- rows = []
91
- if not os.path.exists(path):
92
- return rows
93
- with open(path) as f:
94
- for line in f:
95
- line = line.strip()
96
- if line:
97
- rows.append(json.loads(line))
98
- return rows
99
-
100
-
101
- # ---------------------------------------------------------------------------
102
- # Subcommands
103
- # ---------------------------------------------------------------------------
104
-
105
- def cmd_init_run(args: argparse.Namespace) -> None:
106
- """Create run_manifest.json and empty JSONL artifact files."""
107
- out_dir = os.path.abspath(args.out_dir)
108
- os.makedirs(out_dir, exist_ok=True)
109
-
110
- artifact_paths = {
111
- 'sources': 'sources.jsonl',
112
- 'evidence': 'evidence.jsonl',
113
- 'claims': 'claims.jsonl',
114
- 'report': 'report.md',
115
- }
116
-
117
- manifest = {
118
- 'version': '3.0.0',
119
- 'query': args.query or '',
120
- 'mode': args.mode,
121
- 'started_at': datetime.now(timezone.utc).isoformat(),
122
- 'finished_at': None,
123
- 'assumptions': [],
124
- 'provider_config': {
125
- 'primary': 'search-cli',
126
- 'scholarly': None,
127
- },
128
- 'report_dir': out_dir,
129
- 'artifact_paths': artifact_paths,
130
- 'continuation': None,
131
- }
132
-
133
- manifest_path = os.path.join(out_dir, 'run_manifest.json')
134
- with open(manifest_path, 'w') as f:
135
- json.dump(manifest, f, indent=2, ensure_ascii=False)
136
- f.write('\n')
137
-
138
- # Create empty artifact files
139
- for name in ('sources', 'evidence', 'claims'):
140
- p = os.path.join(out_dir, artifact_paths[name])
141
- if not os.path.exists(p):
142
- open(p, 'w').close()
143
-
144
- print(json.dumps({'status': 'ok', 'manifest': manifest_path, 'dir': out_dir}))
145
-
146
-
147
- def cmd_register_source(args: argparse.Namespace) -> None:
148
- """Register a source, append to sources.jsonl, print source_id."""
149
- data = json.loads(args.json)
150
- raw_url = data.get('raw_url', data.get('url', ''))
151
- if not raw_url:
152
- print(json.dumps({'error': 'raw_url is required'}), file=sys.stderr)
153
- sys.exit(1)
154
-
155
- canonical = data.get('canonical_locator') or canonicalize_locator(raw_url)
156
- source_id = compute_source_id(canonical)
157
-
158
- sources_path = os.path.join(args.dir, 'sources.jsonl')
159
-
160
- # Check for duplicate
161
- existing = read_jsonl(sources_path)
162
- for row in existing:
163
- if row.get('source_id') == source_id:
164
- print(json.dumps({
165
- 'status': 'duplicate',
166
- 'source_id': source_id,
167
- 'canonical_locator': canonical,
168
- }))
169
- return
170
-
171
- source = {
172
- 'source_id': source_id,
173
- 'canonical_locator': canonical,
174
- 'raw_url': raw_url,
175
- 'title': data.get('title', ''),
176
- 'authors': data.get('authors'),
177
- 'year': data.get('year'),
178
- 'source_type': data.get('source_type', 'web'),
179
- 'metadata_status': data.get('metadata_status', 'unverified'),
180
- 'registered_at': datetime.now(timezone.utc).isoformat(),
181
- }
182
- append_jsonl(sources_path, source)
183
- print(json.dumps({
184
- 'status': 'registered',
185
- 'source_id': source_id,
186
- 'canonical_locator': canonical,
187
- }))
188
-
189
-
190
- def cmd_assign_display_numbers(args: argparse.Namespace) -> None:
191
- """Read sources.jsonl, assign stable display numbers in registration order."""
192
- sources_path = os.path.join(args.dir, 'sources.jsonl')
193
- sources = read_jsonl(sources_path)
194
-
195
- mapping = {}
196
- for i, src in enumerate(sources, 1):
197
- sid = src['source_id']
198
- if sid not in mapping:
199
- mapping[sid] = i
200
-
201
- print(json.dumps(mapping, indent=2))
202
-
203
-
204
- def cmd_export_bibliography(args: argparse.Namespace) -> None:
205
- """Generate bibliography from sources.jsonl."""
206
- sources_path = os.path.join(args.dir, 'sources.jsonl')
207
- sources = read_jsonl(sources_path)
208
-
209
- # Deduplicate by source_id, preserve order
210
- seen = set()
211
- unique = []
212
- for src in sources:
213
- if src['source_id'] not in seen:
214
- seen.add(src['source_id'])
215
- unique.append(src)
216
-
217
- style = args.style
218
-
219
- if style == 'markdown':
220
- lines = ['## Bibliography', '']
221
- for i, src in enumerate(unique, 1):
222
- author_str = ''
223
- if src.get('authors'):
224
- authors = src['authors']
225
- if len(authors) == 1:
226
- author_str = f'{authors[0]}. '
227
- elif len(authors) == 2:
228
- author_str = f'{authors[0]} & {authors[1]}. '
229
- else:
230
- author_str = f'{authors[0]} et al. '
231
-
232
- year_str = f'({src["year"]})' if src.get('year') else '(n.d.)'
233
- title = src.get('title', 'Untitled')
234
- url = src.get('raw_url', '')
235
- lines.append(f'[{i}] {author_str}{year_str}. [{title}]({url})')
236
- print('\n'.join(lines))
237
-
238
- elif style == 'json':
239
- out = []
240
- for i, src in enumerate(unique, 1):
241
- out.append({
242
- 'display_number': i,
243
- 'source_id': src['source_id'],
244
- 'canonical_locator': src['canonical_locator'],
245
- 'title': src.get('title', ''),
246
- 'authors': src.get('authors'),
247
- 'year': src.get('year'),
248
- 'raw_url': src.get('raw_url', ''),
249
- })
250
- print(json.dumps(out, indent=2, ensure_ascii=False))
251
-
252
- else:
253
- print(f'Unknown style: {style}', file=sys.stderr)
254
- sys.exit(1)
255
-
256
-
257
- # ---------------------------------------------------------------------------
258
- # CLI entry point
259
- # ---------------------------------------------------------------------------
260
-
261
- def main() -> None:
262
- parser = argparse.ArgumentParser(
263
- prog='citation_manager',
264
- description='Stable source identity and run manifest management for deep-research v3.0',
265
- )
266
- sub = parser.add_subparsers(dest='command', required=True)
267
-
268
- # init-run
269
- p_init = sub.add_parser('init-run', help='Create run manifest and empty artifact files')
270
- p_init.add_argument('--out-dir', required=True, help='Output directory for the research run')
271
- p_init.add_argument('--query', default='', help='Original research question')
272
- p_init.add_argument('--mode', default='standard', choices=['quick', 'standard', 'deep', 'ultradeep'])
273
-
274
- # register-source
275
- p_reg = sub.add_parser('register-source', help='Register a source and return its stable ID')
276
- p_reg.add_argument('--json', required=True, help='JSON object with at least raw_url and title')
277
- p_reg.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
278
-
279
- # assign-display-numbers
280
- p_num = sub.add_parser('assign-display-numbers', help='Map stable source IDs to display numbers')
281
- p_num.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
282
-
283
- # export-bibliography
284
- p_bib = sub.add_parser('export-bibliography', help='Generate bibliography from sources')
285
- p_bib.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
286
- p_bib.add_argument('--style', default='markdown', choices=['markdown', 'json'])
287
-
288
- args = parser.parse_args()
289
-
290
- dispatch = {
291
- 'init-run': cmd_init_run,
292
- 'register-source': cmd_register_source,
293
- 'assign-display-numbers': cmd_assign_display_numbers,
294
- 'export-bibliography': cmd_export_bibliography,
295
- }
296
- dispatch[args.command](args)
297
-
298
-
299
- if __name__ == '__main__':
300
- main()
@@ -1,205 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Evidence Store — append-only evidence persistence for deep-research v3.0.
4
-
5
- CLI subcommands:
6
- init Create empty evidence.jsonl in a run directory
7
- add Append an evidence row, return evidence_id
8
- list List evidence rows, optionally filtered by source_id
9
- export Export evidence as JSON array
10
-
11
- Evidence identity:
12
- evidence_id = sha256(source_id + normalized_quote + locator)[:16]
13
-
14
- All state is append-only JSONL. Evidence is never modified after capture.
15
- """
16
-
17
- import argparse
18
- import hashlib
19
- import json
20
- import os
21
- import re
22
- import sys
23
- from datetime import datetime, timezone
24
-
25
-
26
- # ---------------------------------------------------------------------------
27
- # Evidence ID computation
28
- # ---------------------------------------------------------------------------
29
-
30
- _WHITESPACE_RE = re.compile(r'\s+')
31
-
32
-
33
- def normalize_quote(quote: str) -> str:
34
- """Normalize whitespace for stable hashing."""
35
- return _WHITESPACE_RE.sub(' ', quote.strip()).lower()
36
-
37
-
38
- def compute_evidence_id(source_id: str, quote: str, locator: str | None) -> str:
39
- """sha256(source_id + normalized_quote + locator)[:16] hex."""
40
- payload = source_id + normalize_quote(quote) + (locator or '')
41
- return hashlib.sha256(payload.encode('utf-8')).hexdigest()[:16]
42
-
43
-
44
- # ---------------------------------------------------------------------------
45
- # JSONL helpers (shared pattern with citation_manager)
46
- # ---------------------------------------------------------------------------
47
-
48
- def append_jsonl(path: str, obj: dict) -> None:
49
- with open(path, 'a') as f:
50
- f.write(json.dumps(obj, ensure_ascii=False) + '\n')
51
-
52
-
53
- def read_jsonl(path: str) -> list[dict]:
54
- rows = []
55
- if not os.path.exists(path):
56
- return rows
57
- with open(path) as f:
58
- for line in f:
59
- line = line.strip()
60
- if line:
61
- rows.append(json.loads(line))
62
- return rows
63
-
64
-
65
- # ---------------------------------------------------------------------------
66
- # Subcommands
67
- # ---------------------------------------------------------------------------
68
-
69
- def cmd_init(args: argparse.Namespace) -> None:
70
- """Create empty evidence.jsonl if it doesn't exist."""
71
- out_dir = os.path.abspath(args.dir)
72
- path = os.path.join(out_dir, 'evidence.jsonl')
73
- if not os.path.exists(path):
74
- os.makedirs(out_dir, exist_ok=True)
75
- open(path, 'w').close()
76
- print(json.dumps({'status': 'ok', 'path': path}))
77
-
78
-
79
- def cmd_add(args: argparse.Namespace) -> None:
80
- """Append evidence row, print evidence_id."""
81
- data = json.loads(args.json)
82
- source_id = data.get('source_id', '')
83
- quote = data.get('quote', '')
84
- if not source_id or not quote:
85
- print(json.dumps({'error': 'source_id and quote are required'}), file=sys.stderr)
86
- sys.exit(1)
87
-
88
- locator = data.get('locator')
89
- evidence_id = compute_evidence_id(source_id, quote, locator)
90
- evidence_path = os.path.join(args.dir, 'evidence.jsonl')
91
-
92
- # Check for duplicate
93
- existing = read_jsonl(evidence_path)
94
- for row in existing:
95
- if row.get('evidence_id') == evidence_id:
96
- print(json.dumps({
97
- 'status': 'duplicate',
98
- 'evidence_id': evidence_id,
99
- }))
100
- return
101
-
102
- valid_types = {'direct_quote', 'paraphrase', 'data_point', 'figure_reference', 'methodology'}
103
- evidence_type = data.get('evidence_type', 'direct_quote')
104
- if evidence_type not in valid_types:
105
- evidence_type = 'direct_quote'
106
-
107
- row = {
108
- 'evidence_id': evidence_id,
109
- 'source_id': source_id,
110
- 'retrieval_query': data.get('retrieval_query'),
111
- 'locator': locator,
112
- 'quote': quote,
113
- 'evidence_type': evidence_type,
114
- 'captured_at': datetime.now(timezone.utc).isoformat(),
115
- }
116
- append_jsonl(evidence_path, row)
117
- print(json.dumps({
118
- 'status': 'added',
119
- 'evidence_id': evidence_id,
120
- 'source_id': source_id,
121
- }))
122
-
123
-
124
- def cmd_list(args: argparse.Namespace) -> None:
125
- """List evidence rows, optionally filtered."""
126
- evidence_path = os.path.join(args.dir, 'evidence.jsonl')
127
- rows = read_jsonl(evidence_path)
128
-
129
- if args.source_id:
130
- rows = [r for r in rows if r.get('source_id') == args.source_id]
131
-
132
- # Deduplicate by evidence_id
133
- seen = set()
134
- unique = []
135
- for r in rows:
136
- eid = r.get('evidence_id')
137
- if eid not in seen:
138
- seen.add(eid)
139
- unique.append(r)
140
-
141
- print(json.dumps({
142
- 'count': len(unique),
143
- 'evidence': unique,
144
- }, indent=2, ensure_ascii=False))
145
-
146
-
147
- def cmd_export(args: argparse.Namespace) -> None:
148
- """Export all evidence as JSON array."""
149
- evidence_path = os.path.join(args.dir, 'evidence.jsonl')
150
- rows = read_jsonl(evidence_path)
151
-
152
- # Deduplicate
153
- seen = set()
154
- unique = []
155
- for r in rows:
156
- eid = r.get('evidence_id')
157
- if eid not in seen:
158
- seen.add(eid)
159
- unique.append(r)
160
-
161
- print(json.dumps(unique, indent=2, ensure_ascii=False))
162
-
163
-
164
- # ---------------------------------------------------------------------------
165
- # CLI entry point
166
- # ---------------------------------------------------------------------------
167
-
168
- def main() -> None:
169
- parser = argparse.ArgumentParser(
170
- prog='evidence_store',
171
- description='Append-only evidence persistence for deep-research v3.0',
172
- )
173
- sub = parser.add_subparsers(dest='command', required=True)
174
-
175
- # init
176
- p_init = sub.add_parser('init', help='Create empty evidence.jsonl')
177
- p_init.add_argument('--dir', required=True, help='Run directory')
178
-
179
- # add
180
- p_add = sub.add_parser('add', help='Append evidence row')
181
- p_add.add_argument('--json', required=True, help='JSON with source_id, quote, locator, evidence_type, retrieval_query')
182
- p_add.add_argument('--dir', required=True, help='Run directory containing evidence.jsonl')
183
-
184
- # list
185
- p_list = sub.add_parser('list', help='List evidence rows')
186
- p_list.add_argument('--dir', required=True, help='Run directory')
187
- p_list.add_argument('--source-id', default=None, help='Filter by source_id')
188
-
189
- # export
190
- p_export = sub.add_parser('export', help='Export all evidence as JSON array')
191
- p_export.add_argument('--dir', required=True, help='Run directory')
192
-
193
- args = parser.parse_args()
194
-
195
- dispatch = {
196
- 'init': cmd_init,
197
- 'add': cmd_add,
198
- 'list': cmd_list,
199
- 'export': cmd_export,
200
- }
201
- dispatch[args.command](args)
202
-
203
-
204
- if __name__ == '__main__':
205
- main()