@clawos-dev/clawd 0.2.46 → 0.2.47-beta.70.6ec7522

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/cli.cjs +166 -213
  2. package/package.json +2 -2
  3. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/SKILL.md +0 -187
  4. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/archive-template.md +0 -21
  5. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/article-template.md +0 -20
  6. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/index-template.md +0 -18
  7. package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/raw-template.md +0 -7
  8. package/dist/persona-defaults/persona-knowledge-base/CLAUDE.md +0 -105
  9. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/README.md +0 -119
  10. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/SKILL.md +0 -108
  11. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/continuation.md +0 -167
  12. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/html-generation.md +0 -103
  13. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/methodology.md +0 -421
  14. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/quality-gates.md +0 -192
  15. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/report-assembly.md +0 -130
  16. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/weasyprint_guidelines.md +0 -324
  17. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/requirements.txt +0 -14
  18. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/claim.schema.json +0 -49
  19. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/evidence.schema.json +0 -43
  20. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/run_manifest.schema.json +0 -97
  21. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/source.schema.json +0 -49
  22. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/citation_manager.py +0 -300
  23. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/evidence_store.py +0 -205
  24. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/extract_claims.py +0 -358
  25. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/md_to_html.py +0 -330
  26. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/research_engine.py +0 -584
  27. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/source_evaluator.py +0 -292
  28. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/validate_report.py +0 -354
  29. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_citations.py +0 -426
  30. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_claim_support.py +0 -344
  31. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_html.py +0 -220
  32. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/mckinsey_report_template.html +0 -443
  33. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/report_template.md +0 -414
  34. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/invalid_report.md +0 -27
  35. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/valid_report.md +0 -114
  36. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_citation_manager.py +0 -195
  37. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_evidence_store.py +0 -166
  38. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_extract_claims.py +0 -213
  39. package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_verify_claim_support.py +0 -230
  40. package/dist/persona-defaults/persona-researcher/CLAUDE.md +0 -30
  41. package/dist/persona-defaults/persona-researcher/skills-lock.json +0 -11
@@ -1,43 +0,0 @@
1
- {
2
- "$schema": "https://json-schema.org/draft/2020-12/schema",
3
- "title": "Evidence",
4
- "description": "A piece of evidence extracted from a source. evidence_id = sha256(source_id + normalized_quote + locator)[:16].",
5
- "type": "object",
6
- "required": ["evidence_id", "source_id", "quote", "evidence_type", "captured_at"],
7
- "properties": {
8
- "evidence_id": {
9
- "type": "string",
10
- "pattern": "^[0-9a-f]{16}$",
11
- "description": "sha256(source_id + normalized_quote + locator)[:16]"
12
- },
13
- "source_id": {
14
- "type": "string",
15
- "pattern": "^[0-9a-f]{16}$",
16
- "description": "References a source in sources.jsonl"
17
- },
18
- "retrieval_query": {
19
- "type": ["string", "null"],
20
- "description": "The search query or prompt that led to this evidence",
21
- "default": null
22
- },
23
- "locator": {
24
- "type": ["string", "null"],
25
- "description": "Page number, section heading, URL fragment, or timestamp within the source",
26
- "default": null
27
- },
28
- "quote": {
29
- "type": "string",
30
- "description": "Exact or near-exact text extracted from the source"
31
- },
32
- "evidence_type": {
33
- "type": "string",
34
- "enum": ["direct_quote", "paraphrase", "data_point", "figure_reference", "methodology"],
35
- "description": "How the evidence was captured"
36
- },
37
- "captured_at": {
38
- "type": "string",
39
- "format": "date-time"
40
- }
41
- },
42
- "additionalProperties": false
43
- }
@@ -1,97 +0,0 @@
1
- {
2
- "$schema": "https://json-schema.org/draft/2020-12/schema",
3
- "title": "RunManifest",
4
- "description": "Manifest for a single research run. Created at init, updated throughout.",
5
- "type": "object",
6
- "required": ["version", "query", "mode", "started_at", "report_dir", "artifact_paths"],
7
- "properties": {
8
- "version": {
9
- "type": "string",
10
- "const": "3.0.0"
11
- },
12
- "query": {
13
- "type": "string",
14
- "description": "Original research question"
15
- },
16
- "mode": {
17
- "type": "string",
18
- "enum": ["quick", "standard", "deep", "ultradeep"]
19
- },
20
- "started_at": {
21
- "type": "string",
22
- "format": "date-time"
23
- },
24
- "finished_at": {
25
- "type": ["string", "null"],
26
- "format": "date-time",
27
- "default": null
28
- },
29
- "assumptions": {
30
- "type": "array",
31
- "items": {
32
- "type": "object",
33
- "required": ["assumption_id", "text", "materiality", "status"],
34
- "properties": {
35
- "assumption_id": {
36
- "type": "string",
37
- "pattern": "^asm_[0-9a-f]{8}$"
38
- },
39
- "text": { "type": "string" },
40
- "materiality": {
41
- "type": "string",
42
- "enum": ["low", "medium", "high"]
43
- },
44
- "status": {
45
- "type": "string",
46
- "enum": ["implicit", "user_confirmed", "evidence_validated"]
47
- }
48
- },
49
- "additionalProperties": false
50
- },
51
- "default": []
52
- },
53
- "provider_config": {
54
- "type": "object",
55
- "properties": {
56
- "primary": {
57
- "type": "string",
58
- "description": "Primary search provider (e.g. WebSearch)"
59
- },
60
- "scholarly": {
61
- "type": ["string", "null"],
62
- "description": "Scholarly API provider if configured (e.g. openalex, semantic_scholar)"
63
- }
64
- },
65
- "default": { "primary": "search-cli", "scholarly": null }
66
- },
67
- "report_dir": {
68
- "type": "string",
69
- "description": "Absolute path to the report directory"
70
- },
71
- "artifact_paths": {
72
- "type": "object",
73
- "required": ["sources", "evidence", "claims", "report"],
74
- "properties": {
75
- "sources": { "type": "string", "default": "sources.jsonl" },
76
- "evidence": { "type": "string", "default": "evidence.jsonl" },
77
- "claims": { "type": "string", "default": "claims.jsonl" },
78
- "report": { "type": "string", "default": "report.md" }
79
- },
80
- "additionalProperties": false
81
- },
82
- "continuation": {
83
- "type": ["object", "null"],
84
- "description": "Populated when resuming a previous run",
85
- "properties": {
86
- "previous_run_manifest": { "type": "string" },
87
- "resumed_at": { "type": "string", "format": "date-time" },
88
- "sections_completed": {
89
- "type": "array",
90
- "items": { "type": "string" }
91
- }
92
- },
93
- "default": null
94
- }
95
- },
96
- "additionalProperties": false
97
- }
@@ -1,49 +0,0 @@
1
- {
2
- "$schema": "https://json-schema.org/draft/2020-12/schema",
3
- "title": "Source",
4
- "description": "A research source with stable identity. source_id = sha256(canonical_locator)[:16].",
5
- "type": "object",
6
- "required": ["source_id", "canonical_locator", "raw_url", "title", "source_type", "metadata_status", "registered_at"],
7
- "properties": {
8
- "source_id": {
9
- "type": "string",
10
- "pattern": "^[0-9a-f]{16}$",
11
- "description": "sha256(canonical_locator)[:16] — stable across edits and continuation"
12
- },
13
- "canonical_locator": {
14
- "type": "string",
15
- "description": "Canonical identifier: doi:10.1038/..., arxiv:2305.14251, or normalized URL (scheme+host+path, no fragment/tracking params)"
16
- },
17
- "raw_url": {
18
- "type": "string",
19
- "description": "Original URL as retrieved, before normalization"
20
- },
21
- "title": {
22
- "type": "string"
23
- },
24
- "authors": {
25
- "type": ["array", "null"],
26
- "items": { "type": "string" },
27
- "default": null
28
- },
29
- "year": {
30
- "type": ["string", "null"],
31
- "default": null
32
- },
33
- "source_type": {
34
- "type": "string",
35
- "enum": ["web", "academic", "documentation", "code", "news", "government", "book"]
36
- },
37
- "metadata_status": {
38
- "type": "string",
39
- "enum": ["unverified", "doi_verified", "url_verified", "title_matched"],
40
- "description": "How far metadata has been verified"
41
- },
42
- "registered_at": {
43
- "type": "string",
44
- "format": "date-time",
45
- "description": "ISO 8601 timestamp when source was registered"
46
- }
47
- },
48
- "additionalProperties": false
49
- }
@@ -1,300 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Citation Manager — stable source identity and run manifest management.
4
-
5
- CLI subcommands:
6
- init-run Create run_manifest.json + empty artifact JSONL files
7
- register-source Append a source to sources.jsonl, return source_id
8
- assign-display-numbers Generate stable_id -> display_number mapping
9
- export-bibliography Render bibliography from sources.jsonl
10
-
11
- Source identity:
12
- source_id = sha256(canonical_locator)[:16]
13
- canonical_locator = doi:..., arxiv:..., or normalized URL
14
-
15
- All state is append-only JSONL. No mutable citation numbers in state files.
16
- """
17
-
18
- import argparse
19
- import hashlib
20
- import json
21
- import os
22
- import re
23
- import sys
24
- from datetime import datetime, timezone
25
- from urllib.parse import urlparse, urlunparse
26
-
27
-
28
- # ---------------------------------------------------------------------------
29
- # Canonical locator normalization
30
- # ---------------------------------------------------------------------------
31
-
32
- DOI_RE = re.compile(r'(?:https?://(?:dx\.)?doi\.org/|doi:)(10\.\d{4,}/\S+)', re.IGNORECASE)
33
- ARXIV_RE = re.compile(r'(?:https?://arxiv\.org/abs/|arxiv:)(\d{4}\.\d{4,}(?:v\d+)?)', re.IGNORECASE)
34
-
35
- # URL query params that are tracking noise, not content identifiers
36
- TRACKING_PARAMS = frozenset([
37
- 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
38
- 'ref', 'source', 'fbclid', 'gclid', 'mc_cid', 'mc_eid',
39
- ])
40
-
41
-
42
- def canonicalize_locator(raw_url: str) -> str:
43
- """Derive a canonical locator from a raw URL or identifier string.
44
-
45
- Priority: DOI > arXiv > normalized URL.
46
- """
47
- # DOI
48
- m = DOI_RE.search(raw_url)
49
- if m:
50
- return f'doi:{m.group(1).rstrip(".")}'
51
-
52
- # arXiv
53
- m = ARXIV_RE.search(raw_url)
54
- if m:
55
- return f'arxiv:{m.group(1)}'
56
-
57
- # Normalized URL: lowercase scheme+host, strip fragment and tracking params
58
- parsed = urlparse(raw_url)
59
- scheme = (parsed.scheme or 'https').lower()
60
- host = (parsed.hostname or '').lower()
61
- path = parsed.path.rstrip('/')
62
- # Filter query params
63
- if parsed.query:
64
- pairs = []
65
- for part in parsed.query.split('&'):
66
- kv = part.split('=', 1)
67
- if kv[0].lower() not in TRACKING_PARAMS:
68
- pairs.append(part)
69
- query = '&'.join(sorted(pairs))
70
- else:
71
- query = ''
72
- return urlunparse((scheme, host, path, '', query, ''))
73
-
74
-
75
- def compute_source_id(canonical_locator: str) -> str:
76
- """sha256(canonical_locator)[:16] hex."""
77
- return hashlib.sha256(canonical_locator.encode('utf-8')).hexdigest()[:16]
78
-
79
-
80
- # ---------------------------------------------------------------------------
81
- # JSONL helpers
82
- # ---------------------------------------------------------------------------
83
-
84
- def append_jsonl(path: str, obj: dict) -> None:
85
- with open(path, 'a') as f:
86
- f.write(json.dumps(obj, ensure_ascii=False) + '\n')
87
-
88
-
89
- def read_jsonl(path: str) -> list[dict]:
90
- rows = []
91
- if not os.path.exists(path):
92
- return rows
93
- with open(path) as f:
94
- for line in f:
95
- line = line.strip()
96
- if line:
97
- rows.append(json.loads(line))
98
- return rows
99
-
100
-
101
- # ---------------------------------------------------------------------------
102
- # Subcommands
103
- # ---------------------------------------------------------------------------
104
-
105
- def cmd_init_run(args: argparse.Namespace) -> None:
106
- """Create run_manifest.json and empty JSONL artifact files."""
107
- out_dir = os.path.abspath(args.out_dir)
108
- os.makedirs(out_dir, exist_ok=True)
109
-
110
- artifact_paths = {
111
- 'sources': 'sources.jsonl',
112
- 'evidence': 'evidence.jsonl',
113
- 'claims': 'claims.jsonl',
114
- 'report': 'report.md',
115
- }
116
-
117
- manifest = {
118
- 'version': '3.0.0',
119
- 'query': args.query or '',
120
- 'mode': args.mode,
121
- 'started_at': datetime.now(timezone.utc).isoformat(),
122
- 'finished_at': None,
123
- 'assumptions': [],
124
- 'provider_config': {
125
- 'primary': 'search-cli',
126
- 'scholarly': None,
127
- },
128
- 'report_dir': out_dir,
129
- 'artifact_paths': artifact_paths,
130
- 'continuation': None,
131
- }
132
-
133
- manifest_path = os.path.join(out_dir, 'run_manifest.json')
134
- with open(manifest_path, 'w') as f:
135
- json.dump(manifest, f, indent=2, ensure_ascii=False)
136
- f.write('\n')
137
-
138
- # Create empty artifact files
139
- for name in ('sources', 'evidence', 'claims'):
140
- p = os.path.join(out_dir, artifact_paths[name])
141
- if not os.path.exists(p):
142
- open(p, 'w').close()
143
-
144
- print(json.dumps({'status': 'ok', 'manifest': manifest_path, 'dir': out_dir}))
145
-
146
-
147
- def cmd_register_source(args: argparse.Namespace) -> None:
148
- """Register a source, append to sources.jsonl, print source_id."""
149
- data = json.loads(args.json)
150
- raw_url = data.get('raw_url', data.get('url', ''))
151
- if not raw_url:
152
- print(json.dumps({'error': 'raw_url is required'}), file=sys.stderr)
153
- sys.exit(1)
154
-
155
- canonical = data.get('canonical_locator') or canonicalize_locator(raw_url)
156
- source_id = compute_source_id(canonical)
157
-
158
- sources_path = os.path.join(args.dir, 'sources.jsonl')
159
-
160
- # Check for duplicate
161
- existing = read_jsonl(sources_path)
162
- for row in existing:
163
- if row.get('source_id') == source_id:
164
- print(json.dumps({
165
- 'status': 'duplicate',
166
- 'source_id': source_id,
167
- 'canonical_locator': canonical,
168
- }))
169
- return
170
-
171
- source = {
172
- 'source_id': source_id,
173
- 'canonical_locator': canonical,
174
- 'raw_url': raw_url,
175
- 'title': data.get('title', ''),
176
- 'authors': data.get('authors'),
177
- 'year': data.get('year'),
178
- 'source_type': data.get('source_type', 'web'),
179
- 'metadata_status': data.get('metadata_status', 'unverified'),
180
- 'registered_at': datetime.now(timezone.utc).isoformat(),
181
- }
182
- append_jsonl(sources_path, source)
183
- print(json.dumps({
184
- 'status': 'registered',
185
- 'source_id': source_id,
186
- 'canonical_locator': canonical,
187
- }))
188
-
189
-
190
- def cmd_assign_display_numbers(args: argparse.Namespace) -> None:
191
- """Read sources.jsonl, assign stable display numbers in registration order."""
192
- sources_path = os.path.join(args.dir, 'sources.jsonl')
193
- sources = read_jsonl(sources_path)
194
-
195
- mapping = {}
196
- for i, src in enumerate(sources, 1):
197
- sid = src['source_id']
198
- if sid not in mapping:
199
- mapping[sid] = i
200
-
201
- print(json.dumps(mapping, indent=2))
202
-
203
-
204
- def cmd_export_bibliography(args: argparse.Namespace) -> None:
205
- """Generate bibliography from sources.jsonl."""
206
- sources_path = os.path.join(args.dir, 'sources.jsonl')
207
- sources = read_jsonl(sources_path)
208
-
209
- # Deduplicate by source_id, preserve order
210
- seen = set()
211
- unique = []
212
- for src in sources:
213
- if src['source_id'] not in seen:
214
- seen.add(src['source_id'])
215
- unique.append(src)
216
-
217
- style = args.style
218
-
219
- if style == 'markdown':
220
- lines = ['## Bibliography', '']
221
- for i, src in enumerate(unique, 1):
222
- author_str = ''
223
- if src.get('authors'):
224
- authors = src['authors']
225
- if len(authors) == 1:
226
- author_str = f'{authors[0]}. '
227
- elif len(authors) == 2:
228
- author_str = f'{authors[0]} & {authors[1]}. '
229
- else:
230
- author_str = f'{authors[0]} et al. '
231
-
232
- year_str = f'({src["year"]})' if src.get('year') else '(n.d.)'
233
- title = src.get('title', 'Untitled')
234
- url = src.get('raw_url', '')
235
- lines.append(f'[{i}] {author_str}{year_str}. [{title}]({url})')
236
- print('\n'.join(lines))
237
-
238
- elif style == 'json':
239
- out = []
240
- for i, src in enumerate(unique, 1):
241
- out.append({
242
- 'display_number': i,
243
- 'source_id': src['source_id'],
244
- 'canonical_locator': src['canonical_locator'],
245
- 'title': src.get('title', ''),
246
- 'authors': src.get('authors'),
247
- 'year': src.get('year'),
248
- 'raw_url': src.get('raw_url', ''),
249
- })
250
- print(json.dumps(out, indent=2, ensure_ascii=False))
251
-
252
- else:
253
- print(f'Unknown style: {style}', file=sys.stderr)
254
- sys.exit(1)
255
-
256
-
257
- # ---------------------------------------------------------------------------
258
- # CLI entry point
259
- # ---------------------------------------------------------------------------
260
-
261
- def main() -> None:
262
- parser = argparse.ArgumentParser(
263
- prog='citation_manager',
264
- description='Stable source identity and run manifest management for deep-research v3.0',
265
- )
266
- sub = parser.add_subparsers(dest='command', required=True)
267
-
268
- # init-run
269
- p_init = sub.add_parser('init-run', help='Create run manifest and empty artifact files')
270
- p_init.add_argument('--out-dir', required=True, help='Output directory for the research run')
271
- p_init.add_argument('--query', default='', help='Original research question')
272
- p_init.add_argument('--mode', default='standard', choices=['quick', 'standard', 'deep', 'ultradeep'])
273
-
274
- # register-source
275
- p_reg = sub.add_parser('register-source', help='Register a source and return its stable ID')
276
- p_reg.add_argument('--json', required=True, help='JSON object with at least raw_url and title')
277
- p_reg.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
278
-
279
- # assign-display-numbers
280
- p_num = sub.add_parser('assign-display-numbers', help='Map stable source IDs to display numbers')
281
- p_num.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
282
-
283
- # export-bibliography
284
- p_bib = sub.add_parser('export-bibliography', help='Generate bibliography from sources')
285
- p_bib.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
286
- p_bib.add_argument('--style', default='markdown', choices=['markdown', 'json'])
287
-
288
- args = parser.parse_args()
289
-
290
- dispatch = {
291
- 'init-run': cmd_init_run,
292
- 'register-source': cmd_register_source,
293
- 'assign-display-numbers': cmd_assign_display_numbers,
294
- 'export-bibliography': cmd_export_bibliography,
295
- }
296
- dispatch[args.command](args)
297
-
298
-
299
- if __name__ == '__main__':
300
- main()
@@ -1,205 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Evidence Store — append-only evidence persistence for deep-research v3.0.
4
-
5
- CLI subcommands:
6
- init Create empty evidence.jsonl in a run directory
7
- add Append an evidence row, return evidence_id
8
- list List evidence rows, optionally filtered by source_id
9
- export Export evidence as JSON array
10
-
11
- Evidence identity:
12
- evidence_id = sha256(source_id + normalized_quote + locator)[:16]
13
-
14
- All state is append-only JSONL. Evidence is never modified after capture.
15
- """
16
-
17
- import argparse
18
- import hashlib
19
- import json
20
- import os
21
- import re
22
- import sys
23
- from datetime import datetime, timezone
24
-
25
-
26
- # ---------------------------------------------------------------------------
27
- # Evidence ID computation
28
- # ---------------------------------------------------------------------------
29
-
30
- _WHITESPACE_RE = re.compile(r'\s+')
31
-
32
-
33
- def normalize_quote(quote: str) -> str:
34
- """Normalize whitespace for stable hashing."""
35
- return _WHITESPACE_RE.sub(' ', quote.strip()).lower()
36
-
37
-
38
- def compute_evidence_id(source_id: str, quote: str, locator: str | None) -> str:
39
- """sha256(source_id + normalized_quote + locator)[:16] hex."""
40
- payload = source_id + normalize_quote(quote) + (locator or '')
41
- return hashlib.sha256(payload.encode('utf-8')).hexdigest()[:16]
42
-
43
-
44
- # ---------------------------------------------------------------------------
45
- # JSONL helpers (shared pattern with citation_manager)
46
- # ---------------------------------------------------------------------------
47
-
48
- def append_jsonl(path: str, obj: dict) -> None:
49
- with open(path, 'a') as f:
50
- f.write(json.dumps(obj, ensure_ascii=False) + '\n')
51
-
52
-
53
- def read_jsonl(path: str) -> list[dict]:
54
- rows = []
55
- if not os.path.exists(path):
56
- return rows
57
- with open(path) as f:
58
- for line in f:
59
- line = line.strip()
60
- if line:
61
- rows.append(json.loads(line))
62
- return rows
63
-
64
-
65
- # ---------------------------------------------------------------------------
66
- # Subcommands
67
- # ---------------------------------------------------------------------------
68
-
69
- def cmd_init(args: argparse.Namespace) -> None:
70
- """Create empty evidence.jsonl if it doesn't exist."""
71
- out_dir = os.path.abspath(args.dir)
72
- path = os.path.join(out_dir, 'evidence.jsonl')
73
- if not os.path.exists(path):
74
- os.makedirs(out_dir, exist_ok=True)
75
- open(path, 'w').close()
76
- print(json.dumps({'status': 'ok', 'path': path}))
77
-
78
-
79
- def cmd_add(args: argparse.Namespace) -> None:
80
- """Append evidence row, print evidence_id."""
81
- data = json.loads(args.json)
82
- source_id = data.get('source_id', '')
83
- quote = data.get('quote', '')
84
- if not source_id or not quote:
85
- print(json.dumps({'error': 'source_id and quote are required'}), file=sys.stderr)
86
- sys.exit(1)
87
-
88
- locator = data.get('locator')
89
- evidence_id = compute_evidence_id(source_id, quote, locator)
90
- evidence_path = os.path.join(args.dir, 'evidence.jsonl')
91
-
92
- # Check for duplicate
93
- existing = read_jsonl(evidence_path)
94
- for row in existing:
95
- if row.get('evidence_id') == evidence_id:
96
- print(json.dumps({
97
- 'status': 'duplicate',
98
- 'evidence_id': evidence_id,
99
- }))
100
- return
101
-
102
- valid_types = {'direct_quote', 'paraphrase', 'data_point', 'figure_reference', 'methodology'}
103
- evidence_type = data.get('evidence_type', 'direct_quote')
104
- if evidence_type not in valid_types:
105
- evidence_type = 'direct_quote'
106
-
107
- row = {
108
- 'evidence_id': evidence_id,
109
- 'source_id': source_id,
110
- 'retrieval_query': data.get('retrieval_query'),
111
- 'locator': locator,
112
- 'quote': quote,
113
- 'evidence_type': evidence_type,
114
- 'captured_at': datetime.now(timezone.utc).isoformat(),
115
- }
116
- append_jsonl(evidence_path, row)
117
- print(json.dumps({
118
- 'status': 'added',
119
- 'evidence_id': evidence_id,
120
- 'source_id': source_id,
121
- }))
122
-
123
-
124
- def cmd_list(args: argparse.Namespace) -> None:
125
- """List evidence rows, optionally filtered."""
126
- evidence_path = os.path.join(args.dir, 'evidence.jsonl')
127
- rows = read_jsonl(evidence_path)
128
-
129
- if args.source_id:
130
- rows = [r for r in rows if r.get('source_id') == args.source_id]
131
-
132
- # Deduplicate by evidence_id
133
- seen = set()
134
- unique = []
135
- for r in rows:
136
- eid = r.get('evidence_id')
137
- if eid not in seen:
138
- seen.add(eid)
139
- unique.append(r)
140
-
141
- print(json.dumps({
142
- 'count': len(unique),
143
- 'evidence': unique,
144
- }, indent=2, ensure_ascii=False))
145
-
146
-
147
- def cmd_export(args: argparse.Namespace) -> None:
148
- """Export all evidence as JSON array."""
149
- evidence_path = os.path.join(args.dir, 'evidence.jsonl')
150
- rows = read_jsonl(evidence_path)
151
-
152
- # Deduplicate
153
- seen = set()
154
- unique = []
155
- for r in rows:
156
- eid = r.get('evidence_id')
157
- if eid not in seen:
158
- seen.add(eid)
159
- unique.append(r)
160
-
161
- print(json.dumps(unique, indent=2, ensure_ascii=False))
162
-
163
-
164
- # ---------------------------------------------------------------------------
165
- # CLI entry point
166
- # ---------------------------------------------------------------------------
167
-
168
- def main() -> None:
169
- parser = argparse.ArgumentParser(
170
- prog='evidence_store',
171
- description='Append-only evidence persistence for deep-research v3.0',
172
- )
173
- sub = parser.add_subparsers(dest='command', required=True)
174
-
175
- # init
176
- p_init = sub.add_parser('init', help='Create empty evidence.jsonl')
177
- p_init.add_argument('--dir', required=True, help='Run directory')
178
-
179
- # add
180
- p_add = sub.add_parser('add', help='Append evidence row')
181
- p_add.add_argument('--json', required=True, help='JSON with source_id, quote, locator, evidence_type, retrieval_query')
182
- p_add.add_argument('--dir', required=True, help='Run directory containing evidence.jsonl')
183
-
184
- # list
185
- p_list = sub.add_parser('list', help='List evidence rows')
186
- p_list.add_argument('--dir', required=True, help='Run directory')
187
- p_list.add_argument('--source-id', default=None, help='Filter by source_id')
188
-
189
- # export
190
- p_export = sub.add_parser('export', help='Export all evidence as JSON array')
191
- p_export.add_argument('--dir', required=True, help='Run directory')
192
-
193
- args = parser.parse_args()
194
-
195
- dispatch = {
196
- 'init': cmd_init,
197
- 'add': cmd_add,
198
- 'list': cmd_list,
199
- 'export': cmd_export,
200
- }
201
- dispatch[args.command](args)
202
-
203
-
204
- if __name__ == '__main__':
205
- main()