@clawos-dev/clawd 0.2.50-beta.77.3a9364e → 0.2.51-beta.78.2024c11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/persona-defaults/persona-clawd-helper/CLAUDE.md +1 -1
- package/dist/persona-defaults/persona-knowledge-base/CLAUDE.md +19 -0
- package/dist/persona-defaults/persona-researcher/CLAUDE.md +20 -1
- package/package.json +1 -1
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/SKILL.md +0 -187
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/archive-template.md +0 -21
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/article-template.md +0 -20
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/index-template.md +0 -18
- package/dist/persona-defaults/persona-knowledge-base/.claude/skills/karpathy-llm-wiki/references/raw-template.md +0 -7
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/README.md +0 -119
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/SKILL.md +0 -108
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/continuation.md +0 -167
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/html-generation.md +0 -103
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/methodology.md +0 -421
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/quality-gates.md +0 -192
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/report-assembly.md +0 -130
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/reference/weasyprint_guidelines.md +0 -324
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/requirements.txt +0 -14
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/claim.schema.json +0 -49
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/evidence.schema.json +0 -43
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/run_manifest.schema.json +0 -97
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/schemas/source.schema.json +0 -49
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/citation_manager.py +0 -300
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/evidence_store.py +0 -205
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/extract_claims.py +0 -358
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/md_to_html.py +0 -330
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/research_engine.py +0 -584
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/source_evaluator.py +0 -292
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/validate_report.py +0 -354
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_citations.py +0 -426
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_claim_support.py +0 -344
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/scripts/verify_html.py +0 -220
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/mckinsey_report_template.html +0 -443
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/templates/report_template.md +0 -414
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/invalid_report.md +0 -27
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/fixtures/valid_report.md +0 -114
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_citation_manager.py +0 -195
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_evidence_store.py +0 -166
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_extract_claims.py +0 -213
- package/dist/persona-defaults/persona-researcher/.claude/skills/deep-research/tests/test_verify_claim_support.py +0 -230
- package/dist/persona-defaults/persona-researcher/skills-lock.json +0 -11
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
-
"title": "Evidence",
|
|
4
|
-
"description": "A piece of evidence extracted from a source. evidence_id = sha256(source_id + normalized_quote + locator)[:16].",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"required": ["evidence_id", "source_id", "quote", "evidence_type", "captured_at"],
|
|
7
|
-
"properties": {
|
|
8
|
-
"evidence_id": {
|
|
9
|
-
"type": "string",
|
|
10
|
-
"pattern": "^[0-9a-f]{16}$",
|
|
11
|
-
"description": "sha256(source_id + normalized_quote + locator)[:16]"
|
|
12
|
-
},
|
|
13
|
-
"source_id": {
|
|
14
|
-
"type": "string",
|
|
15
|
-
"pattern": "^[0-9a-f]{16}$",
|
|
16
|
-
"description": "References a source in sources.jsonl"
|
|
17
|
-
},
|
|
18
|
-
"retrieval_query": {
|
|
19
|
-
"type": ["string", "null"],
|
|
20
|
-
"description": "The search query or prompt that led to this evidence",
|
|
21
|
-
"default": null
|
|
22
|
-
},
|
|
23
|
-
"locator": {
|
|
24
|
-
"type": ["string", "null"],
|
|
25
|
-
"description": "Page number, section heading, URL fragment, or timestamp within the source",
|
|
26
|
-
"default": null
|
|
27
|
-
},
|
|
28
|
-
"quote": {
|
|
29
|
-
"type": "string",
|
|
30
|
-
"description": "Exact or near-exact text extracted from the source"
|
|
31
|
-
},
|
|
32
|
-
"evidence_type": {
|
|
33
|
-
"type": "string",
|
|
34
|
-
"enum": ["direct_quote", "paraphrase", "data_point", "figure_reference", "methodology"],
|
|
35
|
-
"description": "How the evidence was captured"
|
|
36
|
-
},
|
|
37
|
-
"captured_at": {
|
|
38
|
-
"type": "string",
|
|
39
|
-
"format": "date-time"
|
|
40
|
-
}
|
|
41
|
-
},
|
|
42
|
-
"additionalProperties": false
|
|
43
|
-
}
|
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
-
"title": "RunManifest",
|
|
4
|
-
"description": "Manifest for a single research run. Created at init, updated throughout.",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"required": ["version", "query", "mode", "started_at", "report_dir", "artifact_paths"],
|
|
7
|
-
"properties": {
|
|
8
|
-
"version": {
|
|
9
|
-
"type": "string",
|
|
10
|
-
"const": "3.0.0"
|
|
11
|
-
},
|
|
12
|
-
"query": {
|
|
13
|
-
"type": "string",
|
|
14
|
-
"description": "Original research question"
|
|
15
|
-
},
|
|
16
|
-
"mode": {
|
|
17
|
-
"type": "string",
|
|
18
|
-
"enum": ["quick", "standard", "deep", "ultradeep"]
|
|
19
|
-
},
|
|
20
|
-
"started_at": {
|
|
21
|
-
"type": "string",
|
|
22
|
-
"format": "date-time"
|
|
23
|
-
},
|
|
24
|
-
"finished_at": {
|
|
25
|
-
"type": ["string", "null"],
|
|
26
|
-
"format": "date-time",
|
|
27
|
-
"default": null
|
|
28
|
-
},
|
|
29
|
-
"assumptions": {
|
|
30
|
-
"type": "array",
|
|
31
|
-
"items": {
|
|
32
|
-
"type": "object",
|
|
33
|
-
"required": ["assumption_id", "text", "materiality", "status"],
|
|
34
|
-
"properties": {
|
|
35
|
-
"assumption_id": {
|
|
36
|
-
"type": "string",
|
|
37
|
-
"pattern": "^asm_[0-9a-f]{8}$"
|
|
38
|
-
},
|
|
39
|
-
"text": { "type": "string" },
|
|
40
|
-
"materiality": {
|
|
41
|
-
"type": "string",
|
|
42
|
-
"enum": ["low", "medium", "high"]
|
|
43
|
-
},
|
|
44
|
-
"status": {
|
|
45
|
-
"type": "string",
|
|
46
|
-
"enum": ["implicit", "user_confirmed", "evidence_validated"]
|
|
47
|
-
}
|
|
48
|
-
},
|
|
49
|
-
"additionalProperties": false
|
|
50
|
-
},
|
|
51
|
-
"default": []
|
|
52
|
-
},
|
|
53
|
-
"provider_config": {
|
|
54
|
-
"type": "object",
|
|
55
|
-
"properties": {
|
|
56
|
-
"primary": {
|
|
57
|
-
"type": "string",
|
|
58
|
-
"description": "Primary search provider (e.g. WebSearch)"
|
|
59
|
-
},
|
|
60
|
-
"scholarly": {
|
|
61
|
-
"type": ["string", "null"],
|
|
62
|
-
"description": "Scholarly API provider if configured (e.g. openalex, semantic_scholar)"
|
|
63
|
-
}
|
|
64
|
-
},
|
|
65
|
-
"default": { "primary": "search-cli", "scholarly": null }
|
|
66
|
-
},
|
|
67
|
-
"report_dir": {
|
|
68
|
-
"type": "string",
|
|
69
|
-
"description": "Absolute path to the report directory"
|
|
70
|
-
},
|
|
71
|
-
"artifact_paths": {
|
|
72
|
-
"type": "object",
|
|
73
|
-
"required": ["sources", "evidence", "claims", "report"],
|
|
74
|
-
"properties": {
|
|
75
|
-
"sources": { "type": "string", "default": "sources.jsonl" },
|
|
76
|
-
"evidence": { "type": "string", "default": "evidence.jsonl" },
|
|
77
|
-
"claims": { "type": "string", "default": "claims.jsonl" },
|
|
78
|
-
"report": { "type": "string", "default": "report.md" }
|
|
79
|
-
},
|
|
80
|
-
"additionalProperties": false
|
|
81
|
-
},
|
|
82
|
-
"continuation": {
|
|
83
|
-
"type": ["object", "null"],
|
|
84
|
-
"description": "Populated when resuming a previous run",
|
|
85
|
-
"properties": {
|
|
86
|
-
"previous_run_manifest": { "type": "string" },
|
|
87
|
-
"resumed_at": { "type": "string", "format": "date-time" },
|
|
88
|
-
"sections_completed": {
|
|
89
|
-
"type": "array",
|
|
90
|
-
"items": { "type": "string" }
|
|
91
|
-
}
|
|
92
|
-
},
|
|
93
|
-
"default": null
|
|
94
|
-
}
|
|
95
|
-
},
|
|
96
|
-
"additionalProperties": false
|
|
97
|
-
}
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
-
"title": "Source",
|
|
4
|
-
"description": "A research source with stable identity. source_id = sha256(canonical_locator)[:16].",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"required": ["source_id", "canonical_locator", "raw_url", "title", "source_type", "metadata_status", "registered_at"],
|
|
7
|
-
"properties": {
|
|
8
|
-
"source_id": {
|
|
9
|
-
"type": "string",
|
|
10
|
-
"pattern": "^[0-9a-f]{16}$",
|
|
11
|
-
"description": "sha256(canonical_locator)[:16] — stable across edits and continuation"
|
|
12
|
-
},
|
|
13
|
-
"canonical_locator": {
|
|
14
|
-
"type": "string",
|
|
15
|
-
"description": "Canonical identifier: doi:10.1038/..., arxiv:2305.14251, or normalized URL (scheme+host+path, no fragment/tracking params)"
|
|
16
|
-
},
|
|
17
|
-
"raw_url": {
|
|
18
|
-
"type": "string",
|
|
19
|
-
"description": "Original URL as retrieved, before normalization"
|
|
20
|
-
},
|
|
21
|
-
"title": {
|
|
22
|
-
"type": "string"
|
|
23
|
-
},
|
|
24
|
-
"authors": {
|
|
25
|
-
"type": ["array", "null"],
|
|
26
|
-
"items": { "type": "string" },
|
|
27
|
-
"default": null
|
|
28
|
-
},
|
|
29
|
-
"year": {
|
|
30
|
-
"type": ["string", "null"],
|
|
31
|
-
"default": null
|
|
32
|
-
},
|
|
33
|
-
"source_type": {
|
|
34
|
-
"type": "string",
|
|
35
|
-
"enum": ["web", "academic", "documentation", "code", "news", "government", "book"]
|
|
36
|
-
},
|
|
37
|
-
"metadata_status": {
|
|
38
|
-
"type": "string",
|
|
39
|
-
"enum": ["unverified", "doi_verified", "url_verified", "title_matched"],
|
|
40
|
-
"description": "How far metadata has been verified"
|
|
41
|
-
},
|
|
42
|
-
"registered_at": {
|
|
43
|
-
"type": "string",
|
|
44
|
-
"format": "date-time",
|
|
45
|
-
"description": "ISO 8601 timestamp when source was registered"
|
|
46
|
-
}
|
|
47
|
-
},
|
|
48
|
-
"additionalProperties": false
|
|
49
|
-
}
|
|
@@ -1,300 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Citation Manager — stable source identity and run manifest management.
|
|
4
|
-
|
|
5
|
-
CLI subcommands:
|
|
6
|
-
init-run Create run_manifest.json + empty artifact JSONL files
|
|
7
|
-
register-source Append a source to sources.jsonl, return source_id
|
|
8
|
-
assign-display-numbers Generate stable_id -> display_number mapping
|
|
9
|
-
export-bibliography Render bibliography from sources.jsonl
|
|
10
|
-
|
|
11
|
-
Source identity:
|
|
12
|
-
source_id = sha256(canonical_locator)[:16]
|
|
13
|
-
canonical_locator = doi:..., arxiv:..., or normalized URL
|
|
14
|
-
|
|
15
|
-
All state is append-only JSONL. No mutable citation numbers in state files.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
import argparse
|
|
19
|
-
import hashlib
|
|
20
|
-
import json
|
|
21
|
-
import os
|
|
22
|
-
import re
|
|
23
|
-
import sys
|
|
24
|
-
from datetime import datetime, timezone
|
|
25
|
-
from urllib.parse import urlparse, urlunparse
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
# ---------------------------------------------------------------------------
|
|
29
|
-
# Canonical locator normalization
|
|
30
|
-
# ---------------------------------------------------------------------------
|
|
31
|
-
|
|
32
|
-
DOI_RE = re.compile(r'(?:https?://(?:dx\.)?doi\.org/|doi:)(10\.\d{4,}/\S+)', re.IGNORECASE)
|
|
33
|
-
ARXIV_RE = re.compile(r'(?:https?://arxiv\.org/abs/|arxiv:)(\d{4}\.\d{4,}(?:v\d+)?)', re.IGNORECASE)
|
|
34
|
-
|
|
35
|
-
# URL query params that are tracking noise, not content identifiers
|
|
36
|
-
TRACKING_PARAMS = frozenset([
|
|
37
|
-
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
|
38
|
-
'ref', 'source', 'fbclid', 'gclid', 'mc_cid', 'mc_eid',
|
|
39
|
-
])
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def canonicalize_locator(raw_url: str) -> str:
|
|
43
|
-
"""Derive a canonical locator from a raw URL or identifier string.
|
|
44
|
-
|
|
45
|
-
Priority: DOI > arXiv > normalized URL.
|
|
46
|
-
"""
|
|
47
|
-
# DOI
|
|
48
|
-
m = DOI_RE.search(raw_url)
|
|
49
|
-
if m:
|
|
50
|
-
return f'doi:{m.group(1).rstrip(".")}'
|
|
51
|
-
|
|
52
|
-
# arXiv
|
|
53
|
-
m = ARXIV_RE.search(raw_url)
|
|
54
|
-
if m:
|
|
55
|
-
return f'arxiv:{m.group(1)}'
|
|
56
|
-
|
|
57
|
-
# Normalized URL: lowercase scheme+host, strip fragment and tracking params
|
|
58
|
-
parsed = urlparse(raw_url)
|
|
59
|
-
scheme = (parsed.scheme or 'https').lower()
|
|
60
|
-
host = (parsed.hostname or '').lower()
|
|
61
|
-
path = parsed.path.rstrip('/')
|
|
62
|
-
# Filter query params
|
|
63
|
-
if parsed.query:
|
|
64
|
-
pairs = []
|
|
65
|
-
for part in parsed.query.split('&'):
|
|
66
|
-
kv = part.split('=', 1)
|
|
67
|
-
if kv[0].lower() not in TRACKING_PARAMS:
|
|
68
|
-
pairs.append(part)
|
|
69
|
-
query = '&'.join(sorted(pairs))
|
|
70
|
-
else:
|
|
71
|
-
query = ''
|
|
72
|
-
return urlunparse((scheme, host, path, '', query, ''))
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def compute_source_id(canonical_locator: str) -> str:
|
|
76
|
-
"""sha256(canonical_locator)[:16] hex."""
|
|
77
|
-
return hashlib.sha256(canonical_locator.encode('utf-8')).hexdigest()[:16]
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
# ---------------------------------------------------------------------------
|
|
81
|
-
# JSONL helpers
|
|
82
|
-
# ---------------------------------------------------------------------------
|
|
83
|
-
|
|
84
|
-
def append_jsonl(path: str, obj: dict) -> None:
|
|
85
|
-
with open(path, 'a') as f:
|
|
86
|
-
f.write(json.dumps(obj, ensure_ascii=False) + '\n')
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def read_jsonl(path: str) -> list[dict]:
|
|
90
|
-
rows = []
|
|
91
|
-
if not os.path.exists(path):
|
|
92
|
-
return rows
|
|
93
|
-
with open(path) as f:
|
|
94
|
-
for line in f:
|
|
95
|
-
line = line.strip()
|
|
96
|
-
if line:
|
|
97
|
-
rows.append(json.loads(line))
|
|
98
|
-
return rows
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
# ---------------------------------------------------------------------------
|
|
102
|
-
# Subcommands
|
|
103
|
-
# ---------------------------------------------------------------------------
|
|
104
|
-
|
|
105
|
-
def cmd_init_run(args: argparse.Namespace) -> None:
|
|
106
|
-
"""Create run_manifest.json and empty JSONL artifact files."""
|
|
107
|
-
out_dir = os.path.abspath(args.out_dir)
|
|
108
|
-
os.makedirs(out_dir, exist_ok=True)
|
|
109
|
-
|
|
110
|
-
artifact_paths = {
|
|
111
|
-
'sources': 'sources.jsonl',
|
|
112
|
-
'evidence': 'evidence.jsonl',
|
|
113
|
-
'claims': 'claims.jsonl',
|
|
114
|
-
'report': 'report.md',
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
manifest = {
|
|
118
|
-
'version': '3.0.0',
|
|
119
|
-
'query': args.query or '',
|
|
120
|
-
'mode': args.mode,
|
|
121
|
-
'started_at': datetime.now(timezone.utc).isoformat(),
|
|
122
|
-
'finished_at': None,
|
|
123
|
-
'assumptions': [],
|
|
124
|
-
'provider_config': {
|
|
125
|
-
'primary': 'search-cli',
|
|
126
|
-
'scholarly': None,
|
|
127
|
-
},
|
|
128
|
-
'report_dir': out_dir,
|
|
129
|
-
'artifact_paths': artifact_paths,
|
|
130
|
-
'continuation': None,
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
manifest_path = os.path.join(out_dir, 'run_manifest.json')
|
|
134
|
-
with open(manifest_path, 'w') as f:
|
|
135
|
-
json.dump(manifest, f, indent=2, ensure_ascii=False)
|
|
136
|
-
f.write('\n')
|
|
137
|
-
|
|
138
|
-
# Create empty artifact files
|
|
139
|
-
for name in ('sources', 'evidence', 'claims'):
|
|
140
|
-
p = os.path.join(out_dir, artifact_paths[name])
|
|
141
|
-
if not os.path.exists(p):
|
|
142
|
-
open(p, 'w').close()
|
|
143
|
-
|
|
144
|
-
print(json.dumps({'status': 'ok', 'manifest': manifest_path, 'dir': out_dir}))
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def cmd_register_source(args: argparse.Namespace) -> None:
|
|
148
|
-
"""Register a source, append to sources.jsonl, print source_id."""
|
|
149
|
-
data = json.loads(args.json)
|
|
150
|
-
raw_url = data.get('raw_url', data.get('url', ''))
|
|
151
|
-
if not raw_url:
|
|
152
|
-
print(json.dumps({'error': 'raw_url is required'}), file=sys.stderr)
|
|
153
|
-
sys.exit(1)
|
|
154
|
-
|
|
155
|
-
canonical = data.get('canonical_locator') or canonicalize_locator(raw_url)
|
|
156
|
-
source_id = compute_source_id(canonical)
|
|
157
|
-
|
|
158
|
-
sources_path = os.path.join(args.dir, 'sources.jsonl')
|
|
159
|
-
|
|
160
|
-
# Check for duplicate
|
|
161
|
-
existing = read_jsonl(sources_path)
|
|
162
|
-
for row in existing:
|
|
163
|
-
if row.get('source_id') == source_id:
|
|
164
|
-
print(json.dumps({
|
|
165
|
-
'status': 'duplicate',
|
|
166
|
-
'source_id': source_id,
|
|
167
|
-
'canonical_locator': canonical,
|
|
168
|
-
}))
|
|
169
|
-
return
|
|
170
|
-
|
|
171
|
-
source = {
|
|
172
|
-
'source_id': source_id,
|
|
173
|
-
'canonical_locator': canonical,
|
|
174
|
-
'raw_url': raw_url,
|
|
175
|
-
'title': data.get('title', ''),
|
|
176
|
-
'authors': data.get('authors'),
|
|
177
|
-
'year': data.get('year'),
|
|
178
|
-
'source_type': data.get('source_type', 'web'),
|
|
179
|
-
'metadata_status': data.get('metadata_status', 'unverified'),
|
|
180
|
-
'registered_at': datetime.now(timezone.utc).isoformat(),
|
|
181
|
-
}
|
|
182
|
-
append_jsonl(sources_path, source)
|
|
183
|
-
print(json.dumps({
|
|
184
|
-
'status': 'registered',
|
|
185
|
-
'source_id': source_id,
|
|
186
|
-
'canonical_locator': canonical,
|
|
187
|
-
}))
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
def cmd_assign_display_numbers(args: argparse.Namespace) -> None:
|
|
191
|
-
"""Read sources.jsonl, assign stable display numbers in registration order."""
|
|
192
|
-
sources_path = os.path.join(args.dir, 'sources.jsonl')
|
|
193
|
-
sources = read_jsonl(sources_path)
|
|
194
|
-
|
|
195
|
-
mapping = {}
|
|
196
|
-
for i, src in enumerate(sources, 1):
|
|
197
|
-
sid = src['source_id']
|
|
198
|
-
if sid not in mapping:
|
|
199
|
-
mapping[sid] = i
|
|
200
|
-
|
|
201
|
-
print(json.dumps(mapping, indent=2))
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def cmd_export_bibliography(args: argparse.Namespace) -> None:
|
|
205
|
-
"""Generate bibliography from sources.jsonl."""
|
|
206
|
-
sources_path = os.path.join(args.dir, 'sources.jsonl')
|
|
207
|
-
sources = read_jsonl(sources_path)
|
|
208
|
-
|
|
209
|
-
# Deduplicate by source_id, preserve order
|
|
210
|
-
seen = set()
|
|
211
|
-
unique = []
|
|
212
|
-
for src in sources:
|
|
213
|
-
if src['source_id'] not in seen:
|
|
214
|
-
seen.add(src['source_id'])
|
|
215
|
-
unique.append(src)
|
|
216
|
-
|
|
217
|
-
style = args.style
|
|
218
|
-
|
|
219
|
-
if style == 'markdown':
|
|
220
|
-
lines = ['## Bibliography', '']
|
|
221
|
-
for i, src in enumerate(unique, 1):
|
|
222
|
-
author_str = ''
|
|
223
|
-
if src.get('authors'):
|
|
224
|
-
authors = src['authors']
|
|
225
|
-
if len(authors) == 1:
|
|
226
|
-
author_str = f'{authors[0]}. '
|
|
227
|
-
elif len(authors) == 2:
|
|
228
|
-
author_str = f'{authors[0]} & {authors[1]}. '
|
|
229
|
-
else:
|
|
230
|
-
author_str = f'{authors[0]} et al. '
|
|
231
|
-
|
|
232
|
-
year_str = f'({src["year"]})' if src.get('year') else '(n.d.)'
|
|
233
|
-
title = src.get('title', 'Untitled')
|
|
234
|
-
url = src.get('raw_url', '')
|
|
235
|
-
lines.append(f'[{i}] {author_str}{year_str}. [{title}]({url})')
|
|
236
|
-
print('\n'.join(lines))
|
|
237
|
-
|
|
238
|
-
elif style == 'json':
|
|
239
|
-
out = []
|
|
240
|
-
for i, src in enumerate(unique, 1):
|
|
241
|
-
out.append({
|
|
242
|
-
'display_number': i,
|
|
243
|
-
'source_id': src['source_id'],
|
|
244
|
-
'canonical_locator': src['canonical_locator'],
|
|
245
|
-
'title': src.get('title', ''),
|
|
246
|
-
'authors': src.get('authors'),
|
|
247
|
-
'year': src.get('year'),
|
|
248
|
-
'raw_url': src.get('raw_url', ''),
|
|
249
|
-
})
|
|
250
|
-
print(json.dumps(out, indent=2, ensure_ascii=False))
|
|
251
|
-
|
|
252
|
-
else:
|
|
253
|
-
print(f'Unknown style: {style}', file=sys.stderr)
|
|
254
|
-
sys.exit(1)
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
# ---------------------------------------------------------------------------
|
|
258
|
-
# CLI entry point
|
|
259
|
-
# ---------------------------------------------------------------------------
|
|
260
|
-
|
|
261
|
-
def main() -> None:
|
|
262
|
-
parser = argparse.ArgumentParser(
|
|
263
|
-
prog='citation_manager',
|
|
264
|
-
description='Stable source identity and run manifest management for deep-research v3.0',
|
|
265
|
-
)
|
|
266
|
-
sub = parser.add_subparsers(dest='command', required=True)
|
|
267
|
-
|
|
268
|
-
# init-run
|
|
269
|
-
p_init = sub.add_parser('init-run', help='Create run manifest and empty artifact files')
|
|
270
|
-
p_init.add_argument('--out-dir', required=True, help='Output directory for the research run')
|
|
271
|
-
p_init.add_argument('--query', default='', help='Original research question')
|
|
272
|
-
p_init.add_argument('--mode', default='standard', choices=['quick', 'standard', 'deep', 'ultradeep'])
|
|
273
|
-
|
|
274
|
-
# register-source
|
|
275
|
-
p_reg = sub.add_parser('register-source', help='Register a source and return its stable ID')
|
|
276
|
-
p_reg.add_argument('--json', required=True, help='JSON object with at least raw_url and title')
|
|
277
|
-
p_reg.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
|
|
278
|
-
|
|
279
|
-
# assign-display-numbers
|
|
280
|
-
p_num = sub.add_parser('assign-display-numbers', help='Map stable source IDs to display numbers')
|
|
281
|
-
p_num.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
|
|
282
|
-
|
|
283
|
-
# export-bibliography
|
|
284
|
-
p_bib = sub.add_parser('export-bibliography', help='Generate bibliography from sources')
|
|
285
|
-
p_bib.add_argument('--dir', required=True, help='Run directory containing sources.jsonl')
|
|
286
|
-
p_bib.add_argument('--style', default='markdown', choices=['markdown', 'json'])
|
|
287
|
-
|
|
288
|
-
args = parser.parse_args()
|
|
289
|
-
|
|
290
|
-
dispatch = {
|
|
291
|
-
'init-run': cmd_init_run,
|
|
292
|
-
'register-source': cmd_register_source,
|
|
293
|
-
'assign-display-numbers': cmd_assign_display_numbers,
|
|
294
|
-
'export-bibliography': cmd_export_bibliography,
|
|
295
|
-
}
|
|
296
|
-
dispatch[args.command](args)
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
if __name__ == '__main__':
|
|
300
|
-
main()
|
|
@@ -1,205 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Evidence Store — append-only evidence persistence for deep-research v3.0.
|
|
4
|
-
|
|
5
|
-
CLI subcommands:
|
|
6
|
-
init Create empty evidence.jsonl in a run directory
|
|
7
|
-
add Append an evidence row, return evidence_id
|
|
8
|
-
list List evidence rows, optionally filtered by source_id
|
|
9
|
-
export Export evidence as JSON array
|
|
10
|
-
|
|
11
|
-
Evidence identity:
|
|
12
|
-
evidence_id = sha256(source_id + normalized_quote + locator)[:16]
|
|
13
|
-
|
|
14
|
-
All state is append-only JSONL. Evidence is never modified after capture.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
import argparse
|
|
18
|
-
import hashlib
|
|
19
|
-
import json
|
|
20
|
-
import os
|
|
21
|
-
import re
|
|
22
|
-
import sys
|
|
23
|
-
from datetime import datetime, timezone
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
# ---------------------------------------------------------------------------
|
|
27
|
-
# Evidence ID computation
|
|
28
|
-
# ---------------------------------------------------------------------------
|
|
29
|
-
|
|
30
|
-
_WHITESPACE_RE = re.compile(r'\s+')
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def normalize_quote(quote: str) -> str:
|
|
34
|
-
"""Normalize whitespace for stable hashing."""
|
|
35
|
-
return _WHITESPACE_RE.sub(' ', quote.strip()).lower()
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def compute_evidence_id(source_id: str, quote: str, locator: str | None) -> str:
|
|
39
|
-
"""sha256(source_id + normalized_quote + locator)[:16] hex."""
|
|
40
|
-
payload = source_id + normalize_quote(quote) + (locator or '')
|
|
41
|
-
return hashlib.sha256(payload.encode('utf-8')).hexdigest()[:16]
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
# ---------------------------------------------------------------------------
|
|
45
|
-
# JSONL helpers (shared pattern with citation_manager)
|
|
46
|
-
# ---------------------------------------------------------------------------
|
|
47
|
-
|
|
48
|
-
def append_jsonl(path: str, obj: dict) -> None:
|
|
49
|
-
with open(path, 'a') as f:
|
|
50
|
-
f.write(json.dumps(obj, ensure_ascii=False) + '\n')
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def read_jsonl(path: str) -> list[dict]:
|
|
54
|
-
rows = []
|
|
55
|
-
if not os.path.exists(path):
|
|
56
|
-
return rows
|
|
57
|
-
with open(path) as f:
|
|
58
|
-
for line in f:
|
|
59
|
-
line = line.strip()
|
|
60
|
-
if line:
|
|
61
|
-
rows.append(json.loads(line))
|
|
62
|
-
return rows
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
# ---------------------------------------------------------------------------
|
|
66
|
-
# Subcommands
|
|
67
|
-
# ---------------------------------------------------------------------------
|
|
68
|
-
|
|
69
|
-
def cmd_init(args: argparse.Namespace) -> None:
|
|
70
|
-
"""Create empty evidence.jsonl if it doesn't exist."""
|
|
71
|
-
out_dir = os.path.abspath(args.dir)
|
|
72
|
-
path = os.path.join(out_dir, 'evidence.jsonl')
|
|
73
|
-
if not os.path.exists(path):
|
|
74
|
-
os.makedirs(out_dir, exist_ok=True)
|
|
75
|
-
open(path, 'w').close()
|
|
76
|
-
print(json.dumps({'status': 'ok', 'path': path}))
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def cmd_add(args: argparse.Namespace) -> None:
|
|
80
|
-
"""Append evidence row, print evidence_id."""
|
|
81
|
-
data = json.loads(args.json)
|
|
82
|
-
source_id = data.get('source_id', '')
|
|
83
|
-
quote = data.get('quote', '')
|
|
84
|
-
if not source_id or not quote:
|
|
85
|
-
print(json.dumps({'error': 'source_id and quote are required'}), file=sys.stderr)
|
|
86
|
-
sys.exit(1)
|
|
87
|
-
|
|
88
|
-
locator = data.get('locator')
|
|
89
|
-
evidence_id = compute_evidence_id(source_id, quote, locator)
|
|
90
|
-
evidence_path = os.path.join(args.dir, 'evidence.jsonl')
|
|
91
|
-
|
|
92
|
-
# Check for duplicate
|
|
93
|
-
existing = read_jsonl(evidence_path)
|
|
94
|
-
for row in existing:
|
|
95
|
-
if row.get('evidence_id') == evidence_id:
|
|
96
|
-
print(json.dumps({
|
|
97
|
-
'status': 'duplicate',
|
|
98
|
-
'evidence_id': evidence_id,
|
|
99
|
-
}))
|
|
100
|
-
return
|
|
101
|
-
|
|
102
|
-
valid_types = {'direct_quote', 'paraphrase', 'data_point', 'figure_reference', 'methodology'}
|
|
103
|
-
evidence_type = data.get('evidence_type', 'direct_quote')
|
|
104
|
-
if evidence_type not in valid_types:
|
|
105
|
-
evidence_type = 'direct_quote'
|
|
106
|
-
|
|
107
|
-
row = {
|
|
108
|
-
'evidence_id': evidence_id,
|
|
109
|
-
'source_id': source_id,
|
|
110
|
-
'retrieval_query': data.get('retrieval_query'),
|
|
111
|
-
'locator': locator,
|
|
112
|
-
'quote': quote,
|
|
113
|
-
'evidence_type': evidence_type,
|
|
114
|
-
'captured_at': datetime.now(timezone.utc).isoformat(),
|
|
115
|
-
}
|
|
116
|
-
append_jsonl(evidence_path, row)
|
|
117
|
-
print(json.dumps({
|
|
118
|
-
'status': 'added',
|
|
119
|
-
'evidence_id': evidence_id,
|
|
120
|
-
'source_id': source_id,
|
|
121
|
-
}))
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
def cmd_list(args: argparse.Namespace) -> None:
|
|
125
|
-
"""List evidence rows, optionally filtered."""
|
|
126
|
-
evidence_path = os.path.join(args.dir, 'evidence.jsonl')
|
|
127
|
-
rows = read_jsonl(evidence_path)
|
|
128
|
-
|
|
129
|
-
if args.source_id:
|
|
130
|
-
rows = [r for r in rows if r.get('source_id') == args.source_id]
|
|
131
|
-
|
|
132
|
-
# Deduplicate by evidence_id
|
|
133
|
-
seen = set()
|
|
134
|
-
unique = []
|
|
135
|
-
for r in rows:
|
|
136
|
-
eid = r.get('evidence_id')
|
|
137
|
-
if eid not in seen:
|
|
138
|
-
seen.add(eid)
|
|
139
|
-
unique.append(r)
|
|
140
|
-
|
|
141
|
-
print(json.dumps({
|
|
142
|
-
'count': len(unique),
|
|
143
|
-
'evidence': unique,
|
|
144
|
-
}, indent=2, ensure_ascii=False))
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def cmd_export(args: argparse.Namespace) -> None:
|
|
148
|
-
"""Export all evidence as JSON array."""
|
|
149
|
-
evidence_path = os.path.join(args.dir, 'evidence.jsonl')
|
|
150
|
-
rows = read_jsonl(evidence_path)
|
|
151
|
-
|
|
152
|
-
# Deduplicate
|
|
153
|
-
seen = set()
|
|
154
|
-
unique = []
|
|
155
|
-
for r in rows:
|
|
156
|
-
eid = r.get('evidence_id')
|
|
157
|
-
if eid not in seen:
|
|
158
|
-
seen.add(eid)
|
|
159
|
-
unique.append(r)
|
|
160
|
-
|
|
161
|
-
print(json.dumps(unique, indent=2, ensure_ascii=False))
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
# ---------------------------------------------------------------------------
|
|
165
|
-
# CLI entry point
|
|
166
|
-
# ---------------------------------------------------------------------------
|
|
167
|
-
|
|
168
|
-
def main() -> None:
|
|
169
|
-
parser = argparse.ArgumentParser(
|
|
170
|
-
prog='evidence_store',
|
|
171
|
-
description='Append-only evidence persistence for deep-research v3.0',
|
|
172
|
-
)
|
|
173
|
-
sub = parser.add_subparsers(dest='command', required=True)
|
|
174
|
-
|
|
175
|
-
# init
|
|
176
|
-
p_init = sub.add_parser('init', help='Create empty evidence.jsonl')
|
|
177
|
-
p_init.add_argument('--dir', required=True, help='Run directory')
|
|
178
|
-
|
|
179
|
-
# add
|
|
180
|
-
p_add = sub.add_parser('add', help='Append evidence row')
|
|
181
|
-
p_add.add_argument('--json', required=True, help='JSON with source_id, quote, locator, evidence_type, retrieval_query')
|
|
182
|
-
p_add.add_argument('--dir', required=True, help='Run directory containing evidence.jsonl')
|
|
183
|
-
|
|
184
|
-
# list
|
|
185
|
-
p_list = sub.add_parser('list', help='List evidence rows')
|
|
186
|
-
p_list.add_argument('--dir', required=True, help='Run directory')
|
|
187
|
-
p_list.add_argument('--source-id', default=None, help='Filter by source_id')
|
|
188
|
-
|
|
189
|
-
# export
|
|
190
|
-
p_export = sub.add_parser('export', help='Export all evidence as JSON array')
|
|
191
|
-
p_export.add_argument('--dir', required=True, help='Run directory')
|
|
192
|
-
|
|
193
|
-
args = parser.parse_args()
|
|
194
|
-
|
|
195
|
-
dispatch = {
|
|
196
|
-
'init': cmd_init,
|
|
197
|
-
'add': cmd_add,
|
|
198
|
-
'list': cmd_list,
|
|
199
|
-
'export': cmd_export,
|
|
200
|
-
}
|
|
201
|
-
dispatch[args.command](args)
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
if __name__ == '__main__':
|
|
205
|
-
main()
|