lumina-wiki 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/installer/template-engine.js +3 -1
- package/src/scripts/lint.mjs +67 -3
- package/src/scripts/schemas.mjs +1 -0
- package/src/scripts/wiki.mjs +66 -0
- package/src/skills/core/check/SKILL.md +7 -2
- package/src/skills/core/ingest/SKILL.md +26 -0
- package/src/skills/core/ingest/references/dedup-policy.md +30 -0
- package/src/skills/packs/research/discover/SKILL.md +30 -9
- package/src/skills/packs/research/prefill/SKILL.md +61 -2
- package/src/templates/_lumina/schema/page-templates.md +1 -0
- package/src/tools/fetch_wikipedia.py +10 -1
- package/src/tools/init_discovery.py +32 -9
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "https://json.schemastore.org/package.json",
|
|
3
3
|
"name": "lumina-wiki",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.5.0",
|
|
5
5
|
"description": "Domain-agnostic, multi-IDE wiki scaffolder — Karpathy's LLM-Wiki vision, cross-platform and pack-based.",
|
|
6
6
|
"keywords": [
|
|
7
7
|
"llm-wiki",
|
|
@@ -124,6 +124,8 @@ export function renderReadme(template, variables, purpose = '') {
|
|
|
124
124
|
return [
|
|
125
125
|
titleLine,
|
|
126
126
|
'',
|
|
127
|
+
'## Project Purpose',
|
|
128
|
+
'',
|
|
127
129
|
purposeText,
|
|
128
130
|
'',
|
|
129
131
|
'<!-- lumina:schema -->',
|
|
@@ -140,7 +142,7 @@ export function renderReadme(template, variables, purpose = '') {
|
|
|
140
142
|
// Find end of title block (first non-empty, non-H1 line before marker)
|
|
141
143
|
let insertIdx = schemaMarkerIdx;
|
|
142
144
|
// Insert purpose region before schema marker
|
|
143
|
-
const purposeLines = ['', purposeText, ''];
|
|
145
|
+
const purposeLines = ['', '## Project Purpose', '', purposeText, ''];
|
|
144
146
|
lines.splice(insertIdx, 0, ...purposeLines);
|
|
145
147
|
|
|
146
148
|
return lines.join('\n');
|
package/src/scripts/lint.mjs
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @module lint
|
|
3
|
-
* @description LuminaWiki v0.1 wiki linter —
|
|
3
|
+
* @description LuminaWiki v0.1 wiki linter — 10 schema checks, optional --fix.
|
|
4
4
|
*
|
|
5
5
|
* CLI usage:
|
|
6
6
|
* node lint.mjs [path] [--fix] [--dry-run] [--suggest] [--json]
|
|
@@ -62,7 +62,7 @@ const INDEX_MARKER_OPEN = '<!-- lumina:index -->';
|
|
|
62
62
|
const INDEX_MARKER_CLOSE = '<!-- /lumina:index -->';
|
|
63
63
|
|
|
64
64
|
/** All check IDs in run order. */
|
|
65
|
-
const ALL_CHECK_IDS = ['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 'L07', 'L08', 'L09'];
|
|
65
|
+
const ALL_CHECK_IDS = ['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 'L07', 'L08', 'L09', 'L10'];
|
|
66
66
|
|
|
67
67
|
/** Kebab-case pattern: lowercase letters, digits, hyphens; no leading/trailing hyphen. */
|
|
68
68
|
const KEBAB_RE = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
|
|
@@ -611,6 +611,57 @@ function checkL09(indexPath, indexContent, entityFiles) {
|
|
|
611
611
|
return [];
|
|
612
612
|
}
|
|
613
613
|
|
|
614
|
+
/**
|
|
615
|
+
* L10: Two foundations share an alias, or a foundation's alias collides with
|
|
616
|
+
* another foundation's title. Foundations-only; no --fix mode.
|
|
617
|
+
*
|
|
618
|
+
* @param {Array<{wikiRelPath: string, fm: Record<string,unknown>}>} foundationEntries
|
|
619
|
+
* Each entry has the wiki-relative path and parsed frontmatter of one foundation file.
|
|
620
|
+
* @returns {Finding[]}
|
|
621
|
+
*/
|
|
622
|
+
function checkL10(foundationEntries) {
|
|
623
|
+
/** @type {Map<string, Array<{slug: string, source: 'title'|'alias', original: string}>>} */
|
|
624
|
+
const index = new Map();
|
|
625
|
+
|
|
626
|
+
for (const { wikiRelPath, fm } of foundationEntries) {
|
|
627
|
+
const slug = wikiRelPath; // e.g. "foundations/transformer.md"
|
|
628
|
+
|
|
629
|
+
// Collect title.
|
|
630
|
+
if (typeof fm.title === 'string') {
|
|
631
|
+
const norm = fm.title.trim().toLowerCase();
|
|
632
|
+
if (!index.has(norm)) index.set(norm, []);
|
|
633
|
+
index.get(norm).push({ slug, source: 'title', original: fm.title });
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
// Collect aliases (skip non-string entries defensively).
|
|
637
|
+
const aliases = Array.isArray(fm.aliases) ? fm.aliases : [];
|
|
638
|
+
for (const alias of aliases) {
|
|
639
|
+
if (typeof alias !== 'string') continue;
|
|
640
|
+
const norm = alias.trim().toLowerCase();
|
|
641
|
+
if (!index.has(norm)) index.set(norm, []);
|
|
642
|
+
index.get(norm).push({ slug, source: 'alias', original: alias });
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
const findings = [];
|
|
647
|
+
for (const [, claimants] of index) {
|
|
648
|
+
if (claimants.length < 2) continue;
|
|
649
|
+
// Each claimant gets a finding mentioning the others.
|
|
650
|
+
for (const claimant of claimants) {
|
|
651
|
+
const others = claimants.filter(c => c !== claimant);
|
|
652
|
+
const othersDesc = others
|
|
653
|
+
.map(c => `${c.slug} (as ${c.source})`)
|
|
654
|
+
.join(', ');
|
|
655
|
+
findings.push(finding(
|
|
656
|
+
'L10-alias-conflict', 'error', false,
|
|
657
|
+
claimant.slug, null,
|
|
658
|
+
`alias conflict on "${claimant.original}" — also claimed by ${othersDesc}`
|
|
659
|
+
));
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
return findings;
|
|
663
|
+
}
|
|
664
|
+
|
|
614
665
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
615
666
|
// FIXERS
|
|
616
667
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -873,6 +924,19 @@ async function runLint(projectRoot, opts) {
|
|
|
873
924
|
allFindings.push(...checkL09(indexPath, indexContent, entityFiles));
|
|
874
925
|
}
|
|
875
926
|
|
|
927
|
+
// L10: collect all foundation frontmatters in one pass, then check for alias conflicts.
|
|
928
|
+
{
|
|
929
|
+
const foundationEntries = [];
|
|
930
|
+
for (const wikiRelPath of entityFiles) {
|
|
931
|
+
if (!wikiRelPath.startsWith('foundations/')) continue;
|
|
932
|
+
const abs = safejoin(wikiRoot, wikiRelPath);
|
|
933
|
+
const content = await readFile(abs, 'utf8');
|
|
934
|
+
const parsed = parseFrontmatter(content);
|
|
935
|
+
foundationEntries.push({ wikiRelPath, fm: parsed ? parsed.data : {} });
|
|
936
|
+
}
|
|
937
|
+
allFindings.push(...checkL10(foundationEntries));
|
|
938
|
+
}
|
|
939
|
+
|
|
876
940
|
// Apply fixes if requested.
|
|
877
941
|
if (opts.fix || opts.dryRun) {
|
|
878
942
|
await applyFixes(allFindings, wikiRoot, edgesPath, indexPath, indexContent, entityFiles, allAbsMd, edges, edgeSet, opts);
|
|
@@ -1119,7 +1183,7 @@ export {
|
|
|
1119
1183
|
isExempt,
|
|
1120
1184
|
entityTypeForPath,
|
|
1121
1185
|
checkL01, checkL02, checkL03, checkL04, checkL05,
|
|
1122
|
-
checkL06, checkL07, checkL08, checkL09,
|
|
1186
|
+
checkL06, checkL07, checkL08, checkL09, checkL10,
|
|
1123
1187
|
fixL01, fixL03, fixL06, fixL07, fixL09,
|
|
1124
1188
|
runLint,
|
|
1125
1189
|
INDEX_MARKER_OPEN,
|
package/src/scripts/schemas.mjs
CHANGED
|
@@ -299,6 +299,7 @@ export const REQUIRED_FRONTMATTER = {
|
|
|
299
299
|
{ key: 'type', type: 'string', required: true, pack: 'research' },
|
|
300
300
|
{ key: 'created', type: 'iso-date', required: true, pack: 'research' },
|
|
301
301
|
{ key: 'updated', type: 'iso-date', required: true, pack: 'research' },
|
|
302
|
+
{ key: 'aliases', type: 'array', required: false, pack: 'research' },
|
|
302
303
|
],
|
|
303
304
|
|
|
304
305
|
// Research pack: topic page
|
package/src/scripts/wiki.mjs
CHANGED
|
@@ -1247,6 +1247,7 @@ async function main(argv) {
|
|
|
1247
1247
|
' batch-edges <json-file> Apply array of edges from JSON file',
|
|
1248
1248
|
' dedup-edges Deduplicate edges.jsonl',
|
|
1249
1249
|
' list-entities [path-prefix] [--type <type>] List entity slugs as JSON',
|
|
1250
|
+
' resolve-alias <text> Map free-text query to a foundations/* slug',
|
|
1250
1251
|
' read-edges <slug>|--from <slug> [--type <type>] [--direction outbound|inbound|both]',
|
|
1251
1252
|
' read-citations <slug> Read all citations for a slug',
|
|
1252
1253
|
' verify-frontmatter <slug> Validate frontmatter fields',
|
|
@@ -1602,6 +1603,71 @@ async function main(argv) {
|
|
|
1602
1603
|
break;
|
|
1603
1604
|
}
|
|
1604
1605
|
|
|
1606
|
+
// -----------------------------------------------------------------------
|
|
1607
|
+
case 'resolve-alias': {
|
|
1608
|
+
const text = positional.join(' ').trim();
|
|
1609
|
+
if (!text) {
|
|
1610
|
+
emitError('resolve-alias requires <text>', 2);
|
|
1611
|
+
process.exit(2);
|
|
1612
|
+
}
|
|
1613
|
+
const projectRoot = await requireProjectRoot();
|
|
1614
|
+
const allEntities = await listEntities(projectRoot);
|
|
1615
|
+
const foundations = allEntities.filter(e => e.type === 'foundations');
|
|
1616
|
+
|
|
1617
|
+
const needle = text.toLowerCase();
|
|
1618
|
+
const matches = [];
|
|
1619
|
+
|
|
1620
|
+
for (const entity of foundations) {
|
|
1621
|
+
const content = await readFile(entity.filePath, 'utf8');
|
|
1622
|
+
const { frontmatter } = parseFrontmatter(content);
|
|
1623
|
+
|
|
1624
|
+
// Build candidate set with priority: slug > title > alias
|
|
1625
|
+
const slugNorm = entity.slug.toLowerCase().trim();
|
|
1626
|
+
const titleNorm = typeof frontmatter.title === 'string'
|
|
1627
|
+
? frontmatter.title.toLowerCase().trim()
|
|
1628
|
+
: null;
|
|
1629
|
+
|
|
1630
|
+
let matchSource = null;
|
|
1631
|
+
|
|
1632
|
+
if (slugNorm === needle) {
|
|
1633
|
+
matchSource = 'slug';
|
|
1634
|
+
} else if (titleNorm !== null && titleNorm === needle) {
|
|
1635
|
+
matchSource = 'title';
|
|
1636
|
+
} else {
|
|
1637
|
+
// Check aliases defensively
|
|
1638
|
+
const aliases = frontmatter.aliases;
|
|
1639
|
+
if (Array.isArray(aliases)) {
|
|
1640
|
+
for (const alias of aliases) {
|
|
1641
|
+
if (typeof alias !== 'string') continue;
|
|
1642
|
+
if (alias.toLowerCase().trim() === needle) {
|
|
1643
|
+
matchSource = 'alias';
|
|
1644
|
+
break;
|
|
1645
|
+
}
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
if (matchSource !== null) {
|
|
1651
|
+
matches.push({ slug: entity.slug, path: entity.path, source: matchSource });
|
|
1652
|
+
}
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1655
|
+
if (matches.length === 0) {
|
|
1656
|
+
emitError(`no match for query: ${text}`, 2);
|
|
1657
|
+
process.exit(2);
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1660
|
+
// Sort ascending by slug for deterministic output
|
|
1661
|
+
matches.sort((a, b) => a.slug < b.slug ? -1 : a.slug > b.slug ? 1 : 0);
|
|
1662
|
+
|
|
1663
|
+
emitJson({
|
|
1664
|
+
query: text,
|
|
1665
|
+
matches,
|
|
1666
|
+
ambiguous: matches.length >= 2,
|
|
1667
|
+
});
|
|
1668
|
+
break;
|
|
1669
|
+
}
|
|
1670
|
+
|
|
1605
1671
|
// -----------------------------------------------------------------------
|
|
1606
1672
|
default: {
|
|
1607
1673
|
emitError(`Unknown subcommand: ${subcommand}. Run node wiki.mjs --help for usage.`, 2);
|
|
@@ -4,7 +4,6 @@ description: >
|
|
|
4
4
|
Run lint.mjs --json, summarize findings by severity, offer to apply --fix for
|
|
5
5
|
auto-fixable checks (L01/L03/L06/L07/L09), self-check re-run to confirm 0
|
|
6
6
|
errors, and surface advisory warnings for user attention.
|
|
7
|
-
Single-model self-check only — no cross-model review.
|
|
8
7
|
Use this whenever the user asks to "check the wiki", "run lint", "verify the
|
|
9
8
|
graph", "are there broken links?", "what's wrong with the wiki?", "health
|
|
10
9
|
check", or "are there missing reverse links?". Also fires for: weekly review
|
|
@@ -17,10 +16,16 @@ allowed-tools:
|
|
|
17
16
|
|
|
18
17
|
# /lumi-check
|
|
19
18
|
|
|
19
|
+
> If you were spawned in the same session that just ran `/lumi-ingest`, surface
|
|
20
|
+
> a one-line note to the user suggesting they re-run this check in a fresh
|
|
21
|
+
> session or via a subagent for an independent read — then proceed normally.
|
|
22
|
+
> Same model with blank context catches bias from the reasoning chain that
|
|
23
|
+
> built the pages you are now reviewing.
|
|
24
|
+
|
|
20
25
|
## Role
|
|
21
26
|
|
|
22
27
|
You are the wiki's quality gate. You run the linter, classify findings, apply
|
|
23
|
-
safe fixes with a
|
|
28
|
+
safe fixes with a self-check re-run, and surface the issues the user
|
|
24
29
|
must resolve manually. You do not decide what is correct content — you enforce
|
|
25
30
|
structural and graph-integrity rules.
|
|
26
31
|
|
|
@@ -128,6 +128,12 @@ Write checkpoint: `phase: "source-page"`.
|
|
|
128
128
|
|
|
129
129
|
### Phase 4 — Write concept and person stubs
|
|
130
130
|
|
|
131
|
+
For every candidate concept name extracted in Phase 3, first run
|
|
132
|
+
`node _lumina/scripts/wiki.mjs resolve-alias "<concept-name>"`. If it resolves to
|
|
133
|
+
a foundation, link to that foundation via `[[foundations/<slug>]]` and add a
|
|
134
|
+
`grounded_in` edge instead of creating a concept stub. See
|
|
135
|
+
`references/dedup-policy.md` § Foundation Resolution for the full decision tree.
|
|
136
|
+
|
|
131
137
|
Apply `references/dedup-policy.md` before creating or updating stubs. Existing
|
|
132
138
|
concept/person pages are updated conservatively; new pages use the templates
|
|
133
139
|
below.
|
|
@@ -250,6 +256,20 @@ Ask whether they want a minimal ingest (source page only, no stubs) or a full
|
|
|
250
256
|
ingest. Proceed only with explicit direction. Log which phases were skipped.
|
|
251
257
|
</example>
|
|
252
258
|
|
|
259
|
+
<example>
|
|
260
|
+
User: "/lumi-ingest raw/sources/rlhf-overview.pdf"
|
|
261
|
+
|
|
262
|
+
Foundation resolution — concept name maps to an existing foundation:
|
|
263
|
+
```bash
|
|
264
|
+
node _lumina/scripts/wiki.mjs resolve-alias "RLHF"
|
|
265
|
+
# → {"query":"RLHF","matches":[{"slug":"reinforcement-learning-from-human-feedback","path":"foundations/reinforcement-learning-from-human-feedback","source":"alias"}],"ambiguous":false}
|
|
266
|
+
node _lumina/scripts/wiki.mjs add-edge sources/rlhf-overview grounded_in foundations/reinforcement-learning-from-human-feedback
|
|
267
|
+
# (no concept stub created for "RLHF")
|
|
268
|
+
```
|
|
269
|
+
Link added to `## Concepts` in `wiki/sources/rlhf-overview.md`:
|
|
270
|
+
`[[foundations/reinforcement-learning-from-human-feedback]]`
|
|
271
|
+
</example>
|
|
272
|
+
|
|
253
273
|
## Guardrails
|
|
254
274
|
|
|
255
275
|
- Never modify files in `raw/`. Read-only.
|
|
@@ -270,3 +290,9 @@ Before reporting done, verify:
|
|
|
270
290
|
(c) Running `/lumi-ingest` again with the same file produces byte-identical `wiki/`
|
|
271
291
|
output (all add-edge calls are no-ops; stubs have same content; index.md entry
|
|
272
292
|
already present)
|
|
293
|
+
|
|
294
|
+
## Next step
|
|
295
|
+
|
|
296
|
+
Tell the user to run `/lumi-check` to validate the wiki state — ideally in a
|
|
297
|
+
fresh session or via a subagent. Same model with blank context catches bias
|
|
298
|
+
from the reasoning chain that just built these pages.
|
|
@@ -15,6 +15,36 @@ If `wiki/sources/<slug>.md` already exists, treat the run as a re-ingest. Confir
|
|
|
15
15
|
with the user before overwriting body content. If the user confirms, keep stable
|
|
16
16
|
frontmatter values when possible and only update fields supported by the source.
|
|
17
17
|
|
|
18
|
+
## Foundation Resolution (Before Creating Concept Stubs)
|
|
19
|
+
|
|
20
|
+
Before creating any concept stub, check whether the term already has a foundation
|
|
21
|
+
page. This avoids duplicate concept pages when a foundation covers the same term
|
|
22
|
+
under its canonical name.
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
node _lumina/scripts/wiki.mjs resolve-alias "<concept-name>"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Decision tree by exit code:
|
|
29
|
+
|
|
30
|
+
- **exit 0, exactly 1 match (`ambiguous: false`)** — do NOT create a concept stub.
|
|
31
|
+
Link to `[[foundations/<match-slug>]]` in the source page's `## Concepts` section.
|
|
32
|
+
Add edge:
|
|
33
|
+
```bash
|
|
34
|
+
node _lumina/scripts/wiki.mjs add-edge sources/<source-slug> grounded_in foundations/<match-slug>
|
|
35
|
+
```
|
|
36
|
+
Note: `grounded_in` is terminal — no reverse edge will be written.
|
|
37
|
+
|
|
38
|
+
- **exit 0, `ambiguous: true`** — present the candidate foundations to the user
|
|
39
|
+
with their slugs and ask which one applies. If none match the source's intended
|
|
40
|
+
meaning, fall back to creating a concept stub.
|
|
41
|
+
|
|
42
|
+
- **exit 2 (no match)** — proceed with normal concept stub creation per the next
|
|
43
|
+
section.
|
|
44
|
+
|
|
45
|
+
Run resolve-alias for every candidate concept name extracted in Phase 4, before
|
|
46
|
+
making any `add-edge concepts/<slug>` calls.
|
|
47
|
+
|
|
18
48
|
## Concept And Person Stubs
|
|
19
49
|
|
|
20
50
|
Before creating a concept or person page, check metadata:
|
|
@@ -32,7 +32,18 @@ References:
|
|
|
32
32
|
## Instructions
|
|
33
33
|
|
|
34
34
|
1. Clarify the discovery query if the topic, domain, or source type is unclear.
|
|
35
|
-
2.
|
|
35
|
+
2. Build the exclude list from already-ingested sources. Run:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
node _lumina/scripts/wiki.mjs list-entities
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
For each entity with `type: "sources"`, `Read` the `filePath` and extract any
|
|
42
|
+
arXiv ID or Semantic Scholar paperId from frontmatter or body URLs. Patterns
|
|
43
|
+
to scan: `arxiv.org/abs/<id>`, `arXiv:<id>`, `semanticscholar.org/paper/<id>`.
|
|
44
|
+
Pass the deduped list to `init_discovery.py --exclude-ids id1,id2,...`. If
|
|
45
|
+
no sources exist yet, skip this step (omit the flag).
|
|
46
|
+
3. Check research tool setup:
|
|
36
47
|
|
|
37
48
|
```bash
|
|
38
49
|
python3 _lumina/tools/init_discovery.py --help
|
|
@@ -43,15 +54,21 @@ python3 _lumina/tools/fetch_deepxiv.py --help
|
|
|
43
54
|
python3 _lumina/tools/discover.py --help
|
|
44
55
|
```
|
|
45
56
|
|
|
46
|
-
|
|
57
|
+
4. Pick one seed mode from `references/source-modes.md`: `topic`, `anchor`, or
|
|
47
58
|
`from-wiki`. Use only the documented commands and flags.
|
|
48
|
-
|
|
59
|
+
5. Deduplicate candidates against existing wiki/discovered/checkpoint state using
|
|
49
60
|
`references/ranking-signals.md`.
|
|
50
|
-
|
|
61
|
+
6. Rank candidate JSON with `discover.py --topic "<topic>"`; preserve returned
|
|
51
62
|
`_score`, then add a human-readable rationale and risk note.
|
|
52
|
-
|
|
63
|
+
7. Apply purpose alignment. Read the `## Project Purpose` section in
|
|
64
|
+
`README.md`. For each shortlisted candidate, judge alignment with that
|
|
65
|
+
purpose (high / medium / low) and include the judgment in the rationale.
|
|
66
|
+
Move clearly off-purpose candidates to MAYBE or SKIP regardless of `_score`.
|
|
67
|
+
If the purpose section is empty or contains only the placeholder text, skip
|
|
68
|
+
this step and note "no project purpose set" in the response.
|
|
69
|
+
8. Present a checkpointed shortlist with title, authors/year, URL or identifier,
|
|
53
70
|
`_score`, rationale, duplicate status, and recommended next action.
|
|
54
|
-
|
|
71
|
+
9. Ask the user which candidates should be ingested. Do not create source pages
|
|
55
72
|
or graph edges in this skill.
|
|
56
73
|
|
|
57
74
|
## Constraints
|
|
@@ -59,14 +76,18 @@ python3 _lumina/tools/discover.py --help
|
|
|
59
76
|
- Do not mutate `wiki/`.
|
|
60
77
|
- Do not invent source metadata not returned by a fetcher or supplied by the user.
|
|
61
78
|
- Do not invent tool flags. Use only `--topic`, `--project-root`, `--phases`,
|
|
62
|
-
`--resume`, `--fetchers`, and `--
|
|
63
|
-
|
|
64
|
-
|
|
79
|
+
`--resume`, `--fetchers`, `--limit`, and `--exclude-ids` for
|
|
80
|
+
`init_discovery.py`.
|
|
81
|
+
- Do not include any non-FR35 workflows such as ideation, LaTeX writing, or
|
|
82
|
+
orchestrator mode.
|
|
65
83
|
|
|
66
84
|
## Definition of Done
|
|
67
85
|
|
|
68
86
|
- Shortlist is deduped against wiki sources and discovered state.
|
|
69
87
|
- Every shortlisted item includes `_score`, rationale, and risk/duplicate note.
|
|
88
|
+
- Purpose alignment is reflected in each candidate's rationale (or the response
|
|
89
|
+
explicitly notes "no project purpose set" when the README purpose is empty
|
|
90
|
+
or placeholder).
|
|
70
91
|
- Discovery checkpoints or an explicit resume decision are reflected in the
|
|
71
92
|
response.
|
|
72
93
|
- No `wiki/` files, index entries, graph edges, or log entries are written.
|
|
@@ -35,10 +35,61 @@ node _lumina/scripts/wiki.mjs read-meta foundations/<slug>
|
|
|
35
35
|
node _lumina/scripts/wiki.mjs slug "<topic title>"
|
|
36
36
|
```
|
|
37
37
|
|
|
38
|
-
2. Check whether `wiki/foundations/<slug>.md` already exists
|
|
39
|
-
|
|
38
|
+
2. Check whether `wiki/foundations/<slug>.md` already exists:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
node _lumina/scripts/wiki.mjs read-meta foundations/<slug>
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
- **exit 2 (not found)** — continue to step 3.
|
|
45
|
+
- **exit 0 (exists)** — read the file, then show the user the `title`, `created`
|
|
46
|
+
date, and the most recent line in `wiki/log.md` that references this slug.
|
|
47
|
+
Ask exactly one question with three options (no other action until answered):
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
Foundation "<title>" already exists.
|
|
51
|
+
[s] skip — abort, no changes (default)
|
|
52
|
+
[r] refresh — re-fetch from Wikipedia, update non-marked sections and `updated`; preserve `created`, `aliases`, and `<!-- user-edited -->` sections
|
|
53
|
+
[a] abort — same as skip but log the user's intent
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Do not proceed without an explicit choice. Map blank/Enter to `skip`.
|
|
57
|
+
|
|
58
|
+
3. Fetch or handle background material based on exit code from the Wikipedia fetcher:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
python3 _lumina/tools/fetch_wikipedia.py page "<title>"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
- **exit 0** — use the JSON output directly.
|
|
65
|
+
- **exit 2 AND stderr is JSON with `kind == "disambiguation"`** — run a search
|
|
66
|
+
to surface candidates:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
python3 _lumina/tools/fetch_wikipedia.py search "<title>" --limit 5
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Present the numbered results (title + snippet) and let the user:
|
|
73
|
+
- pick a candidate number,
|
|
74
|
+
- type a more specific title, or
|
|
75
|
+
- type `manual` to paste content directly.
|
|
76
|
+
|
|
77
|
+
Re-run `page` with the chosen title, or accept the user-pasted content.
|
|
78
|
+
|
|
79
|
+
- **exit 2 for any other reason** (empty title, page not found) — surface the
|
|
80
|
+
`error` field from stderr JSON and abort.
|
|
81
|
+
- **exit 3 (network error)** — tell the user and offer two options: retry, or
|
|
82
|
+
paste content manually.
|
|
40
83
|
4. Write `wiki/foundations/<slug>.md` with valid foundation frontmatter:
|
|
41
84
|
`id`, `title`, `type: foundation`, `created`, `updated`.
|
|
85
|
+
|
|
86
|
+
Also include the optional `aliases` field — an array of strings listing
|
|
87
|
+
alternative names users or sources might write for this concept (abbreviations,
|
|
88
|
+
expansions, common misspellings). Example: for a foundation titled "Reinforcement
|
|
89
|
+
Learning from Human Feedback", use `aliases: ["RLHF", "human feedback RL"]`.
|
|
90
|
+
Propose a list of 2–5 plausible aliases, then ask the user to confirm or edit
|
|
91
|
+
before writing. An empty array `[]` is fine if nothing obvious applies. Aliases
|
|
92
|
+
must be unique across all foundations — `lint.mjs` L10 will error on collisions.
|
|
42
93
|
5. Keep the body concise: definition, scope notes, and external references.
|
|
43
94
|
6. Log the addition:
|
|
44
95
|
|
|
@@ -58,6 +109,12 @@ node _lumina/scripts/lint.mjs --fix --json
|
|
|
58
109
|
knowledge extracted from project sources.
|
|
59
110
|
- Do not store secrets or API keys in foundation pages.
|
|
60
111
|
- Do not add reverse graph edges for foundations.
|
|
112
|
+
- When refreshing an existing foundation, preserve the original `created` date and
|
|
113
|
+
any `<!-- user-edited -->` sections verbatim. Only `updated` and non-marked
|
|
114
|
+
sections may change.
|
|
115
|
+
- Aliases must be unique across all foundations. If `lint.mjs --fix --json` reports
|
|
116
|
+
`L10-alias-conflict`, resolve manually before completing the run — there is no
|
|
117
|
+
automatic fix.
|
|
61
118
|
|
|
62
119
|
## Definition of Done
|
|
63
120
|
|
|
@@ -65,3 +122,5 @@ node _lumina/scripts/lint.mjs --fix --json
|
|
|
65
122
|
- `node _lumina/scripts/lint.mjs --fix --json` has updated `wiki/index.md` if
|
|
66
123
|
needed and leaves `summary.errors === 0`.
|
|
67
124
|
- `wiki/log.md` has an append-only `lumi-research-prefill` entry.
|
|
125
|
+
- If the page already existed, the user's choice (skip / refresh / abort) is logged
|
|
126
|
+
in `wiki/log.md` with the actual decision taken.
|
|
@@ -218,7 +218,16 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
218
218
|
sys.exit(0)
|
|
219
219
|
|
|
220
220
|
except ValueError as exc:
|
|
221
|
-
|
|
221
|
+
msg = str(exc)
|
|
222
|
+
if "disambiguation" in msg:
|
|
223
|
+
err_obj: dict[str, str] = {
|
|
224
|
+
"error": msg,
|
|
225
|
+
"kind": "disambiguation",
|
|
226
|
+
"hint": "Use the search subcommand to enumerate candidates.",
|
|
227
|
+
}
|
|
228
|
+
else:
|
|
229
|
+
err_obj = {"error": msg}
|
|
230
|
+
print(json.dumps(err_obj, ensure_ascii=False), file=sys.stderr)
|
|
222
231
|
sys.exit(2)
|
|
223
232
|
except requests.exceptions.ConnectionError as exc:
|
|
224
233
|
_err(f"Network error: {exc}")
|
|
@@ -229,6 +229,7 @@ def phase1_keyword_search(
|
|
|
229
229
|
fetchers: list[str],
|
|
230
230
|
limit: int,
|
|
231
231
|
env: dict[str, str],
|
|
232
|
+
exclude_ids: set[str] = set(),
|
|
232
233
|
) -> list[dict[str, Any]]:
|
|
233
234
|
"""Phase 1: keyword search across configured fetchers."""
|
|
234
235
|
results: list[dict[str, Any]] = []
|
|
@@ -249,7 +250,7 @@ def phase1_keyword_search(
|
|
|
249
250
|
|
|
250
251
|
for paper in papers:
|
|
251
252
|
pid = paper.get("id") or paper.get("paperId") or ""
|
|
252
|
-
if pid and pid in seen_ids:
|
|
253
|
+
if pid and (pid in seen_ids or pid in exclude_ids):
|
|
253
254
|
continue
|
|
254
255
|
if pid:
|
|
255
256
|
seen_ids.add(pid)
|
|
@@ -265,6 +266,7 @@ def phase2_author_backfill(
|
|
|
265
266
|
discovered_dir: Path,
|
|
266
267
|
limit: int,
|
|
267
268
|
env: dict[str, str],
|
|
269
|
+
exclude_ids: set[str] = set(),
|
|
268
270
|
) -> list[dict[str, Any]]:
|
|
269
271
|
"""Phase 2: fetch more papers by the most prolific authors from phase 1."""
|
|
270
272
|
# Count author occurrences across phase-1 results
|
|
@@ -290,7 +292,7 @@ def phase2_author_backfill(
|
|
|
290
292
|
continue
|
|
291
293
|
for paper in papers:
|
|
292
294
|
pid = paper.get("id") or paper.get("paperId") or ""
|
|
293
|
-
if pid and pid in seen_ids:
|
|
295
|
+
if pid and (pid in seen_ids or pid in exclude_ids):
|
|
294
296
|
continue
|
|
295
297
|
if pid:
|
|
296
298
|
seen_ids.add(pid)
|
|
@@ -305,6 +307,7 @@ def phase3_citation_expansion(
|
|
|
305
307
|
slug: str,
|
|
306
308
|
discovered_dir: Path,
|
|
307
309
|
env: dict[str, str],
|
|
310
|
+
exclude_ids: set[str] = set(),
|
|
308
311
|
) -> list[dict[str, Any]]:
|
|
309
312
|
"""Phase 3: fetch citations of top phase-1 papers."""
|
|
310
313
|
# Sort by citation count to pick the most-cited seeds
|
|
@@ -330,7 +333,7 @@ def phase3_citation_expansion(
|
|
|
330
333
|
continue
|
|
331
334
|
for paper in citations:
|
|
332
335
|
cid = paper.get("id") or paper.get("paperId") or ""
|
|
333
|
-
if cid and cid in seen_ids:
|
|
336
|
+
if cid and (cid in seen_ids or cid in exclude_ids):
|
|
334
337
|
continue
|
|
335
338
|
if cid:
|
|
336
339
|
seen_ids.add(cid)
|
|
@@ -364,6 +367,11 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
364
367
|
help=f"Comma-separated fetchers (default: {DEFAULT_FETCHERS}).")
|
|
365
368
|
parser.add_argument("--limit", type=int, default=DEFAULT_LIMIT,
|
|
366
369
|
help=f"Max results per fetcher per phase (default: {DEFAULT_LIMIT}).")
|
|
370
|
+
parser.add_argument(
|
|
371
|
+
"--exclude-ids", default="",
|
|
372
|
+
help="Comma-separated list of paper IDs (arXiv IDs or S2 paperIds) to "
|
|
373
|
+
"skip. Use to exclude papers already ingested into wiki/sources/.",
|
|
374
|
+
)
|
|
367
375
|
|
|
368
376
|
args = parser.parse_args(argv)
|
|
369
377
|
|
|
@@ -382,6 +390,9 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
382
390
|
sys.exit(2)
|
|
383
391
|
|
|
384
392
|
fetchers = [f.strip() for f in args.fetchers.split(",") if f.strip()]
|
|
393
|
+
exclude_ids: set[str] = {
|
|
394
|
+
s.strip() for s in args.exclude_ids.split(",") if s.strip()
|
|
395
|
+
}
|
|
385
396
|
slug = _slugify(args.topic)
|
|
386
397
|
env = load_env(project_root)
|
|
387
398
|
|
|
@@ -406,16 +417,20 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
406
417
|
summary["phases"]["1"] = {"status": "resumed", "count": len(phase1_results)}
|
|
407
418
|
_err(f"Phase 1 resumed from checkpoint ({len(phase1_results)} results).")
|
|
408
419
|
else:
|
|
420
|
+
_err(f"Phase 1: keyword search across {fetchers} (limit={args.limit})...")
|
|
409
421
|
phase1_results = phase1_keyword_search(
|
|
410
|
-
args.topic, slug, discovered_dir, fetchers, args.limit, env
|
|
422
|
+
args.topic, slug, discovered_dir, fetchers, args.limit, env, exclude_ids
|
|
411
423
|
)
|
|
412
424
|
_save_checkpoint(state_dir, 1, {"results": phase1_results, "slug": slug})
|
|
425
|
+
_err(f"Phase 1 complete: {len(phase1_results)} unique candidates.")
|
|
413
426
|
summary["phases"]["1"] = {"status": "complete", "count": len(phase1_results)}
|
|
414
427
|
else:
|
|
428
|
+
_err(f"Phase 1: keyword search across {fetchers} (limit={args.limit})...")
|
|
415
429
|
phase1_results = phase1_keyword_search(
|
|
416
|
-
args.topic, slug, discovered_dir, fetchers, args.limit, env
|
|
430
|
+
args.topic, slug, discovered_dir, fetchers, args.limit, env, exclude_ids
|
|
417
431
|
)
|
|
418
432
|
_save_checkpoint(state_dir, 1, {"results": phase1_results, "slug": slug})
|
|
433
|
+
_err(f"Phase 1 complete: {len(phase1_results)} unique candidates.")
|
|
419
434
|
summary["phases"]["1"] = {"status": "complete", "count": len(phase1_results)}
|
|
420
435
|
else:
|
|
421
436
|
# Load from checkpoint if phase 1 was run previously
|
|
@@ -432,16 +447,20 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
432
447
|
summary["phases"]["2"] = {"status": "resumed", "count": len(phase2_results)}
|
|
433
448
|
_err(f"Phase 2 resumed from checkpoint ({len(phase2_results)} results).")
|
|
434
449
|
else:
|
|
450
|
+
_err(f"Phase 2: author backfill (top {MAX_AUTHORS_BACKFILL} authors, limit={args.limit})...")
|
|
435
451
|
phase2_results = phase2_author_backfill(
|
|
436
|
-
phase1_results, slug, discovered_dir, args.limit, env
|
|
452
|
+
phase1_results, slug, discovered_dir, args.limit, env, exclude_ids
|
|
437
453
|
)
|
|
438
454
|
_save_checkpoint(state_dir, 2, {"results": phase2_results, "slug": slug})
|
|
455
|
+
_err(f"Phase 2 complete: {len(phase2_results)} unique candidates.")
|
|
439
456
|
summary["phases"]["2"] = {"status": "complete", "count": len(phase2_results)}
|
|
440
457
|
else:
|
|
458
|
+
_err(f"Phase 2: author backfill (top {MAX_AUTHORS_BACKFILL} authors, limit={args.limit})...")
|
|
441
459
|
phase2_results = phase2_author_backfill(
|
|
442
|
-
phase1_results, slug, discovered_dir, args.limit, env
|
|
460
|
+
phase1_results, slug, discovered_dir, args.limit, env, exclude_ids
|
|
443
461
|
)
|
|
444
462
|
_save_checkpoint(state_dir, 2, {"results": phase2_results, "slug": slug})
|
|
463
|
+
_err(f"Phase 2 complete: {len(phase2_results)} unique candidates.")
|
|
445
464
|
summary["phases"]["2"] = {"status": "complete", "count": len(phase2_results)}
|
|
446
465
|
|
|
447
466
|
# --- Phase 3 ---
|
|
@@ -453,16 +472,20 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
453
472
|
summary["phases"]["3"] = {"status": "resumed", "count": len(phase3_results)}
|
|
454
473
|
_err(f"Phase 3 resumed from checkpoint ({len(phase3_results)} results).")
|
|
455
474
|
else:
|
|
475
|
+
_err(f"Phase 3: citation expansion (top 5 seeds × {CITATIONS_PER_SEED} citations)...")
|
|
456
476
|
phase3_results = phase3_citation_expansion(
|
|
457
|
-
phase1_results, slug, discovered_dir, env
|
|
477
|
+
phase1_results, slug, discovered_dir, env, exclude_ids
|
|
458
478
|
)
|
|
459
479
|
_save_checkpoint(state_dir, 3, {"results": phase3_results, "slug": slug})
|
|
480
|
+
_err(f"Phase 3 complete: {len(phase3_results)} unique candidates.")
|
|
460
481
|
summary["phases"]["3"] = {"status": "complete", "count": len(phase3_results)}
|
|
461
482
|
else:
|
|
483
|
+
_err(f"Phase 3: citation expansion (top 5 seeds × {CITATIONS_PER_SEED} citations)...")
|
|
462
484
|
phase3_results = phase3_citation_expansion(
|
|
463
|
-
phase1_results, slug, discovered_dir, env
|
|
485
|
+
phase1_results, slug, discovered_dir, env, exclude_ids
|
|
464
486
|
)
|
|
465
487
|
_save_checkpoint(state_dir, 3, {"results": phase3_results, "slug": slug})
|
|
488
|
+
_err(f"Phase 3 complete: {len(phase3_results)} unique candidates.")
|
|
466
489
|
summary["phases"]["3"] = {"status": "complete", "count": len(phase3_results)}
|
|
467
490
|
|
|
468
491
|
except ValueError as exc:
|