lumina-wiki 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "$schema": "https://json.schemastore.org/package.json",
3
3
  "name": "lumina-wiki",
4
- "version": "0.4.0",
4
+ "version": "0.5.0",
5
5
  "description": "Domain-agnostic, multi-IDE wiki scaffolder — Karpathy's LLM-Wiki vision, cross-platform and pack-based.",
6
6
  "keywords": [
7
7
  "llm-wiki",
@@ -124,6 +124,8 @@ export function renderReadme(template, variables, purpose = '') {
124
124
  return [
125
125
  titleLine,
126
126
  '',
127
+ '## Project Purpose',
128
+ '',
127
129
  purposeText,
128
130
  '',
129
131
  '<!-- lumina:schema -->',
@@ -140,7 +142,7 @@ export function renderReadme(template, variables, purpose = '') {
140
142
  // Find end of title block (first non-empty, non-H1 line before marker)
141
143
  let insertIdx = schemaMarkerIdx;
142
144
  // Insert purpose region before schema marker
143
- const purposeLines = ['', purposeText, ''];
145
+ const purposeLines = ['', '## Project Purpose', '', purposeText, ''];
144
146
  lines.splice(insertIdx, 0, ...purposeLines);
145
147
 
146
148
  return lines.join('\n');
@@ -1,6 +1,6 @@
1
1
  /**
2
2
  * @module lint
3
- * @description LuminaWiki v0.1 wiki linter — 9 schema checks, optional --fix.
3
+ * @description LuminaWiki v0.1 wiki linter — 10 schema checks, optional --fix.
4
4
  *
5
5
  * CLI usage:
6
6
  * node lint.mjs [path] [--fix] [--dry-run] [--suggest] [--json]
@@ -62,7 +62,7 @@ const INDEX_MARKER_OPEN = '<!-- lumina:index -->';
62
62
  const INDEX_MARKER_CLOSE = '<!-- /lumina:index -->';
63
63
 
64
64
  /** All check IDs in run order. */
65
- const ALL_CHECK_IDS = ['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 'L07', 'L08', 'L09'];
65
+ const ALL_CHECK_IDS = ['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 'L07', 'L08', 'L09', 'L10'];
66
66
 
67
67
  /** Kebab-case pattern: lowercase letters, digits, hyphens; no leading/trailing hyphen. */
68
68
  const KEBAB_RE = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
@@ -611,6 +611,57 @@ function checkL09(indexPath, indexContent, entityFiles) {
611
611
  return [];
612
612
  }
613
613
 
614
+ /**
615
+ * L10: Two foundations share an alias, or a foundation's alias collides with
616
+ * another foundation's title. Foundations-only; no --fix mode.
617
+ *
618
+ * @param {Array<{wikiRelPath: string, fm: Record<string,unknown>}>} foundationEntries
619
+ * Each entry has the wiki-relative path and parsed frontmatter of one foundation file.
620
+ * @returns {Finding[]}
621
+ */
622
+ function checkL10(foundationEntries) {
623
+ /** @type {Map<string, Array<{slug: string, source: 'title'|'alias', original: string}>>} */
624
+ const index = new Map();
625
+
626
+ for (const { wikiRelPath, fm } of foundationEntries) {
627
+ const slug = wikiRelPath; // e.g. "foundations/transformer.md"
628
+
629
+ // Collect title.
630
+ if (typeof fm.title === 'string') {
631
+ const norm = fm.title.trim().toLowerCase();
632
+ if (!index.has(norm)) index.set(norm, []);
633
+ index.get(norm).push({ slug, source: 'title', original: fm.title });
634
+ }
635
+
636
+ // Collect aliases (skip non-string entries defensively).
637
+ const aliases = Array.isArray(fm.aliases) ? fm.aliases : [];
638
+ for (const alias of aliases) {
639
+ if (typeof alias !== 'string') continue;
640
+ const norm = alias.trim().toLowerCase();
641
+ if (!index.has(norm)) index.set(norm, []);
642
+ index.get(norm).push({ slug, source: 'alias', original: alias });
643
+ }
644
+ }
645
+
646
+ const findings = [];
647
+ for (const [, claimants] of index) {
648
+ if (claimants.length < 2) continue;
649
+ // Each claimant gets a finding mentioning the others.
650
+ for (const claimant of claimants) {
651
+ const others = claimants.filter(c => c !== claimant);
652
+ const othersDesc = others
653
+ .map(c => `${c.slug} (as ${c.source})`)
654
+ .join(', ');
655
+ findings.push(finding(
656
+ 'L10-alias-conflict', 'error', false,
657
+ claimant.slug, null,
658
+ `alias conflict on "${claimant.original}" — also claimed by ${othersDesc}`
659
+ ));
660
+ }
661
+ }
662
+ return findings;
663
+ }
664
+
614
665
  // ─────────────────────────────────────────────────────────────────────────────
615
666
  // FIXERS
616
667
  // ─────────────────────────────────────────────────────────────────────────────
@@ -873,6 +924,19 @@ async function runLint(projectRoot, opts) {
873
924
  allFindings.push(...checkL09(indexPath, indexContent, entityFiles));
874
925
  }
875
926
 
927
+ // L10: collect all foundation frontmatters in one pass, then check for alias conflicts.
928
+ {
929
+ const foundationEntries = [];
930
+ for (const wikiRelPath of entityFiles) {
931
+ if (!wikiRelPath.startsWith('foundations/')) continue;
932
+ const abs = safejoin(wikiRoot, wikiRelPath);
933
+ const content = await readFile(abs, 'utf8');
934
+ const parsed = parseFrontmatter(content);
935
+ foundationEntries.push({ wikiRelPath, fm: parsed ? parsed.data : {} });
936
+ }
937
+ allFindings.push(...checkL10(foundationEntries));
938
+ }
939
+
876
940
  // Apply fixes if requested.
877
941
  if (opts.fix || opts.dryRun) {
878
942
  await applyFixes(allFindings, wikiRoot, edgesPath, indexPath, indexContent, entityFiles, allAbsMd, edges, edgeSet, opts);
@@ -1119,7 +1183,7 @@ export {
1119
1183
  isExempt,
1120
1184
  entityTypeForPath,
1121
1185
  checkL01, checkL02, checkL03, checkL04, checkL05,
1122
- checkL06, checkL07, checkL08, checkL09,
1186
+ checkL06, checkL07, checkL08, checkL09, checkL10,
1123
1187
  fixL01, fixL03, fixL06, fixL07, fixL09,
1124
1188
  runLint,
1125
1189
  INDEX_MARKER_OPEN,
@@ -299,6 +299,7 @@ export const REQUIRED_FRONTMATTER = {
299
299
  { key: 'type', type: 'string', required: true, pack: 'research' },
300
300
  { key: 'created', type: 'iso-date', required: true, pack: 'research' },
301
301
  { key: 'updated', type: 'iso-date', required: true, pack: 'research' },
302
+ { key: 'aliases', type: 'array', required: false, pack: 'research' },
302
303
  ],
303
304
 
304
305
  // Research pack: topic page
@@ -1247,6 +1247,7 @@ async function main(argv) {
1247
1247
  ' batch-edges <json-file> Apply array of edges from JSON file',
1248
1248
  ' dedup-edges Deduplicate edges.jsonl',
1249
1249
  ' list-entities [path-prefix] [--type <type>] List entity slugs as JSON',
1250
+ ' resolve-alias <text> Map free-text query to a foundations/* slug',
1250
1251
  ' read-edges <slug>|--from <slug> [--type <type>] [--direction outbound|inbound|both]',
1251
1252
  ' read-citations <slug> Read all citations for a slug',
1252
1253
  ' verify-frontmatter <slug> Validate frontmatter fields',
@@ -1602,6 +1603,71 @@ async function main(argv) {
1602
1603
  break;
1603
1604
  }
1604
1605
 
1606
+ // -----------------------------------------------------------------------
1607
+ case 'resolve-alias': {
1608
+ const text = positional.join(' ').trim();
1609
+ if (!text) {
1610
+ emitError('resolve-alias requires <text>', 2);
1611
+ process.exit(2);
1612
+ }
1613
+ const projectRoot = await requireProjectRoot();
1614
+ const allEntities = await listEntities(projectRoot);
1615
+ const foundations = allEntities.filter(e => e.type === 'foundations');
1616
+
1617
+ const needle = text.toLowerCase();
1618
+ const matches = [];
1619
+
1620
+ for (const entity of foundations) {
1621
+ const content = await readFile(entity.filePath, 'utf8');
1622
+ const { frontmatter } = parseFrontmatter(content);
1623
+
1624
+ // Build candidate set with priority: slug > title > alias
1625
+ const slugNorm = entity.slug.toLowerCase().trim();
1626
+ const titleNorm = typeof frontmatter.title === 'string'
1627
+ ? frontmatter.title.toLowerCase().trim()
1628
+ : null;
1629
+
1630
+ let matchSource = null;
1631
+
1632
+ if (slugNorm === needle) {
1633
+ matchSource = 'slug';
1634
+ } else if (titleNorm !== null && titleNorm === needle) {
1635
+ matchSource = 'title';
1636
+ } else {
1637
+ // Check aliases defensively
1638
+ const aliases = frontmatter.aliases;
1639
+ if (Array.isArray(aliases)) {
1640
+ for (const alias of aliases) {
1641
+ if (typeof alias !== 'string') continue;
1642
+ if (alias.toLowerCase().trim() === needle) {
1643
+ matchSource = 'alias';
1644
+ break;
1645
+ }
1646
+ }
1647
+ }
1648
+ }
1649
+
1650
+ if (matchSource !== null) {
1651
+ matches.push({ slug: entity.slug, path: entity.path, source: matchSource });
1652
+ }
1653
+ }
1654
+
1655
+ if (matches.length === 0) {
1656
+ emitError(`no match for query: ${text}`, 2);
1657
+ process.exit(2);
1658
+ }
1659
+
1660
+ // Sort ascending by slug for deterministic output
1661
+ matches.sort((a, b) => a.slug < b.slug ? -1 : a.slug > b.slug ? 1 : 0);
1662
+
1663
+ emitJson({
1664
+ query: text,
1665
+ matches,
1666
+ ambiguous: matches.length >= 2,
1667
+ });
1668
+ break;
1669
+ }
1670
+
1605
1671
  // -----------------------------------------------------------------------
1606
1672
  default: {
1607
1673
  emitError(`Unknown subcommand: ${subcommand}. Run node wiki.mjs --help for usage.`, 2);
@@ -4,7 +4,6 @@ description: >
4
4
  Run lint.mjs --json, summarize findings by severity, offer to apply --fix for
5
5
  auto-fixable checks (L01/L03/L06/L07/L09), self-check re-run to confirm 0
6
6
  errors, and surface advisory warnings for user attention.
7
- Single-model self-check only — no cross-model review.
8
7
  Use this whenever the user asks to "check the wiki", "run lint", "verify the
9
8
  graph", "are there broken links?", "what's wrong with the wiki?", "health
10
9
  check", or "are there missing reverse links?". Also fires for: weekly review
@@ -17,10 +16,16 @@ allowed-tools:
17
16
 
18
17
  # /lumi-check
19
18
 
19
+ > If you were spawned in the same session that just ran `/lumi-ingest`, surface
20
+ > a one-line note to the user suggesting they re-run this check in a fresh
21
+ > session or via a subagent for an independent read — then proceed normally.
22
+ > Same model with blank context catches bias from the reasoning chain that
23
+ > built the pages you are now reviewing.
24
+
20
25
  ## Role
21
26
 
22
27
  You are the wiki's quality gate. You run the linter, classify findings, apply
23
- safe fixes with a single-model self-check re-run, and surface the issues the user
28
+ safe fixes with a self-check re-run, and surface the issues the user
24
29
  must resolve manually. You do not decide what is correct content — you enforce
25
30
  structural and graph-integrity rules.
26
31
 
@@ -128,6 +128,12 @@ Write checkpoint: `phase: "source-page"`.
128
128
 
129
129
  ### Phase 4 — Write concept and person stubs
130
130
 
131
+ For every candidate concept name extracted in Phase 3, first run
132
+ `node _lumina/scripts/wiki.mjs resolve-alias "<concept-name>"`. If it resolves to
133
+ a foundation, link to that foundation via `[[foundations/<slug>]]` and add a
134
+ `grounded_in` edge instead of creating a concept stub. See
135
+ `references/dedup-policy.md` § Foundation Resolution for the full decision tree.
136
+
131
137
  Apply `references/dedup-policy.md` before creating or updating stubs. Existing
132
138
  concept/person pages are updated conservatively; new pages use the templates
133
139
  below.
@@ -250,6 +256,20 @@ Ask whether they want a minimal ingest (source page only, no stubs) or a full
250
256
  ingest. Proceed only with explicit direction. Log which phases were skipped.
251
257
  </example>
252
258
 
259
+ <example>
260
+ User: "/lumi-ingest raw/sources/rlhf-overview.pdf"
261
+
262
+ Foundation resolution — concept name maps to an existing foundation:
263
+ ```bash
264
+ node _lumina/scripts/wiki.mjs resolve-alias "RLHF"
265
+ # → {"query":"RLHF","matches":[{"slug":"reinforcement-learning-from-human-feedback","path":"foundations/reinforcement-learning-from-human-feedback","source":"alias"}],"ambiguous":false}
266
+ node _lumina/scripts/wiki.mjs add-edge sources/rlhf-overview grounded_in foundations/reinforcement-learning-from-human-feedback
267
+ # (no concept stub created for "RLHF")
268
+ ```
269
+ Link added to `## Concepts` in `wiki/sources/rlhf-overview.md`:
270
+ `[[foundations/reinforcement-learning-from-human-feedback]]`
271
+ </example>
272
+
253
273
  ## Guardrails
254
274
 
255
275
  - Never modify files in `raw/`. Read-only.
@@ -270,3 +290,9 @@ Before reporting done, verify:
270
290
  (c) Running `/lumi-ingest` again with the same file produces byte-identical `wiki/`
271
291
  output (all add-edge calls are no-ops; stubs have same content; index.md entry
272
292
  already present)
293
+
294
+ ## Next step
295
+
296
+ Tell the user to run `/lumi-check` to validate the wiki state — ideally in a
297
+ fresh session or via a subagent. Same model with blank context catches bias
298
+ from the reasoning chain that just built these pages.
@@ -15,6 +15,36 @@ If `wiki/sources/<slug>.md` already exists, treat the run as a re-ingest. Confir
15
15
  with the user before overwriting body content. If the user confirms, keep stable
16
16
  frontmatter values when possible and only update fields supported by the source.
17
17
 
18
+ ## Foundation Resolution (Before Creating Concept Stubs)
19
+
20
+ Before creating any concept stub, check whether the term already has a foundation
21
+ page. This avoids duplicate concept pages when a foundation covers the same term
22
+ under its canonical name.
23
+
24
+ ```bash
25
+ node _lumina/scripts/wiki.mjs resolve-alias "<concept-name>"
26
+ ```
27
+
28
+ Decision tree by exit code:
29
+
30
+ - **exit 0, exactly 1 match (`ambiguous: false`)** — do NOT create a concept stub.
31
+ Link to `[[foundations/<match-slug>]]` in the source page's `## Concepts` section.
32
+ Add edge:
33
+ ```bash
34
+ node _lumina/scripts/wiki.mjs add-edge sources/<source-slug> grounded_in foundations/<match-slug>
35
+ ```
36
+ Note: `grounded_in` is terminal — no reverse edge will be written.
37
+
38
+ - **exit 0, `ambiguous: true`** — present the candidate foundations to the user
39
+ with their slugs and ask which one applies. If none match the source's intended
40
+ meaning, fall back to creating a concept stub.
41
+
42
+ - **exit 2 (no match)** — proceed with normal concept stub creation per the next
43
+ section.
44
+
45
+ Run resolve-alias for every candidate concept name extracted in Phase 4, before
46
+ making any `add-edge concepts/<slug>` calls.
47
+
18
48
  ## Concept And Person Stubs
19
49
 
20
50
  Before creating a concept or person page, check metadata:
@@ -32,7 +32,18 @@ References:
32
32
  ## Instructions
33
33
 
34
34
  1. Clarify the discovery query if the topic, domain, or source type is unclear.
35
- 2. Check research tool setup:
35
+ 2. Build the exclude list from already-ingested sources. Run:
36
+
37
+ ```bash
38
+ node _lumina/scripts/wiki.mjs list-entities
39
+ ```
40
+
41
+ For each entity with `type: "sources"`, `Read` the `filePath` and extract any
42
+ arXiv ID or Semantic Scholar paperId from frontmatter or body URLs. Patterns
43
+ to scan: `arxiv.org/abs/<id>`, `arXiv:<id>`, `semanticscholar.org/paper/<id>`.
44
+ Pass the deduped list to `init_discovery.py --exclude-ids id1,id2,...`. If
45
+ no sources exist yet, skip this step (omit the flag).
46
+ 3. Check research tool setup:
36
47
 
37
48
  ```bash
38
49
  python3 _lumina/tools/init_discovery.py --help
@@ -43,15 +54,21 @@ python3 _lumina/tools/fetch_deepxiv.py --help
43
54
  python3 _lumina/tools/discover.py --help
44
55
  ```
45
56
 
46
- 3. Pick one seed mode from `references/source-modes.md`: `topic`, `anchor`, or
57
+ 4. Pick one seed mode from `references/source-modes.md`: `topic`, `anchor`, or
47
58
  `from-wiki`. Use only the documented commands and flags.
48
- 4. Deduplicate candidates against existing wiki/discovered/checkpoint state using
59
+ 5. Deduplicate candidates against existing wiki/discovered/checkpoint state using
49
60
  `references/ranking-signals.md`.
50
- 5. Rank candidate JSON with `discover.py --topic "<topic>"`; preserve returned
61
+ 6. Rank candidate JSON with `discover.py --topic "<topic>"`; preserve returned
51
62
  `_score`, then add a human-readable rationale and risk note.
52
- 6. Present a checkpointed shortlist with title, authors/year, URL or identifier,
63
+ 7. Apply purpose alignment. Read the `## Project Purpose` section in
64
+ `README.md`. For each shortlisted candidate, judge alignment with that
65
+ purpose (high / medium / low) and include the judgment in the rationale.
66
+ Move clearly off-purpose candidates to MAYBE or SKIP regardless of `_score`.
67
+ If the purpose section is empty or contains only the placeholder text, skip
68
+ this step and note "no project purpose set" in the response.
69
+ 8. Present a checkpointed shortlist with title, authors/year, URL or identifier,
53
70
  `_score`, rationale, duplicate status, and recommended next action.
54
- 7. Ask the user which candidates should be ingested. Do not create source pages
71
+ 9. Ask the user which candidates should be ingested. Do not create source pages
55
72
  or graph edges in this skill.
56
73
 
57
74
  ## Constraints
@@ -59,14 +76,18 @@ python3 _lumina/tools/discover.py --help
59
76
  - Do not mutate `wiki/`.
60
77
  - Do not invent source metadata not returned by a fetcher or supplied by the user.
61
78
  - Do not invent tool flags. Use only `--topic`, `--project-root`, `--phases`,
62
- `--resume`, `--fetchers`, and `--limit` for `init_discovery.py`.
63
- - Do not include any non-FR35 workflows such as ideation, LaTeX writing,
64
- orchestrator mode, or cross-model debate.
79
+ `--resume`, `--fetchers`, `--limit`, and `--exclude-ids` for
80
+ `init_discovery.py`.
81
+ - Do not include any non-FR35 workflows such as ideation, LaTeX writing, or
82
+ orchestrator mode.
65
83
 
66
84
  ## Definition of Done
67
85
 
68
86
  - Shortlist is deduped against wiki sources and discovered state.
69
87
  - Every shortlisted item includes `_score`, rationale, and risk/duplicate note.
88
+ - Purpose alignment is reflected in each candidate's rationale (or the response
89
+ explicitly notes "no project purpose set" when the README purpose is empty
90
+ or placeholder).
70
91
  - Discovery checkpoints or an explicit resume decision are reflected in the
71
92
  response.
72
93
  - No `wiki/` files, index entries, graph edges, or log entries are written.
@@ -35,10 +35,61 @@ node _lumina/scripts/wiki.mjs read-meta foundations/<slug>
35
35
  node _lumina/scripts/wiki.mjs slug "<topic title>"
36
36
  ```
37
37
 
38
- 2. Check whether `wiki/foundations/<slug>.md` already exists with `read-meta`.
39
- 3. Fetch or use user-provided background material.
38
+ 2. Check whether `wiki/foundations/<slug>.md` already exists:
39
+
40
+ ```bash
41
+ node _lumina/scripts/wiki.mjs read-meta foundations/<slug>
42
+ ```
43
+
44
+ - **exit 2 (not found)** — continue to step 3.
45
+ - **exit 0 (exists)** — read the file, then show the user the `title`, `created`
46
+ date, and the most recent line in `wiki/log.md` that references this slug.
47
+ Ask exactly one question with three options (no other action until answered):
48
+
49
+ ```
50
+ Foundation "<title>" already exists.
51
+ [s] skip — abort, no changes (default)
52
+ [r] refresh — re-fetch from Wikipedia, update non-marked sections and `updated`; preserve `created`, `aliases`, and `<!-- user-edited -->` sections
53
+ [a] abort — same as skip but log the user's intent
54
+ ```
55
+
56
+ Do not proceed without an explicit choice. Map blank/Enter to `skip`.
57
+
58
+ 3. Fetch or handle background material based on exit code from the Wikipedia fetcher:
59
+
60
+ ```bash
61
+ python3 _lumina/tools/fetch_wikipedia.py page "<title>"
62
+ ```
63
+
64
+ - **exit 0** — use the JSON output directly.
65
+ - **exit 2 AND stderr is JSON with `kind == "disambiguation"`** — run a search
66
+ to surface candidates:
67
+
68
+ ```bash
69
+ python3 _lumina/tools/fetch_wikipedia.py search "<title>" --limit 5
70
+ ```
71
+
72
+ Present the numbered results (title + snippet) and let the user:
73
+ - pick a candidate number,
74
+ - type a more specific title, or
75
+ - type `manual` to paste content directly.
76
+
77
+ Re-run `page` with the chosen title, or accept the user-pasted content.
78
+
79
+ - **exit 2 for any other reason** (empty title, page not found) — surface the
80
+ `error` field from stderr JSON and abort.
81
+ - **exit 3 (network error)** — tell the user and offer two options: retry, or
82
+ paste content manually.
40
83
  4. Write `wiki/foundations/<slug>.md` with valid foundation frontmatter:
41
84
  `id`, `title`, `type: foundation`, `created`, `updated`.
85
+
86
+ Also include the optional `aliases` field — an array of strings listing
87
+ alternative names users or sources might write for this concept (abbreviations,
88
+ expansions, common misspellings). Example: for a foundation titled "Reinforcement
89
+ Learning from Human Feedback", use `aliases: ["RLHF", "human feedback RL"]`.
90
+ Propose a list of 2–5 plausible aliases, then ask the user to confirm or edit
91
+ before writing. An empty array `[]` is fine if nothing obvious applies. Aliases
92
+ must be unique across all foundations — `lint.mjs` L10 will error on collisions.
42
93
  5. Keep the body concise: definition, scope notes, and external references.
43
94
  6. Log the addition:
44
95
 
@@ -58,6 +109,12 @@ node _lumina/scripts/lint.mjs --fix --json
58
109
  knowledge extracted from project sources.
59
110
  - Do not store secrets or API keys in foundation pages.
60
111
  - Do not add reverse graph edges for foundations.
112
+ - When refreshing an existing foundation, preserve the original `created` date and
113
+ any `<!-- user-edited -->` sections verbatim. Only `updated` and non-marked
114
+ sections may change.
115
+ - Aliases must be unique across all foundations. If `lint.mjs --fix --json` reports
116
+ `L10-alias-conflict`, resolve manually before completing the run — there is no
117
+ automatic fix.
61
118
 
62
119
  ## Definition of Done
63
120
 
@@ -65,3 +122,5 @@ node _lumina/scripts/lint.mjs --fix --json
65
122
  - `node _lumina/scripts/lint.mjs --fix --json` has updated `wiki/index.md` if
66
123
  needed and leaves `summary.errors === 0`.
67
124
  - `wiki/log.md` has an append-only `lumi-research-prefill` entry.
125
+ - If the page already existed, the user's choice (skip / refresh / abort) is logged
126
+ in `wiki/log.md` with the actual decision taken.
@@ -133,6 +133,7 @@ title: "Foundation concept"
133
133
  slug: foundation-slug
134
134
  date_added: YYYY-MM-DD
135
135
  tags: []
136
+ aliases: []
136
137
  ---
137
138
  ```
138
139
 
@@ -218,7 +218,16 @@ def main(argv: list[str] | None = None) -> None:
218
218
  sys.exit(0)
219
219
 
220
220
  except ValueError as exc:
221
- _err(f"Error: {exc}")
221
+ msg = str(exc)
222
+ if "disambiguation" in msg:
223
+ err_obj: dict[str, str] = {
224
+ "error": msg,
225
+ "kind": "disambiguation",
226
+ "hint": "Use the search subcommand to enumerate candidates.",
227
+ }
228
+ else:
229
+ err_obj = {"error": msg}
230
+ print(json.dumps(err_obj, ensure_ascii=False), file=sys.stderr)
222
231
  sys.exit(2)
223
232
  except requests.exceptions.ConnectionError as exc:
224
233
  _err(f"Network error: {exc}")
@@ -229,6 +229,7 @@ def phase1_keyword_search(
229
229
  fetchers: list[str],
230
230
  limit: int,
231
231
  env: dict[str, str],
232
+ exclude_ids: set[str] = set(),
232
233
  ) -> list[dict[str, Any]]:
233
234
  """Phase 1: keyword search across configured fetchers."""
234
235
  results: list[dict[str, Any]] = []
@@ -249,7 +250,7 @@ def phase1_keyword_search(
249
250
 
250
251
  for paper in papers:
251
252
  pid = paper.get("id") or paper.get("paperId") or ""
252
- if pid and pid in seen_ids:
253
+ if pid and (pid in seen_ids or pid in exclude_ids):
253
254
  continue
254
255
  if pid:
255
256
  seen_ids.add(pid)
@@ -265,6 +266,7 @@ def phase2_author_backfill(
265
266
  discovered_dir: Path,
266
267
  limit: int,
267
268
  env: dict[str, str],
269
+ exclude_ids: set[str] = set(),
268
270
  ) -> list[dict[str, Any]]:
269
271
  """Phase 2: fetch more papers by the most prolific authors from phase 1."""
270
272
  # Count author occurrences across phase-1 results
@@ -290,7 +292,7 @@ def phase2_author_backfill(
290
292
  continue
291
293
  for paper in papers:
292
294
  pid = paper.get("id") or paper.get("paperId") or ""
293
- if pid and pid in seen_ids:
295
+ if pid and (pid in seen_ids or pid in exclude_ids):
294
296
  continue
295
297
  if pid:
296
298
  seen_ids.add(pid)
@@ -305,6 +307,7 @@ def phase3_citation_expansion(
305
307
  slug: str,
306
308
  discovered_dir: Path,
307
309
  env: dict[str, str],
310
+ exclude_ids: set[str] = set(),
308
311
  ) -> list[dict[str, Any]]:
309
312
  """Phase 3: fetch citations of top phase-1 papers."""
310
313
  # Sort by citation count to pick the most-cited seeds
@@ -330,7 +333,7 @@ def phase3_citation_expansion(
330
333
  continue
331
334
  for paper in citations:
332
335
  cid = paper.get("id") or paper.get("paperId") or ""
333
- if cid and cid in seen_ids:
336
+ if cid and (cid in seen_ids or cid in exclude_ids):
334
337
  continue
335
338
  if cid:
336
339
  seen_ids.add(cid)
@@ -364,6 +367,11 @@ def main(argv: list[str] | None = None) -> None:
364
367
  help=f"Comma-separated fetchers (default: {DEFAULT_FETCHERS}).")
365
368
  parser.add_argument("--limit", type=int, default=DEFAULT_LIMIT,
366
369
  help=f"Max results per fetcher per phase (default: {DEFAULT_LIMIT}).")
370
+ parser.add_argument(
371
+ "--exclude-ids", default="",
372
+ help="Comma-separated list of paper IDs (arXiv IDs or S2 paperIds) to "
373
+ "skip. Use to exclude papers already ingested into wiki/sources/.",
374
+ )
367
375
 
368
376
  args = parser.parse_args(argv)
369
377
 
@@ -382,6 +390,9 @@ def main(argv: list[str] | None = None) -> None:
382
390
  sys.exit(2)
383
391
 
384
392
  fetchers = [f.strip() for f in args.fetchers.split(",") if f.strip()]
393
+ exclude_ids: set[str] = {
394
+ s.strip() for s in args.exclude_ids.split(",") if s.strip()
395
+ }
385
396
  slug = _slugify(args.topic)
386
397
  env = load_env(project_root)
387
398
 
@@ -406,16 +417,20 @@ def main(argv: list[str] | None = None) -> None:
406
417
  summary["phases"]["1"] = {"status": "resumed", "count": len(phase1_results)}
407
418
  _err(f"Phase 1 resumed from checkpoint ({len(phase1_results)} results).")
408
419
  else:
420
+ _err(f"Phase 1: keyword search across {fetchers} (limit={args.limit})...")
409
421
  phase1_results = phase1_keyword_search(
410
- args.topic, slug, discovered_dir, fetchers, args.limit, env
422
+ args.topic, slug, discovered_dir, fetchers, args.limit, env, exclude_ids
411
423
  )
412
424
  _save_checkpoint(state_dir, 1, {"results": phase1_results, "slug": slug})
425
+ _err(f"Phase 1 complete: {len(phase1_results)} unique candidates.")
413
426
  summary["phases"]["1"] = {"status": "complete", "count": len(phase1_results)}
414
427
  else:
428
+ _err(f"Phase 1: keyword search across {fetchers} (limit={args.limit})...")
415
429
  phase1_results = phase1_keyword_search(
416
- args.topic, slug, discovered_dir, fetchers, args.limit, env
430
+ args.topic, slug, discovered_dir, fetchers, args.limit, env, exclude_ids
417
431
  )
418
432
  _save_checkpoint(state_dir, 1, {"results": phase1_results, "slug": slug})
433
+ _err(f"Phase 1 complete: {len(phase1_results)} unique candidates.")
419
434
  summary["phases"]["1"] = {"status": "complete", "count": len(phase1_results)}
420
435
  else:
421
436
  # Load from checkpoint if phase 1 was run previously
@@ -432,16 +447,20 @@ def main(argv: list[str] | None = None) -> None:
432
447
  summary["phases"]["2"] = {"status": "resumed", "count": len(phase2_results)}
433
448
  _err(f"Phase 2 resumed from checkpoint ({len(phase2_results)} results).")
434
449
  else:
450
+ _err(f"Phase 2: author backfill (top {MAX_AUTHORS_BACKFILL} authors, limit={args.limit})...")
435
451
  phase2_results = phase2_author_backfill(
436
- phase1_results, slug, discovered_dir, args.limit, env
452
+ phase1_results, slug, discovered_dir, args.limit, env, exclude_ids
437
453
  )
438
454
  _save_checkpoint(state_dir, 2, {"results": phase2_results, "slug": slug})
455
+ _err(f"Phase 2 complete: {len(phase2_results)} unique candidates.")
439
456
  summary["phases"]["2"] = {"status": "complete", "count": len(phase2_results)}
440
457
  else:
458
+ _err(f"Phase 2: author backfill (top {MAX_AUTHORS_BACKFILL} authors, limit={args.limit})...")
441
459
  phase2_results = phase2_author_backfill(
442
- phase1_results, slug, discovered_dir, args.limit, env
460
+ phase1_results, slug, discovered_dir, args.limit, env, exclude_ids
443
461
  )
444
462
  _save_checkpoint(state_dir, 2, {"results": phase2_results, "slug": slug})
463
+ _err(f"Phase 2 complete: {len(phase2_results)} unique candidates.")
445
464
  summary["phases"]["2"] = {"status": "complete", "count": len(phase2_results)}
446
465
 
447
466
  # --- Phase 3 ---
@@ -453,16 +472,20 @@ def main(argv: list[str] | None = None) -> None:
453
472
  summary["phases"]["3"] = {"status": "resumed", "count": len(phase3_results)}
454
473
  _err(f"Phase 3 resumed from checkpoint ({len(phase3_results)} results).")
455
474
  else:
475
+ _err(f"Phase 3: citation expansion (top 5 seeds × {CITATIONS_PER_SEED} citations)...")
456
476
  phase3_results = phase3_citation_expansion(
457
- phase1_results, slug, discovered_dir, env
477
+ phase1_results, slug, discovered_dir, env, exclude_ids
458
478
  )
459
479
  _save_checkpoint(state_dir, 3, {"results": phase3_results, "slug": slug})
480
+ _err(f"Phase 3 complete: {len(phase3_results)} unique candidates.")
460
481
  summary["phases"]["3"] = {"status": "complete", "count": len(phase3_results)}
461
482
  else:
483
+ _err(f"Phase 3: citation expansion (top 5 seeds × {CITATIONS_PER_SEED} citations)...")
462
484
  phase3_results = phase3_citation_expansion(
463
- phase1_results, slug, discovered_dir, env
485
+ phase1_results, slug, discovered_dir, env, exclude_ids
464
486
  )
465
487
  _save_checkpoint(state_dir, 3, {"results": phase3_results, "slug": slug})
488
+ _err(f"Phase 3 complete: {len(phase3_results)} unique candidates.")
466
489
  summary["phases"]["3"] = {"status": "complete", "count": len(phase3_results)}
467
490
 
468
491
  except ValueError as exc: