npm - lumina-wiki - Versions diffs - 0.4.0 → 0.5.0 - Mend

lumina-wiki 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/package.json +1 -1
package/src/installer/template-engine.js +3 -1
package/src/scripts/lint.mjs +67 -3
package/src/scripts/schemas.mjs +1 -0
package/src/scripts/wiki.mjs +66 -0
package/src/skills/core/check/SKILL.md +7 -2
package/src/skills/core/ingest/SKILL.md +26 -0
package/src/skills/core/ingest/references/dedup-policy.md +30 -0
package/src/skills/packs/research/discover/SKILL.md +30 -9
package/src/skills/packs/research/prefill/SKILL.md +61 -2
package/src/templates/_lumina/schema/page-templates.md +1 -0
package/src/tools/fetch_wikipedia.py +10 -1
package/src/tools/init_discovery.py +32 -9

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "$schema": "https://json.schemastore.org/package.json",
   "name": "lumina-wiki",
-  "version": "0.4.0",
+  "version": "0.5.0",
   "description": "Domain-agnostic, multi-IDE wiki scaffolder — Karpathy's LLM-Wiki vision, cross-platform and pack-based.",
   "keywords": [
     "llm-wiki",

package/src/installer/template-engine.js CHANGED Viewed

@@ -124,6 +124,8 @@ export function renderReadme(template, variables, purpose = '') {
     return [
       titleLine,
       '',
+      '## Project Purpose',
+      '',
       purposeText,
       '',
       '<!-- lumina:schema -->',
@@ -140,7 +142,7 @@ export function renderReadme(template, variables, purpose = '') {
   // Find end of title block (first non-empty, non-H1 line before marker)
   let insertIdx = schemaMarkerIdx;
   // Insert purpose region before schema marker
-  const purposeLines = ['', purposeText, ''];
+  const purposeLines = ['', '## Project Purpose', '', purposeText, ''];
   lines.splice(insertIdx, 0, ...purposeLines);
   return lines.join('\n');

package/src/scripts/lint.mjs CHANGED Viewed

@@ -1,6 +1,6 @@
 /**
  * @module lint
- * @description LuminaWiki v0.1 wiki linter — 9 schema checks, optional --fix.
+ * @description LuminaWiki v0.1 wiki linter — 10 schema checks, optional --fix.
  *
  * CLI usage:
  *   node lint.mjs [path] [--fix] [--dry-run] [--suggest] [--json]
@@ -62,7 +62,7 @@ const INDEX_MARKER_OPEN = '<!-- lumina:index -->';
 const INDEX_MARKER_CLOSE = '<!-- /lumina:index -->';
 /** All check IDs in run order. */
-const ALL_CHECK_IDS = ['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 'L07', 'L08', 'L09'];
+const ALL_CHECK_IDS = ['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 'L07', 'L08', 'L09', 'L10'];
 /** Kebab-case pattern: lowercase letters, digits, hyphens; no leading/trailing hyphen. */
 const KEBAB_RE = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
@@ -611,6 +611,57 @@ function checkL09(indexPath, indexContent, entityFiles) {
   return [];
 }
+/**
+ * L10: Two foundations share an alias, or a foundation's alias collides with
+ * another foundation's title. Foundations-only; no --fix mode.
+ *
+ * @param {Array<{wikiRelPath: string, fm: Record<string,unknown>}>} foundationEntries
+ *   Each entry has the wiki-relative path and parsed frontmatter of one foundation file.
+ * @returns {Finding[]}
+ */
+function checkL10(foundationEntries) {
+  /** @type {Map<string, Array<{slug: string, source: 'title'|'alias', original: string}>>} */
+  const index = new Map();
+  for (const { wikiRelPath, fm } of foundationEntries) {
+    const slug = wikiRelPath; // e.g. "foundations/transformer.md"
+    // Collect title.
+    if (typeof fm.title === 'string') {
+      const norm = fm.title.trim().toLowerCase();
+      if (!index.has(norm)) index.set(norm, []);
+      index.get(norm).push({ slug, source: 'title', original: fm.title });
+    }
+    // Collect aliases (skip non-string entries defensively).
+    const aliases = Array.isArray(fm.aliases) ? fm.aliases : [];
+    for (const alias of aliases) {
+      if (typeof alias !== 'string') continue;
+      const norm = alias.trim().toLowerCase();
+      if (!index.has(norm)) index.set(norm, []);
+      index.get(norm).push({ slug, source: 'alias', original: alias });
+    }
+  }
+  const findings = [];
+  for (const [, claimants] of index) {
+    if (claimants.length < 2) continue;
+    // Each claimant gets a finding mentioning the others.
+    for (const claimant of claimants) {
+      const others = claimants.filter(c => c !== claimant);
+      const othersDesc = others
+        .map(c => `${c.slug} (as ${c.source})`)
+        .join(', ');
+      findings.push(finding(
+        'L10-alias-conflict', 'error', false,
+        claimant.slug, null,
+        `alias conflict on "${claimant.original}" — also claimed by ${othersDesc}`
+      ));
+    }
+  }
+  return findings;
+}
 // ─────────────────────────────────────────────────────────────────────────────
 // FIXERS
 // ─────────────────────────────────────────────────────────────────────────────
@@ -873,6 +924,19 @@ async function runLint(projectRoot, opts) {
     allFindings.push(...checkL09(indexPath, indexContent, entityFiles));
   }
+  // L10: collect all foundation frontmatters in one pass, then check for alias conflicts.
+  {
+    const foundationEntries = [];
+    for (const wikiRelPath of entityFiles) {
+      if (!wikiRelPath.startsWith('foundations/')) continue;
+      const abs = safejoin(wikiRoot, wikiRelPath);
+      const content = await readFile(abs, 'utf8');
+      const parsed = parseFrontmatter(content);
+      foundationEntries.push({ wikiRelPath, fm: parsed ? parsed.data : {} });
+    }
+    allFindings.push(...checkL10(foundationEntries));
+  }
   // Apply fixes if requested.
   if (opts.fix || opts.dryRun) {
     await applyFixes(allFindings, wikiRoot, edgesPath, indexPath, indexContent, entityFiles, allAbsMd, edges, edgeSet, opts);
@@ -1119,7 +1183,7 @@ export {
   isExempt,
   entityTypeForPath,
   checkL01, checkL02, checkL03, checkL04, checkL05,
-  checkL06, checkL07, checkL08, checkL09,
+  checkL06, checkL07, checkL08, checkL09, checkL10,
   fixL01, fixL03, fixL06, fixL07, fixL09,
   runLint,
   INDEX_MARKER_OPEN,

package/src/scripts/schemas.mjs CHANGED Viewed

@@ -299,6 +299,7 @@ export const REQUIRED_FRONTMATTER = {
     { key: 'type',    type: 'string',   required: true,  pack: 'research' },
     { key: 'created', type: 'iso-date', required: true,  pack: 'research' },
     { key: 'updated', type: 'iso-date', required: true,  pack: 'research' },
+    { key: 'aliases', type: 'array',    required: false, pack: 'research' },
   ],
   // Research pack: topic page

package/src/scripts/wiki.mjs CHANGED Viewed

@@ -1247,6 +1247,7 @@ async function main(argv) {
       '  batch-edges <json-file>         Apply array of edges from JSON file',
       '  dedup-edges                     Deduplicate edges.jsonl',
       '  list-entities [path-prefix] [--type <type>]  List entity slugs as JSON',
+      '  resolve-alias <text>            Map free-text query to a foundations/* slug',
       '  read-edges <slug>|--from <slug> [--type <type>] [--direction outbound|inbound|both]',
       '  read-citations <slug>           Read all citations for a slug',
       '  verify-frontmatter <slug>       Validate frontmatter fields',
@@ -1602,6 +1603,71 @@ async function main(argv) {
         break;
       }
+      // -----------------------------------------------------------------------
+      case 'resolve-alias': {
+        const text = positional.join(' ').trim();
+        if (!text) {
+          emitError('resolve-alias requires <text>', 2);
+          process.exit(2);
+        }
+        const projectRoot = await requireProjectRoot();
+        const allEntities = await listEntities(projectRoot);
+        const foundations = allEntities.filter(e => e.type === 'foundations');
+        const needle = text.toLowerCase();
+        const matches = [];
+        for (const entity of foundations) {
+          const content = await readFile(entity.filePath, 'utf8');
+          const { frontmatter } = parseFrontmatter(content);
+          // Build candidate set with priority: slug > title > alias
+          const slugNorm = entity.slug.toLowerCase().trim();
+          const titleNorm = typeof frontmatter.title === 'string'
+            ? frontmatter.title.toLowerCase().trim()
+            : null;
+          let matchSource = null;
+          if (slugNorm === needle) {
+            matchSource = 'slug';
+          } else if (titleNorm !== null && titleNorm === needle) {
+            matchSource = 'title';
+          } else {
+            // Check aliases defensively
+            const aliases = frontmatter.aliases;
+            if (Array.isArray(aliases)) {
+              for (const alias of aliases) {
+                if (typeof alias !== 'string') continue;
+                if (alias.toLowerCase().trim() === needle) {
+                  matchSource = 'alias';
+                  break;
+                }
+              }
+            }
+          }
+          if (matchSource !== null) {
+            matches.push({ slug: entity.slug, path: entity.path, source: matchSource });
+          }
+        }
+        if (matches.length === 0) {
+          emitError(`no match for query: ${text}`, 2);
+          process.exit(2);
+        }
+        // Sort ascending by slug for deterministic output
+        matches.sort((a, b) => a.slug < b.slug ? -1 : a.slug > b.slug ? 1 : 0);
+        emitJson({
+          query: text,
+          matches,
+          ambiguous: matches.length >= 2,
+        });
+        break;
+      }
       // -----------------------------------------------------------------------
       default: {
         emitError(`Unknown subcommand: ${subcommand}. Run node wiki.mjs --help for usage.`, 2);

package/src/skills/core/check/SKILL.md CHANGED Viewed

@@ -4,7 +4,6 @@ description: >
   Run lint.mjs --json, summarize findings by severity, offer to apply --fix for
   auto-fixable checks (L01/L03/L06/L07/L09), self-check re-run to confirm 0
   errors, and surface advisory warnings for user attention.
-  Single-model self-check only — no cross-model review.
   Use this whenever the user asks to "check the wiki", "run lint", "verify the
   graph", "are there broken links?", "what's wrong with the wiki?", "health
   check", or "are there missing reverse links?". Also fires for: weekly review
@@ -17,10 +16,16 @@ allowed-tools:
 # /lumi-check
+> If you were spawned in the same session that just ran `/lumi-ingest`, surface
+> a one-line note to the user suggesting they re-run this check in a fresh
+> session or via a subagent for an independent read — then proceed normally.
+> Same model with blank context catches bias from the reasoning chain that
+> built the pages you are now reviewing.
 ## Role
 You are the wiki's quality gate. You run the linter, classify findings, apply
-safe fixes with a single-model self-check re-run, and surface the issues the user
+safe fixes with a self-check re-run, and surface the issues the user
 must resolve manually. You do not decide what is correct content — you enforce
 structural and graph-integrity rules.

package/src/skills/core/ingest/SKILL.md CHANGED Viewed

@@ -128,6 +128,12 @@ Write checkpoint: `phase: "source-page"`.
 ### Phase 4 — Write concept and person stubs
+For every candidate concept name extracted in Phase 3, first run
+`node _lumina/scripts/wiki.mjs resolve-alias "<concept-name>"`. If it resolves to
+a foundation, link to that foundation via `[[foundations/<slug>]]` and add a
+`grounded_in` edge instead of creating a concept stub. See
+`references/dedup-policy.md` § Foundation Resolution for the full decision tree.
 Apply `references/dedup-policy.md` before creating or updating stubs. Existing
 concept/person pages are updated conservatively; new pages use the templates
 below.
@@ -250,6 +256,20 @@ Ask whether they want a minimal ingest (source page only, no stubs) or a full
 ingest. Proceed only with explicit direction. Log which phases were skipped.
 </example>
+<example>
+User: "/lumi-ingest raw/sources/rlhf-overview.pdf"
+Foundation resolution — concept name maps to an existing foundation:
+```bash
+node _lumina/scripts/wiki.mjs resolve-alias "RLHF"
+# → {"query":"RLHF","matches":[{"slug":"reinforcement-learning-from-human-feedback","path":"foundations/reinforcement-learning-from-human-feedback","source":"alias"}],"ambiguous":false}
+node _lumina/scripts/wiki.mjs add-edge sources/rlhf-overview grounded_in foundations/reinforcement-learning-from-human-feedback
+# (no concept stub created for "RLHF")
+```
+Link added to `## Concepts` in `wiki/sources/rlhf-overview.md`:
+`[[foundations/reinforcement-learning-from-human-feedback]]`
+</example>
 ## Guardrails
 - Never modify files in `raw/`. Read-only.
@@ -270,3 +290,9 @@ Before reporting done, verify:
 (c) Running `/lumi-ingest` again with the same file produces byte-identical `wiki/`
     output (all add-edge calls are no-ops; stubs have same content; index.md entry
     already present)
+## Next step
+Tell the user to run `/lumi-check` to validate the wiki state — ideally in a
+fresh session or via a subagent. Same model with blank context catches bias
+from the reasoning chain that just built these pages.

package/src/skills/core/ingest/references/dedup-policy.md CHANGED Viewed

@@ -15,6 +15,36 @@ If `wiki/sources/<slug>.md` already exists, treat the run as a re-ingest. Confir
 with the user before overwriting body content. If the user confirms, keep stable
 frontmatter values when possible and only update fields supported by the source.
+## Foundation Resolution (Before Creating Concept Stubs)
+Before creating any concept stub, check whether the term already has a foundation
+page. This avoids duplicate concept pages when a foundation covers the same term
+under its canonical name.
+```bash
+node _lumina/scripts/wiki.mjs resolve-alias "<concept-name>"
+```
+Decision tree by exit code:
+- **exit 0, exactly 1 match (`ambiguous: false`)** — do NOT create a concept stub.
+  Link to `[[foundations/<match-slug>]]` in the source page's `## Concepts` section.
+  Add edge:
+  ```bash
+  node _lumina/scripts/wiki.mjs add-edge sources/<source-slug> grounded_in foundations/<match-slug>
+  ```
+  Note: `grounded_in` is terminal — no reverse edge will be written.
+- **exit 0, `ambiguous: true`** — present the candidate foundations to the user
+  with their slugs and ask which one applies. If none match the source's intended
+  meaning, fall back to creating a concept stub.
+- **exit 2 (no match)** — proceed with normal concept stub creation per the next
+  section.
+Run resolve-alias for every candidate concept name extracted in Phase 4, before
+making any `add-edge concepts/<slug>` calls.
 ## Concept And Person Stubs
 Before creating a concept or person page, check metadata:

package/src/skills/packs/research/discover/SKILL.md CHANGED Viewed

@@ -32,7 +32,18 @@ References:
 ## Instructions
 1. Clarify the discovery query if the topic, domain, or source type is unclear.
-2. Check research tool setup:
+2. Build the exclude list from already-ingested sources. Run:
+   ```bash
+   node _lumina/scripts/wiki.mjs list-entities
+   ```
+   For each entity with `type: "sources"`, `Read` the `filePath` and extract any
+   arXiv ID or Semantic Scholar paperId from frontmatter or body URLs. Patterns
+   to scan: `arxiv.org/abs/<id>`, `arXiv:<id>`, `semanticscholar.org/paper/<id>`.
+   Pass the deduped list to `init_discovery.py --exclude-ids id1,id2,...`. If
+   no sources exist yet, skip this step (omit the flag).
+3. Check research tool setup:
 ```bash
 python3 _lumina/tools/init_discovery.py --help
@@ -43,15 +54,21 @@ python3 _lumina/tools/fetch_deepxiv.py --help
 python3 _lumina/tools/discover.py --help
 ```
-3. Pick one seed mode from `references/source-modes.md`: `topic`, `anchor`, or
+4. Pick one seed mode from `references/source-modes.md`: `topic`, `anchor`, or
    `from-wiki`. Use only the documented commands and flags.
-4. Deduplicate candidates against existing wiki/discovered/checkpoint state using
+5. Deduplicate candidates against existing wiki/discovered/checkpoint state using
    `references/ranking-signals.md`.
-5. Rank candidate JSON with `discover.py --topic "<topic>"`; preserve returned
+6. Rank candidate JSON with `discover.py --topic "<topic>"`; preserve returned
    `_score`, then add a human-readable rationale and risk note.
-6. Present a checkpointed shortlist with title, authors/year, URL or identifier,
+7. Apply purpose alignment. Read the `## Project Purpose` section in
+   `README.md`. For each shortlisted candidate, judge alignment with that
+   purpose (high / medium / low) and include the judgment in the rationale.
+   Move clearly off-purpose candidates to MAYBE or SKIP regardless of `_score`.
+   If the purpose section is empty or contains only the placeholder text, skip
+   this step and note "no project purpose set" in the response.
+8. Present a checkpointed shortlist with title, authors/year, URL or identifier,
    `_score`, rationale, duplicate status, and recommended next action.
-7. Ask the user which candidates should be ingested. Do not create source pages
+9. Ask the user which candidates should be ingested. Do not create source pages
    or graph edges in this skill.
 ## Constraints
@@ -59,14 +76,18 @@ python3 _lumina/tools/discover.py --help
 - Do not mutate `wiki/`.
 - Do not invent source metadata not returned by a fetcher or supplied by the user.
 - Do not invent tool flags. Use only `--topic`, `--project-root`, `--phases`,
-  `--resume`, `--fetchers`, and `--limit` for `init_discovery.py`.
-- Do not include any non-FR35 workflows such as ideation, LaTeX writing,
-  orchestrator mode, or cross-model debate.
+  `--resume`, `--fetchers`, `--limit`, and `--exclude-ids` for
+  `init_discovery.py`.
+- Do not include any non-FR35 workflows such as ideation, LaTeX writing, or
+  orchestrator mode.
 ## Definition of Done
 - Shortlist is deduped against wiki sources and discovered state.
 - Every shortlisted item includes `_score`, rationale, and risk/duplicate note.
+- Purpose alignment is reflected in each candidate's rationale (or the response
+  explicitly notes "no project purpose set" when the README purpose is empty
+  or placeholder).
 - Discovery checkpoints or an explicit resume decision are reflected in the
   response.
 - No `wiki/` files, index entries, graph edges, or log entries are written.

package/src/skills/packs/research/prefill/SKILL.md CHANGED Viewed

@@ -35,10 +35,61 @@ node _lumina/scripts/wiki.mjs read-meta foundations/<slug>
 node _lumina/scripts/wiki.mjs slug "<topic title>"
 ```
-2. Check whether `wiki/foundations/<slug>.md` already exists with `read-meta`.
-3. Fetch or use user-provided background material.
+2. Check whether `wiki/foundations/<slug>.md` already exists:
+```bash
+node _lumina/scripts/wiki.mjs read-meta foundations/<slug>
+```
+   - **exit 2 (not found)** — continue to step 3.
+   - **exit 0 (exists)** — read the file, then show the user the `title`, `created`
+     date, and the most recent line in `wiki/log.md` that references this slug.
+     Ask exactly one question with three options (no other action until answered):
+     ```
+     Foundation "<title>" already exists.
+       [s] skip    — abort, no changes (default)
+       [r] refresh — re-fetch from Wikipedia, update non-marked sections and `updated`; preserve `created`, `aliases`, and `<!-- user-edited -->` sections
+       [a] abort   — same as skip but log the user's intent
+     ```
+     Do not proceed without an explicit choice. Map blank/Enter to `skip`.
+3. Fetch or handle background material based on exit code from the Wikipedia fetcher:
+```bash
+python3 _lumina/tools/fetch_wikipedia.py page "<title>"
+```
+   - **exit 0** — use the JSON output directly.
+   - **exit 2 AND stderr is JSON with `kind == "disambiguation"`** — run a search
+     to surface candidates:
+     ```bash
+     python3 _lumina/tools/fetch_wikipedia.py search "<title>" --limit 5
+     ```
+     Present the numbered results (title + snippet) and let the user:
+     - pick a candidate number,
+     - type a more specific title, or
+     - type `manual` to paste content directly.
+     Re-run `page` with the chosen title, or accept the user-pasted content.
+   - **exit 2 for any other reason** (empty title, page not found) — surface the
+     `error` field from stderr JSON and abort.
+   - **exit 3 (network error)** — tell the user and offer two options: retry, or
+     paste content manually.
 4. Write `wiki/foundations/<slug>.md` with valid foundation frontmatter:
    `id`, `title`, `type: foundation`, `created`, `updated`.
+   Also include the optional `aliases` field — an array of strings listing
+   alternative names users or sources might write for this concept (abbreviations,
+   expansions, common misspellings). Example: for a foundation titled "Reinforcement
+   Learning from Human Feedback", use `aliases: ["RLHF", "human feedback RL"]`.
+   Propose a list of 2–5 plausible aliases, then ask the user to confirm or edit
+   before writing. An empty array `[]` is fine if nothing obvious applies. Aliases
+   must be unique across all foundations — `lint.mjs` L10 will error on collisions.
 5. Keep the body concise: definition, scope notes, and external references.
 6. Log the addition:
@@ -58,6 +109,12 @@ node _lumina/scripts/lint.mjs --fix --json
   knowledge extracted from project sources.
 - Do not store secrets or API keys in foundation pages.
 - Do not add reverse graph edges for foundations.
+- When refreshing an existing foundation, preserve the original `created` date and
+  any `<!-- user-edited -->` sections verbatim. Only `updated` and non-marked
+  sections may change.
+- Aliases must be unique across all foundations. If `lint.mjs --fix --json` reports
+  `L10-alias-conflict`, resolve manually before completing the run — there is no
+  automatic fix.
 ## Definition of Done
@@ -65,3 +122,5 @@ node _lumina/scripts/lint.mjs --fix --json
 - `node _lumina/scripts/lint.mjs --fix --json` has updated `wiki/index.md` if
   needed and leaves `summary.errors === 0`.
 - `wiki/log.md` has an append-only `lumi-research-prefill` entry.
+- If the page already existed, the user's choice (skip / refresh / abort) is logged
+  in `wiki/log.md` with the actual decision taken.

package/src/templates/_lumina/schema/page-templates.md CHANGED Viewed

@@ -133,6 +133,7 @@ title: "Foundation concept"
 slug: foundation-slug
 date_added: YYYY-MM-DD
 tags: []
+aliases: []
 ---
 ```

package/src/tools/fetch_wikipedia.py CHANGED Viewed

@@ -218,7 +218,16 @@ def main(argv: list[str] | None = None) -> None:
         sys.exit(0)
     except ValueError as exc:
-        _err(f"Error: {exc}")
+        msg = str(exc)
+        if "disambiguation" in msg:
+            err_obj: dict[str, str] = {
+                "error": msg,
+                "kind": "disambiguation",
+                "hint": "Use the search subcommand to enumerate candidates.",
+            }
+        else:
+            err_obj = {"error": msg}
+        print(json.dumps(err_obj, ensure_ascii=False), file=sys.stderr)
         sys.exit(2)
     except requests.exceptions.ConnectionError as exc:
         _err(f"Network error: {exc}")

package/src/tools/init_discovery.py CHANGED Viewed

@@ -229,6 +229,7 @@ def phase1_keyword_search(
     fetchers: list[str],
     limit: int,
     env: dict[str, str],
+    exclude_ids: set[str] = set(),
 ) -> list[dict[str, Any]]:
     """Phase 1: keyword search across configured fetchers."""
     results: list[dict[str, Any]] = []
@@ -249,7 +250,7 @@ def phase1_keyword_search(
         for paper in papers:
             pid = paper.get("id") or paper.get("paperId") or ""
-            if pid and pid in seen_ids:
+            if pid and (pid in seen_ids or pid in exclude_ids):
                 continue
             if pid:
                 seen_ids.add(pid)
@@ -265,6 +266,7 @@ def phase2_author_backfill(
     discovered_dir: Path,
     limit: int,
     env: dict[str, str],
+    exclude_ids: set[str] = set(),
 ) -> list[dict[str, Any]]:
     """Phase 2: fetch more papers by the most prolific authors from phase 1."""
     # Count author occurrences across phase-1 results
@@ -290,7 +292,7 @@ def phase2_author_backfill(
             continue
         for paper in papers:
             pid = paper.get("id") or paper.get("paperId") or ""
-            if pid and pid in seen_ids:
+            if pid and (pid in seen_ids or pid in exclude_ids):
                 continue
             if pid:
                 seen_ids.add(pid)
@@ -305,6 +307,7 @@ def phase3_citation_expansion(
     slug: str,
     discovered_dir: Path,
     env: dict[str, str],
+    exclude_ids: set[str] = set(),
 ) -> list[dict[str, Any]]:
     """Phase 3: fetch citations of top phase-1 papers."""
     # Sort by citation count to pick the most-cited seeds
@@ -330,7 +333,7 @@ def phase3_citation_expansion(
             continue
         for paper in citations:
             cid = paper.get("id") or paper.get("paperId") or ""
-            if cid and cid in seen_ids:
+            if cid and (cid in seen_ids or cid in exclude_ids):
                 continue
             if cid:
                 seen_ids.add(cid)
@@ -364,6 +367,11 @@ def main(argv: list[str] | None = None) -> None:
                         help=f"Comma-separated fetchers (default: {DEFAULT_FETCHERS}).")
     parser.add_argument("--limit", type=int, default=DEFAULT_LIMIT,
                         help=f"Max results per fetcher per phase (default: {DEFAULT_LIMIT}).")
+    parser.add_argument(
+        "--exclude-ids", default="",
+        help="Comma-separated list of paper IDs (arXiv IDs or S2 paperIds) to "
+             "skip. Use to exclude papers already ingested into wiki/sources/.",
+    )
     args = parser.parse_args(argv)
@@ -382,6 +390,9 @@ def main(argv: list[str] | None = None) -> None:
         sys.exit(2)
     fetchers = [f.strip() for f in args.fetchers.split(",") if f.strip()]
+    exclude_ids: set[str] = {
+        s.strip() for s in args.exclude_ids.split(",") if s.strip()
+    }
     slug = _slugify(args.topic)
     env = load_env(project_root)
@@ -406,16 +417,20 @@ def main(argv: list[str] | None = None) -> None:
                     summary["phases"]["1"] = {"status": "resumed", "count": len(phase1_results)}
                     _err(f"Phase 1 resumed from checkpoint ({len(phase1_results)} results).")
                 else:
+                    _err(f"Phase 1: keyword search across {fetchers} (limit={args.limit})...")
                     phase1_results = phase1_keyword_search(
-                        args.topic, slug, discovered_dir, fetchers, args.limit, env
+                        args.topic, slug, discovered_dir, fetchers, args.limit, env, exclude_ids
                     )
                     _save_checkpoint(state_dir, 1, {"results": phase1_results, "slug": slug})
+                    _err(f"Phase 1 complete: {len(phase1_results)} unique candidates.")
                     summary["phases"]["1"] = {"status": "complete", "count": len(phase1_results)}
             else:
+                _err(f"Phase 1: keyword search across {fetchers} (limit={args.limit})...")
                 phase1_results = phase1_keyword_search(
-                    args.topic, slug, discovered_dir, fetchers, args.limit, env
+                    args.topic, slug, discovered_dir, fetchers, args.limit, env, exclude_ids
                 )
                 _save_checkpoint(state_dir, 1, {"results": phase1_results, "slug": slug})
+                _err(f"Phase 1 complete: {len(phase1_results)} unique candidates.")
                 summary["phases"]["1"] = {"status": "complete", "count": len(phase1_results)}
         else:
             # Load from checkpoint if phase 1 was run previously
@@ -432,16 +447,20 @@ def main(argv: list[str] | None = None) -> None:
                     summary["phases"]["2"] = {"status": "resumed", "count": len(phase2_results)}
                     _err(f"Phase 2 resumed from checkpoint ({len(phase2_results)} results).")
                 else:
+                    _err(f"Phase 2: author backfill (top {MAX_AUTHORS_BACKFILL} authors, limit={args.limit})...")
                     phase2_results = phase2_author_backfill(
-                        phase1_results, slug, discovered_dir, args.limit, env
+                        phase1_results, slug, discovered_dir, args.limit, env, exclude_ids
                     )
                     _save_checkpoint(state_dir, 2, {"results": phase2_results, "slug": slug})
+                    _err(f"Phase 2 complete: {len(phase2_results)} unique candidates.")
                     summary["phases"]["2"] = {"status": "complete", "count": len(phase2_results)}
             else:
+                _err(f"Phase 2: author backfill (top {MAX_AUTHORS_BACKFILL} authors, limit={args.limit})...")
                 phase2_results = phase2_author_backfill(
-                    phase1_results, slug, discovered_dir, args.limit, env
+                    phase1_results, slug, discovered_dir, args.limit, env, exclude_ids
                 )
                 _save_checkpoint(state_dir, 2, {"results": phase2_results, "slug": slug})
+                _err(f"Phase 2 complete: {len(phase2_results)} unique candidates.")
                 summary["phases"]["2"] = {"status": "complete", "count": len(phase2_results)}
         # --- Phase 3 ---
@@ -453,16 +472,20 @@ def main(argv: list[str] | None = None) -> None:
                     summary["phases"]["3"] = {"status": "resumed", "count": len(phase3_results)}
                     _err(f"Phase 3 resumed from checkpoint ({len(phase3_results)} results).")
                 else:
+                    _err(f"Phase 3: citation expansion (top 5 seeds × {CITATIONS_PER_SEED} citations)...")
                     phase3_results = phase3_citation_expansion(
-                        phase1_results, slug, discovered_dir, env
+                        phase1_results, slug, discovered_dir, env, exclude_ids
                     )
                     _save_checkpoint(state_dir, 3, {"results": phase3_results, "slug": slug})
+                    _err(f"Phase 3 complete: {len(phase3_results)} unique candidates.")
                     summary["phases"]["3"] = {"status": "complete", "count": len(phase3_results)}
             else:
+                _err(f"Phase 3: citation expansion (top 5 seeds × {CITATIONS_PER_SEED} citations)...")
                 phase3_results = phase3_citation_expansion(
-                    phase1_results, slug, discovered_dir, env
+                    phase1_results, slug, discovered_dir, env, exclude_ids
                 )
                 _save_checkpoint(state_dir, 3, {"results": phase3_results, "slug": slug})
+                _err(f"Phase 3 complete: {len(phase3_results)} unique candidates.")
                 summary["phases"]["3"] = {"status": "complete", "count": len(phase3_results)}
     except ValueError as exc: