skillwiki 0.8.2 → 0.8.3-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -3158,6 +3158,79 @@ function validateCliRefs(text, page, surface) {
3158
3158
  return violations;
3159
3159
  }
3160
3160
 
3161
+ // src/utils/source-identity.ts
3162
+ var PROJECT_PATTERNS = {
3163
+ hermes: [/\bhermes\b/i, /nousresearch\s*hermes/i, /nousresearch\/hermes-agent/i, /hermes agent/i],
3164
+ skillwiki: [/\bskillwiki\b/i, /\bllm[-_ ]?wiki\b/i, /karpathy'?s llm wiki/i],
3165
+ superpowers: [/\bsuperpowers\b/i, /obra\/superpowers/i, /complete software development methodology/i],
3166
+ playwright: [/\bplaywright\b/i, /microsoft\s*playwright/i, /microsoft\/playwright/i],
3167
+ convex: [/\bconvex\b/i],
3168
+ newapi: [/\bnew[-_ ]?api\b/i, /quantumnous\/new-api/i],
3169
+ coolify: [/\bcoolify\b/i, /coollabsio\/coolify/i],
3170
+ seaweedfs: [/\bseaweed\s*fs\b/i],
3171
+ proxmox: [/\bproxmox\b/i, /proxmoxve/i],
3172
+ codestable: [/\bcodestable\b/i]
3173
+ };
3174
+ var COMPATIBLE = /* @__PURE__ */ new Set([
3175
+ "hermes|skillwiki",
3176
+ "skillwiki|hermes",
3177
+ "proxmox|seaweedfs",
3178
+ "seaweedfs|proxmox",
3179
+ "coolify|seaweedfs",
3180
+ "seaweedfs|coolify"
3181
+ ]);
3182
+ function normalize(text) {
3183
+ return text.replace(/([a-z])([A-Z])/g, "$1 $2").toLowerCase().replace(/[_-]+/g, " ");
3184
+ }
3185
+ function firstBodyWindow(body) {
3186
+ if (!body) return "";
3187
+ const withoutFrontmatter = body.replace(/^---\r?\n[\s\S]*?\r?\n---\r?\n?/, "");
3188
+ return withoutFrontmatter.slice(0, 2e3);
3189
+ }
3190
+ function collectSignals(text) {
3191
+ const normalized = normalize(text);
3192
+ const found = [];
3193
+ for (const [name, patterns] of Object.entries(PROJECT_PATTERNS)) {
3194
+ if (patterns.some((pattern) => pattern.test(normalized))) found.push(name);
3195
+ }
3196
+ return found;
3197
+ }
3198
+ function compatible(left, right) {
3199
+ return left === right || COMPATIBLE.has(`${left}|${right}`);
3200
+ }
3201
+ function hasAnyIncompatibleSignals(leftSignals, rightSignals) {
3202
+ if (leftSignals.length === 0 || rightSignals.length === 0) return false;
3203
+ return leftSignals.some((left) => rightSignals.some((right) => !compatible(left, right)));
3204
+ }
3205
+ function hasAnyCompatibleSignals(leftSignals, rightSignals) {
3206
+ return leftSignals.some((left) => rightSignals.some((right) => compatible(left, right)));
3207
+ }
3208
+ function assessSourceIdentity(input) {
3209
+ const pathSignals = collectSignals(input.rawPath);
3210
+ const sourceSignals = collectSignals(input.sourceUrl ?? "");
3211
+ const bodySignals = collectSignals(firstBodyWindow(input.body));
3212
+ const reasons = [];
3213
+ if (hasAnyIncompatibleSignals(pathSignals, sourceSignals)) {
3214
+ reasons.push(`filename/path signals [${pathSignals.join(", ")}] but source_url signals [${sourceSignals.join(", ")}]`);
3215
+ }
3216
+ if (pathSignals.length > 0 && bodySignals.length > 0 && !hasAnyCompatibleSignals(pathSignals, bodySignals)) {
3217
+ reasons.push(`filename/path signals [${pathSignals.join(", ")}] but body signals [${bodySignals.join(", ")}]`);
3218
+ }
3219
+ if (reasons.length > 0) {
3220
+ return { status: "conflict", pathSignals, sourceSignals, bodySignals, reasons };
3221
+ }
3222
+ if (pathSignals.length === 0 && sourceSignals.length > 0 && bodySignals.length > 0 && !hasAnyCompatibleSignals(sourceSignals, bodySignals)) {
3223
+ return {
3224
+ status: "suspicious",
3225
+ pathSignals,
3226
+ sourceSignals,
3227
+ bodySignals,
3228
+ reasons: [`source_url signals [${sourceSignals.join(", ")}] but body signals [${bodySignals.join(", ")}]`]
3229
+ };
3230
+ }
3231
+ return { status: "ok", pathSignals, sourceSignals, bodySignals, reasons };
3232
+ }
3233
+
3161
3234
  // src/commands/lint.ts
3162
3235
  var STRUCT_MIN_BODY_LINES = 60;
3163
3236
  var STRUCT_MIN_SECTIONS = 3;
@@ -3189,7 +3262,7 @@ function extractSourceEntries(rawFm) {
3189
3262
  }
3190
3263
  return entries;
3191
3264
  }
3192
- var ERROR_ORDER = ["broken_wikilinks", "invalid_frontmatter", "raw_dedup", "broken_sources", "tag_not_in_taxonomy", "path_too_long"];
3265
+ var ERROR_ORDER = ["broken_wikilinks", "invalid_frontmatter", "raw_source_identity_conflict", "raw_dedup", "broken_sources", "tag_not_in_taxonomy", "path_too_long"];
3193
3266
  var WARNING_ORDER = ["raw_body_duplicate", "raw_subdirectory_duplicate", "file_source_url", "index_incomplete", "index_link_format", "stale_page", "page_too_large", "log_rotate_needed", "orphans", "compound_refs", "legacy_citation_style", "orphaned_citations", "duplicate_frontmatter", "work_item_health", "orphaned_project_pages", "missing_overview", "missing_diagram"];
3194
3267
  var INFO_ORDER = ["bridges", "sparse_community", "page_structure", "topic_map_recommended", "frontmatter_wikilink", "wikilink_citation", "missing_tldr", "stale_sections", "cli_refs"];
3195
3268
  var KNOWN_BUCKETS = [...ERROR_ORDER, ...WARNING_ORDER, ...INFO_ORDER];
@@ -3290,6 +3363,7 @@ async function runLint(input) {
3290
3363
  buckets.raw_subdirectory_duplicate = subDirDupes;
3291
3364
  }
3292
3365
  const fileSourceUrlFlags = [];
3366
+ const rawIdentityConflicts = [];
3293
3367
  for (const raw of scan.data.raw) {
3294
3368
  const text = await readPage(raw);
3295
3369
  const split = splitFrontmatter(text);
@@ -3297,8 +3371,25 @@ async function runLint(input) {
3297
3371
  if (/^source_url:\s*file:\/\//m.test(split.data.rawFrontmatter)) {
3298
3372
  fileSourceUrlFlags.push(raw.relPath);
3299
3373
  }
3374
+ const sourceUrl = split.data.rawFrontmatter.match(/^source_url:\s*(.+)$/m)?.[1]?.trim().replace(/^["']|["']$/g, "") ?? "";
3375
+ const assessment = assessSourceIdentity({
3376
+ rawPath: raw.relPath,
3377
+ sourceUrl,
3378
+ body: split.data.body
3379
+ });
3380
+ if (assessment.status === "conflict") {
3381
+ rawIdentityConflicts.push({
3382
+ file: raw.relPath,
3383
+ status: assessment.status,
3384
+ reasons: assessment.reasons,
3385
+ pathSignals: assessment.pathSignals,
3386
+ sourceSignals: assessment.sourceSignals,
3387
+ bodySignals: assessment.bodySignals
3388
+ });
3389
+ }
3300
3390
  }
3301
3391
  if (fileSourceUrlFlags.length > 0) buckets.file_source_url = fileSourceUrlFlags;
3392
+ if (rawIdentityConflicts.length > 0) buckets.raw_source_identity_conflict = rawIdentityConflicts;
3302
3393
  const legacyPages = [];
3303
3394
  const orphanedPages = [];
3304
3395
  const structFlags = [];
@@ -4576,6 +4667,120 @@ function checkSyncLastPush(resolvedPath) {
4576
4667
  }
4577
4668
  return check("pass", "sync_last_push", "Vault sync recency", `Last push: ${dateStr} (${daysSince2} day(s) ago)`);
4578
4669
  }
4670
+ function hasOriginMain(resolvedPath) {
4671
+ try {
4672
+ execSync2("git rev-parse --verify --quiet origin/main", {
4673
+ cwd: resolvedPath,
4674
+ encoding: "utf8",
4675
+ stdio: ["pipe", "pipe", "pipe"]
4676
+ });
4677
+ return true;
4678
+ } catch {
4679
+ return false;
4680
+ }
4681
+ }
4682
+ function checkVaultGitDirty(resolvedPath) {
4683
+ if (resolvedPath === void 0) {
4684
+ return check("pass", "vault_git_dirty", "Vault git dirty state", "No vault path \u2014 check skipped");
4685
+ }
4686
+ if (!existsSync9(join27(resolvedPath, ".git"))) {
4687
+ return check("pass", "vault_git_dirty", "Vault git dirty state", "No git repo \u2014 check skipped");
4688
+ }
4689
+ try {
4690
+ const lines = execSync2("git status --porcelain", {
4691
+ cwd: resolvedPath,
4692
+ encoding: "utf8",
4693
+ stdio: ["pipe", "pipe", "pipe"]
4694
+ }).trim().split("\n").filter(Boolean);
4695
+ if (lines.length > 0) {
4696
+ return check("warn", "vault_git_dirty", "Vault git dirty state", `${lines.length} dirty file(s) in vault worktree`);
4697
+ }
4698
+ return check("pass", "vault_git_dirty", "Vault git dirty state", "Clean worktree");
4699
+ } catch {
4700
+ return check("warn", "vault_git_dirty", "Vault git dirty state", "Could not read git status");
4701
+ }
4702
+ }
4703
+ function checkVaultGitAhead(resolvedPath) {
4704
+ return checkVaultGitComparison(
4705
+ resolvedPath,
4706
+ "vault_git_ahead",
4707
+ "Vault commits ahead",
4708
+ "origin/main..HEAD",
4709
+ "ahead of origin/main",
4710
+ "0 commits ahead of origin/main"
4711
+ );
4712
+ }
4713
+ function checkVaultGitBehind(resolvedPath) {
4714
+ return checkVaultGitComparison(
4715
+ resolvedPath,
4716
+ "vault_git_behind",
4717
+ "Vault commits behind",
4718
+ "HEAD..origin/main",
4719
+ "behind origin/main",
4720
+ "0 commits behind origin/main"
4721
+ );
4722
+ }
4723
+ function checkVaultGitComparison(resolvedPath, id, label, range, nonZeroSuffix, zeroDetail) {
4724
+ if (resolvedPath === void 0) {
4725
+ return check("pass", id, label, "No vault path \u2014 check skipped");
4726
+ }
4727
+ if (!existsSync9(join27(resolvedPath, ".git"))) {
4728
+ return check("pass", id, label, "No git repo \u2014 check skipped");
4729
+ }
4730
+ if (!hasOriginMain(resolvedPath)) {
4731
+ return check("pass", id, label, "origin/main unavailable \u2014 check skipped");
4732
+ }
4733
+ try {
4734
+ const count = parseInt(execSync2(`git rev-list --count ${range}`, {
4735
+ cwd: resolvedPath,
4736
+ encoding: "utf8",
4737
+ stdio: ["pipe", "pipe", "pipe"]
4738
+ }).trim(), 10);
4739
+ if (count > 0) {
4740
+ return check("warn", id, label, `${count} commit(s) ${nonZeroSuffix}`);
4741
+ }
4742
+ return check("pass", id, label, zeroDetail);
4743
+ } catch {
4744
+ return check("warn", id, label, "Could not compare HEAD with origin/main");
4745
+ }
4746
+ }
4747
+ function pullLogPaths(home) {
4748
+ const paths = platform2() === "darwin" ? [
4749
+ join27(home, "Library", "Logs", "wiki-pull.log"),
4750
+ join27(home, ".local", "state", "vault-sync", "log", "wiki-pull.log")
4751
+ ] : [
4752
+ join27(home, ".local", "state", "vault-sync", "log", "wiki-pull.log"),
4753
+ join27(home, "Library", "Logs", "wiki-pull.log")
4754
+ ];
4755
+ return [...new Set(paths)];
4756
+ }
4757
+ function isRecentLogLine(line, nowMs) {
4758
+ const match = line.match(/^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)/);
4759
+ if (!match) return true;
4760
+ const ts = Date.parse(match[1]);
4761
+ if (!Number.isFinite(ts)) return true;
4762
+ return nowMs - ts <= 24 * 60 * 60 * 1e3;
4763
+ }
4764
+ function checkVaultGitPullFailures(home) {
4765
+ const path = pullLogPaths(home).find((p) => existsSync9(p));
4766
+ if (!path) {
4767
+ return check("pass", "vault_git_pull_failures", "Vault pull failures", "No wiki-pull.log found \u2014 check skipped");
4768
+ }
4769
+ try {
4770
+ const lines = readFileSync7(path, "utf8").split(/\r?\n/).filter(Boolean);
4771
+ const now = Date.now();
4772
+ const failures = lines.filter(
4773
+ (line) => isRecentLogLine(line, now) && /(pre-push pull failed|FAIL .*pull|FAIL .*rebase|cannot pull with rebase|unstaged changes)/i.test(line)
4774
+ );
4775
+ if (failures.length > 0) {
4776
+ const sample = failures.slice(-2).map((line) => line.slice(0, 100)).join(" | ");
4777
+ return check("warn", "vault_git_pull_failures", "Vault pull failures", `${failures.length} recent pull failure(s): ${sample}`);
4778
+ }
4779
+ return check("pass", "vault_git_pull_failures", "Vault pull failures", "No recent pull failures logged");
4780
+ } catch {
4781
+ return check("warn", "vault_git_pull_failures", "Vault pull failures", `Could not read ${path}`);
4782
+ }
4783
+ }
4579
4784
  function checkS3MountPerf(resolvedPath) {
4580
4785
  if (resolvedPath === void 0) {
4581
4786
  return check("pass", "s3_mount_perf", "S3 mount performance", "No vault path \u2014 check skipped");
@@ -5253,6 +5458,10 @@ async function runDoctor(input) {
5253
5458
  checks.push(checkObsidianTemplates(resolvedPath));
5254
5459
  checks.push(checkVaultGitRemote(resolvedPath));
5255
5460
  checks.push(checkSyncLastPush(resolvedPath));
5461
+ checks.push(checkVaultGitDirty(resolvedPath));
5462
+ checks.push(checkVaultGitAhead(resolvedPath));
5463
+ checks.push(checkVaultGitBehind(resolvedPath));
5464
+ checks.push(checkVaultGitPullFailures(input.home));
5256
5465
  checks.push(checkDotStoreClean(resolvedPath));
5257
5466
  checks.push(checkS3MountPerf(resolvedPath));
5258
5467
  checks.push(checkS3MountFreshness(resolvedPath));
@@ -5509,6 +5718,22 @@ async function runDrift(input) {
5509
5718
  continue;
5510
5719
  }
5511
5720
  const currentHash = createHash3("sha256").update(Buffer.from(resp.data.body, "utf8")).digest("hex");
5721
+ const identity = assessSourceIdentity({
5722
+ rawPath: raw.relPath,
5723
+ sourceUrl,
5724
+ body: resp.data.body
5725
+ });
5726
+ if (identity.status === "conflict") {
5727
+ results.push({
5728
+ raw_path: raw.relPath,
5729
+ source_url: sourceUrl,
5730
+ stored_sha256: storedHash,
5731
+ current_sha256: currentHash,
5732
+ status: "identity_conflict",
5733
+ identity
5734
+ });
5735
+ continue;
5736
+ }
5512
5737
  const drifted2 = currentHash !== storedHash;
5513
5738
  if (drifted2 && input.apply) {
5514
5739
  const newFm = rawFrontmatter.replace(/^sha256:\s*[a-f0-9]+$/m, `sha256: ${currentHash}`);
@@ -5536,12 +5761,19 @@ ${body}`;
5536
5761
  }
5537
5762
  const drifted = results.filter((r) => r.status === "drifted");
5538
5763
  const fetchFailed = results.filter((r) => r.status === "fetch_failed");
5764
+ const identityConflicts = results.filter((r) => r.status === "identity_conflict");
5539
5765
  const updated = results.filter((r) => r.status === "updated");
5540
5766
  const unchanged = results.filter((r) => r.status === "unchanged").length;
5541
- const exitCode = drifted.length > 0 ? ExitCode.DRIFT_DETECTED : ExitCode.OK;
5767
+ const exitCode = drifted.length > 0 || identityConflicts.length > 0 ? ExitCode.DRIFT_DETECTED : ExitCode.OK;
5542
5768
  const hintLines = [`scanned: ${results.length}, unchanged: ${unchanged}`];
5543
5769
  if (newResults.length > 0) hintLines.push(`new: ${newResults.length}`, ...newResults.map((n) => ` ${n.raw_path} (ingested: ${n.ingested})`));
5544
5770
  if (drifted.length > 0) hintLines.push(`drifted: ${drifted.length}`, ...drifted.map((d) => ` ${d.raw_path}`));
5771
+ if (identityConflicts.length > 0) {
5772
+ hintLines.push(
5773
+ `identity_conflicts: ${identityConflicts.length}`,
5774
+ ...identityConflicts.map((c) => ` ${c.raw_path}: ${c.identity?.reasons.join("; ") ?? "source identity conflict"}`)
5775
+ );
5776
+ }
5545
5777
  if (fetchFailed.length > 0) hintLines.push(`fetch_failed: ${fetchFailed.length}`, ...fetchFailed.map((f) => ` ${f.raw_path}: ${f.fetch_error}`));
5546
5778
  if (updated.length > 0) hintLines.push(`updated: ${updated.length}`, ...updated.map((u) => ` ${u.raw_path}`));
5547
5779
  if (input.apply && updated.length > 0) {
@@ -5554,7 +5786,7 @@ ${body}`;
5554
5786
  }
5555
5787
  return {
5556
5788
  exitCode,
5557
- result: ok({ scanned: results.length, drifted, fetch_failed: fetchFailed, updated, newFiles: newResults, unchanged, humanHint: hintLines.join("\n") })
5789
+ result: ok({ scanned: results.length, drifted, fetch_failed: fetchFailed, identity_conflicts: identityConflicts, updated, newFiles: newResults, unchanged, humanHint: hintLines.join("\n") })
5558
5790
  };
5559
5791
  }
5560
5792
 
@@ -6756,6 +6988,25 @@ async function runIngest(input) {
6756
6988
  const typedRelPath = `${typedDir}/${slug}.md`;
6757
6989
  const rawAbsPath = join35(input.vault, rawRelPath);
6758
6990
  const typedAbsPath = join35(input.vault, typedRelPath);
6991
+ const identity = assessSourceIdentity({
6992
+ rawPath: rawRelPath,
6993
+ sourceUrl: sourceUrl ?? void 0,
6994
+ body: sourceContent
6995
+ });
6996
+ if (identity.status === "conflict") {
6997
+ return {
6998
+ exitCode: ExitCode.INGEST_VALIDATION_FAILED,
6999
+ result: err("INGEST_VALIDATION_FAILED", {
7000
+ message: "source identity conflict",
7001
+ raw_path: rawRelPath,
7002
+ source_url: sourceUrl,
7003
+ reasons: identity.reasons,
7004
+ pathSignals: identity.pathSignals,
7005
+ sourceSignals: identity.sourceSignals,
7006
+ bodySignals: identity.bodySignals
7007
+ })
7008
+ };
7009
+ }
6759
7010
  const rawContent = buildRawContent(sourceUrl, today, sha256, sourceContent);
6760
7011
  const typedContent = buildTypedContent(
6761
7012
  input.title,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "skillwiki",
3
- "version": "0.8.2",
3
+ "version": "0.8.3-beta.2",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "skillwiki": "dist/cli.js"
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "skillwiki",
3
- "version": "0.8.2",
3
+ "version": "0.8.3-beta.2",
4
4
  "skills": "./",
5
5
  "description": "Project-aware Karpathy-style knowledge base for Claude Code: 18 prompt-only skills (wiki-*, proj-*, using-skillwiki) backed by the deterministic `skillwiki` CLI.",
6
6
  "author": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "skillwiki",
3
- "version": "0.8.2",
3
+ "version": "0.8.3-beta.2",
4
4
  "description": "Project-aware Karpathy-style knowledge base for Codex with 18 prompt-only skills backed by the deterministic skillwiki CLI.",
5
5
  "author": {
6
6
  "name": "karlorz",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@skillwiki/skills",
3
- "version": "0.8.2",
3
+ "version": "0.8.3-beta.2",
4
4
  "private": true,
5
5
  "files": [
6
6
  "wiki-*",
@@ -18,13 +18,14 @@ Run `skillwiki lang` at the start. Generate page-body prose, narrative sections,
18
18
  0. **Resolve vault and language.** Run `skillwiki path` (fail if NO_VAULT_CONFIGURED) and `skillwiki lang`. Use the resolved vault path for all writes; use the canonical language for all generated prose.
19
19
  1. **Guard.** For each URL: run `skillwiki fetch-guard <url>`. If exit ≠ 0, STOP and surface the error. Do not retry.
20
20
  2. **Fetch.** Use `web_fetch` (or read local file) under Layer 2 controls (the CLI Layer 2 fetcher applies in tests; in skill runtime use `web_fetch` directly and treat any error as STOP).
21
- 3. **Hash.** Write the raw file (frontmatter + body). Run `skillwiki hash <raw-file>` and embed the result in raw frontmatter `sha256:`.
22
- 4. **Generate page(s).** Compose typed-knowledge page(s) with citations pre-attached (`^[raw/...]` markers). Every page MUST include:
21
+ 3. **Identity guard.** Before writing raw files, ensure the target raw filename/title, `source_url`, fetched H1/title, and early body subject agree. If `skillwiki ingest` reports `INGEST_VALIDATION_FAILED` with `source identity conflict`, STOP. Do not fix by renaming after the fact; choose the correct title/source pair or ask the user.
22
+ 4. **Hash.** Write the raw file (frontmatter + body). Run `skillwiki hash <raw-file>` and embed the result in raw frontmatter `sha256:`.
23
+ 5. **Generate page(s).** Compose typed-knowledge page(s) with citations pre-attached (`^[raw/...]` markers). Every page MUST include:
23
24
  - `> **TL;DR:**` blockquote as the first content after the title heading — a one-sentence summary of the page's key takeaway (under 200 chars). See SCHEMA.md `## TL;DR Convention`.
24
25
  - For pages tagged `architecture` or explaining workflows/systems: include a Mermaid diagram (`graph TB` or `sequenceDiagram`) in the body. Follow Obsidian-compatible Mermaid rules (see SCHEMA.md `## Mermaid Diagrams`).
25
- 5. **Validate.** For each generated page: run `skillwiki validate <page>`. If exit ≠ 0, STOP — do not write index/log.
26
- 6. **Apply writes in order.** raw → page(s) → `index.md` → `log.md`.
27
- 7. **Confidence flag.** If only one source is cited, set `confidence: low`.
26
+ 6. **Validate.** For each generated page: run `skillwiki validate <page>`. If exit ≠ 0, STOP — do not write index/log.
27
+ 7. **Apply writes in order.** raw → page(s) → `index.md` → `log.md`.
28
+ 8. **Confidence flag.** If only one source is cited, set `confidence: low`.
28
29
  ## Provenance defaults
29
30
  - Default `provenance: research`.
30
31
  - If cwd is inside `projects/{slug}/`, set `provenance: project` and add `provenance_projects: ["[[slug]]"]`.
@@ -36,6 +37,7 @@ Raw ephemeral data (market feeds, logs, transient JSON) must be written to the *
36
37
  ## Stop conditions
37
38
  - `fetch-guard` non-zero.
38
39
  - Fetch timeout / size limit exceeded.
40
+ - `INGEST_VALIDATION_FAILED` with `source identity conflict`.
39
41
  - `validate` non-zero on any page.
40
42
  - sha256 already exists in vault for the same source.
41
43
  ## Forbidden
@@ -46,7 +48,7 @@ Raw ephemeral data (market feeds, logs, transient JSON) must be written to the *
46
48
  - Writing `[[wikilinks]]` to pages that don't exist in the vault. Before linking, verify the target exists: check `index.md` or `ls` the target directory. If the target doesn't exist yet, use plain text instead of a wikilink.
47
49
  ## Batch Mode
48
50
  When the user provides multiple sources (a directory of files, a list of URLs, or a multi-document input):
49
- 1. **Loop per source.** Execute steps 1–5 for each source individually (guard → fetch → hash → generate → validate).
51
+ 1. **Loop per source.** Execute steps 1–6 for each source individually (guard → fetch → identity guard → hash → generate → validate).
50
52
  2. **Accumulate, don't write yet.** Collect all raw files and pages in memory. Do not write `index.md` or `log.md` until every source has validated.
51
53
  3. **Fail fast.** If any page fails validation, STOP. Report all failures. Do not write index/log for any source.
52
54
  4. **Deduplication.** Before writing each raw file, check `sha256` against existing vault raw sources. Skip sources whose content is already present.
@@ -25,6 +25,7 @@ Standard four reads (SCHEMA, index, log, project context if applicable).
25
25
  1. Run `skillwiki drift [vault]`. Read the JSON output.
26
26
  2. Present findings grouped by status:
27
27
  - **drifted:** Source content has changed. Show stored vs current sha256.
28
+ - **identity_conflicts:** The fetched source no longer matches the raw filename/source identity. STOP and surface the conflict. Do not archive or reingest until a human chooses the correct source/filename pair.
28
29
  - **fetch_failed:** Could not re-fetch. Show error details.
29
30
  - **unchanged:** No action needed.
30
31
  3. For each drifted source, ask the user: archive old + ingest new, or skip?
@@ -51,4 +52,5 @@ Raw files are immutable (N9). Re-ingest never modifies an existing raw file. Ins
51
52
 
52
53
  - Modifying files in `raw/` directly (N9).
53
54
  - Re-ingesting without user approval for each drifted source.
55
+ - Re-ingesting a source listed under `identity_conflicts` without explicit user approval and a corrected target filename/source URL.
54
56
  - Skipping the drift check and assuming sources have changed.
@@ -18,13 +18,14 @@ Run `skillwiki lang` at the start. Generate page-body prose, narrative sections,
18
18
  0. **Resolve vault and language.** Run `skillwiki path` (fail if NO_VAULT_CONFIGURED) and `skillwiki lang`. Use the resolved vault path for all writes; use the canonical language for all generated prose.
19
19
  1. **Guard.** For each URL: run `skillwiki fetch-guard <url>`. If exit ≠ 0, STOP and surface the error. Do not retry.
20
20
  2. **Fetch.** Use `web_fetch` (or read local file) under Layer 2 controls (the CLI Layer 2 fetcher applies in tests; in skill runtime use `web_fetch` directly and treat any error as STOP).
21
- 3. **Hash.** Write the raw file (frontmatter + body). Run `skillwiki hash <raw-file>` and embed the result in raw frontmatter `sha256:`.
22
- 4. **Generate page(s).** Compose typed-knowledge page(s) with citations pre-attached (`^[raw/...]` markers). Every page MUST include:
21
+ 3. **Identity guard.** Before writing raw files, ensure the target raw filename/title, `source_url`, fetched H1/title, and early body subject agree. If `skillwiki ingest` reports `INGEST_VALIDATION_FAILED` with `source identity conflict`, STOP. Do not fix by renaming after the fact; choose the correct title/source pair or ask the user.
22
+ 4. **Hash.** Write the raw file (frontmatter + body). Run `skillwiki hash <raw-file>` and embed the result in raw frontmatter `sha256:`.
23
+ 5. **Generate page(s).** Compose typed-knowledge page(s) with citations pre-attached (`^[raw/...]` markers). Every page MUST include:
23
24
  - `> **TL;DR:**` blockquote as the first content after the title heading — a one-sentence summary of the page's key takeaway (under 200 chars). See SCHEMA.md `## TL;DR Convention`.
24
25
  - For pages tagged `architecture` or explaining workflows/systems: include a Mermaid diagram (`graph TB` or `sequenceDiagram`) in the body. Follow Obsidian-compatible Mermaid rules (see SCHEMA.md `## Mermaid Diagrams`).
25
- 5. **Validate.** For each generated page: run `skillwiki validate <page>`. If exit ≠ 0, STOP — do not write index/log.
26
- 6. **Apply writes in order.** raw → page(s) → `index.md` → `log.md`.
27
- 7. **Confidence flag.** If only one source is cited, set `confidence: low`.
26
+ 6. **Validate.** For each generated page: run `skillwiki validate <page>`. If exit ≠ 0, STOP — do not write index/log.
27
+ 7. **Apply writes in order.** raw → page(s) → `index.md` → `log.md`.
28
+ 8. **Confidence flag.** If only one source is cited, set `confidence: low`.
28
29
  ## Provenance defaults
29
30
  - Default `provenance: research`.
30
31
  - If cwd is inside `projects/{slug}/`, set `provenance: project` and add `provenance_projects: ["[[slug]]"]`.
@@ -36,6 +37,7 @@ Raw ephemeral data (market feeds, logs, transient JSON) must be written to the *
36
37
  ## Stop conditions
37
38
  - `fetch-guard` non-zero.
38
39
  - Fetch timeout / size limit exceeded.
40
+ - `INGEST_VALIDATION_FAILED` with `source identity conflict`.
39
41
  - `validate` non-zero on any page.
40
42
  - sha256 already exists in vault for the same source.
41
43
  ## Forbidden
@@ -46,7 +48,7 @@ Raw ephemeral data (market feeds, logs, transient JSON) must be written to the *
46
48
  - Writing `[[wikilinks]]` to pages that don't exist in the vault. Before linking, verify the target exists: check `index.md` or `ls` the target directory. If the target doesn't exist yet, use plain text instead of a wikilink.
47
49
  ## Batch Mode
48
50
  When the user provides multiple sources (a directory of files, a list of URLs, or a multi-document input):
49
- 1. **Loop per source.** Execute steps 1–5 for each source individually (guard → fetch → hash → generate → validate).
51
+ 1. **Loop per source.** Execute steps 1–6 for each source individually (guard → fetch → identity guard → hash → generate → validate).
50
52
  2. **Accumulate, don't write yet.** Collect all raw files and pages in memory. Do not write `index.md` or `log.md` until every source has validated.
51
53
  3. **Fail fast.** If any page fails validation, STOP. Report all failures. Do not write index/log for any source.
52
54
  4. **Deduplication.** Before writing each raw file, check `sha256` against existing vault raw sources. Skip sources whose content is already present.
@@ -25,6 +25,7 @@ Standard four reads (SCHEMA, index, log, project context if applicable).
25
25
  1. Run `skillwiki drift [vault]`. Read the JSON output.
26
26
  2. Present findings grouped by status:
27
27
  - **drifted:** Source content has changed. Show stored vs current sha256.
28
+ - **identity_conflicts:** The fetched source no longer matches the raw filename/source identity. STOP and surface the conflict. Do not archive or reingest until a human chooses the correct source/filename pair.
28
29
  - **fetch_failed:** Could not re-fetch. Show error details.
29
30
  - **unchanged:** No action needed.
30
31
  3. For each drifted source, ask the user: archive old + ingest new, or skip?
@@ -51,4 +52,5 @@ Raw files are immutable (N9). Re-ingest never modifies an existing raw file. Ins
51
52
 
52
53
  - Modifying files in `raw/` directly (N9).
53
54
  - Re-ingesting without user approval for each drifted source.
55
+ - Re-ingesting a source listed under `identity_conflicts` without explicit user approval and a corrected target filename/source URL.
54
56
  - Skipping the drift check and assuming sources have changed.