dogsbay 0.2.0-beta.45 → 0.2.0-beta.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@
22
22
  * `_resetSitemapCache()`.
23
23
  */
24
24
  import { existsSync, readFileSync } from "node:fs";
25
- import { join } from "node:path";
25
+ import { basename, dirname, join } from "node:path";
26
26
  const STATE_CACHE = new Map();
27
27
  export function _resetSitemapCache() {
28
28
  STATE_CACHE.clear();
@@ -50,11 +50,26 @@ function getState(distRoot) {
50
50
  // sitemap integration which produces well-formed XML, so
51
51
  // the only failure mode is "missing entirely" or
52
52
  // "doesn't parse at all."
53
- if (/<(urlset|sitemapindex)[\s>]/.test(content)) {
53
+ const isIndex = /<sitemapindex[\s>]/.test(content);
54
+ const isUrlset = /<urlset[\s>]/.test(content);
55
+ if (isIndex || isUrlset) {
54
56
  sitemapValid = true;
55
- const matches = content.matchAll(/<loc>([^<]+)<\/loc>/g);
56
- for (const m of matches) {
57
- sitemapLocs.add(m[1].trim());
57
+ if (isIndex) {
58
+ // sitemap-index.xml each <loc> points at a child
59
+ // sitemap file (not a page). Read each child and union
60
+ // its <loc> entries (those ARE the page URLs). Without
61
+ // this recursion, sitemapLocs would carry sitemap file
62
+ // URLs only and every page would be flagged "missing
63
+ // from sitemap" — the entire audit class is unusable.
64
+ collectChildSitemapLocs(content, sitemapPath, sitemapLocs);
65
+ }
66
+ else {
67
+ // Plain <urlset> — extract <loc>s directly. Same as
68
+ // before.
69
+ const matches = content.matchAll(/<loc>([^<]+)<\/loc>/g);
70
+ for (const m of matches) {
71
+ sitemapLocs.add(m[1].trim());
72
+ }
58
73
  }
59
74
  }
60
75
  }
@@ -214,6 +229,51 @@ export const sitemapRobotsCoherence = {
214
229
  return issues;
215
230
  },
216
231
  };
232
+ /**
233
+ * Recurse into a `<sitemapindex>` document: for each child
234
+ * sitemap referenced by `<loc>`, read the file from disk and
235
+ * union its page-level `<loc>` entries into the supplied set.
236
+ *
237
+ * Resolution strategy: take the basename of the child URL and
238
+ * resolve it against the directory that contains the index file.
239
+ * That covers both layouts in use today — host-root
240
+ * (`dist/sitemap-index.xml` + `dist/sitemap-0.xml`) and
241
+ * per-mount (`dist/<basePath>/sitemap-index.xml` +
242
+ * `dist/<basePath>/sitemap-0.xml`). Children that don't resolve
243
+ * to a local file are silently skipped — better to miss a few
244
+ * URLs than to crash the audit when a deploy ships a partial
245
+ * sitemap. The page-level rule still surfaces orphan pages, so
246
+ * an incomplete recursion just downgrades to the previous
247
+ * misbehaviour (all pages flagged as missing) — never worse.
248
+ */
249
+ function collectChildSitemapLocs(indexContent, indexPath, out) {
250
+ const indexDir = dirname(indexPath);
251
+ for (const m of indexContent.matchAll(/<loc>([^<]+)<\/loc>/g)) {
252
+ const childUrl = m[1].trim();
253
+ let childName;
254
+ try {
255
+ const u = new URL(childUrl);
256
+ childName = basename(u.pathname);
257
+ }
258
+ catch {
259
+ childName = basename(childUrl);
260
+ }
261
+ if (!childName || childName === "/")
262
+ continue;
263
+ const childPath = join(indexDir, childName);
264
+ if (!existsSync(childPath))
265
+ continue;
266
+ try {
267
+ const childContent = readFileSync(childPath, "utf-8");
268
+ for (const cm of childContent.matchAll(/<loc>([^<]+)<\/loc>/g)) {
269
+ out.add(cm[1].trim());
270
+ }
271
+ }
272
+ catch {
273
+ // Skip unreadable child — see function-level comment.
274
+ }
275
+ }
276
+ }
217
277
  /**
218
278
  * Convert an HTML file path within `dist/` into the public URL
219
279
  * path the sitemap is likely to list. Drops `/index.html` and
@@ -16,6 +16,7 @@ import { internalLinks } from "./internal-links.js";
16
16
  import { localeCoherence } from "./locale-coherence.js";
17
17
  import { namespaceCoherence } from "./namespace-coherence.js";
18
18
  import { navTargetExists } from "./nav-target-exists.js";
19
+ import { unresolvedDirectives } from "./unresolved-directives.js";
19
20
  import { versionCoherence } from "./version-coherence.js";
20
21
  let registered = false;
21
22
  /**
@@ -32,6 +33,7 @@ export function registerStructureRules() {
32
33
  registerRule(navTargetExists);
33
34
  registerRule(internalLinks);
34
35
  registerRule(assetRefs);
36
+ registerRule(unresolvedDirectives);
35
37
  }
36
38
  /**
37
39
  * Test-only: reset the "registered" flag so unit tests can
@@ -0,0 +1,108 @@
1
+ /**
2
+ * Strict variable form: `{{ name }}`, `{{ obj.field }}`,
3
+ * `{{ name|filter }}`, `{{ name | filter }}`. Identifier-then-optional-filter
4
+ * shape so a stray `{{` in legitimate prose (e.g. discussing
5
+ * mathematical notation) doesn't false-positive.
6
+ */
7
+ const VAR_RE = /\{\{\s*([a-zA-Z_][\w.-]*)\s*(?:\|[^}]+)?\s*\}\}/g;
8
+ /**
9
+ * Strict block form: `{% if name %}` / `{% endif %}` / `{% set ... %}` /
10
+ * `{% include "..." %}` / `{%- ... -%}` (whitespace control variants).
11
+ * Anchored to a known keyword so a generic `{% ... %}` we don't
12
+ * recognise isn't flagged (Minja's grammar is a subset of Jinja —
13
+ * surfacing only the patterns the engine actually emits keeps the
14
+ * rule focused on what the preprocessor would have resolved).
15
+ */
16
+ const BLOCK_RE = /\{%-?\s*(if|elif|else|endif|for|endfor|set|include|raw|endraw|leveloffset|switch|case|endswitch|default)\b[^%]*%\}/g;
17
+ /**
18
+ * Walk a TreeNode tree, accumulating directive hits from text
19
+ * content. Skips code-flavoured nodes so documentation of Jinja
20
+ * syntax doesn't false-positive.
21
+ */
22
+ function findHits(nodes, out) {
23
+ for (const node of nodes) {
24
+ if (node.type === "code")
25
+ continue;
26
+ if (node.inline)
27
+ findInlineHits(node.inline, out);
28
+ if (typeof node.html === "string")
29
+ scanText(node.html, out);
30
+ if (typeof node.props?.title === "string")
31
+ scanText(node.props.title, out);
32
+ if (node.children)
33
+ findHits(node.children, out);
34
+ }
35
+ }
36
+ function findInlineHits(nodes, out) {
37
+ for (const node of nodes) {
38
+ if (node.type === "code")
39
+ continue;
40
+ if (node.type === "text") {
41
+ scanText(node.text, out);
42
+ }
43
+ else if (node.type === "link" && node.children) {
44
+ findInlineHits(node.children, out);
45
+ }
46
+ else if (node.type === "highlight" && node.children) {
47
+ findInlineHits(node.children, out);
48
+ }
49
+ else if (node.type === "html-inline" && typeof node.html === "string") {
50
+ scanText(node.html, out);
51
+ }
52
+ }
53
+ }
54
+ function scanText(text, out) {
55
+ let m;
56
+ VAR_RE.lastIndex = 0;
57
+ while ((m = VAR_RE.exec(text)) !== null) {
58
+ out.push({ match: m[0], kind: "variable" });
59
+ }
60
+ BLOCK_RE.lastIndex = 0;
61
+ while ((m = BLOCK_RE.exec(text)) !== null) {
62
+ out.push({ match: m[0], kind: "block" });
63
+ }
64
+ }
65
+ /**
66
+ * Build the three-fix message for one hit. Keep it dense but
67
+ * actionable — the audit text formatter renders one line per
68
+ * finding plus the message.
69
+ */
70
+ function formatMessage(hit) {
71
+ const kindLabel = hit.kind === "variable"
72
+ ? "variable reference"
73
+ : "conditional / block directive";
74
+ return (`Unresolved Minja ${kindLabel} \`${hit.match}\` survived to the rendered ` +
75
+ `page. Fix one of: ` +
76
+ `(1) add the missing value to the \`attributes:\` block in ` +
77
+ `\`dogsbay.config.yml\`; ` +
78
+ `(2) pass \`dogsbay site build --attribute name=value\` for per-build ` +
79
+ `overrides (CI secrets, deploy targets); ` +
80
+ `(3) if the literal is intentional (e.g. the page documents Jinja ` +
81
+ `syntax), add \`preprocess: false\` to the page's frontmatter.`);
82
+ }
83
+ export const unresolvedDirectives = {
84
+ id: "structure/unresolved-directives",
85
+ category: "structure",
86
+ stage: "source",
87
+ severity: "warning",
88
+ description: "Flags surviving Minja `{{ var }}` / `{% if %}` directives in page content " +
89
+ "(the preprocessor couldn't resolve them and they're heading to the rendered HTML).",
90
+ run(rawCtx) {
91
+ const ctx = rawCtx;
92
+ if (!ctx.page)
93
+ return [];
94
+ if (ctx.page.frontmatter?.preprocess === false)
95
+ return [];
96
+ const hits = [];
97
+ findHits(ctx.page.tree, hits);
98
+ if (hits.length === 0)
99
+ return [];
100
+ return hits.map((hit) => ({
101
+ ruleId: "structure/unresolved-directives",
102
+ severity: "warning",
103
+ file: `${ctx.page.slug}.md`,
104
+ message: formatMessage(hit),
105
+ context: hit.match,
106
+ }));
107
+ },
108
+ };