@blamejs/exceptd-skills 0.13.19 → 0.13.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,344 @@
1
+ "use strict";
2
+ /**
3
+ * lib/xml-tokenizer.js
4
+ *
5
+ * Minimal but proper XML/RSS/Atom tokenizer. Replaces the regex-based
6
+ * parser in lib/source-advisories.js. The regex approach silently
7
+ * failed on:
8
+ * - XML namespaces (`<atom:entry>` vs `<entry>`)
9
+ * - Nested CDATA
10
+ * - Self-closing `<link href="..."/>` vs container `<link>...</link>`
11
+ * - HTML-escaped entities inside titles
12
+ * - Multi-line title content
13
+ *
14
+ * Failures returned `[]` silently — operators never saw the parser was
15
+ * broken on a given feed. This module fails loudly via tokenizer
16
+ * errors so a parser regression is visible in the refresh report.
17
+ *
18
+ * Design constraints:
19
+ * - Zero runtime dependencies (the project ships with no deps).
20
+ * - Streaming-friendly via a callback API (does not buffer the whole
21
+ * DOM — relevant for the 15 MB IETF RFC index).
22
+ * - Namespace-aware via element-localname matching (the local-name
23
+ * of `<atom:entry>` is `entry`).
24
+ * - CDATA-correct: `<![CDATA[...]]>` content is passed through verbatim
25
+ * including unescaped `<` and `&`.
26
+ * - Entity-correct: the five named XML entities (lt, gt, amp, apos,
27
+ * quot) plus numeric character references (`&#NNN;`, `&#xHH;`)
28
+ * are decoded. Other named entities pass through unchanged (HTML
29
+ * entities in RSS bodies are a recoverable variant we tolerate).
30
+ *
31
+ * Not designed for: DTD parsing, XInclude, XSLT, or any external-entity
32
+ * resolution. Feeds that need those features are outside the scope of
33
+ * a security-tooling intake pipeline.
34
+ *
35
+ * API:
36
+ * const { parseFeed } = require("./xml-tokenizer");
37
+ * const items = parseFeed(xmlString); // returns [{title, link, published, body}, ...]
38
+ *
39
+ * For lower-level use, the underlying tokenizer is exported too:
40
+ * const { tokenize } = require("./xml-tokenizer");
41
+ * tokenize(xml, { onTagOpen, onTagClose, onText, onCData });
42
+ */
43
+
44
+ // Decode the five canonical XML entities + numeric character references.
45
+ // Unknown named entities pass through unchanged (we're tolerant of
46
+ // HTML-style entities that legitimately appear in RSS body text).
47
+ function decodeEntities(s) {
48
+ if (typeof s !== "string") return s;
49
+ return s.replace(/&(#x[0-9a-fA-F]+|#[0-9]+|[a-zA-Z]+);/g, (m, ref) => {
50
+ if (ref[0] === "#") {
51
+ const codepoint = ref[1] === "x" || ref[1] === "X"
52
+ ? parseInt(ref.slice(2), 16)
53
+ : parseInt(ref.slice(1), 10);
54
+ if (!Number.isFinite(codepoint)) return m;
55
+ try { return String.fromCodePoint(codepoint); } catch { return m; }
56
+ }
57
+ switch (ref) {
58
+ case "lt": return "<";
59
+ case "gt": return ">";
60
+ case "amp": return "&";
61
+ case "apos": return "'";
62
+ case "quot": return '"';
63
+ default: return m; // unknown named entity — leave untouched
64
+ }
65
+ });
66
+ }
67
+
68
+ // Strip the optional `prefix:` from a namespaced element/attribute name.
69
+ function localName(qname) {
70
+ const idx = qname.indexOf(":");
71
+ return idx === -1 ? qname : qname.slice(idx + 1);
72
+ }
73
+
74
+ function parseAttrs(rawAttrs) {
75
+ const out = {};
76
+ if (!rawAttrs) return out;
77
+ // Walk character-by-character so quoted values can contain `=` and
78
+ // whitespace without confusing a regex.
79
+ let i = 0;
80
+ const len = rawAttrs.length;
81
+ while (i < len) {
82
+ while (i < len && /\s/.test(rawAttrs[i])) i++;
83
+ if (i >= len) break;
84
+ const nameStart = i;
85
+ while (i < len && rawAttrs[i] !== "=" && !/\s/.test(rawAttrs[i])) i++;
86
+ const name = rawAttrs.slice(nameStart, i);
87
+ if (!name) break;
88
+ while (i < len && /\s/.test(rawAttrs[i])) i++;
89
+ if (rawAttrs[i] !== "=") {
90
+ // Attribute with no value — uncommon in XML but tolerate.
91
+ out[localName(name)] = "";
92
+ continue;
93
+ }
94
+ i++; // skip '='
95
+ while (i < len && /\s/.test(rawAttrs[i])) i++;
96
+ const quote = rawAttrs[i];
97
+ if (quote !== '"' && quote !== "'") {
98
+ // Unquoted value — read until whitespace or end.
99
+ const valStart = i;
100
+ while (i < len && !/\s/.test(rawAttrs[i])) i++;
101
+ out[localName(name)] = decodeEntities(rawAttrs.slice(valStart, i));
102
+ continue;
103
+ }
104
+ i++; // skip opening quote
105
+ const valStart = i;
106
+ while (i < len && rawAttrs[i] !== quote) i++;
107
+ out[localName(name)] = decodeEntities(rawAttrs.slice(valStart, i));
108
+ i++; // skip closing quote
109
+ }
110
+ return out;
111
+ }
112
+
113
+ /**
114
+ * Streaming tokenizer. Calls handlers in document order. Returns no
115
+ * value — accumulation is the caller's responsibility.
116
+ *
117
+ * Handlers (all optional):
118
+ * onTagOpen(name, attrs, selfClosing)
119
+ * onTagClose(name)
120
+ * onText(text) decoded
121
+ * onCData(text) verbatim, not decoded
122
+ * onComment(text)
123
+ * onPI(name, content) processing instructions (<?xml-stylesheet?>)
124
+ * onError(message, position)
125
+ */
126
+ function tokenize(xml, handlers) {
127
+ const H = handlers || {};
128
+ if (typeof xml !== "string") {
129
+ if (H.onError) H.onError("input must be a string", 0);
130
+ return;
131
+ }
132
+ const len = xml.length;
133
+ let i = 0;
134
+ // Open-tag stack — surfaces EOF-with-unclosed-elements as an error
135
+ // instead of silently dropping the residual content. This is the
136
+ // observability gap the v0.13.17 regex parser had: a malformed feed
137
+ // (truncated mid-element) returned `[]` with no signal that the
138
+ // parser had given up.
139
+ const openStack = [];
140
+ while (i < len) {
141
+ const next = xml.indexOf("<", i);
142
+ if (next === -1) {
143
+ // Trailing text — flush.
144
+ const tail = xml.slice(i);
145
+ if (tail.length && H.onText) H.onText(decodeEntities(tail));
146
+ if (openStack.length && H.onError) {
147
+ H.onError("unterminated element at EOF: " + openStack[openStack.length - 1], len);
148
+ }
149
+ return;
150
+ }
151
+ if (next > i) {
152
+ const text = xml.slice(i, next);
153
+ if (text.length && H.onText) H.onText(decodeEntities(text));
154
+ }
155
+ // Now at `<` — classify the construct.
156
+ if (xml.startsWith("<!--", next)) {
157
+ const end = xml.indexOf("-->", next + 4);
158
+ if (end === -1) {
159
+ if (H.onError) H.onError("unterminated comment", next);
160
+ return;
161
+ }
162
+ if (H.onComment) H.onComment(xml.slice(next + 4, end));
163
+ i = end + 3;
164
+ continue;
165
+ }
166
+ if (xml.startsWith("<![CDATA[", next)) {
167
+ const end = xml.indexOf("]]>", next + 9);
168
+ if (end === -1) {
169
+ if (H.onError) H.onError("unterminated CDATA section", next);
170
+ return;
171
+ }
172
+ // CDATA content is verbatim — entities NOT decoded.
173
+ if (H.onCData) H.onCData(xml.slice(next + 9, end));
174
+ else if (H.onText) H.onText(xml.slice(next + 9, end));
175
+ i = end + 3;
176
+ continue;
177
+ }
178
+ if (xml.startsWith("<?", next)) {
179
+ const end = xml.indexOf("?>", next + 2);
180
+ if (end === -1) {
181
+ if (H.onError) H.onError("unterminated processing instruction", next);
182
+ return;
183
+ }
184
+ if (H.onPI) {
185
+ const piBody = xml.slice(next + 2, end).trim();
186
+ const spaceAt = piBody.indexOf(" ");
187
+ const name = spaceAt === -1 ? piBody : piBody.slice(0, spaceAt);
188
+ const content = spaceAt === -1 ? "" : piBody.slice(spaceAt + 1);
189
+ H.onPI(name, content);
190
+ }
191
+ i = end + 2;
192
+ continue;
193
+ }
194
+ if (xml.startsWith("<!", next)) {
195
+ // DOCTYPE or other declaration — skip to next `>` at depth zero.
196
+ let depth = 1;
197
+ let j = next + 2;
198
+ while (j < len && depth > 0) {
199
+ if (xml[j] === "<") depth++;
200
+ else if (xml[j] === ">") depth--;
201
+ if (depth > 0) j++;
202
+ }
203
+ if (depth !== 0) {
204
+ if (H.onError) H.onError("unterminated declaration", next);
205
+ return;
206
+ }
207
+ i = j + 1;
208
+ continue;
209
+ }
210
+ // Element tag — open / close / self-closing.
211
+ const close = xml.indexOf(">", next);
212
+ if (close === -1) {
213
+ if (H.onError) H.onError("unterminated element tag", next);
214
+ return;
215
+ }
216
+ let inner = xml.slice(next + 1, close);
217
+ let isClose = false;
218
+ let selfClose = false;
219
+ if (inner.startsWith("/")) { isClose = true; inner = inner.slice(1); }
220
+ if (inner.endsWith("/")) { selfClose = true; inner = inner.slice(0, -1); }
221
+ inner = inner.trim();
222
+ // Split name and attrs at the first whitespace.
223
+ const wsAt = inner.search(/\s/);
224
+ const rawName = wsAt === -1 ? inner : inner.slice(0, wsAt);
225
+ const rawAttrs = wsAt === -1 ? "" : inner.slice(wsAt + 1);
226
+ const name = localName(rawName);
227
+ if (isClose) {
228
+ if (openStack.length && openStack[openStack.length - 1] === name) openStack.pop();
229
+ if (H.onTagClose) H.onTagClose(name);
230
+ } else {
231
+ const attrs = parseAttrs(rawAttrs);
232
+ if (!selfClose) openStack.push(name);
233
+ if (H.onTagOpen) H.onTagOpen(name, attrs, selfClose);
234
+ if (selfClose && H.onTagClose) H.onTagClose(name);
235
+ }
236
+ i = close + 1;
237
+ }
238
+ if (openStack.length && H.onError) {
239
+ H.onError("unterminated element at EOF: " + openStack[openStack.length - 1], len);
240
+ }
241
+ }
242
+
243
+ /**
244
+ * Parse an RSS / Atom feed into a flat array of items. Returns:
245
+ * [{ title, link, published, body, raw_attrs: {...} }, ...]
246
+ *
247
+ * Empty array on parse failure. `errors` (out-of-band) captured via
248
+ * the optional `errors` array — callers wanting observability pass it.
249
+ */
250
+ function parseFeed(xml, errors = null) {
251
+ const items = [];
252
+ // Stack of "in-progress item" contexts. RSS uses <item>; Atom uses
253
+ // <entry>; both nest title / link / pubDate / published / updated /
254
+ // description / content / summary.
255
+ const ITEM_LOCALS = new Set(["item", "entry"]);
256
+ const FIELD_MAP = {
257
+ title: "title",
258
+ link: "link",
259
+ pubDate: "published",
260
+ published: "published",
261
+ updated: "published",
262
+ description: "body",
263
+ content: "body",
264
+ summary: "body"
265
+ };
266
+ let current = null; // active item context
267
+ let activeField = null; // active field local-name
268
+ let buffer = ""; // accumulator for current field text
269
+ let linkHref = null; // captured from <link href="..."/> attribute
270
+
271
+ tokenize(xml, {
272
+ onTagOpen(name, attrs, selfClosing) {
273
+ if (ITEM_LOCALS.has(name)) {
274
+ current = { title: "", link: "", published: "", body: "" };
275
+ return;
276
+ }
277
+ if (!current) return;
278
+ if (FIELD_MAP[name]) {
279
+ activeField = FIELD_MAP[name];
280
+ buffer = "";
281
+ // Atom <link href="..."/> — capture the href attribute as the
282
+ // link value. RSS <link>...</link> uses element text instead.
283
+ if (name === "link" && attrs && attrs.href) linkHref = attrs.href;
284
+ if (selfClosing && name === "link" && linkHref) {
285
+ current.link = linkHref;
286
+ linkHref = null;
287
+ activeField = null;
288
+ }
289
+ }
290
+ },
291
+ onTagClose(name) {
292
+ if (ITEM_LOCALS.has(name)) {
293
+ if (current) items.push(current);
294
+ current = null;
295
+ activeField = null;
296
+ buffer = "";
297
+ return;
298
+ }
299
+ if (!current) return;
300
+ if (FIELD_MAP[name] && activeField === FIELD_MAP[name]) {
301
+ const value = buffer.trim();
302
+ // Element-text link overrides the attribute capture when
303
+ // both are present.
304
+ if (name === "link" && value) {
305
+ current.link = value;
306
+ } else if (name === "link" && !value && linkHref) {
307
+ current.link = linkHref;
308
+ } else if (activeField === "body" || activeField === "title") {
309
+ // Strip HTML tags from title + description / content / summary.
310
+ // Many feeds embed inline HTML (<b>, <em>, <a>) in titles for
311
+ // emphasis; the operational consumer wants plain text. CDATA
312
+ // content reaches here verbatim, so this also strips HTML
313
+ // that was wrapped in CDATA to dodge entity-encoding.
314
+ current[activeField] = stripHtml(value);
315
+ } else {
316
+ current[activeField] = value;
317
+ }
318
+ linkHref = null;
319
+ activeField = null;
320
+ buffer = "";
321
+ }
322
+ },
323
+ onText(text) {
324
+ if (activeField) buffer += text;
325
+ },
326
+ onCData(text) {
327
+ if (activeField) buffer += text;
328
+ },
329
+ onError(msg, pos) {
330
+ if (errors) errors.push({ message: msg, position: pos });
331
+ }
332
+ });
333
+
334
+ return items;
335
+ }
336
+
337
+ function stripHtml(s) {
338
+ if (typeof s !== "string") return "";
339
+ // First strip tags, then collapse runs of whitespace including
340
+ // newlines. Entity decoding has already happened by this point.
341
+ return s.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
342
+ }
343
+
344
+ module.exports = { tokenize, parseFeed, decodeEntities, localName, parseAttrs, stripHtml };