@dwk/webmention 0.1.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +140 -0
  3. package/dist/discovery.d.ts +43 -0
  4. package/dist/discovery.d.ts.map +1 -0
  5. package/dist/discovery.js +128 -0
  6. package/dist/discovery.js.map +1 -0
  7. package/dist/fetch.d.ts +28 -0
  8. package/dist/fetch.d.ts.map +1 -0
  9. package/dist/fetch.js +73 -0
  10. package/dist/fetch.js.map +1 -0
  11. package/dist/html.d.ts +68 -0
  12. package/dist/html.d.ts.map +1 -0
  13. package/dist/html.js +183 -0
  14. package/dist/html.js.map +1 -0
  15. package/dist/inbox.d.ts +41 -0
  16. package/dist/inbox.d.ts.map +1 -0
  17. package/dist/inbox.js +73 -0
  18. package/dist/inbox.js.map +1 -0
  19. package/dist/index.d.ts +96 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +161 -0
  22. package/dist/index.js.map +1 -0
  23. package/dist/log.d.ts +42 -0
  24. package/dist/log.d.ts.map +1 -0
  25. package/dist/log.js +40 -0
  26. package/dist/log.js.map +1 -0
  27. package/dist/safe-fetch.d.ts +101 -0
  28. package/dist/safe-fetch.d.ts.map +1 -0
  29. package/dist/safe-fetch.js +348 -0
  30. package/dist/safe-fetch.js.map +1 -0
  31. package/dist/sender.d.ts +43 -0
  32. package/dist/sender.d.ts.map +1 -0
  33. package/dist/sender.js +80 -0
  34. package/dist/sender.js.map +1 -0
  35. package/dist/validate.d.ts +47 -0
  36. package/dist/validate.d.ts.map +1 -0
  37. package/dist/validate.js +76 -0
  38. package/dist/validate.js.map +1 -0
  39. package/dist/verify.d.ts +61 -0
  40. package/dist/verify.d.ts.map +1 -0
  41. package/dist/verify.js +216 -0
  42. package/dist/verify.js.map +1 -0
  43. package/package.json +45 -0
  44. package/src/discovery.ts +167 -0
  45. package/src/fetch.ts +84 -0
  46. package/src/html.ts +206 -0
  47. package/src/inbox.ts +121 -0
  48. package/src/index.ts +297 -0
  49. package/src/log.ts +44 -0
  50. package/src/safe-fetch.ts +405 -0
  51. package/src/sender.ts +131 -0
  52. package/src/validate.ts +116 -0
  53. package/src/verify.ts +294 -0
package/src/verify.ts ADDED
@@ -0,0 +1,294 @@
1
+ /**
2
+ * `@dwk/webmention` — asynchronous source verification (receiver side).
3
+ *
4
+ * After the receiver has returned `202 Accepted`, a queued worker fetches the
5
+ * `source` and confirms it actually links to `target` (Webmention §3.2.1).
6
+ * Verification is link-level: the source document must contain a link
7
+ * (`href`/`src`) that resolves to the target. Full Microformats2 extraction is
8
+ * intentionally out of scope here — it would pull a parser into the Worker
9
+ * bundle the runtime budget rules out. See `spec/packages/webmention.md`.
10
+ *
11
+ * @packageDocumentation
12
+ */
13
+
14
+ import {
15
+ hostFromUrl,
16
+ noopLogger,
17
+ noopMetrics,
18
+ type Logger,
19
+ type Metrics,
20
+ } from "@dwk/log";
21
+ import {
22
+ isHtmlContentType,
23
+ isJsonContentType,
24
+ resolveUrl,
25
+ scanElements,
26
+ } from "./html";
27
+ import { readBodyCapped, type FetchLike } from "./fetch";
28
+ import { WebmentionLogEvent } from "./log";
29
+ import { safeFetch } from "./safe-fetch";
30
+
31
+ /** Elements whose `href` may constitute a link to the target. */
32
+ const HREF_TAGS = new Set(["a", "link", "area"]);
33
+ /** Elements whose `src` may constitute a link to the target. */
34
+ const SRC_TAGS = new Set(["img", "video", "audio", "source", "track"]);
35
+
36
+ const LINK_SELECTOR = "base, a, link, area, img, video, audio, source, track";
37
+
38
+ /**
39
+ * Extract every absolute link URL (`href` and `src`) from an HTML document,
40
+ * resolved against `baseUrl`. `href` links are listed before `src` links.
41
+ *
42
+ * Async because HTML scanning runs through the runtime's `HTMLRewriter`, which
43
+ * also means links inside comments are ignored without a separate stripping
44
+ * pass.
45
+ */
46
+ export async function extractLinks(
47
+ html: string,
48
+ baseUrl: string,
49
+ ): Promise<string[]> {
50
+ const elements = await scanElements(html, LINK_SELECTOR, ["href", "src"]);
51
+ // The first <base href> anywhere in the document governs relative resolution.
52
+ let documentBase = baseUrl;
53
+ for (const el of elements) {
54
+ if (el.name === "base" && el.attrs.href) {
55
+ documentBase = resolveUrl(el.attrs.href, baseUrl) ?? baseUrl;
56
+ break;
57
+ }
58
+ }
59
+ const links: string[] = [];
60
+ const collect = (tags: ReadonlySet<string>, attr: "href" | "src") => {
61
+ for (const el of elements) {
62
+ if (!tags.has(el.name)) {
63
+ continue;
64
+ }
65
+ const value = el.attrs[attr];
66
+ if (value === null || value === undefined || value === "") {
67
+ continue;
68
+ }
69
+ const resolved = resolveUrl(value, documentBase);
70
+ if (resolved !== null) {
71
+ links.push(resolved);
72
+ }
73
+ }
74
+ };
75
+ collect(HREF_TAGS, "href");
76
+ collect(SRC_TAGS, "src");
77
+ return links;
78
+ }
79
+
80
+ /**
81
+ * Characters that unambiguously continue a URL token: an alphanumeric, or a
82
+ * structural delimiter (path / query / fragment / userinfo). One of these
83
+ * abutting the target means the target is part of a longer URL.
84
+ */
85
+ const URL_CORE_CHAR = /[A-Za-z0-9_/\-~%+=&?#@]/;
86
+
87
+ /**
88
+ * The full RFC 3986 URL character set (unreserved + reserved + `%`). Punctuation
89
+ * such as `.` `,` `;` `)` `]` is valid inside a URL but also routinely trails or
90
+ * wraps one in prose, so on its own it does not prove continuation — only when
91
+ * it is itself followed by a {@link URL_CORE_CHAR} (e.g. the `.` in `…/post.html`).
92
+ */
93
+ const URL_CHAR = /[A-Za-z0-9\-._~:/?#[\]@!$&'()*+,;=%]/;
94
+
95
+ /**
96
+ * Whether `body` contains `target` as a standalone URL token — present, with
97
+ * neither neighbour continuing a URL. This rejects the over-matches a bare
98
+ * substring admits (`…/post` inside `…/posting`, `…/target` inside
99
+ * `…/target/extra`, or the target as a suffix of a longer URL) while still
100
+ * accepting a target trailed by sentence punctuation or wrapped in brackets.
101
+ */
102
+ function textHasUrlToken(body: string, target: string): boolean {
103
+ for (
104
+ let from = body.indexOf(target);
105
+ from !== -1;
106
+ from = body.indexOf(target, from + 1)
107
+ ) {
108
+ const before = from === 0 ? "" : (body[from - 1] ?? "");
109
+ const after = body[from + target.length] ?? "";
110
+ const afterNext = body[from + target.length + 1] ?? "";
111
+ // A preceding core URL char makes the target a suffix of a longer URL.
112
+ const beforeContinues = URL_CORE_CHAR.test(before);
113
+ // A following core URL char continues the URL; a punctuation URL char only
114
+ // continues it when itself followed by a core char (so `.html` continues,
115
+ // but a sentence-ending `.` or a wrapping `)` is a boundary).
116
+ const afterContinues =
117
+ URL_CORE_CHAR.test(after) ||
118
+ (URL_CHAR.test(after) && URL_CORE_CHAR.test(afterNext));
119
+ if (!beforeContinues && !afterContinues) {
120
+ return true;
121
+ }
122
+ }
123
+ return false;
124
+ }
125
+
126
+ /**
127
+ * Whether any string value within a parsed JSON value equals `target` exactly.
128
+ * Webmention §3.2.2 requires an exact match of the target URL in a non-HTML
129
+ * source, so a JSON body is walked for a string property value identical to the
130
+ * target rather than substring-scanned.
131
+ */
132
+ function jsonHasTargetValue(value: unknown, target: string): boolean {
133
+ if (typeof value === "string") {
134
+ return value === target;
135
+ }
136
+ if (Array.isArray(value)) {
137
+ return value.some((item) => jsonHasTargetValue(item, target));
138
+ }
139
+ if (value !== null && typeof value === "object") {
140
+ return Object.values(value).some((item) =>
141
+ jsonHasTargetValue(item, target),
142
+ );
143
+ }
144
+ return false;
145
+ }
146
+
147
+ /**
148
+ * Decide whether `body` (a fetched source document) links to `target`.
149
+ *
150
+ * HTML bodies are scanned for an `href`/`src` resolving to the target. Other
151
+ * content types require an **exact** match of the target URL (Webmention
152
+ * §3.2.2), not a loose substring: a JSON body must carry a string value equal to
153
+ * the target, and any other (e.g. plain text) body must contain the target as a
154
+ * standalone URL token.
155
+ */
156
+ export async function sourceLinksTo(
157
+ body: string,
158
+ target: string,
159
+ baseUrl: string,
160
+ contentType: string,
161
+ ): Promise<boolean> {
162
+ const normalizedTarget = resolveUrl(target, target);
163
+ if (normalizedTarget === null) {
164
+ return false;
165
+ }
166
+ if (isHtmlContentType(contentType)) {
167
+ return (await extractLinks(body, baseUrl)).some(
168
+ (link) => link === normalizedTarget,
169
+ );
170
+ }
171
+ if (isJsonContentType(contentType)) {
172
+ let parsed: unknown;
173
+ try {
174
+ parsed = JSON.parse(body);
175
+ } catch {
176
+ // A body that claims to be JSON but does not parse cannot exact-match.
177
+ return false;
178
+ }
179
+ return (
180
+ jsonHasTargetValue(parsed, target) ||
181
+ (normalizedTarget !== target &&
182
+ jsonHasTargetValue(parsed, normalizedTarget))
183
+ );
184
+ }
185
+ return (
186
+ textHasUrlToken(body, target) ||
187
+ (normalizedTarget !== target && textHasUrlToken(body, normalizedTarget))
188
+ );
189
+ }
190
+
191
+ /** Options for {@link verifySource}. */
192
+ export interface VerifyOptions {
193
+ /** `fetch` implementation to use; defaults to the global `fetch`. */
194
+ readonly fetch?: FetchLike;
195
+ /** Logger for verification outcomes/failures; defaults to a no-op. */
196
+ readonly logger?: Logger;
197
+ /** Metrics sink for verification-outcome counters; defaults to a no-op. */
198
+ readonly metrics?: Metrics;
199
+ }
200
+
201
+ /**
202
+ * Record a verification outcome on both seams (sanitized hosts only) and return
203
+ * the result. The counter mirrors the log so "verification success rate" is
204
+ * chartable from the `links`/`status` fields.
205
+ */
206
+ function recordVerifyOutcome(
207
+ logger: Logger,
208
+ metrics: Metrics,
209
+ source: string,
210
+ target: string,
211
+ result: VerifyResult,
212
+ ): VerifyResult {
213
+ const fields = {
214
+ sourceHost: hostFromUrl(source),
215
+ targetHost: hostFromUrl(target),
216
+ links: result.links,
217
+ status: result.status,
218
+ };
219
+ logger.info(WebmentionLogEvent.VerifyCompleted, fields);
220
+ metrics.count(WebmentionLogEvent.VerifyCompleted, fields);
221
+ return result;
222
+ }
223
+
224
+ /** Outcome of fetching and checking a source document. */
225
+ export interface VerifyResult {
226
+ /** Whether the source links to the target. */
227
+ readonly links: boolean;
228
+ /** The source's HTTP status (`0` when the fetch threw). */
229
+ readonly status: number;
230
+ }
231
+
232
+ /**
233
+ * Fetch `source` and verify that it links to `target`.
234
+ *
235
+ * Fetches through the SSRF-safe wrapper ({@link safeFetch}): the source host —
236
+ * and every redirect hop — is validated against private/loopback/link-local
237
+ * ranges, redirects are capped, and the request is bounded by a timeout.
238
+ * Relative links resolve against the final URL. A failed, blocked, or non-2xx
239
+ * fetch yields `{ links: false }` — a removed/unreachable source no longer
240
+ * endorses the mention.
241
+ */
242
+ export async function verifySource(
243
+ source: string,
244
+ target: string,
245
+ options?: VerifyOptions,
246
+ ): Promise<VerifyResult> {
247
+ const doFetch: FetchLike =
248
+ options?.fetch ?? ((input, init) => fetch(input, init));
249
+ const logger = options?.logger ?? noopLogger;
250
+ const metrics = options?.metrics ?? noopMetrics;
251
+
252
+ let response: Response;
253
+ let base: string;
254
+ try {
255
+ const result = await safeFetch(
256
+ doFetch,
257
+ source,
258
+ { method: "GET", headers: { accept: "text/html, */*" } },
259
+ { logger, metrics },
260
+ );
261
+ response = result.response;
262
+ base = result.url;
263
+ } catch (err) {
264
+ // A blocked attempt is already logged as `ssrf.blocked` inside safeFetch;
265
+ // record the verification-level failure too so the outcome isn't silent.
266
+ logger.debug(WebmentionLogEvent.VerifyFetchFailed, {
267
+ sourceHost: hostFromUrl(source),
268
+ error: err instanceof Error ? err.name : "unknown",
269
+ });
270
+ return { links: false, status: 0 };
271
+ }
272
+
273
+ if (!response.ok) {
274
+ return recordVerifyOutcome(logger, metrics, source, target, {
275
+ links: false,
276
+ status: response.status,
277
+ });
278
+ }
279
+
280
+ const contentType = response.headers.get("content-type") ?? "";
281
+ const body = await readBodyCapped(response);
282
+ if (body === null) {
283
+ // Unreadable or oversized body: treat as no longer endorsing the mention.
284
+ return recordVerifyOutcome(logger, metrics, source, target, {
285
+ links: false,
286
+ status: response.status,
287
+ });
288
+ }
289
+
290
+ return recordVerifyOutcome(logger, metrics, source, target, {
291
+ links: await sourceLinksTo(body, target, base, contentType),
292
+ status: response.status,
293
+ });
294
+ }