@dwk/webmention 0.1.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +140 -0
- package/dist/discovery.d.ts +43 -0
- package/dist/discovery.d.ts.map +1 -0
- package/dist/discovery.js +128 -0
- package/dist/discovery.js.map +1 -0
- package/dist/fetch.d.ts +28 -0
- package/dist/fetch.d.ts.map +1 -0
- package/dist/fetch.js +73 -0
- package/dist/fetch.js.map +1 -0
- package/dist/html.d.ts +68 -0
- package/dist/html.d.ts.map +1 -0
- package/dist/html.js +183 -0
- package/dist/html.js.map +1 -0
- package/dist/inbox.d.ts +41 -0
- package/dist/inbox.d.ts.map +1 -0
- package/dist/inbox.js +73 -0
- package/dist/inbox.js.map +1 -0
- package/dist/index.d.ts +96 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +161 -0
- package/dist/index.js.map +1 -0
- package/dist/log.d.ts +42 -0
- package/dist/log.d.ts.map +1 -0
- package/dist/log.js +40 -0
- package/dist/log.js.map +1 -0
- package/dist/safe-fetch.d.ts +101 -0
- package/dist/safe-fetch.d.ts.map +1 -0
- package/dist/safe-fetch.js +348 -0
- package/dist/safe-fetch.js.map +1 -0
- package/dist/sender.d.ts +43 -0
- package/dist/sender.d.ts.map +1 -0
- package/dist/sender.js +80 -0
- package/dist/sender.js.map +1 -0
- package/dist/validate.d.ts +47 -0
- package/dist/validate.d.ts.map +1 -0
- package/dist/validate.js +76 -0
- package/dist/validate.js.map +1 -0
- package/dist/verify.d.ts +61 -0
- package/dist/verify.d.ts.map +1 -0
- package/dist/verify.js +216 -0
- package/dist/verify.js.map +1 -0
- package/package.json +45 -0
- package/src/discovery.ts +167 -0
- package/src/fetch.ts +84 -0
- package/src/html.ts +206 -0
- package/src/inbox.ts +121 -0
- package/src/index.ts +297 -0
- package/src/log.ts +44 -0
- package/src/safe-fetch.ts +405 -0
- package/src/sender.ts +131 -0
- package/src/validate.ts +116 -0
- package/src/verify.ts +294 -0
package/src/verify.ts
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@dwk/webmention` — asynchronous source verification (receiver side).
|
|
3
|
+
*
|
|
4
|
+
* After the receiver has returned `202 Accepted`, a queued worker fetches the
|
|
5
|
+
* `source` and confirms it actually links to `target` (Webmention §3.2.1).
|
|
6
|
+
* Verification is link-level: the source document must contain a link
|
|
7
|
+
* (`href`/`src`) that resolves to the target. Full Microformats2 extraction is
|
|
8
|
+
* intentionally out of scope here — it would pull a parser into the Worker
|
|
9
|
+
* bundle the runtime budget rules out. See `spec/packages/webmention.md`.
|
|
10
|
+
*
|
|
11
|
+
* @packageDocumentation
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import {
|
|
15
|
+
hostFromUrl,
|
|
16
|
+
noopLogger,
|
|
17
|
+
noopMetrics,
|
|
18
|
+
type Logger,
|
|
19
|
+
type Metrics,
|
|
20
|
+
} from "@dwk/log";
|
|
21
|
+
import {
|
|
22
|
+
isHtmlContentType,
|
|
23
|
+
isJsonContentType,
|
|
24
|
+
resolveUrl,
|
|
25
|
+
scanElements,
|
|
26
|
+
} from "./html";
|
|
27
|
+
import { readBodyCapped, type FetchLike } from "./fetch";
|
|
28
|
+
import { WebmentionLogEvent } from "./log";
|
|
29
|
+
import { safeFetch } from "./safe-fetch";
|
|
30
|
+
|
|
31
|
+
/** Elements whose `href` may constitute a link to the target. */
|
|
32
|
+
const HREF_TAGS = new Set(["a", "link", "area"]);
|
|
33
|
+
/** Elements whose `src` may constitute a link to the target. */
|
|
34
|
+
const SRC_TAGS = new Set(["img", "video", "audio", "source", "track"]);
|
|
35
|
+
|
|
36
|
+
const LINK_SELECTOR = "base, a, link, area, img, video, audio, source, track";
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Extract every absolute link URL (`href` and `src`) from an HTML document,
|
|
40
|
+
* resolved against `baseUrl`. `href` links are listed before `src` links.
|
|
41
|
+
*
|
|
42
|
+
* Async because HTML scanning runs through the runtime's `HTMLRewriter`, which
|
|
43
|
+
* also means links inside comments are ignored without a separate stripping
|
|
44
|
+
* pass.
|
|
45
|
+
*/
|
|
46
|
+
export async function extractLinks(
|
|
47
|
+
html: string,
|
|
48
|
+
baseUrl: string,
|
|
49
|
+
): Promise<string[]> {
|
|
50
|
+
const elements = await scanElements(html, LINK_SELECTOR, ["href", "src"]);
|
|
51
|
+
// The first <base href> anywhere in the document governs relative resolution.
|
|
52
|
+
let documentBase = baseUrl;
|
|
53
|
+
for (const el of elements) {
|
|
54
|
+
if (el.name === "base" && el.attrs.href) {
|
|
55
|
+
documentBase = resolveUrl(el.attrs.href, baseUrl) ?? baseUrl;
|
|
56
|
+
break;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
const links: string[] = [];
|
|
60
|
+
const collect = (tags: ReadonlySet<string>, attr: "href" | "src") => {
|
|
61
|
+
for (const el of elements) {
|
|
62
|
+
if (!tags.has(el.name)) {
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
65
|
+
const value = el.attrs[attr];
|
|
66
|
+
if (value === null || value === undefined || value === "") {
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
const resolved = resolveUrl(value, documentBase);
|
|
70
|
+
if (resolved !== null) {
|
|
71
|
+
links.push(resolved);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
collect(HREF_TAGS, "href");
|
|
76
|
+
collect(SRC_TAGS, "src");
|
|
77
|
+
return links;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Characters that unambiguously continue a URL token: an alphanumeric, or a
|
|
82
|
+
* structural delimiter (path / query / fragment / userinfo). One of these
|
|
83
|
+
* abutting the target means the target is part of a longer URL.
|
|
84
|
+
*/
|
|
85
|
+
const URL_CORE_CHAR = /[A-Za-z0-9_/\-~%+=&?#@]/;
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* The full RFC 3986 URL character set (unreserved + reserved + `%`). Punctuation
|
|
89
|
+
* such as `.` `,` `;` `)` `]` is valid inside a URL but also routinely trails or
|
|
90
|
+
* wraps one in prose, so on its own it does not prove continuation — only when
|
|
91
|
+
* it is itself followed by a {@link URL_CORE_CHAR} (e.g. the `.` in `…/post.html`).
|
|
92
|
+
*/
|
|
93
|
+
const URL_CHAR = /[A-Za-z0-9\-._~:/?#[\]@!$&'()*+,;=%]/;
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Whether `body` contains `target` as a standalone URL token — present, with
|
|
97
|
+
* neither neighbour continuing a URL. This rejects the over-matches a bare
|
|
98
|
+
* substring admits (`…/post` inside `…/posting`, `…/target` inside
|
|
99
|
+
* `…/target/extra`, or the target as a suffix of a longer URL) while still
|
|
100
|
+
* accepting a target trailed by sentence punctuation or wrapped in brackets.
|
|
101
|
+
*/
|
|
102
|
+
function textHasUrlToken(body: string, target: string): boolean {
|
|
103
|
+
for (
|
|
104
|
+
let from = body.indexOf(target);
|
|
105
|
+
from !== -1;
|
|
106
|
+
from = body.indexOf(target, from + 1)
|
|
107
|
+
) {
|
|
108
|
+
const before = from === 0 ? "" : (body[from - 1] ?? "");
|
|
109
|
+
const after = body[from + target.length] ?? "";
|
|
110
|
+
const afterNext = body[from + target.length + 1] ?? "";
|
|
111
|
+
// A preceding core URL char makes the target a suffix of a longer URL.
|
|
112
|
+
const beforeContinues = URL_CORE_CHAR.test(before);
|
|
113
|
+
// A following core URL char continues the URL; a punctuation URL char only
|
|
114
|
+
// continues it when itself followed by a core char (so `.html` continues,
|
|
115
|
+
// but a sentence-ending `.` or a wrapping `)` is a boundary).
|
|
116
|
+
const afterContinues =
|
|
117
|
+
URL_CORE_CHAR.test(after) ||
|
|
118
|
+
(URL_CHAR.test(after) && URL_CORE_CHAR.test(afterNext));
|
|
119
|
+
if (!beforeContinues && !afterContinues) {
|
|
120
|
+
return true;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return false;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Whether any string value within a parsed JSON value equals `target` exactly.
|
|
128
|
+
* Webmention §3.2.2 requires an exact match of the target URL in a non-HTML
|
|
129
|
+
* source, so a JSON body is walked for a string property value identical to the
|
|
130
|
+
* target rather than substring-scanned.
|
|
131
|
+
*/
|
|
132
|
+
function jsonHasTargetValue(value: unknown, target: string): boolean {
|
|
133
|
+
if (typeof value === "string") {
|
|
134
|
+
return value === target;
|
|
135
|
+
}
|
|
136
|
+
if (Array.isArray(value)) {
|
|
137
|
+
return value.some((item) => jsonHasTargetValue(item, target));
|
|
138
|
+
}
|
|
139
|
+
if (value !== null && typeof value === "object") {
|
|
140
|
+
return Object.values(value).some((item) =>
|
|
141
|
+
jsonHasTargetValue(item, target),
|
|
142
|
+
);
|
|
143
|
+
}
|
|
144
|
+
return false;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Decide whether `body` (a fetched source document) links to `target`.
|
|
149
|
+
*
|
|
150
|
+
* HTML bodies are scanned for an `href`/`src` resolving to the target. Other
|
|
151
|
+
* content types require an **exact** match of the target URL (Webmention
|
|
152
|
+
* §3.2.2), not a loose substring: a JSON body must carry a string value equal to
|
|
153
|
+
* the target, and any other (e.g. plain text) body must contain the target as a
|
|
154
|
+
* standalone URL token.
|
|
155
|
+
*/
|
|
156
|
+
export async function sourceLinksTo(
|
|
157
|
+
body: string,
|
|
158
|
+
target: string,
|
|
159
|
+
baseUrl: string,
|
|
160
|
+
contentType: string,
|
|
161
|
+
): Promise<boolean> {
|
|
162
|
+
const normalizedTarget = resolveUrl(target, target);
|
|
163
|
+
if (normalizedTarget === null) {
|
|
164
|
+
return false;
|
|
165
|
+
}
|
|
166
|
+
if (isHtmlContentType(contentType)) {
|
|
167
|
+
return (await extractLinks(body, baseUrl)).some(
|
|
168
|
+
(link) => link === normalizedTarget,
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
if (isJsonContentType(contentType)) {
|
|
172
|
+
let parsed: unknown;
|
|
173
|
+
try {
|
|
174
|
+
parsed = JSON.parse(body);
|
|
175
|
+
} catch {
|
|
176
|
+
// A body that claims to be JSON but does not parse cannot exact-match.
|
|
177
|
+
return false;
|
|
178
|
+
}
|
|
179
|
+
return (
|
|
180
|
+
jsonHasTargetValue(parsed, target) ||
|
|
181
|
+
(normalizedTarget !== target &&
|
|
182
|
+
jsonHasTargetValue(parsed, normalizedTarget))
|
|
183
|
+
);
|
|
184
|
+
}
|
|
185
|
+
return (
|
|
186
|
+
textHasUrlToken(body, target) ||
|
|
187
|
+
(normalizedTarget !== target && textHasUrlToken(body, normalizedTarget))
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/** Options for {@link verifySource}. */
|
|
192
|
+
export interface VerifyOptions {
|
|
193
|
+
/** `fetch` implementation to use; defaults to the global `fetch`. */
|
|
194
|
+
readonly fetch?: FetchLike;
|
|
195
|
+
/** Logger for verification outcomes/failures; defaults to a no-op. */
|
|
196
|
+
readonly logger?: Logger;
|
|
197
|
+
/** Metrics sink for verification-outcome counters; defaults to a no-op. */
|
|
198
|
+
readonly metrics?: Metrics;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Record a verification outcome on both seams (sanitized hosts only) and return
|
|
203
|
+
* the result. The counter mirrors the log so "verification success rate" is
|
|
204
|
+
* chartable from the `links`/`status` fields.
|
|
205
|
+
*/
|
|
206
|
+
function recordVerifyOutcome(
|
|
207
|
+
logger: Logger,
|
|
208
|
+
metrics: Metrics,
|
|
209
|
+
source: string,
|
|
210
|
+
target: string,
|
|
211
|
+
result: VerifyResult,
|
|
212
|
+
): VerifyResult {
|
|
213
|
+
const fields = {
|
|
214
|
+
sourceHost: hostFromUrl(source),
|
|
215
|
+
targetHost: hostFromUrl(target),
|
|
216
|
+
links: result.links,
|
|
217
|
+
status: result.status,
|
|
218
|
+
};
|
|
219
|
+
logger.info(WebmentionLogEvent.VerifyCompleted, fields);
|
|
220
|
+
metrics.count(WebmentionLogEvent.VerifyCompleted, fields);
|
|
221
|
+
return result;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/** Outcome of fetching and checking a source document. */
|
|
225
|
+
export interface VerifyResult {
|
|
226
|
+
/** Whether the source links to the target. */
|
|
227
|
+
readonly links: boolean;
|
|
228
|
+
/** The source's HTTP status (`0` when the fetch threw). */
|
|
229
|
+
readonly status: number;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Fetch `source` and verify that it links to `target`.
|
|
234
|
+
*
|
|
235
|
+
* Fetches through the SSRF-safe wrapper ({@link safeFetch}): the source host —
|
|
236
|
+
* and every redirect hop — is validated against private/loopback/link-local
|
|
237
|
+
* ranges, redirects are capped, and the request is bounded by a timeout.
|
|
238
|
+
* Relative links resolve against the final URL. A failed, blocked, or non-2xx
|
|
239
|
+
* fetch yields `{ links: false }` — a removed/unreachable source no longer
|
|
240
|
+
* endorses the mention.
|
|
241
|
+
*/
|
|
242
|
+
export async function verifySource(
|
|
243
|
+
source: string,
|
|
244
|
+
target: string,
|
|
245
|
+
options?: VerifyOptions,
|
|
246
|
+
): Promise<VerifyResult> {
|
|
247
|
+
const doFetch: FetchLike =
|
|
248
|
+
options?.fetch ?? ((input, init) => fetch(input, init));
|
|
249
|
+
const logger = options?.logger ?? noopLogger;
|
|
250
|
+
const metrics = options?.metrics ?? noopMetrics;
|
|
251
|
+
|
|
252
|
+
let response: Response;
|
|
253
|
+
let base: string;
|
|
254
|
+
try {
|
|
255
|
+
const result = await safeFetch(
|
|
256
|
+
doFetch,
|
|
257
|
+
source,
|
|
258
|
+
{ method: "GET", headers: { accept: "text/html, */*" } },
|
|
259
|
+
{ logger, metrics },
|
|
260
|
+
);
|
|
261
|
+
response = result.response;
|
|
262
|
+
base = result.url;
|
|
263
|
+
} catch (err) {
|
|
264
|
+
// A blocked attempt is already logged as `ssrf.blocked` inside safeFetch;
|
|
265
|
+
// record the verification-level failure too so the outcome isn't silent.
|
|
266
|
+
logger.debug(WebmentionLogEvent.VerifyFetchFailed, {
|
|
267
|
+
sourceHost: hostFromUrl(source),
|
|
268
|
+
error: err instanceof Error ? err.name : "unknown",
|
|
269
|
+
});
|
|
270
|
+
return { links: false, status: 0 };
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if (!response.ok) {
|
|
274
|
+
return recordVerifyOutcome(logger, metrics, source, target, {
|
|
275
|
+
links: false,
|
|
276
|
+
status: response.status,
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
281
|
+
const body = await readBodyCapped(response);
|
|
282
|
+
if (body === null) {
|
|
283
|
+
// Unreadable or oversized body: treat as no longer endorsing the mention.
|
|
284
|
+
return recordVerifyOutcome(logger, metrics, source, target, {
|
|
285
|
+
links: false,
|
|
286
|
+
status: response.status,
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
return recordVerifyOutcome(logger, metrics, source, target, {
|
|
291
|
+
links: await sourceLinksTo(body, target, base, contentType),
|
|
292
|
+
status: response.status,
|
|
293
|
+
});
|
|
294
|
+
}
|