@dwk/microsub 0.1.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +92 -0
  3. package/dist/auth.d.ts +53 -0
  4. package/dist/auth.d.ts.map +1 -0
  5. package/dist/auth.js +102 -0
  6. package/dist/auth.js.map +1 -0
  7. package/dist/config.d.ts +102 -0
  8. package/dist/config.d.ts.map +1 -0
  9. package/dist/config.js +64 -0
  10. package/dist/config.js.map +1 -0
  11. package/dist/consumer.d.ts +40 -0
  12. package/dist/consumer.d.ts.map +1 -0
  13. package/dist/consumer.js +87 -0
  14. package/dist/consumer.js.map +1 -0
  15. package/dist/discovery.d.ts +59 -0
  16. package/dist/discovery.d.ts.map +1 -0
  17. package/dist/discovery.js +190 -0
  18. package/dist/discovery.js.map +1 -0
  19. package/dist/fetch.d.ts +28 -0
  20. package/dist/fetch.d.ts.map +1 -0
  21. package/dist/fetch.js +72 -0
  22. package/dist/fetch.js.map +1 -0
  23. package/dist/handler.d.ts +24 -0
  24. package/dist/handler.d.ts.map +1 -0
  25. package/dist/handler.js +434 -0
  26. package/dist/handler.js.map +1 -0
  27. package/dist/hfeed.d.ts +25 -0
  28. package/dist/hfeed.d.ts.map +1 -0
  29. package/dist/hfeed.js +252 -0
  30. package/dist/hfeed.js.map +1 -0
  31. package/dist/index.d.ts +39 -0
  32. package/dist/index.d.ts.map +1 -0
  33. package/dist/index.js +32 -0
  34. package/dist/index.js.map +1 -0
  35. package/dist/jf2.d.ts +69 -0
  36. package/dist/jf2.d.ts.map +1 -0
  37. package/dist/jf2.js +295 -0
  38. package/dist/jf2.js.map +1 -0
  39. package/dist/log.d.ts +44 -0
  40. package/dist/log.d.ts.map +1 -0
  41. package/dist/log.js +42 -0
  42. package/dist/log.js.map +1 -0
  43. package/dist/poll.d.ts +22 -0
  44. package/dist/poll.d.ts.map +1 -0
  45. package/dist/poll.js +39 -0
  46. package/dist/poll.js.map +1 -0
  47. package/dist/queue.d.ts +25 -0
  48. package/dist/queue.d.ts.map +1 -0
  49. package/dist/queue.js +13 -0
  50. package/dist/queue.js.map +1 -0
  51. package/dist/replay.d.ts +34 -0
  52. package/dist/replay.d.ts.map +1 -0
  53. package/dist/replay.js +49 -0
  54. package/dist/replay.js.map +1 -0
  55. package/dist/safe-fetch.d.ts +86 -0
  56. package/dist/safe-fetch.d.ts.map +1 -0
  57. package/dist/safe-fetch.js +311 -0
  58. package/dist/safe-fetch.js.map +1 -0
  59. package/dist/store.d.ts +131 -0
  60. package/dist/store.d.ts.map +1 -0
  61. package/dist/store.js +393 -0
  62. package/dist/store.js.map +1 -0
  63. package/dist/xml.d.ts +51 -0
  64. package/dist/xml.d.ts.map +1 -0
  65. package/dist/xml.js +196 -0
  66. package/dist/xml.js.map +1 -0
  67. package/package.json +49 -0
  68. package/src/auth.ts +184 -0
  69. package/src/config.ts +156 -0
  70. package/src/consumer.ts +140 -0
  71. package/src/discovery.ts +270 -0
  72. package/src/fetch.ts +82 -0
  73. package/src/handler.ts +594 -0
  74. package/src/hfeed.ts +287 -0
  75. package/src/index.ts +86 -0
  76. package/src/jf2.ts +394 -0
  77. package/src/log.ts +46 -0
  78. package/src/poll.ts +72 -0
  79. package/src/queue.ts +26 -0
  80. package/src/replay.ts +68 -0
  81. package/src/safe-fetch.ts +346 -0
  82. package/src/store.ts +644 -0
  83. package/src/xml.ts +229 -0
@@ -0,0 +1,270 @@
1
+ /**
2
+ * `@dwk/microsub` — feed discovery and fetching.
3
+ *
4
+ * `follow` and `preview` take a URL the user typed; the server must work out
5
+ * what to actually poll. {@link discoverFeed} fetches that URL (through the
6
+ * SSRF-safe wrapper), and:
7
+ *
8
+ * - if it is already a syndication feed (Atom / RSS / JSON Feed), parses it;
9
+ * - if it is HTML, looks for a `<link rel="alternate">` feed and follows the
10
+ * first one; failing that, parses the page's own `h-feed` microformats.
11
+ *
12
+ * {@link fetchFeed} re-fetches an already-resolved feed URL on each poll,
13
+ * sending the cached `ETag` / `Last-Modified` so an unchanged feed returns `304`
14
+ * and is skipped. Every outbound request goes through {@link safeFetch}, and the
15
+ * body is read with a hard size cap.
16
+ *
17
+ * @packageDocumentation
18
+ */
19
+
20
+ import { noopLogger, noopMetrics, type Logger, type Metrics } from "@dwk/log";
21
+
22
+ import { readTextCapped, type FetchLike } from "./fetch";
23
+ import { parseHFeed } from "./hfeed";
24
+ import { parseFeed, type Jf2Entry } from "./jf2";
25
+ import { safeFetch } from "./safe-fetch";
26
+
27
+ /** Shared options for the discovery/fetch helpers. */
28
+ export interface DiscoveryOptions {
29
+ readonly fetch?: FetchLike;
30
+ readonly logger?: Logger;
31
+ readonly metrics?: Metrics;
32
+ }
33
+
34
+ /** A resolved feed: the URL to poll and the entries parsed from its first fetch. */
35
+ export interface DiscoveredFeed {
36
+ /** The URL the poller should re-fetch (a feed, or an HTML `h-feed` page). */
37
+ readonly feedUrl: string;
38
+ /** The entries parsed from the discovery fetch. */
39
+ readonly entries: readonly Jf2Entry[];
40
+ }
41
+
42
+ /** A completed feed fetch. */
43
+ export interface FetchedFeed {
44
+ /** The parsed entries (empty when not modified or unparseable). */
45
+ readonly entries: readonly Jf2Entry[];
46
+ /** `true` when the server answered `304 Not Modified`. */
47
+ readonly notModified: boolean;
48
+ readonly etag: string | null;
49
+ readonly lastModified: string | null;
50
+ }
51
+
52
+ const FEED_ACCEPT =
53
+ "application/atom+xml, application/rss+xml, application/feed+json, " +
54
+ "application/json, text/xml, application/xml, text/html;q=0.9, */*;q=0.8";
55
+
56
+ /** Feed media types we recognise on a `Content-Type` or a `<link type>`. */
57
+ function isFeedType(contentType: string): boolean {
58
+ const essence = contentType.split(";")[0]?.trim().toLowerCase() ?? "";
59
+ return (
60
+ essence === "application/atom+xml" ||
61
+ essence === "application/rss+xml" ||
62
+ essence === "application/feed+json" ||
63
+ essence === "application/json" ||
64
+ essence === "text/xml" ||
65
+ essence === "application/xml" ||
66
+ essence === "application/rdf+xml"
67
+ );
68
+ }
69
+
70
+ function isHtmlType(contentType: string): boolean {
71
+ const essence = contentType.split(";")[0]?.trim().toLowerCase() ?? "";
72
+ return essence === "text/html" || essence === "application/xhtml+xml";
73
+ }
74
+
75
+ /** A `<link>` discovered in an HTML head: its rel, type, and resolved href. */
76
+ interface FeedLink {
77
+ readonly rels: string[];
78
+ readonly type: string;
79
+ readonly href: string;
80
+ }
81
+
82
+ /** Collect `<link>` elements (rel/type/href) from an HTML document. */
83
+ async function findFeedLinks(
84
+ html: string,
85
+ baseUrl: string,
86
+ ): Promise<FeedLink[]> {
87
+ const links: FeedLink[] = [];
88
+ const rewriter = new HTMLRewriter().on("link", {
89
+ element(el) {
90
+ const href = el.getAttribute("href");
91
+ if (href === null || href === "") return;
92
+ const rels = (el.getAttribute("rel") ?? "")
93
+ .toLowerCase()
94
+ .split(/\s+/)
95
+ .filter(Boolean);
96
+ const type = (el.getAttribute("type") ?? "").toLowerCase();
97
+ let resolved: string;
98
+ try {
99
+ resolved = new URL(href, baseUrl).toString();
100
+ } catch {
101
+ return;
102
+ }
103
+ links.push({ rels, type, href: resolved });
104
+ },
105
+ });
106
+ await rewriter.transform(new Response(html)).text();
107
+ return links;
108
+ }
109
+
110
+ /** The first alternate feed `<link>` in document order, if any. */
111
+ function pickFeedLink(links: readonly FeedLink[]): FeedLink | null {
112
+ for (const link of links) {
113
+ if (!link.rels.includes("alternate") && link.rels.length > 0) continue;
114
+ if (
115
+ link.type === "application/atom+xml" ||
116
+ link.type === "application/rss+xml" ||
117
+ link.type === "application/feed+json" ||
118
+ link.type === "application/json"
119
+ ) {
120
+ return link;
121
+ }
122
+ }
123
+ return null;
124
+ }
125
+
126
+ /** Parse a fetched body into entries, choosing the parser by content type. */
127
+ async function parseBody(
128
+ body: string,
129
+ contentType: string,
130
+ url: string,
131
+ ): Promise<Jf2Entry[]> {
132
+ if (isHtmlType(contentType)) {
133
+ return parseHFeed(body, url);
134
+ }
135
+ return parseFeed(body, contentType, url);
136
+ }
137
+
138
+ function resolveDeps(options?: DiscoveryOptions): {
139
+ doFetch: FetchLike;
140
+ logger: Logger;
141
+ metrics: Metrics;
142
+ } {
143
+ return {
144
+ doFetch: options?.fetch ?? ((input, init) => fetch(input, init)),
145
+ logger: options?.logger ?? noopLogger,
146
+ metrics: options?.metrics ?? noopMetrics,
147
+ };
148
+ }
149
+
150
+ /**
151
+ * Discover the feed at `target`. Returns the URL to poll plus the entries from
152
+ * this first fetch, or `null` when the target is unreachable, blocked, or has no
153
+ * feed and no `h-entry` content.
154
+ */
155
+ export async function discoverFeed(
156
+ target: string,
157
+ options?: DiscoveryOptions,
158
+ ): Promise<DiscoveredFeed | null> {
159
+ const { doFetch, logger, metrics } = resolveDeps(options);
160
+
161
+ let response: Response;
162
+ let finalUrl: string;
163
+ try {
164
+ const result = await safeFetch(
165
+ doFetch,
166
+ target,
167
+ { method: "GET", headers: { accept: FEED_ACCEPT } },
168
+ { logger, metrics },
169
+ );
170
+ response = result.response;
171
+ finalUrl = result.url;
172
+ } catch {
173
+ return null;
174
+ }
175
+
176
+ if (!response.ok) return null;
177
+ const contentType = response.headers.get("content-type") ?? "";
178
+ const body = await readTextCapped(response);
179
+ if (body === null) return null;
180
+
181
+ // A syndication feed: parse it directly.
182
+ if (
183
+ isFeedType(contentType) ||
184
+ (!isHtmlType(contentType) && looksLikeFeedBody(body))
185
+ ) {
186
+ return {
187
+ feedUrl: finalUrl,
188
+ entries: parseFeed(body, contentType, finalUrl),
189
+ };
190
+ }
191
+
192
+ // HTML: prefer a declared alternate feed, else fall back to h-feed.
193
+ if (isHtmlType(contentType) || body.trimStart().startsWith("<")) {
194
+ const links = await findFeedLinks(body, finalUrl);
195
+ const feedLink = pickFeedLink(links);
196
+ if (feedLink !== null) {
197
+ const fetched = await fetchFeed(feedLink.href, options);
198
+ if (fetched !== null) {
199
+ return { feedUrl: feedLink.href, entries: fetched.entries };
200
+ }
201
+ }
202
+ const entries = await parseHFeed(body, finalUrl);
203
+ if (entries.length > 0) {
204
+ return { feedUrl: finalUrl, entries };
205
+ }
206
+ }
207
+
208
+ return null;
209
+ }
210
+
211
+ /** Cheap sniff for a feed root when the content type is unhelpful. */
212
+ function looksLikeFeedBody(body: string): boolean {
213
+ const head = body.trimStart().slice(0, 512).toLowerCase();
214
+ return (
215
+ head.includes("<rss") ||
216
+ head.includes("<feed") ||
217
+ head.includes("<rdf:rdf") ||
218
+ (head.startsWith("{") && head.includes("jsonfeed.org"))
219
+ );
220
+ }
221
+
222
+ /**
223
+ * Fetch and parse an already-resolved feed URL, sending conditional-request
224
+ * validators when supplied. Returns `notModified` on a `304`, the parsed
225
+ * entries otherwise, or `null` when the fetch fails or is blocked.
226
+ */
227
+ export async function fetchFeed(
228
+ feedUrl: string,
229
+ options?: DiscoveryOptions,
230
+ cache?: { etag: string | null; lastModified: string | null },
231
+ ): Promise<FetchedFeed | null> {
232
+ const { doFetch, logger, metrics } = resolveDeps(options);
233
+
234
+ const headers: Record<string, string> = { accept: FEED_ACCEPT };
235
+ if (cache?.etag) headers["if-none-match"] = cache.etag;
236
+ if (cache?.lastModified) headers["if-modified-since"] = cache.lastModified;
237
+
238
+ let response: Response;
239
+ let finalUrl: string;
240
+ try {
241
+ const result = await safeFetch(
242
+ doFetch,
243
+ feedUrl,
244
+ { method: "GET", headers },
245
+ { logger, metrics },
246
+ );
247
+ response = result.response;
248
+ finalUrl = result.url;
249
+ } catch {
250
+ return null;
251
+ }
252
+
253
+ const etag = response.headers.get("etag");
254
+ const lastModified = response.headers.get("last-modified");
255
+
256
+ if (response.status === 304) {
257
+ await response.body?.cancel().catch(() => undefined);
258
+ return { entries: [], notModified: true, etag, lastModified };
259
+ }
260
+ if (!response.ok) {
261
+ await response.body?.cancel().catch(() => undefined);
262
+ return null;
263
+ }
264
+
265
+ const contentType = response.headers.get("content-type") ?? "";
266
+ const body = await readTextCapped(response);
267
+ if (body === null) return null;
268
+ const entries = await parseBody(body, contentType, finalUrl);
269
+ return { entries, notModified: false, etag, lastModified };
270
+ }
package/src/fetch.ts ADDED
@@ -0,0 +1,82 @@
1
+ /**
2
+ * `@dwk/microsub` — injectable `fetch` type and a body-size-capped text reader.
3
+ *
4
+ * Discovery, polling, preview, and search all perform HTTP I/O against
5
+ * attacker-influenced URLs. They accept a {@link FetchLike} so callers can
6
+ * inject a stub in tests (no network) and so the package never reaches for a
7
+ * global it did not receive.
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ /** A minimal, injectable `fetch` signature. */
13
+ export type FetchLike = (
14
+ input: string,
15
+ init?: RequestInit,
16
+ ) => Promise<Response>;
17
+
18
+ /**
19
+ * Default cap on a fetched feed body (4 MB). A feed is modest; a larger body is
20
+ * almost certainly hostile or irrelevant, and buffering it would risk an OOM
21
+ * (the Worker memory limit is 128 MB). See `spec/non-functional-requirements.md`.
22
+ */
23
+ export const MAX_BODY_BYTES = 4 * 1024 * 1024;
24
+
25
+ /**
26
+ * Read a response body as text, refusing bodies larger than `maxBytes`.
27
+ *
28
+ * A declared `Content-Length` over the cap is rejected up front; the stream is
29
+ * then read incrementally and aborted the moment the cap is exceeded, so a
30
+ * missing or lying `Content-Length` cannot force the whole body into memory.
31
+ * Returns `null` when the body is too large or cannot be read.
32
+ */
33
+ export async function readTextCapped(
34
+ response: Response,
35
+ maxBytes = MAX_BODY_BYTES,
36
+ ): Promise<string | null> {
37
+ const declared = response.headers.get("content-length");
38
+ if (declared !== null) {
39
+ const length = Number.parseInt(declared, 10);
40
+ if (Number.isFinite(length) && length > maxBytes) {
41
+ return null;
42
+ }
43
+ }
44
+
45
+ const body = response.body;
46
+ if (body === null) {
47
+ try {
48
+ const text = await response.text();
49
+ return text.length > maxBytes ? null : text;
50
+ } catch {
51
+ return null;
52
+ }
53
+ }
54
+
55
+ const reader = body.getReader();
56
+ const chunks: Uint8Array[] = [];
57
+ let total = 0;
58
+ try {
59
+ for (;;) {
60
+ const { done, value } = await reader.read();
61
+ if (done) break;
62
+ if (value !== undefined) {
63
+ total += value.byteLength;
64
+ if (total > maxBytes) {
65
+ await reader.cancel();
66
+ return null;
67
+ }
68
+ chunks.push(value);
69
+ }
70
+ }
71
+ } catch {
72
+ return null;
73
+ }
74
+
75
+ const merged = new Uint8Array(total);
76
+ let offset = 0;
77
+ for (const chunk of chunks) {
78
+ merged.set(chunk, offset);
79
+ offset += chunk.byteLength;
80
+ }
81
+ return new TextDecoder("utf-8").decode(merged);
82
+ }