@syengup/friday-channel-next 0.1.27 → 0.1.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,168 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { decodeHtmlEntities, parseOpenGraph } from "./og-parse.js";
3
+
4
+ const BASE = "https://example.com/article/42";
5
+
6
+ describe("decodeHtmlEntities", () => {
7
+ it("decodes named, decimal, and hex entities", () => {
8
+ expect(decodeHtmlEntities("Tom & Jerry — "fun"")).toBe('Tom & Jerry — "fun"');
9
+ expect(decodeHtmlEntities("中文")).toBe("中文");
10
+ expect(decodeHtmlEntities("'quoted'")).toBe("'quoted'");
11
+ });
12
+
13
+ it("leaves unknown entities untouched", () => {
14
+ expect(decodeHtmlEntities("&unknownentity; stays")).toBe("&unknownentity; stays");
15
+ });
16
+ });
17
+
18
+ describe("parseOpenGraph", () => {
19
+ it("extracts the standard og tags", () => {
20
+ const html = `<html><head>
21
+ <meta property="og:title" content="Hello World" />
22
+ <meta property="og:description" content="A page about things" />
23
+ <meta property="og:image" content="https://cdn.example.com/cover.jpg" />
24
+ <meta property="og:site_name" content="Example" />
25
+ </head><body></body></html>`;
26
+ expect(parseOpenGraph(html, BASE)).toEqual({
27
+ title: "Hello World",
28
+ description: "A page about things",
29
+ imageUrl: "https://cdn.example.com/cover.jpg",
30
+ siteName: "Example",
31
+ iconUrl: null,
32
+ });
33
+ });
34
+
35
+ it("handles name= variant, swapped attribute order, and single quotes", () => {
36
+ const html = `
37
+ <meta content="Swapped" property="og:title">
38
+ <meta name='og:description' content='Single quoted'>
39
+ `;
40
+ const result = parseOpenGraph(html, BASE);
41
+ expect(result.title).toBe("Swapped");
42
+ expect(result.description).toBe("Single quoted");
43
+ });
44
+
45
+ it("first occurrence wins for duplicate og tags", () => {
46
+ const html = `
47
+ <meta property="og:title" content="First">
48
+ <meta property="og:title" content="Second">
49
+ `;
50
+ expect(parseOpenGraph(html, BASE).title).toBe("First");
51
+ });
52
+
53
+ it("falls back to <title> and meta description", () => {
54
+ const html = `<html><head>
55
+ <title> Fallback Title </title>
56
+ <meta name="description" content="Fallback description">
57
+ </head></html>`;
58
+ const result = parseOpenGraph(html, BASE);
59
+ expect(result.title).toBe("Fallback Title");
60
+ expect(result.description).toBe("Fallback description");
61
+ });
62
+
63
+ it("decodes entities and collapses whitespace in text fields", () => {
64
+ const html = `<meta property="og:title" content="Q&amp;A:&#x20;What&#39;s
65
+ new">`;
66
+ expect(parseOpenGraph(html, BASE).title).toBe("Q&A: What's new");
67
+ });
68
+
69
+ it("resolves a relative og:image against the page URL", () => {
70
+ const html = `<meta property="og:image" content="/img/cover.png">`;
71
+ expect(parseOpenGraph(html, BASE).imageUrl).toBe("https://example.com/img/cover.png");
72
+ });
73
+
74
+ it("drops non-http og:image values", () => {
75
+ const html = `<meta property="og:image" content="data:image/png;base64,AAAA">`;
76
+ expect(parseOpenGraph(html, BASE).imageUrl).toBeNull();
77
+ });
78
+
79
+ it("returns nulls for a page with no usable metadata", () => {
80
+ expect(parseOpenGraph("<html><body>plain</body></html>", BASE)).toEqual({
81
+ title: null,
82
+ description: null,
83
+ imageUrl: null,
84
+ siteName: null,
85
+ iconUrl: null,
86
+ });
87
+ });
88
+
89
+ it("extracts and resolves a favicon, preferring apple-touch-icon, skipping mask-icon", () => {
90
+ const html = `<head>
91
+ <link rel="mask-icon" href="/safari.svg" color="#000">
92
+ <link rel="icon" type="image/png" href="/favicon-32.png">
93
+ <link rel="apple-touch-icon" href="https://cdn.example.com/touch.png">
94
+ </head>`;
95
+ expect(parseOpenGraph(html, BASE).iconUrl).toBe("https://cdn.example.com/touch.png");
96
+ });
97
+
98
+ it("falls back to a regular icon link and resolves relative hrefs", () => {
99
+ const html = `<link rel="shortcut icon" href="/static/fav.ico">`;
100
+ expect(parseOpenGraph(html, BASE).iconUrl).toBe("https://example.com/static/fav.ico");
101
+ });
102
+
103
+ it("returns null icon when only a mask-icon is present", () => {
104
+ expect(parseOpenGraph(`<link rel="mask-icon" href="/m.svg">`, BASE).iconUrl).toBeNull();
105
+ });
106
+
107
+ it("falls back to twitter card tags when og is absent", () => {
108
+ const html = `
109
+ <meta name="twitter:title" content="TW Title">
110
+ <meta name="twitter:description" content="TW Desc">
111
+ <meta name="twitter:image" content="https://cdn.example.com/tw.jpg">
112
+ `;
113
+ const r = parseOpenGraph(html, BASE);
114
+ expect(r.title).toBe("TW Title");
115
+ expect(r.description).toBe("TW Desc");
116
+ expect(r.imageUrl).toBe("https://cdn.example.com/tw.jpg");
117
+ });
118
+
119
+ it("falls back to JSON-LD (headline/description/image, incl. @graph and ImageObject)", () => {
120
+ const html = `<script type="application/ld+json">
121
+ {"@context":"https://schema.org","@graph":[
122
+ {"@type":"NewsArticle","headline":"LD Headline","description":"LD Desc",
123
+ "image":{"@type":"ImageObject","url":"https://cdn.example.com/ld.jpg"}}
124
+ ]}
125
+ </script>`;
126
+ const r = parseOpenGraph(html, BASE);
127
+ expect(r.title).toBe("LD Headline");
128
+ expect(r.description).toBe("LD Desc");
129
+ expect(r.imageUrl).toBe("https://cdn.example.com/ld.jpg");
130
+ });
131
+
132
+ it("prefers a body article-title over a generic <title> (QQ-style SPA shell)", () => {
133
+ const html = `<head><title>搜索资讯页</title></head>
134
+ <body><div class="article-wrapper"><div class="article-title">钉钉"两篇大作文"事件——离职副总裁万字长文</div></div></body>`;
135
+ expect(parseOpenGraph(html, BASE).title).toBe('钉钉"两篇大作文"事件——离职副总裁万字长文');
136
+ });
137
+
138
+ it("prefers an <h1> over a generic <title>", () => {
139
+ const html = `<title>Home</title><h1>The Real Headline</h1>`;
140
+ expect(parseOpenGraph(html, BASE).title).toBe("The Real Headline");
141
+ });
142
+
143
+ it("extracts a cover image from inline JSON (extensionless, escaped slashes)", () => {
144
+ const html = `<title>搜索资讯页</title>
145
+ <script>window.__INFO__={"imgUrl":"http:\\/\\/qqpublic.qpic.cn\\/qq_public_cover\\/0\\/0-2342_op"}</script>`;
146
+ expect(parseOpenGraph(html, BASE).imageUrl).toBe("http://qqpublic.qpic.cn/qq_public_cover/0/0-2342_op");
147
+ });
148
+
149
+ it("standard og tags still win over body/json fallbacks", () => {
150
+ const html = `<meta property="og:title" content="OG Wins">
151
+ <h1>Body H1</h1>
152
+ <div class="article-title">Body Title</div>`;
153
+ expect(parseOpenGraph(html, BASE).title).toBe("OG Wins");
154
+ });
155
+
156
+ it("does not throw on malformed or truncated HTML", () => {
157
+ expect(() => parseOpenGraph(`<meta property="og:title" content="Trunc`, BASE)).not.toThrow();
158
+ expect(() => parseOpenGraph("<<<>>><meta<meta>", BASE)).not.toThrow();
159
+ });
160
+
161
+ it("ignores empty content values", () => {
162
+ const html = `<meta property="og:title" content="">
163
+ <title>Real Title</title>`;
164
+ // og:title 占位为空串 → cleanText 归 null,但 og map 已记录空串;回退逻辑应仍给出可用 title
165
+ const result = parseOpenGraph(html, BASE);
166
+ expect(result.title).toBe("Real Title");
167
+ });
168
+ });
@@ -0,0 +1,249 @@
1
+ /**
2
+ * Open Graph metadata extraction via regex — no HTML parser dependency.
3
+ *
4
+ * Good enough for the link-preview card use case: og:* meta tags are flat, attribute-ordered
5
+ * variants are handled generically, and pages where this fails simply degrade to "no card".
6
+ */
7
+
8
+ const MAX_PARSE_BYTES = 512 * 1024;
9
+
10
+ export interface OpenGraphResult {
11
+ title: string | null;
12
+ description: string | null;
13
+ imageUrl: string | null;
14
+ siteName: string | null;
15
+ /** Favicon URL parsed from `<link rel="...icon...">`, resolved absolute. */
16
+ iconUrl: string | null;
17
+ }
18
+
19
+ const META_TAG_RE = /<meta\b[^>]*>/gi;
20
+ const TITLE_TAG_RE = /<title[^>]*>([\s\S]*?)<\/title>/i;
21
+ const LINK_TAG_RE = /<link\b[^>]*>/gi;
22
+
23
+ /** Extract one attribute value from a tag, tolerating single/double/no quotes and any order. */
24
+ function attributeValue(tag: string, name: string): string | null {
25
+ const re = new RegExp(`\\b${name}\\s*=\\s*(?:"([^"]*)"|'([^']*)'|([^\\s"'>]+))`, "i");
26
+ const m = tag.match(re);
27
+ if (!m) return null;
28
+ return m[1] ?? m[2] ?? m[3] ?? "";
29
+ }
30
+
31
+ const NAMED_ENTITIES: Record<string, string> = {
32
+ amp: "&",
33
+ lt: "<",
34
+ gt: ">",
35
+ quot: '"',
36
+ apos: "'",
37
+ nbsp: " ",
38
+ ndash: "–",
39
+ mdash: "—",
40
+ hellip: "…",
41
+ middot: "·",
42
+ copy: "©",
43
+ reg: "®",
44
+ trade: "™",
45
+ lsquo: "‘",
46
+ rsquo: "’",
47
+ ldquo: "“",
48
+ rdquo: "”",
49
+ laquo: "«",
50
+ raquo: "»",
51
+ };
52
+
53
+ export function decodeHtmlEntities(s: string): string {
54
+ return s.replace(/&(#x?[0-9a-f]+|[a-z]+);/gi, (whole, body: string) => {
55
+ if (body.startsWith("#x") || body.startsWith("#X")) {
56
+ const code = Number.parseInt(body.slice(2), 16);
57
+ return Number.isFinite(code) ? String.fromCodePoint(code) : whole;
58
+ }
59
+ if (body.startsWith("#")) {
60
+ const code = Number.parseInt(body.slice(1), 10);
61
+ return Number.isFinite(code) ? String.fromCodePoint(code) : whole;
62
+ }
63
+ return NAMED_ENTITIES[body.toLowerCase()] ?? whole;
64
+ });
65
+ }
66
+
67
+ function cleanText(raw: string | null | undefined): string | null {
68
+ if (raw == null) return null;
69
+ const text = decodeHtmlEntities(raw).replace(/\s+/g, " ").trim();
70
+ return text || null;
71
+ }
72
+
73
+ /** Resolve og:image (possibly relative) against the final page URL; only http(s) survives. */
74
+ function resolveImageUrl(raw: string | null | undefined, baseUrl: string): string | null {
75
+ if (!raw) return null;
76
+ try {
77
+ const url = new URL(raw.trim(), baseUrl);
78
+ if (url.protocol !== "http:" && url.protocol !== "https:") return null;
79
+ return url.toString();
80
+ } catch {
81
+ return null;
82
+ }
83
+ }
84
+
85
+ export function parseOpenGraph(html: string, baseUrl: string): OpenGraphResult {
86
+ const slice = html.length > MAX_PARSE_BYTES ? html.slice(0, MAX_PARSE_BYTES) : html;
87
+
88
+ // First occurrence wins per key (matches browser/crawler behavior).
89
+ const og: Record<string, string> = {};
90
+ const tw: Record<string, string> = {};
91
+ let metaDescription: string | null = null;
92
+ for (const match of slice.matchAll(META_TAG_RE)) {
93
+ const tag = match[0];
94
+ const key = (attributeValue(tag, "property") ?? attributeValue(tag, "name"))?.trim().toLowerCase();
95
+ if (!key) continue;
96
+ const content = attributeValue(tag, "content");
97
+ if (content == null || !content.trim()) continue;
98
+ if (key.startsWith("og:")) {
99
+ const ogKey = key.slice(3);
100
+ if (!(ogKey in og)) og[ogKey] = content;
101
+ } else if (key.startsWith("twitter:")) {
102
+ const twKey = key.slice(8);
103
+ if (!(twKey in tw)) tw[twKey] = content;
104
+ } else if (key === "description" && metaDescription == null) {
105
+ metaDescription = content;
106
+ }
107
+ }
108
+
109
+ const ld = parseJsonLd(slice);
110
+ const pageTitle = slice.match(TITLE_TAG_RE)?.[1] ?? null;
111
+
112
+ // Title chain: standard tags first, then server-rendered body title (h1 / article-title class)
113
+ // BEFORE the generic <title> — many SPA/news shells put a useless <title> ("搜索资讯页") in the
114
+ // head while the real headline lives in the body.
115
+ const title =
116
+ cleanText(og["title"]) ??
117
+ cleanText(tw["title"]) ??
118
+ cleanText(ld.title) ??
119
+ cleanText(parseBodyTitle(slice)) ??
120
+ cleanText(pageTitle);
121
+
122
+ const description =
123
+ cleanText(og["description"]) ??
124
+ cleanText(tw["description"]) ??
125
+ cleanText(ld.description) ??
126
+ cleanText(metaDescription);
127
+
128
+ const imageUrl =
129
+ resolveImageUrl(og["image"] ?? null, baseUrl) ??
130
+ resolveImageUrl(tw["image"] ?? null, baseUrl) ??
131
+ resolveImageUrl(ld.image, baseUrl) ??
132
+ resolveImageUrl(parseBodyCoverImage(slice), baseUrl);
133
+
134
+ return {
135
+ title,
136
+ description,
137
+ imageUrl,
138
+ siteName: cleanText(og["site_name"] ?? tw["site"] ?? null),
139
+ iconUrl: parseFaviconUrl(slice, baseUrl),
140
+ };
141
+ }
142
+
143
+ const JSON_LD_RE = /<script[^>]*type\s*=\s*["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
144
+
145
+ /** Extract title/description/image from JSON-LD blocks (schema.org Article/NewsArticle/etc.). */
146
+ function parseJsonLd(html: string): { title: string | null; description: string | null; image: string | null } {
147
+ for (const match of html.matchAll(JSON_LD_RE)) {
148
+ let data: unknown;
149
+ try {
150
+ data = JSON.parse(match[1].trim());
151
+ } catch {
152
+ continue;
153
+ }
154
+ // JSON-LD may be a single object, an array, or a @graph container.
155
+ const nodes: unknown[] = Array.isArray(data)
156
+ ? data
157
+ : isRecord(data) && Array.isArray(data["@graph"])
158
+ ? (data["@graph"] as unknown[])
159
+ : [data];
160
+ for (const node of nodes) {
161
+ if (!isRecord(node)) continue;
162
+ const title = asString(node.headline) ?? asString(node.name);
163
+ const description = asString(node.description);
164
+ const image = firstImage(node.image) ?? asString(node.thumbnailUrl);
165
+ if (title || description || image) {
166
+ return { title: title ?? null, description: description ?? null, image: image ?? null };
167
+ }
168
+ }
169
+ }
170
+ return { title: null, description: null, image: null };
171
+ }
172
+
173
+ function isRecord(v: unknown): v is Record<string, unknown> {
174
+ return typeof v === "object" && v !== null;
175
+ }
176
+
177
+ function asString(v: unknown): string | null {
178
+ return typeof v === "string" && v.trim() ? v : null;
179
+ }
180
+
181
+ /** JSON-LD `image` is a string, an array, or an ImageObject `{ url }`. */
182
+ function firstImage(v: unknown): string | null {
183
+ if (typeof v === "string") return v;
184
+ if (Array.isArray(v)) {
185
+ for (const item of v) {
186
+ const found = firstImage(item);
187
+ if (found) return found;
188
+ }
189
+ return null;
190
+ }
191
+ if (isRecord(v)) return asString(v.url);
192
+ return null;
193
+ }
194
+
195
+ // Common server-rendered article-title class names (whitelist keeps false positives down vs. any
196
+ // class containing "title", e.g. a sidebar "related-titles" block).
197
+ const BODY_TITLE_CLASS_RE =
198
+ /class\s*=\s*["'][^"']*\b(?:article-title|post-title|entry-title|news-title|content-title|headline|title-text)\b[^"']*["'][^>]*>\s*([^<]{4,200}?)\s*</i;
199
+ const H1_RE = /<h1\b[^>]*>\s*([\s\S]{4,200}?)\s*<\/h1>/i;
200
+
201
+ /** Server-rendered headline fallback: first <h1>, else an element with a known article-title class. */
202
+ function parseBodyTitle(html: string): string | null {
203
+ const h1 = html.match(H1_RE)?.[1];
204
+ if (h1) {
205
+ const text = stripTags(h1).trim();
206
+ if (text.length >= 4) return text;
207
+ }
208
+ return html.match(BODY_TITLE_CLASS_RE)?.[1] ?? null;
209
+ }
210
+
211
+ // Cover image embedded in inline JSON (e.g. QQ's `"imgUrl":"http:\/\/...cover..."`). The URL may be
212
+ // extensionless; the re-host step's magic-byte sniff is the safety net against non-image matches.
213
+ const JSON_COVER_RE =
214
+ /"(?:imgUrl|imageUrl|coverUrl|coverImage|cover|ogImage|thumbnail|picUrl)"\s*:\s*"(https?:(?:\\?\/){2}[^"]+?)"/i;
215
+
216
+ /** Cover image from inline JSON when no og/twitter/json-ld image is present. */
217
+ function parseBodyCoverImage(html: string): string | null {
218
+ const raw = html.match(JSON_COVER_RE)?.[1];
219
+ if (!raw) return null;
220
+ return raw.replace(/\\\//g, "/"); // unescape JSON `\/`
221
+ }
222
+
223
+ function stripTags(s: string): string {
224
+ return s.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ");
225
+ }
226
+
227
+ /**
228
+ * Pick the best `<link rel="...icon...">` href. Prefers a high-res `apple-touch-icon`, then a
229
+ * regular `icon` / `shortcut icon`. Skips `mask-icon` (monochrome SVG). Returns absolute http(s).
230
+ */
231
+ export function parseFaviconUrl(html: string, baseUrl: string): string | null {
232
+ let appleTouch: string | null = null;
233
+ let regular: string | null = null;
234
+ for (const match of html.matchAll(LINK_TAG_RE)) {
235
+ const tag = match[0];
236
+ const rel = attributeValue(tag, "rel")?.trim().toLowerCase();
237
+ if (!rel || !rel.includes("icon") || rel.includes("mask-icon")) continue;
238
+ const href = attributeValue(tag, "href");
239
+ if (!href) continue;
240
+ const resolved = resolveImageUrl(href, baseUrl);
241
+ if (!resolved) continue;
242
+ if (rel.includes("apple-touch-icon")) {
243
+ appleTouch ??= resolved;
244
+ } else {
245
+ regular ??= resolved;
246
+ }
247
+ }
248
+ return appleTouch ?? regular;
249
+ }
@@ -0,0 +1,247 @@
1
+ /**
2
+ * Link-preview orchestration: fetch page → parse Open Graph → re-host the cover image through
3
+ * the gateway's stored files → cache.
4
+ *
5
+ * The cover image is downloaded server-side and served from /friday-next/files/ so the app only
6
+ * ever talks to the trusted gateway host (same rationale as `downloadRemoteMedia` for outbound
7
+ * media). Failures degrade to "no card" on the app side, so every error path returns a typed
8
+ * error instead of throwing.
9
+ */
10
+
11
+ import { createFridayNextLogger } from "../logging.js";
12
+ import { storeFile } from "../http/handlers/files.js";
13
+ import { parseOpenGraph } from "./og-parse.js";
14
+ import { BlockedUrlError, fetchPublicUrl, parseHttpUrl } from "./ssrf-guard.js";
15
+
16
+ const HTML_MAX_BYTES = 2 * 1024 * 1024;
17
+ const HTML_TIMEOUT_MS = 10_000;
18
+ const IMAGE_MAX_BYTES = 8 * 1024 * 1024;
19
+ const IMAGE_TIMEOUT_MS = 10_000;
20
+
21
+ const SUCCESS_TTL_MS = 24 * 60 * 60 * 1000;
22
+ const FAILURE_TTL_MS = 10 * 60 * 1000;
23
+ const MAX_CACHE_ENTRIES = 1000;
24
+
25
+ const logger = createFridayNextLogger("link-preview");
26
+
27
+ export interface LinkPreviewPayload {
28
+ url: string;
29
+ finalUrl: string;
30
+ siteName: string | null;
31
+ title: string;
32
+ description: string | null;
33
+ /** Gateway-relative cover URL ("/friday-next/files/{token}") or null. */
34
+ imageUrl: string | null;
35
+ /** Gateway-relative favicon URL ("/friday-next/files/{token}") or null. */
36
+ iconUrl: string | null;
37
+ fetchedAt: number;
38
+ }
39
+
40
+ export type LinkPreviewError = "invalid_url" | "blocked_url" | "fetch_failed" | "no_metadata";
41
+
42
+ export type LinkPreviewResult =
43
+ | { ok: true; preview: LinkPreviewPayload }
44
+ | { ok: false; error: LinkPreviewError };
45
+
46
+ interface CacheEntry {
47
+ result: LinkPreviewResult;
48
+ cachedAt: number;
49
+ }
50
+
51
+ const cache = new Map<string, CacheEntry>();
52
+ const inFlight = new Map<string, Promise<LinkPreviewResult>>();
53
+
54
+ export function resetLinkPreviewCacheForTest(): void {
55
+ cache.clear();
56
+ inFlight.clear();
57
+ }
58
+
59
+ export async function getLinkPreview(rawUrl: string): Promise<LinkPreviewResult> {
60
+ const parsed = parseHttpUrl(rawUrl);
61
+ if (!parsed) return { ok: false, error: "invalid_url" };
62
+ const key = parsed.toString();
63
+
64
+ const cached = cache.get(key);
65
+ if (cached) {
66
+ const ttl = cached.result.ok ? SUCCESS_TTL_MS : FAILURE_TTL_MS;
67
+ if (Date.now() - cached.cachedAt < ttl) return cached.result;
68
+ cache.delete(key);
69
+ }
70
+
71
+ const pending = inFlight.get(key);
72
+ if (pending) return pending;
73
+
74
+ const task = buildPreview(key)
75
+ .then((result) => {
76
+ writeCache(key, result);
77
+ return result;
78
+ })
79
+ .finally(() => {
80
+ inFlight.delete(key);
81
+ });
82
+ inFlight.set(key, task);
83
+ return task;
84
+ }
85
+
86
+ function writeCache(key: string, result: LinkPreviewResult): void {
87
+ if (cache.size >= MAX_CACHE_ENTRIES) {
88
+ let oldestKey: string | null = null;
89
+ let oldestAt = Infinity;
90
+ for (const [k, entry] of cache) {
91
+ if (entry.cachedAt < oldestAt) {
92
+ oldestAt = entry.cachedAt;
93
+ oldestKey = k;
94
+ }
95
+ }
96
+ if (oldestKey) cache.delete(oldestKey);
97
+ }
98
+ cache.set(key, { result, cachedAt: Date.now() });
99
+ }
100
+
101
+ async function buildPreview(pageUrl: string): Promise<LinkPreviewResult> {
102
+ let page;
103
+ try {
104
+ page = await fetchPublicUrl(pageUrl, {
105
+ maxBytes: HTML_MAX_BYTES,
106
+ timeoutMs: HTML_TIMEOUT_MS,
107
+ accept: "text/html,application/xhtml+xml",
108
+ requireContentTypePrefixes: ["text/html", "application/xhtml+xml"],
109
+ });
110
+ } catch (err) {
111
+ if (err instanceof BlockedUrlError) {
112
+ logger.warn(`link-preview blocked: ${pageUrl} (${err.reason})`);
113
+ return { ok: false, error: "blocked_url" };
114
+ }
115
+ page = null; // network/timeout — fall through to a favicon-only minimal card
116
+ }
117
+
118
+ const finalUrl = page?.finalUrl ?? pageUrl;
119
+ const og = page ? parseOpenGraph(page.body.toString("utf8"), finalUrl) : null;
120
+ const hostname = (() => {
121
+ try {
122
+ return new URL(finalUrl).hostname;
123
+ } catch {
124
+ return null;
125
+ }
126
+ })();
127
+
128
+ // Favicon: parsed <link rel icon> first, then the conventional /favicon.ico (which is reachable
129
+ // even for pages that block bots, e.g. zhihu → redirects to its CDN icon).
130
+ const iconUrl = await resolveFavicon(og?.iconUrl ?? null, finalUrl);
131
+
132
+ // A failed page fetch only yields a (minimal) card when the favicon resolved — that proves the
133
+ // domain is real/reachable (e.g. bot-blocked zhihu). A dead domain (favicon also fails) collapses.
134
+ const reachable = page !== null || iconUrl !== null;
135
+ const title = og?.title ?? hostname;
136
+ if (!reachable || !title) {
137
+ return { ok: false, error: page ? "no_metadata" : "fetch_failed" };
138
+ }
139
+
140
+ const imageUrl = og?.imageUrl ? await rehostCoverImage(og.imageUrl) : null;
141
+
142
+ return {
143
+ ok: true,
144
+ preview: {
145
+ url: pageUrl,
146
+ finalUrl,
147
+ siteName: og?.siteName ?? hostname,
148
+ title: title ?? hostname ?? pageUrl,
149
+ description: og?.description ?? null,
150
+ imageUrl,
151
+ iconUrl,
152
+ fetchedAt: Date.now(),
153
+ },
154
+ };
155
+ }
156
+
157
+ /** Re-host a favicon: try the parsed `<link rel icon>`, then `<origin>/favicon.ico`. */
158
+ async function resolveFavicon(parsedIconUrl: string | null, finalUrl: string): Promise<string | null> {
159
+ const candidates: string[] = [];
160
+ if (parsedIconUrl) candidates.push(parsedIconUrl);
161
+ try {
162
+ candidates.push(new URL("/favicon.ico", finalUrl).toString());
163
+ } catch {
164
+ // finalUrl unparseable — skip the conventional fallback
165
+ }
166
+ for (const candidate of candidates) {
167
+ const rehosted = await rehostIconImage(candidate);
168
+ if (rehosted) return rehosted;
169
+ }
170
+ return null;
171
+ }
172
+
173
+ /** Download a favicon (full SSRF checks) and re-publish via stored files. Null on any failure. */
174
+ async function rehostIconImage(iconUrl: string): Promise<string | null> {
175
+ let image;
176
+ try {
177
+ image = await fetchPublicUrl(iconUrl, {
178
+ maxBytes: 1024 * 1024,
179
+ timeoutMs: IMAGE_TIMEOUT_MS,
180
+ accept: "image/*",
181
+ requireContentTypePrefixes: ["image/"],
182
+ });
183
+ } catch {
184
+ return null;
185
+ }
186
+ if (!image) return null;
187
+ const sniffed = sniffImageType(image.body);
188
+ if (!sniffed) return null;
189
+ try {
190
+ const stored = storeFile(image.body, `link-preview-icon.${sniffed.ext}`, sniffed.mime);
191
+ return `/friday-next/files/${encodeURIComponent(stored.urlToken)}`;
192
+ } catch {
193
+ return null;
194
+ }
195
+ }
196
+
197
+ /** Download og:image (full SSRF checks) and re-publish via stored files. Null on any failure. */
198
+ async function rehostCoverImage(imageUrl: string): Promise<string | null> {
199
+ let image;
200
+ try {
201
+ image = await fetchPublicUrl(imageUrl, {
202
+ maxBytes: IMAGE_MAX_BYTES,
203
+ timeoutMs: IMAGE_TIMEOUT_MS,
204
+ accept: "image/*",
205
+ requireContentTypePrefixes: ["image/"],
206
+ });
207
+ } catch {
208
+ return null; // blocked og:image just means no cover
209
+ }
210
+ if (!image) return null;
211
+
212
+ const sniffed = sniffImageType(image.body);
213
+ if (!sniffed) return null;
214
+
215
+ try {
216
+ const stored = storeFile(image.body, `link-preview-cover.${sniffed.ext}`, sniffed.mime);
217
+ return `/friday-next/files/${encodeURIComponent(stored.urlToken)}`;
218
+ } catch (err) {
219
+ logger.warn(`link-preview cover store failed for ${imageUrl}: ${String(err)}`);
220
+ return null;
221
+ }
222
+ }
223
+
224
+ /** Magic-byte sniff — second line of defense after the Content-Type check. */
225
+ function sniffImageType(buffer: Buffer): { ext: string; mime: string } | null {
226
+ if (buffer.length < 12) return null;
227
+ // ICO: 00 00 01 00 (favicons are commonly .ico; iOS ImageIO decodes them).
228
+ if (buffer[0] === 0x00 && buffer[1] === 0x00 && buffer[2] === 0x01 && buffer[3] === 0x00) {
229
+ return { ext: "ico", mime: "image/x-icon" };
230
+ }
231
+ if (buffer[0] === 0x89 && buffer[1] === 0x50 && buffer[2] === 0x4e && buffer[3] === 0x47) {
232
+ return { ext: "png", mime: "image/png" };
233
+ }
234
+ if (buffer[0] === 0xff && buffer[1] === 0xd8 && buffer[2] === 0xff) {
235
+ return { ext: "jpg", mime: "image/jpeg" };
236
+ }
237
+ if (buffer.subarray(0, 4).toString("latin1") === "GIF8") {
238
+ return { ext: "gif", mime: "image/gif" };
239
+ }
240
+ if (
241
+ buffer.subarray(0, 4).toString("latin1") === "RIFF" &&
242
+ buffer.subarray(8, 12).toString("latin1") === "WEBP"
243
+ ) {
244
+ return { ext: "webp", mime: "image/webp" };
245
+ }
246
+ return null;
247
+ }