metanova 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +263 -0
- package/USAGE_GUIDE.md +829 -0
- package/dist/index.cjs +3756 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +493 -0
- package/dist/index.d.ts +493 -0
- package/dist/index.js +3674 -0
- package/dist/index.js.map +1 -0
- package/examples/behance.mjs +23 -0
- package/examples/commonjs.cjs +12 -0
- package/examples/custom-adapter.mjs +41 -0
- package/examples/custom-plugin.mjs +26 -0
- package/examples/diagnostics.mjs +17 -0
- package/examples/live-fetch.mjs +21 -0
- package/examples/parse-html.mjs +15 -0
- package/examples/pinterest.mjs +22 -0
- package/examples/preview-card.mjs +11 -0
- package/examples/quick-start.mjs +24 -0
- package/examples/reddit.mjs +23 -0
- package/examples/social-links.mjs +28 -0
- package/examples/social-preview.mjs +21 -0
- package/examples/youtube-playlist.mjs +19 -0
- package/examples/youtube-video.mjs +22 -0
- package/examples/youtube.mjs +22 -0
- package/package.json +70 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,3674 @@
|
|
|
1
|
+
// src/utils/url.ts
|
|
2
|
+
import net from "net";
|
|
3
|
+
var SHORT_URL_HOSTS = {
|
|
4
|
+
"pin.it": "Pinterest",
|
|
5
|
+
"redd.it": "Reddit",
|
|
6
|
+
"t.co": "X",
|
|
7
|
+
"bit.ly": "Bitly",
|
|
8
|
+
"tinyurl.com": "TinyURL",
|
|
9
|
+
"youtu.be": "YouTube"
|
|
10
|
+
};
|
|
11
|
+
function validateUrl(input, options = {}) {
|
|
12
|
+
const url = new URL(input);
|
|
13
|
+
const allowedProtocols = options.allowedProtocols ?? ["http:", "https:"];
|
|
14
|
+
if (!allowedProtocols.includes(url.protocol)) {
|
|
15
|
+
throw new TypeError(`Unsupported URL protocol: ${url.protocol}`);
|
|
16
|
+
}
|
|
17
|
+
return url;
|
|
18
|
+
}
|
|
19
|
+
function resolveUrl(url, baseUrl) {
|
|
20
|
+
const resolved = baseUrl ? new URL(url, baseUrl) : new URL(url);
|
|
21
|
+
return normalizeUrl(resolved.toString());
|
|
22
|
+
}
|
|
23
|
+
function tryResolveUrl(url, baseUrl) {
|
|
24
|
+
if (!url) {
|
|
25
|
+
return void 0;
|
|
26
|
+
}
|
|
27
|
+
try {
|
|
28
|
+
return resolveUrl(url, baseUrl);
|
|
29
|
+
} catch {
|
|
30
|
+
return void 0;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
function normalizeUrl(input, options = {}) {
|
|
34
|
+
const url = validateUrl(input, options);
|
|
35
|
+
url.hash = "";
|
|
36
|
+
url.hostname = url.hostname.toLowerCase();
|
|
37
|
+
if (url.protocol === "http:" && url.port === "80" || url.protocol === "https:" && url.port === "443") {
|
|
38
|
+
url.port = "";
|
|
39
|
+
}
|
|
40
|
+
return url.toString();
|
|
41
|
+
}
|
|
42
|
+
function detectShortUrl(input) {
|
|
43
|
+
const url = typeof input === "string" ? validateUrl(input) : input;
|
|
44
|
+
const hostname = url.hostname.toLowerCase().replace(/^www\./, "");
|
|
45
|
+
const provider = SHORT_URL_HOSTS[hostname];
|
|
46
|
+
return {
|
|
47
|
+
isShortUrl: Boolean(provider),
|
|
48
|
+
provider
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
function resolveCanonicalUrl(canonicalUrl, baseUrl) {
|
|
52
|
+
return tryResolveUrl(canonicalUrl, baseUrl);
|
|
53
|
+
}
|
|
54
|
+
function isLocalHostname(hostname) {
|
|
55
|
+
const normalized = hostname.toLowerCase().replace(/\.$/, "");
|
|
56
|
+
return normalized === "localhost" || normalized.endsWith(".localhost");
|
|
57
|
+
}
|
|
58
|
+
function classifyIp(address) {
|
|
59
|
+
const ipVersion = net.isIP(address);
|
|
60
|
+
if (ipVersion === 4) {
|
|
61
|
+
return classifyIpv4(address);
|
|
62
|
+
}
|
|
63
|
+
if (ipVersion === 6) {
|
|
64
|
+
return classifyIpv6(address);
|
|
65
|
+
}
|
|
66
|
+
return "public";
|
|
67
|
+
}
|
|
68
|
+
function classifyIpv4(address) {
|
|
69
|
+
const parts = address.split(".").map((part) => Number.parseInt(part, 10));
|
|
70
|
+
const [a, b] = parts;
|
|
71
|
+
if (a === 127) {
|
|
72
|
+
return "loopback";
|
|
73
|
+
}
|
|
74
|
+
if (a === 10 || a === 172 && b >= 16 && b <= 31 || a === 192 && b === 168) {
|
|
75
|
+
return "private";
|
|
76
|
+
}
|
|
77
|
+
if (a === 169 && b === 254) {
|
|
78
|
+
return "linkLocal";
|
|
79
|
+
}
|
|
80
|
+
if (a === 0 || a >= 224 || a === 100 && b >= 64 && b <= 127 || a === 192 && b === 0 || a === 198 && (b === 18 || b === 19)) {
|
|
81
|
+
return "reserved";
|
|
82
|
+
}
|
|
83
|
+
return "public";
|
|
84
|
+
}
|
|
85
|
+
function classifyIpv6(address) {
|
|
86
|
+
const normalized = address.toLowerCase();
|
|
87
|
+
if (normalized === "::1" || normalized === "0:0:0:0:0:0:0:1") {
|
|
88
|
+
return "loopback";
|
|
89
|
+
}
|
|
90
|
+
if (normalized.startsWith("fc") || normalized.startsWith("fd")) {
|
|
91
|
+
return "private";
|
|
92
|
+
}
|
|
93
|
+
if (normalized.startsWith("fe8") || normalized.startsWith("fe9") || normalized.startsWith("fea") || normalized.startsWith("feb")) {
|
|
94
|
+
return "linkLocal";
|
|
95
|
+
}
|
|
96
|
+
if (normalized === "::" || normalized.startsWith("::ffff:127.") || normalized.startsWith("::ffff:10.")) {
|
|
97
|
+
return "reserved";
|
|
98
|
+
}
|
|
99
|
+
return "public";
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// src/fetcher/security.ts
|
|
103
|
+
import { lookup } from "dns/promises";
|
|
104
|
+
import net2 from "net";
|
|
105
|
+
var SecurityError = class extends Error {
|
|
106
|
+
constructor(message) {
|
|
107
|
+
super(message);
|
|
108
|
+
this.name = "SecurityError";
|
|
109
|
+
}
|
|
110
|
+
};
|
|
111
|
+
async function assertSafeRequestUrl(input, options = {}) {
|
|
112
|
+
const protocols = options.allowedProtocols ?? ["http:", "https:"];
|
|
113
|
+
let normalizedUrl;
|
|
114
|
+
try {
|
|
115
|
+
normalizedUrl = normalizeUrl(input, { allowedProtocols: protocols });
|
|
116
|
+
} catch (error) {
|
|
117
|
+
if (error instanceof TypeError) {
|
|
118
|
+
throw new SecurityError(error.message);
|
|
119
|
+
}
|
|
120
|
+
throw error;
|
|
121
|
+
}
|
|
122
|
+
const url = new URL(normalizedUrl);
|
|
123
|
+
if (!protocols.includes(url.protocol)) {
|
|
124
|
+
throw new SecurityError(`Unsupported URL protocol: ${url.protocol}`);
|
|
125
|
+
}
|
|
126
|
+
const hostname = url.hostname;
|
|
127
|
+
if (isLocalHostname(hostname) && !options.allowLocalhost) {
|
|
128
|
+
throw new SecurityError("Localhost URLs are blocked by default.");
|
|
129
|
+
}
|
|
130
|
+
const literalIp = net2.isIP(hostname) ? hostname : void 0;
|
|
131
|
+
if (literalIp) {
|
|
132
|
+
assertPublicAddressAllowed(literalIp, options);
|
|
133
|
+
return url.toString();
|
|
134
|
+
}
|
|
135
|
+
if (options.allowPrivateNetwork && options.allowLocalhost) {
|
|
136
|
+
return url.toString();
|
|
137
|
+
}
|
|
138
|
+
const records = await lookup(hostname, { all: true, verbatim: false });
|
|
139
|
+
for (const record of records) {
|
|
140
|
+
assertPublicAddressAllowed(record.address, options);
|
|
141
|
+
}
|
|
142
|
+
return url.toString();
|
|
143
|
+
}
|
|
144
|
+
function assertPublicAddressAllowed(address, options) {
|
|
145
|
+
const classification = classifyIp(address);
|
|
146
|
+
if (classification === "loopback" && !options.allowLocalhost) {
|
|
147
|
+
throw new SecurityError(`Loopback address is blocked: ${address}`);
|
|
148
|
+
}
|
|
149
|
+
if (classification !== "public" && classification !== "loopback" && !options.allowPrivateNetwork) {
|
|
150
|
+
throw new SecurityError(`Private or reserved network address is blocked: ${address}`);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// src/fetcher/fetcher.ts
|
|
155
|
+
var DEFAULT_TIMEOUT_MS = 8e3;
|
|
156
|
+
var DEFAULT_MAX_REDIRECTS = 5;
|
|
157
|
+
var DEFAULT_MAX_BYTES = 2e6;
|
|
158
|
+
var DEFAULT_BROWSER_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36";
|
|
159
|
+
var DEFAULT_ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7";
|
|
160
|
+
var DEFAULT_ACCEPT_LANGUAGE = "en-US,en;q=0.9";
|
|
161
|
+
var DEFAULT_ACCEPT_ENCODING = "gzip, deflate, br";
|
|
162
|
+
async function fetchPage(inputUrl, options = {}) {
|
|
163
|
+
const startUrl = normalizeUrl(inputUrl);
|
|
164
|
+
const cached = await options.cache?.get(startUrl);
|
|
165
|
+
if (cached) {
|
|
166
|
+
return fromCache(startUrl, cached);
|
|
167
|
+
}
|
|
168
|
+
const retries = options.retries ?? 1;
|
|
169
|
+
let lastError;
|
|
170
|
+
for (let attempt = 0; attempt <= retries; attempt += 1) {
|
|
171
|
+
try {
|
|
172
|
+
const page = await requestWithRedirects(startUrl, options);
|
|
173
|
+
await options.cache?.set(startUrl, {
|
|
174
|
+
html: page.html,
|
|
175
|
+
finalUrl: page.finalUrl,
|
|
176
|
+
statusCode: page.statusCode,
|
|
177
|
+
contentType: page.contentType,
|
|
178
|
+
redirects: page.redirects
|
|
179
|
+
});
|
|
180
|
+
return page;
|
|
181
|
+
} catch (error) {
|
|
182
|
+
lastError = error;
|
|
183
|
+
if (attempt < retries) {
|
|
184
|
+
await delay(options.retryDelayMs ?? 250);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
throw lastError;
|
|
189
|
+
}
|
|
190
|
+
async function resolveRedirects(inputUrl, options = {}) {
|
|
191
|
+
const originalUrl = normalizeUrl(inputUrl);
|
|
192
|
+
const maxRedirects = options.maxRedirects ?? DEFAULT_MAX_REDIRECTS;
|
|
193
|
+
const fetchImpl = options.fetch ?? fetch;
|
|
194
|
+
const redirects = [];
|
|
195
|
+
let currentUrl = await assertSafeRequestUrl(originalUrl, options);
|
|
196
|
+
for (let redirectCount = 0; redirectCount <= maxRedirects; redirectCount += 1) {
|
|
197
|
+
const response = await requestOnce(fetchImpl, currentUrl, options);
|
|
198
|
+
const statusCode = response.status;
|
|
199
|
+
if (!isRedirect(statusCode)) {
|
|
200
|
+
const shortUrl = detectShortUrl(originalUrl);
|
|
201
|
+
return {
|
|
202
|
+
originalUrl,
|
|
203
|
+
finalUrl: currentUrl,
|
|
204
|
+
redirects,
|
|
205
|
+
isShortUrl: shortUrl.isShortUrl,
|
|
206
|
+
shortUrlProvider: shortUrl.provider
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
const location = response.headers.get("location");
|
|
210
|
+
if (!location) {
|
|
211
|
+
throw new Error(`Redirect response from ${currentUrl} did not include a Location header.`);
|
|
212
|
+
}
|
|
213
|
+
const nextUrl = await assertSafeRequestUrl(resolveUrl(location, currentUrl), options);
|
|
214
|
+
redirects.push({ from: currentUrl, to: nextUrl, statusCode });
|
|
215
|
+
currentUrl = nextUrl;
|
|
216
|
+
}
|
|
217
|
+
throw new Error(`Too many redirects. Maximum allowed redirects: ${maxRedirects}.`);
|
|
218
|
+
}
|
|
219
|
+
async function requestWithRedirects(inputUrl, options) {
|
|
220
|
+
const fetchImpl = options.fetch ?? fetch;
|
|
221
|
+
const maxRedirects = options.maxRedirects ?? DEFAULT_MAX_REDIRECTS;
|
|
222
|
+
const redirects = [];
|
|
223
|
+
const shortUrl = detectShortUrl(inputUrl);
|
|
224
|
+
let currentUrl = await assertSafeRequestUrl(inputUrl, options);
|
|
225
|
+
for (let redirectCount = 0; redirectCount <= maxRedirects; redirectCount += 1) {
|
|
226
|
+
const response = await requestOnce(fetchImpl, currentUrl, options);
|
|
227
|
+
const statusCode = response.status;
|
|
228
|
+
if (isRedirect(statusCode)) {
|
|
229
|
+
const location = response.headers.get("location");
|
|
230
|
+
if (!location) {
|
|
231
|
+
throw new Error(`Redirect response from ${currentUrl} did not include a Location header.`);
|
|
232
|
+
}
|
|
233
|
+
const nextUrl = await assertSafeRequestUrl(resolveUrl(location, currentUrl), options);
|
|
234
|
+
redirects.push({ from: currentUrl, to: nextUrl, statusCode });
|
|
235
|
+
currentUrl = nextUrl;
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
const headers = headersToRecord(response.headers);
|
|
239
|
+
const body = await readLimitedBody(response, options.maxBytes ?? DEFAULT_MAX_BYTES);
|
|
240
|
+
return {
|
|
241
|
+
url: inputUrl,
|
|
242
|
+
originalUrl: inputUrl,
|
|
243
|
+
finalUrl: currentUrl,
|
|
244
|
+
isShortUrl: shortUrl.isShortUrl,
|
|
245
|
+
shortUrlProvider: shortUrl.provider,
|
|
246
|
+
html: body.text,
|
|
247
|
+
bytes: body.bytes,
|
|
248
|
+
statusCode,
|
|
249
|
+
contentType: response.headers.get("content-type") ?? void 0,
|
|
250
|
+
redirects,
|
|
251
|
+
headers
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
throw new Error(`Too many redirects. Maximum allowed redirects: ${maxRedirects}.`);
|
|
255
|
+
}
|
|
256
|
+
async function requestOnce(fetchImpl, url, options) {
|
|
257
|
+
const controller = new AbortController();
|
|
258
|
+
const timeout = setTimeout(() => controller.abort(new Error("Request timed out.")), options.timeoutMs ?? DEFAULT_TIMEOUT_MS);
|
|
259
|
+
const externalSignal = options.signal;
|
|
260
|
+
const abortFromExternal = () => controller.abort(externalSignal?.reason);
|
|
261
|
+
if (externalSignal) {
|
|
262
|
+
if (externalSignal.aborted) {
|
|
263
|
+
controller.abort(externalSignal.reason);
|
|
264
|
+
} else {
|
|
265
|
+
externalSignal.addEventListener("abort", abortFromExternal, { once: true });
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
try {
|
|
269
|
+
return await fetchImpl(url, {
|
|
270
|
+
redirect: "manual",
|
|
271
|
+
signal: controller.signal,
|
|
272
|
+
headers: buildBrowserLikeHeaders(options)
|
|
273
|
+
});
|
|
274
|
+
} finally {
|
|
275
|
+
clearTimeout(timeout);
|
|
276
|
+
externalSignal?.removeEventListener("abort", abortFromExternal);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
function buildBrowserLikeHeaders(options) {
|
|
280
|
+
return {
|
|
281
|
+
"accept": options.accept ?? DEFAULT_ACCEPT,
|
|
282
|
+
"accept-language": options.acceptLanguage ?? DEFAULT_ACCEPT_LANGUAGE,
|
|
283
|
+
"accept-encoding": options.acceptEncoding ?? DEFAULT_ACCEPT_ENCODING,
|
|
284
|
+
"cache-control": "no-cache",
|
|
285
|
+
"pragma": "no-cache",
|
|
286
|
+
"sec-ch-ua": '"Chromium";v="125", "Google Chrome";v="125", "Not.A/Brand";v="24"',
|
|
287
|
+
"sec-ch-ua-mobile": "?0",
|
|
288
|
+
"sec-ch-ua-platform": '"Windows"',
|
|
289
|
+
"sec-fetch-dest": "document",
|
|
290
|
+
"sec-fetch-mode": "navigate",
|
|
291
|
+
"sec-fetch-site": "none",
|
|
292
|
+
"sec-fetch-user": "?1",
|
|
293
|
+
"upgrade-insecure-requests": "1",
|
|
294
|
+
"user-agent": options.userAgent ?? DEFAULT_BROWSER_USER_AGENT,
|
|
295
|
+
...headersToObject(options.headers)
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
async function readLimitedBody(response, maxBytes) {
|
|
299
|
+
const contentLength = response.headers.get("content-length");
|
|
300
|
+
if (contentLength && Number.parseInt(contentLength, 10) > maxBytes) {
|
|
301
|
+
throw new Error(`Response body exceeds maxBytes (${maxBytes}).`);
|
|
302
|
+
}
|
|
303
|
+
if (!response.body) {
|
|
304
|
+
const text = await response.text();
|
|
305
|
+
return {
|
|
306
|
+
text,
|
|
307
|
+
bytes: new TextEncoder().encode(text)
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
const reader = response.body.getReader();
|
|
311
|
+
const chunks = [];
|
|
312
|
+
let received = 0;
|
|
313
|
+
while (true) {
|
|
314
|
+
const { done, value } = await reader.read();
|
|
315
|
+
if (done) {
|
|
316
|
+
break;
|
|
317
|
+
}
|
|
318
|
+
if (value) {
|
|
319
|
+
received += value.byteLength;
|
|
320
|
+
if (received > maxBytes) {
|
|
321
|
+
throw new Error(`Response body exceeds maxBytes (${maxBytes}).`);
|
|
322
|
+
}
|
|
323
|
+
chunks.push(value);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
const buffer = new Uint8Array(received);
|
|
327
|
+
let offset = 0;
|
|
328
|
+
for (const chunk of chunks) {
|
|
329
|
+
buffer.set(chunk, offset);
|
|
330
|
+
offset += chunk.byteLength;
|
|
331
|
+
}
|
|
332
|
+
return {
|
|
333
|
+
text: decodeBytes(buffer, response.headers.get("content-type")),
|
|
334
|
+
bytes: buffer
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
function fromCache(url, entry) {
|
|
338
|
+
const shortUrl = detectShortUrl(url);
|
|
339
|
+
return {
|
|
340
|
+
url,
|
|
341
|
+
originalUrl: url,
|
|
342
|
+
finalUrl: entry.finalUrl ?? url,
|
|
343
|
+
isShortUrl: shortUrl.isShortUrl,
|
|
344
|
+
shortUrlProvider: shortUrl.provider,
|
|
345
|
+
html: entry.html,
|
|
346
|
+
statusCode: entry.statusCode ?? 200,
|
|
347
|
+
contentType: entry.contentType,
|
|
348
|
+
redirects: entry.redirects ?? [],
|
|
349
|
+
headers: {}
|
|
350
|
+
};
|
|
351
|
+
}
|
|
352
|
+
function isRedirect(statusCode) {
|
|
353
|
+
return statusCode === 301 || statusCode === 302 || statusCode === 303 || statusCode === 307 || statusCode === 308;
|
|
354
|
+
}
|
|
355
|
+
function extractCharset(contentType) {
|
|
356
|
+
const charset = contentType?.match(/charset=([^;]+)/i)?.[1]?.trim().replace(/^["']|["']$/g, "");
|
|
357
|
+
return charset || "utf-8";
|
|
358
|
+
}
|
|
359
|
+
function decodeBytes(buffer, contentType) {
|
|
360
|
+
try {
|
|
361
|
+
return new TextDecoder(extractCharset(contentType)).decode(buffer);
|
|
362
|
+
} catch {
|
|
363
|
+
return new TextDecoder("utf-8").decode(buffer);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
function headersToRecord(headers) {
|
|
367
|
+
const result = {};
|
|
368
|
+
headers.forEach((value, key) => {
|
|
369
|
+
result[key] = value;
|
|
370
|
+
});
|
|
371
|
+
return result;
|
|
372
|
+
}
|
|
373
|
+
function headersToObject(headers) {
|
|
374
|
+
if (!headers) {
|
|
375
|
+
return {};
|
|
376
|
+
}
|
|
377
|
+
return Object.fromEntries(new Headers(headers).entries());
|
|
378
|
+
}
|
|
379
|
+
function delay(ms) {
|
|
380
|
+
return new Promise((resolve) => {
|
|
381
|
+
setTimeout(resolve, ms);
|
|
382
|
+
});
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// src/utils/html.ts
|
|
386
|
+
import { load } from "cheerio";
|
|
387
|
+
function loadDocument(html) {
|
|
388
|
+
return load(html);
|
|
389
|
+
}
|
|
390
|
+
function normalizeWhitespace(value) {
|
|
391
|
+
if (!value) {
|
|
392
|
+
return void 0;
|
|
393
|
+
}
|
|
394
|
+
const normalized = value.replace(/\s+/g, " ").trim();
|
|
395
|
+
return normalized.length > 0 ? normalized : void 0;
|
|
396
|
+
}
|
|
397
|
+
function firstDefined(...values) {
|
|
398
|
+
for (const value of values) {
|
|
399
|
+
if (value !== void 0 && value !== null && value !== false && value !== "") {
|
|
400
|
+
return value;
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
return void 0;
|
|
404
|
+
}
|
|
405
|
+
function parseInteger(value) {
|
|
406
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
407
|
+
return Math.round(value);
|
|
408
|
+
}
|
|
409
|
+
if (!value) {
|
|
410
|
+
return void 0;
|
|
411
|
+
}
|
|
412
|
+
const parsed = Number.parseInt(String(value), 10);
|
|
413
|
+
return Number.isFinite(parsed) ? parsed : void 0;
|
|
414
|
+
}
|
|
415
|
+
function parseNumber(value) {
|
|
416
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
417
|
+
return value;
|
|
418
|
+
}
|
|
419
|
+
if (!value) {
|
|
420
|
+
return void 0;
|
|
421
|
+
}
|
|
422
|
+
const parsed = Number.parseFloat(String(value));
|
|
423
|
+
return Number.isFinite(parsed) ? parsed : void 0;
|
|
424
|
+
}
|
|
425
|
+
function readMetaContent($, selectors) {
|
|
426
|
+
for (const selector of selectors) {
|
|
427
|
+
const content = normalizeWhitespace($(selector).first().attr("content"));
|
|
428
|
+
if (content) {
|
|
429
|
+
return content;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
return void 0;
|
|
433
|
+
}
|
|
434
|
+
function setMapValue(map, key, value) {
|
|
435
|
+
const current = map[key];
|
|
436
|
+
if (Array.isArray(current)) {
|
|
437
|
+
current.push(value);
|
|
438
|
+
return;
|
|
439
|
+
}
|
|
440
|
+
if (typeof current === "string") {
|
|
441
|
+
map[key] = [current, value];
|
|
442
|
+
return;
|
|
443
|
+
}
|
|
444
|
+
map[key] = value;
|
|
445
|
+
}
|
|
446
|
+
function splitList(value) {
|
|
447
|
+
if (!value) {
|
|
448
|
+
return void 0;
|
|
449
|
+
}
|
|
450
|
+
const items = value.split(",").map((item) => normalizeWhitespace(item)).filter((item) => Boolean(item));
|
|
451
|
+
return items.length > 0 ? items : void 0;
|
|
452
|
+
}
|
|
453
|
+
function parseSrcset(value) {
|
|
454
|
+
if (!value) {
|
|
455
|
+
return [];
|
|
456
|
+
}
|
|
457
|
+
return value.split(",").map((candidate) => candidate.trim().split(/\s+/)[0]).filter(Boolean);
|
|
458
|
+
}
|
|
459
|
+
function uniqueMediaByUrl(assets) {
|
|
460
|
+
const seen = /* @__PURE__ */ new Set();
|
|
461
|
+
const unique = [];
|
|
462
|
+
for (const asset of assets) {
|
|
463
|
+
const key = asset.url;
|
|
464
|
+
if (!key || seen.has(key)) {
|
|
465
|
+
continue;
|
|
466
|
+
}
|
|
467
|
+
seen.add(key);
|
|
468
|
+
unique.push(asset);
|
|
469
|
+
}
|
|
470
|
+
return unique;
|
|
471
|
+
}
|
|
472
|
+
function createEmptyDiagnostics() {
|
|
473
|
+
return {
|
|
474
|
+
redirects: [],
|
|
475
|
+
sourcesUsed: [],
|
|
476
|
+
warnings: [],
|
|
477
|
+
trace: [],
|
|
478
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
479
|
+
};
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
// src/extractors/html.ts
|
|
483
|
+
function extractHtmlMetadata(html) {
|
|
484
|
+
const $ = loadDocument(html);
|
|
485
|
+
const favicons = [];
|
|
486
|
+
const alternates = [];
|
|
487
|
+
let canonicalUrl;
|
|
488
|
+
let imageSrc;
|
|
489
|
+
$("link[rel][href]").each((_, element) => {
|
|
490
|
+
const rel = normalizeWhitespace($(element).attr("rel"))?.toLowerCase();
|
|
491
|
+
const href = normalizeWhitespace($(element).attr("href"));
|
|
492
|
+
const type = normalizeWhitespace($(element).attr("type"));
|
|
493
|
+
if (!rel || !href) {
|
|
494
|
+
return;
|
|
495
|
+
}
|
|
496
|
+
const relTokens = rel.split(/\s+/);
|
|
497
|
+
if (relTokens.includes("canonical")) {
|
|
498
|
+
canonicalUrl = href;
|
|
499
|
+
}
|
|
500
|
+
if (relTokens.includes("image_src")) {
|
|
501
|
+
imageSrc = {
|
|
502
|
+
url: href,
|
|
503
|
+
kind: "image",
|
|
504
|
+
source: "html"
|
|
505
|
+
};
|
|
506
|
+
}
|
|
507
|
+
if (relTokens.includes("alternate")) {
|
|
508
|
+
alternates.push({
|
|
509
|
+
href,
|
|
510
|
+
type,
|
|
511
|
+
hreflang: normalizeWhitespace($(element).attr("hreflang")),
|
|
512
|
+
title: normalizeWhitespace($(element).attr("title"))
|
|
513
|
+
});
|
|
514
|
+
}
|
|
515
|
+
if (relTokens.includes("icon") || relTokens.includes("shortcut") || relTokens.includes("apple-touch-icon") || relTokens.includes("mask-icon")) {
|
|
516
|
+
favicons.push({
|
|
517
|
+
url: href,
|
|
518
|
+
kind: "favicon",
|
|
519
|
+
source: "favicon",
|
|
520
|
+
type,
|
|
521
|
+
metadata: {
|
|
522
|
+
sizes: normalizeWhitespace($(element).attr("sizes")),
|
|
523
|
+
rel
|
|
524
|
+
}
|
|
525
|
+
});
|
|
526
|
+
}
|
|
527
|
+
});
|
|
528
|
+
return {
|
|
529
|
+
title: normalizeWhitespace($("title").first().text()) ?? readMetaContent($, ["meta[name='title']"]),
|
|
530
|
+
description: readMetaContent($, ["meta[name='description']", "meta[itemprop='description']"]),
|
|
531
|
+
keywords: splitList(readMetaContent($, ["meta[name='keywords']"])),
|
|
532
|
+
robots: readMetaContent($, ["meta[name='robots']"]),
|
|
533
|
+
canonicalUrl,
|
|
534
|
+
manifestUrl: normalizeWhitespace($("link[rel='manifest']").first().attr("href")),
|
|
535
|
+
themeColor: readMetaContent($, ["meta[name='theme-color']"]),
|
|
536
|
+
applicationName: readMetaContent($, ["meta[name='application-name']", "meta[name='apple-mobile-web-app-title']"]),
|
|
537
|
+
favicons,
|
|
538
|
+
imageSrc,
|
|
539
|
+
alternates
|
|
540
|
+
};
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
// src/extractors/embeddedData.ts
|
|
544
|
+
var MAX_SCRIPT_CHARS = 15e5;
|
|
545
|
+
var ASSIGNMENT_PATTERNS = [
|
|
546
|
+
{ source: "nuxt", names: ["window.__NUXT__", "__NUXT__"] },
|
|
547
|
+
{ source: "initialState", names: ["window.__INITIAL_STATE__", "__INITIAL_STATE__"] },
|
|
548
|
+
{ source: "preloadedState", names: ["window.__PRELOADED_STATE__", "__PRELOADED_STATE__"] },
|
|
549
|
+
{ source: "apollo", names: ["window.__APOLLO_STATE__", "__APOLLO_STATE__", "window.__APOLLO_CLIENT__"] },
|
|
550
|
+
{ source: "youtubeInitialData", names: ["ytInitialData", "window.ytInitialData"] },
|
|
551
|
+
{ source: "youtubePlayerResponse", names: ["ytInitialPlayerResponse", "window.ytInitialPlayerResponse"] }
|
|
552
|
+
];
|
|
553
|
+
function extractEmbeddedData(html) {
|
|
554
|
+
const $ = loadDocument(html);
|
|
555
|
+
const items = [];
|
|
556
|
+
const warnings = [];
|
|
557
|
+
const nextData = parseJsonScript($("#__NEXT_DATA__").first().html() ?? $("#__NEXT_DATA__").first().text(), "nextData", warnings);
|
|
558
|
+
if (nextData) {
|
|
559
|
+
items.push(nextData);
|
|
560
|
+
}
|
|
561
|
+
const nuxtData = parseJsonScript($("#__NUXT_DATA__").first().html() ?? $("#__NUXT_DATA__").first().text(), "nuxt", warnings);
|
|
562
|
+
if (nuxtData) {
|
|
563
|
+
items.push(nuxtData);
|
|
564
|
+
}
|
|
565
|
+
$("script").each((index, element) => {
|
|
566
|
+
const type = normalizeWhitespace($(element).attr("type"))?.toLowerCase();
|
|
567
|
+
const id = normalizeWhitespace($(element).attr("id"));
|
|
568
|
+
const source = $(element).html() ?? $(element).text();
|
|
569
|
+
if (!source || source.length > MAX_SCRIPT_CHARS) {
|
|
570
|
+
return;
|
|
571
|
+
}
|
|
572
|
+
if (type?.includes("application/json") || type?.includes("application/ld+json")) {
|
|
573
|
+
const parsed = parseJsonScript(source, type?.includes("ld+json") ? "jsonScript" : "applicationJson", warnings, id);
|
|
574
|
+
if (parsed) {
|
|
575
|
+
items.push(parsed);
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
for (const pattern of ASSIGNMENT_PATTERNS) {
|
|
579
|
+
for (const name of pattern.names) {
|
|
580
|
+
const data = parseAssignedJson(source, name);
|
|
581
|
+
if (data) {
|
|
582
|
+
items.push({
|
|
583
|
+
source: pattern.source,
|
|
584
|
+
path: name,
|
|
585
|
+
data
|
|
586
|
+
});
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
const apolloLike = findApolloPayload(source);
|
|
591
|
+
if (apolloLike) {
|
|
592
|
+
items.push({
|
|
593
|
+
source: "apollo",
|
|
594
|
+
path: `script[${index}]`,
|
|
595
|
+
data: apolloLike
|
|
596
|
+
});
|
|
597
|
+
}
|
|
598
|
+
});
|
|
599
|
+
return {
|
|
600
|
+
items: dedupeEmbeddedItems(items),
|
|
601
|
+
warnings
|
|
602
|
+
};
|
|
603
|
+
}
|
|
604
|
+
function parseJsonScript(source, itemSource, warnings, path) {
|
|
605
|
+
const cleanSource = cleanJsonSource(source);
|
|
606
|
+
if (!cleanSource) {
|
|
607
|
+
return void 0;
|
|
608
|
+
}
|
|
609
|
+
try {
|
|
610
|
+
const parsed = JSON.parse(cleanSource);
|
|
611
|
+
if (!isRecord(parsed)) {
|
|
612
|
+
return void 0;
|
|
613
|
+
}
|
|
614
|
+
return {
|
|
615
|
+
source: itemSource,
|
|
616
|
+
path,
|
|
617
|
+
data: parsed
|
|
618
|
+
};
|
|
619
|
+
} catch (error) {
|
|
620
|
+
warnings.push(`Could not parse ${itemSource} embedded JSON${path ? ` at ${path}` : ""}: ${error instanceof Error ? error.message : String(error)}`);
|
|
621
|
+
return void 0;
|
|
622
|
+
}
|
|
623
|
+
}
|
|
624
|
+
function parseAssignedJson(source, assignmentName) {
|
|
625
|
+
const index = source.indexOf(assignmentName);
|
|
626
|
+
if (index === -1) {
|
|
627
|
+
return void 0;
|
|
628
|
+
}
|
|
629
|
+
const afterName = source.slice(index + assignmentName.length);
|
|
630
|
+
const assignmentIndex = afterName.search(/=\s*[[{]/);
|
|
631
|
+
if (assignmentIndex === -1) {
|
|
632
|
+
return void 0;
|
|
633
|
+
}
|
|
634
|
+
const startInAfterName = afterName.slice(assignmentIndex).search(/[[{]/);
|
|
635
|
+
if (startInAfterName === -1) {
|
|
636
|
+
return void 0;
|
|
637
|
+
}
|
|
638
|
+
const start = index + assignmentName.length + assignmentIndex + startInAfterName;
|
|
639
|
+
const candidate = readBalancedJson(source, start);
|
|
640
|
+
if (!candidate) {
|
|
641
|
+
return void 0;
|
|
642
|
+
}
|
|
643
|
+
try {
|
|
644
|
+
const parsed = JSON.parse(candidate);
|
|
645
|
+
return isRecord(parsed) ? parsed : { value: parsed };
|
|
646
|
+
} catch {
|
|
647
|
+
return void 0;
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
function findApolloPayload(source) {
|
|
651
|
+
if (!/apollo|__APOLLO/i.test(source)) {
|
|
652
|
+
return void 0;
|
|
653
|
+
}
|
|
654
|
+
const data = parseAssignedJson(source, "window.__APOLLO_STATE__") ?? parseAssignedJson(source, "__APOLLO_STATE__");
|
|
655
|
+
if (data) {
|
|
656
|
+
return data;
|
|
657
|
+
}
|
|
658
|
+
return void 0;
|
|
659
|
+
}
|
|
660
|
+
function readBalancedJson(source, start) {
|
|
661
|
+
const opener = source[start];
|
|
662
|
+
const closer = opener === "{" ? "}" : opener === "[" ? "]" : void 0;
|
|
663
|
+
if (!closer) {
|
|
664
|
+
return void 0;
|
|
665
|
+
}
|
|
666
|
+
let depth = 0;
|
|
667
|
+
let inString = false;
|
|
668
|
+
let escaped = false;
|
|
669
|
+
for (let index = start; index < source.length; index += 1) {
|
|
670
|
+
const char = source[index];
|
|
671
|
+
if (inString) {
|
|
672
|
+
if (escaped) {
|
|
673
|
+
escaped = false;
|
|
674
|
+
} else if (char === "\\") {
|
|
675
|
+
escaped = true;
|
|
676
|
+
} else if (char === '"') {
|
|
677
|
+
inString = false;
|
|
678
|
+
}
|
|
679
|
+
continue;
|
|
680
|
+
}
|
|
681
|
+
if (char === '"') {
|
|
682
|
+
inString = true;
|
|
683
|
+
continue;
|
|
684
|
+
}
|
|
685
|
+
if (char === opener) {
|
|
686
|
+
depth += 1;
|
|
687
|
+
} else if (char === closer) {
|
|
688
|
+
depth -= 1;
|
|
689
|
+
if (depth === 0) {
|
|
690
|
+
return source.slice(start, index + 1);
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
return void 0;
|
|
695
|
+
}
|
|
696
|
+
function cleanJsonSource(source) {
|
|
697
|
+
const cleaned = source?.replace(/^<!--/, "").replace(/-->$/, "").trim();
|
|
698
|
+
return cleaned || void 0;
|
|
699
|
+
}
|
|
700
|
+
function dedupeEmbeddedItems(items) {
|
|
701
|
+
const seen = /* @__PURE__ */ new Set();
|
|
702
|
+
const unique = [];
|
|
703
|
+
for (const item of items) {
|
|
704
|
+
const key = `${item.source}:${item.path ?? ""}:${JSON.stringify(item.data).slice(0, 500)}`;
|
|
705
|
+
if (seen.has(key)) {
|
|
706
|
+
continue;
|
|
707
|
+
}
|
|
708
|
+
seen.add(key);
|
|
709
|
+
unique.push(item);
|
|
710
|
+
}
|
|
711
|
+
return unique;
|
|
712
|
+
}
|
|
713
|
+
function isRecord(value) {
|
|
714
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
// src/extractors/jsonLd.ts
|
|
718
|
+
function extractJsonLd(html) {
|
|
719
|
+
const $ = loadDocument(html);
|
|
720
|
+
const nodes = [];
|
|
721
|
+
const warnings = [];
|
|
722
|
+
$("script[type*='ld+json']").each((index, element) => {
|
|
723
|
+
const source = normalizeWhitespace($(element).text()) ?? normalizeWhitespace($(element).html());
|
|
724
|
+
if (!source) {
|
|
725
|
+
return;
|
|
726
|
+
}
|
|
727
|
+
try {
|
|
728
|
+
for (const node of flattenJsonLd(JSON.parse(cleanJson(source)))) {
|
|
729
|
+
nodes.push(node);
|
|
730
|
+
}
|
|
731
|
+
} catch (error) {
|
|
732
|
+
warnings.push(`Could not parse JSON-LD script at index ${index}: ${error instanceof Error ? error.message : String(error)}`);
|
|
733
|
+
}
|
|
734
|
+
});
|
|
735
|
+
return { nodes, warnings };
|
|
736
|
+
}
|
|
737
|
+
function cleanJson(source) {
|
|
738
|
+
return source.replace(/^<!--/, "").replace(/-->$/, "").trim();
|
|
739
|
+
}
|
|
740
|
+
function flattenJsonLd(value) {
|
|
741
|
+
if (Array.isArray(value)) {
|
|
742
|
+
return value.flatMap((item) => flattenJsonLd(item));
|
|
743
|
+
}
|
|
744
|
+
if (!isRecord2(value)) {
|
|
745
|
+
return [];
|
|
746
|
+
}
|
|
747
|
+
const graph = value["@graph"];
|
|
748
|
+
const current = value;
|
|
749
|
+
if (Array.isArray(graph)) {
|
|
750
|
+
return [current, ...graph.filter(isRecord2)];
|
|
751
|
+
}
|
|
752
|
+
return [current];
|
|
753
|
+
}
|
|
754
|
+
function isRecord2(value) {
|
|
755
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
// src/extractors/media.ts
|
|
759
|
+
var LAZY_IMAGE_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-image", "data-thumbnail"];
|
|
760
|
+
var LAZY_MEDIA_ATTRIBUTES = ["data-src", "data-original", "data-lazy-src", "data-video", "data-media"];
|
|
761
|
+
function extractImages(html, baseUrl) {
|
|
762
|
+
const $ = loadDocument(html);
|
|
763
|
+
const images = [];
|
|
764
|
+
$("link[rel='image_src'][href]").each((_, element) => {
|
|
765
|
+
pushResolved(images, {
|
|
766
|
+
url: normalizeWhitespace($(element).attr("href")),
|
|
767
|
+
kind: "image",
|
|
768
|
+
source: "html"
|
|
769
|
+
}, baseUrl);
|
|
770
|
+
});
|
|
771
|
+
$("link[rel][href]").each((_, element) => {
|
|
772
|
+
const relTokens = (normalizeWhitespace($(element).attr("rel"))?.toLowerCase() ?? "").split(/\s+/);
|
|
773
|
+
const asValue = normalizeWhitespace($(element).attr("as"))?.toLowerCase();
|
|
774
|
+
const type = normalizeWhitespace($(element).attr("type"));
|
|
775
|
+
if (!relTokens.includes("preload") && !relTokens.includes("prefetch")) {
|
|
776
|
+
return;
|
|
777
|
+
}
|
|
778
|
+
if (asValue === "image" || type?.startsWith("image/")) {
|
|
779
|
+
pushResolved(images, {
|
|
780
|
+
url: normalizeWhitespace($(element).attr("href")),
|
|
781
|
+
kind: "image",
|
|
782
|
+
source: "html",
|
|
783
|
+
type,
|
|
784
|
+
metadata: { discoveredFrom: "link.preload" }
|
|
785
|
+
}, baseUrl);
|
|
786
|
+
}
|
|
787
|
+
});
|
|
788
|
+
collectDocumentImages($, images, baseUrl, "html");
|
|
789
|
+
$("video[poster]").each((_, element) => {
|
|
790
|
+
pushResolved(images, {
|
|
791
|
+
url: normalizeWhitespace($(element).attr("poster")),
|
|
792
|
+
kind: "image",
|
|
793
|
+
source: "html",
|
|
794
|
+
width: parseInteger($(element).attr("width")),
|
|
795
|
+
height: parseInteger($(element).attr("height")),
|
|
796
|
+
metadata: {
|
|
797
|
+
discoveredFrom: "video.poster"
|
|
798
|
+
}
|
|
799
|
+
}, baseUrl);
|
|
800
|
+
});
|
|
801
|
+
$("noscript").each((_, element) => {
|
|
802
|
+
const fallbackHtml = $(element).html() || $(element).text();
|
|
803
|
+
if (!fallbackHtml) {
|
|
804
|
+
return;
|
|
805
|
+
}
|
|
806
|
+
collectDocumentImages(loadDocument(fallbackHtml), images, baseUrl, "fallback");
|
|
807
|
+
});
|
|
808
|
+
return uniqueMediaByUrl(images);
|
|
809
|
+
}
|
|
810
|
+
function extractVideos(html, baseUrl) {
|
|
811
|
+
const $ = loadDocument(html);
|
|
812
|
+
const videos = [];
|
|
813
|
+
$("video").each((_, element) => {
|
|
814
|
+
const src = normalizeWhitespace($(element).attr("src")) ?? firstAttribute($, element, LAZY_MEDIA_ATTRIBUTES);
|
|
815
|
+
const common = {
|
|
816
|
+
kind: "video",
|
|
817
|
+
source: "html",
|
|
818
|
+
width: parseInteger($(element).attr("width")),
|
|
819
|
+
height: parseInteger($(element).attr("height")),
|
|
820
|
+
poster: tryResolveUrl(normalizeWhitespace($(element).attr("poster")), baseUrl)
|
|
821
|
+
};
|
|
822
|
+
pushResolved(videos, { ...common, url: src }, baseUrl);
|
|
823
|
+
for (const attribute of LAZY_MEDIA_ATTRIBUTES) {
|
|
824
|
+
pushResolved(videos, {
|
|
825
|
+
...common,
|
|
826
|
+
url: normalizeWhitespace($(element).attr(attribute))
|
|
827
|
+
}, baseUrl);
|
|
828
|
+
}
|
|
829
|
+
$(element).find("source[src]").each((_2, source) => {
|
|
830
|
+
pushResolved(videos, {
|
|
831
|
+
...common,
|
|
832
|
+
url: normalizeWhitespace($(source).attr("src")),
|
|
833
|
+
type: normalizeWhitespace($(source).attr("type"))
|
|
834
|
+
}, baseUrl);
|
|
835
|
+
});
|
|
836
|
+
});
|
|
837
|
+
$("iframe[src]").each((_, element) => {
|
|
838
|
+
const src = normalizeWhitespace($(element).attr("src"));
|
|
839
|
+
if (!src || !isLikelyVideoEmbed(src)) {
|
|
840
|
+
return;
|
|
841
|
+
}
|
|
842
|
+
pushResolved(videos, {
|
|
843
|
+
url: src,
|
|
844
|
+
kind: "video",
|
|
845
|
+
source: "html",
|
|
846
|
+
width: parseInteger($(element).attr("width")),
|
|
847
|
+
height: parseInteger($(element).attr("height"))
|
|
848
|
+
}, baseUrl);
|
|
849
|
+
});
|
|
850
|
+
$("link[rel][href]").each((_, element) => {
|
|
851
|
+
const relTokens = (normalizeWhitespace($(element).attr("rel"))?.toLowerCase() ?? "").split(/\s+/);
|
|
852
|
+
const asValue = normalizeWhitespace($(element).attr("as"))?.toLowerCase();
|
|
853
|
+
const type = normalizeWhitespace($(element).attr("type"));
|
|
854
|
+
if (!relTokens.includes("preload") && !relTokens.includes("prefetch") || asValue !== "video" && !type?.startsWith("video/")) {
|
|
855
|
+
return;
|
|
856
|
+
}
|
|
857
|
+
pushResolved(videos, {
|
|
858
|
+
url: normalizeWhitespace($(element).attr("href")),
|
|
859
|
+
kind: "video",
|
|
860
|
+
source: "html",
|
|
861
|
+
type,
|
|
862
|
+
metadata: { discoveredFrom: "link.preload" }
|
|
863
|
+
}, baseUrl);
|
|
864
|
+
});
|
|
865
|
+
return uniqueMediaByUrl(videos);
|
|
866
|
+
}
|
|
867
|
+
function extractAudio(html, baseUrl) {
|
|
868
|
+
const $ = loadDocument(html);
|
|
869
|
+
const audio = [];
|
|
870
|
+
$("audio").each((_, element) => {
|
|
871
|
+
pushResolved(audio, {
|
|
872
|
+
url: normalizeWhitespace($(element).attr("src")),
|
|
873
|
+
kind: "audio",
|
|
874
|
+
source: "html"
|
|
875
|
+
}, baseUrl);
|
|
876
|
+
$(element).find("source[src]").each((_2, source) => {
|
|
877
|
+
pushResolved(audio, {
|
|
878
|
+
url: normalizeWhitespace($(source).attr("src")),
|
|
879
|
+
kind: "audio",
|
|
880
|
+
source: "html",
|
|
881
|
+
type: normalizeWhitespace($(source).attr("type"))
|
|
882
|
+
}, baseUrl);
|
|
883
|
+
});
|
|
884
|
+
});
|
|
885
|
+
return uniqueMediaByUrl(audio);
|
|
886
|
+
}
|
|
887
|
+
function pushResolved(assets, asset, baseUrl) {
|
|
888
|
+
const url = tryResolveUrl(asset.url, baseUrl);
|
|
889
|
+
if (!url || shouldIgnoreMediaUrl(url) || shouldIgnoreImageAsset(asset, url)) {
|
|
890
|
+
return;
|
|
891
|
+
}
|
|
892
|
+
assets.push({
|
|
893
|
+
...asset,
|
|
894
|
+
url
|
|
895
|
+
});
|
|
896
|
+
}
|
|
897
|
+
function collectDocumentImages($, images, baseUrl, source) {
|
|
898
|
+
$("img").each((_, element) => {
|
|
899
|
+
const common = {
|
|
900
|
+
kind: "image",
|
|
901
|
+
source,
|
|
902
|
+
width: parseInteger($(element).attr("width")),
|
|
903
|
+
height: parseInteger($(element).attr("height")),
|
|
904
|
+
alt: normalizeWhitespace($(element).attr("alt")),
|
|
905
|
+
title: normalizeWhitespace($(element).attr("title"))
|
|
906
|
+
};
|
|
907
|
+
const candidates = [
|
|
908
|
+
normalizeWhitespace($(element).attr("src")),
|
|
909
|
+
...LAZY_IMAGE_ATTRIBUTES.map((attribute) => normalizeWhitespace($(element).attr(attribute))),
|
|
910
|
+
...parseSrcset($(element).attr("srcset"))
|
|
911
|
+
];
|
|
912
|
+
for (const candidate of candidates) {
|
|
913
|
+
pushResolved(images, {
|
|
914
|
+
...common,
|
|
915
|
+
url: candidate,
|
|
916
|
+
metadata: {
|
|
917
|
+
discoveredFrom: source === "fallback" ? "noscript" : "img"
|
|
918
|
+
}
|
|
919
|
+
}, baseUrl);
|
|
920
|
+
}
|
|
921
|
+
});
|
|
922
|
+
$("picture source[srcset], source[type^='image/'][srcset]").each((_, element) => {
|
|
923
|
+
for (const candidate of parseSrcset($(element).attr("srcset"))) {
|
|
924
|
+
pushResolved(images, {
|
|
925
|
+
url: candidate,
|
|
926
|
+
kind: "image",
|
|
927
|
+
source,
|
|
928
|
+
type: normalizeWhitespace($(element).attr("type")),
|
|
929
|
+
metadata: {
|
|
930
|
+
discoveredFrom: source === "fallback" ? "noscript.picture.source" : "picture.source"
|
|
931
|
+
}
|
|
932
|
+
}, baseUrl);
|
|
933
|
+
}
|
|
934
|
+
});
|
|
935
|
+
}
|
|
936
|
+
function shouldIgnoreMediaUrl(url) {
|
|
937
|
+
const normalized = url.toLowerCase();
|
|
938
|
+
return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
|
|
939
|
+
}
|
|
940
|
+
function shouldIgnoreImageAsset(asset, url) {
|
|
941
|
+
if (asset.kind !== "image" && asset.kind !== "favicon") {
|
|
942
|
+
return false;
|
|
943
|
+
}
|
|
944
|
+
const width = asset.width;
|
|
945
|
+
const height = asset.height;
|
|
946
|
+
const normalizedUrl = url.toLowerCase();
|
|
947
|
+
if (width !== void 0 && height !== void 0) {
|
|
948
|
+
if (width <= 2 || height <= 2) {
|
|
949
|
+
return true;
|
|
950
|
+
}
|
|
951
|
+
if (width <= 64 && height <= 64 && /(?:icon|favicon|apple-touch-icon|sprite|logo|avatar)/i.test(normalizedUrl)) {
|
|
952
|
+
return true;
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
return false;
|
|
956
|
+
}
|
|
957
|
+
function isLikelyVideoEmbed(src) {
|
|
958
|
+
return /youtube\.com|youtu\.be|vimeo\.com|dailymotion\.com|tiktok\.com|instagram\.com|facebook\.com|player\./i.test(src);
|
|
959
|
+
}
|
|
960
|
+
function firstAttribute($, element, attributes) {
|
|
961
|
+
for (const attribute of attributes) {
|
|
962
|
+
const value = normalizeWhitespace($(element).attr(attribute));
|
|
963
|
+
if (value) {
|
|
964
|
+
return value;
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
return void 0;
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
// src/extractors/oEmbed.ts
|
|
971
|
+
function extractOEmbed(html, url) {
|
|
972
|
+
const $ = loadDocument(html);
|
|
973
|
+
const links = [];
|
|
974
|
+
$("link[rel][href]").each((_, element) => {
|
|
975
|
+
const rel = normalizeWhitespace($(element).attr("rel"))?.toLowerCase() ?? "";
|
|
976
|
+
const relTokens = rel.split(/\s+/);
|
|
977
|
+
const type = normalizeWhitespace($(element).attr("type"));
|
|
978
|
+
const title = normalizeWhitespace($(element).attr("title"));
|
|
979
|
+
const hrefValue = normalizeWhitespace($(element).attr("href"));
|
|
980
|
+
const looksLikeOEmbed = /oembed/i.test(type ?? "") || /oembed/i.test(title ?? "") || /[?&]format=(?:json|xml)/i.test(hrefValue ?? "");
|
|
981
|
+
if (!relTokens.includes("alternate") && !relTokens.includes("alternate-oembed") && !looksLikeOEmbed) {
|
|
982
|
+
return;
|
|
983
|
+
}
|
|
984
|
+
if (!looksLikeOEmbed) {
|
|
985
|
+
return;
|
|
986
|
+
}
|
|
987
|
+
const href = tryResolveUrl(hrefValue, url);
|
|
988
|
+
if (!href) {
|
|
989
|
+
return;
|
|
990
|
+
}
|
|
991
|
+
links.push({
|
|
992
|
+
href,
|
|
993
|
+
type,
|
|
994
|
+
title
|
|
995
|
+
});
|
|
996
|
+
});
|
|
997
|
+
return {
|
|
998
|
+
links,
|
|
999
|
+
data: []
|
|
1000
|
+
};
|
|
1001
|
+
}
|
|
1002
|
+
async function fetchOEmbedData(links, options = {}) {
|
|
1003
|
+
const data = [];
|
|
1004
|
+
const warnings = [];
|
|
1005
|
+
await Promise.all(
|
|
1006
|
+
links.map(async (link) => {
|
|
1007
|
+
if (link.type && !/json/i.test(link.type)) {
|
|
1008
|
+
warnings.push(`Skipping non-JSON oEmbed endpoint: ${link.href}.`);
|
|
1009
|
+
return;
|
|
1010
|
+
}
|
|
1011
|
+
try {
|
|
1012
|
+
const page = await fetchPage(link.href, {
|
|
1013
|
+
...options,
|
|
1014
|
+
headers: {
|
|
1015
|
+
"accept": "application/json,*/*;q=0.8",
|
|
1016
|
+
...headersToObject2(options.headers)
|
|
1017
|
+
}
|
|
1018
|
+
});
|
|
1019
|
+
data.push(JSON.parse(page.html));
|
|
1020
|
+
} catch (error) {
|
|
1021
|
+
warnings.push(`Could not fetch oEmbed endpoint ${link.href}: ${error instanceof Error ? error.message : String(error)}`);
|
|
1022
|
+
}
|
|
1023
|
+
})
|
|
1024
|
+
);
|
|
1025
|
+
return { data, warnings };
|
|
1026
|
+
}
|
|
1027
|
+
function headersToObject2(headers) {
|
|
1028
|
+
if (!headers) {
|
|
1029
|
+
return {};
|
|
1030
|
+
}
|
|
1031
|
+
return Object.fromEntries(new Headers(headers).entries());
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
// src/extractors/openGraph.ts
|
|
1035
|
+
function extractOpenGraph(html) {
|
|
1036
|
+
const $ = loadDocument(html);
|
|
1037
|
+
const raw = {};
|
|
1038
|
+
$("meta[property], meta[name]").each((_, element) => {
|
|
1039
|
+
const key = readMetaKey($, element);
|
|
1040
|
+
const content = normalizeWhitespace($(element).attr("content"));
|
|
1041
|
+
if (!key || !content || !key.startsWith("og:") && !key.startsWith("article:") && !key.startsWith("product:")) {
|
|
1042
|
+
return;
|
|
1043
|
+
}
|
|
1044
|
+
setMapValue(raw, key, content);
|
|
1045
|
+
});
|
|
1046
|
+
return {
|
|
1047
|
+
title: firstRaw(raw, "og:title"),
|
|
1048
|
+
description: firstRaw(raw, "og:description"),
|
|
1049
|
+
type: firstRaw(raw, "og:type"),
|
|
1050
|
+
url: firstRaw(raw, "og:url"),
|
|
1051
|
+
siteName: firstRaw(raw, "og:site_name"),
|
|
1052
|
+
locale: firstRaw(raw, "og:locale"),
|
|
1053
|
+
determiner: firstRaw(raw, "og:determiner"),
|
|
1054
|
+
images: collectStructuredMedia($, "image"),
|
|
1055
|
+
videos: collectStructuredMedia($, "video"),
|
|
1056
|
+
audio: collectStructuredMedia($, "audio"),
|
|
1057
|
+
article: {
|
|
1058
|
+
publishedTime: firstRaw(raw, "article:published_time"),
|
|
1059
|
+
modifiedTime: firstRaw(raw, "article:modified_time"),
|
|
1060
|
+
expirationTime: firstRaw(raw, "article:expiration_time"),
|
|
1061
|
+
section: firstRaw(raw, "article:section"),
|
|
1062
|
+
tags: allRaw(raw, "article:tag"),
|
|
1063
|
+
authors: allRaw(raw, "article:author")?.map((name) => ({ name }))
|
|
1064
|
+
},
|
|
1065
|
+
product: {
|
|
1066
|
+
price: firstRaw(raw, "product:price:amount"),
|
|
1067
|
+
currency: firstRaw(raw, "product:price:currency"),
|
|
1068
|
+
availability: firstRaw(raw, "product:availability"),
|
|
1069
|
+
condition: firstRaw(raw, "product:condition")
|
|
1070
|
+
},
|
|
1071
|
+
raw
|
|
1072
|
+
};
|
|
1073
|
+
}
|
|
1074
|
+
function collectStructuredMedia($, kind) {
|
|
1075
|
+
const assets = [];
|
|
1076
|
+
const prefix = `og:${kind}`;
|
|
1077
|
+
$("meta[property], meta[name]").each((_, element) => {
|
|
1078
|
+
const key = readMetaKey($, element);
|
|
1079
|
+
const content = normalizeWhitespace($(element).attr("content"));
|
|
1080
|
+
if (!key?.startsWith(prefix) || !content) {
|
|
1081
|
+
return;
|
|
1082
|
+
}
|
|
1083
|
+
const current = assets.at(-1);
|
|
1084
|
+
if (key === prefix || key === `${prefix}:url`) {
|
|
1085
|
+
if (key.endsWith(":url") && current && !current.url) {
|
|
1086
|
+
current.url = content;
|
|
1087
|
+
return;
|
|
1088
|
+
}
|
|
1089
|
+
assets.push({
|
|
1090
|
+
url: content,
|
|
1091
|
+
kind,
|
|
1092
|
+
source: "openGraph"
|
|
1093
|
+
});
|
|
1094
|
+
return;
|
|
1095
|
+
}
|
|
1096
|
+
const target = current ?? pushEmptyAsset(assets, kind);
|
|
1097
|
+
const property = key.slice(prefix.length + 1);
|
|
1098
|
+
if (property === "secure_url") {
|
|
1099
|
+
target.secureUrl = content;
|
|
1100
|
+
} else if (property === "type") {
|
|
1101
|
+
target.type = content;
|
|
1102
|
+
} else if (property === "width") {
|
|
1103
|
+
target.width = parseInteger(content);
|
|
1104
|
+
} else if (property === "height") {
|
|
1105
|
+
target.height = parseInteger(content);
|
|
1106
|
+
} else if (property === "alt") {
|
|
1107
|
+
target.alt = content;
|
|
1108
|
+
}
|
|
1109
|
+
});
|
|
1110
|
+
return assets.filter((asset) => Boolean(asset.url));
|
|
1111
|
+
}
|
|
1112
|
+
function pushEmptyAsset(assets, kind) {
|
|
1113
|
+
const asset = {
|
|
1114
|
+
url: "",
|
|
1115
|
+
kind,
|
|
1116
|
+
source: "openGraph"
|
|
1117
|
+
};
|
|
1118
|
+
assets.push(asset);
|
|
1119
|
+
return asset;
|
|
1120
|
+
}
|
|
1121
|
+
function readMetaKey($, element) {
|
|
1122
|
+
return normalizeWhitespace($(element).attr("property")) ?? normalizeWhitespace($(element).attr("name"));
|
|
1123
|
+
}
|
|
1124
|
+
function firstRaw(raw, key) {
|
|
1125
|
+
const value = raw[key];
|
|
1126
|
+
return Array.isArray(value) ? value[0] : value;
|
|
1127
|
+
}
|
|
1128
|
+
function allRaw(raw, key) {
|
|
1129
|
+
const value = raw[key];
|
|
1130
|
+
if (!value) {
|
|
1131
|
+
return void 0;
|
|
1132
|
+
}
|
|
1133
|
+
return Array.isArray(value) ? value : [value];
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
// src/extractors/twitter.ts
|
|
1137
|
+
function extractTwitterCards(html) {
|
|
1138
|
+
const $ = loadDocument(html);
|
|
1139
|
+
const raw = {};
|
|
1140
|
+
$("meta[name], meta[property]").each((_, element) => {
|
|
1141
|
+
const key = readMetaKey2($, element);
|
|
1142
|
+
const content = normalizeWhitespace($(element).attr("content"));
|
|
1143
|
+
if (!key?.startsWith("twitter:") || !content) {
|
|
1144
|
+
return;
|
|
1145
|
+
}
|
|
1146
|
+
setMapValue(raw, key, content);
|
|
1147
|
+
});
|
|
1148
|
+
return {
|
|
1149
|
+
card: firstRaw2(raw, "twitter:card"),
|
|
1150
|
+
site: firstRaw2(raw, "twitter:site"),
|
|
1151
|
+
creator: firstRaw2(raw, "twitter:creator"),
|
|
1152
|
+
title: firstRaw2(raw, "twitter:title"),
|
|
1153
|
+
description: firstRaw2(raw, "twitter:description"),
|
|
1154
|
+
images: collectImages(raw),
|
|
1155
|
+
videos: collectVideos(raw),
|
|
1156
|
+
raw
|
|
1157
|
+
};
|
|
1158
|
+
}
|
|
1159
|
+
function collectImages(raw) {
|
|
1160
|
+
const urls = uniqueStrings([
|
|
1161
|
+
...allRaw2(raw, "twitter:image"),
|
|
1162
|
+
...allRaw2(raw, "twitter:image:src"),
|
|
1163
|
+
...allRaw2(raw, "twitter:images"),
|
|
1164
|
+
...allRaw2(raw, "twitter:image0"),
|
|
1165
|
+
...allRaw2(raw, "twitter:image1"),
|
|
1166
|
+
...allRaw2(raw, "twitter:image2"),
|
|
1167
|
+
...allRaw2(raw, "twitter:image3"),
|
|
1168
|
+
...allRaw2(raw, "twitter:player:image"),
|
|
1169
|
+
...Object.entries(raw).filter(([key]) => /^twitter:image(?::\d+)?$/i.test(key) || /^twitter:image:\d+$/i.test(key)).flatMap(([, value]) => Array.isArray(value) ? value : [value])
|
|
1170
|
+
]);
|
|
1171
|
+
const alt = firstRaw2(raw, "twitter:image:alt");
|
|
1172
|
+
const width = parseInteger(firstRaw2(raw, "twitter:image:width"));
|
|
1173
|
+
const height = parseInteger(firstRaw2(raw, "twitter:image:height"));
|
|
1174
|
+
return urls.map((url) => ({
|
|
1175
|
+
url,
|
|
1176
|
+
kind: "image",
|
|
1177
|
+
source: "twitter",
|
|
1178
|
+
alt,
|
|
1179
|
+
width,
|
|
1180
|
+
height
|
|
1181
|
+
}));
|
|
1182
|
+
}
|
|
1183
|
+
function collectVideos(raw) {
|
|
1184
|
+
const players = allRaw2(raw, "twitter:player") ?? allRaw2(raw, "twitter:player:stream") ?? [];
|
|
1185
|
+
const width = parseInteger(firstRaw2(raw, "twitter:player:width"));
|
|
1186
|
+
const height = parseInteger(firstRaw2(raw, "twitter:player:height"));
|
|
1187
|
+
return players.map((url) => ({
|
|
1188
|
+
url,
|
|
1189
|
+
kind: "video",
|
|
1190
|
+
source: "twitter",
|
|
1191
|
+
width,
|
|
1192
|
+
height
|
|
1193
|
+
}));
|
|
1194
|
+
}
|
|
1195
|
+
function readMetaKey2($, element) {
|
|
1196
|
+
return normalizeWhitespace($(element).attr("name")) ?? normalizeWhitespace($(element).attr("property"));
|
|
1197
|
+
}
|
|
1198
|
+
function firstRaw2(raw, key) {
|
|
1199
|
+
const value = raw[key];
|
|
1200
|
+
return Array.isArray(value) ? value[0] : value;
|
|
1201
|
+
}
|
|
1202
|
+
function allRaw2(raw, key) {
|
|
1203
|
+
const value = raw[key];
|
|
1204
|
+
if (!value) {
|
|
1205
|
+
return [];
|
|
1206
|
+
}
|
|
1207
|
+
return Array.isArray(value) ? value : [value];
|
|
1208
|
+
}
|
|
1209
|
+
function uniqueStrings(values) {
|
|
1210
|
+
return [...new Set(values.filter(Boolean))];
|
|
1211
|
+
}
|
|
1212
|
+
|
|
1213
|
+
// src/scorers/image.ts
|
|
1214
|
+
var SOURCE_WEIGHT = {
|
|
1215
|
+
adapter: 96,
|
|
1216
|
+
openGraph: 92,
|
|
1217
|
+
oEmbed: 88,
|
|
1218
|
+
jsonLd: 82,
|
|
1219
|
+
twitter: 78,
|
|
1220
|
+
nextData: 76,
|
|
1221
|
+
nuxt: 74,
|
|
1222
|
+
initialState: 73,
|
|
1223
|
+
preloadedState: 73,
|
|
1224
|
+
apollo: 72,
|
|
1225
|
+
applicationJson: 70,
|
|
1226
|
+
jsonScript: 66,
|
|
1227
|
+
html: 42,
|
|
1228
|
+
fallback: 36,
|
|
1229
|
+
favicon: 8
|
|
1230
|
+
};
|
|
1231
|
+
function scoreImages(images, customScorers = []) {
|
|
1232
|
+
const duplicateCounts = countDuplicates(images);
|
|
1233
|
+
return images.map((image, index) => {
|
|
1234
|
+
const scored = scoreImageWithDetails(image, index, images, duplicateCounts);
|
|
1235
|
+
const customScore = customScorers.reduce((total, scorer) => total + scorer(image, { index, images }), 0);
|
|
1236
|
+
const score = clamp(scored.score + customScore, 0, 100);
|
|
1237
|
+
const reasons = customScore === 0 ? scored.reasons : [...scored.reasons, `custom scorers adjusted score by ${formatSigned(customScore)}`];
|
|
1238
|
+
return {
|
|
1239
|
+
...image,
|
|
1240
|
+
score,
|
|
1241
|
+
confidence: Number((score / 100).toFixed(2)),
|
|
1242
|
+
metadata: {
|
|
1243
|
+
...image.metadata,
|
|
1244
|
+
scoreReasons: reasons
|
|
1245
|
+
}
|
|
1246
|
+
};
|
|
1247
|
+
}).sort((left, right) => (right.score ?? 0) - (left.score ?? 0));
|
|
1248
|
+
}
|
|
1249
|
+
function selectBestImage(images, customScorers = []) {
|
|
1250
|
+
const scored = scoreImages(images, customScorers);
|
|
1251
|
+
const best = scored[0];
|
|
1252
|
+
return {
|
|
1253
|
+
best,
|
|
1254
|
+
images: scored,
|
|
1255
|
+
reason: best ? explainImageChoice(best) : void 0
|
|
1256
|
+
};
|
|
1257
|
+
}
|
|
1258
|
+
function scoreImageWithDetails(image, index, images, duplicateCounts) {
|
|
1259
|
+
let score = SOURCE_WEIGHT[image.source] ?? 50;
|
|
1260
|
+
const reasons = [`source ${sourceLabel(image.source)} added ${score} base points`];
|
|
1261
|
+
const dimensions = scoreDimensions(image);
|
|
1262
|
+
const format = scoreFormat(image);
|
|
1263
|
+
const urlSignal = scoreUrlSignal(image);
|
|
1264
|
+
const urlPenalty = scoreUrlPenalty(image);
|
|
1265
|
+
const duplicatePenalty = scoreDuplicatePenalty(image, duplicateCounts);
|
|
1266
|
+
score += dimensions.score;
|
|
1267
|
+
score += format.score;
|
|
1268
|
+
score += urlSignal.score;
|
|
1269
|
+
score -= urlPenalty;
|
|
1270
|
+
score -= duplicatePenalty.score;
|
|
1271
|
+
score -= Math.min(index * 1.5, 10);
|
|
1272
|
+
reasons.push(...dimensions.reasons, ...format.reasons, ...urlSignal.reasons, ...duplicatePenalty.reasons);
|
|
1273
|
+
if (images.length === 1) {
|
|
1274
|
+
score += 4;
|
|
1275
|
+
reasons.push("only candidate image added 4 points");
|
|
1276
|
+
}
|
|
1277
|
+
if (urlPenalty > 0) {
|
|
1278
|
+
reasons.push(`URL penalties subtracted ${urlPenalty} points`);
|
|
1279
|
+
}
|
|
1280
|
+
const positionPenalty = Math.min(index * 1.5, 10);
|
|
1281
|
+
if (positionPenalty > 0) {
|
|
1282
|
+
reasons.push(`candidate position subtracted ${formatNumber(positionPenalty)} points`);
|
|
1283
|
+
}
|
|
1284
|
+
return { score, reasons };
|
|
1285
|
+
}
|
|
1286
|
+
function scoreDimensions(image) {
|
|
1287
|
+
const width = image.width;
|
|
1288
|
+
const height = image.height;
|
|
1289
|
+
if (!width || !height) {
|
|
1290
|
+
return { score: 0, reasons: ["dimensions are unknown"] };
|
|
1291
|
+
}
|
|
1292
|
+
const area = width * height;
|
|
1293
|
+
const ratio = width / height;
|
|
1294
|
+
let score = 0;
|
|
1295
|
+
const reasons = [];
|
|
1296
|
+
if (width < 120 || height < 90) {
|
|
1297
|
+
score -= 35;
|
|
1298
|
+
reasons.push(`${width}x${height} is below preview minimum and subtracted 35 points`);
|
|
1299
|
+
} else if (area >= 1e6) {
|
|
1300
|
+
score += 12;
|
|
1301
|
+
reasons.push(`${width}x${height} dimensions added 12 points`);
|
|
1302
|
+
} else if (area >= 3e5) {
|
|
1303
|
+
score += 9;
|
|
1304
|
+
reasons.push(`${width}x${height} dimensions added 9 points`);
|
|
1305
|
+
} else if (area >= 9e4) {
|
|
1306
|
+
score += 5;
|
|
1307
|
+
reasons.push(`${width}x${height} dimensions added 5 points`);
|
|
1308
|
+
} else if (area < 1e4) {
|
|
1309
|
+
score -= 20;
|
|
1310
|
+
reasons.push(`${width}x${height} dimensions subtracted 20 points`);
|
|
1311
|
+
}
|
|
1312
|
+
if (isNear(ratio, 1.91, 0.18)) {
|
|
1313
|
+
score += 12;
|
|
1314
|
+
reasons.push(`aspect ratio ${ratio.toFixed(2)} matched social preview ratio`);
|
|
1315
|
+
} else if (isNear(ratio, 16 / 9, 0.2) || isNear(ratio, 1, 0.2)) {
|
|
1316
|
+
score += 8;
|
|
1317
|
+
reasons.push(`aspect ratio ${ratio.toFixed(2)} matched a common preview ratio`);
|
|
1318
|
+
} else if (ratio > 4 || ratio < 0.25) {
|
|
1319
|
+
score -= 16;
|
|
1320
|
+
reasons.push(`aspect ratio ${ratio.toFixed(2)} is unlikely to preview well`);
|
|
1321
|
+
}
|
|
1322
|
+
return { score, reasons };
|
|
1323
|
+
}
|
|
1324
|
+
function scoreFormat(image) {
|
|
1325
|
+
const type = image.type?.toLowerCase() ?? "";
|
|
1326
|
+
const url = image.url.toLowerCase();
|
|
1327
|
+
if (type.includes("webp") || url.endsWith(".webp")) {
|
|
1328
|
+
return { score: 4, reasons: ["modern WebP format added 4 points"] };
|
|
1329
|
+
}
|
|
1330
|
+
if (type.includes("avif") || /\.(avif)(\?|$)/.test(url)) {
|
|
1331
|
+
return { score: 4, reasons: ["modern AVIF format added 4 points"] };
|
|
1332
|
+
}
|
|
1333
|
+
if (type.includes("jpeg") || type.includes("jpg") || /\.(jpe?g)(\?|$)/.test(url)) {
|
|
1334
|
+
return { score: 3, reasons: ["JPEG format added 3 points"] };
|
|
1335
|
+
}
|
|
1336
|
+
if (type.includes("png") || /\.(png)(\?|$)/.test(url)) {
|
|
1337
|
+
return { score: 2, reasons: ["PNG format added 2 points"] };
|
|
1338
|
+
}
|
|
1339
|
+
if (/\.(gif|svg|ico)(\?|$)/.test(url)) {
|
|
1340
|
+
return { score: -8, reasons: ["GIF/SVG/ICO formats are weaker preview candidates"] };
|
|
1341
|
+
}
|
|
1342
|
+
return { score: 0, reasons: [] };
|
|
1343
|
+
}
|
|
1344
|
+
function scoreUrlSignal(image) {
|
|
1345
|
+
const url = image.url.toLowerCase();
|
|
1346
|
+
const matches = url.match(/cover|preview|thumbnail|thumb|og|card|media|hero|share|social/g) ?? [];
|
|
1347
|
+
if (matches.length === 0) {
|
|
1348
|
+
return { score: 0, reasons: [] };
|
|
1349
|
+
}
|
|
1350
|
+
const uniqueMatches = [...new Set(matches)];
|
|
1351
|
+
const score = Math.min(uniqueMatches.length * 4, 12);
|
|
1352
|
+
return {
|
|
1353
|
+
score,
|
|
1354
|
+
reasons: [`URL matched preview hints (${uniqueMatches.join(", ")}) and added ${score} points`]
|
|
1355
|
+
};
|
|
1356
|
+
}
|
|
1357
|
+
function scoreUrlPenalty(image) {
|
|
1358
|
+
const url = image.url.toLowerCase();
|
|
1359
|
+
let penalty = 0;
|
|
1360
|
+
if (/favicon|apple-touch-icon|sprite|icon-|\/icon|placeholder|blank|spacer|pixel|tracking|emoji/.test(url)) {
|
|
1361
|
+
penalty += 30;
|
|
1362
|
+
}
|
|
1363
|
+
if (/logo|avatar|profile|headshot|badge/.test(url)) {
|
|
1364
|
+
penalty += 22;
|
|
1365
|
+
}
|
|
1366
|
+
if (image.alt && /logo|avatar|icon|emoji/i.test(image.alt)) {
|
|
1367
|
+
penalty += 14;
|
|
1368
|
+
}
|
|
1369
|
+
if (image.kind === "favicon") {
|
|
1370
|
+
penalty += 35;
|
|
1371
|
+
}
|
|
1372
|
+
return penalty;
|
|
1373
|
+
}
|
|
1374
|
+
function scoreDuplicatePenalty(image, duplicateCounts) {
|
|
1375
|
+
const count = duplicateCounts.get(mediaSignature(image.url)) ?? 0;
|
|
1376
|
+
if (count <= 1) {
|
|
1377
|
+
return { score: 0, reasons: [] };
|
|
1378
|
+
}
|
|
1379
|
+
const penalty = Math.min((count - 1) * 4, 12);
|
|
1380
|
+
return {
|
|
1381
|
+
score: penalty,
|
|
1382
|
+
reasons: [`duplicate-like URL group subtracted ${penalty} points`]
|
|
1383
|
+
};
|
|
1384
|
+
}
|
|
1385
|
+
function explainImageChoice(image) {
|
|
1386
|
+
const dimensions = image.width && image.height ? `${image.width}x${image.height}` : "unknown dimensions";
|
|
1387
|
+
const reasons = Array.isArray(image.metadata?.scoreReasons) ? image.metadata.scoreReasons.slice(0, 4).join("; ") : "";
|
|
1388
|
+
const reasonSuffix = reasons ? ` Reasons: ${reasons}.` : "";
|
|
1389
|
+
return `Selected because it came from ${sourceLabel(image.source)}, has ${dimensions}, and scored ${Math.round(image.score ?? 0)}.${reasonSuffix}`;
|
|
1390
|
+
}
|
|
1391
|
+
function isNear(value, target, tolerance) {
|
|
1392
|
+
return Math.abs(value - target) <= tolerance;
|
|
1393
|
+
}
|
|
1394
|
+
function clamp(value, min, max) {
|
|
1395
|
+
return Math.max(min, Math.min(max, value));
|
|
1396
|
+
}
|
|
1397
|
+
function sourceLabel(source) {
|
|
1398
|
+
if (source === "openGraph") {
|
|
1399
|
+
return "og:image";
|
|
1400
|
+
}
|
|
1401
|
+
if (source === "twitter") {
|
|
1402
|
+
return "Twitter Card metadata";
|
|
1403
|
+
}
|
|
1404
|
+
if (source === "jsonLd") {
|
|
1405
|
+
return "JSON-LD";
|
|
1406
|
+
}
|
|
1407
|
+
if (source === "oEmbed") {
|
|
1408
|
+
return "oEmbed";
|
|
1409
|
+
}
|
|
1410
|
+
if (["nextData", "nuxt", "initialState", "preloadedState", "apollo", "applicationJson", "jsonScript"].includes(source)) {
|
|
1411
|
+
return "embedded application data";
|
|
1412
|
+
}
|
|
1413
|
+
return source;
|
|
1414
|
+
}
|
|
1415
|
+
function formatSigned(value) {
|
|
1416
|
+
return value >= 0 ? `+${formatNumber(value)}` : formatNumber(value);
|
|
1417
|
+
}
|
|
1418
|
+
function formatNumber(value) {
|
|
1419
|
+
return Number.isInteger(value) ? String(value) : value.toFixed(1);
|
|
1420
|
+
}
|
|
1421
|
+
function countDuplicates(images) {
|
|
1422
|
+
const counts = /* @__PURE__ */ new Map();
|
|
1423
|
+
for (const image of images) {
|
|
1424
|
+
const key = mediaSignature(image.url);
|
|
1425
|
+
counts.set(key, (counts.get(key) ?? 0) + 1);
|
|
1426
|
+
}
|
|
1427
|
+
return counts;
|
|
1428
|
+
}
|
|
1429
|
+
function mediaSignature(url) {
|
|
1430
|
+
try {
|
|
1431
|
+
const parsed = new URL(url);
|
|
1432
|
+
return `${parsed.hostname.toLowerCase()}${parsed.pathname.replace(/[-_](?:\d{2,5}x\d{2,5}|\d{2,5}w|small|medium|large|thumb|thumbnail)(?=\.)/i, "").toLowerCase()}`;
|
|
1433
|
+
} catch {
|
|
1434
|
+
return url.toLowerCase();
|
|
1435
|
+
}
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
// src/media/MediaDiscoveryEngine.ts
|
|
1439
|
+
var IMAGE_KEYS = [
|
|
1440
|
+
"image",
|
|
1441
|
+
"images",
|
|
1442
|
+
"thumbnail",
|
|
1443
|
+
"thumbnailUrl",
|
|
1444
|
+
"thumbnail_url",
|
|
1445
|
+
"thumbnailSrc",
|
|
1446
|
+
"previewImage",
|
|
1447
|
+
"preview_image",
|
|
1448
|
+
"ogImage",
|
|
1449
|
+
"cardImage",
|
|
1450
|
+
"cover",
|
|
1451
|
+
"coverImage",
|
|
1452
|
+
"poster",
|
|
1453
|
+
"media"
|
|
1454
|
+
];
|
|
1455
|
+
var VIDEO_KEYS = ["video", "videos", "videoUrl", "video_url", "contentUrl", "embedUrl", "playbackUrl"];
|
|
1456
|
+
var AUDIO_KEYS = ["audio", "audios", "audioUrl", "audio_url", "podcastUrl"];
|
|
1457
|
+
function discoverMedia(rawSources, finalUrl) {
|
|
1458
|
+
const trace = [];
|
|
1459
|
+
const externalResults = [...rawSources.plugins, ...rawSources.adapters];
|
|
1460
|
+
const images = normalizeAssets(
|
|
1461
|
+
[
|
|
1462
|
+
...externalResults.flatMap((result) => result.images ?? []),
|
|
1463
|
+
...rawSources.openGraph.images,
|
|
1464
|
+
...rawSources.twitter.images,
|
|
1465
|
+
...jsonLdImages(rawSources.jsonLd.nodes),
|
|
1466
|
+
...embeddedImages(rawSources.embeddedData.items),
|
|
1467
|
+
...rawSources.html.imageSrc ? [rawSources.html.imageSrc] : [],
|
|
1468
|
+
...rawSources.images,
|
|
1469
|
+
...oEmbedImages(rawSources)
|
|
1470
|
+
],
|
|
1471
|
+
finalUrl
|
|
1472
|
+
);
|
|
1473
|
+
const videos = normalizeAssets(
|
|
1474
|
+
[
|
|
1475
|
+
...externalResults.flatMap((result) => result.videos ?? []),
|
|
1476
|
+
...rawSources.openGraph.videos,
|
|
1477
|
+
...rawSources.twitter.videos,
|
|
1478
|
+
...jsonLdVideos(rawSources.jsonLd.nodes),
|
|
1479
|
+
...embeddedVideos(rawSources.embeddedData.items),
|
|
1480
|
+
...rawSources.videos
|
|
1481
|
+
],
|
|
1482
|
+
finalUrl
|
|
1483
|
+
);
|
|
1484
|
+
const audio = normalizeAssets(
|
|
1485
|
+
[
|
|
1486
|
+
...externalResults.flatMap((result) => result.audio ?? []),
|
|
1487
|
+
...rawSources.openGraph.audio,
|
|
1488
|
+
...jsonLdAudio(rawSources.jsonLd.nodes),
|
|
1489
|
+
...embeddedAudio(rawSources.embeddedData.items),
|
|
1490
|
+
...rawSources.audio
|
|
1491
|
+
],
|
|
1492
|
+
finalUrl
|
|
1493
|
+
);
|
|
1494
|
+
if (rawSources.openGraph.images.length > 0) {
|
|
1495
|
+
trace.push("media discovery collected og:image candidates");
|
|
1496
|
+
}
|
|
1497
|
+
if (rawSources.twitter.images.length > 0) {
|
|
1498
|
+
trace.push("media discovery collected twitter:image candidates");
|
|
1499
|
+
}
|
|
1500
|
+
if (rawSources.jsonLd.nodes.length > 0) {
|
|
1501
|
+
trace.push("media discovery scanned JSON-LD media");
|
|
1502
|
+
}
|
|
1503
|
+
if (rawSources.embeddedData.items.length > 0) {
|
|
1504
|
+
trace.push("media discovery scanned embedded application data");
|
|
1505
|
+
}
|
|
1506
|
+
if (rawSources.images.length > 0) {
|
|
1507
|
+
trace.push("media discovery scanned HTML images, srcset, lazy images, posters, and fallbacks");
|
|
1508
|
+
}
|
|
1509
|
+
if (externalResults.some((result) => (result.images?.length ?? 0) > 0 || (result.videos?.length ?? 0) > 0)) {
|
|
1510
|
+
trace.push("media discovery included adapter and plugin media");
|
|
1511
|
+
}
|
|
1512
|
+
return {
|
|
1513
|
+
images: dedupeMediaBySignature(uniqueMediaByUrl(images)),
|
|
1514
|
+
videos: dedupeMediaBySignature(uniqueMediaByUrl(videos)),
|
|
1515
|
+
audio: dedupeMediaBySignature(uniqueMediaByUrl(audio)),
|
|
1516
|
+
trace
|
|
1517
|
+
};
|
|
1518
|
+
}
|
|
1519
|
+
function normalizeAssets(assets, baseUrl) {
|
|
1520
|
+
return assets.map((asset) => {
|
|
1521
|
+
const secureUrl = tryResolveUrl(asset.secureUrl, baseUrl);
|
|
1522
|
+
const url = tryResolveUrl(secureUrl ?? asset.url, baseUrl);
|
|
1523
|
+
const poster = tryResolveUrl(asset.poster, baseUrl);
|
|
1524
|
+
if (!url || shouldIgnoreMediaUrl2(url)) {
|
|
1525
|
+
return void 0;
|
|
1526
|
+
}
|
|
1527
|
+
return stripUndefined({
|
|
1528
|
+
...asset,
|
|
1529
|
+
url,
|
|
1530
|
+
secureUrl,
|
|
1531
|
+
poster
|
|
1532
|
+
});
|
|
1533
|
+
}).filter((asset) => Boolean(asset));
|
|
1534
|
+
}
|
|
1535
|
+
function jsonLdImages(nodes) {
|
|
1536
|
+
return nodes.flatMap((node) => [
|
|
1537
|
+
...mediaFromJsonValue(node.image, "image", "jsonLd"),
|
|
1538
|
+
...mediaFromJsonValue(node.thumbnailUrl, "image", "jsonLd"),
|
|
1539
|
+
...mediaFromJsonValue(node.thumbnail, "image", "jsonLd"),
|
|
1540
|
+
...mediaFromJsonValue(node.logo, "image", "jsonLd")
|
|
1541
|
+
]);
|
|
1542
|
+
}
|
|
1543
|
+
function jsonLdVideos(nodes) {
|
|
1544
|
+
return nodes.flatMap((node) => [
|
|
1545
|
+
...mediaFromJsonValue(node.contentUrl, "video", "jsonLd"),
|
|
1546
|
+
...mediaFromJsonValue(node.embedUrl, "video", "jsonLd"),
|
|
1547
|
+
...mediaFromJsonValue(node.video, "video", "jsonLd")
|
|
1548
|
+
]);
|
|
1549
|
+
}
|
|
1550
|
+
function jsonLdAudio(nodes) {
|
|
1551
|
+
return nodes.flatMap((node) => [
|
|
1552
|
+
...mediaFromJsonValue(node.contentUrl, "audio", "jsonLd"),
|
|
1553
|
+
...mediaFromJsonValue(node.encoding, "audio", "jsonLd")
|
|
1554
|
+
]);
|
|
1555
|
+
}
|
|
1556
|
+
function embeddedImages(items) {
|
|
1557
|
+
return items.flatMap((item) => mediaFromEmbeddedItem(item, IMAGE_KEYS, "image"));
|
|
1558
|
+
}
|
|
1559
|
+
function embeddedVideos(items) {
|
|
1560
|
+
return items.flatMap((item) => mediaFromEmbeddedItem(item, VIDEO_KEYS, "video"));
|
|
1561
|
+
}
|
|
1562
|
+
function embeddedAudio(items) {
|
|
1563
|
+
return items.flatMap((item) => mediaFromEmbeddedItem(item, AUDIO_KEYS, "audio"));
|
|
1564
|
+
}
|
|
1565
|
+
function mediaFromEmbeddedItem(item, keys, kind) {
|
|
1566
|
+
const assets = [];
|
|
1567
|
+
walkEmbeddedData(item.data, (value, key, parent) => {
|
|
1568
|
+
if (!key || !keys.some((candidate) => candidate.toLowerCase() === key.toLowerCase())) {
|
|
1569
|
+
if (typeof value === "string" && looksLikeMediaUrl(value, kind) && key && keys.some((candidate) => key.toLowerCase().includes(candidate.toLowerCase()))) {
|
|
1570
|
+
assets.push(assetFromEmbedded(value, kind, item, parent));
|
|
1571
|
+
}
|
|
1572
|
+
return;
|
|
1573
|
+
}
|
|
1574
|
+
assets.push(...mediaFromJsonValue(value, kind, item.source));
|
|
1575
|
+
});
|
|
1576
|
+
return assets;
|
|
1577
|
+
}
|
|
1578
|
+
function mediaFromJsonValue(value, kind, source) {
|
|
1579
|
+
if (!value) {
|
|
1580
|
+
return [];
|
|
1581
|
+
}
|
|
1582
|
+
if (typeof value === "string") {
|
|
1583
|
+
return looksLikeMediaUrl(value, kind) ? [{ url: value, kind, source }] : [];
|
|
1584
|
+
}
|
|
1585
|
+
if (Array.isArray(value)) {
|
|
1586
|
+
return value.flatMap((item) => mediaFromJsonValue(item, kind, source));
|
|
1587
|
+
}
|
|
1588
|
+
if (isRecord3(value)) {
|
|
1589
|
+
const url = stringFromUnknown(value.url) ?? stringFromUnknown(value.src) ?? stringFromUnknown(value.contentUrl) ?? stringFromUnknown(value.thumbnailUrl);
|
|
1590
|
+
if (!url || !looksLikeMediaUrl(url, kind)) {
|
|
1591
|
+
return [];
|
|
1592
|
+
}
|
|
1593
|
+
return [
|
|
1594
|
+
{
|
|
1595
|
+
url,
|
|
1596
|
+
kind,
|
|
1597
|
+
source,
|
|
1598
|
+
width: parseNumber(stringFromUnknown(value.width)),
|
|
1599
|
+
height: parseNumber(stringFromUnknown(value.height)),
|
|
1600
|
+
alt: stringFromUnknown(value.alt) ?? stringFromUnknown(value.caption) ?? stringFromUnknown(value.name),
|
|
1601
|
+
title: stringFromUnknown(value.title),
|
|
1602
|
+
type: stringFromUnknown(value.type) ?? stringFromUnknown(value.mimeType) ?? stringFromUnknown(value.encodingFormat)
|
|
1603
|
+
}
|
|
1604
|
+
];
|
|
1605
|
+
}
|
|
1606
|
+
return [];
|
|
1607
|
+
}
|
|
1608
|
+
function assetFromEmbedded(value, kind, item, parent) {
|
|
1609
|
+
return {
|
|
1610
|
+
url: value,
|
|
1611
|
+
kind,
|
|
1612
|
+
source: item.source,
|
|
1613
|
+
width: parseNumber(stringFromUnknown(parent?.width)),
|
|
1614
|
+
height: parseNumber(stringFromUnknown(parent?.height)),
|
|
1615
|
+
alt: stringFromUnknown(parent?.alt) ?? stringFromUnknown(parent?.caption),
|
|
1616
|
+
title: stringFromUnknown(parent?.title),
|
|
1617
|
+
metadata: {
|
|
1618
|
+
embeddedPath: item.path
|
|
1619
|
+
}
|
|
1620
|
+
};
|
|
1621
|
+
}
|
|
1622
|
+
function oEmbedImages(rawSources) {
|
|
1623
|
+
return rawSources.oEmbed.data.flatMap((data) => {
|
|
1624
|
+
const images = [];
|
|
1625
|
+
if (data.thumbnail_url) {
|
|
1626
|
+
images.push({
|
|
1627
|
+
url: data.thumbnail_url,
|
|
1628
|
+
kind: "image",
|
|
1629
|
+
source: "oEmbed",
|
|
1630
|
+
width: data.thumbnail_width,
|
|
1631
|
+
height: data.thumbnail_height
|
|
1632
|
+
});
|
|
1633
|
+
}
|
|
1634
|
+
if (data.type === "photo" && data.url) {
|
|
1635
|
+
images.push({
|
|
1636
|
+
url: data.url,
|
|
1637
|
+
kind: "image",
|
|
1638
|
+
source: "oEmbed",
|
|
1639
|
+
width: data.width,
|
|
1640
|
+
height: data.height
|
|
1641
|
+
});
|
|
1642
|
+
}
|
|
1643
|
+
return images;
|
|
1644
|
+
});
|
|
1645
|
+
}
|
|
1646
|
+
function dedupeMediaBySignature(assets) {
|
|
1647
|
+
const seen = /* @__PURE__ */ new Map();
|
|
1648
|
+
for (const asset of assets) {
|
|
1649
|
+
const key = mediaSignature2(asset.url);
|
|
1650
|
+
const current = seen.get(key);
|
|
1651
|
+
if (!current || sourceRank(asset.source) > sourceRank(current.source)) {
|
|
1652
|
+
seen.set(key, asset);
|
|
1653
|
+
}
|
|
1654
|
+
}
|
|
1655
|
+
return [...seen.values()];
|
|
1656
|
+
}
|
|
1657
|
+
function mediaSignature2(url) {
|
|
1658
|
+
try {
|
|
1659
|
+
const parsed = new URL(url);
|
|
1660
|
+
const pathname = parsed.pathname.replace(/[-_](?:\d{2,5}x\d{2,5}|\d{2,5}w|small|medium|large|thumb|thumbnail)(?=\.)/i, "").toLowerCase();
|
|
1661
|
+
return `${parsed.hostname.toLowerCase()}${pathname}`;
|
|
1662
|
+
} catch {
|
|
1663
|
+
return url.toLowerCase();
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
function sourceRank(source) {
|
|
1667
|
+
const ranks = {
|
|
1668
|
+
adapter: 90,
|
|
1669
|
+
openGraph: 80,
|
|
1670
|
+
twitter: 75,
|
|
1671
|
+
jsonLd: 70,
|
|
1672
|
+
oEmbed: 65,
|
|
1673
|
+
nextData: 64,
|
|
1674
|
+
nuxt: 62,
|
|
1675
|
+
initialState: 61,
|
|
1676
|
+
preloadedState: 60,
|
|
1677
|
+
apollo: 59,
|
|
1678
|
+
applicationJson: 58,
|
|
1679
|
+
jsonScript: 55,
|
|
1680
|
+
html: 40,
|
|
1681
|
+
fallback: 30
|
|
1682
|
+
};
|
|
1683
|
+
return ranks[source] ?? 50;
|
|
1684
|
+
}
|
|
1685
|
+
function shouldIgnoreMediaUrl2(url) {
|
|
1686
|
+
const normalized = url.toLowerCase();
|
|
1687
|
+
return normalized.startsWith("data:") || normalized.startsWith("blob:") || normalized.startsWith("javascript:") || /(?:sprite|spacer|blank|transparent|placeholder|tracking|beacon|pixel|emoji)(?:[._/-]|$|\?)/i.test(normalized) || /(?:^|[/?_-])1x1(?:[._/-]|$|\?)/i.test(normalized);
|
|
1688
|
+
}
|
|
1689
|
+
function looksLikeMediaUrl(value, kind) {
|
|
1690
|
+
if (shouldIgnoreMediaUrl2(value)) {
|
|
1691
|
+
return false;
|
|
1692
|
+
}
|
|
1693
|
+
if (/^https?:\/\//i.test(value) || value.startsWith("/") || value.startsWith("./") || value.startsWith("../")) {
|
|
1694
|
+
if (kind === "image") {
|
|
1695
|
+
return /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(value) || /(?:image|thumb|thumbnail|cover|poster|preview|media|og|card|photo)/i.test(value);
|
|
1696
|
+
}
|
|
1697
|
+
if (kind === "video") {
|
|
1698
|
+
return /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(value) || /(?:video|embed|player|watch|reel|shorts)/i.test(value);
|
|
1699
|
+
}
|
|
1700
|
+
if (kind === "audio") {
|
|
1701
|
+
return /\.(?:mp3|m4a|wav|ogg|aac)(?:[?#].*)?$/i.test(value) || /(?:audio|podcast)/i.test(value);
|
|
1702
|
+
}
|
|
1703
|
+
}
|
|
1704
|
+
return false;
|
|
1705
|
+
}
|
|
1706
|
+
function walkEmbeddedData(value, visit, key, parent, depth = 0) {
|
|
1707
|
+
if (depth > 8) {
|
|
1708
|
+
return;
|
|
1709
|
+
}
|
|
1710
|
+
visit(value, key, parent);
|
|
1711
|
+
if (Array.isArray(value)) {
|
|
1712
|
+
for (const item of value.slice(0, 250)) {
|
|
1713
|
+
walkEmbeddedData(item, visit, void 0, parent, depth + 1);
|
|
1714
|
+
}
|
|
1715
|
+
return;
|
|
1716
|
+
}
|
|
1717
|
+
if (isRecord3(value)) {
|
|
1718
|
+
for (const [childKey, childValue] of Object.entries(value).slice(0, 500)) {
|
|
1719
|
+
walkEmbeddedData(childValue, visit, childKey, value, depth + 1);
|
|
1720
|
+
}
|
|
1721
|
+
}
|
|
1722
|
+
}
|
|
1723
|
+
function stripUndefined(value) {
|
|
1724
|
+
return Object.fromEntries(Object.entries(value).filter(([, item]) => item !== void 0 && item !== null && (!Array.isArray(item) || item.length > 0)));
|
|
1725
|
+
}
|
|
1726
|
+
function stringFromUnknown(value) {
|
|
1727
|
+
if (typeof value === "string" && value.trim()) {
|
|
1728
|
+
return value.trim();
|
|
1729
|
+
}
|
|
1730
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1731
|
+
return String(value);
|
|
1732
|
+
}
|
|
1733
|
+
if (isRecord3(value)) {
|
|
1734
|
+
return stringFromUnknown(value.url) ?? stringFromUnknown(value.src) ?? stringFromUnknown(value.name);
|
|
1735
|
+
}
|
|
1736
|
+
return void 0;
|
|
1737
|
+
}
|
|
1738
|
+
function isRecord3(value) {
|
|
1739
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1740
|
+
}
|
|
1741
|
+
|
|
1742
|
+
// src/engines/ConfidenceEngine.ts
|
|
1743
|
+
function calculateConfidence(input) {
|
|
1744
|
+
let score = 0;
|
|
1745
|
+
score += qualityPoints(input.title, 18, 6, 120);
|
|
1746
|
+
score += qualityPoints(input.description, 16, 24, 300);
|
|
1747
|
+
if (input.bestImage) {
|
|
1748
|
+
score += 18;
|
|
1749
|
+
score += Math.min(input.bestImage.score ?? 0, 100) * 0.12;
|
|
1750
|
+
score += sourceConfidenceBonus(input.bestImage.source);
|
|
1751
|
+
}
|
|
1752
|
+
if (input.canonicalUrl) {
|
|
1753
|
+
score += 10;
|
|
1754
|
+
}
|
|
1755
|
+
if (input.hasStructuredData) {
|
|
1756
|
+
score += 12;
|
|
1757
|
+
}
|
|
1758
|
+
if (adapterSucceeded(input.rawSources.adapters)) {
|
|
1759
|
+
score += 8;
|
|
1760
|
+
}
|
|
1761
|
+
if (input.sourcesUsed.includes("openGraph")) {
|
|
1762
|
+
score += 6;
|
|
1763
|
+
}
|
|
1764
|
+
if (input.sourcesUsed.includes("twitter")) {
|
|
1765
|
+
score += 4;
|
|
1766
|
+
}
|
|
1767
|
+
if (input.rawSources.embeddedData.items.length > 0) {
|
|
1768
|
+
score += 6;
|
|
1769
|
+
}
|
|
1770
|
+
score -= Math.min(input.warnings.length * 3, 18);
|
|
1771
|
+
return Math.round(clamp2(score, 0, 100));
|
|
1772
|
+
}
|
|
1773
|
+
function calculateCompleteness(input) {
|
|
1774
|
+
const weights = [
|
|
1775
|
+
input.title ? 20 : 0,
|
|
1776
|
+
input.description ? 16 : 0,
|
|
1777
|
+
input.bestImage ? 20 : 0,
|
|
1778
|
+
input.canonicalUrl ? 12 : 0,
|
|
1779
|
+
input.siteName ? 8 : 0,
|
|
1780
|
+
input.author ? 8 : 0,
|
|
1781
|
+
input.publisher ? 5 : 0,
|
|
1782
|
+
input.type && input.type !== "unknown" ? 6 : 0,
|
|
1783
|
+
input.publishedTime ? 3 : 0,
|
|
1784
|
+
input.mediaCount > 1 ? 2 : 0
|
|
1785
|
+
];
|
|
1786
|
+
return Math.round(clamp2(weights.reduce((total, value) => total + value, 0), 0, 100));
|
|
1787
|
+
}
|
|
1788
|
+
function calculateReliability(input) {
|
|
1789
|
+
let score = input.confidence * 0.45 + input.completeness * 0.3;
|
|
1790
|
+
if (input.adapterMatched) {
|
|
1791
|
+
score += 10;
|
|
1792
|
+
}
|
|
1793
|
+
if ((input.bestImage?.score ?? 0) >= 80) {
|
|
1794
|
+
score += 10;
|
|
1795
|
+
} else if (input.bestImage) {
|
|
1796
|
+
score += 5;
|
|
1797
|
+
}
|
|
1798
|
+
score -= Math.min(input.warnings.length * 4, 20);
|
|
1799
|
+
return Math.round(clamp2(score, 0, 100));
|
|
1800
|
+
}
|
|
1801
|
+
function qualityPoints(value, maxPoints, idealMinLength, idealMaxLength) {
|
|
1802
|
+
if (!value) {
|
|
1803
|
+
return 0;
|
|
1804
|
+
}
|
|
1805
|
+
const normalized = value.replace(/\s+/g, " ").trim();
|
|
1806
|
+
if (!normalized) {
|
|
1807
|
+
return 0;
|
|
1808
|
+
}
|
|
1809
|
+
let points = maxPoints;
|
|
1810
|
+
if (normalized.length < idealMinLength) {
|
|
1811
|
+
points *= normalized.length / idealMinLength;
|
|
1812
|
+
}
|
|
1813
|
+
if (normalized.length > idealMaxLength) {
|
|
1814
|
+
points *= Math.max(0.4, idealMaxLength / normalized.length);
|
|
1815
|
+
}
|
|
1816
|
+
if (/^(home|untitled|index|login|sign in)$/i.test(normalized)) {
|
|
1817
|
+
points *= 0.35;
|
|
1818
|
+
}
|
|
1819
|
+
return points;
|
|
1820
|
+
}
|
|
1821
|
+
function sourceConfidenceBonus(source) {
|
|
1822
|
+
if (source === "adapter") {
|
|
1823
|
+
return 8;
|
|
1824
|
+
}
|
|
1825
|
+
if (source === "openGraph" || source === "twitter") {
|
|
1826
|
+
return 7;
|
|
1827
|
+
}
|
|
1828
|
+
if (source === "jsonLd" || source === "oEmbed") {
|
|
1829
|
+
return 6;
|
|
1830
|
+
}
|
|
1831
|
+
if (["nextData", "nuxt", "initialState", "preloadedState", "apollo", "applicationJson"].includes(source)) {
|
|
1832
|
+
return 5;
|
|
1833
|
+
}
|
|
1834
|
+
if (source === "html") {
|
|
1835
|
+
return 3;
|
|
1836
|
+
}
|
|
1837
|
+
return 1;
|
|
1838
|
+
}
|
|
1839
|
+
function adapterSucceeded(adapters) {
|
|
1840
|
+
return adapters.some((adapter) => Boolean(adapter.title || adapter.description || adapter.images?.length || adapter.videos?.length));
|
|
1841
|
+
}
|
|
1842
|
+
function clamp2(value, min, max) {
|
|
1843
|
+
return Math.max(min, Math.min(max, value));
|
|
1844
|
+
}
|
|
1845
|
+
|
|
1846
|
+
// src/normalizers/normalize.ts
|
|
1847
|
+
function normalizeMetadata(rawSources, context = {}) {
|
|
1848
|
+
const finalUrl = context.finalUrl ?? context.url ?? "";
|
|
1849
|
+
const url = context.url ?? finalUrl;
|
|
1850
|
+
const externalResults = [...rawSources.plugins, ...rawSources.adapters];
|
|
1851
|
+
const jsonLdNodes = rawSources.jsonLd.nodes;
|
|
1852
|
+
const embeddedNodes = rawSources.embeddedData.items.map((item) => item.data);
|
|
1853
|
+
const articleNode = findJsonLdNode(jsonLdNodes, ["Article", "NewsArticle", "BlogPosting"]);
|
|
1854
|
+
const productNode = findJsonLdNode(jsonLdNodes, ["Product"]);
|
|
1855
|
+
const appNode = findJsonLdNode(jsonLdNodes, ["SoftwareApplication", "MobileApplication", "WebApplication"]);
|
|
1856
|
+
const organizationNode = findJsonLdNode(jsonLdNodes, ["Organization", "NewsMediaOrganization", "WebSite"]);
|
|
1857
|
+
const canonicalUrl = tryResolveUrl(
|
|
1858
|
+
firstDefined(
|
|
1859
|
+
firstResultValue(externalResults, (result) => result.canonicalUrl),
|
|
1860
|
+
rawSources.html.canonicalUrl,
|
|
1861
|
+
rawSources.openGraph.url,
|
|
1862
|
+
jsonString(jsonLdNodes, "url"),
|
|
1863
|
+
findStringByKeys(embeddedNodes, ["canonicalUrl", "canonical", "permalink", "pageUrl", "shareUrl"])
|
|
1864
|
+
),
|
|
1865
|
+
finalUrl
|
|
1866
|
+
);
|
|
1867
|
+
const title = firstDefined(
|
|
1868
|
+
firstResultValue(externalResults, (result) => result.title),
|
|
1869
|
+
rawSources.openGraph.title,
|
|
1870
|
+
rawSources.twitter.title,
|
|
1871
|
+
jsonString([articleNode, productNode, appNode, organizationNode].filter(isJsonLdNode), "headline"),
|
|
1872
|
+
jsonString([articleNode, productNode, appNode, organizationNode].filter(isJsonLdNode), "name"),
|
|
1873
|
+
findStringByKeys(embeddedNodes, ["title", "headline", "pageTitle", "ogTitle", "name"]),
|
|
1874
|
+
rawSources.html.title
|
|
1875
|
+
);
|
|
1876
|
+
const description = firstDefined(
|
|
1877
|
+
firstResultValue(externalResults, (result) => result.description),
|
|
1878
|
+
rawSources.openGraph.description,
|
|
1879
|
+
rawSources.twitter.description,
|
|
1880
|
+
jsonString([articleNode, productNode, appNode, organizationNode].filter(isJsonLdNode), "description"),
|
|
1881
|
+
findStringByKeys(embeddedNodes, ["description", "excerpt", "summary", "subtitle", "ogDescription"]),
|
|
1882
|
+
rawSources.html.description
|
|
1883
|
+
);
|
|
1884
|
+
const siteName = firstDefined(
|
|
1885
|
+
firstResultValue(externalResults, (result) => result.siteName),
|
|
1886
|
+
rawSources.openGraph.siteName,
|
|
1887
|
+
jsonString([organizationNode].filter(isJsonLdNode), "name"),
|
|
1888
|
+
findStringByKeys(embeddedNodes, ["siteName", "site_name", "appName", "applicationName"]),
|
|
1889
|
+
domainName(finalUrl)
|
|
1890
|
+
);
|
|
1891
|
+
const mediaDiscovery = discoverMedia(rawSources, finalUrl);
|
|
1892
|
+
const images = mediaDiscovery.images;
|
|
1893
|
+
const videos = mediaDiscovery.videos;
|
|
1894
|
+
const audio = mediaDiscovery.audio;
|
|
1895
|
+
const favicons = normalizeAssets2(rawSources.html.favicons, finalUrl);
|
|
1896
|
+
const selectedImage = selectBestImage(images, context.imageScorers);
|
|
1897
|
+
const article = mergeArticle(rawSources, externalResults, articleNode, embeddedNodes);
|
|
1898
|
+
const product = mergeProduct(rawSources, externalResults, productNode);
|
|
1899
|
+
const app = mergeApp(rawSources, externalResults, appNode);
|
|
1900
|
+
const video = mergeVideo(rawSources, externalResults, jsonLdNodes, embeddedNodes, videos);
|
|
1901
|
+
const playlist = mergePlaylist(externalResults);
|
|
1902
|
+
const type = inferType(rawSources, externalResults, jsonLdNodes, article, product, app, playlist, videos, audio);
|
|
1903
|
+
const author = firstResultValue(externalResults, (result) => result.author) ?? firstEntity(article?.authors) ?? entityFromEmbedded(embeddedNodes, ["author", "creator", "owner", "user"]);
|
|
1904
|
+
const publisher = article?.publisher ?? firstResultValue(externalResults, (result) => result.publisher) ?? entityFromJsonLd(organizationNode) ?? entityFromEmbedded(embeddedNodes, ["publisher", "provider", "organization"]);
|
|
1905
|
+
const sourcesUsed = detectSourcesUsed(rawSources);
|
|
1906
|
+
const warnings = diagnosticsWarnings(rawSources, externalResults, context.diagnostics);
|
|
1907
|
+
const fieldSources = {
|
|
1908
|
+
title: fieldSource(rawSources, externalResults, embeddedNodes, "title", selectedImage.best),
|
|
1909
|
+
description: fieldSource(rawSources, externalResults, embeddedNodes, "description", selectedImage.best),
|
|
1910
|
+
author: fieldSource(rawSources, externalResults, embeddedNodes, "author", selectedImage.best),
|
|
1911
|
+
image: fieldSource(rawSources, externalResults, embeddedNodes, "image", selectedImage.best)
|
|
1912
|
+
};
|
|
1913
|
+
const confidence = calculateConfidence({
|
|
1914
|
+
title,
|
|
1915
|
+
description,
|
|
1916
|
+
bestImage: selectedImage.best,
|
|
1917
|
+
canonicalUrl,
|
|
1918
|
+
hasStructuredData: jsonLdNodes.length > 0,
|
|
1919
|
+
rawSources,
|
|
1920
|
+
sourcesUsed,
|
|
1921
|
+
warnings
|
|
1922
|
+
});
|
|
1923
|
+
const completeness = calculateCompleteness({
|
|
1924
|
+
title,
|
|
1925
|
+
description,
|
|
1926
|
+
bestImage: selectedImage.best,
|
|
1927
|
+
canonicalUrl,
|
|
1928
|
+
siteName,
|
|
1929
|
+
author,
|
|
1930
|
+
publisher,
|
|
1931
|
+
type,
|
|
1932
|
+
publishedTime: article?.publishedTime,
|
|
1933
|
+
mediaCount: images.length + videos.length + audio.length
|
|
1934
|
+
});
|
|
1935
|
+
const reliability = calculateReliability({
|
|
1936
|
+
confidence,
|
|
1937
|
+
completeness,
|
|
1938
|
+
adapterMatched: rawSources.adapters.length > 0,
|
|
1939
|
+
bestImage: selectedImage.best,
|
|
1940
|
+
warnings
|
|
1941
|
+
});
|
|
1942
|
+
const diagnostics = context.diagnostics ?? {
|
|
1943
|
+
redirects: [],
|
|
1944
|
+
sourcesUsed: [],
|
|
1945
|
+
warnings: [],
|
|
1946
|
+
trace: [],
|
|
1947
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1948
|
+
};
|
|
1949
|
+
diagnostics.sourcesUsed = uniqueStrings2([...diagnostics.sourcesUsed, ...sourcesUsed]);
|
|
1950
|
+
diagnostics.warnings = uniqueStrings2([...diagnostics.warnings, ...rawSources.jsonLd.warnings, ...externalResults.flatMap((result) => result.warnings ?? [])]);
|
|
1951
|
+
diagnostics.selectedImageReason = selectedImage.reason;
|
|
1952
|
+
diagnostics.originalUrl = diagnostics.originalUrl ?? url;
|
|
1953
|
+
diagnostics.finalUrl = diagnostics.finalUrl ?? finalUrl;
|
|
1954
|
+
diagnostics.canonicalUrl = canonicalUrl;
|
|
1955
|
+
diagnostics.adapter = adapterDiagnostics(rawSources.adapters);
|
|
1956
|
+
diagnostics.trace = uniqueStrings2([
|
|
1957
|
+
...diagnostics.trace,
|
|
1958
|
+
...mediaDiscovery.trace,
|
|
1959
|
+
...selectedImage.best ? [`selected image from ${sourceLabel2(selectedImage.best)}`] : []
|
|
1960
|
+
]);
|
|
1961
|
+
return stripUndefined2({
|
|
1962
|
+
ok: true,
|
|
1963
|
+
url,
|
|
1964
|
+
finalUrl,
|
|
1965
|
+
type,
|
|
1966
|
+
title,
|
|
1967
|
+
description,
|
|
1968
|
+
siteName,
|
|
1969
|
+
canonicalUrl,
|
|
1970
|
+
confidence,
|
|
1971
|
+
completeness,
|
|
1972
|
+
reliability,
|
|
1973
|
+
bestImage: selectedImage.best?.url,
|
|
1974
|
+
images: selectedImage.images,
|
|
1975
|
+
videos,
|
|
1976
|
+
audio,
|
|
1977
|
+
favicons,
|
|
1978
|
+
article,
|
|
1979
|
+
product,
|
|
1980
|
+
video,
|
|
1981
|
+
playlist,
|
|
1982
|
+
author,
|
|
1983
|
+
publisher,
|
|
1984
|
+
app,
|
|
1985
|
+
sources: fieldSources,
|
|
1986
|
+
raw: context.includeRaw ? rawSources : void 0,
|
|
1987
|
+
diagnostics,
|
|
1988
|
+
trace: diagnostics.trace
|
|
1989
|
+
});
|
|
1990
|
+
}
|
|
1991
|
+
function normalizeAssets2(assets, baseUrl) {
|
|
1992
|
+
return assets.map((asset) => {
|
|
1993
|
+
const secureUrl = tryResolveUrl(asset.secureUrl, baseUrl);
|
|
1994
|
+
const url = tryResolveUrl(secureUrl ?? asset.url, baseUrl);
|
|
1995
|
+
const poster = tryResolveUrl(asset.poster, baseUrl);
|
|
1996
|
+
if (!url) {
|
|
1997
|
+
return void 0;
|
|
1998
|
+
}
|
|
1999
|
+
return stripUndefined2({
|
|
2000
|
+
...asset,
|
|
2001
|
+
url,
|
|
2002
|
+
secureUrl,
|
|
2003
|
+
poster
|
|
2004
|
+
});
|
|
2005
|
+
}).filter((asset) => Boolean(asset));
|
|
2006
|
+
}
|
|
2007
|
+
function mergeArticle(rawSources, externalResults, articleNode, embeddedNodes) {
|
|
2008
|
+
const embeddedAuthor = entityFromEmbedded(embeddedNodes, ["author", "creator", "owner", "user"]);
|
|
2009
|
+
const embeddedPublisher = entityFromEmbedded(embeddedNodes, ["publisher", "provider", "organization"]);
|
|
2010
|
+
const article = {
|
|
2011
|
+
...rawSources.openGraph.article,
|
|
2012
|
+
...mergePartialObjects(externalResults.map((result) => result.article)),
|
|
2013
|
+
headline: firstDefined(
|
|
2014
|
+
firstResultValue(externalResults, (result) => result.article?.headline),
|
|
2015
|
+
jsonString([articleNode].filter(isJsonLdNode), "headline"),
|
|
2016
|
+
findStringByKeys(embeddedNodes, ["headline", "title", "postTitle", "pinTitle", "projectTitle"]),
|
|
2017
|
+
rawSources.openGraph.article?.headline,
|
|
2018
|
+
rawSources.openGraph.title
|
|
2019
|
+
),
|
|
2020
|
+
section: firstDefined(
|
|
2021
|
+
firstResultValue(externalResults, (result) => result.article?.section),
|
|
2022
|
+
rawSources.openGraph.article?.section,
|
|
2023
|
+
jsonString([articleNode].filter(isJsonLdNode), "articleSection")
|
|
2024
|
+
),
|
|
2025
|
+
publishedTime: firstDefined(
|
|
2026
|
+
firstResultValue(externalResults, (result) => result.article?.publishedTime),
|
|
2027
|
+
rawSources.openGraph.article?.publishedTime,
|
|
2028
|
+
jsonString([articleNode].filter(isJsonLdNode), "datePublished"),
|
|
2029
|
+
findStringByKeys(embeddedNodes, ["datePublished", "publishedTime", "published_at", "createdAt", "created_at", "timestamp"])
|
|
2030
|
+
),
|
|
2031
|
+
modifiedTime: firstDefined(
|
|
2032
|
+
firstResultValue(externalResults, (result) => result.article?.modifiedTime),
|
|
2033
|
+
rawSources.openGraph.article?.modifiedTime,
|
|
2034
|
+
jsonString([articleNode].filter(isJsonLdNode), "dateModified")
|
|
2035
|
+
),
|
|
2036
|
+
expirationTime: firstDefined(
|
|
2037
|
+
firstResultValue(externalResults, (result) => result.article?.expirationTime),
|
|
2038
|
+
rawSources.openGraph.article?.expirationTime
|
|
2039
|
+
),
|
|
2040
|
+
tags: uniqueStrings2([
|
|
2041
|
+
...rawSources.openGraph.article?.tags ?? [],
|
|
2042
|
+
...jsonKeywords(articleNode)
|
|
2043
|
+
]),
|
|
2044
|
+
authors: firstDefined(
|
|
2045
|
+
firstResultValue(externalResults, (result) => result.article?.authors),
|
|
2046
|
+
rawSources.openGraph.article?.authors,
|
|
2047
|
+
entitiesFromJsonLd(articleNode?.author),
|
|
2048
|
+
embeddedAuthor ? [embeddedAuthor] : void 0
|
|
2049
|
+
),
|
|
2050
|
+
publisher: firstDefined(
|
|
2051
|
+
firstResultValue(externalResults, (result) => result.article?.publisher),
|
|
2052
|
+
rawSources.openGraph.article?.publisher,
|
|
2053
|
+
entityFromJsonLd(articleNode?.publisher),
|
|
2054
|
+
embeddedPublisher
|
|
2055
|
+
)
|
|
2056
|
+
};
|
|
2057
|
+
return emptyToUndefined(article);
|
|
2058
|
+
}
|
|
2059
|
+
function mergeProduct(_rawSources, externalResults, productNode) {
|
|
2060
|
+
const offers = firstRecord(productNode?.offers);
|
|
2061
|
+
const aggregateRating = firstRecord(productNode?.aggregateRating);
|
|
2062
|
+
const product = {
|
|
2063
|
+
...mergePartialObjects(externalResults.map((result) => result.product)),
|
|
2064
|
+
name: firstDefined(firstResultValue(externalResults, (result) => result.product?.name), jsonString([productNode].filter(isJsonLdNode), "name")),
|
|
2065
|
+
brand: firstDefined(firstResultValue(externalResults, (result) => result.product?.brand), entityFromJsonLd(productNode?.brand)),
|
|
2066
|
+
sku: firstDefined(firstResultValue(externalResults, (result) => result.product?.sku), jsonString([productNode].filter(isJsonLdNode), "sku")),
|
|
2067
|
+
price: firstDefined(firstResultValue(externalResults, (result) => result.product?.price), jsonString([offers].filter(isJsonLdNode), "price")),
|
|
2068
|
+
currency: firstDefined(
|
|
2069
|
+
firstResultValue(externalResults, (result) => result.product?.currency),
|
|
2070
|
+
jsonString([offers].filter(isJsonLdNode), "priceCurrency")
|
|
2071
|
+
),
|
|
2072
|
+
availability: firstDefined(
|
|
2073
|
+
firstResultValue(externalResults, (result) => result.product?.availability),
|
|
2074
|
+
shortSchemaValue(jsonString([offers].filter(isJsonLdNode), "availability"))
|
|
2075
|
+
),
|
|
2076
|
+
condition: firstDefined(
|
|
2077
|
+
firstResultValue(externalResults, (result) => result.product?.condition),
|
|
2078
|
+
shortSchemaValue(jsonString([offers].filter(isJsonLdNode), "itemCondition"))
|
|
2079
|
+
),
|
|
2080
|
+
ratingValue: firstDefined(
|
|
2081
|
+
firstResultValue(externalResults, (result) => result.product?.ratingValue),
|
|
2082
|
+
parseNumber(jsonString([aggregateRating].filter(isJsonLdNode), "ratingValue"))
|
|
2083
|
+
),
|
|
2084
|
+
reviewCount: firstDefined(
|
|
2085
|
+
firstResultValue(externalResults, (result) => result.product?.reviewCount),
|
|
2086
|
+
parseNumber(jsonString([aggregateRating].filter(isJsonLdNode), "reviewCount"))
|
|
2087
|
+
)
|
|
2088
|
+
};
|
|
2089
|
+
return emptyToUndefined(product);
|
|
2090
|
+
}
|
|
2091
|
+
function mergeApp(rawSources, externalResults, appNode) {
|
|
2092
|
+
const offers = firstRecord(appNode?.offers);
|
|
2093
|
+
const app = {
|
|
2094
|
+
...mergePartialObjects(externalResults.map((result) => result.app)),
|
|
2095
|
+
name: firstDefined(
|
|
2096
|
+
firstResultValue(externalResults, (result) => result.app?.name),
|
|
2097
|
+
rawSources.html.applicationName,
|
|
2098
|
+
jsonString([appNode].filter(isJsonLdNode), "name")
|
|
2099
|
+
),
|
|
2100
|
+
category: firstDefined(firstResultValue(externalResults, (result) => result.app?.category), jsonString([appNode].filter(isJsonLdNode), "applicationCategory")),
|
|
2101
|
+
operatingSystem: firstDefined(
|
|
2102
|
+
firstResultValue(externalResults, (result) => result.app?.operatingSystem),
|
|
2103
|
+
jsonString([appNode].filter(isJsonLdNode), "operatingSystem")
|
|
2104
|
+
),
|
|
2105
|
+
price: firstDefined(firstResultValue(externalResults, (result) => result.app?.price), jsonString([offers].filter(isJsonLdNode), "price")),
|
|
2106
|
+
currency: firstDefined(firstResultValue(externalResults, (result) => result.app?.currency), jsonString([offers].filter(isJsonLdNode), "priceCurrency"))
|
|
2107
|
+
};
|
|
2108
|
+
return emptyToUndefined(app);
|
|
2109
|
+
}
|
|
2110
|
+
function mergeVideo(rawSources, externalResults, jsonLdNodes, embeddedNodes, videos) {
|
|
2111
|
+
const externalVideo = mergePartialObjects(externalResults.map((result) => result.video));
|
|
2112
|
+
const hasExternalVideo = Object.keys(externalVideo).length > 0;
|
|
2113
|
+
const explicitExternalType = firstResultValue(externalResults, (result) => result.type);
|
|
2114
|
+
const videoNode = findJsonLdNode(jsonLdNodes, ["VideoObject"]);
|
|
2115
|
+
const openGraphType = rawSources.openGraph.type?.toLowerCase() ?? "";
|
|
2116
|
+
const canTrustEmbeddedVideo = !explicitExternalType || explicitExternalType === "video" || explicitExternalType === "playlist" || openGraphType.includes("video") || Boolean(videoNode);
|
|
2117
|
+
if (!hasExternalVideo && !canTrustEmbeddedVideo) {
|
|
2118
|
+
return void 0;
|
|
2119
|
+
}
|
|
2120
|
+
const canUseEmbeddedFallbacks = hasExternalVideo || canTrustEmbeddedVideo;
|
|
2121
|
+
const video = {
|
|
2122
|
+
...externalVideo,
|
|
2123
|
+
id: firstDefined(
|
|
2124
|
+
firstResultValue(externalResults, (result) => result.video?.id),
|
|
2125
|
+
canUseEmbeddedFallbacks ? findStringByKeys([videoNode, ...embeddedNodes].filter(isJsonLdNode), ["videoId", "video_id"]) : void 0
|
|
2126
|
+
),
|
|
2127
|
+
title: firstDefined(
|
|
2128
|
+
firstResultValue(externalResults, (result) => result.video?.title),
|
|
2129
|
+
jsonString([videoNode].filter(isJsonLdNode), "name"),
|
|
2130
|
+
canUseEmbeddedFallbacks ? rawSources.openGraph.title : void 0,
|
|
2131
|
+
canUseEmbeddedFallbacks ? findStringByKeys(embeddedNodes, ["videoTitle", "title"]) : void 0
|
|
2132
|
+
),
|
|
2133
|
+
channel: firstDefined(firstResultValue(externalResults, (result) => result.video?.channel), entityFromEmbedded(embeddedNodes, ["channel", "ownerChannelName", "author"])),
|
|
2134
|
+
publishedTime: firstDefined(
|
|
2135
|
+
firstResultValue(externalResults, (result) => result.video?.publishedTime),
|
|
2136
|
+
jsonString([videoNode].filter(isJsonLdNode), "uploadDate"),
|
|
2137
|
+
canUseEmbeddedFallbacks ? findStringByKeys(embeddedNodes, ["publishDate", "publishedTime", "uploadDate", "datePublished"]) : void 0
|
|
2138
|
+
),
|
|
2139
|
+
duration: firstDefined(
|
|
2140
|
+
firstResultValue(externalResults, (result) => result.video?.duration),
|
|
2141
|
+
jsonString([videoNode].filter(isJsonLdNode), "duration"),
|
|
2142
|
+
canUseEmbeddedFallbacks ? findStringByKeys(embeddedNodes, ["duration", "lengthSeconds"]) : void 0
|
|
2143
|
+
),
|
|
2144
|
+
category: firstDefined(firstResultValue(externalResults, (result) => result.video?.category), canUseEmbeddedFallbacks ? findStringByKeys(embeddedNodes, ["category"]) : void 0),
|
|
2145
|
+
viewCount: firstDefined(
|
|
2146
|
+
firstResultValue(externalResults, (result) => result.video?.viewCount),
|
|
2147
|
+
canUseEmbeddedFallbacks ? parseNumber(findStringByKeys(embeddedNodes, ["viewCount", "views"])) : void 0
|
|
2148
|
+
),
|
|
2149
|
+
tags: firstDefined(firstResultValue(externalResults, (result) => result.video?.tags), canUseEmbeddedFallbacks ? arrayOfStrings(findValueByKeys(embeddedNodes, ["tags", "keywords"])) : void 0)
|
|
2150
|
+
};
|
|
2151
|
+
const cleaned = emptyToUndefined(video);
|
|
2152
|
+
const hasUsefulVideoIdentity = Boolean(cleaned?.id) || Boolean(cleaned?.duration) || Boolean(cleaned?.channel) || Boolean(cleaned?.publishedTime) || videos.length > 0 || rawSources.openGraph.videos.length > 0 || rawSources.twitter.videos.length > 0 || Boolean(videoNode);
|
|
2153
|
+
if (!cleaned || !hasExternalVideo && !hasUsefulVideoIdentity) {
|
|
2154
|
+
return void 0;
|
|
2155
|
+
}
|
|
2156
|
+
return cleaned;
|
|
2157
|
+
}
|
|
2158
|
+
function mergePlaylist(externalResults) {
|
|
2159
|
+
const playlist = firstResultValue(externalResults, (result) => result.playlist);
|
|
2160
|
+
if (!playlist) {
|
|
2161
|
+
return void 0;
|
|
2162
|
+
}
|
|
2163
|
+
return {
|
|
2164
|
+
videos: [],
|
|
2165
|
+
...playlist
|
|
2166
|
+
};
|
|
2167
|
+
}
|
|
2168
|
+
function inferType(rawSources, externalResults, jsonLdNodes, article, product, app, playlist, videos, audio) {
|
|
2169
|
+
const explicit = firstResultValue(externalResults, (result) => result.type);
|
|
2170
|
+
if (explicit) {
|
|
2171
|
+
return explicit;
|
|
2172
|
+
}
|
|
2173
|
+
if (playlist) {
|
|
2174
|
+
return "playlist";
|
|
2175
|
+
}
|
|
2176
|
+
const ogType = rawSources.openGraph.type?.toLowerCase();
|
|
2177
|
+
if (ogType?.includes("article") || article) {
|
|
2178
|
+
return "article";
|
|
2179
|
+
}
|
|
2180
|
+
if (ogType?.includes("product") || product || hasJsonLdType(jsonLdNodes, ["Product"])) {
|
|
2181
|
+
return "product";
|
|
2182
|
+
}
|
|
2183
|
+
if (ogType?.includes("image") || hasJsonLdType(jsonLdNodes, ["ImageObject"])) {
|
|
2184
|
+
return "image";
|
|
2185
|
+
}
|
|
2186
|
+
if (ogType?.includes("video") || videos.length > 0 || hasJsonLdType(jsonLdNodes, ["VideoObject"])) {
|
|
2187
|
+
return "video";
|
|
2188
|
+
}
|
|
2189
|
+
if (ogType?.includes("audio") || audio.length > 0 || hasJsonLdType(jsonLdNodes, ["AudioObject", "MusicRecording", "PodcastEpisode"])) {
|
|
2190
|
+
return "audio";
|
|
2191
|
+
}
|
|
2192
|
+
if (app || hasJsonLdType(jsonLdNodes, ["SoftwareApplication", "MobileApplication", "WebApplication"])) {
|
|
2193
|
+
return "app";
|
|
2194
|
+
}
|
|
2195
|
+
if (ogType?.includes("profile") || hasJsonLdType(jsonLdNodes, ["Person"])) {
|
|
2196
|
+
return "profile";
|
|
2197
|
+
}
|
|
2198
|
+
return rawSources.openGraph.raw["og:type"] || rawSources.html.title ? "website" : "unknown";
|
|
2199
|
+
}
|
|
2200
|
+
function detectSourcesUsed(rawSources) {
|
|
2201
|
+
const sources = [];
|
|
2202
|
+
if (Object.keys(rawSources.openGraph.raw).length > 0) {
|
|
2203
|
+
sources.push("openGraph");
|
|
2204
|
+
}
|
|
2205
|
+
if (Object.keys(rawSources.twitter.raw).length > 0) {
|
|
2206
|
+
sources.push("twitter");
|
|
2207
|
+
}
|
|
2208
|
+
if (rawSources.jsonLd.nodes.length > 0) {
|
|
2209
|
+
sources.push("jsonLd");
|
|
2210
|
+
}
|
|
2211
|
+
if (rawSources.embeddedData.items.length > 0) {
|
|
2212
|
+
sources.push("embeddedData", ...rawSources.embeddedData.items.map((item) => item.source));
|
|
2213
|
+
}
|
|
2214
|
+
if (rawSources.oEmbed.links.length > 0 || rawSources.oEmbed.data.length > 0) {
|
|
2215
|
+
sources.push("oEmbed");
|
|
2216
|
+
}
|
|
2217
|
+
if (rawSources.html.title || rawSources.html.description || rawSources.html.canonicalUrl) {
|
|
2218
|
+
sources.push("html");
|
|
2219
|
+
}
|
|
2220
|
+
if (rawSources.images.length > 0 || rawSources.videos.length > 0 || rawSources.audio.length > 0) {
|
|
2221
|
+
sources.push("media");
|
|
2222
|
+
}
|
|
2223
|
+
sources.push(...rawSources.adapters.map((result) => result.source), ...rawSources.plugins.map((result) => result.source));
|
|
2224
|
+
return sources;
|
|
2225
|
+
}
|
|
2226
|
+
function diagnosticsWarnings(rawSources, externalResults, diagnostics) {
|
|
2227
|
+
return uniqueStrings2([
|
|
2228
|
+
...diagnostics?.warnings ?? [],
|
|
2229
|
+
...rawSources.jsonLd.warnings,
|
|
2230
|
+
...externalResults.flatMap((result) => result.warnings ?? [])
|
|
2231
|
+
]);
|
|
2232
|
+
}
|
|
2233
|
+
function adapterDiagnostics(adapters) {
|
|
2234
|
+
const adapter = adapters[0];
|
|
2235
|
+
if (!adapter) {
|
|
2236
|
+
return { matched: false };
|
|
2237
|
+
}
|
|
2238
|
+
let confidence = 55;
|
|
2239
|
+
if (adapter.title) {
|
|
2240
|
+
confidence += 15;
|
|
2241
|
+
}
|
|
2242
|
+
if (adapter.description) {
|
|
2243
|
+
confidence += 10;
|
|
2244
|
+
}
|
|
2245
|
+
if ((adapter.images?.length ?? 0) > 0 || (adapter.videos?.length ?? 0) > 0) {
|
|
2246
|
+
confidence += 15;
|
|
2247
|
+
}
|
|
2248
|
+
if (adapter.author) {
|
|
2249
|
+
confidence += 5;
|
|
2250
|
+
}
|
|
2251
|
+
return {
|
|
2252
|
+
matched: true,
|
|
2253
|
+
name: adapter.source,
|
|
2254
|
+
confidence: Math.min(confidence, 100)
|
|
2255
|
+
};
|
|
2256
|
+
}
|
|
2257
|
+
function fieldSource(rawSources, externalResults, embeddedNodes, field, bestImage) {
|
|
2258
|
+
if (field === "image") {
|
|
2259
|
+
return bestImage ? sourceLabel2(bestImage) : void 0;
|
|
2260
|
+
}
|
|
2261
|
+
for (const result of externalResults) {
|
|
2262
|
+
if (field === "title" && result.title) {
|
|
2263
|
+
return result.source;
|
|
2264
|
+
}
|
|
2265
|
+
if (field === "description" && result.description) {
|
|
2266
|
+
return result.source;
|
|
2267
|
+
}
|
|
2268
|
+
if (field === "author" && result.author) {
|
|
2269
|
+
return result.source;
|
|
2270
|
+
}
|
|
2271
|
+
}
|
|
2272
|
+
if (field === "title") {
|
|
2273
|
+
if (rawSources.openGraph.title) return "openGraph";
|
|
2274
|
+
if (rawSources.twitter.title) return "twitter";
|
|
2275
|
+
if (findStringByKeys(embeddedNodes, ["title", "headline", "name"])) return "embeddedData";
|
|
2276
|
+
if (rawSources.html.title) return "html";
|
|
2277
|
+
}
|
|
2278
|
+
if (field === "description") {
|
|
2279
|
+
if (rawSources.openGraph.description) return "openGraph";
|
|
2280
|
+
if (rawSources.twitter.description) return "twitter";
|
|
2281
|
+
if (findStringByKeys(embeddedNodes, ["description", "summary", "excerpt"])) return "embeddedData";
|
|
2282
|
+
if (rawSources.html.description) return "html";
|
|
2283
|
+
}
|
|
2284
|
+
if (field === "author") {
|
|
2285
|
+
if (entityFromEmbedded(embeddedNodes, ["author", "creator", "owner", "user"])) return "embeddedData";
|
|
2286
|
+
if (rawSources.openGraph.article?.authors?.length) return "openGraph";
|
|
2287
|
+
}
|
|
2288
|
+
return void 0;
|
|
2289
|
+
}
|
|
2290
|
+
function findJsonLdNode(nodes, types) {
|
|
2291
|
+
return nodes.find((node) => hasJsonLdType([node], types));
|
|
2292
|
+
}
|
|
2293
|
+
function hasJsonLdType(nodes, types) {
|
|
2294
|
+
return nodes.some((node) => {
|
|
2295
|
+
const nodeTypes = Array.isArray(node["@type"]) ? node["@type"] : [node["@type"]];
|
|
2296
|
+
return nodeTypes.some((type) => typeof type === "string" && types.some((candidate) => type.toLowerCase().endsWith(candidate.toLowerCase())));
|
|
2297
|
+
});
|
|
2298
|
+
}
|
|
2299
|
+
function jsonString(nodes, key) {
|
|
2300
|
+
for (const node of nodes) {
|
|
2301
|
+
const value = stringFromUnknown2(node[key]);
|
|
2302
|
+
if (value) {
|
|
2303
|
+
return value;
|
|
2304
|
+
}
|
|
2305
|
+
}
|
|
2306
|
+
return void 0;
|
|
2307
|
+
}
|
|
2308
|
+
function findStringByKeys(nodes, keys) {
|
|
2309
|
+
const normalizedKeys = keys.map((key) => key.toLowerCase());
|
|
2310
|
+
const candidates = [];
|
|
2311
|
+
for (const node of nodes) {
|
|
2312
|
+
walkJson(node, (value, key) => {
|
|
2313
|
+
if (!key || !normalizedKeys.includes(key.toLowerCase())) {
|
|
2314
|
+
return;
|
|
2315
|
+
}
|
|
2316
|
+
const text = stringFromUnknown2(value);
|
|
2317
|
+
if (text) {
|
|
2318
|
+
candidates.push(text);
|
|
2319
|
+
}
|
|
2320
|
+
});
|
|
2321
|
+
}
|
|
2322
|
+
return candidates.filter((candidate) => candidate.length >= 2 && !/^\d+$/.test(candidate)).sort((left, right) => scoreTextCandidate(right) - scoreTextCandidate(left))[0];
|
|
2323
|
+
}
|
|
2324
|
+
function findValueByKeys(nodes, keys) {
|
|
2325
|
+
const normalizedKeys = keys.map((key) => key.toLowerCase());
|
|
2326
|
+
let found;
|
|
2327
|
+
for (const node of nodes) {
|
|
2328
|
+
walkJson(node, (value, key) => {
|
|
2329
|
+
if (found !== void 0 || !key || !normalizedKeys.includes(key.toLowerCase())) {
|
|
2330
|
+
return;
|
|
2331
|
+
}
|
|
2332
|
+
found = value;
|
|
2333
|
+
});
|
|
2334
|
+
if (found !== void 0) {
|
|
2335
|
+
return found;
|
|
2336
|
+
}
|
|
2337
|
+
}
|
|
2338
|
+
return void 0;
|
|
2339
|
+
}
|
|
2340
|
+
function entityFromEmbedded(nodes, keys) {
|
|
2341
|
+
const normalizedKeys = keys.map((key) => key.toLowerCase());
|
|
2342
|
+
const candidates = [];
|
|
2343
|
+
for (const node of nodes) {
|
|
2344
|
+
walkJson(node, (value, key) => {
|
|
2345
|
+
if (!key || !normalizedKeys.some((candidate) => key.toLowerCase().includes(candidate))) {
|
|
2346
|
+
return;
|
|
2347
|
+
}
|
|
2348
|
+
const entity = entityFromJsonLd(value);
|
|
2349
|
+
if (entity?.name) {
|
|
2350
|
+
candidates.push(entity);
|
|
2351
|
+
}
|
|
2352
|
+
});
|
|
2353
|
+
}
|
|
2354
|
+
return candidates[0];
|
|
2355
|
+
}
|
|
2356
|
+
function walkJson(value, visit, key, depth = 0) {
|
|
2357
|
+
if (depth > 8) {
|
|
2358
|
+
return;
|
|
2359
|
+
}
|
|
2360
|
+
visit(value, key);
|
|
2361
|
+
if (Array.isArray(value)) {
|
|
2362
|
+
for (const item of value.slice(0, 250)) {
|
|
2363
|
+
walkJson(item, visit, void 0, depth + 1);
|
|
2364
|
+
}
|
|
2365
|
+
return;
|
|
2366
|
+
}
|
|
2367
|
+
if (isJsonLdNode(value)) {
|
|
2368
|
+
for (const [childKey, childValue] of Object.entries(value).slice(0, 500)) {
|
|
2369
|
+
walkJson(childValue, visit, childKey, depth + 1);
|
|
2370
|
+
}
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
2373
|
+
function scoreTextCandidate(value) {
|
|
2374
|
+
let score = Math.min(value.length, 160);
|
|
2375
|
+
if (value.length >= 12 && value.length <= 120) {
|
|
2376
|
+
score += 50;
|
|
2377
|
+
}
|
|
2378
|
+
if (/^(home|login|index|untitled)$/i.test(value)) {
|
|
2379
|
+
score -= 80;
|
|
2380
|
+
}
|
|
2381
|
+
return score;
|
|
2382
|
+
}
|
|
2383
|
+
function jsonKeywords(node) {
|
|
2384
|
+
const value = node?.keywords;
|
|
2385
|
+
if (!value) {
|
|
2386
|
+
return [];
|
|
2387
|
+
}
|
|
2388
|
+
if (Array.isArray(value)) {
|
|
2389
|
+
return value.map(stringFromUnknown2).filter((item) => Boolean(item));
|
|
2390
|
+
}
|
|
2391
|
+
const text = stringFromUnknown2(value);
|
|
2392
|
+
return text ? text.split(",").map((item) => item.trim()).filter(Boolean) : [];
|
|
2393
|
+
}
|
|
2394
|
+
function entityFromJsonLd(value) {
|
|
2395
|
+
const entity = firstRecord(value);
|
|
2396
|
+
if (!entity) {
|
|
2397
|
+
const name = stringFromUnknown2(value);
|
|
2398
|
+
return name ? { name } : void 0;
|
|
2399
|
+
}
|
|
2400
|
+
return emptyToUndefined({
|
|
2401
|
+
name: stringFromUnknown2(entity.name),
|
|
2402
|
+
url: stringFromUnknown2(entity.url),
|
|
2403
|
+
logo: stringFromUnknown2(entity.logo),
|
|
2404
|
+
sameAs: arrayOfStrings(entity.sameAs)
|
|
2405
|
+
});
|
|
2406
|
+
}
|
|
2407
|
+
function entitiesFromJsonLd(value) {
|
|
2408
|
+
if (Array.isArray(value)) {
|
|
2409
|
+
const entities = value.map(entityFromJsonLd).filter((entity2) => Boolean(entity2));
|
|
2410
|
+
return entities.length > 0 ? entities : void 0;
|
|
2411
|
+
}
|
|
2412
|
+
const entity = entityFromJsonLd(value);
|
|
2413
|
+
return entity ? [entity] : void 0;
|
|
2414
|
+
}
|
|
2415
|
+
function firstRecord(value) {
|
|
2416
|
+
if (Array.isArray(value)) {
|
|
2417
|
+
return value.find(isJsonLdNode);
|
|
2418
|
+
}
|
|
2419
|
+
return isJsonLdNode(value) ? value : void 0;
|
|
2420
|
+
}
|
|
2421
|
+
function firstResultValue(results, select) {
|
|
2422
|
+
for (const result of results) {
|
|
2423
|
+
const value = select(result);
|
|
2424
|
+
if (value !== void 0) {
|
|
2425
|
+
return value;
|
|
2426
|
+
}
|
|
2427
|
+
}
|
|
2428
|
+
return void 0;
|
|
2429
|
+
}
|
|
2430
|
+
function firstEntity(entities) {
|
|
2431
|
+
return entities?.[0];
|
|
2432
|
+
}
|
|
2433
|
+
function mergePartialObjects(objects) {
|
|
2434
|
+
return Object.assign({}, ...objects.filter(Boolean));
|
|
2435
|
+
}
|
|
2436
|
+
function stringFromUnknown2(value) {
|
|
2437
|
+
if (typeof value === "string" && value.trim()) {
|
|
2438
|
+
return value.trim();
|
|
2439
|
+
}
|
|
2440
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
2441
|
+
return String(value);
|
|
2442
|
+
}
|
|
2443
|
+
if (Array.isArray(value)) {
|
|
2444
|
+
return value.map(stringFromUnknown2).find(Boolean);
|
|
2445
|
+
}
|
|
2446
|
+
if (isJsonLdNode(value)) {
|
|
2447
|
+
return stringFromUnknown2(value.name) ?? stringFromUnknown2(value.url) ?? stringFromUnknown2(value["@id"]);
|
|
2448
|
+
}
|
|
2449
|
+
return void 0;
|
|
2450
|
+
}
|
|
2451
|
+
function arrayOfStrings(value) {
|
|
2452
|
+
if (!Array.isArray(value)) {
|
|
2453
|
+
const single = stringFromUnknown2(value);
|
|
2454
|
+
return single ? [single] : void 0;
|
|
2455
|
+
}
|
|
2456
|
+
const values = value.map(stringFromUnknown2).filter((item) => Boolean(item));
|
|
2457
|
+
return values.length > 0 ? values : void 0;
|
|
2458
|
+
}
|
|
2459
|
+
function shortSchemaValue(value) {
|
|
2460
|
+
return value?.split("/").filter(Boolean).at(-1);
|
|
2461
|
+
}
|
|
2462
|
+
function domainName(url) {
|
|
2463
|
+
try {
|
|
2464
|
+
return new URL(url).hostname.replace(/^www\./, "");
|
|
2465
|
+
} catch {
|
|
2466
|
+
return void 0;
|
|
2467
|
+
}
|
|
2468
|
+
}
|
|
2469
|
+
function uniqueStrings2(values) {
|
|
2470
|
+
return [...new Set(values.filter((value) => Boolean(value)))];
|
|
2471
|
+
}
|
|
2472
|
+
function emptyToUndefined(value) {
|
|
2473
|
+
const cleaned = stripUndefined2(value);
|
|
2474
|
+
return Object.keys(cleaned).length > 0 ? cleaned : void 0;
|
|
2475
|
+
}
|
|
2476
|
+
function stripUndefined2(value) {
|
|
2477
|
+
return Object.fromEntries(Object.entries(value).filter(([, item]) => item !== void 0 && item !== null && (!Array.isArray(item) || item.length > 0)));
|
|
2478
|
+
}
|
|
2479
|
+
function isJsonLdNode(value) {
|
|
2480
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2481
|
+
}
|
|
2482
|
+
function sourceLabel2(image) {
|
|
2483
|
+
if (image.source === "adapter") {
|
|
2484
|
+
const adapter = typeof image.metadata?.adapter === "string" ? image.metadata.adapter : "adapter";
|
|
2485
|
+
const originalSource = typeof image.metadata?.originalSource === "string" ? image.metadata.originalSource : void 0;
|
|
2486
|
+
return originalSource ? `${adapter} (${originalSource})` : adapter;
|
|
2487
|
+
}
|
|
2488
|
+
return image.source;
|
|
2489
|
+
}
|
|
2490
|
+
|
|
2491
|
+
// src/adapters/siteAdapters.ts
|
|
2492
|
+
var youtubeAdapter = {
|
|
2493
|
+
name: "youtubeAdapter",
|
|
2494
|
+
detect(url) {
|
|
2495
|
+
return hostMatches(url, ["youtube.com", "youtu.be", "youtube-nocookie.com"]);
|
|
2496
|
+
},
|
|
2497
|
+
canHandle(url) {
|
|
2498
|
+
return this.detect?.(url) ?? false;
|
|
2499
|
+
},
|
|
2500
|
+
extract(context) {
|
|
2501
|
+
const url = new URL(context.finalUrl);
|
|
2502
|
+
const videoId = getYouTubeVideoId(url);
|
|
2503
|
+
const playlistId = getYouTubePlaylistId(url);
|
|
2504
|
+
const communityPostId = getYouTubeCommunityPostId(url);
|
|
2505
|
+
const channel = entityFromContext(context, ["author", "ownerChannelName", "channel", "owner"]);
|
|
2506
|
+
const playlistVideos = playlistId ? extractPlaylistVideos(context) : [];
|
|
2507
|
+
return compactAdapterResult({
|
|
2508
|
+
source: "youtubeAdapter",
|
|
2509
|
+
platform: "YouTube",
|
|
2510
|
+
type: playlistId ? "playlist" : communityPostId ? "social_post" : "video",
|
|
2511
|
+
siteName: "YouTube",
|
|
2512
|
+
canonicalUrl: videoId ? `https://www.youtube.com/watch?v=${videoId}` : context.raw.openGraph.url,
|
|
2513
|
+
title: titleFromContext(context, ["videoDetails", "title", "headline", "name", "contentText"]),
|
|
2514
|
+
description: descriptionFromContext(context),
|
|
2515
|
+
videos: markAdapterMedia(mediaFromContext(context).videos, "youtubeAdapter"),
|
|
2516
|
+
images: markAdapterMedia(mediaFromContext(context).images, "youtubeAdapter"),
|
|
2517
|
+
author: channel,
|
|
2518
|
+
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2519
|
+
video: videoId ? {
|
|
2520
|
+
id: videoId,
|
|
2521
|
+
title: titleFromContext(context, ["videoDetails", "title"]),
|
|
2522
|
+
channel,
|
|
2523
|
+
publishedTime: publishedTimeFromContext(context),
|
|
2524
|
+
duration: findEmbeddedString(context, ["duration", "lengthSeconds", "approxDurationMs"]),
|
|
2525
|
+
tags: arrayFromContext(context, ["tags", "keywords"]),
|
|
2526
|
+
category: findEmbeddedString(context, ["category"]),
|
|
2527
|
+
viewCount: numberFromContext(context, ["viewCount", "views"])
|
|
2528
|
+
} : void 0,
|
|
2529
|
+
playlist: playlistId ? {
|
|
2530
|
+
id: playlistId,
|
|
2531
|
+
title: findEmbeddedString(context, ["playlistTitle", "playlistName", "title"]) ?? context.raw.openGraph.title,
|
|
2532
|
+
channel,
|
|
2533
|
+
videos: playlistVideos
|
|
2534
|
+
} : void 0,
|
|
2535
|
+
identifiers: { videoId, playlistId, communityPostId }
|
|
2536
|
+
});
|
|
2537
|
+
},
|
|
2538
|
+
normalize(rawData) {
|
|
2539
|
+
return normalizePlatformResult(rawData);
|
|
2540
|
+
}
|
|
2541
|
+
};
|
|
2542
|
+
var redditAdapter = {
|
|
2543
|
+
name: "redditAdapter",
|
|
2544
|
+
detect(url) {
|
|
2545
|
+
return hostMatches(url, ["reddit.com", "redd.it"]);
|
|
2546
|
+
},
|
|
2547
|
+
canHandle(url) {
|
|
2548
|
+
return this.detect?.(url) ?? false;
|
|
2549
|
+
},
|
|
2550
|
+
extract(context) {
|
|
2551
|
+
const url = new URL(context.finalUrl);
|
|
2552
|
+
const reddit = parseRedditUrl(url);
|
|
2553
|
+
const username = typeof reddit.username === "string" ? reddit.username : void 0;
|
|
2554
|
+
return compactAdapterResult({
|
|
2555
|
+
source: "redditAdapter",
|
|
2556
|
+
platform: "Reddit",
|
|
2557
|
+
type: reddit.isPost ? "social_post" : "website",
|
|
2558
|
+
siteName: "Reddit",
|
|
2559
|
+
canonicalUrl: context.raw.openGraph.url,
|
|
2560
|
+
title: cleanSocialTitle(titleFromContext(context, ["title", "postTitle", "headline"])),
|
|
2561
|
+
description: descriptionFromContext(context),
|
|
2562
|
+
images: markAdapterMedia(mediaFromContext(context).images, "redditAdapter"),
|
|
2563
|
+
videos: markAdapterMedia(mediaFromContext(context).videos, "redditAdapter"),
|
|
2564
|
+
author: username ? { name: username } : entityFromContext(context, ["author", "submitter", "user"]),
|
|
2565
|
+
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2566
|
+
identifiers: { subreddit: reddit.subreddit, postId: reddit.postId, username: reddit.username },
|
|
2567
|
+
raw: { ...reddit }
|
|
2568
|
+
});
|
|
2569
|
+
},
|
|
2570
|
+
normalize(rawData) {
|
|
2571
|
+
return normalizePlatformResult(rawData);
|
|
2572
|
+
}
|
|
2573
|
+
};
|
|
2574
|
+
var pinterestAdapter = {
|
|
2575
|
+
name: "pinterestAdapter",
|
|
2576
|
+
detect(url) {
|
|
2577
|
+
return hostMatches(url, ["pinterest.com", "pin.it"]);
|
|
2578
|
+
},
|
|
2579
|
+
canHandle(url) {
|
|
2580
|
+
return this.detect?.(url) ?? false;
|
|
2581
|
+
},
|
|
2582
|
+
extract(context) {
|
|
2583
|
+
const url = new URL(context.finalUrl);
|
|
2584
|
+
const pinId = url.pathname.match(/\/pin\/([^/]+)/)?.[1];
|
|
2585
|
+
return compactAdapterResult({
|
|
2586
|
+
source: "pinterestAdapter",
|
|
2587
|
+
platform: "Pinterest",
|
|
2588
|
+
type: pinId || hostMatches(url, ["pin.it"]) ? "social_post" : "image",
|
|
2589
|
+
siteName: "Pinterest",
|
|
2590
|
+
canonicalUrl: context.raw.openGraph.url,
|
|
2591
|
+
title: titleFromContext(context, ["title", "pinTitle", "gridTitle", "headline", "name"]),
|
|
2592
|
+
description: descriptionFromContext(context),
|
|
2593
|
+
images: markAdapterMedia(mediaFromContext(context).images, "pinterestAdapter"),
|
|
2594
|
+
videos: markAdapterMedia(mediaFromContext(context).videos, "pinterestAdapter"),
|
|
2595
|
+
author: entityFromContext(context, ["pinner", "author", "creator", "owner", "user"]),
|
|
2596
|
+
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2597
|
+
identifiers: { pinId }
|
|
2598
|
+
});
|
|
2599
|
+
},
|
|
2600
|
+
normalize(rawData) {
|
|
2601
|
+
return normalizePlatformResult(rawData);
|
|
2602
|
+
}
|
|
2603
|
+
};
|
|
2604
|
+
var behanceAdapter = {
|
|
2605
|
+
name: "behanceAdapter",
|
|
2606
|
+
detect(url) {
|
|
2607
|
+
return hostMatches(url, ["behance.net"]);
|
|
2608
|
+
},
|
|
2609
|
+
canHandle(url) {
|
|
2610
|
+
return this.detect?.(url) ?? false;
|
|
2611
|
+
},
|
|
2612
|
+
extract(context) {
|
|
2613
|
+
const url = new URL(context.finalUrl);
|
|
2614
|
+
const projectId = url.pathname.match(/\/gallery\/(\d+)/)?.[1];
|
|
2615
|
+
return compactAdapterResult({
|
|
2616
|
+
source: "behanceAdapter",
|
|
2617
|
+
platform: "Behance",
|
|
2618
|
+
type: projectId ? "image" : "website",
|
|
2619
|
+
siteName: "Behance",
|
|
2620
|
+
canonicalUrl: context.raw.openGraph.url,
|
|
2621
|
+
title: titleFromContext(context, ["title", "projectTitle", "name", "headline"]),
|
|
2622
|
+
description: descriptionFromContext(context),
|
|
2623
|
+
images: markAdapterMedia(mediaFromContext(context).images, "behanceAdapter"),
|
|
2624
|
+
videos: markAdapterMedia(mediaFromContext(context).videos, "behanceAdapter"),
|
|
2625
|
+
author: entityFromContext(context, ["owners", "author", "creator", "user"]),
|
|
2626
|
+
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2627
|
+
raw: { projectId }
|
|
2628
|
+
});
|
|
2629
|
+
},
|
|
2630
|
+
normalize(rawData) {
|
|
2631
|
+
return normalizePlatformResult(rawData);
|
|
2632
|
+
}
|
|
2633
|
+
};
|
|
2634
|
+
var tiktokAdapter = {
|
|
2635
|
+
name: "tiktokAdapter",
|
|
2636
|
+
detect(url) {
|
|
2637
|
+
return hostMatches(url, ["tiktok.com"]);
|
|
2638
|
+
},
|
|
2639
|
+
canHandle(url) {
|
|
2640
|
+
return this.detect?.(url) ?? false;
|
|
2641
|
+
},
|
|
2642
|
+
extract(context) {
|
|
2643
|
+
return socialVideoResult("tiktokAdapter", "TikTok", context);
|
|
2644
|
+
},
|
|
2645
|
+
normalize(rawData) {
|
|
2646
|
+
return normalizePlatformResult(rawData);
|
|
2647
|
+
}
|
|
2648
|
+
};
|
|
2649
|
+
var facebookAdapter = {
|
|
2650
|
+
name: "facebookAdapter",
|
|
2651
|
+
detect(url) {
|
|
2652
|
+
return hostMatches(url, ["facebook.com", "fb.watch"]);
|
|
2653
|
+
},
|
|
2654
|
+
canHandle(url) {
|
|
2655
|
+
return this.detect?.(url) ?? false;
|
|
2656
|
+
},
|
|
2657
|
+
extract(context) {
|
|
2658
|
+
const url = new URL(context.finalUrl);
|
|
2659
|
+
const isPost = /\/(?:posts|photo|videos|watch|reel|share)\//i.test(url.pathname) || url.searchParams.has("story_fbid");
|
|
2660
|
+
const isPhoto = url.pathname.includes("photo.php") || url.searchParams.has("fbid");
|
|
2661
|
+
const postId = url.searchParams.get("story_fbid") ?? url.pathname.match(/\/(?:posts|videos|reel)\/([^/]+)/)?.[1];
|
|
2662
|
+
const media = mediaFromContext(context);
|
|
2663
|
+
return compactAdapterResult({
|
|
2664
|
+
source: "facebookAdapter",
|
|
2665
|
+
platform: "Facebook",
|
|
2666
|
+
type: isPhoto ? "image" : isPost || media.images.length > 0 || media.videos.length > 0 ? "social_post" : "website",
|
|
2667
|
+
siteName: "Facebook",
|
|
2668
|
+
title: titleFromContext(context, ["title", "headline", "name"]),
|
|
2669
|
+
description: descriptionFromContext(context),
|
|
2670
|
+
images: markAdapterMedia(media.images, "facebookAdapter"),
|
|
2671
|
+
videos: markAdapterMedia(media.videos, "facebookAdapter"),
|
|
2672
|
+
author: entityFromContext(context, ["author", "owner", "profile", "user"]),
|
|
2673
|
+
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2674
|
+
identifiers: { postId }
|
|
2675
|
+
});
|
|
2676
|
+
},
|
|
2677
|
+
normalize(rawData) {
|
|
2678
|
+
return normalizePlatformResult(rawData);
|
|
2679
|
+
}
|
|
2680
|
+
};
|
|
2681
|
+
var twitterAdapter = {
|
|
2682
|
+
name: "twitterAdapter",
|
|
2683
|
+
detect(url) {
|
|
2684
|
+
return hostMatches(url, ["twitter.com", "x.com", "t.co"]);
|
|
2685
|
+
},
|
|
2686
|
+
canHandle(url) {
|
|
2687
|
+
return this.detect?.(url) ?? false;
|
|
2688
|
+
},
|
|
2689
|
+
extract(context) {
|
|
2690
|
+
const url = new URL(context.finalUrl);
|
|
2691
|
+
const statusId = url.pathname.match(/\/status(?:es)?\/(\d+)/)?.[1];
|
|
2692
|
+
return compactAdapterResult({
|
|
2693
|
+
source: "twitterAdapter",
|
|
2694
|
+
platform: "X",
|
|
2695
|
+
type: statusId || hostMatches(url, ["t.co"]) ? "social_post" : "profile",
|
|
2696
|
+
siteName: "X",
|
|
2697
|
+
canonicalUrl: context.raw.openGraph.url,
|
|
2698
|
+
title: titleFromContext(context, ["title", "full_text", "text", "headline"]),
|
|
2699
|
+
description: descriptionFromContext(context),
|
|
2700
|
+
images: markAdapterMedia(mediaFromContext(context).images, "twitterAdapter"),
|
|
2701
|
+
videos: markAdapterMedia(mediaFromContext(context).videos, "twitterAdapter"),
|
|
2702
|
+
author: entityFromContext(context, ["author", "user", "screen_name", "creator"]),
|
|
2703
|
+
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2704
|
+
identifiers: { statusId }
|
|
2705
|
+
});
|
|
2706
|
+
},
|
|
2707
|
+
normalize(rawData) {
|
|
2708
|
+
return normalizePlatformResult(rawData);
|
|
2709
|
+
}
|
|
2710
|
+
};
|
|
2711
|
+
var instagramAdapter = {
|
|
2712
|
+
name: "instagramAdapter",
|
|
2713
|
+
detect(url) {
|
|
2714
|
+
return hostMatches(url, ["instagram.com"]);
|
|
2715
|
+
},
|
|
2716
|
+
canHandle(url) {
|
|
2717
|
+
return this.detect?.(url) ?? false;
|
|
2718
|
+
},
|
|
2719
|
+
extract(context) {
|
|
2720
|
+
const url = new URL(context.finalUrl);
|
|
2721
|
+
const shortcode = url.pathname.match(/\/(?:p|reel|tv)\/([^/]+)/)?.[1];
|
|
2722
|
+
return compactAdapterResult({
|
|
2723
|
+
source: "instagramAdapter",
|
|
2724
|
+
platform: "Instagram",
|
|
2725
|
+
type: shortcode ? "social_post" : "profile",
|
|
2726
|
+
siteName: "Instagram",
|
|
2727
|
+
canonicalUrl: context.raw.openGraph.url,
|
|
2728
|
+
title: titleFromContext(context, ["title", "caption", "edge_media_to_caption", "headline"]),
|
|
2729
|
+
description: descriptionFromContext(context),
|
|
2730
|
+
images: markAdapterMedia(mediaFromContext(context).images, "instagramAdapter"),
|
|
2731
|
+
videos: markAdapterMedia(mediaFromContext(context).videos, "instagramAdapter"),
|
|
2732
|
+
author: entityFromContext(context, ["owner", "author", "user", "username"]),
|
|
2733
|
+
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2734
|
+
identifiers: { shortcode }
|
|
2735
|
+
});
|
|
2736
|
+
},
|
|
2737
|
+
normalize(rawData) {
|
|
2738
|
+
return normalizePlatformResult(rawData);
|
|
2739
|
+
}
|
|
2740
|
+
};
|
|
2741
|
+
var defaultAdapters = [
|
|
2742
|
+
youtubeAdapter,
|
|
2743
|
+
redditAdapter,
|
|
2744
|
+
pinterestAdapter,
|
|
2745
|
+
behanceAdapter,
|
|
2746
|
+
tiktokAdapter,
|
|
2747
|
+
facebookAdapter,
|
|
2748
|
+
twitterAdapter,
|
|
2749
|
+
instagramAdapter
|
|
2750
|
+
];
|
|
2751
|
+
function socialVideoResult(source, platform, context) {
|
|
2752
|
+
const url = new URL(context.finalUrl);
|
|
2753
|
+
const username = url.pathname.match(/@([^/]+)/)?.[1];
|
|
2754
|
+
const postId = url.pathname.match(/\/(?:video|photo)\/([^/]+)/)?.[1] ?? url.pathname.split("/").filter(Boolean).at(-1);
|
|
2755
|
+
return compactAdapterResult({
|
|
2756
|
+
source,
|
|
2757
|
+
platform,
|
|
2758
|
+
type: "social_post",
|
|
2759
|
+
siteName: platform,
|
|
2760
|
+
canonicalUrl: context.raw.openGraph.url,
|
|
2761
|
+
title: titleFromContext(context, ["title", "desc", "description", "caption"]),
|
|
2762
|
+
description: descriptionFromContext(context),
|
|
2763
|
+
images: markAdapterMedia(mediaFromContext(context).images, source),
|
|
2764
|
+
videos: markAdapterMedia(mediaFromContext(context).videos, source),
|
|
2765
|
+
author: username ? { name: username } : entityFromContext(context, ["author", "user", "creator", "owner"]),
|
|
2766
|
+
article: { publishedTime: publishedTimeFromContext(context) },
|
|
2767
|
+
identifiers: { username, postId }
|
|
2768
|
+
});
|
|
2769
|
+
}
|
|
2770
|
+
function normalizePlatformResult(rawData) {
|
|
2771
|
+
const type = rawData.type ?? inferAdapterType(rawData);
|
|
2772
|
+
return compactAdapterResult({
|
|
2773
|
+
...rawData,
|
|
2774
|
+
type,
|
|
2775
|
+
raw: {
|
|
2776
|
+
...rawData.raw ?? {},
|
|
2777
|
+
platform: rawData.platform,
|
|
2778
|
+
identifiers: rawData.identifiers
|
|
2779
|
+
}
|
|
2780
|
+
});
|
|
2781
|
+
}
|
|
2782
|
+
function inferAdapterType(rawData) {
|
|
2783
|
+
if ((rawData.videos?.length ?? 0) > 0) {
|
|
2784
|
+
return "video";
|
|
2785
|
+
}
|
|
2786
|
+
if ((rawData.images?.length ?? 0) > 0) {
|
|
2787
|
+
return "image";
|
|
2788
|
+
}
|
|
2789
|
+
return "website";
|
|
2790
|
+
}
|
|
2791
|
+
function markAdapterMedia(assets, adapterName) {
|
|
2792
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2793
|
+
return assets.filter((asset) => {
|
|
2794
|
+
if (seen.has(asset.url)) {
|
|
2795
|
+
return false;
|
|
2796
|
+
}
|
|
2797
|
+
seen.add(asset.url);
|
|
2798
|
+
return true;
|
|
2799
|
+
}).map((asset) => ({
|
|
2800
|
+
...asset,
|
|
2801
|
+
source: "adapter",
|
|
2802
|
+
metadata: {
|
|
2803
|
+
...asset.metadata,
|
|
2804
|
+
adapter: adapterName,
|
|
2805
|
+
originalSource: asset.source
|
|
2806
|
+
}
|
|
2807
|
+
}));
|
|
2808
|
+
}
|
|
2809
|
+
function titleFromContext(context, embeddedKeys) {
|
|
2810
|
+
return firstText(
|
|
2811
|
+
context.raw.openGraph.title,
|
|
2812
|
+
context.raw.twitter.title,
|
|
2813
|
+
findEmbeddedString(context, embeddedKeys),
|
|
2814
|
+
context.raw.html.title
|
|
2815
|
+
);
|
|
2816
|
+
}
|
|
2817
|
+
function descriptionFromContext(context) {
|
|
2818
|
+
return firstText(
|
|
2819
|
+
context.raw.openGraph.description,
|
|
2820
|
+
context.raw.twitter.description,
|
|
2821
|
+
findEmbeddedString(context, ["description", "desc", "summary", "excerpt", "caption", "text"]),
|
|
2822
|
+
context.raw.html.description
|
|
2823
|
+
);
|
|
2824
|
+
}
|
|
2825
|
+
function publishedTimeFromContext(context) {
|
|
2826
|
+
return firstText(
|
|
2827
|
+
context.raw.openGraph.article?.publishedTime,
|
|
2828
|
+
findJsonLdString(context.raw.jsonLd.nodes, ["datePublished", "uploadDate", "createdAt"]),
|
|
2829
|
+
findEmbeddedString(context, ["datePublished", "publishedTime", "published_at", "createdAt", "created_at", "uploadDate", "timestamp"])
|
|
2830
|
+
);
|
|
2831
|
+
}
|
|
2832
|
+
function mediaFromContext(context) {
|
|
2833
|
+
const discovered = discoverMedia(context.raw, context.finalUrl);
|
|
2834
|
+
return {
|
|
2835
|
+
images: discovered.images,
|
|
2836
|
+
videos: discovered.videos
|
|
2837
|
+
};
|
|
2838
|
+
}
|
|
2839
|
+
function entityFromContext(context, keys) {
|
|
2840
|
+
const jsonLdEntity = entityFromJsonValue(findJsonLdValue(context.raw.jsonLd.nodes, keys));
|
|
2841
|
+
if (jsonLdEntity) {
|
|
2842
|
+
return jsonLdEntity;
|
|
2843
|
+
}
|
|
2844
|
+
for (const item of context.raw.embeddedData.items) {
|
|
2845
|
+
const entity = entityFromJsonValue(findValueByKeys2(item.data, keys));
|
|
2846
|
+
if (entity) {
|
|
2847
|
+
return entity;
|
|
2848
|
+
}
|
|
2849
|
+
}
|
|
2850
|
+
return void 0;
|
|
2851
|
+
}
|
|
2852
|
+
function findEmbeddedString(context, keys) {
|
|
2853
|
+
const candidates = [];
|
|
2854
|
+
for (const item of context.raw.embeddedData.items) {
|
|
2855
|
+
walkData(item.data, (value, key) => {
|
|
2856
|
+
if (!key || !matchesKey(key, keys)) {
|
|
2857
|
+
return;
|
|
2858
|
+
}
|
|
2859
|
+
const text = stringFromUnknown3(value);
|
|
2860
|
+
if (text) {
|
|
2861
|
+
candidates.push(text);
|
|
2862
|
+
}
|
|
2863
|
+
});
|
|
2864
|
+
}
|
|
2865
|
+
return bestTextCandidate(candidates);
|
|
2866
|
+
}
|
|
2867
|
+
function findJsonLdString(nodes, keys) {
|
|
2868
|
+
const value = findJsonLdValue(nodes, keys);
|
|
2869
|
+
return stringFromUnknown3(value);
|
|
2870
|
+
}
|
|
2871
|
+
function findJsonLdValue(nodes, keys) {
|
|
2872
|
+
for (const node of nodes) {
|
|
2873
|
+
const value = findValueByKeys2(node, keys);
|
|
2874
|
+
if (value !== void 0) {
|
|
2875
|
+
return value;
|
|
2876
|
+
}
|
|
2877
|
+
}
|
|
2878
|
+
return void 0;
|
|
2879
|
+
}
|
|
2880
|
+
function findValueByKeys2(node, keys) {
|
|
2881
|
+
let found;
|
|
2882
|
+
walkData(node, (value, key) => {
|
|
2883
|
+
if (found !== void 0 || !key || !matchesKey(key, keys)) {
|
|
2884
|
+
return;
|
|
2885
|
+
}
|
|
2886
|
+
found = value;
|
|
2887
|
+
});
|
|
2888
|
+
return found;
|
|
2889
|
+
}
|
|
2890
|
+
function walkData(value, visit, key, depth = 0) {
|
|
2891
|
+
if (depth > 8) {
|
|
2892
|
+
return;
|
|
2893
|
+
}
|
|
2894
|
+
visit(value, key);
|
|
2895
|
+
if (Array.isArray(value)) {
|
|
2896
|
+
for (const item of value.slice(0, 250)) {
|
|
2897
|
+
walkData(item, visit, void 0, depth + 1);
|
|
2898
|
+
}
|
|
2899
|
+
return;
|
|
2900
|
+
}
|
|
2901
|
+
if (isRecord4(value)) {
|
|
2902
|
+
for (const [childKey, childValue] of Object.entries(value).slice(0, 500)) {
|
|
2903
|
+
walkData(childValue, visit, childKey, depth + 1);
|
|
2904
|
+
}
|
|
2905
|
+
}
|
|
2906
|
+
}
|
|
2907
|
+
function entityFromJsonValue(value) {
|
|
2908
|
+
if (!value) {
|
|
2909
|
+
return void 0;
|
|
2910
|
+
}
|
|
2911
|
+
if (typeof value === "string") {
|
|
2912
|
+
return { name: value };
|
|
2913
|
+
}
|
|
2914
|
+
if (Array.isArray(value)) {
|
|
2915
|
+
return value.map(entityFromJsonValue).find(Boolean);
|
|
2916
|
+
}
|
|
2917
|
+
if (!isRecord4(value)) {
|
|
2918
|
+
return void 0;
|
|
2919
|
+
}
|
|
2920
|
+
const name = stringFromUnknown3(value.name) ?? stringFromUnknown3(value.username) ?? stringFromUnknown3(value.screen_name) ?? stringFromUnknown3(value.ownerChannelName) ?? stringFromUnknown3(value.title);
|
|
2921
|
+
if (!name) {
|
|
2922
|
+
return void 0;
|
|
2923
|
+
}
|
|
2924
|
+
return {
|
|
2925
|
+
name,
|
|
2926
|
+
url: stringFromUnknown3(value.url) ?? stringFromUnknown3(value.canonicalUrl),
|
|
2927
|
+
logo: stringFromUnknown3(value.logo) ?? stringFromUnknown3(value.avatar) ?? stringFromUnknown3(value.image)
|
|
2928
|
+
};
|
|
2929
|
+
}
|
|
2930
|
+
function firstText(...values) {
|
|
2931
|
+
return bestTextCandidate(values.filter((value) => Boolean(value)));
|
|
2932
|
+
}
|
|
2933
|
+
function bestTextCandidate(values) {
|
|
2934
|
+
return values.map((value) => value.replace(/\s+/g, " ").trim()).filter((value) => value.length > 0).sort((left, right) => scoreText(right) - scoreText(left))[0];
|
|
2935
|
+
}
|
|
2936
|
+
function scoreText(value) {
|
|
2937
|
+
let score = Math.min(value.length, 180);
|
|
2938
|
+
if (value.length >= 10 && value.length <= 140) {
|
|
2939
|
+
score += 60;
|
|
2940
|
+
}
|
|
2941
|
+
if (/^(home|login|index|untitled)$/i.test(value)) {
|
|
2942
|
+
score -= 100;
|
|
2943
|
+
}
|
|
2944
|
+
return score;
|
|
2945
|
+
}
|
|
2946
|
+
function stringFromUnknown3(value) {
|
|
2947
|
+
if (typeof value === "string" && value.trim()) {
|
|
2948
|
+
return value.trim();
|
|
2949
|
+
}
|
|
2950
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
2951
|
+
return String(value);
|
|
2952
|
+
}
|
|
2953
|
+
if (Array.isArray(value)) {
|
|
2954
|
+
const values = value.map(stringFromUnknown3).filter(Boolean);
|
|
2955
|
+
return values.length > 0 ? values.join("") : void 0;
|
|
2956
|
+
}
|
|
2957
|
+
if (isRecord4(value)) {
|
|
2958
|
+
return stringFromUnknown3(value.text) ?? stringFromUnknown3(value.simpleText) ?? stringFromUnknown3(value.runs) ?? stringFromUnknown3(value.title) ?? stringFromUnknown3(value.name) ?? stringFromUnknown3(value.value) ?? stringFromUnknown3(value.url);
|
|
2959
|
+
}
|
|
2960
|
+
return void 0;
|
|
2961
|
+
}
|
|
2962
|
+
function matchesKey(key, keys) {
|
|
2963
|
+
const normalized = key.toLowerCase();
|
|
2964
|
+
return keys.some((candidate) => normalized === candidate.toLowerCase() || normalized.endsWith(candidate.toLowerCase()));
|
|
2965
|
+
}
|
|
2966
|
+
function isRecord4(value) {
|
|
2967
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2968
|
+
}
|
|
2969
|
+
function parseRedditUrl(url) {
|
|
2970
|
+
const parts = url.pathname.split("/").filter(Boolean);
|
|
2971
|
+
const commentsIndex = parts.indexOf("comments");
|
|
2972
|
+
const shortPostId = hostMatches(url, ["redd.it"]) ? parts[0] : void 0;
|
|
2973
|
+
return {
|
|
2974
|
+
isPost: commentsIndex !== -1 || Boolean(shortPostId),
|
|
2975
|
+
subreddit: parts[0] === "r" ? parts[1] : void 0,
|
|
2976
|
+
postId: commentsIndex !== -1 ? parts[commentsIndex + 1] : shortPostId,
|
|
2977
|
+
username: parts[0] === "user" ? parts[1] : void 0
|
|
2978
|
+
};
|
|
2979
|
+
}
|
|
2980
|
+
function cleanSocialTitle(title) {
|
|
2981
|
+
return title?.replace(/\s*:\s*r\/[A-Za-z0-9_]+$/i, "").trim();
|
|
2982
|
+
}
|
|
2983
|
+
function hostMatches(url, domains) {
|
|
2984
|
+
const host = url.hostname.toLowerCase().replace(/^www\./, "");
|
|
2985
|
+
return domains.some((domain) => host === domain || host.endsWith(`.${domain}`));
|
|
2986
|
+
}
|
|
2987
|
+
function getYouTubeVideoId(url) {
|
|
2988
|
+
const host = url.hostname.toLowerCase().replace(/^www\./, "");
|
|
2989
|
+
if (host === "youtu.be") {
|
|
2990
|
+
return url.pathname.split("/").filter(Boolean)[0];
|
|
2991
|
+
}
|
|
2992
|
+
if (url.pathname === "/watch") {
|
|
2993
|
+
return url.searchParams.get("v") ?? void 0;
|
|
2994
|
+
}
|
|
2995
|
+
const embedMatch = url.pathname.match(/\/(?:embed|shorts)\/([^/?]+)/);
|
|
2996
|
+
return embedMatch?.[1];
|
|
2997
|
+
}
|
|
2998
|
+
function getYouTubePlaylistId(url) {
|
|
2999
|
+
return url.searchParams.get("list") ?? void 0;
|
|
3000
|
+
}
|
|
3001
|
+
function getYouTubeCommunityPostId(url) {
|
|
3002
|
+
return url.pathname.match(/\/post\/([^/?]+)/)?.[1];
|
|
3003
|
+
}
|
|
3004
|
+
function extractPlaylistVideos(context) {
|
|
3005
|
+
const videos = /* @__PURE__ */ new Map();
|
|
3006
|
+
for (const item of context.raw.embeddedData.items) {
|
|
3007
|
+
walkData(item.data, (value) => {
|
|
3008
|
+
if (!isRecord4(value)) {
|
|
3009
|
+
return;
|
|
3010
|
+
}
|
|
3011
|
+
const videoId = stringFromUnknown3(value.videoId) ?? stringFromUnknown3(value.video_id) ?? (isRecord4(value.watchEndpoint) ? stringFromUnknown3(value.watchEndpoint.videoId) : void 0) ?? (isRecord4(value.navigationEndpoint) && isRecord4(value.navigationEndpoint.watchEndpoint) ? stringFromUnknown3(value.navigationEndpoint.watchEndpoint.videoId) : void 0);
|
|
3012
|
+
if (!videoId || videos.has(videoId)) {
|
|
3013
|
+
return;
|
|
3014
|
+
}
|
|
3015
|
+
videos.set(videoId, {
|
|
3016
|
+
id: videoId,
|
|
3017
|
+
title: stringFromUnknown3(value.title) ?? stringFromUnknown3(value.headline) ?? stringFromUnknown3(value.shortBylineText),
|
|
3018
|
+
url: `https://www.youtube.com/watch?v=${videoId}`
|
|
3019
|
+
});
|
|
3020
|
+
});
|
|
3021
|
+
}
|
|
3022
|
+
return [...videos.values()].slice(0, 100);
|
|
3023
|
+
}
|
|
3024
|
+
function arrayFromContext(context, keys) {
|
|
3025
|
+
const fromJsonLd = arrayOfStrings2(findJsonLdValue(context.raw.jsonLd.nodes, keys));
|
|
3026
|
+
if (fromJsonLd) {
|
|
3027
|
+
return fromJsonLd;
|
|
3028
|
+
}
|
|
3029
|
+
for (const item of context.raw.embeddedData.items) {
|
|
3030
|
+
const embedded = arrayOfStrings2(findValueByKeys2(item.data, keys));
|
|
3031
|
+
if (embedded) {
|
|
3032
|
+
return embedded;
|
|
3033
|
+
}
|
|
3034
|
+
}
|
|
3035
|
+
return void 0;
|
|
3036
|
+
}
|
|
3037
|
+
function numberFromContext(context, keys) {
|
|
3038
|
+
const fromJsonLd = numberFromUnknown(findJsonLdValue(context.raw.jsonLd.nodes, keys));
|
|
3039
|
+
if (fromJsonLd !== void 0) {
|
|
3040
|
+
return fromJsonLd;
|
|
3041
|
+
}
|
|
3042
|
+
for (const item of context.raw.embeddedData.items) {
|
|
3043
|
+
const embedded = numberFromUnknown(findValueByKeys2(item.data, keys));
|
|
3044
|
+
if (embedded !== void 0) {
|
|
3045
|
+
return embedded;
|
|
3046
|
+
}
|
|
3047
|
+
}
|
|
3048
|
+
return void 0;
|
|
3049
|
+
}
|
|
3050
|
+
function arrayOfStrings2(value) {
|
|
3051
|
+
if (!value) {
|
|
3052
|
+
return void 0;
|
|
3053
|
+
}
|
|
3054
|
+
if (Array.isArray(value)) {
|
|
3055
|
+
const values = value.map(stringFromUnknown3).filter((item) => Boolean(item));
|
|
3056
|
+
return values.length > 0 ? values : void 0;
|
|
3057
|
+
}
|
|
3058
|
+
const text = stringFromUnknown3(value);
|
|
3059
|
+
if (!text) {
|
|
3060
|
+
return void 0;
|
|
3061
|
+
}
|
|
3062
|
+
return text.split(",").map((item) => item.trim()).filter(Boolean);
|
|
3063
|
+
}
|
|
3064
|
+
function numberFromUnknown(value) {
|
|
3065
|
+
const text = stringFromUnknown3(value);
|
|
3066
|
+
if (!text) {
|
|
3067
|
+
return void 0;
|
|
3068
|
+
}
|
|
3069
|
+
const parsed = Number.parseInt(text.replace(/[^\d]/g, ""), 10);
|
|
3070
|
+
return Number.isFinite(parsed) ? parsed : void 0;
|
|
3071
|
+
}
|
|
3072
|
+
function compactAdapterResult(result) {
|
|
3073
|
+
return Object.fromEntries(
|
|
3074
|
+
Object.entries(result).filter(([, value]) => value !== void 0 && (!Array.isArray(value) || value.length > 0))
|
|
3075
|
+
);
|
|
3076
|
+
}
|
|
3077
|
+
|
|
3078
|
+
// src/plugins/registry.ts
|
|
3079
|
+
var globalPlugins = [];
|
|
3080
|
+
function registerGlobalPlugin(plugin) {
|
|
3081
|
+
if (!globalPlugins.some((registered) => registered.name === plugin.name)) {
|
|
3082
|
+
globalPlugins.push(plugin);
|
|
3083
|
+
}
|
|
3084
|
+
}
|
|
3085
|
+
function createRegistry(options = {}) {
|
|
3086
|
+
const registry = {
|
|
3087
|
+
adapters: [...defaultAdapters, ...options.adapters ?? []],
|
|
3088
|
+
extractors: [],
|
|
3089
|
+
imageScorers: [...options.imageScorers ?? []]
|
|
3090
|
+
};
|
|
3091
|
+
const api = {
|
|
3092
|
+
addAdapter(adapter) {
|
|
3093
|
+
registry.adapters.push(adapter);
|
|
3094
|
+
},
|
|
3095
|
+
addExtractor(name, extractor) {
|
|
3096
|
+
registry.extractors.push({ name, extract: extractor });
|
|
3097
|
+
},
|
|
3098
|
+
addImageScorer(scorer) {
|
|
3099
|
+
registry.imageScorers.push(scorer);
|
|
3100
|
+
}
|
|
3101
|
+
};
|
|
3102
|
+
for (const plugin of [...globalPlugins, ...options.plugins ?? []]) {
|
|
3103
|
+
plugin.setup(api);
|
|
3104
|
+
}
|
|
3105
|
+
return registry;
|
|
3106
|
+
}
|
|
3107
|
+
|
|
3108
|
+
// src/parse.ts
|
|
3109
|
+
function parseMetadata(html, url, options = {}) {
|
|
3110
|
+
const finalUrl = resolveUrl(url);
|
|
3111
|
+
const registry = createRegistry(options);
|
|
3112
|
+
const diagnostics = createEmptyDiagnostics();
|
|
3113
|
+
diagnostics.originalUrl = url;
|
|
3114
|
+
diagnostics.finalUrl = finalUrl;
|
|
3115
|
+
diagnostics.trace.push("validated and normalized URL");
|
|
3116
|
+
const rawSources = {
|
|
3117
|
+
html: extractHtmlMetadata(html),
|
|
3118
|
+
openGraph: extractOpenGraph(html),
|
|
3119
|
+
twitter: extractTwitterCards(html),
|
|
3120
|
+
jsonLd: extractJsonLd(html),
|
|
3121
|
+
embeddedData: extractEmbeddedData(html),
|
|
3122
|
+
oEmbed: extractOEmbed(html, finalUrl),
|
|
3123
|
+
images: extractImages(html, finalUrl),
|
|
3124
|
+
videos: extractVideos(html, finalUrl),
|
|
3125
|
+
audio: extractAudio(html, finalUrl),
|
|
3126
|
+
adapters: [],
|
|
3127
|
+
plugins: []
|
|
3128
|
+
};
|
|
3129
|
+
appendExtractionTrace(rawSources, diagnostics.trace);
|
|
3130
|
+
const $ = loadDocument(html);
|
|
3131
|
+
const fetchOptions = options;
|
|
3132
|
+
if (fetchOptions.fetchOEmbed && rawSources.oEmbed.links.length > 0) {
|
|
3133
|
+
diagnostics.warnings.push("parseMetadata is synchronous; oEmbed endpoints are discovered but not fetched. Use fetchMetadata or parseMetadataAsync.");
|
|
3134
|
+
}
|
|
3135
|
+
for (const extractor of registry.extractors) {
|
|
3136
|
+
try {
|
|
3137
|
+
const result = extractor.extract({ html, url, finalUrl, $, raw: rawSources, options });
|
|
3138
|
+
if (isPromise(result)) {
|
|
3139
|
+
diagnostics.warnings.push(`Plugin extractor "${extractor.name}" returned a Promise during parseMetadata; use fetchMetadata or pre-resolve async work.`);
|
|
3140
|
+
continue;
|
|
3141
|
+
}
|
|
3142
|
+
rawSources.plugins.push(withSource(result, extractor.name));
|
|
3143
|
+
diagnostics.trace.push(`plugin extractor matched: ${extractor.name}`);
|
|
3144
|
+
} catch (error) {
|
|
3145
|
+
diagnostics.warnings.push(`Plugin extractor "${extractor.name}" failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
3146
|
+
}
|
|
3147
|
+
}
|
|
3148
|
+
const parsedUrl = new URL(finalUrl);
|
|
3149
|
+
for (const adapter of registry.adapters) {
|
|
3150
|
+
if (!adapterMatches(adapter, parsedUrl)) {
|
|
3151
|
+
continue;
|
|
3152
|
+
}
|
|
3153
|
+
try {
|
|
3154
|
+
const context = { html, url, finalUrl, $, raw: rawSources, options };
|
|
3155
|
+
const result = adapter.extract(context);
|
|
3156
|
+
if (isPromise(result)) {
|
|
3157
|
+
diagnostics.warnings.push(`Adapter "${adapter.name}" returned a Promise during parseMetadata; use fetchMetadata or a synchronous adapter.`);
|
|
3158
|
+
continue;
|
|
3159
|
+
}
|
|
3160
|
+
const normalized = adapter.normalize?.(result, context);
|
|
3161
|
+
if (normalized && isPromise(normalized)) {
|
|
3162
|
+
diagnostics.warnings.push(`Adapter "${adapter.name}" normalize returned a Promise during parseMetadata; use fetchMetadata or a synchronous adapter.`);
|
|
3163
|
+
continue;
|
|
3164
|
+
}
|
|
3165
|
+
rawSources.adapters.push(withSource(normalized ?? result, adapter.name));
|
|
3166
|
+
diagnostics.trace.push(`adapter matched: ${adapter.name}`);
|
|
3167
|
+
} catch (error) {
|
|
3168
|
+
diagnostics.warnings.push(`Adapter "${adapter.name}" failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
3169
|
+
}
|
|
3170
|
+
}
|
|
3171
|
+
return normalizeMetadata(rawSources, {
|
|
3172
|
+
url,
|
|
3173
|
+
finalUrl,
|
|
3174
|
+
diagnostics,
|
|
3175
|
+
imageScorers: registry.imageScorers,
|
|
3176
|
+
includeRaw: options.includeRaw
|
|
3177
|
+
});
|
|
3178
|
+
}
|
|
3179
|
+
async function parseMetadataAsync(html, url, options = {}) {
|
|
3180
|
+
const finalUrl = resolveUrl(url);
|
|
3181
|
+
const registry = createRegistry(options);
|
|
3182
|
+
const diagnostics = createEmptyDiagnostics();
|
|
3183
|
+
diagnostics.originalUrl = url;
|
|
3184
|
+
diagnostics.finalUrl = finalUrl;
|
|
3185
|
+
diagnostics.trace.push("validated and normalized URL");
|
|
3186
|
+
const rawSources = {
|
|
3187
|
+
html: extractHtmlMetadata(html),
|
|
3188
|
+
openGraph: extractOpenGraph(html),
|
|
3189
|
+
twitter: extractTwitterCards(html),
|
|
3190
|
+
jsonLd: extractJsonLd(html),
|
|
3191
|
+
embeddedData: extractEmbeddedData(html),
|
|
3192
|
+
oEmbed: extractOEmbed(html, finalUrl),
|
|
3193
|
+
images: extractImages(html, finalUrl),
|
|
3194
|
+
videos: extractVideos(html, finalUrl),
|
|
3195
|
+
audio: extractAudio(html, finalUrl),
|
|
3196
|
+
adapters: [],
|
|
3197
|
+
plugins: []
|
|
3198
|
+
};
|
|
3199
|
+
appendExtractionTrace(rawSources, diagnostics.trace);
|
|
3200
|
+
const $ = loadDocument(html);
|
|
3201
|
+
const fetchOptions = options;
|
|
3202
|
+
if (fetchOptions.fetchOEmbed && rawSources.oEmbed.links.length > 0) {
|
|
3203
|
+
const oEmbed = await fetchOEmbedData(rawSources.oEmbed.links, fetchOptions);
|
|
3204
|
+
rawSources.oEmbed.data = oEmbed.data;
|
|
3205
|
+
diagnostics.warnings.push(...oEmbed.warnings);
|
|
3206
|
+
diagnostics.trace.push("fetched discovered oEmbed JSON endpoints");
|
|
3207
|
+
}
|
|
3208
|
+
for (const extractor of registry.extractors) {
|
|
3209
|
+
try {
|
|
3210
|
+
rawSources.plugins.push(withSource(await extractor.extract({ html, url, finalUrl, $, raw: rawSources, options }), extractor.name));
|
|
3211
|
+
diagnostics.trace.push(`plugin extractor matched: ${extractor.name}`);
|
|
3212
|
+
} catch (error) {
|
|
3213
|
+
diagnostics.warnings.push(`Plugin extractor "${extractor.name}" failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
3214
|
+
}
|
|
3215
|
+
}
|
|
3216
|
+
const parsedUrl = new URL(finalUrl);
|
|
3217
|
+
for (const adapter of registry.adapters) {
|
|
3218
|
+
if (!adapterMatches(adapter, parsedUrl)) {
|
|
3219
|
+
continue;
|
|
3220
|
+
}
|
|
3221
|
+
try {
|
|
3222
|
+
const context = { html, url, finalUrl, $, raw: rawSources, options };
|
|
3223
|
+
const result = await adapter.extract(context);
|
|
3224
|
+
const normalized = adapter.normalize ? await adapter.normalize(result, context) : result;
|
|
3225
|
+
rawSources.adapters.push(withSource(normalized, adapter.name));
|
|
3226
|
+
diagnostics.trace.push(`adapter matched: ${adapter.name}`);
|
|
3227
|
+
} catch (error) {
|
|
3228
|
+
diagnostics.warnings.push(`Adapter "${adapter.name}" failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
3229
|
+
}
|
|
3230
|
+
}
|
|
3231
|
+
return normalizeMetadata(rawSources, {
|
|
3232
|
+
url,
|
|
3233
|
+
finalUrl,
|
|
3234
|
+
diagnostics,
|
|
3235
|
+
imageScorers: registry.imageScorers,
|
|
3236
|
+
includeRaw: options.includeRaw
|
|
3237
|
+
});
|
|
3238
|
+
}
|
|
3239
|
+
function withSource(result, source) {
|
|
3240
|
+
return {
|
|
3241
|
+
...result,
|
|
3242
|
+
source: result.source || source
|
|
3243
|
+
};
|
|
3244
|
+
}
|
|
3245
|
+
function isPromise(value) {
|
|
3246
|
+
return Boolean(value && typeof value.then === "function");
|
|
3247
|
+
}
|
|
3248
|
+
function adapterMatches(adapter, url) {
|
|
3249
|
+
return adapter.detect?.(url) ?? adapter.canHandle?.(url) ?? false;
|
|
3250
|
+
}
|
|
3251
|
+
function appendExtractionTrace(rawSources, trace) {
|
|
3252
|
+
if (Object.keys(rawSources.openGraph.raw).length > 0) {
|
|
3253
|
+
trace.push("parsed Open Graph");
|
|
3254
|
+
}
|
|
3255
|
+
if (Object.keys(rawSources.twitter.raw).length > 0) {
|
|
3256
|
+
trace.push("parsed Twitter Cards");
|
|
3257
|
+
}
|
|
3258
|
+
if (rawSources.jsonLd.nodes.length > 0) {
|
|
3259
|
+
trace.push("parsed JSON-LD");
|
|
3260
|
+
}
|
|
3261
|
+
if (rawSources.embeddedData.items.length > 0) {
|
|
3262
|
+
trace.push("parsed embedded application data");
|
|
3263
|
+
}
|
|
3264
|
+
if (rawSources.oEmbed.links.length > 0) {
|
|
3265
|
+
trace.push("discovered oEmbed endpoints");
|
|
3266
|
+
}
|
|
3267
|
+
if (rawSources.images.length > 0 || rawSources.videos.length > 0 || rawSources.audio.length > 0) {
|
|
3268
|
+
trace.push("discovered HTML media candidates");
|
|
3269
|
+
}
|
|
3270
|
+
}
|
|
3271
|
+
|
|
3272
|
+
// src/utils/imageDimensions.ts
|
|
3273
|
+
function detectImageDimensions(bytes, contentType) {
|
|
3274
|
+
if (!bytes || bytes.length < 16) {
|
|
3275
|
+
return {};
|
|
3276
|
+
}
|
|
3277
|
+
const type = contentType?.toLowerCase() ?? "";
|
|
3278
|
+
if (type.includes("png") || isPng(bytes)) {
|
|
3279
|
+
return readPngDimensions(bytes);
|
|
3280
|
+
}
|
|
3281
|
+
if (type.includes("jpeg") || type.includes("jpg") || isJpeg(bytes)) {
|
|
3282
|
+
return readJpegDimensions(bytes);
|
|
3283
|
+
}
|
|
3284
|
+
if (type.includes("webp") || isWebp(bytes)) {
|
|
3285
|
+
return readWebpDimensions(bytes);
|
|
3286
|
+
}
|
|
3287
|
+
return {};
|
|
3288
|
+
}
|
|
3289
|
+
function isPng(bytes) {
|
|
3290
|
+
return bytes[0] === 137 && bytes[1] === 80 && bytes[2] === 78 && bytes[3] === 71;
|
|
3291
|
+
}
|
|
3292
|
+
function isJpeg(bytes) {
|
|
3293
|
+
return bytes[0] === 255 && bytes[1] === 216;
|
|
3294
|
+
}
|
|
3295
|
+
function isWebp(bytes) {
|
|
3296
|
+
return ascii(bytes, 0, 4) === "RIFF" && ascii(bytes, 8, 4) === "WEBP";
|
|
3297
|
+
}
|
|
3298
|
+
function readPngDimensions(bytes) {
|
|
3299
|
+
if (bytes.length < 24) {
|
|
3300
|
+
return {};
|
|
3301
|
+
}
|
|
3302
|
+
return {
|
|
3303
|
+
width: readUint32(bytes, 16),
|
|
3304
|
+
height: readUint32(bytes, 20)
|
|
3305
|
+
};
|
|
3306
|
+
}
|
|
3307
|
+
function readJpegDimensions(bytes) {
|
|
3308
|
+
let offset = 2;
|
|
3309
|
+
while (offset + 9 < bytes.length) {
|
|
3310
|
+
if (bytes[offset] !== 255) {
|
|
3311
|
+
offset += 1;
|
|
3312
|
+
continue;
|
|
3313
|
+
}
|
|
3314
|
+
const marker = bytes[offset + 1];
|
|
3315
|
+
const length = readUint16(bytes, offset + 2);
|
|
3316
|
+
if (length < 2) {
|
|
3317
|
+
return {};
|
|
3318
|
+
}
|
|
3319
|
+
if (marker >= 192 && marker <= 207 && ![196, 200, 204].includes(marker)) {
|
|
3320
|
+
return {
|
|
3321
|
+
height: readUint16(bytes, offset + 5),
|
|
3322
|
+
width: readUint16(bytes, offset + 7)
|
|
3323
|
+
};
|
|
3324
|
+
}
|
|
3325
|
+
offset += 2 + length;
|
|
3326
|
+
}
|
|
3327
|
+
return {};
|
|
3328
|
+
}
|
|
3329
|
+
function readWebpDimensions(bytes) {
|
|
3330
|
+
const chunk = ascii(bytes, 12, 4);
|
|
3331
|
+
if (chunk === "VP8X" && bytes.length >= 30) {
|
|
3332
|
+
return {
|
|
3333
|
+
width: 1 + readUint24Le(bytes, 24),
|
|
3334
|
+
height: 1 + readUint24Le(bytes, 27)
|
|
3335
|
+
};
|
|
3336
|
+
}
|
|
3337
|
+
if (chunk === "VP8 " && bytes.length >= 30) {
|
|
3338
|
+
return {
|
|
3339
|
+
width: readUint16Le(bytes, 26) & 16383,
|
|
3340
|
+
height: readUint16Le(bytes, 28) & 16383
|
|
3341
|
+
};
|
|
3342
|
+
}
|
|
3343
|
+
if (chunk === "VP8L" && bytes.length >= 25) {
|
|
3344
|
+
const b0 = bytes[21];
|
|
3345
|
+
const b1 = bytes[22];
|
|
3346
|
+
const b2 = bytes[23];
|
|
3347
|
+
const b3 = bytes[24];
|
|
3348
|
+
return {
|
|
3349
|
+
width: 1 + ((b1 & 63) << 8 | b0),
|
|
3350
|
+
height: 1 + ((b3 & 15) << 10 | b2 << 2 | (b1 & 192) >> 6)
|
|
3351
|
+
};
|
|
3352
|
+
}
|
|
3353
|
+
return {};
|
|
3354
|
+
}
|
|
3355
|
+
function readUint16(bytes, offset) {
|
|
3356
|
+
return (bytes[offset] << 8) + bytes[offset + 1];
|
|
3357
|
+
}
|
|
3358
|
+
function readUint16Le(bytes, offset) {
|
|
3359
|
+
return bytes[offset] + (bytes[offset + 1] << 8);
|
|
3360
|
+
}
|
|
3361
|
+
function readUint24Le(bytes, offset) {
|
|
3362
|
+
return bytes[offset] + (bytes[offset + 1] << 8) + (bytes[offset + 2] << 16);
|
|
3363
|
+
}
|
|
3364
|
+
function readUint32(bytes, offset) {
|
|
3365
|
+
return (bytes[offset] << 24 >>> 0) + (bytes[offset + 1] << 16) + (bytes[offset + 2] << 8) + bytes[offset + 3];
|
|
3366
|
+
}
|
|
3367
|
+
function ascii(bytes, offset, length) {
|
|
3368
|
+
return String.fromCharCode(...bytes.slice(offset, offset + length));
|
|
3369
|
+
}
|
|
3370
|
+
|
|
3371
|
+
// src/fetchMetadata.ts
|
|
3372
|
+
async function fetchMetadata(url, options = {}) {
|
|
3373
|
+
const startedAt = Date.now();
|
|
3374
|
+
try {
|
|
3375
|
+
const requestedUrl = normalizeUrl(url);
|
|
3376
|
+
const firstPage = await fetchPage(requestedUrl, options);
|
|
3377
|
+
const fallback = await maybeFetchRedditFallback(firstPage, options);
|
|
3378
|
+
const page = fallback.page;
|
|
3379
|
+
const directMedia = createDirectMediaMetadata(page, requestedUrl, Date.now() - startedAt);
|
|
3380
|
+
if (directMedia) {
|
|
3381
|
+
return directMedia;
|
|
3382
|
+
}
|
|
3383
|
+
const metadata = await parseMetadataAsync(page.html, page.finalUrl, options);
|
|
3384
|
+
metadata.url = requestedUrl;
|
|
3385
|
+
metadata.finalUrl = page.finalUrl;
|
|
3386
|
+
metadata.ok = page.statusCode >= 200 && page.statusCode < 300;
|
|
3387
|
+
metadata.diagnostics.statusCode = page.statusCode;
|
|
3388
|
+
metadata.diagnostics.contentType = page.contentType;
|
|
3389
|
+
metadata.diagnostics.originalUrl = requestedUrl;
|
|
3390
|
+
metadata.diagnostics.finalUrl = page.finalUrl;
|
|
3391
|
+
metadata.diagnostics.canonicalUrl = metadata.canonicalUrl;
|
|
3392
|
+
metadata.diagnostics.isShortUrl = page.isShortUrl;
|
|
3393
|
+
metadata.diagnostics.shortUrlProvider = page.shortUrlProvider;
|
|
3394
|
+
metadata.diagnostics.redirects = page.redirects;
|
|
3395
|
+
metadata.diagnostics.fetchDurationMs = Date.now() - startedAt;
|
|
3396
|
+
metadata.diagnostics.trace = [
|
|
3397
|
+
...page.isShortUrl ? [`detected short URL provider: ${page.shortUrlProvider ?? "unknown"}`] : [],
|
|
3398
|
+
...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
|
|
3399
|
+
...fallback.used ? ["retried Reddit page through old.reddit fallback"] : [],
|
|
3400
|
+
"downloaded page",
|
|
3401
|
+
...metadata.diagnostics.trace,
|
|
3402
|
+
...metadata.canonicalUrl ? ["resolved canonical URL"] : []
|
|
3403
|
+
];
|
|
3404
|
+
metadata.trace = metadata.diagnostics.trace;
|
|
3405
|
+
if (!metadata.ok) {
|
|
3406
|
+
metadata.diagnostics.warnings.push(`Fetch completed with non-success status code ${page.statusCode}.`);
|
|
3407
|
+
}
|
|
3408
|
+
if (page.contentType && !/html|xml|text/i.test(page.contentType)) {
|
|
3409
|
+
metadata.diagnostics.warnings.push(`Response content type may not contain parseable metadata: ${page.contentType}.`);
|
|
3410
|
+
}
|
|
3411
|
+
return metadata;
|
|
3412
|
+
} catch (error) {
|
|
3413
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3414
|
+
const safeUrl = safeNormalize(url);
|
|
3415
|
+
return {
|
|
3416
|
+
ok: false,
|
|
3417
|
+
url: safeUrl,
|
|
3418
|
+
finalUrl: safeUrl,
|
|
3419
|
+
type: "unknown",
|
|
3420
|
+
confidence: 0,
|
|
3421
|
+
completeness: 0,
|
|
3422
|
+
reliability: 0,
|
|
3423
|
+
images: [],
|
|
3424
|
+
videos: [],
|
|
3425
|
+
audio: [],
|
|
3426
|
+
favicons: [],
|
|
3427
|
+
trace: ["fetch failed"],
|
|
3428
|
+
diagnostics: {
|
|
3429
|
+
originalUrl: safeUrl,
|
|
3430
|
+
finalUrl: safeUrl,
|
|
3431
|
+
redirects: [],
|
|
3432
|
+
sourcesUsed: [],
|
|
3433
|
+
warnings: [],
|
|
3434
|
+
trace: ["fetch failed"],
|
|
3435
|
+
errors: [message],
|
|
3436
|
+
fetchDurationMs: Date.now() - startedAt,
|
|
3437
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
3438
|
+
}
|
|
3439
|
+
};
|
|
3440
|
+
}
|
|
3441
|
+
}
|
|
3442
|
+
async function maybeFetchRedditFallback(page, options) {
|
|
3443
|
+
let parsed;
|
|
3444
|
+
try {
|
|
3445
|
+
parsed = new URL(page.finalUrl);
|
|
3446
|
+
} catch {
|
|
3447
|
+
return { page, used: false };
|
|
3448
|
+
}
|
|
3449
|
+
const host = parsed.hostname.toLowerCase();
|
|
3450
|
+
const isReddit = host === "www.reddit.com" || host === "reddit.com" || host.endsWith(".reddit.com");
|
|
3451
|
+
const isOldReddit = host === "old.reddit.com";
|
|
3452
|
+
const looksLikeVerification = /please wait for verification|blocked|whoa there, pardner/i.test(page.html);
|
|
3453
|
+
const hasUsefulPreview = /og:(?:title|image|description)|twitter:(?:title|image|description)/i.test(page.html);
|
|
3454
|
+
if (!isReddit || isOldReddit || hasUsefulPreview || !looksLikeVerification) {
|
|
3455
|
+
return { page, used: false };
|
|
3456
|
+
}
|
|
3457
|
+
const fallbackUrl = new URL(page.finalUrl);
|
|
3458
|
+
fallbackUrl.hostname = "old.reddit.com";
|
|
3459
|
+
fallbackUrl.search = "";
|
|
3460
|
+
try {
|
|
3461
|
+
const fallbackPage = await fetchPage(fallbackUrl.toString(), options);
|
|
3462
|
+
return { page: fallbackPage, used: true };
|
|
3463
|
+
} catch {
|
|
3464
|
+
return { page, used: false };
|
|
3465
|
+
}
|
|
3466
|
+
}
|
|
3467
|
+
function createDirectMediaMetadata(page, requestedUrl, fetchDurationMs) {
|
|
3468
|
+
const contentType = page.contentType?.toLowerCase() ?? "";
|
|
3469
|
+
const kind = directMediaKind(contentType, page.finalUrl);
|
|
3470
|
+
if (!kind) {
|
|
3471
|
+
return void 0;
|
|
3472
|
+
}
|
|
3473
|
+
const dimensions = kind === "image" ? {
|
|
3474
|
+
...imageDimensionsFromUrl(page.finalUrl),
|
|
3475
|
+
...detectImageDimensions(page.bytes, page.contentType)
|
|
3476
|
+
} : {};
|
|
3477
|
+
const asset = {
|
|
3478
|
+
url: page.finalUrl,
|
|
3479
|
+
kind,
|
|
3480
|
+
source: "direct",
|
|
3481
|
+
type: page.contentType,
|
|
3482
|
+
width: dimensions.width,
|
|
3483
|
+
height: dimensions.height,
|
|
3484
|
+
score: kind === "image" ? 90 : void 0,
|
|
3485
|
+
confidence: kind === "image" ? 0.9 : void 0
|
|
3486
|
+
};
|
|
3487
|
+
const confidence = kind === "image" ? 82 : 70;
|
|
3488
|
+
const completeness = kind === "image" ? dimensions.width && dimensions.height ? 60 : 45 : 35;
|
|
3489
|
+
const trace = [
|
|
3490
|
+
...page.redirects.length > 0 ? [`resolved ${page.redirects.length} redirect${page.redirects.length === 1 ? "" : "s"}`] : [],
|
|
3491
|
+
"downloaded direct media",
|
|
3492
|
+
`detected direct ${kind}`
|
|
3493
|
+
];
|
|
3494
|
+
return {
|
|
3495
|
+
ok: page.statusCode >= 200 && page.statusCode < 300,
|
|
3496
|
+
url: requestedUrl,
|
|
3497
|
+
finalUrl: page.finalUrl,
|
|
3498
|
+
type: kind === "image" ? "image" : kind,
|
|
3499
|
+
confidence,
|
|
3500
|
+
completeness,
|
|
3501
|
+
reliability: Math.round((confidence + completeness) / 2),
|
|
3502
|
+
bestImage: kind === "image" ? page.finalUrl : void 0,
|
|
3503
|
+
images: kind === "image" ? [asset] : [],
|
|
3504
|
+
videos: kind === "video" ? [asset] : [],
|
|
3505
|
+
audio: kind === "audio" ? [asset] : [],
|
|
3506
|
+
favicons: [],
|
|
3507
|
+
trace,
|
|
3508
|
+
sources: {
|
|
3509
|
+
image: kind === "image" ? "direct" : void 0
|
|
3510
|
+
},
|
|
3511
|
+
diagnostics: {
|
|
3512
|
+
originalUrl: requestedUrl,
|
|
3513
|
+
finalUrl: page.finalUrl,
|
|
3514
|
+
isShortUrl: page.isShortUrl,
|
|
3515
|
+
shortUrlProvider: page.shortUrlProvider,
|
|
3516
|
+
statusCode: page.statusCode,
|
|
3517
|
+
contentType: page.contentType,
|
|
3518
|
+
redirects: page.redirects,
|
|
3519
|
+
sourcesUsed: ["direct"],
|
|
3520
|
+
warnings: [],
|
|
3521
|
+
trace,
|
|
3522
|
+
selectedImageReason: kind === "image" ? "Selected direct image URL because the response content type is an image." : void 0,
|
|
3523
|
+
fetchDurationMs,
|
|
3524
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
3525
|
+
}
|
|
3526
|
+
};
|
|
3527
|
+
}
|
|
3528
|
+
function imageDimensionsFromUrl(url) {
|
|
3529
|
+
try {
|
|
3530
|
+
const parsed = new URL(url);
|
|
3531
|
+
const width = parseDimension(parsed.searchParams.get("width") ?? parsed.searchParams.get("w"));
|
|
3532
|
+
const height = parseDimension(parsed.searchParams.get("height") ?? parsed.searchParams.get("h"));
|
|
3533
|
+
if (width && height) {
|
|
3534
|
+
return { width, height };
|
|
3535
|
+
}
|
|
3536
|
+
const crop = parsed.searchParams.get("crop");
|
|
3537
|
+
const cropMatch = crop?.match(/(\d{2,5})\s*:\s*(\d{2,5})/);
|
|
3538
|
+
if (cropMatch) {
|
|
3539
|
+
return { width: Number(cropMatch[1]), height: Number(cropMatch[2]) };
|
|
3540
|
+
}
|
|
3541
|
+
} catch {
|
|
3542
|
+
return {};
|
|
3543
|
+
}
|
|
3544
|
+
return {};
|
|
3545
|
+
}
|
|
3546
|
+
function parseDimension(value) {
|
|
3547
|
+
if (!value) {
|
|
3548
|
+
return void 0;
|
|
3549
|
+
}
|
|
3550
|
+
const parsed = Number(value);
|
|
3551
|
+
return Number.isFinite(parsed) && parsed > 0 ? parsed : void 0;
|
|
3552
|
+
}
|
|
3553
|
+
function directMediaKind(contentType, url) {
|
|
3554
|
+
const normalizedUrl = url.toLowerCase();
|
|
3555
|
+
if (contentType.startsWith("image/") || /\.(?:avif|webp|png|jpe?g|gif)(?:[?#].*)?$/i.test(normalizedUrl)) {
|
|
3556
|
+
return "image";
|
|
3557
|
+
}
|
|
3558
|
+
if (contentType.startsWith("video/") || /\.(?:mp4|webm|m3u8|mov)(?:[?#].*)?$/i.test(normalizedUrl)) {
|
|
3559
|
+
return "video";
|
|
3560
|
+
}
|
|
3561
|
+
if (contentType.startsWith("audio/") || /\.(?:mp3|m4a|wav|ogg|aac)(?:[?#].*)?$/i.test(normalizedUrl)) {
|
|
3562
|
+
return "audio";
|
|
3563
|
+
}
|
|
3564
|
+
return void 0;
|
|
3565
|
+
}
|
|
3566
|
+
function safeNormalize(url) {
|
|
3567
|
+
try {
|
|
3568
|
+
return normalizeUrl(url);
|
|
3569
|
+
} catch {
|
|
3570
|
+
return url;
|
|
3571
|
+
}
|
|
3572
|
+
}
|
|
3573
|
+
|
|
3574
|
+
// src/preview.ts
|
|
3575
|
+
function createPreviewCard(metadata) {
|
|
3576
|
+
return {
|
|
3577
|
+
title: metadata.title,
|
|
3578
|
+
description: metadata.description,
|
|
3579
|
+
image: metadata.bestImage,
|
|
3580
|
+
url: metadata.canonicalUrl ?? metadata.finalUrl,
|
|
3581
|
+
siteName: metadata.siteName,
|
|
3582
|
+
domain: domainFromUrl(metadata.canonicalUrl ?? metadata.finalUrl),
|
|
3583
|
+
author: metadata.author?.name,
|
|
3584
|
+
type: metadata.type,
|
|
3585
|
+
confidence: metadata.confidence
|
|
3586
|
+
};
|
|
3587
|
+
}
|
|
3588
|
+
function domainFromUrl(url) {
|
|
3589
|
+
try {
|
|
3590
|
+
return new URL(url).hostname.replace(/^www\./, "");
|
|
3591
|
+
} catch {
|
|
3592
|
+
return void 0;
|
|
3593
|
+
}
|
|
3594
|
+
}
|
|
3595
|
+
|
|
3596
|
+
// src/core.ts
|
|
3597
|
+
var MetaNova = {
|
|
3598
|
+
use(plugin) {
|
|
3599
|
+
registerGlobalPlugin(plugin);
|
|
3600
|
+
return MetaNova;
|
|
3601
|
+
},
|
|
3602
|
+
fetchMetadata,
|
|
3603
|
+
parseMetadata,
|
|
3604
|
+
parseMetadataAsync,
|
|
3605
|
+
createPreviewCard
|
|
3606
|
+
};
|
|
3607
|
+
|
|
3608
|
+
// src/diagnostics/index.ts
|
|
3609
|
+
function createDiagnostics() {
|
|
3610
|
+
return {
|
|
3611
|
+
redirects: [],
|
|
3612
|
+
sourcesUsed: [],
|
|
3613
|
+
warnings: [],
|
|
3614
|
+
trace: [],
|
|
3615
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
3616
|
+
};
|
|
3617
|
+
}
|
|
3618
|
+
function addWarning(diagnostics, warning) {
|
|
3619
|
+
diagnostics.warnings.push(warning);
|
|
3620
|
+
return diagnostics;
|
|
3621
|
+
}
|
|
3622
|
+
|
|
3623
|
+
// src/index.ts
|
|
3624
|
+
var index_default = MetaNova;
|
|
3625
|
+
export {
|
|
3626
|
+
DEFAULT_ACCEPT,
|
|
3627
|
+
DEFAULT_ACCEPT_ENCODING,
|
|
3628
|
+
DEFAULT_ACCEPT_LANGUAGE,
|
|
3629
|
+
DEFAULT_BROWSER_USER_AGENT,
|
|
3630
|
+
MetaNova,
|
|
3631
|
+
SecurityError,
|
|
3632
|
+
addWarning,
|
|
3633
|
+
assertSafeRequestUrl,
|
|
3634
|
+
behanceAdapter,
|
|
3635
|
+
calculateCompleteness,
|
|
3636
|
+
calculateConfidence,
|
|
3637
|
+
calculateReliability,
|
|
3638
|
+
createDiagnostics,
|
|
3639
|
+
createPreviewCard,
|
|
3640
|
+
createRegistry,
|
|
3641
|
+
index_default as default,
|
|
3642
|
+
defaultAdapters,
|
|
3643
|
+
detectShortUrl,
|
|
3644
|
+
discoverMedia,
|
|
3645
|
+
extractAudio,
|
|
3646
|
+
extractEmbeddedData,
|
|
3647
|
+
extractHtmlMetadata,
|
|
3648
|
+
extractImages,
|
|
3649
|
+
extractJsonLd,
|
|
3650
|
+
extractOEmbed,
|
|
3651
|
+
extractOpenGraph,
|
|
3652
|
+
extractTwitterCards,
|
|
3653
|
+
extractVideos,
|
|
3654
|
+
facebookAdapter,
|
|
3655
|
+
fetchMetadata,
|
|
3656
|
+
fetchPage,
|
|
3657
|
+
instagramAdapter,
|
|
3658
|
+
normalizeMetadata,
|
|
3659
|
+
normalizeUrl,
|
|
3660
|
+
parseMetadata,
|
|
3661
|
+
parseMetadataAsync,
|
|
3662
|
+
pinterestAdapter,
|
|
3663
|
+
redditAdapter,
|
|
3664
|
+
registerGlobalPlugin,
|
|
3665
|
+
resolveCanonicalUrl,
|
|
3666
|
+
resolveRedirects,
|
|
3667
|
+
resolveUrl,
|
|
3668
|
+
scoreImages,
|
|
3669
|
+
tiktokAdapter,
|
|
3670
|
+
twitterAdapter,
|
|
3671
|
+
validateUrl,
|
|
3672
|
+
youtubeAdapter
|
|
3673
|
+
};
|
|
3674
|
+
//# sourceMappingURL=index.js.map
|