smart-web-mcp 0.8.8 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/CHANGELOG.md +72 -18
  2. package/README.md +23 -15
  3. package/dist/assessment.js +1 -1
  4. package/dist/assessment.js.map +1 -1
  5. package/dist/browser-session.d.ts +1 -0
  6. package/dist/browser-session.js +11 -11
  7. package/dist/browser-session.js.map +1 -1
  8. package/dist/cli.js +5 -1
  9. package/dist/cli.js.map +1 -1
  10. package/dist/dev-runtime.js +2 -1
  11. package/dist/dev-runtime.js.map +1 -1
  12. package/dist/index.js +9 -2
  13. package/dist/index.js.map +1 -1
  14. package/dist/mcp-server.js +2 -2
  15. package/dist/mcp-server.js.map +1 -1
  16. package/dist/runtime-temp.d.ts +1 -1
  17. package/dist/runtime-temp.js +12 -17
  18. package/dist/runtime-temp.js.map +1 -1
  19. package/dist/settings.d.ts +25 -5
  20. package/dist/settings.js +55 -60
  21. package/dist/settings.js.map +1 -1
  22. package/dist/shared.d.ts +3 -3
  23. package/dist/shared.js +29 -9
  24. package/dist/shared.js.map +1 -1
  25. package/dist/smartcrawl.js +2 -2
  26. package/dist/smartcrawl.js.map +1 -1
  27. package/dist/smartfetch/academic-fallback.d.ts +7 -0
  28. package/dist/smartfetch/academic-fallback.js +777 -0
  29. package/dist/smartfetch/academic-fallback.js.map +1 -0
  30. package/dist/smartfetch/archive-fallback.js +7 -57
  31. package/dist/smartfetch/archive-fallback.js.map +1 -1
  32. package/dist/smartfetch/generic-fallbacks.d.ts +92 -0
  33. package/dist/smartfetch/generic-fallbacks.js +616 -0
  34. package/dist/smartfetch/generic-fallbacks.js.map +1 -0
  35. package/dist/smartfetch/jina-reader.js +62 -3
  36. package/dist/smartfetch/jina-reader.js.map +1 -1
  37. package/dist/smartfetch/paywall.d.ts +7 -0
  38. package/dist/smartfetch/paywall.js +77 -0
  39. package/dist/smartfetch/paywall.js.map +1 -0
  40. package/dist/smartfetch/pipeline.d.ts +1 -1
  41. package/dist/smartfetch/pipeline.js +4 -3
  42. package/dist/smartfetch/pipeline.js.map +1 -1
  43. package/dist/smartfetch/provider-policy.js +2 -2
  44. package/dist/smartfetch/provider-policy.js.map +1 -1
  45. package/dist/smartfetch/provider-types.d.ts +1 -0
  46. package/dist/smartfetch/providers/acmicpc.d.ts +2 -0
  47. package/dist/smartfetch/providers/acmicpc.js +57 -0
  48. package/dist/smartfetch/providers/acmicpc.js.map +1 -0
  49. package/dist/smartfetch/providers/article.d.ts +1 -2
  50. package/dist/smartfetch/providers/article.js +161 -1
  51. package/dist/smartfetch/providers/article.js.map +1 -1
  52. package/dist/smartfetch/providers/atcoder.d.ts +2 -0
  53. package/dist/smartfetch/providers/atcoder.js +54 -0
  54. package/dist/smartfetch/providers/atcoder.js.map +1 -0
  55. package/dist/smartfetch/providers/codeforces.d.ts +2 -0
  56. package/dist/smartfetch/providers/codeforces.js +54 -0
  57. package/dist/smartfetch/providers/codeforces.js.map +1 -0
  58. package/dist/smartfetch/providers/cp-common.d.ts +8 -0
  59. package/dist/smartfetch/providers/cp-common.js +62 -0
  60. package/dist/smartfetch/providers/cp-common.js.map +1 -0
  61. package/dist/smartfetch/providers/index.js +15 -1
  62. package/dist/smartfetch/providers/index.js.map +1 -1
  63. package/dist/smartfetch/providers/jungol.d.ts +2 -0
  64. package/dist/smartfetch/providers/jungol.js +43 -0
  65. package/dist/smartfetch/providers/jungol.js.map +1 -0
  66. package/dist/smartfetch/providers/kakao-map.d.ts +2 -0
  67. package/dist/smartfetch/providers/kakao-map.js +150 -0
  68. package/dist/smartfetch/providers/kakao-map.js.map +1 -0
  69. package/dist/smartfetch/providers/linkedin.js +5 -5
  70. package/dist/smartfetch/providers/linkedin.js.map +1 -1
  71. package/dist/smartfetch/providers/map-utils.d.ts +7 -0
  72. package/dist/smartfetch/providers/map-utils.js +56 -0
  73. package/dist/smartfetch/providers/map-utils.js.map +1 -0
  74. package/dist/smartfetch/providers/naver-map.d.ts +2 -0
  75. package/dist/smartfetch/providers/naver-map.js +183 -0
  76. package/dist/smartfetch/providers/naver-map.js.map +1 -0
  77. package/dist/smartfetch/providers/qoj.d.ts +2 -0
  78. package/dist/smartfetch/providers/qoj.js +54 -0
  79. package/dist/smartfetch/providers/qoj.js.map +1 -0
  80. package/dist/smartfetch/providers/reddit.js +2 -2
  81. package/dist/smartfetch/providers/reddit.js.map +1 -1
  82. package/dist/smartfetch/providers/solvedac.js +193 -2
  83. package/dist/smartfetch/providers/solvedac.js.map +1 -1
  84. package/dist/smartfetch/providers/x.js +4 -7
  85. package/dist/smartfetch/providers/x.js.map +1 -1
  86. package/dist/smartfetch/providers/youtube.js +2 -2
  87. package/dist/smartfetch/providers/youtube.js.map +1 -1
  88. package/dist/smartfetch.js +22 -9
  89. package/dist/smartfetch.js.map +1 -1
  90. package/dist/smartsearch.js +199 -26
  91. package/dist/smartsearch.js.map +1 -1
  92. package/dist/test-settings.d.ts +9 -0
  93. package/dist/test-settings.js +40 -0
  94. package/dist/test-settings.js.map +1 -0
  95. package/package.json +7 -6
@@ -0,0 +1,616 @@
1
+ import { absolutizeUrl, asString, decodeHtml, dedupeUrls, extractMetaDescription, extractMetaName, extractMetaProperty, extractTitleFromHtml, extractUrls, stripTags, } from "../shared.js";
2
+ import { fetchProviderText } from "./provider-policy.js";
3
+ function isRecord(value) {
4
+ return Boolean(value) && typeof value === "object" && !Array.isArray(value);
5
+ }
6
+ function cleanText(value) {
7
+ return String(value || "").replace(/\s+/g, " ").trim();
8
+ }
9
+ function truncate(value, maxLength) {
10
+ const text = cleanText(value);
11
+ if (text.length <= maxLength)
12
+ return text;
13
+ return `${text.slice(0, maxLength - 1).trimEnd()}…`;
14
+ }
15
+ function tokenize(value) {
16
+ return Array.from(new Set(cleanText(value).toLowerCase().split(/[^a-z0-9가-힣]+/).filter((item) => item.length >= 3)));
17
+ }
18
+ function overlapCount(left, right) {
19
+ const rightTokens = new Set(tokenize(right));
20
+ return tokenize(left).reduce((count, token) => count + (rightTokens.has(token) ? 1 : 0), 0);
21
+ }
22
+ function slugWords(url) {
23
+ try {
24
+ return decodeURIComponent(new URL(url).pathname)
25
+ .replace(/[-_/]+/g, " ")
26
+ .replace(/\b\d+\b/g, " ")
27
+ .replace(/\s+/g, " ")
28
+ .trim();
29
+ }
30
+ catch {
31
+ return "";
32
+ }
33
+ }
34
+ function looksProductLikeUrl(url) {
35
+ try {
36
+ const parsed = new URL(url);
37
+ const path = parsed.pathname.toLowerCase();
38
+ if (/\/(?:product|products|item|shop|store|sku|dp|goods)\//.test(path))
39
+ return true;
40
+ if (/(^|\/)(?:p|pd)\/[a-z0-9-]+/i.test(path))
41
+ return true;
42
+ return ["id", "item", "sku", "product", "productid"].some((key) => parsed.searchParams.has(key));
43
+ }
44
+ catch {
45
+ return false;
46
+ }
47
+ }
48
+ function looksCollectionLikeUrl(url) {
49
+ try {
50
+ const parsed = new URL(url);
51
+ const path = parsed.pathname.toLowerCase();
52
+ if (/\/(?:search|shop|store|catalog|collection|collections|products|items|category|tag)s?(?:\/|$)/.test(path))
53
+ return true;
54
+ return ["q", "query", "keyword", "keywords", "search", "searchtext"].some((key) => parsed.searchParams.has(key));
55
+ }
56
+ catch {
57
+ return false;
58
+ }
59
+ }
60
+ function looksProfileLikeUrl(url) {
61
+ try {
62
+ const parsed = new URL(url);
63
+ const path = parsed.pathname.toLowerCase();
64
+ if (path === "/about" || path === "/team")
65
+ return true;
66
+ return /\/(?:about|profile|profiles|people|team|authors?|members?|u|user)\//.test(path)
67
+ || /\/@[a-z0-9._-]+/i.test(path);
68
+ }
69
+ catch {
70
+ return false;
71
+ }
72
+ }
73
+ function looksFeedDiscoveryPage(url) {
74
+ try {
75
+ const parsed = new URL(url);
76
+ const path = parsed.pathname.toLowerCase().replace(/\/+$/, "") || "/";
77
+ if (path === "/")
78
+ return true;
79
+ if (/\/(?:blog|blogs|news|updates|articles|posts|journal|category|categories|tags?)$/.test(path))
80
+ return true;
81
+ return ["page", "q", "query", "tag", "category", "search"].some((key) => parsed.searchParams.has(key));
82
+ }
83
+ catch {
84
+ return false;
85
+ }
86
+ }
87
+ function textFromUnknown(value) {
88
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean")
89
+ return cleanText(value);
90
+ if (Array.isArray(value))
91
+ return value.map((item) => textFromUnknown(item)).filter(Boolean).join(", ");
92
+ if (!isRecord(value))
93
+ return "";
94
+ return cleanText(textFromUnknown(value.name)
95
+ || textFromUnknown(value.text)
96
+ || textFromUnknown(value.description)
97
+ || textFromUnknown(value.headline));
98
+ }
99
+ function firstUrl(value, baseUrl) {
100
+ if (typeof value === "string")
101
+ return absolutizeUrl(value, baseUrl);
102
+ if (Array.isArray(value)) {
103
+ for (const item of value) {
104
+ const candidate = firstUrl(item, baseUrl);
105
+ if (candidate)
106
+ return candidate;
107
+ }
108
+ return "";
109
+ }
110
+ if (!isRecord(value))
111
+ return "";
112
+ for (const key of ["url", "@id", "contentUrl", "thumbnailUrl"]) {
113
+ const candidate = firstUrl(value[key], baseUrl);
114
+ if (candidate)
115
+ return candidate;
116
+ }
117
+ return "";
118
+ }
119
+ function typeNames(record) {
120
+ const raw = record["@type"];
121
+ if (Array.isArray(raw))
122
+ return raw.map((item) => cleanText(item)).filter(Boolean);
123
+ const single = cleanText(raw);
124
+ return single ? [single] : [];
125
+ }
126
+ function collectJsonRecords(value, out) {
127
+ if (Array.isArray(value)) {
128
+ for (const item of value)
129
+ collectJsonRecords(item, out);
130
+ return;
131
+ }
132
+ if (!isRecord(value))
133
+ return;
134
+ out.push(value);
135
+ if (Array.isArray(value["@graph"]))
136
+ collectJsonRecords(value["@graph"], out);
137
+ if (Array.isArray(value.itemListElement))
138
+ collectJsonRecords(value.itemListElement, out);
139
+ if (isRecord(value.item))
140
+ collectJsonRecords(value.item, out);
141
+ if (isRecord(value.offers))
142
+ collectJsonRecords(value.offers, out);
143
+ }
144
+ function extractJsonLdRecords(html) {
145
+ const out = [];
146
+ for (const match of html.matchAll(/<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi)) {
147
+ const raw = asString(match[1]).trim();
148
+ if (!raw)
149
+ continue;
150
+ try {
151
+ collectJsonRecords(JSON.parse(raw), out);
152
+ }
153
+ catch {
154
+ // Ignore malformed JSON-LD blocks.
155
+ }
156
+ }
157
+ return out;
158
+ }
159
+ function interestingText(value, out, depth = 0) {
160
+ if (depth > 6)
161
+ return;
162
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
163
+ const text = cleanText(stripTags(decodeHtml(String(value))));
164
+ if (text.length >= 24 && !/^https?:\/\//i.test(text))
165
+ out.push(text);
166
+ return;
167
+ }
168
+ if (Array.isArray(value)) {
169
+ value.forEach((item) => {
170
+ interestingText(item, out, depth + 1);
171
+ });
172
+ return;
173
+ }
174
+ if (!isRecord(value))
175
+ return;
176
+ Object.values(value).forEach((item) => {
177
+ interestingText(item, out, depth + 1);
178
+ });
179
+ }
180
+ function decodeJsonStringLiteral(value) {
181
+ try {
182
+ return JSON.parse(`"${value}"`);
183
+ }
184
+ catch {
185
+ return "";
186
+ }
187
+ }
188
+ function extractNextPayloadText(html) {
189
+ const out = [];
190
+ const nextDataMatch = html.match(/<script[^>]*id=["']__NEXT_DATA__["'][^>]*>([\s\S]*?)<\/script>/i);
191
+ if (nextDataMatch?.[1]) {
192
+ try {
193
+ interestingText(JSON.parse(nextDataMatch[1]), out);
194
+ }
195
+ catch {
196
+ // Ignore invalid payloads.
197
+ }
198
+ }
199
+ for (const match of html.matchAll(/self\.__next_f\.push\(\[1,\s*(?:\\?")([\s\S]*?)(?:\\?")\]\)/g)) {
200
+ const decoded = decodeJsonStringLiteral(asString(match[1]));
201
+ if (!decoded)
202
+ continue;
203
+ interestingText(decoded, out);
204
+ }
205
+ return Array.from(new Set(out)).join(" ").trim();
206
+ }
207
+ function ratingText(record) {
208
+ const aggregate = isRecord(record.aggregateRating) ? record.aggregateRating : null;
209
+ if (!aggregate)
210
+ return "";
211
+ const ratingValue = cleanText(aggregate.ratingValue);
212
+ const reviewCount = cleanText(aggregate.reviewCount || aggregate.ratingCount);
213
+ return ratingValue ? `${ratingValue}${reviewCount ? ` (${reviewCount})` : ""}` : "";
214
+ }
215
+ function productFallback(context, records) {
216
+ const pageTitle = cleanText(extractMetaProperty(context.active.content, "og:title") || extractTitleFromHtml(context.active.content));
217
+ const productRecords = records.filter((record) => typeNames(record).some((type) => /Product/i.test(type)));
218
+ const product = productRecords
219
+ .map((record) => ({ record, score: overlapCount(pageTitle, cleanText(textFromUnknown(record.name))) }))
220
+ .sort((left, right) => right.score - left.score)[0]?.record;
221
+ if (!product)
222
+ return null;
223
+ const offers = Array.isArray(product.offers) ? product.offers[0] : product.offers;
224
+ const offer = isRecord(offers) ? offers : {};
225
+ const title = cleanText(textFromUnknown(product.name) || extractMetaProperty(context.active.content, "og:title") || extractTitleFromHtml(context.active.content));
226
+ const description = cleanText(textFromUnknown(product.description) || extractMetaProperty(context.active.content, "og:description") || extractMetaDescription(context.active.content));
227
+ const price = cleanText(textFromUnknown(offer.price) || textFromUnknown(product.price));
228
+ const availability = cleanText(textFromUnknown(offer.availability).split("/").at(-1) || "");
229
+ const image = firstUrl(product.image, context.resolvedUrl || context.url)
230
+ || absolutizeUrl(extractMetaProperty(context.active.content, "og:image") || extractMetaName(context.active.content, "twitter:image"), context.resolvedUrl || context.url);
231
+ const url = firstUrl(product.url, context.resolvedUrl || context.url) || context.url;
232
+ const strongSignal = looksProductLikeUrl(context.resolvedUrl || context.url)
233
+ || overlapCount(pageTitle, title) >= 2
234
+ || Boolean(extractMetaProperty(context.active.content, "product:price:amount"));
235
+ if (!title || (!price && !description && !ratingText(product)) || !strongSignal || productRecords.length > 4)
236
+ return null;
237
+ return {
238
+ post: {
239
+ url: context.url,
240
+ canonical_url: url,
241
+ title,
242
+ description,
243
+ text: [description, price ? `Price: ${price}` : "", ratingText(product) ? `Rating: ${ratingText(product)}` : "", availability ? `Availability: ${availability}` : ""].filter(Boolean).join("\n") || description,
244
+ ...(price ? { price } : {}),
245
+ ...(availability ? { availability } : {}),
246
+ ...(ratingText(product) ? { rating: ratingText(product) } : {}),
247
+ ...(image ? { image } : {}),
248
+ kind: "structured_product",
249
+ extractor: "jsonld_product",
250
+ status: "ok",
251
+ },
252
+ thread: [],
253
+ comments: [],
254
+ outbound_links: dedupeUrls([url, image, ...extractUrls(description)].filter(Boolean)),
255
+ partial: false,
256
+ errors: context.errors,
257
+ method: "generic_jsonld_product",
258
+ };
259
+ }
260
+ function collectionItems(record, baseUrl) {
261
+ const elements = Array.isArray(record.itemListElement) ? record.itemListElement : [];
262
+ return elements.map((element, index) => {
263
+ const node = isRecord(element) && isRecord(element.item) ? element.item : isRecord(element) ? element : null;
264
+ if (!node)
265
+ return null;
266
+ const title = cleanText(textFromUnknown(node.name) || textFromUnknown(node.headline));
267
+ const url = firstUrl(node.url, baseUrl);
268
+ const offers = Array.isArray(node.offers) ? node.offers[0] : node.offers;
269
+ const offer = isRecord(offers) ? offers : {};
270
+ const price = cleanText(textFromUnknown(offer.price) || textFromUnknown(node.price));
271
+ const snippet = truncate(cleanText(textFromUnknown(node.description)), 220);
272
+ if (!title || !url)
273
+ return null;
274
+ return {
275
+ index: index + 1,
276
+ title,
277
+ url,
278
+ ...(price ? { price } : {}),
279
+ ...(snippet ? { snippet } : {}),
280
+ source: "jsonld_itemlist",
281
+ };
282
+ }).filter(Boolean);
283
+ }
284
+ function collectionFallback(context, records) {
285
+ const collection = records.find((record) => typeNames(record).some((type) => /(?:CollectionPage|ItemList)/i.test(type)) && Array.isArray(record.itemListElement));
286
+ if (!collection)
287
+ return null;
288
+ const thread = collectionItems(collection, context.resolvedUrl || context.url).slice(0, 12);
289
+ if (thread.length < 2)
290
+ return null;
291
+ const title = cleanText(textFromUnknown(collection.name) || extractMetaProperty(context.active.content, "og:title") || extractTitleFromHtml(context.active.content) || `Items from ${(new URL(context.url)).hostname}`);
292
+ const description = cleanText(textFromUnknown(collection.description) || extractMetaProperty(context.active.content, "og:description") || extractMetaDescription(context.active.content));
293
+ const strongSignal = looksCollectionLikeUrl(context.resolvedUrl || context.url)
294
+ || /results|search|catalog|collection|products|items|shop/i.test(title)
295
+ || thread.length >= 4;
296
+ if (!strongSignal)
297
+ return null;
298
+ return {
299
+ post: {
300
+ url: context.url,
301
+ title,
302
+ description,
303
+ text: thread.slice(0, 8).map((item, index) => `${index + 1}. ${cleanText(item.title)}${cleanText(item.price) ? ` — ${cleanText(item.price)}` : ""}`).join("\n"),
304
+ item_count: thread.length,
305
+ kind: "structured_collection",
306
+ extractor: "jsonld_itemlist",
307
+ status: "ok",
308
+ },
309
+ thread,
310
+ comments: [],
311
+ outbound_links: dedupeUrls(thread.map((item) => cleanText(item.url))),
312
+ partial: false,
313
+ errors: context.errors,
314
+ method: "generic_jsonld_itemlist",
315
+ };
316
+ }
317
+ function articleFallback(context, article, records, nextPayloadText) {
318
+ const pageTitle = cleanText(article.title || extractMetaProperty(context.active.content, "og:title") || extractTitleFromHtml(context.active.content));
319
+ const pageSlug = slugWords(context.url);
320
+ const ranked = records
321
+ .map((record) => {
322
+ const title = cleanText(textFromUnknown(record.headline) || textFromUnknown(record.name));
323
+ const recordUrl = firstUrl(record.url || record["@id"], context.resolvedUrl || context.url);
324
+ const titleOverlap = overlapCount(pageTitle, title);
325
+ const slugOverlap = overlapCount(pageSlug, `${title} ${recordUrl}`);
326
+ const exactUrlMatch = cleanText(recordUrl) === cleanText(context.url) || cleanText(recordUrl) === cleanText(context.resolvedUrl);
327
+ const score = typeNames(record).reduce((sum, type) => sum + (/Article|NewsArticle|BlogPosting|DiscussionForumPosting|SocialMediaPosting/i.test(type) ? 5 : 0), 0)
328
+ + (cleanText(record.articleBody).length > 0 ? 4 : 0)
329
+ + (cleanText(record.headline).length > 0 ? 2 : 0)
330
+ + (cleanText(record.description).length > 0 ? 1 : 0)
331
+ + (exactUrlMatch ? 6 : 0)
332
+ + (titleOverlap >= 2 ? 4 : 0)
333
+ + (slugOverlap >= 2 ? 2 : 0);
334
+ return { record, score, exactUrlMatch, titleOverlap, slugOverlap };
335
+ })
336
+ .filter((item) => item.score > 0 && (item.exactUrlMatch || item.titleOverlap >= 2 || item.slugOverlap >= 2))
337
+ .sort((left, right) => right.score - left.score);
338
+ const best = ranked[0]?.record;
339
+ const structuredTitle = best ? cleanText(textFromUnknown(best.headline) || textFromUnknown(best.name)) : "";
340
+ const structuredDescription = best ? cleanText(textFromUnknown(best.description)) : "";
341
+ const structuredText = best ? cleanText(textFromUnknown(best.articleBody) || textFromUnknown(best.text)) : "";
342
+ const candidateText = structuredText || nextPayloadText;
343
+ const currentTextLength = cleanText(article.text).length;
344
+ if (!candidateText)
345
+ return null;
346
+ if (!structuredText) {
347
+ const nextShell = /<script[^>]*id=["']__NEXT_DATA__["']|self\.__next_f\.push\(/i.test(context.active.content);
348
+ if (!nextShell || candidateText.length < 60)
349
+ return null;
350
+ }
351
+ else if (candidateText.length < Math.max(currentTextLength + 120, 280)) {
352
+ return null;
353
+ }
354
+ const image = best ? firstUrl(best.image, context.resolvedUrl || context.url) : "";
355
+ return {
356
+ post: {
357
+ url: context.url,
358
+ title: structuredTitle || article.title || extractTitleFromHtml(context.active.content),
359
+ description: structuredDescription || article.description || extractMetaProperty(context.active.content, "og:description") || extractMetaDescription(context.active.content),
360
+ text: candidateText.slice(0, 50000),
361
+ author: article.author,
362
+ image: image || article.image,
363
+ published: article.published,
364
+ extractor: structuredText ? "jsonld_article" : "nextjs_payload",
365
+ status: "ok",
366
+ },
367
+ thread: [],
368
+ comments: [],
369
+ outbound_links: dedupeUrls([...(context.active.links || []), ...extractUrls(candidateText), image].filter(Boolean)),
370
+ partial: false,
371
+ errors: context.errors,
372
+ method: structuredText ? "generic_jsonld_article" : "generic_nextjs_payload",
373
+ };
374
+ }
375
+ function profileFallback(context, article, records) {
376
+ const profile = records.find((record) => typeNames(record).some((type) => /Person/i.test(type)));
377
+ if (!profile)
378
+ return null;
379
+ if (!looksProfileLikeUrl(context.resolvedUrl || context.url))
380
+ return null;
381
+ const name = cleanText(textFromUnknown(profile.name));
382
+ const jobTitle = cleanText(textFromUnknown(profile.jobTitle));
383
+ const description = cleanText(textFromUnknown(profile.description) || extractMetaProperty(context.active.content, "og:description") || extractMetaDescription(context.active.content));
384
+ const affiliation = cleanText(textFromUnknown(profile.worksFor) || textFromUnknown(profile.alumniOf));
385
+ if (!name || (cleanText(article.text).length >= 400 && !jobTitle && !affiliation))
386
+ return null;
387
+ const image = firstUrl(profile.image, context.resolvedUrl || context.url);
388
+ return {
389
+ post: {
390
+ url: context.url,
391
+ title: name,
392
+ description: [jobTitle, affiliation].filter(Boolean).join(" · ") || description,
393
+ text: [description, jobTitle ? `Role: ${jobTitle}` : "", affiliation ? `Affiliation: ${affiliation}` : ""].filter(Boolean).join("\n") || description,
394
+ ...(image ? { image } : {}),
395
+ kind: "structured_profile",
396
+ extractor: "jsonld_person",
397
+ status: "ok",
398
+ },
399
+ thread: [],
400
+ comments: [],
401
+ outbound_links: dedupeUrls([firstUrl(profile.url, context.resolvedUrl || context.url), image].filter(Boolean)),
402
+ partial: false,
403
+ errors: context.errors,
404
+ method: "generic_jsonld_profile",
405
+ };
406
+ }
407
+ export function tryStructuredGenericFallback(context, article) {
408
+ const html = context.active.content;
409
+ if (!/<(?:html|body|script|main|article|meta)\b/i.test(html))
410
+ return null;
411
+ const records = extractJsonLdRecords(html);
412
+ const nextPayloadText = extractNextPayloadText(html);
413
+ const collection = collectionFallback(context, records);
414
+ if (collection)
415
+ return collection;
416
+ const product = productFallback(context, records);
417
+ if (product)
418
+ return product;
419
+ const structuredArticle = articleFallback(context, article, records, nextPayloadText);
420
+ if (structuredArticle)
421
+ return structuredArticle;
422
+ return profileFallback(context, article, records);
423
+ }
424
+ function parseLinkAttributes(tag) {
425
+ const attributes = {};
426
+ const regex = /([^\s=/>]+)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+)))?/g;
427
+ for (const match of tag.matchAll(regex)) {
428
+ const key = cleanText(match[1]).toLowerCase();
429
+ if (!key || key === "link")
430
+ continue;
431
+ attributes[key] = decodeHtml(String(match[2] ?? match[3] ?? match[4] ?? "")).trim();
432
+ }
433
+ return attributes;
434
+ }
435
+ function sameOrigin(left, right) {
436
+ try {
437
+ return new URL(left).origin === new URL(right).origin;
438
+ }
439
+ catch {
440
+ return false;
441
+ }
442
+ }
443
+ function originCandidates(url) {
444
+ try {
445
+ const parsed = new URL(url);
446
+ return ["/rss", "/feed", "/rss.xml", "/atom.xml", "/index.xml"].map((path) => `${parsed.origin}${path}`);
447
+ }
448
+ catch {
449
+ return [];
450
+ }
451
+ }
452
+ function htmlFeedCandidates(html, pageUrl) {
453
+ const out = [];
454
+ for (const match of html.matchAll(/<link\b[^>]*>/gi)) {
455
+ const attributes = parseLinkAttributes(asString(match[0]));
456
+ const rel = cleanText(attributes.rel).toLowerCase();
457
+ const type = cleanText(attributes.type).toLowerCase();
458
+ if (!rel.includes("alternate"))
459
+ continue;
460
+ if (!/(rss|atom|xml)/i.test(type))
461
+ continue;
462
+ const href = absolutizeUrl(attributes.href || "", pageUrl);
463
+ if (href && sameOrigin(href, pageUrl))
464
+ out.push(href);
465
+ }
466
+ return dedupeUrls(out);
467
+ }
468
+ function extractXmlTag(block, tagNames) {
469
+ for (const tag of tagNames) {
470
+ const match = block.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\/${tag}>`, "i"));
471
+ if (match?.[1])
472
+ return cleanText(stripTags(decodeHtml(match[1])));
473
+ }
474
+ return "";
475
+ }
476
+ function extractLinkHref(block, preferredRels) {
477
+ const relWanted = preferredRels.map((value) => value.toLowerCase());
478
+ const matches = Array.from(block.matchAll(/<link\b[^>]+href=["']([^"']+)["'][^>]*>/gi));
479
+ for (const match of matches) {
480
+ const attributes = parseLinkAttributes(asString(match[0]));
481
+ const rel = cleanText(attributes.rel).toLowerCase();
482
+ if (relWanted.length > 0 && !relWanted.includes(rel))
483
+ continue;
484
+ if (attributes.href)
485
+ return attributes.href;
486
+ }
487
+ if (relWanted.length > 0)
488
+ return "";
489
+ return cleanText(matches[0]?.[1] || "");
490
+ }
491
+ function extractXmlLink(block, tagNames, baseUrl, preferredRels = []) {
492
+ for (const tag of tagNames) {
493
+ if (tag === "link") {
494
+ const href = extractLinkHref(block, preferredRels);
495
+ const absolute = absolutizeUrl(href, baseUrl);
496
+ if (absolute)
497
+ return absolute;
498
+ }
499
+ const withHref = block.match(new RegExp(`<${tag}[^>]+href=["']([^"']+)["'][^>]*>`, "i"));
500
+ if (withHref?.[1]) {
501
+ const absolute = absolutizeUrl(withHref[1], baseUrl);
502
+ if (absolute)
503
+ return absolute;
504
+ }
505
+ const inline = block.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\/${tag}>`, "i"));
506
+ if (inline?.[1]) {
507
+ const absolute = absolutizeUrl(stripTags(decodeHtml(inline[1])), baseUrl);
508
+ if (absolute)
509
+ return absolute;
510
+ }
511
+ }
512
+ return "";
513
+ }
514
+ function parseFeed(xml, baseUrl) {
515
+ const trimmed = xml.trim();
516
+ if (!trimmed)
517
+ return null;
518
+ if (/<rss\b|<channel\b/i.test(trimmed)) {
519
+ const channelMatch = trimmed.match(/<channel\b[^>]*>([\s\S]*?)<\/channel>/i);
520
+ const channel = channelMatch?.[1] || trimmed;
521
+ const title = extractXmlTag(channel, ["title"]);
522
+ const description = extractXmlTag(channel, ["description"]);
523
+ const url = extractXmlLink(channel, ["link"], baseUrl) || baseUrl;
524
+ const entries = Array.from(channel.matchAll(/<item\b[^>]*>([\s\S]*?)<\/item>/gi)).map((match) => {
525
+ const block = match[1] || "";
526
+ return {
527
+ title: extractXmlTag(block, ["title"]),
528
+ url: extractXmlLink(block, ["link", "guid"], url || baseUrl),
529
+ snippet: truncate(extractXmlTag(block, ["description", "content:encoded"]), 220),
530
+ published: extractXmlTag(block, ["pubDate", "dc:date"]),
531
+ };
532
+ }).filter((entry) => entry.title && entry.url);
533
+ return entries.length > 0 ? { title, description, url, format: "rss", entries } : null;
534
+ }
535
+ if (/<feed\b/i.test(trimmed)) {
536
+ const title = extractXmlTag(trimmed, ["title"]);
537
+ const description = extractXmlTag(trimmed, ["subtitle"]);
538
+ const url = extractXmlLink(trimmed, ["link"], baseUrl, ["alternate"]) || extractXmlLink(trimmed, ["link"], baseUrl) || baseUrl;
539
+ const entries = Array.from(trimmed.matchAll(/<entry\b[^>]*>([\s\S]*?)<\/entry>/gi)).map((match) => {
540
+ const block = match[1] || "";
541
+ return {
542
+ title: extractXmlTag(block, ["title"]),
543
+ url: extractXmlLink(block, ["link"], url || baseUrl, ["alternate"]) || extractXmlLink(block, ["link", "id"], url || baseUrl),
544
+ snippet: truncate(extractXmlTag(block, ["summary", "content"]), 220),
545
+ published: extractXmlTag(block, ["updated", "published"]),
546
+ };
547
+ }).filter((entry) => entry.title && entry.url);
548
+ return entries.length > 0 ? { title, description, url, format: "atom", entries } : null;
549
+ }
550
+ return null;
551
+ }
552
+ function feedNormalization(context, article, feed, method, sourceUrl) {
553
+ const title = feed.title || article.title || extractTitleFromHtml(context.active.content) || sourceUrl;
554
+ const description = feed.description || article.description;
555
+ return {
556
+ post: {
557
+ url: context.url,
558
+ canonical_url: feed.url || sourceUrl,
559
+ title,
560
+ description,
561
+ text: feed.entries.slice(0, 8).map((entry, index) => `${index + 1}. ${entry.title}${entry.published ? ` — ${entry.published}` : ""}`).join("\n"),
562
+ feed_url: sourceUrl,
563
+ feed_format: feed.format,
564
+ item_count: feed.entries.length,
565
+ kind: "feed_index",
566
+ extractor: "feed_discovery",
567
+ status: "ok",
568
+ },
569
+ thread: feed.entries.slice(0, 12).map((entry, index) => ({
570
+ index: index + 1,
571
+ title: entry.title,
572
+ url: entry.url,
573
+ ...(entry.snippet ? { snippet: entry.snippet } : {}),
574
+ ...(entry.published ? { published: entry.published } : {}),
575
+ source: "feed",
576
+ })),
577
+ comments: [],
578
+ outbound_links: dedupeUrls([sourceUrl, ...feed.entries.map((entry) => entry.url)]),
579
+ partial: false,
580
+ errors: context.errors,
581
+ method,
582
+ };
583
+ }
584
+ export async function tryFeedFallback(context, article) {
585
+ const direct = parseFeed(context.active.content, context.resolvedUrl || context.url);
586
+ if (direct)
587
+ return feedNormalization(context, article, direct, "rss_feed_direct", context.resolvedUrl || context.url);
588
+ const currentTextLength = cleanText(article.text).length;
589
+ if (currentTextLength >= 320)
590
+ return null;
591
+ if (!looksFeedDiscoveryPage(context.resolvedUrl || context.url))
592
+ return null;
593
+ const candidates = dedupeUrls([
594
+ ...htmlFeedCandidates(context.active.content, context.resolvedUrl || context.url),
595
+ ...originCandidates(context.resolvedUrl || context.url),
596
+ ]).slice(0, 6);
597
+ for (const candidate of candidates) {
598
+ if (!sameOrigin(candidate, context.resolvedUrl || context.url))
599
+ continue;
600
+ const fetched = await fetchProviderText(candidate, Math.min(context.timeoutMs, 12000), {
601
+ headers: {
602
+ accept: "application/rss+xml,application/atom+xml,application/xml,text/xml,text/plain,*/*",
603
+ },
604
+ }, {
605
+ mode: "same-origin",
606
+ sourceUrl: context.resolvedUrl || context.url,
607
+ });
608
+ if (!fetched.ok)
609
+ continue;
610
+ const parsed = parseFeed(fetched.text, candidate);
611
+ if (parsed)
612
+ return feedNormalization(context, article, parsed, "rss_feed_discovery", candidate);
613
+ }
614
+ return null;
615
+ }
616
+ //# sourceMappingURL=generic-fallbacks.js.map