@steipete/summarize-core 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. package/README.md +7 -0
  2. package/dist/esm/content/index.js +5 -0
  3. package/dist/esm/content/index.js.map +1 -0
  4. package/dist/esm/content/link-preview/client.js +28 -0
  5. package/dist/esm/content/link-preview/client.js.map +1 -0
  6. package/dist/esm/content/link-preview/content/article.js +155 -0
  7. package/dist/esm/content/link-preview/content/article.js.map +1 -0
  8. package/dist/esm/content/link-preview/content/cleaner.js +55 -0
  9. package/dist/esm/content/link-preview/content/cleaner.js.map +1 -0
  10. package/dist/esm/content/link-preview/content/constants.js +7 -0
  11. package/dist/esm/content/link-preview/content/constants.js.map +1 -0
  12. package/dist/esm/content/link-preview/content/fetcher.js +124 -0
  13. package/dist/esm/content/link-preview/content/fetcher.js.map +1 -0
  14. package/dist/esm/content/link-preview/content/firecrawl.js +86 -0
  15. package/dist/esm/content/link-preview/content/firecrawl.js.map +1 -0
  16. package/dist/esm/content/link-preview/content/html.js +162 -0
  17. package/dist/esm/content/link-preview/content/html.js.map +1 -0
  18. package/dist/esm/content/link-preview/content/index.js +345 -0
  19. package/dist/esm/content/link-preview/content/index.js.map +1 -0
  20. package/dist/esm/content/link-preview/content/jsonld.js +77 -0
  21. package/dist/esm/content/link-preview/content/jsonld.js.map +1 -0
  22. package/dist/esm/content/link-preview/content/parsers.js +77 -0
  23. package/dist/esm/content/link-preview/content/parsers.js.map +1 -0
  24. package/dist/esm/content/link-preview/content/podcast-utils.js +79 -0
  25. package/dist/esm/content/link-preview/content/podcast-utils.js.map +1 -0
  26. package/dist/esm/content/link-preview/content/readability.js +53 -0
  27. package/dist/esm/content/link-preview/content/readability.js.map +1 -0
  28. package/dist/esm/content/link-preview/content/twitter-utils.js +68 -0
  29. package/dist/esm/content/link-preview/content/twitter-utils.js.map +1 -0
  30. package/dist/esm/content/link-preview/content/types.js +4 -0
  31. package/dist/esm/content/link-preview/content/types.js.map +1 -0
  32. package/dist/esm/content/link-preview/content/utils.js +164 -0
  33. package/dist/esm/content/link-preview/content/utils.js.map +1 -0
  34. package/dist/esm/content/link-preview/content/video.js +96 -0
  35. package/dist/esm/content/link-preview/content/video.js.map +1 -0
  36. package/dist/esm/content/link-preview/content/youtube.js +82 -0
  37. package/dist/esm/content/link-preview/content/youtube.js.map +1 -0
  38. package/dist/esm/content/link-preview/deps.js +20 -0
  39. package/dist/esm/content/link-preview/deps.js.map +1 -0
  40. package/dist/esm/content/link-preview/fetch-with-timeout.js +35 -0
  41. package/dist/esm/content/link-preview/fetch-with-timeout.js.map +1 -0
  42. package/dist/esm/content/link-preview/types.js +2 -0
  43. package/dist/esm/content/link-preview/types.js.map +1 -0
  44. package/dist/esm/content/transcript/cache.js +79 -0
  45. package/dist/esm/content/transcript/cache.js.map +1 -0
  46. package/dist/esm/content/transcript/index.js +130 -0
  47. package/dist/esm/content/transcript/index.js.map +1 -0
  48. package/dist/esm/content/transcript/normalize.js +43 -0
  49. package/dist/esm/content/transcript/normalize.js.map +1 -0
  50. package/dist/esm/content/transcript/providers/generic.js +11 -0
  51. package/dist/esm/content/transcript/providers/generic.js.map +1 -0
  52. package/dist/esm/content/transcript/providers/podcast/apple-flow.js +222 -0
  53. package/dist/esm/content/transcript/providers/podcast/apple-flow.js.map +1 -0
  54. package/dist/esm/content/transcript/providers/podcast/apple.js +38 -0
  55. package/dist/esm/content/transcript/providers/podcast/apple.js.map +1 -0
  56. package/dist/esm/content/transcript/providers/podcast/constants.js +8 -0
  57. package/dist/esm/content/transcript/providers/podcast/constants.js.map +1 -0
  58. package/dist/esm/content/transcript/providers/podcast/flow-context.js +2 -0
  59. package/dist/esm/content/transcript/providers/podcast/flow-context.js.map +1 -0
  60. package/dist/esm/content/transcript/providers/podcast/itunes.js +134 -0
  61. package/dist/esm/content/transcript/providers/podcast/itunes.js.map +1 -0
  62. package/dist/esm/content/transcript/providers/podcast/json.js +34 -0
  63. package/dist/esm/content/transcript/providers/podcast/json.js.map +1 -0
  64. package/dist/esm/content/transcript/providers/podcast/media.js +345 -0
  65. package/dist/esm/content/transcript/providers/podcast/media.js.map +1 -0
  66. package/dist/esm/content/transcript/providers/podcast/results.js +28 -0
  67. package/dist/esm/content/transcript/providers/podcast/results.js.map +1 -0
  68. package/dist/esm/content/transcript/providers/podcast/rss.js +253 -0
  69. package/dist/esm/content/transcript/providers/podcast/rss.js.map +1 -0
  70. package/dist/esm/content/transcript/providers/podcast/spotify-flow.js +218 -0
  71. package/dist/esm/content/transcript/providers/podcast/spotify-flow.js.map +1 -0
  72. package/dist/esm/content/transcript/providers/podcast/spotify.js +113 -0
  73. package/dist/esm/content/transcript/providers/podcast/spotify.js.map +1 -0
  74. package/dist/esm/content/transcript/providers/podcast.js +222 -0
  75. package/dist/esm/content/transcript/providers/podcast.js.map +1 -0
  76. package/dist/esm/content/transcript/providers/youtube/api.js +257 -0
  77. package/dist/esm/content/transcript/providers/youtube/api.js.map +1 -0
  78. package/dist/esm/content/transcript/providers/youtube/apify.js +55 -0
  79. package/dist/esm/content/transcript/providers/youtube/apify.js.map +1 -0
  80. package/dist/esm/content/transcript/providers/youtube/captions.js +409 -0
  81. package/dist/esm/content/transcript/providers/youtube/captions.js.map +1 -0
  82. package/dist/esm/content/transcript/providers/youtube/yt-dlp.js +166 -0
  83. package/dist/esm/content/transcript/providers/youtube/yt-dlp.js.map +1 -0
  84. package/dist/esm/content/transcript/providers/youtube.js +173 -0
  85. package/dist/esm/content/transcript/providers/youtube.js.map +1 -0
  86. package/dist/esm/content/transcript/types.js +2 -0
  87. package/dist/esm/content/transcript/types.js.map +1 -0
  88. package/dist/esm/content/transcript/utils.js +259 -0
  89. package/dist/esm/content/transcript/utils.js.map +1 -0
  90. package/dist/esm/index.js +4 -0
  91. package/dist/esm/index.js.map +1 -0
  92. package/dist/esm/language.js +126 -0
  93. package/dist/esm/language.js.map +1 -0
  94. package/dist/esm/prompts/cli.js +20 -0
  95. package/dist/esm/prompts/cli.js.map +1 -0
  96. package/dist/esm/prompts/file.js +48 -0
  97. package/dist/esm/prompts/file.js.map +1 -0
  98. package/dist/esm/prompts/index.js +4 -0
  99. package/dist/esm/prompts/index.js.map +1 -0
  100. package/dist/esm/prompts/link-summary.js +116 -0
  101. package/dist/esm/prompts/link-summary.js.map +1 -0
  102. package/dist/esm/shared/contracts.js +2 -0
  103. package/dist/esm/shared/contracts.js.map +1 -0
  104. package/dist/esm/transcription/whisper/constants.js +8 -0
  105. package/dist/esm/transcription/whisper/constants.js.map +1 -0
  106. package/dist/esm/transcription/whisper/core.js +303 -0
  107. package/dist/esm/transcription/whisper/core.js.map +1 -0
  108. package/dist/esm/transcription/whisper/fal.js +41 -0
  109. package/dist/esm/transcription/whisper/fal.js.map +1 -0
  110. package/dist/esm/transcription/whisper/ffmpeg.js +179 -0
  111. package/dist/esm/transcription/whisper/ffmpeg.js.map +1 -0
  112. package/dist/esm/transcription/whisper/openai.js +47 -0
  113. package/dist/esm/transcription/whisper/openai.js.map +1 -0
  114. package/dist/esm/transcription/whisper/types.js +2 -0
  115. package/dist/esm/transcription/whisper/types.js.map +1 -0
  116. package/dist/esm/transcription/whisper/utils.js +63 -0
  117. package/dist/esm/transcription/whisper/utils.js.map +1 -0
  118. package/dist/esm/transcription/whisper/whisper-cpp.js +227 -0
  119. package/dist/esm/transcription/whisper/whisper-cpp.js.map +1 -0
  120. package/dist/esm/transcription/whisper.js +5 -0
  121. package/dist/esm/transcription/whisper.js.map +1 -0
  122. package/dist/types/content/index.d.ts +5 -0
  123. package/dist/types/content/link-preview/client.d.ts +18 -0
  124. package/dist/types/content/link-preview/content/article.d.ts +4 -0
  125. package/dist/types/content/link-preview/content/cleaner.d.ts +12 -0
  126. package/dist/types/content/link-preview/content/constants.d.ts +6 -0
  127. package/dist/types/content/link-preview/content/fetcher.d.ts +16 -0
  128. package/dist/types/content/link-preview/content/firecrawl.d.ts +14 -0
  129. package/dist/types/content/link-preview/content/html.d.ts +17 -0
  130. package/dist/types/content/link-preview/content/index.d.ts +4 -0
  131. package/dist/types/content/link-preview/content/jsonld.d.ts +6 -0
  132. package/dist/types/content/link-preview/content/parsers.d.ts +7 -0
  133. package/dist/types/content/link-preview/content/podcast-utils.d.ts +7 -0
  134. package/dist/types/content/link-preview/content/readability.d.ts +8 -0
  135. package/dist/types/content/link-preview/content/twitter-utils.d.ts +4 -0
  136. package/dist/types/content/link-preview/content/types.d.ts +61 -0
  137. package/dist/types/content/link-preview/content/utils.d.ts +17 -0
  138. package/dist/types/content/link-preview/content/video.d.ts +5 -0
  139. package/dist/types/content/link-preview/content/youtube.d.ts +1 -0
  140. package/dist/types/content/link-preview/deps.d.ts +167 -0
  141. package/dist/types/content/link-preview/fetch-with-timeout.d.ts +4 -0
  142. package/dist/types/content/link-preview/types.d.ts +37 -0
  143. package/dist/types/content/transcript/cache.d.ts +29 -0
  144. package/dist/types/content/transcript/index.d.ts +9 -0
  145. package/dist/types/content/transcript/normalize.d.ts +3 -0
  146. package/dist/types/content/transcript/providers/generic.d.ts +3 -0
  147. package/dist/types/content/transcript/providers/podcast/apple-flow.d.ts +4 -0
  148. package/dist/types/content/transcript/providers/podcast/apple.d.ts +6 -0
  149. package/dist/types/content/transcript/providers/podcast/constants.d.ts +7 -0
  150. package/dist/types/content/transcript/providers/podcast/flow-context.d.ts +11 -0
  151. package/dist/types/content/transcript/providers/podcast/itunes.d.ts +17 -0
  152. package/dist/types/content/transcript/providers/podcast/json.d.ts +8 -0
  153. package/dist/types/content/transcript/providers/podcast/media.d.ts +42 -0
  154. package/dist/types/content/transcript/providers/podcast/results.d.ts +10 -0
  155. package/dist/types/content/transcript/providers/podcast/rss.d.ts +22 -0
  156. package/dist/types/content/transcript/providers/podcast/spotify-flow.d.ts +3 -0
  157. package/dist/types/content/transcript/providers/podcast/spotify.d.ts +24 -0
  158. package/dist/types/content/transcript/providers/podcast.d.ts +20 -0
  159. package/dist/types/content/transcript/providers/youtube/api.d.ts +26 -0
  160. package/dist/types/content/transcript/providers/youtube/apify.d.ts +1 -0
  161. package/dist/types/content/transcript/providers/youtube/captions.d.ts +7 -0
  162. package/dist/types/content/transcript/providers/youtube/yt-dlp.d.ts +17 -0
  163. package/dist/types/content/transcript/providers/youtube.d.ts +3 -0
  164. package/dist/types/content/transcript/types.d.ts +30 -0
  165. package/dist/types/content/transcript/utils.d.ts +8 -0
  166. package/dist/types/index.d.ts +4 -0
  167. package/dist/types/language.d.ts +25 -0
  168. package/dist/types/prompts/cli.d.ts +10 -0
  169. package/dist/types/prompts/file.d.ts +17 -0
  170. package/dist/types/prompts/index.d.ts +4 -0
  171. package/dist/types/prompts/link-summary.d.ts +29 -0
  172. package/dist/types/shared/contracts.d.ts +2 -0
  173. package/dist/types/transcription/whisper/constants.d.ts +7 -0
  174. package/dist/types/transcription/whisper/core.d.ts +20 -0
  175. package/dist/types/transcription/whisper/fal.d.ts +1 -0
  176. package/dist/types/transcription/whisper/ffmpeg.d.ts +16 -0
  177. package/dist/types/transcription/whisper/openai.d.ts +2 -0
  178. package/dist/types/transcription/whisper/types.d.ts +17 -0
  179. package/dist/types/transcription/whisper/utils.d.ts +5 -0
  180. package/dist/types/transcription/whisper/whisper-cpp.d.ts +9 -0
  181. package/dist/types/transcription/whisper.d.ts +5 -0
  182. package/package.json +54 -0
@@ -0,0 +1,162 @@
1
+ import { resolveTranscriptForLink } from '../../transcript/index.js';
2
+ import { extractYouTubeVideoId, isYouTubeUrl, isYouTubeVideoUrl } from '../../transcript/utils.js';
3
+ import { extractArticleContent, sanitizeHtmlForMarkdownConversion } from './article.js';
4
+ import { normalizeForPrompt } from './cleaner.js';
5
+ import { MIN_HTML_CONTENT_CHARACTERS, MIN_METADATA_DESCRIPTION_CHARACTERS, MIN_READABILITY_CONTENT_CHARACTERS, READABILITY_RELATIVE_THRESHOLD, } from './constants.js';
6
+ import { extractJsonLdContent } from './jsonld.js';
7
+ import { extractMetadataFromHtml } from './parsers.js';
8
+ import { isPodcastHost, isPodcastLikeJsonLdType } from './podcast-utils.js';
9
+ import { extractReadabilityFromHtml, toReadabilityHtml } from './readability.js';
10
+ import { ensureTranscriptDiagnostics, finalizeExtractedLinkContent, pickFirstText, selectBaseContent, } from './utils.js';
11
+ import { detectPrimaryVideoFromHtml } from './video.js';
12
+ import { extractYouTubeShortDescription } from './youtube.js';
13
+ const LEADING_CONTROL_PATTERN = /^[\s\p{Cc}]+/u;
14
+ function stripLeadingTitle(content, title) {
15
+ if (!(content && title)) {
16
+ return content;
17
+ }
18
+ const normalizedTitle = title.trim();
19
+ if (normalizedTitle.length === 0) {
20
+ return content;
21
+ }
22
+ const trimmedContent = content.trimStart();
23
+ if (!trimmedContent.toLowerCase().startsWith(normalizedTitle.toLowerCase())) {
24
+ return content;
25
+ }
26
+ const remainderOriginal = trimmedContent.slice(normalizedTitle.length);
27
+ const remainder = remainderOriginal.replace(LEADING_CONTROL_PATTERN, '');
28
+ return remainder;
29
+ }
30
+ export async function buildResultFromHtmlDocument({ url, html, cacheMode, maxCharacters, youtubeTranscriptMode, firecrawlDiagnostics, markdownRequested, markdownMode, timeoutMs, deps, readabilityCandidate, }) {
31
+ if (isYouTubeVideoUrl(url) && !extractYouTubeVideoId(url)) {
32
+ throw new Error('Invalid YouTube video id in URL');
33
+ }
34
+ const { title, description, siteName } = extractMetadataFromHtml(html, url);
35
+ const jsonLd = extractJsonLdContent(html);
36
+ const mergedTitle = pickFirstText([jsonLd?.title, title]);
37
+ const mergedDescription = pickFirstText([jsonLd?.description, description]);
38
+ const isPodcastJsonLd = isPodcastLikeJsonLdType(jsonLd?.type);
39
+ const readability = readabilityCandidate ?? (await extractReadabilityFromHtml(html, url));
40
+ const readabilityText = readability?.text ? normalizeForPrompt(readability.text) : '';
41
+ const readabilityHtml = toReadabilityHtml(readability);
42
+ const normalizedSegmentsFromHtml = normalizeForPrompt(extractArticleContent(html));
43
+ const normalizedSegmentsFromReadabilityHtml = readabilityHtml
44
+ ? normalizeForPrompt(extractArticleContent(readabilityHtml))
45
+ : '';
46
+ const preferReadabilityHtml = normalizedSegmentsFromReadabilityHtml.length >= MIN_READABILITY_CONTENT_CHARACTERS &&
47
+ (normalizedSegmentsFromHtml.length < MIN_HTML_CONTENT_CHARACTERS ||
48
+ normalizedSegmentsFromReadabilityHtml.length >=
49
+ normalizedSegmentsFromHtml.length * READABILITY_RELATIVE_THRESHOLD);
50
+ const normalizedSegments = preferReadabilityHtml
51
+ ? normalizedSegmentsFromReadabilityHtml
52
+ : normalizedSegmentsFromHtml;
53
+ const preferReadabilityText = !preferReadabilityHtml &&
54
+ readabilityText.length >= MIN_READABILITY_CONTENT_CHARACTERS &&
55
+ (normalizedSegmentsFromHtml.length < MIN_HTML_CONTENT_CHARACTERS ||
56
+ readabilityText.length >= normalizedSegmentsFromHtml.length * READABILITY_RELATIVE_THRESHOLD);
57
+ const preferReadability = preferReadabilityHtml || preferReadabilityText;
58
+ const effectiveNormalized = preferReadabilityText ? readabilityText : normalizedSegments;
59
+ const descriptionCandidate = mergedDescription ? normalizeForPrompt(mergedDescription) : '';
60
+ const preferDescription = descriptionCandidate.length >= MIN_METADATA_DESCRIPTION_CHARACTERS &&
61
+ (isPodcastJsonLd ||
62
+ isPodcastHost(url) ||
63
+ (!preferReadability &&
64
+ (effectiveNormalized.length < MIN_HTML_CONTENT_CHARACTERS ||
65
+ descriptionCandidate.length >=
66
+ effectiveNormalized.length * READABILITY_RELATIVE_THRESHOLD)));
67
+ const effectiveNormalizedWithDescription = preferDescription
68
+ ? descriptionCandidate
69
+ : effectiveNormalized;
70
+ const transcriptResolution = await resolveTranscriptForLink(url, html, deps, {
71
+ youtubeTranscriptMode,
72
+ cacheMode,
73
+ });
74
+ const youtubeDescription = transcriptResolution.text === null ? extractYouTubeShortDescription(html) : null;
75
+ const baseCandidate = youtubeDescription
76
+ ? normalizeForPrompt(youtubeDescription)
77
+ : effectiveNormalizedWithDescription;
78
+ let baseContent = selectBaseContent(baseCandidate, transcriptResolution.text);
79
+ if (baseContent === normalizedSegments) {
80
+ baseContent = stripLeadingTitle(baseContent, mergedTitle ?? title);
81
+ }
82
+ const transcriptDiagnostics = ensureTranscriptDiagnostics(transcriptResolution, cacheMode ?? 'default');
83
+ const markdownDiagnostics = await (async () => {
84
+ if (!markdownRequested) {
85
+ return { requested: false, used: false, provider: null, notes: null };
86
+ }
87
+ if (isYouTubeUrl(url)) {
88
+ return {
89
+ requested: true,
90
+ used: false,
91
+ provider: null,
92
+ notes: 'Skipping Markdown conversion for YouTube URLs',
93
+ };
94
+ }
95
+ if (!deps.convertHtmlToMarkdown) {
96
+ return {
97
+ requested: true,
98
+ used: false,
99
+ provider: null,
100
+ notes: 'No HTML→Markdown converter configured',
101
+ };
102
+ }
103
+ try {
104
+ const htmlForMarkdown = markdownMode === 'readability' && readabilityHtml ? readabilityHtml : html;
105
+ const sanitizedHtml = sanitizeHtmlForMarkdownConversion(htmlForMarkdown);
106
+ const markdown = await deps.convertHtmlToMarkdown({
107
+ url,
108
+ html: sanitizedHtml,
109
+ title: mergedTitle ?? title,
110
+ siteName,
111
+ timeoutMs,
112
+ });
113
+ const normalizedMarkdown = normalizeForPrompt(markdown);
114
+ if (normalizedMarkdown.length === 0) {
115
+ return {
116
+ requested: true,
117
+ used: false,
118
+ provider: null,
119
+ notes: 'HTML→Markdown conversion returned empty content',
120
+ };
121
+ }
122
+ baseContent = normalizedMarkdown;
123
+ return {
124
+ requested: true,
125
+ used: true,
126
+ provider: 'llm',
127
+ notes: markdownMode === 'readability' && readabilityHtml
128
+ ? 'Readability HTML used for markdown input'
129
+ : null,
130
+ };
131
+ }
132
+ catch (error) {
133
+ const message = error instanceof Error ? error.message : String(error);
134
+ return {
135
+ requested: true,
136
+ used: false,
137
+ provider: null,
138
+ notes: `HTML→Markdown conversion failed: ${message}`,
139
+ };
140
+ }
141
+ })();
142
+ const video = detectPrimaryVideoFromHtml(html, url);
143
+ const isVideoOnly = !transcriptResolution.text && baseContent.length < MIN_HTML_CONTENT_CHARACTERS && video !== null;
144
+ return finalizeExtractedLinkContent({
145
+ url,
146
+ baseContent,
147
+ maxCharacters,
148
+ title: mergedTitle ?? title,
149
+ description: mergedDescription ?? description,
150
+ siteName,
151
+ transcriptResolution,
152
+ video,
153
+ isVideoOnly,
154
+ diagnostics: {
155
+ strategy: 'html',
156
+ firecrawl: firecrawlDiagnostics,
157
+ markdown: markdownDiagnostics,
158
+ transcript: transcriptDiagnostics,
159
+ },
160
+ });
161
+ }
162
+ //# sourceMappingURL=html.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/html.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,2BAA2B,CAAA;AACpE,OAAO,EAAE,qBAAqB,EAAE,YAAY,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAA;AAGlG,OAAO,EAAE,qBAAqB,EAAE,iCAAiC,EAAE,MAAM,cAAc,CAAA;AACvF,OAAO,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAA;AACjD,OAAO,EACL,2BAA2B,EAC3B,mCAAmC,EACnC,kCAAkC,EAClC,8BAA8B,GAC/B,MAAM,gBAAgB,CAAA;AACvB,OAAO,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAA;AAClD,OAAO,EAAE,uBAAuB,EAAE,MAAM,cAAc,CAAA;AACtD,OAAO,EAAE,aAAa,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAA;AAC3E,OAAO,EAAE,0BAA0B,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAA;AAEhF,OAAO,EACL,2BAA2B,EAC3B,4BAA4B,EAC5B,aAAa,EACb,iBAAiB,GAClB,MAAM,YAAY,CAAA;AACnB,OAAO,EAAE,0BAA0B,EAAE,MAAM,YAAY,CAAA;AACvD,OAAO,EAAE,8BAA8B,EAAE,MAAM,cAAc,CAAA;AAE7D,MAAM,uBAAuB,GAAG,eAAe,CAAA;AAE/C,SAAS,iBAAiB,CAAC,OAAe,EAAE,KAAgC;IAC1E,IAAI,CAAC,CAAC,OAAO,IAAI,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,eAAe,GAAG,KAAK,CAAC,IAAI,EAAE,CAAA;IACpC,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACjC,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,cAAc,GAAG,OAAO,CAAC,SAAS,EAAE,CAAA;IAC1C,IAAI,CAAC,cAAc,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,eAAe,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;QAC5E,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,iBAAiB,GAAG,cAAc,CAAC,KAAK,CAAC,eAAe,CAAC,MAAM,CAAC,CAAA;IACtE,MAAM,SAAS,GAAG,iBAAiB,CAAC,OAAO,CAAC,uBAAuB,EAAE,EAAE,CAAC,CAAA;IACxE,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,2BAA2B,CAAC,EAChD,GAAG,EACH,IAAI,EACJ,SAAS,EACT,aAAa,EACb,qBAAqB,EACrB,oBAAoB,EACpB,iBAAiB,EACjB,YAAY,EACZ,SAAS,EACT,IAAI,EACJ,oBAAoB,GAarB;IACC,IAAI,iBAAiB,CAAC,GAAG,CAAC,IAAI,CAAC,qBAAqB,CAAC,GAAG,CAAC,EAAE,CAAC;QAC1D,MAAM,IAAI,KAAK,CAAC,iCAAiC,CAAC,CAAA;IACpD,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,GAAG,uBAAuB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAA;IAC3E,MAAM,MAAM,GAAG,oBAAoB,CAAC,IAAI,CAAC,CAAA;IACzC,MAAM,WAAW,GAAG,aAAa,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAA;IACzD,MAAM,iBAAiB,GAAG,aAAa,CAAC,CAAC,MAAM,EAAE,WAAW,EAAE,WAAW,CAAC,CAAC,CAAA;IAC3E,MAAM,eAAe,GAAG,uBAAuB,CAAC,MAAM,EAAE,IAAI,CAAC,CAAA;IAC7D,MAAM,WAAW,GAAG,oBAAoB,IAAI,CAAC,MAAM,0BAA0B,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,CAAA;IACzF,MAAM,eAAe,GAAG,WAAW,EAAE,IAAI,CAAC,CAAC,CAAC,kBAAkB,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;IACrF,MAAM,eAAe,GAAG,iBAAiB,CAAC,WAAW,CAAC,CAAA;IAEtD,MAAM,0BAA0B,GAAG,kBAAkB,CAAC,qBAAqB,CAAC,IAAI,CAAC,CAAC,CAAA;IAClF,MAAM,qCAAqC,GAAG,eAAe;QAC3D,CAAC,CAAC,kBAAkB,CAAC,qBAAqB,CAAC,eAAe,CAAC,CAAC;QAC5D,CAAC,CAAC,EAAE,CAAA;IACN,MAAM,qBAAqB,GACzB,qCAAqC,CAAC,MAAM,IAAI,kCAAkC;QAClF,CAAC,0BAA0B,CAAC,MAAM,GAAG,2BAA2B;YAC9D,qCAAqC,CAAC,MAAM;gBAC1C,0BAA0B,CAAC,MAAM,GAAG,8BAA8B,CAAC,CAAA;IACzE,MAAM,kBAAkB,GAAG,qBAAqB;QAC9C,CAAC,CAAC,qCAAqC;QACvC,CAAC,CAAC,0BAA0B,CAAA;IAE9B,MAAM,qBAAqB,GACzB,CAAC,qBAAqB;QACtB,eAAe,CAAC,MAAM,IAAI,kCAAkC;QAC5D,CAAC,0BAA0B,CAAC,MAAM,GAAG,2BAA2B;YAC9D,eAAe,CAAC,MAAM,IAAI,0BAA0B,CAAC,MAAM,GAAG,8BAA8B,CAAC,CAAA;IACjG,MAAM,iBAAiB,GAAG,qBAAqB,IAAI,qBAAqB,CAAA;IACxE,MAAM,mBAAmB,GAAG,qBAAqB,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,kBAAkB,CAAA;IACxF,MAAM,oBAAoB,GAAG,iBAAiB,CAAC,CAAC,CAAC,kBAAkB,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;IAC3F,MAAM,iBAAiB,GACrB,oBAAoB,CAAC,MAAM,IAAI,mCAAmC;QAClE,CAAC,eAAe;YACd,aAAa,CAAC,GAAG,CAAC;YAClB,CAAC,CAAC,iBAAiB;gBACjB,CAAC,mBAAmB,CAAC,MAAM,GAAG,2BAA2B;oBACvD,oBAAoB,CAAC,MAAM;wBACzB,mBAAmB,CAAC,MAAM,GAAG,8BAA8B,CAAC,CAAC,CAAC,CAAA;IACxE,MAAM,kCAAkC,GAAG,iBAAiB;QAC1D,CAAC,CAAC,oBAAoB;QACtB,CAAC,CAAC,mBAAmB,CAAA;IACvB,MAAM,oBAAoB,GAAG,MAAM,wBAAwB,CAAC,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE;QAC3E,qBAAqB;QACrB,SAAS;KACV,CAAC,CAAA;IAEF,MAAM,kBAAkB,GACtB,oBAAoB,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC,CAAC,8BAA8B,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAA;IAClF,MAAM,aAAa,GAAG,kBAAkB;QACtC,CAAC,CAAC,kBAAkB,CAAC,kBAAkB,CAAC;QACxC,CAAC,CAAC,kCAAkC,CAAA;IAEtC,IAAI,WAAW,GAAG,iBAAiB,CAAC,aAAa,EAAE,oBAAoB,CAAC,IAAI,CAAC,CAAA;IAC7E,IAAI,WAAW,KAAK,kBAAkB,EAAE,CAAC;QACvC,WAAW,GAAG,iBAAiB,CAAC,WAAW,EAAE,WAAW,IAAI,KAAK,CAAC,CAAA;IACpE,CAAC;IAED,MAAM,qBAAqB,GAAG,2BAA2B,CACvD,oBAAoB,EACpB,SAAS,IAAI,SAAS,CACvB,CAAA;IAED,MAAM,mBAAmB,GAAwB,MAAM,CAAC,KAAK,IAAI,EAAE;QACjE,IAAI,CAAC,iBAAiB,EAAE,CAAC;YACvB,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAA;QACvE,CAAC;QAED,IAAI,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO;gBACL,SAAS,EAAE,IAAI;gBACf,IAAI,EAAE,KAAK;gBACX,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,+CAA+C;aACvD,CAAA;QACH,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,qBAAqB,EAAE,CAAC;YAChC,OAAO;gBACL,SAAS,EAAE,IAAI;gBACf,IAAI,EAAE,KAAK;gBACX,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,uCAAuC;aAC/C,CAAA;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,eAAe,GACnB,YAAY,KAAK,aAAa,IAAI,eAAe,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAA;YAC5E,MAAM,aAAa,GAAG,iCAAiC,CAAC,eAAe,CAAC,CAAA;YACxE,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,qBAAqB,CAAC;gBAChD,GAAG;gBACH,IAAI,EAAE,aAAa;gBACnB,KAAK,EAAE,WAAW,IAAI,KAAK;gBAC3B,QAAQ;gBACR,SAAS;aACV,CAAC,CAAA;YACF,MAAM,kBAAkB,GAAG,kBAAkB,CAAC,QAAQ,CAAC,CAAA;YACvD,IAAI,kBAAkB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACpC,OAAO;oBACL,SAAS,EAAE,IAAI;oBACf,IAAI,EAAE,KAAK;oBACX,QAAQ,EAAE,IAAI;oBACd,KAAK,EAAE,iDAAiD;iBACzD,CAAA;YACH,CAAC;YAED,WAAW,GAAG,kBAAkB,CAAA;YAChC,OAAO;gBACL,SAAS,EAAE,IAAI;gBACf,IAAI,EAAE,IAAI;gBACV,QAAQ,EAAE,KAAK;gBACf,KAAK,EACH,YAAY,KAAK,aAAa,IAAI,eAAe;oBAC/C,CAAC,CAAC,0CAA0C;oBAC5C,CAAC,CAAC,IAAI;aACX,CAAA;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;YACtE,OAAO;gBACL,SAAS,EAAE,IAAI;gBACf,IAAI,EAAE,KAAK;gBACX,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,oCAAoC,OAAO,EAAE;aACrD,CAAA;QACH,CAAC;IACH,CAAC,CAAC,EAAE,CAAA;IAEJ,MAAM,KAAK,GAAG,0BAA0B,CAAC,IAAI,EAAE,GAAG,CAAC,CAAA;IACnD,MAAM,WAAW,GACf,CAAC,oBAAoB,CAAC,IAAI,IAAI,WAAW,CAAC,MAAM,GAAG,2BAA2B,IAAI,KAAK,KAAK,IAAI,CAAA;IAElG,OAAO,4BAA4B,CAAC;QAClC,GAAG;QACH,WAAW;QACX,aAAa;QACb,KAAK,EAAE,WAAW,IAAI,KAAK;QAC3B,WAAW,EAAE,iBAAiB,IAAI,WAAW;QAC7C,QAAQ;QACR,oBAAoB;QACpB,KAAK;QACL,WAAW;QACX,WAAW,EAAE;YACX,QAAQ,EAAE,MAAM;YAChB,SAAS,EAAE,oBAAoB;YAC/B,QAAQ,EAAE,mBAAmB;YAC7B,UAAU,EAAE,qBAAqB;SAClC;KACF,CAAC,CAAA;AACJ,CAAC"}
@@ -0,0 +1,345 @@
1
+ import { resolveTranscriptForLink } from '../../transcript/index.js';
2
+ import { isYouTubeUrl } from '../../transcript/utils.js';
3
+ import { normalizeForPrompt } from './cleaner.js';
4
+ import { MIN_READABILITY_CONTENT_CHARACTERS } from './constants.js';
5
+ import { fetchHtmlDocument, fetchWithFirecrawl } from './fetcher.js';
6
+ import { buildResultFromFirecrawl, shouldFallbackToFirecrawl } from './firecrawl.js';
7
+ import { buildResultFromHtmlDocument } from './html.js';
8
+ import { extractApplePodcastIds, extractSpotifyEpisodeId } from './podcast-utils.js';
9
+ import { extractReadabilityFromHtml } from './readability.js';
10
+ import { isAnubisHtml, isBlockedTwitterContent, isTwitterStatusUrl, toNitterUrls, } from './twitter-utils.js';
11
+ import { appendNote, ensureTranscriptDiagnostics, finalizeExtractedLinkContent, resolveCacheMode, resolveFirecrawlMode, resolveMaxCharacters, resolveTimeoutMs, selectBaseContent, } from './utils.js';
12
+ export async function fetchLinkContent(url, options, deps) {
13
+ const timeoutMs = resolveTimeoutMs(options);
14
+ const cacheMode = resolveCacheMode(options);
15
+ const maxCharacters = resolveMaxCharacters(options);
16
+ const youtubeTranscriptMode = options?.youtubeTranscript ?? 'auto';
17
+ const firecrawlMode = resolveFirecrawlMode(options);
18
+ const markdownRequested = (options?.format ?? 'text') === 'markdown';
19
+ const markdownMode = options?.markdownMode ?? 'auto';
20
+ const canUseFirecrawl = firecrawlMode !== 'off' && deps.scrapeWithFirecrawl !== null && !isYouTubeUrl(url);
21
+ const spotifyEpisodeId = extractSpotifyEpisodeId(url);
22
+ if (spotifyEpisodeId) {
23
+ if (!deps.openaiApiKey && !deps.falApiKey) {
24
+ throw new Error('Spotify episode transcription requires OPENAI_API_KEY or FAL_KEY (Whisper); otherwise you may only get a captcha/recaptcha HTML page.');
25
+ }
26
+ const transcriptResolution = await resolveTranscriptForLink(url, null, deps, {
27
+ youtubeTranscriptMode,
28
+ cacheMode,
29
+ });
30
+ if (!transcriptResolution.text) {
31
+ const notes = transcriptResolution.diagnostics?.notes;
32
+ const suffix = notes ? ` (${notes})` : '';
33
+ throw new Error(`Failed to transcribe Spotify episode${suffix}`);
34
+ }
35
+ const transcriptDiagnostics = ensureTranscriptDiagnostics(transcriptResolution, cacheMode ?? 'default');
36
+ transcriptDiagnostics.notes = appendNote(transcriptDiagnostics.notes, 'Spotify episode: skipped HTML fetch to avoid captcha pages');
37
+ return finalizeExtractedLinkContent({
38
+ url,
39
+ baseContent: selectBaseContent('', transcriptResolution.text),
40
+ maxCharacters,
41
+ title: null,
42
+ description: null,
43
+ siteName: 'Spotify',
44
+ transcriptResolution,
45
+ video: null,
46
+ isVideoOnly: false,
47
+ diagnostics: {
48
+ strategy: 'html',
49
+ firecrawl: {
50
+ attempted: false,
51
+ used: false,
52
+ cacheMode,
53
+ cacheStatus: cacheMode === 'bypass' ? 'bypassed' : 'unknown',
54
+ notes: 'Spotify short-circuit skipped HTML/Firecrawl',
55
+ },
56
+ markdown: {
57
+ requested: markdownRequested,
58
+ used: false,
59
+ provider: null,
60
+ notes: 'Spotify short-circuit uses transcript content',
61
+ },
62
+ transcript: transcriptDiagnostics,
63
+ },
64
+ });
65
+ }
66
+ const appleIds = extractApplePodcastIds(url);
67
+ if (appleIds) {
68
+ if (!deps.openaiApiKey && !deps.falApiKey) {
69
+ throw new Error('Apple Podcasts transcription requires OPENAI_API_KEY or FAL_KEY (Whisper); otherwise you may only get a slow/blocked HTML page.');
70
+ }
71
+ const transcriptResolution = await resolveTranscriptForLink(url, null, deps, {
72
+ youtubeTranscriptMode,
73
+ cacheMode,
74
+ });
75
+ if (!transcriptResolution.text) {
76
+ const notes = transcriptResolution.diagnostics?.notes;
77
+ const suffix = notes ? ` (${notes})` : '';
78
+ throw new Error(`Failed to transcribe Apple Podcasts episode${suffix}`);
79
+ }
80
+ const transcriptDiagnostics = ensureTranscriptDiagnostics(transcriptResolution, cacheMode ?? 'default');
81
+ transcriptDiagnostics.notes = appendNote(transcriptDiagnostics.notes, 'Apple Podcasts: skipped HTML fetch (prefer iTunes lookup / enclosures)');
82
+ return finalizeExtractedLinkContent({
83
+ url,
84
+ baseContent: selectBaseContent('', transcriptResolution.text),
85
+ maxCharacters,
86
+ title: null,
87
+ description: null,
88
+ siteName: 'Apple Podcasts',
89
+ transcriptResolution,
90
+ video: null,
91
+ isVideoOnly: false,
92
+ diagnostics: {
93
+ strategy: 'html',
94
+ firecrawl: {
95
+ attempted: false,
96
+ used: false,
97
+ cacheMode,
98
+ cacheStatus: cacheMode === 'bypass' ? 'bypassed' : 'unknown',
99
+ notes: 'Apple Podcasts short-circuit skipped HTML/Firecrawl',
100
+ },
101
+ markdown: {
102
+ requested: markdownRequested,
103
+ used: false,
104
+ provider: null,
105
+ notes: 'Apple Podcasts short-circuit uses transcript content',
106
+ },
107
+ transcript: transcriptDiagnostics,
108
+ },
109
+ });
110
+ }
111
+ let firecrawlAttempted = false;
112
+ let firecrawlPayload = null;
113
+ const firecrawlDiagnostics = {
114
+ attempted: false,
115
+ used: false,
116
+ cacheMode,
117
+ cacheStatus: cacheMode === 'bypass' ? 'bypassed' : 'unknown',
118
+ notes: null,
119
+ };
120
+ const twitterStatus = isTwitterStatusUrl(url);
121
+ const nitterUrls = twitterStatus ? toNitterUrls(url) : [];
122
+ let birdError = null;
123
+ let nitterError = null;
124
+ const attemptFirecrawl = async (reason) => {
125
+ if (!canUseFirecrawl) {
126
+ return null;
127
+ }
128
+ if (!firecrawlAttempted) {
129
+ const attempt = await fetchWithFirecrawl(url, deps.scrapeWithFirecrawl, {
130
+ timeoutMs,
131
+ cacheMode,
132
+ onProgress: deps.onProgress ?? null,
133
+ reason,
134
+ });
135
+ firecrawlAttempted = true;
136
+ firecrawlPayload = attempt.payload;
137
+ firecrawlDiagnostics.attempted = attempt.diagnostics.attempted;
138
+ firecrawlDiagnostics.used = attempt.diagnostics.used;
139
+ firecrawlDiagnostics.cacheMode = attempt.diagnostics.cacheMode;
140
+ firecrawlDiagnostics.cacheStatus = attempt.diagnostics.cacheStatus;
141
+ firecrawlDiagnostics.notes = attempt.diagnostics.notes ?? null;
142
+ }
143
+ firecrawlDiagnostics.notes = appendNote(firecrawlDiagnostics.notes, reason);
144
+ if (!firecrawlPayload) {
145
+ return null;
146
+ }
147
+ const firecrawlResult = await buildResultFromFirecrawl({
148
+ url,
149
+ payload: firecrawlPayload,
150
+ cacheMode,
151
+ maxCharacters,
152
+ youtubeTranscriptMode,
153
+ firecrawlDiagnostics,
154
+ markdownRequested,
155
+ deps,
156
+ });
157
+ if (firecrawlResult) {
158
+ return firecrawlResult;
159
+ }
160
+ firecrawlDiagnostics.notes = appendNote(firecrawlDiagnostics.notes, 'Firecrawl returned empty content');
161
+ return null;
162
+ };
163
+ const attemptBird = async () => {
164
+ if (!deps.readTweetWithBird || !twitterStatus) {
165
+ return null;
166
+ }
167
+ deps.onProgress?.({ kind: 'bird-start', url });
168
+ try {
169
+ const tweet = await deps.readTweetWithBird({ url, timeoutMs });
170
+ const text = tweet?.text?.trim() ?? '';
171
+ if (text.length === 0) {
172
+ deps.onProgress?.({ kind: 'bird-done', url, ok: false, textBytes: null });
173
+ return null;
174
+ }
175
+ const title = tweet?.author?.username ? `@${tweet.author.username}` : null;
176
+ const description = null;
177
+ const siteName = 'X';
178
+ const transcriptResolution = { text: null, source: null };
179
+ const transcriptDiagnostics = ensureTranscriptDiagnostics(transcriptResolution, cacheMode ?? 'default');
180
+ const result = finalizeExtractedLinkContent({
181
+ url,
182
+ baseContent: text,
183
+ maxCharacters,
184
+ title,
185
+ description,
186
+ siteName,
187
+ transcriptResolution,
188
+ video: null,
189
+ isVideoOnly: false,
190
+ diagnostics: {
191
+ strategy: 'bird',
192
+ firecrawl: firecrawlDiagnostics,
193
+ markdown: {
194
+ requested: markdownRequested,
195
+ used: false,
196
+ provider: null,
197
+ notes: 'Bird tweet fetch provides plain text',
198
+ },
199
+ transcript: transcriptDiagnostics,
200
+ },
201
+ });
202
+ deps.onProgress?.({
203
+ kind: 'bird-done',
204
+ url,
205
+ ok: true,
206
+ textBytes: Buffer.byteLength(result.content, 'utf8'),
207
+ });
208
+ return result;
209
+ }
210
+ catch (error) {
211
+ birdError = error;
212
+ deps.onProgress?.({ kind: 'bird-done', url, ok: false, textBytes: null });
213
+ return null;
214
+ }
215
+ };
216
+ const birdResult = await attemptBird();
217
+ if (birdResult) {
218
+ return birdResult;
219
+ }
220
+ const attemptNitter = async () => {
221
+ if (nitterUrls.length === 0) {
222
+ return null;
223
+ }
224
+ for (const nitterUrl of nitterUrls) {
225
+ deps.onProgress?.({ kind: 'nitter-start', url: nitterUrl });
226
+ try {
227
+ const nitterHtml = await fetchHtmlDocument(deps.fetch, nitterUrl, { timeoutMs });
228
+ if (!nitterHtml.trim()) {
229
+ nitterError = new Error(`Nitter returned empty body from ${new URL(nitterUrl).host}`);
230
+ deps.onProgress?.({ kind: 'nitter-done', url: nitterUrl, ok: false, textBytes: null });
231
+ continue;
232
+ }
233
+ if (isAnubisHtml(nitterHtml)) {
234
+ nitterError = new Error(`Nitter returned Anubis challenge from ${new URL(nitterUrl).host}`);
235
+ deps.onProgress?.({ kind: 'nitter-done', url: nitterUrl, ok: false, textBytes: null });
236
+ continue;
237
+ }
238
+ deps.onProgress?.({
239
+ kind: 'nitter-done',
240
+ url: nitterUrl,
241
+ ok: true,
242
+ textBytes: Buffer.byteLength(nitterHtml, 'utf8'),
243
+ });
244
+ return nitterHtml;
245
+ }
246
+ catch (error) {
247
+ nitterError = error;
248
+ deps.onProgress?.({ kind: 'nitter-done', url: nitterUrl, ok: false, textBytes: null });
249
+ }
250
+ }
251
+ return null;
252
+ };
253
+ const nitterHtml = await attemptNitter();
254
+ if (nitterHtml) {
255
+ const nitterResult = await buildResultFromHtmlDocument({
256
+ url,
257
+ html: nitterHtml,
258
+ cacheMode,
259
+ maxCharacters,
260
+ youtubeTranscriptMode,
261
+ firecrawlDiagnostics,
262
+ markdownRequested,
263
+ markdownMode,
264
+ timeoutMs,
265
+ deps,
266
+ readabilityCandidate: null,
267
+ });
268
+ if (!isBlockedTwitterContent(nitterResult.content)) {
269
+ nitterResult.diagnostics.strategy = 'nitter';
270
+ return nitterResult;
271
+ }
272
+ nitterError = new Error('Nitter returned blocked or empty content');
273
+ }
274
+ if (firecrawlMode === 'always') {
275
+ const firecrawlResult = await attemptFirecrawl('Firecrawl forced via options');
276
+ if (firecrawlResult) {
277
+ return firecrawlResult;
278
+ }
279
+ }
280
+ let html = null;
281
+ let htmlError = null;
282
+ try {
283
+ html = await fetchHtmlDocument(deps.fetch, url, {
284
+ timeoutMs,
285
+ onProgress: deps.onProgress ?? null,
286
+ });
287
+ }
288
+ catch (error) {
289
+ htmlError = error;
290
+ }
291
+ if (!html) {
292
+ if (!canUseFirecrawl) {
293
+ throw htmlError instanceof Error ? htmlError : new Error('Failed to fetch HTML document');
294
+ }
295
+ const firecrawlResult = await attemptFirecrawl('HTML fetch failed; falling back to Firecrawl');
296
+ if (firecrawlResult) {
297
+ return firecrawlResult;
298
+ }
299
+ const firecrawlError = firecrawlDiagnostics.notes
300
+ ? `; Firecrawl notes: ${firecrawlDiagnostics.notes}`
301
+ : '';
302
+ throw new Error(`Failed to fetch HTML document${firecrawlError}${htmlError instanceof Error ? `; HTML error: ${htmlError.message}` : ''}`);
303
+ }
304
+ let readabilityCandidate = null;
305
+ if (firecrawlMode === 'auto' && shouldFallbackToFirecrawl(html)) {
306
+ readabilityCandidate = await extractReadabilityFromHtml(html, url);
307
+ const readabilityText = readabilityCandidate?.text
308
+ ? normalizeForPrompt(readabilityCandidate.text)
309
+ : '';
310
+ if (readabilityText.length < MIN_READABILITY_CONTENT_CHARACTERS) {
311
+ const firecrawlResult = await attemptFirecrawl('HTML content looked blocked/thin; falling back to Firecrawl');
312
+ if (firecrawlResult) {
313
+ return firecrawlResult;
314
+ }
315
+ }
316
+ }
317
+ const htmlResult = await buildResultFromHtmlDocument({
318
+ url,
319
+ html,
320
+ cacheMode,
321
+ maxCharacters,
322
+ youtubeTranscriptMode,
323
+ firecrawlDiagnostics,
324
+ markdownRequested,
325
+ markdownMode,
326
+ timeoutMs,
327
+ deps,
328
+ readabilityCandidate,
329
+ });
330
+ if (twitterStatus && isBlockedTwitterContent(htmlResult.content)) {
331
+ const birdNote = !deps.readTweetWithBird
332
+ ? 'Bird not available'
333
+ : birdError
334
+ ? `Bird failed: ${birdError instanceof Error ? birdError.message : String(birdError)}`
335
+ : 'Bird returned no text';
336
+ const nitterNote = nitterUrls.length > 0
337
+ ? nitterError
338
+ ? `Nitter failed: ${nitterError instanceof Error ? nitterError.message : String(nitterError)}`
339
+ : 'Nitter returned no text'
340
+ : 'Nitter not available';
341
+ throw new Error(`Unable to fetch tweet content from X. ${birdNote}. ${nitterNote}.`);
342
+ }
343
+ return htmlResult;
344
+ }
345
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,wBAAwB,EAAE,MAAM,2BAA2B,CAAA;AACpE,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAA;AAGxD,OAAO,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAA;AACjD,OAAO,EAAE,kCAAkC,EAAE,MAAM,gBAAgB,CAAA;AACnE,OAAO,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAA;AACpE,OAAO,EAAE,wBAAwB,EAAE,yBAAyB,EAAE,MAAM,gBAAgB,CAAA;AACpF,OAAO,EAAE,2BAA2B,EAAE,MAAM,WAAW,CAAA;AACvD,OAAO,EAAE,sBAAsB,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAA;AACpF,OAAO,EAAE,0BAA0B,EAAE,MAAM,kBAAkB,CAAA;AAC7D,OAAO,EACL,YAAY,EACZ,uBAAuB,EACvB,kBAAkB,EAClB,YAAY,GACb,MAAM,oBAAoB,CAAA;AAE3B,OAAO,EACL,UAAU,EACV,2BAA2B,EAC3B,4BAA4B,EAC5B,gBAAgB,EAChB,oBAAoB,EACpB,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,YAAY,CAAA;AAEnB,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,GAAW,EACX,OAA4C,EAC5C,IAAqB;IAErB,MAAM,SAAS,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAA;IAC3C,MAAM,SAAS,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAA;IAC3C,MAAM,aAAa,GAAG,oBAAoB,CAAC,OAAO,CAAC,CAAA;IACnD,MAAM,qBAAqB,GAAG,OAAO,EAAE,iBAAiB,IAAI,MAAM,CAAA;IAClE,MAAM,aAAa,GAAG,oBAAoB,CAAC,OAAO,CAAC,CAAA;IACnD,MAAM,iBAAiB,GAAG,CAAC,OAAO,EAAE,MAAM,IAAI,MAAM,CAAC,KAAK,UAAU,CAAA;IACpE,MAAM,YAAY,GAAiB,OAAO,EAAE,YAAY,IAAI,MAAM,CAAA;IAElE,MAAM,eAAe,GACnB,aAAa,KAAK,KAAK,IAAI,IAAI,CAAC,mBAAmB,KAAK,IAAI,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAA;IAEpF,MAAM,gBAAgB,GAAG,uBAAuB,CAAC,GAAG,CAAC,CAAA;IACrD,IAAI,gBAAgB,EAAE,CAAC;QACrB,IAAI,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC;YAC1C,MAAM,IAAI,KAAK,CACb,uIAAuI,CACxI,CAAA;QACH,CAAC;QAED,MAAM,oBAAoB,GAAG,MAAM,wBAAwB,CAAC,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE;YAC3E,qBAAqB;YACrB,SAAS;SACV,CAAC,CAAA;QACF,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE,CAAC;YAC/B,MAAM,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,KAAK,CAAA;YACrD,MAAM,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,KAAK,KAAK,GAAG,CAAC,CAAC,CAAC,EAAE,CAAA;YACzC,MAAM,IAAI,KAAK,CAAC,uCAAuC,MAAM,EAAE,CAAC,CAAA;QAClE,CAAC;QAED,MAAM,qBAAqB,GAAG,2BAA2B,CACvD,oBAAoB,EACpB,SAAS,IAAI,SAAS,CACvB,CAAA;QACD,qBAAqB,CAAC,KAAK,GAAG,UAAU,CACtC,qBAAqB,CAAC,KAAK,EAC3B,4DAA4D,CAC7D,CAAA;QAED,OAAO,4BAA4B,CAAC;YAClC,GAAG;YACH,WAAW,EAAE,iBAAiB,CAAC,EAAE,EAAE,oBAAoB,CAAC,IAAI,CAAC;YAC7D,aAAa;YACb,KAAK,EAAE,IAAI;YACX,WAAW,EAAE,IAAI;YACjB,QAAQ,EAAE,SAAS;YACnB,oBAAoB;YACpB,KAAK,EAAE,IAAI;YACX,WAAW,EAAE,KAAK;YAClB,WAAW,EAAE;gBACX,QAAQ,EAAE,MAAM;gBAChB,SAAS,EAAE;oBACT,SAAS,EAAE,KAAK;oBAChB,IAAI,EAAE,KAAK;oBACX,SAAS;oBACT,WAAW,EAAE,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;oBAC5D,KAAK,EAAE,8CAA8C;iBACtD;gBACD,QAAQ,EAAE;oBACR,SAAS,EAAE,iBAAiB;oBAC5B,IAAI,EAAE,KAAK;oBACX,QAAQ,EAAE,IAAI;oBACd,KAAK,EAAE,+CAA+C;iBACvD;gBACD,UAAU,EAAE,qBAAqB;aAClC;SACF,CAAC,CAAA;IACJ,CAAC;IAED,MAAM,QAAQ,GAAG,sBAAsB,CAAC,GAAG,CAAC,CAAA;IAC5C,IAAI,QAAQ,EAAE,CAAC;QACb,IAAI,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC;YAC1C,MAAM,IAAI,KAAK,CACb,iIAAiI,CAClI,CAAA;QACH,CAAC;QAED,MAAM,oBAAoB,GAAG,MAAM,wBAAwB,CAAC,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE;YAC3E,qBAAqB;YACrB,SAAS;SACV,CAAC,CAAA;QACF,IAAI,CAAC,oBAAoB,CAAC,IAAI,EAAE,CAAC;YAC/B,MAAM,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,KAAK,CAAA;YACrD,MAAM,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,KAAK,KAAK,GAAG,CAAC,CAAC,CAAC,EAAE,CAAA;YACzC,MAAM,IAAI,KAAK,CAAC,8CAA8C,MAAM,EAAE,CAAC,CAAA;QACzE,CAAC;QAED,MAAM,qBAAqB,GAAG,2BAA2B,CACvD,oBAAoB,EACpB,SAAS,IAAI,SAAS,CACvB,CAAA;QACD,qBAAqB,CAAC,KAAK,GAAG,UAAU,CACtC,qBAAqB,CAAC,KAAK,EAC3B,wEAAwE,CACzE,CAAA;QAED,OAAO,4BAA4B,CAAC;YAClC,GAAG;YACH,WAAW,EAAE,iBAAiB,CAAC,EAAE,EAAE,oBAAoB,CAAC,IAAI,CAAC;YAC7D,aAAa;YACb,KAAK,EAAE,IAAI;YACX,WAAW,EAAE,IAAI;YACjB,QAAQ,EAAE,gBAAgB;YAC1B,oBAAoB;YACpB,KAAK,EAAE,IAAI;YACX,WAAW,EAAE,KAAK;YAClB,WAAW,EAAE;gBACX,QAAQ,EAAE,MAAM;gBAChB,SAAS,EAAE;oBACT,SAAS,EAAE,KAAK;oBAChB,IAAI,EAAE,KAAK;oBACX,SAAS;oBACT,WAAW,EAAE,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;oBAC5D,KAAK,EAAE,qDAAqD;iBAC7D;gBACD,QAAQ,EAAE;oBACR,SAAS,EAAE,iBAAiB;oBAC5B,IAAI,EAAE,KAAK;oBACX,QAAQ,EAAE,IAAI;oBACd,KAAK,EAAE,sDAAsD;iBAC9D;gBACD,UAAU,EAAE,qBAAqB;aAClC;SACF,CAAC,CAAA;IACJ,CAAC;IAED,IAAI,kBAAkB,GAAG,KAAK,CAAA;IAC9B,IAAI,gBAAgB,GAAiC,IAAI,CAAA;IACzD,MAAM,oBAAoB,GAAyB;QACjD,SAAS,EAAE,KAAK;QAChB,IAAI,EAAE,KAAK;QACX,SAAS;QACT,WAAW,EAAE,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;QAC5D,KAAK,EAAE,IAAI;KACZ,CAAA;IAED,MAAM,aAAa,GAAG,kBAAkB,CAAC,GAAG,CAAC,CAAA;IAC7C,MAAM,UAAU,GAAG,aAAa,CAAC,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;IACzD,IAAI,SAAS,GAAY,IAAI,CAAA;IAC7B,IAAI,WAAW,GAAY,IAAI,CAAA;IAE/B,MAAM,gBAAgB,GAAG,KAAK,EAAE,MAAc,EAAwC,EAAE;QACtF,IAAI,CAAC,eAAe,EAAE,CAAC;YACrB,OAAO,IAAI,CAAA;QACb,CAAC;QAED,IAAI,CAAC,kBAAkB,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,IAAI,CAAC,mBAAmB,EAAE;gBACtE,SAAS;gBACT,SAAS;gBACT,UAAU,EAAE,IAAI,CAAC,UAAU,IAAI,IAAI;gBACnC,MAAM;aACP,CAAC,CAAA;YACF,kBAAkB,GAAG,IAAI,CAAA;YACzB,gBAAgB,GAAG,OAAO,CAAC,OAAO,CAAA;YAClC,oBAAoB,CAAC,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC,SAAS,CAAA;YAC9D,oBAAoB,CAAC,IAAI,GAAG,OAAO,CAAC,WAAW,CAAC,IAAI,CAAA;YACpD,oBAAoB,CAAC,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC,SAAS,CAAA;YAC9D,oBAAoB,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC,WAAW,CAAA;YAClE,oBAAoB,CAAC,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,KAAK,IAAI,IAAI,CAAA;QAChE,CAAC;QAED,oBAAoB,CAAC,KAAK,GAAG,UAAU,CAAC,oBAAoB,CAAC,KAAK,EAAE,MAAM,CAAC,CAAA;QAE3E,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACtB,OAAO,IAAI,CAAA;QACb,CAAC;QAED,MAAM,eAAe,GAAG,MAAM,wBAAwB,CAAC;YACrD,GAAG;YACH,OAAO,EAAE,gBAAgB;YACzB,SAAS;YACT,aAAa;YACb,qBAAqB;YACrB,oBAAoB;YACpB,iBAAiB;YACjB,IAAI;SACL,CAAC,CAAA;QACF,IAAI,eAAe,EAAE,CAAC;YACpB,OAAO,eAAe,CAAA;QACxB,CAAC;QAED,oBAAoB,CAAC,KAAK,GAAG,UAAU,CACrC,oBAAoB,CAAC,KAAK,EAC1B,kCAAkC,CACnC,CAAA;QACD,OAAO,IAAI,CAAA;IACb,CAAC,CAAA;IAED,MAAM,WAAW,GAAG,KAAK,IAA0C,EAAE;QACnE,IAAI,CAAC,IAAI,CAAC,iBAAiB,IAAI,CAAC,aAAa,EAAE,CAAC;YAC9C,OAAO,IAAI,CAAA;QACb,CAAC;QAED,IAAI,CAAC,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,YAAY,EAAE,GAAG,EAAE,CAAC,CAAA;QAC9C,IAAI,CAAC;YACH,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,CAAC,CAAA;YAC9D,MAAM,IAAI,GAAG,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,CAAA;YACtC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACtB,IAAI,CAAC,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;gBACzE,OAAO,IAAI,CAAA;YACb,CAAC;YAED,MAAM,KAAK,GAAG,KAAK,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,IAAI,CAAA;YAC1E,MAAM,WAAW,GAAG,IAAI,CAAA;YACxB,MAAM,QAAQ,GAAG,GAAG,CAAA;YACpB,MAAM,oBAAoB,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,CAAA;YACzD,MAAM,qBAAqB,GAAG,2BAA2B,CACvD,oBAAoB,EACpB,SAAS,IAAI,SAAS,CACvB,CAAA;YACD,MAAM,MAAM,GAAG,4BAA4B,CAAC;gBAC1C,GAAG;gBACH,WAAW,EAAE,IAAI;gBACjB,aAAa;gBACb,KAAK;gBACL,WAAW;gBACX,QAAQ;gBACR,oBAAoB;gBACpB,KAAK,EAAE,IAAI;gBACX,WAAW,EAAE,KAAK;gBAClB,WAAW,EAAE;oBACX,QAAQ,EAAE,MAAM;oBAChB,SAAS,EAAE,oBAAoB;oBAC/B,QAAQ,EAAE;wBACR,SAAS,EAAE,iBAAiB;wBAC5B,IAAI,EAAE,KAAK;wBACX,QAAQ,EAAE,IAAI;wBACd,KAAK,EAAE,sCAAsC;qBAC9C;oBACD,UAAU,EAAE,qBAAqB;iBAClC;aACF,CAAC,CAAA;YACF,IAAI,CAAC,UAAU,EAAE,CAAC;gBAChB,IAAI,EAAE,WAAW;gBACjB,GAAG;gBACH,EAAE,EAAE,IAAI;gBACR,SAAS,EAAE,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC;aACrD,CAAC,CAAA;YACF,OAAO,MAAM,CAAA;QACf,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,SAAS,GAAG,KAAK,CAAA;YACjB,IAAI,CAAC,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;YACzE,OAAO,IAAI,CAAA;QACb,CAAC;IACH,CAAC,CAAA;IAED,MAAM,UAAU,GAAG,MAAM,WAAW,EAAE,CAAA;IACtC,IAAI,UAAU,EAAE,CAAC;QACf,OAAO,UAAU,CAAA;IACnB,CAAC;IAED,MAAM,aAAa,GAAG,KAAK,IAA4B,EAAE;QACvD,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC5B,OAAO,IAAI,CAAA;QACb,CAAC;QACD,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;YACnC,IAAI,CAAC,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,cAAc,EAAE,GAAG,EAAE,SAAS,EAAE,CAAC,CAAA;YAC3D,IAAI,CAAC;gBACH,MAAM,UAAU,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,KAAK,EAAE,SAAS,EAAE,EAAE,SAAS,EAAE,CAAC,CAAA;gBAChF,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,EAAE,CAAC;oBACvB,WAAW,GAAG,IAAI,KAAK,CAAC,mCAAmC,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;oBACrF,IAAI,CAAC,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,aAAa,EAAE,GAAG,EAAE,SAAS,EAAE,EAAE,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;oBACtF,SAAQ;gBACV,CAAC;gBACD,IAAI,YAAY,CAAC,UAAU,CAAC,EAAE,CAAC;oBAC7B,WAAW,GAAG,IAAI,KAAK,CACrB,yCAAyC,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CACnE,CAAA;oBACD,IAAI,CAAC,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,aAAa,EAAE,GAAG,EAAE,SAAS,EAAE,EAAE,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;oBACtF,SAAQ;gBACV,CAAC;gBACD,IAAI,CAAC,UAAU,EAAE,CAAC;oBAChB,IAAI,EAAE,aAAa;oBACnB,GAAG,EAAE,SAAS;oBACd,EAAE,EAAE,IAAI;oBACR,SAAS,EAAE,MAAM,CAAC,UAAU,CAAC,UAAU,EAAE,MAAM,CAAC;iBACjD,CAAC,CAAA;gBACF,OAAO,UAAU,CAAA;YACnB,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,WAAW,GAAG,KAAK,CAAA;gBACnB,IAAI,CAAC,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,aAAa,EAAE,GAAG,EAAE,SAAS,EAAE,EAAE,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;YACxF,CAAC;QACH,CAAC;QACD,OAAO,IAAI,CAAA;IACb,CAAC,CAAA;IAED,MAAM,UAAU,GAAG,MAAM,aAAa,EAAE,CAAA;IACxC,IAAI,UAAU,EAAE,CAAC;QACf,MAAM,YAAY,GAAG,MAAM,2BAA2B,CAAC;YACrD,GAAG;YACH,IAAI,EAAE,UAAU;YAChB,SAAS;YACT,aAAa;YACb,qBAAqB;YACrB,oBAAoB;YACpB,iBAAiB;YACjB,YAAY;YACZ,SAAS;YACT,IAAI;YACJ,oBAAoB,EAAE,IAAI;SAC3B,CAAC,CAAA;QACF,IAAI,CAAC,uBAAuB,CAAC,YAAY,CAAC,OAAO,CAAC,EAAE,CAAC;YACnD,YAAY,CAAC,WAAW,CAAC,QAAQ,GAAG,QAAQ,CAAA;YAC5C,OAAO,YAAY,CAAA;QACrB,CAAC;QACD,WAAW,GAAG,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAA;IACrE,CAAC;IAED,IAAI,aAAa,KAAK,QAAQ,EAAE,CAAC;QAC/B,MAAM,eAAe,GAAG,MAAM,gBAAgB,CAAC,8BAA8B,CAAC,CAAA;QAC9E,IAAI,eAAe,EAAE,CAAC;YACpB,OAAO,eAAe,CAAA;QACxB,CAAC;IACH,CAAC;IAED,IAAI,IAAI,GAAkB,IAAI,CAAA;IAC9B,IAAI,SAAS,GAAY,IAAI,CAAA;IAE7B,IAAI,CAAC;QACH,IAAI,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,KAAK,EAAE,GAAG,EAAE;YAC9C,SAAS;YACT,UAAU,EAAE,IAAI,CAAC,UAAU,IAAI,IAAI;SACpC,CAAC,CAAA;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,SAAS,GAAG,KAAK,CAAA;IACnB,CAAC;IAED,IAAI,CAAC,IAAI,EAAE,CAAC;QACV,IAAI,CAAC,eAAe,EAAE,CAAC;YACrB,MAAM,SAAS,YAAY,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAA;QAC3F,CAAC;QAED,MAAM,eAAe,GAAG,MAAM,gBAAgB,CAAC,8CAA8C,CAAC,CAAA;QAC9F,IAAI,eAAe,EAAE,CAAC;YACpB,OAAO,eAAe,CAAA;QACxB,CAAC;QAED,MAAM,cAAc,GAAG,oBAAoB,CAAC,KAAK;YAC/C,CAAC,CAAC,sBAAsB,oBAAoB,CAAC,KAAK,EAAE;YACpD,CAAC,CAAC,EAAE,CAAA;QACN,MAAM,IAAI,KAAK,CACb,gCAAgC,cAAc,GAC5C,SAAS,YAAY,KAAK,CAAC,CAAC,CAAC,iBAAiB,SAAS,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EACtE,EAAE,CACH,CAAA;IACH,CAAC;IAED,IAAI,oBAAoB,GAAkE,IAAI,CAAA;IAE9F,IAAI,aAAa,KAAK,MAAM,IAAI,yBAAyB,CAAC,IAAI,CAAC,EAAE,CAAC;QAChE,oBAAoB,GAAG,MAAM,0BAA0B,CAAC,IAAI,EAAE,GAAG,CAAC,CAAA;QAClE,MAAM,eAAe,GAAG,oBAAoB,EAAE,IAAI;YAChD,CAAC,CAAC,kBAAkB,CAAC,oBAAoB,CAAC,IAAI,CAAC;YAC/C,CAAC,CAAC,EAAE,CAAA;QACN,IAAI,eAAe,CAAC,MAAM,GAAG,kCAAkC,EAAE,CAAC;YAChE,MAAM,eAAe,GAAG,MAAM,gBAAgB,CAC5C,6DAA6D,CAC9D,CAAA;YACD,IAAI,eAAe,EAAE,CAAC;gBACpB,OAAO,eAAe,CAAA;YACxB,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,2BAA2B,CAAC;QACnD,GAAG;QACH,IAAI;QACJ,SAAS;QACT,aAAa;QACb,qBAAqB;QACrB,oBAAoB;QACpB,iBAAiB;QACjB,YAAY;QACZ,SAAS;QACT,IAAI;QACJ,oBAAoB;KACrB,CAAC,CAAA;IACF,IAAI,aAAa,IAAI,uBAAuB,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QACjE,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,iBAAiB;YACtC,CAAC,CAAC,oBAAoB;YACtB,CAAC,CAAC,SAAS;gBACT,CAAC,CAAC,gBAAgB,SAAS,YAAY,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,EAAE;gBACtF,CAAC,CAAC,uBAAuB,CAAA;QAC7B,MAAM,UAAU,GACd,UAAU,CAAC,MAAM,GAAG,CAAC;YACnB,CAAC,CAAC,WAAW;gBACX,CAAC,CAAC,kBAAkB,WAAW,YAAY,KAAK,CAAC,CAAC,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE;gBAC9F,CAAC,CAAC,yBAAyB;YAC7B,CAAC,CAAC,sBAAsB,CAAA;QAC5B,MAAM,IAAI,KAAK,CAAC,yCAAyC,QAAQ,KAAK,UAAU,GAAG,CAAC,CAAA;IACtF,CAAC;IACD,OAAO,UAAU,CAAA;AACnB,CAAC"}
@@ -0,0 +1,77 @@
1
+ import { load } from 'cheerio';
2
+ import { normalizeCandidate } from './cleaner.js';
3
+ export function extractJsonLdContent(html) {
4
+ try {
5
+ const $ = load(html);
6
+ const scripts = $('script[type="application/ld+json"]').toArray();
7
+ const candidates = [];
8
+ for (const script of scripts) {
9
+ const raw = $(script).text();
10
+ if (!raw)
11
+ continue;
12
+ try {
13
+ const data = JSON.parse(raw);
14
+ collectCandidates(data, candidates);
15
+ }
16
+ catch {
17
+ // ignore malformed jsonld
18
+ }
19
+ }
20
+ if (candidates.length === 0)
21
+ return null;
22
+ const sorted = candidates
23
+ .map((c) => ({
24
+ title: c.title ? normalizeCandidate(c.title) : null,
25
+ description: c.description ? normalizeCandidate(c.description) : null,
26
+ type: c.type ? normalizeCandidate(c.type) : null,
27
+ }))
28
+ .filter((c) => c.title || c.description)
29
+ .sort((a, b) => (b.description?.length ?? 0) - (a.description?.length ?? 0));
30
+ return sorted[0] ?? null;
31
+ }
32
+ catch {
33
+ return null;
34
+ }
35
+ }
36
+ function collectCandidates(input, out) {
37
+ if (!input)
38
+ return;
39
+ if (Array.isArray(input)) {
40
+ for (const item of input)
41
+ collectCandidates(item, out);
42
+ return;
43
+ }
44
+ if (typeof input !== 'object')
45
+ return;
46
+ const record = input;
47
+ if (record['@graph'] && Array.isArray(record['@graph'])) {
48
+ collectCandidates(record['@graph'], out);
49
+ }
50
+ const type = extractType(record);
51
+ if (type) {
52
+ const title = firstString(record, ['name', 'headline', 'title']);
53
+ const description = firstString(record, ['description', 'summary']);
54
+ if (title || description) {
55
+ out.push({ title: title ?? null, description: description ?? null, type });
56
+ }
57
+ }
58
+ }
59
+ function extractType(record) {
60
+ const raw = record['@type'];
61
+ if (typeof raw === 'string')
62
+ return raw.toLowerCase();
63
+ if (Array.isArray(raw)) {
64
+ const found = raw.find((entry) => typeof entry === 'string');
65
+ return typeof found === 'string' ? found.toLowerCase() : null;
66
+ }
67
+ return null;
68
+ }
69
+ function firstString(record, keys) {
70
+ for (const key of keys) {
71
+ const value = record[key];
72
+ if (typeof value === 'string' && value.trim())
73
+ return value.trim();
74
+ }
75
+ return null;
76
+ }
77
+ //# sourceMappingURL=jsonld.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"jsonld.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/jsonld.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAA;AAE9B,OAAO,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAA;AAQjD,MAAM,UAAU,oBAAoB,CAAC,IAAY;IAC/C,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAA;QACpB,MAAM,OAAO,GAAG,CAAC,CAAC,oCAAoC,CAAC,CAAC,OAAO,EAAE,CAAA;QACjE,MAAM,UAAU,GAAoB,EAAE,CAAA;QAEtC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,MAAM,GAAG,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,CAAA;YAC5B,IAAI,CAAC,GAAG;gBAAE,SAAQ;YAClB,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;gBAC5B,iBAAiB,CAAC,IAAI,EAAE,UAAU,CAAC,CAAA;YACrC,CAAC;YAAC,MAAM,CAAC;gBACP,0BAA0B;YAC5B,CAAC;QACH,CAAC;QAED,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAA;QAExC,MAAM,MAAM,GAAG,UAAU;aACtB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACX,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI;YACnD,WAAW,EAAE,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI;YACrE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI;SACjD,CAAC,CAAC;aACF,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,WAAW,CAAC;aACvC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,MAAM,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,EAAE,MAAM,IAAI,CAAC,CAAC,CAAC,CAAA;QAE9E,OAAO,MAAM,CAAC,CAAC,CAAC,IAAI,IAAI,CAAA;IAC1B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAA;IACb,CAAC;AACH,CAAC;AAED,SAAS,iBAAiB,CAAC,KAAc,EAAE,GAAoB;IAC7D,IAAI,CAAC,KAAK;QAAE,OAAM;IAElB,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,KAAK,MAAM,IAAI,IAAI,KAAK;YAAE,iBAAiB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAA;QACtD,OAAM;IACR,CAAC;IAED,IAAI,OAAO,KAAK,KAAK,QAAQ;QAAE,OAAM;IAErC,MAAM,MAAM,GAAG,KAAgC,CAAA;IAC/C,IAAI,MAAM,CAAC,QAAQ,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;QACxD,iBAAiB,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,GAAG,CAAC,CAAA;IAC1C,CAAC;IAED,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,CAAC,CAAA;IAChC,IAAI,IAAI,EAAE,CAAC;QACT,MAAM,KAAK,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,UAAU,EAAE,OAAO,CAAC,CAAC,CAAA;QAChE,MAAM,WAAW,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,aAAa,EAAE,SAAS,CAAC,CAAC,CAAA;QACnE,IAAI,KAAK,IAAI,WAAW,EAAE,CAAC;YACzB,GAAG,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,IAAI,IAAI,EAAE,WAAW,EAAE,WAAW,IAAI,IAAI,EAAE,IAAI,EAAE,CAAC,CAAA;QAC5E,CAAC;IACH,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,MAA+B;IAClD,MAAM,GAAG,GAAG,MAAM,CAAC,OAAO,CAAC,CAAA;IAC3B,IAAI,OAAO,GAAG,KAAK,QAAQ;QAAE,OAAO,GAAG,CAAC,WAAW,EAAE,CAAA;IACrD,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAA;QAC5D,OAAO,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,IAAI,CAAA;IAC/D,CAAC;IACD,OAAO,IAAI,CAAA;AACb,CAAC;AAED,SAAS,WAAW,CAAC,MAA+B,EAAE,IAAc;IAClE,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAA;QACzB,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,CAAC,IAAI,EAAE;YAAE,OAAO,KAAK,CAAC,IAAI,EAAE,CAAA;IACpE,CAAC;IACD,OAAO,IAAI,CAAA;AACb,CAAC"}