@steipete/summarize 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. package/CHANGELOG.md +52 -0
  2. package/LICENSE +21 -0
  3. package/README.md +185 -0
  4. package/dist/cli.cjs +74333 -0
  5. package/dist/cli.cjs.map +7 -0
  6. package/dist/esm/cli-main.js +80 -0
  7. package/dist/esm/cli-main.js.map +1 -0
  8. package/dist/esm/cli.js +18 -0
  9. package/dist/esm/cli.js.map +1 -0
  10. package/dist/esm/config.js +33 -0
  11. package/dist/esm/config.js.map +1 -0
  12. package/dist/esm/content/asset.js +167 -0
  13. package/dist/esm/content/asset.js.map +1 -0
  14. package/dist/esm/content/index.js +4 -0
  15. package/dist/esm/content/index.js.map +1 -0
  16. package/dist/esm/content/link-preview/client.js +20 -0
  17. package/dist/esm/content/link-preview/client.js.map +1 -0
  18. package/dist/esm/content/link-preview/content/article.js +150 -0
  19. package/dist/esm/content/link-preview/content/article.js.map +1 -0
  20. package/dist/esm/content/link-preview/content/cleaner.js +55 -0
  21. package/dist/esm/content/link-preview/content/cleaner.js.map +1 -0
  22. package/dist/esm/content/link-preview/content/fetcher.js +120 -0
  23. package/dist/esm/content/link-preview/content/fetcher.js.map +1 -0
  24. package/dist/esm/content/link-preview/content/index.js +275 -0
  25. package/dist/esm/content/link-preview/content/index.js.map +1 -0
  26. package/dist/esm/content/link-preview/content/parsers.js +77 -0
  27. package/dist/esm/content/link-preview/content/parsers.js.map +1 -0
  28. package/dist/esm/content/link-preview/content/types.js +4 -0
  29. package/dist/esm/content/link-preview/content/types.js.map +1 -0
  30. package/dist/esm/content/link-preview/content/utils.js +127 -0
  31. package/dist/esm/content/link-preview/content/utils.js.map +1 -0
  32. package/dist/esm/content/link-preview/content/youtube.js +82 -0
  33. package/dist/esm/content/link-preview/content/youtube.js.map +1 -0
  34. package/dist/esm/content/link-preview/deps.js +2 -0
  35. package/dist/esm/content/link-preview/deps.js.map +1 -0
  36. package/dist/esm/content/link-preview/fetch-with-timeout.js +35 -0
  37. package/dist/esm/content/link-preview/fetch-with-timeout.js.map +1 -0
  38. package/dist/esm/content/link-preview/transcript/cache.js +73 -0
  39. package/dist/esm/content/link-preview/transcript/cache.js.map +1 -0
  40. package/dist/esm/content/link-preview/transcript/index.js +95 -0
  41. package/dist/esm/content/link-preview/transcript/index.js.map +1 -0
  42. package/dist/esm/content/link-preview/transcript/normalize.js +43 -0
  43. package/dist/esm/content/link-preview/transcript/normalize.js.map +1 -0
  44. package/dist/esm/content/link-preview/transcript/providers/generic.js +11 -0
  45. package/dist/esm/content/link-preview/transcript/providers/generic.js.map +1 -0
  46. package/dist/esm/content/link-preview/transcript/providers/podcast.js +12 -0
  47. package/dist/esm/content/link-preview/transcript/providers/podcast.js.map +1 -0
  48. package/dist/esm/content/link-preview/transcript/providers/twitter.js +12 -0
  49. package/dist/esm/content/link-preview/transcript/providers/twitter.js.map +1 -0
  50. package/dist/esm/content/link-preview/transcript/providers/youtube/api.js +257 -0
  51. package/dist/esm/content/link-preview/transcript/providers/youtube/api.js.map +1 -0
  52. package/dist/esm/content/link-preview/transcript/providers/youtube/apify.js +55 -0
  53. package/dist/esm/content/link-preview/transcript/providers/youtube/apify.js.map +1 -0
  54. package/dist/esm/content/link-preview/transcript/providers/youtube/captions.js +409 -0
  55. package/dist/esm/content/link-preview/transcript/providers/youtube/captions.js.map +1 -0
  56. package/dist/esm/content/link-preview/transcript/providers/youtube/ytdlp.js +114 -0
  57. package/dist/esm/content/link-preview/transcript/providers/youtube/ytdlp.js.map +1 -0
  58. package/dist/esm/content/link-preview/transcript/providers/youtube.js +74 -0
  59. package/dist/esm/content/link-preview/transcript/providers/youtube.js.map +1 -0
  60. package/dist/esm/content/link-preview/transcript/types.js +2 -0
  61. package/dist/esm/content/link-preview/transcript/types.js.map +1 -0
  62. package/dist/esm/content/link-preview/transcript/utils.js +193 -0
  63. package/dist/esm/content/link-preview/transcript/utils.js.map +1 -0
  64. package/dist/esm/content/link-preview/types.js +2 -0
  65. package/dist/esm/content/link-preview/types.js.map +1 -0
  66. package/dist/esm/costs.js +57 -0
  67. package/dist/esm/costs.js.map +1 -0
  68. package/dist/esm/firecrawl.js +54 -0
  69. package/dist/esm/firecrawl.js.map +1 -0
  70. package/dist/esm/flags.js +97 -0
  71. package/dist/esm/flags.js.map +1 -0
  72. package/dist/esm/index.js +4 -0
  73. package/dist/esm/index.js.map +1 -0
  74. package/dist/esm/llm/generate-text.js +296 -0
  75. package/dist/esm/llm/generate-text.js.map +1 -0
  76. package/dist/esm/llm/google-models.js +112 -0
  77. package/dist/esm/llm/google-models.js.map +1 -0
  78. package/dist/esm/llm/html-to-markdown.js +44 -0
  79. package/dist/esm/llm/html-to-markdown.js.map +1 -0
  80. package/dist/esm/llm/model-id.js +45 -0
  81. package/dist/esm/llm/model-id.js.map +1 -0
  82. package/dist/esm/pricing/litellm.js +25 -0
  83. package/dist/esm/pricing/litellm.js.map +1 -0
  84. package/dist/esm/prompts/file.js +14 -0
  85. package/dist/esm/prompts/file.js.map +1 -0
  86. package/dist/esm/prompts/index.js +3 -0
  87. package/dist/esm/prompts/index.js.map +1 -0
  88. package/dist/esm/prompts/link-summary.js +105 -0
  89. package/dist/esm/prompts/link-summary.js.map +1 -0
  90. package/dist/esm/run.js +1674 -0
  91. package/dist/esm/run.js.map +1 -0
  92. package/dist/esm/shared/contracts.js +2 -0
  93. package/dist/esm/shared/contracts.js.map +1 -0
  94. package/dist/esm/summarizeHome.js +20 -0
  95. package/dist/esm/summarizeHome.js.map +1 -0
  96. package/dist/esm/tty/live-markdown.js +52 -0
  97. package/dist/esm/tty/live-markdown.js.map +1 -0
  98. package/dist/esm/tty/osc-progress.js +8 -0
  99. package/dist/esm/tty/osc-progress.js.map +1 -0
  100. package/dist/esm/tty/spinner.js +33 -0
  101. package/dist/esm/tty/spinner.js.map +1 -0
  102. package/dist/esm/version.js +44 -0
  103. package/dist/esm/version.js.map +1 -0
  104. package/dist/types/cli-main.d.ts +11 -0
  105. package/dist/types/cli.d.ts +1 -0
  106. package/dist/types/config.d.ts +15 -0
  107. package/dist/types/content/asset.d.ts +44 -0
  108. package/dist/types/content/index.d.ts +4 -0
  109. package/dist/types/content/link-preview/client.d.ts +14 -0
  110. package/dist/types/content/link-preview/content/article.d.ts +4 -0
  111. package/dist/types/content/link-preview/content/cleaner.d.ts +12 -0
  112. package/dist/types/content/link-preview/content/fetcher.d.ts +16 -0
  113. package/dist/types/content/link-preview/content/index.d.ts +4 -0
  114. package/dist/types/content/link-preview/content/parsers.d.ts +7 -0
  115. package/dist/types/content/link-preview/content/types.d.ts +44 -0
  116. package/dist/types/content/link-preview/content/utils.d.ts +16 -0
  117. package/dist/types/content/link-preview/content/youtube.d.ts +1 -0
  118. package/dist/types/content/link-preview/deps.d.ts +70 -0
  119. package/dist/types/content/link-preview/fetch-with-timeout.d.ts +4 -0
  120. package/dist/types/content/link-preview/transcript/cache.d.ts +29 -0
  121. package/dist/types/content/link-preview/transcript/index.d.ts +9 -0
  122. package/dist/types/content/link-preview/transcript/normalize.d.ts +3 -0
  123. package/dist/types/content/link-preview/transcript/providers/generic.d.ts +3 -0
  124. package/dist/types/content/link-preview/transcript/providers/podcast.d.ts +3 -0
  125. package/dist/types/content/link-preview/transcript/providers/twitter.d.ts +3 -0
  126. package/dist/types/content/link-preview/transcript/providers/youtube/api.d.ts +26 -0
  127. package/dist/types/content/link-preview/transcript/providers/youtube/apify.d.ts +1 -0
  128. package/dist/types/content/link-preview/transcript/providers/youtube/captions.d.ts +7 -0
  129. package/dist/types/content/link-preview/transcript/providers/youtube/ytdlp.d.ts +3 -0
  130. package/dist/types/content/link-preview/transcript/providers/youtube.d.ts +3 -0
  131. package/dist/types/content/link-preview/transcript/types.d.ts +23 -0
  132. package/dist/types/content/link-preview/transcript/utils.d.ts +7 -0
  133. package/dist/types/content/link-preview/types.d.ts +36 -0
  134. package/dist/types/costs.d.ts +31 -0
  135. package/dist/types/firecrawl.d.ts +5 -0
  136. package/dist/types/flags.d.ts +23 -0
  137. package/dist/types/index.d.ts +4 -0
  138. package/dist/types/llm/generate-text.d.ts +43 -0
  139. package/dist/types/llm/google-models.d.ts +10 -0
  140. package/dist/types/llm/html-to-markdown.d.ts +15 -0
  141. package/dist/types/llm/model-id.d.ts +14 -0
  142. package/dist/types/pricing/litellm.d.ts +13 -0
  143. package/dist/types/prompts/file.d.ts +6 -0
  144. package/dist/types/prompts/index.d.ts +3 -0
  145. package/dist/types/prompts/link-summary.d.ts +27 -0
  146. package/dist/types/run.d.ts +8 -0
  147. package/dist/types/shared/contracts.d.ts +2 -0
  148. package/dist/types/summarizeHome.d.ts +6 -0
  149. package/dist/types/tty/live-markdown.d.ts +10 -0
  150. package/dist/types/tty/osc-progress.d.ts +3 -0
  151. package/dist/types/tty/spinner.d.ts +10 -0
  152. package/dist/types/version.d.ts +2 -0
  153. package/docs/README.md +11 -0
  154. package/docs/config.md +28 -0
  155. package/docs/extract-only.md +13 -0
  156. package/docs/firecrawl.md +17 -0
  157. package/docs/llm.md +33 -0
  158. package/docs/openai.md +18 -0
  159. package/docs/site/.nojekyll +1 -0
  160. package/docs/site/404.html +37 -0
  161. package/docs/site/assets/site.css +577 -0
  162. package/docs/site/assets/site.js +69 -0
  163. package/docs/site/docs/config.html +73 -0
  164. package/docs/site/docs/extract-only.html +79 -0
  165. package/docs/site/docs/firecrawl.html +72 -0
  166. package/docs/site/docs/index.html +89 -0
  167. package/docs/site/docs/llm.html +70 -0
  168. package/docs/site/docs/openai.html +66 -0
  169. package/docs/site/docs/website.html +70 -0
  170. package/docs/site/docs/youtube.html +62 -0
  171. package/docs/site/index.html +125 -0
  172. package/docs/website.md +27 -0
  173. package/docs/youtube.md +32 -0
  174. package/package.json +76 -0
@@ -0,0 +1,120 @@
1
+ import { isYouTubeUrl } from '../transcript/utils.js';
2
+ import { appendNote } from './utils.js';
3
+ const REQUEST_HEADERS = {
4
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
5
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
6
+ 'Accept-Language': 'en-US,en;q=0.9',
7
+ 'Cache-Control': 'no-cache',
8
+ Pragma: 'no-cache',
9
+ };
10
+ const DEFAULT_REQUEST_TIMEOUT_MS = 5000;
11
+ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress, } = {}) {
12
+ onProgress?.({ kind: 'fetch-html-start', url });
13
+ const controller = new AbortController();
14
+ const effectiveTimeoutMs = typeof timeoutMs === 'number' && Number.isFinite(timeoutMs)
15
+ ? timeoutMs
16
+ : DEFAULT_REQUEST_TIMEOUT_MS;
17
+ const timeout = setTimeout(() => {
18
+ controller.abort();
19
+ }, effectiveTimeoutMs);
20
+ try {
21
+ const response = await fetchImpl(url, {
22
+ headers: REQUEST_HEADERS,
23
+ redirect: 'follow',
24
+ signal: controller.signal,
25
+ });
26
+ if (!response.ok) {
27
+ throw new Error(`Failed to fetch HTML document (status ${response.status})`);
28
+ }
29
+ const contentType = response.headers.get('content-type')?.toLowerCase() ?? null;
30
+ if (contentType &&
31
+ !contentType.includes('text/html') &&
32
+ !contentType.includes('application/xhtml+xml') &&
33
+ !contentType.startsWith('text/')) {
34
+ throw new Error(`Unsupported content-type for HTML document fetch: ${contentType}`);
35
+ }
36
+ const totalBytes = (() => {
37
+ const raw = response.headers.get('content-length');
38
+ if (!raw)
39
+ return null;
40
+ const parsed = Number(raw);
41
+ return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : null;
42
+ })();
43
+ const body = response.body;
44
+ if (!body) {
45
+ const text = await response.text();
46
+ const bytes = new TextEncoder().encode(text).byteLength;
47
+ onProgress?.({ kind: 'fetch-html-done', url, downloadedBytes: bytes, totalBytes });
48
+ return text;
49
+ }
50
+ const reader = body.getReader();
51
+ const decoder = new TextDecoder();
52
+ let downloadedBytes = 0;
53
+ let text = '';
54
+ onProgress?.({ kind: 'fetch-html-progress', url, downloadedBytes: 0, totalBytes });
55
+ while (true) {
56
+ const { value, done } = await reader.read();
57
+ if (done)
58
+ break;
59
+ if (!value)
60
+ continue;
61
+ downloadedBytes += value.byteLength;
62
+ text += decoder.decode(value, { stream: true });
63
+ onProgress?.({ kind: 'fetch-html-progress', url, downloadedBytes, totalBytes });
64
+ }
65
+ text += decoder.decode();
66
+ onProgress?.({ kind: 'fetch-html-done', url, downloadedBytes, totalBytes });
67
+ return text;
68
+ }
69
+ catch (error) {
70
+ if (error instanceof DOMException && error.name === 'AbortError') {
71
+ throw new Error('Fetching HTML document timed out');
72
+ }
73
+ throw error;
74
+ }
75
+ finally {
76
+ clearTimeout(timeout);
77
+ }
78
+ }
79
+ export async function fetchWithFirecrawl(url, scrapeWithFirecrawl, options = {}) {
80
+ const timeoutMs = options.timeoutMs;
81
+ const cacheMode = options.cacheMode ?? 'default';
82
+ const onProgress = typeof options.onProgress === 'function' ? options.onProgress : null;
83
+ const reason = typeof options.reason === 'string' ? options.reason : null;
84
+ const diagnostics = {
85
+ attempted: false,
86
+ used: false,
87
+ cacheMode,
88
+ cacheStatus: cacheMode === 'bypass' ? 'bypassed' : 'unknown',
89
+ notes: null,
90
+ };
91
+ if (isYouTubeUrl(url)) {
92
+ diagnostics.notes = appendNote(diagnostics.notes, 'Skipped Firecrawl for YouTube URL');
93
+ return { payload: null, diagnostics };
94
+ }
95
+ if (!scrapeWithFirecrawl) {
96
+ diagnostics.notes = appendNote(diagnostics.notes, 'Firecrawl is not configured');
97
+ return { payload: null, diagnostics };
98
+ }
99
+ diagnostics.attempted = true;
100
+ onProgress?.({ kind: 'firecrawl-start', url, reason: reason ?? 'firecrawl' });
101
+ try {
102
+ const payload = await scrapeWithFirecrawl(url, { timeoutMs, cacheMode });
103
+ if (!payload) {
104
+ diagnostics.notes = appendNote(diagnostics.notes, 'Firecrawl returned no content payload');
105
+ onProgress?.({ kind: 'firecrawl-done', url, ok: false, markdownBytes: null, htmlBytes: null });
106
+ return { payload: null, diagnostics };
107
+ }
108
+ const encoder = new TextEncoder();
109
+ const markdownBytes = typeof payload.markdown === 'string' ? encoder.encode(payload.markdown).byteLength : null;
110
+ const htmlBytes = typeof payload.html === 'string' ? encoder.encode(payload.html).byteLength : null;
111
+ onProgress?.({ kind: 'firecrawl-done', url, ok: true, markdownBytes, htmlBytes });
112
+ return { payload, diagnostics };
113
+ }
114
+ catch (error) {
115
+ diagnostics.notes = appendNote(diagnostics.notes, `Firecrawl error: ${error instanceof Error ? error.message : 'unknown error'}`);
116
+ onProgress?.({ kind: 'firecrawl-done', url, ok: false, markdownBytes: null, htmlBytes: null });
117
+ return { payload: null, diagnostics };
118
+ }
119
+ }
120
+ //# sourceMappingURL=fetcher.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetcher.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/fetcher.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAA;AAGrD,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAA;AAEvC,MAAM,eAAe,GAA2B;IAC9C,YAAY,EACV,iHAAiH;IACnH,MAAM,EACJ,kGAAkG;IACpG,iBAAiB,EAAE,gBAAgB;IACnC,eAAe,EAAE,UAAU;IAC3B,MAAM,EAAE,UAAU;CACnB,CAAA;AAED,MAAM,0BAA0B,GAAG,IAAI,CAAA;AAOvC,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,SAAuB,EACvB,GAAW,EACX,EACE,SAAS,EACT,UAAU,MACiF,EAAE;IAE/F,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,kBAAkB,EAAE,GAAG,EAAE,CAAC,CAAA;IAE/C,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAA;IACxC,MAAM,kBAAkB,GACtB,OAAO,SAAS,KAAK,QAAQ,IAAI,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC;QACzD,CAAC,CAAC,SAAS;QACX,CAAC,CAAC,0BAA0B,CAAA;IAChC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE;QAC9B,UAAU,CAAC,KAAK,EAAE,CAAA;IACpB,CAAC,EAAE,kBAAkB,CAAC,CAAA;IAEtB,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE;YACpC,OAAO,EAAE,eAAe;YACxB,QAAQ,EAAE,QAAQ;YAClB,MAAM,EAAE,UAAU,CAAC,MAAM;SAC1B,CAAC,CAAA;QAEF,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CAAC,yCAAyC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAA;QAC9E,CAAC;QAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,EAAE,WAAW,EAAE,IAAI,IAAI,CAAA;QAC/E,IACE,WAAW;YACX,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC;YAClC,CAAC,WAAW,CAAC,QAAQ,CAAC,uBAAuB,CAAC;YAC9C,CAAC,WAAW,CAAC,UAAU,CAAC,OAAO,CAAC,EAChC,CAAC;YACD,MAAM,IAAI,KAAK,CAAC,qDAAqD,WAAW,EAAE,CAAC,CAAA;QACrF,CAAC;QAED,MAAM,UAAU,GAAG,CAAC,GAAG,EAAE;YACvB,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAA;YAClD,IAAI,CAAC,GAAG;gBAAE,OAAO,IAAI,CAAA;YACrB,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAA;YAC1B,OAAO,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAA;QAC1E,CAAC,CAAC,EAAE,CAAA;QAEJ,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAA;QAC1B,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAA;YAClC,MAAM,KAAK,GAAG,IAAI,WAAW,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,CAAA;YACvD,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,iBAAiB,EAAE,GAAG,EAAE,eAAe,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC,CAAA;YAClF,OAAO,IAAI,CAAA;QACb,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,EAAE,CAAA;QAC/B,MAAM,OAAO,GAAG,IAAI,WAAW,EAAE,CAAA;QACjC,IAAI,eAAe,GAAG,CAAC,CAAA;QACvB,IAAI,IAAI,GAAG,EAAE,CAAA;QAEb,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,GAAG,EAAE,eAAe,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,CAAA;QAElF,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAA;YAC3C,IAAI,IAAI;gBAAE,MAAK;YACf,IAAI,CAAC,KAAK;gBAAE,SAAQ;YACpB,eAAe,IAAI,KAAK,CAAC,UAAU,CAAA;YACnC,IAAI,IAAI,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC,CAAA;YAC/C,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,GAAG,EAAE,eAAe,EAAE,UAAU,EAAE,CAAC,CAAA;QACjF,CAAC;QAED,IAAI,IAAI,OAAO,CAAC,MAAM,EAAE,CAAA;QACxB,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,iBAAiB,EAAE,GAAG,EAAE,eAAe,EAAE,UAAU,EAAE,CAAC,CAAA;QAC3E,OAAO,IAAI,CAAA;IACb,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;YACjE,MAAM,IAAI,KAAK,CAAC,kCAAkC,CAAC,CAAA;QACrD,CAAC;QACD,MAAM,KAAK,CAAA;IACb,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,OAAO,CAAC,CAAA;IACvB,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,GAAW,EACX,mBAA+C,EAC/C,UAKI,EAAE;IAEN,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,CAAA;IACnC,MAAM,SAAS,GAAc,OAAO,CAAC,SAAS,IAAI,SAAS,CAAA;IAC3D,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAA;IACvF,MAAM,MAAM,GAAG,OAAO,OAAO,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAA;IACzE,MAAM,WAAW,GAAyB;QACxC,SAAS,EAAE,KAAK;QAChB,IAAI,EAAE,KAAK;QACX,SAAS;QACT,WAAW,EAAE,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;QAC5D,KAAK,EAAE,IAAI;KACZ,CAAA;IAED,IAAI,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC;QACtB,WAAW,CAAC,KAAK,GAAG,UAAU,CAAC,WAAW,CAAC,KAAK,EAAE,mCAAmC,CAAC,CAAA;QACtF,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,CAAA;IACvC,CAAC;IAED,IAAI,CAAC,mBAAmB,EAAE,CAAC;QACzB,WAAW,CAAC,KAAK,GAAG,UAAU,CAAC,WAAW,CAAC,KAAK,EAAE,6BAA6B,CAAC,CAAA;QAChF,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,CAAA;IACvC,CAAC;IAED,WAAW,CAAC,SAAS,GAAG,IAAI,CAAA;IAC5B,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,iBAAiB,EAAE,GAAG,EAAE,MAAM,EAAE,MAAM,IAAI,WAAW,EAAE,CAAC,CAAA;IAE7E,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,mBAAmB,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,CAAA;QACxE,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,WAAW,CAAC,KAAK,GAAG,UAAU,CAAC,WAAW,CAAC,KAAK,EAAE,uCAAuC,CAAC,CAAA;YAC1F,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE,aAAa,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;YAC9F,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,CAAA;QACvC,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,WAAW,EAAE,CAAA;QACjC,MAAM,aAAa,GACjB,OAAO,OAAO,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAA;QAC3F,MAAM,SAAS,GACb,OAAO,OAAO,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAA;QACnF,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,GAAG,EAAE,EAAE,EAAE,IAAI,EAAE,aAAa,EAAE,SAAS,EAAE,CAAC,CAAA;QAEjF,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,CAAA;IACjC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,WAAW,CAAC,KAAK,GAAG,UAAU,CAC5B,WAAW,CAAC,KAAK,EACjB,oBAAoB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CAC/E,CAAA;QACD,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE,aAAa,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;QAC9F,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,CAAA;IACvC,CAAC;AACH,CAAC"}
@@ -0,0 +1,275 @@
1
+ import { resolveTranscriptForLink } from '../transcript/index.js';
2
+ import { extractYouTubeVideoId, isYouTubeUrl, isYouTubeVideoUrl } from '../transcript/utils.js';
3
+ import { extractArticleContent, sanitizeHtmlForMarkdownConversion } from './article.js';
4
+ import { normalizeForPrompt } from './cleaner.js';
5
+ import { fetchHtmlDocument, fetchWithFirecrawl } from './fetcher.js';
6
+ import { extractMetadataFromFirecrawl, extractMetadataFromHtml } from './parsers.js';
7
+ import { appendNote, ensureTranscriptDiagnostics, finalizeExtractedLinkContent, pickFirstText, resolveCacheMode, resolveFirecrawlMode, resolveMaxCharacters, resolveTimeoutMs, safeHostname, selectBaseContent, } from './utils.js';
8
+ import { extractYouTubeShortDescription } from './youtube.js';
9
+ const LEADING_CONTROL_PATTERN = /^[\\s\\p{Cc}]+/u;
10
+ const BLOCKED_HTML_HINT_PATTERN = /access denied|attention required|captcha|cloudflare|enable javascript|forbidden|please turn javascript on|verify you are human/i;
11
+ const MIN_HTML_CONTENT_CHARACTERS = 200;
12
+ const MIN_HTML_DOCUMENT_CHARACTERS_FOR_FALLBACK = 5000;
13
+ function stripLeadingTitle(content, title) {
14
+ if (!(content && title)) {
15
+ return content;
16
+ }
17
+ const normalizedTitle = title.trim();
18
+ if (normalizedTitle.length === 0) {
19
+ return content;
20
+ }
21
+ const trimmedContent = content.trimStart();
22
+ if (!trimmedContent.toLowerCase().startsWith(normalizedTitle.toLowerCase())) {
23
+ return content;
24
+ }
25
+ const remainderOriginal = trimmedContent.slice(normalizedTitle.length);
26
+ const remainder = remainderOriginal.replace(LEADING_CONTROL_PATTERN, '');
27
+ return remainder;
28
+ }
29
+ function shouldFallbackToFirecrawl(html) {
30
+ if (BLOCKED_HTML_HINT_PATTERN.test(html)) {
31
+ return true;
32
+ }
33
+ const normalized = normalizeForPrompt(extractArticleContent(html));
34
+ if (normalized.length >= MIN_HTML_CONTENT_CHARACTERS) {
35
+ return false;
36
+ }
37
+ // Avoid spending Firecrawl on truly small/simple pages where the extracted HTML content is short but
38
+ // likely complete (e.g. https://example.com). Only treat "thin" content as a Firecrawl signal when
39
+ // the HTML document itself is large (SSR/app-shell pages, blocked pages without a match, etc.).
40
+ return html.length >= MIN_HTML_DOCUMENT_CHARACTERS_FOR_FALLBACK;
41
+ }
42
+ export async function fetchLinkContent(url, options, deps) {
43
+ const timeoutMs = resolveTimeoutMs(options);
44
+ const cacheMode = resolveCacheMode(options);
45
+ const maxCharacters = resolveMaxCharacters(options);
46
+ const youtubeTranscriptMode = options?.youtubeTranscript ?? 'auto';
47
+ const firecrawlMode = resolveFirecrawlMode(options);
48
+ const markdownRequested = (options?.format ?? 'text') === 'markdown';
49
+ const canUseFirecrawl = firecrawlMode !== 'off' && deps.scrapeWithFirecrawl !== null && !isYouTubeUrl(url);
50
+ let firecrawlAttempted = false;
51
+ let firecrawlPayload = null;
52
+ const firecrawlDiagnostics = {
53
+ attempted: false,
54
+ used: false,
55
+ cacheMode,
56
+ cacheStatus: cacheMode === 'bypass' ? 'bypassed' : 'unknown',
57
+ notes: null,
58
+ };
59
+ const attemptFirecrawl = async (reason) => {
60
+ if (!canUseFirecrawl) {
61
+ return null;
62
+ }
63
+ if (!firecrawlAttempted) {
64
+ const attempt = await fetchWithFirecrawl(url, deps.scrapeWithFirecrawl, {
65
+ timeoutMs,
66
+ cacheMode,
67
+ onProgress: deps.onProgress ?? null,
68
+ reason,
69
+ });
70
+ firecrawlAttempted = true;
71
+ firecrawlPayload = attempt.payload;
72
+ firecrawlDiagnostics.attempted = attempt.diagnostics.attempted;
73
+ firecrawlDiagnostics.used = attempt.diagnostics.used;
74
+ firecrawlDiagnostics.cacheMode = attempt.diagnostics.cacheMode;
75
+ firecrawlDiagnostics.cacheStatus = attempt.diagnostics.cacheStatus;
76
+ firecrawlDiagnostics.notes = attempt.diagnostics.notes ?? null;
77
+ }
78
+ firecrawlDiagnostics.notes = appendNote(firecrawlDiagnostics.notes, reason);
79
+ if (!firecrawlPayload) {
80
+ return null;
81
+ }
82
+ const firecrawlResult = await buildResultFromFirecrawl({
83
+ url,
84
+ payload: firecrawlPayload,
85
+ cacheMode,
86
+ maxCharacters,
87
+ youtubeTranscriptMode,
88
+ firecrawlDiagnostics,
89
+ markdownRequested,
90
+ deps,
91
+ });
92
+ if (firecrawlResult) {
93
+ return firecrawlResult;
94
+ }
95
+ firecrawlDiagnostics.notes = appendNote(firecrawlDiagnostics.notes, 'Firecrawl returned empty content');
96
+ return null;
97
+ };
98
+ if (firecrawlMode === 'always') {
99
+ const firecrawlResult = await attemptFirecrawl('Firecrawl forced via options');
100
+ if (firecrawlResult) {
101
+ return firecrawlResult;
102
+ }
103
+ }
104
+ let html = null;
105
+ let htmlError = null;
106
+ try {
107
+ html = await fetchHtmlDocument(deps.fetch, url, {
108
+ timeoutMs,
109
+ onProgress: deps.onProgress ?? null,
110
+ });
111
+ }
112
+ catch (error) {
113
+ htmlError = error;
114
+ }
115
+ if (!html) {
116
+ if (!canUseFirecrawl) {
117
+ throw htmlError instanceof Error ? htmlError : new Error('Failed to fetch HTML document');
118
+ }
119
+ const firecrawlResult = await attemptFirecrawl('HTML fetch failed; falling back to Firecrawl');
120
+ if (firecrawlResult) {
121
+ return firecrawlResult;
122
+ }
123
+ const firecrawlError = firecrawlDiagnostics.notes
124
+ ? `; Firecrawl notes: ${firecrawlDiagnostics.notes}`
125
+ : '';
126
+ throw new Error(`Failed to fetch HTML document${firecrawlError}${htmlError instanceof Error ? `; HTML error: ${htmlError.message}` : ''}`);
127
+ }
128
+ if (firecrawlMode === 'auto' && shouldFallbackToFirecrawl(html)) {
129
+ const firecrawlResult = await attemptFirecrawl('HTML content looked blocked/thin; falling back to Firecrawl');
130
+ if (firecrawlResult) {
131
+ return firecrawlResult;
132
+ }
133
+ }
134
+ return buildResultFromHtmlDocument({
135
+ url,
136
+ html,
137
+ cacheMode,
138
+ maxCharacters,
139
+ youtubeTranscriptMode,
140
+ firecrawlDiagnostics,
141
+ markdownRequested,
142
+ timeoutMs,
143
+ deps,
144
+ });
145
+ }
146
+ async function buildResultFromFirecrawl({ url, payload, cacheMode, maxCharacters, youtubeTranscriptMode, firecrawlDiagnostics, markdownRequested, deps, }) {
147
+ const normalizedMarkdown = normalizeForPrompt(payload.markdown ?? '');
148
+ if (normalizedMarkdown.length === 0) {
149
+ firecrawlDiagnostics.notes = appendNote(firecrawlDiagnostics.notes, 'Firecrawl markdown normalization yielded empty text');
150
+ return null;
151
+ }
152
+ const transcriptResolution = await resolveTranscriptForLink(url, payload.html ?? null, deps, {
153
+ youtubeTranscriptMode,
154
+ cacheMode,
155
+ });
156
+ const baseContent = selectBaseContent(normalizedMarkdown, transcriptResolution.text);
157
+ if (baseContent.length === 0) {
158
+ firecrawlDiagnostics.notes = appendNote(firecrawlDiagnostics.notes, 'Firecrawl produced content that normalized to an empty string');
159
+ return null;
160
+ }
161
+ const htmlMetadata = payload.html
162
+ ? extractMetadataFromHtml(payload.html, url)
163
+ : { title: null, description: null, siteName: null };
164
+ const metadata = extractMetadataFromFirecrawl(payload.metadata ?? null);
165
+ const title = pickFirstText([metadata.title, htmlMetadata.title]);
166
+ const description = pickFirstText([metadata.description, htmlMetadata.description]);
167
+ const siteName = pickFirstText([metadata.siteName, htmlMetadata.siteName, safeHostname(url)]);
168
+ firecrawlDiagnostics.used = true;
169
+ const transcriptDiagnostics = ensureTranscriptDiagnostics(transcriptResolution, cacheMode ?? 'default');
170
+ return finalizeExtractedLinkContent({
171
+ url,
172
+ baseContent,
173
+ maxCharacters,
174
+ title,
175
+ description,
176
+ siteName,
177
+ transcriptResolution,
178
+ diagnostics: {
179
+ strategy: 'firecrawl',
180
+ firecrawl: firecrawlDiagnostics,
181
+ markdown: {
182
+ requested: markdownRequested,
183
+ used: true,
184
+ provider: 'firecrawl',
185
+ },
186
+ transcript: transcriptDiagnostics,
187
+ },
188
+ });
189
+ }
190
+ async function buildResultFromHtmlDocument({ url, html, cacheMode, maxCharacters, youtubeTranscriptMode, firecrawlDiagnostics, markdownRequested, timeoutMs, deps, }) {
191
+ if (isYouTubeVideoUrl(url) && !extractYouTubeVideoId(url)) {
192
+ throw new Error('Invalid YouTube video id in URL');
193
+ }
194
+ const { title, description, siteName } = extractMetadataFromHtml(html, url);
195
+ const rawContent = extractArticleContent(html);
196
+ const normalized = normalizeForPrompt(rawContent);
197
+ const transcriptResolution = await resolveTranscriptForLink(url, html, deps, {
198
+ youtubeTranscriptMode,
199
+ cacheMode,
200
+ });
201
+ const youtubeDescription = transcriptResolution.text === null ? extractYouTubeShortDescription(html) : null;
202
+ const baseCandidate = youtubeDescription ? normalizeForPrompt(youtubeDescription) : normalized;
203
+ let baseContent = selectBaseContent(baseCandidate, transcriptResolution.text);
204
+ if (baseContent === normalized) {
205
+ baseContent = stripLeadingTitle(baseContent, title);
206
+ }
207
+ const transcriptDiagnostics = ensureTranscriptDiagnostics(transcriptResolution, cacheMode ?? 'default');
208
+ const markdownDiagnostics = await (async () => {
209
+ if (!markdownRequested) {
210
+ return { requested: false, used: false, provider: null, notes: null };
211
+ }
212
+ if (isYouTubeUrl(url)) {
213
+ return {
214
+ requested: true,
215
+ used: false,
216
+ provider: null,
217
+ notes: 'Skipping Markdown conversion for YouTube URLs',
218
+ };
219
+ }
220
+ if (!deps.convertHtmlToMarkdown) {
221
+ return {
222
+ requested: true,
223
+ used: false,
224
+ provider: null,
225
+ notes: 'No HTML→Markdown converter configured',
226
+ };
227
+ }
228
+ try {
229
+ const sanitizedHtml = sanitizeHtmlForMarkdownConversion(html);
230
+ const markdown = await deps.convertHtmlToMarkdown({
231
+ url,
232
+ html: sanitizedHtml,
233
+ title,
234
+ siteName,
235
+ timeoutMs,
236
+ });
237
+ const normalizedMarkdown = normalizeForPrompt(markdown);
238
+ if (normalizedMarkdown.length === 0) {
239
+ return {
240
+ requested: true,
241
+ used: false,
242
+ provider: null,
243
+ notes: 'HTML→Markdown conversion returned empty content',
244
+ };
245
+ }
246
+ baseContent = normalizedMarkdown;
247
+ return { requested: true, used: true, provider: 'llm', notes: null };
248
+ }
249
+ catch (error) {
250
+ const message = error instanceof Error ? error.message : String(error);
251
+ return {
252
+ requested: true,
253
+ used: false,
254
+ provider: null,
255
+ notes: `HTML→Markdown conversion failed: ${message}`,
256
+ };
257
+ }
258
+ })();
259
+ return finalizeExtractedLinkContent({
260
+ url,
261
+ baseContent,
262
+ maxCharacters,
263
+ title,
264
+ description,
265
+ siteName,
266
+ transcriptResolution,
267
+ diagnostics: {
268
+ strategy: 'html',
269
+ firecrawl: firecrawlDiagnostics,
270
+ markdown: markdownDiagnostics,
271
+ transcript: transcriptDiagnostics,
272
+ },
273
+ });
274
+ }
275
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,wBAAwB,EAAE,MAAM,wBAAwB,CAAA;AACjE,OAAO,EAAE,qBAAqB,EAAE,YAAY,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAA;AAE/F,OAAO,EAAE,qBAAqB,EAAE,iCAAiC,EAAE,MAAM,cAAc,CAAA;AACvF,OAAO,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAA;AACjD,OAAO,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAA;AACpE,OAAO,EAAE,4BAA4B,EAAE,uBAAuB,EAAE,MAAM,cAAc,CAAA;AAEpF,OAAO,EACL,UAAU,EACV,2BAA2B,EAC3B,4BAA4B,EAC5B,aAAa,EACb,gBAAgB,EAChB,oBAAoB,EACpB,oBAAoB,EACpB,gBAAgB,EAChB,YAAY,EACZ,iBAAiB,GAClB,MAAM,YAAY,CAAA;AACnB,OAAO,EAAE,8BAA8B,EAAE,MAAM,cAAc,CAAA;AAE7D,MAAM,uBAAuB,GAAG,iBAAiB,CAAA;AACjD,MAAM,yBAAyB,GAC7B,iIAAiI,CAAA;AACnI,MAAM,2BAA2B,GAAG,GAAG,CAAA;AACvC,MAAM,yCAAyC,GAAG,IAAI,CAAA;AAEtD,SAAS,iBAAiB,CAAC,OAAe,EAAE,KAAgC;IAC1E,IAAI,CAAC,CAAC,OAAO,IAAI,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,eAAe,GAAG,KAAK,CAAC,IAAI,EAAE,CAAA;IACpC,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACjC,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,cAAc,GAAG,OAAO,CAAC,SAAS,EAAE,CAAA;IAC1C,IAAI,CAAC,cAAc,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,eAAe,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;QAC5E,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,iBAAiB,GAAG,cAAc,CAAC,KAAK,CAAC,eAAe,CAAC,MAAM,CAAC,CAAA;IACtE,MAAM,SAAS,GAAG,iBAAiB,CAAC,OAAO,CAAC,uBAAuB,EAAE,EAAE,CAAC,CAAA;IACxE,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,SAAS,yBAAyB,CAAC,IAAY;IAC7C,IAAI,yBAAyB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QACzC,OAAO,IAAI,CAAA;IACb,CAAC;IACD,MAAM,UAAU,GAAG,kBAAkB,CAAC,qBAAqB,CAAC,IAAI,CAAC,CAAC,CAAA;IAClE,IAAI,UAAU,CAAC,MAAM,IAAI,2BAA2B,EAAE,CAAC;QACrD,OAAO,KAAK,CAAA;IACd,CAAC;IAED,qGAAqG;IACrG,mGAAmG;IACnG,gGAAgG;IAChG,OAAO,IAAI,CAAC,MAAM,IAAI,yCAAyC,CAAA;AACjE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,GAAW,EACX,OAA4C,EAC5C,IAAqB;IAErB,MAAM,SAAS,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAA;IAC3C,MAAM,SAAS,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAA;IAC3C,MAAM,aAAa,GAAG,oBAAoB,CAAC,OAAO,CAAC,CAAA;IACnD,MAAM,qBAAqB,GAAG,OAAO,EAAE,iBAAiB,IAAI,MAAM,CAAA;IAClE,MAAM,aAAa,GAAG,oBAAoB,CAAC,OAAO,CAAC,CAAA;IACnD,MAAM,iBAAiB,GAAG,CAAC,OAAO,EAAE,MAAM,IAAI,MAAM,CAAC,KAAK,UAAU,CAAA;IAEpE,MAAM,eAAe,GACnB,aAAa,KAAK,KAAK,IAAI,IAAI,CAAC,mBAAmB,KAAK,IAAI,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAA;IAEpF,IAAI,kBAAkB,GAAG,KAAK,CAAA;IAC9B,IAAI,gBAAgB,GAAiC,IAAI,CAAA;IACzD,MAAM,oBAAoB,GAAyB;QACjD,SAAS,EAAE,KAAK;QAChB,IAAI,EAAE,KAAK;QACX,SAAS;QACT,WAAW,EAAE,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;QAC5D,KAAK,EAAE,IAAI;KACZ,CAAA;IAED,MAAM,gBAAgB,GAAG,KAAK,EAAE,MAAc,EAAwC,EAAE;QACtF,IAAI,CAAC,eAAe,EAAE,CAAC;YACrB,OAAO,IAAI,CAAA;QACb,CAAC;QAED,IAAI,CAAC,kBAAkB,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,GAAG,EAAE,IAAI,CAAC,mBAAmB,EAAE;gBACtE,SAAS;gBACT,SAAS;gBACT,UAAU,EAAE,IAAI,CAAC,UAAU,IAAI,IAAI;gBACnC,MAAM;aACP,CAAC,CAAA;YACF,kBAAkB,GAAG,IAAI,CAAA;YACzB,gBAAgB,GAAG,OAAO,CAAC,OAAO,CAAA;YAClC,oBAAoB,CAAC,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC,SAAS,CAAA;YAC9D,oBAAoB,CAAC,IAAI,GAAG,OAAO,CAAC,WAAW,CAAC,IAAI,CAAA;YACpD,oBAAoB,CAAC,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC,SAAS,CAAA;YAC9D,oBAAoB,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC,WAAW,CAAA;YAClE,oBAAoB,CAAC,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,KAAK,IAAI,IAAI,CAAA;QAChE,CAAC;QAED,oBAAoB,CAAC,KAAK,GAAG,UAAU,CAAC,oBAAoB,CAAC,KAAK,EAAE,MAAM,CAAC,CAAA;QAE3E,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACtB,OAAO,IAAI,CAAA;QACb,CAAC;QAED,MAAM,eAAe,GAAG,MAAM,wBAAwB,CAAC;YACrD,GAAG;YACH,OAAO,EAAE,gBAAgB;YACzB,SAAS;YACT,aAAa;YACb,qBAAqB;YACrB,oBAAoB;YACpB,iBAAiB;YACjB,IAAI;SACL,CAAC,CAAA;QACF,IAAI,eAAe,EAAE,CAAC;YACpB,OAAO,eAAe,CAAA;QACxB,CAAC;QAED,oBAAoB,CAAC,KAAK,GAAG,UAAU,CACrC,oBAAoB,CAAC,KAAK,EAC1B,kCAAkC,CACnC,CAAA;QACD,OAAO,IAAI,CAAA;IACb,CAAC,CAAA;IAED,IAAI,aAAa,KAAK,QAAQ,EAAE,CAAC;QAC/B,MAAM,eAAe,GAAG,MAAM,gBAAgB,CAAC,8BAA8B,CAAC,CAAA;QAC9E,IAAI,eAAe,EAAE,CAAC;YACpB,OAAO,eAAe,CAAA;QACxB,CAAC;IACH,CAAC;IAED,IAAI,IAAI,GAAkB,IAAI,CAAA;IAC9B,IAAI,SAAS,GAAY,IAAI,CAAA;IAE7B,IAAI,CAAC;QACH,IAAI,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,KAAK,EAAE,GAAG,EAAE;YAC9C,SAAS;YACT,UAAU,EAAE,IAAI,CAAC,UAAU,IAAI,IAAI;SACpC,CAAC,CAAA;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,SAAS,GAAG,KAAK,CAAA;IACnB,CAAC;IAED,IAAI,CAAC,IAAI,EAAE,CAAC;QACV,IAAI,CAAC,eAAe,EAAE,CAAC;YACrB,MAAM,SAAS,YAAY,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAA;QAC3F,CAAC;QAED,MAAM,eAAe,GAAG,MAAM,gBAAgB,CAAC,8CAA8C,CAAC,CAAA;QAC9F,IAAI,eAAe,EAAE,CAAC;YACpB,OAAO,eAAe,CAAA;QACxB,CAAC;QAED,MAAM,cAAc,GAAG,oBAAoB,CAAC,KAAK;YAC/C,CAAC,CAAC,sBAAsB,oBAAoB,CAAC,KAAK,EAAE;YACpD,CAAC,CAAC,EAAE,CAAA;QACN,MAAM,IAAI,KAAK,CACb,gCAAgC,cAAc,GAC5C,SAAS,YAAY,KAAK,CAAC,CAAC,CAAC,iBAAiB,SAAS,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,EACtE,EAAE,CACH,CAAA;IACH,CAAC;IAED,IAAI,aAAa,KAAK,MAAM,IAAI,yBAAyB,CAAC,IAAI,CAAC,EAAE,CAAC;QAChE,MAAM,eAAe,GAAG,MAAM,gBAAgB,CAC5C,6DAA6D,CAC9D,CAAA;QACD,IAAI,eAAe,EAAE,CAAC;YACpB,OAAO,eAAe,CAAA;QACxB,CAAC;IACH,CAAC;IAED,OAAO,2BAA2B,CAAC;QACjC,GAAG;QACH,IAAI;QACJ,SAAS;QACT,aAAa;QACb,qBAAqB;QACrB,oBAAoB;QACpB,iBAAiB;QACjB,SAAS;QACT,IAAI;KACL,CAAC,CAAA;AACJ,CAAC;AAED,KAAK,UAAU,wBAAwB,CAAC,EACtC,GAAG,EACH,OAAO,EACP,SAAS,EACT,aAAa,EACb,qBAAqB,EACrB,oBAAoB,EACpB,iBAAiB,EACjB,IAAI,GAUL;IACC,MAAM,kBAAkB,GAAG,kBAAkB,CAAC,OAAO,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAA;IACrE,IAAI,kBAAkB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACpC,oBAAoB,CAAC,KAAK,GAAG,UAAU,CACrC,oBAAoB,CAAC,KAAK,EAC1B,qDAAqD,CACtD,CAAA;QACD,OAAO,IAAI,CAAA;IACb,CAAC;IAED,MAAM,oBAAoB,GAAG,MAAM,wBAAwB,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,IAAI,IAAI,EAAE,IAAI,EAAE;QAC3F,qBAAqB;QACrB,SAAS;KACV,CAAC,CAAA;IACF,MAAM,WAAW,GAAG,iBAAiB,CAAC,kBAAkB,EAAE,oBAAoB,CAAC,IAAI,CAAC,CAAA;IACpF,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC7B,oBAAoB,CAAC,KAAK,GAAG,UAAU,CACrC,oBAAoB,CAAC,KAAK,EAC1B,+DAA+D,CAChE,CAAA;QACD,OAAO,IAAI,CAAA;IACb,CAAC;IAED,MAAM,YAAY,GAAG,OAAO,CAAC,IAAI;QAC/B,CAAC,CAAC,uBAAuB,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC;QAC5C,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAA;IACtD,MAAM,QAAQ,GAAG,4BAA4B,CAAC,OAAO,CAAC,QAAQ,IAAI,IAAI,CAAC,CAAA;IAEvE,MAAM,KAAK,GAAG,aAAa,CAAC,CAAC,QAAQ,CAAC,KAAK,EAAE,YAAY,CAAC,KAAK,CAAC,CAAC,CAAA;IACjE,MAAM,WAAW,GAAG,aAAa,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,YAAY,CAAC,WAAW,CAAC,CAAC,CAAA;IACnF,MAAM,QAAQ,GAAG,aAAa,CAAC,CAAC,QAAQ,CAAC,QAAQ,EAAE,YAAY,CAAC,QAAQ,EAAE,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;IAE7F,oBAAoB,CAAC,IAAI,GAAG,IAAI,CAAA;IAEhC,MAAM,qBAAqB,GAAG,2BAA2B,CACvD,oBAAoB,EACpB,SAAS,IAAI,SAAS,CACvB,CAAA;IAED,OAAO,4BAA4B,CAAC;QAClC,GAAG;QACH,WAAW;QACX,aAAa;QACb,KAAK;QACL,WAAW;QACX,QAAQ;QACR,oBAAoB;QACpB,WAAW,EAAE;YACX,QAAQ,EAAE,WAAW;YACrB,SAAS,EAAE,oBAAoB;YAC/B,QAAQ,EAAE;gBACR,SAAS,EAAE,iBAAiB;gBAC5B,IAAI,EAAE,IAAI;gBACV,QAAQ,EAAE,WAAW;aACtB;YACD,UAAU,EAAE,qBAAqB;SAClC;KACF,CAAC,CAAA;AACJ,CAAC;AAED,KAAK,UAAU,2BAA2B,CAAC,EACzC,GAAG,EACH,IAAI,EACJ,SAAS,EACT,aAAa,EACb,qBAAqB,EACrB,oBAAoB,EACpB,iBAAiB,EACjB,SAAS,EACT,IAAI,GAWL;IACC,IAAI,iBAAiB,CAAC,GAAG,CAAC,IAAI,CAAC,qBAAqB,CAAC,GAAG,CAAC,EAAE,CAAC;QAC1D,MAAM,IAAI,KAAK,CAAC,iCAAiC,CAAC,CAAA;IACpD,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,GAAG,uBAAuB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAA;IAC3E,MAAM,UAAU,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAA;IAC9C,MAAM,UAAU,GAAG,kBAAkB,CAAC,UAAU,CAAC,CAAA;IACjD,MAAM,oBAAoB,GAAG,MAAM,wBAAwB,CAAC,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE;QAC3E,qBAAqB;QACrB,SAAS;KACV,CAAC,CAAA;IAEF,MAAM,kBAAkB,GACtB,oBAAoB,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC,CAAC,8BAA8B,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAA;IAClF,MAAM,aAAa,GAAG,kBAAkB,CAAC,CAAC,CAAC,kBAAkB,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,UAAU,CAAA;IAE9F,IAAI,WAAW,GAAG,iBAAiB,CAAC,aAAa,EAAE,oBAAoB,CAAC,IAAI,CAAC,CAAA;IAC7E,IAAI,WAAW,KAAK,UAAU,EAAE,CAAC;QAC/B,WAAW,GAAG,iBAAiB,CAAC,WAAW,EAAE,KAAK,CAAC,CAAA;IACrD,CAAC;IAED,MAAM,qBAAqB,GAAG,2BAA2B,CACvD,oBAAoB,EACpB,SAAS,IAAI,SAAS,CACvB,CAAA;IAED,MAAM,mBAAmB,GAAwB,MAAM,CAAC,KAAK,IAAI,EAAE;QACjE,IAAI,CAAC,iBAAiB,EAAE,CAAC;YACvB,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAA;QACvE,CAAC;QAED,IAAI,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO;gBACL,SAAS,EAAE,IAAI;gBACf,IAAI,EAAE,KAAK;gBACX,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,+CAA+C;aACvD,CAAA;QACH,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,qBAAqB,EAAE,CAAC;YAChC,OAAO;gBACL,SAAS,EAAE,IAAI;gBACf,IAAI,EAAE,KAAK;gBACX,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,uCAAuC;aAC/C,CAAA;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,aAAa,GAAG,iCAAiC,CAAC,IAAI,CAAC,CAAA;YAC7D,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,qBAAqB,CAAC;gBAChD,GAAG;gBACH,IAAI,EAAE,aAAa;gBACnB,KAAK;gBACL,QAAQ;gBACR,SAAS;aACV,CAAC,CAAA;YACF,MAAM,kBAAkB,GAAG,kBAAkB,CAAC,QAAQ,CAAC,CAAA;YACvD,IAAI,kBAAkB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACpC,OAAO;oBACL,SAAS,EAAE,IAAI;oBACf,IAAI,EAAE,KAAK;oBACX,QAAQ,EAAE,IAAI;oBACd,KAAK,EAAE,iDAAiD;iBACzD,CAAA;YACH,CAAC;YAED,WAAW,GAAG,kBAAkB,CAAA;YAChC,OAAO,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,CAAA;QACtE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;YACtE,OAAO;gBACL,SAAS,EAAE,IAAI;gBACf,IAAI,EAAE,KAAK;gBACX,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,oCAAoC,OAAO,EAAE;aACrD,CAAA;QACH,CAAC;IACH,CAAC,CAAC,EAAE,CAAA;IAEJ,OAAO,4BAA4B,CAAC;QAClC,GAAG;QACH,WAAW;QACX,aAAa;QACb,KAAK;QACL,WAAW;QACX,QAAQ;QACR,oBAAoB;QACpB,WAAW,EAAE;YACX,QAAQ,EAAE,MAAM;YAChB,SAAS,EAAE,oBAAoB;YAC/B,QAAQ,EAAE,mBAAmB;YAC7B,UAAU,EAAE,qBAAqB;SAClC;KACF,CAAC,CAAA;AACJ,CAAC"}
@@ -0,0 +1,77 @@
1
+ import { load } from 'cheerio';
2
+ import { decodeHtmlEntities, normalizeCandidate } from './cleaner.js';
3
+ import { pickFirstText, safeHostname } from './utils.js';
4
+ const ALLOWED_TEXT_TAGS = new Set(['title']);
5
+ export function extractMetadataFromHtml(html, url) {
6
+ const $ = load(html);
7
+ const title = pickFirstText([
8
+ pickMetaContent($, [
9
+ { attribute: 'property', value: 'og:title' },
10
+ { attribute: 'name', value: 'og:title' },
11
+ { attribute: 'name', value: 'twitter:title' },
12
+ ]),
13
+ extractTagText($, 'title'),
14
+ ]);
15
+ const description = pickFirstText([
16
+ pickMetaContent($, [
17
+ { attribute: 'property', value: 'og:description' },
18
+ { attribute: 'name', value: 'description' },
19
+ { attribute: 'name', value: 'twitter:description' },
20
+ ]),
21
+ ]);
22
+ const siteName = pickFirstText([
23
+ pickMetaContent($, [
24
+ { attribute: 'property', value: 'og:site_name' },
25
+ { attribute: 'name', value: 'application-name' },
26
+ ]),
27
+ safeHostname(url),
28
+ ]);
29
+ return { title, description, siteName };
30
+ }
31
+ export function extractMetadataFromFirecrawl(metadata) {
32
+ return {
33
+ title: pickFirstText([metadataString(metadata, 'title'), metadataString(metadata, 'ogTitle')]),
34
+ description: pickFirstText([
35
+ metadataString(metadata, 'description'),
36
+ metadataString(metadata, 'ogDescription'),
37
+ ]),
38
+ siteName: pickFirstText([
39
+ metadataString(metadata, 'siteName'),
40
+ metadataString(metadata, 'ogSiteName'),
41
+ ]),
42
+ };
43
+ }
44
+ function pickMetaContent($, selectors) {
45
+ for (const selector of selectors) {
46
+ const meta = $(`meta[${selector.attribute}="${selector.value}"]`).first();
47
+ if (meta.length === 0) {
48
+ continue;
49
+ }
50
+ const value = meta.attr('content') ?? meta.attr('value') ?? '';
51
+ const normalized = normalizeCandidate(decodeHtmlEntities(value));
52
+ if (normalized) {
53
+ return normalized;
54
+ }
55
+ }
56
+ return null;
57
+ }
58
+ function extractTagText($, tagName) {
59
+ const normalizedTag = tagName.trim().toLowerCase();
60
+ if (!ALLOWED_TEXT_TAGS.has(normalizedTag)) {
61
+ return null;
62
+ }
63
+ const element = $(normalizedTag).first();
64
+ if (element.length === 0) {
65
+ return null;
66
+ }
67
+ const text = decodeHtmlEntities(element.text());
68
+ return normalizeCandidate(text);
69
+ }
70
+ function metadataString(metadata, key) {
71
+ if (!metadata) {
72
+ return null;
73
+ }
74
+ const value = metadata[key];
75
+ return typeof value === 'string' ? normalizeCandidate(value) : null;
76
+ }
77
+ //# sourceMappingURL=parsers.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parsers.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/parsers.ts"],"names":[],"mappings":"AAAA,OAAO,EAAmB,IAAI,EAAE,MAAM,SAAS,CAAA;AAE/C,OAAO,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAA;AACrE,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAExD,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAA;AAa5C,MAAM,UAAU,uBAAuB,CAAC,IAAY,EAAE,GAAW;IAC/D,MAAM,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAA;IAEpB,MAAM,KAAK,GAAG,aAAa,CAAC;QAC1B,eAAe,CAAC,CAAC,EAAE;YACjB,EAAE,SAAS,EAAE,UAAU,EAAE,KAAK,EAAE,UAAU,EAAE;YAC5C,EAAE,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,UAAU,EAAE;YACxC,EAAE,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,eAAe,EAAE;SAC9C,CAAC;QACF,cAAc,CAAC,CAAC,EAAE,OAAO,CAAC;KAC3B,CAAC,CAAA;IAEF,MAAM,WAAW,GAAG,aAAa,CAAC;QAChC,eAAe,CAAC,CAAC,EAAE;YACjB,EAAE,SAAS,EAAE,UAAU,EAAE,KAAK,EAAE,gBAAgB,EAAE;YAClD,EAAE,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,aAAa,EAAE;YAC3C,EAAE,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,qBAAqB,EAAE;SACpD,CAAC;KACH,CAAC,CAAA;IAEF,MAAM,QAAQ,GAAG,aAAa,CAAC;QAC7B,eAAe,CAAC,CAAC,EAAE;YACjB,EAAE,SAAS,EAAE,UAAU,EAAE,KAAK,EAAE,cAAc,EAAE;YAChD,EAAE,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,kBAAkB,EAAE;SACjD,CAAC;QACF,YAAY,CAAC,GAAG,CAAC;KAClB,CAAC,CAAA;IAEF,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,CAAA;AACzC,CAAC;AAED,MAAM,UAAU,4BAA4B,CAC1C,QAAoD;IAEpD,OAAO;QACL,KAAK,EAAE,aAAa,CAAC,CAAC,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,EAAE,cAAc,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC,CAAC;QAC9F,WAAW,EAAE,aAAa,CAAC;YACzB,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC;YACvC,cAAc,CAAC,QAAQ,EAAE,eAAe,CAAC;SAC1C,CAAC;QACF,QAAQ,EAAE,aAAa,CAAC;YACtB,cAAc,CAAC,QAAQ,EAAE,UAAU,CAAC;YACpC,cAAc,CAAC,QAAQ,EAAE,YAAY,CAAC;SACvC,CAAC;KACH,CAAA;AACH,CAAC;AAED,SAAS,eAAe,CAAC,CAAa,EAAE,SAAyB;IAC/D,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,MAAM,IAAI,GAAG,CAAC,CAAC,QAAQ,QAAQ,CAAC,SAAS,KAAK,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC,KAAK,EAAE,CAAA;QACzE,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtB,SAAQ;QACV,CAAC;QACD,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAA;QAC9D,MAAM,UAAU,GAAG,kBAAkB,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAA;QAChE,IAAI,UAAU,EAAE,CAAC;YACf,OAAO,UAAU,CAAA;QACnB,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAA;AACb,CAAC;AAED,SAAS,cAAc,CAAC,CAAa,EAAE,OAAe;IACpD,MAAM,aAAa,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;IAClD,IAAI,CAAC,iBAAiB,CAAC,GAAG,CAAC,aAAa,CAAC,EAAE,CAAC;QAC1C,OAAO,IAAI,CAAA;IACb,CAAC;IACD,MAAM,OAAO,GAAG,CAAC,CAAC,aAAa,CAAC,CAAC,KAAK,EAAE,CAAA;IACxC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,CAAA;IACb,CAAC;IACD,MAAM,IAAI,GAAG,kBAAkB,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAA;IAC/C,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAA;AACjC,CAAC;AAED,SAAS,cAAc,CACrB,QAAoD,EACpD,GAAW;IAEX,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,OAAO,IAAI,CAAA;IACb,CAAC;IACD,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAA;IAC3B,OAAO,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAA;AACrE,CAAC"}
@@ -0,0 +1,4 @@
1
+ export const DEFAULT_TIMEOUT_MS = 120_000;
2
+ export const DEFAULT_MAX_CONTENT_CHARACTERS = 8000;
3
+ export const DEFAULT_CACHE_MODE = 'default';
4
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/types.ts"],"names":[],"mappings":"AAOA,MAAM,CAAC,MAAM,kBAAkB,GAAG,OAAO,CAAA;AACzC,MAAM,CAAC,MAAM,8BAA8B,GAAG,IAAI,CAAA;AAClD,MAAM,CAAC,MAAM,kBAAkB,GAAc,SAAS,CAAA"}
@@ -0,0 +1,127 @@
1
+ import { applyContentBudget, normalizeCandidate, normalizeForPrompt } from './cleaner.js';
2
+ import { DEFAULT_CACHE_MODE, DEFAULT_MAX_CONTENT_CHARACTERS, DEFAULT_TIMEOUT_MS, } from './types.js';
3
+ const WWW_PREFIX_PATTERN = /^www\./i;
4
+ const TRANSCRIPT_LINE_SPLIT_PATTERN = /\r?\n/;
5
+ const WORD_SPLIT_PATTERN = /\s+/g;
6
+ export function resolveCacheMode(options) {
7
+ return options?.cacheMode ?? DEFAULT_CACHE_MODE;
8
+ }
9
+ export function resolveMaxCharacters(options) {
10
+ const candidate = options?.maxCharacters;
11
+ if (typeof candidate !== 'number' || !Number.isFinite(candidate) || candidate <= 0) {
12
+ return null;
13
+ }
14
+ if (candidate <= DEFAULT_MAX_CONTENT_CHARACTERS) {
15
+ return DEFAULT_MAX_CONTENT_CHARACTERS;
16
+ }
17
+ return Math.floor(candidate);
18
+ }
19
+ export function resolveTimeoutMs(options) {
20
+ const candidate = options?.timeoutMs;
21
+ if (typeof candidate !== 'number' || !Number.isFinite(candidate) || candidate <= 0) {
22
+ return DEFAULT_TIMEOUT_MS;
23
+ }
24
+ return Math.floor(candidate);
25
+ }
26
+ export function resolveFirecrawlMode(options) {
27
+ const candidate = options?.firecrawl;
28
+ if (candidate === 'off' || candidate === 'auto' || candidate === 'always') {
29
+ return candidate;
30
+ }
31
+ return 'auto';
32
+ }
33
+ export function appendNote(existing, next) {
34
+ if (!next) {
35
+ return existing ?? '';
36
+ }
37
+ if (!existing || existing.length === 0) {
38
+ return next;
39
+ }
40
+ return `${existing}; ${next}`;
41
+ }
42
+ export function safeHostname(rawUrl) {
43
+ try {
44
+ return new URL(rawUrl).hostname.replace(WWW_PREFIX_PATTERN, '');
45
+ }
46
+ catch {
47
+ return null;
48
+ }
49
+ }
50
+ export function pickFirstText(candidates) {
51
+ for (const candidate of candidates) {
52
+ const normalized = normalizeCandidate(candidate);
53
+ if (normalized) {
54
+ return normalized;
55
+ }
56
+ }
57
+ return null;
58
+ }
59
+ export function selectBaseContent(sourceContent, transcriptText) {
60
+ if (!transcriptText) {
61
+ return sourceContent;
62
+ }
63
+ const normalizedTranscript = normalizeForPrompt(transcriptText);
64
+ if (normalizedTranscript.length === 0) {
65
+ return sourceContent;
66
+ }
67
+ return `Transcript:\n${normalizedTranscript}`;
68
+ }
69
+ export function summarizeTranscript(transcriptText) {
70
+ if (!transcriptText) {
71
+ return { transcriptCharacters: null, transcriptLines: null };
72
+ }
73
+ const transcriptCharacters = transcriptText.length > 0 ? transcriptText.length : null;
74
+ const transcriptLinesRaw = transcriptText
75
+ .split(TRANSCRIPT_LINE_SPLIT_PATTERN)
76
+ .map((line) => line.trim())
77
+ .filter((line) => line.length > 0).length;
78
+ const transcriptLines = transcriptLinesRaw > 0 ? transcriptLinesRaw : null;
79
+ return { transcriptCharacters, transcriptLines };
80
+ }
81
+ export function ensureTranscriptDiagnostics(resolution, cacheMode) {
82
+ if (resolution.diagnostics) {
83
+ return resolution.diagnostics;
84
+ }
85
+ const hasText = typeof resolution.text === 'string' && resolution.text.length > 0;
86
+ const cacheStatus = cacheMode === 'bypass' ? 'bypassed' : hasText ? 'miss' : 'unknown';
87
+ return {
88
+ cacheMode,
89
+ cacheStatus,
90
+ textProvided: hasText,
91
+ provider: resolution.source,
92
+ attemptedProviders: resolution.source ? [resolution.source] : [],
93
+ notes: cacheMode === 'bypass' ? 'Cache bypass requested' : null,
94
+ };
95
+ }
96
+ export function finalizeExtractedLinkContent({ url, baseContent, maxCharacters, title, description, siteName, transcriptResolution, diagnostics, }) {
97
+ const normalized = normalizeForPrompt(baseContent);
98
+ const { content, truncated, totalCharacters, wordCount } = typeof maxCharacters === 'number'
99
+ ? applyContentBudget(normalized, maxCharacters)
100
+ : {
101
+ content: normalized,
102
+ truncated: false,
103
+ totalCharacters: normalized.length,
104
+ wordCount: normalized.length > 0
105
+ ? normalized
106
+ .split(WORD_SPLIT_PATTERN)
107
+ .map((value) => value.trim())
108
+ .filter((value) => value.length > 0).length
109
+ : 0,
110
+ };
111
+ const { transcriptCharacters, transcriptLines } = summarizeTranscript(transcriptResolution.text);
112
+ return {
113
+ url,
114
+ title,
115
+ description,
116
+ siteName,
117
+ content,
118
+ truncated,
119
+ totalCharacters,
120
+ wordCount,
121
+ transcriptCharacters,
122
+ transcriptLines,
123
+ transcriptSource: transcriptResolution.source,
124
+ diagnostics,
125
+ };
126
+ }
127
+ //# sourceMappingURL=utils.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"utils.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/utils.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAA;AACzF,OAAO,EACL,kBAAkB,EAClB,8BAA8B,EAC9B,kBAAkB,GAMnB,MAAM,YAAY,CAAA;AAEnB,MAAM,kBAAkB,GAAG,SAAS,CAAA;AACpC,MAAM,6BAA6B,GAAG,OAAO,CAAA;AAC7C,MAAM,kBAAkB,GAAG,MAAM,CAAA;AAEjC,MAAM,UAAU,gBAAgB,CAAC,OAAiC;IAChE,OAAO,OAAO,EAAE,SAAS,IAAI,kBAAkB,CAAA;AACjD,CAAC;AAED,MAAM,UAAU,oBAAoB,CAAC,OAAiC;IACpE,MAAM,SAAS,GAAG,OAAO,EAAE,aAAa,CAAA;IACxC,IAAI,OAAO,SAAS,KAAK,QAAQ,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;QACnF,OAAO,IAAI,CAAA;IACb,CAAC;IACD,IAAI,SAAS,IAAI,8BAA8B,EAAE,CAAC;QAChD,OAAO,8BAA8B,CAAA;IACvC,CAAC;IACD,OAAO,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;AAC9B,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,OAAiC;IAChE,MAAM,SAAS,GAAG,OAAO,EAAE,SAAS,CAAA;IACpC,IAAI,OAAO,SAAS,KAAK,QAAQ,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;QACnF,OAAO,kBAAkB,CAAA;IAC3B,CAAC;IACD,OAAO,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;AAC9B,CAAC;AAED,MAAM,UAAU,oBAAoB,CAAC,OAAiC;IACpE,MAAM,SAAS,GAAG,OAAO,EAAE,SAAS,CAAA;IACpC,IAAI,SAAS,KAAK,KAAK,IAAI,SAAS,KAAK,MAAM,IAAI,SAAS,KAAK,QAAQ,EAAE,CAAC;QAC1E,OAAO,SAAS,CAAA;IAClB,CAAC;IACD,OAAO,MAAM,CAAA;AACf,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,QAAmC,EAAE,IAAY;IAC1E,IAAI,CAAC,IAAI,EAAE,CAAC;QACV,OAAO,QAAQ,IAAI,EAAE,CAAA;IACvB,CAAC;IACD,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvC,OAAO,IAAI,CAAA;IACb,CAAC;IACD,OAAO,GAAG,QAAQ,KAAK,IAAI,EAAE,CAAA;AAC/B,CAAC;AAED,MAAM,UAAU,YAAY,CAAC,MAAc;IACzC,IAAI,CAAC;QACH,OAAO,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAA;IACjE,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAA;IACb,CAAC;AACH,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,UAA4C;IACxE,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,UAAU,GAAG,kBAAkB,CAAC,SAAS,CAAC,CAAA;QAChD,IAAI,UAAU,EAAE,CAAC;YACf,OAAO,UAAU,CAAA;QACnB,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAA;AACb,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,aAAqB,EAAE,cAA6B;IACpF,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,OAAO,aAAa,CAAA;IACtB,CAAC;IACD,MAAM,oBAAoB,GAAG,kBAAkB,CAAC,cAAc,CAAC,CAAA;IAC/D,IAAI,oBAAoB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,aAAa,CAAA;IACtB,CAAC;IACD,OAAO,gBAAgB,oBAAoB,EAAE,CAAA;AAC/C,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,cAA6B;IAC/D,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,OAAO,EAAE,oBAAoB,EAAE,IAAI,EAAE,eAAe,EAAE,IAAI,EAAE,CAAA;IAC9D,CAAC;IACD,MAAM,oBAAoB,GAAG,cAAc,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAA;IACrF,MAAM,kBAAkB,GAAG,cAAc;SACtC,KAAK,CAAC,6BAA6B,CAAC;SACpC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SAC1B,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,MAAM,CAAA;IAC3C,MAAM,eAAe,GAAG,kBAAkB,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,IAAI,CAAA;IAC1E,OAAO,EAAE,oBAAoB,EAAE,eAAe,EAAE,CAAA;AAClD,CAAC;AAED,MAAM,UAAU,2BAA2B,CACzC,UAAgC,EAChC,SAAoB;IAEpB,IAAI,UAAU,CAAC,WAAW,EAAE,CAAC;QAC3B,OAAO,UAAU,CAAC,WAAW,CAAA;IAC/B,CAAC;IACD,MAAM,OAAO,GAAG,OAAO,UAAU,CAAC,IAAI,KAAK,QAAQ,IAAI,UAAU,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAA;IACjF,MAAM,WAAW,GAAG,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAA;IACtF,OAAO;QACL,SAAS;QACT,WAAW;QACX,YAAY,EAAE,OAAO;QACrB,QAAQ,EAAE,UAAU,CAAC,MAAM;QAC3B,kBAAkB,EAAE,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE;QAChE,KAAK,EAAE,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,wBAAwB,CAAC,CAAC,CAAC,IAAI;KAChE,CAAA;AACH,CAAC;AAED,MAAM,UAAU,4BAA4B,CAAC,EAC3C,GAAG,EACH,WAAW,EACX,aAAa,EACb,KAAK,EACL,WAAW,EACX,QAAQ,EACR,oBAAoB,EACpB,WAAW,GACW;IACtB,MAAM,UAAU,GAAG,kBAAkB,CAAC,WAAW,CAAC,CAAA;IAClD,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,SAAS,EAAE,GACtD,OAAO,aAAa,KAAK,QAAQ;QAC/B,CAAC,CAAC,kBAAkB,CAAC,UAAU,EAAE,aAAa,CAAC;QAC/C,CAAC,CAAC;YACE,OAAO,EAAE,UAAU;YACnB,SAAS,EAAE,KAAK;YAChB,eAAe,EAAE,UAAU,CAAC,MAAM;YAClC,SAAS,EACP,UAAU,CAAC,MAAM,GAAG,CAAC;gBACnB,CAAC,CAAC,UAAU;qBACP,KAAK,CAAC,kBAAkB,CAAC;qBACzB,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;qBAC5B,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,MAAM;gBAC/C,CAAC,CAAC,CAAC;SACR,CAAA;IACP,MAAM,EAAE,oBAAoB,EAAE,eAAe,EAAE,GAAG,mBAAmB,CAAC,oBAAoB,CAAC,IAAI,CAAC,CAAA;IAEhG,OAAO;QACL,GAAG;QACH,KAAK;QACL,WAAW;QACX,QAAQ;QACR,OAAO;QACP,SAAS;QACT,eAAe;QACf,SAAS;QACT,oBAAoB;QACpB,eAAe;QACf,gBAAgB,EAAE,oBAAoB,CAAC,MAAM;QAC7C,WAAW;KACZ,CAAA;AACH,CAAC"}