@steipete/summarize-core 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/README.md +0 -1
  2. package/dist/esm/content/index.js +5 -5
  3. package/dist/esm/content/index.js.map +1 -1
  4. package/dist/esm/content/link-preview/client.js +20 -9
  5. package/dist/esm/content/link-preview/client.js.map +1 -1
  6. package/dist/esm/content/link-preview/content/article.js +84 -83
  7. package/dist/esm/content/link-preview/content/article.js.map +1 -1
  8. package/dist/esm/content/link-preview/content/cleaner.js +23 -20
  9. package/dist/esm/content/link-preview/content/cleaner.js.map +1 -1
  10. package/dist/esm/content/link-preview/content/constants.js.map +1 -1
  11. package/dist/esm/content/link-preview/content/fetcher.js +46 -40
  12. package/dist/esm/content/link-preview/content/fetcher.js.map +1 -1
  13. package/dist/esm/content/link-preview/content/firecrawl.js +16 -16
  14. package/dist/esm/content/link-preview/content/firecrawl.js.map +1 -1
  15. package/dist/esm/content/link-preview/content/html.js +29 -27
  16. package/dist/esm/content/link-preview/content/html.js.map +1 -1
  17. package/dist/esm/content/link-preview/content/index.js +141 -88
  18. package/dist/esm/content/link-preview/content/index.js.map +1 -1
  19. package/dist/esm/content/link-preview/content/jsonld.js +12 -12
  20. package/dist/esm/content/link-preview/content/jsonld.js.map +1 -1
  21. package/dist/esm/content/link-preview/content/parsers.js +20 -20
  22. package/dist/esm/content/link-preview/content/parsers.js.map +1 -1
  23. package/dist/esm/content/link-preview/content/podcast-utils.js +34 -34
  24. package/dist/esm/content/link-preview/content/podcast-utils.js.map +1 -1
  25. package/dist/esm/content/link-preview/content/readability.js +16 -15
  26. package/dist/esm/content/link-preview/content/readability.js.map +1 -1
  27. package/dist/esm/content/link-preview/content/twitter-utils.js +24 -11
  28. package/dist/esm/content/link-preview/content/twitter-utils.js.map +1 -1
  29. package/dist/esm/content/link-preview/content/types.js +1 -1
  30. package/dist/esm/content/link-preview/content/types.js.map +1 -1
  31. package/dist/esm/content/link-preview/content/utils.js +17 -17
  32. package/dist/esm/content/link-preview/content/utils.js.map +1 -1
  33. package/dist/esm/content/link-preview/content/video.js +19 -19
  34. package/dist/esm/content/link-preview/content/video.js.map +1 -1
  35. package/dist/esm/content/link-preview/content/visibility.js +121 -0
  36. package/dist/esm/content/link-preview/content/visibility.js.map +1 -0
  37. package/dist/esm/content/link-preview/content/youtube.js +10 -10
  38. package/dist/esm/content/link-preview/content/youtube.js.map +1 -1
  39. package/dist/esm/content/link-preview/deps.js +16 -16
  40. package/dist/esm/content/link-preview/deps.js.map +1 -1
  41. package/dist/esm/content/link-preview/fetch-with-timeout.js +4 -4
  42. package/dist/esm/content/link-preview/fetch-with-timeout.js.map +1 -1
  43. package/dist/esm/content/link-preview/types.js +1 -1
  44. package/dist/esm/content/link-preview/types.js.map +1 -1
  45. package/dist/esm/content/transcript/cache.js +22 -22
  46. package/dist/esm/content/transcript/cache.js.map +1 -1
  47. package/dist/esm/content/transcript/index.js +34 -24
  48. package/dist/esm/content/transcript/index.js.map +1 -1
  49. package/dist/esm/content/transcript/normalize.js +10 -10
  50. package/dist/esm/content/transcript/normalize.js.map +1 -1
  51. package/dist/esm/content/transcript/parse.js +31 -31
  52. package/dist/esm/content/transcript/parse.js.map +1 -1
  53. package/dist/esm/content/transcript/providers/generic.js +74 -78
  54. package/dist/esm/content/transcript/providers/generic.js.map +1 -1
  55. package/dist/esm/content/transcript/providers/podcast/apple-flow.js +36 -36
  56. package/dist/esm/content/transcript/providers/podcast/apple-flow.js.map +1 -1
  57. package/dist/esm/content/transcript/providers/podcast/apple.js +5 -5
  58. package/dist/esm/content/transcript/providers/podcast/apple.js.map +1 -1
  59. package/dist/esm/content/transcript/providers/podcast/constants.js +2 -2
  60. package/dist/esm/content/transcript/providers/podcast/constants.js.map +1 -1
  61. package/dist/esm/content/transcript/providers/podcast/itunes.js +44 -42
  62. package/dist/esm/content/transcript/providers/podcast/itunes.js.map +1 -1
  63. package/dist/esm/content/transcript/providers/podcast/json.js +4 -4
  64. package/dist/esm/content/transcript/providers/podcast/json.js.map +1 -1
  65. package/dist/esm/content/transcript/providers/podcast/media.js +58 -49
  66. package/dist/esm/content/transcript/providers/podcast/media.js.map +1 -1
  67. package/dist/esm/content/transcript/providers/podcast/results.js +2 -2
  68. package/dist/esm/content/transcript/providers/podcast/results.js.map +1 -1
  69. package/dist/esm/content/transcript/providers/podcast/rss.js +29 -29
  70. package/dist/esm/content/transcript/providers/podcast/rss.js.map +1 -1
  71. package/dist/esm/content/transcript/providers/podcast/spotify-flow.js +38 -38
  72. package/dist/esm/content/transcript/providers/podcast/spotify-flow.js.map +1 -1
  73. package/dist/esm/content/transcript/providers/podcast/spotify.js +32 -32
  74. package/dist/esm/content/transcript/providers/podcast/spotify.js.map +1 -1
  75. package/dist/esm/content/transcript/providers/podcast.js +43 -47
  76. package/dist/esm/content/transcript/providers/podcast.js.map +1 -1
  77. package/dist/esm/content/transcript/providers/transcription-start.js +59 -31
  78. package/dist/esm/content/transcript/providers/transcription-start.js.map +1 -1
  79. package/dist/esm/content/transcript/providers/youtube/api.js +56 -56
  80. package/dist/esm/content/transcript/providers/youtube/api.js.map +1 -1
  81. package/dist/esm/content/transcript/providers/youtube/apify.js +7 -7
  82. package/dist/esm/content/transcript/providers/youtube/apify.js.map +1 -1
  83. package/dist/esm/content/transcript/providers/youtube/captions.js +76 -76
  84. package/dist/esm/content/transcript/providers/youtube/captions.js.map +1 -1
  85. package/dist/esm/content/transcript/providers/youtube/yt-dlp.js +82 -75
  86. package/dist/esm/content/transcript/providers/youtube/yt-dlp.js.map +1 -1
  87. package/dist/esm/content/transcript/providers/youtube.js +84 -77
  88. package/dist/esm/content/transcript/providers/youtube.js.map +1 -1
  89. package/dist/esm/content/transcript/timestamps.js +8 -8
  90. package/dist/esm/content/transcript/timestamps.js.map +1 -1
  91. package/dist/esm/content/transcript/transcription-config.js +14 -0
  92. package/dist/esm/content/transcript/transcription-config.js.map +1 -0
  93. package/dist/esm/content/transcript/utils.js +35 -35
  94. package/dist/esm/content/transcript/utils.js.map +1 -1
  95. package/dist/esm/content/url.js +59 -28
  96. package/dist/esm/content/url.js.map +1 -1
  97. package/dist/esm/index.js +4 -3
  98. package/dist/esm/index.js.map +1 -1
  99. package/dist/esm/language.js +77 -77
  100. package/dist/esm/language.js.map +1 -1
  101. package/dist/esm/openai/base-url.js +35 -0
  102. package/dist/esm/openai/base-url.js.map +1 -0
  103. package/dist/esm/processes.js +16 -16
  104. package/dist/esm/processes.js.map +1 -1
  105. package/dist/esm/prompts/cli.js +17 -17
  106. package/dist/esm/prompts/cli.js.map +1 -1
  107. package/dist/esm/prompts/file.js +54 -54
  108. package/dist/esm/prompts/file.js.map +1 -1
  109. package/dist/esm/prompts/format.js +2 -2
  110. package/dist/esm/prompts/format.js.map +1 -1
  111. package/dist/esm/prompts/index.js +5 -5
  112. package/dist/esm/prompts/index.js.map +1 -1
  113. package/dist/esm/prompts/link-summary.js +65 -65
  114. package/dist/esm/prompts/link-summary.js.map +1 -1
  115. package/dist/esm/prompts/summary-lengths.js +10 -10
  116. package/dist/esm/prompts/summary-lengths.js.map +1 -1
  117. package/dist/esm/prompts/summary-system.js +9 -9
  118. package/dist/esm/prompts/summary-system.js.map +1 -1
  119. package/dist/esm/shared/contracts.js +1 -1
  120. package/dist/esm/shared/contracts.js.map +1 -1
  121. package/dist/esm/transcription/onnx-cli.js +69 -69
  122. package/dist/esm/transcription/onnx-cli.js.map +1 -1
  123. package/dist/esm/transcription/whisper/constants.js +3 -3
  124. package/dist/esm/transcription/whisper/constants.js.map +1 -1
  125. package/dist/esm/transcription/whisper/core.js +148 -59
  126. package/dist/esm/transcription/whisper/core.js.map +1 -1
  127. package/dist/esm/transcription/whisper/fal.js +14 -14
  128. package/dist/esm/transcription/whisper/fal.js.map +1 -1
  129. package/dist/esm/transcription/whisper/ffmpeg.js +106 -106
  130. package/dist/esm/transcription/whisper/ffmpeg.js.map +1 -1
  131. package/dist/esm/transcription/whisper/groq.js +46 -0
  132. package/dist/esm/transcription/whisper/groq.js.map +1 -0
  133. package/dist/esm/transcription/whisper/openai.js +19 -13
  134. package/dist/esm/transcription/whisper/openai.js.map +1 -1
  135. package/dist/esm/transcription/whisper/utils.js +19 -19
  136. package/dist/esm/transcription/whisper/utils.js.map +1 -1
  137. package/dist/esm/transcription/whisper/whisper-cpp.js +64 -64
  138. package/dist/esm/transcription/whisper/whisper-cpp.js.map +1 -1
  139. package/dist/esm/transcription/whisper.js +4 -4
  140. package/dist/esm/transcription/whisper.js.map +1 -1
  141. package/dist/types/content/cache/types.d.ts +1 -1
  142. package/dist/types/content/index.d.ts +7 -7
  143. package/dist/types/content/link-preview/client.d.ts +7 -4
  144. package/dist/types/content/link-preview/content/cleaner.d.ts +1 -0
  145. package/dist/types/content/link-preview/content/fetcher.d.ts +2 -2
  146. package/dist/types/content/link-preview/content/firecrawl.d.ts +7 -7
  147. package/dist/types/content/link-preview/content/html.d.ts +8 -8
  148. package/dist/types/content/link-preview/content/index.d.ts +3 -3
  149. package/dist/types/content/link-preview/content/twitter-utils.d.ts +1 -0
  150. package/dist/types/content/link-preview/content/types.d.ts +8 -8
  151. package/dist/types/content/link-preview/content/utils.d.ts +3 -3
  152. package/dist/types/content/link-preview/content/video.d.ts +1 -1
  153. package/dist/types/content/link-preview/content/visibility.d.ts +1 -0
  154. package/dist/types/content/link-preview/deps.d.ts +36 -33
  155. package/dist/types/content/link-preview/types.d.ts +4 -4
  156. package/dist/types/content/transcript/cache.d.ts +4 -4
  157. package/dist/types/content/transcript/index.d.ts +7 -7
  158. package/dist/types/content/transcript/parse.d.ts +1 -1
  159. package/dist/types/content/transcript/providers/generic.d.ts +1 -1
  160. package/dist/types/content/transcript/providers/podcast/apple-flow.d.ts +2 -2
  161. package/dist/types/content/transcript/providers/podcast/flow-context.d.ts +4 -4
  162. package/dist/types/content/transcript/providers/podcast/media.d.ts +9 -6
  163. package/dist/types/content/transcript/providers/podcast/results.d.ts +3 -3
  164. package/dist/types/content/transcript/providers/podcast/rss.d.ts +1 -1
  165. package/dist/types/content/transcript/providers/podcast/spotify-flow.d.ts +2 -2
  166. package/dist/types/content/transcript/providers/podcast/spotify.d.ts +2 -2
  167. package/dist/types/content/transcript/providers/podcast.d.ts +5 -5
  168. package/dist/types/content/transcript/providers/transcription-start.d.ts +14 -8
  169. package/dist/types/content/transcript/providers/youtube/api.d.ts +1 -1
  170. package/dist/types/content/transcript/providers/youtube/captions.d.ts +1 -1
  171. package/dist/types/content/transcript/providers/youtube/yt-dlp.d.ts +11 -8
  172. package/dist/types/content/transcript/providers/youtube.d.ts +1 -1
  173. package/dist/types/content/transcript/timestamps.d.ts +1 -1
  174. package/dist/types/content/transcript/transcription-config.d.ts +15 -0
  175. package/dist/types/content/transcript/types.d.ts +12 -9
  176. package/dist/types/content/transcript/utils.d.ts +1 -1
  177. package/dist/types/content/url.d.ts +5 -3
  178. package/dist/types/index.d.ts +5 -4
  179. package/dist/types/language.d.ts +4 -4
  180. package/dist/types/openai/base-url.d.ts +14 -0
  181. package/dist/types/processes.d.ts +2 -2
  182. package/dist/types/prompts/cli.d.ts +3 -3
  183. package/dist/types/prompts/file.d.ts +2 -2
  184. package/dist/types/prompts/index.d.ts +6 -6
  185. package/dist/types/prompts/link-summary.d.ts +3 -3
  186. package/dist/types/prompts/summary-lengths.d.ts +1 -1
  187. package/dist/types/transcription/onnx-cli.d.ts +3 -3
  188. package/dist/types/transcription/whisper/core.d.ts +6 -3
  189. package/dist/types/transcription/whisper/groq.d.ts +2 -0
  190. package/dist/types/transcription/whisper/openai.d.ts +6 -1
  191. package/dist/types/transcription/whisper/types.d.ts +1 -1
  192. package/dist/types/transcription/whisper/whisper-cpp.d.ts +1 -1
  193. package/dist/types/transcription/whisper.d.ts +5 -5
  194. package/package.json +18 -17
  195. package/LICENSE +0 -21
package/README.md CHANGED
@@ -4,4 +4,3 @@ Core library for Summarize (content extraction + prompt builders).
4
4
 
5
5
  - CLI package: `@steipete/summarize`
6
6
  - Recommended imports (library use): `@steipete/summarize-core/content`, `@steipete/summarize-core/prompts`
7
-
@@ -1,6 +1,6 @@
1
- export { createLinkPreviewClient, } from './link-preview/client.js';
2
- export { DEFAULT_CACHE_MODE, DEFAULT_MAX_CONTENT_CHARACTERS, DEFAULT_TIMEOUT_MS, } from './link-preview/content/types.js';
3
- export { ProgressKind } from './link-preview/deps.js';
4
- export { CACHE_MODES, } from './link-preview/types.js';
5
- export { extractYouTubeVideoId, isDirectMediaUrl, isPodcastHost, isTwitterStatusUrl, isYouTubeUrl, isYouTubeVideoUrl, shouldPreferUrlMode, } from './url.js';
1
+ export { createLinkPreviewClient, } from "./link-preview/client.js";
2
+ export { DEFAULT_CACHE_MODE, DEFAULT_MAX_CONTENT_CHARACTERS, DEFAULT_TIMEOUT_MS, } from "./link-preview/content/types.js";
3
+ export { ProgressKind } from "./link-preview/deps.js";
4
+ export { CACHE_MODES, } from "./link-preview/types.js";
5
+ export { DIRECT_MEDIA_EXTENSIONS, extractYouTubeVideoId, isDirectMediaExtension, isDirectMediaUrl, isPodcastHost, isTwitterBroadcastUrl, isTwitterStatusUrl, isYouTubeUrl, isYouTubeVideoUrl, shouldPreferUrlMode, } from "./url.js";
6
6
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/content/index.ts"],"names":[],"mappings":"AAOA,OAAO,EACL,uBAAuB,GAGxB,MAAM,0BAA0B,CAAA;AACjC,OAAO,EACL,kBAAkB,EAClB,8BAA8B,EAC9B,kBAAkB,GAGnB,MAAM,iCAAiC,CAAA;AASxC,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAA;AACrD,OAAO,EACL,WAAW,GAIZ,MAAM,yBAAyB,CAAA;AAChC,OAAO,EACL,qBAAqB,EACrB,gBAAgB,EAChB,aAAa,EACb,kBAAkB,EAClB,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,GACpB,MAAM,UAAU,CAAA"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/content/index.ts"],"names":[],"mappings":"AAOA,OAAO,EACL,uBAAuB,GAGxB,MAAM,0BAA0B,CAAC;AAClC,OAAO,EACL,kBAAkB,EAClB,8BAA8B,EAC9B,kBAAkB,GAGnB,MAAM,iCAAiC,CAAC;AASzC,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AACtD,OAAO,EACL,WAAW,GAIZ,MAAM,yBAAyB,CAAC;AACjC,OAAO,EACL,uBAAuB,EACvB,qBAAqB,EACrB,sBAAsB,EACtB,gBAAgB,EAChB,aAAa,EACb,qBAAqB,EACrB,kBAAkB,EAClB,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,GACpB,MAAM,UAAU,CAAC"}
@@ -1,19 +1,28 @@
1
- import { fetchLinkContent } from './content/index.js';
1
+ import { resolveTranscriptionConfig, } from "../transcript/transcription-config.js";
2
+ import { fetchLinkContent } from "./content/index.js";
2
3
  /** Public factory for a link preview client with injectable dependencies. */
3
4
  export function createLinkPreviewClient(options = {}) {
4
5
  const fetchImpl = options.fetch ?? ((...args) => globalThis.fetch(...args));
5
- const env = typeof options.env === 'object' && options.env ? options.env : undefined;
6
+ const env = typeof options.env === "object" && options.env ? options.env : undefined;
6
7
  const scrape = options.scrapeWithFirecrawl ?? null;
7
- const apifyApiToken = typeof options.apifyApiToken === 'string' ? options.apifyApiToken : null;
8
- const ytDlpPath = typeof options.ytDlpPath === 'string' ? options.ytDlpPath : null;
9
- const falApiKey = typeof options.falApiKey === 'string' ? options.falApiKey : null;
10
- const openaiApiKey = typeof options.openaiApiKey === 'string' ? options.openaiApiKey : null;
8
+ const apifyApiToken = typeof options.apifyApiToken === "string" ? options.apifyApiToken : null;
9
+ const ytDlpPath = typeof options.ytDlpPath === "string" ? options.ytDlpPath : null;
10
+ const falApiKey = typeof options.falApiKey === "string" ? options.falApiKey : null;
11
+ const groqApiKey = typeof options.groqApiKey === "string" ? options.groqApiKey : null;
12
+ const openaiApiKey = typeof options.openaiApiKey === "string" ? options.openaiApiKey : null;
13
+ const transcription = resolveTranscriptionConfig({
14
+ env,
15
+ transcription: options.transcription ?? null,
16
+ falApiKey,
17
+ groqApiKey,
18
+ openaiApiKey,
19
+ });
11
20
  const convertHtmlToMarkdown = options.convertHtmlToMarkdown ?? null;
12
21
  const transcriptCache = options.transcriptCache ?? null;
13
22
  const mediaCache = options.mediaCache ?? null;
14
- const readTweetWithBird = typeof options.readTweetWithBird === 'function' ? options.readTweetWithBird : null;
15
- const resolveTwitterCookies = typeof options.resolveTwitterCookies === 'function' ? options.resolveTwitterCookies : null;
16
- const onProgress = typeof options.onProgress === 'function' ? options.onProgress : null;
23
+ const readTweetWithBird = typeof options.readTweetWithBird === "function" ? options.readTweetWithBird : null;
24
+ const resolveTwitterCookies = typeof options.resolveTwitterCookies === "function" ? options.resolveTwitterCookies : null;
25
+ const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
17
26
  return {
18
27
  fetchLinkContent: (url, contentOptions) => fetchLinkContent(url, contentOptions, {
19
28
  fetch: fetchImpl,
@@ -21,7 +30,9 @@ export function createLinkPreviewClient(options = {}) {
21
30
  scrapeWithFirecrawl: scrape,
22
31
  apifyApiToken,
23
32
  ytDlpPath,
33
+ transcription,
24
34
  falApiKey,
35
+ groqApiKey,
25
36
  openaiApiKey,
26
37
  convertHtmlToMarkdown,
27
38
  transcriptCache,
@@ -1 +1 @@
1
- {"version":3,"file":"client.js","sourceRoot":"","sources":["../../../../src/content/link-preview/client.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAA;AAgCrD,6EAA6E;AAC7E,MAAM,UAAU,uBAAuB,CAAC,UAAoC,EAAE;IAC5E,MAAM,SAAS,GACb,OAAO,CAAC,KAAK,IAAI,CAAC,CAAC,GAAG,IAA8B,EAAE,EAAE,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC,CAAA;IACrF,MAAM,GAAG,GAAG,OAAO,OAAO,CAAC,GAAG,KAAK,QAAQ,IAAI,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAA;IACpF,MAAM,MAAM,GAA+B,OAAO,CAAC,mBAAmB,IAAI,IAAI,CAAA;IAC9E,MAAM,aAAa,GAAG,OAAO,OAAO,CAAC,aAAa,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAA;IAC9F,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAA;IAClF,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAA;IAClF,MAAM,YAAY,GAAG,OAAO,OAAO,CAAC,YAAY,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAA;IAC3F,MAAM,qBAAqB,GAAiC,OAAO,CAAC,qBAAqB,IAAI,IAAI,CAAA;IACjG,MAAM,eAAe,GAA2B,OAAO,CAAC,eAAe,IAAI,IAAI,CAAA;IAC/E,MAAM,UAAU,GAAsB,OAAO,CAAC,UAAU,IAAI,IAAI,CAAA;IAChE,MAAM,iBAAiB,GACrB,OAAO,OAAO,CAAC,iBAAiB,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,iBAAiB,CAAC,CAAC,CAAC,IAAI,CAAA;IACpF,MAAM,qBAAqB,GACzB,OAAO,OAAO,CAAC,qBAAqB,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,qBAAqB,CAAC,CAAC,CAAC,IAAI,CAAA;IAC5F,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAA;IAEvF,OAAO;QACL,gBAAgB,EAAE,CAAC,GAAW,EAAE,cAAwC,EAAE,EAAE,CAC1E,gBAAgB,CAAC,GAAG,EAAE,cAAc,EAAE;YACpC,KAAK,EAAE,SAAS;YAChB,GAAG;YACH,mBAAmB,EAAE,MAAM;YAC3B,aAAa;YACb,SAAS;YACT,SAAS;YACT,YAAY;YACZ,qBAAqB;YACrB,eAAe;YACf,UAAU;YACV,iBAAiB;YACjB,qBAAqB;YACrB,UAAU;SACX,CAAC;KACL,CAAA;AACH,CAAC"}
1
+ {"version":3,"file":"client.js","sourceRoot":"","sources":["../../../../src/content/link-preview/client.ts"],"names":[],"mappings":"AASA,OAAO,EACL,0BAA0B,GAE3B,MAAM,uCAAuC,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AA0BtD,6EAA6E;AAC7E,MAAM,UAAU,uBAAuB,CAAC,UAAoC,EAAE;IAC5E,MAAM,SAAS,GACb,OAAO,CAAC,KAAK,IAAI,CAAC,CAAC,GAAG,IAA8B,EAAE,EAAE,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC;IACtF,MAAM,GAAG,GAAG,OAAO,OAAO,CAAC,GAAG,KAAK,QAAQ,IAAI,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;IACrF,MAAM,MAAM,GAA+B,OAAO,CAAC,mBAAmB,IAAI,IAAI,CAAC;IAC/E,MAAM,aAAa,GAAG,OAAO,OAAO,CAAC,aAAa,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC;IAC/F,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC;IACnF,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC;IACnF,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IACtF,MAAM,YAAY,GAAG,OAAO,OAAO,CAAC,YAAY,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC;IAC5F,MAAM,aAAa,GAAG,0BAA0B,CAAC;QAC/C,GAAG;QACH,aAAa,EAAE,OAAO,CAAC,aAAa,IAAI,IAAI;QAC5C,SAAS;QACT,UAAU;QACV,YAAY;KACb,CAAC,CAAC;IACH,MAAM,qBAAqB,GAAiC,OAAO,CAAC,qBAAqB,IAAI,IAAI,CAAC;IAClG,MAAM,eAAe,GAA2B,OAAO,CAAC,eAAe,IAAI,IAAI,CAAC;IAChF,MAAM,UAAU,GAAsB,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC;IACjE,MAAM,iBAAiB,GACrB,OAAO,OAAO,CAAC,iBAAiB,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,iBAAiB,CAAC,CAAC,CAAC,IAAI,CAAC;IACrF,MAAM,qBAAqB,GACzB,OAAO,OAAO,CAAC,qBAAqB,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,qBAAqB,CAAC,CAAC,CAAC,IAAI,CAAC;IAC7F,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAExF,OAAO;QACL,gBAAgB,EAAE,CAAC,GAAW,EAAE,cAAwC,EAAE,EAAE,CAC1E,gBAAgB,CAAC,GAAG,EAAE,cAAc,EAAE;YACpC,KAAK,EAAE,SAAS;YAChB,GAAG;YACH,mBAAmB,EAAE,MAAM;YAC3B,aAAa;YACb,SAAS;YACT,aAAa;YACb,SAAS;YACT,UAAU;YACV,YAAY;YACZ,qBAAqB;YACrB,eAAe;YACf,UAAU;YACV,iBAAiB;YACjB,qBAAqB;YACrB,UAAU;SACX,CAAC;KACL,CAAC;AACJ,CAAC"}
@@ -1,45 +1,46 @@
1
- import { load } from 'cheerio';
2
- import sanitizeHtml from 'sanitize-html';
3
- import { decodeHtmlEntities, normalizeWhitespace } from './cleaner.js';
1
+ import { load } from "cheerio";
2
+ import sanitizeHtml from "sanitize-html";
3
+ import { decodeHtmlEntities, normalizeWhitespace } from "./cleaner.js";
4
+ import { stripHiddenHtml } from "./visibility.js";
4
5
  const MIN_SEGMENT_LENGTH = 30;
5
6
  export function sanitizeHtmlForMarkdownConversion(html) {
6
- return sanitizeHtml(html, {
7
+ return sanitizeHtml(stripHiddenHtml(html), {
7
8
  allowedTags: [
8
- 'article',
9
- 'section',
10
- 'div',
11
- 'p',
12
- 'h1',
13
- 'h2',
14
- 'h3',
15
- 'h4',
16
- 'h5',
17
- 'h6',
18
- 'ol',
19
- 'ul',
20
- 'li',
21
- 'blockquote',
22
- 'pre',
23
- 'code',
24
- 'span',
25
- 'strong',
26
- 'em',
27
- 'br',
28
- 'a',
9
+ "article",
10
+ "section",
11
+ "div",
12
+ "p",
13
+ "h1",
14
+ "h2",
15
+ "h3",
16
+ "h4",
17
+ "h5",
18
+ "h6",
19
+ "ol",
20
+ "ul",
21
+ "li",
22
+ "blockquote",
23
+ "pre",
24
+ "code",
25
+ "span",
26
+ "strong",
27
+ "em",
28
+ "br",
29
+ "a",
29
30
  ],
30
31
  allowedAttributes: {
31
- a: ['href'],
32
+ a: ["href"],
32
33
  },
33
34
  nonTextTags: [
34
- 'style',
35
- 'script',
36
- 'noscript',
37
- 'template',
38
- 'svg',
39
- 'canvas',
40
- 'iframe',
41
- 'object',
42
- 'embed',
35
+ "style",
36
+ "script",
37
+ "noscript",
38
+ "template",
39
+ "svg",
40
+ "canvas",
41
+ "iframe",
42
+ "object",
43
+ "embed",
43
44
  ],
44
45
  textFilter(text) {
45
46
  return decodeHtmlEntities(text);
@@ -49,46 +50,46 @@ export function sanitizeHtmlForMarkdownConversion(html) {
49
50
  export function extractArticleContent(html) {
50
51
  const segments = collectSegmentsFromHtml(html);
51
52
  if (segments.length > 0) {
52
- return segments.join('\n');
53
+ return segments.join("\n");
53
54
  }
54
55
  const fallback = normalizeWhitespace(extractPlainText(html));
55
- return fallback ?? '';
56
+ return fallback ?? "";
56
57
  }
57
58
  export function collectSegmentsFromHtml(html) {
58
- const sanitized = sanitizeHtml(html, {
59
+ const sanitized = sanitizeHtml(stripHiddenHtml(html), {
59
60
  allowedTags: [
60
- 'article',
61
- 'section',
62
- 'div',
63
- 'p',
64
- 'h1',
65
- 'h2',
66
- 'h3',
67
- 'h4',
68
- 'h5',
69
- 'h6',
70
- 'ol',
71
- 'ul',
72
- 'li',
73
- 'blockquote',
74
- 'pre',
75
- 'code',
76
- 'span',
77
- 'strong',
78
- 'em',
79
- 'br',
61
+ "article",
62
+ "section",
63
+ "div",
64
+ "p",
65
+ "h1",
66
+ "h2",
67
+ "h3",
68
+ "h4",
69
+ "h5",
70
+ "h6",
71
+ "ol",
72
+ "ul",
73
+ "li",
74
+ "blockquote",
75
+ "pre",
76
+ "code",
77
+ "span",
78
+ "strong",
79
+ "em",
80
+ "br",
80
81
  ],
81
82
  allowedAttributes: {},
82
83
  nonTextTags: [
83
- 'style',
84
- 'script',
85
- 'noscript',
86
- 'template',
87
- 'svg',
88
- 'canvas',
89
- 'iframe',
90
- 'object',
91
- 'embed',
84
+ "style",
85
+ "script",
86
+ "noscript",
87
+ "template",
88
+ "svg",
89
+ "canvas",
90
+ "iframe",
91
+ "object",
92
+ "embed",
92
93
  ],
93
94
  textFilter(text) {
94
95
  return decodeHtmlEntities(text);
@@ -96,23 +97,23 @@ export function collectSegmentsFromHtml(html) {
96
97
  });
97
98
  const $ = load(sanitized);
98
99
  const segments = [];
99
- $('h1,h2,h3,h4,h5,h6,li,p,blockquote,pre').each((_, element) => {
100
- if (!('tagName' in element) || typeof element.tagName !== 'string') {
100
+ $("h1,h2,h3,h4,h5,h6,li,p,blockquote,pre").each((_, element) => {
101
+ if (!("tagName" in element) || typeof element.tagName !== "string") {
101
102
  return;
102
103
  }
103
104
  const tag = element.tagName.toLowerCase();
104
105
  const raw = $(element).text();
105
- const text = normalizeWhitespace(raw).replaceAll(/\n+/g, ' ');
106
+ const text = normalizeWhitespace(raw).replaceAll(/\n+/g, " ");
106
107
  if (!text || text.length === 0) {
107
108
  return;
108
109
  }
109
- if (tag.startsWith('h')) {
110
+ if (tag.startsWith("h")) {
110
111
  if (text.length >= 10) {
111
112
  segments.push(text);
112
113
  }
113
114
  return;
114
115
  }
115
- if (tag === 'li') {
116
+ if (tag === "li") {
116
117
  if (text.length >= 20) {
117
118
  segments.push(`• ${text}`);
118
119
  }
@@ -124,25 +125,25 @@ export function collectSegmentsFromHtml(html) {
124
125
  segments.push(text);
125
126
  });
126
127
  if (segments.length === 0) {
127
- const fallback = normalizeWhitespace($('body').text() || sanitized);
128
+ const fallback = normalizeWhitespace($("body").text() || sanitized);
128
129
  return fallback ? [fallback] : [];
129
130
  }
130
131
  return mergeConsecutiveSegments(segments);
131
132
  }
132
133
  export function extractPlainText(html) {
133
- const stripped = sanitizeHtml(html, {
134
+ const stripped = sanitizeHtml(stripHiddenHtml(html), {
134
135
  allowedTags: [],
135
136
  allowedAttributes: {},
136
137
  nonTextTags: [
137
- 'style',
138
- 'script',
139
- 'noscript',
140
- 'template',
141
- 'svg',
142
- 'canvas',
143
- 'iframe',
144
- 'object',
145
- 'embed',
138
+ "style",
139
+ "script",
140
+ "noscript",
141
+ "template",
142
+ "svg",
143
+ "canvas",
144
+ "iframe",
145
+ "object",
146
+ "embed",
146
147
  ],
147
148
  });
148
149
  return decodeHtmlEntities(stripped);
@@ -1 +1 @@
1
- {"version":3,"file":"article.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/article.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAA;AAC9B,OAAO,YAAY,MAAM,eAAe,CAAA;AAExC,OAAO,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,MAAM,cAAc,CAAA;AAEtE,MAAM,kBAAkB,GAAG,EAAE,CAAA;AAE7B,MAAM,UAAU,iCAAiC,CAAC,IAAY;IAC5D,OAAO,YAAY,CAAC,IAAI,EAAE;QACxB,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;YACJ,GAAG;SACJ;QACD,iBAAiB,EAAE;YACjB,CAAC,EAAE,CAAC,MAAM,CAAC;SACZ;QACD,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACjC,CAAC;KACF,CAAC,CAAA;AACJ,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,IAAY;IAChD,MAAM,QAAQ,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAA;IAC9C,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,OAAO,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC5B,CAAC;IACD,MAAM,QAAQ,GAAG,mBAAmB,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAA;IAC5D,OAAO,QAAQ,IAAI,EAAE,CAAA;AACvB,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,IAAY;IAClD,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,EAAE;QACnC,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;SACL;QACD,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAA;QACjC,CAAC;KACF,CAAC,CAAA;IAEF,MAAM,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,CAAA;IACzB,MAAM,QAAQ,GAAa,EAAE,CAAA;IAE7B,CAAC,CAAC,uCAAuC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAC7D,IAAI,CAAC,CAAC,SAAS,IAAI,OAAO,CAAC,IAAI,OAAO,OAAO,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YACnE,OAAM;QACR,CAAC;QAED,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAA;QAEzC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAA;QAC7B,MAAM,IAAI,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;QAC7D,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAM;QACR,CAAC;QAED,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YACxB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;YACrB,CAAC;YACD,OAAM;QACR,CAAC;QAED,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACjB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC,CAAA;YAC5B,CAAC;YACD,OAAM;QACR,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,GAAG,kBAAkB,EAAE,CAAC;YACrC,OAAM;QACR,CAAC;QAED,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IACrB,CAAC,CAAC,CAAA;IAEF,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,SAAS,CAAC,CAAA;QACnE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;IACnC,CAAC;IAED,OAAO,wBAAwB,CAAC,QAAQ,CAAC,CAAA;AAC3C,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,QAAQ,GAAG,YAAY,CAAC,IAAI,EAAE;QAClC,WAAW,EAAE,EAAE;QACf,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;KACF,CAAC,CAAA;IACF,OAAO,kBAAkB,CAAC,QAAQ,CAAC,CAAA;AACrC,CAAC;AAED,SAAS,wBAAwB,CAAC,QAAkB;IAClD,gGAAgG;IAChG,2FAA2F;IAC3F,OAAO,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;AACjC,CAAC"}
1
+ {"version":3,"file":"article.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/article.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAC/B,OAAO,YAAY,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,MAAM,cAAc,CAAC;AACvE,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAElD,MAAM,kBAAkB,GAAG,EAAE,CAAC;AAE9B,MAAM,UAAU,iCAAiC,CAAC,IAAY;IAC5D,OAAO,YAAY,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE;QACzC,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;YACJ,GAAG;SACJ;QACD,iBAAiB,EAAE;YACjB,CAAC,EAAE,CAAC,MAAM,CAAC;SACZ;QACD,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAClC,CAAC;KACF,CAAC,CAAC;AACL,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,IAAY;IAChD,MAAM,QAAQ,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAC;IAC/C,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,OAAO,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7B,CAAC;IACD,MAAM,QAAQ,GAAG,mBAAmB,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;IAC7D,OAAO,QAAQ,IAAI,EAAE,CAAC;AACxB,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,IAAY;IAClD,MAAM,SAAS,GAAG,YAAY,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE;QACpD,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;SACL;QACD,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAClC,CAAC;KACF,CAAC,CAAC;IAEH,MAAM,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC;IAC1B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,CAAC,CAAC,uCAAuC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAC7D,IAAI,CAAC,CAAC,SAAS,IAAI,OAAO,CAAC,IAAI,OAAO,OAAO,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YACnE,OAAO;QACT,CAAC;QAED,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;QAE1C,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;QAC9B,MAAM,IAAI,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9D,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAO;QACT,CAAC;QAED,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YACxB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;YACD,OAAO;QACT,CAAC;QAED,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACjB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC;YAC7B,CAAC;YACD,OAAO;QACT,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,GAAG,kBAAkB,EAAE,CAAC;YACrC,OAAO;QACT,CAAC;QAED,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtB,CAAC,CAAC,CAAC;IAEH,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,SAAS,CAAC,CAAC;QACpE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACpC,CAAC;IAED,OAAO,wBAAwB,CAAC,QAAQ,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,QAAQ,GAAG,YAAY,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE;QACnD,WAAW,EAAE,EAAE;QACf,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;KACF,CAAC,CAAC;IACH,OAAO,kBAAkB,CAAC,QAAQ,CAAC,CAAC;AACtC,CAAC;AAED,SAAS,wBAAwB,CAAC,QAAkB;IAClD,gGAAgG;IAChG,2FAA2F;IAC3F,OAAO,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;AAClC,CAAC"}
@@ -1,36 +1,39 @@
1
- import { compact } from 'es-toolkit';
1
+ import { compact } from "es-toolkit";
2
2
  const WORD_SPLIT_PATTERN = /\s+/g;
3
3
  export function normalizeForPrompt(input) {
4
- return input
5
- .replaceAll('\u00A0', ' ')
6
- .replaceAll(/[\t ]+/g, ' ')
7
- .replaceAll(/\s*\n\s*/g, '\n')
8
- .replaceAll(/\n{3,}/g, '\n\n')
4
+ return stripInvisibleUnicode(input)
5
+ .replaceAll("\u00A0", " ")
6
+ .replaceAll(/[\t ]+/g, " ")
7
+ .replaceAll(/\s*\n\s*/g, "\n")
8
+ .replaceAll(/\n{3,}/g, "\n\n")
9
9
  .trim();
10
10
  }
11
11
  export function normalizeWhitespace(input) {
12
- return input
13
- .replaceAll('\u00A0', ' ')
14
- .replaceAll(/[\t ]+/g, ' ')
15
- .replaceAll(/\s*\n\s*/g, '\n')
12
+ return stripInvisibleUnicode(input)
13
+ .replaceAll("\u00A0", " ")
14
+ .replaceAll(/[\t ]+/g, " ")
15
+ .replaceAll(/\s*\n\s*/g, "\n")
16
16
  .trim();
17
17
  }
18
18
  export function decodeHtmlEntities(input) {
19
19
  return input
20
- .replaceAll('&', '&')
21
- .replaceAll('&lt;', '<')
22
- .replaceAll('&gt;', '>')
23
- .replaceAll('&quot;', '"')
24
- .replaceAll('&#39;', "'")
25
- .replaceAll('&#x27;', "'")
26
- .replaceAll('&#x2F;', '/')
27
- .replaceAll('&nbsp;', ' ');
20
+ .replaceAll("&amp;", "&")
21
+ .replaceAll("&lt;", "<")
22
+ .replaceAll("&gt;", ">")
23
+ .replaceAll("&quot;", '"')
24
+ .replaceAll("&#39;", "'")
25
+ .replaceAll("&#x27;", "'")
26
+ .replaceAll("&#x2F;", "/")
27
+ .replaceAll("&nbsp;", " ");
28
+ }
29
+ export function stripInvisibleUnicode(input) {
30
+ return input.replaceAll(/[\u200B-\u200F\u202A-\u202E\u2060-\u2069\uFEFF\u{E0000}-\u{E007F}]/gu, "");
28
31
  }
29
32
  export function normalizeCandidate(value) {
30
33
  if (!value) {
31
34
  return null;
32
35
  }
33
- const trimmed = value.replaceAll(/\s+/g, ' ').trim();
36
+ const trimmed = value.replaceAll(/\s+/g, " ").trim();
34
37
  return trimmed.length > 0 ? trimmed : null;
35
38
  }
36
39
  export function clipAtSentenceBoundary(input, maxLength) {
@@ -38,7 +41,7 @@ export function clipAtSentenceBoundary(input, maxLength) {
38
41
  return input;
39
42
  }
40
43
  const slice = input.slice(0, maxLength);
41
- const lastSentenceBreak = Math.max(slice.lastIndexOf('. '), slice.lastIndexOf('! '), slice.lastIndexOf('? '), slice.lastIndexOf('\n\n'));
44
+ const lastSentenceBreak = Math.max(slice.lastIndexOf(". "), slice.lastIndexOf("! "), slice.lastIndexOf("? "), slice.lastIndexOf("\n\n"));
42
45
  if (lastSentenceBreak > maxLength * 0.5) {
43
46
  return slice.slice(0, lastSentenceBreak + 1);
44
47
  }
@@ -1 +1 @@
1
- {"version":3,"file":"cleaner.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/cleaner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAA;AAEpC,MAAM,kBAAkB,GAAG,MAAM,CAAA;AASjC,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,OAAO,KAAK;SACT,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,UAAU,CAAC,SAAS,EAAE,MAAM,CAAC;SAC7B,IAAI,EAAE,CAAA;AACX,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,KAAa;IAC/C,OAAO,KAAK;SACT,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,IAAI,EAAE,CAAA;AACX,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,OAAO,KAAK;SACT,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAA;AAC9B,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAgC;IACjE,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,OAAO,IAAI,CAAA;IACb,CAAC;IACD,MAAM,OAAO,GAAG,KAAK,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACpD,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAA;AAC5C,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,KAAa,EAAE,SAAiB;IACrE,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAA;IACd,CAAC;IACD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAA;IACvC,MAAM,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAChC,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAC1B,CAAA;IACD,IAAI,iBAAiB,GAAG,SAAS,GAAG,GAAG,EAAE,CAAC;QACxC,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,GAAG,CAAC,CAAC,CAAA;IAC9C,CAAC;IACD,OAAO,KAAK,CAAA;AACd,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,WAAmB,EACnB,aAAqB;IAErB,MAAM,eAAe,GAAG,WAAW,CAAC,MAAM,CAAA;IAC1C,MAAM,SAAS,GAAG,eAAe,GAAG,aAAa,CAAA;IACjD,MAAM,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,sBAAsB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,WAAW,CAAA;IAC5F,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IAC9B,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAA;IAC5F,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,SAAS,EAAE,CAAA;AAC3D,CAAC"}
1
+ {"version":3,"file":"cleaner.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/cleaner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAErC,MAAM,kBAAkB,GAAG,MAAM,CAAC;AASlC,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,OAAO,qBAAqB,CAAC,KAAK,CAAC;SAChC,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,UAAU,CAAC,SAAS,EAAE,MAAM,CAAC;SAC7B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,KAAa;IAC/C,OAAO,qBAAqB,CAAC,KAAK,CAAC;SAChC,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,OAAO,KAAK;SACT,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;AAC/B,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,KAAa;IACjD,OAAO,KAAK,CAAC,UAAU,CACrB,sEAAsE,EACtE,EAAE,CACH,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAgC;IACjE,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,OAAO,IAAI,CAAC;IACd,CAAC;IACD,MAAM,OAAO,GAAG,KAAK,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IACrD,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;AAC7C,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,KAAa,EAAE,SAAiB;IACrE,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IACxC,MAAM,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAChC,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAC1B,CAAC;IACF,IAAI,iBAAiB,GAAG,SAAS,GAAG,GAAG,EAAE,CAAC;QACxC,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,GAAG,CAAC,CAAC,CAAC;IAC/C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,WAAmB,EACnB,aAAqB;IAErB,MAAM,eAAe,GAAG,WAAW,CAAC,MAAM,CAAC;IAC3C,MAAM,SAAS,GAAG,eAAe,GAAG,aAAa,CAAC;IAClD,MAAM,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,sBAAsB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC;IAC7F,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAC/B,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7F,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,SAAS,EAAE,CAAC;AAC5D,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"constants.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/constants.ts"],"names":[],"mappings":"AAAA,MAAM,CAAC,MAAM,yBAAyB,GACpC,iIAAiI,CAAA;AACnI,MAAM,CAAC,MAAM,2BAA2B,GAAG,GAAG,CAAA;AAC9C,MAAM,CAAC,MAAM,kCAAkC,GAAG,GAAG,CAAA;AACrD,MAAM,CAAC,MAAM,mCAAmC,GAAG,GAAG,CAAA;AACtD,MAAM,CAAC,MAAM,8BAA8B,GAAG,GAAG,CAAA;AACjD,MAAM,CAAC,MAAM,yCAAyC,GAAG,IAAI,CAAA"}
1
+ {"version":3,"file":"constants.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/constants.ts"],"names":[],"mappings":"AAAA,MAAM,CAAC,MAAM,yBAAyB,GACpC,iIAAiI,CAAC;AACpI,MAAM,CAAC,MAAM,2BAA2B,GAAG,GAAG,CAAC;AAC/C,MAAM,CAAC,MAAM,kCAAkC,GAAG,GAAG,CAAC;AACtD,MAAM,CAAC,MAAM,mCAAmC,GAAG,GAAG,CAAC;AACvD,MAAM,CAAC,MAAM,8BAA8B,GAAG,GAAG,CAAC;AAClD,MAAM,CAAC,MAAM,yCAAyC,GAAG,IAAI,CAAC"}
@@ -1,17 +1,17 @@
1
- import { isYouTubeUrl } from '../../url.js';
2
- import { appendNote } from './utils.js';
1
+ import { isYouTubeUrl } from "../../url.js";
2
+ import { appendNote } from "./utils.js";
3
3
  const REQUEST_HEADERS = {
4
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
5
- Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
6
- 'Accept-Language': 'en-US,en;q=0.9',
7
- 'Cache-Control': 'no-cache',
8
- Pragma: 'no-cache',
4
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
5
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
6
+ "Accept-Language": "en-US,en;q=0.9",
7
+ "Cache-Control": "no-cache",
8
+ Pragma: "no-cache",
9
9
  };
10
10
  const DEFAULT_REQUEST_TIMEOUT_MS = 5000;
11
11
  export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress, } = {}) {
12
- onProgress?.({ kind: 'fetch-html-start', url });
12
+ onProgress?.({ kind: "fetch-html-start", url });
13
13
  const controller = new AbortController();
14
- const effectiveTimeoutMs = typeof timeoutMs === 'number' && Number.isFinite(timeoutMs)
14
+ const effectiveTimeoutMs = typeof timeoutMs === "number" && Number.isFinite(timeoutMs)
15
15
  ? timeoutMs
16
16
  : DEFAULT_REQUEST_TIMEOUT_MS;
17
17
  const timeout = setTimeout(() => {
@@ -20,26 +20,26 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
20
20
  try {
21
21
  const response = await fetchImpl(url, {
22
22
  headers: REQUEST_HEADERS,
23
- redirect: 'follow',
23
+ redirect: "follow",
24
24
  signal: controller.signal,
25
25
  });
26
26
  if (!response.ok) {
27
27
  throw new Error(`Failed to fetch HTML document (status ${response.status})`);
28
28
  }
29
29
  const finalUrl = response.url?.trim() || url;
30
- const contentType = response.headers.get('content-type')?.toLowerCase() ?? null;
30
+ const contentType = response.headers.get("content-type")?.toLowerCase() ?? null;
31
31
  if (contentType &&
32
- !contentType.includes('text/html') &&
33
- !contentType.includes('application/xhtml+xml') &&
34
- !contentType.includes('application/xml') &&
35
- !contentType.includes('text/xml') &&
36
- !contentType.includes('application/rss+xml') &&
37
- !contentType.includes('application/atom+xml') &&
38
- !contentType.startsWith('text/')) {
32
+ !contentType.includes("text/html") &&
33
+ !contentType.includes("application/xhtml+xml") &&
34
+ !contentType.includes("application/xml") &&
35
+ !contentType.includes("text/xml") &&
36
+ !contentType.includes("application/rss+xml") &&
37
+ !contentType.includes("application/atom+xml") &&
38
+ !contentType.startsWith("text/")) {
39
39
  throw new Error(`Unsupported content-type for HTML document fetch: ${contentType}`);
40
40
  }
41
41
  const totalBytes = (() => {
42
- const raw = response.headers.get('content-length');
42
+ const raw = response.headers.get("content-length");
43
43
  if (!raw)
44
44
  return null;
45
45
  const parsed = Number(raw);
@@ -49,14 +49,14 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
49
49
  if (!body) {
50
50
  const text = await response.text();
51
51
  const bytes = new TextEncoder().encode(text).byteLength;
52
- onProgress?.({ kind: 'fetch-html-done', url, downloadedBytes: bytes, totalBytes });
52
+ onProgress?.({ kind: "fetch-html-done", url, downloadedBytes: bytes, totalBytes });
53
53
  return { html: text, finalUrl };
54
54
  }
55
55
  const reader = body.getReader();
56
56
  const decoder = new TextDecoder();
57
57
  let downloadedBytes = 0;
58
- let text = '';
59
- onProgress?.({ kind: 'fetch-html-progress', url, downloadedBytes: 0, totalBytes });
58
+ let text = "";
59
+ onProgress?.({ kind: "fetch-html-progress", url, downloadedBytes: 0, totalBytes });
60
60
  while (true) {
61
61
  const { value, done } = await reader.read();
62
62
  if (done)
@@ -65,15 +65,15 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
65
65
  continue;
66
66
  downloadedBytes += value.byteLength;
67
67
  text += decoder.decode(value, { stream: true });
68
- onProgress?.({ kind: 'fetch-html-progress', url, downloadedBytes, totalBytes });
68
+ onProgress?.({ kind: "fetch-html-progress", url, downloadedBytes, totalBytes });
69
69
  }
70
70
  text += decoder.decode();
71
- onProgress?.({ kind: 'fetch-html-done', url, downloadedBytes, totalBytes });
71
+ onProgress?.({ kind: "fetch-html-done", url, downloadedBytes, totalBytes });
72
72
  return { html: text, finalUrl };
73
73
  }
74
74
  catch (error) {
75
- if (error instanceof DOMException && error.name === 'AbortError') {
76
- throw new Error('Fetching HTML document timed out');
75
+ if (error instanceof DOMException && error.name === "AbortError") {
76
+ throw new Error("Fetching HTML document timed out");
77
77
  }
78
78
  throw error;
79
79
  }
@@ -83,42 +83,48 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
83
83
  }
84
84
  export async function fetchWithFirecrawl(url, scrapeWithFirecrawl, options = {}) {
85
85
  const timeoutMs = options.timeoutMs;
86
- const cacheMode = options.cacheMode ?? 'default';
87
- const onProgress = typeof options.onProgress === 'function' ? options.onProgress : null;
88
- const reason = typeof options.reason === 'string' ? options.reason : null;
86
+ const cacheMode = options.cacheMode ?? "default";
87
+ const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
88
+ const reason = typeof options.reason === "string" ? options.reason : null;
89
89
  const diagnostics = {
90
90
  attempted: false,
91
91
  used: false,
92
92
  cacheMode,
93
- cacheStatus: cacheMode === 'bypass' ? 'bypassed' : 'unknown',
93
+ cacheStatus: cacheMode === "bypass" ? "bypassed" : "unknown",
94
94
  notes: null,
95
95
  };
96
96
  if (isYouTubeUrl(url)) {
97
- diagnostics.notes = appendNote(diagnostics.notes, 'Skipped Firecrawl for YouTube URL');
97
+ diagnostics.notes = appendNote(diagnostics.notes, "Skipped Firecrawl for YouTube URL");
98
98
  return { payload: null, diagnostics };
99
99
  }
100
100
  if (!scrapeWithFirecrawl) {
101
- diagnostics.notes = appendNote(diagnostics.notes, 'Firecrawl is not configured');
101
+ diagnostics.notes = appendNote(diagnostics.notes, "Firecrawl is not configured");
102
102
  return { payload: null, diagnostics };
103
103
  }
104
104
  diagnostics.attempted = true;
105
- onProgress?.({ kind: 'firecrawl-start', url, reason: reason ?? 'firecrawl' });
105
+ onProgress?.({ kind: "firecrawl-start", url, reason: reason ?? "firecrawl" });
106
106
  try {
107
107
  const payload = await scrapeWithFirecrawl(url, { timeoutMs, cacheMode });
108
108
  if (!payload) {
109
- diagnostics.notes = appendNote(diagnostics.notes, 'Firecrawl returned no content payload');
110
- onProgress?.({ kind: 'firecrawl-done', url, ok: false, markdownBytes: null, htmlBytes: null });
109
+ diagnostics.notes = appendNote(diagnostics.notes, "Firecrawl returned no content payload");
110
+ onProgress?.({
111
+ kind: "firecrawl-done",
112
+ url,
113
+ ok: false,
114
+ markdownBytes: null,
115
+ htmlBytes: null,
116
+ });
111
117
  return { payload: null, diagnostics };
112
118
  }
113
119
  const encoder = new TextEncoder();
114
- const markdownBytes = typeof payload.markdown === 'string' ? encoder.encode(payload.markdown).byteLength : null;
115
- const htmlBytes = typeof payload.html === 'string' ? encoder.encode(payload.html).byteLength : null;
116
- onProgress?.({ kind: 'firecrawl-done', url, ok: true, markdownBytes, htmlBytes });
120
+ const markdownBytes = typeof payload.markdown === "string" ? encoder.encode(payload.markdown).byteLength : null;
121
+ const htmlBytes = typeof payload.html === "string" ? encoder.encode(payload.html).byteLength : null;
122
+ onProgress?.({ kind: "firecrawl-done", url, ok: true, markdownBytes, htmlBytes });
117
123
  return { payload, diagnostics };
118
124
  }
119
125
  catch (error) {
120
- diagnostics.notes = appendNote(diagnostics.notes, `Firecrawl error: ${error instanceof Error ? error.message : 'unknown error'}`);
121
- onProgress?.({ kind: 'firecrawl-done', url, ok: false, markdownBytes: null, htmlBytes: null });
126
+ diagnostics.notes = appendNote(diagnostics.notes, `Firecrawl error: ${error instanceof Error ? error.message : "unknown error"}`);
127
+ onProgress?.({ kind: "firecrawl-done", url, ok: false, markdownBytes: null, htmlBytes: null });
122
128
  return { payload: null, diagnostics };
123
129
  }
124
130
  }