@steipete/summarize-core 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/dist/esm/content/index.js +5 -5
- package/dist/esm/content/index.js.map +1 -1
- package/dist/esm/content/link-preview/client.js +23 -8
- package/dist/esm/content/link-preview/client.js.map +1 -1
- package/dist/esm/content/link-preview/content/article.js +84 -83
- package/dist/esm/content/link-preview/content/article.js.map +1 -1
- package/dist/esm/content/link-preview/content/cleaner.js +23 -20
- package/dist/esm/content/link-preview/content/cleaner.js.map +1 -1
- package/dist/esm/content/link-preview/content/constants.js.map +1 -1
- package/dist/esm/content/link-preview/content/fetcher.js +46 -40
- package/dist/esm/content/link-preview/content/fetcher.js.map +1 -1
- package/dist/esm/content/link-preview/content/firecrawl.js +18 -17
- package/dist/esm/content/link-preview/content/firecrawl.js.map +1 -1
- package/dist/esm/content/link-preview/content/html.js +31 -28
- package/dist/esm/content/link-preview/content/html.js.map +1 -1
- package/dist/esm/content/link-preview/content/index.js +173 -90
- package/dist/esm/content/link-preview/content/index.js.map +1 -1
- package/dist/esm/content/link-preview/content/jsonld.js +12 -12
- package/dist/esm/content/link-preview/content/jsonld.js.map +1 -1
- package/dist/esm/content/link-preview/content/parsers.js +20 -20
- package/dist/esm/content/link-preview/content/parsers.js.map +1 -1
- package/dist/esm/content/link-preview/content/podcast-utils.js +34 -34
- package/dist/esm/content/link-preview/content/podcast-utils.js.map +1 -1
- package/dist/esm/content/link-preview/content/readability.js +16 -15
- package/dist/esm/content/link-preview/content/readability.js.map +1 -1
- package/dist/esm/content/link-preview/content/twitter-utils.js +24 -11
- package/dist/esm/content/link-preview/content/twitter-utils.js.map +1 -1
- package/dist/esm/content/link-preview/content/types.js +1 -1
- package/dist/esm/content/link-preview/content/types.js.map +1 -1
- package/dist/esm/content/link-preview/content/utils.js +30 -22
- package/dist/esm/content/link-preview/content/utils.js.map +1 -1
- package/dist/esm/content/link-preview/content/video.js +19 -19
- package/dist/esm/content/link-preview/content/video.js.map +1 -1
- package/dist/esm/content/link-preview/content/visibility.js +121 -0
- package/dist/esm/content/link-preview/content/visibility.js.map +1 -0
- package/dist/esm/content/link-preview/content/youtube.js +10 -10
- package/dist/esm/content/link-preview/content/youtube.js.map +1 -1
- package/dist/esm/content/link-preview/deps.js +16 -16
- package/dist/esm/content/link-preview/deps.js.map +1 -1
- package/dist/esm/content/link-preview/fetch-with-timeout.js +4 -4
- package/dist/esm/content/link-preview/fetch-with-timeout.js.map +1 -1
- package/dist/esm/content/link-preview/types.js +1 -1
- package/dist/esm/content/link-preview/types.js.map +1 -1
- package/dist/esm/content/transcript/cache.js +47 -22
- package/dist/esm/content/transcript/cache.js.map +1 -1
- package/dist/esm/content/transcript/index.js +71 -25
- package/dist/esm/content/transcript/index.js.map +1 -1
- package/dist/esm/content/transcript/normalize.js +10 -10
- package/dist/esm/content/transcript/normalize.js.map +1 -1
- package/dist/esm/content/transcript/parse.js +125 -13
- package/dist/esm/content/transcript/parse.js.map +1 -1
- package/dist/esm/content/transcript/providers/generic.js +112 -81
- package/dist/esm/content/transcript/providers/generic.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/apple-flow.js +38 -36
- package/dist/esm/content/transcript/providers/podcast/apple-flow.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/apple.js +5 -5
- package/dist/esm/content/transcript/providers/podcast/apple.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/constants.js +2 -2
- package/dist/esm/content/transcript/providers/podcast/constants.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/itunes.js +44 -42
- package/dist/esm/content/transcript/providers/podcast/itunes.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/json.js +4 -4
- package/dist/esm/content/transcript/providers/podcast/json.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/media.js +66 -56
- package/dist/esm/content/transcript/providers/podcast/media.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/results.js +2 -2
- package/dist/esm/content/transcript/providers/podcast/results.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/rss.js +42 -28
- package/dist/esm/content/transcript/providers/podcast/rss.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/spotify-flow.js +39 -38
- package/dist/esm/content/transcript/providers/podcast/spotify-flow.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/spotify.js +32 -32
- package/dist/esm/content/transcript/providers/podcast/spotify.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast.js +52 -43
- package/dist/esm/content/transcript/providers/podcast.js.map +1 -1
- package/dist/esm/content/transcript/providers/transcription-start.js +78 -0
- package/dist/esm/content/transcript/providers/transcription-start.js.map +1 -0
- package/dist/esm/content/transcript/providers/youtube/api.js +74 -57
- package/dist/esm/content/transcript/providers/youtube/api.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/apify.js +7 -7
- package/dist/esm/content/transcript/providers/youtube/apify.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/captions.js +112 -74
- package/dist/esm/content/transcript/providers/youtube/captions.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/yt-dlp.js +161 -101
- package/dist/esm/content/transcript/providers/youtube/yt-dlp.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube.js +97 -80
- package/dist/esm/content/transcript/providers/youtube.js.map +1 -1
- package/dist/esm/content/transcript/timestamps.js +79 -0
- package/dist/esm/content/transcript/timestamps.js.map +1 -0
- package/dist/esm/content/transcript/transcription-config.js +14 -0
- package/dist/esm/content/transcript/transcription-config.js.map +1 -0
- package/dist/esm/content/transcript/utils.js +35 -35
- package/dist/esm/content/transcript/utils.js.map +1 -1
- package/dist/esm/content/url.js +59 -28
- package/dist/esm/content/url.js.map +1 -1
- package/dist/esm/index.js +4 -3
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/language.js +77 -77
- package/dist/esm/language.js.map +1 -1
- package/dist/esm/openai/base-url.js +35 -0
- package/dist/esm/openai/base-url.js.map +1 -0
- package/dist/esm/processes.js +121 -0
- package/dist/esm/processes.js.map +1 -0
- package/dist/esm/prompts/cli.js +17 -17
- package/dist/esm/prompts/cli.js.map +1 -1
- package/dist/esm/prompts/file.js +56 -40
- package/dist/esm/prompts/file.js.map +1 -1
- package/dist/esm/prompts/format.js +2 -2
- package/dist/esm/prompts/format.js.map +1 -1
- package/dist/esm/prompts/index.js +5 -4
- package/dist/esm/prompts/index.js.map +1 -1
- package/dist/esm/prompts/link-summary.js +96 -46
- package/dist/esm/prompts/link-summary.js.map +1 -1
- package/dist/esm/prompts/summary-lengths.js +10 -10
- package/dist/esm/prompts/summary-lengths.js.map +1 -1
- package/dist/esm/prompts/summary-system.js +13 -0
- package/dist/esm/prompts/summary-system.js.map +1 -0
- package/dist/esm/shared/contracts.js +1 -1
- package/dist/esm/shared/contracts.js.map +1 -1
- package/dist/esm/transcription/onnx-cli.js +319 -0
- package/dist/esm/transcription/onnx-cli.js.map +1 -0
- package/dist/esm/transcription/whisper/constants.js +3 -3
- package/dist/esm/transcription/whisper/constants.js.map +1 -1
- package/dist/esm/transcription/whisper/core.js +209 -52
- package/dist/esm/transcription/whisper/core.js.map +1 -1
- package/dist/esm/transcription/whisper/fal.js +14 -14
- package/dist/esm/transcription/whisper/fal.js.map +1 -1
- package/dist/esm/transcription/whisper/ffmpeg.js +134 -82
- package/dist/esm/transcription/whisper/ffmpeg.js.map +1 -1
- package/dist/esm/transcription/whisper/groq.js +46 -0
- package/dist/esm/transcription/whisper/groq.js.map +1 -0
- package/dist/esm/transcription/whisper/openai.js +19 -13
- package/dist/esm/transcription/whisper/openai.js.map +1 -1
- package/dist/esm/transcription/whisper/utils.js +19 -19
- package/dist/esm/transcription/whisper/utils.js.map +1 -1
- package/dist/esm/transcription/whisper/whisper-cpp.js +68 -58
- package/dist/esm/transcription/whisper/whisper-cpp.js.map +1 -1
- package/dist/esm/transcription/whisper.js +4 -4
- package/dist/esm/transcription/whisper.js.map +1 -1
- package/dist/types/content/cache/types.d.ts +25 -1
- package/dist/types/content/index.d.ts +7 -7
- package/dist/types/content/link-preview/client.d.ts +9 -4
- package/dist/types/content/link-preview/content/cleaner.d.ts +1 -0
- package/dist/types/content/link-preview/content/fetcher.d.ts +2 -2
- package/dist/types/content/link-preview/content/firecrawl.d.ts +8 -7
- package/dist/types/content/link-preview/content/html.d.ts +9 -8
- package/dist/types/content/link-preview/content/index.d.ts +3 -3
- package/dist/types/content/link-preview/content/twitter-utils.d.ts +1 -0
- package/dist/types/content/link-preview/content/types.d.ts +13 -8
- package/dist/types/content/link-preview/content/utils.d.ts +3 -3
- package/dist/types/content/link-preview/content/video.d.ts +1 -1
- package/dist/types/content/link-preview/content/visibility.d.ts +1 -0
- package/dist/types/content/link-preview/deps.d.ts +44 -28
- package/dist/types/content/link-preview/types.d.ts +10 -4
- package/dist/types/content/transcript/cache.d.ts +9 -6
- package/dist/types/content/transcript/index.d.ts +9 -6
- package/dist/types/content/transcript/parse.d.ts +7 -0
- package/dist/types/content/transcript/providers/generic.d.ts +1 -1
- package/dist/types/content/transcript/providers/podcast/apple-flow.d.ts +2 -2
- package/dist/types/content/transcript/providers/podcast/flow-context.d.ts +4 -4
- package/dist/types/content/transcript/providers/podcast/media.d.ts +10 -6
- package/dist/types/content/transcript/providers/podcast/results.d.ts +3 -3
- package/dist/types/content/transcript/providers/podcast/rss.d.ts +2 -0
- package/dist/types/content/transcript/providers/podcast/spotify-flow.d.ts +2 -2
- package/dist/types/content/transcript/providers/podcast/spotify.d.ts +2 -2
- package/dist/types/content/transcript/providers/podcast.d.ts +5 -5
- package/dist/types/content/transcript/providers/transcription-start.d.ts +32 -0
- package/dist/types/content/transcript/providers/youtube/api.d.ts +7 -2
- package/dist/types/content/transcript/providers/youtube/captions.d.ts +6 -1
- package/dist/types/content/transcript/providers/youtube/yt-dlp.d.ts +13 -6
- package/dist/types/content/transcript/providers/youtube.d.ts +1 -1
- package/dist/types/content/transcript/timestamps.d.ts +5 -0
- package/dist/types/content/transcript/transcription-config.d.ts +15 -0
- package/dist/types/content/transcript/types.d.ts +15 -7
- package/dist/types/content/transcript/utils.d.ts +1 -1
- package/dist/types/content/url.d.ts +5 -3
- package/dist/types/index.d.ts +5 -4
- package/dist/types/language.d.ts +4 -4
- package/dist/types/openai/base-url.d.ts +14 -0
- package/dist/types/processes.d.ts +50 -0
- package/dist/types/prompts/cli.d.ts +3 -3
- package/dist/types/prompts/file.d.ts +2 -2
- package/dist/types/prompts/index.d.ts +6 -5
- package/dist/types/prompts/link-summary.d.ts +9 -4
- package/dist/types/prompts/summary-lengths.d.ts +1 -1
- package/dist/types/prompts/summary-system.d.ts +1 -0
- package/dist/types/transcription/onnx-cli.d.ts +25 -0
- package/dist/types/transcription/whisper/core.d.ts +10 -3
- package/dist/types/transcription/whisper/ffmpeg.d.ts +5 -0
- package/dist/types/transcription/whisper/groq.d.ts +2 -0
- package/dist/types/transcription/whisper/openai.d.ts +6 -1
- package/dist/types/transcription/whisper/types.d.ts +1 -1
- package/dist/types/transcription/whisper/whisper-cpp.d.ts +1 -1
- package/dist/types/transcription/whisper.d.ts +5 -5
- package/package.json +24 -18
- package/LICENSE +0 -21
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
export { createLinkPreviewClient, } from
|
|
2
|
-
export { DEFAULT_CACHE_MODE, DEFAULT_MAX_CONTENT_CHARACTERS, DEFAULT_TIMEOUT_MS, } from
|
|
3
|
-
export { ProgressKind } from
|
|
4
|
-
export { CACHE_MODES, } from
|
|
5
|
-
export { extractYouTubeVideoId, isDirectMediaUrl, isPodcastHost, isTwitterStatusUrl, isYouTubeUrl, isYouTubeVideoUrl, shouldPreferUrlMode, } from
|
|
1
|
+
export { createLinkPreviewClient, } from "./link-preview/client.js";
|
|
2
|
+
export { DEFAULT_CACHE_MODE, DEFAULT_MAX_CONTENT_CHARACTERS, DEFAULT_TIMEOUT_MS, } from "./link-preview/content/types.js";
|
|
3
|
+
export { ProgressKind } from "./link-preview/deps.js";
|
|
4
|
+
export { CACHE_MODES, } from "./link-preview/types.js";
|
|
5
|
+
export { DIRECT_MEDIA_EXTENSIONS, extractYouTubeVideoId, isDirectMediaExtension, isDirectMediaUrl, isPodcastHost, isTwitterBroadcastUrl, isTwitterStatusUrl, isYouTubeUrl, isYouTubeVideoUrl, shouldPreferUrlMode, } from "./url.js";
|
|
6
6
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/content/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/content/index.ts"],"names":[],"mappings":"AAOA,OAAO,EACL,uBAAuB,GAGxB,MAAM,0BAA0B,CAAC;AAClC,OAAO,EACL,kBAAkB,EAClB,8BAA8B,EAC9B,kBAAkB,GAGnB,MAAM,iCAAiC,CAAC;AASzC,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AACtD,OAAO,EACL,WAAW,GAIZ,MAAM,yBAAyB,CAAC;AACjC,OAAO,EACL,uBAAuB,EACvB,qBAAqB,EACrB,sBAAsB,EACtB,gBAAgB,EAChB,aAAa,EACb,qBAAqB,EACrB,kBAAkB,EAClB,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,GACpB,MAAM,UAAU,CAAC"}
|
|
@@ -1,27 +1,42 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { resolveTranscriptionConfig, } from "../transcript/transcription-config.js";
|
|
2
|
+
import { fetchLinkContent } from "./content/index.js";
|
|
2
3
|
/** Public factory for a link preview client with injectable dependencies. */
|
|
3
4
|
export function createLinkPreviewClient(options = {}) {
|
|
4
5
|
const fetchImpl = options.fetch ?? ((...args) => globalThis.fetch(...args));
|
|
6
|
+
const env = typeof options.env === "object" && options.env ? options.env : undefined;
|
|
5
7
|
const scrape = options.scrapeWithFirecrawl ?? null;
|
|
6
|
-
const apifyApiToken = typeof options.apifyApiToken ===
|
|
7
|
-
const ytDlpPath = typeof options.ytDlpPath ===
|
|
8
|
-
const falApiKey = typeof options.falApiKey ===
|
|
9
|
-
const
|
|
8
|
+
const apifyApiToken = typeof options.apifyApiToken === "string" ? options.apifyApiToken : null;
|
|
9
|
+
const ytDlpPath = typeof options.ytDlpPath === "string" ? options.ytDlpPath : null;
|
|
10
|
+
const falApiKey = typeof options.falApiKey === "string" ? options.falApiKey : null;
|
|
11
|
+
const groqApiKey = typeof options.groqApiKey === "string" ? options.groqApiKey : null;
|
|
12
|
+
const openaiApiKey = typeof options.openaiApiKey === "string" ? options.openaiApiKey : null;
|
|
13
|
+
const transcription = resolveTranscriptionConfig({
|
|
14
|
+
env,
|
|
15
|
+
transcription: options.transcription ?? null,
|
|
16
|
+
falApiKey,
|
|
17
|
+
groqApiKey,
|
|
18
|
+
openaiApiKey,
|
|
19
|
+
});
|
|
10
20
|
const convertHtmlToMarkdown = options.convertHtmlToMarkdown ?? null;
|
|
11
21
|
const transcriptCache = options.transcriptCache ?? null;
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
const
|
|
22
|
+
const mediaCache = options.mediaCache ?? null;
|
|
23
|
+
const readTweetWithBird = typeof options.readTweetWithBird === "function" ? options.readTweetWithBird : null;
|
|
24
|
+
const resolveTwitterCookies = typeof options.resolveTwitterCookies === "function" ? options.resolveTwitterCookies : null;
|
|
25
|
+
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
|
|
15
26
|
return {
|
|
16
27
|
fetchLinkContent: (url, contentOptions) => fetchLinkContent(url, contentOptions, {
|
|
17
28
|
fetch: fetchImpl,
|
|
29
|
+
env,
|
|
18
30
|
scrapeWithFirecrawl: scrape,
|
|
19
31
|
apifyApiToken,
|
|
20
32
|
ytDlpPath,
|
|
33
|
+
transcription,
|
|
21
34
|
falApiKey,
|
|
35
|
+
groqApiKey,
|
|
22
36
|
openaiApiKey,
|
|
23
37
|
convertHtmlToMarkdown,
|
|
24
38
|
transcriptCache,
|
|
39
|
+
mediaCache,
|
|
25
40
|
readTweetWithBird,
|
|
26
41
|
resolveTwitterCookies,
|
|
27
42
|
onProgress,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"client.js","sourceRoot":"","sources":["../../../../src/content/link-preview/client.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"client.js","sourceRoot":"","sources":["../../../../src/content/link-preview/client.ts"],"names":[],"mappings":"AASA,OAAO,EACL,0BAA0B,GAE3B,MAAM,uCAAuC,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AA0BtD,6EAA6E;AAC7E,MAAM,UAAU,uBAAuB,CAAC,UAAoC,EAAE;IAC5E,MAAM,SAAS,GACb,OAAO,CAAC,KAAK,IAAI,CAAC,CAAC,GAAG,IAA8B,EAAE,EAAE,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC;IACtF,MAAM,GAAG,GAAG,OAAO,OAAO,CAAC,GAAG,KAAK,QAAQ,IAAI,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;IACrF,MAAM,MAAM,GAA+B,OAAO,CAAC,mBAAmB,IAAI,IAAI,CAAC;IAC/E,MAAM,aAAa,GAAG,OAAO,OAAO,CAAC,aAAa,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC;IAC/F,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC;IACnF,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC;IACnF,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IACtF,MAAM,YAAY,GAAG,OAAO,OAAO,CAAC,YAAY,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC;IAC5F,MAAM,aAAa,GAAG,0BAA0B,CAAC;QAC/C,GAAG;QACH,aAAa,EAAE,OAAO,CAAC,aAAa,IAAI,IAAI;QAC5C,SAAS;QACT,UAAU;QACV,YAAY;KACb,CAAC,CAAC;IACH,MAAM,qBAAqB,GAAiC,OAAO,CAAC,qBAAqB,IAAI,IAAI,CAAC;IAClG,MAAM,eAAe,GAA2B,OAAO,CAAC,eAAe,IAAI,IAAI,CAAC;IAChF,MAAM,UAAU,GAAsB,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC;IACjE,MAAM,iBAAiB,GACrB,OAAO,OAAO,CAAC,iBAAiB,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,iBAAiB,CAAC,CAAC,CAAC,IAAI,CAAC;IACrF,MAAM,qBAAqB,GACzB,OAAO,OAAO,CAAC,qBAAqB,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,qBAAqB,CAAC,CAAC,CAAC,IAAI,CAAC;IAC7F,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAExF,OAAO;QACL,gBAAgB,EAAE,CAAC,GAAW,EAAE,cAAwC,EAAE,EAAE,CAC1E,gBAAgB,CAAC,GAAG,EAAE,cAAc,EAAE;YACpC,KAAK,EAAE,SAAS;YAChB,GAAG;YACH,mBAAmB,EAAE,MAAM;YAC3B,aAAa;YACb,SAAS;YACT,aAAa;YACb,SAAS;YACT,UAAU;YACV,YAAY;YACZ,qBAAqB;YACrB,eAAe;YACf,UAAU;YACV,iBAAiB;YACjB,qBAAqB;YACrB,UAAU;SACX,CAAC;KACL,CAAC;AACJ,CAAC"}
|
|
@@ -1,45 +1,46 @@
|
|
|
1
|
-
import { load } from
|
|
2
|
-
import sanitizeHtml from
|
|
3
|
-
import { decodeHtmlEntities, normalizeWhitespace } from
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
import sanitizeHtml from "sanitize-html";
|
|
3
|
+
import { decodeHtmlEntities, normalizeWhitespace } from "./cleaner.js";
|
|
4
|
+
import { stripHiddenHtml } from "./visibility.js";
|
|
4
5
|
const MIN_SEGMENT_LENGTH = 30;
|
|
5
6
|
export function sanitizeHtmlForMarkdownConversion(html) {
|
|
6
|
-
return sanitizeHtml(html, {
|
|
7
|
+
return sanitizeHtml(stripHiddenHtml(html), {
|
|
7
8
|
allowedTags: [
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
9
|
+
"article",
|
|
10
|
+
"section",
|
|
11
|
+
"div",
|
|
12
|
+
"p",
|
|
13
|
+
"h1",
|
|
14
|
+
"h2",
|
|
15
|
+
"h3",
|
|
16
|
+
"h4",
|
|
17
|
+
"h5",
|
|
18
|
+
"h6",
|
|
19
|
+
"ol",
|
|
20
|
+
"ul",
|
|
21
|
+
"li",
|
|
22
|
+
"blockquote",
|
|
23
|
+
"pre",
|
|
24
|
+
"code",
|
|
25
|
+
"span",
|
|
26
|
+
"strong",
|
|
27
|
+
"em",
|
|
28
|
+
"br",
|
|
29
|
+
"a",
|
|
29
30
|
],
|
|
30
31
|
allowedAttributes: {
|
|
31
|
-
a: [
|
|
32
|
+
a: ["href"],
|
|
32
33
|
},
|
|
33
34
|
nonTextTags: [
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
35
|
+
"style",
|
|
36
|
+
"script",
|
|
37
|
+
"noscript",
|
|
38
|
+
"template",
|
|
39
|
+
"svg",
|
|
40
|
+
"canvas",
|
|
41
|
+
"iframe",
|
|
42
|
+
"object",
|
|
43
|
+
"embed",
|
|
43
44
|
],
|
|
44
45
|
textFilter(text) {
|
|
45
46
|
return decodeHtmlEntities(text);
|
|
@@ -49,46 +50,46 @@ export function sanitizeHtmlForMarkdownConversion(html) {
|
|
|
49
50
|
export function extractArticleContent(html) {
|
|
50
51
|
const segments = collectSegmentsFromHtml(html);
|
|
51
52
|
if (segments.length > 0) {
|
|
52
|
-
return segments.join(
|
|
53
|
+
return segments.join("\n");
|
|
53
54
|
}
|
|
54
55
|
const fallback = normalizeWhitespace(extractPlainText(html));
|
|
55
|
-
return fallback ??
|
|
56
|
+
return fallback ?? "";
|
|
56
57
|
}
|
|
57
58
|
export function collectSegmentsFromHtml(html) {
|
|
58
|
-
const sanitized = sanitizeHtml(html, {
|
|
59
|
+
const sanitized = sanitizeHtml(stripHiddenHtml(html), {
|
|
59
60
|
allowedTags: [
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
61
|
+
"article",
|
|
62
|
+
"section",
|
|
63
|
+
"div",
|
|
64
|
+
"p",
|
|
65
|
+
"h1",
|
|
66
|
+
"h2",
|
|
67
|
+
"h3",
|
|
68
|
+
"h4",
|
|
69
|
+
"h5",
|
|
70
|
+
"h6",
|
|
71
|
+
"ol",
|
|
72
|
+
"ul",
|
|
73
|
+
"li",
|
|
74
|
+
"blockquote",
|
|
75
|
+
"pre",
|
|
76
|
+
"code",
|
|
77
|
+
"span",
|
|
78
|
+
"strong",
|
|
79
|
+
"em",
|
|
80
|
+
"br",
|
|
80
81
|
],
|
|
81
82
|
allowedAttributes: {},
|
|
82
83
|
nonTextTags: [
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
84
|
+
"style",
|
|
85
|
+
"script",
|
|
86
|
+
"noscript",
|
|
87
|
+
"template",
|
|
88
|
+
"svg",
|
|
89
|
+
"canvas",
|
|
90
|
+
"iframe",
|
|
91
|
+
"object",
|
|
92
|
+
"embed",
|
|
92
93
|
],
|
|
93
94
|
textFilter(text) {
|
|
94
95
|
return decodeHtmlEntities(text);
|
|
@@ -96,23 +97,23 @@ export function collectSegmentsFromHtml(html) {
|
|
|
96
97
|
});
|
|
97
98
|
const $ = load(sanitized);
|
|
98
99
|
const segments = [];
|
|
99
|
-
$(
|
|
100
|
-
if (!(
|
|
100
|
+
$("h1,h2,h3,h4,h5,h6,li,p,blockquote,pre").each((_, element) => {
|
|
101
|
+
if (!("tagName" in element) || typeof element.tagName !== "string") {
|
|
101
102
|
return;
|
|
102
103
|
}
|
|
103
104
|
const tag = element.tagName.toLowerCase();
|
|
104
105
|
const raw = $(element).text();
|
|
105
|
-
const text = normalizeWhitespace(raw).replaceAll(/\n+/g,
|
|
106
|
+
const text = normalizeWhitespace(raw).replaceAll(/\n+/g, " ");
|
|
106
107
|
if (!text || text.length === 0) {
|
|
107
108
|
return;
|
|
108
109
|
}
|
|
109
|
-
if (tag.startsWith(
|
|
110
|
+
if (tag.startsWith("h")) {
|
|
110
111
|
if (text.length >= 10) {
|
|
111
112
|
segments.push(text);
|
|
112
113
|
}
|
|
113
114
|
return;
|
|
114
115
|
}
|
|
115
|
-
if (tag ===
|
|
116
|
+
if (tag === "li") {
|
|
116
117
|
if (text.length >= 20) {
|
|
117
118
|
segments.push(`• ${text}`);
|
|
118
119
|
}
|
|
@@ -124,25 +125,25 @@ export function collectSegmentsFromHtml(html) {
|
|
|
124
125
|
segments.push(text);
|
|
125
126
|
});
|
|
126
127
|
if (segments.length === 0) {
|
|
127
|
-
const fallback = normalizeWhitespace($(
|
|
128
|
+
const fallback = normalizeWhitespace($("body").text() || sanitized);
|
|
128
129
|
return fallback ? [fallback] : [];
|
|
129
130
|
}
|
|
130
131
|
return mergeConsecutiveSegments(segments);
|
|
131
132
|
}
|
|
132
133
|
export function extractPlainText(html) {
|
|
133
|
-
const stripped = sanitizeHtml(html, {
|
|
134
|
+
const stripped = sanitizeHtml(stripHiddenHtml(html), {
|
|
134
135
|
allowedTags: [],
|
|
135
136
|
allowedAttributes: {},
|
|
136
137
|
nonTextTags: [
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
138
|
+
"style",
|
|
139
|
+
"script",
|
|
140
|
+
"noscript",
|
|
141
|
+
"template",
|
|
142
|
+
"svg",
|
|
143
|
+
"canvas",
|
|
144
|
+
"iframe",
|
|
145
|
+
"object",
|
|
146
|
+
"embed",
|
|
146
147
|
],
|
|
147
148
|
});
|
|
148
149
|
return decodeHtmlEntities(stripped);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"article.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/article.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,
|
|
1
|
+
{"version":3,"file":"article.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/article.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAC/B,OAAO,YAAY,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,MAAM,cAAc,CAAC;AACvE,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAElD,MAAM,kBAAkB,GAAG,EAAE,CAAC;AAE9B,MAAM,UAAU,iCAAiC,CAAC,IAAY;IAC5D,OAAO,YAAY,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE;QACzC,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;YACJ,GAAG;SACJ;QACD,iBAAiB,EAAE;YACjB,CAAC,EAAE,CAAC,MAAM,CAAC;SACZ;QACD,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAClC,CAAC;KACF,CAAC,CAAC;AACL,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,IAAY;IAChD,MAAM,QAAQ,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAC;IAC/C,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,OAAO,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7B,CAAC;IACD,MAAM,QAAQ,GAAG,mBAAmB,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;IAC7D,OAAO,QAAQ,IAAI,EAAE,CAAC;AACxB,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,IAAY;IAClD,MAAM,SAAS,GAAG,YAAY,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE;QACpD,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;SACL;QACD,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAClC,CAAC;KACF,CAAC,CAAC;IAEH,MAAM,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC;IAC1B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,CAAC,CAAC,uCAAuC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAC7D,IAAI,CAAC,CAAC,SAAS,IAAI,OAAO,CAAC,IAAI,OAAO,OAAO,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YACnE,OAAO;QACT,CAAC;QAED,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;QAE1C,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;QAC9B,MAAM,IAAI,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9D,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAO;QACT,CAAC;QAED,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YACxB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;YACD,OAAO;QACT,CAAC;QAED,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACjB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC;YAC7B,CAAC;YACD,OAAO;QACT,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,GAAG,kBAAkB,EAAE,CAAC;YACrC,OAAO;QACT,CAAC;QAED,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtB,CAAC,CAAC,CAAC;IAEH,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,SAAS,CAAC,CAAC;QACpE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACpC,CAAC;IAED,OAAO,wBAAwB,CAAC,QAAQ,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,QAAQ,GAAG,YAAY,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE;QACnD,WAAW,EAAE,EAAE;QACf,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;KACF,CAAC,CAAC;IACH,OAAO,kBAAkB,CAAC,QAAQ,CAAC,CAAC;AACtC,CAAC;AAED,SAAS,wBAAwB,CAAC,QAAkB;IAClD,gGAAgG;IAChG,2FAA2F;IAC3F,OAAO,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;AAClC,CAAC"}
|
|
@@ -1,36 +1,39 @@
|
|
|
1
|
-
import { compact } from
|
|
1
|
+
import { compact } from "es-toolkit";
|
|
2
2
|
const WORD_SPLIT_PATTERN = /\s+/g;
|
|
3
3
|
export function normalizeForPrompt(input) {
|
|
4
|
-
return input
|
|
5
|
-
.replaceAll(
|
|
6
|
-
.replaceAll(/[\t ]+/g,
|
|
7
|
-
.replaceAll(/\s*\n\s*/g,
|
|
8
|
-
.replaceAll(/\n{3,}/g,
|
|
4
|
+
return stripInvisibleUnicode(input)
|
|
5
|
+
.replaceAll("\u00A0", " ")
|
|
6
|
+
.replaceAll(/[\t ]+/g, " ")
|
|
7
|
+
.replaceAll(/\s*\n\s*/g, "\n")
|
|
8
|
+
.replaceAll(/\n{3,}/g, "\n\n")
|
|
9
9
|
.trim();
|
|
10
10
|
}
|
|
11
11
|
export function normalizeWhitespace(input) {
|
|
12
|
-
return input
|
|
13
|
-
.replaceAll(
|
|
14
|
-
.replaceAll(/[\t ]+/g,
|
|
15
|
-
.replaceAll(/\s*\n\s*/g,
|
|
12
|
+
return stripInvisibleUnicode(input)
|
|
13
|
+
.replaceAll("\u00A0", " ")
|
|
14
|
+
.replaceAll(/[\t ]+/g, " ")
|
|
15
|
+
.replaceAll(/\s*\n\s*/g, "\n")
|
|
16
16
|
.trim();
|
|
17
17
|
}
|
|
18
18
|
export function decodeHtmlEntities(input) {
|
|
19
19
|
return input
|
|
20
|
-
.replaceAll(
|
|
21
|
-
.replaceAll(
|
|
22
|
-
.replaceAll(
|
|
23
|
-
.replaceAll(
|
|
24
|
-
.replaceAll(
|
|
25
|
-
.replaceAll(
|
|
26
|
-
.replaceAll(
|
|
27
|
-
.replaceAll(
|
|
20
|
+
.replaceAll("&", "&")
|
|
21
|
+
.replaceAll("<", "<")
|
|
22
|
+
.replaceAll(">", ">")
|
|
23
|
+
.replaceAll(""", '"')
|
|
24
|
+
.replaceAll("'", "'")
|
|
25
|
+
.replaceAll("'", "'")
|
|
26
|
+
.replaceAll("/", "/")
|
|
27
|
+
.replaceAll(" ", " ");
|
|
28
|
+
}
|
|
29
|
+
export function stripInvisibleUnicode(input) {
|
|
30
|
+
return input.replaceAll(/[\u200B-\u200F\u202A-\u202E\u2060-\u2069\uFEFF\u{E0000}-\u{E007F}]/gu, "");
|
|
28
31
|
}
|
|
29
32
|
export function normalizeCandidate(value) {
|
|
30
33
|
if (!value) {
|
|
31
34
|
return null;
|
|
32
35
|
}
|
|
33
|
-
const trimmed = value.replaceAll(/\s+/g,
|
|
36
|
+
const trimmed = value.replaceAll(/\s+/g, " ").trim();
|
|
34
37
|
return trimmed.length > 0 ? trimmed : null;
|
|
35
38
|
}
|
|
36
39
|
export function clipAtSentenceBoundary(input, maxLength) {
|
|
@@ -38,7 +41,7 @@ export function clipAtSentenceBoundary(input, maxLength) {
|
|
|
38
41
|
return input;
|
|
39
42
|
}
|
|
40
43
|
const slice = input.slice(0, maxLength);
|
|
41
|
-
const lastSentenceBreak = Math.max(slice.lastIndexOf(
|
|
44
|
+
const lastSentenceBreak = Math.max(slice.lastIndexOf(". "), slice.lastIndexOf("! "), slice.lastIndexOf("? "), slice.lastIndexOf("\n\n"));
|
|
42
45
|
if (lastSentenceBreak > maxLength * 0.5) {
|
|
43
46
|
return slice.slice(0, lastSentenceBreak + 1);
|
|
44
47
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cleaner.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/cleaner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,
|
|
1
|
+
{"version":3,"file":"cleaner.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/cleaner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAErC,MAAM,kBAAkB,GAAG,MAAM,CAAC;AASlC,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,OAAO,qBAAqB,CAAC,KAAK,CAAC;SAChC,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,UAAU,CAAC,SAAS,EAAE,MAAM,CAAC;SAC7B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,KAAa;IAC/C,OAAO,qBAAqB,CAAC,KAAK,CAAC;SAChC,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,OAAO,KAAK;SACT,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;AAC/B,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,KAAa;IACjD,OAAO,KAAK,CAAC,UAAU,CACrB,sEAAsE,EACtE,EAAE,CACH,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAgC;IACjE,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,OAAO,IAAI,CAAC;IACd,CAAC;IACD,MAAM,OAAO,GAAG,KAAK,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IACrD,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;AAC7C,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,KAAa,EAAE,SAAiB;IACrE,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IACxC,MAAM,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAChC,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAC1B,CAAC;IACF,IAAI,iBAAiB,GAAG,SAAS,GAAG,GAAG,EAAE,CAAC;QACxC,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,GAAG,CAAC,CAAC,CAAC;IAC/C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,WAAmB,EACnB,aAAqB;IAErB,MAAM,eAAe,GAAG,WAAW,CAAC,MAAM,CAAC;IAC3C,MAAM,SAAS,GAAG,eAAe,GAAG,aAAa,CAAC;IAClD,MAAM,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,sBAAsB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC;IAC7F,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAC/B,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7F,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,SAAS,EAAE,CAAC;AAC5D,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"constants.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/constants.ts"],"names":[],"mappings":"AAAA,MAAM,CAAC,MAAM,yBAAyB,GACpC,iIAAiI,
|
|
1
|
+
{"version":3,"file":"constants.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/constants.ts"],"names":[],"mappings":"AAAA,MAAM,CAAC,MAAM,yBAAyB,GACpC,iIAAiI,CAAC;AACpI,MAAM,CAAC,MAAM,2BAA2B,GAAG,GAAG,CAAC;AAC/C,MAAM,CAAC,MAAM,kCAAkC,GAAG,GAAG,CAAC;AACtD,MAAM,CAAC,MAAM,mCAAmC,GAAG,GAAG,CAAC;AACvD,MAAM,CAAC,MAAM,8BAA8B,GAAG,GAAG,CAAC;AAClD,MAAM,CAAC,MAAM,yCAAyC,GAAG,IAAI,CAAC"}
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
import { isYouTubeUrl } from
|
|
2
|
-
import { appendNote } from
|
|
1
|
+
import { isYouTubeUrl } from "../../url.js";
|
|
2
|
+
import { appendNote } from "./utils.js";
|
|
3
3
|
const REQUEST_HEADERS = {
|
|
4
|
-
|
|
5
|
-
Accept:
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
Pragma:
|
|
4
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
|
5
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
6
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
7
|
+
"Cache-Control": "no-cache",
|
|
8
|
+
Pragma: "no-cache",
|
|
9
9
|
};
|
|
10
10
|
const DEFAULT_REQUEST_TIMEOUT_MS = 5000;
|
|
11
11
|
export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress, } = {}) {
|
|
12
|
-
onProgress?.({ kind:
|
|
12
|
+
onProgress?.({ kind: "fetch-html-start", url });
|
|
13
13
|
const controller = new AbortController();
|
|
14
|
-
const effectiveTimeoutMs = typeof timeoutMs ===
|
|
14
|
+
const effectiveTimeoutMs = typeof timeoutMs === "number" && Number.isFinite(timeoutMs)
|
|
15
15
|
? timeoutMs
|
|
16
16
|
: DEFAULT_REQUEST_TIMEOUT_MS;
|
|
17
17
|
const timeout = setTimeout(() => {
|
|
@@ -20,26 +20,26 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
|
|
|
20
20
|
try {
|
|
21
21
|
const response = await fetchImpl(url, {
|
|
22
22
|
headers: REQUEST_HEADERS,
|
|
23
|
-
redirect:
|
|
23
|
+
redirect: "follow",
|
|
24
24
|
signal: controller.signal,
|
|
25
25
|
});
|
|
26
26
|
if (!response.ok) {
|
|
27
27
|
throw new Error(`Failed to fetch HTML document (status ${response.status})`);
|
|
28
28
|
}
|
|
29
29
|
const finalUrl = response.url?.trim() || url;
|
|
30
|
-
const contentType = response.headers.get(
|
|
30
|
+
const contentType = response.headers.get("content-type")?.toLowerCase() ?? null;
|
|
31
31
|
if (contentType &&
|
|
32
|
-
!contentType.includes(
|
|
33
|
-
!contentType.includes(
|
|
34
|
-
!contentType.includes(
|
|
35
|
-
!contentType.includes(
|
|
36
|
-
!contentType.includes(
|
|
37
|
-
!contentType.includes(
|
|
38
|
-
!contentType.startsWith(
|
|
32
|
+
!contentType.includes("text/html") &&
|
|
33
|
+
!contentType.includes("application/xhtml+xml") &&
|
|
34
|
+
!contentType.includes("application/xml") &&
|
|
35
|
+
!contentType.includes("text/xml") &&
|
|
36
|
+
!contentType.includes("application/rss+xml") &&
|
|
37
|
+
!contentType.includes("application/atom+xml") &&
|
|
38
|
+
!contentType.startsWith("text/")) {
|
|
39
39
|
throw new Error(`Unsupported content-type for HTML document fetch: ${contentType}`);
|
|
40
40
|
}
|
|
41
41
|
const totalBytes = (() => {
|
|
42
|
-
const raw = response.headers.get(
|
|
42
|
+
const raw = response.headers.get("content-length");
|
|
43
43
|
if (!raw)
|
|
44
44
|
return null;
|
|
45
45
|
const parsed = Number(raw);
|
|
@@ -49,14 +49,14 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
|
|
|
49
49
|
if (!body) {
|
|
50
50
|
const text = await response.text();
|
|
51
51
|
const bytes = new TextEncoder().encode(text).byteLength;
|
|
52
|
-
onProgress?.({ kind:
|
|
52
|
+
onProgress?.({ kind: "fetch-html-done", url, downloadedBytes: bytes, totalBytes });
|
|
53
53
|
return { html: text, finalUrl };
|
|
54
54
|
}
|
|
55
55
|
const reader = body.getReader();
|
|
56
56
|
const decoder = new TextDecoder();
|
|
57
57
|
let downloadedBytes = 0;
|
|
58
|
-
let text =
|
|
59
|
-
onProgress?.({ kind:
|
|
58
|
+
let text = "";
|
|
59
|
+
onProgress?.({ kind: "fetch-html-progress", url, downloadedBytes: 0, totalBytes });
|
|
60
60
|
while (true) {
|
|
61
61
|
const { value, done } = await reader.read();
|
|
62
62
|
if (done)
|
|
@@ -65,15 +65,15 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
|
|
|
65
65
|
continue;
|
|
66
66
|
downloadedBytes += value.byteLength;
|
|
67
67
|
text += decoder.decode(value, { stream: true });
|
|
68
|
-
onProgress?.({ kind:
|
|
68
|
+
onProgress?.({ kind: "fetch-html-progress", url, downloadedBytes, totalBytes });
|
|
69
69
|
}
|
|
70
70
|
text += decoder.decode();
|
|
71
|
-
onProgress?.({ kind:
|
|
71
|
+
onProgress?.({ kind: "fetch-html-done", url, downloadedBytes, totalBytes });
|
|
72
72
|
return { html: text, finalUrl };
|
|
73
73
|
}
|
|
74
74
|
catch (error) {
|
|
75
|
-
if (error instanceof DOMException && error.name ===
|
|
76
|
-
throw new Error(
|
|
75
|
+
if (error instanceof DOMException && error.name === "AbortError") {
|
|
76
|
+
throw new Error("Fetching HTML document timed out");
|
|
77
77
|
}
|
|
78
78
|
throw error;
|
|
79
79
|
}
|
|
@@ -83,42 +83,48 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
|
|
|
83
83
|
}
|
|
84
84
|
export async function fetchWithFirecrawl(url, scrapeWithFirecrawl, options = {}) {
|
|
85
85
|
const timeoutMs = options.timeoutMs;
|
|
86
|
-
const cacheMode = options.cacheMode ??
|
|
87
|
-
const onProgress = typeof options.onProgress ===
|
|
88
|
-
const reason = typeof options.reason ===
|
|
86
|
+
const cacheMode = options.cacheMode ?? "default";
|
|
87
|
+
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
|
|
88
|
+
const reason = typeof options.reason === "string" ? options.reason : null;
|
|
89
89
|
const diagnostics = {
|
|
90
90
|
attempted: false,
|
|
91
91
|
used: false,
|
|
92
92
|
cacheMode,
|
|
93
|
-
cacheStatus: cacheMode ===
|
|
93
|
+
cacheStatus: cacheMode === "bypass" ? "bypassed" : "unknown",
|
|
94
94
|
notes: null,
|
|
95
95
|
};
|
|
96
96
|
if (isYouTubeUrl(url)) {
|
|
97
|
-
diagnostics.notes = appendNote(diagnostics.notes,
|
|
97
|
+
diagnostics.notes = appendNote(diagnostics.notes, "Skipped Firecrawl for YouTube URL");
|
|
98
98
|
return { payload: null, diagnostics };
|
|
99
99
|
}
|
|
100
100
|
if (!scrapeWithFirecrawl) {
|
|
101
|
-
diagnostics.notes = appendNote(diagnostics.notes,
|
|
101
|
+
diagnostics.notes = appendNote(diagnostics.notes, "Firecrawl is not configured");
|
|
102
102
|
return { payload: null, diagnostics };
|
|
103
103
|
}
|
|
104
104
|
diagnostics.attempted = true;
|
|
105
|
-
onProgress?.({ kind:
|
|
105
|
+
onProgress?.({ kind: "firecrawl-start", url, reason: reason ?? "firecrawl" });
|
|
106
106
|
try {
|
|
107
107
|
const payload = await scrapeWithFirecrawl(url, { timeoutMs, cacheMode });
|
|
108
108
|
if (!payload) {
|
|
109
|
-
diagnostics.notes = appendNote(diagnostics.notes,
|
|
110
|
-
onProgress?.({
|
|
109
|
+
diagnostics.notes = appendNote(diagnostics.notes, "Firecrawl returned no content payload");
|
|
110
|
+
onProgress?.({
|
|
111
|
+
kind: "firecrawl-done",
|
|
112
|
+
url,
|
|
113
|
+
ok: false,
|
|
114
|
+
markdownBytes: null,
|
|
115
|
+
htmlBytes: null,
|
|
116
|
+
});
|
|
111
117
|
return { payload: null, diagnostics };
|
|
112
118
|
}
|
|
113
119
|
const encoder = new TextEncoder();
|
|
114
|
-
const markdownBytes = typeof payload.markdown ===
|
|
115
|
-
const htmlBytes = typeof payload.html ===
|
|
116
|
-
onProgress?.({ kind:
|
|
120
|
+
const markdownBytes = typeof payload.markdown === "string" ? encoder.encode(payload.markdown).byteLength : null;
|
|
121
|
+
const htmlBytes = typeof payload.html === "string" ? encoder.encode(payload.html).byteLength : null;
|
|
122
|
+
onProgress?.({ kind: "firecrawl-done", url, ok: true, markdownBytes, htmlBytes });
|
|
117
123
|
return { payload, diagnostics };
|
|
118
124
|
}
|
|
119
125
|
catch (error) {
|
|
120
|
-
diagnostics.notes = appendNote(diagnostics.notes, `Firecrawl error: ${error instanceof Error ? error.message :
|
|
121
|
-
onProgress?.({ kind:
|
|
126
|
+
diagnostics.notes = appendNote(diagnostics.notes, `Firecrawl error: ${error instanceof Error ? error.message : "unknown error"}`);
|
|
127
|
+
onProgress?.({ kind: "firecrawl-done", url, ok: false, markdownBytes: null, htmlBytes: null });
|
|
122
128
|
return { payload: null, diagnostics };
|
|
123
129
|
}
|
|
124
130
|
}
|