@steipete/summarize-core 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/dist/esm/content/index.js +5 -5
- package/dist/esm/content/index.js.map +1 -1
- package/dist/esm/content/link-preview/client.js +20 -9
- package/dist/esm/content/link-preview/client.js.map +1 -1
- package/dist/esm/content/link-preview/content/article.js +84 -83
- package/dist/esm/content/link-preview/content/article.js.map +1 -1
- package/dist/esm/content/link-preview/content/cleaner.js +23 -20
- package/dist/esm/content/link-preview/content/cleaner.js.map +1 -1
- package/dist/esm/content/link-preview/content/constants.js.map +1 -1
- package/dist/esm/content/link-preview/content/fetcher.js +46 -40
- package/dist/esm/content/link-preview/content/fetcher.js.map +1 -1
- package/dist/esm/content/link-preview/content/firecrawl.js +16 -16
- package/dist/esm/content/link-preview/content/firecrawl.js.map +1 -1
- package/dist/esm/content/link-preview/content/html.js +29 -27
- package/dist/esm/content/link-preview/content/html.js.map +1 -1
- package/dist/esm/content/link-preview/content/index.js +141 -88
- package/dist/esm/content/link-preview/content/index.js.map +1 -1
- package/dist/esm/content/link-preview/content/jsonld.js +12 -12
- package/dist/esm/content/link-preview/content/jsonld.js.map +1 -1
- package/dist/esm/content/link-preview/content/parsers.js +20 -20
- package/dist/esm/content/link-preview/content/parsers.js.map +1 -1
- package/dist/esm/content/link-preview/content/podcast-utils.js +34 -34
- package/dist/esm/content/link-preview/content/podcast-utils.js.map +1 -1
- package/dist/esm/content/link-preview/content/readability.js +16 -15
- package/dist/esm/content/link-preview/content/readability.js.map +1 -1
- package/dist/esm/content/link-preview/content/twitter-utils.js +24 -11
- package/dist/esm/content/link-preview/content/twitter-utils.js.map +1 -1
- package/dist/esm/content/link-preview/content/types.js +1 -1
- package/dist/esm/content/link-preview/content/types.js.map +1 -1
- package/dist/esm/content/link-preview/content/utils.js +17 -17
- package/dist/esm/content/link-preview/content/utils.js.map +1 -1
- package/dist/esm/content/link-preview/content/video.js +19 -19
- package/dist/esm/content/link-preview/content/video.js.map +1 -1
- package/dist/esm/content/link-preview/content/visibility.js +121 -0
- package/dist/esm/content/link-preview/content/visibility.js.map +1 -0
- package/dist/esm/content/link-preview/content/youtube.js +10 -10
- package/dist/esm/content/link-preview/content/youtube.js.map +1 -1
- package/dist/esm/content/link-preview/deps.js +16 -16
- package/dist/esm/content/link-preview/deps.js.map +1 -1
- package/dist/esm/content/link-preview/fetch-with-timeout.js +4 -4
- package/dist/esm/content/link-preview/fetch-with-timeout.js.map +1 -1
- package/dist/esm/content/link-preview/types.js +1 -1
- package/dist/esm/content/link-preview/types.js.map +1 -1
- package/dist/esm/content/transcript/cache.js +22 -22
- package/dist/esm/content/transcript/cache.js.map +1 -1
- package/dist/esm/content/transcript/index.js +34 -24
- package/dist/esm/content/transcript/index.js.map +1 -1
- package/dist/esm/content/transcript/normalize.js +10 -10
- package/dist/esm/content/transcript/normalize.js.map +1 -1
- package/dist/esm/content/transcript/parse.js +31 -31
- package/dist/esm/content/transcript/parse.js.map +1 -1
- package/dist/esm/content/transcript/providers/generic.js +74 -78
- package/dist/esm/content/transcript/providers/generic.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/apple-flow.js +36 -36
- package/dist/esm/content/transcript/providers/podcast/apple-flow.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/apple.js +5 -5
- package/dist/esm/content/transcript/providers/podcast/apple.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/constants.js +2 -2
- package/dist/esm/content/transcript/providers/podcast/constants.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/itunes.js +44 -42
- package/dist/esm/content/transcript/providers/podcast/itunes.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/json.js +4 -4
- package/dist/esm/content/transcript/providers/podcast/json.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/media.js +58 -49
- package/dist/esm/content/transcript/providers/podcast/media.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/results.js +2 -2
- package/dist/esm/content/transcript/providers/podcast/results.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/rss.js +29 -29
- package/dist/esm/content/transcript/providers/podcast/rss.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/spotify-flow.js +38 -38
- package/dist/esm/content/transcript/providers/podcast/spotify-flow.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/spotify.js +32 -32
- package/dist/esm/content/transcript/providers/podcast/spotify.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast.js +43 -47
- package/dist/esm/content/transcript/providers/podcast.js.map +1 -1
- package/dist/esm/content/transcript/providers/transcription-start.js +59 -31
- package/dist/esm/content/transcript/providers/transcription-start.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/api.js +56 -56
- package/dist/esm/content/transcript/providers/youtube/api.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/apify.js +7 -7
- package/dist/esm/content/transcript/providers/youtube/apify.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/captions.js +76 -76
- package/dist/esm/content/transcript/providers/youtube/captions.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/yt-dlp.js +82 -75
- package/dist/esm/content/transcript/providers/youtube/yt-dlp.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube.js +84 -77
- package/dist/esm/content/transcript/providers/youtube.js.map +1 -1
- package/dist/esm/content/transcript/timestamps.js +8 -8
- package/dist/esm/content/transcript/timestamps.js.map +1 -1
- package/dist/esm/content/transcript/transcription-config.js +14 -0
- package/dist/esm/content/transcript/transcription-config.js.map +1 -0
- package/dist/esm/content/transcript/utils.js +35 -35
- package/dist/esm/content/transcript/utils.js.map +1 -1
- package/dist/esm/content/url.js +59 -28
- package/dist/esm/content/url.js.map +1 -1
- package/dist/esm/index.js +4 -3
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/language.js +77 -77
- package/dist/esm/language.js.map +1 -1
- package/dist/esm/openai/base-url.js +35 -0
- package/dist/esm/openai/base-url.js.map +1 -0
- package/dist/esm/processes.js +16 -16
- package/dist/esm/processes.js.map +1 -1
- package/dist/esm/prompts/cli.js +17 -17
- package/dist/esm/prompts/cli.js.map +1 -1
- package/dist/esm/prompts/file.js +54 -54
- package/dist/esm/prompts/file.js.map +1 -1
- package/dist/esm/prompts/format.js +2 -2
- package/dist/esm/prompts/format.js.map +1 -1
- package/dist/esm/prompts/index.js +5 -5
- package/dist/esm/prompts/index.js.map +1 -1
- package/dist/esm/prompts/link-summary.js +65 -65
- package/dist/esm/prompts/link-summary.js.map +1 -1
- package/dist/esm/prompts/summary-lengths.js +10 -10
- package/dist/esm/prompts/summary-lengths.js.map +1 -1
- package/dist/esm/prompts/summary-system.js +9 -9
- package/dist/esm/prompts/summary-system.js.map +1 -1
- package/dist/esm/shared/contracts.js +1 -1
- package/dist/esm/shared/contracts.js.map +1 -1
- package/dist/esm/transcription/onnx-cli.js +69 -69
- package/dist/esm/transcription/onnx-cli.js.map +1 -1
- package/dist/esm/transcription/whisper/constants.js +3 -3
- package/dist/esm/transcription/whisper/constants.js.map +1 -1
- package/dist/esm/transcription/whisper/core.js +148 -59
- package/dist/esm/transcription/whisper/core.js.map +1 -1
- package/dist/esm/transcription/whisper/fal.js +14 -14
- package/dist/esm/transcription/whisper/fal.js.map +1 -1
- package/dist/esm/transcription/whisper/ffmpeg.js +106 -106
- package/dist/esm/transcription/whisper/ffmpeg.js.map +1 -1
- package/dist/esm/transcription/whisper/groq.js +46 -0
- package/dist/esm/transcription/whisper/groq.js.map +1 -0
- package/dist/esm/transcription/whisper/openai.js +19 -13
- package/dist/esm/transcription/whisper/openai.js.map +1 -1
- package/dist/esm/transcription/whisper/utils.js +19 -19
- package/dist/esm/transcription/whisper/utils.js.map +1 -1
- package/dist/esm/transcription/whisper/whisper-cpp.js +64 -64
- package/dist/esm/transcription/whisper/whisper-cpp.js.map +1 -1
- package/dist/esm/transcription/whisper.js +4 -4
- package/dist/esm/transcription/whisper.js.map +1 -1
- package/dist/types/content/cache/types.d.ts +1 -1
- package/dist/types/content/index.d.ts +7 -7
- package/dist/types/content/link-preview/client.d.ts +7 -4
- package/dist/types/content/link-preview/content/cleaner.d.ts +1 -0
- package/dist/types/content/link-preview/content/fetcher.d.ts +2 -2
- package/dist/types/content/link-preview/content/firecrawl.d.ts +7 -7
- package/dist/types/content/link-preview/content/html.d.ts +8 -8
- package/dist/types/content/link-preview/content/index.d.ts +3 -3
- package/dist/types/content/link-preview/content/twitter-utils.d.ts +1 -0
- package/dist/types/content/link-preview/content/types.d.ts +8 -8
- package/dist/types/content/link-preview/content/utils.d.ts +3 -3
- package/dist/types/content/link-preview/content/video.d.ts +1 -1
- package/dist/types/content/link-preview/content/visibility.d.ts +1 -0
- package/dist/types/content/link-preview/deps.d.ts +36 -33
- package/dist/types/content/link-preview/types.d.ts +4 -4
- package/dist/types/content/transcript/cache.d.ts +4 -4
- package/dist/types/content/transcript/index.d.ts +7 -7
- package/dist/types/content/transcript/parse.d.ts +1 -1
- package/dist/types/content/transcript/providers/generic.d.ts +1 -1
- package/dist/types/content/transcript/providers/podcast/apple-flow.d.ts +2 -2
- package/dist/types/content/transcript/providers/podcast/flow-context.d.ts +4 -4
- package/dist/types/content/transcript/providers/podcast/media.d.ts +9 -6
- package/dist/types/content/transcript/providers/podcast/results.d.ts +3 -3
- package/dist/types/content/transcript/providers/podcast/rss.d.ts +1 -1
- package/dist/types/content/transcript/providers/podcast/spotify-flow.d.ts +2 -2
- package/dist/types/content/transcript/providers/podcast/spotify.d.ts +2 -2
- package/dist/types/content/transcript/providers/podcast.d.ts +5 -5
- package/dist/types/content/transcript/providers/transcription-start.d.ts +14 -8
- package/dist/types/content/transcript/providers/youtube/api.d.ts +1 -1
- package/dist/types/content/transcript/providers/youtube/captions.d.ts +1 -1
- package/dist/types/content/transcript/providers/youtube/yt-dlp.d.ts +11 -8
- package/dist/types/content/transcript/providers/youtube.d.ts +1 -1
- package/dist/types/content/transcript/timestamps.d.ts +1 -1
- package/dist/types/content/transcript/transcription-config.d.ts +15 -0
- package/dist/types/content/transcript/types.d.ts +12 -9
- package/dist/types/content/transcript/utils.d.ts +1 -1
- package/dist/types/content/url.d.ts +5 -3
- package/dist/types/index.d.ts +5 -4
- package/dist/types/language.d.ts +4 -4
- package/dist/types/openai/base-url.d.ts +14 -0
- package/dist/types/processes.d.ts +2 -2
- package/dist/types/prompts/cli.d.ts +3 -3
- package/dist/types/prompts/file.d.ts +2 -2
- package/dist/types/prompts/index.d.ts +6 -6
- package/dist/types/prompts/link-summary.d.ts +3 -3
- package/dist/types/prompts/summary-lengths.d.ts +1 -1
- package/dist/types/transcription/onnx-cli.d.ts +3 -3
- package/dist/types/transcription/whisper/core.d.ts +6 -3
- package/dist/types/transcription/whisper/groq.d.ts +2 -0
- package/dist/types/transcription/whisper/openai.d.ts +6 -1
- package/dist/types/transcription/whisper/types.d.ts +1 -1
- package/dist/types/transcription/whisper/whisper-cpp.d.ts +1 -1
- package/dist/types/transcription/whisper.d.ts +5 -5
- package/package.json +18 -17
- package/LICENSE +0 -21
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
export { createLinkPreviewClient, } from
|
|
2
|
-
export { DEFAULT_CACHE_MODE, DEFAULT_MAX_CONTENT_CHARACTERS, DEFAULT_TIMEOUT_MS, } from
|
|
3
|
-
export { ProgressKind } from
|
|
4
|
-
export { CACHE_MODES, } from
|
|
5
|
-
export { extractYouTubeVideoId, isDirectMediaUrl, isPodcastHost, isTwitterStatusUrl, isYouTubeUrl, isYouTubeVideoUrl, shouldPreferUrlMode, } from
|
|
1
|
+
export { createLinkPreviewClient, } from "./link-preview/client.js";
|
|
2
|
+
export { DEFAULT_CACHE_MODE, DEFAULT_MAX_CONTENT_CHARACTERS, DEFAULT_TIMEOUT_MS, } from "./link-preview/content/types.js";
|
|
3
|
+
export { ProgressKind } from "./link-preview/deps.js";
|
|
4
|
+
export { CACHE_MODES, } from "./link-preview/types.js";
|
|
5
|
+
export { DIRECT_MEDIA_EXTENSIONS, extractYouTubeVideoId, isDirectMediaExtension, isDirectMediaUrl, isPodcastHost, isTwitterBroadcastUrl, isTwitterStatusUrl, isYouTubeUrl, isYouTubeVideoUrl, shouldPreferUrlMode, } from "./url.js";
|
|
6
6
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/content/index.ts"],"names":[],"mappings":"AAOA,OAAO,EACL,uBAAuB,GAGxB,MAAM,0BAA0B,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/content/index.ts"],"names":[],"mappings":"AAOA,OAAO,EACL,uBAAuB,GAGxB,MAAM,0BAA0B,CAAC;AAClC,OAAO,EACL,kBAAkB,EAClB,8BAA8B,EAC9B,kBAAkB,GAGnB,MAAM,iCAAiC,CAAC;AASzC,OAAO,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AACtD,OAAO,EACL,WAAW,GAIZ,MAAM,yBAAyB,CAAC;AACjC,OAAO,EACL,uBAAuB,EACvB,qBAAqB,EACrB,sBAAsB,EACtB,gBAAgB,EAChB,aAAa,EACb,qBAAqB,EACrB,kBAAkB,EAClB,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,GACpB,MAAM,UAAU,CAAC"}
|
|
@@ -1,19 +1,28 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { resolveTranscriptionConfig, } from "../transcript/transcription-config.js";
|
|
2
|
+
import { fetchLinkContent } from "./content/index.js";
|
|
2
3
|
/** Public factory for a link preview client with injectable dependencies. */
|
|
3
4
|
export function createLinkPreviewClient(options = {}) {
|
|
4
5
|
const fetchImpl = options.fetch ?? ((...args) => globalThis.fetch(...args));
|
|
5
|
-
const env = typeof options.env ===
|
|
6
|
+
const env = typeof options.env === "object" && options.env ? options.env : undefined;
|
|
6
7
|
const scrape = options.scrapeWithFirecrawl ?? null;
|
|
7
|
-
const apifyApiToken = typeof options.apifyApiToken ===
|
|
8
|
-
const ytDlpPath = typeof options.ytDlpPath ===
|
|
9
|
-
const falApiKey = typeof options.falApiKey ===
|
|
10
|
-
const
|
|
8
|
+
const apifyApiToken = typeof options.apifyApiToken === "string" ? options.apifyApiToken : null;
|
|
9
|
+
const ytDlpPath = typeof options.ytDlpPath === "string" ? options.ytDlpPath : null;
|
|
10
|
+
const falApiKey = typeof options.falApiKey === "string" ? options.falApiKey : null;
|
|
11
|
+
const groqApiKey = typeof options.groqApiKey === "string" ? options.groqApiKey : null;
|
|
12
|
+
const openaiApiKey = typeof options.openaiApiKey === "string" ? options.openaiApiKey : null;
|
|
13
|
+
const transcription = resolveTranscriptionConfig({
|
|
14
|
+
env,
|
|
15
|
+
transcription: options.transcription ?? null,
|
|
16
|
+
falApiKey,
|
|
17
|
+
groqApiKey,
|
|
18
|
+
openaiApiKey,
|
|
19
|
+
});
|
|
11
20
|
const convertHtmlToMarkdown = options.convertHtmlToMarkdown ?? null;
|
|
12
21
|
const transcriptCache = options.transcriptCache ?? null;
|
|
13
22
|
const mediaCache = options.mediaCache ?? null;
|
|
14
|
-
const readTweetWithBird = typeof options.readTweetWithBird ===
|
|
15
|
-
const resolveTwitterCookies = typeof options.resolveTwitterCookies ===
|
|
16
|
-
const onProgress = typeof options.onProgress ===
|
|
23
|
+
const readTweetWithBird = typeof options.readTweetWithBird === "function" ? options.readTweetWithBird : null;
|
|
24
|
+
const resolveTwitterCookies = typeof options.resolveTwitterCookies === "function" ? options.resolveTwitterCookies : null;
|
|
25
|
+
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
|
|
17
26
|
return {
|
|
18
27
|
fetchLinkContent: (url, contentOptions) => fetchLinkContent(url, contentOptions, {
|
|
19
28
|
fetch: fetchImpl,
|
|
@@ -21,7 +30,9 @@ export function createLinkPreviewClient(options = {}) {
|
|
|
21
30
|
scrapeWithFirecrawl: scrape,
|
|
22
31
|
apifyApiToken,
|
|
23
32
|
ytDlpPath,
|
|
33
|
+
transcription,
|
|
24
34
|
falApiKey,
|
|
35
|
+
groqApiKey,
|
|
25
36
|
openaiApiKey,
|
|
26
37
|
convertHtmlToMarkdown,
|
|
27
38
|
transcriptCache,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"client.js","sourceRoot":"","sources":["../../../../src/content/link-preview/client.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"client.js","sourceRoot":"","sources":["../../../../src/content/link-preview/client.ts"],"names":[],"mappings":"AASA,OAAO,EACL,0BAA0B,GAE3B,MAAM,uCAAuC,CAAC;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AA0BtD,6EAA6E;AAC7E,MAAM,UAAU,uBAAuB,CAAC,UAAoC,EAAE;IAC5E,MAAM,SAAS,GACb,OAAO,CAAC,KAAK,IAAI,CAAC,CAAC,GAAG,IAA8B,EAAE,EAAE,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC;IACtF,MAAM,GAAG,GAAG,OAAO,OAAO,CAAC,GAAG,KAAK,QAAQ,IAAI,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;IACrF,MAAM,MAAM,GAA+B,OAAO,CAAC,mBAAmB,IAAI,IAAI,CAAC;IAC/E,MAAM,aAAa,GAAG,OAAO,OAAO,CAAC,aAAa,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC;IAC/F,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC;IACnF,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC;IACnF,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IACtF,MAAM,YAAY,GAAG,OAAO,OAAO,CAAC,YAAY,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC;IAC5F,MAAM,aAAa,GAAG,0BAA0B,CAAC;QAC/C,GAAG;QACH,aAAa,EAAE,OAAO,CAAC,aAAa,IAAI,IAAI;QAC5C,SAAS;QACT,UAAU;QACV,YAAY;KACb,CAAC,CAAC;IACH,MAAM,qBAAqB,GAAiC,OAAO,CAAC,qBAAqB,IAAI,IAAI,CAAC;IAClG,MAAM,eAAe,GAA2B,OAAO,CAAC,eAAe,IAAI,IAAI,CAAC;IAChF,MAAM,UAAU,GAAsB,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC;IACjE,MAAM,iBAAiB,GACrB,OAAO,OAAO,CAAC,iBAAiB,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,iBAAiB,CAAC,CAAC,CAAC,IAAI,CAAC;IACrF,MAAM,qBAAqB,GACzB,OAAO,OAAO,CAAC,qBAAqB,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,qBAAqB,CAAC,CAAC,CAAC,IAAI,CAAC;IAC7F,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAExF,OAAO;QACL,gBAAgB,EAAE,CAAC,GAAW,EAAE,cAAwC,EAAE,EAAE,CAC1E,gBAAgB,CAAC,GAAG,EAAE,cAAc,EAAE;YACpC,KAAK,EAAE,SAAS;YAChB,GAAG;YACH,mBAAmB,EAAE,MAAM;YAC3B,aAAa;YACb,SAAS;YACT,aAAa;YACb,SAAS;YACT,UAAU;YACV,YAAY;YACZ,qBAAqB;YACrB,eAAe;YACf,UAAU;YACV,iBAAiB;YACjB,qBAAqB;YACrB,UAAU;SACX,CAAC;KACL,CAAC;AACJ,CAAC"}
|
|
@@ -1,45 +1,46 @@
|
|
|
1
|
-
import { load } from
|
|
2
|
-
import sanitizeHtml from
|
|
3
|
-
import { decodeHtmlEntities, normalizeWhitespace } from
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
import sanitizeHtml from "sanitize-html";
|
|
3
|
+
import { decodeHtmlEntities, normalizeWhitespace } from "./cleaner.js";
|
|
4
|
+
import { stripHiddenHtml } from "./visibility.js";
|
|
4
5
|
const MIN_SEGMENT_LENGTH = 30;
|
|
5
6
|
export function sanitizeHtmlForMarkdownConversion(html) {
|
|
6
|
-
return sanitizeHtml(html, {
|
|
7
|
+
return sanitizeHtml(stripHiddenHtml(html), {
|
|
7
8
|
allowedTags: [
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
9
|
+
"article",
|
|
10
|
+
"section",
|
|
11
|
+
"div",
|
|
12
|
+
"p",
|
|
13
|
+
"h1",
|
|
14
|
+
"h2",
|
|
15
|
+
"h3",
|
|
16
|
+
"h4",
|
|
17
|
+
"h5",
|
|
18
|
+
"h6",
|
|
19
|
+
"ol",
|
|
20
|
+
"ul",
|
|
21
|
+
"li",
|
|
22
|
+
"blockquote",
|
|
23
|
+
"pre",
|
|
24
|
+
"code",
|
|
25
|
+
"span",
|
|
26
|
+
"strong",
|
|
27
|
+
"em",
|
|
28
|
+
"br",
|
|
29
|
+
"a",
|
|
29
30
|
],
|
|
30
31
|
allowedAttributes: {
|
|
31
|
-
a: [
|
|
32
|
+
a: ["href"],
|
|
32
33
|
},
|
|
33
34
|
nonTextTags: [
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
35
|
+
"style",
|
|
36
|
+
"script",
|
|
37
|
+
"noscript",
|
|
38
|
+
"template",
|
|
39
|
+
"svg",
|
|
40
|
+
"canvas",
|
|
41
|
+
"iframe",
|
|
42
|
+
"object",
|
|
43
|
+
"embed",
|
|
43
44
|
],
|
|
44
45
|
textFilter(text) {
|
|
45
46
|
return decodeHtmlEntities(text);
|
|
@@ -49,46 +50,46 @@ export function sanitizeHtmlForMarkdownConversion(html) {
|
|
|
49
50
|
export function extractArticleContent(html) {
|
|
50
51
|
const segments = collectSegmentsFromHtml(html);
|
|
51
52
|
if (segments.length > 0) {
|
|
52
|
-
return segments.join(
|
|
53
|
+
return segments.join("\n");
|
|
53
54
|
}
|
|
54
55
|
const fallback = normalizeWhitespace(extractPlainText(html));
|
|
55
|
-
return fallback ??
|
|
56
|
+
return fallback ?? "";
|
|
56
57
|
}
|
|
57
58
|
export function collectSegmentsFromHtml(html) {
|
|
58
|
-
const sanitized = sanitizeHtml(html, {
|
|
59
|
+
const sanitized = sanitizeHtml(stripHiddenHtml(html), {
|
|
59
60
|
allowedTags: [
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
61
|
+
"article",
|
|
62
|
+
"section",
|
|
63
|
+
"div",
|
|
64
|
+
"p",
|
|
65
|
+
"h1",
|
|
66
|
+
"h2",
|
|
67
|
+
"h3",
|
|
68
|
+
"h4",
|
|
69
|
+
"h5",
|
|
70
|
+
"h6",
|
|
71
|
+
"ol",
|
|
72
|
+
"ul",
|
|
73
|
+
"li",
|
|
74
|
+
"blockquote",
|
|
75
|
+
"pre",
|
|
76
|
+
"code",
|
|
77
|
+
"span",
|
|
78
|
+
"strong",
|
|
79
|
+
"em",
|
|
80
|
+
"br",
|
|
80
81
|
],
|
|
81
82
|
allowedAttributes: {},
|
|
82
83
|
nonTextTags: [
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
84
|
+
"style",
|
|
85
|
+
"script",
|
|
86
|
+
"noscript",
|
|
87
|
+
"template",
|
|
88
|
+
"svg",
|
|
89
|
+
"canvas",
|
|
90
|
+
"iframe",
|
|
91
|
+
"object",
|
|
92
|
+
"embed",
|
|
92
93
|
],
|
|
93
94
|
textFilter(text) {
|
|
94
95
|
return decodeHtmlEntities(text);
|
|
@@ -96,23 +97,23 @@ export function collectSegmentsFromHtml(html) {
|
|
|
96
97
|
});
|
|
97
98
|
const $ = load(sanitized);
|
|
98
99
|
const segments = [];
|
|
99
|
-
$(
|
|
100
|
-
if (!(
|
|
100
|
+
$("h1,h2,h3,h4,h5,h6,li,p,blockquote,pre").each((_, element) => {
|
|
101
|
+
if (!("tagName" in element) || typeof element.tagName !== "string") {
|
|
101
102
|
return;
|
|
102
103
|
}
|
|
103
104
|
const tag = element.tagName.toLowerCase();
|
|
104
105
|
const raw = $(element).text();
|
|
105
|
-
const text = normalizeWhitespace(raw).replaceAll(/\n+/g,
|
|
106
|
+
const text = normalizeWhitespace(raw).replaceAll(/\n+/g, " ");
|
|
106
107
|
if (!text || text.length === 0) {
|
|
107
108
|
return;
|
|
108
109
|
}
|
|
109
|
-
if (tag.startsWith(
|
|
110
|
+
if (tag.startsWith("h")) {
|
|
110
111
|
if (text.length >= 10) {
|
|
111
112
|
segments.push(text);
|
|
112
113
|
}
|
|
113
114
|
return;
|
|
114
115
|
}
|
|
115
|
-
if (tag ===
|
|
116
|
+
if (tag === "li") {
|
|
116
117
|
if (text.length >= 20) {
|
|
117
118
|
segments.push(`• ${text}`);
|
|
118
119
|
}
|
|
@@ -124,25 +125,25 @@ export function collectSegmentsFromHtml(html) {
|
|
|
124
125
|
segments.push(text);
|
|
125
126
|
});
|
|
126
127
|
if (segments.length === 0) {
|
|
127
|
-
const fallback = normalizeWhitespace($(
|
|
128
|
+
const fallback = normalizeWhitespace($("body").text() || sanitized);
|
|
128
129
|
return fallback ? [fallback] : [];
|
|
129
130
|
}
|
|
130
131
|
return mergeConsecutiveSegments(segments);
|
|
131
132
|
}
|
|
132
133
|
export function extractPlainText(html) {
|
|
133
|
-
const stripped = sanitizeHtml(html, {
|
|
134
|
+
const stripped = sanitizeHtml(stripHiddenHtml(html), {
|
|
134
135
|
allowedTags: [],
|
|
135
136
|
allowedAttributes: {},
|
|
136
137
|
nonTextTags: [
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
138
|
+
"style",
|
|
139
|
+
"script",
|
|
140
|
+
"noscript",
|
|
141
|
+
"template",
|
|
142
|
+
"svg",
|
|
143
|
+
"canvas",
|
|
144
|
+
"iframe",
|
|
145
|
+
"object",
|
|
146
|
+
"embed",
|
|
146
147
|
],
|
|
147
148
|
});
|
|
148
149
|
return decodeHtmlEntities(stripped);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"article.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/article.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,
|
|
1
|
+
{"version":3,"file":"article.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/article.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAC/B,OAAO,YAAY,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,MAAM,cAAc,CAAC;AACvE,OAAO,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAElD,MAAM,kBAAkB,GAAG,EAAE,CAAC;AAE9B,MAAM,UAAU,iCAAiC,CAAC,IAAY;IAC5D,OAAO,YAAY,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE;QACzC,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;YACJ,GAAG;SACJ;QACD,iBAAiB,EAAE;YACjB,CAAC,EAAE,CAAC,MAAM,CAAC;SACZ;QACD,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAClC,CAAC;KACF,CAAC,CAAC;AACL,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,IAAY;IAChD,MAAM,QAAQ,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAC;IAC/C,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,OAAO,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7B,CAAC;IACD,MAAM,QAAQ,GAAG,mBAAmB,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;IAC7D,OAAO,QAAQ,IAAI,EAAE,CAAC;AACxB,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,IAAY;IAClD,MAAM,SAAS,GAAG,YAAY,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE;QACpD,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;SACL;QACD,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAClC,CAAC;KACF,CAAC,CAAC;IAEH,MAAM,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC;IAC1B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,CAAC,CAAC,uCAAuC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAC7D,IAAI,CAAC,CAAC,SAAS,IAAI,OAAO,CAAC,IAAI,OAAO,OAAO,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YACnE,OAAO;QACT,CAAC;QAED,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;QAE1C,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;QAC9B,MAAM,IAAI,GAAG,mBAAmB,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9D,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAO;QACT,CAAC;QAED,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YACxB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,CAAC;YACD,OAAO;QACT,CAAC;QAED,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACjB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC;YAC7B,CAAC;YACD,OAAO;QACT,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,GAAG,kBAAkB,EAAE,CAAC;YACrC,OAAO;QACT,CAAC;QAED,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtB,CAAC,CAAC,CAAC;IAEH,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,SAAS,CAAC,CAAC;QACpE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACpC,CAAC;IAED,OAAO,wBAAwB,CAAC,QAAQ,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,QAAQ,GAAG,YAAY,CAAC,eAAe,CAAC,IAAI,CAAC,EAAE;QACnD,WAAW,EAAE,EAAE;QACf,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;KACF,CAAC,CAAC;IACH,OAAO,kBAAkB,CAAC,QAAQ,CAAC,CAAC;AACtC,CAAC;AAED,SAAS,wBAAwB,CAAC,QAAkB;IAClD,gGAAgG;IAChG,2FAA2F;IAC3F,OAAO,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;AAClC,CAAC"}
|
|
@@ -1,36 +1,39 @@
|
|
|
1
|
-
import { compact } from
|
|
1
|
+
import { compact } from "es-toolkit";
|
|
2
2
|
const WORD_SPLIT_PATTERN = /\s+/g;
|
|
3
3
|
export function normalizeForPrompt(input) {
|
|
4
|
-
return input
|
|
5
|
-
.replaceAll(
|
|
6
|
-
.replaceAll(/[\t ]+/g,
|
|
7
|
-
.replaceAll(/\s*\n\s*/g,
|
|
8
|
-
.replaceAll(/\n{3,}/g,
|
|
4
|
+
return stripInvisibleUnicode(input)
|
|
5
|
+
.replaceAll("\u00A0", " ")
|
|
6
|
+
.replaceAll(/[\t ]+/g, " ")
|
|
7
|
+
.replaceAll(/\s*\n\s*/g, "\n")
|
|
8
|
+
.replaceAll(/\n{3,}/g, "\n\n")
|
|
9
9
|
.trim();
|
|
10
10
|
}
|
|
11
11
|
export function normalizeWhitespace(input) {
|
|
12
|
-
return input
|
|
13
|
-
.replaceAll(
|
|
14
|
-
.replaceAll(/[\t ]+/g,
|
|
15
|
-
.replaceAll(/\s*\n\s*/g,
|
|
12
|
+
return stripInvisibleUnicode(input)
|
|
13
|
+
.replaceAll("\u00A0", " ")
|
|
14
|
+
.replaceAll(/[\t ]+/g, " ")
|
|
15
|
+
.replaceAll(/\s*\n\s*/g, "\n")
|
|
16
16
|
.trim();
|
|
17
17
|
}
|
|
18
18
|
export function decodeHtmlEntities(input) {
|
|
19
19
|
return input
|
|
20
|
-
.replaceAll(
|
|
21
|
-
.replaceAll(
|
|
22
|
-
.replaceAll(
|
|
23
|
-
.replaceAll(
|
|
24
|
-
.replaceAll(
|
|
25
|
-
.replaceAll(
|
|
26
|
-
.replaceAll(
|
|
27
|
-
.replaceAll(
|
|
20
|
+
.replaceAll("&", "&")
|
|
21
|
+
.replaceAll("<", "<")
|
|
22
|
+
.replaceAll(">", ">")
|
|
23
|
+
.replaceAll(""", '"')
|
|
24
|
+
.replaceAll("'", "'")
|
|
25
|
+
.replaceAll("'", "'")
|
|
26
|
+
.replaceAll("/", "/")
|
|
27
|
+
.replaceAll(" ", " ");
|
|
28
|
+
}
|
|
29
|
+
export function stripInvisibleUnicode(input) {
|
|
30
|
+
return input.replaceAll(/[\u200B-\u200F\u202A-\u202E\u2060-\u2069\uFEFF\u{E0000}-\u{E007F}]/gu, "");
|
|
28
31
|
}
|
|
29
32
|
export function normalizeCandidate(value) {
|
|
30
33
|
if (!value) {
|
|
31
34
|
return null;
|
|
32
35
|
}
|
|
33
|
-
const trimmed = value.replaceAll(/\s+/g,
|
|
36
|
+
const trimmed = value.replaceAll(/\s+/g, " ").trim();
|
|
34
37
|
return trimmed.length > 0 ? trimmed : null;
|
|
35
38
|
}
|
|
36
39
|
export function clipAtSentenceBoundary(input, maxLength) {
|
|
@@ -38,7 +41,7 @@ export function clipAtSentenceBoundary(input, maxLength) {
|
|
|
38
41
|
return input;
|
|
39
42
|
}
|
|
40
43
|
const slice = input.slice(0, maxLength);
|
|
41
|
-
const lastSentenceBreak = Math.max(slice.lastIndexOf(
|
|
44
|
+
const lastSentenceBreak = Math.max(slice.lastIndexOf(". "), slice.lastIndexOf("! "), slice.lastIndexOf("? "), slice.lastIndexOf("\n\n"));
|
|
42
45
|
if (lastSentenceBreak > maxLength * 0.5) {
|
|
43
46
|
return slice.slice(0, lastSentenceBreak + 1);
|
|
44
47
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cleaner.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/cleaner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,
|
|
1
|
+
{"version":3,"file":"cleaner.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/cleaner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAErC,MAAM,kBAAkB,GAAG,MAAM,CAAC;AASlC,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,OAAO,qBAAqB,CAAC,KAAK,CAAC;SAChC,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,UAAU,CAAC,SAAS,EAAE,MAAM,CAAC;SAC7B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,KAAa;IAC/C,OAAO,qBAAqB,CAAC,KAAK,CAAC;SAChC,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,IAAI,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC9C,OAAO,KAAK;SACT,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;AAC/B,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,KAAa;IACjD,OAAO,KAAK,CAAC,UAAU,CACrB,sEAAsE,EACtE,EAAE,CACH,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,KAAgC;IACjE,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,OAAO,IAAI,CAAC;IACd,CAAC;IACD,MAAM,OAAO,GAAG,KAAK,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IACrD,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;AAC7C,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,KAAa,EAAE,SAAiB;IACrE,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IACxC,MAAM,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAChC,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAC1B,CAAC;IACF,IAAI,iBAAiB,GAAG,SAAS,GAAG,GAAG,EAAE,CAAC;QACxC,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,GAAG,CAAC,CAAC,CAAC;IAC/C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,WAAmB,EACnB,aAAqB;IAErB,MAAM,eAAe,GAAG,WAAW,CAAC,MAAM,CAAC;IAC3C,MAAM,SAAS,GAAG,eAAe,GAAG,aAAa,CAAC;IAClD,MAAM,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,sBAAsB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC;IAC7F,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAC/B,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7F,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,SAAS,EAAE,CAAC;AAC5D,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"constants.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/constants.ts"],"names":[],"mappings":"AAAA,MAAM,CAAC,MAAM,yBAAyB,GACpC,iIAAiI,
|
|
1
|
+
{"version":3,"file":"constants.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/constants.ts"],"names":[],"mappings":"AAAA,MAAM,CAAC,MAAM,yBAAyB,GACpC,iIAAiI,CAAC;AACpI,MAAM,CAAC,MAAM,2BAA2B,GAAG,GAAG,CAAC;AAC/C,MAAM,CAAC,MAAM,kCAAkC,GAAG,GAAG,CAAC;AACtD,MAAM,CAAC,MAAM,mCAAmC,GAAG,GAAG,CAAC;AACvD,MAAM,CAAC,MAAM,8BAA8B,GAAG,GAAG,CAAC;AAClD,MAAM,CAAC,MAAM,yCAAyC,GAAG,IAAI,CAAC"}
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
import { isYouTubeUrl } from
|
|
2
|
-
import { appendNote } from
|
|
1
|
+
import { isYouTubeUrl } from "../../url.js";
|
|
2
|
+
import { appendNote } from "./utils.js";
|
|
3
3
|
const REQUEST_HEADERS = {
|
|
4
|
-
|
|
5
|
-
Accept:
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
Pragma:
|
|
4
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
|
5
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
6
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
7
|
+
"Cache-Control": "no-cache",
|
|
8
|
+
Pragma: "no-cache",
|
|
9
9
|
};
|
|
10
10
|
const DEFAULT_REQUEST_TIMEOUT_MS = 5000;
|
|
11
11
|
export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress, } = {}) {
|
|
12
|
-
onProgress?.({ kind:
|
|
12
|
+
onProgress?.({ kind: "fetch-html-start", url });
|
|
13
13
|
const controller = new AbortController();
|
|
14
|
-
const effectiveTimeoutMs = typeof timeoutMs ===
|
|
14
|
+
const effectiveTimeoutMs = typeof timeoutMs === "number" && Number.isFinite(timeoutMs)
|
|
15
15
|
? timeoutMs
|
|
16
16
|
: DEFAULT_REQUEST_TIMEOUT_MS;
|
|
17
17
|
const timeout = setTimeout(() => {
|
|
@@ -20,26 +20,26 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
|
|
|
20
20
|
try {
|
|
21
21
|
const response = await fetchImpl(url, {
|
|
22
22
|
headers: REQUEST_HEADERS,
|
|
23
|
-
redirect:
|
|
23
|
+
redirect: "follow",
|
|
24
24
|
signal: controller.signal,
|
|
25
25
|
});
|
|
26
26
|
if (!response.ok) {
|
|
27
27
|
throw new Error(`Failed to fetch HTML document (status ${response.status})`);
|
|
28
28
|
}
|
|
29
29
|
const finalUrl = response.url?.trim() || url;
|
|
30
|
-
const contentType = response.headers.get(
|
|
30
|
+
const contentType = response.headers.get("content-type")?.toLowerCase() ?? null;
|
|
31
31
|
if (contentType &&
|
|
32
|
-
!contentType.includes(
|
|
33
|
-
!contentType.includes(
|
|
34
|
-
!contentType.includes(
|
|
35
|
-
!contentType.includes(
|
|
36
|
-
!contentType.includes(
|
|
37
|
-
!contentType.includes(
|
|
38
|
-
!contentType.startsWith(
|
|
32
|
+
!contentType.includes("text/html") &&
|
|
33
|
+
!contentType.includes("application/xhtml+xml") &&
|
|
34
|
+
!contentType.includes("application/xml") &&
|
|
35
|
+
!contentType.includes("text/xml") &&
|
|
36
|
+
!contentType.includes("application/rss+xml") &&
|
|
37
|
+
!contentType.includes("application/atom+xml") &&
|
|
38
|
+
!contentType.startsWith("text/")) {
|
|
39
39
|
throw new Error(`Unsupported content-type for HTML document fetch: ${contentType}`);
|
|
40
40
|
}
|
|
41
41
|
const totalBytes = (() => {
|
|
42
|
-
const raw = response.headers.get(
|
|
42
|
+
const raw = response.headers.get("content-length");
|
|
43
43
|
if (!raw)
|
|
44
44
|
return null;
|
|
45
45
|
const parsed = Number(raw);
|
|
@@ -49,14 +49,14 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
|
|
|
49
49
|
if (!body) {
|
|
50
50
|
const text = await response.text();
|
|
51
51
|
const bytes = new TextEncoder().encode(text).byteLength;
|
|
52
|
-
onProgress?.({ kind:
|
|
52
|
+
onProgress?.({ kind: "fetch-html-done", url, downloadedBytes: bytes, totalBytes });
|
|
53
53
|
return { html: text, finalUrl };
|
|
54
54
|
}
|
|
55
55
|
const reader = body.getReader();
|
|
56
56
|
const decoder = new TextDecoder();
|
|
57
57
|
let downloadedBytes = 0;
|
|
58
|
-
let text =
|
|
59
|
-
onProgress?.({ kind:
|
|
58
|
+
let text = "";
|
|
59
|
+
onProgress?.({ kind: "fetch-html-progress", url, downloadedBytes: 0, totalBytes });
|
|
60
60
|
while (true) {
|
|
61
61
|
const { value, done } = await reader.read();
|
|
62
62
|
if (done)
|
|
@@ -65,15 +65,15 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
|
|
|
65
65
|
continue;
|
|
66
66
|
downloadedBytes += value.byteLength;
|
|
67
67
|
text += decoder.decode(value, { stream: true });
|
|
68
|
-
onProgress?.({ kind:
|
|
68
|
+
onProgress?.({ kind: "fetch-html-progress", url, downloadedBytes, totalBytes });
|
|
69
69
|
}
|
|
70
70
|
text += decoder.decode();
|
|
71
|
-
onProgress?.({ kind:
|
|
71
|
+
onProgress?.({ kind: "fetch-html-done", url, downloadedBytes, totalBytes });
|
|
72
72
|
return { html: text, finalUrl };
|
|
73
73
|
}
|
|
74
74
|
catch (error) {
|
|
75
|
-
if (error instanceof DOMException && error.name ===
|
|
76
|
-
throw new Error(
|
|
75
|
+
if (error instanceof DOMException && error.name === "AbortError") {
|
|
76
|
+
throw new Error("Fetching HTML document timed out");
|
|
77
77
|
}
|
|
78
78
|
throw error;
|
|
79
79
|
}
|
|
@@ -83,42 +83,48 @@ export async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress,
|
|
|
83
83
|
}
|
|
84
84
|
export async function fetchWithFirecrawl(url, scrapeWithFirecrawl, options = {}) {
|
|
85
85
|
const timeoutMs = options.timeoutMs;
|
|
86
|
-
const cacheMode = options.cacheMode ??
|
|
87
|
-
const onProgress = typeof options.onProgress ===
|
|
88
|
-
const reason = typeof options.reason ===
|
|
86
|
+
const cacheMode = options.cacheMode ?? "default";
|
|
87
|
+
const onProgress = typeof options.onProgress === "function" ? options.onProgress : null;
|
|
88
|
+
const reason = typeof options.reason === "string" ? options.reason : null;
|
|
89
89
|
const diagnostics = {
|
|
90
90
|
attempted: false,
|
|
91
91
|
used: false,
|
|
92
92
|
cacheMode,
|
|
93
|
-
cacheStatus: cacheMode ===
|
|
93
|
+
cacheStatus: cacheMode === "bypass" ? "bypassed" : "unknown",
|
|
94
94
|
notes: null,
|
|
95
95
|
};
|
|
96
96
|
if (isYouTubeUrl(url)) {
|
|
97
|
-
diagnostics.notes = appendNote(diagnostics.notes,
|
|
97
|
+
diagnostics.notes = appendNote(diagnostics.notes, "Skipped Firecrawl for YouTube URL");
|
|
98
98
|
return { payload: null, diagnostics };
|
|
99
99
|
}
|
|
100
100
|
if (!scrapeWithFirecrawl) {
|
|
101
|
-
diagnostics.notes = appendNote(diagnostics.notes,
|
|
101
|
+
diagnostics.notes = appendNote(diagnostics.notes, "Firecrawl is not configured");
|
|
102
102
|
return { payload: null, diagnostics };
|
|
103
103
|
}
|
|
104
104
|
diagnostics.attempted = true;
|
|
105
|
-
onProgress?.({ kind:
|
|
105
|
+
onProgress?.({ kind: "firecrawl-start", url, reason: reason ?? "firecrawl" });
|
|
106
106
|
try {
|
|
107
107
|
const payload = await scrapeWithFirecrawl(url, { timeoutMs, cacheMode });
|
|
108
108
|
if (!payload) {
|
|
109
|
-
diagnostics.notes = appendNote(diagnostics.notes,
|
|
110
|
-
onProgress?.({
|
|
109
|
+
diagnostics.notes = appendNote(diagnostics.notes, "Firecrawl returned no content payload");
|
|
110
|
+
onProgress?.({
|
|
111
|
+
kind: "firecrawl-done",
|
|
112
|
+
url,
|
|
113
|
+
ok: false,
|
|
114
|
+
markdownBytes: null,
|
|
115
|
+
htmlBytes: null,
|
|
116
|
+
});
|
|
111
117
|
return { payload: null, diagnostics };
|
|
112
118
|
}
|
|
113
119
|
const encoder = new TextEncoder();
|
|
114
|
-
const markdownBytes = typeof payload.markdown ===
|
|
115
|
-
const htmlBytes = typeof payload.html ===
|
|
116
|
-
onProgress?.({ kind:
|
|
120
|
+
const markdownBytes = typeof payload.markdown === "string" ? encoder.encode(payload.markdown).byteLength : null;
|
|
121
|
+
const htmlBytes = typeof payload.html === "string" ? encoder.encode(payload.html).byteLength : null;
|
|
122
|
+
onProgress?.({ kind: "firecrawl-done", url, ok: true, markdownBytes, htmlBytes });
|
|
117
123
|
return { payload, diagnostics };
|
|
118
124
|
}
|
|
119
125
|
catch (error) {
|
|
120
|
-
diagnostics.notes = appendNote(diagnostics.notes, `Firecrawl error: ${error instanceof Error ? error.message :
|
|
121
|
-
onProgress?.({ kind:
|
|
126
|
+
diagnostics.notes = appendNote(diagnostics.notes, `Firecrawl error: ${error instanceof Error ? error.message : "unknown error"}`);
|
|
127
|
+
onProgress?.({ kind: "firecrawl-done", url, ok: false, markdownBytes: null, htmlBytes: null });
|
|
122
128
|
return { payload: null, diagnostics };
|
|
123
129
|
}
|
|
124
130
|
}
|