@steipete/summarize-core 0.7.1 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/content/cache/types.js +2 -0
- package/dist/esm/content/cache/types.js.map +1 -0
- package/dist/esm/content/index.js +1 -0
- package/dist/esm/content/index.js.map +1 -1
- package/dist/esm/content/link-preview/client.js +3 -0
- package/dist/esm/content/link-preview/client.js.map +1 -1
- package/dist/esm/content/link-preview/content/fetcher.js +1 -1
- package/dist/esm/content/link-preview/content/fetcher.js.map +1 -1
- package/dist/esm/content/link-preview/content/html.js +1 -1
- package/dist/esm/content/link-preview/content/html.js.map +1 -1
- package/dist/esm/content/link-preview/content/index.js +22 -3
- package/dist/esm/content/link-preview/content/index.js.map +1 -1
- package/dist/esm/content/link-preview/deps.js.map +1 -1
- package/dist/esm/content/transcript/index.js +1 -0
- package/dist/esm/content/transcript/index.js.map +1 -1
- package/dist/esm/content/transcript/providers/generic.js +84 -4
- package/dist/esm/content/transcript/providers/generic.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast.js +1 -0
- package/dist/esm/content/transcript/providers/podcast.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/captions.js +35 -14
- package/dist/esm/content/transcript/providers/youtube/captions.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/yt-dlp.js +84 -12
- package/dist/esm/content/transcript/providers/youtube/yt-dlp.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube.js +38 -2
- package/dist/esm/content/transcript/providers/youtube.js.map +1 -1
- package/dist/esm/content/transcript/utils.js +25 -69
- package/dist/esm/content/transcript/utils.js.map +1 -1
- package/dist/esm/content/url.js +76 -0
- package/dist/esm/content/url.js.map +1 -0
- package/dist/esm/prompts/cli.js +25 -5
- package/dist/esm/prompts/cli.js.map +1 -1
- package/dist/esm/prompts/file.js +51 -12
- package/dist/esm/prompts/file.js.map +1 -1
- package/dist/esm/prompts/format.js +26 -0
- package/dist/esm/prompts/format.js.map +1 -0
- package/dist/esm/prompts/link-summary.js +51 -22
- package/dist/esm/prompts/link-summary.js.map +1 -1
- package/dist/types/content/cache/types.d.ts +25 -0
- package/dist/types/content/index.d.ts +3 -1
- package/dist/types/content/link-preview/client.d.ts +6 -1
- package/dist/types/content/link-preview/content/types.d.ts +1 -1
- package/dist/types/content/link-preview/deps.d.ts +11 -20
- package/dist/types/content/transcript/cache.d.ts +1 -1
- package/dist/types/content/transcript/providers/generic.d.ts +1 -1
- package/dist/types/content/transcript/providers/youtube/captions.d.ts +3 -1
- package/dist/types/content/transcript/providers/youtube/yt-dlp.d.ts +3 -1
- package/dist/types/content/transcript/types.d.ts +2 -1
- package/dist/types/content/transcript/utils.d.ts +1 -3
- package/dist/types/content/url.d.ts +8 -0
- package/dist/types/prompts/cli.d.ts +4 -1
- package/dist/types/prompts/file.d.ts +9 -2
- package/dist/types/prompts/format.d.ts +14 -0
- package/dist/types/prompts/link-summary.d.ts +4 -1
- package/package.json +10 -10
- package/dist/cjs/content/index.js +0 -14
- package/dist/cjs/content/index.js.map +0 -1
- package/dist/cjs/content/link-preview/client.js +0 -31
- package/dist/cjs/content/link-preview/client.js.map +0 -1
- package/dist/cjs/content/link-preview/content/article.js +0 -164
- package/dist/cjs/content/link-preview/content/article.js.map +0 -1
- package/dist/cjs/content/link-preview/content/cleaner.js +0 -63
- package/dist/cjs/content/link-preview/content/cleaner.js.map +0 -1
- package/dist/cjs/content/link-preview/content/constants.js +0 -10
- package/dist/cjs/content/link-preview/content/constants.js.map +0 -1
- package/dist/cjs/content/link-preview/content/fetcher.js +0 -128
- package/dist/cjs/content/link-preview/content/fetcher.js.map +0 -1
- package/dist/cjs/content/link-preview/content/firecrawl.js +0 -90
- package/dist/cjs/content/link-preview/content/firecrawl.js.map +0 -1
- package/dist/cjs/content/link-preview/content/html.js +0 -165
- package/dist/cjs/content/link-preview/content/html.js.map +0 -1
- package/dist/cjs/content/link-preview/content/index.js +0 -348
- package/dist/cjs/content/link-preview/content/index.js.map +0 -1
- package/dist/cjs/content/link-preview/content/jsonld.js +0 -80
- package/dist/cjs/content/link-preview/content/jsonld.js.map +0 -1
- package/dist/cjs/content/link-preview/content/parsers.js +0 -81
- package/dist/cjs/content/link-preview/content/parsers.js.map +0 -1
- package/dist/cjs/content/link-preview/content/podcast-utils.js +0 -85
- package/dist/cjs/content/link-preview/content/podcast-utils.js.map +0 -1
- package/dist/cjs/content/link-preview/content/readability.js +0 -90
- package/dist/cjs/content/link-preview/content/readability.js.map +0 -1
- package/dist/cjs/content/link-preview/content/twitter-utils.js +0 -74
- package/dist/cjs/content/link-preview/content/twitter-utils.js.map +0 -1
- package/dist/cjs/content/link-preview/content/types.js +0 -7
- package/dist/cjs/content/link-preview/content/types.js.map +0 -1
- package/dist/cjs/content/link-preview/content/utils.js +0 -177
- package/dist/cjs/content/link-preview/content/utils.js.map +0 -1
- package/dist/cjs/content/link-preview/content/video.js +0 -99
- package/dist/cjs/content/link-preview/content/video.js.map +0 -1
- package/dist/cjs/content/link-preview/content/youtube.js +0 -85
- package/dist/cjs/content/link-preview/content/youtube.js.map +0 -1
- package/dist/cjs/content/link-preview/deps.js +0 -23
- package/dist/cjs/content/link-preview/deps.js.map +0 -1
- package/dist/cjs/content/link-preview/fetch-with-timeout.js +0 -38
- package/dist/cjs/content/link-preview/fetch-with-timeout.js.map +0 -1
- package/dist/cjs/content/link-preview/types.js +0 -5
- package/dist/cjs/content/link-preview/types.js.map +0 -1
- package/dist/cjs/content/transcript/cache.js +0 -85
- package/dist/cjs/content/transcript/cache.js.map +0 -1
- package/dist/cjs/content/transcript/index.js +0 -134
- package/dist/cjs/content/transcript/index.js.map +0 -1
- package/dist/cjs/content/transcript/normalize.js +0 -49
- package/dist/cjs/content/transcript/normalize.js.map +0 -1
- package/dist/cjs/content/transcript/providers/generic.js +0 -16
- package/dist/cjs/content/transcript/providers/generic.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/apple-flow.js +0 -226
- package/dist/cjs/content/transcript/providers/podcast/apple-flow.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/apple.js +0 -43
- package/dist/cjs/content/transcript/providers/podcast/apple.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/constants.js +0 -11
- package/dist/cjs/content/transcript/providers/podcast/constants.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/flow-context.js +0 -3
- package/dist/cjs/content/transcript/providers/podcast/flow-context.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/itunes.js +0 -139
- package/dist/cjs/content/transcript/providers/podcast/itunes.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/json.js +0 -43
- package/dist/cjs/content/transcript/providers/podcast/json.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/media.js +0 -355
- package/dist/cjs/content/transcript/providers/podcast/media.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/results.js +0 -32
- package/dist/cjs/content/transcript/providers/podcast/results.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/rss.js +0 -262
- package/dist/cjs/content/transcript/providers/podcast/rss.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/spotify-flow.js +0 -221
- package/dist/cjs/content/transcript/providers/podcast/spotify-flow.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast/spotify.js +0 -119
- package/dist/cjs/content/transcript/providers/podcast/spotify.js.map +0 -1
- package/dist/cjs/content/transcript/providers/podcast.js +0 -260
- package/dist/cjs/content/transcript/providers/podcast.js.map +0 -1
- package/dist/cjs/content/transcript/providers/youtube/api.js +0 -264
- package/dist/cjs/content/transcript/providers/youtube/api.js.map +0 -1
- package/dist/cjs/content/transcript/providers/youtube/apify.js +0 -59
- package/dist/cjs/content/transcript/providers/youtube/apify.js.map +0 -1
- package/dist/cjs/content/transcript/providers/youtube/captions.js +0 -413
- package/dist/cjs/content/transcript/providers/youtube/captions.js.map +0 -1
- package/dist/cjs/content/transcript/providers/youtube/yt-dlp.js +0 -170
- package/dist/cjs/content/transcript/providers/youtube/yt-dlp.js.map +0 -1
- package/dist/cjs/content/transcript/providers/youtube.js +0 -178
- package/dist/cjs/content/transcript/providers/youtube.js.map +0 -1
- package/dist/cjs/content/transcript/types.js +0 -3
- package/dist/cjs/content/transcript/types.js.map +0 -1
- package/dist/cjs/content/transcript/utils.js +0 -303
- package/dist/cjs/content/transcript/utils.js.map +0 -1
- package/dist/cjs/index.js +0 -22
- package/dist/cjs/index.js.map +0 -1
- package/dist/cjs/language.js +0 -132
- package/dist/cjs/language.js.map +0 -1
- package/dist/cjs/package.json +0 -3
- package/dist/cjs/prompts/cli.js +0 -23
- package/dist/cjs/prompts/cli.js.map +0 -1
- package/dist/cjs/prompts/file.js +0 -52
- package/dist/cjs/prompts/file.js.map +0 -1
- package/dist/cjs/prompts/index.js +0 -14
- package/dist/cjs/prompts/index.js.map +0 -1
- package/dist/cjs/prompts/link-summary.js +0 -122
- package/dist/cjs/prompts/link-summary.js.map +0 -1
- package/dist/cjs/shared/contracts.js +0 -5
- package/dist/cjs/shared/contracts.js.map +0 -1
- package/dist/cjs/transcription/whisper/constants.js +0 -11
- package/dist/cjs/transcription/whisper/constants.js.map +0 -1
- package/dist/cjs/transcription/whisper/core.js +0 -307
- package/dist/cjs/transcription/whisper/core.js.map +0 -1
- package/dist/cjs/transcription/whisper/fal.js +0 -44
- package/dist/cjs/transcription/whisper/fal.js.map +0 -1
- package/dist/cjs/transcription/whisper/ffmpeg.js +0 -187
- package/dist/cjs/transcription/whisper/ffmpeg.js.map +0 -1
- package/dist/cjs/transcription/whisper/openai.js +0 -51
- package/dist/cjs/transcription/whisper/openai.js.map +0 -1
- package/dist/cjs/transcription/whisper/types.js +0 -3
- package/dist/cjs/transcription/whisper/types.js.map +0 -1
- package/dist/cjs/transcription/whisper/utils.js +0 -70
- package/dist/cjs/transcription/whisper/utils.js.map +0 -1
- package/dist/cjs/transcription/whisper/whisper-cpp.js +0 -232
- package/dist/cjs/transcription/whisper/whisper-cpp.js.map +0 -1
- package/dist/cjs/transcription/whisper.js +0 -15
- package/dist/cjs/transcription/whisper.js.map +0 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@steipete/summarize-core",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.8.1",
|
|
4
4
|
"description": "Summarize core library (content extraction + prompts).",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/esm/index.js",
|
|
@@ -9,23 +9,23 @@
|
|
|
9
9
|
"exports": {
|
|
10
10
|
".": {
|
|
11
11
|
"types": "./dist/types/index.d.ts",
|
|
12
|
-
"import": "./dist/esm/index.js"
|
|
13
|
-
"require": "./dist/cjs/index.js"
|
|
12
|
+
"import": "./dist/esm/index.js"
|
|
14
13
|
},
|
|
15
14
|
"./content": {
|
|
16
15
|
"types": "./dist/types/content/index.d.ts",
|
|
17
|
-
"import": "./dist/esm/content/index.js"
|
|
18
|
-
|
|
16
|
+
"import": "./dist/esm/content/index.js"
|
|
17
|
+
},
|
|
18
|
+
"./content/url": {
|
|
19
|
+
"types": "./dist/types/content/url.d.ts",
|
|
20
|
+
"import": "./dist/esm/content/url.js"
|
|
19
21
|
},
|
|
20
22
|
"./prompts": {
|
|
21
23
|
"types": "./dist/types/prompts/index.d.ts",
|
|
22
|
-
"import": "./dist/esm/prompts/index.js"
|
|
23
|
-
"require": "./dist/cjs/prompts/index.js"
|
|
24
|
+
"import": "./dist/esm/prompts/index.js"
|
|
24
25
|
},
|
|
25
26
|
"./language": {
|
|
26
27
|
"types": "./dist/types/language.d.ts",
|
|
27
|
-
"import": "./dist/esm/language.js"
|
|
28
|
-
"require": "./dist/cjs/language.js"
|
|
28
|
+
"import": "./dist/esm/language.js"
|
|
29
29
|
}
|
|
30
30
|
},
|
|
31
31
|
"files": [
|
|
@@ -51,7 +51,7 @@
|
|
|
51
51
|
},
|
|
52
52
|
"scripts": {
|
|
53
53
|
"clean": "rm -rf dist",
|
|
54
|
-
"build": "pnpm clean && tsc -p tsconfig.build.json
|
|
54
|
+
"build": "pnpm clean && tsc -p tsconfig.build.json",
|
|
55
55
|
"typecheck": "tsc -p tsconfig.build.json --noEmit"
|
|
56
56
|
}
|
|
57
57
|
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.CACHE_MODES = exports.ProgressKind = exports.DEFAULT_TIMEOUT_MS = exports.DEFAULT_MAX_CONTENT_CHARACTERS = exports.DEFAULT_CACHE_MODE = exports.createLinkPreviewClient = void 0;
|
|
4
|
-
var client_js_1 = require("./link-preview/client.js");
|
|
5
|
-
Object.defineProperty(exports, "createLinkPreviewClient", { enumerable: true, get: function () { return client_js_1.createLinkPreviewClient; } });
|
|
6
|
-
var types_js_1 = require("./link-preview/content/types.js");
|
|
7
|
-
Object.defineProperty(exports, "DEFAULT_CACHE_MODE", { enumerable: true, get: function () { return types_js_1.DEFAULT_CACHE_MODE; } });
|
|
8
|
-
Object.defineProperty(exports, "DEFAULT_MAX_CONTENT_CHARACTERS", { enumerable: true, get: function () { return types_js_1.DEFAULT_MAX_CONTENT_CHARACTERS; } });
|
|
9
|
-
Object.defineProperty(exports, "DEFAULT_TIMEOUT_MS", { enumerable: true, get: function () { return types_js_1.DEFAULT_TIMEOUT_MS; } });
|
|
10
|
-
var deps_js_1 = require("./link-preview/deps.js");
|
|
11
|
-
Object.defineProperty(exports, "ProgressKind", { enumerable: true, get: function () { return deps_js_1.ProgressKind; } });
|
|
12
|
-
var types_js_2 = require("./link-preview/types.js");
|
|
13
|
-
Object.defineProperty(exports, "CACHE_MODES", { enumerable: true, get: function () { return types_js_2.CACHE_MODES; } });
|
|
14
|
-
//# sourceMappingURL=index.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/content/index.ts"],"names":[],"mappings":";;;AAAA,sDAIiC;AAH/B,oHAAA,uBAAuB,OAAA;AAIzB,4DAMwC;AALtC,8GAAA,kBAAkB,OAAA;AAClB,0HAAA,8BAA8B,OAAA;AAC9B,8GAAA,kBAAkB,OAAA;AAapB,kDAAqD;AAA5C,uGAAA,YAAY,OAAA;AACrB,oDAKgC;AAJ9B,uGAAA,WAAW,OAAA"}
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.createLinkPreviewClient = createLinkPreviewClient;
|
|
4
|
-
const index_js_1 = require("./content/index.js");
|
|
5
|
-
function createLinkPreviewClient(options = {}) {
|
|
6
|
-
const fetchImpl = options.fetch ?? ((...args) => globalThis.fetch(...args));
|
|
7
|
-
const scrape = options.scrapeWithFirecrawl ?? null;
|
|
8
|
-
const apifyApiToken = typeof options.apifyApiToken === 'string' ? options.apifyApiToken : null;
|
|
9
|
-
const ytDlpPath = typeof options.ytDlpPath === 'string' ? options.ytDlpPath : null;
|
|
10
|
-
const falApiKey = typeof options.falApiKey === 'string' ? options.falApiKey : null;
|
|
11
|
-
const openaiApiKey = typeof options.openaiApiKey === 'string' ? options.openaiApiKey : null;
|
|
12
|
-
const convertHtmlToMarkdown = options.convertHtmlToMarkdown ?? null;
|
|
13
|
-
const transcriptCache = options.transcriptCache ?? null;
|
|
14
|
-
const readTweetWithBird = typeof options.readTweetWithBird === 'function' ? options.readTweetWithBird : null;
|
|
15
|
-
const onProgress = typeof options.onProgress === 'function' ? options.onProgress : null;
|
|
16
|
-
return {
|
|
17
|
-
fetchLinkContent: (url, contentOptions) => (0, index_js_1.fetchLinkContent)(url, contentOptions, {
|
|
18
|
-
fetch: fetchImpl,
|
|
19
|
-
scrapeWithFirecrawl: scrape,
|
|
20
|
-
apifyApiToken,
|
|
21
|
-
ytDlpPath,
|
|
22
|
-
falApiKey,
|
|
23
|
-
openaiApiKey,
|
|
24
|
-
convertHtmlToMarkdown,
|
|
25
|
-
transcriptCache,
|
|
26
|
-
readTweetWithBird,
|
|
27
|
-
onProgress,
|
|
28
|
-
}),
|
|
29
|
-
};
|
|
30
|
-
}
|
|
31
|
-
//# sourceMappingURL=client.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"client.js","sourceRoot":"","sources":["../../../../src/content/link-preview/client.ts"],"names":[],"mappings":";;AA2BA,0DA6BC;AAxDD,iDAAqD;AA2BrD,SAAgB,uBAAuB,CAAC,UAAoC,EAAE;IAC5E,MAAM,SAAS,GACb,OAAO,CAAC,KAAK,IAAI,CAAC,CAAC,GAAG,IAA8B,EAAE,EAAE,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC,CAAA;IACrF,MAAM,MAAM,GAA+B,OAAO,CAAC,mBAAmB,IAAI,IAAI,CAAA;IAC9E,MAAM,aAAa,GAAG,OAAO,OAAO,CAAC,aAAa,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAA;IAC9F,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAA;IAClF,MAAM,SAAS,GAAG,OAAO,OAAO,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAA;IAClF,MAAM,YAAY,GAAG,OAAO,OAAO,CAAC,YAAY,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAA;IAC3F,MAAM,qBAAqB,GAAiC,OAAO,CAAC,qBAAqB,IAAI,IAAI,CAAA;IACjG,MAAM,eAAe,GAA2B,OAAO,CAAC,eAAe,IAAI,IAAI,CAAA;IAC/E,MAAM,iBAAiB,GACrB,OAAO,OAAO,CAAC,iBAAiB,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,iBAAiB,CAAC,CAAC,CAAC,IAAI,CAAA;IACpF,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAA;IAEvF,OAAO;QACL,gBAAgB,EAAE,CAAC,GAAW,EAAE,cAAwC,EAAE,EAAE,CAC1E,IAAA,2BAAgB,EAAC,GAAG,EAAE,cAAc,EAAE;YACpC,KAAK,EAAE,SAAS;YAChB,mBAAmB,EAAE,MAAM;YAC3B,aAAa;YACb,SAAS;YACT,SAAS;YACT,YAAY;YACZ,qBAAqB;YACrB,eAAe;YACf,iBAAiB;YACjB,UAAU;SACX,CAAC;KACL,CAAA;AACH,CAAC"}
|
|
@@ -1,164 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
-
};
|
|
5
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.sanitizeHtmlForMarkdownConversion = sanitizeHtmlForMarkdownConversion;
|
|
7
|
-
exports.extractArticleContent = extractArticleContent;
|
|
8
|
-
exports.collectSegmentsFromHtml = collectSegmentsFromHtml;
|
|
9
|
-
exports.extractPlainText = extractPlainText;
|
|
10
|
-
const cheerio_1 = require("cheerio");
|
|
11
|
-
const sanitize_html_1 = __importDefault(require("sanitize-html"));
|
|
12
|
-
const cleaner_js_1 = require("./cleaner.js");
|
|
13
|
-
const MIN_SEGMENT_LENGTH = 30;
|
|
14
|
-
function sanitizeHtmlForMarkdownConversion(html) {
|
|
15
|
-
return (0, sanitize_html_1.default)(html, {
|
|
16
|
-
allowedTags: [
|
|
17
|
-
'article',
|
|
18
|
-
'section',
|
|
19
|
-
'div',
|
|
20
|
-
'p',
|
|
21
|
-
'h1',
|
|
22
|
-
'h2',
|
|
23
|
-
'h3',
|
|
24
|
-
'h4',
|
|
25
|
-
'h5',
|
|
26
|
-
'h6',
|
|
27
|
-
'ol',
|
|
28
|
-
'ul',
|
|
29
|
-
'li',
|
|
30
|
-
'blockquote',
|
|
31
|
-
'pre',
|
|
32
|
-
'code',
|
|
33
|
-
'span',
|
|
34
|
-
'strong',
|
|
35
|
-
'em',
|
|
36
|
-
'br',
|
|
37
|
-
'a',
|
|
38
|
-
],
|
|
39
|
-
allowedAttributes: {
|
|
40
|
-
a: ['href'],
|
|
41
|
-
},
|
|
42
|
-
nonTextTags: [
|
|
43
|
-
'style',
|
|
44
|
-
'script',
|
|
45
|
-
'noscript',
|
|
46
|
-
'template',
|
|
47
|
-
'svg',
|
|
48
|
-
'canvas',
|
|
49
|
-
'iframe',
|
|
50
|
-
'object',
|
|
51
|
-
'embed',
|
|
52
|
-
],
|
|
53
|
-
textFilter(text) {
|
|
54
|
-
return (0, cleaner_js_1.decodeHtmlEntities)(text);
|
|
55
|
-
},
|
|
56
|
-
});
|
|
57
|
-
}
|
|
58
|
-
function extractArticleContent(html) {
|
|
59
|
-
const segments = collectSegmentsFromHtml(html);
|
|
60
|
-
if (segments.length > 0) {
|
|
61
|
-
return segments.join('\n');
|
|
62
|
-
}
|
|
63
|
-
const fallback = (0, cleaner_js_1.normalizeWhitespace)(extractPlainText(html));
|
|
64
|
-
return fallback ?? '';
|
|
65
|
-
}
|
|
66
|
-
function collectSegmentsFromHtml(html) {
|
|
67
|
-
const sanitized = (0, sanitize_html_1.default)(html, {
|
|
68
|
-
allowedTags: [
|
|
69
|
-
'article',
|
|
70
|
-
'section',
|
|
71
|
-
'div',
|
|
72
|
-
'p',
|
|
73
|
-
'h1',
|
|
74
|
-
'h2',
|
|
75
|
-
'h3',
|
|
76
|
-
'h4',
|
|
77
|
-
'h5',
|
|
78
|
-
'h6',
|
|
79
|
-
'ol',
|
|
80
|
-
'ul',
|
|
81
|
-
'li',
|
|
82
|
-
'blockquote',
|
|
83
|
-
'pre',
|
|
84
|
-
'code',
|
|
85
|
-
'span',
|
|
86
|
-
'strong',
|
|
87
|
-
'em',
|
|
88
|
-
'br',
|
|
89
|
-
],
|
|
90
|
-
allowedAttributes: {},
|
|
91
|
-
nonTextTags: [
|
|
92
|
-
'style',
|
|
93
|
-
'script',
|
|
94
|
-
'noscript',
|
|
95
|
-
'template',
|
|
96
|
-
'svg',
|
|
97
|
-
'canvas',
|
|
98
|
-
'iframe',
|
|
99
|
-
'object',
|
|
100
|
-
'embed',
|
|
101
|
-
],
|
|
102
|
-
textFilter(text) {
|
|
103
|
-
return (0, cleaner_js_1.decodeHtmlEntities)(text);
|
|
104
|
-
},
|
|
105
|
-
});
|
|
106
|
-
const $ = (0, cheerio_1.load)(sanitized);
|
|
107
|
-
const segments = [];
|
|
108
|
-
$('h1,h2,h3,h4,h5,h6,li,p,blockquote,pre').each((_, element) => {
|
|
109
|
-
if (!('tagName' in element) || typeof element.tagName !== 'string') {
|
|
110
|
-
return;
|
|
111
|
-
}
|
|
112
|
-
const tag = element.tagName.toLowerCase();
|
|
113
|
-
const raw = $(element).text();
|
|
114
|
-
const text = (0, cleaner_js_1.normalizeWhitespace)(raw).replaceAll(/\n+/g, ' ');
|
|
115
|
-
if (!text || text.length === 0) {
|
|
116
|
-
return;
|
|
117
|
-
}
|
|
118
|
-
if (tag.startsWith('h')) {
|
|
119
|
-
if (text.length >= 10) {
|
|
120
|
-
segments.push(text);
|
|
121
|
-
}
|
|
122
|
-
return;
|
|
123
|
-
}
|
|
124
|
-
if (tag === 'li') {
|
|
125
|
-
if (text.length >= 20) {
|
|
126
|
-
segments.push(`• ${text}`);
|
|
127
|
-
}
|
|
128
|
-
return;
|
|
129
|
-
}
|
|
130
|
-
if (text.length < MIN_SEGMENT_LENGTH) {
|
|
131
|
-
return;
|
|
132
|
-
}
|
|
133
|
-
segments.push(text);
|
|
134
|
-
});
|
|
135
|
-
if (segments.length === 0) {
|
|
136
|
-
const fallback = (0, cleaner_js_1.normalizeWhitespace)($('body').text() || sanitized);
|
|
137
|
-
return fallback ? [fallback] : [];
|
|
138
|
-
}
|
|
139
|
-
return mergeConsecutiveSegments(segments);
|
|
140
|
-
}
|
|
141
|
-
function extractPlainText(html) {
|
|
142
|
-
const stripped = (0, sanitize_html_1.default)(html, {
|
|
143
|
-
allowedTags: [],
|
|
144
|
-
allowedAttributes: {},
|
|
145
|
-
nonTextTags: [
|
|
146
|
-
'style',
|
|
147
|
-
'script',
|
|
148
|
-
'noscript',
|
|
149
|
-
'template',
|
|
150
|
-
'svg',
|
|
151
|
-
'canvas',
|
|
152
|
-
'iframe',
|
|
153
|
-
'object',
|
|
154
|
-
'embed',
|
|
155
|
-
],
|
|
156
|
-
});
|
|
157
|
-
return (0, cleaner_js_1.decodeHtmlEntities)(stripped);
|
|
158
|
-
}
|
|
159
|
-
function mergeConsecutiveSegments(segments) {
|
|
160
|
-
// Keep headings as separate segments; merging short segments mostly collapses headings into the
|
|
161
|
-
// previous paragraph ("... Conclusion"), which reads worse than a standalone heading line.
|
|
162
|
-
return segments.filter(Boolean);
|
|
163
|
-
}
|
|
164
|
-
//# sourceMappingURL=article.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"article.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/article.ts"],"names":[],"mappings":";;;;;AAOA,8EA2CC;AAED,sDAOC;AAED,0DAoFC;AAED,4CAiBC;AApKD,qCAA8B;AAC9B,kEAAwC;AAExC,6CAAsE;AAEtE,MAAM,kBAAkB,GAAG,EAAE,CAAA;AAE7B,SAAgB,iCAAiC,CAAC,IAAY;IAC5D,OAAO,IAAA,uBAAY,EAAC,IAAI,EAAE;QACxB,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;YACJ,GAAG;SACJ;QACD,iBAAiB,EAAE;YACjB,CAAC,EAAE,CAAC,MAAM,CAAC;SACZ;QACD,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,IAAA,+BAAkB,EAAC,IAAI,CAAC,CAAA;QACjC,CAAC;KACF,CAAC,CAAA;AACJ,CAAC;AAED,SAAgB,qBAAqB,CAAC,IAAY;IAChD,MAAM,QAAQ,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAA;IAC9C,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,OAAO,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC5B,CAAC;IACD,MAAM,QAAQ,GAAG,IAAA,gCAAmB,EAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAA;IAC5D,OAAO,QAAQ,IAAI,EAAE,CAAA;AACvB,CAAC;AAED,SAAgB,uBAAuB,CAAC,IAAY;IAClD,MAAM,SAAS,GAAG,IAAA,uBAAY,EAAC,IAAI,EAAE;QACnC,WAAW,EAAE;YACX,SAAS;YACT,SAAS;YACT,KAAK;YACL,GAAG;YACH,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,IAAI;YACJ,YAAY;YACZ,KAAK;YACL,MAAM;YACN,MAAM;YACN,QAAQ;YACR,IAAI;YACJ,IAAI;SACL;QACD,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;QACD,UAAU,CAAC,IAAY;YACrB,OAAO,IAAA,+BAAkB,EAAC,IAAI,CAAC,CAAA;QACjC,CAAC;KACF,CAAC,CAAA;IAEF,MAAM,CAAC,GAAG,IAAA,cAAI,EAAC,SAAS,CAAC,CAAA;IACzB,MAAM,QAAQ,GAAa,EAAE,CAAA;IAE7B,CAAC,CAAC,uCAAuC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE;QAC7D,IAAI,CAAC,CAAC,SAAS,IAAI,OAAO,CAAC,IAAI,OAAO,OAAO,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YACnE,OAAM;QACR,CAAC;QAED,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,CAAC,WAAW,EAAE,CAAA;QAEzC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAA;QAC7B,MAAM,IAAI,GAAG,IAAA,gCAAmB,EAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;QAC7D,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAM;QACR,CAAC;QAED,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YACxB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;YACrB,CAAC;YACD,OAAM;QACR,CAAC;QAED,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACjB,IAAI,IAAI,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;gBACtB,QAAQ,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC,CAAA;YAC5B,CAAC;YACD,OAAM;QACR,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,GAAG,kBAAkB,EAAE,CAAC;YACrC,OAAM;QACR,CAAC;QAED,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IACrB,CAAC,CAAC,CAAA;IAEF,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,QAAQ,GAAG,IAAA,gCAAmB,EAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,SAAS,CAAC,CAAA;QACnE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;IACnC,CAAC;IAED,OAAO,wBAAwB,CAAC,QAAQ,CAAC,CAAA;AAC3C,CAAC;AAED,SAAgB,gBAAgB,CAAC,IAAY;IAC3C,MAAM,QAAQ,GAAG,IAAA,uBAAY,EAAC,IAAI,EAAE;QAClC,WAAW,EAAE,EAAE;QACf,iBAAiB,EAAE,EAAE;QACrB,WAAW,EAAE;YACX,OAAO;YACP,QAAQ;YACR,UAAU;YACV,UAAU;YACV,KAAK;YACL,QAAQ;YACR,QAAQ;YACR,QAAQ;YACR,OAAO;SACR;KACF,CAAC,CAAA;IACF,OAAO,IAAA,+BAAkB,EAAC,QAAQ,CAAC,CAAA;AACrC,CAAC;AAED,SAAS,wBAAwB,CAAC,QAAkB;IAClD,gGAAgG;IAChG,2FAA2F;IAC3F,OAAO,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;AACjC,CAAC"}
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.normalizeForPrompt = normalizeForPrompt;
|
|
4
|
-
exports.normalizeWhitespace = normalizeWhitespace;
|
|
5
|
-
exports.decodeHtmlEntities = decodeHtmlEntities;
|
|
6
|
-
exports.normalizeCandidate = normalizeCandidate;
|
|
7
|
-
exports.clipAtSentenceBoundary = clipAtSentenceBoundary;
|
|
8
|
-
exports.applyContentBudget = applyContentBudget;
|
|
9
|
-
const es_toolkit_1 = require("es-toolkit");
|
|
10
|
-
const WORD_SPLIT_PATTERN = /\s+/g;
|
|
11
|
-
function normalizeForPrompt(input) {
|
|
12
|
-
return input
|
|
13
|
-
.replaceAll('\u00A0', ' ')
|
|
14
|
-
.replaceAll(/[\t ]+/g, ' ')
|
|
15
|
-
.replaceAll(/\s*\n\s*/g, '\n')
|
|
16
|
-
.replaceAll(/\n{3,}/g, '\n\n')
|
|
17
|
-
.trim();
|
|
18
|
-
}
|
|
19
|
-
function normalizeWhitespace(input) {
|
|
20
|
-
return input
|
|
21
|
-
.replaceAll('\u00A0', ' ')
|
|
22
|
-
.replaceAll(/[\t ]+/g, ' ')
|
|
23
|
-
.replaceAll(/\s*\n\s*/g, '\n')
|
|
24
|
-
.trim();
|
|
25
|
-
}
|
|
26
|
-
function decodeHtmlEntities(input) {
|
|
27
|
-
return input
|
|
28
|
-
.replaceAll('&', '&')
|
|
29
|
-
.replaceAll('<', '<')
|
|
30
|
-
.replaceAll('>', '>')
|
|
31
|
-
.replaceAll('"', '"')
|
|
32
|
-
.replaceAll(''', "'")
|
|
33
|
-
.replaceAll(''', "'")
|
|
34
|
-
.replaceAll('/', '/')
|
|
35
|
-
.replaceAll(' ', ' ');
|
|
36
|
-
}
|
|
37
|
-
function normalizeCandidate(value) {
|
|
38
|
-
if (!value) {
|
|
39
|
-
return null;
|
|
40
|
-
}
|
|
41
|
-
const trimmed = value.replaceAll(/\s+/g, ' ').trim();
|
|
42
|
-
return trimmed.length > 0 ? trimmed : null;
|
|
43
|
-
}
|
|
44
|
-
function clipAtSentenceBoundary(input, maxLength) {
|
|
45
|
-
if (input.length <= maxLength) {
|
|
46
|
-
return input;
|
|
47
|
-
}
|
|
48
|
-
const slice = input.slice(0, maxLength);
|
|
49
|
-
const lastSentenceBreak = Math.max(slice.lastIndexOf('. '), slice.lastIndexOf('! '), slice.lastIndexOf('? '), slice.lastIndexOf('\n\n'));
|
|
50
|
-
if (lastSentenceBreak > maxLength * 0.5) {
|
|
51
|
-
return slice.slice(0, lastSentenceBreak + 1);
|
|
52
|
-
}
|
|
53
|
-
return slice;
|
|
54
|
-
}
|
|
55
|
-
function applyContentBudget(baseContent, maxCharacters) {
|
|
56
|
-
const totalCharacters = baseContent.length;
|
|
57
|
-
const truncated = totalCharacters > maxCharacters;
|
|
58
|
-
const clipped = truncated ? clipAtSentenceBoundary(baseContent, maxCharacters) : baseContent;
|
|
59
|
-
const content = clipped.trim();
|
|
60
|
-
const wordCount = content.length > 0 ? (0, es_toolkit_1.compact)(content.split(WORD_SPLIT_PATTERN)).length : 0;
|
|
61
|
-
return { content, truncated, totalCharacters, wordCount };
|
|
62
|
-
}
|
|
63
|
-
//# sourceMappingURL=cleaner.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"cleaner.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/cleaner.ts"],"names":[],"mappings":";;AAWA,gDAOC;AAED,kDAMC;AAED,gDAUC;AAED,gDAMC;AAED,wDAeC;AAED,gDAUC;AA3ED,2CAAoC;AAEpC,MAAM,kBAAkB,GAAG,MAAM,CAAA;AASjC,SAAgB,kBAAkB,CAAC,KAAa;IAC9C,OAAO,KAAK;SACT,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,UAAU,CAAC,SAAS,EAAE,MAAM,CAAC;SAC7B,IAAI,EAAE,CAAA;AACX,CAAC;AAED,SAAgB,mBAAmB,CAAC,KAAa;IAC/C,OAAO,KAAK;SACT,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC;SAC1B,UAAU,CAAC,WAAW,EAAE,IAAI,CAAC;SAC7B,IAAI,EAAE,CAAA;AACX,CAAC;AAED,SAAgB,kBAAkB,CAAC,KAAa;IAC9C,OAAO,KAAK;SACT,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC;SACvB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,OAAO,EAAE,GAAG,CAAC;SACxB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC;SACzB,UAAU,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAA;AAC9B,CAAC;AAED,SAAgB,kBAAkB,CAAC,KAAgC;IACjE,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,OAAO,IAAI,CAAA;IACb,CAAC;IACD,MAAM,OAAO,GAAG,KAAK,CAAC,UAAU,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACpD,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAA;AAC5C,CAAC;AAED,SAAgB,sBAAsB,CAAC,KAAa,EAAE,SAAiB;IACrE,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAA;IACd,CAAC;IACD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAA;IACvC,MAAM,iBAAiB,GAAG,IAAI,CAAC,GAAG,CAChC,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,EACvB,KAAK,CAAC,WAAW,CAAC,MAAM,CAAC,CAC1B,CAAA;IACD,IAAI,iBAAiB,GAAG,SAAS,GAAG,GAAG,EAAE,CAAC;QACxC,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,GAAG,CAAC,CAAC,CAAA;IAC9C,CAAC;IACD,OAAO,KAAK,CAAA;AACd,CAAC;AAED,SAAgB,kBAAkB,CAChC,WAAmB,EACnB,aAAqB;IAErB,MAAM,eAAe,GAAG,WAAW,CAAC,MAAM,CAAA;IAC1C,MAAM,SAAS,GAAG,eAAe,GAAG,aAAa,CAAA;IACjD,MAAM,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,sBAAsB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,WAAW,CAAA;IAC5F,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IAC9B,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAA,oBAAO,EAAC,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAA;IAC5F,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,SAAS,EAAE,CAAA;AAC3D,CAAC"}
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.MIN_HTML_DOCUMENT_CHARACTERS_FOR_FALLBACK = exports.READABILITY_RELATIVE_THRESHOLD = exports.MIN_METADATA_DESCRIPTION_CHARACTERS = exports.MIN_READABILITY_CONTENT_CHARACTERS = exports.MIN_HTML_CONTENT_CHARACTERS = exports.BLOCKED_HTML_HINT_PATTERN = void 0;
|
|
4
|
-
exports.BLOCKED_HTML_HINT_PATTERN = /access denied|attention required|captcha|cloudflare|enable javascript|forbidden|please turn javascript on|verify you are human/i;
|
|
5
|
-
exports.MIN_HTML_CONTENT_CHARACTERS = 200;
|
|
6
|
-
exports.MIN_READABILITY_CONTENT_CHARACTERS = 200;
|
|
7
|
-
exports.MIN_METADATA_DESCRIPTION_CHARACTERS = 120;
|
|
8
|
-
exports.READABILITY_RELATIVE_THRESHOLD = 0.6;
|
|
9
|
-
exports.MIN_HTML_DOCUMENT_CHARACTERS_FOR_FALLBACK = 5000;
|
|
10
|
-
//# sourceMappingURL=constants.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"constants.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/constants.ts"],"names":[],"mappings":";;;AAAa,QAAA,yBAAyB,GACpC,iIAAiI,CAAA;AACtH,QAAA,2BAA2B,GAAG,GAAG,CAAA;AACjC,QAAA,kCAAkC,GAAG,GAAG,CAAA;AACxC,QAAA,mCAAmC,GAAG,GAAG,CAAA;AACzC,QAAA,8BAA8B,GAAG,GAAG,CAAA;AACpC,QAAA,yCAAyC,GAAG,IAAI,CAAA"}
|
|
@@ -1,128 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.fetchHtmlDocument = fetchHtmlDocument;
|
|
4
|
-
exports.fetchWithFirecrawl = fetchWithFirecrawl;
|
|
5
|
-
const utils_js_1 = require("../../transcript/utils.js");
|
|
6
|
-
const utils_js_2 = require("./utils.js");
|
|
7
|
-
const REQUEST_HEADERS = {
|
|
8
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
|
9
|
-
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
10
|
-
'Accept-Language': 'en-US,en;q=0.9',
|
|
11
|
-
'Cache-Control': 'no-cache',
|
|
12
|
-
Pragma: 'no-cache',
|
|
13
|
-
};
|
|
14
|
-
const DEFAULT_REQUEST_TIMEOUT_MS = 5000;
|
|
15
|
-
async function fetchHtmlDocument(fetchImpl, url, { timeoutMs, onProgress, } = {}) {
|
|
16
|
-
onProgress?.({ kind: 'fetch-html-start', url });
|
|
17
|
-
const controller = new AbortController();
|
|
18
|
-
const effectiveTimeoutMs = typeof timeoutMs === 'number' && Number.isFinite(timeoutMs)
|
|
19
|
-
? timeoutMs
|
|
20
|
-
: DEFAULT_REQUEST_TIMEOUT_MS;
|
|
21
|
-
const timeout = setTimeout(() => {
|
|
22
|
-
controller.abort();
|
|
23
|
-
}, effectiveTimeoutMs);
|
|
24
|
-
try {
|
|
25
|
-
const response = await fetchImpl(url, {
|
|
26
|
-
headers: REQUEST_HEADERS,
|
|
27
|
-
redirect: 'follow',
|
|
28
|
-
signal: controller.signal,
|
|
29
|
-
});
|
|
30
|
-
if (!response.ok) {
|
|
31
|
-
throw new Error(`Failed to fetch HTML document (status ${response.status})`);
|
|
32
|
-
}
|
|
33
|
-
const contentType = response.headers.get('content-type')?.toLowerCase() ?? null;
|
|
34
|
-
if (contentType &&
|
|
35
|
-
!contentType.includes('text/html') &&
|
|
36
|
-
!contentType.includes('application/xhtml+xml') &&
|
|
37
|
-
!contentType.includes('application/xml') &&
|
|
38
|
-
!contentType.includes('text/xml') &&
|
|
39
|
-
!contentType.includes('application/rss+xml') &&
|
|
40
|
-
!contentType.includes('application/atom+xml') &&
|
|
41
|
-
!contentType.startsWith('text/')) {
|
|
42
|
-
throw new Error(`Unsupported content-type for HTML document fetch: ${contentType}`);
|
|
43
|
-
}
|
|
44
|
-
const totalBytes = (() => {
|
|
45
|
-
const raw = response.headers.get('content-length');
|
|
46
|
-
if (!raw)
|
|
47
|
-
return null;
|
|
48
|
-
const parsed = Number(raw);
|
|
49
|
-
return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : null;
|
|
50
|
-
})();
|
|
51
|
-
const body = response.body;
|
|
52
|
-
if (!body) {
|
|
53
|
-
const text = await response.text();
|
|
54
|
-
const bytes = new TextEncoder().encode(text).byteLength;
|
|
55
|
-
onProgress?.({ kind: 'fetch-html-done', url, downloadedBytes: bytes, totalBytes });
|
|
56
|
-
return text;
|
|
57
|
-
}
|
|
58
|
-
const reader = body.getReader();
|
|
59
|
-
const decoder = new TextDecoder();
|
|
60
|
-
let downloadedBytes = 0;
|
|
61
|
-
let text = '';
|
|
62
|
-
onProgress?.({ kind: 'fetch-html-progress', url, downloadedBytes: 0, totalBytes });
|
|
63
|
-
while (true) {
|
|
64
|
-
const { value, done } = await reader.read();
|
|
65
|
-
if (done)
|
|
66
|
-
break;
|
|
67
|
-
if (!value)
|
|
68
|
-
continue;
|
|
69
|
-
downloadedBytes += value.byteLength;
|
|
70
|
-
text += decoder.decode(value, { stream: true });
|
|
71
|
-
onProgress?.({ kind: 'fetch-html-progress', url, downloadedBytes, totalBytes });
|
|
72
|
-
}
|
|
73
|
-
text += decoder.decode();
|
|
74
|
-
onProgress?.({ kind: 'fetch-html-done', url, downloadedBytes, totalBytes });
|
|
75
|
-
return text;
|
|
76
|
-
}
|
|
77
|
-
catch (error) {
|
|
78
|
-
if (error instanceof DOMException && error.name === 'AbortError') {
|
|
79
|
-
throw new Error('Fetching HTML document timed out');
|
|
80
|
-
}
|
|
81
|
-
throw error;
|
|
82
|
-
}
|
|
83
|
-
finally {
|
|
84
|
-
clearTimeout(timeout);
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
async function fetchWithFirecrawl(url, scrapeWithFirecrawl, options = {}) {
|
|
88
|
-
const timeoutMs = options.timeoutMs;
|
|
89
|
-
const cacheMode = options.cacheMode ?? 'default';
|
|
90
|
-
const onProgress = typeof options.onProgress === 'function' ? options.onProgress : null;
|
|
91
|
-
const reason = typeof options.reason === 'string' ? options.reason : null;
|
|
92
|
-
const diagnostics = {
|
|
93
|
-
attempted: false,
|
|
94
|
-
used: false,
|
|
95
|
-
cacheMode,
|
|
96
|
-
cacheStatus: cacheMode === 'bypass' ? 'bypassed' : 'unknown',
|
|
97
|
-
notes: null,
|
|
98
|
-
};
|
|
99
|
-
if ((0, utils_js_1.isYouTubeUrl)(url)) {
|
|
100
|
-
diagnostics.notes = (0, utils_js_2.appendNote)(diagnostics.notes, 'Skipped Firecrawl for YouTube URL');
|
|
101
|
-
return { payload: null, diagnostics };
|
|
102
|
-
}
|
|
103
|
-
if (!scrapeWithFirecrawl) {
|
|
104
|
-
diagnostics.notes = (0, utils_js_2.appendNote)(diagnostics.notes, 'Firecrawl is not configured');
|
|
105
|
-
return { payload: null, diagnostics };
|
|
106
|
-
}
|
|
107
|
-
diagnostics.attempted = true;
|
|
108
|
-
onProgress?.({ kind: 'firecrawl-start', url, reason: reason ?? 'firecrawl' });
|
|
109
|
-
try {
|
|
110
|
-
const payload = await scrapeWithFirecrawl(url, { timeoutMs, cacheMode });
|
|
111
|
-
if (!payload) {
|
|
112
|
-
diagnostics.notes = (0, utils_js_2.appendNote)(diagnostics.notes, 'Firecrawl returned no content payload');
|
|
113
|
-
onProgress?.({ kind: 'firecrawl-done', url, ok: false, markdownBytes: null, htmlBytes: null });
|
|
114
|
-
return { payload: null, diagnostics };
|
|
115
|
-
}
|
|
116
|
-
const encoder = new TextEncoder();
|
|
117
|
-
const markdownBytes = typeof payload.markdown === 'string' ? encoder.encode(payload.markdown).byteLength : null;
|
|
118
|
-
const htmlBytes = typeof payload.html === 'string' ? encoder.encode(payload.html).byteLength : null;
|
|
119
|
-
onProgress?.({ kind: 'firecrawl-done', url, ok: true, markdownBytes, htmlBytes });
|
|
120
|
-
return { payload, diagnostics };
|
|
121
|
-
}
|
|
122
|
-
catch (error) {
|
|
123
|
-
diagnostics.notes = (0, utils_js_2.appendNote)(diagnostics.notes, `Firecrawl error: ${error instanceof Error ? error.message : 'unknown error'}`);
|
|
124
|
-
onProgress?.({ kind: 'firecrawl-done', url, ok: false, markdownBytes: null, htmlBytes: null });
|
|
125
|
-
return { payload: null, diagnostics };
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
//# sourceMappingURL=fetcher.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"fetcher.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/fetcher.ts"],"names":[],"mappings":";;AA2BA,8CAsFC;AAED,gDA2DC;AA9KD,wDAAwD;AAQxD,yCAAuC;AAEvC,MAAM,eAAe,GAA2B;IAC9C,YAAY,EACV,iHAAiH;IACnH,MAAM,EACJ,kGAAkG;IACpG,iBAAiB,EAAE,gBAAgB;IACnC,eAAe,EAAE,UAAU;IAC3B,MAAM,EAAE,UAAU;CACnB,CAAA;AAED,MAAM,0BAA0B,GAAG,IAAI,CAAA;AAOhC,KAAK,UAAU,iBAAiB,CACrC,SAAuB,EACvB,GAAW,EACX,EACE,SAAS,EACT,UAAU,MACiF,EAAE;IAE/F,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,kBAAkB,EAAE,GAAG,EAAE,CAAC,CAAA;IAE/C,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAA;IACxC,MAAM,kBAAkB,GACtB,OAAO,SAAS,KAAK,QAAQ,IAAI,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC;QACzD,CAAC,CAAC,SAAS;QACX,CAAC,CAAC,0BAA0B,CAAA;IAChC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE;QAC9B,UAAU,CAAC,KAAK,EAAE,CAAA;IACpB,CAAC,EAAE,kBAAkB,CAAC,CAAA;IAEtB,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,GAAG,EAAE;YACpC,OAAO,EAAE,eAAe;YACxB,QAAQ,EAAE,QAAQ;YAClB,MAAM,EAAE,UAAU,CAAC,MAAM;SAC1B,CAAC,CAAA;QAEF,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CAAC,yCAAyC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAA;QAC9E,CAAC;QAED,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,EAAE,WAAW,EAAE,IAAI,IAAI,CAAA;QAC/E,IACE,WAAW;YACX,CAAC,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC;YAClC,CAAC,WAAW,CAAC,QAAQ,CAAC,uBAAuB,CAAC;YAC9C,CAAC,WAAW,CAAC,QAAQ,CAAC,iBAAiB,CAAC;YACxC,CAAC,WAAW,CAAC,QAAQ,CAAC,UAAU,CAAC;YACjC,CAAC,WAAW,CAAC,QAAQ,CAAC,qBAAqB,CAAC;YAC5C,CAAC,WAAW,CAAC,QAAQ,CAAC,sBAAsB,CAAC;YAC7C,CAAC,WAAW,CAAC,UAAU,CAAC,OAAO,CAAC,EAChC,CAAC;YACD,MAAM,IAAI,KAAK,CAAC,qDAAqD,WAAW,EAAE,CAAC,CAAA;QACrF,CAAC;QAED,MAAM,UAAU,GAAG,CAAC,GAAG,EAAE;YACvB,MAAM,GAAG,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAA;YAClD,IAAI,CAAC,GAAG;gBAAE,OAAO,IAAI,CAAA;YACrB,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAA;YAC1B,OAAO,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAA;QAC1E,CAAC,CAAC,EAAE,CAAA;QAEJ,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAA;QAC1B,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAA;YAClC,MAAM,KAAK,GAAG,IAAI,WAAW,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,CAAA;YACvD,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,iBAAiB,EAAE,GAAG,EAAE,eAAe,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC,CAAA;YAClF,OAAO,IAAI,CAAA;QACb,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,EAAE,CAAA;QAC/B,MAAM,OAAO,GAAG,IAAI,WAAW,EAAE,CAAA;QACjC,IAAI,eAAe,GAAG,CAAC,CAAA;QACvB,IAAI,IAAI,GAAG,EAAE,CAAA;QAEb,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,GAAG,EAAE,eAAe,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,CAAA;QAElF,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,CAAA;YAC3C,IAAI,IAAI;gBAAE,MAAK;YACf,IAAI,CAAC,KAAK;gBAAE,SAAQ;YACpB,eAAe,IAAI,KAAK,CAAC,UAAU,CAAA;YACnC,IAAI,IAAI,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC,CAAA;YAC/C,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,GAAG,EAAE,eAAe,EAAE,UAAU,EAAE,CAAC,CAAA;QACjF,CAAC;QAED,IAAI,IAAI,OAAO,CAAC,MAAM,EAAE,CAAA;QACxB,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,iBAAiB,EAAE,GAAG,EAAE,eAAe,EAAE,UAAU,EAAE,CAAC,CAAA;QAC3E,OAAO,IAAI,CAAA;IACb,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,YAAY,IAAI,KAAK,CAAC,IAAI,KAAK,YAAY,EAAE,CAAC;YACjE,MAAM,IAAI,KAAK,CAAC,kCAAkC,CAAC,CAAA;QACrD,CAAC;QACD,MAAM,KAAK,CAAA;IACb,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,OAAO,CAAC,CAAA;IACvB,CAAC;AACH,CAAC;AAEM,KAAK,UAAU,kBAAkB,CACtC,GAAW,EACX,mBAA+C,EAC/C,UAKI,EAAE;IAEN,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,CAAA;IACnC,MAAM,SAAS,GAAc,OAAO,CAAC,SAAS,IAAI,SAAS,CAAA;IAC3D,MAAM,UAAU,GAAG,OAAO,OAAO,CAAC,UAAU,KAAK,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAA;IACvF,MAAM,MAAM,GAAG,OAAO,OAAO,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAA;IACzE,MAAM,WAAW,GAAyB;QACxC,SAAS,EAAE,KAAK;QAChB,IAAI,EAAE,KAAK;QACX,SAAS;QACT,WAAW,EAAE,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;QAC5D,KAAK,EAAE,IAAI;KACZ,CAAA;IAED,IAAI,IAAA,uBAAY,EAAC,GAAG,CAAC,EAAE,CAAC;QACtB,WAAW,CAAC,KAAK,GAAG,IAAA,qBAAU,EAAC,WAAW,CAAC,KAAK,EAAE,mCAAmC,CAAC,CAAA;QACtF,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,CAAA;IACvC,CAAC;IAED,IAAI,CAAC,mBAAmB,EAAE,CAAC;QACzB,WAAW,CAAC,KAAK,GAAG,IAAA,qBAAU,EAAC,WAAW,CAAC,KAAK,EAAE,6BAA6B,CAAC,CAAA;QAChF,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,CAAA;IACvC,CAAC;IAED,WAAW,CAAC,SAAS,GAAG,IAAI,CAAA;IAC5B,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,iBAAiB,EAAE,GAAG,EAAE,MAAM,EAAE,MAAM,IAAI,WAAW,EAAE,CAAC,CAAA;IAE7E,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,mBAAmB,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,CAAA;QACxE,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,WAAW,CAAC,KAAK,GAAG,IAAA,qBAAU,EAAC,WAAW,CAAC,KAAK,EAAE,uCAAuC,CAAC,CAAA;YAC1F,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE,aAAa,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;YAC9F,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,CAAA;QACvC,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,WAAW,EAAE,CAAA;QACjC,MAAM,aAAa,GACjB,OAAO,OAAO,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAA;QAC3F,MAAM,SAAS,GACb,OAAO,OAAO,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAA;QACnF,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,GAAG,EAAE,EAAE,EAAE,IAAI,EAAE,aAAa,EAAE,SAAS,EAAE,CAAC,CAAA;QAEjF,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,CAAA;IACjC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,WAAW,CAAC,KAAK,GAAG,IAAA,qBAAU,EAC5B,WAAW,CAAC,KAAK,EACjB,oBAAoB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE,CAC/E,CAAA;QACD,UAAU,EAAE,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE,aAAa,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;QAC9F,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,EAAE,CAAA;IACvC,CAAC;AACH,CAAC"}
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.shouldFallbackToFirecrawl = shouldFallbackToFirecrawl;
|
|
4
|
-
exports.buildResultFromFirecrawl = buildResultFromFirecrawl;
|
|
5
|
-
const index_js_1 = require("../../transcript/index.js");
|
|
6
|
-
const article_js_1 = require("./article.js");
|
|
7
|
-
const cleaner_js_1 = require("./cleaner.js");
|
|
8
|
-
const constants_js_1 = require("./constants.js");
|
|
9
|
-
const jsonld_js_1 = require("./jsonld.js");
|
|
10
|
-
const parsers_js_1 = require("./parsers.js");
|
|
11
|
-
const podcast_utils_js_1 = require("./podcast-utils.js");
|
|
12
|
-
const utils_js_1 = require("./utils.js");
|
|
13
|
-
const video_js_1 = require("./video.js");
|
|
14
|
-
function shouldFallbackToFirecrawl(html) {
|
|
15
|
-
const plainText = (0, cleaner_js_1.normalizeForPrompt)((0, article_js_1.extractPlainText)(html));
|
|
16
|
-
if (constants_js_1.BLOCKED_HTML_HINT_PATTERN.test(plainText))
|
|
17
|
-
return true;
|
|
18
|
-
const normalized = (0, cleaner_js_1.normalizeForPrompt)((0, article_js_1.extractArticleContent)(html));
|
|
19
|
-
if (normalized.length >= constants_js_1.MIN_HTML_CONTENT_CHARACTERS) {
|
|
20
|
-
return false;
|
|
21
|
-
}
|
|
22
|
-
// Avoid spending Firecrawl on truly small/simple pages where the extracted HTML content is short but
|
|
23
|
-
// likely complete (e.g. https://example.com). Only treat "thin" content as a Firecrawl signal when
|
|
24
|
-
// the HTML document itself is large (SSR/app-shell pages, blocked pages without a match, etc.).
|
|
25
|
-
return html.length >= constants_js_1.MIN_HTML_DOCUMENT_CHARACTERS_FOR_FALLBACK;
|
|
26
|
-
}
|
|
27
|
-
async function buildResultFromFirecrawl({ url, payload, cacheMode, maxCharacters, youtubeTranscriptMode, firecrawlDiagnostics, markdownRequested, deps, }) {
|
|
28
|
-
const normalizedMarkdown = (0, cleaner_js_1.normalizeForPrompt)(payload.markdown ?? '');
|
|
29
|
-
if (normalizedMarkdown.length === 0) {
|
|
30
|
-
firecrawlDiagnostics.notes = (0, utils_js_1.appendNote)(firecrawlDiagnostics.notes, 'Firecrawl markdown normalization yielded empty text');
|
|
31
|
-
return null;
|
|
32
|
-
}
|
|
33
|
-
const jsonLd = payload.html ? (0, jsonld_js_1.extractJsonLdContent)(payload.html) : null;
|
|
34
|
-
const isPodcastJsonLd = (0, podcast_utils_js_1.isPodcastLikeJsonLdType)(jsonLd?.type);
|
|
35
|
-
const transcriptResolution = await (0, index_js_1.resolveTranscriptForLink)(url, payload.html ?? null, deps, {
|
|
36
|
-
youtubeTranscriptMode,
|
|
37
|
-
cacheMode,
|
|
38
|
-
});
|
|
39
|
-
const htmlMetadata = payload.html
|
|
40
|
-
? (0, parsers_js_1.extractMetadataFromHtml)(payload.html, url)
|
|
41
|
-
: { title: null, description: null, siteName: null };
|
|
42
|
-
const metadata = (0, parsers_js_1.extractMetadataFromFirecrawl)(payload.metadata ?? null);
|
|
43
|
-
const title = (0, utils_js_1.pickFirstText)([jsonLd?.title, metadata.title, htmlMetadata.title]);
|
|
44
|
-
const description = (0, utils_js_1.pickFirstText)([
|
|
45
|
-
jsonLd?.description,
|
|
46
|
-
metadata.description,
|
|
47
|
-
htmlMetadata.description,
|
|
48
|
-
]);
|
|
49
|
-
const siteName = (0, utils_js_1.pickFirstText)([metadata.siteName, htmlMetadata.siteName, (0, utils_js_1.safeHostname)(url)]);
|
|
50
|
-
const descriptionCandidate = description ? (0, cleaner_js_1.normalizeForPrompt)(description) : '';
|
|
51
|
-
const preferDescription = descriptionCandidate.length >= constants_js_1.MIN_METADATA_DESCRIPTION_CHARACTERS &&
|
|
52
|
-
(isPodcastJsonLd ||
|
|
53
|
-
(0, podcast_utils_js_1.isPodcastHost)(url) ||
|
|
54
|
-
normalizedMarkdown.length < constants_js_1.MIN_HTML_CONTENT_CHARACTERS ||
|
|
55
|
-
descriptionCandidate.length >= normalizedMarkdown.length * constants_js_1.READABILITY_RELATIVE_THRESHOLD);
|
|
56
|
-
const baseCandidate = preferDescription ? descriptionCandidate : normalizedMarkdown;
|
|
57
|
-
const baseContent = (0, utils_js_1.selectBaseContent)(baseCandidate, transcriptResolution.text);
|
|
58
|
-
if (baseContent.length === 0) {
|
|
59
|
-
firecrawlDiagnostics.notes = (0, utils_js_1.appendNote)(firecrawlDiagnostics.notes, 'Firecrawl produced content that normalized to an empty string');
|
|
60
|
-
return null;
|
|
61
|
-
}
|
|
62
|
-
firecrawlDiagnostics.used = true;
|
|
63
|
-
const transcriptDiagnostics = (0, utils_js_1.ensureTranscriptDiagnostics)(transcriptResolution, cacheMode ?? 'default');
|
|
64
|
-
const video = payload.html ? (0, video_js_1.detectPrimaryVideoFromHtml)(payload.html, url) : null;
|
|
65
|
-
const isVideoOnly = !transcriptResolution.text &&
|
|
66
|
-
normalizedMarkdown.length < constants_js_1.MIN_HTML_CONTENT_CHARACTERS &&
|
|
67
|
-
video !== null;
|
|
68
|
-
return (0, utils_js_1.finalizeExtractedLinkContent)({
|
|
69
|
-
url,
|
|
70
|
-
baseContent,
|
|
71
|
-
maxCharacters,
|
|
72
|
-
title,
|
|
73
|
-
description,
|
|
74
|
-
siteName,
|
|
75
|
-
transcriptResolution,
|
|
76
|
-
video,
|
|
77
|
-
isVideoOnly,
|
|
78
|
-
diagnostics: {
|
|
79
|
-
strategy: 'firecrawl',
|
|
80
|
-
firecrawl: firecrawlDiagnostics,
|
|
81
|
-
markdown: {
|
|
82
|
-
requested: markdownRequested,
|
|
83
|
-
used: true,
|
|
84
|
-
provider: 'firecrawl',
|
|
85
|
-
},
|
|
86
|
-
transcript: transcriptDiagnostics,
|
|
87
|
-
},
|
|
88
|
-
});
|
|
89
|
-
}
|
|
90
|
-
//# sourceMappingURL=firecrawl.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"firecrawl.js","sourceRoot":"","sources":["../../../../../src/content/link-preview/content/firecrawl.ts"],"names":[],"mappings":";;AA0BA,8DAYC;AAED,4DAmGC;AA3ID,wDAAoE;AAGpE,6CAAsE;AACtE,6CAAiD;AACjD,iDAMuB;AACvB,2CAAkD;AAClD,6CAAoF;AACpF,yDAA2E;AAE3E,yCAOmB;AACnB,yCAAuD;AAEvD,SAAgB,yBAAyB,CAAC,IAAY;IACpD,MAAM,SAAS,GAAG,IAAA,+BAAkB,EAAC,IAAA,6BAAgB,EAAC,IAAI,CAAC,CAAC,CAAA;IAC5D,IAAI,wCAAyB,CAAC,IAAI,CAAC,SAAS,CAAC;QAAE,OAAO,IAAI,CAAA;IAC1D,MAAM,UAAU,GAAG,IAAA,+BAAkB,EAAC,IAAA,kCAAqB,EAAC,IAAI,CAAC,CAAC,CAAA;IAClE,IAAI,UAAU,CAAC,MAAM,IAAI,0CAA2B,EAAE,CAAC;QACrD,OAAO,KAAK,CAAA;IACd,CAAC;IAED,qGAAqG;IACrG,mGAAmG;IACnG,gGAAgG;IAChG,OAAO,IAAI,CAAC,MAAM,IAAI,wDAAyC,CAAA;AACjE,CAAC;AAEM,KAAK,UAAU,wBAAwB,CAAC,EAC7C,GAAG,EACH,OAAO,EACP,SAAS,EACT,aAAa,EACb,qBAAqB,EACrB,oBAAoB,EACpB,iBAAiB,EACjB,IAAI,GAUL;IACC,MAAM,kBAAkB,GAAG,IAAA,+BAAkB,EAAC,OAAO,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAA;IACrE,IAAI,kBAAkB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACpC,oBAAoB,CAAC,KAAK,GAAG,IAAA,qBAAU,EACrC,oBAAoB,CAAC,KAAK,EAC1B,qDAAqD,CACtD,CAAA;QACD,OAAO,IAAI,CAAA;IACb,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAA,gCAAoB,EAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAA;IACvE,MAAM,eAAe,GAAG,IAAA,0CAAuB,EAAC,MAAM,EAAE,IAAI,CAAC,CAAA;IAE7D,MAAM,oBAAoB,GAAG,MAAM,IAAA,mCAAwB,EAAC,GAAG,EAAE,OAAO,CAAC,IAAI,IAAI,IAAI,EAAE,IAAI,EAAE;QAC3F,qBAAqB;QACrB,SAAS;KACV,CAAC,CAAA;IACF,MAAM,YAAY,GAAG,OAAO,CAAC,IAAI;QAC/B,CAAC,CAAC,IAAA,oCAAuB,EAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC;QAC5C,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAA;IACtD,MAAM,QAAQ,GAAG,IAAA,yCAA4B,EAAC,OAAO,CAAC,QAAQ,IAAI,IAAI,CAAC,CAAA;IAEvE,MAAM,KAAK,GAAG,IAAA,wBAAa,EAAC,CAAC,MAAM,EAAE,KAAK,EAAE,QAAQ,CAAC,KAAK,EAAE,YAAY,CAAC,KAAK,CAAC,CAAC,CAAA;IAChF,MAAM,WAAW,GAAG,IAAA,wBAAa,EAAC;QAChC,MAAM,EAAE,WAAW;QACnB,QAAQ,CAAC,WAAW;QACpB,YAAY,CAAC,WAAW;KACzB,CAAC,CAAA;IACF,MAAM,QAAQ,GAAG,IAAA,wBAAa,EAAC,CAAC,QAAQ,CAAC,QAAQ,EAAE,YAAY,CAAC,QAAQ,EAAE,IAAA,uBAAY,EAAC,GAAG,CAAC,CAAC,CAAC,CAAA;IAE7F,MAAM,oBAAoB,GAAG,WAAW,CAAC,CAAC,CAAC,IAAA,+BAAkB,EAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;IAC/E,MAAM,iBAAiB,GACrB,oBAAoB,CAAC,MAAM,IAAI,kDAAmC;QAClE,CAAC,eAAe;YACd,IAAA,gCAAa,EAAC,GAAG,CAAC;YAClB,kBAAkB,CAAC,MAAM,GAAG,0CAA2B;YACvD,oBAAoB,CAAC,MAAM,IAAI,kBAAkB,CAAC,MAAM,GAAG,6CAA8B,CAAC,CAAA;IAC9F,MAAM,aAAa,GAAG,iBAAiB,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,kBAAkB,CAAA;IACnF,MAAM,WAAW,GAAG,IAAA,4BAAiB,EAAC,aAAa,EAAE,oBAAoB,CAAC,IAAI,CAAC,CAAA;IAC/E,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC7B,oBAAoB,CAAC,KAAK,GAAG,IAAA,qBAAU,EACrC,oBAAoB,CAAC,KAAK,EAC1B,+DAA+D,CAChE,CAAA;QACD,OAAO,IAAI,CAAA;IACb,CAAC;IAED,oBAAoB,CAAC,IAAI,GAAG,IAAI,CAAA;IAEhC,MAAM,qBAAqB,GAAG,IAAA,sCAA2B,EACvD,oBAAoB,EACpB,SAAS,IAAI,SAAS,CACvB,CAAA;IAED,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAA,qCAA0B,EAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAA;IACjF,MAAM,WAAW,GACf,CAAC,oBAAoB,CAAC,IAAI;QAC1B,kBAAkB,CAAC,MAAM,GAAG,0CAA2B;QACvD,KAAK,KAAK,IAAI,CAAA;IAEhB,OAAO,IAAA,uCAA4B,EAAC;QAClC,GAAG;QACH,WAAW;QACX,aAAa;QACb,KAAK;QACL,WAAW;QACX,QAAQ;QACR,oBAAoB;QACpB,KAAK;QACL,WAAW;QACX,WAAW,EAAE;YACX,QAAQ,EAAE,WAAW;YACrB,SAAS,EAAE,oBAAoB;YAC/B,QAAQ,EAAE;gBACR,SAAS,EAAE,iBAAiB;gBAC5B,IAAI,EAAE,IAAI;gBACV,QAAQ,EAAE,WAAW;aACtB;YACD,UAAU,EAAE,qBAAqB;SAClC;KACF,CAAC,CAAA;AACJ,CAAC"}
|