@staticn0va/wigolo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +74 -0
- package/README.md +272 -0
- package/dist/cache/db.d.ts +5 -0
- package/dist/cache/db.d.ts.map +1 -0
- package/dist/cache/db.js +97 -0
- package/dist/cache/db.js.map +1 -0
- package/dist/cache/store.d.ts +26 -0
- package/dist/cache/store.d.ts.map +1 -0
- package/dist/cache/store.js +214 -0
- package/dist/cache/store.js.map +1 -0
- package/dist/cli/daemon.d.ts +2 -0
- package/dist/cli/daemon.d.ts.map +1 -0
- package/dist/cli/daemon.js +5 -0
- package/dist/cli/daemon.js.map +1 -0
- package/dist/cli/health.d.ts +2 -0
- package/dist/cli/health.d.ts.map +1 -0
- package/dist/cli/health.js +5 -0
- package/dist/cli/health.js.map +1 -0
- package/dist/cli/index.d.ts +7 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +9 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/warmup.d.ts +11 -0
- package/dist/cli/warmup.d.ts.map +1 -0
- package/dist/cli/warmup.js +107 -0
- package/dist/cli/warmup.js.map +1 -0
- package/dist/config.d.ts +41 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +66 -0
- package/dist/config.js.map +1 -0
- package/dist/crawl/crawler.d.ts +18 -0
- package/dist/crawl/crawler.d.ts.map +1 -0
- package/dist/crawl/crawler.js +228 -0
- package/dist/crawl/crawler.js.map +1 -0
- package/dist/crawl/dedup.d.ts +15 -0
- package/dist/crawl/dedup.d.ts.map +1 -0
- package/dist/crawl/dedup.js +93 -0
- package/dist/crawl/dedup.js.map +1 -0
- package/dist/crawl/mapper.d.ts +17 -0
- package/dist/crawl/mapper.d.ts.map +1 -0
- package/dist/crawl/mapper.js +178 -0
- package/dist/crawl/mapper.js.map +1 -0
- package/dist/crawl/rate-limiter.d.ts +10 -0
- package/dist/crawl/rate-limiter.d.ts.map +1 -0
- package/dist/crawl/rate-limiter.js +72 -0
- package/dist/crawl/rate-limiter.js.map +1 -0
- package/dist/crawl/robots.d.ts +9 -0
- package/dist/crawl/robots.d.ts.map +1 -0
- package/dist/crawl/robots.js +63 -0
- package/dist/crawl/robots.js.map +1 -0
- package/dist/crawl/sitemap.d.ts +4 -0
- package/dist/crawl/sitemap.d.ts.map +1 -0
- package/dist/crawl/sitemap.js +38 -0
- package/dist/crawl/sitemap.js.map +1 -0
- package/dist/crawl/url-utils.d.ts +3 -0
- package/dist/crawl/url-utils.d.ts.map +1 -0
- package/dist/crawl/url-utils.js +41 -0
- package/dist/crawl/url-utils.js.map +1 -0
- package/dist/extraction/defuddle.d.ts +3 -0
- package/dist/extraction/defuddle.d.ts.map +1 -0
- package/dist/extraction/defuddle.js +26 -0
- package/dist/extraction/defuddle.js.map +1 -0
- package/dist/extraction/extract.d.ts +5 -0
- package/dist/extraction/extract.d.ts.map +1 -0
- package/dist/extraction/extract.js +83 -0
- package/dist/extraction/extract.js.map +1 -0
- package/dist/extraction/jsonld.d.ts +4 -0
- package/dist/extraction/jsonld.d.ts.map +1 -0
- package/dist/extraction/jsonld.js +64 -0
- package/dist/extraction/jsonld.js.map +1 -0
- package/dist/extraction/markdown.d.ts +10 -0
- package/dist/extraction/markdown.d.ts.map +1 -0
- package/dist/extraction/markdown.js +107 -0
- package/dist/extraction/markdown.js.map +1 -0
- package/dist/extraction/pipeline.d.ts +11 -0
- package/dist/extraction/pipeline.d.ts.map +1 -0
- package/dist/extraction/pipeline.js +95 -0
- package/dist/extraction/pipeline.js.map +1 -0
- package/dist/extraction/readability.d.ts +3 -0
- package/dist/extraction/readability.d.ts.map +1 -0
- package/dist/extraction/readability.js +32 -0
- package/dist/extraction/readability.js.map +1 -0
- package/dist/extraction/schema.d.ts +7 -0
- package/dist/extraction/schema.d.ts.map +1 -0
- package/dist/extraction/schema.js +86 -0
- package/dist/extraction/schema.js.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.js +104 -0
- package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
- package/dist/extraction/site-extractors/github.d.ts +3 -0
- package/dist/extraction/site-extractors/github.d.ts.map +1 -0
- package/dist/extraction/site-extractors/github.js +107 -0
- package/dist/extraction/site-extractors/github.js.map +1 -0
- package/dist/extraction/site-extractors/mdn.d.ts +3 -0
- package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
- package/dist/extraction/site-extractors/mdn.js +58 -0
- package/dist/extraction/site-extractors/mdn.js.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.js +88 -0
- package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
- package/dist/extraction/trafilatura.d.ts +6 -0
- package/dist/extraction/trafilatura.d.ts.map +1 -0
- package/dist/extraction/trafilatura.js +105 -0
- package/dist/extraction/trafilatura.js.map +1 -0
- package/dist/fetch/auth.d.ts +8 -0
- package/dist/fetch/auth.d.ts.map +1 -0
- package/dist/fetch/auth.js +32 -0
- package/dist/fetch/auth.js.map +1 -0
- package/dist/fetch/browser-pool.d.ts +28 -0
- package/dist/fetch/browser-pool.d.ts.map +1 -0
- package/dist/fetch/browser-pool.js +138 -0
- package/dist/fetch/browser-pool.js.map +1 -0
- package/dist/fetch/content-check.d.ts +2 -0
- package/dist/fetch/content-check.d.ts.map +1 -0
- package/dist/fetch/content-check.js +62 -0
- package/dist/fetch/content-check.js.map +1 -0
- package/dist/fetch/http-client.d.ts +15 -0
- package/dist/fetch/http-client.d.ts.map +1 -0
- package/dist/fetch/http-client.js +146 -0
- package/dist/fetch/http-client.js.map +1 -0
- package/dist/fetch/router.d.ts +45 -0
- package/dist/fetch/router.d.ts.map +1 -0
- package/dist/fetch/router.js +89 -0
- package/dist/fetch/router.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +10 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +39 -0
- package/dist/logger.js.map +1 -0
- package/dist/search/dedup.d.ts +10 -0
- package/dist/search/dedup.d.ts.map +1 -0
- package/dist/search/dedup.js +35 -0
- package/dist/search/dedup.js.map +1 -0
- package/dist/search/engines/bing.d.ts +7 -0
- package/dist/search/engines/bing.d.ts.map +1 -0
- package/dist/search/engines/bing.js +48 -0
- package/dist/search/engines/bing.js.map +1 -0
- package/dist/search/engines/duckduckgo.d.ts +7 -0
- package/dist/search/engines/duckduckgo.d.ts.map +1 -0
- package/dist/search/engines/duckduckgo.js +50 -0
- package/dist/search/engines/duckduckgo.js.map +1 -0
- package/dist/search/engines/startpage.d.ts +7 -0
- package/dist/search/engines/startpage.d.ts.map +1 -0
- package/dist/search/engines/startpage.js +50 -0
- package/dist/search/engines/startpage.js.map +1 -0
- package/dist/search/filters.d.ts +16 -0
- package/dist/search/filters.d.ts.map +1 -0
- package/dist/search/filters.js +63 -0
- package/dist/search/filters.js.map +1 -0
- package/dist/search/flashrank.d.ts +12 -0
- package/dist/search/flashrank.d.ts.map +1 -0
- package/dist/search/flashrank.js +63 -0
- package/dist/search/flashrank.js.map +1 -0
- package/dist/search/query.d.ts +2 -0
- package/dist/search/query.d.ts.map +1 -0
- package/dist/search/query.js +41 -0
- package/dist/search/query.js.map +1 -0
- package/dist/search/rerank.d.ts +3 -0
- package/dist/search/rerank.d.ts.map +1 -0
- package/dist/search/rerank.js +40 -0
- package/dist/search/rerank.js.map +1 -0
- package/dist/search/searxng.d.ts +8 -0
- package/dist/search/searxng.d.ts.map +1 -0
- package/dist/search/searxng.js +87 -0
- package/dist/search/searxng.js.map +1 -0
- package/dist/search/validator.d.ts +6 -0
- package/dist/search/validator.d.ts.map +1 -0
- package/dist/search/validator.js +35 -0
- package/dist/search/validator.js.map +1 -0
- package/dist/searxng/bootstrap.d.ts +18 -0
- package/dist/searxng/bootstrap.d.ts.map +1 -0
- package/dist/searxng/bootstrap.js +136 -0
- package/dist/searxng/bootstrap.js.map +1 -0
- package/dist/searxng/docker.d.ts +9 -0
- package/dist/searxng/docker.d.ts.map +1 -0
- package/dist/searxng/docker.js +67 -0
- package/dist/searxng/docker.js.map +1 -0
- package/dist/searxng/process.d.ts +23 -0
- package/dist/searxng/process.d.ts.map +1 -0
- package/dist/searxng/process.js +188 -0
- package/dist/searxng/process.js.map +1 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +311 -0
- package/dist/server.js.map +1 -0
- package/dist/tools/cache.d.ts +3 -0
- package/dist/tools/cache.d.ts.map +1 -0
- package/dist/tools/cache.js +50 -0
- package/dist/tools/cache.js.map +1 -0
- package/dist/tools/crawl.d.ts +6 -0
- package/dist/tools/crawl.d.ts.map +1 -0
- package/dist/tools/crawl.js +97 -0
- package/dist/tools/crawl.js.map +1 -0
- package/dist/tools/extract.d.ts +4 -0
- package/dist/tools/extract.d.ts.map +1 -0
- package/dist/tools/extract.js +69 -0
- package/dist/tools/extract.js.map +1 -0
- package/dist/tools/fetch.d.ts +4 -0
- package/dist/tools/fetch.d.ts.map +1 -0
- package/dist/tools/fetch.js +76 -0
- package/dist/tools/fetch.js.map +1 -0
- package/dist/tools/search.d.ts +4 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +160 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/types.d.ts +222 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +61 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import TurndownService from 'turndown';
|
|
3
|
+
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
4
|
+
function parseVotes(el) {
|
|
5
|
+
if (!el)
|
|
6
|
+
return 0;
|
|
7
|
+
const voteEl = el.querySelector('.js-vote-count');
|
|
8
|
+
const val = voteEl?.getAttribute('data-value') ?? voteEl?.textContent?.trim() ?? '0';
|
|
9
|
+
return parseInt(val, 10) || 0;
|
|
10
|
+
}
|
|
11
|
+
function parseAnswers(document) {
|
|
12
|
+
const answerEls = document.querySelectorAll('#answers .answer');
|
|
13
|
+
const answers = [];
|
|
14
|
+
for (const el of Array.from(answerEls)) {
|
|
15
|
+
const accepted = el.classList.contains('accepted-answer');
|
|
16
|
+
const votes = parseVotes(el);
|
|
17
|
+
const bodyEl = el.querySelector('.s-prose, .js-post-body, .post-text');
|
|
18
|
+
const bodyHtml = bodyEl ? bodyEl.innerHTML : '';
|
|
19
|
+
answers.push({ accepted, votes, bodyHtml });
|
|
20
|
+
}
|
|
21
|
+
return answers;
|
|
22
|
+
}
|
|
23
|
+
function buildMarkdown(title, tags, votes, questionHtml, answers) {
|
|
24
|
+
const tagLine = `Tags: ${tags.join(', ')} | Votes: ${votes}`;
|
|
25
|
+
const questionMd = turndown.turndown(questionHtml).trim();
|
|
26
|
+
const sections = [
|
|
27
|
+
`# ${title}`,
|
|
28
|
+
tagLine,
|
|
29
|
+
'',
|
|
30
|
+
questionMd,
|
|
31
|
+
];
|
|
32
|
+
const accepted = answers.filter((a) => a.accepted);
|
|
33
|
+
const others = answers.filter((a) => !a.accepted).sort((a, b) => b.votes - a.votes);
|
|
34
|
+
const ordered = [...accepted, ...others];
|
|
35
|
+
for (const answer of ordered) {
|
|
36
|
+
const heading = answer.accepted
|
|
37
|
+
? `## Accepted Answer (Votes: ${answer.votes})`
|
|
38
|
+
: `## Answer (Votes: ${answer.votes})`;
|
|
39
|
+
const bodyMd = turndown.turndown(answer.bodyHtml).trim();
|
|
40
|
+
sections.push('---', '', heading, '', bodyMd);
|
|
41
|
+
}
|
|
42
|
+
return sections.join('\n\n');
|
|
43
|
+
}
|
|
44
|
+
export const stackoverflowExtractor = {
|
|
45
|
+
name: 'stackoverflow',
|
|
46
|
+
canHandle(url) {
|
|
47
|
+
try {
|
|
48
|
+
const hostname = new URL(url).hostname;
|
|
49
|
+
return hostname === 'stackoverflow.com' ||
|
|
50
|
+
hostname.endsWith('.stackoverflow.com') ||
|
|
51
|
+
hostname === 'stackexchange.com' ||
|
|
52
|
+
hostname.endsWith('.stackexchange.com');
|
|
53
|
+
}
|
|
54
|
+
catch {
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
extract(html, url) {
|
|
59
|
+
if (!html)
|
|
60
|
+
return null;
|
|
61
|
+
const { document } = parseHTML(html);
|
|
62
|
+
const titleEl = document.querySelector('.question-hyperlink');
|
|
63
|
+
if (!titleEl)
|
|
64
|
+
return null;
|
|
65
|
+
const title = titleEl.textContent?.trim() ?? '';
|
|
66
|
+
if (!title)
|
|
67
|
+
return null;
|
|
68
|
+
const questionBodyEl = document.querySelector('#question .s-prose, #question .js-post-body, #question .post-text');
|
|
69
|
+
if (!questionBodyEl)
|
|
70
|
+
return null;
|
|
71
|
+
const questionHtml = questionBodyEl.innerHTML;
|
|
72
|
+
const tagEls = document.querySelectorAll('.js-post-tag-list-wrapper .post-tag, .post-taglist .post-tag');
|
|
73
|
+
const tags = Array.from(tagEls).map((el) => el.textContent?.trim() ?? '').filter(Boolean);
|
|
74
|
+
const questionEl = document.querySelector('#question');
|
|
75
|
+
const votes = parseVotes(questionEl);
|
|
76
|
+
const answers = parseAnswers(document);
|
|
77
|
+
const markdown = buildMarkdown(title, tags, votes, questionHtml, answers);
|
|
78
|
+
return {
|
|
79
|
+
title,
|
|
80
|
+
markdown,
|
|
81
|
+
metadata: {},
|
|
82
|
+
links: [],
|
|
83
|
+
images: [],
|
|
84
|
+
extractor: 'site-specific',
|
|
85
|
+
};
|
|
86
|
+
},
|
|
87
|
+
};
|
|
88
|
+
//# sourceMappingURL=stackoverflow.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"stackoverflow.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/stackoverflow.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AAQxF,SAAS,UAAU,CAAC,EAAkB;IACpC,IAAI,CAAC,EAAE;QAAE,OAAO,CAAC,CAAC;IAClB,MAAM,MAAM,GAAG,EAAE,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;IAClD,MAAM,GAAG,GAAG,MAAM,EAAE,YAAY,CAAC,YAAY,CAAC,IAAI,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,GAAG,CAAC;IACrF,OAAO,QAAQ,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;AAChC,CAAC;AAED,SAAS,YAAY,CAAC,QAAkB;IACtC,MAAM,SAAS,GAAG,QAAQ,CAAC,gBAAgB,CAAC,kBAAkB,CAAC,CAAC;IAChE,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;QAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,EAAa,CAAC,CAAC;QACxC,MAAM,MAAM,GAAG,EAAE,CAAC,aAAa,CAAC,qCAAqC,CAAC,CAAC;QACvE,MAAM,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAE,MAAkB,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,OAAO,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,aAAa,CACpB,KAAa,EACb,IAAc,EACd,KAAa,EACb,YAAoB,EACpB,OAAiB;IAEjB,MAAM,OAAO,GAAG,SAAS,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,KAAK,EAAE,CAAC;IAC7D,MAAM,UAAU,GAAG,QAAQ,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,IAAI,EAAE,CAAC;IAE1D,MAAM,QAAQ,GAAa;QACzB,KAAK,KAAK,EAAE;QACZ,OAAO;QACP,EAAE;QACF,UAAU;KACX,CAAC;IAEF,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IACnD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IACpF,MAAM,OAAO,GAAG,CAAC,GAAG,QAAQ,EAAE,GAAG,MAAM,CAAC,CAAC;IAEzC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ;YAC7B,CAAC,CAAC,8BAA8B,MAAM,CAAC,KAAK,GAAG;YAC/C,CAAC,CAAC,qBAAqB,MAAM,CAAC,KAAK,GAAG,CAAC;QACzC,MAAM,MAAM,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,IAAI,EAAE,CAAC;QACzD,QAAQ,CAAC,IAAI,CAAC,KAAK,EAAE,EAAE,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;IAChD,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC;AAED,MAAM,CAAC,MAAM,sBAAsB,GAAc;IAC/C,IAAI,EAAE,eAAe;IAErB,SAAS,CAAC,GAAW;QACnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,OAAO,QAAQ,KAAK,mBAAmB;gBACrC,QAAQ,CAAC,QAAQ,CAAC,oBAAoB,CAAC;gBACvC,QAAQ,KAAK,mBAAmB;gBAChC,QAAQ,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC;QAC5C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,GAAW;QAC/B,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,qBAAqB,CAAC,CAAC;QAC9D,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE1B,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAChD,IAAI,CAAC,KAAK;YAAE,OAAO,IAAI,CAAC;QAExB,MAAM,cAAc,GAAG,QAAQ,CAAC,aAAa,CAAC,mEAAmE,CAAC,CAAC;QACnH,IAAI,CAAC,cAAc;YAAE,OAAO,IAAI,CAAC;QAEjC,MAAM,YAAY,GAAI,cAA0B,CAAC,SAAS,CAAC;QAE3D,MAAM,MAAM,GAAG,QAAQ,CAAC,gBAAgB,CAAC,8DAA8D,CAAC,CAAC;QACzG,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAE1F,MAAM,UAAU,GAAG,QAAQ,CAAC,aAAa,CAAC,WAAW,CAAC,CAAC;QACvD,MAAM,KAAK,GAAG,UAAU,CAAC,UAA4B,CAAC,CAAC;QAEvD,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;QAEvC,MAAM,QAAQ,GAAG,aAAa,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,YAAY,EAAE,OAAO,CAAC,CAAC;QAE1E,OAAO;YACL,KAAK;YACL,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,eAAe;SAC3B,CAAC;IACJ,CAAC;CACF,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { ExtractionResult } from '../types.js';
|
|
2
|
+
export declare function isTrafilaturaAvailable(): Promise<boolean>;
|
|
3
|
+
export declare function resetAvailabilityCache(): void;
|
|
4
|
+
export declare function runPythonWithStdin(script: string, stdin: string, timeoutMs: number): Promise<string>;
|
|
5
|
+
export declare function trafilaturaExtract(html: string, url: string): Promise<ExtractionResult | null>;
|
|
6
|
+
//# sourceMappingURL=trafilatura.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"trafilatura.d.ts","sourceRoot":"","sources":["../../src/extraction/trafilatura.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAoBpD,wBAAsB,sBAAsB,IAAI,OAAO,CAAC,OAAO,CAAC,CAW/D;AAED,wBAAgB,sBAAsB,IAAI,IAAI,CAE7C;AAED,wBAAgB,kBAAkB,CAChC,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,MAAM,GAChB,OAAO,CAAC,MAAM,CAAC,CAsCjB;AAED,wBAAsB,kBAAkB,CACtC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,GACV,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,CAuClC"}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
// src/extraction/trafilatura.ts
|
|
2
|
+
import { spawn, execFile as execFileCb } from 'node:child_process';
|
|
3
|
+
import { promisify } from 'node:util';
|
|
4
|
+
import { createLogger } from '../logger.js';
|
|
5
|
+
const execFileAsync = promisify(execFileCb);
|
|
6
|
+
const log = createLogger('extract');
|
|
7
|
+
const MIN_CONTENT_THRESHOLD = 100;
|
|
8
|
+
const SUBPROCESS_TIMEOUT_MS = 15000;
|
|
9
|
+
const AVAILABILITY_CHECK_TIMEOUT_MS = 5000;
|
|
10
|
+
const TRAFILATURA_SCRIPT = `
|
|
11
|
+
import sys, json
|
|
12
|
+
from trafilatura import extract
|
|
13
|
+
html = sys.stdin.read()
|
|
14
|
+
result = extract(html, output_format='json', include_links=True, include_images=True, favor_precision=True)
|
|
15
|
+
print(result or '{}')
|
|
16
|
+
`.trim();
|
|
17
|
+
let availableCache = null;
|
|
18
|
+
export async function isTrafilaturaAvailable() {
|
|
19
|
+
if (availableCache !== null)
|
|
20
|
+
return availableCache;
|
|
21
|
+
try {
|
|
22
|
+
await execFileAsync('python3', ['-c', 'import trafilatura'], {
|
|
23
|
+
timeout: AVAILABILITY_CHECK_TIMEOUT_MS,
|
|
24
|
+
});
|
|
25
|
+
availableCache = true;
|
|
26
|
+
}
|
|
27
|
+
catch {
|
|
28
|
+
availableCache = false;
|
|
29
|
+
}
|
|
30
|
+
return availableCache;
|
|
31
|
+
}
|
|
32
|
+
export function resetAvailabilityCache() {
|
|
33
|
+
availableCache = null;
|
|
34
|
+
}
|
|
35
|
+
export function runPythonWithStdin(script, stdin, timeoutMs) {
|
|
36
|
+
const procRef = { current: null };
|
|
37
|
+
const procPromise = new Promise((resolve, reject) => {
|
|
38
|
+
const proc = spawn('python3', ['-c', script], { timeout: timeoutMs });
|
|
39
|
+
procRef.current = proc;
|
|
40
|
+
let stdout = '';
|
|
41
|
+
let stderr = '';
|
|
42
|
+
proc.stdout.on('data', (d) => {
|
|
43
|
+
stdout += d.toString();
|
|
44
|
+
});
|
|
45
|
+
proc.stderr.on('data', (d) => {
|
|
46
|
+
stderr += d.toString();
|
|
47
|
+
});
|
|
48
|
+
proc.on('close', (code, signal) => {
|
|
49
|
+
if (signal) {
|
|
50
|
+
reject(new Error(`Python killed by signal ${signal}: ${stderr}`));
|
|
51
|
+
}
|
|
52
|
+
else if (code === 0) {
|
|
53
|
+
resolve(stdout);
|
|
54
|
+
}
|
|
55
|
+
else {
|
|
56
|
+
reject(new Error(`Python exited ${code}: ${stderr}`));
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
proc.on('error', reject);
|
|
60
|
+
proc.stdin.write(stdin);
|
|
61
|
+
proc.stdin.end();
|
|
62
|
+
});
|
|
63
|
+
const timeoutPromise = new Promise((_, reject) => setTimeout(() => {
|
|
64
|
+
procRef.current?.kill();
|
|
65
|
+
reject(new Error(`Python timed out after ${timeoutMs}ms`));
|
|
66
|
+
}, timeoutMs));
|
|
67
|
+
return Promise.race([procPromise, timeoutPromise]);
|
|
68
|
+
}
|
|
69
|
+
export async function trafilaturaExtract(html, url) {
|
|
70
|
+
try {
|
|
71
|
+
const stdout = await runPythonWithStdin(TRAFILATURA_SCRIPT, html, SUBPROCESS_TIMEOUT_MS);
|
|
72
|
+
const trimmed = stdout.trim();
|
|
73
|
+
if (!trimmed || trimmed === 'null')
|
|
74
|
+
return null;
|
|
75
|
+
let parsed;
|
|
76
|
+
try {
|
|
77
|
+
parsed = JSON.parse(trimmed);
|
|
78
|
+
}
|
|
79
|
+
catch {
|
|
80
|
+
log.debug('Trafilatura output was not valid JSON', { url });
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
if (!parsed || typeof parsed !== 'object')
|
|
84
|
+
return null;
|
|
85
|
+
const text = typeof parsed.text === 'string' ? parsed.text : '';
|
|
86
|
+
if (text.length < MIN_CONTENT_THRESHOLD)
|
|
87
|
+
return null;
|
|
88
|
+
return {
|
|
89
|
+
title: typeof parsed.title === 'string' ? parsed.title : '',
|
|
90
|
+
markdown: text,
|
|
91
|
+
metadata: {
|
|
92
|
+
author: typeof parsed.author === 'string' ? parsed.author : undefined,
|
|
93
|
+
date: typeof parsed.date === 'string' ? parsed.date : undefined,
|
|
94
|
+
},
|
|
95
|
+
links: [],
|
|
96
|
+
images: [],
|
|
97
|
+
extractor: 'trafilatura',
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
catch (err) {
|
|
101
|
+
log.debug('Trafilatura extraction failed', { url, error: String(err) });
|
|
102
|
+
return null;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
//# sourceMappingURL=trafilatura.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"trafilatura.js","sourceRoot":"","sources":["../../src/extraction/trafilatura.ts"],"names":[],"mappings":"AAAA,gCAAgC;AAChC,OAAO,EAAE,KAAK,EAAE,QAAQ,IAAI,UAAU,EAAE,MAAM,oBAAoB,CAAC;AACnE,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAEtC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAE5C,MAAM,aAAa,GAAG,SAAS,CAAC,UAAU,CAAC,CAAC;AAC5C,MAAM,GAAG,GAAG,YAAY,CAAC,SAAS,CAAC,CAAC;AAEpC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,qBAAqB,GAAG,KAAK,CAAC;AACpC,MAAM,6BAA6B,GAAG,IAAI,CAAC;AAE3C,MAAM,kBAAkB,GAAG;;;;;;CAM1B,CAAC,IAAI,EAAE,CAAC;AAET,IAAI,cAAc,GAAmB,IAAI,CAAC;AAE1C,MAAM,CAAC,KAAK,UAAU,sBAAsB;IAC1C,IAAI,cAAc,KAAK,IAAI;QAAE,OAAO,cAAc,CAAC;IACnD,IAAI,CAAC;QACH,MAAM,aAAa,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,oBAAoB,CAAC,EAAE;YAC3D,OAAO,EAAE,6BAA6B;SACvC,CAAC,CAAC;QACH,cAAc,GAAG,IAAI,CAAC;IACxB,CAAC;IAAC,MAAM,CAAC;QACP,cAAc,GAAG,KAAK,CAAC;IACzB,CAAC;IACD,OAAO,cAAc,CAAC;AACxB,CAAC;AAED,MAAM,UAAU,sBAAsB;IACpC,cAAc,GAAG,IAAI,CAAC;AACxB,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,MAAc,EACd,KAAa,EACb,SAAiB;IAEjB,MAAM,OAAO,GAAG,EAAE,OAAO,EAAE,IAAuC,EAAE,CAAC;IAErE,MAAM,WAAW,GAAG,IAAI,OAAO,CAAS,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QAC1D,MAAM,IAAI,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC,CAAC;QACtE,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC;QACvB,IAAI,MAAM,GAAG,EAAE,CAAC;QAChB,IAAI,MAAM,GAAG,EAAE,CAAC;QAEhB,IAAI,CAAC,MAAO,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAS,EAAE,EAAE;YACpC,MAAM,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC;QACzB,CAAC,CAAC,CAAC;QACH,IAAI,CAAC,MAAO,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAS,EAAE,EAAE;YACpC,MAAM,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC;QACzB,CAAC,CAAC,CAAC;QACH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE;YAChC,IAAI,MAAM,EAAE,CAAC;gBACX,MAAM,CAAC,IAAI,KAAK,CAAC,2BAA2B,MAAM,KAAK,MAAM,EAAE,CAAC,CAAC,CAAC;YACpE,CAAC;iBAAM,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;gBACtB,OAAO,CAAC,MAAM,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,KAAK,CAAC,iBAAiB,IAAI,KAAK,MAAM,EAAE,CAAC,CAAC,CAAC;YACxD,CAAC;QACH,CAAC,CAAC,CAAC;QACH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;QAEzB,IAAI,CAAC,KAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACzB,IAAI,CAAC,KAAM,CAAC,GAAG,EAAE,CAAC;IACpB,CAAC,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAI,OAAO,CAAS,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE,CACvD,UAAU,CAAC,GAAG,EAAE;QACd,OAAO,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC;QACxB,MAAM,CAAC,IAAI,KAAK,CAAC,0BAA0B,SAAS,IAAI,CAAC,CAAC,CAAC;IAC7D,CAAC,EAAE,SAAS,CAAC,CACd,CAAC;IAEF,OAAO,OAAO,CAAC,IAAI,CAAC,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC,CAAC;AACrD,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,IAAY,EACZ,GAAW;IAEX,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,kBAAkB,CACrC,kBAAkB,EAClB,IAAI,EACJ,qBAAqB,CACtB,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;QAC9B,IAAI,CAAC,OAAO,IAAI,OAAO,KAAK,MAAM;YAAE,OAAO,IAAI,CAAC;QAEhD,IAAI,MAA+B,CAAC;QACpC,IAAI,CAAC;YACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAC/B,CAAC;QAAC,MAAM,CAAC;YACP,GAAG,CAAC,KAAK,CAAC,uCAAuC,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YAC5D,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,CAAC,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ;YAAE,OAAO,IAAI,CAAC;QAEvD,MAAM,IAAI,GAAG,OAAO,MAAM,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;QAChE,IAAI,IAAI,CAAC,MAAM,GAAG,qBAAqB;YAAE,OAAO,IAAI,CAAC;QAErD,OAAO;YACL,KAAK,EAAE,OAAO,MAAM,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE;YAC3D,QAAQ,EAAE,IAAI;YACd,QAAQ,EAAE;gBACR,MAAM,EAAE,OAAO,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS;gBACrE,IAAI,EAAE,OAAO,MAAM,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS;aAChE;YACD,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,aAAa;SACzB,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,KAAK,CAAC,+BAA+B,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACxE,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { CDPSession } from '../types.js';
|
|
2
|
+
export interface AuthOptions {
|
|
3
|
+
storageStatePath?: string;
|
|
4
|
+
userDataDir?: string;
|
|
5
|
+
}
|
|
6
|
+
export declare function getAuthOptions(): AuthOptions | null;
|
|
7
|
+
export declare function listSessions(): Promise<CDPSession[]>;
|
|
8
|
+
//# sourceMappingURL=auth.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"auth.d.ts","sourceRoot":"","sources":["../../src/fetch/auth.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAE9C,MAAM,WAAW,WAAW;IAC1B,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,wBAAgB,cAAc,IAAI,WAAW,GAAG,IAAI,CAyBnD;AAED,wBAAsB,YAAY,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC,CAE1D"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { existsSync, cpSync, mkdtempSync } from 'node:fs';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import { tmpdir } from 'node:os';
|
|
4
|
+
import { getConfig } from '../config.js';
|
|
5
|
+
import { createLogger } from '../logger.js';
|
|
6
|
+
export function getAuthOptions() {
|
|
7
|
+
const config = getConfig();
|
|
8
|
+
const logger = createLogger('fetch');
|
|
9
|
+
if (config.authStatePath) {
|
|
10
|
+
if (!existsSync(config.authStatePath)) {
|
|
11
|
+
throw new Error(`Auth state file not found: ${config.authStatePath}`);
|
|
12
|
+
}
|
|
13
|
+
return { storageStatePath: config.authStatePath };
|
|
14
|
+
}
|
|
15
|
+
if (config.chromeProfilePath) {
|
|
16
|
+
const lockFile = join(config.chromeProfilePath, 'SingletonLock');
|
|
17
|
+
if (existsSync(lockFile)) {
|
|
18
|
+
logger.warn('Chrome appears to be running (SingletonLock found) — close Chrome before using its profile', {
|
|
19
|
+
profilePath: config.chromeProfilePath,
|
|
20
|
+
});
|
|
21
|
+
}
|
|
22
|
+
const tempDir = mkdtempSync(join(tmpdir(), 'wigolo-chrome-'));
|
|
23
|
+
cpSync(config.chromeProfilePath, tempDir, { recursive: true });
|
|
24
|
+
logger.debug('copied Chrome profile to temp directory', { from: config.chromeProfilePath, to: tempDir });
|
|
25
|
+
return { userDataDir: tempDir };
|
|
26
|
+
}
|
|
27
|
+
return null;
|
|
28
|
+
}
|
|
29
|
+
export async function listSessions() {
|
|
30
|
+
return [];
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=auth.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"auth.js","sourceRoot":"","sources":["../../src/fetch/auth.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAC1D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAQ5C,MAAM,UAAU,cAAc;IAC5B,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;IAC3B,MAAM,MAAM,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;IAErC,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;QACzB,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,aAAa,CAAC,EAAE,CAAC;YACtC,MAAM,IAAI,KAAK,CAAC,8BAA8B,MAAM,CAAC,aAAa,EAAE,CAAC,CAAC;QACxE,CAAC;QACD,OAAO,EAAE,gBAAgB,EAAE,MAAM,CAAC,aAAa,EAAE,CAAC;IACpD,CAAC;IAED,IAAI,MAAM,CAAC,iBAAiB,EAAE,CAAC;QAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,iBAAiB,EAAE,eAAe,CAAC,CAAC;QACjE,IAAI,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YACzB,MAAM,CAAC,IAAI,CAAC,4FAA4F,EAAE;gBACxG,WAAW,EAAE,MAAM,CAAC,iBAAiB;aACtC,CAAC,CAAC;QACL,CAAC;QACD,MAAM,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,gBAAgB,CAAC,CAAC,CAAC;QAC9D,MAAM,CAAC,MAAM,CAAC,iBAAiB,EAAE,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/D,MAAM,CAAC,KAAK,CAAC,yCAAyC,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,iBAAiB,EAAE,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;QACzG,OAAO,EAAE,WAAW,EAAE,OAAO,EAAE,CAAC;IAClC,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY;IAChC,OAAO,EAAE,CAAC;AACZ,CAAC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { type BrowserContext } from 'playwright';
|
|
2
|
+
import type { RawFetchResult, BrowserType } from '../types.js';
|
|
3
|
+
export interface BrowserFetchOptions {
|
|
4
|
+
timeoutMs?: number;
|
|
5
|
+
storageStatePath?: string;
|
|
6
|
+
userDataDir?: string;
|
|
7
|
+
headers?: Record<string, string>;
|
|
8
|
+
screenshot?: boolean;
|
|
9
|
+
}
|
|
10
|
+
export interface BrowserPoolOptions {
|
|
11
|
+
browserType?: BrowserType;
|
|
12
|
+
}
|
|
13
|
+
export declare class BrowserPool {
|
|
14
|
+
private browser;
|
|
15
|
+
private pool;
|
|
16
|
+
private activeCount;
|
|
17
|
+
private waitQueue;
|
|
18
|
+
private idleTimers;
|
|
19
|
+
private shutdownCalled;
|
|
20
|
+
private readonly browserType;
|
|
21
|
+
constructor(options?: BrowserPoolOptions);
|
|
22
|
+
private launchBrowser;
|
|
23
|
+
acquire(): Promise<BrowserContext>;
|
|
24
|
+
release(ctx: BrowserContext): void;
|
|
25
|
+
fetchWithBrowser(url: string, options?: BrowserFetchOptions): Promise<RawFetchResult>;
|
|
26
|
+
shutdown(): Promise<void>;
|
|
27
|
+
}
|
|
28
|
+
//# sourceMappingURL=browser-pool.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser-pool.d.ts","sourceRoot":"","sources":["../../src/fetch/browser-pool.ts"],"names":[],"mappings":"AAAA,OAAO,EAA2C,KAAK,cAAc,EAAE,MAAM,YAAY,CAAC;AAG1F,OAAO,KAAK,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAE/D,MAAM,WAAW,mBAAmB;IAClC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,UAAU,CAAC,EAAE,OAAO,CAAC;CACtB;AAED,MAAM,WAAW,kBAAkB;IACjC,WAAW,CAAC,EAAE,WAAW,CAAC;CAC3B;AAED,qBAAa,WAAW;IACtB,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,IAAI,CAAwB;IACpC,OAAO,CAAC,WAAW,CAAK;IACxB,OAAO,CAAC,SAAS,CAA4C;IAC7D,OAAO,CAAC,UAAU,CAA4D;IAC9E,OAAO,CAAC,cAAc,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAc;gBAE9B,OAAO,CAAC,EAAE,kBAAkB;YAI1B,aAAa;IAUrB,OAAO,IAAI,OAAO,CAAC,cAAc,CAAC;IAyBxC,OAAO,CAAC,GAAG,EAAE,cAAc,GAAG,IAAI;IAyB5B,gBAAgB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,mBAAwB,GAAG,OAAO,CAAC,cAAc,CAAC;IA+DzF,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;CAoBhC"}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { chromium, firefox, webkit } from 'playwright';
|
|
2
|
+
import { getConfig } from '../config.js';
|
|
3
|
+
import { createLogger } from '../logger.js';
|
|
4
|
+
export class BrowserPool {
|
|
5
|
+
browser = null;
|
|
6
|
+
pool = [];
|
|
7
|
+
activeCount = 0;
|
|
8
|
+
waitQueue = [];
|
|
9
|
+
idleTimers = new Map();
|
|
10
|
+
shutdownCalled = false;
|
|
11
|
+
browserType;
|
|
12
|
+
constructor(options) {
|
|
13
|
+
this.browserType = options?.browserType ?? 'chromium';
|
|
14
|
+
}
|
|
15
|
+
async launchBrowser() {
|
|
16
|
+
if (!this.browser) {
|
|
17
|
+
const launcher = this.browserType === 'firefox' ? firefox
|
|
18
|
+
: this.browserType === 'webkit' ? webkit
|
|
19
|
+
: chromium;
|
|
20
|
+
this.browser = await launcher.launch({ headless: true });
|
|
21
|
+
}
|
|
22
|
+
return this.browser;
|
|
23
|
+
}
|
|
24
|
+
async acquire() {
|
|
25
|
+
const config = getConfig();
|
|
26
|
+
const maxBrowsers = config.maxBrowsers;
|
|
27
|
+
if (this.pool.length > 0) {
|
|
28
|
+
const ctx = this.pool.pop();
|
|
29
|
+
const timer = this.idleTimers.get(ctx);
|
|
30
|
+
if (timer !== undefined) {
|
|
31
|
+
clearTimeout(timer);
|
|
32
|
+
this.idleTimers.delete(ctx);
|
|
33
|
+
}
|
|
34
|
+
return ctx;
|
|
35
|
+
}
|
|
36
|
+
if (this.activeCount < maxBrowsers) {
|
|
37
|
+
this.activeCount++;
|
|
38
|
+
const browser = await this.launchBrowser();
|
|
39
|
+
return browser.newContext();
|
|
40
|
+
}
|
|
41
|
+
return new Promise((resolve) => {
|
|
42
|
+
this.waitQueue.push(resolve);
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
release(ctx) {
|
|
46
|
+
const config = getConfig();
|
|
47
|
+
const idleTimeoutMs = config.browserIdleTimeoutMs;
|
|
48
|
+
if (this.waitQueue.length > 0) {
|
|
49
|
+
const resolve = this.waitQueue.shift();
|
|
50
|
+
resolve(ctx);
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
53
|
+
this.pool.push(ctx);
|
|
54
|
+
const timer = setTimeout(() => {
|
|
55
|
+
const idx = this.pool.indexOf(ctx);
|
|
56
|
+
if (idx !== -1) {
|
|
57
|
+
this.pool.splice(idx, 1);
|
|
58
|
+
this.idleTimers.delete(ctx);
|
|
59
|
+
this.activeCount = Math.max(0, this.activeCount - 1);
|
|
60
|
+
ctx.close().catch(() => { });
|
|
61
|
+
}
|
|
62
|
+
}, idleTimeoutMs);
|
|
63
|
+
this.idleTimers.set(ctx, timer);
|
|
64
|
+
}
|
|
65
|
+
async fetchWithBrowser(url, options = {}) {
|
|
66
|
+
const config = getConfig();
|
|
67
|
+
const logger = createLogger('fetch');
|
|
68
|
+
const navTimeoutMs = options.timeoutMs ?? config.playwrightNavTimeoutMs;
|
|
69
|
+
const loadTimeoutMs = config.playwrightLoadTimeoutMs;
|
|
70
|
+
const ctx = await this.acquire();
|
|
71
|
+
const page = await ctx.newPage();
|
|
72
|
+
if (options.headers) {
|
|
73
|
+
await page.setExtraHTTPHeaders(options.headers);
|
|
74
|
+
}
|
|
75
|
+
let statusCode = 200;
|
|
76
|
+
let contentType = '';
|
|
77
|
+
let responseHeaders = {};
|
|
78
|
+
let finalUrl = url;
|
|
79
|
+
try {
|
|
80
|
+
const response = await page.goto(url, {
|
|
81
|
+
timeout: navTimeoutMs,
|
|
82
|
+
waitUntil: 'domcontentloaded',
|
|
83
|
+
});
|
|
84
|
+
if (response) {
|
|
85
|
+
statusCode = response.status();
|
|
86
|
+
finalUrl = response.url();
|
|
87
|
+
const rawHeaders = response.headers();
|
|
88
|
+
responseHeaders = rawHeaders;
|
|
89
|
+
contentType = rawHeaders['content-type'] ?? '';
|
|
90
|
+
}
|
|
91
|
+
try {
|
|
92
|
+
await page.waitForLoadState('networkidle', { timeout: loadTimeoutMs });
|
|
93
|
+
}
|
|
94
|
+
catch {
|
|
95
|
+
// networkidle timeout is non-fatal — page content is still usable
|
|
96
|
+
logger.debug('networkidle timeout, using page content as-is', { url });
|
|
97
|
+
}
|
|
98
|
+
const html = await page.content();
|
|
99
|
+
let screenshotBase64;
|
|
100
|
+
if (options.screenshot) {
|
|
101
|
+
const buf = await page.screenshot({ fullPage: true });
|
|
102
|
+
screenshotBase64 = buf.toString('base64');
|
|
103
|
+
}
|
|
104
|
+
return {
|
|
105
|
+
url,
|
|
106
|
+
finalUrl,
|
|
107
|
+
html,
|
|
108
|
+
contentType,
|
|
109
|
+
statusCode,
|
|
110
|
+
method: 'playwright',
|
|
111
|
+
headers: responseHeaders,
|
|
112
|
+
screenshot: screenshotBase64,
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
finally {
|
|
116
|
+
await page.close();
|
|
117
|
+
this.release(ctx);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
async shutdown() {
|
|
121
|
+
if (this.shutdownCalled)
|
|
122
|
+
return;
|
|
123
|
+
this.shutdownCalled = true;
|
|
124
|
+
for (const [, timer] of this.idleTimers) {
|
|
125
|
+
clearTimeout(timer);
|
|
126
|
+
}
|
|
127
|
+
this.idleTimers.clear();
|
|
128
|
+
const closePromises = this.pool.map((ctx) => ctx.close().catch(() => { }));
|
|
129
|
+
this.pool = [];
|
|
130
|
+
await Promise.all(closePromises);
|
|
131
|
+
if (this.browser) {
|
|
132
|
+
await this.browser.close().catch(() => { });
|
|
133
|
+
this.browser = null;
|
|
134
|
+
}
|
|
135
|
+
this.activeCount = 0;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
//# sourceMappingURL=browser-pool.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser-pool.js","sourceRoot":"","sources":["../../src/fetch/browser-pool.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,EAAqC,MAAM,YAAY,CAAC;AAC1F,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAe5C,MAAM,OAAO,WAAW;IACd,OAAO,GAAmB,IAAI,CAAC;IAC/B,IAAI,GAAqB,EAAE,CAAC;IAC5B,WAAW,GAAG,CAAC,CAAC;IAChB,SAAS,GAAyC,EAAE,CAAC;IACrD,UAAU,GAAG,IAAI,GAAG,EAAiD,CAAC;IACtE,cAAc,GAAG,KAAK,CAAC;IACd,WAAW,CAAc;IAE1C,YAAY,OAA4B;QACtC,IAAI,CAAC,WAAW,GAAG,OAAO,EAAE,WAAW,IAAI,UAAU,CAAC;IACxD,CAAC;IAEO,KAAK,CAAC,aAAa;QACzB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,KAAK,SAAS,CAAC,CAAC,CAAC,OAAO;gBACvD,CAAC,CAAC,IAAI,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM;oBACxC,CAAC,CAAC,QAAQ,CAAC;YACb,IAAI,CAAC,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QAC3D,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED,KAAK,CAAC,OAAO;QACX,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,CAAC;QAEvC,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzB,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAG,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACvC,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;gBACxB,YAAY,CAAC,KAAK,CAAC,CAAC;gBACpB,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAC9B,CAAC;YACD,OAAO,GAAG,CAAC;QACb,CAAC;QAED,IAAI,IAAI,CAAC,WAAW,GAAG,WAAW,EAAE,CAAC;YACnC,IAAI,CAAC,WAAW,EAAE,CAAC;YACnB,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,aAAa,EAAE,CAAC;YAC3C,OAAO,OAAO,CAAC,UAAU,EAAE,CAAC;QAC9B,CAAC;QAED,OAAO,IAAI,OAAO,CAAiB,CAAC,OAAO,EAAE,EAAE;YAC7C,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;IACL,CAAC;IAED,OAAO,CAAC,GAAmB;QACzB,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,MAAM,aAAa,GAAG,MAAM,CAAC,oBAAoB,CAAC;QAElD,IAAI,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,EAAG,CAAC;YACxC,OAAO,CAAC,GAAG,CAAC,CAAC;YACb,OAAO;QACT,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEpB,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;YAC5B,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YACnC,IAAI,GAAG,KAAK,CAAC,CAAC,EAAE,CAAC;gBACf,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;gBACzB,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;gBAC5B,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,GAAG,CAAC,CAAC,CAAC;gBACrD,GAAG,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC,EAAE,aAAa,CAAC,CAAC;QAElB,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;IAClC,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,GAAW,EAAE,UAA+B,EAAE;QACnE,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;QACrC,MAAM,YAAY,GAAG,OAAO,CAAC,SAAS,IAAI,MAAM,CAAC,sBAAsB,CAAC;QACxE,MAAM,aAAa,GAAG,MAAM,CAAC,uBAAuB,CAAC;QAErD,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;QACjC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,EAAE,CAAC;QAEjC,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;YACpB,MAAM,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAClD,CAAC;QAED,IAAI,UAAU,GAAG,GAAG,CAAC;QACrB,IAAI,WAAW,GAAG,EAAE,CAAC;QACrB,IAAI,eAAe,GAA2B,EAAE,CAAC;QACjD,IAAI,QAAQ,GAAG,GAAG,CAAC;QAEnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACpC,OAAO,EAAE,YAAY;gBACrB,SAAS,EAAE,kBAAkB;aAC9B,CAAC,CAAC;YAEH,IAAI,QAAQ,EAAE,CAAC;gBACb,UAAU,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC;gBAC/B,QAAQ,GAAG,QAAQ,CAAC,GAAG,EAAE,CAAC;gBAC1B,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC;gBACtC,eAAe,GAAG,UAAU,CAAC;gBAC7B,WAAW,GAAG,UAAU,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YACjD,CAAC;YAED,IAAI,CAAC;gBACH,MAAM,IAAI,CAAC,gBAAgB,CAAC,aAAa,EAAE,EAAE,OAAO,EAAE,aAAa,EAAE,CAAC,CAAC;YACzE,CAAC;YAAC,MAAM,CAAC;gBACP,kEAAkE;gBAClE,MAAM,CAAC,KAAK,CAAC,+CAA+C,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YACzE,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAElC,IAAI,gBAAoC,CAAC;YACzC,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;gBACvB,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;gBACtD,gBAAgB,GAAG,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;YAC5C,CAAC;YAED,OAAO;gBACL,GAAG;gBACH,QAAQ;gBACR,IAAI;gBACJ,WAAW;gBACX,UAAU;gBACV,MAAM,EAAE,YAAY;gBACpB,OAAO,EAAE,eAAe;gBACxB,UAAU,EAAE,gBAAgB;aAC7B,CAAC;QACJ,CAAC;gBAAS,CAAC;YACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YACnB,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACpB,CAAC;IACH,CAAC;IAED,KAAK,CAAC,QAAQ;QACZ,IAAI,IAAI,CAAC,cAAc;YAAE,OAAO;QAChC,IAAI,CAAC,cAAc,GAAG,IAAI,CAAC;QAE3B,KAAK,MAAM,CAAC,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACxC,YAAY,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QAExB,MAAM,aAAa,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC,CAAC;QAC1E,IAAI,CAAC,IAAI,GAAG,EAAE,CAAC;QACf,MAAM,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;QAEjC,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAC3C,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QAED,IAAI,CAAC,WAAW,GAAG,CAAC,CAAC;IACvB,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-check.d.ts","sourceRoot":"","sources":["../../src/fetch/content-check.ts"],"names":[],"mappings":"AAsDA,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAUzD"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
const VISIBLE_TEXT_THRESHOLD = 200;
|
|
2
|
+
const SCRIPT_RATIO_THRESHOLD = 0.8;
|
|
3
|
+
function stripScriptsAndStyles(html) {
|
|
4
|
+
return html
|
|
5
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
6
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '');
|
|
7
|
+
}
|
|
8
|
+
function extractVisibleText(html) {
|
|
9
|
+
const stripped = stripScriptsAndStyles(html);
|
|
10
|
+
const noTags = stripped.replace(/<[^>]+>/g, ' ');
|
|
11
|
+
return noTags.replace(/\s+/g, ' ').trim();
|
|
12
|
+
}
|
|
13
|
+
function hasSpaShellIndicator(html) {
|
|
14
|
+
const spaPatterns = [
|
|
15
|
+
/<div[^>]+id=["']root["'][^>]*>\s*<\/div>/i,
|
|
16
|
+
/<div[^>]+id=["']app["'][^>]*>\s*<\/div>/i,
|
|
17
|
+
/<div[^>]+id=["']__next["'][^>]*>\s*<\/div>/i,
|
|
18
|
+
];
|
|
19
|
+
return spaPatterns.some((pattern) => pattern.test(html));
|
|
20
|
+
}
|
|
21
|
+
function hasNextData(html) {
|
|
22
|
+
if (!/__NEXT_DATA__/.test(html))
|
|
23
|
+
return false;
|
|
24
|
+
const withoutScripts = stripScriptsAndStyles(html);
|
|
25
|
+
const visibleText = withoutScripts.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
26
|
+
return visibleText.length < VISIBLE_TEXT_THRESHOLD;
|
|
27
|
+
}
|
|
28
|
+
function hasNoscriptRequired(html) {
|
|
29
|
+
const noscriptMatches = html.match(/<noscript[^>]*>([\s\S]*?)<\/noscript>/gi);
|
|
30
|
+
if (!noscriptMatches)
|
|
31
|
+
return false;
|
|
32
|
+
return noscriptMatches.some((tag) => {
|
|
33
|
+
const inner = tag.replace(/<[^>]+>/g, '').toLowerCase();
|
|
34
|
+
return inner.includes('javascript') || inner.includes('enable');
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
function hasHighScriptRatio(html) {
|
|
38
|
+
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
39
|
+
const bodyContent = bodyMatch ? bodyMatch[1] : html;
|
|
40
|
+
const scriptMatches = bodyContent.match(/<script[\s\S]*?<\/script>/gi) ?? [];
|
|
41
|
+
const scriptText = scriptMatches.join('');
|
|
42
|
+
const scriptLen = scriptText.length;
|
|
43
|
+
const totalLen = bodyContent.length;
|
|
44
|
+
if (totalLen === 0)
|
|
45
|
+
return false;
|
|
46
|
+
return scriptLen / totalLen > SCRIPT_RATIO_THRESHOLD;
|
|
47
|
+
}
|
|
48
|
+
export function contentAppearsEmpty(html) {
|
|
49
|
+
const visibleText = extractVisibleText(html);
|
|
50
|
+
if (visibleText.length < VISIBLE_TEXT_THRESHOLD)
|
|
51
|
+
return true;
|
|
52
|
+
if (hasSpaShellIndicator(html))
|
|
53
|
+
return true;
|
|
54
|
+
if (hasNextData(html))
|
|
55
|
+
return true;
|
|
56
|
+
if (hasNoscriptRequired(html))
|
|
57
|
+
return true;
|
|
58
|
+
if (hasHighScriptRatio(html))
|
|
59
|
+
return true;
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
//# sourceMappingURL=content-check.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-check.js","sourceRoot":"","sources":["../../src/fetch/content-check.ts"],"names":[],"mappings":"AAAA,MAAM,sBAAsB,GAAG,GAAG,CAAC;AACnC,MAAM,sBAAsB,GAAG,GAAG,CAAC;AAEnC,SAAS,qBAAqB,CAAC,IAAY;IACzC,OAAO,IAAI;SACR,OAAO,CAAC,6BAA6B,EAAE,EAAE,CAAC;SAC1C,OAAO,CAAC,2BAA2B,EAAE,EAAE,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,kBAAkB,CAAC,IAAY;IACtC,MAAM,QAAQ,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAC;IAC7C,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC;IACjD,OAAO,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AAC5C,CAAC;AAED,SAAS,oBAAoB,CAAC,IAAY;IACxC,MAAM,WAAW,GAAG;QAClB,2CAA2C;QAC3C,0CAA0C;QAC1C,6CAA6C;KAC9C,CAAC;IACF,OAAO,WAAW,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;AAC3D,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IAC9C,MAAM,cAAc,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAC;IACnD,MAAM,WAAW,GAAG,cAAc,CAAC,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IACxF,OAAO,WAAW,CAAC,MAAM,GAAG,sBAAsB,CAAC;AACrD,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAY;IACvC,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,yCAAyC,CAAC,CAAC;IAC9E,IAAI,CAAC,eAAe;QAAE,OAAO,KAAK,CAAC;IACnC,OAAO,eAAe,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,EAAE;QAClC,MAAM,KAAK,GAAG,GAAG,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;QACxD,OAAO,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,kBAAkB,CAAC,IAAY;IACtC,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;IAC/D,MAAM,WAAW,GAAG,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAEpD,MAAM,aAAa,GAAG,WAAW,CAAC,KAAK,CAAC,6BAA6B,CAAC,IAAI,EAAE,CAAC;IAC7E,MAAM,UAAU,GAAG,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAE1C,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,CAAC;IACpC,MAAM,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC;IAEpC,IAAI,QAAQ,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACjC,OAAO,SAAS,GAAG,QAAQ,GAAG,sBAAsB,CAAC;AACvD,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,IAAY;IAC9C,MAAM,WAAW,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAC;IAC7C,IAAI,WAAW,CAAC,MAAM,GAAG,sBAAsB;QAAE,OAAO,IAAI,CAAC;IAE7D,IAAI,oBAAoB,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5C,IAAI,WAAW,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IACnC,IAAI,mBAAmB,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAC3C,IAAI,kBAAkB,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAE1C,OAAO,KAAK,CAAC;AACf,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export interface HttpFetchOptions {
|
|
2
|
+
headers?: Record<string, string>;
|
|
3
|
+
timeoutMs?: number;
|
|
4
|
+
}
|
|
5
|
+
export interface HttpFetchResult {
|
|
6
|
+
url: string;
|
|
7
|
+
finalUrl: string;
|
|
8
|
+
html: string;
|
|
9
|
+
contentType: string;
|
|
10
|
+
statusCode: number;
|
|
11
|
+
headers: Record<string, string>;
|
|
12
|
+
rawBuffer?: Buffer;
|
|
13
|
+
}
|
|
14
|
+
export declare function httpFetch(url: string, options?: HttpFetchOptions): Promise<HttpFetchResult>;
|
|
15
|
+
//# sourceMappingURL=http-client.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"http-client.d.ts","sourceRoot":"","sources":["../../src/fetch/http-client.ts"],"names":[],"mappings":"AAGA,MAAM,WAAW,gBAAgB;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAmCD,wBAAsB,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,gBAAqB,GAAG,OAAO,CAAC,eAAe,CAAC,CAyCrG"}
|