@staticn0va/wigolo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +74 -0
- package/README.md +272 -0
- package/dist/cache/db.d.ts +5 -0
- package/dist/cache/db.d.ts.map +1 -0
- package/dist/cache/db.js +97 -0
- package/dist/cache/db.js.map +1 -0
- package/dist/cache/store.d.ts +26 -0
- package/dist/cache/store.d.ts.map +1 -0
- package/dist/cache/store.js +214 -0
- package/dist/cache/store.js.map +1 -0
- package/dist/cli/daemon.d.ts +2 -0
- package/dist/cli/daemon.d.ts.map +1 -0
- package/dist/cli/daemon.js +5 -0
- package/dist/cli/daemon.js.map +1 -0
- package/dist/cli/health.d.ts +2 -0
- package/dist/cli/health.d.ts.map +1 -0
- package/dist/cli/health.js +5 -0
- package/dist/cli/health.js.map +1 -0
- package/dist/cli/index.d.ts +7 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +9 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/warmup.d.ts +11 -0
- package/dist/cli/warmup.d.ts.map +1 -0
- package/dist/cli/warmup.js +107 -0
- package/dist/cli/warmup.js.map +1 -0
- package/dist/config.d.ts +41 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +66 -0
- package/dist/config.js.map +1 -0
- package/dist/crawl/crawler.d.ts +18 -0
- package/dist/crawl/crawler.d.ts.map +1 -0
- package/dist/crawl/crawler.js +228 -0
- package/dist/crawl/crawler.js.map +1 -0
- package/dist/crawl/dedup.d.ts +15 -0
- package/dist/crawl/dedup.d.ts.map +1 -0
- package/dist/crawl/dedup.js +93 -0
- package/dist/crawl/dedup.js.map +1 -0
- package/dist/crawl/mapper.d.ts +17 -0
- package/dist/crawl/mapper.d.ts.map +1 -0
- package/dist/crawl/mapper.js +178 -0
- package/dist/crawl/mapper.js.map +1 -0
- package/dist/crawl/rate-limiter.d.ts +10 -0
- package/dist/crawl/rate-limiter.d.ts.map +1 -0
- package/dist/crawl/rate-limiter.js +72 -0
- package/dist/crawl/rate-limiter.js.map +1 -0
- package/dist/crawl/robots.d.ts +9 -0
- package/dist/crawl/robots.d.ts.map +1 -0
- package/dist/crawl/robots.js +63 -0
- package/dist/crawl/robots.js.map +1 -0
- package/dist/crawl/sitemap.d.ts +4 -0
- package/dist/crawl/sitemap.d.ts.map +1 -0
- package/dist/crawl/sitemap.js +38 -0
- package/dist/crawl/sitemap.js.map +1 -0
- package/dist/crawl/url-utils.d.ts +3 -0
- package/dist/crawl/url-utils.d.ts.map +1 -0
- package/dist/crawl/url-utils.js +41 -0
- package/dist/crawl/url-utils.js.map +1 -0
- package/dist/extraction/defuddle.d.ts +3 -0
- package/dist/extraction/defuddle.d.ts.map +1 -0
- package/dist/extraction/defuddle.js +26 -0
- package/dist/extraction/defuddle.js.map +1 -0
- package/dist/extraction/extract.d.ts +5 -0
- package/dist/extraction/extract.d.ts.map +1 -0
- package/dist/extraction/extract.js +83 -0
- package/dist/extraction/extract.js.map +1 -0
- package/dist/extraction/jsonld.d.ts +4 -0
- package/dist/extraction/jsonld.d.ts.map +1 -0
- package/dist/extraction/jsonld.js +64 -0
- package/dist/extraction/jsonld.js.map +1 -0
- package/dist/extraction/markdown.d.ts +10 -0
- package/dist/extraction/markdown.d.ts.map +1 -0
- package/dist/extraction/markdown.js +107 -0
- package/dist/extraction/markdown.js.map +1 -0
- package/dist/extraction/pipeline.d.ts +11 -0
- package/dist/extraction/pipeline.d.ts.map +1 -0
- package/dist/extraction/pipeline.js +95 -0
- package/dist/extraction/pipeline.js.map +1 -0
- package/dist/extraction/readability.d.ts +3 -0
- package/dist/extraction/readability.d.ts.map +1 -0
- package/dist/extraction/readability.js +32 -0
- package/dist/extraction/readability.js.map +1 -0
- package/dist/extraction/schema.d.ts +7 -0
- package/dist/extraction/schema.d.ts.map +1 -0
- package/dist/extraction/schema.js +86 -0
- package/dist/extraction/schema.js.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.js +104 -0
- package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
- package/dist/extraction/site-extractors/github.d.ts +3 -0
- package/dist/extraction/site-extractors/github.d.ts.map +1 -0
- package/dist/extraction/site-extractors/github.js +107 -0
- package/dist/extraction/site-extractors/github.js.map +1 -0
- package/dist/extraction/site-extractors/mdn.d.ts +3 -0
- package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
- package/dist/extraction/site-extractors/mdn.js +58 -0
- package/dist/extraction/site-extractors/mdn.js.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.js +88 -0
- package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
- package/dist/extraction/trafilatura.d.ts +6 -0
- package/dist/extraction/trafilatura.d.ts.map +1 -0
- package/dist/extraction/trafilatura.js +105 -0
- package/dist/extraction/trafilatura.js.map +1 -0
- package/dist/fetch/auth.d.ts +8 -0
- package/dist/fetch/auth.d.ts.map +1 -0
- package/dist/fetch/auth.js +32 -0
- package/dist/fetch/auth.js.map +1 -0
- package/dist/fetch/browser-pool.d.ts +28 -0
- package/dist/fetch/browser-pool.d.ts.map +1 -0
- package/dist/fetch/browser-pool.js +138 -0
- package/dist/fetch/browser-pool.js.map +1 -0
- package/dist/fetch/content-check.d.ts +2 -0
- package/dist/fetch/content-check.d.ts.map +1 -0
- package/dist/fetch/content-check.js +62 -0
- package/dist/fetch/content-check.js.map +1 -0
- package/dist/fetch/http-client.d.ts +15 -0
- package/dist/fetch/http-client.d.ts.map +1 -0
- package/dist/fetch/http-client.js +146 -0
- package/dist/fetch/http-client.js.map +1 -0
- package/dist/fetch/router.d.ts +45 -0
- package/dist/fetch/router.d.ts.map +1 -0
- package/dist/fetch/router.js +89 -0
- package/dist/fetch/router.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +10 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +39 -0
- package/dist/logger.js.map +1 -0
- package/dist/search/dedup.d.ts +10 -0
- package/dist/search/dedup.d.ts.map +1 -0
- package/dist/search/dedup.js +35 -0
- package/dist/search/dedup.js.map +1 -0
- package/dist/search/engines/bing.d.ts +7 -0
- package/dist/search/engines/bing.d.ts.map +1 -0
- package/dist/search/engines/bing.js +48 -0
- package/dist/search/engines/bing.js.map +1 -0
- package/dist/search/engines/duckduckgo.d.ts +7 -0
- package/dist/search/engines/duckduckgo.d.ts.map +1 -0
- package/dist/search/engines/duckduckgo.js +50 -0
- package/dist/search/engines/duckduckgo.js.map +1 -0
- package/dist/search/engines/startpage.d.ts +7 -0
- package/dist/search/engines/startpage.d.ts.map +1 -0
- package/dist/search/engines/startpage.js +50 -0
- package/dist/search/engines/startpage.js.map +1 -0
- package/dist/search/filters.d.ts +16 -0
- package/dist/search/filters.d.ts.map +1 -0
- package/dist/search/filters.js +63 -0
- package/dist/search/filters.js.map +1 -0
- package/dist/search/flashrank.d.ts +12 -0
- package/dist/search/flashrank.d.ts.map +1 -0
- package/dist/search/flashrank.js +63 -0
- package/dist/search/flashrank.js.map +1 -0
- package/dist/search/query.d.ts +2 -0
- package/dist/search/query.d.ts.map +1 -0
- package/dist/search/query.js +41 -0
- package/dist/search/query.js.map +1 -0
- package/dist/search/rerank.d.ts +3 -0
- package/dist/search/rerank.d.ts.map +1 -0
- package/dist/search/rerank.js +40 -0
- package/dist/search/rerank.js.map +1 -0
- package/dist/search/searxng.d.ts +8 -0
- package/dist/search/searxng.d.ts.map +1 -0
- package/dist/search/searxng.js +87 -0
- package/dist/search/searxng.js.map +1 -0
- package/dist/search/validator.d.ts +6 -0
- package/dist/search/validator.d.ts.map +1 -0
- package/dist/search/validator.js +35 -0
- package/dist/search/validator.js.map +1 -0
- package/dist/searxng/bootstrap.d.ts +18 -0
- package/dist/searxng/bootstrap.d.ts.map +1 -0
- package/dist/searxng/bootstrap.js +136 -0
- package/dist/searxng/bootstrap.js.map +1 -0
- package/dist/searxng/docker.d.ts +9 -0
- package/dist/searxng/docker.d.ts.map +1 -0
- package/dist/searxng/docker.js +67 -0
- package/dist/searxng/docker.js.map +1 -0
- package/dist/searxng/process.d.ts +23 -0
- package/dist/searxng/process.d.ts.map +1 -0
- package/dist/searxng/process.js +188 -0
- package/dist/searxng/process.js.map +1 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +311 -0
- package/dist/server.js.map +1 -0
- package/dist/tools/cache.d.ts +3 -0
- package/dist/tools/cache.d.ts.map +1 -0
- package/dist/tools/cache.js +50 -0
- package/dist/tools/cache.js.map +1 -0
- package/dist/tools/crawl.d.ts +6 -0
- package/dist/tools/crawl.d.ts.map +1 -0
- package/dist/tools/crawl.js +97 -0
- package/dist/tools/crawl.js.map +1 -0
- package/dist/tools/extract.d.ts +4 -0
- package/dist/tools/extract.d.ts.map +1 -0
- package/dist/tools/extract.js +69 -0
- package/dist/tools/extract.js.map +1 -0
- package/dist/tools/fetch.d.ts +4 -0
- package/dist/tools/fetch.d.ts.map +1 -0
- package/dist/tools/fetch.js +76 -0
- package/dist/tools/fetch.js.map +1 -0
- package/dist/tools/search.d.ts +4 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +160 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/types.d.ts +222 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +61 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ExtractionResult, Extractor } from '../types.js';
|
|
2
|
+
export interface ExtractionOptions {
|
|
3
|
+
maxChars?: number;
|
|
4
|
+
section?: string;
|
|
5
|
+
sectionIndex?: number;
|
|
6
|
+
contentType?: string;
|
|
7
|
+
pdfBuffer?: Buffer;
|
|
8
|
+
}
|
|
9
|
+
export declare function registerExtractor(extractor: Extractor): void;
|
|
10
|
+
export declare function extractContent(html: string, url: string, options?: ExtractionOptions): Promise<ExtractionResult>;
|
|
11
|
+
//# sourceMappingURL=pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAU/D,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AASD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,SAAS,GAAG,IAAI,CAE5D;AAED,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,gBAAgB,CAAC,CAmE3B"}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { defuddleExtract } from './defuddle.js';
|
|
2
|
+
import { readabilityExtract } from './readability.js';
|
|
3
|
+
import { trafilaturaExtract, isTrafilaturaAvailable } from './trafilatura.js';
|
|
4
|
+
import { htmlToMarkdown, extractSection, extractLinksAndImages } from './markdown.js';
|
|
5
|
+
import { githubExtractor } from './site-extractors/github.js';
|
|
6
|
+
import { stackoverflowExtractor } from './site-extractors/stackoverflow.js';
|
|
7
|
+
import { mdnExtractor } from './site-extractors/mdn.js';
|
|
8
|
+
import { docsGenericExtractor } from './site-extractors/docs-generic.js';
|
|
9
|
+
import { createLogger } from '../logger.js';
|
|
10
|
+
import { getConfig } from '../config.js';
|
|
11
|
+
const log = createLogger('extract');
|
|
12
|
+
const siteExtractors = [
|
|
13
|
+
githubExtractor,
|
|
14
|
+
stackoverflowExtractor,
|
|
15
|
+
mdnExtractor,
|
|
16
|
+
docsGenericExtractor,
|
|
17
|
+
];
|
|
18
|
+
export function registerExtractor(extractor) {
|
|
19
|
+
siteExtractors.push(extractor);
|
|
20
|
+
}
|
|
21
|
+
export async function extractContent(html, url, options = {}) {
|
|
22
|
+
let result = null;
|
|
23
|
+
if (options.contentType === 'application/pdf') {
|
|
24
|
+
let pdfText = '';
|
|
25
|
+
if (options.pdfBuffer) {
|
|
26
|
+
try {
|
|
27
|
+
const pdfParse = (await import('pdf-parse')).default;
|
|
28
|
+
const parsed = await pdfParse(options.pdfBuffer);
|
|
29
|
+
pdfText = parsed.text ?? '';
|
|
30
|
+
}
|
|
31
|
+
catch (err) {
|
|
32
|
+
log.warn('pdf-parse failed', { url, error: String(err) });
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
result = {
|
|
36
|
+
title: '',
|
|
37
|
+
markdown: pdfText,
|
|
38
|
+
metadata: {},
|
|
39
|
+
links: [],
|
|
40
|
+
images: [],
|
|
41
|
+
extractor: 'turndown',
|
|
42
|
+
};
|
|
43
|
+
return applyPostProcessing(result, options);
|
|
44
|
+
}
|
|
45
|
+
const siteExtractor = siteExtractors.find((e) => e.canHandle(url, html));
|
|
46
|
+
if (siteExtractor) {
|
|
47
|
+
const extracted = siteExtractor.extract(html, url);
|
|
48
|
+
if (extracted) {
|
|
49
|
+
result = extracted;
|
|
50
|
+
return applyPostProcessing(result, options);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
result = await defuddleExtract(html, url);
|
|
54
|
+
if (!result) {
|
|
55
|
+
const config = getConfig();
|
|
56
|
+
if (config.trafilatura !== 'never') {
|
|
57
|
+
const trafAvailable = await isTrafilaturaAvailable();
|
|
58
|
+
if (trafAvailable) {
|
|
59
|
+
result = await trafilaturaExtract(html, url);
|
|
60
|
+
if (result) {
|
|
61
|
+
log.info('Trafilatura extraction succeeded', { url, chars: result.markdown.length });
|
|
62
|
+
return applyPostProcessing(result, options);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
if (!result) {
|
|
68
|
+
result = readabilityExtract(html, url);
|
|
69
|
+
}
|
|
70
|
+
if (!result) {
|
|
71
|
+
const markdown = htmlToMarkdown(html);
|
|
72
|
+
result = {
|
|
73
|
+
title: '',
|
|
74
|
+
markdown,
|
|
75
|
+
metadata: {},
|
|
76
|
+
links: [],
|
|
77
|
+
images: [],
|
|
78
|
+
extractor: 'turndown',
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
return applyPostProcessing(result, options);
|
|
82
|
+
}
|
|
83
|
+
function applyPostProcessing(result, options) {
|
|
84
|
+
let markdown = result.markdown;
|
|
85
|
+
if (options.section) {
|
|
86
|
+
const { content } = extractSection(markdown, options.section, options.sectionIndex ?? 0);
|
|
87
|
+
markdown = content;
|
|
88
|
+
}
|
|
89
|
+
const { links, images } = extractLinksAndImages(markdown);
|
|
90
|
+
if (options.maxChars && markdown.length > options.maxChars) {
|
|
91
|
+
markdown = markdown.slice(0, options.maxChars);
|
|
92
|
+
}
|
|
93
|
+
return { ...result, markdown, links, images };
|
|
94
|
+
}
|
|
95
|
+
//# sourceMappingURL=pipeline.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC9E,OAAO,EAAE,cAAc,EAAE,cAAc,EAAE,qBAAqB,EAAE,MAAM,eAAe,CAAC;AAEtF,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,sBAAsB,EAAE,MAAM,oCAAoC,CAAC;AAC5E,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEzC,MAAM,GAAG,GAAG,YAAY,CAAC,SAAS,CAAC,CAAC;AAUpC,MAAM,cAAc,GAAgB;IAClC,eAAe;IACf,sBAAsB;IACtB,YAAY;IACZ,oBAAoB;CACrB,CAAC;AAEF,MAAM,UAAU,iBAAiB,CAAC,SAAoB;IACpD,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;AACjC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAAY,EACZ,GAAW,EACX,UAA6B,EAAE;IAE/B,IAAI,MAAM,GAA4B,IAAI,CAAC;IAE3C,IAAI,OAAO,CAAC,WAAW,KAAK,iBAAiB,EAAE,CAAC;QAC9C,IAAI,OAAO,GAAG,EAAE,CAAC;QACjB,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;YACtB,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,CAAC,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC;gBACrD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;gBACjD,OAAO,GAAG,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;YAC9B,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,GAAG,CAAC,IAAI,CAAC,kBAAkB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC5D,CAAC;QACH,CAAC;QACD,MAAM,GAAG;YACP,KAAK,EAAE,EAAE;YACT,QAAQ,EAAE,OAAO;YACjB,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;QACF,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC9C,CAAC;IAED,MAAM,aAAa,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC;IACzE,IAAI,aAAa,EAAE,CAAC;QAClB,MAAM,SAAS,GAAG,aAAa,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QACnD,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,GAAG,SAAS,CAAC;YACnB,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,MAAM,GAAG,MAAM,eAAe,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAE1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,IAAI,MAAM,CAAC,WAAW,KAAK,OAAO,EAAE,CAAC;YACnC,MAAM,aAAa,GAAG,MAAM,sBAAsB,EAAE,CAAC;YACrD,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBAC7C,IAAI,MAAM,EAAE,CAAC;oBACX,GAAG,CAAC,IAAI,CAAC,kCAAkC,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;oBACrF,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;gBAC9C,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,GAAG,kBAAkB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IACzC,CAAC;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,QAAQ,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QACtC,MAAM,GAAG;YACP,KAAK,EAAE,EAAE;YACT,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;IACJ,CAAC;IAED,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,mBAAmB,CAC1B,MAAwB,EACxB,OAA0B;IAE1B,IAAI,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;IAE/B,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACpB,MAAM,EAAE,OAAO,EAAE,GAAG,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,OAAO,EAAE,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,CAAC;QACzF,QAAQ,GAAG,OAAO,CAAC;IACrB,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAE1D,IAAI,OAAO,CAAC,QAAQ,IAAI,QAAQ,CAAC,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;QAC3D,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IACjD,CAAC;IAED,OAAO,EAAE,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;AAChD,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"readability.d.ts","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAIpD,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CA0BrF"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { Readability } from '@mozilla/readability';
|
|
2
|
+
import { parseHTML } from 'linkedom';
|
|
3
|
+
import TurndownService from 'turndown';
|
|
4
|
+
const MIN_CONTENT_THRESHOLD = 100;
|
|
5
|
+
export function readabilityExtract(html, url) {
|
|
6
|
+
try {
|
|
7
|
+
const { document } = parseHTML(html);
|
|
8
|
+
const reader = new Readability(document);
|
|
9
|
+
const article = reader.parse();
|
|
10
|
+
if (!article || !article.content)
|
|
11
|
+
return null;
|
|
12
|
+
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
13
|
+
const markdown = turndown.turndown(article.content);
|
|
14
|
+
if (markdown.length < MIN_CONTENT_THRESHOLD)
|
|
15
|
+
return null;
|
|
16
|
+
return {
|
|
17
|
+
title: article.title ?? '',
|
|
18
|
+
markdown,
|
|
19
|
+
metadata: {
|
|
20
|
+
author: article.byline || undefined,
|
|
21
|
+
language: article.lang || undefined,
|
|
22
|
+
},
|
|
23
|
+
links: [],
|
|
24
|
+
images: [],
|
|
25
|
+
extractor: 'readability',
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
catch {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=readability.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"readability.js","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,GAAW;IAC1D,IAAI,CAAC;QACH,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;QAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAC/B,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE9C,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;QACxF,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAEpD,IAAI,QAAQ,CAAC,MAAM,GAAG,qBAAqB;YAAE,OAAO,IAAI,CAAC;QAEzD,OAAO;YACL,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,EAAE;YAC1B,QAAQ;YACR,QAAQ,EAAE;gBACR,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,QAAQ,EAAE,OAAO,CAAC,IAAI,IAAI,SAAS;aACpC;YACD,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,aAAa;SACzB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAGA,MAAM,WAAW,UAAU;IACzB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;IACxC,KAAK,CAAC,EAAE,UAAU,CAAC;CACpB;AAED,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,UAAU,GACjB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAmBzB"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import { extractJsonLd, matchJsonLdToSchema } from './jsonld.js';
|
|
3
|
+
export function extractWithSchema(html, schema) {
|
|
4
|
+
if (!html || !schema.properties)
|
|
5
|
+
return {};
|
|
6
|
+
const jsonLdBlocks = extractJsonLd(html);
|
|
7
|
+
const jsonLdResult = matchJsonLdToSchema(jsonLdBlocks, schema);
|
|
8
|
+
const { document: doc } = parseHTML(html);
|
|
9
|
+
const heuristicResult = {};
|
|
10
|
+
for (const [fieldName, fieldSchema] of Object.entries(schema.properties)) {
|
|
11
|
+
if (jsonLdResult[fieldName] !== undefined)
|
|
12
|
+
continue;
|
|
13
|
+
const value = findFieldValue(doc, fieldName, fieldSchema);
|
|
14
|
+
if (value !== undefined) {
|
|
15
|
+
heuristicResult[fieldName] = value;
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return { ...jsonLdResult, ...heuristicResult };
|
|
19
|
+
}
|
|
20
|
+
function findFieldValue(doc, fieldName, schema) {
|
|
21
|
+
const normalizedName = fieldName.toLowerCase().replace(/_/g, '-');
|
|
22
|
+
const compactName = fieldName.replace(/_/g, '').toLowerCase();
|
|
23
|
+
const variants = [fieldName, normalizedName, compactName];
|
|
24
|
+
if (schema.type === 'array') {
|
|
25
|
+
return findArrayValues(doc, variants);
|
|
26
|
+
}
|
|
27
|
+
return findSingleValue(doc, variants);
|
|
28
|
+
}
|
|
29
|
+
function cssEscape(value) {
|
|
30
|
+
return value.replace(/([^\w-])/g, '\\$1');
|
|
31
|
+
}
|
|
32
|
+
function findSingleValue(doc, variants) {
|
|
33
|
+
for (const name of variants) {
|
|
34
|
+
const byItemprop = doc.querySelector(`[itemprop="${name}"]`);
|
|
35
|
+
if (byItemprop) {
|
|
36
|
+
const text = byItemprop.getAttribute('content') ?? byItemprop.textContent?.trim();
|
|
37
|
+
if (text)
|
|
38
|
+
return text;
|
|
39
|
+
}
|
|
40
|
+
// Substring match is intentional — heuristic best-effort for partial class names
|
|
41
|
+
const byClass = doc.querySelector(`[class*="${name}"]`);
|
|
42
|
+
if (byClass) {
|
|
43
|
+
const text = byClass.textContent?.trim();
|
|
44
|
+
if (text)
|
|
45
|
+
return text;
|
|
46
|
+
}
|
|
47
|
+
const allWithAria = doc.querySelectorAll('[aria-label]');
|
|
48
|
+
for (const el of allWithAria) {
|
|
49
|
+
const label = el.getAttribute('aria-label')?.toLowerCase().replace(/\s+/g, '-') ?? '';
|
|
50
|
+
if (label === name.toLowerCase()) {
|
|
51
|
+
const text = el.textContent?.trim();
|
|
52
|
+
if (text)
|
|
53
|
+
return text;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
const byId = doc.querySelector(`#${cssEscape(name)}`);
|
|
57
|
+
if (byId) {
|
|
58
|
+
const text = byId.textContent?.trim();
|
|
59
|
+
if (text)
|
|
60
|
+
return text;
|
|
61
|
+
}
|
|
62
|
+
const byData = doc.querySelector(`[data-${name}]`);
|
|
63
|
+
if (byData) {
|
|
64
|
+
return byData.getAttribute(`data-${name}`) ?? byData.textContent?.trim() ?? undefined;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return undefined;
|
|
68
|
+
}
|
|
69
|
+
function findArrayValues(doc, variants) {
|
|
70
|
+
for (const name of variants) {
|
|
71
|
+
const container = doc.querySelector(`[class*="${name}"]`);
|
|
72
|
+
if (container) {
|
|
73
|
+
const items = container.querySelectorAll('li, [class*="item"]');
|
|
74
|
+
if (items.length > 0) {
|
|
75
|
+
return Array.from(items).map((el) => (el.textContent ?? '').trim()).filter(Boolean);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
const singular = name.replace(/s$/, '');
|
|
79
|
+
const elements = doc.querySelectorAll(`[class*="${singular}"]`);
|
|
80
|
+
if (elements.length > 1) {
|
|
81
|
+
return Array.from(elements).map((el) => (el.textContent ?? '').trim()).filter(Boolean);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
return undefined;
|
|
85
|
+
}
|
|
86
|
+
//# sourceMappingURL=schema.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,aAAa,CAAC;AAQjE,MAAM,UAAU,iBAAiB,CAC/B,IAAY,EACZ,MAAkB;IAElB,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU;QAAE,OAAO,EAAE,CAAC;IAE3C,MAAM,YAAY,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACzC,MAAM,YAAY,GAAG,mBAAmB,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC;IAE/D,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,eAAe,GAA4B,EAAE,CAAC;IAEpD,KAAK,MAAM,CAAC,SAAS,EAAE,WAAW,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;QACzE,IAAI,YAAY,CAAC,SAAS,CAAC,KAAK,SAAS;YAAE,SAAS;QAEpD,MAAM,KAAK,GAAG,cAAc,CAAC,GAAG,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;QAC1D,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACxB,eAAe,CAAC,SAAS,CAAC,GAAG,KAAK,CAAC;QACrC,CAAC;IACH,CAAC;IAED,OAAO,EAAE,GAAG,YAAY,EAAE,GAAG,eAAe,EAAE,CAAC;AACjD,CAAC;AAED,SAAS,cAAc,CACrB,GAAa,EACb,SAAiB,EACjB,MAAkB;IAElB,MAAM,cAAc,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAClE,MAAM,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAC9D,MAAM,QAAQ,GAAG,CAAC,SAAS,EAAE,cAAc,EAAE,WAAW,CAAC,CAAC;IAE1D,IAAI,MAAM,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;QAC5B,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IACxC,CAAC;IAED,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;AACxC,CAAC;AAED,SAAS,SAAS,CAAC,KAAa;IAC9B,OAAO,KAAK,CAAC,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;AAC5C,CAAC;AAED,SAAS,eAAe,CAAC,GAAa,EAAE,QAAkB;IACxD,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,UAAU,GAAG,GAAG,CAAC,aAAa,CAAC,cAAc,IAAI,IAAI,CAAC,CAAC;QAC7D,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,UAAU,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,UAAU,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YAClF,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,iFAAiF;QACjF,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QACxD,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,IAAI,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACzC,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,MAAM,WAAW,GAAG,GAAG,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;QACzD,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;YAC7B,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,CAAC,EAAE,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC;YACtF,IAAI,KAAK,KAAK,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gBACjC,MAAM,IAAI,GAAG,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;gBACpC,IAAI,IAAI;oBAAE,OAAO,IAAI,CAAC;YACxB,CAAC;QACH,CAAC;QAED,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,CAAC,IAAI,SAAS,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACtD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACtC,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,MAAM,MAAM,GAAG,GAAG,CAAC,aAAa,CAAC,SAAS,IAAI,GAAG,CAAC,CAAC;QACnD,IAAI,MAAM,EAAE,CAAC;YACX,OAAO,MAAM,CAAC,YAAY,CAAC,QAAQ,IAAI,EAAE,CAAC,IAAI,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,SAAS,CAAC;QACxF,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,eAAe,CAAC,GAAa,EAAE,QAAkB;IACxD,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,SAAS,GAAG,GAAG,CAAC,aAAa,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QAC1D,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,KAAK,GAAG,SAAS,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAAC;YAChE,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;YACtF,CAAC;QACH,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACxC,MAAM,QAAQ,GAAG,GAAG,CAAC,gBAAgB,CAAC,YAAY,QAAQ,IAAI,CAAC,CAAC;QAChE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,OAAO,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzF,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docs-generic.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/docs-generic.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAqFlE,eAAO,MAAM,oBAAoB,EAAE,SAqClC,CAAC"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import TurndownService from 'turndown';
|
|
3
|
+
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
4
|
+
const STRIP_SELECTORS = [
|
|
5
|
+
'nav',
|
|
6
|
+
'.docs-sidebar',
|
|
7
|
+
'.sidebar',
|
|
8
|
+
'.toc-wrapper',
|
|
9
|
+
'.table-of-contents',
|
|
10
|
+
'.version-picker',
|
|
11
|
+
'.pagination-nav',
|
|
12
|
+
'header',
|
|
13
|
+
'footer',
|
|
14
|
+
];
|
|
15
|
+
function detectFramework(html) {
|
|
16
|
+
if (html.includes('docs-sidebar') || html.includes('data-docusaurus-page')) {
|
|
17
|
+
return 'docusaurus';
|
|
18
|
+
}
|
|
19
|
+
if (html.includes('md-content')) {
|
|
20
|
+
return 'mkdocs';
|
|
21
|
+
}
|
|
22
|
+
if (html.includes('class="document"') || html.includes("class='document'") ||
|
|
23
|
+
(html.includes('class="body"') && html.includes('highlight'))) {
|
|
24
|
+
return 'sphinx';
|
|
25
|
+
}
|
|
26
|
+
if (html.includes('page-body')) {
|
|
27
|
+
return 'gitbook';
|
|
28
|
+
}
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
function extractWithSelectors(document, contentSelectors) {
|
|
32
|
+
for (const selector of contentSelectors) {
|
|
33
|
+
const el = document.querySelector(selector);
|
|
34
|
+
if (el)
|
|
35
|
+
return el;
|
|
36
|
+
}
|
|
37
|
+
return null;
|
|
38
|
+
}
|
|
39
|
+
function stripElements(root, selectors) {
|
|
40
|
+
for (const selector of selectors) {
|
|
41
|
+
for (const el of Array.from(root.querySelectorAll(selector))) {
|
|
42
|
+
el.parentNode?.removeChild(el);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
function buildResult(document, contentEl) {
|
|
47
|
+
stripElements(contentEl, STRIP_SELECTORS);
|
|
48
|
+
const titleEl = contentEl.querySelector('h1') ??
|
|
49
|
+
document.querySelector('h1') ??
|
|
50
|
+
document.querySelector('title');
|
|
51
|
+
const rawTitle = titleEl?.textContent?.trim() ?? '';
|
|
52
|
+
const title = rawTitle.includes('|')
|
|
53
|
+
? rawTitle.split('|')[0].trim()
|
|
54
|
+
: rawTitle;
|
|
55
|
+
if (!title)
|
|
56
|
+
return null;
|
|
57
|
+
const markdown = turndown.turndown(contentEl.innerHTML).trim();
|
|
58
|
+
if (!markdown)
|
|
59
|
+
return null;
|
|
60
|
+
return {
|
|
61
|
+
title,
|
|
62
|
+
markdown,
|
|
63
|
+
metadata: {},
|
|
64
|
+
links: [],
|
|
65
|
+
images: [],
|
|
66
|
+
extractor: 'site-specific',
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
export const docsGenericExtractor = {
|
|
70
|
+
name: 'docs-generic',
|
|
71
|
+
canHandle(_url, html) {
|
|
72
|
+
if (!html)
|
|
73
|
+
return false;
|
|
74
|
+
return detectFramework(html) !== null;
|
|
75
|
+
},
|
|
76
|
+
extract(html, _url) {
|
|
77
|
+
if (!html)
|
|
78
|
+
return null;
|
|
79
|
+
const framework = detectFramework(html);
|
|
80
|
+
if (!framework)
|
|
81
|
+
return null;
|
|
82
|
+
const { document } = parseHTML(html);
|
|
83
|
+
let contentSelectors;
|
|
84
|
+
switch (framework) {
|
|
85
|
+
case 'docusaurus':
|
|
86
|
+
contentSelectors = ['.markdown', 'article', 'main'];
|
|
87
|
+
break;
|
|
88
|
+
case 'mkdocs':
|
|
89
|
+
contentSelectors = ['.md-content'];
|
|
90
|
+
break;
|
|
91
|
+
case 'sphinx':
|
|
92
|
+
contentSelectors = ['.document', '.body'];
|
|
93
|
+
break;
|
|
94
|
+
case 'gitbook':
|
|
95
|
+
contentSelectors = ['.page-body'];
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
const contentEl = extractWithSelectors(document, contentSelectors);
|
|
99
|
+
if (!contentEl)
|
|
100
|
+
return null;
|
|
101
|
+
return buildResult(document, contentEl);
|
|
102
|
+
},
|
|
103
|
+
};
|
|
104
|
+
//# sourceMappingURL=docs-generic.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"docs-generic.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/docs-generic.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AAIxF,MAAM,eAAe,GAAG;IACtB,KAAK;IACL,eAAe;IACf,UAAU;IACV,cAAc;IACd,oBAAoB;IACpB,iBAAiB;IACjB,iBAAiB;IACjB,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF,SAAS,eAAe,CAAC,IAAY;IACnC,IAAI,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,sBAAsB,CAAC,EAAE,CAAC;QAC3E,OAAO,YAAY,CAAC;IACtB,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;QAChC,OAAO,QAAQ,CAAC;IAClB,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,kBAAkB,CAAC;QACtE,CAAC,IAAI,CAAC,QAAQ,CAAC,cAAc,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,EAAE,CAAC;QAClE,OAAO,QAAQ,CAAC;IAClB,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;QAC/B,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,oBAAoB,CAC3B,QAAkB,EAClB,gBAA0B;IAE1B,KAAK,MAAM,QAAQ,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,EAAE,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;QAC5C,IAAI,EAAE;YAAE,OAAO,EAAa,CAAC;IAC/B,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,aAAa,CAAC,IAAa,EAAE,SAAmB;IACvD,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;YAC7D,EAAE,CAAC,UAAU,EAAE,WAAW,CAAC,EAAE,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAClB,QAAkB,EAClB,SAAkB;IAElB,aAAa,CAAC,SAAS,EAAE,eAAe,CAAC,CAAC;IAE1C,MAAM,OAAO,GACX,SAAS,CAAC,aAAa,CAAC,IAAI,CAAC;QAC7B,QAAQ,CAAC,aAAa,CAAC,IAAI,CAAC;QAC5B,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAElC,MAAM,QAAQ,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IACpD,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;QAClC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE;QAChC,CAAC,CAAC,QAAQ,CAAC;IAEb,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/D,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,MAAM,oBAAoB,GAAc;IAC7C,IAAI,EAAE,cAAc;IAEpB,SAAS,CAAC,IAAY,EAAE,IAAa;QACnC,IAAI,CAAC,IAAI;YAAE,OAAO,KAAK,CAAC;QACxB,OAAO,eAAe,CAAC,IAAI,CAAC,KAAK,IAAI,CAAC;IACxC,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,IAAY;QAChC,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,SAAS,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC;QACxC,IAAI,CAAC,SAAS;YAAE,OAAO,IAAI,CAAC;QAE5B,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,IAAI,gBAA0B,CAAC;QAC/B,QAAQ,SAAS,EAAE,CAAC;YAClB,KAAK,YAAY;gBACf,gBAAgB,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;gBACpD,MAAM;YACR,KAAK,QAAQ;gBACX,gBAAgB,GAAG,CAAC,aAAa,CAAC,CAAC;gBACnC,MAAM;YACR,KAAK,QAAQ;gBACX,gBAAgB,GAAG,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;gBAC1C,MAAM;YACR,KAAK,SAAS;gBACZ,gBAAgB,GAAG,CAAC,YAAY,CAAC,CAAC;gBAClC,MAAM;QACV,CAAC;QAED,MAAM,SAAS,GAAG,oBAAoB,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;QACnE,IAAI,CAAC,SAAS;YAAE,OAAO,IAAI,CAAC;QAE5B,OAAO,WAAW,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC1C,CAAC;CACF,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/github.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAoGlE,eAAO,MAAM,eAAe,EAAE,SA2B7B,CAAC"}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import TurndownService from 'turndown';
|
|
3
|
+
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
4
|
+
function isIssueOrPR(url) {
|
|
5
|
+
return /\/issues\/\d+|\/pull\/\d+/.test(url);
|
|
6
|
+
}
|
|
7
|
+
function isBlob(url) {
|
|
8
|
+
return /\/blob\//.test(url);
|
|
9
|
+
}
|
|
10
|
+
function extractIssue(document, url) {
|
|
11
|
+
const titleEl = document.querySelector('.js-issue-title') ?? document.querySelector('.gh-header-title');
|
|
12
|
+
if (!titleEl)
|
|
13
|
+
return null;
|
|
14
|
+
const title = titleEl.textContent?.trim() ?? '';
|
|
15
|
+
const labelEls = document.querySelectorAll('.IssueLabel');
|
|
16
|
+
const labels = Array.from(labelEls)
|
|
17
|
+
.map((el) => el.textContent?.trim() ?? '')
|
|
18
|
+
.filter(Boolean);
|
|
19
|
+
const commentBodies = document.querySelectorAll('.d-block.comment-body');
|
|
20
|
+
if (commentBodies.length === 0)
|
|
21
|
+
return null;
|
|
22
|
+
const sections = [];
|
|
23
|
+
if (labels.length > 0) {
|
|
24
|
+
sections.push(`**Labels:** ${labels.join(', ')}\n`);
|
|
25
|
+
}
|
|
26
|
+
Array.from(commentBodies).forEach((body, i) => {
|
|
27
|
+
const html = body.innerHTML;
|
|
28
|
+
const md = turndown.turndown(html).trim();
|
|
29
|
+
if (md) {
|
|
30
|
+
sections.push(i === 0 ? md : `---\n\n${md}`);
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
const markdown = sections.join('\n\n');
|
|
34
|
+
return {
|
|
35
|
+
title,
|
|
36
|
+
markdown,
|
|
37
|
+
metadata: {},
|
|
38
|
+
links: [],
|
|
39
|
+
images: [],
|
|
40
|
+
extractor: 'site-specific',
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
function extractReadme(document) {
|
|
44
|
+
const titleEl = document.querySelector('title');
|
|
45
|
+
const rawTitle = titleEl?.textContent?.trim() ?? '';
|
|
46
|
+
const title = rawTitle.split(':')[0]?.trim() ?? rawTitle;
|
|
47
|
+
const readmeBody = document.querySelector('#readme .markdown-body') ??
|
|
48
|
+
document.querySelector('.markdown-body');
|
|
49
|
+
if (!readmeBody)
|
|
50
|
+
return null;
|
|
51
|
+
const markdown = turndown.turndown(readmeBody.innerHTML).trim();
|
|
52
|
+
if (!markdown)
|
|
53
|
+
return null;
|
|
54
|
+
return {
|
|
55
|
+
title,
|
|
56
|
+
markdown,
|
|
57
|
+
metadata: {},
|
|
58
|
+
links: [],
|
|
59
|
+
images: [],
|
|
60
|
+
extractor: 'site-specific',
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
function extractBlob(document) {
|
|
64
|
+
const titleEl = document.querySelector('title');
|
|
65
|
+
const title = titleEl?.textContent?.trim() ?? '';
|
|
66
|
+
const codeBlock = document.querySelector('.blob-code-content') ??
|
|
67
|
+
document.querySelector('.highlight') ??
|
|
68
|
+
document.querySelector('.markdown-body');
|
|
69
|
+
if (!codeBlock)
|
|
70
|
+
return null;
|
|
71
|
+
const markdown = turndown.turndown(codeBlock.innerHTML).trim();
|
|
72
|
+
if (!markdown)
|
|
73
|
+
return null;
|
|
74
|
+
return {
|
|
75
|
+
title,
|
|
76
|
+
markdown,
|
|
77
|
+
metadata: {},
|
|
78
|
+
links: [],
|
|
79
|
+
images: [],
|
|
80
|
+
extractor: 'site-specific',
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
export const githubExtractor = {
|
|
84
|
+
name: 'github',
|
|
85
|
+
canHandle(url) {
|
|
86
|
+
try {
|
|
87
|
+
const hostname = new URL(url).hostname;
|
|
88
|
+
return hostname === 'github.com' || hostname.endsWith('.github.com');
|
|
89
|
+
}
|
|
90
|
+
catch {
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
},
|
|
94
|
+
extract(html, url) {
|
|
95
|
+
if (!html)
|
|
96
|
+
return null;
|
|
97
|
+
const { document } = parseHTML(html);
|
|
98
|
+
if (isIssueOrPR(url)) {
|
|
99
|
+
return extractIssue(document, url);
|
|
100
|
+
}
|
|
101
|
+
if (isBlob(url)) {
|
|
102
|
+
return extractBlob(document);
|
|
103
|
+
}
|
|
104
|
+
return extractReadme(document);
|
|
105
|
+
},
|
|
106
|
+
};
|
|
107
|
+
//# sourceMappingURL=github.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/github.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AAExF,SAAS,WAAW,CAAC,GAAW;IAC9B,OAAO,2BAA2B,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC/C,CAAC;AAED,SAAS,MAAM,CAAC,GAAW;IACzB,OAAO,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC9B,CAAC;AAED,SAAS,YAAY,CAAC,QAAkB,EAAE,GAAW;IACnD,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,iBAAiB,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,kBAAkB,CAAC,CAAC;IACxG,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAC;IAE1B,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAEhD,MAAM,QAAQ,GAAG,QAAQ,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC;IAC1D,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC;SAChC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;SACzC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEnB,MAAM,aAAa,GAAG,QAAQ,CAAC,gBAAgB,CAAC,uBAAuB,CAAC,CAAC;IACzE,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAE5C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,QAAQ,CAAC,IAAI,CAAC,eAAe,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QAC5C,MAAM,IAAI,GAAI,IAAgB,CAAC,SAAS,CAAC;QACzC,MAAM,EAAE,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1C,IAAI,EAAE,EAAE,CAAC;YACP,QAAQ,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAEvC,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,SAAS,aAAa,CAAC,QAAkB;IACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IACpD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,QAAQ,CAAC;IAEzD,MAAM,UAAU,GACd,QAAQ,CAAC,aAAa,CAAC,wBAAwB,CAAC;QAChD,QAAQ,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;IAE3C,IAAI,CAAC,UAAU;QAAE,OAAO,IAAI,CAAC;IAE7B,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAE,UAAsB,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IAC7E,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,SAAS,WAAW,CAAC,QAAkB;IACrC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAChD,MAAM,KAAK,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAEjD,MAAM,SAAS,GACb,QAAQ,CAAC,aAAa,CAAC,oBAAoB,CAAC;QAC5C,QAAQ,CAAC,aAAa,CAAC,YAAY,CAAC;QACpC,QAAQ,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;IAE3C,IAAI,CAAC,SAAS;QAAE,OAAO,IAAI,CAAC;IAE5B,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAE,SAAqB,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;IAC5E,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAC;IAE3B,OAAO;QACL,KAAK;QACL,QAAQ;QACR,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,SAAS,EAAE,eAAe;KAC3B,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,MAAM,eAAe,GAAc;IACxC,IAAI,EAAE,QAAQ;IAEd,SAAS,CAAC,GAAW;QACnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,OAAO,QAAQ,KAAK,YAAY,IAAI,QAAQ,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC;QACvE,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,GAAW;QAC/B,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,IAAI,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC;YACrB,OAAO,YAAY,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QACrC,CAAC;QAED,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC;YAChB,OAAO,WAAW,CAAC,QAAQ,CAAC,CAAC;QAC/B,CAAC;QAED,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAC;IACjC,CAAC;CACF,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mdn.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/mdn.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAalE,eAAO,MAAM,YAAY,EAAE,SAqD1B,CAAC"}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import TurndownService from 'turndown';
|
|
3
|
+
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
4
|
+
const STRIP_SELECTORS = [
|
|
5
|
+
'nav',
|
|
6
|
+
'.sidebar',
|
|
7
|
+
'header',
|
|
8
|
+
'footer',
|
|
9
|
+
'.bc-head',
|
|
10
|
+
'.metadata',
|
|
11
|
+
];
|
|
12
|
+
export const mdnExtractor = {
|
|
13
|
+
name: 'mdn',
|
|
14
|
+
canHandle(url) {
|
|
15
|
+
try {
|
|
16
|
+
const hostname = new URL(url).hostname;
|
|
17
|
+
return hostname === 'developer.mozilla.org';
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return false;
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
extract(html, url) {
|
|
24
|
+
if (!html)
|
|
25
|
+
return null;
|
|
26
|
+
const { document } = parseHTML(html);
|
|
27
|
+
const article = document.querySelector('article.main-page-content') ??
|
|
28
|
+
document.querySelector('.section-content') ??
|
|
29
|
+
document.querySelector('article');
|
|
30
|
+
if (!article)
|
|
31
|
+
return null;
|
|
32
|
+
for (const selector of STRIP_SELECTORS) {
|
|
33
|
+
for (const el of Array.from(article.querySelectorAll(selector))) {
|
|
34
|
+
el.parentNode?.removeChild(el);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
const titleEl = article.querySelector('h1') ??
|
|
38
|
+
document.querySelector('title');
|
|
39
|
+
const rawTitle = titleEl?.textContent?.trim() ?? '';
|
|
40
|
+
const title = rawTitle.includes('|')
|
|
41
|
+
? rawTitle.split('|')[0].trim()
|
|
42
|
+
: rawTitle;
|
|
43
|
+
if (!title)
|
|
44
|
+
return null;
|
|
45
|
+
const markdown = turndown.turndown(article.innerHTML).trim();
|
|
46
|
+
if (!markdown)
|
|
47
|
+
return null;
|
|
48
|
+
return {
|
|
49
|
+
title,
|
|
50
|
+
markdown,
|
|
51
|
+
metadata: {},
|
|
52
|
+
links: [],
|
|
53
|
+
images: [],
|
|
54
|
+
extractor: 'site-specific',
|
|
55
|
+
};
|
|
56
|
+
},
|
|
57
|
+
};
|
|
58
|
+
//# sourceMappingURL=mdn.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mdn.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/mdn.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AAExF,MAAM,eAAe,GAAG;IACtB,KAAK;IACL,UAAU;IACV,QAAQ;IACR,QAAQ;IACR,UAAU;IACV,WAAW;CACZ,CAAC;AAEF,MAAM,CAAC,MAAM,YAAY,GAAc;IACrC,IAAI,EAAE,KAAK;IAEX,SAAS,CAAC,GAAW;QACnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,OAAO,QAAQ,KAAK,uBAAuB,CAAC;QAC9C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,GAAW;QAC/B,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,MAAM,OAAO,GACX,QAAQ,CAAC,aAAa,CAAC,2BAA2B,CAAC;YACnD,QAAQ,CAAC,aAAa,CAAC,kBAAkB,CAAC;YAC1C,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;QAEpC,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE1B,KAAK,MAAM,QAAQ,IAAI,eAAe,EAAE,CAAC;YACvC,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;gBAChE,EAAE,CAAC,UAAU,EAAE,WAAW,CAAC,EAAE,CAAC,CAAC;YACjC,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GACX,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC;YAC3B,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAElC,MAAM,QAAQ,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACpD,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;YAClC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE;YAChC,CAAC,CAAC,QAAQ,CAAC;QAEb,IAAI,CAAC,KAAK;YAAE,OAAO,IAAI,CAAC;QAExB,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAE,OAAmB,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;QAC1E,IAAI,CAAC,QAAQ;YAAE,OAAO,IAAI,CAAC;QAE3B,OAAO;YACL,KAAK;YACL,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,eAAe;SAC3B,CAAC;IACJ,CAAC;CACF,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"stackoverflow.d.ts","sourceRoot":"","sources":["../../../src/extraction/site-extractors/stackoverflow.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAoB,MAAM,gBAAgB,CAAC;AAgElE,eAAO,MAAM,sBAAsB,EAAE,SAkDpC,CAAC"}
|