@staticn0va/wigolo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +74 -0
- package/README.md +272 -0
- package/dist/cache/db.d.ts +5 -0
- package/dist/cache/db.d.ts.map +1 -0
- package/dist/cache/db.js +97 -0
- package/dist/cache/db.js.map +1 -0
- package/dist/cache/store.d.ts +26 -0
- package/dist/cache/store.d.ts.map +1 -0
- package/dist/cache/store.js +214 -0
- package/dist/cache/store.js.map +1 -0
- package/dist/cli/daemon.d.ts +2 -0
- package/dist/cli/daemon.d.ts.map +1 -0
- package/dist/cli/daemon.js +5 -0
- package/dist/cli/daemon.js.map +1 -0
- package/dist/cli/health.d.ts +2 -0
- package/dist/cli/health.d.ts.map +1 -0
- package/dist/cli/health.js +5 -0
- package/dist/cli/health.js.map +1 -0
- package/dist/cli/index.d.ts +7 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +9 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/warmup.d.ts +11 -0
- package/dist/cli/warmup.d.ts.map +1 -0
- package/dist/cli/warmup.js +107 -0
- package/dist/cli/warmup.js.map +1 -0
- package/dist/config.d.ts +41 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +66 -0
- package/dist/config.js.map +1 -0
- package/dist/crawl/crawler.d.ts +18 -0
- package/dist/crawl/crawler.d.ts.map +1 -0
- package/dist/crawl/crawler.js +228 -0
- package/dist/crawl/crawler.js.map +1 -0
- package/dist/crawl/dedup.d.ts +15 -0
- package/dist/crawl/dedup.d.ts.map +1 -0
- package/dist/crawl/dedup.js +93 -0
- package/dist/crawl/dedup.js.map +1 -0
- package/dist/crawl/mapper.d.ts +17 -0
- package/dist/crawl/mapper.d.ts.map +1 -0
- package/dist/crawl/mapper.js +178 -0
- package/dist/crawl/mapper.js.map +1 -0
- package/dist/crawl/rate-limiter.d.ts +10 -0
- package/dist/crawl/rate-limiter.d.ts.map +1 -0
- package/dist/crawl/rate-limiter.js +72 -0
- package/dist/crawl/rate-limiter.js.map +1 -0
- package/dist/crawl/robots.d.ts +9 -0
- package/dist/crawl/robots.d.ts.map +1 -0
- package/dist/crawl/robots.js +63 -0
- package/dist/crawl/robots.js.map +1 -0
- package/dist/crawl/sitemap.d.ts +4 -0
- package/dist/crawl/sitemap.d.ts.map +1 -0
- package/dist/crawl/sitemap.js +38 -0
- package/dist/crawl/sitemap.js.map +1 -0
- package/dist/crawl/url-utils.d.ts +3 -0
- package/dist/crawl/url-utils.d.ts.map +1 -0
- package/dist/crawl/url-utils.js +41 -0
- package/dist/crawl/url-utils.js.map +1 -0
- package/dist/extraction/defuddle.d.ts +3 -0
- package/dist/extraction/defuddle.d.ts.map +1 -0
- package/dist/extraction/defuddle.js +26 -0
- package/dist/extraction/defuddle.js.map +1 -0
- package/dist/extraction/extract.d.ts +5 -0
- package/dist/extraction/extract.d.ts.map +1 -0
- package/dist/extraction/extract.js +83 -0
- package/dist/extraction/extract.js.map +1 -0
- package/dist/extraction/jsonld.d.ts +4 -0
- package/dist/extraction/jsonld.d.ts.map +1 -0
- package/dist/extraction/jsonld.js +64 -0
- package/dist/extraction/jsonld.js.map +1 -0
- package/dist/extraction/markdown.d.ts +10 -0
- package/dist/extraction/markdown.d.ts.map +1 -0
- package/dist/extraction/markdown.js +107 -0
- package/dist/extraction/markdown.js.map +1 -0
- package/dist/extraction/pipeline.d.ts +11 -0
- package/dist/extraction/pipeline.d.ts.map +1 -0
- package/dist/extraction/pipeline.js +95 -0
- package/dist/extraction/pipeline.js.map +1 -0
- package/dist/extraction/readability.d.ts +3 -0
- package/dist/extraction/readability.d.ts.map +1 -0
- package/dist/extraction/readability.js +32 -0
- package/dist/extraction/readability.js.map +1 -0
- package/dist/extraction/schema.d.ts +7 -0
- package/dist/extraction/schema.d.ts.map +1 -0
- package/dist/extraction/schema.js +86 -0
- package/dist/extraction/schema.js.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.js +104 -0
- package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
- package/dist/extraction/site-extractors/github.d.ts +3 -0
- package/dist/extraction/site-extractors/github.d.ts.map +1 -0
- package/dist/extraction/site-extractors/github.js +107 -0
- package/dist/extraction/site-extractors/github.js.map +1 -0
- package/dist/extraction/site-extractors/mdn.d.ts +3 -0
- package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
- package/dist/extraction/site-extractors/mdn.js +58 -0
- package/dist/extraction/site-extractors/mdn.js.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.js +88 -0
- package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
- package/dist/extraction/trafilatura.d.ts +6 -0
- package/dist/extraction/trafilatura.d.ts.map +1 -0
- package/dist/extraction/trafilatura.js +105 -0
- package/dist/extraction/trafilatura.js.map +1 -0
- package/dist/fetch/auth.d.ts +8 -0
- package/dist/fetch/auth.d.ts.map +1 -0
- package/dist/fetch/auth.js +32 -0
- package/dist/fetch/auth.js.map +1 -0
- package/dist/fetch/browser-pool.d.ts +28 -0
- package/dist/fetch/browser-pool.d.ts.map +1 -0
- package/dist/fetch/browser-pool.js +138 -0
- package/dist/fetch/browser-pool.js.map +1 -0
- package/dist/fetch/content-check.d.ts +2 -0
- package/dist/fetch/content-check.d.ts.map +1 -0
- package/dist/fetch/content-check.js +62 -0
- package/dist/fetch/content-check.js.map +1 -0
- package/dist/fetch/http-client.d.ts +15 -0
- package/dist/fetch/http-client.d.ts.map +1 -0
- package/dist/fetch/http-client.js +146 -0
- package/dist/fetch/http-client.js.map +1 -0
- package/dist/fetch/router.d.ts +45 -0
- package/dist/fetch/router.d.ts.map +1 -0
- package/dist/fetch/router.js +89 -0
- package/dist/fetch/router.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +10 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +39 -0
- package/dist/logger.js.map +1 -0
- package/dist/search/dedup.d.ts +10 -0
- package/dist/search/dedup.d.ts.map +1 -0
- package/dist/search/dedup.js +35 -0
- package/dist/search/dedup.js.map +1 -0
- package/dist/search/engines/bing.d.ts +7 -0
- package/dist/search/engines/bing.d.ts.map +1 -0
- package/dist/search/engines/bing.js +48 -0
- package/dist/search/engines/bing.js.map +1 -0
- package/dist/search/engines/duckduckgo.d.ts +7 -0
- package/dist/search/engines/duckduckgo.d.ts.map +1 -0
- package/dist/search/engines/duckduckgo.js +50 -0
- package/dist/search/engines/duckduckgo.js.map +1 -0
- package/dist/search/engines/startpage.d.ts +7 -0
- package/dist/search/engines/startpage.d.ts.map +1 -0
- package/dist/search/engines/startpage.js +50 -0
- package/dist/search/engines/startpage.js.map +1 -0
- package/dist/search/filters.d.ts +16 -0
- package/dist/search/filters.d.ts.map +1 -0
- package/dist/search/filters.js +63 -0
- package/dist/search/filters.js.map +1 -0
- package/dist/search/flashrank.d.ts +12 -0
- package/dist/search/flashrank.d.ts.map +1 -0
- package/dist/search/flashrank.js +63 -0
- package/dist/search/flashrank.js.map +1 -0
- package/dist/search/query.d.ts +2 -0
- package/dist/search/query.d.ts.map +1 -0
- package/dist/search/query.js +41 -0
- package/dist/search/query.js.map +1 -0
- package/dist/search/rerank.d.ts +3 -0
- package/dist/search/rerank.d.ts.map +1 -0
- package/dist/search/rerank.js +40 -0
- package/dist/search/rerank.js.map +1 -0
- package/dist/search/searxng.d.ts +8 -0
- package/dist/search/searxng.d.ts.map +1 -0
- package/dist/search/searxng.js +87 -0
- package/dist/search/searxng.js.map +1 -0
- package/dist/search/validator.d.ts +6 -0
- package/dist/search/validator.d.ts.map +1 -0
- package/dist/search/validator.js +35 -0
- package/dist/search/validator.js.map +1 -0
- package/dist/searxng/bootstrap.d.ts +18 -0
- package/dist/searxng/bootstrap.d.ts.map +1 -0
- package/dist/searxng/bootstrap.js +136 -0
- package/dist/searxng/bootstrap.js.map +1 -0
- package/dist/searxng/docker.d.ts +9 -0
- package/dist/searxng/docker.d.ts.map +1 -0
- package/dist/searxng/docker.js +67 -0
- package/dist/searxng/docker.js.map +1 -0
- package/dist/searxng/process.d.ts +23 -0
- package/dist/searxng/process.d.ts.map +1 -0
- package/dist/searxng/process.js +188 -0
- package/dist/searxng/process.js.map +1 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +311 -0
- package/dist/server.js.map +1 -0
- package/dist/tools/cache.d.ts +3 -0
- package/dist/tools/cache.d.ts.map +1 -0
- package/dist/tools/cache.js +50 -0
- package/dist/tools/cache.js.map +1 -0
- package/dist/tools/crawl.d.ts +6 -0
- package/dist/tools/crawl.d.ts.map +1 -0
- package/dist/tools/crawl.js +97 -0
- package/dist/tools/crawl.js.map +1 -0
- package/dist/tools/extract.d.ts +4 -0
- package/dist/tools/extract.d.ts.map +1 -0
- package/dist/tools/extract.js +69 -0
- package/dist/tools/extract.js.map +1 -0
- package/dist/tools/fetch.d.ts +4 -0
- package/dist/tools/fetch.d.ts.map +1 -0
- package/dist/tools/fetch.js +76 -0
- package/dist/tools/fetch.js.map +1 -0
- package/dist/tools/search.d.ts +4 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +160 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/types.d.ts +222 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +61 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { isPrivateUrl } from './url-utils.js';
|
|
2
|
+
import { getConfig } from '../config.js';
|
|
3
|
+
export class RateLimiter {
|
|
4
|
+
domains = new Map();
|
|
5
|
+
robotsDelays = new Map();
|
|
6
|
+
setRobotsCrawlDelay(domain, delaySeconds) {
|
|
7
|
+
this.robotsDelays.set(domain, delaySeconds * 1000);
|
|
8
|
+
}
|
|
9
|
+
async acquire(url) {
|
|
10
|
+
const domain = new URL(url).hostname;
|
|
11
|
+
const state = this.getOrCreateState(url, domain);
|
|
12
|
+
if (state.activeCount < state.maxConcurrency) {
|
|
13
|
+
// Enforce delay even when under concurrency limit
|
|
14
|
+
const elapsed = Date.now() - state.lastRequestTime;
|
|
15
|
+
const remaining = state.delayMs - elapsed;
|
|
16
|
+
if (remaining > 0 && state.lastRequestTime > 0) {
|
|
17
|
+
await new Promise((r) => setTimeout(r, remaining));
|
|
18
|
+
}
|
|
19
|
+
return this.startRequest(state);
|
|
20
|
+
}
|
|
21
|
+
// Wait in queue
|
|
22
|
+
return new Promise((resolve) => {
|
|
23
|
+
state.queue.push(() => resolve(this.startRequest(state)));
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
getOrCreateState(url, domain) {
|
|
27
|
+
if (!this.domains.has(domain)) {
|
|
28
|
+
const config = getConfig();
|
|
29
|
+
const isPrivate = isPrivateUrl(url);
|
|
30
|
+
const configDelay = isPrivate ? config.crawlPrivateDelayMs : config.crawlDelayMs;
|
|
31
|
+
// Use robots.txt delay if it's higher than configured delay
|
|
32
|
+
const robotsDelay = this.robotsDelays.get(domain) ?? 0;
|
|
33
|
+
const effectiveDelay = Math.max(configDelay, robotsDelay);
|
|
34
|
+
this.domains.set(domain, {
|
|
35
|
+
activeCount: 0,
|
|
36
|
+
lastRequestTime: 0,
|
|
37
|
+
queue: [],
|
|
38
|
+
maxConcurrency: isPrivate ? config.crawlPrivateConcurrency : config.crawlConcurrency,
|
|
39
|
+
delayMs: effectiveDelay,
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
const state = this.domains.get(domain);
|
|
43
|
+
// Update delay if robots delay was set after state creation
|
|
44
|
+
const robotsDelay = this.robotsDelays.get(domain);
|
|
45
|
+
if (robotsDelay !== undefined && robotsDelay > state.delayMs) {
|
|
46
|
+
state.delayMs = robotsDelay;
|
|
47
|
+
}
|
|
48
|
+
return state;
|
|
49
|
+
}
|
|
50
|
+
startRequest(state) {
|
|
51
|
+
state.activeCount++;
|
|
52
|
+
state.lastRequestTime = Date.now();
|
|
53
|
+
return () => {
|
|
54
|
+
state.activeCount--;
|
|
55
|
+
this.processQueue(state);
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
processQueue(state) {
|
|
59
|
+
if (state.queue.length === 0 || state.activeCount >= state.maxConcurrency)
|
|
60
|
+
return;
|
|
61
|
+
const next = state.queue.shift();
|
|
62
|
+
const elapsed = Date.now() - state.lastRequestTime;
|
|
63
|
+
const remaining = state.delayMs - elapsed;
|
|
64
|
+
if (remaining <= 0) {
|
|
65
|
+
next();
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
setTimeout(next, remaining);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
//# sourceMappingURL=rate-limiter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rate-limiter.js","sourceRoot":"","sources":["../../src/crawl/rate-limiter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAUzC,MAAM,OAAO,WAAW;IACd,OAAO,GAAG,IAAI,GAAG,EAAuB,CAAC;IACzC,YAAY,GAAG,IAAI,GAAG,EAAkB,CAAC;IAEjD,mBAAmB,CAAC,MAAc,EAAE,YAAoB;QACtD,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,EAAE,YAAY,GAAG,IAAI,CAAC,CAAC;IACrD,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,GAAW;QACvB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;QACrC,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;QAEjD,IAAI,KAAK,CAAC,WAAW,GAAG,KAAK,CAAC,cAAc,EAAE,CAAC;YAC7C,kDAAkD;YAClD,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,eAAe,CAAC;YACnD,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,GAAG,OAAO,CAAC;YAC1C,IAAI,SAAS,GAAG,CAAC,IAAI,KAAK,CAAC,eAAe,GAAG,CAAC,EAAE,CAAC;gBAC/C,MAAM,IAAI,OAAO,CAAO,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC,CAAC;YAC3D,CAAC;YACD,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QAClC,CAAC;QAED,gBAAgB;QAChB,OAAO,IAAI,OAAO,CAAa,CAAC,OAAO,EAAE,EAAE;YACzC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC5D,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,gBAAgB,CAAC,GAAW,EAAE,MAAc;QAClD,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;YAC9B,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;YACpC,MAAM,WAAW,GAAG,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,mBAAmB,CAAC,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC;YAEjF,4DAA4D;YAC5D,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YACvD,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,WAAW,CAAC,CAAC;YAE1D,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE;gBACvB,WAAW,EAAE,CAAC;gBACd,eAAe,EAAE,CAAC;gBAClB,KAAK,EAAE,EAAE;gBACT,cAAc,EAAE,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,uBAAuB,CAAC,CAAC,CAAC,MAAM,CAAC,gBAAgB;gBACpF,OAAO,EAAE,cAAc;aACxB,CAAC,CAAC;QACL,CAAC;QAED,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAE,CAAC;QACxC,4DAA4D;QAC5D,MAAM,WAAW,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAClD,IAAI,WAAW,KAAK,SAAS,IAAI,WAAW,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC;YAC7D,KAAK,CAAC,OAAO,GAAG,WAAW,CAAC;QAC9B,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,YAAY,CAAC,KAAkB;QACrC,KAAK,CAAC,WAAW,EAAE,CAAC;QACpB,KAAK,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAEnC,OAAO,GAAG,EAAE;YACV,KAAK,CAAC,WAAW,EAAE,CAAC;YACpB,IAAI,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;QAC3B,CAAC,CAAC;IACJ,CAAC;IAEO,YAAY,CAAC,KAAkB;QACrC,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,IAAI,KAAK,CAAC,WAAW,IAAI,KAAK,CAAC,cAAc;YAAE,OAAO;QAElF,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAG,CAAC;QAClC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,eAAe,CAAC;QACnD,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,GAAG,OAAO,CAAC;QAE1C,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;YACnB,IAAI,EAAE,CAAC;QACT,CAAC;aAAM,CAAC;YACN,UAAU,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots.d.ts","sourceRoot":"","sources":["../../src/crawl/robots.ts"],"names":[],"mappings":"AAKA,qBAAa,YAAY;IACvB,OAAO,CAAC,KAAK,CAAoB;IACjC,OAAO,CAAC,UAAU,CAAuB;gBAE7B,SAAS,EAAE,MAAM;IAI7B,OAAO,CAAC,KAAK;IA4Cb,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO;IAiBhC,aAAa,IAAI,MAAM,GAAG,IAAI;CAG/B"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
export class RobotsParser {
|
|
2
|
+
rules = [];
|
|
3
|
+
crawlDelay = null;
|
|
4
|
+
constructor(robotsTxt) {
|
|
5
|
+
this.parse(robotsTxt);
|
|
6
|
+
}
|
|
7
|
+
parse(text) {
|
|
8
|
+
const lines = text.split('\n');
|
|
9
|
+
let inWildcardAgent = false;
|
|
10
|
+
for (const rawLine of lines) {
|
|
11
|
+
const line = rawLine.trim();
|
|
12
|
+
if (line.match(/^user-agent:\s*\*/i)) {
|
|
13
|
+
inWildcardAgent = true;
|
|
14
|
+
continue;
|
|
15
|
+
}
|
|
16
|
+
if (line.match(/^user-agent:/i) && !line.match(/^user-agent:\s*\*/i)) {
|
|
17
|
+
inWildcardAgent = false;
|
|
18
|
+
continue;
|
|
19
|
+
}
|
|
20
|
+
if (!inWildcardAgent)
|
|
21
|
+
continue;
|
|
22
|
+
const disallowMatch = line.match(/^disallow:\s*(.*)/i);
|
|
23
|
+
if (disallowMatch) {
|
|
24
|
+
const path = disallowMatch[1].trim();
|
|
25
|
+
if (path) {
|
|
26
|
+
this.rules.push({ type: 'disallow', path });
|
|
27
|
+
}
|
|
28
|
+
continue;
|
|
29
|
+
}
|
|
30
|
+
const allowMatch = line.match(/^allow:\s*(.*)/i);
|
|
31
|
+
if (allowMatch) {
|
|
32
|
+
const path = allowMatch[1].trim();
|
|
33
|
+
if (path) {
|
|
34
|
+
this.rules.push({ type: 'allow', path });
|
|
35
|
+
}
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
const delayMatch = line.match(/^crawl-delay:\s*(\d+(?:\.\d+)?)/i);
|
|
39
|
+
if (delayMatch) {
|
|
40
|
+
this.crawlDelay = parseFloat(delayMatch[1]);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
isAllowed(path) {
|
|
45
|
+
let bestMatch = null;
|
|
46
|
+
let bestLength = -1;
|
|
47
|
+
for (const rule of this.rules) {
|
|
48
|
+
if (path.startsWith(rule.path)) {
|
|
49
|
+
if (rule.path.length > bestLength || (rule.path.length === bestLength && rule.type === 'allow')) {
|
|
50
|
+
bestMatch = rule;
|
|
51
|
+
bestLength = rule.path.length;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
if (!bestMatch)
|
|
56
|
+
return true;
|
|
57
|
+
return bestMatch.type === 'allow';
|
|
58
|
+
}
|
|
59
|
+
getCrawlDelay() {
|
|
60
|
+
return this.crawlDelay;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
//# sourceMappingURL=robots.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots.js","sourceRoot":"","sources":["../../src/crawl/robots.ts"],"names":[],"mappings":"AAKA,MAAM,OAAO,YAAY;IACf,KAAK,GAAiB,EAAE,CAAC;IACzB,UAAU,GAAkB,IAAI,CAAC;IAEzC,YAAY,SAAiB;QAC3B,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IACxB,CAAC;IAEO,KAAK,CAAC,IAAY;QACxB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,eAAe,GAAG,KAAK,CAAC;QAE5B,KAAK,MAAM,OAAO,IAAI,KAAK,EAAE,CAAC;YAC5B,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;YAE5B,IAAI,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,EAAE,CAAC;gBACrC,eAAe,GAAG,IAAI,CAAC;gBACvB,SAAS;YACX,CAAC;YAED,IAAI,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,EAAE,CAAC;gBACrE,eAAe,GAAG,KAAK,CAAC;gBACxB,SAAS;YACX,CAAC;YAED,IAAI,CAAC,eAAe;gBAAE,SAAS;YAE/B,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC;YACvD,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,IAAI,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACrC,IAAI,IAAI,EAAE,CAAC;oBACT,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC9C,CAAC;gBACD,SAAS;YACX,CAAC;YAED,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;YACjD,IAAI,UAAU,EAAE,CAAC;gBACf,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAClC,IAAI,IAAI,EAAE,CAAC;oBACT,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;gBAC3C,CAAC;gBACD,SAAS;YACX,CAAC;YAED,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;YAClE,IAAI,UAAU,EAAE,CAAC;gBACf,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9C,CAAC;QACH,CAAC;IACH,CAAC;IAED,SAAS,CAAC,IAAY;QACpB,IAAI,SAAS,GAAsB,IAAI,CAAC;QACxC,IAAI,UAAU,GAAG,CAAC,CAAC,CAAC;QAEpB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC/B,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,UAAU,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,KAAK,UAAU,IAAI,IAAI,CAAC,IAAI,KAAK,OAAO,CAAC,EAAE,CAAC;oBAChG,SAAS,GAAG,IAAI,CAAC;oBACjB,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;gBAChC,CAAC;YACH,CAAC;QACH,CAAC;QAED,IAAI,CAAC,SAAS;YAAE,OAAO,IAAI,CAAC;QAC5B,OAAO,SAAS,CAAC,IAAI,KAAK,OAAO,CAAC;IACpC,CAAC;IAED,aAAa;QACX,OAAO,IAAI,CAAC,UAAU,CAAC;IACzB,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap.d.ts","sourceRoot":"","sources":["../../src/crawl/sitemap.ts"],"names":[],"mappings":"AAAA,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE,CAgBlD;AAED,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE,CASvD;AAED,wBAAgB,2BAA2B,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,EAAE,CAYvE"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export function parseSitemap(xml) {
|
|
2
|
+
// A sitemapindex document should be parsed with parseSitemapIndex, not here
|
|
3
|
+
if (xml.includes('<sitemapindex'))
|
|
4
|
+
return [];
|
|
5
|
+
if (!xml.includes('<urlset') && !xml.includes('<loc>'))
|
|
6
|
+
return [];
|
|
7
|
+
const urls = [];
|
|
8
|
+
const locMatches = xml.matchAll(/<loc>\s*([^<]+?)\s*<\/loc>/g);
|
|
9
|
+
for (const match of locMatches) {
|
|
10
|
+
const url = match[1].trim();
|
|
11
|
+
if (url) {
|
|
12
|
+
urls.push(url);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
return urls;
|
|
16
|
+
}
|
|
17
|
+
export function parseSitemapIndex(xml) {
|
|
18
|
+
if (!xml.includes('<sitemapindex'))
|
|
19
|
+
return [];
|
|
20
|
+
const urls = [];
|
|
21
|
+
const locMatches = xml.matchAll(/<loc>\s*([^<]+?)\s*<\/loc>/g);
|
|
22
|
+
for (const match of locMatches) {
|
|
23
|
+
urls.push(match[1].trim());
|
|
24
|
+
}
|
|
25
|
+
return urls;
|
|
26
|
+
}
|
|
27
|
+
export function extractSitemapUrlFromRobots(robotsTxt) {
|
|
28
|
+
const urls = [];
|
|
29
|
+
const lines = robotsTxt.split('\n');
|
|
30
|
+
for (const line of lines) {
|
|
31
|
+
const match = line.match(/^sitemap:\s*(.+)/i);
|
|
32
|
+
if (match) {
|
|
33
|
+
urls.push(match[1].trim());
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
return urls;
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=sitemap.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sitemap.js","sourceRoot":"","sources":["../../src/crawl/sitemap.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,YAAY,CAAC,GAAW;IACtC,4EAA4E;IAC5E,IAAI,GAAG,CAAC,QAAQ,CAAC,eAAe,CAAC;QAAE,OAAO,EAAE,CAAC;IAE7C,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC;QAAE,OAAO,EAAE,CAAC;IAElE,MAAM,IAAI,GAAa,EAAE,CAAC;IAC1B,MAAM,UAAU,GAAG,GAAG,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC;IAC/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;QAC/B,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,GAAG,EAAE,CAAC;YACR,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACjB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,GAAW;IAC3C,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,eAAe,CAAC;QAAE,OAAO,EAAE,CAAC;IAE9C,MAAM,IAAI,GAAa,EAAE,CAAC;IAC1B,MAAM,UAAU,GAAG,GAAG,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC;IAC/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;QAC/B,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IAC7B,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,2BAA2B,CAAC,SAAiB;IAC3D,MAAM,IAAI,GAAa,EAAE,CAAC;IAC1B,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEpC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;QAC9C,IAAI,KAAK,EAAE,CAAC;YACV,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QAC7B,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"url-utils.d.ts","sourceRoot":"","sources":["../../src/crawl/url-utils.ts"],"names":[],"mappings":"AAAA,wBAAgB,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAgCjD;AAED,wBAAgB,eAAe,CAC7B,GAAG,EAAE,MAAM,EACX,eAAe,EAAE,MAAM,EAAE,GAAG,SAAS,EACrC,eAAe,EAAE,MAAM,EAAE,GAAG,SAAS,GACpC,OAAO,CAYT"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export function isPrivateUrl(url) {
|
|
2
|
+
const parsed = new URL(url);
|
|
3
|
+
const hostname = parsed.hostname.replace(/^\[|\]$/g, ''); // strip IPv6 brackets
|
|
4
|
+
if (hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1' || hostname === '0.0.0.0') {
|
|
5
|
+
return true;
|
|
6
|
+
}
|
|
7
|
+
if (hostname.endsWith('.local')) {
|
|
8
|
+
return true;
|
|
9
|
+
}
|
|
10
|
+
// 10.x.x.x
|
|
11
|
+
if (hostname.startsWith('10.')) {
|
|
12
|
+
return true;
|
|
13
|
+
}
|
|
14
|
+
// 192.168.x.x
|
|
15
|
+
if (hostname.startsWith('192.168.')) {
|
|
16
|
+
return true;
|
|
17
|
+
}
|
|
18
|
+
// 172.16.0.0/12 (172.16.x.x – 172.31.x.x)
|
|
19
|
+
if (hostname.startsWith('172.')) {
|
|
20
|
+
const parts = hostname.split('.');
|
|
21
|
+
const second = parseInt(parts[1], 10);
|
|
22
|
+
if (second >= 16 && second <= 31) {
|
|
23
|
+
return true;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return false;
|
|
27
|
+
}
|
|
28
|
+
export function matchesPatterns(url, includePatterns, excludePatterns) {
|
|
29
|
+
if (includePatterns && includePatterns.length > 0) {
|
|
30
|
+
const matches = includePatterns.some((p) => new RegExp(p).test(url));
|
|
31
|
+
if (!matches)
|
|
32
|
+
return false;
|
|
33
|
+
}
|
|
34
|
+
if (excludePatterns && excludePatterns.length > 0) {
|
|
35
|
+
const excluded = excludePatterns.some((p) => new RegExp(p).test(url));
|
|
36
|
+
if (excluded)
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
//# sourceMappingURL=url-utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"url-utils.js","sourceRoot":"","sources":["../../src/crawl/url-utils.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,YAAY,CAAC,GAAW;IACtC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;IAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,CAAC,sBAAsB;IAEhF,IAAI,QAAQ,KAAK,WAAW,IAAI,QAAQ,KAAK,WAAW,IAAI,QAAQ,KAAK,KAAK,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;QACzG,OAAO,IAAI,CAAC;IACd,CAAC;IAED,IAAI,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QAChC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,WAAW;IACX,IAAI,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,IAAI,CAAC;IACd,CAAC;IAED,cAAc;IACd,IAAI,QAAQ,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QACpC,OAAO,IAAI,CAAC;IACd,CAAC;IAED,0CAA0C;IAC1C,IAAI,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;QAChC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAClC,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACtC,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,IAAI,EAAE,EAAE,CAAC;YACjC,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,GAAW,EACX,eAAqC,EACrC,eAAqC;IAErC,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,MAAM,OAAO,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QACrE,IAAI,CAAC,OAAO;YAAE,OAAO,KAAK,CAAC;IAC7B,CAAC;IAED,IAAI,eAAe,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;QACtE,IAAI,QAAQ;YAAE,OAAO,KAAK,CAAC;IAC7B,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"defuddle.d.ts","sourceRoot":"","sources":["../../src/extraction/defuddle.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAIpD,wBAAsB,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC,CAoBjG"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { Defuddle } from 'defuddle/node';
|
|
2
|
+
const MIN_CONTENT_THRESHOLD = 100;
|
|
3
|
+
export async function defuddleExtract(html, url) {
|
|
4
|
+
try {
|
|
5
|
+
const result = await Defuddle(html, url, { markdown: true });
|
|
6
|
+
if (!result.content || result.content.length < MIN_CONTENT_THRESHOLD)
|
|
7
|
+
return null;
|
|
8
|
+
return {
|
|
9
|
+
title: result.title ?? '',
|
|
10
|
+
markdown: result.content,
|
|
11
|
+
metadata: {
|
|
12
|
+
description: result.description || undefined,
|
|
13
|
+
author: result.author || undefined,
|
|
14
|
+
date: result.published || undefined,
|
|
15
|
+
language: result.language || undefined,
|
|
16
|
+
},
|
|
17
|
+
links: [],
|
|
18
|
+
images: [],
|
|
19
|
+
extractor: 'defuddle',
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
catch {
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
//# sourceMappingURL=defuddle.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"defuddle.js","sourceRoot":"","sources":["../../src/extraction/defuddle.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAGzC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,IAAY,EAAE,GAAW;IAC7D,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QAC7D,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,qBAAqB;YAAE,OAAO,IAAI,CAAC;QAClF,OAAO;YACL,KAAK,EAAE,MAAM,CAAC,KAAK,IAAI,EAAE;YACzB,QAAQ,EAAE,MAAM,CAAC,OAAO;YACxB,QAAQ,EAAE;gBACR,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,SAAS;gBAC5C,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,SAAS;gBAClC,IAAI,EAAE,MAAM,CAAC,SAAS,IAAI,SAAS;gBACnC,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,SAAS;aACvC;YACD,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { MetadataData, TableData } from '../types.js';
|
|
2
|
+
export declare function extractMetadata(html: string): MetadataData;
|
|
3
|
+
export declare function extractSelector(html: string, selector: string, multiple: boolean): string | string[];
|
|
4
|
+
export declare function extractTables(html: string): TableData[];
|
|
5
|
+
//# sourceMappingURL=extract.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract.d.ts","sourceRoot":"","sources":["../../src/extraction/extract.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAS3D,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CA2B1D;AAED,wBAAgB,eAAe,CAC7B,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,OAAO,GAChB,MAAM,GAAG,MAAM,EAAE,CAUnB;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,EAAE,CA6CvD"}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
function getMetaContent(doc, nameOrProperty) {
|
|
3
|
+
const el = doc.querySelector(`meta[name="${nameOrProperty}"]`) ??
|
|
4
|
+
doc.querySelector(`meta[property="${nameOrProperty}"]`);
|
|
5
|
+
return el?.getAttribute('content') ?? undefined;
|
|
6
|
+
}
|
|
7
|
+
export function extractMetadata(html) {
|
|
8
|
+
const { document: doc } = parseHTML(html);
|
|
9
|
+
const result = {};
|
|
10
|
+
const title = doc.querySelector('title')?.textContent?.trim();
|
|
11
|
+
if (title)
|
|
12
|
+
result.title = title;
|
|
13
|
+
const description = getMetaContent(doc, 'description') ?? getMetaContent(doc, 'og:description');
|
|
14
|
+
if (description)
|
|
15
|
+
result.description = description;
|
|
16
|
+
const author = getMetaContent(doc, 'author');
|
|
17
|
+
if (author)
|
|
18
|
+
result.author = author;
|
|
19
|
+
const date = getMetaContent(doc, 'date') ?? getMetaContent(doc, 'article:published_time');
|
|
20
|
+
if (date)
|
|
21
|
+
result.date = date;
|
|
22
|
+
const keywords = getMetaContent(doc, 'keywords');
|
|
23
|
+
if (keywords) {
|
|
24
|
+
result.keywords = keywords.split(',').map((k) => k.trim()).filter(Boolean);
|
|
25
|
+
}
|
|
26
|
+
const ogImage = getMetaContent(doc, 'og:image');
|
|
27
|
+
if (ogImage)
|
|
28
|
+
result.og_image = ogImage;
|
|
29
|
+
return result;
|
|
30
|
+
}
|
|
31
|
+
export function extractSelector(html, selector, multiple) {
|
|
32
|
+
const { document: doc } = parseHTML(html);
|
|
33
|
+
if (multiple) {
|
|
34
|
+
const elements = doc.querySelectorAll(selector);
|
|
35
|
+
return Array.from(elements).map((el) => (el.textContent ?? '').trim());
|
|
36
|
+
}
|
|
37
|
+
const el = doc.querySelector(selector);
|
|
38
|
+
return el ? (el.textContent ?? '').trim() : '';
|
|
39
|
+
}
|
|
40
|
+
export function extractTables(html) {
|
|
41
|
+
const { document: doc } = parseHTML(html);
|
|
42
|
+
const tables = doc.querySelectorAll('table');
|
|
43
|
+
if (tables.length === 0)
|
|
44
|
+
return [];
|
|
45
|
+
return Array.from(tables).map((table) => {
|
|
46
|
+
const caption = table.querySelector('caption')?.textContent?.trim() || undefined;
|
|
47
|
+
const thElements = table.querySelectorAll('thead th');
|
|
48
|
+
let headers;
|
|
49
|
+
let bodyRows;
|
|
50
|
+
if (thElements.length > 0) {
|
|
51
|
+
headers = Array.from(thElements).map((th) => (th.textContent ?? '').trim());
|
|
52
|
+
bodyRows = Array.from(table.querySelectorAll('tbody tr'));
|
|
53
|
+
if (bodyRows.length === 0) {
|
|
54
|
+
const allRows = Array.from(table.querySelectorAll('tr'));
|
|
55
|
+
bodyRows = allRows.slice(1);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
else {
|
|
59
|
+
const allRows = Array.from(table.querySelectorAll('tr'));
|
|
60
|
+
const firstRow = allRows[0];
|
|
61
|
+
const firstRowThs = firstRow ? Array.from(firstRow.querySelectorAll('th')) : [];
|
|
62
|
+
if (firstRowThs.length > 0) {
|
|
63
|
+
headers = firstRowThs.map((th) => (th.textContent ?? '').trim());
|
|
64
|
+
bodyRows = allRows.slice(1);
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
const cellCount = firstRow ? firstRow.querySelectorAll('td').length : 0;
|
|
68
|
+
headers = Array.from({ length: cellCount }, (_, i) => `col_${i + 1}`);
|
|
69
|
+
bodyRows = allRows;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
const rows = bodyRows.map((row) => {
|
|
73
|
+
const cells = Array.from(row.querySelectorAll('td'));
|
|
74
|
+
const obj = {};
|
|
75
|
+
headers.forEach((header, i) => {
|
|
76
|
+
obj[header] = (cells[i]?.textContent ?? '').trim();
|
|
77
|
+
});
|
|
78
|
+
return obj;
|
|
79
|
+
});
|
|
80
|
+
return { caption, headers, rows };
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
//# sourceMappingURL=extract.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extract.js","sourceRoot":"","sources":["../../src/extraction/extract.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC,SAAS,cAAc,CAAC,GAAa,EAAE,cAAsB;IAC3D,MAAM,EAAE,GACN,GAAG,CAAC,aAAa,CAAC,cAAc,cAAc,IAAI,CAAC;QACnD,GAAG,CAAC,aAAa,CAAC,kBAAkB,cAAc,IAAI,CAAC,CAAC;IAC1D,OAAO,EAAE,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,SAAS,CAAC;AAClD,CAAC;AAED,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,MAAM,GAAiB,EAAE,CAAC;IAEhC,MAAM,KAAK,GAAG,GAAG,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;IAC9D,IAAI,KAAK;QAAE,MAAM,CAAC,KAAK,GAAG,KAAK,CAAC;IAEhC,MAAM,WAAW,GACf,cAAc,CAAC,GAAG,EAAE,aAAa,CAAC,IAAI,cAAc,CAAC,GAAG,EAAE,gBAAgB,CAAC,CAAC;IAC9E,IAAI,WAAW;QAAE,MAAM,CAAC,WAAW,GAAG,WAAW,CAAC;IAElD,MAAM,MAAM,GAAG,cAAc,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IAC7C,IAAI,MAAM;QAAE,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC;IAEnC,MAAM,IAAI,GACR,cAAc,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,cAAc,CAAC,GAAG,EAAE,wBAAwB,CAAC,CAAC;IAC/E,IAAI,IAAI;QAAE,MAAM,CAAC,IAAI,GAAG,IAAI,CAAC;IAE7B,MAAM,QAAQ,GAAG,cAAc,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;IACjD,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAC7E,CAAC;IAED,MAAM,OAAO,GAAG,cAAc,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;IAChD,IAAI,OAAO;QAAE,MAAM,CAAC,QAAQ,GAAG,OAAO,CAAC;IAEvC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,IAAY,EACZ,QAAgB,EAChB,QAAiB;IAEjB,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAE1C,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,QAAQ,GAAG,GAAG,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;QAChD,OAAO,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IACzE,CAAC;IAED,MAAM,EAAE,GAAG,GAAG,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;IACvC,OAAO,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;AACjD,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,MAAM,GAAG,GAAG,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;IAC7C,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEnC,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE;QACtC,MAAM,OAAO,GAAG,KAAK,CAAC,aAAa,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,SAAS,CAAC;QAEjF,MAAM,UAAU,GAAG,KAAK,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC;QACtD,IAAI,OAAiB,CAAC;QACtB,IAAI,QAAmB,CAAC;QAExB,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAC5E,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,CAAC;YAC1D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC1B,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;gBACzD,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;aAAM,CAAC;YACN,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;YACzD,MAAM,QAAQ,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;YAC5B,MAAM,WAAW,GAAG,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAEhF,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC3B,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;gBACjE,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,MAAM,SAAS,GAAG,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxE,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBACtE,QAAQ,GAAG,OAAO,CAAC;YACrB,CAAC;QACH,CAAC;QAED,MAAM,IAAI,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;YAChC,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;YACrD,MAAM,GAAG,GAA2B,EAAE,CAAC;YACvC,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;gBAC5B,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;YACrD,CAAC,CAAC,CAAC;YACH,OAAO,GAAG,CAAC;QACb,CAAC,CAAC,CAAC;QAEH,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { JsonSchema } from './schema.js';
|
|
2
|
+
export declare function extractJsonLd(html: string): Record<string, unknown>[];
|
|
3
|
+
export declare function matchJsonLdToSchema(jsonLdBlocks: Record<string, unknown>[], schema: JsonSchema): Record<string, unknown>;
|
|
4
|
+
//# sourceMappingURL=jsonld.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"jsonld.d.ts","sourceRoot":"","sources":["../../src/extraction/jsonld.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAI9C,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,CAyBrE;AAED,wBAAgB,mBAAmB,CACjC,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,EACvC,MAAM,EAAE,UAAU,GACjB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAazB"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import { createLogger } from '../logger.js';
|
|
3
|
+
const log = createLogger('jsonld');
|
|
4
|
+
export function extractJsonLd(html) {
|
|
5
|
+
const { document: doc } = parseHTML(html);
|
|
6
|
+
const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
|
|
7
|
+
const results = [];
|
|
8
|
+
for (const script of scripts) {
|
|
9
|
+
try {
|
|
10
|
+
const text = script.textContent?.trim();
|
|
11
|
+
if (!text)
|
|
12
|
+
continue;
|
|
13
|
+
const parsed = JSON.parse(text);
|
|
14
|
+
if (Array.isArray(parsed)) {
|
|
15
|
+
results.push(...parsed);
|
|
16
|
+
}
|
|
17
|
+
else if (parsed['@graph'] && Array.isArray(parsed['@graph'])) {
|
|
18
|
+
results.push(...parsed['@graph']);
|
|
19
|
+
}
|
|
20
|
+
else {
|
|
21
|
+
results.push(parsed);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
catch (err) {
|
|
25
|
+
log.debug('Failed to parse JSON-LD block', { error: String(err) });
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return results;
|
|
29
|
+
}
|
|
30
|
+
export function matchJsonLdToSchema(jsonLdBlocks, schema) {
|
|
31
|
+
if (!schema.properties || jsonLdBlocks.length === 0)
|
|
32
|
+
return {};
|
|
33
|
+
const result = {};
|
|
34
|
+
const flattened = flattenJsonLd(jsonLdBlocks);
|
|
35
|
+
for (const fieldName of Object.keys(schema.properties)) {
|
|
36
|
+
if (flattened[fieldName] !== undefined) {
|
|
37
|
+
result[fieldName] = flattened[fieldName];
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return result;
|
|
41
|
+
}
|
|
42
|
+
function flattenJsonLd(blocks) {
|
|
43
|
+
const flat = {};
|
|
44
|
+
for (const block of blocks) {
|
|
45
|
+
flattenObject(block, flat);
|
|
46
|
+
}
|
|
47
|
+
return flat;
|
|
48
|
+
}
|
|
49
|
+
function flattenObject(obj, target) {
|
|
50
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
51
|
+
if (key.startsWith('@'))
|
|
52
|
+
continue;
|
|
53
|
+
// First-wins: earlier blocks and shallower keys take priority
|
|
54
|
+
if (!(key in target)) {
|
|
55
|
+
if (typeof value === 'object' && value !== null && !Array.isArray(value)) {
|
|
56
|
+
flattenObject(value, target);
|
|
57
|
+
}
|
|
58
|
+
else {
|
|
59
|
+
target[key] = value;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
//# sourceMappingURL=jsonld.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"jsonld.js","sourceRoot":"","sources":["../../src/extraction/jsonld.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAG5C,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;AAEnC,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,OAAO,GAAG,GAAG,CAAC,gBAAgB,CAAC,oCAAoC,CAAC,CAAC;IAC3E,MAAM,OAAO,GAA8B,EAAE,CAAC;IAE9C,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACxC,IAAI,CAAC,IAAI;gBAAE,SAAS;YAEpB,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAEhC,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC1B,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;YAC1B,CAAC;iBAAM,IAAI,MAAM,CAAC,QAAQ,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;YACpC,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACvB,CAAC;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,GAAG,CAAC,KAAK,CAAC,+BAA+B,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACrE,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,YAAuC,EACvC,MAAkB;IAElB,IAAI,CAAC,MAAM,CAAC,UAAU,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAE/D,MAAM,MAAM,GAA4B,EAAE,CAAC;IAC3C,MAAM,SAAS,GAAG,aAAa,CAAC,YAAY,CAAC,CAAC;IAE9C,KAAK,MAAM,SAAS,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;QACvD,IAAI,SAAS,CAAC,SAAS,CAAC,KAAK,SAAS,EAAE,CAAC;YACvC,MAAM,CAAC,SAAS,CAAC,GAAG,SAAS,CAAC,SAAS,CAAC,CAAC;QAC3C,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,aAAa,CACpB,MAAiC;IAEjC,MAAM,IAAI,GAA4B,EAAE,CAAC;IAEzC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,aAAa,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAC7B,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,aAAa,CACpB,GAA4B,EAC5B,MAA+B;IAE/B,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QAC/C,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,SAAS;QAElC,8DAA8D;QAC9D,IAAI,CAAC,CAAC,GAAG,IAAI,MAAM,CAAC,EAAE,CAAC;YACrB,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;gBACzE,aAAa,CAAC,KAAgC,EAAE,MAAM,CAAC,CAAC;YAC1D,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;YACtB,CAAC;QACH,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export declare function htmlToMarkdown(html: string): string;
|
|
2
|
+
export declare function extractSection(markdown: string, section: string, sectionIndex?: number): {
|
|
3
|
+
content: string;
|
|
4
|
+
matched: boolean;
|
|
5
|
+
};
|
|
6
|
+
export declare function extractLinksAndImages(markdown: string): {
|
|
7
|
+
links: string[];
|
|
8
|
+
images: string[];
|
|
9
|
+
};
|
|
10
|
+
//# sourceMappingURL=markdown.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.d.ts","sourceRoot":"","sources":["../../src/extraction/markdown.ts"],"names":[],"mappings":"AAkDA,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAGnD;AAmCD,wBAAgB,cAAc,CAC5B,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,EACf,YAAY,SAAI,GACf;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,OAAO,EAAE,OAAO,CAAA;CAAE,CA2BvC;AAED,wBAAgB,qBAAqB,CAAC,QAAQ,EAAE,MAAM,GAAG;IAAE,KAAK,EAAE,MAAM,EAAE,CAAC;IAAC,MAAM,EAAE,MAAM,EAAE,CAAA;CAAE,CAoB7F"}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
function buildTurndown() {
|
|
3
|
+
const td = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
|
4
|
+
// Remove script and style tags entirely
|
|
5
|
+
td.remove(['script', 'style']);
|
|
6
|
+
// Custom rule: convert <table> to markdown table
|
|
7
|
+
td.addRule('table', {
|
|
8
|
+
filter: 'table',
|
|
9
|
+
replacement(_content, node) {
|
|
10
|
+
const el = node;
|
|
11
|
+
const rows = Array.from(el.querySelectorAll('tr'));
|
|
12
|
+
if (rows.length === 0)
|
|
13
|
+
return '';
|
|
14
|
+
const renderRow = (row) => {
|
|
15
|
+
const cells = Array.from(row.querySelectorAll('th, td'));
|
|
16
|
+
return '| ' + cells.map(c => c.textContent?.replace(/\n/g, ' ').trim() ?? '').join(' | ') + ' |';
|
|
17
|
+
};
|
|
18
|
+
const headerRow = rows[0];
|
|
19
|
+
const isHeaderRow = headerRow.querySelectorAll('th').length > 0;
|
|
20
|
+
const headerCells = Array.from(headerRow.querySelectorAll('th, td'));
|
|
21
|
+
const separator = '| ' + headerCells.map(() => '---').join(' | ') + ' |';
|
|
22
|
+
if (isHeaderRow) {
|
|
23
|
+
const bodyRows = rows.slice(1);
|
|
24
|
+
const lines = [renderRow(headerRow), separator, ...bodyRows.map(renderRow)];
|
|
25
|
+
return '\n\n' + lines.join('\n') + '\n\n';
|
|
26
|
+
}
|
|
27
|
+
const lines = [renderRow(headerRow), separator, ...rows.slice(1).map(renderRow)];
|
|
28
|
+
return '\n\n' + lines.join('\n') + '\n\n';
|
|
29
|
+
},
|
|
30
|
+
});
|
|
31
|
+
// Suppress thead/tbody/tr/th/td individually since table rule handles the whole node
|
|
32
|
+
td.addRule('tableCell', {
|
|
33
|
+
filter: ['thead', 'tbody', 'tfoot', 'tr', 'th', 'td'],
|
|
34
|
+
replacement(content) {
|
|
35
|
+
return content;
|
|
36
|
+
},
|
|
37
|
+
});
|
|
38
|
+
return td;
|
|
39
|
+
}
|
|
40
|
+
const turndown = buildTurndown();
|
|
41
|
+
export function htmlToMarkdown(html) {
|
|
42
|
+
if (!html)
|
|
43
|
+
return '';
|
|
44
|
+
return turndown.turndown(html);
|
|
45
|
+
}
|
|
46
|
+
function parseHeadings(lines) {
|
|
47
|
+
const headings = [];
|
|
48
|
+
for (let i = 0; i < lines.length; i++) {
|
|
49
|
+
const match = lines[i].match(/^(#{1,6})\s+(.+)/);
|
|
50
|
+
if (match) {
|
|
51
|
+
headings.push({ level: match[1].length, text: match[2].trim(), lineIndex: i });
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return headings;
|
|
55
|
+
}
|
|
56
|
+
function extractFromHeading(lines, headings, headingIdx) {
|
|
57
|
+
const heading = headings[headingIdx];
|
|
58
|
+
const start = heading.lineIndex;
|
|
59
|
+
// Find the next heading of equal or higher level (lower or equal # count)
|
|
60
|
+
let end = lines.length;
|
|
61
|
+
for (let i = headingIdx + 1; i < headings.length; i++) {
|
|
62
|
+
if (headings[i].level <= heading.level) {
|
|
63
|
+
end = headings[i].lineIndex;
|
|
64
|
+
break;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return lines.slice(start, end).join('\n');
|
|
68
|
+
}
|
|
69
|
+
export function extractSection(markdown, section, sectionIndex = 0) {
|
|
70
|
+
const lines = markdown.split('\n');
|
|
71
|
+
const headings = parseHeadings(lines);
|
|
72
|
+
if (headings.length === 0)
|
|
73
|
+
return { content: markdown, matched: false };
|
|
74
|
+
const lower = section.toLowerCase();
|
|
75
|
+
const indexed = headings.map((h, i) => ({ h, i }));
|
|
76
|
+
// Collect exact matches first
|
|
77
|
+
const exactMatches = indexed.filter(({ h }) => h.text.toLowerCase() === lower);
|
|
78
|
+
// If exact matches satisfy the requested index, use them
|
|
79
|
+
if (exactMatches.length > 0 && sectionIndex < exactMatches.length) {
|
|
80
|
+
const { i } = exactMatches[sectionIndex];
|
|
81
|
+
return { content: extractFromHeading(lines, headings, i), matched: true };
|
|
82
|
+
}
|
|
83
|
+
// Fall back to substring matches (includes exact headings and partial ones)
|
|
84
|
+
const substringMatches = indexed.filter(({ h }) => h.text.toLowerCase().includes(lower));
|
|
85
|
+
if (substringMatches.length === 0 || sectionIndex >= substringMatches.length) {
|
|
86
|
+
return { content: markdown, matched: false };
|
|
87
|
+
}
|
|
88
|
+
const { i } = substringMatches[sectionIndex];
|
|
89
|
+
return { content: extractFromHeading(lines, headings, i), matched: true };
|
|
90
|
+
}
|
|
91
|
+
export function extractLinksAndImages(markdown) {
|
|
92
|
+
const imagePattern = /!\[[^\]]*\]\(([^)]+)\)/g;
|
|
93
|
+
const linkPattern = /(?<!!)\[[^\]]*\]\(([^)]+)\)/g;
|
|
94
|
+
const images = new Set();
|
|
95
|
+
const links = new Set();
|
|
96
|
+
let match;
|
|
97
|
+
// Extract images first
|
|
98
|
+
while ((match = imagePattern.exec(markdown)) !== null) {
|
|
99
|
+
images.add(match[1]);
|
|
100
|
+
}
|
|
101
|
+
// Extract links (non-image)
|
|
102
|
+
while ((match = linkPattern.exec(markdown)) !== null) {
|
|
103
|
+
links.add(match[1]);
|
|
104
|
+
}
|
|
105
|
+
return { links: Array.from(links), images: Array.from(images) };
|
|
106
|
+
}
|
|
107
|
+
//# sourceMappingURL=markdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../src/extraction/markdown.ts"],"names":[],"mappings":"AAAA,OAAO,eAAe,MAAM,UAAU,CAAC;AAEvC,SAAS,aAAa;IACpB,MAAM,EAAE,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;IAElF,wCAAwC;IACxC,EAAE,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;IAE/B,iDAAiD;IACjD,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE;QAClB,MAAM,EAAE,OAAO;QACf,WAAW,CAAC,QAAQ,EAAE,IAAI;YACxB,MAAM,EAAE,GAAG,IAAe,CAAC;YAC3B,MAAM,IAAI,GAAc,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;YAC9D,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO,EAAE,CAAC;YAEjC,MAAM,SAAS,GAAG,CAAC,GAAY,EAAU,EAAE;gBACzC,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC;gBACzD,OAAO,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;YACnG,CAAC,CAAC;YAEF,MAAM,SAAS,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,WAAW,GAAG,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YAChE,MAAM,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC;YACrE,MAAM,SAAS,GAAG,IAAI,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;YAEzE,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAC/B,MAAM,KAAK,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,GAAG,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC;gBAC5E,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;YAC5C,CAAC;YAED,MAAM,KAAK,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC;YACjF,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;QAC5C,CAAC;KACF,CAAC,CAAC;IAEH,qFAAqF;IACrF,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE;QACtB,MAAM,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;QACrD,WAAW,CAAC,OAAO;YACjB,OAAO,OAAO,CAAC;QACjB,CAAC;KACF,CAAC,CAAC;IAEH,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,QAAQ,GAAG,aAAa,EAAE,CAAC;AAEjC,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IACrB,OAAO,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC;AAQD,SAAS,aAAa,CAAC,KAAe;IACpC,MAAM,QAAQ,GAAc,EAAE,CAAC;IAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACjD,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;QACjF,CAAC;IACH,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,SAAS,kBAAkB,CAAC,KAAe,EAAE,QAAmB,EAAE,UAAkB;IAClF,MAAM,OAAO,GAAG,QAAQ,CAAC,UAAU,CAAC,CAAC;IACrC,MAAM,KAAK,GAAG,OAAO,CAAC,SAAS,CAAC;IAEhC,0EAA0E;IAC1E,IAAI,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC;IACvB,KAAK,IAAI,CAAC,GAAG,UAAU,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtD,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC,KAAK,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YACvC,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC5B,MAAM;QACR,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,cAAc,CAC5B,QAAgB,EAChB,OAAe,EACf,YAAY,GAAG,CAAC;IAEhB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,QAAQ,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;IAEtC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;IAExE,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IACpC,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;IAEnD,8BAA8B;IAC9B,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,KAAK,CAAC,CAAC;IAE/E,yDAAyD;IACzD,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC;QAClE,MAAM,EAAE,CAAC,EAAE,GAAG,YAAY,CAAC,YAAY,CAAC,CAAC;QACzC,OAAO,EAAE,OAAO,EAAE,kBAAkB,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;IAC5E,CAAC;IAED,4EAA4E;IAC5E,MAAM,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IAEzF,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,IAAI,YAAY,IAAI,gBAAgB,CAAC,MAAM,EAAE,CAAC;QAC7E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;IAC/C,CAAC;IAED,MAAM,EAAE,CAAC,EAAE,GAAG,gBAAgB,CAAC,YAAY,CAAC,CAAC;IAC7C,OAAO,EAAE,OAAO,EAAE,kBAAkB,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;AAC5E,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,QAAgB;IACpD,MAAM,YAAY,GAAG,yBAAyB,CAAC;IAC/C,MAAM,WAAW,GAAG,8BAA8B,CAAC;IAEnD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAU,CAAC;IACjC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;IAEhC,IAAI,KAA6B,CAAC;IAElC,uBAAuB;IACvB,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACtD,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,4BAA4B;IAC5B,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACrD,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACtB,CAAC;IAED,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;AAClE,CAAC"}
|