@staticn0va/wigolo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +74 -0
- package/README.md +272 -0
- package/dist/cache/db.d.ts +5 -0
- package/dist/cache/db.d.ts.map +1 -0
- package/dist/cache/db.js +97 -0
- package/dist/cache/db.js.map +1 -0
- package/dist/cache/store.d.ts +26 -0
- package/dist/cache/store.d.ts.map +1 -0
- package/dist/cache/store.js +214 -0
- package/dist/cache/store.js.map +1 -0
- package/dist/cli/daemon.d.ts +2 -0
- package/dist/cli/daemon.d.ts.map +1 -0
- package/dist/cli/daemon.js +5 -0
- package/dist/cli/daemon.js.map +1 -0
- package/dist/cli/health.d.ts +2 -0
- package/dist/cli/health.d.ts.map +1 -0
- package/dist/cli/health.js +5 -0
- package/dist/cli/health.js.map +1 -0
- package/dist/cli/index.d.ts +7 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +9 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/warmup.d.ts +11 -0
- package/dist/cli/warmup.d.ts.map +1 -0
- package/dist/cli/warmup.js +107 -0
- package/dist/cli/warmup.js.map +1 -0
- package/dist/config.d.ts +41 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +66 -0
- package/dist/config.js.map +1 -0
- package/dist/crawl/crawler.d.ts +18 -0
- package/dist/crawl/crawler.d.ts.map +1 -0
- package/dist/crawl/crawler.js +228 -0
- package/dist/crawl/crawler.js.map +1 -0
- package/dist/crawl/dedup.d.ts +15 -0
- package/dist/crawl/dedup.d.ts.map +1 -0
- package/dist/crawl/dedup.js +93 -0
- package/dist/crawl/dedup.js.map +1 -0
- package/dist/crawl/mapper.d.ts +17 -0
- package/dist/crawl/mapper.d.ts.map +1 -0
- package/dist/crawl/mapper.js +178 -0
- package/dist/crawl/mapper.js.map +1 -0
- package/dist/crawl/rate-limiter.d.ts +10 -0
- package/dist/crawl/rate-limiter.d.ts.map +1 -0
- package/dist/crawl/rate-limiter.js +72 -0
- package/dist/crawl/rate-limiter.js.map +1 -0
- package/dist/crawl/robots.d.ts +9 -0
- package/dist/crawl/robots.d.ts.map +1 -0
- package/dist/crawl/robots.js +63 -0
- package/dist/crawl/robots.js.map +1 -0
- package/dist/crawl/sitemap.d.ts +4 -0
- package/dist/crawl/sitemap.d.ts.map +1 -0
- package/dist/crawl/sitemap.js +38 -0
- package/dist/crawl/sitemap.js.map +1 -0
- package/dist/crawl/url-utils.d.ts +3 -0
- package/dist/crawl/url-utils.d.ts.map +1 -0
- package/dist/crawl/url-utils.js +41 -0
- package/dist/crawl/url-utils.js.map +1 -0
- package/dist/extraction/defuddle.d.ts +3 -0
- package/dist/extraction/defuddle.d.ts.map +1 -0
- package/dist/extraction/defuddle.js +26 -0
- package/dist/extraction/defuddle.js.map +1 -0
- package/dist/extraction/extract.d.ts +5 -0
- package/dist/extraction/extract.d.ts.map +1 -0
- package/dist/extraction/extract.js +83 -0
- package/dist/extraction/extract.js.map +1 -0
- package/dist/extraction/jsonld.d.ts +4 -0
- package/dist/extraction/jsonld.d.ts.map +1 -0
- package/dist/extraction/jsonld.js +64 -0
- package/dist/extraction/jsonld.js.map +1 -0
- package/dist/extraction/markdown.d.ts +10 -0
- package/dist/extraction/markdown.d.ts.map +1 -0
- package/dist/extraction/markdown.js +107 -0
- package/dist/extraction/markdown.js.map +1 -0
- package/dist/extraction/pipeline.d.ts +11 -0
- package/dist/extraction/pipeline.d.ts.map +1 -0
- package/dist/extraction/pipeline.js +95 -0
- package/dist/extraction/pipeline.js.map +1 -0
- package/dist/extraction/readability.d.ts +3 -0
- package/dist/extraction/readability.d.ts.map +1 -0
- package/dist/extraction/readability.js +32 -0
- package/dist/extraction/readability.js.map +1 -0
- package/dist/extraction/schema.d.ts +7 -0
- package/dist/extraction/schema.d.ts.map +1 -0
- package/dist/extraction/schema.js +86 -0
- package/dist/extraction/schema.js.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts +3 -0
- package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -0
- package/dist/extraction/site-extractors/docs-generic.js +104 -0
- package/dist/extraction/site-extractors/docs-generic.js.map +1 -0
- package/dist/extraction/site-extractors/github.d.ts +3 -0
- package/dist/extraction/site-extractors/github.d.ts.map +1 -0
- package/dist/extraction/site-extractors/github.js +107 -0
- package/dist/extraction/site-extractors/github.js.map +1 -0
- package/dist/extraction/site-extractors/mdn.d.ts +3 -0
- package/dist/extraction/site-extractors/mdn.d.ts.map +1 -0
- package/dist/extraction/site-extractors/mdn.js +58 -0
- package/dist/extraction/site-extractors/mdn.js.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts +3 -0
- package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -0
- package/dist/extraction/site-extractors/stackoverflow.js +88 -0
- package/dist/extraction/site-extractors/stackoverflow.js.map +1 -0
- package/dist/extraction/trafilatura.d.ts +6 -0
- package/dist/extraction/trafilatura.d.ts.map +1 -0
- package/dist/extraction/trafilatura.js +105 -0
- package/dist/extraction/trafilatura.js.map +1 -0
- package/dist/fetch/auth.d.ts +8 -0
- package/dist/fetch/auth.d.ts.map +1 -0
- package/dist/fetch/auth.js +32 -0
- package/dist/fetch/auth.js.map +1 -0
- package/dist/fetch/browser-pool.d.ts +28 -0
- package/dist/fetch/browser-pool.d.ts.map +1 -0
- package/dist/fetch/browser-pool.js +138 -0
- package/dist/fetch/browser-pool.js.map +1 -0
- package/dist/fetch/content-check.d.ts +2 -0
- package/dist/fetch/content-check.d.ts.map +1 -0
- package/dist/fetch/content-check.js +62 -0
- package/dist/fetch/content-check.js.map +1 -0
- package/dist/fetch/http-client.d.ts +15 -0
- package/dist/fetch/http-client.d.ts.map +1 -0
- package/dist/fetch/http-client.js +146 -0
- package/dist/fetch/http-client.js.map +1 -0
- package/dist/fetch/router.d.ts +45 -0
- package/dist/fetch/router.d.ts.map +1 -0
- package/dist/fetch/router.js +89 -0
- package/dist/fetch/router.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +10 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +39 -0
- package/dist/logger.js.map +1 -0
- package/dist/search/dedup.d.ts +10 -0
- package/dist/search/dedup.d.ts.map +1 -0
- package/dist/search/dedup.js +35 -0
- package/dist/search/dedup.js.map +1 -0
- package/dist/search/engines/bing.d.ts +7 -0
- package/dist/search/engines/bing.d.ts.map +1 -0
- package/dist/search/engines/bing.js +48 -0
- package/dist/search/engines/bing.js.map +1 -0
- package/dist/search/engines/duckduckgo.d.ts +7 -0
- package/dist/search/engines/duckduckgo.d.ts.map +1 -0
- package/dist/search/engines/duckduckgo.js +50 -0
- package/dist/search/engines/duckduckgo.js.map +1 -0
- package/dist/search/engines/startpage.d.ts +7 -0
- package/dist/search/engines/startpage.d.ts.map +1 -0
- package/dist/search/engines/startpage.js +50 -0
- package/dist/search/engines/startpage.js.map +1 -0
- package/dist/search/filters.d.ts +16 -0
- package/dist/search/filters.d.ts.map +1 -0
- package/dist/search/filters.js +63 -0
- package/dist/search/filters.js.map +1 -0
- package/dist/search/flashrank.d.ts +12 -0
- package/dist/search/flashrank.d.ts.map +1 -0
- package/dist/search/flashrank.js +63 -0
- package/dist/search/flashrank.js.map +1 -0
- package/dist/search/query.d.ts +2 -0
- package/dist/search/query.d.ts.map +1 -0
- package/dist/search/query.js +41 -0
- package/dist/search/query.js.map +1 -0
- package/dist/search/rerank.d.ts +3 -0
- package/dist/search/rerank.d.ts.map +1 -0
- package/dist/search/rerank.js +40 -0
- package/dist/search/rerank.js.map +1 -0
- package/dist/search/searxng.d.ts +8 -0
- package/dist/search/searxng.d.ts.map +1 -0
- package/dist/search/searxng.js +87 -0
- package/dist/search/searxng.js.map +1 -0
- package/dist/search/validator.d.ts +6 -0
- package/dist/search/validator.d.ts.map +1 -0
- package/dist/search/validator.js +35 -0
- package/dist/search/validator.js.map +1 -0
- package/dist/searxng/bootstrap.d.ts +18 -0
- package/dist/searxng/bootstrap.d.ts.map +1 -0
- package/dist/searxng/bootstrap.js +136 -0
- package/dist/searxng/bootstrap.js.map +1 -0
- package/dist/searxng/docker.d.ts +9 -0
- package/dist/searxng/docker.d.ts.map +1 -0
- package/dist/searxng/docker.js +67 -0
- package/dist/searxng/docker.js.map +1 -0
- package/dist/searxng/process.d.ts +23 -0
- package/dist/searxng/process.d.ts.map +1 -0
- package/dist/searxng/process.js +188 -0
- package/dist/searxng/process.js.map +1 -0
- package/dist/server.d.ts +2 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +311 -0
- package/dist/server.js.map +1 -0
- package/dist/tools/cache.d.ts +3 -0
- package/dist/tools/cache.d.ts.map +1 -0
- package/dist/tools/cache.js +50 -0
- package/dist/tools/cache.js.map +1 -0
- package/dist/tools/crawl.d.ts +6 -0
- package/dist/tools/crawl.d.ts.map +1 -0
- package/dist/tools/crawl.js +97 -0
- package/dist/tools/crawl.js.map +1 -0
- package/dist/tools/extract.d.ts +4 -0
- package/dist/tools/extract.d.ts.map +1 -0
- package/dist/tools/extract.js +69 -0
- package/dist/tools/extract.js.map +1 -0
- package/dist/tools/fetch.d.ts +4 -0
- package/dist/tools/fetch.d.ts.map +1 -0
- package/dist/tools/fetch.js +76 -0
- package/dist/tools/fetch.js.map +1 -0
- package/dist/tools/search.d.ts +4 -0
- package/dist/tools/search.d.ts.map +1 -0
- package/dist/tools/search.js +160 -0
- package/dist/tools/search.js.map +1 -0
- package/dist/types.d.ts +222 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +61 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
import { matchesPatterns } from './url-utils.js';
|
|
2
|
+
import { RateLimiter } from './rate-limiter.js';
|
|
3
|
+
import { RobotsParser } from './robots.js';
|
|
4
|
+
import { parseSitemap, parseSitemapIndex, extractSitemapUrlFromRobots } from './sitemap.js';
|
|
5
|
+
import { getConfig } from '../config.js';
|
|
6
|
+
import { createLogger } from '../logger.js';
|
|
7
|
+
const log = createLogger('crawl');
|
|
8
|
+
export class Crawler {
|
|
9
|
+
fetchFn;
|
|
10
|
+
rawFetchFn;
|
|
11
|
+
rateLimiter = new RateLimiter();
|
|
12
|
+
constructor(fetchFn, rawFetchFn) {
|
|
13
|
+
this.fetchFn = fetchFn;
|
|
14
|
+
this.rawFetchFn = rawFetchFn;
|
|
15
|
+
}
|
|
16
|
+
async crawl(input) {
|
|
17
|
+
const strategy = input.strategy ?? 'bfs';
|
|
18
|
+
const maxDepth = input.max_depth ?? 2;
|
|
19
|
+
const maxPages = input.max_pages ?? 20;
|
|
20
|
+
const seedOrigin = new URL(input.url).origin;
|
|
21
|
+
// Fetch and parse robots.txt if configured
|
|
22
|
+
const config = getConfig();
|
|
23
|
+
let robotsParser = null;
|
|
24
|
+
if (config.respectRobotsTxt) {
|
|
25
|
+
robotsParser = await this.fetchRobots(seedOrigin);
|
|
26
|
+
}
|
|
27
|
+
if (strategy === 'sitemap') {
|
|
28
|
+
return this.crawlSitemap(input, seedOrigin, maxPages, robotsParser);
|
|
29
|
+
}
|
|
30
|
+
const traversalStrategy = strategy === 'map' ? 'bfs' : strategy;
|
|
31
|
+
return this.crawlTraversal(input, seedOrigin, maxDepth, maxPages, traversalStrategy, robotsParser);
|
|
32
|
+
}
|
|
33
|
+
robotsTxtContent = null;
|
|
34
|
+
async fetchRobots(origin) {
|
|
35
|
+
try {
|
|
36
|
+
const result = await this.rawFetchFn(`${origin}/robots.txt`);
|
|
37
|
+
if (result.statusCode === 200 && result.html) {
|
|
38
|
+
this.robotsTxtContent = result.html;
|
|
39
|
+
const parser = new RobotsParser(result.html);
|
|
40
|
+
const crawlDelay = parser.getCrawlDelay();
|
|
41
|
+
if (crawlDelay !== null) {
|
|
42
|
+
const domain = new URL(origin).hostname;
|
|
43
|
+
this.rateLimiter.setRobotsCrawlDelay(domain, crawlDelay);
|
|
44
|
+
}
|
|
45
|
+
return parser;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
catch {
|
|
49
|
+
log.debug('Could not fetch robots.txt', { origin });
|
|
50
|
+
}
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
async crawlTraversal(input, seedOrigin, maxDepth, maxPages, strategy, robotsParser) {
|
|
54
|
+
const visited = new Set();
|
|
55
|
+
const pages = [];
|
|
56
|
+
const allLinks = [];
|
|
57
|
+
// Queue: [url, depth]
|
|
58
|
+
const queue = [[input.url, 0]];
|
|
59
|
+
visited.add(input.url);
|
|
60
|
+
while (queue.length > 0 && pages.length < maxPages) {
|
|
61
|
+
const item = strategy === 'dfs' ? queue.pop() : queue.shift();
|
|
62
|
+
const [url, depth] = item;
|
|
63
|
+
// Check robots.txt
|
|
64
|
+
if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname)) {
|
|
65
|
+
log.debug('Blocked by robots.txt', { url });
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
// Rate limit
|
|
69
|
+
const release = await this.rateLimiter.acquire(url);
|
|
70
|
+
let fetchResult;
|
|
71
|
+
try {
|
|
72
|
+
fetchResult = await this.fetchFn(url);
|
|
73
|
+
}
|
|
74
|
+
catch (err) {
|
|
75
|
+
log.warn('Fetch failed during crawl', { url, error: String(err) });
|
|
76
|
+
release();
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
release();
|
|
80
|
+
if (fetchResult.error) {
|
|
81
|
+
log.warn('Fetch returned error', { url, error: fetchResult.error });
|
|
82
|
+
continue;
|
|
83
|
+
}
|
|
84
|
+
pages.push({
|
|
85
|
+
url: fetchResult.url,
|
|
86
|
+
title: fetchResult.title,
|
|
87
|
+
markdown: fetchResult.markdown,
|
|
88
|
+
depth,
|
|
89
|
+
});
|
|
90
|
+
// Discover links for traversal
|
|
91
|
+
if (depth < maxDepth) {
|
|
92
|
+
const newLinks = this.filterLinks(fetchResult.links, seedOrigin, visited, input.include_patterns, input.exclude_patterns, robotsParser);
|
|
93
|
+
for (const link of newLinks) {
|
|
94
|
+
visited.add(link);
|
|
95
|
+
queue.push([link, depth + 1]);
|
|
96
|
+
}
|
|
97
|
+
if (input.extract_links) {
|
|
98
|
+
for (const link of fetchResult.links) {
|
|
99
|
+
allLinks.push({ from: url, to: link });
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
// total_found = all unique URLs discovered (visited set), including unvisited queue items
|
|
105
|
+
return {
|
|
106
|
+
pages,
|
|
107
|
+
total_found: visited.size,
|
|
108
|
+
crawled: pages.length,
|
|
109
|
+
...(input.extract_links ? { links: allLinks } : {}),
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
filterLinks(links, seedOrigin, visited, includePatterns, excludePatterns, robotsParser) {
|
|
113
|
+
const filtered = links.filter((link) => {
|
|
114
|
+
try {
|
|
115
|
+
const parsed = new URL(link);
|
|
116
|
+
if (parsed.origin !== seedOrigin)
|
|
117
|
+
return false;
|
|
118
|
+
if (visited.has(link))
|
|
119
|
+
return false;
|
|
120
|
+
if (!matchesPatterns(link, includePatterns, excludePatterns))
|
|
121
|
+
return false;
|
|
122
|
+
if (robotsParser && !robotsParser.isAllowed(parsed.pathname))
|
|
123
|
+
return false;
|
|
124
|
+
return true;
|
|
125
|
+
}
|
|
126
|
+
catch {
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
// Prioritize documentation pages over marketing/nav pages
|
|
131
|
+
return filtered.sort((a, b) => {
|
|
132
|
+
const aDoc = isDocPage(a) ? 0 : 1;
|
|
133
|
+
const bDoc = isDocPage(b) ? 0 : 1;
|
|
134
|
+
return aDoc - bDoc;
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
async crawlSitemap(input, seedOrigin, maxPages, robotsParser) {
|
|
138
|
+
// Discover sitemap URLs (pass already-fetched robots.txt content)
|
|
139
|
+
let sitemapUrls = await this.discoverSitemapUrls(seedOrigin, this.robotsTxtContent);
|
|
140
|
+
if (sitemapUrls.length === 0) {
|
|
141
|
+
log.info('No sitemap found, falling back to BFS');
|
|
142
|
+
return this.crawlTraversal(input, seedOrigin, input.max_depth ?? 2, maxPages, 'bfs', robotsParser);
|
|
143
|
+
}
|
|
144
|
+
// Filter by patterns
|
|
145
|
+
sitemapUrls = sitemapUrls.filter((url) => matchesPatterns(url, input.include_patterns, input.exclude_patterns));
|
|
146
|
+
const totalFound = sitemapUrls.length;
|
|
147
|
+
const toFetch = sitemapUrls.slice(0, maxPages);
|
|
148
|
+
const pages = [];
|
|
149
|
+
const allLinks = [];
|
|
150
|
+
for (const url of toFetch) {
|
|
151
|
+
if (pages.length >= maxPages)
|
|
152
|
+
break;
|
|
153
|
+
if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname))
|
|
154
|
+
continue;
|
|
155
|
+
const release = await this.rateLimiter.acquire(url);
|
|
156
|
+
try {
|
|
157
|
+
const result = await this.fetchFn(url);
|
|
158
|
+
release();
|
|
159
|
+
if (!result.error) {
|
|
160
|
+
pages.push({ url: result.url, title: result.title, markdown: result.markdown, depth: 0 });
|
|
161
|
+
if (input.extract_links) {
|
|
162
|
+
for (const link of result.links) {
|
|
163
|
+
allLinks.push({ from: url, to: link });
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
catch (err) {
|
|
169
|
+
release();
|
|
170
|
+
log.warn('Sitemap fetch failed', { url, error: String(err) });
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return {
|
|
174
|
+
pages,
|
|
175
|
+
total_found: totalFound,
|
|
176
|
+
crawled: pages.length,
|
|
177
|
+
...(input.extract_links ? { links: allLinks } : {}),
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
async discoverSitemapUrls(origin, robotsTxt) {
|
|
181
|
+
const sitemapLocations = [];
|
|
182
|
+
// Check robots.txt for sitemap references (reuses already-fetched content)
|
|
183
|
+
if (robotsTxt) {
|
|
184
|
+
sitemapLocations.push(...extractSitemapUrlFromRobots(robotsTxt));
|
|
185
|
+
}
|
|
186
|
+
// Try default location
|
|
187
|
+
if (sitemapLocations.length === 0) {
|
|
188
|
+
sitemapLocations.push(`${origin}/sitemap.xml`);
|
|
189
|
+
}
|
|
190
|
+
const allUrls = [];
|
|
191
|
+
for (const sitemapUrl of sitemapLocations) {
|
|
192
|
+
try {
|
|
193
|
+
const result = await this.rawFetchFn(sitemapUrl);
|
|
194
|
+
if (result.statusCode !== 200)
|
|
195
|
+
continue;
|
|
196
|
+
// Check if it's a sitemap index
|
|
197
|
+
const indexUrls = parseSitemapIndex(result.html);
|
|
198
|
+
if (indexUrls.length > 0) {
|
|
199
|
+
// Fetch each sub-sitemap
|
|
200
|
+
for (const subUrl of indexUrls) {
|
|
201
|
+
try {
|
|
202
|
+
const subResult = await this.rawFetchFn(subUrl);
|
|
203
|
+
if (subResult.statusCode === 200) {
|
|
204
|
+
allUrls.push(...parseSitemap(subResult.html));
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
catch {
|
|
208
|
+
// skip failed sub-sitemaps
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
else {
|
|
213
|
+
allUrls.push(...parseSitemap(result.html));
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
catch {
|
|
217
|
+
// skip failed sitemap fetches
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
return allUrls;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
const DOC_PATH_PATTERNS = ['/docs/', '/guide/', '/api/', '/reference/'];
|
|
224
|
+
function isDocPage(url) {
|
|
225
|
+
const path = new URL(url).pathname.toLowerCase();
|
|
226
|
+
return DOC_PATH_PATTERNS.some(p => path.includes(p));
|
|
227
|
+
}
|
|
228
|
+
//# sourceMappingURL=crawler.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawler.js","sourceRoot":"","sources":["../../src/crawl/crawler.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,YAAY,EAAE,iBAAiB,EAAE,2BAA2B,EAAE,MAAM,cAAc,CAAC;AAC5F,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAE5C,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;AAKlC,MAAM,OAAO,OAAO;IACV,OAAO,CAAU;IACjB,UAAU,CAAa;IACvB,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;IAExC,YAAY,OAAgB,EAAE,UAAsB;QAClD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,KAAiB;QAC3B,MAAM,QAAQ,GAAG,KAAK,CAAC,QAAQ,IAAI,KAAK,CAAC;QACzC,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,CAAC,CAAC;QACtC,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,EAAE,CAAC;QAEvC,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC;QAE7C,2CAA2C;QAC3C,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAwB,IAAI,CAAC;QAC7C,IAAI,MAAM,CAAC,gBAAgB,EAAE,CAAC;YAC5B,YAAY,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;QACpD,CAAC;QAED,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC3B,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,YAAY,CAAC,CAAC;QACtE,CAAC;QAED,MAAM,iBAAiB,GAAG,QAAQ,KAAK,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC;QAChE,OAAO,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,YAAY,CAAC,CAAC;IACrG,CAAC;IAEO,gBAAgB,GAAkB,IAAI,CAAC;IAEvC,KAAK,CAAC,WAAW,CAAC,MAAc;QACtC,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,GAAG,MAAM,aAAa,CAAC,CAAC;YAC7D,IAAI,MAAM,CAAC,UAAU,KAAK,GAAG,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;gBAC7C,IAAI,CAAC,gBAAgB,GAAG,MAAM,CAAC,IAAI,CAAC;gBACpC,MAAM,MAAM,GAAG,IAAI,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;gBAC7C,MAAM,UAAU,GAAG,MAAM,CAAC,aAAa,EAAE,CAAC;gBAC1C,IAAI,UAAU,KAAK,IAAI,EAAE,CAAC;oBACxB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC;oBACxC,IAAI,CAAC,WAAW,CAAC,mBAAmB,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;gBAC3D,CAAC;gBACD,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,GAAG,CAAC,KAAK,CAAC,4BAA4B,EAAE,EAAE,MAAM,EAAE,CAAC,CAAC;QACtD,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,KAAK,CAAC,cAAc,CAC1B,KAAiB,EACjB,UAAkB,EAClB,QAAgB,EAChB,QAAgB,EAChB,QAAuB,EACvB,YAAiC;QAEjC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;QAClC,MAAM,KAAK,GAAsB,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAe,EAAE,CAAC;QAEhC,sBAAsB;QACtB,MAAM,KAAK,GAA4B,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QACxD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAEvB,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YACnD,MAAM,IAAI,GAAG,QAAQ,KAAK,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,EAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,EAAG,CAAC;YAChE,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,IAAI,CAAC;YAE1B,mBAAmB;YACnB,IAAI,YAAY,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACnE,GAAG,CAAC,KAAK,CAAC,uBAAuB,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;gBAC5C,SAAS;YACX,CAAC;YAED,aAAa;YACb,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAEpD,IAAI,WAAwB,CAAC;YAC7B,IAAI,CAAC;gBACH,WAAW,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YACxC,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,GAAG,CAAC,IAAI,CAAC,2BAA2B,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBACnE,OAAO,EAAE,CAAC;gBACV,SAAS;YACX,CAAC;YAED,OAAO,EAAE,CAAC;YAEV,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;gBACtB,GAAG,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC;gBACpE,SAAS;YACX,CAAC;YAED,KAAK,CAAC,IAAI,CAAC;gBACT,GAAG,EAAE,WAAW,CAAC,GAAG;gBACpB,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ;gBAC9B,KAAK;aACN,CAAC,CAAC;YAEH,+BAA+B;YAC/B,IAAI,KAAK,GAAG,QAAQ,EAAE,CAAC;gBACrB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC,KAAK,EAAE,UAAU,EAAE,OAAO,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,EAAE,YAAY,CAAC,CAAC;gBAExI,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;oBAC5B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC;gBAChC,CAAC;gBAED,IAAI,KAAK,CAAC,aAAa,EAAE,CAAC;oBACxB,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;wBACrC,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;oBACzC,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,0FAA0F;QAC1F,OAAO;YACL,KAAK;YACL,WAAW,EAAE,OAAO,CAAC,IAAI;YACzB,OAAO,EAAE,KAAK,CAAC,MAAM;YACrB,GAAG,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACpD,CAAC;IACJ,CAAC;IAEO,WAAW,CACjB,KAAe,EACf,UAAkB,EAClB,OAAoB,EACpB,eAAqC,EACrC,eAAqC,EACrC,YAAiC;QAEjC,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE;YACrC,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC7B,IAAI,MAAM,CAAC,MAAM,KAAK,UAAU;oBAAE,OAAO,KAAK,CAAC;gBAC/C,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC;oBAAE,OAAO,KAAK,CAAC;gBACpC,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,eAAe,EAAE,eAAe,CAAC;oBAAE,OAAO,KAAK,CAAC;gBAC3E,IAAI,YAAY,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC;oBAAE,OAAO,KAAK,CAAC;gBAC3E,OAAO,IAAI,CAAC;YACd,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,0DAA0D;QAC1D,OAAO,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAC5B,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAClC,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAClC,OAAO,IAAI,GAAG,IAAI,CAAC;QACrB,CAAC,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,YAAY,CAChB,KAAiB,EACjB,UAAkB,EAClB,QAAgB,EAChB,YAAiC;QAEjC,kEAAkE;QAClE,IAAI,WAAW,GAAG,MAAM,IAAI,CAAC,mBAAmB,CAAC,UAAU,EAAE,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAEpF,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,GAAG,CAAC,IAAI,CAAC,uCAAuC,CAAC,CAAC;YAClD,OAAO,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,UAAU,EAAE,KAAK,CAAC,SAAS,IAAI,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,YAAY,CAAC,CAAC;QACrG,CAAC;QAED,qBAAqB;QACrB,WAAW,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CACvC,eAAe,CAAC,GAAG,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,CAAC,CACrE,CAAC;QAEF,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,CAAC;QACtC,MAAM,OAAO,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAC/C,MAAM,KAAK,GAAsB,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAe,EAAE,CAAC;QAEhC,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;YAC1B,IAAI,KAAK,CAAC,MAAM,IAAI,QAAQ;gBAAE,MAAM;YAEpC,IAAI,YAAY,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;gBAAE,SAAS;YAE7E,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAEpD,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;gBACvC,OAAO,EAAE,CAAC;gBAEV,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;oBAE1F,IAAI,KAAK,CAAC,aAAa,EAAE,CAAC;wBACxB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;4BAChC,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;wBACzC,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,EAAE,CAAC;gBACV,GAAG,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAChE,CAAC;QACH,CAAC;QAED,OAAO;YACL,KAAK;YACL,WAAW,EAAE,UAAU;YACvB,OAAO,EAAE,KAAK,CAAC,MAAM;YACrB,GAAG,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACpD,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,mBAAmB,CAAC,MAAc,EAAE,SAAoC;QACpF,MAAM,gBAAgB,GAAa,EAAE,CAAC;QAEtC,2EAA2E;QAC3E,IAAI,SAAS,EAAE,CAAC;YACd,gBAAgB,CAAC,IAAI,CAAC,GAAG,2BAA2B,CAAC,SAAS,CAAC,CAAC,CAAC;QACnE,CAAC;QAED,uBAAuB;QACvB,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAClC,gBAAgB,CAAC,IAAI,CAAC,GAAG,MAAM,cAAc,CAAC,CAAC;QACjD,CAAC;QAED,MAAM,OAAO,GAAa,EAAE,CAAC;QAE7B,KAAK,MAAM,UAAU,IAAI,gBAAgB,EAAE,CAAC;YAC1C,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC;gBACjD,IAAI,MAAM,CAAC,UAAU,KAAK,GAAG;oBAAE,SAAS;gBAExC,gCAAgC;gBAChC,MAAM,SAAS,GAAG,iBAAiB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;gBACjD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACzB,yBAAyB;oBACzB,KAAK,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;wBAC/B,IAAI,CAAC;4BACH,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;4BAChD,IAAI,SAAS,CAAC,UAAU,KAAK,GAAG,EAAE,CAAC;gCACjC,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;4BAChD,CAAC;wBACH,CAAC;wBAAC,MAAM,CAAC;4BACP,2BAA2B;wBAC7B,CAAC;oBACH,CAAC;gBACH,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC7C,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,8BAA8B;YAChC,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;CACF;AAED,MAAM,iBAAiB,GAAG,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC;AAExE,SAAS,SAAS,CAAC,GAAW;IAC5B,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;IACjD,OAAO,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;AACvD,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export declare function splitIntoBlocks(markdown: string): string[];
|
|
2
|
+
export declare function normalizeBlockText(text: string): string;
|
|
3
|
+
interface PageInput {
|
|
4
|
+
url: string;
|
|
5
|
+
markdown: string;
|
|
6
|
+
}
|
|
7
|
+
interface PageOutput {
|
|
8
|
+
url: string;
|
|
9
|
+
markdown: string;
|
|
10
|
+
}
|
|
11
|
+
export declare function deduplicatePages(pages: PageInput[], domain?: string): PageOutput[];
|
|
12
|
+
export declare function getStoredBoilerplate(domain: string): string[];
|
|
13
|
+
export declare function storeBoilerplate(domain: string, hashes: string[]): void;
|
|
14
|
+
export {};
|
|
15
|
+
//# sourceMappingURL=dedup.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup.d.ts","sourceRoot":"","sources":["../../src/crawl/dedup.ts"],"names":[],"mappings":"AAGA,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,EAAE,CA2B1D;AAED,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEvD;AAMD,UAAU,SAAS;IACjB,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,UAAU;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,UAAU,EAAE,CA+ClF;AAED,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAI7D;AAED,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,IAAI,CAavE"}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import { getDatabase } from '../cache/db.js';
|
|
3
|
+
export function splitIntoBlocks(markdown) {
|
|
4
|
+
if (!markdown.trim())
|
|
5
|
+
return [];
|
|
6
|
+
const lines = markdown.split('\n');
|
|
7
|
+
const headingIndices = [];
|
|
8
|
+
for (let i = 0; i < lines.length; i++) {
|
|
9
|
+
const match = lines[i].match(/^(#{1,6})\s+/);
|
|
10
|
+
if (match) {
|
|
11
|
+
headingIndices.push({ level: match[1].length, lineIdx: i });
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
// If no headings, split by double-newline (paragraph blocks)
|
|
15
|
+
if (headingIndices.length === 0) {
|
|
16
|
+
return markdown.split(/\n\n+/).map((b) => b.trim()).filter(Boolean);
|
|
17
|
+
}
|
|
18
|
+
// Non-overlapping split: each heading starts a new block, ending at the next heading of ANY level
|
|
19
|
+
const blocks = [];
|
|
20
|
+
for (let i = 0; i < headingIndices.length; i++) {
|
|
21
|
+
const start = headingIndices[i].lineIdx;
|
|
22
|
+
const end = i + 1 < headingIndices.length ? headingIndices[i + 1].lineIdx : lines.length;
|
|
23
|
+
blocks.push(lines.slice(start, end).join('\n').trim());
|
|
24
|
+
}
|
|
25
|
+
return blocks.filter(Boolean);
|
|
26
|
+
}
|
|
27
|
+
export function normalizeBlockText(text) {
|
|
28
|
+
return text.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
29
|
+
}
|
|
30
|
+
function hashBlock(text) {
|
|
31
|
+
return createHash('sha256').update(normalizeBlockText(text)).digest('hex');
|
|
32
|
+
}
|
|
33
|
+
export function deduplicatePages(pages, domain) {
|
|
34
|
+
if (pages.length <= 1)
|
|
35
|
+
return pages.map((p) => ({ url: p.url, markdown: p.markdown }));
|
|
36
|
+
// Pre-load stored boilerplate hashes for this domain
|
|
37
|
+
const storedHashes = domain ? getStoredBoilerplate(domain) : [];
|
|
38
|
+
const boilerplateHashes = new Set(storedHashes);
|
|
39
|
+
// Split each page into blocks and hash them
|
|
40
|
+
const pageBlocks = pages.map((page) => ({
|
|
41
|
+
url: page.url,
|
|
42
|
+
blocks: splitIntoBlocks(page.markdown),
|
|
43
|
+
}));
|
|
44
|
+
// Count how many pages each block hash appears in
|
|
45
|
+
const hashPageCount = new Map();
|
|
46
|
+
for (const page of pageBlocks) {
|
|
47
|
+
const seenHashes = new Set();
|
|
48
|
+
for (const block of page.blocks) {
|
|
49
|
+
const h = hashBlock(block);
|
|
50
|
+
if (!seenHashes.has(h)) {
|
|
51
|
+
seenHashes.add(h);
|
|
52
|
+
hashPageCount.set(h, (hashPageCount.get(h) ?? 0) + 1);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// Mark hashes appearing in >50% of pages as boilerplate
|
|
57
|
+
const threshold = pages.length / 2;
|
|
58
|
+
for (const [hash, count] of hashPageCount) {
|
|
59
|
+
if (count > threshold) {
|
|
60
|
+
boilerplateHashes.add(hash);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
// Store updated boilerplate hashes for this domain
|
|
64
|
+
if (domain) {
|
|
65
|
+
storeBoilerplate(domain, Array.from(boilerplateHashes));
|
|
66
|
+
}
|
|
67
|
+
// Strip boilerplate blocks from each page
|
|
68
|
+
return pageBlocks.map((page) => {
|
|
69
|
+
const filtered = page.blocks.filter((block) => !boilerplateHashes.has(hashBlock(block)));
|
|
70
|
+
return {
|
|
71
|
+
url: page.url,
|
|
72
|
+
markdown: filtered.join('\n\n'),
|
|
73
|
+
};
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
export function getStoredBoilerplate(domain) {
|
|
77
|
+
const db = getDatabase();
|
|
78
|
+
const rows = db.prepare('SELECT block_hash FROM domain_boilerplate WHERE domain = ?').all(domain);
|
|
79
|
+
return rows.map(r => r.block_hash);
|
|
80
|
+
}
|
|
81
|
+
export function storeBoilerplate(domain, hashes) {
|
|
82
|
+
const db = getDatabase();
|
|
83
|
+
const del = db.prepare('DELETE FROM domain_boilerplate WHERE domain = ?');
|
|
84
|
+
const insert = db.prepare('INSERT OR IGNORE INTO domain_boilerplate (domain, block_hash, sample_text) VALUES (?, ?, ?)');
|
|
85
|
+
const tx = db.transaction((items) => {
|
|
86
|
+
del.run(domain);
|
|
87
|
+
for (const hash of items) {
|
|
88
|
+
insert.run(domain, hash, null);
|
|
89
|
+
}
|
|
90
|
+
});
|
|
91
|
+
tx(hashes);
|
|
92
|
+
}
|
|
93
|
+
//# sourceMappingURL=dedup.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup.js","sourceRoot":"","sources":["../../src/crawl/dedup.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAE7C,MAAM,UAAU,eAAe,CAAC,QAAgB;IAC9C,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAEhC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,cAAc,GAAyC,EAAE,CAAC;IAEhE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QAC7C,IAAI,KAAK,EAAE,CAAC;YACV,cAAc,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;QAC9D,CAAC;IACH,CAAC;IAED,6DAA6D;IAC7D,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAChC,OAAO,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IACtE,CAAC;IAED,kGAAkG;IAClG,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/C,MAAM,KAAK,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;QACxC,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;QACzF,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;IACzD,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;AAChC,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,IAAY;IAC7C,OAAO,IAAI,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AACxD,CAAC;AAED,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC7E,CAAC;AAYD,MAAM,UAAU,gBAAgB,CAAC,KAAkB,EAAE,MAAe;IAClE,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;IAEvF,qDAAqD;IACrD,MAAM,YAAY,GAAG,MAAM,CAAC,CAAC,CAAC,oBAAoB,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAChE,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAS,YAAY,CAAC,CAAC;IAExD,4CAA4C;IAC5C,MAAM,UAAU,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACtC,GAAG,EAAE,IAAI,CAAC,GAAG;QACb,MAAM,EAAE,eAAe,CAAC,IAAI,CAAC,QAAQ,CAAC;KACvC,CAAC,CAAC,CAAC;IAEJ,kDAAkD;IAClD,MAAM,aAAa,GAAG,IAAI,GAAG,EAAkB,CAAC;IAChD,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAC;QACrC,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,CAAC,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC;YAC3B,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;gBACvB,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;gBAClB,aAAa,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;IACH,CAAC;IAED,wDAAwD;IACxD,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;IACnC,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,aAAa,EAAE,CAAC;QAC1C,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;YACtB,iBAAiB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,mDAAmD;IACnD,IAAI,MAAM,EAAE,CAAC;QACX,gBAAgB,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC,CAAC;IAC1D,CAAC;IAED,0CAA0C;IAC1C,OAAO,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,iBAAiB,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACzF,OAAO;YACL,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,QAAQ,EAAE,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC;SAChC,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,UAAU,oBAAoB,CAAC,MAAc;IACjD,MAAM,EAAE,GAAG,WAAW,EAAE,CAAC;IACzB,MAAM,IAAI,GAAG,EAAE,CAAC,OAAO,CAAC,4DAA4D,CAAC,CAAC,GAAG,CAAC,MAAM,CAA6B,CAAC;IAC9H,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;AACrC,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,MAAc,EAAE,MAAgB;IAC/D,MAAM,EAAE,GAAG,WAAW,EAAE,CAAC;IACzB,MAAM,GAAG,GAAG,EAAE,CAAC,OAAO,CAAC,iDAAiD,CAAC,CAAC;IAC1E,MAAM,MAAM,GAAG,EAAE,CAAC,OAAO,CACvB,6FAA6F,CAC9F,CAAC;IACF,MAAM,EAAE,GAAG,EAAE,CAAC,WAAW,CAAC,CAAC,KAAe,EAAE,EAAE;QAC5C,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAChB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,CAAC,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC;QACjC,CAAC;IACH,CAAC,CAAC,CAAC;IACH,EAAE,CAAC,MAAM,CAAC,CAAC;AACb,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { MapOutput } from '../types.js';
|
|
2
|
+
interface MapInput {
|
|
3
|
+
url: string;
|
|
4
|
+
max_depth?: number;
|
|
5
|
+
max_pages?: number;
|
|
6
|
+
include_patterns?: string[];
|
|
7
|
+
exclude_patterns?: string[];
|
|
8
|
+
}
|
|
9
|
+
export type LightFetchFn = (url: string) => Promise<{
|
|
10
|
+
html: string;
|
|
11
|
+
finalUrl: string;
|
|
12
|
+
statusCode: number;
|
|
13
|
+
}>;
|
|
14
|
+
export declare function extractLinks(html: string, origin: string): string[];
|
|
15
|
+
export declare function mapUrls(input: MapInput, fetchFn: LightFetchFn): Promise<MapOutput>;
|
|
16
|
+
export {};
|
|
17
|
+
//# sourceMappingURL=mapper.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mapper.d.ts","sourceRoot":"","sources":["../../src/crawl/mapper.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAI7C,UAAU,QAAQ;IAChB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC5B,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;CAC7B;AAED,MAAM,MAAM,YAAY,GAAG,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,UAAU,EAAE,MAAM,CAAA;CAAE,CAAC,CAAC;AAI5G,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CA+CnE;AAED,wBAAsB,OAAO,CAAC,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,GAAG,OAAO,CAAC,SAAS,CAAC,CAqFxF"}
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import { matchesPatterns } from './url-utils.js';
|
|
3
|
+
import { parseSitemap, parseSitemapIndex, extractSitemapUrlFromRobots } from './sitemap.js';
|
|
4
|
+
import { createLogger } from '../logger.js';
|
|
5
|
+
const log = createLogger('crawl');
|
|
6
|
+
const IGNORED_PROTOCOLS = ['javascript:', 'mailto:', 'tel:', 'data:', 'blob:', 'ftp:'];
|
|
7
|
+
export function extractLinks(html, origin) {
|
|
8
|
+
if (!html || !html.trim())
|
|
9
|
+
return [];
|
|
10
|
+
try {
|
|
11
|
+
const { document: doc } = parseHTML(html);
|
|
12
|
+
const anchors = doc.querySelectorAll('a[href]');
|
|
13
|
+
const seen = new Set();
|
|
14
|
+
const links = [];
|
|
15
|
+
const parsedOrigin = new URL(origin).origin;
|
|
16
|
+
for (const anchor of Array.from(anchors)) {
|
|
17
|
+
const href = anchor.getAttribute('href');
|
|
18
|
+
if (!href)
|
|
19
|
+
continue;
|
|
20
|
+
const trimmed = href.trim();
|
|
21
|
+
if (!trimmed)
|
|
22
|
+
continue;
|
|
23
|
+
// Skip fragment-only links
|
|
24
|
+
if (trimmed.startsWith('#'))
|
|
25
|
+
continue;
|
|
26
|
+
// Skip non-http protocols
|
|
27
|
+
if (IGNORED_PROTOCOLS.some((p) => trimmed.toLowerCase().startsWith(p)))
|
|
28
|
+
continue;
|
|
29
|
+
try {
|
|
30
|
+
const resolved = new URL(trimmed, origin);
|
|
31
|
+
// Same-origin check
|
|
32
|
+
if (resolved.origin !== parsedOrigin)
|
|
33
|
+
continue;
|
|
34
|
+
// Strip fragment, keep path + query
|
|
35
|
+
resolved.hash = '';
|
|
36
|
+
const normalized = resolved.href;
|
|
37
|
+
if (!seen.has(normalized)) {
|
|
38
|
+
seen.add(normalized);
|
|
39
|
+
links.push(normalized);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
catch {
|
|
43
|
+
log.debug('Failed to resolve URL', { href: trimmed, origin });
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
return links;
|
|
47
|
+
}
|
|
48
|
+
catch (err) {
|
|
49
|
+
log.debug('Failed to parse HTML for link extraction', { error: String(err) });
|
|
50
|
+
return [];
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
export async function mapUrls(input, fetchFn) {
|
|
54
|
+
const maxDepth = input.max_depth ?? 3;
|
|
55
|
+
const maxPages = input.max_pages ?? 200;
|
|
56
|
+
let origin;
|
|
57
|
+
try {
|
|
58
|
+
origin = new URL(input.url).origin;
|
|
59
|
+
}
|
|
60
|
+
catch (err) {
|
|
61
|
+
return {
|
|
62
|
+
urls: [],
|
|
63
|
+
total_found: 0,
|
|
64
|
+
sitemap_found: false,
|
|
65
|
+
error: `Invalid seed URL: ${String(err)}`,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
const discovered = new Set([input.url]);
|
|
69
|
+
const queued = new Set([input.url]);
|
|
70
|
+
const queue = [{ url: input.url, depth: 0 }];
|
|
71
|
+
let sitemapFound = false;
|
|
72
|
+
// Phase 1: Sitemap discovery (best-effort, errors are non-fatal)
|
|
73
|
+
try {
|
|
74
|
+
const sitemapUrls = await discoverSitemapUrls(origin, fetchFn);
|
|
75
|
+
if (sitemapUrls.length > 0) {
|
|
76
|
+
sitemapFound = true;
|
|
77
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
78
|
+
if (discovered.size >= maxPages)
|
|
79
|
+
break;
|
|
80
|
+
if (matchesPatterns(sitemapUrl, input.include_patterns, input.exclude_patterns)) {
|
|
81
|
+
discovered.add(sitemapUrl);
|
|
82
|
+
// Also queue sitemap URLs for BFS traversal so their links are explored
|
|
83
|
+
if (!queued.has(sitemapUrl)) {
|
|
84
|
+
queued.add(sitemapUrl);
|
|
85
|
+
queue.push({ url: sitemapUrl, depth: 0 });
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
catch (err) {
|
|
92
|
+
log.debug('Sitemap discovery failed, continuing with BFS only', { error: String(err) });
|
|
93
|
+
}
|
|
94
|
+
// Phase 2: BFS link traversal
|
|
95
|
+
let seedError;
|
|
96
|
+
while (queue.length > 0 && discovered.size < maxPages) {
|
|
97
|
+
const current = queue.shift();
|
|
98
|
+
if (current.depth > maxDepth)
|
|
99
|
+
continue;
|
|
100
|
+
try {
|
|
101
|
+
const { html } = await fetchFn(current.url);
|
|
102
|
+
const links = extractLinks(html, origin);
|
|
103
|
+
for (const link of links) {
|
|
104
|
+
if (discovered.size >= maxPages)
|
|
105
|
+
break;
|
|
106
|
+
if (discovered.has(link))
|
|
107
|
+
continue;
|
|
108
|
+
if (!matchesPatterns(link, input.include_patterns, input.exclude_patterns))
|
|
109
|
+
continue;
|
|
110
|
+
discovered.add(link);
|
|
111
|
+
// Only queue for further traversal if we haven't hit max depth
|
|
112
|
+
if (current.depth + 1 <= maxDepth && !queued.has(link)) {
|
|
113
|
+
queued.add(link);
|
|
114
|
+
queue.push({ url: link, depth: current.depth + 1 });
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
catch (err) {
|
|
119
|
+
if (current.url === input.url && current.depth === 0) {
|
|
120
|
+
seedError = String(err);
|
|
121
|
+
log.warn('Seed URL fetch failed during map', { url: current.url, error: String(err) });
|
|
122
|
+
}
|
|
123
|
+
else {
|
|
124
|
+
log.debug('Child page fetch failed during map, skipping', { url: current.url, error: String(err) });
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
const urls = Array.from(discovered);
|
|
129
|
+
return {
|
|
130
|
+
urls,
|
|
131
|
+
total_found: urls.length,
|
|
132
|
+
sitemap_found: sitemapFound,
|
|
133
|
+
...(seedError !== undefined ? { error: seedError } : {}),
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
async function discoverSitemapUrls(origin, fetchFn) {
|
|
137
|
+
const sitemapLocations = [];
|
|
138
|
+
// Try robots.txt first for Sitemap directives
|
|
139
|
+
try {
|
|
140
|
+
const { html: robotsTxt } = await fetchFn(`${origin}/robots.txt`);
|
|
141
|
+
const fromRobots = extractSitemapUrlFromRobots(robotsTxt);
|
|
142
|
+
sitemapLocations.push(...fromRobots);
|
|
143
|
+
}
|
|
144
|
+
catch {
|
|
145
|
+
log.debug('robots.txt not found, trying default sitemap location');
|
|
146
|
+
}
|
|
147
|
+
// Fallback to default /sitemap.xml
|
|
148
|
+
if (sitemapLocations.length === 0) {
|
|
149
|
+
sitemapLocations.push(`${origin}/sitemap.xml`);
|
|
150
|
+
}
|
|
151
|
+
const allUrls = [];
|
|
152
|
+
for (const sitemapUrl of sitemapLocations) {
|
|
153
|
+
try {
|
|
154
|
+
const { html: sitemapXml } = await fetchFn(sitemapUrl);
|
|
155
|
+
// Check if it's a sitemap index (contains sub-sitemaps)
|
|
156
|
+
const indexUrls = parseSitemapIndex(sitemapXml);
|
|
157
|
+
if (indexUrls.length > 0) {
|
|
158
|
+
for (const subUrl of indexUrls) {
|
|
159
|
+
try {
|
|
160
|
+
const { html: subXml } = await fetchFn(subUrl);
|
|
161
|
+
allUrls.push(...parseSitemap(subXml));
|
|
162
|
+
}
|
|
163
|
+
catch {
|
|
164
|
+
log.debug('Failed to fetch sub-sitemap', { url: subUrl });
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
else {
|
|
169
|
+
allUrls.push(...parseSitemap(sitemapXml));
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
catch {
|
|
173
|
+
log.debug('Sitemap fetch failed', { url: sitemapUrl });
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return allUrls;
|
|
177
|
+
}
|
|
178
|
+
//# sourceMappingURL=mapper.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mapper.js","sourceRoot":"","sources":["../../src/crawl/mapper.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,YAAY,EAAE,iBAAiB,EAAE,2BAA2B,EAAE,MAAM,cAAc,CAAC;AAC5F,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAG5C,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;AAYlC,MAAM,iBAAiB,GAAG,CAAC,aAAa,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC;AAEvF,MAAM,UAAU,YAAY,CAAC,IAAY,EAAE,MAAc;IACvD,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAErC,IAAI,CAAC;QACH,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAC1C,MAAM,OAAO,GAAG,GAAG,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;QAChD,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;QAC/B,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;QAE5C,KAAK,MAAM,MAAM,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;YACzC,MAAM,IAAI,GAAG,MAAM,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YACzC,IAAI,CAAC,IAAI;gBAAE,SAAS;YAEpB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;YAC5B,IAAI,CAAC,OAAO;gBAAE,SAAS;YAEvB,2BAA2B;YAC3B,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAEtC,0BAA0B;YAC1B,IAAI,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;gBAAE,SAAS;YAEjF,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;gBAE1C,oBAAoB;gBACpB,IAAI,QAAQ,CAAC,MAAM,KAAK,YAAY;oBAAE,SAAS;gBAE/C,oCAAoC;gBACpC,QAAQ,CAAC,IAAI,GAAG,EAAE,CAAC;gBACnB,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,CAAC;gBAEjC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;oBAC1B,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;oBACrB,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBACzB,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,GAAG,CAAC,KAAK,CAAC,uBAAuB,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;YAChE,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,KAAK,CAAC,0CAA0C,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAC9E,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,KAAe,EAAE,OAAqB;IAClE,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,CAAC,CAAC;IACtC,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,GAAG,CAAC;IAExC,IAAI,MAAc,CAAC;IACnB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC;IACrC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO;YACL,IAAI,EAAE,EAAE;YACR,WAAW,EAAE,CAAC;YACd,aAAa,EAAE,KAAK;YACpB,KAAK,EAAE,qBAAqB,MAAM,CAAC,GAAG,CAAC,EAAE;SAC1C,CAAC;IACJ,CAAC;IAED,MAAM,UAAU,GAAG,IAAI,GAAG,CAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;IAChD,MAAM,MAAM,GAAG,IAAI,GAAG,CAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5C,MAAM,KAAK,GAA0C,CAAC,EAAE,GAAG,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;IACpF,IAAI,YAAY,GAAG,KAAK,CAAC;IAEzB,iEAAiE;IACjE,IAAI,CAAC;QACH,MAAM,WAAW,GAAG,MAAM,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC/D,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3B,YAAY,GAAG,IAAI,CAAC;YACpB,KAAK,MAAM,UAAU,IAAI,WAAW,EAAE,CAAC;gBACrC,IAAI,UAAU,CAAC,IAAI,IAAI,QAAQ;oBAAE,MAAM;gBACvC,IAAI,eAAe,CAAC,UAAU,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,CAAC,EAAE,CAAC;oBAChF,UAAU,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;oBAC3B,wEAAwE;oBACxE,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;wBAC5B,MAAM,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;wBACvB,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,UAAU,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;oBAC5C,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,GAAG,CAAC,KAAK,CAAC,oDAAoD,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAC1F,CAAC;IAED,8BAA8B;IAC9B,IAAI,SAA6B,CAAC;IAElC,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,CAAC,IAAI,GAAG,QAAQ,EAAE,CAAC;QACtD,MAAM,OAAO,GAAG,KAAK,CAAC,KAAK,EAAG,CAAC;QAE/B,IAAI,OAAO,CAAC,KAAK,GAAG,QAAQ;YAAE,SAAS;QAEvC,IAAI,CAAC;YACH,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAC5C,MAAM,KAAK,GAAG,YAAY,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAEzC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACzB,IAAI,UAAU,CAAC,IAAI,IAAI,QAAQ;oBAAE,MAAM;gBACvC,IAAI,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC;oBAAE,SAAS;gBACnC,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,CAAC;oBAAE,SAAS;gBAErF,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;gBAErB,+DAA+D;gBAC/D,IAAI,OAAO,CAAC,KAAK,GAAG,CAAC,IAAI,QAAQ,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBACvD,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;oBACjB,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;gBACtD,CAAC;YACH,CAAC;QACH,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,IAAI,OAAO,CAAC,GAAG,KAAK,KAAK,CAAC,GAAG,IAAI,OAAO,CAAC,KAAK,KAAK,CAAC,EAAE,CAAC;gBACrD,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC;gBACxB,GAAG,CAAC,IAAI,CAAC,kCAAkC,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACzF,CAAC;iBAAM,CAAC;gBACN,GAAG,CAAC,KAAK,CAAC,8CAA8C,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACtG,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAEpC,OAAO;QACL,IAAI;QACJ,WAAW,EAAE,IAAI,CAAC,MAAM;QACxB,aAAa,EAAE,YAAY;QAC3B,GAAG,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KACzD,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,mBAAmB,CAAC,MAAc,EAAE,OAAqB;IACtE,MAAM,gBAAgB,GAAa,EAAE,CAAC;IAEtC,8CAA8C;IAC9C,IAAI,CAAC;QACH,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,MAAM,OAAO,CAAC,GAAG,MAAM,aAAa,CAAC,CAAC;QAClE,MAAM,UAAU,GAAG,2BAA2B,CAAC,SAAS,CAAC,CAAC;QAC1D,gBAAgB,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;IACvC,CAAC;IAAC,MAAM,CAAC;QACP,GAAG,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;IACrE,CAAC;IAED,mCAAmC;IACnC,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClC,gBAAgB,CAAC,IAAI,CAAC,GAAG,MAAM,cAAc,CAAC,CAAC;IACjD,CAAC;IAED,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,KAAK,MAAM,UAAU,IAAI,gBAAgB,EAAE,CAAC;QAC1C,IAAI,CAAC;YACH,MAAM,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;YAEvD,wDAAwD;YACxD,MAAM,SAAS,GAAG,iBAAiB,CAAC,UAAU,CAAC,CAAC;YAChD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzB,KAAK,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;oBAC/B,IAAI,CAAC;wBACH,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,MAAM,OAAO,CAAC,MAAM,CAAC,CAAC;wBAC/C,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC;oBACxC,CAAC;oBAAC,MAAM,CAAC;wBACP,GAAG,CAAC,KAAK,CAAC,6BAA6B,EAAE,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,CAAC;oBAC5D,CAAC;gBACH,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC;YAC5C,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,GAAG,CAAC,KAAK,CAAC,sBAAsB,EAAE,EAAE,GAAG,EAAE,UAAU,EAAE,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export declare class RateLimiter {
|
|
2
|
+
private domains;
|
|
3
|
+
private robotsDelays;
|
|
4
|
+
setRobotsCrawlDelay(domain: string, delaySeconds: number): void;
|
|
5
|
+
acquire(url: string): Promise<() => void>;
|
|
6
|
+
private getOrCreateState;
|
|
7
|
+
private startRequest;
|
|
8
|
+
private processQueue;
|
|
9
|
+
}
|
|
10
|
+
//# sourceMappingURL=rate-limiter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rate-limiter.d.ts","sourceRoot":"","sources":["../../src/crawl/rate-limiter.ts"],"names":[],"mappings":"AAWA,qBAAa,WAAW;IACtB,OAAO,CAAC,OAAO,CAAkC;IACjD,OAAO,CAAC,YAAY,CAA6B;IAEjD,mBAAmB,CAAC,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,MAAM,GAAG,IAAI;IAIzD,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,IAAI,CAAC;IAoB/C,OAAO,CAAC,gBAAgB;IA6BxB,OAAO,CAAC,YAAY;IAUpB,OAAO,CAAC,YAAY;CAarB"}
|