recker 1.0.93 → 1.0.94-next.132e096
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/index.d.ts +3 -1
- package/dist/browser/scrape/index.js +2 -0
- package/dist/browser/scrape/spider.d.ts +35 -2
- package/dist/browser/scrape/spider.js +185 -53
- package/dist/browser/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/browser/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/browser/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/browser/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -0
- package/dist/browser/seo/seo-spider.js +17 -3
- package/dist/cli/commands/spider-runner.js +2 -1
- package/dist/cli/index.js +0 -3
- package/dist/cli/tui/jobs/spider-job.js +2 -1
- package/dist/mcp/prompts/index.js +15 -6
- package/dist/scrape/index.d.ts +3 -1
- package/dist/scrape/index.js +2 -0
- package/dist/scrape/spider.d.ts +35 -2
- package/dist/scrape/spider.js +185 -53
- package/dist/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/seo/seo-spider.d.ts +2 -0
- package/dist/seo/seo-spider.js +17 -3
- package/dist/version.js +1 -1
- package/package.json +7 -1
|
@@ -3,10 +3,12 @@ export type { Options as ParserOptions } from './parser/index.js';
|
|
|
3
3
|
export { ScrapeDocument } from './document.js';
|
|
4
4
|
export { ScrapeElement } from './element.js';
|
|
5
5
|
export { Spider, spider } from './spider.js';
|
|
6
|
-
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
6
|
+
export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
|
|
7
7
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
8
|
+
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
8
9
|
export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
9
10
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
11
|
+
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
10
12
|
export type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
11
13
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
12
14
|
export type { ProxyAdapter } from './proxy-adapter.js';
|
|
@@ -3,6 +3,8 @@ export { ScrapeDocument } from './document.js';
|
|
|
3
3
|
export { ScrapeElement } from './element.js';
|
|
4
4
|
export { Spider, spider } from './spider.js';
|
|
5
5
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
6
|
+
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
6
7
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
8
|
+
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
7
9
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
8
10
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { ScrapeDocument } from './document.js';
|
|
1
2
|
import type { ExtractedLink, ExtractionSchema } from './types.js';
|
|
2
3
|
import type { Options as ParserOptions } from './parser/index.js';
|
|
3
4
|
import { type SitemapUrl } from '../seo/validators/sitemap.js';
|
|
@@ -31,8 +32,7 @@ export interface SpiderOptions {
|
|
|
31
32
|
proxy?: string | string[] | ProxyAdapter;
|
|
32
33
|
transport?: SpiderTransport;
|
|
33
34
|
preferCurlFirst?: boolean;
|
|
34
|
-
onPage?: (
|
|
35
|
-
onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
|
|
35
|
+
onPage?: (event: SpiderPageEvent) => void | Promise<void>;
|
|
36
36
|
onCaptchaDetected?: (result: {
|
|
37
37
|
url: string;
|
|
38
38
|
status: number;
|
|
@@ -40,9 +40,31 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
+
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
+
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
+
onRetry?: (info: {
|
|
46
|
+
url: string;
|
|
47
|
+
attempt: number;
|
|
48
|
+
maxAttempts: number;
|
|
49
|
+
reason?: string;
|
|
50
|
+
delay: number;
|
|
51
|
+
transport: SpiderTransport;
|
|
52
|
+
previousStatus: number;
|
|
53
|
+
timings?: SpiderPageResult['timings'];
|
|
54
|
+
}) => void | Promise<void>;
|
|
55
|
+
onRedirect?: (info: {
|
|
56
|
+
from: string;
|
|
57
|
+
to: string;
|
|
58
|
+
status: number;
|
|
59
|
+
}) => void | Promise<void>;
|
|
43
60
|
onProgress?: (progress: SpiderProgress) => void;
|
|
44
61
|
extract?: string[] | ExtractionSchema;
|
|
45
62
|
parserOptions?: Partial<ParserOptions>;
|
|
63
|
+
domainRateLimit?: {
|
|
64
|
+
maxPerSecond?: number;
|
|
65
|
+
};
|
|
66
|
+
deduplicateContent?: boolean;
|
|
67
|
+
resume?: boolean;
|
|
46
68
|
crawlQueue?: CrawlQueueAdapter;
|
|
47
69
|
crawlStorage?: CrawlStorageAdapter;
|
|
48
70
|
}
|
|
@@ -102,6 +124,14 @@ export interface SpiderPageResult {
|
|
|
102
124
|
stylesheets: number;
|
|
103
125
|
};
|
|
104
126
|
extracted?: Record<string, unknown>;
|
|
127
|
+
contentHash?: string;
|
|
128
|
+
isDuplicate?: boolean;
|
|
129
|
+
duplicateOf?: string;
|
|
130
|
+
}
|
|
131
|
+
export interface SpiderPageEvent {
|
|
132
|
+
result: SpiderPageResult;
|
|
133
|
+
html?: string;
|
|
134
|
+
document?: () => Promise<ScrapeDocument>;
|
|
105
135
|
}
|
|
106
136
|
export interface SpiderProgress {
|
|
107
137
|
crawled: number;
|
|
@@ -163,6 +193,8 @@ export declare class Spider {
|
|
|
163
193
|
private running;
|
|
164
194
|
private aborted;
|
|
165
195
|
private pendingCount;
|
|
196
|
+
private domainRequestTimestamps;
|
|
197
|
+
private contentHashes;
|
|
166
198
|
private blockedDomains;
|
|
167
199
|
private curlTransport;
|
|
168
200
|
private curlAvailable;
|
|
@@ -172,6 +204,7 @@ export declare class Spider {
|
|
|
172
204
|
private robotsData;
|
|
173
205
|
private sitemapValidation;
|
|
174
206
|
private robotsValidation;
|
|
207
|
+
private waitForDomainRateLimit;
|
|
175
208
|
private toHeaderRecord;
|
|
176
209
|
constructor(options?: SpiderOptions);
|
|
177
210
|
crawl(startUrl: string): Promise<SpiderResult>;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
1
2
|
import { performance } from 'node:perf_hooks';
|
|
2
3
|
import { createClient } from '../core/client.js';
|
|
3
4
|
import { ScrapeDocument } from './document.js';
|
|
@@ -108,7 +109,8 @@ function shouldCrawl(url, baseHost, options) {
|
|
|
108
109
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
109
110
|
return false;
|
|
110
111
|
}
|
|
111
|
-
|
|
112
|
+
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
113
|
+
if (options.sameDomain !== false && hostname !== baseHost) {
|
|
112
114
|
return false;
|
|
113
115
|
}
|
|
114
116
|
const skipExtensions = [
|
|
@@ -189,6 +191,8 @@ export class Spider {
|
|
|
189
191
|
running = false;
|
|
190
192
|
aborted = false;
|
|
191
193
|
pendingCount = 0;
|
|
194
|
+
domainRequestTimestamps = new Map();
|
|
195
|
+
contentHashes = new Map();
|
|
192
196
|
blockedDomains = new Set();
|
|
193
197
|
curlTransport = null;
|
|
194
198
|
curlAvailable = false;
|
|
@@ -198,6 +202,31 @@ export class Spider {
|
|
|
198
202
|
robotsData = null;
|
|
199
203
|
sitemapValidation = null;
|
|
200
204
|
robotsValidation = null;
|
|
205
|
+
async waitForDomainRateLimit(hostname) {
|
|
206
|
+
const limit = this.options.domainRateLimit?.maxPerSecond;
|
|
207
|
+
if (!limit || limit <= 0)
|
|
208
|
+
return;
|
|
209
|
+
const now = Date.now();
|
|
210
|
+
const window = 1000;
|
|
211
|
+
let timestamps = this.domainRequestTimestamps.get(hostname);
|
|
212
|
+
if (!timestamps) {
|
|
213
|
+
timestamps = [];
|
|
214
|
+
this.domainRequestTimestamps.set(hostname, timestamps);
|
|
215
|
+
}
|
|
216
|
+
while (timestamps.length > 0 && timestamps[0] <= now - window) {
|
|
217
|
+
timestamps.shift();
|
|
218
|
+
}
|
|
219
|
+
if (timestamps.length >= limit) {
|
|
220
|
+
const waitMs = timestamps[0] + window - now;
|
|
221
|
+
if (waitMs > 0)
|
|
222
|
+
await sleep(waitMs);
|
|
223
|
+
const afterWait = Date.now();
|
|
224
|
+
while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
|
|
225
|
+
timestamps.shift();
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
timestamps.push(Date.now());
|
|
229
|
+
}
|
|
201
230
|
toHeaderRecord(headers) {
|
|
202
231
|
const headerRecord = {};
|
|
203
232
|
headers.forEach((value, key) => {
|
|
@@ -239,11 +268,17 @@ export class Spider {
|
|
|
239
268
|
exclude: options.exclude,
|
|
240
269
|
include: options.include,
|
|
241
270
|
onPage: options.onPage,
|
|
242
|
-
onPageWithHtml: options.onPageWithHtml,
|
|
243
271
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
272
|
+
onBlocked: options.onBlocked,
|
|
273
|
+
onError: options.onError,
|
|
274
|
+
onRetry: options.onRetry,
|
|
275
|
+
onRedirect: options.onRedirect,
|
|
244
276
|
onProgress: options.onProgress,
|
|
245
277
|
extract: extractSchema,
|
|
246
278
|
parserOptions: options.parserOptions,
|
|
279
|
+
domainRateLimit: options.domainRateLimit,
|
|
280
|
+
deduplicateContent: options.deduplicateContent ?? false,
|
|
281
|
+
resume: options.resume ?? false,
|
|
247
282
|
};
|
|
248
283
|
if (options.proxy) {
|
|
249
284
|
if (typeof options.proxy === 'string') {
|
|
@@ -281,12 +316,20 @@ export class Spider {
|
|
|
281
316
|
const startTimestamp = Date.now();
|
|
282
317
|
const normalizedStart = normalizeUrl(startUrl);
|
|
283
318
|
const baseUrl = new URL(normalizedStart).origin;
|
|
284
|
-
this.baseHost = new URL(normalizedStart).hostname;
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
319
|
+
this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
|
|
320
|
+
if (!this.options.resume) {
|
|
321
|
+
await this.crawlQueue.clear();
|
|
322
|
+
await this.crawlStorage.clear();
|
|
323
|
+
this._visitedCount = 0;
|
|
324
|
+
this._queueSize = 0;
|
|
325
|
+
this._resultCount = 0;
|
|
326
|
+
this.domainRequestTimestamps.clear();
|
|
327
|
+
this.contentHashes.clear();
|
|
328
|
+
}
|
|
329
|
+
else {
|
|
330
|
+
this._queueSize = await this.crawlQueue.size();
|
|
331
|
+
this._resultCount = await this.crawlStorage.getResultCount();
|
|
332
|
+
}
|
|
290
333
|
this.running = true;
|
|
291
334
|
this.aborted = false;
|
|
292
335
|
this.pendingCount = 0;
|
|
@@ -295,8 +338,18 @@ export class Spider {
|
|
|
295
338
|
this.robotsData = null;
|
|
296
339
|
this.sitemapValidation = null;
|
|
297
340
|
this.robotsValidation = null;
|
|
298
|
-
this.
|
|
299
|
-
|
|
341
|
+
if (!this.options.resume) {
|
|
342
|
+
this.blockedDomains.clear();
|
|
343
|
+
this.domainStates.clear();
|
|
344
|
+
}
|
|
345
|
+
if (this.options.resume && this.options.deduplicateContent) {
|
|
346
|
+
const existingResults = await this.crawlStorage.getResults();
|
|
347
|
+
for (const r of existingResults) {
|
|
348
|
+
if (r.contentHash) {
|
|
349
|
+
this.contentHashes.set(r.contentHash, r.url);
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
300
353
|
if (this.options.transport !== 'undici') {
|
|
301
354
|
this.curlAvailable = await hasImpersonate();
|
|
302
355
|
if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
|
|
@@ -312,10 +365,12 @@ export class Spider {
|
|
|
312
365
|
const pending = new Map();
|
|
313
366
|
const scheduleUrl = async (item) => {
|
|
314
367
|
const normalized = normalizeUrl(item.url);
|
|
315
|
-
if (await this.crawlQueue.hasVisited(normalized))
|
|
316
|
-
return;
|
|
317
368
|
if (pending.has(normalized))
|
|
318
369
|
return;
|
|
370
|
+
if (await this.crawlQueue.hasVisited(normalized))
|
|
371
|
+
return;
|
|
372
|
+
await this.crawlQueue.markVisited(normalized);
|
|
373
|
+
this._visitedCount++;
|
|
319
374
|
if (item.depth > this.options.maxDepth)
|
|
320
375
|
return;
|
|
321
376
|
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
@@ -331,8 +386,6 @@ export class Spider {
|
|
|
331
386
|
return;
|
|
332
387
|
}
|
|
333
388
|
}
|
|
334
|
-
await this.crawlQueue.markVisited(normalized);
|
|
335
|
-
this._visitedCount++;
|
|
336
389
|
this.pendingCount++;
|
|
337
390
|
const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
|
|
338
391
|
.finally(() => {
|
|
@@ -341,16 +394,18 @@ export class Spider {
|
|
|
341
394
|
});
|
|
342
395
|
pending.set(normalized, promise);
|
|
343
396
|
};
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
397
|
+
if (!this.options.resume) {
|
|
398
|
+
await scheduleUrl({ url: normalizedStart, depth: 0 });
|
|
399
|
+
if (this.options.useSitemap && this.sitemapUrls.length > 0) {
|
|
400
|
+
for (const sitemapUrl of this.sitemapUrls) {
|
|
401
|
+
try {
|
|
402
|
+
const urlHost = new URL(sitemapUrl.loc).hostname;
|
|
403
|
+
if (urlHost === this.baseHost) {
|
|
404
|
+
await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
catch {
|
|
351
408
|
}
|
|
352
|
-
}
|
|
353
|
-
catch {
|
|
354
409
|
}
|
|
355
410
|
}
|
|
356
411
|
}
|
|
@@ -372,14 +427,33 @@ export class Spider {
|
|
|
372
427
|
if (pending.size > 0) {
|
|
373
428
|
await Promise.all(pending.values());
|
|
374
429
|
}
|
|
430
|
+
while (!this.aborted && this._resultCount < this.options.maxPages) {
|
|
431
|
+
const remaining = await this.crawlQueue.size();
|
|
432
|
+
if (remaining === 0 && pending.size === 0)
|
|
433
|
+
break;
|
|
434
|
+
this._queueSize = remaining;
|
|
435
|
+
let nextItem = await this.crawlQueue.pop();
|
|
436
|
+
while (nextItem && !this.aborted) {
|
|
437
|
+
this._queueSize = Math.max(0, this._queueSize - 1);
|
|
438
|
+
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
439
|
+
break;
|
|
440
|
+
await scheduleUrl(nextItem);
|
|
441
|
+
nextItem = await this.crawlQueue.pop();
|
|
442
|
+
}
|
|
443
|
+
if (pending.size > 0) {
|
|
444
|
+
await Promise.all(pending.values());
|
|
445
|
+
}
|
|
446
|
+
}
|
|
375
447
|
this.running = false;
|
|
376
448
|
const pages = await this.crawlStorage.getResults();
|
|
377
449
|
const errors = await this.crawlStorage.getErrors();
|
|
378
450
|
const sitemapAnalysis = this.buildSitemapAnalysis(pages);
|
|
379
451
|
const robotsAnalysis = this.buildRobotsAnalysis();
|
|
380
452
|
const visited = this.crawlQueue instanceof InMemoryCrawlQueue
|
|
381
|
-
? this.crawlQueue.getVisited()
|
|
453
|
+
? new Set(this.crawlQueue.getVisited())
|
|
382
454
|
: new Set(pages.map(r => r.url));
|
|
455
|
+
await this.crawlQueue.close?.();
|
|
456
|
+
await this.crawlStorage.close?.();
|
|
383
457
|
return {
|
|
384
458
|
startUrl: normalizedStart,
|
|
385
459
|
pages,
|
|
@@ -611,6 +685,9 @@ export class Spider {
|
|
|
611
685
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
612
686
|
const response = await clientForRequest.get(url, {
|
|
613
687
|
headers: this.buildRequestHeaders(url, false),
|
|
688
|
+
beforeRedirect: this.options.onRedirect
|
|
689
|
+
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
690
|
+
: undefined,
|
|
614
691
|
});
|
|
615
692
|
const contentType = response.headers.get('content-type') || '';
|
|
616
693
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -643,6 +720,7 @@ export class Spider {
|
|
|
643
720
|
};
|
|
644
721
|
};
|
|
645
722
|
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
723
|
+
await this.waitForDomainRateLimit(hostname);
|
|
646
724
|
await this.waitForDomainPenalty(hostname);
|
|
647
725
|
const useCurl = this.shouldUseCurlForHost(hostname, hasCurl, forcedTransport);
|
|
648
726
|
const transportForAttempt = useCurl ? 'curl' : 'undici';
|
|
@@ -732,6 +810,18 @@ export class Spider {
|
|
|
732
810
|
forcedTransport = 'curl';
|
|
733
811
|
}
|
|
734
812
|
}
|
|
813
|
+
if (this.options.onRetry) {
|
|
814
|
+
await this.options.onRetry({
|
|
815
|
+
url,
|
|
816
|
+
attempt: attempt + 1,
|
|
817
|
+
maxAttempts,
|
|
818
|
+
reason: attemptReason,
|
|
819
|
+
delay: waitMs,
|
|
820
|
+
transport: forcedTransport ?? transportForAttempt,
|
|
821
|
+
previousStatus: response.status,
|
|
822
|
+
timings,
|
|
823
|
+
});
|
|
824
|
+
}
|
|
735
825
|
await sleep(waitMs);
|
|
736
826
|
continue;
|
|
737
827
|
}
|
|
@@ -867,7 +957,21 @@ export class Spider {
|
|
|
867
957
|
};
|
|
868
958
|
await this.crawlStorage.saveResult(nonHtmlResult);
|
|
869
959
|
this._resultCount++;
|
|
870
|
-
this.options.onPage
|
|
960
|
+
if (this.options.onPage) {
|
|
961
|
+
let cachedDoc = null;
|
|
962
|
+
await this.options.onPage({
|
|
963
|
+
result: nonHtmlResult,
|
|
964
|
+
html: html || undefined,
|
|
965
|
+
document: html ? () => {
|
|
966
|
+
if (cachedDoc)
|
|
967
|
+
return Promise.resolve(cachedDoc);
|
|
968
|
+
return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
|
|
969
|
+
} : undefined,
|
|
970
|
+
});
|
|
971
|
+
}
|
|
972
|
+
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
973
|
+
await this.options.onBlocked(nonHtmlResult);
|
|
974
|
+
}
|
|
871
975
|
return;
|
|
872
976
|
}
|
|
873
977
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -898,6 +1002,21 @@ export class Spider {
|
|
|
898
1002
|
catch {
|
|
899
1003
|
}
|
|
900
1004
|
}
|
|
1005
|
+
let isDuplicate = false;
|
|
1006
|
+
let duplicateOf;
|
|
1007
|
+
let contentHash;
|
|
1008
|
+
if (this.options.deduplicateContent) {
|
|
1009
|
+
const bodyText = doc.text('body');
|
|
1010
|
+
contentHash = createHash('md5').update(bodyText).digest('hex');
|
|
1011
|
+
const existingUrl = this.contentHashes.get(contentHash);
|
|
1012
|
+
if (existingUrl) {
|
|
1013
|
+
isDuplicate = true;
|
|
1014
|
+
duplicateOf = existingUrl;
|
|
1015
|
+
}
|
|
1016
|
+
else {
|
|
1017
|
+
this.contentHashes.set(contentHash, item.url);
|
|
1018
|
+
}
|
|
1019
|
+
}
|
|
901
1020
|
const result = {
|
|
902
1021
|
url: item.url,
|
|
903
1022
|
status,
|
|
@@ -926,38 +1045,46 @@ export class Spider {
|
|
|
926
1045
|
timings,
|
|
927
1046
|
fetchedAt,
|
|
928
1047
|
extracted,
|
|
1048
|
+
contentHash,
|
|
1049
|
+
isDuplicate: isDuplicate || undefined,
|
|
1050
|
+
duplicateOf,
|
|
929
1051
|
};
|
|
930
1052
|
await this.crawlStorage.saveResult(result);
|
|
931
1053
|
this._resultCount++;
|
|
932
|
-
this.options.onPage
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
for (const link of links) {
|
|
939
|
-
if (!link.href)
|
|
940
|
-
continue;
|
|
941
|
-
const normalized = normalizeUrl(link.href);
|
|
942
|
-
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
943
|
-
continue;
|
|
944
|
-
candidateUrls.push(normalized);
|
|
945
|
-
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
1054
|
+
if (this.options.onPage) {
|
|
1055
|
+
await this.options.onPage({
|
|
1056
|
+
result,
|
|
1057
|
+
html,
|
|
1058
|
+
document: () => Promise.resolve(doc),
|
|
1059
|
+
});
|
|
946
1060
|
}
|
|
947
|
-
if (
|
|
948
|
-
const
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
1061
|
+
if (!isDuplicate) {
|
|
1062
|
+
const candidates = [];
|
|
1063
|
+
const candidateUrls = [];
|
|
1064
|
+
for (const link of links) {
|
|
1065
|
+
if (!link.href)
|
|
1066
|
+
continue;
|
|
1067
|
+
const normalized = normalizeUrl(link.href);
|
|
1068
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
1069
|
+
continue;
|
|
1070
|
+
candidateUrls.push(normalized);
|
|
1071
|
+
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
1072
|
+
}
|
|
1073
|
+
if (candidates.length > 0) {
|
|
1074
|
+
const visitedSet = this.crawlQueue.hasVisitedBatch
|
|
1075
|
+
? await this.crawlQueue.hasVisitedBatch(candidateUrls)
|
|
1076
|
+
: new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
|
|
1077
|
+
const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
|
|
1078
|
+
if (newItems.length > 0) {
|
|
1079
|
+
if (this.crawlQueue.pushBatch) {
|
|
1080
|
+
await this.crawlQueue.pushBatch(newItems);
|
|
1081
|
+
}
|
|
1082
|
+
else {
|
|
1083
|
+
for (const newItem of newItems)
|
|
1084
|
+
await this.crawlQueue.push(newItem);
|
|
1085
|
+
}
|
|
1086
|
+
this._queueSize += newItems.length;
|
|
959
1087
|
}
|
|
960
|
-
this._queueSize += newItems.length;
|
|
961
1088
|
}
|
|
962
1089
|
}
|
|
963
1090
|
}
|
|
@@ -1026,7 +1153,12 @@ export class Spider {
|
|
|
1026
1153
|
await this.crawlStorage.saveResult(errorResult);
|
|
1027
1154
|
this._resultCount++;
|
|
1028
1155
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1029
|
-
this.options.onPage
|
|
1156
|
+
if (this.options.onPage) {
|
|
1157
|
+
await this.options.onPage({ result: errorResult });
|
|
1158
|
+
}
|
|
1159
|
+
if (this.options.onError) {
|
|
1160
|
+
await this.options.onError(errorResult);
|
|
1161
|
+
}
|
|
1030
1162
|
}
|
|
1031
1163
|
}
|
|
1032
1164
|
getOrCreateDomainState(hostname) {
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
2
|
+
export declare class SqliteCrawlQueue implements CrawlQueueAdapter {
|
|
3
|
+
private db;
|
|
4
|
+
private stmts;
|
|
5
|
+
private constructor();
|
|
6
|
+
static create(opts?: {
|
|
7
|
+
dbPath?: string;
|
|
8
|
+
}): Promise<SqliteCrawlQueue>;
|
|
9
|
+
private ensureDb;
|
|
10
|
+
getDb(): any;
|
|
11
|
+
push(item: CrawlQueueItem): Promise<void>;
|
|
12
|
+
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
13
|
+
pop(): Promise<CrawlQueueItem | null>;
|
|
14
|
+
hasVisited(url: string): Promise<boolean>;
|
|
15
|
+
hasVisitedBatch(urls: string[]): Promise<Set<string>>;
|
|
16
|
+
markVisited(url: string): Promise<void>;
|
|
17
|
+
size(): Promise<number>;
|
|
18
|
+
clear(): Promise<void>;
|
|
19
|
+
close(): Promise<void>;
|
|
20
|
+
getVisitedSet(): Set<string>;
|
|
21
|
+
saveMetadata(key: string, value: string): void;
|
|
22
|
+
getMetadata(key: string): string | undefined;
|
|
23
|
+
getAllMetadata(): Record<string, string>;
|
|
24
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import * as path from 'node:path';
|
|
2
|
+
import * as os from 'node:os';
|
|
3
|
+
import * as crypto from 'node:crypto';
|
|
4
|
+
export class SqliteCrawlQueue {
|
|
5
|
+
db;
|
|
6
|
+
stmts;
|
|
7
|
+
constructor() { }
|
|
8
|
+
static async create(opts) {
|
|
9
|
+
const instance = new SqliteCrawlQueue();
|
|
10
|
+
const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-crawl-${crypto.randomUUID().slice(0, 8)}.db`);
|
|
11
|
+
await instance.ensureDb(dbPath);
|
|
12
|
+
return instance;
|
|
13
|
+
}
|
|
14
|
+
async ensureDb(dbPath) {
|
|
15
|
+
const BetterSqlite3 = (await import('better-sqlite3')).default;
|
|
16
|
+
this.db = new BetterSqlite3(dbPath);
|
|
17
|
+
this.db.pragma('journal_mode = WAL');
|
|
18
|
+
this.db.exec(`
|
|
19
|
+
CREATE TABLE IF NOT EXISTS queue (
|
|
20
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
21
|
+
url TEXT NOT NULL,
|
|
22
|
+
depth INTEGER NOT NULL,
|
|
23
|
+
priority INTEGER,
|
|
24
|
+
discovered_from TEXT
|
|
25
|
+
);
|
|
26
|
+
CREATE TABLE IF NOT EXISTS visited (url TEXT PRIMARY KEY);
|
|
27
|
+
CREATE TABLE IF NOT EXISTS crawl_metadata (key TEXT PRIMARY KEY, value TEXT NOT NULL);
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(priority ASC, id ASC);
|
|
29
|
+
`);
|
|
30
|
+
this.stmts = {
|
|
31
|
+
push: this.db.prepare('INSERT INTO queue (url, depth, priority, discovered_from) VALUES (?, ?, ?, ?)'),
|
|
32
|
+
pop: this.db.prepare('SELECT id, url, depth, priority, discovered_from FROM queue ORDER BY priority ASC NULLS LAST, id ASC LIMIT 1'),
|
|
33
|
+
deletePop: this.db.prepare('DELETE FROM queue WHERE id = ?'),
|
|
34
|
+
hasVisited: this.db.prepare('SELECT 1 FROM visited WHERE url = ?'),
|
|
35
|
+
markVisited: this.db.prepare('INSERT OR IGNORE INTO visited (url) VALUES (?)'),
|
|
36
|
+
size: this.db.prepare('SELECT COUNT(*) AS cnt FROM queue'),
|
|
37
|
+
clearQueue: this.db.prepare('DELETE FROM queue'),
|
|
38
|
+
clearVisited: this.db.prepare('DELETE FROM visited'),
|
|
39
|
+
clearMetadata: this.db.prepare('DELETE FROM crawl_metadata'),
|
|
40
|
+
allVisited: this.db.prepare('SELECT url FROM visited'),
|
|
41
|
+
saveMeta: this.db.prepare('INSERT OR REPLACE INTO crawl_metadata (key, value) VALUES (?, ?)'),
|
|
42
|
+
getMeta: this.db.prepare('SELECT value FROM crawl_metadata WHERE key = ?'),
|
|
43
|
+
allMeta: this.db.prepare('SELECT key, value FROM crawl_metadata'),
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
getDb() {
|
|
47
|
+
return this.db;
|
|
48
|
+
}
|
|
49
|
+
async push(item) {
|
|
50
|
+
this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
|
|
51
|
+
}
|
|
52
|
+
async pushBatch(items) {
|
|
53
|
+
const insert = this.db.transaction((rows) => {
|
|
54
|
+
for (const item of rows) {
|
|
55
|
+
this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
|
|
56
|
+
}
|
|
57
|
+
});
|
|
58
|
+
insert(items);
|
|
59
|
+
}
|
|
60
|
+
async pop() {
|
|
61
|
+
const row = this.stmts.pop.get();
|
|
62
|
+
if (!row)
|
|
63
|
+
return null;
|
|
64
|
+
this.stmts.deletePop.run(row.id);
|
|
65
|
+
return {
|
|
66
|
+
url: row.url,
|
|
67
|
+
depth: row.depth,
|
|
68
|
+
priority: row.priority ?? undefined,
|
|
69
|
+
discoveredFrom: row.discovered_from ?? undefined,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
async hasVisited(url) {
|
|
73
|
+
return this.stmts.hasVisited.get(url) !== undefined;
|
|
74
|
+
}
|
|
75
|
+
async hasVisitedBatch(urls) {
|
|
76
|
+
const result = new Set();
|
|
77
|
+
for (const url of urls) {
|
|
78
|
+
if (this.stmts.hasVisited.get(url) !== undefined) {
|
|
79
|
+
result.add(url);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return result;
|
|
83
|
+
}
|
|
84
|
+
async markVisited(url) {
|
|
85
|
+
this.stmts.markVisited.run(url);
|
|
86
|
+
}
|
|
87
|
+
async size() {
|
|
88
|
+
const row = this.stmts.size.get();
|
|
89
|
+
return row.cnt;
|
|
90
|
+
}
|
|
91
|
+
async clear() {
|
|
92
|
+
this.stmts.clearQueue.run();
|
|
93
|
+
this.stmts.clearVisited.run();
|
|
94
|
+
this.stmts.clearMetadata.run();
|
|
95
|
+
}
|
|
96
|
+
async close() {
|
|
97
|
+
this.db.close();
|
|
98
|
+
}
|
|
99
|
+
getVisitedSet() {
|
|
100
|
+
const rows = this.stmts.allVisited.all();
|
|
101
|
+
return new Set(rows.map((r) => r.url));
|
|
102
|
+
}
|
|
103
|
+
saveMetadata(key, value) {
|
|
104
|
+
this.stmts.saveMeta.run(key, value);
|
|
105
|
+
}
|
|
106
|
+
getMetadata(key) {
|
|
107
|
+
const row = this.stmts.getMeta.get(key);
|
|
108
|
+
return row?.value;
|
|
109
|
+
}
|
|
110
|
+
getAllMetadata() {
|
|
111
|
+
const rows = this.stmts.allMeta.all();
|
|
112
|
+
const result = {};
|
|
113
|
+
for (const row of rows) {
|
|
114
|
+
result[row.key] = row.value;
|
|
115
|
+
}
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
2
|
+
import type { SpiderPageResult } from './spider.js';
|
|
3
|
+
export declare class SqliteCrawlStorage implements CrawlStorageAdapter {
|
|
4
|
+
private db;
|
|
5
|
+
private ownsDb;
|
|
6
|
+
private stmts;
|
|
7
|
+
private constructor();
|
|
8
|
+
static create(opts?: {
|
|
9
|
+
dbPath?: string;
|
|
10
|
+
db?: any;
|
|
11
|
+
}): Promise<SqliteCrawlStorage>;
|
|
12
|
+
private init;
|
|
13
|
+
saveResult(result: SpiderPageResult): Promise<void>;
|
|
14
|
+
saveError(error: {
|
|
15
|
+
url: string;
|
|
16
|
+
error: string;
|
|
17
|
+
}): Promise<void>;
|
|
18
|
+
getResultCount(): Promise<number>;
|
|
19
|
+
getResults(): Promise<SpiderPageResult[]>;
|
|
20
|
+
getErrors(): Promise<Array<{
|
|
21
|
+
url: string;
|
|
22
|
+
error: string;
|
|
23
|
+
}>>;
|
|
24
|
+
clear(): Promise<void>;
|
|
25
|
+
close(): Promise<void>;
|
|
26
|
+
}
|