recker 1.0.93 → 1.0.94-next.83dffd9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/index.d.ts +3 -1
- package/dist/browser/scrape/index.js +2 -0
- package/dist/browser/scrape/spider.d.ts +36 -2
- package/dist/browser/scrape/spider.js +209 -58
- package/dist/browser/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/browser/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/browser/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/browser/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -0
- package/dist/browser/seo/seo-spider.js +17 -3
- package/dist/cli/commands/spider-runner.js +2 -1
- package/dist/cli/index.js +0 -3
- package/dist/cli/tui/jobs/spider-job.js +2 -1
- package/dist/mcp/prompts/index.js +15 -6
- package/dist/scrape/index.d.ts +3 -1
- package/dist/scrape/index.js +2 -0
- package/dist/scrape/spider.d.ts +36 -2
- package/dist/scrape/spider.js +209 -58
- package/dist/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/seo/seo-spider.d.ts +2 -0
- package/dist/seo/seo-spider.js +17 -3
- package/dist/version.js +1 -1
- package/package.json +7 -1
package/dist/scrape/spider.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
1
2
|
import { performance } from 'node:perf_hooks';
|
|
2
3
|
import { createClient } from '../core/client.js';
|
|
3
4
|
import { ScrapeDocument } from './document.js';
|
|
@@ -39,10 +40,15 @@ const SEC_CH_PLATFORM_HINTS = [
|
|
|
39
40
|
function getHostname(url) {
|
|
40
41
|
return new URL(url).hostname;
|
|
41
42
|
}
|
|
42
|
-
function sleep(ms) {
|
|
43
|
+
function sleep(ms, signal) {
|
|
43
44
|
if (ms <= 0)
|
|
44
45
|
return Promise.resolve();
|
|
45
|
-
|
|
46
|
+
if (signal?.aborted)
|
|
47
|
+
return Promise.resolve();
|
|
48
|
+
return new Promise(resolve => {
|
|
49
|
+
const timer = setTimeout(resolve, ms);
|
|
50
|
+
signal?.addEventListener('abort', () => { clearTimeout(timer); resolve(); }, { once: true });
|
|
51
|
+
});
|
|
46
52
|
}
|
|
47
53
|
function getRetryAfterDelay(response) {
|
|
48
54
|
const retryAfter = response.headers.get('retry-after');
|
|
@@ -108,7 +114,8 @@ function shouldCrawl(url, baseHost, options) {
|
|
|
108
114
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
109
115
|
return false;
|
|
110
116
|
}
|
|
111
|
-
|
|
117
|
+
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
118
|
+
if (options.sameDomain !== false && hostname !== baseHost) {
|
|
112
119
|
return false;
|
|
113
120
|
}
|
|
114
121
|
const skipExtensions = [
|
|
@@ -188,7 +195,10 @@ export class Spider {
|
|
|
188
195
|
baseHost = '';
|
|
189
196
|
running = false;
|
|
190
197
|
aborted = false;
|
|
198
|
+
abortController = new AbortController();
|
|
191
199
|
pendingCount = 0;
|
|
200
|
+
domainRequestTimestamps = new Map();
|
|
201
|
+
contentHashes = new Map();
|
|
192
202
|
blockedDomains = new Set();
|
|
193
203
|
curlTransport = null;
|
|
194
204
|
curlAvailable = false;
|
|
@@ -198,6 +208,33 @@ export class Spider {
|
|
|
198
208
|
robotsData = null;
|
|
199
209
|
sitemapValidation = null;
|
|
200
210
|
robotsValidation = null;
|
|
211
|
+
async waitForDomainRateLimit(hostname) {
|
|
212
|
+
const limit = this.options.domainRateLimit?.maxPerSecond;
|
|
213
|
+
if (!limit || limit <= 0)
|
|
214
|
+
return;
|
|
215
|
+
const now = Date.now();
|
|
216
|
+
const window = 1000;
|
|
217
|
+
let timestamps = this.domainRequestTimestamps.get(hostname);
|
|
218
|
+
if (!timestamps) {
|
|
219
|
+
timestamps = [];
|
|
220
|
+
this.domainRequestTimestamps.set(hostname, timestamps);
|
|
221
|
+
}
|
|
222
|
+
while (timestamps.length > 0 && timestamps[0] <= now - window) {
|
|
223
|
+
timestamps.shift();
|
|
224
|
+
}
|
|
225
|
+
if (timestamps.length >= limit) {
|
|
226
|
+
const waitMs = timestamps[0] + window - now;
|
|
227
|
+
if (waitMs > 0)
|
|
228
|
+
await sleep(waitMs, this.abortController.signal);
|
|
229
|
+
if (this.aborted)
|
|
230
|
+
return;
|
|
231
|
+
const afterWait = Date.now();
|
|
232
|
+
while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
|
|
233
|
+
timestamps.shift();
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
timestamps.push(Date.now());
|
|
237
|
+
}
|
|
201
238
|
toHeaderRecord(headers) {
|
|
202
239
|
const headerRecord = {};
|
|
203
240
|
headers.forEach((value, key) => {
|
|
@@ -239,11 +276,17 @@ export class Spider {
|
|
|
239
276
|
exclude: options.exclude,
|
|
240
277
|
include: options.include,
|
|
241
278
|
onPage: options.onPage,
|
|
242
|
-
onPageWithHtml: options.onPageWithHtml,
|
|
243
279
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
280
|
+
onBlocked: options.onBlocked,
|
|
281
|
+
onError: options.onError,
|
|
282
|
+
onRetry: options.onRetry,
|
|
283
|
+
onRedirect: options.onRedirect,
|
|
244
284
|
onProgress: options.onProgress,
|
|
245
285
|
extract: extractSchema,
|
|
246
286
|
parserOptions: options.parserOptions,
|
|
287
|
+
domainRateLimit: options.domainRateLimit,
|
|
288
|
+
deduplicateContent: options.deduplicateContent ?? false,
|
|
289
|
+
resume: options.resume ?? false,
|
|
247
290
|
};
|
|
248
291
|
if (options.proxy) {
|
|
249
292
|
if (typeof options.proxy === 'string') {
|
|
@@ -281,22 +324,41 @@ export class Spider {
|
|
|
281
324
|
const startTimestamp = Date.now();
|
|
282
325
|
const normalizedStart = normalizeUrl(startUrl);
|
|
283
326
|
const baseUrl = new URL(normalizedStart).origin;
|
|
284
|
-
this.baseHost = new URL(normalizedStart).hostname;
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
327
|
+
this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
|
|
328
|
+
if (!this.options.resume) {
|
|
329
|
+
await this.crawlQueue.clear();
|
|
330
|
+
await this.crawlStorage.clear();
|
|
331
|
+
this._visitedCount = 0;
|
|
332
|
+
this._queueSize = 0;
|
|
333
|
+
this._resultCount = 0;
|
|
334
|
+
this.domainRequestTimestamps.clear();
|
|
335
|
+
this.contentHashes.clear();
|
|
336
|
+
}
|
|
337
|
+
else {
|
|
338
|
+
this._queueSize = await this.crawlQueue.size();
|
|
339
|
+
this._resultCount = await this.crawlStorage.getResultCount();
|
|
340
|
+
}
|
|
290
341
|
this.running = true;
|
|
291
342
|
this.aborted = false;
|
|
343
|
+
this.abortController = new AbortController();
|
|
292
344
|
this.pendingCount = 0;
|
|
293
345
|
this.sitemapUrls = [];
|
|
294
346
|
this.sitemapUrlSet.clear();
|
|
295
347
|
this.robotsData = null;
|
|
296
348
|
this.sitemapValidation = null;
|
|
297
349
|
this.robotsValidation = null;
|
|
298
|
-
this.
|
|
299
|
-
|
|
350
|
+
if (!this.options.resume) {
|
|
351
|
+
this.blockedDomains.clear();
|
|
352
|
+
this.domainStates.clear();
|
|
353
|
+
}
|
|
354
|
+
if (this.options.resume && this.options.deduplicateContent) {
|
|
355
|
+
const existingResults = await this.crawlStorage.getResults();
|
|
356
|
+
for (const r of existingResults) {
|
|
357
|
+
if (r.contentHash) {
|
|
358
|
+
this.contentHashes.set(r.contentHash, r.url);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
300
362
|
if (this.options.transport !== 'undici') {
|
|
301
363
|
this.curlAvailable = await hasImpersonate();
|
|
302
364
|
if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
|
|
@@ -312,10 +374,12 @@ export class Spider {
|
|
|
312
374
|
const pending = new Map();
|
|
313
375
|
const scheduleUrl = async (item) => {
|
|
314
376
|
const normalized = normalizeUrl(item.url);
|
|
315
|
-
if (await this.crawlQueue.hasVisited(normalized))
|
|
316
|
-
return;
|
|
317
377
|
if (pending.has(normalized))
|
|
318
378
|
return;
|
|
379
|
+
if (await this.crawlQueue.hasVisited(normalized))
|
|
380
|
+
return;
|
|
381
|
+
await this.crawlQueue.markVisited(normalized);
|
|
382
|
+
this._visitedCount++;
|
|
319
383
|
if (item.depth > this.options.maxDepth)
|
|
320
384
|
return;
|
|
321
385
|
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
@@ -331,8 +395,6 @@ export class Spider {
|
|
|
331
395
|
return;
|
|
332
396
|
}
|
|
333
397
|
}
|
|
334
|
-
await this.crawlQueue.markVisited(normalized);
|
|
335
|
-
this._visitedCount++;
|
|
336
398
|
this.pendingCount++;
|
|
337
399
|
const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
|
|
338
400
|
.finally(() => {
|
|
@@ -341,16 +403,18 @@ export class Spider {
|
|
|
341
403
|
});
|
|
342
404
|
pending.set(normalized, promise);
|
|
343
405
|
};
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
406
|
+
if (!this.options.resume) {
|
|
407
|
+
await scheduleUrl({ url: normalizedStart, depth: 0 });
|
|
408
|
+
if (this.options.useSitemap && this.sitemapUrls.length > 0) {
|
|
409
|
+
for (const sitemapUrl of this.sitemapUrls) {
|
|
410
|
+
try {
|
|
411
|
+
const urlHost = new URL(sitemapUrl.loc).hostname;
|
|
412
|
+
if (urlHost === this.baseHost) {
|
|
413
|
+
await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
catch {
|
|
351
417
|
}
|
|
352
|
-
}
|
|
353
|
-
catch {
|
|
354
418
|
}
|
|
355
419
|
}
|
|
356
420
|
}
|
|
@@ -372,14 +436,33 @@ export class Spider {
|
|
|
372
436
|
if (pending.size > 0) {
|
|
373
437
|
await Promise.all(pending.values());
|
|
374
438
|
}
|
|
439
|
+
while (!this.aborted && this._resultCount < this.options.maxPages) {
|
|
440
|
+
const remaining = await this.crawlQueue.size();
|
|
441
|
+
if (remaining === 0 && pending.size === 0)
|
|
442
|
+
break;
|
|
443
|
+
this._queueSize = remaining;
|
|
444
|
+
let nextItem = await this.crawlQueue.pop();
|
|
445
|
+
while (nextItem && !this.aborted) {
|
|
446
|
+
this._queueSize = Math.max(0, this._queueSize - 1);
|
|
447
|
+
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
448
|
+
break;
|
|
449
|
+
await scheduleUrl(nextItem);
|
|
450
|
+
nextItem = await this.crawlQueue.pop();
|
|
451
|
+
}
|
|
452
|
+
if (pending.size > 0) {
|
|
453
|
+
await Promise.all(pending.values());
|
|
454
|
+
}
|
|
455
|
+
}
|
|
375
456
|
this.running = false;
|
|
376
457
|
const pages = await this.crawlStorage.getResults();
|
|
377
458
|
const errors = await this.crawlStorage.getErrors();
|
|
378
459
|
const sitemapAnalysis = this.buildSitemapAnalysis(pages);
|
|
379
460
|
const robotsAnalysis = this.buildRobotsAnalysis();
|
|
380
461
|
const visited = this.crawlQueue instanceof InMemoryCrawlQueue
|
|
381
|
-
? this.crawlQueue.getVisited()
|
|
462
|
+
? new Set(this.crawlQueue.getVisited())
|
|
382
463
|
: new Set(pages.map(r => r.url));
|
|
464
|
+
await this.crawlQueue.close?.();
|
|
465
|
+
await this.crawlStorage.close?.();
|
|
383
466
|
return {
|
|
384
467
|
startUrl: normalizedStart,
|
|
385
468
|
pages,
|
|
@@ -571,6 +654,8 @@ export class Spider {
|
|
|
571
654
|
let lastRetryAfterMs = 0;
|
|
572
655
|
const proxyUrl = this.proxyAdapter ? await this.proxyAdapter.getProxy() : null;
|
|
573
656
|
const executeRequest = async (useCurl) => {
|
|
657
|
+
if (this.aborted)
|
|
658
|
+
throw new Error('Crawl aborted');
|
|
574
659
|
if (useCurl && this.curlTransport) {
|
|
575
660
|
const curlForRequest = proxyUrl
|
|
576
661
|
? new CurlTransport(proxyUrl)
|
|
@@ -611,6 +696,10 @@ export class Spider {
|
|
|
611
696
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
612
697
|
const response = await clientForRequest.get(url, {
|
|
613
698
|
headers: this.buildRequestHeaders(url, false),
|
|
699
|
+
signal: this.abortController.signal,
|
|
700
|
+
beforeRedirect: this.options.onRedirect
|
|
701
|
+
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
702
|
+
: undefined,
|
|
614
703
|
});
|
|
615
704
|
const contentType = response.headers.get('content-type') || '';
|
|
616
705
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -643,6 +732,7 @@ export class Spider {
|
|
|
643
732
|
};
|
|
644
733
|
};
|
|
645
734
|
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
735
|
+
await this.waitForDomainRateLimit(hostname);
|
|
646
736
|
await this.waitForDomainPenalty(hostname);
|
|
647
737
|
const useCurl = this.shouldUseCurlForHost(hostname, hasCurl, forcedTransport);
|
|
648
738
|
const transportForAttempt = useCurl ? 'curl' : 'undici';
|
|
@@ -732,7 +822,21 @@ export class Spider {
|
|
|
732
822
|
forcedTransport = 'curl';
|
|
733
823
|
}
|
|
734
824
|
}
|
|
735
|
-
|
|
825
|
+
if (this.options.onRetry) {
|
|
826
|
+
await this.options.onRetry({
|
|
827
|
+
url,
|
|
828
|
+
attempt: attempt + 1,
|
|
829
|
+
maxAttempts,
|
|
830
|
+
reason: attemptReason,
|
|
831
|
+
delay: waitMs,
|
|
832
|
+
transport: forcedTransport ?? transportForAttempt,
|
|
833
|
+
previousStatus: response.status,
|
|
834
|
+
timings,
|
|
835
|
+
});
|
|
836
|
+
}
|
|
837
|
+
if (this.aborted)
|
|
838
|
+
break;
|
|
839
|
+
await sleep(waitMs, this.abortController.signal);
|
|
736
840
|
continue;
|
|
737
841
|
}
|
|
738
842
|
catch (error) {
|
|
@@ -763,8 +867,10 @@ export class Spider {
|
|
|
763
867
|
forcedTransport = 'curl';
|
|
764
868
|
}
|
|
765
869
|
}
|
|
870
|
+
if (this.aborted)
|
|
871
|
+
break;
|
|
766
872
|
const waitMs = this.getRetryWait(hostname, attempt + 1);
|
|
767
|
-
await sleep(waitMs);
|
|
873
|
+
await sleep(waitMs, this.abortController.signal);
|
|
768
874
|
}
|
|
769
875
|
}
|
|
770
876
|
if (lastResponse) {
|
|
@@ -867,7 +973,21 @@ export class Spider {
|
|
|
867
973
|
};
|
|
868
974
|
await this.crawlStorage.saveResult(nonHtmlResult);
|
|
869
975
|
this._resultCount++;
|
|
870
|
-
this.options.onPage
|
|
976
|
+
if (this.options.onPage) {
|
|
977
|
+
let cachedDoc = null;
|
|
978
|
+
await this.options.onPage({
|
|
979
|
+
result: nonHtmlResult,
|
|
980
|
+
html: html || undefined,
|
|
981
|
+
document: html ? () => {
|
|
982
|
+
if (cachedDoc)
|
|
983
|
+
return Promise.resolve(cachedDoc);
|
|
984
|
+
return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
|
|
985
|
+
} : undefined,
|
|
986
|
+
});
|
|
987
|
+
}
|
|
988
|
+
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
989
|
+
await this.options.onBlocked(nonHtmlResult);
|
|
990
|
+
}
|
|
871
991
|
return;
|
|
872
992
|
}
|
|
873
993
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -898,6 +1018,21 @@ export class Spider {
|
|
|
898
1018
|
catch {
|
|
899
1019
|
}
|
|
900
1020
|
}
|
|
1021
|
+
let isDuplicate = false;
|
|
1022
|
+
let duplicateOf;
|
|
1023
|
+
let contentHash;
|
|
1024
|
+
if (this.options.deduplicateContent) {
|
|
1025
|
+
const bodyText = doc.text('body');
|
|
1026
|
+
contentHash = createHash('md5').update(bodyText).digest('hex');
|
|
1027
|
+
const existingUrl = this.contentHashes.get(contentHash);
|
|
1028
|
+
if (existingUrl) {
|
|
1029
|
+
isDuplicate = true;
|
|
1030
|
+
duplicateOf = existingUrl;
|
|
1031
|
+
}
|
|
1032
|
+
else {
|
|
1033
|
+
this.contentHashes.set(contentHash, item.url);
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
901
1036
|
const result = {
|
|
902
1037
|
url: item.url,
|
|
903
1038
|
status,
|
|
@@ -926,42 +1061,52 @@ export class Spider {
|
|
|
926
1061
|
timings,
|
|
927
1062
|
fetchedAt,
|
|
928
1063
|
extracted,
|
|
1064
|
+
contentHash,
|
|
1065
|
+
isDuplicate: isDuplicate || undefined,
|
|
1066
|
+
duplicateOf,
|
|
929
1067
|
};
|
|
930
1068
|
await this.crawlStorage.saveResult(result);
|
|
931
1069
|
this._resultCount++;
|
|
932
|
-
this.options.onPage
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
for (const link of links) {
|
|
939
|
-
if (!link.href)
|
|
940
|
-
continue;
|
|
941
|
-
const normalized = normalizeUrl(link.href);
|
|
942
|
-
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
943
|
-
continue;
|
|
944
|
-
candidateUrls.push(normalized);
|
|
945
|
-
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
1070
|
+
if (this.options.onPage) {
|
|
1071
|
+
await this.options.onPage({
|
|
1072
|
+
result,
|
|
1073
|
+
html,
|
|
1074
|
+
document: () => Promise.resolve(doc),
|
|
1075
|
+
});
|
|
946
1076
|
}
|
|
947
|
-
if (
|
|
948
|
-
const
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
1077
|
+
if (!isDuplicate) {
|
|
1078
|
+
const candidates = [];
|
|
1079
|
+
const candidateUrls = [];
|
|
1080
|
+
for (const link of links) {
|
|
1081
|
+
if (!link.href)
|
|
1082
|
+
continue;
|
|
1083
|
+
const normalized = normalizeUrl(link.href);
|
|
1084
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
1085
|
+
continue;
|
|
1086
|
+
candidateUrls.push(normalized);
|
|
1087
|
+
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
1088
|
+
}
|
|
1089
|
+
if (candidates.length > 0) {
|
|
1090
|
+
const visitedSet = this.crawlQueue.hasVisitedBatch
|
|
1091
|
+
? await this.crawlQueue.hasVisitedBatch(candidateUrls)
|
|
1092
|
+
: new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
|
|
1093
|
+
const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
|
|
1094
|
+
if (newItems.length > 0) {
|
|
1095
|
+
if (this.crawlQueue.pushBatch) {
|
|
1096
|
+
await this.crawlQueue.pushBatch(newItems);
|
|
1097
|
+
}
|
|
1098
|
+
else {
|
|
1099
|
+
for (const newItem of newItems)
|
|
1100
|
+
await this.crawlQueue.push(newItem);
|
|
1101
|
+
}
|
|
1102
|
+
this._queueSize += newItems.length;
|
|
959
1103
|
}
|
|
960
|
-
this._queueSize += newItems.length;
|
|
961
1104
|
}
|
|
962
1105
|
}
|
|
963
1106
|
}
|
|
964
1107
|
catch (error) {
|
|
1108
|
+
if (this.aborted)
|
|
1109
|
+
return;
|
|
965
1110
|
const errAttempts = typeof error?.attempts === 'number' ? error.attempts : 1;
|
|
966
1111
|
const errRetryCount = typeof error?.retryCount === 'number' ? error.retryCount : Math.max(0, errAttempts - 1);
|
|
967
1112
|
const errRetryAfterMs = typeof error?.retryAfterMs === 'number' ? error?.retryAfterMs : 0;
|
|
@@ -1026,7 +1171,12 @@ export class Spider {
|
|
|
1026
1171
|
await this.crawlStorage.saveResult(errorResult);
|
|
1027
1172
|
this._resultCount++;
|
|
1028
1173
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1029
|
-
this.options.onPage
|
|
1174
|
+
if (this.options.onPage) {
|
|
1175
|
+
await this.options.onPage({ result: errorResult });
|
|
1176
|
+
}
|
|
1177
|
+
if (this.options.onError) {
|
|
1178
|
+
await this.options.onError(errorResult);
|
|
1179
|
+
}
|
|
1030
1180
|
}
|
|
1031
1181
|
}
|
|
1032
1182
|
getOrCreateDomainState(hostname) {
|
|
@@ -1146,7 +1296,7 @@ export class Spider {
|
|
|
1146
1296
|
const now = Date.now();
|
|
1147
1297
|
const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
|
|
1148
1298
|
if (delay > 0) {
|
|
1149
|
-
await sleep(delay);
|
|
1299
|
+
await sleep(delay, this.abortController.signal);
|
|
1150
1300
|
}
|
|
1151
1301
|
}
|
|
1152
1302
|
registerDomainBlock(hostname) {
|
|
@@ -1217,6 +1367,7 @@ export class Spider {
|
|
|
1217
1367
|
}
|
|
1218
1368
|
abort() {
|
|
1219
1369
|
this.aborted = true;
|
|
1370
|
+
this.abortController.abort();
|
|
1220
1371
|
}
|
|
1221
1372
|
isRunning() {
|
|
1222
1373
|
return this.running;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
2
|
+
export declare class SqliteCrawlQueue implements CrawlQueueAdapter {
|
|
3
|
+
private db;
|
|
4
|
+
private stmts;
|
|
5
|
+
private constructor();
|
|
6
|
+
static create(opts?: {
|
|
7
|
+
dbPath?: string;
|
|
8
|
+
}): Promise<SqliteCrawlQueue>;
|
|
9
|
+
private ensureDb;
|
|
10
|
+
getDb(): any;
|
|
11
|
+
push(item: CrawlQueueItem): Promise<void>;
|
|
12
|
+
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
13
|
+
pop(): Promise<CrawlQueueItem | null>;
|
|
14
|
+
hasVisited(url: string): Promise<boolean>;
|
|
15
|
+
hasVisitedBatch(urls: string[]): Promise<Set<string>>;
|
|
16
|
+
markVisited(url: string): Promise<void>;
|
|
17
|
+
size(): Promise<number>;
|
|
18
|
+
clear(): Promise<void>;
|
|
19
|
+
close(): Promise<void>;
|
|
20
|
+
getVisitedSet(): Set<string>;
|
|
21
|
+
saveMetadata(key: string, value: string): void;
|
|
22
|
+
getMetadata(key: string): string | undefined;
|
|
23
|
+
getAllMetadata(): Record<string, string>;
|
|
24
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import * as path from 'node:path';
|
|
2
|
+
import * as os from 'node:os';
|
|
3
|
+
import * as crypto from 'node:crypto';
|
|
4
|
+
export class SqliteCrawlQueue {
|
|
5
|
+
db;
|
|
6
|
+
stmts;
|
|
7
|
+
constructor() { }
|
|
8
|
+
static async create(opts) {
|
|
9
|
+
const instance = new SqliteCrawlQueue();
|
|
10
|
+
const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-crawl-${crypto.randomUUID().slice(0, 8)}.db`);
|
|
11
|
+
await instance.ensureDb(dbPath);
|
|
12
|
+
return instance;
|
|
13
|
+
}
|
|
14
|
+
async ensureDb(dbPath) {
|
|
15
|
+
const BetterSqlite3 = (await import('better-sqlite3')).default;
|
|
16
|
+
this.db = new BetterSqlite3(dbPath);
|
|
17
|
+
this.db.pragma('journal_mode = WAL');
|
|
18
|
+
this.db.exec(`
|
|
19
|
+
CREATE TABLE IF NOT EXISTS queue (
|
|
20
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
21
|
+
url TEXT NOT NULL,
|
|
22
|
+
depth INTEGER NOT NULL,
|
|
23
|
+
priority INTEGER,
|
|
24
|
+
discovered_from TEXT
|
|
25
|
+
);
|
|
26
|
+
CREATE TABLE IF NOT EXISTS visited (url TEXT PRIMARY KEY);
|
|
27
|
+
CREATE TABLE IF NOT EXISTS crawl_metadata (key TEXT PRIMARY KEY, value TEXT NOT NULL);
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(priority ASC, id ASC);
|
|
29
|
+
`);
|
|
30
|
+
this.stmts = {
|
|
31
|
+
push: this.db.prepare('INSERT INTO queue (url, depth, priority, discovered_from) VALUES (?, ?, ?, ?)'),
|
|
32
|
+
pop: this.db.prepare('SELECT id, url, depth, priority, discovered_from FROM queue ORDER BY priority ASC NULLS LAST, id ASC LIMIT 1'),
|
|
33
|
+
deletePop: this.db.prepare('DELETE FROM queue WHERE id = ?'),
|
|
34
|
+
hasVisited: this.db.prepare('SELECT 1 FROM visited WHERE url = ?'),
|
|
35
|
+
markVisited: this.db.prepare('INSERT OR IGNORE INTO visited (url) VALUES (?)'),
|
|
36
|
+
size: this.db.prepare('SELECT COUNT(*) AS cnt FROM queue'),
|
|
37
|
+
clearQueue: this.db.prepare('DELETE FROM queue'),
|
|
38
|
+
clearVisited: this.db.prepare('DELETE FROM visited'),
|
|
39
|
+
clearMetadata: this.db.prepare('DELETE FROM crawl_metadata'),
|
|
40
|
+
allVisited: this.db.prepare('SELECT url FROM visited'),
|
|
41
|
+
saveMeta: this.db.prepare('INSERT OR REPLACE INTO crawl_metadata (key, value) VALUES (?, ?)'),
|
|
42
|
+
getMeta: this.db.prepare('SELECT value FROM crawl_metadata WHERE key = ?'),
|
|
43
|
+
allMeta: this.db.prepare('SELECT key, value FROM crawl_metadata'),
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
getDb() {
|
|
47
|
+
return this.db;
|
|
48
|
+
}
|
|
49
|
+
async push(item) {
|
|
50
|
+
this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
|
|
51
|
+
}
|
|
52
|
+
async pushBatch(items) {
|
|
53
|
+
const insert = this.db.transaction((rows) => {
|
|
54
|
+
for (const item of rows) {
|
|
55
|
+
this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
|
|
56
|
+
}
|
|
57
|
+
});
|
|
58
|
+
insert(items);
|
|
59
|
+
}
|
|
60
|
+
async pop() {
|
|
61
|
+
const row = this.stmts.pop.get();
|
|
62
|
+
if (!row)
|
|
63
|
+
return null;
|
|
64
|
+
this.stmts.deletePop.run(row.id);
|
|
65
|
+
return {
|
|
66
|
+
url: row.url,
|
|
67
|
+
depth: row.depth,
|
|
68
|
+
priority: row.priority ?? undefined,
|
|
69
|
+
discoveredFrom: row.discovered_from ?? undefined,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
async hasVisited(url) {
|
|
73
|
+
return this.stmts.hasVisited.get(url) !== undefined;
|
|
74
|
+
}
|
|
75
|
+
async hasVisitedBatch(urls) {
|
|
76
|
+
const result = new Set();
|
|
77
|
+
for (const url of urls) {
|
|
78
|
+
if (this.stmts.hasVisited.get(url) !== undefined) {
|
|
79
|
+
result.add(url);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return result;
|
|
83
|
+
}
|
|
84
|
+
async markVisited(url) {
|
|
85
|
+
this.stmts.markVisited.run(url);
|
|
86
|
+
}
|
|
87
|
+
async size() {
|
|
88
|
+
const row = this.stmts.size.get();
|
|
89
|
+
return row.cnt;
|
|
90
|
+
}
|
|
91
|
+
async clear() {
|
|
92
|
+
this.stmts.clearQueue.run();
|
|
93
|
+
this.stmts.clearVisited.run();
|
|
94
|
+
this.stmts.clearMetadata.run();
|
|
95
|
+
}
|
|
96
|
+
async close() {
|
|
97
|
+
this.db.close();
|
|
98
|
+
}
|
|
99
|
+
getVisitedSet() {
|
|
100
|
+
const rows = this.stmts.allVisited.all();
|
|
101
|
+
return new Set(rows.map((r) => r.url));
|
|
102
|
+
}
|
|
103
|
+
saveMetadata(key, value) {
|
|
104
|
+
this.stmts.saveMeta.run(key, value);
|
|
105
|
+
}
|
|
106
|
+
getMetadata(key) {
|
|
107
|
+
const row = this.stmts.getMeta.get(key);
|
|
108
|
+
return row?.value;
|
|
109
|
+
}
|
|
110
|
+
getAllMetadata() {
|
|
111
|
+
const rows = this.stmts.allMeta.all();
|
|
112
|
+
const result = {};
|
|
113
|
+
for (const row of rows) {
|
|
114
|
+
result[row.key] = row.value;
|
|
115
|
+
}
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
2
|
+
import type { SpiderPageResult } from './spider.js';
|
|
3
|
+
export declare class SqliteCrawlStorage implements CrawlStorageAdapter {
|
|
4
|
+
private db;
|
|
5
|
+
private ownsDb;
|
|
6
|
+
private stmts;
|
|
7
|
+
private constructor();
|
|
8
|
+
static create(opts?: {
|
|
9
|
+
dbPath?: string;
|
|
10
|
+
db?: any;
|
|
11
|
+
}): Promise<SqliteCrawlStorage>;
|
|
12
|
+
private init;
|
|
13
|
+
saveResult(result: SpiderPageResult): Promise<void>;
|
|
14
|
+
saveError(error: {
|
|
15
|
+
url: string;
|
|
16
|
+
error: string;
|
|
17
|
+
}): Promise<void>;
|
|
18
|
+
getResultCount(): Promise<number>;
|
|
19
|
+
getResults(): Promise<SpiderPageResult[]>;
|
|
20
|
+
getErrors(): Promise<Array<{
|
|
21
|
+
url: string;
|
|
22
|
+
error: string;
|
|
23
|
+
}>>;
|
|
24
|
+
clear(): Promise<void>;
|
|
25
|
+
close(): Promise<void>;
|
|
26
|
+
}
|