recker 1.0.93 → 1.0.94-next.fcefd58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/index.d.ts +3 -1
- package/dist/browser/scrape/index.js +2 -0
- package/dist/browser/scrape/spider.d.ts +35 -2
- package/dist/browser/scrape/spider.js +146 -44
- package/dist/browser/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/browser/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/browser/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/browser/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -0
- package/dist/browser/seo/seo-spider.js +17 -3
- package/dist/cli/commands/spider-runner.js +2 -1
- package/dist/cli/index.js +0 -3
- package/dist/cli/tui/jobs/spider-job.js +2 -1
- package/dist/mcp/prompts/index.js +15 -6
- package/dist/scrape/index.d.ts +3 -1
- package/dist/scrape/index.js +2 -0
- package/dist/scrape/spider.d.ts +35 -2
- package/dist/scrape/spider.js +146 -44
- package/dist/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/seo/seo-spider.d.ts +2 -0
- package/dist/seo/seo-spider.js +17 -3
- package/dist/version.js +1 -1
- package/package.json +7 -1
|
@@ -3,10 +3,12 @@ export type { Options as ParserOptions } from './parser/index.js';
|
|
|
3
3
|
export { ScrapeDocument } from './document.js';
|
|
4
4
|
export { ScrapeElement } from './element.js';
|
|
5
5
|
export { Spider, spider } from './spider.js';
|
|
6
|
-
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
6
|
+
export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
|
|
7
7
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
8
|
+
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
8
9
|
export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
9
10
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
11
|
+
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
10
12
|
export type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
11
13
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
12
14
|
export type { ProxyAdapter } from './proxy-adapter.js';
|
|
@@ -3,6 +3,8 @@ export { ScrapeDocument } from './document.js';
|
|
|
3
3
|
export { ScrapeElement } from './element.js';
|
|
4
4
|
export { Spider, spider } from './spider.js';
|
|
5
5
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
6
|
+
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
6
7
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
8
|
+
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
7
9
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
8
10
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { ScrapeDocument } from './document.js';
|
|
1
2
|
import type { ExtractedLink, ExtractionSchema } from './types.js';
|
|
2
3
|
import type { Options as ParserOptions } from './parser/index.js';
|
|
3
4
|
import { type SitemapUrl } from '../seo/validators/sitemap.js';
|
|
@@ -31,8 +32,7 @@ export interface SpiderOptions {
|
|
|
31
32
|
proxy?: string | string[] | ProxyAdapter;
|
|
32
33
|
transport?: SpiderTransport;
|
|
33
34
|
preferCurlFirst?: boolean;
|
|
34
|
-
onPage?: (
|
|
35
|
-
onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
|
|
35
|
+
onPage?: (event: SpiderPageEvent) => void | Promise<void>;
|
|
36
36
|
onCaptchaDetected?: (result: {
|
|
37
37
|
url: string;
|
|
38
38
|
status: number;
|
|
@@ -40,9 +40,31 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
+
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
+
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
+
onRetry?: (info: {
|
|
46
|
+
url: string;
|
|
47
|
+
attempt: number;
|
|
48
|
+
maxAttempts: number;
|
|
49
|
+
reason?: string;
|
|
50
|
+
delay: number;
|
|
51
|
+
transport: SpiderTransport;
|
|
52
|
+
previousStatus: number;
|
|
53
|
+
timings?: SpiderPageResult['timings'];
|
|
54
|
+
}) => void | Promise<void>;
|
|
55
|
+
onRedirect?: (info: {
|
|
56
|
+
from: string;
|
|
57
|
+
to: string;
|
|
58
|
+
status: number;
|
|
59
|
+
}) => void | Promise<void>;
|
|
43
60
|
onProgress?: (progress: SpiderProgress) => void;
|
|
44
61
|
extract?: string[] | ExtractionSchema;
|
|
45
62
|
parserOptions?: Partial<ParserOptions>;
|
|
63
|
+
domainRateLimit?: {
|
|
64
|
+
maxPerSecond?: number;
|
|
65
|
+
};
|
|
66
|
+
deduplicateContent?: boolean;
|
|
67
|
+
resume?: boolean;
|
|
46
68
|
crawlQueue?: CrawlQueueAdapter;
|
|
47
69
|
crawlStorage?: CrawlStorageAdapter;
|
|
48
70
|
}
|
|
@@ -102,6 +124,14 @@ export interface SpiderPageResult {
|
|
|
102
124
|
stylesheets: number;
|
|
103
125
|
};
|
|
104
126
|
extracted?: Record<string, unknown>;
|
|
127
|
+
contentHash?: string;
|
|
128
|
+
isDuplicate?: boolean;
|
|
129
|
+
duplicateOf?: string;
|
|
130
|
+
}
|
|
131
|
+
export interface SpiderPageEvent {
|
|
132
|
+
result: SpiderPageResult;
|
|
133
|
+
html?: string;
|
|
134
|
+
document?: () => Promise<ScrapeDocument>;
|
|
105
135
|
}
|
|
106
136
|
export interface SpiderProgress {
|
|
107
137
|
crawled: number;
|
|
@@ -163,6 +193,8 @@ export declare class Spider {
|
|
|
163
193
|
private running;
|
|
164
194
|
private aborted;
|
|
165
195
|
private pendingCount;
|
|
196
|
+
private domainRequestTimestamps;
|
|
197
|
+
private contentHashes;
|
|
166
198
|
private blockedDomains;
|
|
167
199
|
private curlTransport;
|
|
168
200
|
private curlAvailable;
|
|
@@ -172,6 +204,7 @@ export declare class Spider {
|
|
|
172
204
|
private robotsData;
|
|
173
205
|
private sitemapValidation;
|
|
174
206
|
private robotsValidation;
|
|
207
|
+
private waitForDomainRateLimit;
|
|
175
208
|
private toHeaderRecord;
|
|
176
209
|
constructor(options?: SpiderOptions);
|
|
177
210
|
crawl(startUrl: string): Promise<SpiderResult>;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
1
2
|
import { performance } from 'node:perf_hooks';
|
|
2
3
|
import { createClient } from '../core/client.js';
|
|
3
4
|
import { ScrapeDocument } from './document.js';
|
|
@@ -189,6 +190,8 @@ export class Spider {
|
|
|
189
190
|
running = false;
|
|
190
191
|
aborted = false;
|
|
191
192
|
pendingCount = 0;
|
|
193
|
+
domainRequestTimestamps = new Map();
|
|
194
|
+
contentHashes = new Map();
|
|
192
195
|
blockedDomains = new Set();
|
|
193
196
|
curlTransport = null;
|
|
194
197
|
curlAvailable = false;
|
|
@@ -198,6 +201,31 @@ export class Spider {
|
|
|
198
201
|
robotsData = null;
|
|
199
202
|
sitemapValidation = null;
|
|
200
203
|
robotsValidation = null;
|
|
204
|
+
async waitForDomainRateLimit(hostname) {
|
|
205
|
+
const limit = this.options.domainRateLimit?.maxPerSecond;
|
|
206
|
+
if (!limit || limit <= 0)
|
|
207
|
+
return;
|
|
208
|
+
const now = Date.now();
|
|
209
|
+
const window = 1000;
|
|
210
|
+
let timestamps = this.domainRequestTimestamps.get(hostname);
|
|
211
|
+
if (!timestamps) {
|
|
212
|
+
timestamps = [];
|
|
213
|
+
this.domainRequestTimestamps.set(hostname, timestamps);
|
|
214
|
+
}
|
|
215
|
+
while (timestamps.length > 0 && timestamps[0] <= now - window) {
|
|
216
|
+
timestamps.shift();
|
|
217
|
+
}
|
|
218
|
+
if (timestamps.length >= limit) {
|
|
219
|
+
const waitMs = timestamps[0] + window - now;
|
|
220
|
+
if (waitMs > 0)
|
|
221
|
+
await sleep(waitMs);
|
|
222
|
+
const afterWait = Date.now();
|
|
223
|
+
while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
|
|
224
|
+
timestamps.shift();
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
timestamps.push(Date.now());
|
|
228
|
+
}
|
|
201
229
|
toHeaderRecord(headers) {
|
|
202
230
|
const headerRecord = {};
|
|
203
231
|
headers.forEach((value, key) => {
|
|
@@ -239,11 +267,17 @@ export class Spider {
|
|
|
239
267
|
exclude: options.exclude,
|
|
240
268
|
include: options.include,
|
|
241
269
|
onPage: options.onPage,
|
|
242
|
-
onPageWithHtml: options.onPageWithHtml,
|
|
243
270
|
onCaptchaDetected: options.onCaptchaDetected,
|
|
271
|
+
onBlocked: options.onBlocked,
|
|
272
|
+
onError: options.onError,
|
|
273
|
+
onRetry: options.onRetry,
|
|
274
|
+
onRedirect: options.onRedirect,
|
|
244
275
|
onProgress: options.onProgress,
|
|
245
276
|
extract: extractSchema,
|
|
246
277
|
parserOptions: options.parserOptions,
|
|
278
|
+
domainRateLimit: options.domainRateLimit,
|
|
279
|
+
deduplicateContent: options.deduplicateContent ?? false,
|
|
280
|
+
resume: options.resume ?? false,
|
|
247
281
|
};
|
|
248
282
|
if (options.proxy) {
|
|
249
283
|
if (typeof options.proxy === 'string') {
|
|
@@ -282,11 +316,19 @@ export class Spider {
|
|
|
282
316
|
const normalizedStart = normalizeUrl(startUrl);
|
|
283
317
|
const baseUrl = new URL(normalizedStart).origin;
|
|
284
318
|
this.baseHost = new URL(normalizedStart).hostname;
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
319
|
+
if (!this.options.resume) {
|
|
320
|
+
await this.crawlQueue.clear();
|
|
321
|
+
await this.crawlStorage.clear();
|
|
322
|
+
this._visitedCount = 0;
|
|
323
|
+
this._queueSize = 0;
|
|
324
|
+
this._resultCount = 0;
|
|
325
|
+
this.domainRequestTimestamps.clear();
|
|
326
|
+
this.contentHashes.clear();
|
|
327
|
+
}
|
|
328
|
+
else {
|
|
329
|
+
this._queueSize = await this.crawlQueue.size();
|
|
330
|
+
this._resultCount = await this.crawlStorage.getResultCount();
|
|
331
|
+
}
|
|
290
332
|
this.running = true;
|
|
291
333
|
this.aborted = false;
|
|
292
334
|
this.pendingCount = 0;
|
|
@@ -341,16 +383,18 @@ export class Spider {
|
|
|
341
383
|
});
|
|
342
384
|
pending.set(normalized, promise);
|
|
343
385
|
};
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
386
|
+
if (!this.options.resume) {
|
|
387
|
+
await scheduleUrl({ url: normalizedStart, depth: 0 });
|
|
388
|
+
if (this.options.useSitemap && this.sitemapUrls.length > 0) {
|
|
389
|
+
for (const sitemapUrl of this.sitemapUrls) {
|
|
390
|
+
try {
|
|
391
|
+
const urlHost = new URL(sitemapUrl.loc).hostname;
|
|
392
|
+
if (urlHost === this.baseHost) {
|
|
393
|
+
await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
catch {
|
|
351
397
|
}
|
|
352
|
-
}
|
|
353
|
-
catch {
|
|
354
398
|
}
|
|
355
399
|
}
|
|
356
400
|
}
|
|
@@ -611,6 +655,9 @@ export class Spider {
|
|
|
611
655
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
612
656
|
const response = await clientForRequest.get(url, {
|
|
613
657
|
headers: this.buildRequestHeaders(url, false),
|
|
658
|
+
beforeRedirect: this.options.onRedirect
|
|
659
|
+
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
660
|
+
: undefined,
|
|
614
661
|
});
|
|
615
662
|
const contentType = response.headers.get('content-type') || '';
|
|
616
663
|
const shouldReadUndiciBody = !contentType ||
|
|
@@ -643,6 +690,7 @@ export class Spider {
|
|
|
643
690
|
};
|
|
644
691
|
};
|
|
645
692
|
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
693
|
+
await this.waitForDomainRateLimit(hostname);
|
|
646
694
|
await this.waitForDomainPenalty(hostname);
|
|
647
695
|
const useCurl = this.shouldUseCurlForHost(hostname, hasCurl, forcedTransport);
|
|
648
696
|
const transportForAttempt = useCurl ? 'curl' : 'undici';
|
|
@@ -732,6 +780,18 @@ export class Spider {
|
|
|
732
780
|
forcedTransport = 'curl';
|
|
733
781
|
}
|
|
734
782
|
}
|
|
783
|
+
if (this.options.onRetry) {
|
|
784
|
+
await this.options.onRetry({
|
|
785
|
+
url,
|
|
786
|
+
attempt: attempt + 1,
|
|
787
|
+
maxAttempts,
|
|
788
|
+
reason: attemptReason,
|
|
789
|
+
delay: waitMs,
|
|
790
|
+
transport: forcedTransport ?? transportForAttempt,
|
|
791
|
+
previousStatus: response.status,
|
|
792
|
+
timings,
|
|
793
|
+
});
|
|
794
|
+
}
|
|
735
795
|
await sleep(waitMs);
|
|
736
796
|
continue;
|
|
737
797
|
}
|
|
@@ -867,7 +927,21 @@ export class Spider {
|
|
|
867
927
|
};
|
|
868
928
|
await this.crawlStorage.saveResult(nonHtmlResult);
|
|
869
929
|
this._resultCount++;
|
|
870
|
-
this.options.onPage
|
|
930
|
+
if (this.options.onPage) {
|
|
931
|
+
let cachedDoc = null;
|
|
932
|
+
await this.options.onPage({
|
|
933
|
+
result: nonHtmlResult,
|
|
934
|
+
html: html || undefined,
|
|
935
|
+
document: html ? () => {
|
|
936
|
+
if (cachedDoc)
|
|
937
|
+
return Promise.resolve(cachedDoc);
|
|
938
|
+
return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
|
|
939
|
+
} : undefined,
|
|
940
|
+
});
|
|
941
|
+
}
|
|
942
|
+
if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
|
|
943
|
+
await this.options.onBlocked(nonHtmlResult);
|
|
944
|
+
}
|
|
871
945
|
return;
|
|
872
946
|
}
|
|
873
947
|
const doc = await ScrapeDocument.create(html, {
|
|
@@ -898,6 +972,21 @@ export class Spider {
|
|
|
898
972
|
catch {
|
|
899
973
|
}
|
|
900
974
|
}
|
|
975
|
+
let isDuplicate = false;
|
|
976
|
+
let duplicateOf;
|
|
977
|
+
let contentHash;
|
|
978
|
+
if (this.options.deduplicateContent) {
|
|
979
|
+
const bodyText = doc.text('body');
|
|
980
|
+
contentHash = createHash('md5').update(bodyText).digest('hex');
|
|
981
|
+
const existingUrl = this.contentHashes.get(contentHash);
|
|
982
|
+
if (existingUrl) {
|
|
983
|
+
isDuplicate = true;
|
|
984
|
+
duplicateOf = existingUrl;
|
|
985
|
+
}
|
|
986
|
+
else {
|
|
987
|
+
this.contentHashes.set(contentHash, item.url);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
901
990
|
const result = {
|
|
902
991
|
url: item.url,
|
|
903
992
|
status,
|
|
@@ -926,38 +1015,46 @@ export class Spider {
|
|
|
926
1015
|
timings,
|
|
927
1016
|
fetchedAt,
|
|
928
1017
|
extracted,
|
|
1018
|
+
contentHash,
|
|
1019
|
+
isDuplicate: isDuplicate || undefined,
|
|
1020
|
+
duplicateOf,
|
|
929
1021
|
};
|
|
930
1022
|
await this.crawlStorage.saveResult(result);
|
|
931
1023
|
this._resultCount++;
|
|
932
|
-
this.options.onPage
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
for (const link of links) {
|
|
939
|
-
if (!link.href)
|
|
940
|
-
continue;
|
|
941
|
-
const normalized = normalizeUrl(link.href);
|
|
942
|
-
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
943
|
-
continue;
|
|
944
|
-
candidateUrls.push(normalized);
|
|
945
|
-
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
1024
|
+
if (this.options.onPage) {
|
|
1025
|
+
await this.options.onPage({
|
|
1026
|
+
result,
|
|
1027
|
+
html,
|
|
1028
|
+
document: () => Promise.resolve(doc),
|
|
1029
|
+
});
|
|
946
1030
|
}
|
|
947
|
-
if (
|
|
948
|
-
const
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
1031
|
+
if (!isDuplicate) {
|
|
1032
|
+
const candidates = [];
|
|
1033
|
+
const candidateUrls = [];
|
|
1034
|
+
for (const link of links) {
|
|
1035
|
+
if (!link.href)
|
|
1036
|
+
continue;
|
|
1037
|
+
const normalized = normalizeUrl(link.href);
|
|
1038
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
1039
|
+
continue;
|
|
1040
|
+
candidateUrls.push(normalized);
|
|
1041
|
+
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
1042
|
+
}
|
|
1043
|
+
if (candidates.length > 0) {
|
|
1044
|
+
const visitedSet = this.crawlQueue.hasVisitedBatch
|
|
1045
|
+
? await this.crawlQueue.hasVisitedBatch(candidateUrls)
|
|
1046
|
+
: new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
|
|
1047
|
+
const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
|
|
1048
|
+
if (newItems.length > 0) {
|
|
1049
|
+
if (this.crawlQueue.pushBatch) {
|
|
1050
|
+
await this.crawlQueue.pushBatch(newItems);
|
|
1051
|
+
}
|
|
1052
|
+
else {
|
|
1053
|
+
for (const newItem of newItems)
|
|
1054
|
+
await this.crawlQueue.push(newItem);
|
|
1055
|
+
}
|
|
1056
|
+
this._queueSize += newItems.length;
|
|
959
1057
|
}
|
|
960
|
-
this._queueSize += newItems.length;
|
|
961
1058
|
}
|
|
962
1059
|
}
|
|
963
1060
|
}
|
|
@@ -1026,7 +1123,12 @@ export class Spider {
|
|
|
1026
1123
|
await this.crawlStorage.saveResult(errorResult);
|
|
1027
1124
|
this._resultCount++;
|
|
1028
1125
|
await this.crawlStorage.saveError({ url: item.url, error: message });
|
|
1029
|
-
this.options.onPage
|
|
1126
|
+
if (this.options.onPage) {
|
|
1127
|
+
await this.options.onPage({ result: errorResult });
|
|
1128
|
+
}
|
|
1129
|
+
if (this.options.onError) {
|
|
1130
|
+
await this.options.onError(errorResult);
|
|
1131
|
+
}
|
|
1030
1132
|
}
|
|
1031
1133
|
}
|
|
1032
1134
|
getOrCreateDomainState(hostname) {
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
2
|
+
export declare class SqliteCrawlQueue implements CrawlQueueAdapter {
|
|
3
|
+
private db;
|
|
4
|
+
private stmts;
|
|
5
|
+
private constructor();
|
|
6
|
+
static create(opts?: {
|
|
7
|
+
dbPath?: string;
|
|
8
|
+
}): Promise<SqliteCrawlQueue>;
|
|
9
|
+
private ensureDb;
|
|
10
|
+
getDb(): any;
|
|
11
|
+
push(item: CrawlQueueItem): Promise<void>;
|
|
12
|
+
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
13
|
+
pop(): Promise<CrawlQueueItem | null>;
|
|
14
|
+
hasVisited(url: string): Promise<boolean>;
|
|
15
|
+
hasVisitedBatch(urls: string[]): Promise<Set<string>>;
|
|
16
|
+
markVisited(url: string): Promise<void>;
|
|
17
|
+
size(): Promise<number>;
|
|
18
|
+
clear(): Promise<void>;
|
|
19
|
+
close(): Promise<void>;
|
|
20
|
+
getVisitedSet(): Set<string>;
|
|
21
|
+
saveMetadata(key: string, value: string): void;
|
|
22
|
+
getMetadata(key: string): string | undefined;
|
|
23
|
+
getAllMetadata(): Record<string, string>;
|
|
24
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import * as path from 'node:path';
|
|
2
|
+
import * as os from 'node:os';
|
|
3
|
+
import * as crypto from 'node:crypto';
|
|
4
|
+
export class SqliteCrawlQueue {
|
|
5
|
+
db;
|
|
6
|
+
stmts;
|
|
7
|
+
constructor() { }
|
|
8
|
+
static async create(opts) {
|
|
9
|
+
const instance = new SqliteCrawlQueue();
|
|
10
|
+
const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-crawl-${crypto.randomUUID().slice(0, 8)}.db`);
|
|
11
|
+
await instance.ensureDb(dbPath);
|
|
12
|
+
return instance;
|
|
13
|
+
}
|
|
14
|
+
async ensureDb(dbPath) {
|
|
15
|
+
const BetterSqlite3 = (await import('better-sqlite3')).default;
|
|
16
|
+
this.db = new BetterSqlite3(dbPath);
|
|
17
|
+
this.db.pragma('journal_mode = WAL');
|
|
18
|
+
this.db.exec(`
|
|
19
|
+
CREATE TABLE IF NOT EXISTS queue (
|
|
20
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
21
|
+
url TEXT NOT NULL,
|
|
22
|
+
depth INTEGER NOT NULL,
|
|
23
|
+
priority INTEGER,
|
|
24
|
+
discovered_from TEXT
|
|
25
|
+
);
|
|
26
|
+
CREATE TABLE IF NOT EXISTS visited (url TEXT PRIMARY KEY);
|
|
27
|
+
CREATE TABLE IF NOT EXISTS crawl_metadata (key TEXT PRIMARY KEY, value TEXT NOT NULL);
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(priority ASC, id ASC);
|
|
29
|
+
`);
|
|
30
|
+
this.stmts = {
|
|
31
|
+
push: this.db.prepare('INSERT INTO queue (url, depth, priority, discovered_from) VALUES (?, ?, ?, ?)'),
|
|
32
|
+
pop: this.db.prepare('SELECT id, url, depth, priority, discovered_from FROM queue ORDER BY priority ASC NULLS LAST, id ASC LIMIT 1'),
|
|
33
|
+
deletePop: this.db.prepare('DELETE FROM queue WHERE id = ?'),
|
|
34
|
+
hasVisited: this.db.prepare('SELECT 1 FROM visited WHERE url = ?'),
|
|
35
|
+
markVisited: this.db.prepare('INSERT OR IGNORE INTO visited (url) VALUES (?)'),
|
|
36
|
+
size: this.db.prepare('SELECT COUNT(*) AS cnt FROM queue'),
|
|
37
|
+
clearQueue: this.db.prepare('DELETE FROM queue'),
|
|
38
|
+
clearVisited: this.db.prepare('DELETE FROM visited'),
|
|
39
|
+
clearMetadata: this.db.prepare('DELETE FROM crawl_metadata'),
|
|
40
|
+
allVisited: this.db.prepare('SELECT url FROM visited'),
|
|
41
|
+
saveMeta: this.db.prepare('INSERT OR REPLACE INTO crawl_metadata (key, value) VALUES (?, ?)'),
|
|
42
|
+
getMeta: this.db.prepare('SELECT value FROM crawl_metadata WHERE key = ?'),
|
|
43
|
+
allMeta: this.db.prepare('SELECT key, value FROM crawl_metadata'),
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
getDb() {
|
|
47
|
+
return this.db;
|
|
48
|
+
}
|
|
49
|
+
async push(item) {
|
|
50
|
+
this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
|
|
51
|
+
}
|
|
52
|
+
async pushBatch(items) {
|
|
53
|
+
const insert = this.db.transaction((rows) => {
|
|
54
|
+
for (const item of rows) {
|
|
55
|
+
this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
|
|
56
|
+
}
|
|
57
|
+
});
|
|
58
|
+
insert(items);
|
|
59
|
+
}
|
|
60
|
+
async pop() {
|
|
61
|
+
const row = this.stmts.pop.get();
|
|
62
|
+
if (!row)
|
|
63
|
+
return null;
|
|
64
|
+
this.stmts.deletePop.run(row.id);
|
|
65
|
+
return {
|
|
66
|
+
url: row.url,
|
|
67
|
+
depth: row.depth,
|
|
68
|
+
priority: row.priority ?? undefined,
|
|
69
|
+
discoveredFrom: row.discovered_from ?? undefined,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
async hasVisited(url) {
|
|
73
|
+
return this.stmts.hasVisited.get(url) !== undefined;
|
|
74
|
+
}
|
|
75
|
+
async hasVisitedBatch(urls) {
|
|
76
|
+
const result = new Set();
|
|
77
|
+
for (const url of urls) {
|
|
78
|
+
if (this.stmts.hasVisited.get(url) !== undefined) {
|
|
79
|
+
result.add(url);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return result;
|
|
83
|
+
}
|
|
84
|
+
async markVisited(url) {
|
|
85
|
+
this.stmts.markVisited.run(url);
|
|
86
|
+
}
|
|
87
|
+
async size() {
|
|
88
|
+
const row = this.stmts.size.get();
|
|
89
|
+
return row.cnt;
|
|
90
|
+
}
|
|
91
|
+
async clear() {
|
|
92
|
+
this.stmts.clearQueue.run();
|
|
93
|
+
this.stmts.clearVisited.run();
|
|
94
|
+
this.stmts.clearMetadata.run();
|
|
95
|
+
}
|
|
96
|
+
async close() {
|
|
97
|
+
this.db.close();
|
|
98
|
+
}
|
|
99
|
+
getVisitedSet() {
|
|
100
|
+
const rows = this.stmts.allVisited.all();
|
|
101
|
+
return new Set(rows.map((r) => r.url));
|
|
102
|
+
}
|
|
103
|
+
saveMetadata(key, value) {
|
|
104
|
+
this.stmts.saveMeta.run(key, value);
|
|
105
|
+
}
|
|
106
|
+
getMetadata(key) {
|
|
107
|
+
const row = this.stmts.getMeta.get(key);
|
|
108
|
+
return row?.value;
|
|
109
|
+
}
|
|
110
|
+
getAllMetadata() {
|
|
111
|
+
const rows = this.stmts.allMeta.all();
|
|
112
|
+
const result = {};
|
|
113
|
+
for (const row of rows) {
|
|
114
|
+
result[row.key] = row.value;
|
|
115
|
+
}
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
2
|
+
import type { SpiderPageResult } from './spider.js';
|
|
3
|
+
export declare class SqliteCrawlStorage implements CrawlStorageAdapter {
|
|
4
|
+
private db;
|
|
5
|
+
private ownsDb;
|
|
6
|
+
private stmts;
|
|
7
|
+
private constructor();
|
|
8
|
+
static create(opts?: {
|
|
9
|
+
dbPath?: string;
|
|
10
|
+
db?: any;
|
|
11
|
+
}): Promise<SqliteCrawlStorage>;
|
|
12
|
+
private init;
|
|
13
|
+
saveResult(result: SpiderPageResult): Promise<void>;
|
|
14
|
+
saveError(error: {
|
|
15
|
+
url: string;
|
|
16
|
+
error: string;
|
|
17
|
+
}): Promise<void>;
|
|
18
|
+
getResultCount(): Promise<number>;
|
|
19
|
+
getResults(): Promise<SpiderPageResult[]>;
|
|
20
|
+
getErrors(): Promise<Array<{
|
|
21
|
+
url: string;
|
|
22
|
+
error: string;
|
|
23
|
+
}>>;
|
|
24
|
+
clear(): Promise<void>;
|
|
25
|
+
close(): Promise<void>;
|
|
26
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import * as path from 'node:path';
|
|
2
|
+
import * as os from 'node:os';
|
|
3
|
+
import * as crypto from 'node:crypto';
|
|
4
|
+
export class SqliteCrawlStorage {
|
|
5
|
+
db;
|
|
6
|
+
ownsDb;
|
|
7
|
+
stmts;
|
|
8
|
+
constructor(db, ownsDb) {
|
|
9
|
+
this.db = db;
|
|
10
|
+
this.ownsDb = ownsDb;
|
|
11
|
+
}
|
|
12
|
+
static async create(opts) {
|
|
13
|
+
let db = opts?.db;
|
|
14
|
+
let ownsDb = false;
|
|
15
|
+
if (!db) {
|
|
16
|
+
const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-storage-${crypto.randomUUID().slice(0, 8)}.db`);
|
|
17
|
+
const BetterSqlite3 = (await import('better-sqlite3')).default;
|
|
18
|
+
db = new BetterSqlite3(dbPath);
|
|
19
|
+
db.pragma('journal_mode = WAL');
|
|
20
|
+
ownsDb = true;
|
|
21
|
+
}
|
|
22
|
+
const instance = new SqliteCrawlStorage(db, ownsDb);
|
|
23
|
+
instance.init();
|
|
24
|
+
return instance;
|
|
25
|
+
}
|
|
26
|
+
init() {
|
|
27
|
+
this.db.exec(`
|
|
28
|
+
CREATE TABLE IF NOT EXISTS results (
|
|
29
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
30
|
+
url TEXT NOT NULL,
|
|
31
|
+
status INTEGER NOT NULL,
|
|
32
|
+
data TEXT NOT NULL
|
|
33
|
+
);
|
|
34
|
+
CREATE TABLE IF NOT EXISTS errors (
|
|
35
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
36
|
+
url TEXT NOT NULL,
|
|
37
|
+
error TEXT NOT NULL
|
|
38
|
+
);
|
|
39
|
+
`);
|
|
40
|
+
this.stmts = {
|
|
41
|
+
saveResult: this.db.prepare('INSERT INTO results (url, status, data) VALUES (?, ?, ?)'),
|
|
42
|
+
saveError: this.db.prepare('INSERT INTO errors (url, error) VALUES (?, ?)'),
|
|
43
|
+
resultCount: this.db.prepare('SELECT COUNT(*) AS cnt FROM results'),
|
|
44
|
+
allResults: this.db.prepare('SELECT data FROM results'),
|
|
45
|
+
allErrors: this.db.prepare('SELECT url, error FROM errors'),
|
|
46
|
+
clearResults: this.db.prepare('DELETE FROM results'),
|
|
47
|
+
clearErrors: this.db.prepare('DELETE FROM errors'),
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
async saveResult(result) {
|
|
51
|
+
this.stmts.saveResult.run(result.url, result.status, JSON.stringify(result));
|
|
52
|
+
}
|
|
53
|
+
async saveError(error) {
|
|
54
|
+
this.stmts.saveError.run(error.url, error.error);
|
|
55
|
+
}
|
|
56
|
+
async getResultCount() {
|
|
57
|
+
const row = this.stmts.resultCount.get();
|
|
58
|
+
return row.cnt;
|
|
59
|
+
}
|
|
60
|
+
async getResults() {
|
|
61
|
+
const rows = this.stmts.allResults.all();
|
|
62
|
+
return rows.map((r) => JSON.parse(r.data));
|
|
63
|
+
}
|
|
64
|
+
async getErrors() {
|
|
65
|
+
return this.stmts.allErrors.all();
|
|
66
|
+
}
|
|
67
|
+
async clear() {
|
|
68
|
+
this.stmts.clearResults.run();
|
|
69
|
+
this.stmts.clearErrors.run();
|
|
70
|
+
}
|
|
71
|
+
async close() {
|
|
72
|
+
if (this.ownsDb) {
|
|
73
|
+
this.db.close();
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
|
|
|
5
5
|
seo?: boolean;
|
|
6
6
|
output?: string;
|
|
7
7
|
onSeoAnalysis?: (result: SeoPageResult) => void;
|
|
8
|
+
onBlocked?: (result: SeoPageResult) => void | Promise<void>;
|
|
9
|
+
onError?: (result: SeoPageResult) => void | Promise<void>;
|
|
8
10
|
focusCategories?: string[];
|
|
9
11
|
focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
|
|
10
12
|
}
|
|
@@ -32,11 +32,25 @@ export class SeoSpider {
|
|
|
32
32
|
}
|
|
33
33
|
constructor(options = {}) {
|
|
34
34
|
this.options = options;
|
|
35
|
+
const userOnPage = options.onPage;
|
|
35
36
|
this.spider = new Spider({
|
|
36
37
|
...options,
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
await this.analyzePageDuringCrawl(
|
|
38
|
+
onPage: async (event) => {
|
|
39
|
+
if (this.options.seo && event.html) {
|
|
40
|
+
await this.analyzePageDuringCrawl(event.result, event.html);
|
|
41
|
+
}
|
|
42
|
+
if (userOnPage) {
|
|
43
|
+
await userOnPage(event);
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
onBlocked: this.options.onBlocked
|
|
47
|
+
? async (pageResult) => {
|
|
48
|
+
await this.options.onBlocked({ ...pageResult });
|
|
49
|
+
}
|
|
50
|
+
: undefined,
|
|
51
|
+
onError: this.options.onError
|
|
52
|
+
? async (pageResult) => {
|
|
53
|
+
await this.options.onError({ ...pageResult });
|
|
40
54
|
}
|
|
41
55
|
: undefined,
|
|
42
56
|
});
|