recker 1.0.93 → 1.0.94-next.fcefd58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,12 @@ export type { Options as ParserOptions } from './parser/index.js';
3
3
  export { ScrapeDocument } from './document.js';
4
4
  export { ScrapeElement } from './element.js';
5
5
  export { Spider, spider } from './spider.js';
6
- export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
6
+ export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
7
7
  export { InMemoryCrawlQueue } from './crawl-queue.js';
8
+ export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
8
9
  export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
9
10
  export { InMemoryCrawlStorage } from './crawl-storage.js';
11
+ export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
10
12
  export type { CrawlStorageAdapter } from './crawl-storage.js';
11
13
  export { ListProxyAdapter } from './proxy-adapter.js';
12
14
  export type { ProxyAdapter } from './proxy-adapter.js';
@@ -3,6 +3,8 @@ export { ScrapeDocument } from './document.js';
3
3
  export { ScrapeElement } from './element.js';
4
4
  export { Spider, spider } from './spider.js';
5
5
  export { InMemoryCrawlQueue } from './crawl-queue.js';
6
+ export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
6
7
  export { InMemoryCrawlStorage } from './crawl-storage.js';
8
+ export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
7
9
  export { ListProxyAdapter } from './proxy-adapter.js';
8
10
  export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
@@ -1,3 +1,4 @@
1
+ import { ScrapeDocument } from './document.js';
1
2
  import type { ExtractedLink, ExtractionSchema } from './types.js';
2
3
  import type { Options as ParserOptions } from './parser/index.js';
3
4
  import { type SitemapUrl } from '../seo/validators/sitemap.js';
@@ -31,8 +32,7 @@ export interface SpiderOptions {
31
32
  proxy?: string | string[] | ProxyAdapter;
32
33
  transport?: SpiderTransport;
33
34
  preferCurlFirst?: boolean;
34
- onPage?: (result: SpiderPageResult) => void;
35
- onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
35
+ onPage?: (event: SpiderPageEvent) => void | Promise<void>;
36
36
  onCaptchaDetected?: (result: {
37
37
  url: string;
38
38
  status: number;
@@ -40,9 +40,31 @@ export interface SpiderOptions {
40
40
  provider?: CaptchaProvider;
41
41
  usedCurl: boolean;
42
42
  }) => void | Promise<void>;
43
+ onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
44
+ onError?: (result: SpiderPageResult) => void | Promise<void>;
45
+ onRetry?: (info: {
46
+ url: string;
47
+ attempt: number;
48
+ maxAttempts: number;
49
+ reason?: string;
50
+ delay: number;
51
+ transport: SpiderTransport;
52
+ previousStatus: number;
53
+ timings?: SpiderPageResult['timings'];
54
+ }) => void | Promise<void>;
55
+ onRedirect?: (info: {
56
+ from: string;
57
+ to: string;
58
+ status: number;
59
+ }) => void | Promise<void>;
43
60
  onProgress?: (progress: SpiderProgress) => void;
44
61
  extract?: string[] | ExtractionSchema;
45
62
  parserOptions?: Partial<ParserOptions>;
63
+ domainRateLimit?: {
64
+ maxPerSecond?: number;
65
+ };
66
+ deduplicateContent?: boolean;
67
+ resume?: boolean;
46
68
  crawlQueue?: CrawlQueueAdapter;
47
69
  crawlStorage?: CrawlStorageAdapter;
48
70
  }
@@ -102,6 +124,14 @@ export interface SpiderPageResult {
102
124
  stylesheets: number;
103
125
  };
104
126
  extracted?: Record<string, unknown>;
127
+ contentHash?: string;
128
+ isDuplicate?: boolean;
129
+ duplicateOf?: string;
130
+ }
131
+ export interface SpiderPageEvent {
132
+ result: SpiderPageResult;
133
+ html?: string;
134
+ document?: () => Promise<ScrapeDocument>;
105
135
  }
106
136
  export interface SpiderProgress {
107
137
  crawled: number;
@@ -163,6 +193,8 @@ export declare class Spider {
163
193
  private running;
164
194
  private aborted;
165
195
  private pendingCount;
196
+ private domainRequestTimestamps;
197
+ private contentHashes;
166
198
  private blockedDomains;
167
199
  private curlTransport;
168
200
  private curlAvailable;
@@ -172,6 +204,7 @@ export declare class Spider {
172
204
  private robotsData;
173
205
  private sitemapValidation;
174
206
  private robotsValidation;
207
+ private waitForDomainRateLimit;
175
208
  private toHeaderRecord;
176
209
  constructor(options?: SpiderOptions);
177
210
  crawl(startUrl: string): Promise<SpiderResult>;
@@ -1,3 +1,4 @@
1
+ import { createHash } from 'node:crypto';
1
2
  import { performance } from 'node:perf_hooks';
2
3
  import { createClient } from '../core/client.js';
3
4
  import { ScrapeDocument } from './document.js';
@@ -189,6 +190,8 @@ export class Spider {
189
190
  running = false;
190
191
  aborted = false;
191
192
  pendingCount = 0;
193
+ domainRequestTimestamps = new Map();
194
+ contentHashes = new Map();
192
195
  blockedDomains = new Set();
193
196
  curlTransport = null;
194
197
  curlAvailable = false;
@@ -198,6 +201,31 @@ export class Spider {
198
201
  robotsData = null;
199
202
  sitemapValidation = null;
200
203
  robotsValidation = null;
204
+ async waitForDomainRateLimit(hostname) {
205
+ const limit = this.options.domainRateLimit?.maxPerSecond;
206
+ if (!limit || limit <= 0)
207
+ return;
208
+ const now = Date.now();
209
+ const window = 1000;
210
+ let timestamps = this.domainRequestTimestamps.get(hostname);
211
+ if (!timestamps) {
212
+ timestamps = [];
213
+ this.domainRequestTimestamps.set(hostname, timestamps);
214
+ }
215
+ while (timestamps.length > 0 && timestamps[0] <= now - window) {
216
+ timestamps.shift();
217
+ }
218
+ if (timestamps.length >= limit) {
219
+ const waitMs = timestamps[0] + window - now;
220
+ if (waitMs > 0)
221
+ await sleep(waitMs);
222
+ const afterWait = Date.now();
223
+ while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
224
+ timestamps.shift();
225
+ }
226
+ }
227
+ timestamps.push(Date.now());
228
+ }
201
229
  toHeaderRecord(headers) {
202
230
  const headerRecord = {};
203
231
  headers.forEach((value, key) => {
@@ -239,11 +267,17 @@ export class Spider {
239
267
  exclude: options.exclude,
240
268
  include: options.include,
241
269
  onPage: options.onPage,
242
- onPageWithHtml: options.onPageWithHtml,
243
270
  onCaptchaDetected: options.onCaptchaDetected,
271
+ onBlocked: options.onBlocked,
272
+ onError: options.onError,
273
+ onRetry: options.onRetry,
274
+ onRedirect: options.onRedirect,
244
275
  onProgress: options.onProgress,
245
276
  extract: extractSchema,
246
277
  parserOptions: options.parserOptions,
278
+ domainRateLimit: options.domainRateLimit,
279
+ deduplicateContent: options.deduplicateContent ?? false,
280
+ resume: options.resume ?? false,
247
281
  };
248
282
  if (options.proxy) {
249
283
  if (typeof options.proxy === 'string') {
@@ -282,11 +316,19 @@ export class Spider {
282
316
  const normalizedStart = normalizeUrl(startUrl);
283
317
  const baseUrl = new URL(normalizedStart).origin;
284
318
  this.baseHost = new URL(normalizedStart).hostname;
285
- await this.crawlQueue.clear();
286
- await this.crawlStorage.clear();
287
- this._visitedCount = 0;
288
- this._queueSize = 0;
289
- this._resultCount = 0;
319
+ if (!this.options.resume) {
320
+ await this.crawlQueue.clear();
321
+ await this.crawlStorage.clear();
322
+ this._visitedCount = 0;
323
+ this._queueSize = 0;
324
+ this._resultCount = 0;
325
+ this.domainRequestTimestamps.clear();
326
+ this.contentHashes.clear();
327
+ }
328
+ else {
329
+ this._queueSize = await this.crawlQueue.size();
330
+ this._resultCount = await this.crawlStorage.getResultCount();
331
+ }
290
332
  this.running = true;
291
333
  this.aborted = false;
292
334
  this.pendingCount = 0;
@@ -341,16 +383,18 @@ export class Spider {
341
383
  });
342
384
  pending.set(normalized, promise);
343
385
  };
344
- await scheduleUrl({ url: normalizedStart, depth: 0 });
345
- if (this.options.useSitemap && this.sitemapUrls.length > 0) {
346
- for (const sitemapUrl of this.sitemapUrls) {
347
- try {
348
- const urlHost = new URL(sitemapUrl.loc).hostname;
349
- if (urlHost === this.baseHost) {
350
- await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
386
+ if (!this.options.resume) {
387
+ await scheduleUrl({ url: normalizedStart, depth: 0 });
388
+ if (this.options.useSitemap && this.sitemapUrls.length > 0) {
389
+ for (const sitemapUrl of this.sitemapUrls) {
390
+ try {
391
+ const urlHost = new URL(sitemapUrl.loc).hostname;
392
+ if (urlHost === this.baseHost) {
393
+ await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
394
+ }
395
+ }
396
+ catch {
351
397
  }
352
- }
353
- catch {
354
398
  }
355
399
  }
356
400
  }
@@ -611,6 +655,9 @@ export class Spider {
611
655
  const clientForRequest = this.getClientForProxy(proxyUrl);
612
656
  const response = await clientForRequest.get(url, {
613
657
  headers: this.buildRequestHeaders(url, false),
658
+ beforeRedirect: this.options.onRedirect
659
+ ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
660
+ : undefined,
614
661
  });
615
662
  const contentType = response.headers.get('content-type') || '';
616
663
  const shouldReadUndiciBody = !contentType ||
@@ -643,6 +690,7 @@ export class Spider {
643
690
  };
644
691
  };
645
692
  for (let attempt = 0; attempt < maxAttempts; attempt++) {
693
+ await this.waitForDomainRateLimit(hostname);
646
694
  await this.waitForDomainPenalty(hostname);
647
695
  const useCurl = this.shouldUseCurlForHost(hostname, hasCurl, forcedTransport);
648
696
  const transportForAttempt = useCurl ? 'curl' : 'undici';
@@ -732,6 +780,18 @@ export class Spider {
732
780
  forcedTransport = 'curl';
733
781
  }
734
782
  }
783
+ if (this.options.onRetry) {
784
+ await this.options.onRetry({
785
+ url,
786
+ attempt: attempt + 1,
787
+ maxAttempts,
788
+ reason: attemptReason,
789
+ delay: waitMs,
790
+ transport: forcedTransport ?? transportForAttempt,
791
+ previousStatus: response.status,
792
+ timings,
793
+ });
794
+ }
735
795
  await sleep(waitMs);
736
796
  continue;
737
797
  }
@@ -867,7 +927,21 @@ export class Spider {
867
927
  };
868
928
  await this.crawlStorage.saveResult(nonHtmlResult);
869
929
  this._resultCount++;
870
- this.options.onPage?.(nonHtmlResult);
930
+ if (this.options.onPage) {
931
+ let cachedDoc = null;
932
+ await this.options.onPage({
933
+ result: nonHtmlResult,
934
+ html: html || undefined,
935
+ document: html ? () => {
936
+ if (cachedDoc)
937
+ return Promise.resolve(cachedDoc);
938
+ return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
939
+ } : undefined,
940
+ });
941
+ }
942
+ if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
943
+ await this.options.onBlocked(nonHtmlResult);
944
+ }
871
945
  return;
872
946
  }
873
947
  const doc = await ScrapeDocument.create(html, {
@@ -898,6 +972,21 @@ export class Spider {
898
972
  catch {
899
973
  }
900
974
  }
975
+ let isDuplicate = false;
976
+ let duplicateOf;
977
+ let contentHash;
978
+ if (this.options.deduplicateContent) {
979
+ const bodyText = doc.text('body');
980
+ contentHash = createHash('md5').update(bodyText).digest('hex');
981
+ const existingUrl = this.contentHashes.get(contentHash);
982
+ if (existingUrl) {
983
+ isDuplicate = true;
984
+ duplicateOf = existingUrl;
985
+ }
986
+ else {
987
+ this.contentHashes.set(contentHash, item.url);
988
+ }
989
+ }
901
990
  const result = {
902
991
  url: item.url,
903
992
  status,
@@ -926,38 +1015,46 @@ export class Spider {
926
1015
  timings,
927
1016
  fetchedAt,
928
1017
  extracted,
1018
+ contentHash,
1019
+ isDuplicate: isDuplicate || undefined,
1020
+ duplicateOf,
929
1021
  };
930
1022
  await this.crawlStorage.saveResult(result);
931
1023
  this._resultCount++;
932
- this.options.onPage?.(result);
933
- if (this.options.onPageWithHtml) {
934
- await this.options.onPageWithHtml(result, html);
935
- }
936
- const candidates = [];
937
- const candidateUrls = [];
938
- for (const link of links) {
939
- if (!link.href)
940
- continue;
941
- const normalized = normalizeUrl(link.href);
942
- if (!shouldCrawl(normalized, this.baseHost, this.options))
943
- continue;
944
- candidateUrls.push(normalized);
945
- candidates.push({ url: normalized, depth: item.depth + 1 });
1024
+ if (this.options.onPage) {
1025
+ await this.options.onPage({
1026
+ result,
1027
+ html,
1028
+ document: () => Promise.resolve(doc),
1029
+ });
946
1030
  }
947
- if (candidates.length > 0) {
948
- const visitedSet = this.crawlQueue.hasVisitedBatch
949
- ? await this.crawlQueue.hasVisitedBatch(candidateUrls)
950
- : new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
951
- const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
952
- if (newItems.length > 0) {
953
- if (this.crawlQueue.pushBatch) {
954
- await this.crawlQueue.pushBatch(newItems);
955
- }
956
- else {
957
- for (const newItem of newItems)
958
- await this.crawlQueue.push(newItem);
1031
+ if (!isDuplicate) {
1032
+ const candidates = [];
1033
+ const candidateUrls = [];
1034
+ for (const link of links) {
1035
+ if (!link.href)
1036
+ continue;
1037
+ const normalized = normalizeUrl(link.href);
1038
+ if (!shouldCrawl(normalized, this.baseHost, this.options))
1039
+ continue;
1040
+ candidateUrls.push(normalized);
1041
+ candidates.push({ url: normalized, depth: item.depth + 1 });
1042
+ }
1043
+ if (candidates.length > 0) {
1044
+ const visitedSet = this.crawlQueue.hasVisitedBatch
1045
+ ? await this.crawlQueue.hasVisitedBatch(candidateUrls)
1046
+ : new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
1047
+ const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
1048
+ if (newItems.length > 0) {
1049
+ if (this.crawlQueue.pushBatch) {
1050
+ await this.crawlQueue.pushBatch(newItems);
1051
+ }
1052
+ else {
1053
+ for (const newItem of newItems)
1054
+ await this.crawlQueue.push(newItem);
1055
+ }
1056
+ this._queueSize += newItems.length;
959
1057
  }
960
- this._queueSize += newItems.length;
961
1058
  }
962
1059
  }
963
1060
  }
@@ -1026,7 +1123,12 @@ export class Spider {
1026
1123
  await this.crawlStorage.saveResult(errorResult);
1027
1124
  this._resultCount++;
1028
1125
  await this.crawlStorage.saveError({ url: item.url, error: message });
1029
- this.options.onPage?.(errorResult);
1126
+ if (this.options.onPage) {
1127
+ await this.options.onPage({ result: errorResult });
1128
+ }
1129
+ if (this.options.onError) {
1130
+ await this.options.onError(errorResult);
1131
+ }
1030
1132
  }
1031
1133
  }
1032
1134
  getOrCreateDomainState(hostname) {
@@ -0,0 +1,24 @@
1
+ import type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
2
+ export declare class SqliteCrawlQueue implements CrawlQueueAdapter {
3
+ private db;
4
+ private stmts;
5
+ private constructor();
6
+ static create(opts?: {
7
+ dbPath?: string;
8
+ }): Promise<SqliteCrawlQueue>;
9
+ private ensureDb;
10
+ getDb(): any;
11
+ push(item: CrawlQueueItem): Promise<void>;
12
+ pushBatch(items: CrawlQueueItem[]): Promise<void>;
13
+ pop(): Promise<CrawlQueueItem | null>;
14
+ hasVisited(url: string): Promise<boolean>;
15
+ hasVisitedBatch(urls: string[]): Promise<Set<string>>;
16
+ markVisited(url: string): Promise<void>;
17
+ size(): Promise<number>;
18
+ clear(): Promise<void>;
19
+ close(): Promise<void>;
20
+ getVisitedSet(): Set<string>;
21
+ saveMetadata(key: string, value: string): void;
22
+ getMetadata(key: string): string | undefined;
23
+ getAllMetadata(): Record<string, string>;
24
+ }
@@ -0,0 +1,118 @@
1
+ import * as path from 'node:path';
2
+ import * as os from 'node:os';
3
+ import * as crypto from 'node:crypto';
4
+ export class SqliteCrawlQueue {
5
+ db;
6
+ stmts;
7
+ constructor() { }
8
+ static async create(opts) {
9
+ const instance = new SqliteCrawlQueue();
10
+ const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-crawl-${crypto.randomUUID().slice(0, 8)}.db`);
11
+ await instance.ensureDb(dbPath);
12
+ return instance;
13
+ }
14
+ async ensureDb(dbPath) {
15
+ const BetterSqlite3 = (await import('better-sqlite3')).default;
16
+ this.db = new BetterSqlite3(dbPath);
17
+ this.db.pragma('journal_mode = WAL');
18
+ this.db.exec(`
19
+ CREATE TABLE IF NOT EXISTS queue (
20
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
21
+ url TEXT NOT NULL,
22
+ depth INTEGER NOT NULL,
23
+ priority INTEGER,
24
+ discovered_from TEXT
25
+ );
26
+ CREATE TABLE IF NOT EXISTS visited (url TEXT PRIMARY KEY);
27
+ CREATE TABLE IF NOT EXISTS crawl_metadata (key TEXT PRIMARY KEY, value TEXT NOT NULL);
28
+ CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(priority ASC, id ASC);
29
+ `);
30
+ this.stmts = {
31
+ push: this.db.prepare('INSERT INTO queue (url, depth, priority, discovered_from) VALUES (?, ?, ?, ?)'),
32
+ pop: this.db.prepare('SELECT id, url, depth, priority, discovered_from FROM queue ORDER BY priority ASC NULLS LAST, id ASC LIMIT 1'),
33
+ deletePop: this.db.prepare('DELETE FROM queue WHERE id = ?'),
34
+ hasVisited: this.db.prepare('SELECT 1 FROM visited WHERE url = ?'),
35
+ markVisited: this.db.prepare('INSERT OR IGNORE INTO visited (url) VALUES (?)'),
36
+ size: this.db.prepare('SELECT COUNT(*) AS cnt FROM queue'),
37
+ clearQueue: this.db.prepare('DELETE FROM queue'),
38
+ clearVisited: this.db.prepare('DELETE FROM visited'),
39
+ clearMetadata: this.db.prepare('DELETE FROM crawl_metadata'),
40
+ allVisited: this.db.prepare('SELECT url FROM visited'),
41
+ saveMeta: this.db.prepare('INSERT OR REPLACE INTO crawl_metadata (key, value) VALUES (?, ?)'),
42
+ getMeta: this.db.prepare('SELECT value FROM crawl_metadata WHERE key = ?'),
43
+ allMeta: this.db.prepare('SELECT key, value FROM crawl_metadata'),
44
+ };
45
+ }
46
+ getDb() {
47
+ return this.db;
48
+ }
49
+ async push(item) {
50
+ this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
51
+ }
52
+ async pushBatch(items) {
53
+ const insert = this.db.transaction((rows) => {
54
+ for (const item of rows) {
55
+ this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
56
+ }
57
+ });
58
+ insert(items);
59
+ }
60
+ async pop() {
61
+ const row = this.stmts.pop.get();
62
+ if (!row)
63
+ return null;
64
+ this.stmts.deletePop.run(row.id);
65
+ return {
66
+ url: row.url,
67
+ depth: row.depth,
68
+ priority: row.priority ?? undefined,
69
+ discoveredFrom: row.discovered_from ?? undefined,
70
+ };
71
+ }
72
+ async hasVisited(url) {
73
+ return this.stmts.hasVisited.get(url) !== undefined;
74
+ }
75
+ async hasVisitedBatch(urls) {
76
+ const result = new Set();
77
+ for (const url of urls) {
78
+ if (this.stmts.hasVisited.get(url) !== undefined) {
79
+ result.add(url);
80
+ }
81
+ }
82
+ return result;
83
+ }
84
+ async markVisited(url) {
85
+ this.stmts.markVisited.run(url);
86
+ }
87
+ async size() {
88
+ const row = this.stmts.size.get();
89
+ return row.cnt;
90
+ }
91
+ async clear() {
92
+ this.stmts.clearQueue.run();
93
+ this.stmts.clearVisited.run();
94
+ this.stmts.clearMetadata.run();
95
+ }
96
+ async close() {
97
+ this.db.close();
98
+ }
99
+ getVisitedSet() {
100
+ const rows = this.stmts.allVisited.all();
101
+ return new Set(rows.map((r) => r.url));
102
+ }
103
+ saveMetadata(key, value) {
104
+ this.stmts.saveMeta.run(key, value);
105
+ }
106
+ getMetadata(key) {
107
+ const row = this.stmts.getMeta.get(key);
108
+ return row?.value;
109
+ }
110
+ getAllMetadata() {
111
+ const rows = this.stmts.allMeta.all();
112
+ const result = {};
113
+ for (const row of rows) {
114
+ result[row.key] = row.value;
115
+ }
116
+ return result;
117
+ }
118
+ }
@@ -0,0 +1,26 @@
1
+ import type { CrawlStorageAdapter } from './crawl-storage.js';
2
+ import type { SpiderPageResult } from './spider.js';
3
+ export declare class SqliteCrawlStorage implements CrawlStorageAdapter {
4
+ private db;
5
+ private ownsDb;
6
+ private stmts;
7
+ private constructor();
8
+ static create(opts?: {
9
+ dbPath?: string;
10
+ db?: any;
11
+ }): Promise<SqliteCrawlStorage>;
12
+ private init;
13
+ saveResult(result: SpiderPageResult): Promise<void>;
14
+ saveError(error: {
15
+ url: string;
16
+ error: string;
17
+ }): Promise<void>;
18
+ getResultCount(): Promise<number>;
19
+ getResults(): Promise<SpiderPageResult[]>;
20
+ getErrors(): Promise<Array<{
21
+ url: string;
22
+ error: string;
23
+ }>>;
24
+ clear(): Promise<void>;
25
+ close(): Promise<void>;
26
+ }
@@ -0,0 +1,76 @@
1
+ import * as path from 'node:path';
2
+ import * as os from 'node:os';
3
+ import * as crypto from 'node:crypto';
4
+ export class SqliteCrawlStorage {
5
+ db;
6
+ ownsDb;
7
+ stmts;
8
+ constructor(db, ownsDb) {
9
+ this.db = db;
10
+ this.ownsDb = ownsDb;
11
+ }
12
+ static async create(opts) {
13
+ let db = opts?.db;
14
+ let ownsDb = false;
15
+ if (!db) {
16
+ const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-storage-${crypto.randomUUID().slice(0, 8)}.db`);
17
+ const BetterSqlite3 = (await import('better-sqlite3')).default;
18
+ db = new BetterSqlite3(dbPath);
19
+ db.pragma('journal_mode = WAL');
20
+ ownsDb = true;
21
+ }
22
+ const instance = new SqliteCrawlStorage(db, ownsDb);
23
+ instance.init();
24
+ return instance;
25
+ }
26
+ init() {
27
+ this.db.exec(`
28
+ CREATE TABLE IF NOT EXISTS results (
29
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
30
+ url TEXT NOT NULL,
31
+ status INTEGER NOT NULL,
32
+ data TEXT NOT NULL
33
+ );
34
+ CREATE TABLE IF NOT EXISTS errors (
35
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
36
+ url TEXT NOT NULL,
37
+ error TEXT NOT NULL
38
+ );
39
+ `);
40
+ this.stmts = {
41
+ saveResult: this.db.prepare('INSERT INTO results (url, status, data) VALUES (?, ?, ?)'),
42
+ saveError: this.db.prepare('INSERT INTO errors (url, error) VALUES (?, ?)'),
43
+ resultCount: this.db.prepare('SELECT COUNT(*) AS cnt FROM results'),
44
+ allResults: this.db.prepare('SELECT data FROM results'),
45
+ allErrors: this.db.prepare('SELECT url, error FROM errors'),
46
+ clearResults: this.db.prepare('DELETE FROM results'),
47
+ clearErrors: this.db.prepare('DELETE FROM errors'),
48
+ };
49
+ }
50
+ async saveResult(result) {
51
+ this.stmts.saveResult.run(result.url, result.status, JSON.stringify(result));
52
+ }
53
+ async saveError(error) {
54
+ this.stmts.saveError.run(error.url, error.error);
55
+ }
56
+ async getResultCount() {
57
+ const row = this.stmts.resultCount.get();
58
+ return row.cnt;
59
+ }
60
+ async getResults() {
61
+ const rows = this.stmts.allResults.all();
62
+ return rows.map((r) => JSON.parse(r.data));
63
+ }
64
+ async getErrors() {
65
+ return this.stmts.allErrors.all();
66
+ }
67
+ async clear() {
68
+ this.stmts.clearResults.run();
69
+ this.stmts.clearErrors.run();
70
+ }
71
+ async close() {
72
+ if (this.ownsDb) {
73
+ this.db.close();
74
+ }
75
+ }
76
+ }
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
5
5
  seo?: boolean;
6
6
  output?: string;
7
7
  onSeoAnalysis?: (result: SeoPageResult) => void;
8
+ onBlocked?: (result: SeoPageResult) => void | Promise<void>;
9
+ onError?: (result: SeoPageResult) => void | Promise<void>;
8
10
  focusCategories?: string[];
9
11
  focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
10
12
  }
@@ -32,11 +32,25 @@ export class SeoSpider {
32
32
  }
33
33
  constructor(options = {}) {
34
34
  this.options = options;
35
+ const userOnPage = options.onPage;
35
36
  this.spider = new Spider({
36
37
  ...options,
37
- onPageWithHtml: this.options.seo
38
- ? async (pageResult, html) => {
39
- await this.analyzePageDuringCrawl(pageResult, html);
38
+ onPage: async (event) => {
39
+ if (this.options.seo && event.html) {
40
+ await this.analyzePageDuringCrawl(event.result, event.html);
41
+ }
42
+ if (userOnPage) {
43
+ await userOnPage(event);
44
+ }
45
+ },
46
+ onBlocked: this.options.onBlocked
47
+ ? async (pageResult) => {
48
+ await this.options.onBlocked({ ...pageResult });
49
+ }
50
+ : undefined,
51
+ onError: this.options.onError
52
+ ? async (pageResult) => {
53
+ await this.options.onError({ ...pageResult });
40
54
  }
41
55
  : undefined,
42
56
  });