recker 1.0.93 → 1.0.94-next.132e096

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,12 @@ export type { Options as ParserOptions } from './parser/index.js';
3
3
  export { ScrapeDocument } from './document.js';
4
4
  export { ScrapeElement } from './element.js';
5
5
  export { Spider, spider } from './spider.js';
6
- export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
6
+ export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
7
7
  export { InMemoryCrawlQueue } from './crawl-queue.js';
8
+ export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
8
9
  export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
9
10
  export { InMemoryCrawlStorage } from './crawl-storage.js';
11
+ export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
10
12
  export type { CrawlStorageAdapter } from './crawl-storage.js';
11
13
  export { ListProxyAdapter } from './proxy-adapter.js';
12
14
  export type { ProxyAdapter } from './proxy-adapter.js';
@@ -3,6 +3,8 @@ export { ScrapeDocument } from './document.js';
3
3
  export { ScrapeElement } from './element.js';
4
4
  export { Spider, spider } from './spider.js';
5
5
  export { InMemoryCrawlQueue } from './crawl-queue.js';
6
+ export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
6
7
  export { InMemoryCrawlStorage } from './crawl-storage.js';
8
+ export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
7
9
  export { ListProxyAdapter } from './proxy-adapter.js';
8
10
  export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
@@ -1,3 +1,4 @@
1
+ import { ScrapeDocument } from './document.js';
1
2
  import type { ExtractedLink, ExtractionSchema } from './types.js';
2
3
  import type { Options as ParserOptions } from './parser/index.js';
3
4
  import { type SitemapUrl } from '../seo/validators/sitemap.js';
@@ -31,8 +32,7 @@ export interface SpiderOptions {
31
32
  proxy?: string | string[] | ProxyAdapter;
32
33
  transport?: SpiderTransport;
33
34
  preferCurlFirst?: boolean;
34
- onPage?: (result: SpiderPageResult) => void;
35
- onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
35
+ onPage?: (event: SpiderPageEvent) => void | Promise<void>;
36
36
  onCaptchaDetected?: (result: {
37
37
  url: string;
38
38
  status: number;
@@ -40,9 +40,31 @@ export interface SpiderOptions {
40
40
  provider?: CaptchaProvider;
41
41
  usedCurl: boolean;
42
42
  }) => void | Promise<void>;
43
+ onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
44
+ onError?: (result: SpiderPageResult) => void | Promise<void>;
45
+ onRetry?: (info: {
46
+ url: string;
47
+ attempt: number;
48
+ maxAttempts: number;
49
+ reason?: string;
50
+ delay: number;
51
+ transport: SpiderTransport;
52
+ previousStatus: number;
53
+ timings?: SpiderPageResult['timings'];
54
+ }) => void | Promise<void>;
55
+ onRedirect?: (info: {
56
+ from: string;
57
+ to: string;
58
+ status: number;
59
+ }) => void | Promise<void>;
43
60
  onProgress?: (progress: SpiderProgress) => void;
44
61
  extract?: string[] | ExtractionSchema;
45
62
  parserOptions?: Partial<ParserOptions>;
63
+ domainRateLimit?: {
64
+ maxPerSecond?: number;
65
+ };
66
+ deduplicateContent?: boolean;
67
+ resume?: boolean;
46
68
  crawlQueue?: CrawlQueueAdapter;
47
69
  crawlStorage?: CrawlStorageAdapter;
48
70
  }
@@ -102,6 +124,14 @@ export interface SpiderPageResult {
102
124
  stylesheets: number;
103
125
  };
104
126
  extracted?: Record<string, unknown>;
127
+ contentHash?: string;
128
+ isDuplicate?: boolean;
129
+ duplicateOf?: string;
130
+ }
131
+ export interface SpiderPageEvent {
132
+ result: SpiderPageResult;
133
+ html?: string;
134
+ document?: () => Promise<ScrapeDocument>;
105
135
  }
106
136
  export interface SpiderProgress {
107
137
  crawled: number;
@@ -163,6 +193,8 @@ export declare class Spider {
163
193
  private running;
164
194
  private aborted;
165
195
  private pendingCount;
196
+ private domainRequestTimestamps;
197
+ private contentHashes;
166
198
  private blockedDomains;
167
199
  private curlTransport;
168
200
  private curlAvailable;
@@ -172,6 +204,7 @@ export declare class Spider {
172
204
  private robotsData;
173
205
  private sitemapValidation;
174
206
  private robotsValidation;
207
+ private waitForDomainRateLimit;
175
208
  private toHeaderRecord;
176
209
  constructor(options?: SpiderOptions);
177
210
  crawl(startUrl: string): Promise<SpiderResult>;
@@ -1,3 +1,4 @@
1
+ import { createHash } from 'node:crypto';
1
2
  import { performance } from 'node:perf_hooks';
2
3
  import { createClient } from '../core/client.js';
3
4
  import { ScrapeDocument } from './document.js';
@@ -108,7 +109,8 @@ function shouldCrawl(url, baseHost, options) {
108
109
  if (!['http:', 'https:'].includes(parsed.protocol)) {
109
110
  return false;
110
111
  }
111
- if (options.sameDomain !== false && parsed.hostname !== baseHost) {
112
+ const hostname = parsed.hostname.replace(/^www\./, '');
113
+ if (options.sameDomain !== false && hostname !== baseHost) {
112
114
  return false;
113
115
  }
114
116
  const skipExtensions = [
@@ -189,6 +191,8 @@ export class Spider {
189
191
  running = false;
190
192
  aborted = false;
191
193
  pendingCount = 0;
194
+ domainRequestTimestamps = new Map();
195
+ contentHashes = new Map();
192
196
  blockedDomains = new Set();
193
197
  curlTransport = null;
194
198
  curlAvailable = false;
@@ -198,6 +202,31 @@ export class Spider {
198
202
  robotsData = null;
199
203
  sitemapValidation = null;
200
204
  robotsValidation = null;
205
+ async waitForDomainRateLimit(hostname) {
206
+ const limit = this.options.domainRateLimit?.maxPerSecond;
207
+ if (!limit || limit <= 0)
208
+ return;
209
+ const now = Date.now();
210
+ const window = 1000;
211
+ let timestamps = this.domainRequestTimestamps.get(hostname);
212
+ if (!timestamps) {
213
+ timestamps = [];
214
+ this.domainRequestTimestamps.set(hostname, timestamps);
215
+ }
216
+ while (timestamps.length > 0 && timestamps[0] <= now - window) {
217
+ timestamps.shift();
218
+ }
219
+ if (timestamps.length >= limit) {
220
+ const waitMs = timestamps[0] + window - now;
221
+ if (waitMs > 0)
222
+ await sleep(waitMs);
223
+ const afterWait = Date.now();
224
+ while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
225
+ timestamps.shift();
226
+ }
227
+ }
228
+ timestamps.push(Date.now());
229
+ }
201
230
  toHeaderRecord(headers) {
202
231
  const headerRecord = {};
203
232
  headers.forEach((value, key) => {
@@ -239,11 +268,17 @@ export class Spider {
239
268
  exclude: options.exclude,
240
269
  include: options.include,
241
270
  onPage: options.onPage,
242
- onPageWithHtml: options.onPageWithHtml,
243
271
  onCaptchaDetected: options.onCaptchaDetected,
272
+ onBlocked: options.onBlocked,
273
+ onError: options.onError,
274
+ onRetry: options.onRetry,
275
+ onRedirect: options.onRedirect,
244
276
  onProgress: options.onProgress,
245
277
  extract: extractSchema,
246
278
  parserOptions: options.parserOptions,
279
+ domainRateLimit: options.domainRateLimit,
280
+ deduplicateContent: options.deduplicateContent ?? false,
281
+ resume: options.resume ?? false,
247
282
  };
248
283
  if (options.proxy) {
249
284
  if (typeof options.proxy === 'string') {
@@ -281,12 +316,20 @@ export class Spider {
281
316
  const startTimestamp = Date.now();
282
317
  const normalizedStart = normalizeUrl(startUrl);
283
318
  const baseUrl = new URL(normalizedStart).origin;
284
- this.baseHost = new URL(normalizedStart).hostname;
285
- await this.crawlQueue.clear();
286
- await this.crawlStorage.clear();
287
- this._visitedCount = 0;
288
- this._queueSize = 0;
289
- this._resultCount = 0;
319
+ this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
320
+ if (!this.options.resume) {
321
+ await this.crawlQueue.clear();
322
+ await this.crawlStorage.clear();
323
+ this._visitedCount = 0;
324
+ this._queueSize = 0;
325
+ this._resultCount = 0;
326
+ this.domainRequestTimestamps.clear();
327
+ this.contentHashes.clear();
328
+ }
329
+ else {
330
+ this._queueSize = await this.crawlQueue.size();
331
+ this._resultCount = await this.crawlStorage.getResultCount();
332
+ }
290
333
  this.running = true;
291
334
  this.aborted = false;
292
335
  this.pendingCount = 0;
@@ -295,8 +338,18 @@ export class Spider {
295
338
  this.robotsData = null;
296
339
  this.sitemapValidation = null;
297
340
  this.robotsValidation = null;
298
- this.blockedDomains.clear();
299
- this.domainStates.clear();
341
+ if (!this.options.resume) {
342
+ this.blockedDomains.clear();
343
+ this.domainStates.clear();
344
+ }
345
+ if (this.options.resume && this.options.deduplicateContent) {
346
+ const existingResults = await this.crawlStorage.getResults();
347
+ for (const r of existingResults) {
348
+ if (r.contentHash) {
349
+ this.contentHashes.set(r.contentHash, r.url);
350
+ }
351
+ }
352
+ }
300
353
  if (this.options.transport !== 'undici') {
301
354
  this.curlAvailable = await hasImpersonate();
302
355
  if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
@@ -312,10 +365,12 @@ export class Spider {
312
365
  const pending = new Map();
313
366
  const scheduleUrl = async (item) => {
314
367
  const normalized = normalizeUrl(item.url);
315
- if (await this.crawlQueue.hasVisited(normalized))
316
- return;
317
368
  if (pending.has(normalized))
318
369
  return;
370
+ if (await this.crawlQueue.hasVisited(normalized))
371
+ return;
372
+ await this.crawlQueue.markVisited(normalized);
373
+ this._visitedCount++;
319
374
  if (item.depth > this.options.maxDepth)
320
375
  return;
321
376
  if (this._resultCount + pending.size >= this.options.maxPages)
@@ -331,8 +386,6 @@ export class Spider {
331
386
  return;
332
387
  }
333
388
  }
334
- await this.crawlQueue.markVisited(normalized);
335
- this._visitedCount++;
336
389
  this.pendingCount++;
337
390
  const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
338
391
  .finally(() => {
@@ -341,16 +394,18 @@ export class Spider {
341
394
  });
342
395
  pending.set(normalized, promise);
343
396
  };
344
- await scheduleUrl({ url: normalizedStart, depth: 0 });
345
- if (this.options.useSitemap && this.sitemapUrls.length > 0) {
346
- for (const sitemapUrl of this.sitemapUrls) {
347
- try {
348
- const urlHost = new URL(sitemapUrl.loc).hostname;
349
- if (urlHost === this.baseHost) {
350
- await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
397
+ if (!this.options.resume) {
398
+ await scheduleUrl({ url: normalizedStart, depth: 0 });
399
+ if (this.options.useSitemap && this.sitemapUrls.length > 0) {
400
+ for (const sitemapUrl of this.sitemapUrls) {
401
+ try {
402
+ const urlHost = new URL(sitemapUrl.loc).hostname;
403
+ if (urlHost === this.baseHost) {
404
+ await scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
405
+ }
406
+ }
407
+ catch {
351
408
  }
352
- }
353
- catch {
354
409
  }
355
410
  }
356
411
  }
@@ -372,14 +427,33 @@ export class Spider {
372
427
  if (pending.size > 0) {
373
428
  await Promise.all(pending.values());
374
429
  }
430
+ while (!this.aborted && this._resultCount < this.options.maxPages) {
431
+ const remaining = await this.crawlQueue.size();
432
+ if (remaining === 0 && pending.size === 0)
433
+ break;
434
+ this._queueSize = remaining;
435
+ let nextItem = await this.crawlQueue.pop();
436
+ while (nextItem && !this.aborted) {
437
+ this._queueSize = Math.max(0, this._queueSize - 1);
438
+ if (this._resultCount + pending.size >= this.options.maxPages)
439
+ break;
440
+ await scheduleUrl(nextItem);
441
+ nextItem = await this.crawlQueue.pop();
442
+ }
443
+ if (pending.size > 0) {
444
+ await Promise.all(pending.values());
445
+ }
446
+ }
375
447
  this.running = false;
376
448
  const pages = await this.crawlStorage.getResults();
377
449
  const errors = await this.crawlStorage.getErrors();
378
450
  const sitemapAnalysis = this.buildSitemapAnalysis(pages);
379
451
  const robotsAnalysis = this.buildRobotsAnalysis();
380
452
  const visited = this.crawlQueue instanceof InMemoryCrawlQueue
381
- ? this.crawlQueue.getVisited()
453
+ ? new Set(this.crawlQueue.getVisited())
382
454
  : new Set(pages.map(r => r.url));
455
+ await this.crawlQueue.close?.();
456
+ await this.crawlStorage.close?.();
383
457
  return {
384
458
  startUrl: normalizedStart,
385
459
  pages,
@@ -611,6 +685,9 @@ export class Spider {
611
685
  const clientForRequest = this.getClientForProxy(proxyUrl);
612
686
  const response = await clientForRequest.get(url, {
613
687
  headers: this.buildRequestHeaders(url, false),
688
+ beforeRedirect: this.options.onRedirect
689
+ ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
690
+ : undefined,
614
691
  });
615
692
  const contentType = response.headers.get('content-type') || '';
616
693
  const shouldReadUndiciBody = !contentType ||
@@ -643,6 +720,7 @@ export class Spider {
643
720
  };
644
721
  };
645
722
  for (let attempt = 0; attempt < maxAttempts; attempt++) {
723
+ await this.waitForDomainRateLimit(hostname);
646
724
  await this.waitForDomainPenalty(hostname);
647
725
  const useCurl = this.shouldUseCurlForHost(hostname, hasCurl, forcedTransport);
648
726
  const transportForAttempt = useCurl ? 'curl' : 'undici';
@@ -732,6 +810,18 @@ export class Spider {
732
810
  forcedTransport = 'curl';
733
811
  }
734
812
  }
813
+ if (this.options.onRetry) {
814
+ await this.options.onRetry({
815
+ url,
816
+ attempt: attempt + 1,
817
+ maxAttempts,
818
+ reason: attemptReason,
819
+ delay: waitMs,
820
+ transport: forcedTransport ?? transportForAttempt,
821
+ previousStatus: response.status,
822
+ timings,
823
+ });
824
+ }
735
825
  await sleep(waitMs);
736
826
  continue;
737
827
  }
@@ -867,7 +957,21 @@ export class Spider {
867
957
  };
868
958
  await this.crawlStorage.saveResult(nonHtmlResult);
869
959
  this._resultCount++;
870
- this.options.onPage?.(nonHtmlResult);
960
+ if (this.options.onPage) {
961
+ let cachedDoc = null;
962
+ await this.options.onPage({
963
+ result: nonHtmlResult,
964
+ html: html || undefined,
965
+ document: html ? () => {
966
+ if (cachedDoc)
967
+ return Promise.resolve(cachedDoc);
968
+ return ScrapeDocument.create(html, { baseUrl: item.url, parserOptions: this.options.parserOptions }).then(d => { cachedDoc = d; return d; });
969
+ } : undefined,
970
+ });
971
+ }
972
+ if (this.options.onBlocked && (detection.blocked || hasCaptcha)) {
973
+ await this.options.onBlocked(nonHtmlResult);
974
+ }
871
975
  return;
872
976
  }
873
977
  const doc = await ScrapeDocument.create(html, {
@@ -898,6 +1002,21 @@ export class Spider {
898
1002
  catch {
899
1003
  }
900
1004
  }
1005
+ let isDuplicate = false;
1006
+ let duplicateOf;
1007
+ let contentHash;
1008
+ if (this.options.deduplicateContent) {
1009
+ const bodyText = doc.text('body');
1010
+ contentHash = createHash('md5').update(bodyText).digest('hex');
1011
+ const existingUrl = this.contentHashes.get(contentHash);
1012
+ if (existingUrl) {
1013
+ isDuplicate = true;
1014
+ duplicateOf = existingUrl;
1015
+ }
1016
+ else {
1017
+ this.contentHashes.set(contentHash, item.url);
1018
+ }
1019
+ }
901
1020
  const result = {
902
1021
  url: item.url,
903
1022
  status,
@@ -926,38 +1045,46 @@ export class Spider {
926
1045
  timings,
927
1046
  fetchedAt,
928
1047
  extracted,
1048
+ contentHash,
1049
+ isDuplicate: isDuplicate || undefined,
1050
+ duplicateOf,
929
1051
  };
930
1052
  await this.crawlStorage.saveResult(result);
931
1053
  this._resultCount++;
932
- this.options.onPage?.(result);
933
- if (this.options.onPageWithHtml) {
934
- await this.options.onPageWithHtml(result, html);
935
- }
936
- const candidates = [];
937
- const candidateUrls = [];
938
- for (const link of links) {
939
- if (!link.href)
940
- continue;
941
- const normalized = normalizeUrl(link.href);
942
- if (!shouldCrawl(normalized, this.baseHost, this.options))
943
- continue;
944
- candidateUrls.push(normalized);
945
- candidates.push({ url: normalized, depth: item.depth + 1 });
1054
+ if (this.options.onPage) {
1055
+ await this.options.onPage({
1056
+ result,
1057
+ html,
1058
+ document: () => Promise.resolve(doc),
1059
+ });
946
1060
  }
947
- if (candidates.length > 0) {
948
- const visitedSet = this.crawlQueue.hasVisitedBatch
949
- ? await this.crawlQueue.hasVisitedBatch(candidateUrls)
950
- : new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
951
- const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
952
- if (newItems.length > 0) {
953
- if (this.crawlQueue.pushBatch) {
954
- await this.crawlQueue.pushBatch(newItems);
955
- }
956
- else {
957
- for (const newItem of newItems)
958
- await this.crawlQueue.push(newItem);
1061
+ if (!isDuplicate) {
1062
+ const candidates = [];
1063
+ const candidateUrls = [];
1064
+ for (const link of links) {
1065
+ if (!link.href)
1066
+ continue;
1067
+ const normalized = normalizeUrl(link.href);
1068
+ if (!shouldCrawl(normalized, this.baseHost, this.options))
1069
+ continue;
1070
+ candidateUrls.push(normalized);
1071
+ candidates.push({ url: normalized, depth: item.depth + 1 });
1072
+ }
1073
+ if (candidates.length > 0) {
1074
+ const visitedSet = this.crawlQueue.hasVisitedBatch
1075
+ ? await this.crawlQueue.hasVisitedBatch(candidateUrls)
1076
+ : new Set(await Promise.all(candidateUrls.map(async (u) => (await this.crawlQueue.hasVisited(u)) ? u : null)).then(r => r.filter(Boolean)));
1077
+ const newItems = candidates.filter((_, i) => !visitedSet.has(candidateUrls[i]));
1078
+ if (newItems.length > 0) {
1079
+ if (this.crawlQueue.pushBatch) {
1080
+ await this.crawlQueue.pushBatch(newItems);
1081
+ }
1082
+ else {
1083
+ for (const newItem of newItems)
1084
+ await this.crawlQueue.push(newItem);
1085
+ }
1086
+ this._queueSize += newItems.length;
959
1087
  }
960
- this._queueSize += newItems.length;
961
1088
  }
962
1089
  }
963
1090
  }
@@ -1026,7 +1153,12 @@ export class Spider {
1026
1153
  await this.crawlStorage.saveResult(errorResult);
1027
1154
  this._resultCount++;
1028
1155
  await this.crawlStorage.saveError({ url: item.url, error: message });
1029
- this.options.onPage?.(errorResult);
1156
+ if (this.options.onPage) {
1157
+ await this.options.onPage({ result: errorResult });
1158
+ }
1159
+ if (this.options.onError) {
1160
+ await this.options.onError(errorResult);
1161
+ }
1030
1162
  }
1031
1163
  }
1032
1164
  getOrCreateDomainState(hostname) {
@@ -0,0 +1,24 @@
1
+ import type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
2
+ export declare class SqliteCrawlQueue implements CrawlQueueAdapter {
3
+ private db;
4
+ private stmts;
5
+ private constructor();
6
+ static create(opts?: {
7
+ dbPath?: string;
8
+ }): Promise<SqliteCrawlQueue>;
9
+ private ensureDb;
10
+ getDb(): any;
11
+ push(item: CrawlQueueItem): Promise<void>;
12
+ pushBatch(items: CrawlQueueItem[]): Promise<void>;
13
+ pop(): Promise<CrawlQueueItem | null>;
14
+ hasVisited(url: string): Promise<boolean>;
15
+ hasVisitedBatch(urls: string[]): Promise<Set<string>>;
16
+ markVisited(url: string): Promise<void>;
17
+ size(): Promise<number>;
18
+ clear(): Promise<void>;
19
+ close(): Promise<void>;
20
+ getVisitedSet(): Set<string>;
21
+ saveMetadata(key: string, value: string): void;
22
+ getMetadata(key: string): string | undefined;
23
+ getAllMetadata(): Record<string, string>;
24
+ }
@@ -0,0 +1,118 @@
1
+ import * as path from 'node:path';
2
+ import * as os from 'node:os';
3
+ import * as crypto from 'node:crypto';
4
+ export class SqliteCrawlQueue {
5
+ db;
6
+ stmts;
7
+ constructor() { }
8
+ static async create(opts) {
9
+ const instance = new SqliteCrawlQueue();
10
+ const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-crawl-${crypto.randomUUID().slice(0, 8)}.db`);
11
+ await instance.ensureDb(dbPath);
12
+ return instance;
13
+ }
14
+ async ensureDb(dbPath) {
15
+ const BetterSqlite3 = (await import('better-sqlite3')).default;
16
+ this.db = new BetterSqlite3(dbPath);
17
+ this.db.pragma('journal_mode = WAL');
18
+ this.db.exec(`
19
+ CREATE TABLE IF NOT EXISTS queue (
20
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
21
+ url TEXT NOT NULL,
22
+ depth INTEGER NOT NULL,
23
+ priority INTEGER,
24
+ discovered_from TEXT
25
+ );
26
+ CREATE TABLE IF NOT EXISTS visited (url TEXT PRIMARY KEY);
27
+ CREATE TABLE IF NOT EXISTS crawl_metadata (key TEXT PRIMARY KEY, value TEXT NOT NULL);
28
+ CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(priority ASC, id ASC);
29
+ `);
30
+ this.stmts = {
31
+ push: this.db.prepare('INSERT INTO queue (url, depth, priority, discovered_from) VALUES (?, ?, ?, ?)'),
32
+ pop: this.db.prepare('SELECT id, url, depth, priority, discovered_from FROM queue ORDER BY priority ASC NULLS LAST, id ASC LIMIT 1'),
33
+ deletePop: this.db.prepare('DELETE FROM queue WHERE id = ?'),
34
+ hasVisited: this.db.prepare('SELECT 1 FROM visited WHERE url = ?'),
35
+ markVisited: this.db.prepare('INSERT OR IGNORE INTO visited (url) VALUES (?)'),
36
+ size: this.db.prepare('SELECT COUNT(*) AS cnt FROM queue'),
37
+ clearQueue: this.db.prepare('DELETE FROM queue'),
38
+ clearVisited: this.db.prepare('DELETE FROM visited'),
39
+ clearMetadata: this.db.prepare('DELETE FROM crawl_metadata'),
40
+ allVisited: this.db.prepare('SELECT url FROM visited'),
41
+ saveMeta: this.db.prepare('INSERT OR REPLACE INTO crawl_metadata (key, value) VALUES (?, ?)'),
42
+ getMeta: this.db.prepare('SELECT value FROM crawl_metadata WHERE key = ?'),
43
+ allMeta: this.db.prepare('SELECT key, value FROM crawl_metadata'),
44
+ };
45
+ }
46
+ getDb() {
47
+ return this.db;
48
+ }
49
+ async push(item) {
50
+ this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
51
+ }
52
+ async pushBatch(items) {
53
+ const insert = this.db.transaction((rows) => {
54
+ for (const item of rows) {
55
+ this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
56
+ }
57
+ });
58
+ insert(items);
59
+ }
60
+ async pop() {
61
+ const row = this.stmts.pop.get();
62
+ if (!row)
63
+ return null;
64
+ this.stmts.deletePop.run(row.id);
65
+ return {
66
+ url: row.url,
67
+ depth: row.depth,
68
+ priority: row.priority ?? undefined,
69
+ discoveredFrom: row.discovered_from ?? undefined,
70
+ };
71
+ }
72
+ async hasVisited(url) {
73
+ return this.stmts.hasVisited.get(url) !== undefined;
74
+ }
75
+ async hasVisitedBatch(urls) {
76
+ const result = new Set();
77
+ for (const url of urls) {
78
+ if (this.stmts.hasVisited.get(url) !== undefined) {
79
+ result.add(url);
80
+ }
81
+ }
82
+ return result;
83
+ }
84
+ async markVisited(url) {
85
+ this.stmts.markVisited.run(url);
86
+ }
87
+ async size() {
88
+ const row = this.stmts.size.get();
89
+ return row.cnt;
90
+ }
91
+ async clear() {
92
+ this.stmts.clearQueue.run();
93
+ this.stmts.clearVisited.run();
94
+ this.stmts.clearMetadata.run();
95
+ }
96
+ async close() {
97
+ this.db.close();
98
+ }
99
+ getVisitedSet() {
100
+ const rows = this.stmts.allVisited.all();
101
+ return new Set(rows.map((r) => r.url));
102
+ }
103
+ saveMetadata(key, value) {
104
+ this.stmts.saveMeta.run(key, value);
105
+ }
106
+ getMetadata(key) {
107
+ const row = this.stmts.getMeta.get(key);
108
+ return row?.value;
109
+ }
110
+ getAllMetadata() {
111
+ const rows = this.stmts.allMeta.all();
112
+ const result = {};
113
+ for (const row of rows) {
114
+ result[row.key] = row.value;
115
+ }
116
+ return result;
117
+ }
118
+ }
@@ -0,0 +1,26 @@
1
+ import type { CrawlStorageAdapter } from './crawl-storage.js';
2
+ import type { SpiderPageResult } from './spider.js';
3
+ export declare class SqliteCrawlStorage implements CrawlStorageAdapter {
4
+ private db;
5
+ private ownsDb;
6
+ private stmts;
7
+ private constructor();
8
+ static create(opts?: {
9
+ dbPath?: string;
10
+ db?: any;
11
+ }): Promise<SqliteCrawlStorage>;
12
+ private init;
13
+ saveResult(result: SpiderPageResult): Promise<void>;
14
+ saveError(error: {
15
+ url: string;
16
+ error: string;
17
+ }): Promise<void>;
18
+ getResultCount(): Promise<number>;
19
+ getResults(): Promise<SpiderPageResult[]>;
20
+ getErrors(): Promise<Array<{
21
+ url: string;
22
+ error: string;
23
+ }>>;
24
+ clear(): Promise<void>;
25
+ close(): Promise<void>;
26
+ }