recker 1.0.93 → 1.0.94-next.83dffd9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ import * as path from 'node:path';
2
+ import * as os from 'node:os';
3
+ import * as crypto from 'node:crypto';
4
+ export class SqliteCrawlQueue {
5
+ db;
6
+ stmts;
7
+ constructor() { }
8
+ static async create(opts) {
9
+ const instance = new SqliteCrawlQueue();
10
+ const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-crawl-${crypto.randomUUID().slice(0, 8)}.db`);
11
+ await instance.ensureDb(dbPath);
12
+ return instance;
13
+ }
14
+ async ensureDb(dbPath) {
15
+ const BetterSqlite3 = (await import('better-sqlite3')).default;
16
+ this.db = new BetterSqlite3(dbPath);
17
+ this.db.pragma('journal_mode = WAL');
18
+ this.db.exec(`
19
+ CREATE TABLE IF NOT EXISTS queue (
20
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
21
+ url TEXT NOT NULL,
22
+ depth INTEGER NOT NULL,
23
+ priority INTEGER,
24
+ discovered_from TEXT
25
+ );
26
+ CREATE TABLE IF NOT EXISTS visited (url TEXT PRIMARY KEY);
27
+ CREATE TABLE IF NOT EXISTS crawl_metadata (key TEXT PRIMARY KEY, value TEXT NOT NULL);
28
+ CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(priority ASC, id ASC);
29
+ `);
30
+ this.stmts = {
31
+ push: this.db.prepare('INSERT INTO queue (url, depth, priority, discovered_from) VALUES (?, ?, ?, ?)'),
32
+ pop: this.db.prepare('SELECT id, url, depth, priority, discovered_from FROM queue ORDER BY priority ASC NULLS LAST, id ASC LIMIT 1'),
33
+ deletePop: this.db.prepare('DELETE FROM queue WHERE id = ?'),
34
+ hasVisited: this.db.prepare('SELECT 1 FROM visited WHERE url = ?'),
35
+ markVisited: this.db.prepare('INSERT OR IGNORE INTO visited (url) VALUES (?)'),
36
+ size: this.db.prepare('SELECT COUNT(*) AS cnt FROM queue'),
37
+ clearQueue: this.db.prepare('DELETE FROM queue'),
38
+ clearVisited: this.db.prepare('DELETE FROM visited'),
39
+ clearMetadata: this.db.prepare('DELETE FROM crawl_metadata'),
40
+ allVisited: this.db.prepare('SELECT url FROM visited'),
41
+ saveMeta: this.db.prepare('INSERT OR REPLACE INTO crawl_metadata (key, value) VALUES (?, ?)'),
42
+ getMeta: this.db.prepare('SELECT value FROM crawl_metadata WHERE key = ?'),
43
+ allMeta: this.db.prepare('SELECT key, value FROM crawl_metadata'),
44
+ };
45
+ }
46
+ getDb() {
47
+ return this.db;
48
+ }
49
+ async push(item) {
50
+ this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
51
+ }
52
+ async pushBatch(items) {
53
+ const insert = this.db.transaction((rows) => {
54
+ for (const item of rows) {
55
+ this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
56
+ }
57
+ });
58
+ insert(items);
59
+ }
60
+ async pop() {
61
+ const row = this.stmts.pop.get();
62
+ if (!row)
63
+ return null;
64
+ this.stmts.deletePop.run(row.id);
65
+ return {
66
+ url: row.url,
67
+ depth: row.depth,
68
+ priority: row.priority ?? undefined,
69
+ discoveredFrom: row.discovered_from ?? undefined,
70
+ };
71
+ }
72
+ async hasVisited(url) {
73
+ return this.stmts.hasVisited.get(url) !== undefined;
74
+ }
75
+ async hasVisitedBatch(urls) {
76
+ const result = new Set();
77
+ for (const url of urls) {
78
+ if (this.stmts.hasVisited.get(url) !== undefined) {
79
+ result.add(url);
80
+ }
81
+ }
82
+ return result;
83
+ }
84
+ async markVisited(url) {
85
+ this.stmts.markVisited.run(url);
86
+ }
87
+ async size() {
88
+ const row = this.stmts.size.get();
89
+ return row.cnt;
90
+ }
91
+ async clear() {
92
+ this.stmts.clearQueue.run();
93
+ this.stmts.clearVisited.run();
94
+ this.stmts.clearMetadata.run();
95
+ }
96
+ async close() {
97
+ this.db.close();
98
+ }
99
+ getVisitedSet() {
100
+ const rows = this.stmts.allVisited.all();
101
+ return new Set(rows.map((r) => r.url));
102
+ }
103
+ saveMetadata(key, value) {
104
+ this.stmts.saveMeta.run(key, value);
105
+ }
106
+ getMetadata(key) {
107
+ const row = this.stmts.getMeta.get(key);
108
+ return row?.value;
109
+ }
110
+ getAllMetadata() {
111
+ const rows = this.stmts.allMeta.all();
112
+ const result = {};
113
+ for (const row of rows) {
114
+ result[row.key] = row.value;
115
+ }
116
+ return result;
117
+ }
118
+ }
@@ -0,0 +1,26 @@
1
+ import type { CrawlStorageAdapter } from './crawl-storage.js';
2
+ import type { SpiderPageResult } from './spider.js';
3
+ export declare class SqliteCrawlStorage implements CrawlStorageAdapter {
4
+ private db;
5
+ private ownsDb;
6
+ private stmts;
7
+ private constructor();
8
+ static create(opts?: {
9
+ dbPath?: string;
10
+ db?: any;
11
+ }): Promise<SqliteCrawlStorage>;
12
+ private init;
13
+ saveResult(result: SpiderPageResult): Promise<void>;
14
+ saveError(error: {
15
+ url: string;
16
+ error: string;
17
+ }): Promise<void>;
18
+ getResultCount(): Promise<number>;
19
+ getResults(): Promise<SpiderPageResult[]>;
20
+ getErrors(): Promise<Array<{
21
+ url: string;
22
+ error: string;
23
+ }>>;
24
+ clear(): Promise<void>;
25
+ close(): Promise<void>;
26
+ }
@@ -0,0 +1,76 @@
1
+ import * as path from 'node:path';
2
+ import * as os from 'node:os';
3
+ import * as crypto from 'node:crypto';
4
+ export class SqliteCrawlStorage {
5
+ db;
6
+ ownsDb;
7
+ stmts;
8
+ constructor(db, ownsDb) {
9
+ this.db = db;
10
+ this.ownsDb = ownsDb;
11
+ }
12
+ static async create(opts) {
13
+ let db = opts?.db;
14
+ let ownsDb = false;
15
+ if (!db) {
16
+ const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-storage-${crypto.randomUUID().slice(0, 8)}.db`);
17
+ const BetterSqlite3 = (await import('better-sqlite3')).default;
18
+ db = new BetterSqlite3(dbPath);
19
+ db.pragma('journal_mode = WAL');
20
+ ownsDb = true;
21
+ }
22
+ const instance = new SqliteCrawlStorage(db, ownsDb);
23
+ instance.init();
24
+ return instance;
25
+ }
26
+ init() {
27
+ this.db.exec(`
28
+ CREATE TABLE IF NOT EXISTS results (
29
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
30
+ url TEXT NOT NULL,
31
+ status INTEGER NOT NULL,
32
+ data TEXT NOT NULL
33
+ );
34
+ CREATE TABLE IF NOT EXISTS errors (
35
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
36
+ url TEXT NOT NULL,
37
+ error TEXT NOT NULL
38
+ );
39
+ `);
40
+ this.stmts = {
41
+ saveResult: this.db.prepare('INSERT INTO results (url, status, data) VALUES (?, ?, ?)'),
42
+ saveError: this.db.prepare('INSERT INTO errors (url, error) VALUES (?, ?)'),
43
+ resultCount: this.db.prepare('SELECT COUNT(*) AS cnt FROM results'),
44
+ allResults: this.db.prepare('SELECT data FROM results'),
45
+ allErrors: this.db.prepare('SELECT url, error FROM errors'),
46
+ clearResults: this.db.prepare('DELETE FROM results'),
47
+ clearErrors: this.db.prepare('DELETE FROM errors'),
48
+ };
49
+ }
50
+ async saveResult(result) {
51
+ this.stmts.saveResult.run(result.url, result.status, JSON.stringify(result));
52
+ }
53
+ async saveError(error) {
54
+ this.stmts.saveError.run(error.url, error.error);
55
+ }
56
+ async getResultCount() {
57
+ const row = this.stmts.resultCount.get();
58
+ return row.cnt;
59
+ }
60
+ async getResults() {
61
+ const rows = this.stmts.allResults.all();
62
+ return rows.map((r) => JSON.parse(r.data));
63
+ }
64
+ async getErrors() {
65
+ return this.stmts.allErrors.all();
66
+ }
67
+ async clear() {
68
+ this.stmts.clearResults.run();
69
+ this.stmts.clearErrors.run();
70
+ }
71
+ async close() {
72
+ if (this.ownsDb) {
73
+ this.db.close();
74
+ }
75
+ }
76
+ }
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
5
5
  seo?: boolean;
6
6
  output?: string;
7
7
  onSeoAnalysis?: (result: SeoPageResult) => void;
8
+ onBlocked?: (result: SeoPageResult) => void | Promise<void>;
9
+ onError?: (result: SeoPageResult) => void | Promise<void>;
8
10
  focusCategories?: string[];
9
11
  focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
10
12
  }
@@ -32,11 +32,25 @@ export class SeoSpider {
32
32
  }
33
33
  constructor(options = {}) {
34
34
  this.options = options;
35
+ const userOnPage = options.onPage;
35
36
  this.spider = new Spider({
36
37
  ...options,
37
- onPageWithHtml: this.options.seo
38
- ? async (pageResult, html) => {
39
- await this.analyzePageDuringCrawl(pageResult, html);
38
+ onPage: async (event) => {
39
+ if (this.options.seo && event.html) {
40
+ await this.analyzePageDuringCrawl(event.result, event.html);
41
+ }
42
+ if (userOnPage) {
43
+ await userOnPage(event);
44
+ }
45
+ },
46
+ onBlocked: this.options.onBlocked
47
+ ? async (pageResult) => {
48
+ await this.options.onBlocked({ ...pageResult });
49
+ }
50
+ : undefined,
51
+ onError: this.options.onError
52
+ ? async (pageResult) => {
53
+ await this.options.onError({ ...pageResult });
40
54
  }
41
55
  : undefined,
42
56
  });
@@ -102,7 +102,8 @@ export class SpiderRunner extends CommandEmitter {
102
102
  extract,
103
103
  include: include?.map(p => new RegExp(p)),
104
104
  exclude: exclude?.map(p => new RegExp(p)),
105
- onPage: (page) => {
105
+ onPage: (event) => {
106
+ const page = event.result;
106
107
  collectPageMetrics(page);
107
108
  pages.push({
108
109
  url: page.url,
package/dist/cli/index.js CHANGED
@@ -51,9 +51,6 @@ async function main() {
51
51
  const { handleRequest } = await import('./handler.js');
52
52
  const { resolvePreset } = await import('./presets.js');
53
53
  const presets = await import('../presets/index.js');
54
- import('../utils/binary-manager.js')
55
- .then(({ ensureCurlImpersonate }) => ensureCurlImpersonate(console))
56
- .catch(() => { });
57
54
  const version = await getVersion();
58
55
  function parseMixedArgs(args, initialClientOptions = {}) {
59
56
  const headers = { ...initialClientOptions.headers };
@@ -57,7 +57,8 @@ export class SpiderJob {
57
57
  errors: 0,
58
58
  });
59
59
  },
60
- onPage: (result) => {
60
+ onPage: (event) => {
61
+ const result = event.result;
61
62
  if (result.error) {
62
63
  const currentProgress = this.job.progress;
63
64
  this.manager.updateProgress(this.job.id, {
@@ -546,18 +546,27 @@ ${targetType === 'ecommerce' ? `
546
546
  \`\`\`typescript
547
547
  import { Spider } from 'recker';
548
548
 
549
- const spider = new Spider('https://example.com', {
549
+ const spider = new Spider({
550
550
  maxPages: 100,
551
551
  maxDepth: 3,
552
552
  respectRobotsTxt: true,
553
553
  delay: 1000,
554
+ onPage: async ({ result, html, document }) => {
555
+ console.log('Scraped:', result.url);
556
+ if (document) {
557
+ const doc = await document();
558
+ console.log('Title:', doc.selectFirst('title').text());
559
+ }
560
+ },
561
+ onBlocked: (result) => {
562
+ console.warn('Blocked:', result.url, result.security?.reason);
563
+ },
564
+ onError: (result) => {
565
+ console.error('Error:', result.url, result.error);
566
+ },
554
567
  });
555
568
 
556
- spider.on('page', ({ url, doc }) => {
557
- console.log('Scraped:', url);
558
- });
559
-
560
- await spider.crawl();
569
+ await spider.crawl('https://example.com');
561
570
  \`\`\`
562
571
 
563
572
  Please provide a complete workflow including:
@@ -3,10 +3,12 @@ export type { Options as ParserOptions } from './parser/index.js';
3
3
  export { ScrapeDocument } from './document.js';
4
4
  export { ScrapeElement } from './element.js';
5
5
  export { Spider, spider } from './spider.js';
6
- export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
6
+ export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
7
7
  export { InMemoryCrawlQueue } from './crawl-queue.js';
8
+ export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
8
9
  export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
9
10
  export { InMemoryCrawlStorage } from './crawl-storage.js';
11
+ export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
10
12
  export type { CrawlStorageAdapter } from './crawl-storage.js';
11
13
  export { ListProxyAdapter } from './proxy-adapter.js';
12
14
  export type { ProxyAdapter } from './proxy-adapter.js';
@@ -3,6 +3,8 @@ export { ScrapeDocument } from './document.js';
3
3
  export { ScrapeElement } from './element.js';
4
4
  export { Spider, spider } from './spider.js';
5
5
  export { InMemoryCrawlQueue } from './crawl-queue.js';
6
+ export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
6
7
  export { InMemoryCrawlStorage } from './crawl-storage.js';
8
+ export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
7
9
  export { ListProxyAdapter } from './proxy-adapter.js';
8
10
  export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
@@ -1,3 +1,4 @@
1
+ import { ScrapeDocument } from './document.js';
1
2
  import type { ExtractedLink, ExtractionSchema } from './types.js';
2
3
  import type { Options as ParserOptions } from './parser/index.js';
3
4
  import { type SitemapUrl } from '../seo/validators/sitemap.js';
@@ -31,8 +32,7 @@ export interface SpiderOptions {
31
32
  proxy?: string | string[] | ProxyAdapter;
32
33
  transport?: SpiderTransport;
33
34
  preferCurlFirst?: boolean;
34
- onPage?: (result: SpiderPageResult) => void;
35
- onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
35
+ onPage?: (event: SpiderPageEvent) => void | Promise<void>;
36
36
  onCaptchaDetected?: (result: {
37
37
  url: string;
38
38
  status: number;
@@ -40,9 +40,31 @@ export interface SpiderOptions {
40
40
  provider?: CaptchaProvider;
41
41
  usedCurl: boolean;
42
42
  }) => void | Promise<void>;
43
+ onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
44
+ onError?: (result: SpiderPageResult) => void | Promise<void>;
45
+ onRetry?: (info: {
46
+ url: string;
47
+ attempt: number;
48
+ maxAttempts: number;
49
+ reason?: string;
50
+ delay: number;
51
+ transport: SpiderTransport;
52
+ previousStatus: number;
53
+ timings?: SpiderPageResult['timings'];
54
+ }) => void | Promise<void>;
55
+ onRedirect?: (info: {
56
+ from: string;
57
+ to: string;
58
+ status: number;
59
+ }) => void | Promise<void>;
43
60
  onProgress?: (progress: SpiderProgress) => void;
44
61
  extract?: string[] | ExtractionSchema;
45
62
  parserOptions?: Partial<ParserOptions>;
63
+ domainRateLimit?: {
64
+ maxPerSecond?: number;
65
+ };
66
+ deduplicateContent?: boolean;
67
+ resume?: boolean;
46
68
  crawlQueue?: CrawlQueueAdapter;
47
69
  crawlStorage?: CrawlStorageAdapter;
48
70
  }
@@ -102,6 +124,14 @@ export interface SpiderPageResult {
102
124
  stylesheets: number;
103
125
  };
104
126
  extracted?: Record<string, unknown>;
127
+ contentHash?: string;
128
+ isDuplicate?: boolean;
129
+ duplicateOf?: string;
130
+ }
131
+ export interface SpiderPageEvent {
132
+ result: SpiderPageResult;
133
+ html?: string;
134
+ document?: () => Promise<ScrapeDocument>;
105
135
  }
106
136
  export interface SpiderProgress {
107
137
  crawled: number;
@@ -162,7 +192,10 @@ export declare class Spider {
162
192
  private baseHost;
163
193
  private running;
164
194
  private aborted;
195
+ private abortController;
165
196
  private pendingCount;
197
+ private domainRequestTimestamps;
198
+ private contentHashes;
166
199
  private blockedDomains;
167
200
  private curlTransport;
168
201
  private curlAvailable;
@@ -172,6 +205,7 @@ export declare class Spider {
172
205
  private robotsData;
173
206
  private sitemapValidation;
174
207
  private robotsValidation;
208
+ private waitForDomainRateLimit;
175
209
  private toHeaderRecord;
176
210
  constructor(options?: SpiderOptions);
177
211
  crawl(startUrl: string): Promise<SpiderResult>;