recker 1.0.93 → 1.0.94-next.83dffd9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/index.d.ts +3 -1
- package/dist/browser/scrape/index.js +2 -0
- package/dist/browser/scrape/spider.d.ts +36 -2
- package/dist/browser/scrape/spider.js +209 -58
- package/dist/browser/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/browser/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/browser/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/browser/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -0
- package/dist/browser/seo/seo-spider.js +17 -3
- package/dist/cli/commands/spider-runner.js +2 -1
- package/dist/cli/index.js +0 -3
- package/dist/cli/tui/jobs/spider-job.js +2 -1
- package/dist/mcp/prompts/index.js +15 -6
- package/dist/scrape/index.d.ts +3 -1
- package/dist/scrape/index.js +2 -0
- package/dist/scrape/spider.d.ts +36 -2
- package/dist/scrape/spider.js +209 -58
- package/dist/scrape/sqlite-crawl-queue.d.ts +24 -0
- package/dist/scrape/sqlite-crawl-queue.js +118 -0
- package/dist/scrape/sqlite-crawl-storage.d.ts +26 -0
- package/dist/scrape/sqlite-crawl-storage.js +76 -0
- package/dist/seo/seo-spider.d.ts +2 -0
- package/dist/seo/seo-spider.js +17 -3
- package/dist/version.js +1 -1
- package/package.json +7 -1
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import * as path from 'node:path';
|
|
2
|
+
import * as os from 'node:os';
|
|
3
|
+
import * as crypto from 'node:crypto';
|
|
4
|
+
export class SqliteCrawlQueue {
|
|
5
|
+
db;
|
|
6
|
+
stmts;
|
|
7
|
+
constructor() { }
|
|
8
|
+
static async create(opts) {
|
|
9
|
+
const instance = new SqliteCrawlQueue();
|
|
10
|
+
const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-crawl-${crypto.randomUUID().slice(0, 8)}.db`);
|
|
11
|
+
await instance.ensureDb(dbPath);
|
|
12
|
+
return instance;
|
|
13
|
+
}
|
|
14
|
+
async ensureDb(dbPath) {
|
|
15
|
+
const BetterSqlite3 = (await import('better-sqlite3')).default;
|
|
16
|
+
this.db = new BetterSqlite3(dbPath);
|
|
17
|
+
this.db.pragma('journal_mode = WAL');
|
|
18
|
+
this.db.exec(`
|
|
19
|
+
CREATE TABLE IF NOT EXISTS queue (
|
|
20
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
21
|
+
url TEXT NOT NULL,
|
|
22
|
+
depth INTEGER NOT NULL,
|
|
23
|
+
priority INTEGER,
|
|
24
|
+
discovered_from TEXT
|
|
25
|
+
);
|
|
26
|
+
CREATE TABLE IF NOT EXISTS visited (url TEXT PRIMARY KEY);
|
|
27
|
+
CREATE TABLE IF NOT EXISTS crawl_metadata (key TEXT PRIMARY KEY, value TEXT NOT NULL);
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(priority ASC, id ASC);
|
|
29
|
+
`);
|
|
30
|
+
this.stmts = {
|
|
31
|
+
push: this.db.prepare('INSERT INTO queue (url, depth, priority, discovered_from) VALUES (?, ?, ?, ?)'),
|
|
32
|
+
pop: this.db.prepare('SELECT id, url, depth, priority, discovered_from FROM queue ORDER BY priority ASC NULLS LAST, id ASC LIMIT 1'),
|
|
33
|
+
deletePop: this.db.prepare('DELETE FROM queue WHERE id = ?'),
|
|
34
|
+
hasVisited: this.db.prepare('SELECT 1 FROM visited WHERE url = ?'),
|
|
35
|
+
markVisited: this.db.prepare('INSERT OR IGNORE INTO visited (url) VALUES (?)'),
|
|
36
|
+
size: this.db.prepare('SELECT COUNT(*) AS cnt FROM queue'),
|
|
37
|
+
clearQueue: this.db.prepare('DELETE FROM queue'),
|
|
38
|
+
clearVisited: this.db.prepare('DELETE FROM visited'),
|
|
39
|
+
clearMetadata: this.db.prepare('DELETE FROM crawl_metadata'),
|
|
40
|
+
allVisited: this.db.prepare('SELECT url FROM visited'),
|
|
41
|
+
saveMeta: this.db.prepare('INSERT OR REPLACE INTO crawl_metadata (key, value) VALUES (?, ?)'),
|
|
42
|
+
getMeta: this.db.prepare('SELECT value FROM crawl_metadata WHERE key = ?'),
|
|
43
|
+
allMeta: this.db.prepare('SELECT key, value FROM crawl_metadata'),
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
getDb() {
|
|
47
|
+
return this.db;
|
|
48
|
+
}
|
|
49
|
+
async push(item) {
|
|
50
|
+
this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
|
|
51
|
+
}
|
|
52
|
+
async pushBatch(items) {
|
|
53
|
+
const insert = this.db.transaction((rows) => {
|
|
54
|
+
for (const item of rows) {
|
|
55
|
+
this.stmts.push.run(item.url, item.depth, item.priority ?? null, item.discoveredFrom ?? null);
|
|
56
|
+
}
|
|
57
|
+
});
|
|
58
|
+
insert(items);
|
|
59
|
+
}
|
|
60
|
+
async pop() {
|
|
61
|
+
const row = this.stmts.pop.get();
|
|
62
|
+
if (!row)
|
|
63
|
+
return null;
|
|
64
|
+
this.stmts.deletePop.run(row.id);
|
|
65
|
+
return {
|
|
66
|
+
url: row.url,
|
|
67
|
+
depth: row.depth,
|
|
68
|
+
priority: row.priority ?? undefined,
|
|
69
|
+
discoveredFrom: row.discovered_from ?? undefined,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
async hasVisited(url) {
|
|
73
|
+
return this.stmts.hasVisited.get(url) !== undefined;
|
|
74
|
+
}
|
|
75
|
+
async hasVisitedBatch(urls) {
|
|
76
|
+
const result = new Set();
|
|
77
|
+
for (const url of urls) {
|
|
78
|
+
if (this.stmts.hasVisited.get(url) !== undefined) {
|
|
79
|
+
result.add(url);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return result;
|
|
83
|
+
}
|
|
84
|
+
async markVisited(url) {
|
|
85
|
+
this.stmts.markVisited.run(url);
|
|
86
|
+
}
|
|
87
|
+
async size() {
|
|
88
|
+
const row = this.stmts.size.get();
|
|
89
|
+
return row.cnt;
|
|
90
|
+
}
|
|
91
|
+
async clear() {
|
|
92
|
+
this.stmts.clearQueue.run();
|
|
93
|
+
this.stmts.clearVisited.run();
|
|
94
|
+
this.stmts.clearMetadata.run();
|
|
95
|
+
}
|
|
96
|
+
async close() {
|
|
97
|
+
this.db.close();
|
|
98
|
+
}
|
|
99
|
+
getVisitedSet() {
|
|
100
|
+
const rows = this.stmts.allVisited.all();
|
|
101
|
+
return new Set(rows.map((r) => r.url));
|
|
102
|
+
}
|
|
103
|
+
saveMetadata(key, value) {
|
|
104
|
+
this.stmts.saveMeta.run(key, value);
|
|
105
|
+
}
|
|
106
|
+
getMetadata(key) {
|
|
107
|
+
const row = this.stmts.getMeta.get(key);
|
|
108
|
+
return row?.value;
|
|
109
|
+
}
|
|
110
|
+
getAllMetadata() {
|
|
111
|
+
const rows = this.stmts.allMeta.all();
|
|
112
|
+
const result = {};
|
|
113
|
+
for (const row of rows) {
|
|
114
|
+
result[row.key] = row.value;
|
|
115
|
+
}
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
2
|
+
import type { SpiderPageResult } from './spider.js';
|
|
3
|
+
export declare class SqliteCrawlStorage implements CrawlStorageAdapter {
|
|
4
|
+
private db;
|
|
5
|
+
private ownsDb;
|
|
6
|
+
private stmts;
|
|
7
|
+
private constructor();
|
|
8
|
+
static create(opts?: {
|
|
9
|
+
dbPath?: string;
|
|
10
|
+
db?: any;
|
|
11
|
+
}): Promise<SqliteCrawlStorage>;
|
|
12
|
+
private init;
|
|
13
|
+
saveResult(result: SpiderPageResult): Promise<void>;
|
|
14
|
+
saveError(error: {
|
|
15
|
+
url: string;
|
|
16
|
+
error: string;
|
|
17
|
+
}): Promise<void>;
|
|
18
|
+
getResultCount(): Promise<number>;
|
|
19
|
+
getResults(): Promise<SpiderPageResult[]>;
|
|
20
|
+
getErrors(): Promise<Array<{
|
|
21
|
+
url: string;
|
|
22
|
+
error: string;
|
|
23
|
+
}>>;
|
|
24
|
+
clear(): Promise<void>;
|
|
25
|
+
close(): Promise<void>;
|
|
26
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import * as path from 'node:path';
|
|
2
|
+
import * as os from 'node:os';
|
|
3
|
+
import * as crypto from 'node:crypto';
|
|
4
|
+
export class SqliteCrawlStorage {
|
|
5
|
+
db;
|
|
6
|
+
ownsDb;
|
|
7
|
+
stmts;
|
|
8
|
+
constructor(db, ownsDb) {
|
|
9
|
+
this.db = db;
|
|
10
|
+
this.ownsDb = ownsDb;
|
|
11
|
+
}
|
|
12
|
+
static async create(opts) {
|
|
13
|
+
let db = opts?.db;
|
|
14
|
+
let ownsDb = false;
|
|
15
|
+
if (!db) {
|
|
16
|
+
const dbPath = opts?.dbPath ?? path.join(os.tmpdir(), `recker-storage-${crypto.randomUUID().slice(0, 8)}.db`);
|
|
17
|
+
const BetterSqlite3 = (await import('better-sqlite3')).default;
|
|
18
|
+
db = new BetterSqlite3(dbPath);
|
|
19
|
+
db.pragma('journal_mode = WAL');
|
|
20
|
+
ownsDb = true;
|
|
21
|
+
}
|
|
22
|
+
const instance = new SqliteCrawlStorage(db, ownsDb);
|
|
23
|
+
instance.init();
|
|
24
|
+
return instance;
|
|
25
|
+
}
|
|
26
|
+
init() {
|
|
27
|
+
this.db.exec(`
|
|
28
|
+
CREATE TABLE IF NOT EXISTS results (
|
|
29
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
30
|
+
url TEXT NOT NULL,
|
|
31
|
+
status INTEGER NOT NULL,
|
|
32
|
+
data TEXT NOT NULL
|
|
33
|
+
);
|
|
34
|
+
CREATE TABLE IF NOT EXISTS errors (
|
|
35
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
36
|
+
url TEXT NOT NULL,
|
|
37
|
+
error TEXT NOT NULL
|
|
38
|
+
);
|
|
39
|
+
`);
|
|
40
|
+
this.stmts = {
|
|
41
|
+
saveResult: this.db.prepare('INSERT INTO results (url, status, data) VALUES (?, ?, ?)'),
|
|
42
|
+
saveError: this.db.prepare('INSERT INTO errors (url, error) VALUES (?, ?)'),
|
|
43
|
+
resultCount: this.db.prepare('SELECT COUNT(*) AS cnt FROM results'),
|
|
44
|
+
allResults: this.db.prepare('SELECT data FROM results'),
|
|
45
|
+
allErrors: this.db.prepare('SELECT url, error FROM errors'),
|
|
46
|
+
clearResults: this.db.prepare('DELETE FROM results'),
|
|
47
|
+
clearErrors: this.db.prepare('DELETE FROM errors'),
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
async saveResult(result) {
|
|
51
|
+
this.stmts.saveResult.run(result.url, result.status, JSON.stringify(result));
|
|
52
|
+
}
|
|
53
|
+
async saveError(error) {
|
|
54
|
+
this.stmts.saveError.run(error.url, error.error);
|
|
55
|
+
}
|
|
56
|
+
async getResultCount() {
|
|
57
|
+
const row = this.stmts.resultCount.get();
|
|
58
|
+
return row.cnt;
|
|
59
|
+
}
|
|
60
|
+
async getResults() {
|
|
61
|
+
const rows = this.stmts.allResults.all();
|
|
62
|
+
return rows.map((r) => JSON.parse(r.data));
|
|
63
|
+
}
|
|
64
|
+
async getErrors() {
|
|
65
|
+
return this.stmts.allErrors.all();
|
|
66
|
+
}
|
|
67
|
+
async clear() {
|
|
68
|
+
this.stmts.clearResults.run();
|
|
69
|
+
this.stmts.clearErrors.run();
|
|
70
|
+
}
|
|
71
|
+
async close() {
|
|
72
|
+
if (this.ownsDb) {
|
|
73
|
+
this.db.close();
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
@@ -5,6 +5,8 @@ export interface SeoSpiderOptions extends SpiderOptions {
|
|
|
5
5
|
seo?: boolean;
|
|
6
6
|
output?: string;
|
|
7
7
|
onSeoAnalysis?: (result: SeoPageResult) => void;
|
|
8
|
+
onBlocked?: (result: SeoPageResult) => void | Promise<void>;
|
|
9
|
+
onError?: (result: SeoPageResult) => void | Promise<void>;
|
|
8
10
|
focusCategories?: string[];
|
|
9
11
|
focusMode?: 'all' | 'links' | 'duplicates' | 'security' | 'ai' | 'resources';
|
|
10
12
|
}
|
|
@@ -32,11 +32,25 @@ export class SeoSpider {
|
|
|
32
32
|
}
|
|
33
33
|
constructor(options = {}) {
|
|
34
34
|
this.options = options;
|
|
35
|
+
const userOnPage = options.onPage;
|
|
35
36
|
this.spider = new Spider({
|
|
36
37
|
...options,
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
await this.analyzePageDuringCrawl(
|
|
38
|
+
onPage: async (event) => {
|
|
39
|
+
if (this.options.seo && event.html) {
|
|
40
|
+
await this.analyzePageDuringCrawl(event.result, event.html);
|
|
41
|
+
}
|
|
42
|
+
if (userOnPage) {
|
|
43
|
+
await userOnPage(event);
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
onBlocked: this.options.onBlocked
|
|
47
|
+
? async (pageResult) => {
|
|
48
|
+
await this.options.onBlocked({ ...pageResult });
|
|
49
|
+
}
|
|
50
|
+
: undefined,
|
|
51
|
+
onError: this.options.onError
|
|
52
|
+
? async (pageResult) => {
|
|
53
|
+
await this.options.onError({ ...pageResult });
|
|
40
54
|
}
|
|
41
55
|
: undefined,
|
|
42
56
|
});
|
|
@@ -102,7 +102,8 @@ export class SpiderRunner extends CommandEmitter {
|
|
|
102
102
|
extract,
|
|
103
103
|
include: include?.map(p => new RegExp(p)),
|
|
104
104
|
exclude: exclude?.map(p => new RegExp(p)),
|
|
105
|
-
onPage: (
|
|
105
|
+
onPage: (event) => {
|
|
106
|
+
const page = event.result;
|
|
106
107
|
collectPageMetrics(page);
|
|
107
108
|
pages.push({
|
|
108
109
|
url: page.url,
|
package/dist/cli/index.js
CHANGED
|
@@ -51,9 +51,6 @@ async function main() {
|
|
|
51
51
|
const { handleRequest } = await import('./handler.js');
|
|
52
52
|
const { resolvePreset } = await import('./presets.js');
|
|
53
53
|
const presets = await import('../presets/index.js');
|
|
54
|
-
import('../utils/binary-manager.js')
|
|
55
|
-
.then(({ ensureCurlImpersonate }) => ensureCurlImpersonate(console))
|
|
56
|
-
.catch(() => { });
|
|
57
54
|
const version = await getVersion();
|
|
58
55
|
function parseMixedArgs(args, initialClientOptions = {}) {
|
|
59
56
|
const headers = { ...initialClientOptions.headers };
|
|
@@ -546,18 +546,27 @@ ${targetType === 'ecommerce' ? `
|
|
|
546
546
|
\`\`\`typescript
|
|
547
547
|
import { Spider } from 'recker';
|
|
548
548
|
|
|
549
|
-
const spider = new Spider(
|
|
549
|
+
const spider = new Spider({
|
|
550
550
|
maxPages: 100,
|
|
551
551
|
maxDepth: 3,
|
|
552
552
|
respectRobotsTxt: true,
|
|
553
553
|
delay: 1000,
|
|
554
|
+
onPage: async ({ result, html, document }) => {
|
|
555
|
+
console.log('Scraped:', result.url);
|
|
556
|
+
if (document) {
|
|
557
|
+
const doc = await document();
|
|
558
|
+
console.log('Title:', doc.selectFirst('title').text());
|
|
559
|
+
}
|
|
560
|
+
},
|
|
561
|
+
onBlocked: (result) => {
|
|
562
|
+
console.warn('Blocked:', result.url, result.security?.reason);
|
|
563
|
+
},
|
|
564
|
+
onError: (result) => {
|
|
565
|
+
console.error('Error:', result.url, result.error);
|
|
566
|
+
},
|
|
554
567
|
});
|
|
555
568
|
|
|
556
|
-
spider.
|
|
557
|
-
console.log('Scraped:', url);
|
|
558
|
-
});
|
|
559
|
-
|
|
560
|
-
await spider.crawl();
|
|
569
|
+
await spider.crawl('https://example.com');
|
|
561
570
|
\`\`\`
|
|
562
571
|
|
|
563
572
|
Please provide a complete workflow including:
|
package/dist/scrape/index.d.ts
CHANGED
|
@@ -3,10 +3,12 @@ export type { Options as ParserOptions } from './parser/index.js';
|
|
|
3
3
|
export { ScrapeDocument } from './document.js';
|
|
4
4
|
export { ScrapeElement } from './element.js';
|
|
5
5
|
export { Spider, spider } from './spider.js';
|
|
6
|
-
export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
|
|
6
|
+
export type { SpiderOptions, SpiderPageResult, SpiderPageEvent, SpiderProgress, SpiderResult, } from './spider.js';
|
|
7
7
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
8
|
+
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
8
9
|
export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
|
|
9
10
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
11
|
+
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
10
12
|
export type { CrawlStorageAdapter } from './crawl-storage.js';
|
|
11
13
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
12
14
|
export type { ProxyAdapter } from './proxy-adapter.js';
|
package/dist/scrape/index.js
CHANGED
|
@@ -3,6 +3,8 @@ export { ScrapeDocument } from './document.js';
|
|
|
3
3
|
export { ScrapeElement } from './element.js';
|
|
4
4
|
export { Spider, spider } from './spider.js';
|
|
5
5
|
export { InMemoryCrawlQueue } from './crawl-queue.js';
|
|
6
|
+
export { SqliteCrawlQueue } from './sqlite-crawl-queue.js';
|
|
6
7
|
export { InMemoryCrawlStorage } from './crawl-storage.js';
|
|
8
|
+
export { SqliteCrawlStorage } from './sqlite-crawl-storage.js';
|
|
7
9
|
export { ListProxyAdapter } from './proxy-adapter.js';
|
|
8
10
|
export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
package/dist/scrape/spider.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { ScrapeDocument } from './document.js';
|
|
1
2
|
import type { ExtractedLink, ExtractionSchema } from './types.js';
|
|
2
3
|
import type { Options as ParserOptions } from './parser/index.js';
|
|
3
4
|
import { type SitemapUrl } from '../seo/validators/sitemap.js';
|
|
@@ -31,8 +32,7 @@ export interface SpiderOptions {
|
|
|
31
32
|
proxy?: string | string[] | ProxyAdapter;
|
|
32
33
|
transport?: SpiderTransport;
|
|
33
34
|
preferCurlFirst?: boolean;
|
|
34
|
-
onPage?: (
|
|
35
|
-
onPageWithHtml?: (result: SpiderPageResult, html: string) => void | Promise<void>;
|
|
35
|
+
onPage?: (event: SpiderPageEvent) => void | Promise<void>;
|
|
36
36
|
onCaptchaDetected?: (result: {
|
|
37
37
|
url: string;
|
|
38
38
|
status: number;
|
|
@@ -40,9 +40,31 @@ export interface SpiderOptions {
|
|
|
40
40
|
provider?: CaptchaProvider;
|
|
41
41
|
usedCurl: boolean;
|
|
42
42
|
}) => void | Promise<void>;
|
|
43
|
+
onBlocked?: (result: SpiderPageResult) => void | Promise<void>;
|
|
44
|
+
onError?: (result: SpiderPageResult) => void | Promise<void>;
|
|
45
|
+
onRetry?: (info: {
|
|
46
|
+
url: string;
|
|
47
|
+
attempt: number;
|
|
48
|
+
maxAttempts: number;
|
|
49
|
+
reason?: string;
|
|
50
|
+
delay: number;
|
|
51
|
+
transport: SpiderTransport;
|
|
52
|
+
previousStatus: number;
|
|
53
|
+
timings?: SpiderPageResult['timings'];
|
|
54
|
+
}) => void | Promise<void>;
|
|
55
|
+
onRedirect?: (info: {
|
|
56
|
+
from: string;
|
|
57
|
+
to: string;
|
|
58
|
+
status: number;
|
|
59
|
+
}) => void | Promise<void>;
|
|
43
60
|
onProgress?: (progress: SpiderProgress) => void;
|
|
44
61
|
extract?: string[] | ExtractionSchema;
|
|
45
62
|
parserOptions?: Partial<ParserOptions>;
|
|
63
|
+
domainRateLimit?: {
|
|
64
|
+
maxPerSecond?: number;
|
|
65
|
+
};
|
|
66
|
+
deduplicateContent?: boolean;
|
|
67
|
+
resume?: boolean;
|
|
46
68
|
crawlQueue?: CrawlQueueAdapter;
|
|
47
69
|
crawlStorage?: CrawlStorageAdapter;
|
|
48
70
|
}
|
|
@@ -102,6 +124,14 @@ export interface SpiderPageResult {
|
|
|
102
124
|
stylesheets: number;
|
|
103
125
|
};
|
|
104
126
|
extracted?: Record<string, unknown>;
|
|
127
|
+
contentHash?: string;
|
|
128
|
+
isDuplicate?: boolean;
|
|
129
|
+
duplicateOf?: string;
|
|
130
|
+
}
|
|
131
|
+
export interface SpiderPageEvent {
|
|
132
|
+
result: SpiderPageResult;
|
|
133
|
+
html?: string;
|
|
134
|
+
document?: () => Promise<ScrapeDocument>;
|
|
105
135
|
}
|
|
106
136
|
export interface SpiderProgress {
|
|
107
137
|
crawled: number;
|
|
@@ -162,7 +192,10 @@ export declare class Spider {
|
|
|
162
192
|
private baseHost;
|
|
163
193
|
private running;
|
|
164
194
|
private aborted;
|
|
195
|
+
private abortController;
|
|
165
196
|
private pendingCount;
|
|
197
|
+
private domainRequestTimestamps;
|
|
198
|
+
private contentHashes;
|
|
166
199
|
private blockedDomains;
|
|
167
200
|
private curlTransport;
|
|
168
201
|
private curlAvailable;
|
|
@@ -172,6 +205,7 @@ export declare class Spider {
|
|
|
172
205
|
private robotsData;
|
|
173
206
|
private sitemapValidation;
|
|
174
207
|
private robotsValidation;
|
|
208
|
+
private waitForDomainRateLimit;
|
|
175
209
|
private toHeaderRecord;
|
|
176
210
|
constructor(options?: SpiderOptions);
|
|
177
211
|
crawl(startUrl: string): Promise<SpiderResult>;
|