s3db.js 18.0.11-next.1534f717 → 18.0.11-next.e8e71b5b
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/clients/recker-http-handler.js +56 -8
- package/dist/clients/recker-http-handler.js.map +1 -1
- package/dist/concerns/high-performance-inserter.js +6 -34
- package/dist/concerns/high-performance-inserter.js.map +1 -1
- package/dist/concerns/id/alphabets.js +150 -0
- package/dist/concerns/id/alphabets.js.map +1 -0
- package/dist/concerns/id/entropy.js +243 -0
- package/dist/concerns/id/entropy.js.map +1 -0
- package/dist/concerns/id/generators/nanoid.js +74 -0
- package/dist/concerns/id/generators/nanoid.js.map +1 -0
- package/dist/concerns/id/generators/sid.js +73 -0
- package/dist/concerns/id/generators/sid.js.map +1 -0
- package/dist/concerns/id/generators/ulid.js +208 -0
- package/dist/concerns/id/generators/ulid.js.map +1 -0
- package/dist/concerns/id/generators/uuid-v7.js +150 -0
- package/dist/concerns/id/generators/uuid-v7.js.map +1 -0
- package/dist/concerns/id/index.js +74 -0
- package/dist/concerns/id/index.js.map +1 -0
- package/dist/concerns/plugin-storage.js +114 -0
- package/dist/concerns/plugin-storage.js.map +1 -1
- package/dist/concerns/s3-errors.js +72 -0
- package/dist/concerns/s3-errors.js.map +1 -0
- package/dist/concerns/s3-key.js +54 -0
- package/dist/concerns/s3-key.js.map +1 -0
- package/dist/concerns/safe-merge.js +47 -0
- package/dist/concerns/safe-merge.js.map +1 -0
- package/dist/core/resource-config-validator.js +12 -2
- package/dist/core/resource-config-validator.js.map +1 -1
- package/dist/core/resource-partitions.class.js +12 -1
- package/dist/core/resource-partitions.class.js.map +1 -1
- package/dist/core/resource-persistence.class.js +41 -12
- package/dist/core/resource-persistence.class.js.map +1 -1
- package/dist/core/resource-query.class.js +21 -47
- package/dist/core/resource-query.class.js.map +1 -1
- package/dist/database/database-connection.class.js +3 -6
- package/dist/database/database-connection.class.js.map +1 -1
- package/dist/database/database-plugins.class.js +7 -13
- package/dist/database/database-plugins.class.js.map +1 -1
- package/dist/plugins/concerns/s3-mutex.class.js +155 -0
- package/dist/plugins/concerns/s3-mutex.class.js.map +1 -0
- package/dist/plugins/eventual-consistency/consolidation.js +4 -7
- package/dist/plugins/eventual-consistency/consolidation.js.map +1 -1
- package/dist/plugins/eventual-consistency/garbage-collection.js +3 -6
- package/dist/plugins/eventual-consistency/garbage-collection.js.map +1 -1
- package/dist/plugins/queue-consumer.plugin.js +10 -16
- package/dist/plugins/queue-consumer.plugin.js.map +1 -1
- package/dist/plugins/recon/managers/scheduler-manager.js +3 -5
- package/dist/plugins/recon/managers/scheduler-manager.js.map +1 -1
- package/dist/plugins/recon/stages/recker-asn-stage.js +279 -0
- package/dist/plugins/recon/stages/recker-asn-stage.js.map +1 -0
- package/dist/plugins/recon/stages/recker-dns-stage.js +227 -0
- package/dist/plugins/recon/stages/recker-dns-stage.js.map +1 -0
- package/dist/plugins/recon/stages/recker-scrape-stage.js +369 -0
- package/dist/plugins/recon/stages/recker-scrape-stage.js.map +1 -0
- package/dist/plugins/replicator.plugin.js +13 -31
- package/dist/plugins/replicator.plugin.js.map +1 -1
- package/dist/plugins/replicators/base-replicator.class.js +10 -23
- package/dist/plugins/replicators/base-replicator.class.js.map +1 -1
- package/dist/plugins/spider/recker-link-discoverer.js +544 -0
- package/dist/plugins/spider/recker-link-discoverer.js.map +1 -0
- package/dist/plugins/spider/recker-llms-validator.js +334 -0
- package/dist/plugins/spider/recker-llms-validator.js.map +1 -0
- package/dist/plugins/spider/recker-robots-validator.js +336 -0
- package/dist/plugins/spider/recker-robots-validator.js.map +1 -0
- package/dist/plugins/spider/recker-security-adapter.js +325 -0
- package/dist/plugins/spider/recker-security-adapter.js.map +1 -0
- package/dist/plugins/spider/recker-seo-adapter.js +399 -0
- package/dist/plugins/spider/recker-seo-adapter.js.map +1 -0
- package/dist/plugins/spider/recker-sitemap-validator.js +406 -0
- package/dist/plugins/spider/recker-sitemap-validator.js.map +1 -0
- package/dist/resource.class.js +2 -0
- package/dist/resource.class.js.map +1 -1
- package/dist/s3db.cjs +444 -219
- package/dist/s3db.cjs.map +1 -1
- package/dist/s3db.es.js +445 -220
- package/dist/s3db.es.js.map +1 -1
- package/dist/stream/resource-reader.class.js +5 -7
- package/dist/stream/resource-reader.class.js.map +1 -1
- package/dist/stream/resource-writer.class.js +5 -7
- package/dist/stream/resource-writer.class.js.map +1 -1
- package/dist/tasks/tasks-pool.class.js +31 -0
- package/dist/tasks/tasks-pool.class.js.map +1 -1
- package/dist/types/clients/recker-http-handler.d.ts +1 -0
- package/dist/types/clients/recker-http-handler.d.ts.map +1 -1
- package/dist/types/clients/types.d.ts +14 -0
- package/dist/types/clients/types.d.ts.map +1 -1
- package/dist/types/concerns/high-performance-inserter.d.ts.map +1 -1
- package/dist/types/concerns/id/alphabets.d.ts +125 -0
- package/dist/types/concerns/id/alphabets.d.ts.map +1 -0
- package/dist/types/concerns/id/entropy.d.ts +84 -0
- package/dist/types/concerns/id/entropy.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/nanoid.d.ts +46 -0
- package/dist/types/concerns/id/generators/nanoid.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/sid.d.ts +45 -0
- package/dist/types/concerns/id/generators/sid.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/ulid.d.ts +71 -0
- package/dist/types/concerns/id/generators/ulid.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/uuid-v7.d.ts +60 -0
- package/dist/types/concerns/id/generators/uuid-v7.d.ts.map +1 -0
- package/dist/types/concerns/id/index.d.ts +51 -0
- package/dist/types/concerns/id/index.d.ts.map +1 -0
- package/dist/types/concerns/plugin-storage.d.ts +25 -0
- package/dist/types/concerns/plugin-storage.d.ts.map +1 -1
- package/dist/types/concerns/s3-errors.d.ts +20 -0
- package/dist/types/concerns/s3-errors.d.ts.map +1 -0
- package/dist/types/concerns/s3-key.d.ts +30 -0
- package/dist/types/concerns/s3-key.d.ts.map +1 -0
- package/dist/types/concerns/safe-merge.d.ts +22 -0
- package/dist/types/concerns/safe-merge.d.ts.map +1 -0
- package/dist/types/core/resource-config-validator.d.ts.map +1 -1
- package/dist/types/core/resource-partitions.class.d.ts.map +1 -1
- package/dist/types/core/resource-persistence.class.d.ts.map +1 -1
- package/dist/types/core/resource-query.class.d.ts.map +1 -1
- package/dist/types/database/database-connection.class.d.ts.map +1 -1
- package/dist/types/database/database-plugins.class.d.ts.map +1 -1
- package/dist/types/plugins/concerns/s3-mutex.class.d.ts +30 -0
- package/dist/types/plugins/concerns/s3-mutex.class.d.ts.map +1 -0
- package/dist/types/plugins/eventual-consistency/consolidation.d.ts.map +1 -1
- package/dist/types/plugins/eventual-consistency/garbage-collection.d.ts.map +1 -1
- package/dist/types/plugins/queue-consumer.plugin.d.ts.map +1 -1
- package/dist/types/plugins/recon/managers/scheduler-manager.d.ts.map +1 -1
- package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts +90 -0
- package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts.map +1 -0
- package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts +125 -0
- package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts.map +1 -0
- package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts +96 -0
- package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts.map +1 -0
- package/dist/types/plugins/replicator.plugin.d.ts.map +1 -1
- package/dist/types/plugins/replicators/base-replicator.class.d.ts.map +1 -1
- package/dist/types/plugins/spider/recker-link-discoverer.d.ts +54 -0
- package/dist/types/plugins/spider/recker-link-discoverer.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-llms-validator.d.ts +105 -0
- package/dist/types/plugins/spider/recker-llms-validator.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-robots-validator.d.ts +92 -0
- package/dist/types/plugins/spider/recker-robots-validator.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-security-adapter.d.ts +83 -0
- package/dist/types/plugins/spider/recker-security-adapter.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-seo-adapter.d.ts +187 -0
- package/dist/types/plugins/spider/recker-seo-adapter.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-sitemap-validator.d.ts +121 -0
- package/dist/types/plugins/spider/recker-sitemap-validator.d.ts.map +1 -0
- package/dist/types/resource.class.d.ts.map +1 -1
- package/dist/types/stream/resource-reader.class.d.ts.map +1 -1
- package/dist/types/stream/resource-writer.class.d.ts.map +1 -1
- package/dist/types/tasks/tasks-pool.class.d.ts +23 -0
- package/dist/types/tasks/tasks-pool.class.d.ts.map +1 -1
- package/mcp/prompts/index.ts +275 -0
- package/mcp/resources/index.ts +322 -0
- package/mcp/tools/plugins.ts +1137 -0
- package/mcp/tools/streams.ts +340 -0
- package/package.json +20 -22
- package/src/clients/recker-http-handler.ts +74 -8
- package/src/clients/types.ts +14 -0
- package/src/concerns/high-performance-inserter.ts +18 -57
- package/src/concerns/id/alphabets.ts +175 -0
- package/src/concerns/id/entropy.ts +286 -0
- package/src/concerns/id/generators/sid.ts +90 -0
- package/src/concerns/id/generators/ulid.ts +249 -0
- package/src/concerns/id/generators/uuid-v7.ts +179 -0
- package/src/concerns/id/index.ts +167 -0
- package/src/concerns/plugin-storage.ts +144 -0
- package/src/concerns/s3-errors.ts +97 -0
- package/src/concerns/s3-key.ts +62 -0
- package/src/concerns/safe-merge.ts +60 -0
- package/src/core/resource-config-validator.ts +9 -2
- package/src/core/resource-partitions.class.ts +14 -1
- package/src/core/resource-persistence.class.ts +47 -13
- package/src/core/resource-query.class.ts +21 -46
- package/src/database/database-connection.class.ts +7 -6
- package/src/database/database-plugins.class.ts +15 -13
- package/src/plugins/concerns/s3-mutex.class.ts +228 -0
- package/src/plugins/eventual-consistency/consolidation.ts +8 -7
- package/src/plugins/eventual-consistency/garbage-collection.ts +7 -6
- package/src/plugins/queue-consumer.plugin.ts +21 -19
- package/src/plugins/recon/managers/scheduler-manager.ts +7 -5
- package/src/plugins/recon/stages/recker-asn-stage.ts +385 -0
- package/src/plugins/recon/stages/recker-dns-stage.ts +360 -0
- package/src/plugins/recon/stages/recker-scrape-stage.ts +509 -0
- package/src/plugins/replicator.plugin.ts +41 -35
- package/src/plugins/replicators/base-replicator.class.ts +17 -23
- package/src/plugins/spider/recker-link-discoverer.ts +645 -0
- package/src/plugins/spider/recker-llms-validator.ts +500 -0
- package/src/plugins/spider/recker-robots-validator.ts +473 -0
- package/src/plugins/spider/recker-security-adapter.ts +489 -0
- package/src/plugins/spider/recker-seo-adapter.ts +605 -0
- package/src/plugins/spider/recker-sitemap-validator.ts +621 -0
- package/src/resource.class.ts +2 -0
- package/src/stream/resource-reader.class.ts +10 -8
- package/src/stream/resource-writer.class.ts +10 -8
- package/src/tasks/tasks-pool.class.ts +46 -0
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
import { createHttpClient } from '#src/concerns/http-client.js';
|
|
2
|
+
import type { CrawlContext } from './crawl-context.js';
|
|
3
|
+
import type { RobotsCheckResult, RobotsParserConfig, CacheStats, ParsedRules } from './robots-parser.js';
|
|
4
|
+
|
|
5
|
+
type ReckerRobotsParseResult = {
|
|
6
|
+
valid: boolean;
|
|
7
|
+
errors: Array<{ line: number; message: string }>;
|
|
8
|
+
warnings: Array<{ line: number; message: string }>;
|
|
9
|
+
directives: Array<{
|
|
10
|
+
type: 'user-agent' | 'allow' | 'disallow' | 'sitemap' | 'crawl-delay' | 'host' | 'clean-param';
|
|
11
|
+
value: string;
|
|
12
|
+
line: number;
|
|
13
|
+
}>;
|
|
14
|
+
userAgentBlocks: Array<{
|
|
15
|
+
userAgents: string[];
|
|
16
|
+
rules: Array<{ type: 'allow' | 'disallow'; path: string; line: number }>;
|
|
17
|
+
crawlDelay?: number;
|
|
18
|
+
}>;
|
|
19
|
+
sitemaps: string[];
|
|
20
|
+
host?: string;
|
|
21
|
+
blocksAllRobots: boolean;
|
|
22
|
+
blocksImportantPaths: boolean;
|
|
23
|
+
size: number;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
type ReckerRobotsValidationIssue = {
|
|
27
|
+
type: 'error' | 'warning' | 'info';
|
|
28
|
+
code: string;
|
|
29
|
+
message: string;
|
|
30
|
+
line?: number;
|
|
31
|
+
recommendation?: string;
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
type ReckerRobotsValidationResult = {
|
|
35
|
+
valid: boolean;
|
|
36
|
+
issues: ReckerRobotsValidationIssue[];
|
|
37
|
+
parseResult: ReckerRobotsParseResult;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
type ReckerParseRobotsTxt = (content: string) => ReckerRobotsParseResult;
|
|
41
|
+
type ReckerValidateRobotsTxt = (content: string, baseUrl?: string) => ReckerRobotsValidationResult;
|
|
42
|
+
type ReckerIsPathAllowed = (parseResult: ReckerRobotsParseResult, path: string, userAgent?: string) => boolean;
|
|
43
|
+
type ReckerFetchAndValidate = (url: string, fetcher?: (url: string) => Promise<{ status: number; text: string }>) => Promise<ReckerRobotsValidationResult & { exists: boolean; status?: number }>;
|
|
44
|
+
|
|
45
|
+
interface CacheEntry {
|
|
46
|
+
parseResult: ReckerRobotsParseResult | null;
|
|
47
|
+
validationResult: ReckerRobotsValidationResult | null;
|
|
48
|
+
timestamp: number;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
interface HttpClient {
|
|
52
|
+
get(url: string): Promise<HttpResponse>;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
interface HttpResponse {
|
|
56
|
+
ok: boolean;
|
|
57
|
+
status: number;
|
|
58
|
+
text(): Promise<string>;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export interface RobotsValidationDetails {
|
|
62
|
+
valid: boolean;
|
|
63
|
+
issues: ReckerRobotsValidationIssue[];
|
|
64
|
+
blocksAllRobots: boolean;
|
|
65
|
+
blocksImportantPaths: boolean;
|
|
66
|
+
host?: string;
|
|
67
|
+
size: number;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export class ReckerRobotsValidator {
|
|
71
|
+
private config: Required<Omit<RobotsParserConfig, 'context' | 'fetcher'>> & {
|
|
72
|
+
context: CrawlContext | null;
|
|
73
|
+
fetcher: ((url: string) => Promise<string>) | null;
|
|
74
|
+
};
|
|
75
|
+
private _context: CrawlContext | null;
|
|
76
|
+
private cache: Map<string, CacheEntry>;
|
|
77
|
+
private fetcher: ((url: string) => Promise<string>) | null;
|
|
78
|
+
private _httpClient: HttpClient | null;
|
|
79
|
+
|
|
80
|
+
private reckerAvailable: boolean | null = null;
|
|
81
|
+
private parseRobotsTxt: ReckerParseRobotsTxt | null = null;
|
|
82
|
+
private validateRobotsTxt: ReckerValidateRobotsTxt | null = null;
|
|
83
|
+
private isPathAllowed: ReckerIsPathAllowed | null = null;
|
|
84
|
+
private fetchAndValidateRobotsTxt: ReckerFetchAndValidate | null = null;
|
|
85
|
+
private fallbackParser: import('./robots-parser.js').RobotsParser | null = null;
|
|
86
|
+
|
|
87
|
+
constructor(config: RobotsParserConfig = {}) {
|
|
88
|
+
this.config = {
|
|
89
|
+
userAgent: config.userAgent || 's3db-spider',
|
|
90
|
+
defaultAllow: config.defaultAllow !== false,
|
|
91
|
+
cacheTimeout: config.cacheTimeout || 3600000,
|
|
92
|
+
fetchTimeout: config.fetchTimeout || 10000,
|
|
93
|
+
context: config.context || null,
|
|
94
|
+
fetcher: config.fetcher || null
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
this._context = this.config.context;
|
|
98
|
+
this.cache = new Map();
|
|
99
|
+
this.fetcher = this.config.fetcher;
|
|
100
|
+
this._httpClient = null;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
private async _checkReckerAvailability(): Promise<boolean> {
|
|
104
|
+
if (this.reckerAvailable !== null) {
|
|
105
|
+
return this.reckerAvailable;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
try {
|
|
109
|
+
const robotsModule = await import('recker/seo/validators/robots');
|
|
110
|
+
this.parseRobotsTxt = robotsModule.parseRobotsTxt;
|
|
111
|
+
this.validateRobotsTxt = robotsModule.validateRobotsTxt;
|
|
112
|
+
this.isPathAllowed = robotsModule.isPathAllowed;
|
|
113
|
+
this.fetchAndValidateRobotsTxt = robotsModule.fetchAndValidateRobotsTxt;
|
|
114
|
+
this.reckerAvailable = true;
|
|
115
|
+
return true;
|
|
116
|
+
} catch {
|
|
117
|
+
this.reckerAvailable = false;
|
|
118
|
+
return false;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
private async _getFallbackParser(): Promise<import('./robots-parser.js').RobotsParser> {
|
|
123
|
+
if (!this.fallbackParser) {
|
|
124
|
+
const { RobotsParser } = await import('./robots-parser.js');
|
|
125
|
+
this.fallbackParser = new RobotsParser(this.config);
|
|
126
|
+
}
|
|
127
|
+
return this.fallbackParser;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
setFetcher(fetcher: (url: string) => Promise<string>): void {
|
|
131
|
+
this.fetcher = fetcher;
|
|
132
|
+
if (this.fallbackParser) {
|
|
133
|
+
this.fallbackParser.setFetcher(fetcher);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
async isAllowed(url: string): Promise<RobotsCheckResult> {
|
|
138
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
139
|
+
|
|
140
|
+
if (!isReckerAvailable) {
|
|
141
|
+
const fallback = await this._getFallbackParser();
|
|
142
|
+
return fallback.isAllowed(url);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
try {
|
|
146
|
+
const urlObj = new URL(url);
|
|
147
|
+
const domain = `${urlObj.protocol}//${urlObj.host}`;
|
|
148
|
+
const path = urlObj.pathname + urlObj.search;
|
|
149
|
+
|
|
150
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
151
|
+
|
|
152
|
+
if (!cached.parseResult) {
|
|
153
|
+
return { allowed: this.config.defaultAllow, source: 'no-robots-txt' };
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const allowed = this.isPathAllowed!(cached.parseResult, path, this.config.userAgent);
|
|
157
|
+
const crawlDelay = this._getCrawlDelayFromParseResult(cached.parseResult);
|
|
158
|
+
const matchedRule = this._findMatchedRule(cached.parseResult, path);
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
allowed,
|
|
162
|
+
crawlDelay,
|
|
163
|
+
source: 'robots-txt',
|
|
164
|
+
matchedRule
|
|
165
|
+
};
|
|
166
|
+
|
|
167
|
+
} catch (error) {
|
|
168
|
+
return {
|
|
169
|
+
allowed: this.config.defaultAllow,
|
|
170
|
+
source: 'error',
|
|
171
|
+
error: (error as Error).message
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
private async _getCachedOrFetch(domain: string): Promise<CacheEntry> {
|
|
177
|
+
const cached = this.cache.get(domain);
|
|
178
|
+
if (cached && Date.now() - cached.timestamp < this.config.cacheTimeout) {
|
|
179
|
+
return cached;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const robotsUrl = `${domain}/robots.txt`;
|
|
183
|
+
let content: string | null = null;
|
|
184
|
+
|
|
185
|
+
try {
|
|
186
|
+
if (this.fetcher) {
|
|
187
|
+
content = await this.fetcher(robotsUrl);
|
|
188
|
+
} else {
|
|
189
|
+
content = await this._fetchRobotsTxt(robotsUrl);
|
|
190
|
+
}
|
|
191
|
+
} catch {
|
|
192
|
+
const entry: CacheEntry = {
|
|
193
|
+
parseResult: null,
|
|
194
|
+
validationResult: null,
|
|
195
|
+
timestamp: Date.now()
|
|
196
|
+
};
|
|
197
|
+
this.cache.set(domain, entry);
|
|
198
|
+
return entry;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const parseResult = this.parseRobotsTxt!(content);
|
|
202
|
+
const validationResult = this.validateRobotsTxt!(content, domain);
|
|
203
|
+
|
|
204
|
+
const entry: CacheEntry = {
|
|
205
|
+
parseResult,
|
|
206
|
+
validationResult,
|
|
207
|
+
timestamp: Date.now()
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
this.cache.set(domain, entry);
|
|
211
|
+
return entry;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
private async _getHttpClient(): Promise<HttpClient> {
|
|
215
|
+
if (!this._httpClient) {
|
|
216
|
+
const baseConfig = this._context
|
|
217
|
+
? this._context.getHttpClientConfig('https://example.com')
|
|
218
|
+
: {
|
|
219
|
+
headers: {
|
|
220
|
+
'User-Agent': this.config.userAgent
|
|
221
|
+
}
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
this._httpClient = await createHttpClient({
|
|
225
|
+
...baseConfig,
|
|
226
|
+
timeout: this.config.fetchTimeout,
|
|
227
|
+
retry: {
|
|
228
|
+
maxAttempts: 2,
|
|
229
|
+
delay: 500,
|
|
230
|
+
backoff: 'exponential',
|
|
231
|
+
retryAfter: true,
|
|
232
|
+
retryOn: [429, 500, 502, 503, 504]
|
|
233
|
+
}
|
|
234
|
+
}) as unknown as HttpClient;
|
|
235
|
+
}
|
|
236
|
+
return this._httpClient;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
private async _fetchRobotsTxt(url: string): Promise<string> {
|
|
240
|
+
const client = await this._getHttpClient();
|
|
241
|
+
const response = await client.get(url);
|
|
242
|
+
|
|
243
|
+
if (this._context) {
|
|
244
|
+
this._context.processResponse(
|
|
245
|
+
response as unknown as Parameters<typeof this._context.processResponse>[0],
|
|
246
|
+
url
|
|
247
|
+
);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if (!response.ok) {
|
|
251
|
+
throw new Error(`HTTP ${response.status}`);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return await response.text();
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
private _getCrawlDelayFromParseResult(parseResult: ReckerRobotsParseResult): number | null {
|
|
258
|
+
const userAgentLower = this.config.userAgent.toLowerCase();
|
|
259
|
+
|
|
260
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
261
|
+
const agents = block.userAgents.map(a => a.toLowerCase());
|
|
262
|
+
|
|
263
|
+
if (agents.includes(userAgentLower)) {
|
|
264
|
+
return block.crawlDelay ? block.crawlDelay * 1000 : null;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
269
|
+
const agents = block.userAgents.map(a => a.toLowerCase());
|
|
270
|
+
|
|
271
|
+
for (const agent of agents) {
|
|
272
|
+
if (agent !== '*' && (agent.includes(userAgentLower) || userAgentLower.includes(agent))) {
|
|
273
|
+
return block.crawlDelay ? block.crawlDelay * 1000 : null;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
279
|
+
if (block.userAgents.map(a => a.toLowerCase()).includes('*')) {
|
|
280
|
+
return block.crawlDelay ? block.crawlDelay * 1000 : null;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return null;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
private _findMatchedRule(parseResult: ReckerRobotsParseResult, path: string): string | undefined {
|
|
288
|
+
const userAgentLower = this.config.userAgent.toLowerCase();
|
|
289
|
+
|
|
290
|
+
let targetBlock: ReckerRobotsParseResult['userAgentBlocks'][0] | null = null;
|
|
291
|
+
|
|
292
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
293
|
+
const agents = block.userAgents.map(a => a.toLowerCase());
|
|
294
|
+
if (agents.includes(userAgentLower)) {
|
|
295
|
+
targetBlock = block;
|
|
296
|
+
break;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if (!targetBlock) {
|
|
301
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
302
|
+
const agents = block.userAgents.map(a => a.toLowerCase());
|
|
303
|
+
for (const agent of agents) {
|
|
304
|
+
if (agent !== '*' && (agent.includes(userAgentLower) || userAgentLower.includes(agent))) {
|
|
305
|
+
targetBlock = block;
|
|
306
|
+
break;
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
if (targetBlock) break;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
if (!targetBlock) {
|
|
314
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
315
|
+
if (block.userAgents.map(a => a.toLowerCase()).includes('*')) {
|
|
316
|
+
targetBlock = block;
|
|
317
|
+
break;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
if (!targetBlock) return undefined;
|
|
323
|
+
|
|
324
|
+
const sortedRules = [...targetBlock.rules].sort((a, b) => {
|
|
325
|
+
const lenA = a.path.replace(/\*/g, '').length;
|
|
326
|
+
const lenB = b.path.replace(/\*/g, '').length;
|
|
327
|
+
return lenB - lenA;
|
|
328
|
+
});
|
|
329
|
+
|
|
330
|
+
for (const rule of sortedRules) {
|
|
331
|
+
if (this._pathMatches(path, rule.path)) {
|
|
332
|
+
return rule.path;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
return undefined;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
private _pathMatches(path: string, pattern: string): boolean {
|
|
340
|
+
let escaped = pattern.replace(/[.+?^{}()|[\]\\]/g, '\\$&');
|
|
341
|
+
escaped = escaped.replace(/\*/g, '.*');
|
|
342
|
+
|
|
343
|
+
if (escaped.endsWith('$')) {
|
|
344
|
+
escaped = escaped.slice(0, -1) + '$';
|
|
345
|
+
} else {
|
|
346
|
+
escaped = escaped + '.*';
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
const regex = new RegExp(`^${escaped}$`, 'i');
|
|
350
|
+
return regex.test(path);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
async getSitemaps(domain: string): Promise<string[]> {
|
|
354
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
355
|
+
|
|
356
|
+
if (!isReckerAvailable) {
|
|
357
|
+
const fallback = await this._getFallbackParser();
|
|
358
|
+
return fallback.getSitemaps(domain);
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
362
|
+
return cached.parseResult?.sitemaps || [];
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
async getCrawlDelay(domain: string): Promise<number | null> {
|
|
366
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
367
|
+
|
|
368
|
+
if (!isReckerAvailable) {
|
|
369
|
+
const fallback = await this._getFallbackParser();
|
|
370
|
+
return fallback.getCrawlDelay(domain);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
374
|
+
if (!cached.parseResult) return null;
|
|
375
|
+
|
|
376
|
+
return this._getCrawlDelayFromParseResult(cached.parseResult);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
async preload(domain: string): Promise<void> {
|
|
380
|
+
await this._getCachedOrFetch(domain);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
clearCache(domain?: string): void {
|
|
384
|
+
if (domain) {
|
|
385
|
+
this.cache.delete(domain);
|
|
386
|
+
} else {
|
|
387
|
+
this.cache.clear();
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
getCacheStats(): CacheStats {
|
|
392
|
+
return {
|
|
393
|
+
size: this.cache.size,
|
|
394
|
+
domains: [...this.cache.keys()]
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
async validate(url: string): Promise<RobotsValidationDetails | null> {
|
|
399
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
400
|
+
|
|
401
|
+
if (!isReckerAvailable) {
|
|
402
|
+
return null;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
try {
|
|
406
|
+
const urlObj = new URL(url);
|
|
407
|
+
const domain = `${urlObj.protocol}//${urlObj.host}`;
|
|
408
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
409
|
+
|
|
410
|
+
if (!cached.validationResult || !cached.parseResult) {
|
|
411
|
+
return null;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
return {
|
|
415
|
+
valid: cached.validationResult.valid,
|
|
416
|
+
issues: cached.validationResult.issues,
|
|
417
|
+
blocksAllRobots: cached.parseResult.blocksAllRobots,
|
|
418
|
+
blocksImportantPaths: cached.parseResult.blocksImportantPaths,
|
|
419
|
+
host: cached.parseResult.host,
|
|
420
|
+
size: cached.parseResult.size
|
|
421
|
+
};
|
|
422
|
+
} catch {
|
|
423
|
+
return null;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
async validateContent(content: string, baseUrl?: string): Promise<ReckerRobotsValidationResult | null> {
|
|
428
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
429
|
+
|
|
430
|
+
if (!isReckerAvailable || !this.validateRobotsTxt) {
|
|
431
|
+
return null;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
return this.validateRobotsTxt(content, baseUrl);
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
parseContent(content: string): ReckerRobotsParseResult | null {
|
|
438
|
+
if (!this.reckerAvailable || !this.parseRobotsTxt) {
|
|
439
|
+
return null;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
return this.parseRobotsTxt(content);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
getBlockingStatus(domain: string): {
|
|
446
|
+
blocksAllRobots: boolean;
|
|
447
|
+
blocksImportantPaths: boolean;
|
|
448
|
+
} | null {
|
|
449
|
+
const cached = this.cache.get(domain);
|
|
450
|
+
if (!cached?.parseResult) return null;
|
|
451
|
+
|
|
452
|
+
return {
|
|
453
|
+
blocksAllRobots: cached.parseResult.blocksAllRobots,
|
|
454
|
+
blocksImportantPaths: cached.parseResult.blocksImportantPaths
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
getHost(domain: string): string | null {
|
|
459
|
+
const cached = this.cache.get(domain);
|
|
460
|
+
return cached?.parseResult?.host || null;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
async getValidationIssues(domain: string): Promise<ReckerRobotsValidationIssue[]> {
|
|
464
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
465
|
+
return cached.validationResult?.issues || [];
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
isReckerEnabled(): boolean {
|
|
469
|
+
return this.reckerAvailable === true;
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
export default ReckerRobotsValidator;
|