s3db.js 18.0.11-next.1534f717 → 18.0.11-next.e8e71b5b

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/dist/clients/recker-http-handler.js +56 -8
  2. package/dist/clients/recker-http-handler.js.map +1 -1
  3. package/dist/concerns/high-performance-inserter.js +6 -34
  4. package/dist/concerns/high-performance-inserter.js.map +1 -1
  5. package/dist/concerns/id/alphabets.js +150 -0
  6. package/dist/concerns/id/alphabets.js.map +1 -0
  7. package/dist/concerns/id/entropy.js +243 -0
  8. package/dist/concerns/id/entropy.js.map +1 -0
  9. package/dist/concerns/id/generators/nanoid.js +74 -0
  10. package/dist/concerns/id/generators/nanoid.js.map +1 -0
  11. package/dist/concerns/id/generators/sid.js +73 -0
  12. package/dist/concerns/id/generators/sid.js.map +1 -0
  13. package/dist/concerns/id/generators/ulid.js +208 -0
  14. package/dist/concerns/id/generators/ulid.js.map +1 -0
  15. package/dist/concerns/id/generators/uuid-v7.js +150 -0
  16. package/dist/concerns/id/generators/uuid-v7.js.map +1 -0
  17. package/dist/concerns/id/index.js +74 -0
  18. package/dist/concerns/id/index.js.map +1 -0
  19. package/dist/concerns/plugin-storage.js +114 -0
  20. package/dist/concerns/plugin-storage.js.map +1 -1
  21. package/dist/concerns/s3-errors.js +72 -0
  22. package/dist/concerns/s3-errors.js.map +1 -0
  23. package/dist/concerns/s3-key.js +54 -0
  24. package/dist/concerns/s3-key.js.map +1 -0
  25. package/dist/concerns/safe-merge.js +47 -0
  26. package/dist/concerns/safe-merge.js.map +1 -0
  27. package/dist/core/resource-config-validator.js +12 -2
  28. package/dist/core/resource-config-validator.js.map +1 -1
  29. package/dist/core/resource-partitions.class.js +12 -1
  30. package/dist/core/resource-partitions.class.js.map +1 -1
  31. package/dist/core/resource-persistence.class.js +41 -12
  32. package/dist/core/resource-persistence.class.js.map +1 -1
  33. package/dist/core/resource-query.class.js +21 -47
  34. package/dist/core/resource-query.class.js.map +1 -1
  35. package/dist/database/database-connection.class.js +3 -6
  36. package/dist/database/database-connection.class.js.map +1 -1
  37. package/dist/database/database-plugins.class.js +7 -13
  38. package/dist/database/database-plugins.class.js.map +1 -1
  39. package/dist/plugins/concerns/s3-mutex.class.js +155 -0
  40. package/dist/plugins/concerns/s3-mutex.class.js.map +1 -0
  41. package/dist/plugins/eventual-consistency/consolidation.js +4 -7
  42. package/dist/plugins/eventual-consistency/consolidation.js.map +1 -1
  43. package/dist/plugins/eventual-consistency/garbage-collection.js +3 -6
  44. package/dist/plugins/eventual-consistency/garbage-collection.js.map +1 -1
  45. package/dist/plugins/queue-consumer.plugin.js +10 -16
  46. package/dist/plugins/queue-consumer.plugin.js.map +1 -1
  47. package/dist/plugins/recon/managers/scheduler-manager.js +3 -5
  48. package/dist/plugins/recon/managers/scheduler-manager.js.map +1 -1
  49. package/dist/plugins/recon/stages/recker-asn-stage.js +279 -0
  50. package/dist/plugins/recon/stages/recker-asn-stage.js.map +1 -0
  51. package/dist/plugins/recon/stages/recker-dns-stage.js +227 -0
  52. package/dist/plugins/recon/stages/recker-dns-stage.js.map +1 -0
  53. package/dist/plugins/recon/stages/recker-scrape-stage.js +369 -0
  54. package/dist/plugins/recon/stages/recker-scrape-stage.js.map +1 -0
  55. package/dist/plugins/replicator.plugin.js +13 -31
  56. package/dist/plugins/replicator.plugin.js.map +1 -1
  57. package/dist/plugins/replicators/base-replicator.class.js +10 -23
  58. package/dist/plugins/replicators/base-replicator.class.js.map +1 -1
  59. package/dist/plugins/spider/recker-link-discoverer.js +544 -0
  60. package/dist/plugins/spider/recker-link-discoverer.js.map +1 -0
  61. package/dist/plugins/spider/recker-llms-validator.js +334 -0
  62. package/dist/plugins/spider/recker-llms-validator.js.map +1 -0
  63. package/dist/plugins/spider/recker-robots-validator.js +336 -0
  64. package/dist/plugins/spider/recker-robots-validator.js.map +1 -0
  65. package/dist/plugins/spider/recker-security-adapter.js +325 -0
  66. package/dist/plugins/spider/recker-security-adapter.js.map +1 -0
  67. package/dist/plugins/spider/recker-seo-adapter.js +399 -0
  68. package/dist/plugins/spider/recker-seo-adapter.js.map +1 -0
  69. package/dist/plugins/spider/recker-sitemap-validator.js +406 -0
  70. package/dist/plugins/spider/recker-sitemap-validator.js.map +1 -0
  71. package/dist/resource.class.js +2 -0
  72. package/dist/resource.class.js.map +1 -1
  73. package/dist/s3db.cjs +444 -219
  74. package/dist/s3db.cjs.map +1 -1
  75. package/dist/s3db.es.js +445 -220
  76. package/dist/s3db.es.js.map +1 -1
  77. package/dist/stream/resource-reader.class.js +5 -7
  78. package/dist/stream/resource-reader.class.js.map +1 -1
  79. package/dist/stream/resource-writer.class.js +5 -7
  80. package/dist/stream/resource-writer.class.js.map +1 -1
  81. package/dist/tasks/tasks-pool.class.js +31 -0
  82. package/dist/tasks/tasks-pool.class.js.map +1 -1
  83. package/dist/types/clients/recker-http-handler.d.ts +1 -0
  84. package/dist/types/clients/recker-http-handler.d.ts.map +1 -1
  85. package/dist/types/clients/types.d.ts +14 -0
  86. package/dist/types/clients/types.d.ts.map +1 -1
  87. package/dist/types/concerns/high-performance-inserter.d.ts.map +1 -1
  88. package/dist/types/concerns/id/alphabets.d.ts +125 -0
  89. package/dist/types/concerns/id/alphabets.d.ts.map +1 -0
  90. package/dist/types/concerns/id/entropy.d.ts +84 -0
  91. package/dist/types/concerns/id/entropy.d.ts.map +1 -0
  92. package/dist/types/concerns/id/generators/nanoid.d.ts +46 -0
  93. package/dist/types/concerns/id/generators/nanoid.d.ts.map +1 -0
  94. package/dist/types/concerns/id/generators/sid.d.ts +45 -0
  95. package/dist/types/concerns/id/generators/sid.d.ts.map +1 -0
  96. package/dist/types/concerns/id/generators/ulid.d.ts +71 -0
  97. package/dist/types/concerns/id/generators/ulid.d.ts.map +1 -0
  98. package/dist/types/concerns/id/generators/uuid-v7.d.ts +60 -0
  99. package/dist/types/concerns/id/generators/uuid-v7.d.ts.map +1 -0
  100. package/dist/types/concerns/id/index.d.ts +51 -0
  101. package/dist/types/concerns/id/index.d.ts.map +1 -0
  102. package/dist/types/concerns/plugin-storage.d.ts +25 -0
  103. package/dist/types/concerns/plugin-storage.d.ts.map +1 -1
  104. package/dist/types/concerns/s3-errors.d.ts +20 -0
  105. package/dist/types/concerns/s3-errors.d.ts.map +1 -0
  106. package/dist/types/concerns/s3-key.d.ts +30 -0
  107. package/dist/types/concerns/s3-key.d.ts.map +1 -0
  108. package/dist/types/concerns/safe-merge.d.ts +22 -0
  109. package/dist/types/concerns/safe-merge.d.ts.map +1 -0
  110. package/dist/types/core/resource-config-validator.d.ts.map +1 -1
  111. package/dist/types/core/resource-partitions.class.d.ts.map +1 -1
  112. package/dist/types/core/resource-persistence.class.d.ts.map +1 -1
  113. package/dist/types/core/resource-query.class.d.ts.map +1 -1
  114. package/dist/types/database/database-connection.class.d.ts.map +1 -1
  115. package/dist/types/database/database-plugins.class.d.ts.map +1 -1
  116. package/dist/types/plugins/concerns/s3-mutex.class.d.ts +30 -0
  117. package/dist/types/plugins/concerns/s3-mutex.class.d.ts.map +1 -0
  118. package/dist/types/plugins/eventual-consistency/consolidation.d.ts.map +1 -1
  119. package/dist/types/plugins/eventual-consistency/garbage-collection.d.ts.map +1 -1
  120. package/dist/types/plugins/queue-consumer.plugin.d.ts.map +1 -1
  121. package/dist/types/plugins/recon/managers/scheduler-manager.d.ts.map +1 -1
  122. package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts +90 -0
  123. package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts.map +1 -0
  124. package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts +125 -0
  125. package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts.map +1 -0
  126. package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts +96 -0
  127. package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts.map +1 -0
  128. package/dist/types/plugins/replicator.plugin.d.ts.map +1 -1
  129. package/dist/types/plugins/replicators/base-replicator.class.d.ts.map +1 -1
  130. package/dist/types/plugins/spider/recker-link-discoverer.d.ts +54 -0
  131. package/dist/types/plugins/spider/recker-link-discoverer.d.ts.map +1 -0
  132. package/dist/types/plugins/spider/recker-llms-validator.d.ts +105 -0
  133. package/dist/types/plugins/spider/recker-llms-validator.d.ts.map +1 -0
  134. package/dist/types/plugins/spider/recker-robots-validator.d.ts +92 -0
  135. package/dist/types/plugins/spider/recker-robots-validator.d.ts.map +1 -0
  136. package/dist/types/plugins/spider/recker-security-adapter.d.ts +83 -0
  137. package/dist/types/plugins/spider/recker-security-adapter.d.ts.map +1 -0
  138. package/dist/types/plugins/spider/recker-seo-adapter.d.ts +187 -0
  139. package/dist/types/plugins/spider/recker-seo-adapter.d.ts.map +1 -0
  140. package/dist/types/plugins/spider/recker-sitemap-validator.d.ts +121 -0
  141. package/dist/types/plugins/spider/recker-sitemap-validator.d.ts.map +1 -0
  142. package/dist/types/resource.class.d.ts.map +1 -1
  143. package/dist/types/stream/resource-reader.class.d.ts.map +1 -1
  144. package/dist/types/stream/resource-writer.class.d.ts.map +1 -1
  145. package/dist/types/tasks/tasks-pool.class.d.ts +23 -0
  146. package/dist/types/tasks/tasks-pool.class.d.ts.map +1 -1
  147. package/mcp/prompts/index.ts +275 -0
  148. package/mcp/resources/index.ts +322 -0
  149. package/mcp/tools/plugins.ts +1137 -0
  150. package/mcp/tools/streams.ts +340 -0
  151. package/package.json +20 -22
  152. package/src/clients/recker-http-handler.ts +74 -8
  153. package/src/clients/types.ts +14 -0
  154. package/src/concerns/high-performance-inserter.ts +18 -57
  155. package/src/concerns/id/alphabets.ts +175 -0
  156. package/src/concerns/id/entropy.ts +286 -0
  157. package/src/concerns/id/generators/sid.ts +90 -0
  158. package/src/concerns/id/generators/ulid.ts +249 -0
  159. package/src/concerns/id/generators/uuid-v7.ts +179 -0
  160. package/src/concerns/id/index.ts +167 -0
  161. package/src/concerns/plugin-storage.ts +144 -0
  162. package/src/concerns/s3-errors.ts +97 -0
  163. package/src/concerns/s3-key.ts +62 -0
  164. package/src/concerns/safe-merge.ts +60 -0
  165. package/src/core/resource-config-validator.ts +9 -2
  166. package/src/core/resource-partitions.class.ts +14 -1
  167. package/src/core/resource-persistence.class.ts +47 -13
  168. package/src/core/resource-query.class.ts +21 -46
  169. package/src/database/database-connection.class.ts +7 -6
  170. package/src/database/database-plugins.class.ts +15 -13
  171. package/src/plugins/concerns/s3-mutex.class.ts +228 -0
  172. package/src/plugins/eventual-consistency/consolidation.ts +8 -7
  173. package/src/plugins/eventual-consistency/garbage-collection.ts +7 -6
  174. package/src/plugins/queue-consumer.plugin.ts +21 -19
  175. package/src/plugins/recon/managers/scheduler-manager.ts +7 -5
  176. package/src/plugins/recon/stages/recker-asn-stage.ts +385 -0
  177. package/src/plugins/recon/stages/recker-dns-stage.ts +360 -0
  178. package/src/plugins/recon/stages/recker-scrape-stage.ts +509 -0
  179. package/src/plugins/replicator.plugin.ts +41 -35
  180. package/src/plugins/replicators/base-replicator.class.ts +17 -23
  181. package/src/plugins/spider/recker-link-discoverer.ts +645 -0
  182. package/src/plugins/spider/recker-llms-validator.ts +500 -0
  183. package/src/plugins/spider/recker-robots-validator.ts +473 -0
  184. package/src/plugins/spider/recker-security-adapter.ts +489 -0
  185. package/src/plugins/spider/recker-seo-adapter.ts +605 -0
  186. package/src/plugins/spider/recker-sitemap-validator.ts +621 -0
  187. package/src/resource.class.ts +2 -0
  188. package/src/stream/resource-reader.class.ts +10 -8
  189. package/src/stream/resource-writer.class.ts +10 -8
  190. package/src/tasks/tasks-pool.class.ts +46 -0
@@ -0,0 +1,473 @@
1
+ import { createHttpClient } from '#src/concerns/http-client.js';
2
+ import type { CrawlContext } from './crawl-context.js';
3
+ import type { RobotsCheckResult, RobotsParserConfig, CacheStats, ParsedRules } from './robots-parser.js';
4
+
5
+ type ReckerRobotsParseResult = {
6
+ valid: boolean;
7
+ errors: Array<{ line: number; message: string }>;
8
+ warnings: Array<{ line: number; message: string }>;
9
+ directives: Array<{
10
+ type: 'user-agent' | 'allow' | 'disallow' | 'sitemap' | 'crawl-delay' | 'host' | 'clean-param';
11
+ value: string;
12
+ line: number;
13
+ }>;
14
+ userAgentBlocks: Array<{
15
+ userAgents: string[];
16
+ rules: Array<{ type: 'allow' | 'disallow'; path: string; line: number }>;
17
+ crawlDelay?: number;
18
+ }>;
19
+ sitemaps: string[];
20
+ host?: string;
21
+ blocksAllRobots: boolean;
22
+ blocksImportantPaths: boolean;
23
+ size: number;
24
+ };
25
+
26
+ type ReckerRobotsValidationIssue = {
27
+ type: 'error' | 'warning' | 'info';
28
+ code: string;
29
+ message: string;
30
+ line?: number;
31
+ recommendation?: string;
32
+ };
33
+
34
+ type ReckerRobotsValidationResult = {
35
+ valid: boolean;
36
+ issues: ReckerRobotsValidationIssue[];
37
+ parseResult: ReckerRobotsParseResult;
38
+ };
39
+
40
+ type ReckerParseRobotsTxt = (content: string) => ReckerRobotsParseResult;
41
+ type ReckerValidateRobotsTxt = (content: string, baseUrl?: string) => ReckerRobotsValidationResult;
42
+ type ReckerIsPathAllowed = (parseResult: ReckerRobotsParseResult, path: string, userAgent?: string) => boolean;
43
+ type ReckerFetchAndValidate = (url: string, fetcher?: (url: string) => Promise<{ status: number; text: string }>) => Promise<ReckerRobotsValidationResult & { exists: boolean; status?: number }>;
44
+
45
+ interface CacheEntry {
46
+ parseResult: ReckerRobotsParseResult | null;
47
+ validationResult: ReckerRobotsValidationResult | null;
48
+ timestamp: number;
49
+ }
50
+
51
+ interface HttpClient {
52
+ get(url: string): Promise<HttpResponse>;
53
+ }
54
+
55
+ interface HttpResponse {
56
+ ok: boolean;
57
+ status: number;
58
+ text(): Promise<string>;
59
+ }
60
+
61
+ export interface RobotsValidationDetails {
62
+ valid: boolean;
63
+ issues: ReckerRobotsValidationIssue[];
64
+ blocksAllRobots: boolean;
65
+ blocksImportantPaths: boolean;
66
+ host?: string;
67
+ size: number;
68
+ }
69
+
70
+ export class ReckerRobotsValidator {
71
+ private config: Required<Omit<RobotsParserConfig, 'context' | 'fetcher'>> & {
72
+ context: CrawlContext | null;
73
+ fetcher: ((url: string) => Promise<string>) | null;
74
+ };
75
+ private _context: CrawlContext | null;
76
+ private cache: Map<string, CacheEntry>;
77
+ private fetcher: ((url: string) => Promise<string>) | null;
78
+ private _httpClient: HttpClient | null;
79
+
80
+ private reckerAvailable: boolean | null = null;
81
+ private parseRobotsTxt: ReckerParseRobotsTxt | null = null;
82
+ private validateRobotsTxt: ReckerValidateRobotsTxt | null = null;
83
+ private isPathAllowed: ReckerIsPathAllowed | null = null;
84
+ private fetchAndValidateRobotsTxt: ReckerFetchAndValidate | null = null;
85
+ private fallbackParser: import('./robots-parser.js').RobotsParser | null = null;
86
+
87
+ constructor(config: RobotsParserConfig = {}) {
88
+ this.config = {
89
+ userAgent: config.userAgent || 's3db-spider',
90
+ defaultAllow: config.defaultAllow !== false,
91
+ cacheTimeout: config.cacheTimeout || 3600000,
92
+ fetchTimeout: config.fetchTimeout || 10000,
93
+ context: config.context || null,
94
+ fetcher: config.fetcher || null
95
+ };
96
+
97
+ this._context = this.config.context;
98
+ this.cache = new Map();
99
+ this.fetcher = this.config.fetcher;
100
+ this._httpClient = null;
101
+ }
102
+
103
+ private async _checkReckerAvailability(): Promise<boolean> {
104
+ if (this.reckerAvailable !== null) {
105
+ return this.reckerAvailable;
106
+ }
107
+
108
+ try {
109
+ const robotsModule = await import('recker/seo/validators/robots');
110
+ this.parseRobotsTxt = robotsModule.parseRobotsTxt;
111
+ this.validateRobotsTxt = robotsModule.validateRobotsTxt;
112
+ this.isPathAllowed = robotsModule.isPathAllowed;
113
+ this.fetchAndValidateRobotsTxt = robotsModule.fetchAndValidateRobotsTxt;
114
+ this.reckerAvailable = true;
115
+ return true;
116
+ } catch {
117
+ this.reckerAvailable = false;
118
+ return false;
119
+ }
120
+ }
121
+
122
+ private async _getFallbackParser(): Promise<import('./robots-parser.js').RobotsParser> {
123
+ if (!this.fallbackParser) {
124
+ const { RobotsParser } = await import('./robots-parser.js');
125
+ this.fallbackParser = new RobotsParser(this.config);
126
+ }
127
+ return this.fallbackParser;
128
+ }
129
+
130
+ setFetcher(fetcher: (url: string) => Promise<string>): void {
131
+ this.fetcher = fetcher;
132
+ if (this.fallbackParser) {
133
+ this.fallbackParser.setFetcher(fetcher);
134
+ }
135
+ }
136
+
137
+ async isAllowed(url: string): Promise<RobotsCheckResult> {
138
+ const isReckerAvailable = await this._checkReckerAvailability();
139
+
140
+ if (!isReckerAvailable) {
141
+ const fallback = await this._getFallbackParser();
142
+ return fallback.isAllowed(url);
143
+ }
144
+
145
+ try {
146
+ const urlObj = new URL(url);
147
+ const domain = `${urlObj.protocol}//${urlObj.host}`;
148
+ const path = urlObj.pathname + urlObj.search;
149
+
150
+ const cached = await this._getCachedOrFetch(domain);
151
+
152
+ if (!cached.parseResult) {
153
+ return { allowed: this.config.defaultAllow, source: 'no-robots-txt' };
154
+ }
155
+
156
+ const allowed = this.isPathAllowed!(cached.parseResult, path, this.config.userAgent);
157
+ const crawlDelay = this._getCrawlDelayFromParseResult(cached.parseResult);
158
+ const matchedRule = this._findMatchedRule(cached.parseResult, path);
159
+
160
+ return {
161
+ allowed,
162
+ crawlDelay,
163
+ source: 'robots-txt',
164
+ matchedRule
165
+ };
166
+
167
+ } catch (error) {
168
+ return {
169
+ allowed: this.config.defaultAllow,
170
+ source: 'error',
171
+ error: (error as Error).message
172
+ };
173
+ }
174
+ }
175
+
176
+ private async _getCachedOrFetch(domain: string): Promise<CacheEntry> {
177
+ const cached = this.cache.get(domain);
178
+ if (cached && Date.now() - cached.timestamp < this.config.cacheTimeout) {
179
+ return cached;
180
+ }
181
+
182
+ const robotsUrl = `${domain}/robots.txt`;
183
+ let content: string | null = null;
184
+
185
+ try {
186
+ if (this.fetcher) {
187
+ content = await this.fetcher(robotsUrl);
188
+ } else {
189
+ content = await this._fetchRobotsTxt(robotsUrl);
190
+ }
191
+ } catch {
192
+ const entry: CacheEntry = {
193
+ parseResult: null,
194
+ validationResult: null,
195
+ timestamp: Date.now()
196
+ };
197
+ this.cache.set(domain, entry);
198
+ return entry;
199
+ }
200
+
201
+ const parseResult = this.parseRobotsTxt!(content);
202
+ const validationResult = this.validateRobotsTxt!(content, domain);
203
+
204
+ const entry: CacheEntry = {
205
+ parseResult,
206
+ validationResult,
207
+ timestamp: Date.now()
208
+ };
209
+
210
+ this.cache.set(domain, entry);
211
+ return entry;
212
+ }
213
+
214
+ private async _getHttpClient(): Promise<HttpClient> {
215
+ if (!this._httpClient) {
216
+ const baseConfig = this._context
217
+ ? this._context.getHttpClientConfig('https://example.com')
218
+ : {
219
+ headers: {
220
+ 'User-Agent': this.config.userAgent
221
+ }
222
+ };
223
+
224
+ this._httpClient = await createHttpClient({
225
+ ...baseConfig,
226
+ timeout: this.config.fetchTimeout,
227
+ retry: {
228
+ maxAttempts: 2,
229
+ delay: 500,
230
+ backoff: 'exponential',
231
+ retryAfter: true,
232
+ retryOn: [429, 500, 502, 503, 504]
233
+ }
234
+ }) as unknown as HttpClient;
235
+ }
236
+ return this._httpClient;
237
+ }
238
+
239
+ private async _fetchRobotsTxt(url: string): Promise<string> {
240
+ const client = await this._getHttpClient();
241
+ const response = await client.get(url);
242
+
243
+ if (this._context) {
244
+ this._context.processResponse(
245
+ response as unknown as Parameters<typeof this._context.processResponse>[0],
246
+ url
247
+ );
248
+ }
249
+
250
+ if (!response.ok) {
251
+ throw new Error(`HTTP ${response.status}`);
252
+ }
253
+
254
+ return await response.text();
255
+ }
256
+
257
+ private _getCrawlDelayFromParseResult(parseResult: ReckerRobotsParseResult): number | null {
258
+ const userAgentLower = this.config.userAgent.toLowerCase();
259
+
260
+ for (const block of parseResult.userAgentBlocks) {
261
+ const agents = block.userAgents.map(a => a.toLowerCase());
262
+
263
+ if (agents.includes(userAgentLower)) {
264
+ return block.crawlDelay ? block.crawlDelay * 1000 : null;
265
+ }
266
+ }
267
+
268
+ for (const block of parseResult.userAgentBlocks) {
269
+ const agents = block.userAgents.map(a => a.toLowerCase());
270
+
271
+ for (const agent of agents) {
272
+ if (agent !== '*' && (agent.includes(userAgentLower) || userAgentLower.includes(agent))) {
273
+ return block.crawlDelay ? block.crawlDelay * 1000 : null;
274
+ }
275
+ }
276
+ }
277
+
278
+ for (const block of parseResult.userAgentBlocks) {
279
+ if (block.userAgents.map(a => a.toLowerCase()).includes('*')) {
280
+ return block.crawlDelay ? block.crawlDelay * 1000 : null;
281
+ }
282
+ }
283
+
284
+ return null;
285
+ }
286
+
287
+ private _findMatchedRule(parseResult: ReckerRobotsParseResult, path: string): string | undefined {
288
+ const userAgentLower = this.config.userAgent.toLowerCase();
289
+
290
+ let targetBlock: ReckerRobotsParseResult['userAgentBlocks'][0] | null = null;
291
+
292
+ for (const block of parseResult.userAgentBlocks) {
293
+ const agents = block.userAgents.map(a => a.toLowerCase());
294
+ if (agents.includes(userAgentLower)) {
295
+ targetBlock = block;
296
+ break;
297
+ }
298
+ }
299
+
300
+ if (!targetBlock) {
301
+ for (const block of parseResult.userAgentBlocks) {
302
+ const agents = block.userAgents.map(a => a.toLowerCase());
303
+ for (const agent of agents) {
304
+ if (agent !== '*' && (agent.includes(userAgentLower) || userAgentLower.includes(agent))) {
305
+ targetBlock = block;
306
+ break;
307
+ }
308
+ }
309
+ if (targetBlock) break;
310
+ }
311
+ }
312
+
313
+ if (!targetBlock) {
314
+ for (const block of parseResult.userAgentBlocks) {
315
+ if (block.userAgents.map(a => a.toLowerCase()).includes('*')) {
316
+ targetBlock = block;
317
+ break;
318
+ }
319
+ }
320
+ }
321
+
322
+ if (!targetBlock) return undefined;
323
+
324
+ const sortedRules = [...targetBlock.rules].sort((a, b) => {
325
+ const lenA = a.path.replace(/\*/g, '').length;
326
+ const lenB = b.path.replace(/\*/g, '').length;
327
+ return lenB - lenA;
328
+ });
329
+
330
+ for (const rule of sortedRules) {
331
+ if (this._pathMatches(path, rule.path)) {
332
+ return rule.path;
333
+ }
334
+ }
335
+
336
+ return undefined;
337
+ }
338
+
339
+ private _pathMatches(path: string, pattern: string): boolean {
340
+ let escaped = pattern.replace(/[.+?^{}()|[\]\\]/g, '\\$&');
341
+ escaped = escaped.replace(/\*/g, '.*');
342
+
343
+ if (escaped.endsWith('$')) {
344
+ escaped = escaped.slice(0, -1) + '$';
345
+ } else {
346
+ escaped = escaped + '.*';
347
+ }
348
+
349
+ const regex = new RegExp(`^${escaped}$`, 'i');
350
+ return regex.test(path);
351
+ }
352
+
353
+ async getSitemaps(domain: string): Promise<string[]> {
354
+ const isReckerAvailable = await this._checkReckerAvailability();
355
+
356
+ if (!isReckerAvailable) {
357
+ const fallback = await this._getFallbackParser();
358
+ return fallback.getSitemaps(domain);
359
+ }
360
+
361
+ const cached = await this._getCachedOrFetch(domain);
362
+ return cached.parseResult?.sitemaps || [];
363
+ }
364
+
365
+ async getCrawlDelay(domain: string): Promise<number | null> {
366
+ const isReckerAvailable = await this._checkReckerAvailability();
367
+
368
+ if (!isReckerAvailable) {
369
+ const fallback = await this._getFallbackParser();
370
+ return fallback.getCrawlDelay(domain);
371
+ }
372
+
373
+ const cached = await this._getCachedOrFetch(domain);
374
+ if (!cached.parseResult) return null;
375
+
376
+ return this._getCrawlDelayFromParseResult(cached.parseResult);
377
+ }
378
+
379
+ async preload(domain: string): Promise<void> {
380
+ await this._getCachedOrFetch(domain);
381
+ }
382
+
383
+ clearCache(domain?: string): void {
384
+ if (domain) {
385
+ this.cache.delete(domain);
386
+ } else {
387
+ this.cache.clear();
388
+ }
389
+ }
390
+
391
+ getCacheStats(): CacheStats {
392
+ return {
393
+ size: this.cache.size,
394
+ domains: [...this.cache.keys()]
395
+ };
396
+ }
397
+
398
+ async validate(url: string): Promise<RobotsValidationDetails | null> {
399
+ const isReckerAvailable = await this._checkReckerAvailability();
400
+
401
+ if (!isReckerAvailable) {
402
+ return null;
403
+ }
404
+
405
+ try {
406
+ const urlObj = new URL(url);
407
+ const domain = `${urlObj.protocol}//${urlObj.host}`;
408
+ const cached = await this._getCachedOrFetch(domain);
409
+
410
+ if (!cached.validationResult || !cached.parseResult) {
411
+ return null;
412
+ }
413
+
414
+ return {
415
+ valid: cached.validationResult.valid,
416
+ issues: cached.validationResult.issues,
417
+ blocksAllRobots: cached.parseResult.blocksAllRobots,
418
+ blocksImportantPaths: cached.parseResult.blocksImportantPaths,
419
+ host: cached.parseResult.host,
420
+ size: cached.parseResult.size
421
+ };
422
+ } catch {
423
+ return null;
424
+ }
425
+ }
426
+
427
+ async validateContent(content: string, baseUrl?: string): Promise<ReckerRobotsValidationResult | null> {
428
+ const isReckerAvailable = await this._checkReckerAvailability();
429
+
430
+ if (!isReckerAvailable || !this.validateRobotsTxt) {
431
+ return null;
432
+ }
433
+
434
+ return this.validateRobotsTxt(content, baseUrl);
435
+ }
436
+
437
+ parseContent(content: string): ReckerRobotsParseResult | null {
438
+ if (!this.reckerAvailable || !this.parseRobotsTxt) {
439
+ return null;
440
+ }
441
+
442
+ return this.parseRobotsTxt(content);
443
+ }
444
+
445
+ getBlockingStatus(domain: string): {
446
+ blocksAllRobots: boolean;
447
+ blocksImportantPaths: boolean;
448
+ } | null {
449
+ const cached = this.cache.get(domain);
450
+ if (!cached?.parseResult) return null;
451
+
452
+ return {
453
+ blocksAllRobots: cached.parseResult.blocksAllRobots,
454
+ blocksImportantPaths: cached.parseResult.blocksImportantPaths
455
+ };
456
+ }
457
+
458
+ getHost(domain: string): string | null {
459
+ const cached = this.cache.get(domain);
460
+ return cached?.parseResult?.host || null;
461
+ }
462
+
463
+ async getValidationIssues(domain: string): Promise<ReckerRobotsValidationIssue[]> {
464
+ const cached = await this._getCachedOrFetch(domain);
465
+ return cached.validationResult?.issues || [];
466
+ }
467
+
468
+ isReckerEnabled(): boolean {
469
+ return this.reckerAvailable === true;
470
+ }
471
+ }
472
+
473
+ export default ReckerRobotsValidator;