s3db.js 18.0.11-next.1534f717 → 18.0.11-next.e8e71b5b
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/clients/recker-http-handler.js +56 -8
- package/dist/clients/recker-http-handler.js.map +1 -1
- package/dist/concerns/high-performance-inserter.js +6 -34
- package/dist/concerns/high-performance-inserter.js.map +1 -1
- package/dist/concerns/id/alphabets.js +150 -0
- package/dist/concerns/id/alphabets.js.map +1 -0
- package/dist/concerns/id/entropy.js +243 -0
- package/dist/concerns/id/entropy.js.map +1 -0
- package/dist/concerns/id/generators/nanoid.js +74 -0
- package/dist/concerns/id/generators/nanoid.js.map +1 -0
- package/dist/concerns/id/generators/sid.js +73 -0
- package/dist/concerns/id/generators/sid.js.map +1 -0
- package/dist/concerns/id/generators/ulid.js +208 -0
- package/dist/concerns/id/generators/ulid.js.map +1 -0
- package/dist/concerns/id/generators/uuid-v7.js +150 -0
- package/dist/concerns/id/generators/uuid-v7.js.map +1 -0
- package/dist/concerns/id/index.js +74 -0
- package/dist/concerns/id/index.js.map +1 -0
- package/dist/concerns/plugin-storage.js +114 -0
- package/dist/concerns/plugin-storage.js.map +1 -1
- package/dist/concerns/s3-errors.js +72 -0
- package/dist/concerns/s3-errors.js.map +1 -0
- package/dist/concerns/s3-key.js +54 -0
- package/dist/concerns/s3-key.js.map +1 -0
- package/dist/concerns/safe-merge.js +47 -0
- package/dist/concerns/safe-merge.js.map +1 -0
- package/dist/core/resource-config-validator.js +12 -2
- package/dist/core/resource-config-validator.js.map +1 -1
- package/dist/core/resource-partitions.class.js +12 -1
- package/dist/core/resource-partitions.class.js.map +1 -1
- package/dist/core/resource-persistence.class.js +41 -12
- package/dist/core/resource-persistence.class.js.map +1 -1
- package/dist/core/resource-query.class.js +21 -47
- package/dist/core/resource-query.class.js.map +1 -1
- package/dist/database/database-connection.class.js +3 -6
- package/dist/database/database-connection.class.js.map +1 -1
- package/dist/database/database-plugins.class.js +7 -13
- package/dist/database/database-plugins.class.js.map +1 -1
- package/dist/plugins/concerns/s3-mutex.class.js +155 -0
- package/dist/plugins/concerns/s3-mutex.class.js.map +1 -0
- package/dist/plugins/eventual-consistency/consolidation.js +4 -7
- package/dist/plugins/eventual-consistency/consolidation.js.map +1 -1
- package/dist/plugins/eventual-consistency/garbage-collection.js +3 -6
- package/dist/plugins/eventual-consistency/garbage-collection.js.map +1 -1
- package/dist/plugins/queue-consumer.plugin.js +10 -16
- package/dist/plugins/queue-consumer.plugin.js.map +1 -1
- package/dist/plugins/recon/managers/scheduler-manager.js +3 -5
- package/dist/plugins/recon/managers/scheduler-manager.js.map +1 -1
- package/dist/plugins/recon/stages/recker-asn-stage.js +279 -0
- package/dist/plugins/recon/stages/recker-asn-stage.js.map +1 -0
- package/dist/plugins/recon/stages/recker-dns-stage.js +227 -0
- package/dist/plugins/recon/stages/recker-dns-stage.js.map +1 -0
- package/dist/plugins/recon/stages/recker-scrape-stage.js +369 -0
- package/dist/plugins/recon/stages/recker-scrape-stage.js.map +1 -0
- package/dist/plugins/replicator.plugin.js +13 -31
- package/dist/plugins/replicator.plugin.js.map +1 -1
- package/dist/plugins/replicators/base-replicator.class.js +10 -23
- package/dist/plugins/replicators/base-replicator.class.js.map +1 -1
- package/dist/plugins/spider/recker-link-discoverer.js +544 -0
- package/dist/plugins/spider/recker-link-discoverer.js.map +1 -0
- package/dist/plugins/spider/recker-llms-validator.js +334 -0
- package/dist/plugins/spider/recker-llms-validator.js.map +1 -0
- package/dist/plugins/spider/recker-robots-validator.js +336 -0
- package/dist/plugins/spider/recker-robots-validator.js.map +1 -0
- package/dist/plugins/spider/recker-security-adapter.js +325 -0
- package/dist/plugins/spider/recker-security-adapter.js.map +1 -0
- package/dist/plugins/spider/recker-seo-adapter.js +399 -0
- package/dist/plugins/spider/recker-seo-adapter.js.map +1 -0
- package/dist/plugins/spider/recker-sitemap-validator.js +406 -0
- package/dist/plugins/spider/recker-sitemap-validator.js.map +1 -0
- package/dist/resource.class.js +2 -0
- package/dist/resource.class.js.map +1 -1
- package/dist/s3db.cjs +444 -219
- package/dist/s3db.cjs.map +1 -1
- package/dist/s3db.es.js +445 -220
- package/dist/s3db.es.js.map +1 -1
- package/dist/stream/resource-reader.class.js +5 -7
- package/dist/stream/resource-reader.class.js.map +1 -1
- package/dist/stream/resource-writer.class.js +5 -7
- package/dist/stream/resource-writer.class.js.map +1 -1
- package/dist/tasks/tasks-pool.class.js +31 -0
- package/dist/tasks/tasks-pool.class.js.map +1 -1
- package/dist/types/clients/recker-http-handler.d.ts +1 -0
- package/dist/types/clients/recker-http-handler.d.ts.map +1 -1
- package/dist/types/clients/types.d.ts +14 -0
- package/dist/types/clients/types.d.ts.map +1 -1
- package/dist/types/concerns/high-performance-inserter.d.ts.map +1 -1
- package/dist/types/concerns/id/alphabets.d.ts +125 -0
- package/dist/types/concerns/id/alphabets.d.ts.map +1 -0
- package/dist/types/concerns/id/entropy.d.ts +84 -0
- package/dist/types/concerns/id/entropy.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/nanoid.d.ts +46 -0
- package/dist/types/concerns/id/generators/nanoid.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/sid.d.ts +45 -0
- package/dist/types/concerns/id/generators/sid.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/ulid.d.ts +71 -0
- package/dist/types/concerns/id/generators/ulid.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/uuid-v7.d.ts +60 -0
- package/dist/types/concerns/id/generators/uuid-v7.d.ts.map +1 -0
- package/dist/types/concerns/id/index.d.ts +51 -0
- package/dist/types/concerns/id/index.d.ts.map +1 -0
- package/dist/types/concerns/plugin-storage.d.ts +25 -0
- package/dist/types/concerns/plugin-storage.d.ts.map +1 -1
- package/dist/types/concerns/s3-errors.d.ts +20 -0
- package/dist/types/concerns/s3-errors.d.ts.map +1 -0
- package/dist/types/concerns/s3-key.d.ts +30 -0
- package/dist/types/concerns/s3-key.d.ts.map +1 -0
- package/dist/types/concerns/safe-merge.d.ts +22 -0
- package/dist/types/concerns/safe-merge.d.ts.map +1 -0
- package/dist/types/core/resource-config-validator.d.ts.map +1 -1
- package/dist/types/core/resource-partitions.class.d.ts.map +1 -1
- package/dist/types/core/resource-persistence.class.d.ts.map +1 -1
- package/dist/types/core/resource-query.class.d.ts.map +1 -1
- package/dist/types/database/database-connection.class.d.ts.map +1 -1
- package/dist/types/database/database-plugins.class.d.ts.map +1 -1
- package/dist/types/plugins/concerns/s3-mutex.class.d.ts +30 -0
- package/dist/types/plugins/concerns/s3-mutex.class.d.ts.map +1 -0
- package/dist/types/plugins/eventual-consistency/consolidation.d.ts.map +1 -1
- package/dist/types/plugins/eventual-consistency/garbage-collection.d.ts.map +1 -1
- package/dist/types/plugins/queue-consumer.plugin.d.ts.map +1 -1
- package/dist/types/plugins/recon/managers/scheduler-manager.d.ts.map +1 -1
- package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts +90 -0
- package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts.map +1 -0
- package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts +125 -0
- package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts.map +1 -0
- package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts +96 -0
- package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts.map +1 -0
- package/dist/types/plugins/replicator.plugin.d.ts.map +1 -1
- package/dist/types/plugins/replicators/base-replicator.class.d.ts.map +1 -1
- package/dist/types/plugins/spider/recker-link-discoverer.d.ts +54 -0
- package/dist/types/plugins/spider/recker-link-discoverer.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-llms-validator.d.ts +105 -0
- package/dist/types/plugins/spider/recker-llms-validator.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-robots-validator.d.ts +92 -0
- package/dist/types/plugins/spider/recker-robots-validator.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-security-adapter.d.ts +83 -0
- package/dist/types/plugins/spider/recker-security-adapter.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-seo-adapter.d.ts +187 -0
- package/dist/types/plugins/spider/recker-seo-adapter.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-sitemap-validator.d.ts +121 -0
- package/dist/types/plugins/spider/recker-sitemap-validator.d.ts.map +1 -0
- package/dist/types/resource.class.d.ts.map +1 -1
- package/dist/types/stream/resource-reader.class.d.ts.map +1 -1
- package/dist/types/stream/resource-writer.class.d.ts.map +1 -1
- package/dist/types/tasks/tasks-pool.class.d.ts +23 -0
- package/dist/types/tasks/tasks-pool.class.d.ts.map +1 -1
- package/mcp/prompts/index.ts +275 -0
- package/mcp/resources/index.ts +322 -0
- package/mcp/tools/plugins.ts +1137 -0
- package/mcp/tools/streams.ts +340 -0
- package/package.json +20 -22
- package/src/clients/recker-http-handler.ts +74 -8
- package/src/clients/types.ts +14 -0
- package/src/concerns/high-performance-inserter.ts +18 -57
- package/src/concerns/id/alphabets.ts +175 -0
- package/src/concerns/id/entropy.ts +286 -0
- package/src/concerns/id/generators/sid.ts +90 -0
- package/src/concerns/id/generators/ulid.ts +249 -0
- package/src/concerns/id/generators/uuid-v7.ts +179 -0
- package/src/concerns/id/index.ts +167 -0
- package/src/concerns/plugin-storage.ts +144 -0
- package/src/concerns/s3-errors.ts +97 -0
- package/src/concerns/s3-key.ts +62 -0
- package/src/concerns/safe-merge.ts +60 -0
- package/src/core/resource-config-validator.ts +9 -2
- package/src/core/resource-partitions.class.ts +14 -1
- package/src/core/resource-persistence.class.ts +47 -13
- package/src/core/resource-query.class.ts +21 -46
- package/src/database/database-connection.class.ts +7 -6
- package/src/database/database-plugins.class.ts +15 -13
- package/src/plugins/concerns/s3-mutex.class.ts +228 -0
- package/src/plugins/eventual-consistency/consolidation.ts +8 -7
- package/src/plugins/eventual-consistency/garbage-collection.ts +7 -6
- package/src/plugins/queue-consumer.plugin.ts +21 -19
- package/src/plugins/recon/managers/scheduler-manager.ts +7 -5
- package/src/plugins/recon/stages/recker-asn-stage.ts +385 -0
- package/src/plugins/recon/stages/recker-dns-stage.ts +360 -0
- package/src/plugins/recon/stages/recker-scrape-stage.ts +509 -0
- package/src/plugins/replicator.plugin.ts +41 -35
- package/src/plugins/replicators/base-replicator.class.ts +17 -23
- package/src/plugins/spider/recker-link-discoverer.ts +645 -0
- package/src/plugins/spider/recker-llms-validator.ts +500 -0
- package/src/plugins/spider/recker-robots-validator.ts +473 -0
- package/src/plugins/spider/recker-security-adapter.ts +489 -0
- package/src/plugins/spider/recker-seo-adapter.ts +605 -0
- package/src/plugins/spider/recker-sitemap-validator.ts +621 -0
- package/src/resource.class.ts +2 -0
- package/src/stream/resource-reader.class.ts +10 -8
- package/src/stream/resource-writer.class.ts +10 -8
- package/src/tasks/tasks-pool.class.ts +46 -0
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
import { createHttpClient } from '#src/concerns/http-client.js';
|
|
2
|
+
export class ReckerLlmsTxtValidator {
|
|
3
|
+
config;
|
|
4
|
+
_context;
|
|
5
|
+
cache;
|
|
6
|
+
_httpClient;
|
|
7
|
+
reckerAvailable = null;
|
|
8
|
+
parseLlmsTxt = null;
|
|
9
|
+
validateLlmsTxt = null;
|
|
10
|
+
fetchAndValidateLlmsTxt = null;
|
|
11
|
+
generateLlmsTxtTemplate = null;
|
|
12
|
+
constructor(config = {}) {
|
|
13
|
+
this.config = {
|
|
14
|
+
userAgent: config.userAgent || 's3db-spider',
|
|
15
|
+
fetchTimeout: config.fetchTimeout || 10000,
|
|
16
|
+
cacheTimeout: config.cacheTimeout || 3600000,
|
|
17
|
+
context: config.context || null
|
|
18
|
+
};
|
|
19
|
+
this._context = this.config.context;
|
|
20
|
+
this.cache = new Map();
|
|
21
|
+
this._httpClient = null;
|
|
22
|
+
}
|
|
23
|
+
async _checkReckerAvailability() {
|
|
24
|
+
if (this.reckerAvailable !== null) {
|
|
25
|
+
return this.reckerAvailable;
|
|
26
|
+
}
|
|
27
|
+
try {
|
|
28
|
+
const llmsModule = await import('recker/seo/validators/llms-txt');
|
|
29
|
+
this.parseLlmsTxt = llmsModule.parseLlmsTxt;
|
|
30
|
+
this.validateLlmsTxt = llmsModule.validateLlmsTxt;
|
|
31
|
+
this.fetchAndValidateLlmsTxt = llmsModule.fetchAndValidateLlmsTxt;
|
|
32
|
+
this.generateLlmsTxtTemplate = llmsModule.generateLlmsTxtTemplate;
|
|
33
|
+
this.reckerAvailable = true;
|
|
34
|
+
return true;
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
this.reckerAvailable = false;
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
async _getHttpClient() {
|
|
42
|
+
if (!this._httpClient) {
|
|
43
|
+
const baseConfig = this._context
|
|
44
|
+
? this._context.getHttpClientConfig('https://example.com')
|
|
45
|
+
: {
|
|
46
|
+
headers: {
|
|
47
|
+
'User-Agent': this.config.userAgent
|
|
48
|
+
}
|
|
49
|
+
};
|
|
50
|
+
this._httpClient = await createHttpClient({
|
|
51
|
+
...baseConfig,
|
|
52
|
+
timeout: this.config.fetchTimeout,
|
|
53
|
+
retry: {
|
|
54
|
+
maxAttempts: 2,
|
|
55
|
+
delay: 500,
|
|
56
|
+
backoff: 'exponential',
|
|
57
|
+
retryAfter: true,
|
|
58
|
+
retryOn: [429, 500, 502, 503, 504]
|
|
59
|
+
}
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
return this._httpClient;
|
|
63
|
+
}
|
|
64
|
+
async check(domain) {
|
|
65
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
66
|
+
if (!isReckerAvailable) {
|
|
67
|
+
return this._fallbackCheck(domain);
|
|
68
|
+
}
|
|
69
|
+
const normalizedDomain = domain.replace(/\/$/, '');
|
|
70
|
+
const llmsUrl = `${normalizedDomain}/llms.txt`;
|
|
71
|
+
const cached = this.cache.get(normalizedDomain);
|
|
72
|
+
if (cached && Date.now() - cached.timestamp < this.config.cacheTimeout) {
|
|
73
|
+
return cached.result;
|
|
74
|
+
}
|
|
75
|
+
try {
|
|
76
|
+
const fetchResult = await this.fetchAndValidateLlmsTxt(llmsUrl, async (url) => {
|
|
77
|
+
const client = await this._getHttpClient();
|
|
78
|
+
const response = await client.get(url);
|
|
79
|
+
if (this._context) {
|
|
80
|
+
this._context.processResponse(response, url);
|
|
81
|
+
}
|
|
82
|
+
return {
|
|
83
|
+
status: response.ok ? 200 : response.status,
|
|
84
|
+
text: response.ok ? await response.text() : ''
|
|
85
|
+
};
|
|
86
|
+
});
|
|
87
|
+
const result = {
|
|
88
|
+
exists: fetchResult.exists,
|
|
89
|
+
valid: fetchResult.valid,
|
|
90
|
+
status: fetchResult.status,
|
|
91
|
+
fullVersionExists: fetchResult.fullVersionExists,
|
|
92
|
+
siteName: fetchResult.parseResult.siteName,
|
|
93
|
+
siteDescription: fetchResult.parseResult.siteDescription,
|
|
94
|
+
sections: fetchResult.parseResult.sections,
|
|
95
|
+
links: fetchResult.parseResult.links,
|
|
96
|
+
issues: fetchResult.issues,
|
|
97
|
+
errors: fetchResult.parseResult.errors,
|
|
98
|
+
warnings: fetchResult.parseResult.warnings,
|
|
99
|
+
size: fetchResult.parseResult.size
|
|
100
|
+
};
|
|
101
|
+
this.cache.set(normalizedDomain, { result, timestamp: Date.now() });
|
|
102
|
+
return result;
|
|
103
|
+
}
|
|
104
|
+
catch (error) {
|
|
105
|
+
return {
|
|
106
|
+
exists: false,
|
|
107
|
+
valid: false,
|
|
108
|
+
sections: [],
|
|
109
|
+
links: [],
|
|
110
|
+
issues: [{
|
|
111
|
+
type: 'error',
|
|
112
|
+
code: 'FETCH_ERROR',
|
|
113
|
+
message: `Failed to fetch llms.txt: ${error.message}`
|
|
114
|
+
}],
|
|
115
|
+
errors: [error.message],
|
|
116
|
+
warnings: []
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
async _fallbackCheck(domain) {
|
|
121
|
+
const normalizedDomain = domain.replace(/\/$/, '');
|
|
122
|
+
const llmsUrl = `${normalizedDomain}/llms.txt`;
|
|
123
|
+
try {
|
|
124
|
+
const client = await this._getHttpClient();
|
|
125
|
+
const response = await client.get(llmsUrl);
|
|
126
|
+
if (!response.ok) {
|
|
127
|
+
return {
|
|
128
|
+
exists: false,
|
|
129
|
+
valid: false,
|
|
130
|
+
status: response.status,
|
|
131
|
+
sections: [],
|
|
132
|
+
links: [],
|
|
133
|
+
issues: [{
|
|
134
|
+
type: 'info',
|
|
135
|
+
code: 'NOT_FOUND',
|
|
136
|
+
message: 'llms.txt file not found',
|
|
137
|
+
recommendation: 'Consider adding an llms.txt file for AI SEO'
|
|
138
|
+
}],
|
|
139
|
+
errors: [],
|
|
140
|
+
warnings: []
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
const content = await response.text();
|
|
144
|
+
const parsed = this._simpleParse(content);
|
|
145
|
+
return {
|
|
146
|
+
exists: true,
|
|
147
|
+
valid: parsed.sections.length > 0 || !!parsed.siteName,
|
|
148
|
+
status: 200,
|
|
149
|
+
siteName: parsed.siteName,
|
|
150
|
+
siteDescription: parsed.siteDescription,
|
|
151
|
+
sections: parsed.sections,
|
|
152
|
+
links: parsed.links,
|
|
153
|
+
issues: [],
|
|
154
|
+
errors: [],
|
|
155
|
+
warnings: ['Recker not available - using basic parsing'],
|
|
156
|
+
size: content.length
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
catch (error) {
|
|
160
|
+
return {
|
|
161
|
+
exists: false,
|
|
162
|
+
valid: false,
|
|
163
|
+
sections: [],
|
|
164
|
+
links: [],
|
|
165
|
+
issues: [],
|
|
166
|
+
errors: [error.message],
|
|
167
|
+
warnings: []
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
_simpleParse(content) {
|
|
172
|
+
const lines = content.split(/\r?\n/);
|
|
173
|
+
const sections = [];
|
|
174
|
+
const links = [];
|
|
175
|
+
let siteName;
|
|
176
|
+
let siteDescription;
|
|
177
|
+
let currentSection = null;
|
|
178
|
+
for (const line of lines) {
|
|
179
|
+
const trimmed = line.trim();
|
|
180
|
+
if (!trimmed || trimmed.startsWith('#'))
|
|
181
|
+
continue;
|
|
182
|
+
if (trimmed.startsWith('# ') && !siteName) {
|
|
183
|
+
siteName = trimmed.slice(2).trim();
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
186
|
+
if (trimmed.startsWith('>') && !siteDescription) {
|
|
187
|
+
siteDescription = trimmed.slice(1).trim();
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
if (trimmed.startsWith('## ')) {
|
|
191
|
+
if (currentSection) {
|
|
192
|
+
sections.push(currentSection);
|
|
193
|
+
}
|
|
194
|
+
currentSection = {
|
|
195
|
+
title: trimmed.slice(3).trim(),
|
|
196
|
+
content: '',
|
|
197
|
+
links: []
|
|
198
|
+
};
|
|
199
|
+
continue;
|
|
200
|
+
}
|
|
201
|
+
const linkMatch = trimmed.match(/^\[([^\]]+)\]\(([^)]+)\)(?:\s*-\s*(.+))?$/);
|
|
202
|
+
if (linkMatch) {
|
|
203
|
+
const link = {
|
|
204
|
+
text: linkMatch[1],
|
|
205
|
+
url: linkMatch[2],
|
|
206
|
+
description: linkMatch[3]?.trim(),
|
|
207
|
+
section: currentSection?.title
|
|
208
|
+
};
|
|
209
|
+
links.push(link);
|
|
210
|
+
if (currentSection) {
|
|
211
|
+
currentSection.links.push(link);
|
|
212
|
+
}
|
|
213
|
+
continue;
|
|
214
|
+
}
|
|
215
|
+
if (currentSection && trimmed) {
|
|
216
|
+
currentSection.content += (currentSection.content ? '\n' : '') + trimmed;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
if (currentSection) {
|
|
220
|
+
sections.push(currentSection);
|
|
221
|
+
}
|
|
222
|
+
return { siteName, siteDescription, sections, links };
|
|
223
|
+
}
|
|
224
|
+
async validate(domain) {
|
|
225
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
226
|
+
if (!isReckerAvailable) {
|
|
227
|
+
return null;
|
|
228
|
+
}
|
|
229
|
+
const checkResult = await this.check(domain);
|
|
230
|
+
if (!checkResult.exists) {
|
|
231
|
+
return null;
|
|
232
|
+
}
|
|
233
|
+
const normalizedDomain = domain.replace(/\/$/, '');
|
|
234
|
+
const llmsUrl = `${normalizedDomain}/llms.txt`;
|
|
235
|
+
try {
|
|
236
|
+
const client = await this._getHttpClient();
|
|
237
|
+
const response = await client.get(llmsUrl);
|
|
238
|
+
const content = await response.text();
|
|
239
|
+
return this.validateLlmsTxt(content, normalizedDomain);
|
|
240
|
+
}
|
|
241
|
+
catch {
|
|
242
|
+
return null;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
validateContent(content, baseUrl) {
|
|
246
|
+
if (!this.reckerAvailable || !this.validateLlmsTxt) {
|
|
247
|
+
return null;
|
|
248
|
+
}
|
|
249
|
+
return this.validateLlmsTxt(content, baseUrl);
|
|
250
|
+
}
|
|
251
|
+
parseContent(content) {
|
|
252
|
+
if (!this.reckerAvailable || !this.parseLlmsTxt) {
|
|
253
|
+
return null;
|
|
254
|
+
}
|
|
255
|
+
return this.parseLlmsTxt(content);
|
|
256
|
+
}
|
|
257
|
+
generateTemplate(options) {
|
|
258
|
+
if (!this.reckerAvailable || !this.generateLlmsTxtTemplate) {
|
|
259
|
+
return this._fallbackGenerateTemplate(options);
|
|
260
|
+
}
|
|
261
|
+
return this.generateLlmsTxtTemplate(options);
|
|
262
|
+
}
|
|
263
|
+
_fallbackGenerateTemplate(options) {
|
|
264
|
+
const lines = [];
|
|
265
|
+
lines.push(`# ${options.siteName}`);
|
|
266
|
+
lines.push('');
|
|
267
|
+
lines.push(`> ${options.siteDescription}`);
|
|
268
|
+
lines.push('');
|
|
269
|
+
if (options.sections) {
|
|
270
|
+
for (const section of options.sections) {
|
|
271
|
+
lines.push(`## ${section.title}`);
|
|
272
|
+
lines.push('');
|
|
273
|
+
for (const link of section.links) {
|
|
274
|
+
if (link.description) {
|
|
275
|
+
lines.push(`[${link.text}](${link.url}) - ${link.description}`);
|
|
276
|
+
}
|
|
277
|
+
else {
|
|
278
|
+
lines.push(`[${link.text}](${link.url})`);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
lines.push('');
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
return lines.join('\n');
|
|
285
|
+
}
|
|
286
|
+
async checkFullVersion(domain) {
|
|
287
|
+
const normalizedDomain = domain.replace(/\/$/, '');
|
|
288
|
+
const llmsFullUrl = `${normalizedDomain}/llms-full.txt`;
|
|
289
|
+
try {
|
|
290
|
+
const client = await this._getHttpClient();
|
|
291
|
+
const response = await client.get(llmsFullUrl);
|
|
292
|
+
if (!response.ok) {
|
|
293
|
+
return { exists: false, status: response.status };
|
|
294
|
+
}
|
|
295
|
+
const content = await response.text();
|
|
296
|
+
return {
|
|
297
|
+
exists: true,
|
|
298
|
+
status: 200,
|
|
299
|
+
size: content.length
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
catch {
|
|
303
|
+
return { exists: false };
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
async getLinks(domain) {
|
|
307
|
+
const result = await this.check(domain);
|
|
308
|
+
return result.links;
|
|
309
|
+
}
|
|
310
|
+
async getSections(domain) {
|
|
311
|
+
const result = await this.check(domain);
|
|
312
|
+
return result.sections;
|
|
313
|
+
}
|
|
314
|
+
clearCache(domain) {
|
|
315
|
+
if (domain) {
|
|
316
|
+
const normalizedDomain = domain.replace(/\/$/, '');
|
|
317
|
+
this.cache.delete(normalizedDomain);
|
|
318
|
+
}
|
|
319
|
+
else {
|
|
320
|
+
this.cache.clear();
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
getCacheStats() {
|
|
324
|
+
return {
|
|
325
|
+
size: this.cache.size,
|
|
326
|
+
domains: [...this.cache.keys()]
|
|
327
|
+
};
|
|
328
|
+
}
|
|
329
|
+
isReckerEnabled() {
|
|
330
|
+
return this.reckerAvailable === true;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
export default ReckerLlmsTxtValidator;
|
|
334
|
+
//# sourceMappingURL=recker-llms-validator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"recker-llms-validator.js","sourceRoot":"","sources":["../../../src/plugins/spider/recker-llms-validator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAqGhE,MAAM,OAAO,sBAAsB;IACzB,MAAM,CAEZ;IACM,QAAQ,CAAsB;IAC9B,KAAK,CAA0B;IAC/B,WAAW,CAAoB;IAE/B,eAAe,GAAmB,IAAI,CAAC;IACvC,YAAY,GAA8B,IAAI,CAAC;IAC/C,eAAe,GAAiC,IAAI,CAAC;IACrD,uBAAuB,GAAyC,IAAI,CAAC;IACrE,uBAAuB,GAAyC,IAAI,CAAC;IAE7E,YAAY,SAAiC,EAAE;QAC7C,IAAI,CAAC,MAAM,GAAG;YACZ,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,aAAa;YAC5C,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,KAAK;YAC1C,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,OAAO;YAC5C,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,IAAI;SAChC,CAAC;QAEF,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC;QACpC,IAAI,CAAC,KAAK,GAAG,IAAI,GAAG,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;IAC1B,CAAC;IAEO,KAAK,CAAC,wBAAwB;QACpC,IAAI,IAAI,CAAC,eAAe,KAAK,IAAI,EAAE,CAAC;YAClC,OAAO,IAAI,CAAC,eAAe,CAAC;QAC9B,CAAC;QAED,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,gCAAgC,CAAC,CAAC;YAClE,IAAI,CAAC,YAAY,GAAG,UAAU,CAAC,YAAY,CAAC;YAC5C,IAAI,CAAC,eAAe,GAAG,UAAU,CAAC,eAAe,CAAC;YAClD,IAAI,CAAC,uBAAuB,GAAG,UAAU,CAAC,uBAAuB,CAAC;YAClE,IAAI,CAAC,uBAAuB,GAAG,UAAU,CAAC,uBAAuB,CAAC;YAClE,IAAI,CAAC,eAAe,GAAG,IAAI,CAAC;YAC5B,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,IAAI,CAAC,eAAe,GAAG,KAAK,CAAC;YAC7B,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,cAAc;QAC1B,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;YACtB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ;gBAC9B,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,mBAAmB,CAAC,qBAAqB,CAAC;gBAC1D,CAAC,CAAC;oBACE,OAAO,EAAE;wBACP,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;qBACpC;iBACF,CAAC;YAEN,IAAI,CAAC,WAAW,GAAG,MAAM,gBAAgB,CAAC;gBACxC,GAAG,UAAU;gBACb,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;gBACjC,KAAK,EAAE;oBACL,WAAW,EAAE,CAAC;oBACd,KAAK,EAAE,GAAG;oBACV,OAAO,EAAE,aAAa;oBACtB,UAAU,EAAE,IAAI;oBAChB,OAAO,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC;iBACnC;aACF,CAA0B,CAAC;QAC9B,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,MAAc;QACxB,MAAM,iBAAiB,GAAG,MAAM,IAAI,CAAC,wBAAwB,EAAE,CAAC;QAEhE,IAAI,CAAC,iBAAiB,EAAE,CAAC;YACvB,OAAO,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;QACrC,CAAC;QAED,MAAM,gBAAgB,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACnD,MAAM,OAAO,GAAG,GAAG,gBAAgB,WAAW,CAAC;QAE/C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;QAChD,IAAI,MAAM,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,MAAM,CAAC,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC;YACvE,OAAO,MAAM,CAAC,MAAM,CAAC;QACvB,CAAC;QAED,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,uBAAwB,CAAC,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;gBAC7E,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;gBAC3C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;gBAEvC,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;oBAClB,IAAI,CAAC,QAAQ,CAAC,eAAe,CAC3B,QAA0E,EAC1E,GAAG,CACJ,CAAC;gBACJ,CAAC;gBAED,OAAO;oBACL,MAAM,EAAE,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM;oBAC3C,IAAI,EAAE,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE;iBAC/C,CAAC;YACJ,CAAC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAuB;gBACjC,MAAM,EAAE,WAAW,CAAC,MAAM;gBAC1B,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,MAAM,EAAE,WAAW,CAAC,MAAM;gBAC1B,iBAAiB,EAAE,WAAW,CAAC,iBAAiB;gBAChD,QAAQ,EAAE,WAAW,CAAC,WAAW,CAAC,QAAQ;gBAC1C,eAAe,EAAE,WAAW,CAAC,WAAW,CAAC,eAAe;gBACxD,QAAQ,EAAE,WAAW,CAAC,WAAW,CAAC,QAAQ;gBAC1C,KAAK,EAAE,WAAW,CAAC,WAAW,CAAC,KAAK;gBACpC,MAAM,EAAE,WAAW,CAAC,MAAM;gBAC1B,MAAM,EAAE,WAAW,CAAC,WAAW,CAAC,MAAM;gBACtC,QAAQ,EAAE,WAAW,CAAC,WAAW,CAAC,QAAQ;gBAC1C,IAAI,EAAE,WAAW,CAAC,WAAW,CAAC,IAAI;aACnC,CAAC;YAEF,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,gBAAgB,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;YAEpE,OAAO,MAAM,CAAC;QAEhB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO;gBACL,MAAM,EAAE,KAAK;gBACb,KAAK,EAAE,KAAK;gBACZ,QAAQ,EAAE,EAAE;gBACZ,KAAK,EAAE,EAAE;gBACT,MAAM,EAAE,CAAC;wBACP,IAAI,EAAE,OAAO;wBACb,IAAI,EAAE,aAAa;wBACnB,OAAO,EAAE,6BAA8B,KAAe,CAAC,OAAO,EAAE;qBACjE,CAAC;gBACF,MAAM,EAAE,CAAE,KAAe,CAAC,OAAO,CAAC;gBAClC,QAAQ,EAAE,EAAE;aACb,CAAC;QACJ,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,cAAc,CAAC,MAAc;QACzC,MAAM,gBAAgB,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACnD,MAAM,OAAO,GAAG,GAAG,gBAAgB,WAAW,CAAC;QAE/C,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;YAC3C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YAE3C,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,OAAO;oBACL,MAAM,EAAE,KAAK;oBACb,KAAK,EAAE,KAAK;oBACZ,MAAM,EAAE,QAAQ,CAAC,MAAM;oBACvB,QAAQ,EAAE,EAAE;oBACZ,KAAK,EAAE,EAAE;oBACT,MAAM,EAAE,CAAC;4BACP,IAAI,EAAE,MAAM;4BACZ,IAAI,EAAE,WAAW;4BACjB,OAAO,EAAE,yBAAyB;4BAClC,cAAc,EAAE,6CAA6C;yBAC9D,CAAC;oBACF,MAAM,EAAE,EAAE;oBACV,QAAQ,EAAE,EAAE;iBACb,CAAC;YACJ,CAAC;YAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACtC,MAAM,MAAM,GAAG,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;YAE1C,OAAO;gBACL,MAAM,EAAE,IAAI;gBACZ,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,QAAQ;gBACtD,MAAM,EAAE,GAAG;gBACX,QAAQ,EAAE,MAAM,CAAC,QAAQ;gBACzB,eAAe,EAAE,MAAM,CAAC,eAAe;gBACvC,QAAQ,EAAE,MAAM,CAAC,QAAQ;gBACzB,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,MAAM,EAAE,EAAE;gBACV,MAAM,EAAE,EAAE;gBACV,QAAQ,EAAE,CAAC,4CAA4C,CAAC;gBACxD,IAAI,EAAE,OAAO,CAAC,MAAM;aACrB,CAAC;QAEJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO;gBACL,MAAM,EAAE,KAAK;gBACb,KAAK,EAAE,KAAK;gBACZ,QAAQ,EAAE,EAAE;gBACZ,KAAK,EAAE,EAAE;gBACT,MAAM,EAAE,EAAE;gBACV,MAAM,EAAE,CAAE,KAAe,CAAC,OAAO,CAAC;gBAClC,QAAQ,EAAE,EAAE;aACb,CAAC;QACJ,CAAC;IACH,CAAC;IAEO,YAAY,CAAC,OAAe;QAMlC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACrC,MAAM,QAAQ,GAAqB,EAAE,CAAC;QACtC,MAAM,KAAK,GAAkB,EAAE,CAAC;QAChC,IAAI,QAA4B,CAAC;QACjC,IAAI,eAAmC,CAAC;QACxC,IAAI,cAAc,GAA0B,IAAI,CAAC;QAEjD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;YAE5B,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAElD,IAAI,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAC1C,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACnC,SAAS;YACX,CAAC;YAED,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;gBAChD,eAAe,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC1C,SAAS;YACX,CAAC;YAED,IAAI,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;gBAC9B,IAAI,cAAc,EAAE,CAAC;oBACnB,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAChC,CAAC;gBACD,cAAc,GAAG;oBACf,KAAK,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;oBAC9B,OAAO,EAAE,EAAE;oBACX,KAAK,EAAE,EAAE;iBACV,CAAC;gBACF,SAAS;YACX,CAAC;YAED,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,2CAA2C,CAAC,CAAC;YAC7E,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,IAAI,GAAgB;oBACxB,IAAI,EAAE,SAAS,CAAC,CAAC,CAAE;oBACnB,GAAG,EAAE,SAAS,CAAC,CAAC,CAAE;oBAClB,WAAW,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE;oBACjC,OAAO,EAAE,cAAc,EAAE,KAAK;iBAC/B,CAAC;gBACF,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACjB,IAAI,cAAc,EAAE,CAAC;oBACnB,cAAc,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBAClC,CAAC;gBACD,SAAS;YACX,CAAC;YAED,IAAI,cAAc,IAAI,OAAO,EAAE,CAAC;gBAC9B,cAAc,CAAC,OAAO,IAAI,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC;YAC3E,CAAC;QACH,CAAC;QAED,IAAI,cAAc,EAAE,CAAC;YACnB,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAChC,CAAC;QAED,OAAO,EAAE,QAAQ,EAAE,eAAe,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC;IACxD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,MAAc;QAC3B,MAAM,iBAAiB,GAAG,MAAM,IAAI,CAAC,wBAAwB,EAAE,CAAC;QAEhE,IAAI,CAAC,iBAAiB,EAAE,CAAC;YACvB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QAE7C,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC;YACxB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,gBAAgB,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACnD,MAAM,OAAO,GAAG,GAAG,gBAAgB,WAAW,CAAC;QAE/C,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;YAC3C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YAC3C,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YAEtC,OAAO,IAAI,CAAC,eAAgB,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;QAC1D,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,eAAe,CAAC,OAAe,EAAE,OAAgB;QAC/C,IAAI,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;YACnD,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAChD,CAAC;IAED,YAAY,CAAC,OAAe;QAC1B,IAAI,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,CAAC;YAChD,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO,IAAI,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;IACpC,CAAC;IAED,gBAAgB,CAAC,OAA+B;QAC9C,IAAI,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,IAAI,CAAC,uBAAuB,EAAE,CAAC;YAC3D,OAAO,IAAI,CAAC,yBAAyB,CAAC,OAAO,CAAC,CAAC;QACjD,CAAC;QAED,OAAO,IAAI,CAAC,uBAAuB,CAAC,OAAO,CAAC,CAAC;IAC/C,CAAC;IAEO,yBAAyB,CAAC,OAA+B;QAC/D,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,CAAC,IAAI,CAAC,KAAK,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,KAAK,OAAO,CAAC,eAAe,EAAE,CAAC,CAAC;QAC3C,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAEf,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;YACrB,KAAK,MAAM,OAAO,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;gBACvC,KAAK,CAAC,IAAI,CAAC,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;gBAClC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,KAAK,MAAM,IAAI,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;oBACjC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;wBACrB,KAAK,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,GAAG,OAAO,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC;oBAClE,CAAC;yBAAM,CAAC;wBACN,KAAK,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,IAAI,KAAK,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC;oBAC5C,CAAC;gBACH,CAAC;gBACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACjB,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAED,KAAK,CAAC,gBAAgB,CAAC,MAAc;QAKnC,MAAM,gBAAgB,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACnD,MAAM,WAAW,GAAG,GAAG,gBAAgB,gBAAgB,CAAC;QAExD,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;YAC3C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;YAE/C,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,QAAQ,CAAC,MAAM,EAAE,CAAC;YACpD,CAAC;YAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACtC,OAAO;gBACL,MAAM,EAAE,IAAI;gBACZ,MAAM,EAAE,GAAG;gBACX,IAAI,EAAE,OAAO,CAAC,MAAM;aACrB,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;QAC3B,CAAC;IACH,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,MAAc;QAC3B,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACxC,OAAO,MAAM,CAAC,KAAK,CAAC;IACtB,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,MAAc;QAC9B,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACxC,OAAO,MAAM,CAAC,QAAQ,CAAC;IACzB,CAAC;IAED,UAAU,CAAC,MAAe;QACxB,IAAI,MAAM,EAAE,CAAC;YACX,MAAM,gBAAgB,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YACnD,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC;QACtC,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;QACrB,CAAC;IACH,CAAC;IAED,aAAa;QACX,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACrB,OAAO,EAAE,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;SAChC,CAAC;IACJ,CAAC;IAED,eAAe;QACb,OAAO,IAAI,CAAC,eAAe,KAAK,IAAI,CAAC;IACvC,CAAC;CACF;AAED,eAAe,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
import { createHttpClient } from '#src/concerns/http-client.js';
|
|
2
|
+
export class ReckerRobotsValidator {
|
|
3
|
+
config;
|
|
4
|
+
_context;
|
|
5
|
+
cache;
|
|
6
|
+
fetcher;
|
|
7
|
+
_httpClient;
|
|
8
|
+
reckerAvailable = null;
|
|
9
|
+
parseRobotsTxt = null;
|
|
10
|
+
validateRobotsTxt = null;
|
|
11
|
+
isPathAllowed = null;
|
|
12
|
+
fetchAndValidateRobotsTxt = null;
|
|
13
|
+
fallbackParser = null;
|
|
14
|
+
constructor(config = {}) {
|
|
15
|
+
this.config = {
|
|
16
|
+
userAgent: config.userAgent || 's3db-spider',
|
|
17
|
+
defaultAllow: config.defaultAllow !== false,
|
|
18
|
+
cacheTimeout: config.cacheTimeout || 3600000,
|
|
19
|
+
fetchTimeout: config.fetchTimeout || 10000,
|
|
20
|
+
context: config.context || null,
|
|
21
|
+
fetcher: config.fetcher || null
|
|
22
|
+
};
|
|
23
|
+
this._context = this.config.context;
|
|
24
|
+
this.cache = new Map();
|
|
25
|
+
this.fetcher = this.config.fetcher;
|
|
26
|
+
this._httpClient = null;
|
|
27
|
+
}
|
|
28
|
+
async _checkReckerAvailability() {
|
|
29
|
+
if (this.reckerAvailable !== null) {
|
|
30
|
+
return this.reckerAvailable;
|
|
31
|
+
}
|
|
32
|
+
try {
|
|
33
|
+
const robotsModule = await import('recker/seo/validators/robots');
|
|
34
|
+
this.parseRobotsTxt = robotsModule.parseRobotsTxt;
|
|
35
|
+
this.validateRobotsTxt = robotsModule.validateRobotsTxt;
|
|
36
|
+
this.isPathAllowed = robotsModule.isPathAllowed;
|
|
37
|
+
this.fetchAndValidateRobotsTxt = robotsModule.fetchAndValidateRobotsTxt;
|
|
38
|
+
this.reckerAvailable = true;
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
catch {
|
|
42
|
+
this.reckerAvailable = false;
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
async _getFallbackParser() {
|
|
47
|
+
if (!this.fallbackParser) {
|
|
48
|
+
const { RobotsParser } = await import('./robots-parser.js');
|
|
49
|
+
this.fallbackParser = new RobotsParser(this.config);
|
|
50
|
+
}
|
|
51
|
+
return this.fallbackParser;
|
|
52
|
+
}
|
|
53
|
+
setFetcher(fetcher) {
|
|
54
|
+
this.fetcher = fetcher;
|
|
55
|
+
if (this.fallbackParser) {
|
|
56
|
+
this.fallbackParser.setFetcher(fetcher);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
async isAllowed(url) {
|
|
60
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
61
|
+
if (!isReckerAvailable) {
|
|
62
|
+
const fallback = await this._getFallbackParser();
|
|
63
|
+
return fallback.isAllowed(url);
|
|
64
|
+
}
|
|
65
|
+
try {
|
|
66
|
+
const urlObj = new URL(url);
|
|
67
|
+
const domain = `${urlObj.protocol}//${urlObj.host}`;
|
|
68
|
+
const path = urlObj.pathname + urlObj.search;
|
|
69
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
70
|
+
if (!cached.parseResult) {
|
|
71
|
+
return { allowed: this.config.defaultAllow, source: 'no-robots-txt' };
|
|
72
|
+
}
|
|
73
|
+
const allowed = this.isPathAllowed(cached.parseResult, path, this.config.userAgent);
|
|
74
|
+
const crawlDelay = this._getCrawlDelayFromParseResult(cached.parseResult);
|
|
75
|
+
const matchedRule = this._findMatchedRule(cached.parseResult, path);
|
|
76
|
+
return {
|
|
77
|
+
allowed,
|
|
78
|
+
crawlDelay,
|
|
79
|
+
source: 'robots-txt',
|
|
80
|
+
matchedRule
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
catch (error) {
|
|
84
|
+
return {
|
|
85
|
+
allowed: this.config.defaultAllow,
|
|
86
|
+
source: 'error',
|
|
87
|
+
error: error.message
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
async _getCachedOrFetch(domain) {
|
|
92
|
+
const cached = this.cache.get(domain);
|
|
93
|
+
if (cached && Date.now() - cached.timestamp < this.config.cacheTimeout) {
|
|
94
|
+
return cached;
|
|
95
|
+
}
|
|
96
|
+
const robotsUrl = `${domain}/robots.txt`;
|
|
97
|
+
let content = null;
|
|
98
|
+
try {
|
|
99
|
+
if (this.fetcher) {
|
|
100
|
+
content = await this.fetcher(robotsUrl);
|
|
101
|
+
}
|
|
102
|
+
else {
|
|
103
|
+
content = await this._fetchRobotsTxt(robotsUrl);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
catch {
|
|
107
|
+
const entry = {
|
|
108
|
+
parseResult: null,
|
|
109
|
+
validationResult: null,
|
|
110
|
+
timestamp: Date.now()
|
|
111
|
+
};
|
|
112
|
+
this.cache.set(domain, entry);
|
|
113
|
+
return entry;
|
|
114
|
+
}
|
|
115
|
+
const parseResult = this.parseRobotsTxt(content);
|
|
116
|
+
const validationResult = this.validateRobotsTxt(content, domain);
|
|
117
|
+
const entry = {
|
|
118
|
+
parseResult,
|
|
119
|
+
validationResult,
|
|
120
|
+
timestamp: Date.now()
|
|
121
|
+
};
|
|
122
|
+
this.cache.set(domain, entry);
|
|
123
|
+
return entry;
|
|
124
|
+
}
|
|
125
|
+
async _getHttpClient() {
|
|
126
|
+
if (!this._httpClient) {
|
|
127
|
+
const baseConfig = this._context
|
|
128
|
+
? this._context.getHttpClientConfig('https://example.com')
|
|
129
|
+
: {
|
|
130
|
+
headers: {
|
|
131
|
+
'User-Agent': this.config.userAgent
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
this._httpClient = await createHttpClient({
|
|
135
|
+
...baseConfig,
|
|
136
|
+
timeout: this.config.fetchTimeout,
|
|
137
|
+
retry: {
|
|
138
|
+
maxAttempts: 2,
|
|
139
|
+
delay: 500,
|
|
140
|
+
backoff: 'exponential',
|
|
141
|
+
retryAfter: true,
|
|
142
|
+
retryOn: [429, 500, 502, 503, 504]
|
|
143
|
+
}
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
return this._httpClient;
|
|
147
|
+
}
|
|
148
|
+
async _fetchRobotsTxt(url) {
|
|
149
|
+
const client = await this._getHttpClient();
|
|
150
|
+
const response = await client.get(url);
|
|
151
|
+
if (this._context) {
|
|
152
|
+
this._context.processResponse(response, url);
|
|
153
|
+
}
|
|
154
|
+
if (!response.ok) {
|
|
155
|
+
throw new Error(`HTTP ${response.status}`);
|
|
156
|
+
}
|
|
157
|
+
return await response.text();
|
|
158
|
+
}
|
|
159
|
+
_getCrawlDelayFromParseResult(parseResult) {
|
|
160
|
+
const userAgentLower = this.config.userAgent.toLowerCase();
|
|
161
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
162
|
+
const agents = block.userAgents.map(a => a.toLowerCase());
|
|
163
|
+
if (agents.includes(userAgentLower)) {
|
|
164
|
+
return block.crawlDelay ? block.crawlDelay * 1000 : null;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
168
|
+
const agents = block.userAgents.map(a => a.toLowerCase());
|
|
169
|
+
for (const agent of agents) {
|
|
170
|
+
if (agent !== '*' && (agent.includes(userAgentLower) || userAgentLower.includes(agent))) {
|
|
171
|
+
return block.crawlDelay ? block.crawlDelay * 1000 : null;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
176
|
+
if (block.userAgents.map(a => a.toLowerCase()).includes('*')) {
|
|
177
|
+
return block.crawlDelay ? block.crawlDelay * 1000 : null;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
return null;
|
|
181
|
+
}
|
|
182
|
+
_findMatchedRule(parseResult, path) {
|
|
183
|
+
const userAgentLower = this.config.userAgent.toLowerCase();
|
|
184
|
+
let targetBlock = null;
|
|
185
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
186
|
+
const agents = block.userAgents.map(a => a.toLowerCase());
|
|
187
|
+
if (agents.includes(userAgentLower)) {
|
|
188
|
+
targetBlock = block;
|
|
189
|
+
break;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
if (!targetBlock) {
|
|
193
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
194
|
+
const agents = block.userAgents.map(a => a.toLowerCase());
|
|
195
|
+
for (const agent of agents) {
|
|
196
|
+
if (agent !== '*' && (agent.includes(userAgentLower) || userAgentLower.includes(agent))) {
|
|
197
|
+
targetBlock = block;
|
|
198
|
+
break;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
if (targetBlock)
|
|
202
|
+
break;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if (!targetBlock) {
|
|
206
|
+
for (const block of parseResult.userAgentBlocks) {
|
|
207
|
+
if (block.userAgents.map(a => a.toLowerCase()).includes('*')) {
|
|
208
|
+
targetBlock = block;
|
|
209
|
+
break;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
if (!targetBlock)
|
|
214
|
+
return undefined;
|
|
215
|
+
const sortedRules = [...targetBlock.rules].sort((a, b) => {
|
|
216
|
+
const lenA = a.path.replace(/\*/g, '').length;
|
|
217
|
+
const lenB = b.path.replace(/\*/g, '').length;
|
|
218
|
+
return lenB - lenA;
|
|
219
|
+
});
|
|
220
|
+
for (const rule of sortedRules) {
|
|
221
|
+
if (this._pathMatches(path, rule.path)) {
|
|
222
|
+
return rule.path;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return undefined;
|
|
226
|
+
}
|
|
227
|
+
_pathMatches(path, pattern) {
|
|
228
|
+
let escaped = pattern.replace(/[.+?^{}()|[\]\\]/g, '\\$&');
|
|
229
|
+
escaped = escaped.replace(/\*/g, '.*');
|
|
230
|
+
if (escaped.endsWith('$')) {
|
|
231
|
+
escaped = escaped.slice(0, -1) + '$';
|
|
232
|
+
}
|
|
233
|
+
else {
|
|
234
|
+
escaped = escaped + '.*';
|
|
235
|
+
}
|
|
236
|
+
const regex = new RegExp(`^${escaped}$`, 'i');
|
|
237
|
+
return regex.test(path);
|
|
238
|
+
}
|
|
239
|
+
async getSitemaps(domain) {
|
|
240
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
241
|
+
if (!isReckerAvailable) {
|
|
242
|
+
const fallback = await this._getFallbackParser();
|
|
243
|
+
return fallback.getSitemaps(domain);
|
|
244
|
+
}
|
|
245
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
246
|
+
return cached.parseResult?.sitemaps || [];
|
|
247
|
+
}
|
|
248
|
+
async getCrawlDelay(domain) {
|
|
249
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
250
|
+
if (!isReckerAvailable) {
|
|
251
|
+
const fallback = await this._getFallbackParser();
|
|
252
|
+
return fallback.getCrawlDelay(domain);
|
|
253
|
+
}
|
|
254
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
255
|
+
if (!cached.parseResult)
|
|
256
|
+
return null;
|
|
257
|
+
return this._getCrawlDelayFromParseResult(cached.parseResult);
|
|
258
|
+
}
|
|
259
|
+
async preload(domain) {
|
|
260
|
+
await this._getCachedOrFetch(domain);
|
|
261
|
+
}
|
|
262
|
+
clearCache(domain) {
|
|
263
|
+
if (domain) {
|
|
264
|
+
this.cache.delete(domain);
|
|
265
|
+
}
|
|
266
|
+
else {
|
|
267
|
+
this.cache.clear();
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
getCacheStats() {
|
|
271
|
+
return {
|
|
272
|
+
size: this.cache.size,
|
|
273
|
+
domains: [...this.cache.keys()]
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
async validate(url) {
|
|
277
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
278
|
+
if (!isReckerAvailable) {
|
|
279
|
+
return null;
|
|
280
|
+
}
|
|
281
|
+
try {
|
|
282
|
+
const urlObj = new URL(url);
|
|
283
|
+
const domain = `${urlObj.protocol}//${urlObj.host}`;
|
|
284
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
285
|
+
if (!cached.validationResult || !cached.parseResult) {
|
|
286
|
+
return null;
|
|
287
|
+
}
|
|
288
|
+
return {
|
|
289
|
+
valid: cached.validationResult.valid,
|
|
290
|
+
issues: cached.validationResult.issues,
|
|
291
|
+
blocksAllRobots: cached.parseResult.blocksAllRobots,
|
|
292
|
+
blocksImportantPaths: cached.parseResult.blocksImportantPaths,
|
|
293
|
+
host: cached.parseResult.host,
|
|
294
|
+
size: cached.parseResult.size
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
catch {
|
|
298
|
+
return null;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
async validateContent(content, baseUrl) {
|
|
302
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
303
|
+
if (!isReckerAvailable || !this.validateRobotsTxt) {
|
|
304
|
+
return null;
|
|
305
|
+
}
|
|
306
|
+
return this.validateRobotsTxt(content, baseUrl);
|
|
307
|
+
}
|
|
308
|
+
parseContent(content) {
|
|
309
|
+
if (!this.reckerAvailable || !this.parseRobotsTxt) {
|
|
310
|
+
return null;
|
|
311
|
+
}
|
|
312
|
+
return this.parseRobotsTxt(content);
|
|
313
|
+
}
|
|
314
|
+
getBlockingStatus(domain) {
|
|
315
|
+
const cached = this.cache.get(domain);
|
|
316
|
+
if (!cached?.parseResult)
|
|
317
|
+
return null;
|
|
318
|
+
return {
|
|
319
|
+
blocksAllRobots: cached.parseResult.blocksAllRobots,
|
|
320
|
+
blocksImportantPaths: cached.parseResult.blocksImportantPaths
|
|
321
|
+
};
|
|
322
|
+
}
|
|
323
|
+
getHost(domain) {
|
|
324
|
+
const cached = this.cache.get(domain);
|
|
325
|
+
return cached?.parseResult?.host || null;
|
|
326
|
+
}
|
|
327
|
+
async getValidationIssues(domain) {
|
|
328
|
+
const cached = await this._getCachedOrFetch(domain);
|
|
329
|
+
return cached.validationResult?.issues || [];
|
|
330
|
+
}
|
|
331
|
+
isReckerEnabled() {
|
|
332
|
+
return this.reckerAvailable === true;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
export default ReckerRobotsValidator;
|
|
336
|
+
//# sourceMappingURL=recker-robots-validator.js.map
|