@snap-agent/rag-web 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +571 -0
- package/dist/index.d.mts +727 -0
- package/dist/index.d.ts +727 -0
- package/dist/index.js +2144 -0
- package/dist/index.mjs +2107 -0
- package/package.json +71 -0
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,2107 @@
|
|
|
1
|
+
// src/WebRAGPlugin.ts
|
|
2
|
+
import { MongoClient } from "mongodb";
|
|
3
|
+
import OpenAI from "openai";
|
|
4
|
+
import * as cheerio from "cheerio";
|
|
5
|
+
import * as fs from "fs";
|
|
6
|
+
import * as path from "path";
|
|
7
|
+
var WebRAGPlugin = class _WebRAGPlugin {
|
|
8
|
+
name = "web-rag";
|
|
9
|
+
type = "rag";
|
|
10
|
+
priority;
|
|
11
|
+
config;
|
|
12
|
+
client = null;
|
|
13
|
+
db = null;
|
|
14
|
+
openai;
|
|
15
|
+
// Embedding cache
|
|
16
|
+
embeddingCache = /* @__PURE__ */ new Map();
|
|
17
|
+
cacheStats = { hits: 0, misses: 0 };
|
|
18
|
+
constructor(config) {
|
|
19
|
+
this.config = {
|
|
20
|
+
collection: "web_content",
|
|
21
|
+
embeddingModel: "text-embedding-3-small",
|
|
22
|
+
vectorIndexName: "web_vector_index",
|
|
23
|
+
numCandidates: 100,
|
|
24
|
+
limit: 10,
|
|
25
|
+
minScore: 0.7,
|
|
26
|
+
filterableFields: ["type"],
|
|
27
|
+
...config
|
|
28
|
+
};
|
|
29
|
+
this.priority = config.priority ?? 100;
|
|
30
|
+
this.openai = new OpenAI({ apiKey: config.openaiApiKey });
|
|
31
|
+
}
|
|
32
|
+
// ============================================================================
|
|
33
|
+
// MongoDB Connection
|
|
34
|
+
// ============================================================================
|
|
35
|
+
async getCollection() {
|
|
36
|
+
if (!this.client) {
|
|
37
|
+
this.client = new MongoClient(this.config.mongoUri);
|
|
38
|
+
await this.client.connect();
|
|
39
|
+
this.db = this.client.db(this.config.dbName);
|
|
40
|
+
}
|
|
41
|
+
return this.db.collection(this.config.collection);
|
|
42
|
+
}
|
|
43
|
+
async getLedgerCollection() {
|
|
44
|
+
if (!this.client) {
|
|
45
|
+
this.client = new MongoClient(this.config.mongoUri);
|
|
46
|
+
await this.client.connect();
|
|
47
|
+
this.db = this.client.db(this.config.dbName);
|
|
48
|
+
}
|
|
49
|
+
const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
|
|
50
|
+
return this.db.collection(name);
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
54
|
+
*/
|
|
55
|
+
async listCrawlLedger(options = {}) {
|
|
56
|
+
const col = await this.getLedgerCollection();
|
|
57
|
+
const filter = { tenantId: this.config.tenantId };
|
|
58
|
+
filter.agentId = options.agentId ?? "shared";
|
|
59
|
+
if (options.domain) filter.domain = options.domain;
|
|
60
|
+
if (options.status) filter.lastStatus = options.status;
|
|
61
|
+
const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
|
|
62
|
+
const skip = Math.max(options.skip ?? 0, 0);
|
|
63
|
+
return col.find(filter).sort({ lastCrawledAt: -1 }).skip(skip).limit(limit).toArray();
|
|
64
|
+
}
|
|
65
|
+
resolveCrawlLedgerOptions(config) {
|
|
66
|
+
const plugin = this.config.crawlLedger;
|
|
67
|
+
const per = config.crawlLedger;
|
|
68
|
+
const enabled = per?.enabled ?? plugin?.enabled ?? false;
|
|
69
|
+
if (!enabled) return null;
|
|
70
|
+
const ttlMsFailure = per?.ttlMsFailure ?? plugin?.ttlMsFailure ?? 60 * 60 * 1e3;
|
|
71
|
+
return {
|
|
72
|
+
ttlMsIndexed: per?.ttlMsIndexed ?? plugin?.ttlMsIndexed ?? 7 * 24 * 60 * 60 * 1e3,
|
|
73
|
+
ttlMsFailure,
|
|
74
|
+
ttlMsRenderError: per?.ttlMsRenderError ?? plugin?.ttlMsRenderError ?? 5 * 60 * 1e3,
|
|
75
|
+
maxPageStatuses: per?.maxPageStatuses ?? 500,
|
|
76
|
+
stripQuery: config.stripQueryParams ?? false
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
normalizeLedgerUrl(url, stripQuery) {
|
|
80
|
+
return this.normalizeWebsiteUrl(url, stripQuery);
|
|
81
|
+
}
|
|
82
|
+
shouldSkipLedger(entry, ttlMsIndexed, ttlMsFailure, ttlMsRenderError, forceRecrawl) {
|
|
83
|
+
if (forceRecrawl || !entry) return false;
|
|
84
|
+
const t = entry.lastCrawledAt instanceof Date ? entry.lastCrawledAt.getTime() : new Date(entry.lastCrawledAt).getTime();
|
|
85
|
+
const age = Date.now() - t;
|
|
86
|
+
if (entry.lastStatus === "indexed" && age < ttlMsIndexed) return true;
|
|
87
|
+
if (entry.lastStatus === "error" && age < ttlMsRenderError) return true;
|
|
88
|
+
if (entry.lastStatus !== "indexed" && entry.lastStatus !== "error" && age < ttlMsFailure) {
|
|
89
|
+
return true;
|
|
90
|
+
}
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
async findLedgerEntry(urlNormalized, agentId) {
|
|
94
|
+
const col = await this.getLedgerCollection();
|
|
95
|
+
return col.findOne({
|
|
96
|
+
tenantId: this.config.tenantId,
|
|
97
|
+
agentId,
|
|
98
|
+
urlNormalized
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
toLedgerStatus(doc, diag) {
|
|
102
|
+
if (doc) return "indexed";
|
|
103
|
+
if (diag?.reason === "non_html") return "non_html";
|
|
104
|
+
if (diag?.reason === "blocked_suspected") return "blocked_suspected";
|
|
105
|
+
if (diag?.reason === "render_error") return "error";
|
|
106
|
+
return "too_small";
|
|
107
|
+
}
|
|
108
|
+
async upsertLedgerRecord(params) {
|
|
109
|
+
const col = await this.getLedgerCollection();
|
|
110
|
+
let domain = "";
|
|
111
|
+
try {
|
|
112
|
+
domain = new URL(params.url).hostname;
|
|
113
|
+
} catch {
|
|
114
|
+
domain = "";
|
|
115
|
+
}
|
|
116
|
+
const now = /* @__PURE__ */ new Date();
|
|
117
|
+
const errMsg = params.errorMessage ?? params.diag?.errorMessage;
|
|
118
|
+
const $set = {
|
|
119
|
+
tenantId: this.config.tenantId,
|
|
120
|
+
agentId: params.agentId,
|
|
121
|
+
urlNormalized: params.urlNormalized,
|
|
122
|
+
url: params.url,
|
|
123
|
+
domain,
|
|
124
|
+
lastStatus: params.status,
|
|
125
|
+
lastCrawledAt: now,
|
|
126
|
+
updatedAt: now
|
|
127
|
+
};
|
|
128
|
+
if (errMsg !== void 0) {
|
|
129
|
+
$set.errorMessage = errMsg;
|
|
130
|
+
} else if (params.status === "indexed" && params.doc) {
|
|
131
|
+
$set.errorMessage = null;
|
|
132
|
+
}
|
|
133
|
+
if (params.doc) {
|
|
134
|
+
$set.modeUsed = params.diag?.modeUsed;
|
|
135
|
+
$set.contentLength = params.doc.content.length;
|
|
136
|
+
$set.title = params.doc.metadata?.title;
|
|
137
|
+
$set.docId = params.doc.id;
|
|
138
|
+
} else {
|
|
139
|
+
$set.modeUsed = params.diag?.modeUsed;
|
|
140
|
+
$set.contentLength = null;
|
|
141
|
+
$set.title = null;
|
|
142
|
+
$set.docId = null;
|
|
143
|
+
}
|
|
144
|
+
await col.updateOne(
|
|
145
|
+
{
|
|
146
|
+
tenantId: this.config.tenantId,
|
|
147
|
+
agentId: params.agentId,
|
|
148
|
+
urlNormalized: params.urlNormalized
|
|
149
|
+
},
|
|
150
|
+
{ $set },
|
|
151
|
+
{ upsert: true }
|
|
152
|
+
);
|
|
153
|
+
}
|
|
154
|
+
pushPageStatus(list, max, entry) {
|
|
155
|
+
list.push(entry);
|
|
156
|
+
while (list.length > max) list.shift();
|
|
157
|
+
}
|
|
158
|
+
async disconnect() {
|
|
159
|
+
if (this.client) {
|
|
160
|
+
await this.client.close();
|
|
161
|
+
this.client = null;
|
|
162
|
+
this.db = null;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
// ============================================================================
|
|
166
|
+
// RAG Plugin Interface
|
|
167
|
+
// ============================================================================
|
|
168
|
+
/**
|
|
169
|
+
* Retrieve contextual content for a message
|
|
170
|
+
*/
|
|
171
|
+
async retrieveContext(message, options = {}) {
|
|
172
|
+
const queryVector = await this.generateEmbedding(message);
|
|
173
|
+
const hardFilters = {
|
|
174
|
+
tenantId: this.config.tenantId,
|
|
175
|
+
...options.filters
|
|
176
|
+
};
|
|
177
|
+
if (options.agentId) {
|
|
178
|
+
hardFilters.agentId = { $in: ["shared", options.agentId] };
|
|
179
|
+
}
|
|
180
|
+
const results = await this.vectorSearch({
|
|
181
|
+
queryVector,
|
|
182
|
+
hardFilters
|
|
183
|
+
});
|
|
184
|
+
let scoredResults = results;
|
|
185
|
+
if (this.config.typeBoosts) {
|
|
186
|
+
scoredResults = results.map((doc) => ({
|
|
187
|
+
...doc,
|
|
188
|
+
score: doc.score * (this.config.typeBoosts[doc.metadata.type] ?? 1)
|
|
189
|
+
}));
|
|
190
|
+
}
|
|
191
|
+
if (this.config.recencyBoost?.enabled) {
|
|
192
|
+
const { field, decayDays, maxBoost = 1.2 } = this.config.recencyBoost;
|
|
193
|
+
const now = Date.now();
|
|
194
|
+
const decayMs = decayDays * 24 * 60 * 60 * 1e3;
|
|
195
|
+
scoredResults = scoredResults.map((doc) => {
|
|
196
|
+
const dateValue = doc.metadata[field];
|
|
197
|
+
if (!dateValue) return doc;
|
|
198
|
+
const docDate = new Date(dateValue).getTime();
|
|
199
|
+
const age = now - docDate;
|
|
200
|
+
const freshness = Math.max(0, 1 - age / decayMs);
|
|
201
|
+
const boost = 1 + (maxBoost - 1) * freshness;
|
|
202
|
+
return { ...doc, score: doc.score * boost };
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
scoredResults.sort((a, b) => b.score - a.score);
|
|
206
|
+
scoredResults = scoredResults.slice(0, this.config.limit);
|
|
207
|
+
const content = this.formatResultsToContext(scoredResults);
|
|
208
|
+
return {
|
|
209
|
+
content,
|
|
210
|
+
metadata: {
|
|
211
|
+
plugin: this.name,
|
|
212
|
+
contentCount: scoredResults.length,
|
|
213
|
+
types: [...new Set(scoredResults.map((d) => d.metadata.type))],
|
|
214
|
+
topResults: scoredResults.slice(0, 5).map((doc) => ({
|
|
215
|
+
id: doc.id,
|
|
216
|
+
type: doc.metadata.type,
|
|
217
|
+
title: doc.metadata.title,
|
|
218
|
+
url: doc.metadata.url,
|
|
219
|
+
score: doc.score
|
|
220
|
+
}))
|
|
221
|
+
}
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Format retrieved content for LLM context
|
|
226
|
+
*/
|
|
227
|
+
formatResultsToContext(docs) {
|
|
228
|
+
if (docs.length === 0) {
|
|
229
|
+
return "No relevant content found.";
|
|
230
|
+
}
|
|
231
|
+
const sections = ["## Relevant Content\n"];
|
|
232
|
+
for (const doc of docs) {
|
|
233
|
+
const meta = doc.metadata;
|
|
234
|
+
const header = meta.title || `${meta.type} (${doc.id})`;
|
|
235
|
+
sections.push(`### ${header}`);
|
|
236
|
+
if (meta.type) sections.push(`**Type:** ${meta.type}`);
|
|
237
|
+
if (meta.url) sections.push(`**URL:** ${meta.url}`);
|
|
238
|
+
const skipFields = ["type", "title", "url", "sourceUrl", "fetchedAt"];
|
|
239
|
+
const extraMeta = Object.entries(meta).filter(([key]) => !skipFields.includes(key)).map(([key, value]) => `**${this.formatFieldName(key)}:** ${this.formatFieldValue(value)}`);
|
|
240
|
+
if (extraMeta.length > 0) {
|
|
241
|
+
sections.push(extraMeta.join("\n"));
|
|
242
|
+
}
|
|
243
|
+
sections.push("");
|
|
244
|
+
sections.push(doc.content);
|
|
245
|
+
sections.push("");
|
|
246
|
+
}
|
|
247
|
+
return sections.join("\n");
|
|
248
|
+
}
|
|
249
|
+
formatFieldName(key) {
|
|
250
|
+
return key.replace(/([A-Z])/g, " $1").replace(/^./, (s) => s.toUpperCase());
|
|
251
|
+
}
|
|
252
|
+
formatFieldValue(value) {
|
|
253
|
+
if (Array.isArray(value)) return value.join(", ");
|
|
254
|
+
if (value instanceof Date) return value.toLocaleDateString();
|
|
255
|
+
if (typeof value === "object") return JSON.stringify(value);
|
|
256
|
+
return String(value);
|
|
257
|
+
}
|
|
258
|
+
// ============================================================================
|
|
259
|
+
// Vector Search
|
|
260
|
+
// ============================================================================
|
|
261
|
+
async vectorSearch(options) {
|
|
262
|
+
const collection = await this.getCollection();
|
|
263
|
+
const pipeline = [
|
|
264
|
+
{
|
|
265
|
+
$vectorSearch: {
|
|
266
|
+
index: this.config.vectorIndexName,
|
|
267
|
+
path: "embedding",
|
|
268
|
+
queryVector: options.queryVector,
|
|
269
|
+
numCandidates: this.config.numCandidates,
|
|
270
|
+
limit: this.config.limit * 2,
|
|
271
|
+
// Fetch more for post-filtering
|
|
272
|
+
filter: options.hardFilters
|
|
273
|
+
}
|
|
274
|
+
},
|
|
275
|
+
{
|
|
276
|
+
$addFields: {
|
|
277
|
+
score: { $meta: "vectorSearchScore" }
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
];
|
|
281
|
+
if (this.config.minScore) {
|
|
282
|
+
pipeline.push({
|
|
283
|
+
$match: { score: { $gte: this.config.minScore } }
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
pipeline.push({ $limit: this.config.limit * 2 });
|
|
287
|
+
const results = await collection.aggregate(pipeline).toArray();
|
|
288
|
+
return results;
|
|
289
|
+
}
|
|
290
|
+
// ============================================================================
|
|
291
|
+
// Embedding Generation
|
|
292
|
+
// ============================================================================
|
|
293
|
+
async generateEmbedding(text) {
|
|
294
|
+
const cacheConfig = this.config.cache?.embeddings;
|
|
295
|
+
if (cacheConfig?.enabled) {
|
|
296
|
+
const cached = this.embeddingCache.get(text);
|
|
297
|
+
const ttl = cacheConfig.ttl ?? 36e5;
|
|
298
|
+
if (cached && Date.now() - cached.timestamp < ttl) {
|
|
299
|
+
this.cacheStats.hits++;
|
|
300
|
+
return cached.value;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
this.cacheStats.misses++;
|
|
304
|
+
const response = await this.openai.embeddings.create({
|
|
305
|
+
model: this.config.embeddingModel,
|
|
306
|
+
input: text
|
|
307
|
+
});
|
|
308
|
+
const embedding = response.data[0].embedding;
|
|
309
|
+
if (cacheConfig?.enabled) {
|
|
310
|
+
const maxSize = cacheConfig.maxSize ?? 1e3;
|
|
311
|
+
if (this.embeddingCache.size >= maxSize) {
|
|
312
|
+
const firstKey = this.embeddingCache.keys().next().value;
|
|
313
|
+
if (firstKey) this.embeddingCache.delete(firstKey);
|
|
314
|
+
}
|
|
315
|
+
this.embeddingCache.set(text, { value: embedding, timestamp: Date.now() });
|
|
316
|
+
}
|
|
317
|
+
return embedding;
|
|
318
|
+
}
|
|
319
|
+
async generateEmbeddingsBatch(texts) {
|
|
320
|
+
const embeddings = [];
|
|
321
|
+
for (const text of texts) {
|
|
322
|
+
const embedding = await this.generateEmbedding(text);
|
|
323
|
+
embeddings.push(embedding);
|
|
324
|
+
}
|
|
325
|
+
return embeddings;
|
|
326
|
+
}
|
|
327
|
+
// ============================================================================
|
|
328
|
+
// Document Ingestion
|
|
329
|
+
// ============================================================================
|
|
330
|
+
/**
|
|
331
|
+
* Ingest documents into the CMS RAG system
|
|
332
|
+
*/
|
|
333
|
+
async ingest(documents, options) {
|
|
334
|
+
const collection = await this.getCollection();
|
|
335
|
+
let indexed = 0;
|
|
336
|
+
const errors = [];
|
|
337
|
+
const batchSize = options?.batchSize ?? 10;
|
|
338
|
+
for (let i = 0; i < documents.length; i += batchSize) {
|
|
339
|
+
const batch = documents.slice(i, i + batchSize);
|
|
340
|
+
const embeddings = await this.generateEmbeddingsBatch(
|
|
341
|
+
batch.map((doc) => doc.content)
|
|
342
|
+
);
|
|
343
|
+
const docsToStore = batch.map((doc, idx) => ({
|
|
344
|
+
id: doc.id,
|
|
345
|
+
content: doc.content,
|
|
346
|
+
metadata: {
|
|
347
|
+
type: doc.metadata?.type || "content",
|
|
348
|
+
...doc.metadata
|
|
349
|
+
},
|
|
350
|
+
tenantId: this.config.tenantId,
|
|
351
|
+
// Use 'shared' marker for tenant-wide content, specific agentId for agent-only
|
|
352
|
+
agentId: options?.agentId || "shared",
|
|
353
|
+
embedding: embeddings[idx]
|
|
354
|
+
}));
|
|
355
|
+
for (const doc of docsToStore) {
|
|
356
|
+
try {
|
|
357
|
+
const filter = {
|
|
358
|
+
tenantId: this.config.tenantId,
|
|
359
|
+
id: doc.id,
|
|
360
|
+
// Match by agentId ('shared' for tenant-wide, specific for agent-only)
|
|
361
|
+
agentId: options?.agentId || "shared"
|
|
362
|
+
};
|
|
363
|
+
await collection.updateOne(
|
|
364
|
+
filter,
|
|
365
|
+
{
|
|
366
|
+
$set: { ...doc, updatedAt: /* @__PURE__ */ new Date() },
|
|
367
|
+
$setOnInsert: { createdAt: /* @__PURE__ */ new Date() }
|
|
368
|
+
},
|
|
369
|
+
{ upsert: true }
|
|
370
|
+
);
|
|
371
|
+
indexed++;
|
|
372
|
+
} catch (error) {
|
|
373
|
+
errors.push({
|
|
374
|
+
id: doc.id,
|
|
375
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
return {
|
|
381
|
+
success: errors.length === 0,
|
|
382
|
+
indexed,
|
|
383
|
+
failed: errors.length,
|
|
384
|
+
errors: errors.length > 0 ? errors : void 0,
|
|
385
|
+
metadata: {
|
|
386
|
+
tenantId: this.config.tenantId,
|
|
387
|
+
collection: this.config.collection
|
|
388
|
+
}
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
/**
|
|
392
|
+
* Update a single document
|
|
393
|
+
*/
|
|
394
|
+
async update(id, document, options) {
|
|
395
|
+
const collection = await this.getCollection();
|
|
396
|
+
const update = { updatedAt: /* @__PURE__ */ new Date() };
|
|
397
|
+
if (document.content) {
|
|
398
|
+
const embedding = await this.generateEmbedding(document.content);
|
|
399
|
+
update.content = document.content;
|
|
400
|
+
update.embedding = embedding;
|
|
401
|
+
}
|
|
402
|
+
if (document.metadata) {
|
|
403
|
+
for (const [key, value] of Object.entries(document.metadata)) {
|
|
404
|
+
update[`metadata.${key}`] = value;
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
const filter = {
|
|
408
|
+
tenantId: this.config.tenantId,
|
|
409
|
+
id,
|
|
410
|
+
// Match by agentId ('shared' for tenant-wide, specific for agent-only)
|
|
411
|
+
agentId: options?.agentId || "shared"
|
|
412
|
+
};
|
|
413
|
+
await collection.updateOne(filter, { $set: update });
|
|
414
|
+
}
|
|
415
|
+
/**
|
|
416
|
+
* Delete document(s) by ID
|
|
417
|
+
*/
|
|
418
|
+
async delete(ids, options) {
|
|
419
|
+
const collection = await this.getCollection();
|
|
420
|
+
const idArray = Array.isArray(ids) ? ids : [ids];
|
|
421
|
+
const filter = {
|
|
422
|
+
tenantId: this.config.tenantId,
|
|
423
|
+
id: { $in: idArray },
|
|
424
|
+
// Match by agentId ('shared' for tenant-wide, specific for agent-only)
|
|
425
|
+
agentId: options?.agentId || "shared"
|
|
426
|
+
};
|
|
427
|
+
const result = await collection.deleteMany(filter);
|
|
428
|
+
return result.deletedCount;
|
|
429
|
+
}
|
|
430
|
+
/**
|
|
431
|
+
* Bulk operations
|
|
432
|
+
*/
|
|
433
|
+
async bulk(operations, options) {
|
|
434
|
+
let inserted = 0;
|
|
435
|
+
let updated = 0;
|
|
436
|
+
let deleted = 0;
|
|
437
|
+
let failed = 0;
|
|
438
|
+
const errors = [];
|
|
439
|
+
for (const op of operations) {
|
|
440
|
+
try {
|
|
441
|
+
switch (op.type) {
|
|
442
|
+
case "insert":
|
|
443
|
+
if (op.document) {
|
|
444
|
+
await this.ingest([op.document], options);
|
|
445
|
+
inserted++;
|
|
446
|
+
}
|
|
447
|
+
break;
|
|
448
|
+
case "update":
|
|
449
|
+
if (op.document) {
|
|
450
|
+
await this.update(op.id, op.document, options);
|
|
451
|
+
updated++;
|
|
452
|
+
}
|
|
453
|
+
break;
|
|
454
|
+
case "delete":
|
|
455
|
+
const count = await this.delete(op.id, options);
|
|
456
|
+
deleted += count;
|
|
457
|
+
break;
|
|
458
|
+
}
|
|
459
|
+
} catch (error) {
|
|
460
|
+
failed++;
|
|
461
|
+
errors.push({
|
|
462
|
+
id: op.id,
|
|
463
|
+
operation: op.type,
|
|
464
|
+
error: error.message || "Unknown error"
|
|
465
|
+
});
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
return {
|
|
469
|
+
success: failed === 0,
|
|
470
|
+
inserted,
|
|
471
|
+
updated,
|
|
472
|
+
deleted,
|
|
473
|
+
failed,
|
|
474
|
+
errors: errors.length > 0 ? errors : void 0
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
// ============================================================================
|
|
478
|
+
// URL Ingestion
|
|
479
|
+
// ============================================================================
|
|
480
|
+
/**
|
|
481
|
+
* Ingest content from a URL (JSON, CSV, XML, or API)
|
|
482
|
+
*/
|
|
483
|
+
async ingestFromUrl(source, options) {
|
|
484
|
+
try {
|
|
485
|
+
const controller = new AbortController();
|
|
486
|
+
const timeoutId = setTimeout(() => controller.abort(), source.timeout || 3e4);
|
|
487
|
+
const response = await fetch(source.url, {
|
|
488
|
+
headers: {
|
|
489
|
+
...source.headers,
|
|
490
|
+
...source.auth && this.buildAuthHeaders(source.auth)
|
|
491
|
+
},
|
|
492
|
+
signal: controller.signal
|
|
493
|
+
});
|
|
494
|
+
clearTimeout(timeoutId);
|
|
495
|
+
if (!response.ok) {
|
|
496
|
+
throw new Error(`HTTP error: ${response.status} ${response.statusText}`);
|
|
497
|
+
}
|
|
498
|
+
let documents;
|
|
499
|
+
if (source.type === "json" || source.type === "api") {
|
|
500
|
+
const data = await response.json();
|
|
501
|
+
documents = this.transformJsonToDocuments(data, source.transform);
|
|
502
|
+
} else if (source.type === "csv") {
|
|
503
|
+
const data = await response.text();
|
|
504
|
+
documents = this.transformCsvToDocuments(data, source.transform);
|
|
505
|
+
} else if (source.type === "xml") {
|
|
506
|
+
const data = await response.text();
|
|
507
|
+
documents = this.transformXmlToDocuments(data, source.transform);
|
|
508
|
+
} else {
|
|
509
|
+
throw new Error(`Unsupported source type: ${source.type}`);
|
|
510
|
+
}
|
|
511
|
+
documents = documents.map((doc) => ({
|
|
512
|
+
...doc,
|
|
513
|
+
metadata: {
|
|
514
|
+
...doc.metadata,
|
|
515
|
+
...source.metadata,
|
|
516
|
+
sourceUrl: source.url,
|
|
517
|
+
fetchedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
518
|
+
}
|
|
519
|
+
}));
|
|
520
|
+
const ingestResult = await this.ingest(documents, options);
|
|
521
|
+
return {
|
|
522
|
+
...ingestResult,
|
|
523
|
+
sourceUrl: source.url,
|
|
524
|
+
fetchedAt: /* @__PURE__ */ new Date(),
|
|
525
|
+
documentsFetched: documents.length
|
|
526
|
+
};
|
|
527
|
+
} catch (error) {
|
|
528
|
+
return {
|
|
529
|
+
success: false,
|
|
530
|
+
indexed: 0,
|
|
531
|
+
failed: 0,
|
|
532
|
+
sourceUrl: source.url,
|
|
533
|
+
fetchedAt: /* @__PURE__ */ new Date(),
|
|
534
|
+
documentsFetched: 0,
|
|
535
|
+
errors: [{
|
|
536
|
+
id: "fetch",
|
|
537
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
538
|
+
}]
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
buildAuthHeaders(auth) {
|
|
543
|
+
if (!auth) return {};
|
|
544
|
+
switch (auth.type) {
|
|
545
|
+
case "bearer":
|
|
546
|
+
return auth.token ? { Authorization: `Bearer ${auth.token}` } : {};
|
|
547
|
+
case "basic":
|
|
548
|
+
if (auth.username && auth.password) {
|
|
549
|
+
const encoded = Buffer.from(`${auth.username}:${auth.password}`).toString("base64");
|
|
550
|
+
return { Authorization: `Basic ${encoded}` };
|
|
551
|
+
}
|
|
552
|
+
return {};
|
|
553
|
+
case "api-key":
|
|
554
|
+
return auth.header && auth.key ? { [auth.header]: auth.key } : {};
|
|
555
|
+
case "custom":
|
|
556
|
+
return auth.headers || {};
|
|
557
|
+
default:
|
|
558
|
+
return {};
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
transformJsonToDocuments(data, transform) {
|
|
562
|
+
let items = data;
|
|
563
|
+
if (transform?.documentPath) {
|
|
564
|
+
items = this.extractByPath(data, transform.documentPath);
|
|
565
|
+
}
|
|
566
|
+
if (!Array.isArray(items)) {
|
|
567
|
+
items = [items];
|
|
568
|
+
}
|
|
569
|
+
const fieldMapping = transform?.fieldMapping || {};
|
|
570
|
+
return items.map((item, index) => {
|
|
571
|
+
const metadata = {};
|
|
572
|
+
for (const [targetField, sourcePath] of Object.entries(fieldMapping)) {
|
|
573
|
+
if (targetField === "id" || targetField === "content") continue;
|
|
574
|
+
if (typeof sourcePath === "function") {
|
|
575
|
+
metadata[targetField] = sourcePath();
|
|
576
|
+
} else if (sourcePath) {
|
|
577
|
+
metadata[targetField] = this.extractField(item, sourcePath);
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
if (!metadata.type) {
|
|
581
|
+
metadata.type = "content";
|
|
582
|
+
}
|
|
583
|
+
return {
|
|
584
|
+
id: this.extractField(item, fieldMapping.id || "id") || `doc-${index}`,
|
|
585
|
+
content: this.extractField(item, fieldMapping.content || "content") || JSON.stringify(item),
|
|
586
|
+
metadata
|
|
587
|
+
};
|
|
588
|
+
});
|
|
589
|
+
}
|
|
590
|
+
transformCsvToDocuments(csvData, transform) {
|
|
591
|
+
const lines = csvData.trim().split("\n");
|
|
592
|
+
if (lines.length < 2) return [];
|
|
593
|
+
const headers = this.parseCsvLine(lines[0]);
|
|
594
|
+
return lines.slice(1).map((line, index) => {
|
|
595
|
+
const values = this.parseCsvLine(line);
|
|
596
|
+
const item = headers.reduce((acc, header, i) => {
|
|
597
|
+
acc[header] = values[i] || "";
|
|
598
|
+
return acc;
|
|
599
|
+
}, {});
|
|
600
|
+
return this.transformJsonToDocuments([item], transform)[0];
|
|
601
|
+
});
|
|
602
|
+
}
|
|
603
|
+
parseCsvLine(line) {
|
|
604
|
+
const result = [];
|
|
605
|
+
let current = "";
|
|
606
|
+
let inQuotes = false;
|
|
607
|
+
for (const char of line) {
|
|
608
|
+
if (char === '"') {
|
|
609
|
+
inQuotes = !inQuotes;
|
|
610
|
+
} else if (char === "," && !inQuotes) {
|
|
611
|
+
result.push(current.trim());
|
|
612
|
+
current = "";
|
|
613
|
+
} else {
|
|
614
|
+
current += char;
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
result.push(current.trim());
|
|
618
|
+
return result;
|
|
619
|
+
}
|
|
620
|
+
transformXmlToDocuments(xmlData, transform) {
|
|
621
|
+
const items = [];
|
|
622
|
+
const itemPath = transform?.documentPath || "item";
|
|
623
|
+
const itemRegex = new RegExp(`<${itemPath}[^>]*>([\\s\\S]*?)<\\/${itemPath}>`, "gi");
|
|
624
|
+
let match;
|
|
625
|
+
while ((match = itemRegex.exec(xmlData)) !== null) {
|
|
626
|
+
const itemXml = match[1];
|
|
627
|
+
const item = {};
|
|
628
|
+
const tagRegex = /<(\w+)[^>]*>([^<]*)<\/\1>/g;
|
|
629
|
+
let tagMatch;
|
|
630
|
+
while ((tagMatch = tagRegex.exec(itemXml)) !== null) {
|
|
631
|
+
item[tagMatch[1]] = tagMatch[2].trim();
|
|
632
|
+
}
|
|
633
|
+
items.push(item);
|
|
634
|
+
}
|
|
635
|
+
return this.transformJsonToDocuments(items, transform);
|
|
636
|
+
}
|
|
637
|
+
extractByPath(obj, path2) {
|
|
638
|
+
const parts = path2.split(".");
|
|
639
|
+
let current = obj;
|
|
640
|
+
for (const part of parts) {
|
|
641
|
+
if (current == null) return void 0;
|
|
642
|
+
const arrayMatch = part.match(/^(\w+)\[(\d+)\]$/);
|
|
643
|
+
if (arrayMatch) {
|
|
644
|
+
current = current[arrayMatch[1]]?.[parseInt(arrayMatch[2])];
|
|
645
|
+
} else {
|
|
646
|
+
current = current[part];
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
return current;
|
|
650
|
+
}
|
|
651
|
+
extractField(item, path2) {
|
|
652
|
+
return this.extractByPath(item, path2);
|
|
653
|
+
}
|
|
654
|
+
// ============================================================================
|
|
655
|
+
// Drupal JSON:API Integration
|
|
656
|
+
// ============================================================================
|
|
657
|
+
/**
|
|
658
|
+
* Ingest content from a Drupal site using JSON:API
|
|
659
|
+
*/
|
|
660
|
+
async ingestFromDrupal(config, options) {
|
|
661
|
+
const results = [];
|
|
662
|
+
for (const contentType of config.contentTypes) {
|
|
663
|
+
const url = `${config.baseUrl}/jsonapi/node/${contentType}`;
|
|
664
|
+
const mapping = config.mappings?.[contentType];
|
|
665
|
+
const result = await this.ingestFromUrl(
|
|
666
|
+
{
|
|
667
|
+
url,
|
|
668
|
+
type: "json",
|
|
669
|
+
auth: config.auth,
|
|
670
|
+
transform: {
|
|
671
|
+
documentPath: "data",
|
|
672
|
+
fieldMapping: {
|
|
673
|
+
id: "id",
|
|
674
|
+
content: mapping?.content || "attributes.body.processed",
|
|
675
|
+
type: () => contentType,
|
|
676
|
+
title: "attributes.title",
|
|
677
|
+
url: "attributes.path.alias",
|
|
678
|
+
...mapping?.fields
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
},
|
|
682
|
+
options
|
|
683
|
+
);
|
|
684
|
+
results.push(result);
|
|
685
|
+
}
|
|
686
|
+
return results;
|
|
687
|
+
}
|
|
688
|
+
/**
|
|
689
|
+
* Parse Drupal JSON:API node type (e.g., 'node--project' → 'project')
|
|
690
|
+
*/
|
|
691
|
+
static parseDrupalType(type) {
|
|
692
|
+
return type.replace(/^node--/, "");
|
|
693
|
+
}
|
|
694
|
+
// ============================================================================
|
|
695
|
+
// WordPress REST API Integration
|
|
696
|
+
// ============================================================================
|
|
697
|
+
/**
|
|
698
|
+
* Ingest content from a WordPress site using REST API
|
|
699
|
+
*
|
|
700
|
+
* @example
|
|
701
|
+
* ```typescript
|
|
702
|
+
* await plugin.ingestFromWordPress({
|
|
703
|
+
* baseUrl: 'https://myblog.com',
|
|
704
|
+
* postTypes: ['posts', 'pages'],
|
|
705
|
+
* perPage: 100,
|
|
706
|
+
* });
|
|
707
|
+
* ```
|
|
708
|
+
*/
|
|
709
|
+
async ingestFromWordPress(config, options) {
|
|
710
|
+
const results = [];
|
|
711
|
+
const postTypes = config.postTypes || ["posts", "pages"];
|
|
712
|
+
const perPage = config.perPage || 100;
|
|
713
|
+
const maxPages = config.maxPages || 10;
|
|
714
|
+
for (const postType of postTypes) {
|
|
715
|
+
let page = 1;
|
|
716
|
+
let hasMore = true;
|
|
717
|
+
while (hasMore && page <= maxPages) {
|
|
718
|
+
const url = `${config.baseUrl}/wp-json/wp/v2/${postType}?per_page=${perPage}&page=${page}&_embed`;
|
|
719
|
+
const mapping = config.mappings?.[postType];
|
|
720
|
+
try {
|
|
721
|
+
const result = await this.ingestFromUrl(
|
|
722
|
+
{
|
|
723
|
+
url,
|
|
724
|
+
type: "json",
|
|
725
|
+
auth: config.auth,
|
|
726
|
+
transform: {
|
|
727
|
+
fieldMapping: {
|
|
728
|
+
id: "id",
|
|
729
|
+
content: mapping?.content || "content.rendered",
|
|
730
|
+
type: () => this.normalizeWordPressType(postType),
|
|
731
|
+
title: "title.rendered",
|
|
732
|
+
url: "link",
|
|
733
|
+
slug: "slug",
|
|
734
|
+
publishedAt: "date",
|
|
735
|
+
modifiedAt: "modified",
|
|
736
|
+
author: "_embedded.author.0.name",
|
|
737
|
+
featuredImage: "_embedded.wp:featuredmedia.0.source_url",
|
|
738
|
+
excerpt: "excerpt.rendered",
|
|
739
|
+
categories: "_embedded.wp:term.0",
|
|
740
|
+
tags: "_embedded.wp:term.1",
|
|
741
|
+
...mapping?.fields
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
},
|
|
745
|
+
options
|
|
746
|
+
);
|
|
747
|
+
results.push(result);
|
|
748
|
+
hasMore = result.documentsFetched === perPage;
|
|
749
|
+
page++;
|
|
750
|
+
} catch (error) {
|
|
751
|
+
hasMore = false;
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
return results;
|
|
756
|
+
}
|
|
757
|
+
/**
|
|
758
|
+
* Normalize WordPress post type to a cleaner name
|
|
759
|
+
*/
|
|
760
|
+
normalizeWordPressType(postType) {
|
|
761
|
+
if (postType.endsWith("s")) {
|
|
762
|
+
return postType.slice(0, -1);
|
|
763
|
+
}
|
|
764
|
+
return postType;
|
|
765
|
+
}
|
|
766
|
+
// ============================================================================
|
|
767
|
+
// Sanity.io Integration
|
|
768
|
+
// ============================================================================
|
|
769
|
+
/**
|
|
770
|
+
* Ingest content from a Sanity.io project using GROQ queries
|
|
771
|
+
*
|
|
772
|
+
* @example
|
|
773
|
+
* ```typescript
|
|
774
|
+
* await plugin.ingestFromSanity({
|
|
775
|
+
* projectId: 'abc123',
|
|
776
|
+
* dataset: 'production',
|
|
777
|
+
* queries: {
|
|
778
|
+
* post: {
|
|
779
|
+
* query: '*[_type == "post" && !(_id in path("drafts.**"))]',
|
|
780
|
+
* content: 'body',
|
|
781
|
+
* fields: {
|
|
782
|
+
* author: 'author->name',
|
|
783
|
+
* categories: 'categories[]->title',
|
|
784
|
+
* },
|
|
785
|
+
* },
|
|
786
|
+
* },
|
|
787
|
+
* });
|
|
788
|
+
* ```
|
|
789
|
+
*/
|
|
790
|
+
async ingestFromSanity(config, options) {
|
|
791
|
+
const results = [];
|
|
792
|
+
const apiVersion = config.apiVersion || "v2024-01-01";
|
|
793
|
+
const useCdn = config.useCdn !== false;
|
|
794
|
+
const baseUrl = useCdn ? `https://${config.projectId}.apicdn.sanity.io/${apiVersion}` : `https://${config.projectId}.api.sanity.io/${apiVersion}`;
|
|
795
|
+
for (const [contentType, queryConfig] of Object.entries(config.queries)) {
|
|
796
|
+
const encodedQuery = encodeURIComponent(queryConfig.query);
|
|
797
|
+
const url = `${baseUrl}/data/query/${config.dataset}?query=${encodedQuery}`;
|
|
798
|
+
const headers = {};
|
|
799
|
+
if (config.token) {
|
|
800
|
+
headers["Authorization"] = `Bearer ${config.token}`;
|
|
801
|
+
}
|
|
802
|
+
const result = await this.ingestFromUrl(
|
|
803
|
+
{
|
|
804
|
+
url,
|
|
805
|
+
type: "json",
|
|
806
|
+
headers,
|
|
807
|
+
transform: {
|
|
808
|
+
documentPath: "result",
|
|
809
|
+
fieldMapping: {
|
|
810
|
+
id: "_id",
|
|
811
|
+
content: queryConfig.content,
|
|
812
|
+
type: () => contentType,
|
|
813
|
+
title: "title",
|
|
814
|
+
slug: "slug.current",
|
|
815
|
+
publishedAt: "publishedAt",
|
|
816
|
+
updatedAt: "_updatedAt",
|
|
817
|
+
...queryConfig.fields
|
|
818
|
+
}
|
|
819
|
+
}
|
|
820
|
+
},
|
|
821
|
+
options
|
|
822
|
+
);
|
|
823
|
+
results.push(result);
|
|
824
|
+
}
|
|
825
|
+
return results;
|
|
826
|
+
}
|
|
827
|
+
/**
|
|
828
|
+
* Convert Sanity Portable Text blocks to plain text
|
|
829
|
+
* Useful for extracting content from rich text fields
|
|
830
|
+
*/
|
|
831
|
+
static sanityBlocksToText(blocks) {
|
|
832
|
+
if (!Array.isArray(blocks)) return "";
|
|
833
|
+
return blocks.filter((block) => block._type === "block").map((block) => {
|
|
834
|
+
if (!block.children) return "";
|
|
835
|
+
return block.children.map((child) => child.text || "").join("");
|
|
836
|
+
}).join("\n\n");
|
|
837
|
+
}
|
|
838
|
+
// ============================================================================
|
|
839
|
+
// Strapi Integration
|
|
840
|
+
// ============================================================================
|
|
841
|
+
/**
|
|
842
|
+
* Ingest content from a Strapi CMS (v4 by default)
|
|
843
|
+
*
|
|
844
|
+
* @example
|
|
845
|
+
* ```typescript
|
|
846
|
+
* await plugin.ingestFromStrapi({
|
|
847
|
+
* baseUrl: 'https://my-strapi.com',
|
|
848
|
+
* apiToken: process.env.STRAPI_TOKEN,
|
|
849
|
+
* contentTypes: ['articles', 'pages'],
|
|
850
|
+
* mappings: {
|
|
851
|
+
* articles: {
|
|
852
|
+
* content: 'attributes.content',
|
|
853
|
+
* fields: {
|
|
854
|
+
* author: 'attributes.author.data.attributes.name',
|
|
855
|
+
* category: 'attributes.category.data.attributes.name',
|
|
856
|
+
* },
|
|
857
|
+
* },
|
|
858
|
+
* },
|
|
859
|
+
* });
|
|
860
|
+
* ```
|
|
861
|
+
*/
|
|
862
|
+
async ingestFromStrapi(config, options) {
|
|
863
|
+
const results = [];
|
|
864
|
+
const pageSize = config.pageSize || 100;
|
|
865
|
+
const maxPages = config.maxPages || 10;
|
|
866
|
+
for (const contentType of config.contentTypes) {
|
|
867
|
+
let page = 1;
|
|
868
|
+
let hasMore = true;
|
|
869
|
+
const mapping = config.mappings?.[contentType];
|
|
870
|
+
const useAttributes = mapping?.useAttributes !== false;
|
|
871
|
+
while (hasMore && page <= maxPages) {
|
|
872
|
+
const url = `${config.baseUrl}/api/${contentType}?pagination[page]=${page}&pagination[pageSize]=${pageSize}&populate=*`;
|
|
873
|
+
const headers = {};
|
|
874
|
+
if (config.apiToken) {
|
|
875
|
+
headers["Authorization"] = `Bearer ${config.apiToken}`;
|
|
876
|
+
}
|
|
877
|
+
try {
|
|
878
|
+
const result = await this.ingestFromUrl(
|
|
879
|
+
{
|
|
880
|
+
url,
|
|
881
|
+
type: "json",
|
|
882
|
+
headers,
|
|
883
|
+
transform: {
|
|
884
|
+
documentPath: "data",
|
|
885
|
+
fieldMapping: useAttributes ? {
|
|
886
|
+
// Strapi v4 format (with attributes)
|
|
887
|
+
id: "id",
|
|
888
|
+
content: mapping?.content || "attributes.content",
|
|
889
|
+
type: () => this.normalizeStrapiType(contentType),
|
|
890
|
+
title: "attributes.title",
|
|
891
|
+
slug: "attributes.slug",
|
|
892
|
+
publishedAt: "attributes.publishedAt",
|
|
893
|
+
updatedAt: "attributes.updatedAt",
|
|
894
|
+
...mapping?.fields
|
|
895
|
+
} : {
|
|
896
|
+
// Strapi v3 format (flat)
|
|
897
|
+
id: "id",
|
|
898
|
+
content: mapping?.content || "content",
|
|
899
|
+
type: () => this.normalizeStrapiType(contentType),
|
|
900
|
+
title: "title",
|
|
901
|
+
slug: "slug",
|
|
902
|
+
publishedAt: "published_at",
|
|
903
|
+
updatedAt: "updated_at",
|
|
904
|
+
...mapping?.fields
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
},
|
|
908
|
+
options
|
|
909
|
+
);
|
|
910
|
+
results.push(result);
|
|
911
|
+
hasMore = result.documentsFetched === pageSize;
|
|
912
|
+
page++;
|
|
913
|
+
} catch (error) {
|
|
914
|
+
hasMore = false;
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
return results;
|
|
919
|
+
}
|
|
920
|
+
/**
|
|
921
|
+
* Normalize Strapi collection type to singular form
|
|
922
|
+
*/
|
|
923
|
+
normalizeStrapiType(collectionType) {
|
|
924
|
+
if (collectionType.endsWith("s")) {
|
|
925
|
+
return collectionType.slice(0, -1);
|
|
926
|
+
}
|
|
927
|
+
return collectionType;
|
|
928
|
+
}
|
|
929
|
+
// ============================================================================
|
|
930
|
+
// Web Crawling - Zero Setup for Non-Technical Clients
|
|
931
|
+
// ============================================================================
|
|
932
|
+
/**
|
|
933
|
+
* Ingest content by crawling a website's sitemap
|
|
934
|
+
* Perfect for non-technical clients - just provide the sitemap URL
|
|
935
|
+
*
|
|
936
|
+
* @example
|
|
937
|
+
* ```typescript
|
|
938
|
+
* // Simple usage - just provide the sitemap
|
|
939
|
+
* await plugin.ingestFromSitemap({
|
|
940
|
+
* sitemapUrl: 'https://my-site/sitemap.xml',
|
|
941
|
+
* });
|
|
942
|
+
*
|
|
943
|
+
* // Or auto-discover sitemap from base URL
|
|
944
|
+
* await plugin.ingestFromSitemap({
|
|
945
|
+
* baseUrl: 'https://my-site',
|
|
946
|
+
* });
|
|
947
|
+
*
|
|
948
|
+
* // With content selectors and type inference
|
|
949
|
+
* await plugin.ingestFromSitemap({
|
|
950
|
+
* sitemapUrl: 'https://my-site/sitemap.xml',
|
|
951
|
+
* contentSelector: 'article, .main-content',
|
|
952
|
+
* excludePatterns: ['/cart', '/checkout', '/admin'],
|
|
953
|
+
* typeFromUrl: {
|
|
954
|
+
* '/projects/': 'project',
|
|
955
|
+
* '/perspectives/': 'blog',
|
|
956
|
+
* '/people/': 'team',
|
|
957
|
+
* },
|
|
958
|
+
* });
|
|
959
|
+
* ```
|
|
960
|
+
*/
|
|
961
|
+
async ingestFromSitemap(config, options) {
|
|
962
|
+
const maxPages = config.maxPages ?? 100;
|
|
963
|
+
const concurrency = config.concurrency ?? 3;
|
|
964
|
+
const delayMs = config.delayMs ?? 500;
|
|
965
|
+
let sitemapUrl = config.sitemapUrl;
|
|
966
|
+
if (!sitemapUrl && config.baseUrl) {
|
|
967
|
+
sitemapUrl = `${config.baseUrl.replace(/\/$/, "")}/sitemap.xml`;
|
|
968
|
+
}
|
|
969
|
+
if (!sitemapUrl) {
|
|
970
|
+
return {
|
|
971
|
+
success: false,
|
|
972
|
+
indexed: 0,
|
|
973
|
+
failed: 0,
|
|
974
|
+
urlsCrawled: 0,
|
|
975
|
+
urlsSkipped: 0,
|
|
976
|
+
urlsFailed: 0,
|
|
977
|
+
crawledAt: /* @__PURE__ */ new Date(),
|
|
978
|
+
errors: [{ id: "config", error: "Either sitemapUrl or baseUrl is required" }]
|
|
979
|
+
};
|
|
980
|
+
}
|
|
981
|
+
const urls = await this.parseSitemap(sitemapUrl, config);
|
|
982
|
+
let filteredUrls = urls;
|
|
983
|
+
if (config.includePatterns?.length) {
|
|
984
|
+
filteredUrls = filteredUrls.filter(
|
|
985
|
+
(url) => config.includePatterns.some((pattern) => url.includes(pattern))
|
|
986
|
+
);
|
|
987
|
+
}
|
|
988
|
+
if (config.excludePatterns?.length) {
|
|
989
|
+
filteredUrls = filteredUrls.filter(
|
|
990
|
+
(url) => !config.excludePatterns.some((pattern) => url.includes(pattern))
|
|
991
|
+
);
|
|
992
|
+
}
|
|
993
|
+
const urlsToCrawl = filteredUrls.slice(0, maxPages);
|
|
994
|
+
const urlsSkipped = filteredUrls.length - urlsToCrawl.length;
|
|
995
|
+
const result = await this.crawlUrls(urlsToCrawl, {
|
|
996
|
+
...config,
|
|
997
|
+
concurrency,
|
|
998
|
+
delayMs
|
|
999
|
+
}, options);
|
|
1000
|
+
return {
|
|
1001
|
+
...result,
|
|
1002
|
+
urlsSkipped,
|
|
1003
|
+
crawledAt: /* @__PURE__ */ new Date()
|
|
1004
|
+
};
|
|
1005
|
+
}
|
|
1006
|
+
/**
|
|
1007
|
+
* Ingest content from a website that has no sitemap (or sitemap is incomplete).
|
|
1008
|
+
* Discovers internal links from `baseUrl` (BFS) and then crawls the discovered URLs.
|
|
1009
|
+
*
|
|
1010
|
+
* This uses the same extraction pipeline as `ingestFromSitemap()` (via `crawlPage()`).
|
|
1011
|
+
*/
|
|
1012
|
+
async ingestFromWebsite(config, options) {
|
|
1013
|
+
const maxPages = config.maxPages ?? 100;
|
|
1014
|
+
const maxDepth = config.maxDepth ?? 3;
|
|
1015
|
+
const concurrency = config.concurrency ?? 3;
|
|
1016
|
+
const delayMs = config.delayMs ?? 500;
|
|
1017
|
+
const timeout = config.timeout ?? 3e4;
|
|
1018
|
+
const stripQueryParams = config.stripQueryParams ?? true;
|
|
1019
|
+
if (!config.baseUrl) {
|
|
1020
|
+
return {
|
|
1021
|
+
success: false,
|
|
1022
|
+
indexed: 0,
|
|
1023
|
+
failed: 0,
|
|
1024
|
+
urlsCrawled: 0,
|
|
1025
|
+
urlsSkipped: 0,
|
|
1026
|
+
urlsFailed: 0,
|
|
1027
|
+
crawledAt: /* @__PURE__ */ new Date(),
|
|
1028
|
+
errors: [{ id: "config", error: "baseUrl is required" }]
|
|
1029
|
+
};
|
|
1030
|
+
}
|
|
1031
|
+
const dbg = this.createDebugCollector(config.debug);
|
|
1032
|
+
const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
|
|
1033
|
+
if (!base) {
|
|
1034
|
+
return {
|
|
1035
|
+
success: false,
|
|
1036
|
+
indexed: 0,
|
|
1037
|
+
failed: 0,
|
|
1038
|
+
urlsCrawled: 0,
|
|
1039
|
+
urlsSkipped: 0,
|
|
1040
|
+
urlsFailed: 0,
|
|
1041
|
+
crawledAt: /* @__PURE__ */ new Date(),
|
|
1042
|
+
errors: [{ id: "config", error: "Invalid baseUrl" }]
|
|
1043
|
+
};
|
|
1044
|
+
}
|
|
1045
|
+
const discoveredSitemaps = await this.discoverSitemaps(base, timeout, dbg);
|
|
1046
|
+
dbg.log("discovery.sitemaps", { baseUrl: base, sitemaps: discoveredSitemaps });
|
|
1047
|
+
let urlsToCrawl = [];
|
|
1048
|
+
let urlsSkipped = 0;
|
|
1049
|
+
for (const sm of discoveredSitemaps) {
|
|
1050
|
+
const urls = await this.parseSitemap(sm, {
|
|
1051
|
+
sitemapUrl: sm,
|
|
1052
|
+
timeout
|
|
1053
|
+
});
|
|
1054
|
+
if (urls.length > 0) {
|
|
1055
|
+
dbg.log("discovery.sitemapParsed", { sitemapUrl: sm, urlCount: urls.length });
|
|
1056
|
+
let filteredUrls = urls;
|
|
1057
|
+
if (config.includePatterns?.length) {
|
|
1058
|
+
filteredUrls = filteredUrls.filter((u) => config.includePatterns.some((p) => u.includes(p)));
|
|
1059
|
+
}
|
|
1060
|
+
if (config.excludePatterns?.length) {
|
|
1061
|
+
filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
|
|
1062
|
+
}
|
|
1063
|
+
urlsToCrawl = filteredUrls.slice(0, maxPages);
|
|
1064
|
+
urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
|
|
1065
|
+
break;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
if (urlsToCrawl.length === 0) {
|
|
1069
|
+
dbg.log("discovery.fallback", { reason: "no_sitemap_urls", method: "link_lookup" });
|
|
1070
|
+
const discovery = await this.discoverInternalUrls({
|
|
1071
|
+
baseUrl: base,
|
|
1072
|
+
maxPages,
|
|
1073
|
+
maxDepth,
|
|
1074
|
+
concurrency,
|
|
1075
|
+
delayMs,
|
|
1076
|
+
timeout,
|
|
1077
|
+
includePatterns: config.includePatterns,
|
|
1078
|
+
excludePatterns: config.excludePatterns,
|
|
1079
|
+
stripQueryParams
|
|
1080
|
+
});
|
|
1081
|
+
urlsToCrawl = discovery.urls;
|
|
1082
|
+
urlsSkipped = discovery.skipped;
|
|
1083
|
+
dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
|
|
1084
|
+
}
|
|
1085
|
+
const result = await this.crawlUrls(urlsToCrawl, {
|
|
1086
|
+
contentSelector: config.contentSelector,
|
|
1087
|
+
titleSelector: config.titleSelector,
|
|
1088
|
+
removeSelectors: config.removeSelectors,
|
|
1089
|
+
concurrency,
|
|
1090
|
+
delayMs,
|
|
1091
|
+
timeout,
|
|
1092
|
+
typeFromUrl: config.typeFromUrl,
|
|
1093
|
+
defaultType: config.defaultType ?? "page",
|
|
1094
|
+
metadata: config.metadata,
|
|
1095
|
+
includePatterns: config.includePatterns,
|
|
1096
|
+
excludePatterns: config.excludePatterns,
|
|
1097
|
+
stripQueryParams,
|
|
1098
|
+
render: config.render,
|
|
1099
|
+
renderOptions: config.renderOptions,
|
|
1100
|
+
debug: config.debug,
|
|
1101
|
+
crawlLedger: config.crawlLedger
|
|
1102
|
+
}, options);
|
|
1103
|
+
return {
|
|
1104
|
+
...result,
|
|
1105
|
+
urlsSkipped,
|
|
1106
|
+
crawledAt: /* @__PURE__ */ new Date(),
|
|
1107
|
+
metadata: {
|
|
1108
|
+
...result.metadata || {},
|
|
1109
|
+
discoveryDebug: dbg.summary()
|
|
1110
|
+
}
|
|
1111
|
+
};
|
|
1112
|
+
}
|
|
1113
|
+
/**
|
|
1114
|
+
* Parse sitemap XML and extract URLs
|
|
1115
|
+
*/
|
|
1116
|
+
async parseSitemap(sitemapUrl, config) {
|
|
1117
|
+
const urls = [];
|
|
1118
|
+
try {
|
|
1119
|
+
const response = await fetch(sitemapUrl, {
|
|
1120
|
+
headers: { "User-Agent": "SnapAgent-CMS-Crawler/1.0" },
|
|
1121
|
+
signal: AbortSignal.timeout(config.timeout || 3e4)
|
|
1122
|
+
});
|
|
1123
|
+
if (!response.ok) {
|
|
1124
|
+
console.error(`Failed to fetch sitemap: ${response.status}`);
|
|
1125
|
+
return urls;
|
|
1126
|
+
}
|
|
1127
|
+
const xml = await response.text();
|
|
1128
|
+
if (xml.includes("<sitemapindex")) {
|
|
1129
|
+
const sitemapUrls = this.extractUrlsFromXml(xml, "sitemap", "loc");
|
|
1130
|
+
for (const subSitemapUrl of sitemapUrls.slice(0, 10)) {
|
|
1131
|
+
const subUrls = await this.parseSitemap(subSitemapUrl, config);
|
|
1132
|
+
urls.push(...subUrls);
|
|
1133
|
+
}
|
|
1134
|
+
} else {
|
|
1135
|
+
const pageUrls = this.extractUrlsFromXml(xml, "url", "loc");
|
|
1136
|
+
urls.push(...pageUrls);
|
|
1137
|
+
}
|
|
1138
|
+
} catch (error) {
|
|
1139
|
+
console.error(`Error parsing sitemap ${sitemapUrl}:`, error);
|
|
1140
|
+
}
|
|
1141
|
+
return urls;
|
|
1142
|
+
}
|
|
1143
|
+
/**
|
|
1144
|
+
* Extract URLs from sitemap XML
|
|
1145
|
+
*/
|
|
1146
|
+
extractUrlsFromXml(xml, parentTag, urlTag) {
|
|
1147
|
+
const urls = [];
|
|
1148
|
+
const regex = new RegExp(`<${parentTag}[^>]*>[\\s\\S]*?<${urlTag}>([^<]+)<\\/${urlTag}>[\\s\\S]*?<\\/${parentTag}>`, "gi");
|
|
1149
|
+
let match;
|
|
1150
|
+
while ((match = regex.exec(xml)) !== null) {
|
|
1151
|
+
const url = match[1].trim();
|
|
1152
|
+
if (url.startsWith("http")) {
|
|
1153
|
+
urls.push(url);
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
return urls;
|
|
1157
|
+
}
|
|
1158
|
+
async discoverInternalUrls(input) {
|
|
1159
|
+
const start = this.normalizeWebsiteUrl(input.baseUrl, input.stripQueryParams);
|
|
1160
|
+
if (!start) return { urls: [], skipped: 0 };
|
|
1161
|
+
const startUrl = new URL(start);
|
|
1162
|
+
const visited = /* @__PURE__ */ new Set();
|
|
1163
|
+
const queue = [{ url: startUrl.toString(), depth: 0 }];
|
|
1164
|
+
const discovered = [];
|
|
1165
|
+
let skipped = 0;
|
|
1166
|
+
while (queue.length > 0 && discovered.length < input.maxPages) {
|
|
1167
|
+
const batch = queue.splice(0, input.concurrency);
|
|
1168
|
+
const results = await Promise.allSettled(
|
|
1169
|
+
batch.map(async ({ url, depth }) => {
|
|
1170
|
+
if (visited.has(url)) return { url, depth, links: [] };
|
|
1171
|
+
visited.add(url);
|
|
1172
|
+
if (depth > input.maxDepth) return { url, depth, links: [] };
|
|
1173
|
+
if (input.includePatterns?.length && !input.includePatterns.some((p) => url.includes(p))) {
|
|
1174
|
+
skipped++;
|
|
1175
|
+
return { url, depth, links: [] };
|
|
1176
|
+
}
|
|
1177
|
+
if (input.excludePatterns?.length && input.excludePatterns.some((p) => url.includes(p))) {
|
|
1178
|
+
skipped++;
|
|
1179
|
+
return { url, depth, links: [] };
|
|
1180
|
+
}
|
|
1181
|
+
discovered.push(url);
|
|
1182
|
+
if (discovered.length >= input.maxPages) return { url, depth, links: [] };
|
|
1183
|
+
try {
|
|
1184
|
+
const html = await this.fetchHtml(url, input.timeout);
|
|
1185
|
+
if (!html) return { url, depth, links: [] };
|
|
1186
|
+
const links = this.extractInternalLinks(html, startUrl, input.stripQueryParams);
|
|
1187
|
+
return { url, depth, links };
|
|
1188
|
+
} catch {
|
|
1189
|
+
return { url, depth, links: [] };
|
|
1190
|
+
}
|
|
1191
|
+
})
|
|
1192
|
+
);
|
|
1193
|
+
for (const r of results) {
|
|
1194
|
+
if (r.status !== "fulfilled") continue;
|
|
1195
|
+
const { depth, links } = r.value;
|
|
1196
|
+
const nextDepth = depth + 1;
|
|
1197
|
+
if (nextDepth > input.maxDepth) continue;
|
|
1198
|
+
for (const link of links) {
|
|
1199
|
+
if (discovered.length + queue.length >= input.maxPages * 3) continue;
|
|
1200
|
+
if (visited.has(link)) continue;
|
|
1201
|
+
queue.push({ url: link, depth: nextDepth });
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
if (queue.length > 0 && discovered.length < input.maxPages) {
|
|
1205
|
+
await this.delay(input.delayMs);
|
|
1206
|
+
}
|
|
1207
|
+
}
|
|
1208
|
+
if (discovered.length >= input.maxPages) {
|
|
1209
|
+
skipped += queue.length;
|
|
1210
|
+
}
|
|
1211
|
+
return { urls: discovered.slice(0, input.maxPages), skipped };
|
|
1212
|
+
}
|
|
1213
|
+
normalizeWebsiteUrl(inputUrl, stripQueryParams) {
|
|
1214
|
+
try {
|
|
1215
|
+
const u = new URL(inputUrl);
|
|
1216
|
+
u.hash = "";
|
|
1217
|
+
if (stripQueryParams) u.search = "";
|
|
1218
|
+
return u.toString();
|
|
1219
|
+
} catch {
|
|
1220
|
+
return null;
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
async fetchHtml(url, timeout) {
|
|
1224
|
+
const response = await fetch(url, {
|
|
1225
|
+
headers: {
|
|
1226
|
+
"User-Agent": "SnapAgent-CMS-Crawler/1.0",
|
|
1227
|
+
"Accept": "text/html,application/xhtml+xml"
|
|
1228
|
+
},
|
|
1229
|
+
signal: AbortSignal.timeout(timeout)
|
|
1230
|
+
});
|
|
1231
|
+
if (!response.ok) return null;
|
|
1232
|
+
const contentType = response.headers.get("content-type") || "";
|
|
1233
|
+
if (!contentType.includes("text/html")) return null;
|
|
1234
|
+
return await response.text();
|
|
1235
|
+
}
|
|
1236
|
+
extractInternalLinks(html, base, stripQueryParams) {
|
|
1237
|
+
const $ = cheerio.load(html);
|
|
1238
|
+
const links = /* @__PURE__ */ new Set();
|
|
1239
|
+
$("a[href]").each((_, el) => {
|
|
1240
|
+
const href = ($(el).attr("href") || "").trim();
|
|
1241
|
+
if (!href) return;
|
|
1242
|
+
if (href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) return;
|
|
1243
|
+
try {
|
|
1244
|
+
const u = new URL(href, base);
|
|
1245
|
+
if (u.origin !== base.origin) return;
|
|
1246
|
+
u.hash = "";
|
|
1247
|
+
if (stripQueryParams) u.search = "";
|
|
1248
|
+
links.add(u.toString());
|
|
1249
|
+
} catch {
|
|
1250
|
+
}
|
|
1251
|
+
});
|
|
1252
|
+
return Array.from(links);
|
|
1253
|
+
}
|
|
1254
|
+
/**
|
|
1255
|
+
* Ingest content from a list of URLs
|
|
1256
|
+
*
|
|
1257
|
+
* @example
|
|
1258
|
+
* ```typescript
|
|
1259
|
+
* await plugin.ingestFromUrls([
|
|
1260
|
+
* 'https://example.com/about',
|
|
1261
|
+
* 'https://example.com/services',
|
|
1262
|
+
* 'https://example.com/contact',
|
|
1263
|
+
* ], {
|
|
1264
|
+
* contentSelector: '.page-content',
|
|
1265
|
+
* type: 'page',
|
|
1266
|
+
* });
|
|
1267
|
+
* ```
|
|
1268
|
+
*/
|
|
1269
|
+
async ingestFromUrls(urls, config = {}, options) {
|
|
1270
|
+
return this.crawlUrls(urls, {
|
|
1271
|
+
contentSelector: config.contentSelector,
|
|
1272
|
+
titleSelector: config.titleSelector,
|
|
1273
|
+
removeSelectors: config.removeSelectors,
|
|
1274
|
+
concurrency: config.concurrency ?? 3,
|
|
1275
|
+
delayMs: config.delayMs ?? 500,
|
|
1276
|
+
timeout: config.timeout ?? 3e4,
|
|
1277
|
+
typeFromUrl: config.typeFromUrl,
|
|
1278
|
+
defaultType: config.type || "page",
|
|
1279
|
+
metadata: config.metadata,
|
|
1280
|
+
stripQueryParams: config.stripQueryParams ?? false,
|
|
1281
|
+
render: config.render,
|
|
1282
|
+
renderOptions: config.renderOptions,
|
|
1283
|
+
debug: config.debug,
|
|
1284
|
+
crawlLedger: config.crawlLedger
|
|
1285
|
+
}, options);
|
|
1286
|
+
}
|
|
1287
|
+
/**
|
|
1288
|
+
* Ingest a single page from a URL (no sitemap discovery, no link lookup).
|
|
1289
|
+
* Uses the same crawl pipeline (static/render/auto) as other web ingestion methods.
|
|
1290
|
+
*/
|
|
1291
|
+
async ingestSinglePageFromUrl(config, options) {
|
|
1292
|
+
if (!config?.url) {
|
|
1293
|
+
return {
|
|
1294
|
+
success: false,
|
|
1295
|
+
indexed: 0,
|
|
1296
|
+
failed: 0,
|
|
1297
|
+
urlsCrawled: 0,
|
|
1298
|
+
urlsSkipped: 0,
|
|
1299
|
+
urlsFailed: 0,
|
|
1300
|
+
crawledAt: /* @__PURE__ */ new Date(),
|
|
1301
|
+
errors: [{ id: "config", error: "url is required" }]
|
|
1302
|
+
};
|
|
1303
|
+
}
|
|
1304
|
+
return this.crawlUrls([config.url], {
|
|
1305
|
+
contentSelector: config.contentSelector,
|
|
1306
|
+
titleSelector: config.titleSelector,
|
|
1307
|
+
removeSelectors: config.removeSelectors,
|
|
1308
|
+
concurrency: 1,
|
|
1309
|
+
delayMs: 0,
|
|
1310
|
+
timeout: config.timeout ?? 3e4,
|
|
1311
|
+
typeFromUrl: config.typeFromUrl,
|
|
1312
|
+
defaultType: config.type || "page",
|
|
1313
|
+
metadata: config.metadata,
|
|
1314
|
+
stripQueryParams: config.stripQueryParams ?? true,
|
|
1315
|
+
render: config.render,
|
|
1316
|
+
renderOptions: config.renderOptions,
|
|
1317
|
+
debug: config.debug,
|
|
1318
|
+
crawlLedger: config.crawlLedger
|
|
1319
|
+
}, options);
|
|
1320
|
+
}
|
|
1321
|
+
/**
|
|
1322
|
+
* Crawl a list of URLs and ingest their content
|
|
1323
|
+
*/
|
|
1324
|
+
async crawlUrls(urls, config, options) {
|
|
1325
|
+
const concurrency = config.concurrency ?? 3;
|
|
1326
|
+
const delayMs = config.delayMs ?? 500;
|
|
1327
|
+
const timeout = config.timeout ?? 3e4;
|
|
1328
|
+
const renderMode = config.render ?? false;
|
|
1329
|
+
const renderOptions = config.renderOptions || {};
|
|
1330
|
+
const minContentLength = renderOptions.minContentLength ?? 200;
|
|
1331
|
+
const dbg = this.createDebugCollector(config.debug);
|
|
1332
|
+
const ledgerOpts = this.resolveCrawlLedgerOptions(config);
|
|
1333
|
+
const forceRecrawl = !!(options && options.forceRecrawl);
|
|
1334
|
+
const agentId = options?.agentId ?? "shared";
|
|
1335
|
+
const stripQ = config.stripQueryParams ?? false;
|
|
1336
|
+
const urlByNorm = /* @__PURE__ */ new Map();
|
|
1337
|
+
for (const u of urls) {
|
|
1338
|
+
const norm = this.normalizeLedgerUrl(u, stripQ) || u;
|
|
1339
|
+
if (!urlByNorm.has(norm)) urlByNorm.set(norm, u);
|
|
1340
|
+
}
|
|
1341
|
+
const uniqueUrls = Array.from(urlByNorm.values());
|
|
1342
|
+
const counters = {
|
|
1343
|
+
staticOk: 0,
|
|
1344
|
+
renderOk: 0,
|
|
1345
|
+
renderFallbacks: 0,
|
|
1346
|
+
nonHtml: 0,
|
|
1347
|
+
tooSmall: 0,
|
|
1348
|
+
blockedSuspected: 0,
|
|
1349
|
+
renderErrors: 0,
|
|
1350
|
+
ledgerSkipped: 0
|
|
1351
|
+
};
|
|
1352
|
+
let indexed = 0;
|
|
1353
|
+
let urlsCrawled = 0;
|
|
1354
|
+
let urlsFailed = 0;
|
|
1355
|
+
const errors = [];
|
|
1356
|
+
const documents = [];
|
|
1357
|
+
const pageStatuses = [];
|
|
1358
|
+
const maxStatuses = ledgerOpts?.maxPageStatuses ?? 500;
|
|
1359
|
+
for (let i = 0; i < uniqueUrls.length; i += concurrency) {
|
|
1360
|
+
const batch = uniqueUrls.slice(i, i + concurrency);
|
|
1361
|
+
const results = await Promise.allSettled(
|
|
1362
|
+
batch.map(async (url) => {
|
|
1363
|
+
const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
|
|
1364
|
+
if (ledgerOpts && !forceRecrawl) {
|
|
1365
|
+
const entry = await this.findLedgerEntry(urlNormalized, agentId);
|
|
1366
|
+
if (this.shouldSkipLedger(
|
|
1367
|
+
entry,
|
|
1368
|
+
ledgerOpts.ttlMsIndexed,
|
|
1369
|
+
ledgerOpts.ttlMsFailure,
|
|
1370
|
+
ledgerOpts.ttlMsRenderError,
|
|
1371
|
+
false
|
|
1372
|
+
)) {
|
|
1373
|
+
counters.ledgerSkipped++;
|
|
1374
|
+
this.pushPageStatus(pageStatuses, maxStatuses, {
|
|
1375
|
+
url,
|
|
1376
|
+
urlNormalized,
|
|
1377
|
+
status: "skipped_ledger",
|
|
1378
|
+
skippedReason: `fresh:${entry?.lastStatus}`,
|
|
1379
|
+
contentLength: entry?.contentLength,
|
|
1380
|
+
title: entry?.title,
|
|
1381
|
+
docId: entry?.docId
|
|
1382
|
+
});
|
|
1383
|
+
dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
|
|
1384
|
+
return { kind: "ledger_skip", url };
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
try {
|
|
1388
|
+
const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
|
|
1389
|
+
renderMode,
|
|
1390
|
+
renderOptions,
|
|
1391
|
+
minContentLength,
|
|
1392
|
+
dbg
|
|
1393
|
+
});
|
|
1394
|
+
if (diag?.modeUsed === "static_ok") counters.staticOk++;
|
|
1395
|
+
if (diag?.modeUsed === "render_ok") counters.renderOk++;
|
|
1396
|
+
if (diag?.modeUsed === "render_fallback_ok") counters.renderFallbacks++;
|
|
1397
|
+
if (diag?.reason === "non_html") counters.nonHtml++;
|
|
1398
|
+
if (diag?.reason === "too_small") counters.tooSmall++;
|
|
1399
|
+
if (diag?.reason === "blocked_suspected") counters.blockedSuspected++;
|
|
1400
|
+
if (diag?.reason === "render_error") counters.renderErrors++;
|
|
1401
|
+
const crawlSt = this.toLedgerStatus(doc, diag);
|
|
1402
|
+
if (ledgerOpts) {
|
|
1403
|
+
await this.upsertLedgerRecord({
|
|
1404
|
+
url,
|
|
1405
|
+
urlNormalized,
|
|
1406
|
+
agentId,
|
|
1407
|
+
status: crawlSt,
|
|
1408
|
+
doc,
|
|
1409
|
+
diag
|
|
1410
|
+
});
|
|
1411
|
+
}
|
|
1412
|
+
this.pushPageStatus(pageStatuses, maxStatuses, {
|
|
1413
|
+
url,
|
|
1414
|
+
urlNormalized,
|
|
1415
|
+
status: crawlSt,
|
|
1416
|
+
modeUsed: diag?.modeUsed,
|
|
1417
|
+
contentLength: doc?.content?.length,
|
|
1418
|
+
bodyTextLengthHint,
|
|
1419
|
+
title: doc?.metadata?.title,
|
|
1420
|
+
docId: doc?.id,
|
|
1421
|
+
error: diag?.errorMessage
|
|
1422
|
+
});
|
|
1423
|
+
return { kind: "doc", doc, url };
|
|
1424
|
+
} catch (error) {
|
|
1425
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
1426
|
+
if (ledgerOpts) {
|
|
1427
|
+
await this.upsertLedgerRecord({
|
|
1428
|
+
url,
|
|
1429
|
+
urlNormalized,
|
|
1430
|
+
agentId,
|
|
1431
|
+
status: "error",
|
|
1432
|
+
errorMessage: msg
|
|
1433
|
+
});
|
|
1434
|
+
}
|
|
1435
|
+
this.pushPageStatus(pageStatuses, maxStatuses, {
|
|
1436
|
+
url,
|
|
1437
|
+
urlNormalized,
|
|
1438
|
+
status: "error",
|
|
1439
|
+
error: msg
|
|
1440
|
+
});
|
|
1441
|
+
throw { url, error };
|
|
1442
|
+
}
|
|
1443
|
+
})
|
|
1444
|
+
);
|
|
1445
|
+
for (const result of results) {
|
|
1446
|
+
if (result.status === "fulfilled") {
|
|
1447
|
+
const v = result.value;
|
|
1448
|
+
if (v && typeof v === "object" && "kind" in v && v.kind === "ledger_skip") {
|
|
1449
|
+
continue;
|
|
1450
|
+
}
|
|
1451
|
+
if (v && typeof v === "object" && "kind" in v && v.kind === "doc" && v.doc) {
|
|
1452
|
+
documents.push(v.doc);
|
|
1453
|
+
urlsCrawled++;
|
|
1454
|
+
}
|
|
1455
|
+
} else if (result.status === "rejected") {
|
|
1456
|
+
urlsFailed++;
|
|
1457
|
+
errors.push({
|
|
1458
|
+
id: result.reason.url || "unknown",
|
|
1459
|
+
error: result.reason.error?.message || "Failed to crawl"
|
|
1460
|
+
});
|
|
1461
|
+
}
|
|
1462
|
+
}
|
|
1463
|
+
if (i + concurrency < uniqueUrls.length) {
|
|
1464
|
+
await this.delay(delayMs);
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
if (documents.length > 0) {
|
|
1468
|
+
const ingestResult = await this.ingest(documents, options);
|
|
1469
|
+
indexed = ingestResult.indexed;
|
|
1470
|
+
if (ingestResult.errors) {
|
|
1471
|
+
errors.push(...ingestResult.errors);
|
|
1472
|
+
}
|
|
1473
|
+
}
|
|
1474
|
+
return {
|
|
1475
|
+
success: errors.length === 0,
|
|
1476
|
+
indexed,
|
|
1477
|
+
failed: errors.length,
|
|
1478
|
+
urlsCrawled,
|
|
1479
|
+
urlsSkipped: 0,
|
|
1480
|
+
urlsFailed,
|
|
1481
|
+
crawledAt: /* @__PURE__ */ new Date(),
|
|
1482
|
+
errors: errors.length > 0 ? errors : void 0,
|
|
1483
|
+
metadata: {
|
|
1484
|
+
counters,
|
|
1485
|
+
pageStatuses,
|
|
1486
|
+
debug: dbg.summary()
|
|
1487
|
+
}
|
|
1488
|
+
};
|
|
1489
|
+
}
|
|
1490
|
+
/**
|
|
1491
|
+
* Crawl a single page and extract content
|
|
1492
|
+
*/
|
|
1493
|
+
async crawlPage(url, config, timeout) {
|
|
1494
|
+
const response = await fetch(url, {
|
|
1495
|
+
headers: {
|
|
1496
|
+
"User-Agent": "SnapAgent-CMS-Crawler/1.0",
|
|
1497
|
+
"Accept": "text/html,application/xhtml+xml"
|
|
1498
|
+
},
|
|
1499
|
+
signal: AbortSignal.timeout(timeout)
|
|
1500
|
+
});
|
|
1501
|
+
if (!response.ok) {
|
|
1502
|
+
throw new Error(`HTTP ${response.status}`);
|
|
1503
|
+
}
|
|
1504
|
+
const contentType = response.headers.get("content-type") || "";
|
|
1505
|
+
if (!contentType.includes("text/html")) {
|
|
1506
|
+
return null;
|
|
1507
|
+
}
|
|
1508
|
+
const html = await response.text();
|
|
1509
|
+
return this.extractDocumentFromHtml(url, html, config);
|
|
1510
|
+
}
|
|
1511
|
+
/**
|
|
1512
|
+
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
1513
|
+
* would otherwise hit an empty wrapper.
|
|
1514
|
+
*/
|
|
1515
|
+
static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
1516
|
+
stripNoiseFromDom($, config) {
|
|
1517
|
+
const removeSelectors = config.removeSelectors || [
|
|
1518
|
+
"script",
|
|
1519
|
+
"style",
|
|
1520
|
+
"nav",
|
|
1521
|
+
"header",
|
|
1522
|
+
"footer",
|
|
1523
|
+
".sidebar",
|
|
1524
|
+
".navigation",
|
|
1525
|
+
".menu",
|
|
1526
|
+
".comments",
|
|
1527
|
+
'[role="navigation"]',
|
|
1528
|
+
'[role="banner"]'
|
|
1529
|
+
];
|
|
1530
|
+
removeSelectors.forEach((selector) => $(selector).remove());
|
|
1531
|
+
}
|
|
1532
|
+
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
1533
|
+
extractBestContentText($, config) {
|
|
1534
|
+
const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
|
|
1535
|
+
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
1536
|
+
let best = "";
|
|
1537
|
+
for (const sel of selectors) {
|
|
1538
|
+
$(sel).each((_, el) => {
|
|
1539
|
+
const t = this.cleanContent($(el).text().trim());
|
|
1540
|
+
if (t.length > best.length) best = t;
|
|
1541
|
+
});
|
|
1542
|
+
}
|
|
1543
|
+
const bodyText = this.cleanContent($("body").text().trim());
|
|
1544
|
+
if (bodyText.length > best.length) best = bodyText;
|
|
1545
|
+
return best;
|
|
1546
|
+
}
|
|
1547
|
+
bodyTextLengthHint(html, config) {
|
|
1548
|
+
const $ = cheerio.load(html);
|
|
1549
|
+
this.stripNoiseFromDom($, config);
|
|
1550
|
+
return this.cleanContent($("body").text().trim()).length;
|
|
1551
|
+
}
|
|
1552
|
+
extractDocumentFromHtml(url, html, config) {
|
|
1553
|
+
const $ = cheerio.load(html);
|
|
1554
|
+
this.stripNoiseFromDom($, config);
|
|
1555
|
+
const titleSelector = config.titleSelector || "h1, title";
|
|
1556
|
+
let title = $(titleSelector).first().text().trim();
|
|
1557
|
+
if (!title) {
|
|
1558
|
+
title = $("title").text().trim();
|
|
1559
|
+
}
|
|
1560
|
+
const content = this.extractBestContentText($, config);
|
|
1561
|
+
const minChars = config.minExtractedContentLength ?? 50;
|
|
1562
|
+
if (!content || content.length < minChars) return null;
|
|
1563
|
+
let type = config.defaultType || "page";
|
|
1564
|
+
if (config.typeFromUrl) {
|
|
1565
|
+
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
1566
|
+
if (url.includes(pattern)) {
|
|
1567
|
+
type = typeName;
|
|
1568
|
+
break;
|
|
1569
|
+
}
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
const id = this.urlToId(url);
|
|
1573
|
+
return {
|
|
1574
|
+
id,
|
|
1575
|
+
content,
|
|
1576
|
+
metadata: {
|
|
1577
|
+
type,
|
|
1578
|
+
title,
|
|
1579
|
+
url,
|
|
1580
|
+
...config.metadata
|
|
1581
|
+
}
|
|
1582
|
+
};
|
|
1583
|
+
}
|
|
1584
|
+
looksLikeDynamicShell(html) {
|
|
1585
|
+
const lower = html.toLowerCase();
|
|
1586
|
+
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
1587
|
+
const body = bodyMatch?.[1] ?? html;
|
|
1588
|
+
const textOnly = body.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1589
|
+
const scriptCount = (body.match(/<script\b/gi) ?? []).length;
|
|
1590
|
+
const hasEmptyAppMountNode = /<(div|main)[^>]+id=["'](__next|root|app)["'][^>]*>\s*<\/\1>/i.test(body);
|
|
1591
|
+
const hasHydrationData = lower.includes("__next_data__") || lower.includes("__next_f") || lower.includes("window.__initial_state__") || lower.includes("window.__apollo_state__") || lower.includes("data-reactroot");
|
|
1592
|
+
const asksForJavascript = lower.includes("please enable javascript") || lower.includes("enable javascript to run this app") || lower.includes("you need to enable javascript");
|
|
1593
|
+
const hasLoadingHints = /\b(loading|please wait|spinner|initializing|fetching)\b/i.test(lower);
|
|
1594
|
+
const textLength = textOnly.length;
|
|
1595
|
+
const htmlLength = lower.length;
|
|
1596
|
+
const contentDensity = textLength / Math.max(htmlLength, 1);
|
|
1597
|
+
const isMostlyScripts = scriptCount >= 5 && textLength < 500;
|
|
1598
|
+
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
1599
|
+
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
1600
|
+
}
|
|
1601
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
1602
|
+
if (blockedSuspected) {
|
|
1603
|
+
return {
|
|
1604
|
+
doc: null,
|
|
1605
|
+
diag: { modeUsed: modeFailed, reason: "blocked_suspected" }
|
|
1606
|
+
};
|
|
1607
|
+
}
|
|
1608
|
+
if (renderFailure) {
|
|
1609
|
+
return {
|
|
1610
|
+
doc: null,
|
|
1611
|
+
diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure }
|
|
1612
|
+
};
|
|
1613
|
+
}
|
|
1614
|
+
return {
|
|
1615
|
+
doc,
|
|
1616
|
+
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
1617
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint
|
|
1618
|
+
};
|
|
1619
|
+
}
|
|
1620
|
+
async crawlPageSmart(url, config, timeout, ctx) {
|
|
1621
|
+
if (ctx.renderMode === true) {
|
|
1622
|
+
const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
1623
|
+
url,
|
|
1624
|
+
config,
|
|
1625
|
+
timeout,
|
|
1626
|
+
ctx.renderOptions,
|
|
1627
|
+
ctx.dbg
|
|
1628
|
+
);
|
|
1629
|
+
return this.diagFromRenderedAttempt(
|
|
1630
|
+
doc,
|
|
1631
|
+
bodyTextLengthHint,
|
|
1632
|
+
renderFailure,
|
|
1633
|
+
blockedSuspected,
|
|
1634
|
+
"render_ok",
|
|
1635
|
+
"render_failed"
|
|
1636
|
+
);
|
|
1637
|
+
}
|
|
1638
|
+
try {
|
|
1639
|
+
const response = await fetch(url, {
|
|
1640
|
+
headers: {
|
|
1641
|
+
"User-Agent": "SnapAgent-CMS-Crawler/1.0",
|
|
1642
|
+
"Accept": "text/html,application/xhtml+xml"
|
|
1643
|
+
},
|
|
1644
|
+
signal: AbortSignal.timeout(timeout)
|
|
1645
|
+
});
|
|
1646
|
+
if (!response.ok) {
|
|
1647
|
+
const status = response.status;
|
|
1648
|
+
if (status === 403 || status === 429 || status === 503) {
|
|
1649
|
+
ctx.dbg.log("crawl.blocked", { url, status });
|
|
1650
|
+
return { doc: null, diag: { modeUsed: "static_failed", reason: "blocked_suspected" } };
|
|
1651
|
+
}
|
|
1652
|
+
throw new Error(`HTTP ${status}`);
|
|
1653
|
+
}
|
|
1654
|
+
const contentType = response.headers.get("content-type") || "";
|
|
1655
|
+
if (!contentType.includes("text/html")) {
|
|
1656
|
+
return { doc: null, diag: { modeUsed: "static_failed", reason: "non_html" } };
|
|
1657
|
+
}
|
|
1658
|
+
const html = await response.text();
|
|
1659
|
+
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
1660
|
+
const staticHint = !doc ? this.bodyTextLengthHint(html, config) : void 0;
|
|
1661
|
+
if (doc && doc.content.length >= ctx.minContentLength) {
|
|
1662
|
+
return { doc, diag: { modeUsed: "static_ok" } };
|
|
1663
|
+
}
|
|
1664
|
+
if (ctx.renderMode === "auto") {
|
|
1665
|
+
const shouldRender = this.looksLikeDynamicShell(html) || !doc || doc.content.length < ctx.minContentLength;
|
|
1666
|
+
if (shouldRender) {
|
|
1667
|
+
ctx.dbg.log("crawl.renderFallback", {
|
|
1668
|
+
url,
|
|
1669
|
+
reason: !doc ? "no_doc" : "too_small",
|
|
1670
|
+
staticLength: doc?.content?.length ?? 0
|
|
1671
|
+
});
|
|
1672
|
+
const {
|
|
1673
|
+
doc: rendered,
|
|
1674
|
+
bodyTextLengthHint: rHint,
|
|
1675
|
+
renderFailure,
|
|
1676
|
+
blockedSuspected
|
|
1677
|
+
} = await this.crawlPageRendered(
|
|
1678
|
+
url,
|
|
1679
|
+
config,
|
|
1680
|
+
timeout,
|
|
1681
|
+
ctx.renderOptions,
|
|
1682
|
+
ctx.dbg
|
|
1683
|
+
);
|
|
1684
|
+
const mergedHint = rHint ?? staticHint;
|
|
1685
|
+
const fb = this.diagFromRenderedAttempt(
|
|
1686
|
+
rendered,
|
|
1687
|
+
mergedHint,
|
|
1688
|
+
renderFailure,
|
|
1689
|
+
blockedSuspected,
|
|
1690
|
+
"render_fallback_ok",
|
|
1691
|
+
"render_fallback_failed"
|
|
1692
|
+
);
|
|
1693
|
+
if (!rendered && (renderFailure || blockedSuspected)) {
|
|
1694
|
+
fb.bodyTextLengthHint = staticHint ?? rHint;
|
|
1695
|
+
}
|
|
1696
|
+
return fb;
|
|
1697
|
+
}
|
|
1698
|
+
}
|
|
1699
|
+
return {
|
|
1700
|
+
doc: null,
|
|
1701
|
+
diag: { modeUsed: "static_failed", reason: "too_small" },
|
|
1702
|
+
bodyTextLengthHint: staticHint
|
|
1703
|
+
};
|
|
1704
|
+
} catch (e) {
|
|
1705
|
+
throw e;
|
|
1706
|
+
}
|
|
1707
|
+
}
|
|
1708
|
+
async crawlPageRendered(url, config, timeout, renderOptions, dbg) {
|
|
1709
|
+
let playwright;
|
|
1710
|
+
try {
|
|
1711
|
+
playwright = await Function('return import("playwright")')();
|
|
1712
|
+
} catch (e) {
|
|
1713
|
+
dbg.log("render.missingDependency", { url, error: "playwright_not_installed" });
|
|
1714
|
+
throw new Error("playwright is not installed. Add it to dependencies to use crawlPageRendered().");
|
|
1715
|
+
}
|
|
1716
|
+
const waitUntil = renderOptions.waitUntil || "domcontentloaded";
|
|
1717
|
+
const waitForSelector = renderOptions.waitForSelector;
|
|
1718
|
+
const scrollCfg = renderOptions.scroll || {};
|
|
1719
|
+
const doScroll = scrollCfg.enabled ?? false;
|
|
1720
|
+
const maxScrolls = scrollCfg.maxScrolls ?? 10;
|
|
1721
|
+
const scrollDelayMs = scrollCfg.scrollDelayMs ?? 750;
|
|
1722
|
+
const stableIterations = scrollCfg.stableIterations ?? 2;
|
|
1723
|
+
const postRenderDelayMs = renderOptions.postRenderDelayMs ?? 0;
|
|
1724
|
+
const browser = await playwright.chromium.launch({ headless: true });
|
|
1725
|
+
try {
|
|
1726
|
+
const page = await browser.newPage();
|
|
1727
|
+
await page.goto(url, { waitUntil, timeout });
|
|
1728
|
+
if (waitForSelector) {
|
|
1729
|
+
await page.waitForSelector(waitForSelector, { timeout });
|
|
1730
|
+
}
|
|
1731
|
+
if (postRenderDelayMs > 0) {
|
|
1732
|
+
await page.waitForTimeout(postRenderDelayMs);
|
|
1733
|
+
}
|
|
1734
|
+
if (doScroll) {
|
|
1735
|
+
let stable = 0;
|
|
1736
|
+
let lastLen = 0;
|
|
1737
|
+
for (let i = 0; i < maxScrolls; i++) {
|
|
1738
|
+
const len = await page.evaluate("(document.body?.innerText || '').length");
|
|
1739
|
+
if (len <= lastLen + 20) stable++;
|
|
1740
|
+
else stable = 0;
|
|
1741
|
+
lastLen = len;
|
|
1742
|
+
if (stable >= stableIterations) break;
|
|
1743
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)");
|
|
1744
|
+
await page.waitForTimeout(scrollDelayMs);
|
|
1745
|
+
}
|
|
1746
|
+
}
|
|
1747
|
+
const html = await page.content();
|
|
1748
|
+
const bodyTextLengthHint = this.bodyTextLengthHint(html, config);
|
|
1749
|
+
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
1750
|
+
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
1751
|
+
try {
|
|
1752
|
+
const saveDir = config.debug.saveDir;
|
|
1753
|
+
const safeId = this.urlToId(url) || "page";
|
|
1754
|
+
const outDir = path.join(saveDir, safeId);
|
|
1755
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
1756
|
+
fs.writeFileSync(path.join(outDir, "rendered.html"), html, "utf8");
|
|
1757
|
+
fs.writeFileSync(path.join(outDir, "extracted.txt"), doc?.content || "", "utf8");
|
|
1758
|
+
fs.writeFileSync(path.join(outDir, "meta.json"), JSON.stringify(doc?.metadata || {}, null, 2), "utf8");
|
|
1759
|
+
} catch (e) {
|
|
1760
|
+
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
1761
|
+
}
|
|
1762
|
+
}
|
|
1763
|
+
return { doc, bodyTextLengthHint };
|
|
1764
|
+
} catch (e) {
|
|
1765
|
+
const msg = String(e?.message || e || "render_failed");
|
|
1766
|
+
const lower = msg.toLowerCase();
|
|
1767
|
+
if (lower.includes("captcha") || lower.includes("access denied")) {
|
|
1768
|
+
dbg.log("render.blocked", { url, error: msg });
|
|
1769
|
+
return { doc: null, bodyTextLengthHint: 0, blockedSuspected: true };
|
|
1770
|
+
}
|
|
1771
|
+
dbg.log("render.error", { url, error: msg });
|
|
1772
|
+
return { doc: null, bodyTextLengthHint: 0, renderFailure: msg };
|
|
1773
|
+
} finally {
|
|
1774
|
+
await browser.close();
|
|
1775
|
+
}
|
|
1776
|
+
}
|
|
1777
|
+
async discoverSitemaps(baseUrl, timeout, dbg) {
|
|
1778
|
+
const base = new URL(baseUrl);
|
|
1779
|
+
const robotsUrl = new URL("/robots.txt", base).toString();
|
|
1780
|
+
const found = /* @__PURE__ */ new Set();
|
|
1781
|
+
try {
|
|
1782
|
+
const res = await fetch(robotsUrl, {
|
|
1783
|
+
headers: { "User-Agent": "SnapAgent-CMS-Crawler/1.0" },
|
|
1784
|
+
signal: AbortSignal.timeout(timeout)
|
|
1785
|
+
});
|
|
1786
|
+
if (res.ok) {
|
|
1787
|
+
const txt = await res.text();
|
|
1788
|
+
const rx = /^sitemap:\s*(\S+)/gim;
|
|
1789
|
+
let m;
|
|
1790
|
+
while ((m = rx.exec(txt)) !== null) {
|
|
1791
|
+
const sm = m[1].trim();
|
|
1792
|
+
if (sm.startsWith("http")) found.add(sm);
|
|
1793
|
+
}
|
|
1794
|
+
dbg.log("discovery.robots", { robotsUrl, ok: true, sitemapCount: found.size });
|
|
1795
|
+
} else {
|
|
1796
|
+
dbg.log("discovery.robots", { robotsUrl, ok: false, status: res.status });
|
|
1797
|
+
}
|
|
1798
|
+
} catch (e) {
|
|
1799
|
+
dbg.log("discovery.robots", { robotsUrl, ok: false, error: e instanceof Error ? e.message : "failed" });
|
|
1800
|
+
}
|
|
1801
|
+
if (found.size === 0) {
|
|
1802
|
+
const candidates = [
|
|
1803
|
+
"/sitemap.xml",
|
|
1804
|
+
"/sitemap_index.xml",
|
|
1805
|
+
"/sitemap-index.xml",
|
|
1806
|
+
"/wp-sitemap.xml"
|
|
1807
|
+
].map((p) => new URL(p, base).toString());
|
|
1808
|
+
candidates.forEach((c) => found.add(c));
|
|
1809
|
+
dbg.log("discovery.sitemapCandidates", { count: candidates.length });
|
|
1810
|
+
}
|
|
1811
|
+
return Array.from(found);
|
|
1812
|
+
}
|
|
1813
|
+
createDebugCollector(debug) {
|
|
1814
|
+
const enabled = !!debug?.enabled;
|
|
1815
|
+
const level = debug?.level || "summary";
|
|
1816
|
+
const maxPerUrlLogs = debug?.maxPerUrlLogs ?? 200;
|
|
1817
|
+
const entries = [];
|
|
1818
|
+
return {
|
|
1819
|
+
log: (event, data) => {
|
|
1820
|
+
if (!enabled) return;
|
|
1821
|
+
if (level === "summary" && !event.startsWith("discovery.") && !event.startsWith("crawl.")) return;
|
|
1822
|
+
if (entries.length >= maxPerUrlLogs) return;
|
|
1823
|
+
entries.push({ ts: (/* @__PURE__ */ new Date()).toISOString(), event, data });
|
|
1824
|
+
},
|
|
1825
|
+
summary: () => enabled ? { enabled, level, entries } : void 0
|
|
1826
|
+
};
|
|
1827
|
+
}
|
|
1828
|
+
/**
|
|
1829
|
+
* Clean extracted text content
|
|
1830
|
+
*/
|
|
1831
|
+
cleanContent(text) {
|
|
1832
|
+
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
1833
|
+
}
|
|
1834
|
+
/**
|
|
1835
|
+
* Convert URL to a stable document ID
|
|
1836
|
+
*/
|
|
1837
|
+
urlToId(url) {
|
|
1838
|
+
return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
|
|
1839
|
+
}
|
|
1840
|
+
/**
|
|
1841
|
+
* Delay helper
|
|
1842
|
+
*/
|
|
1843
|
+
delay(ms) {
|
|
1844
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1845
|
+
}
|
|
1846
|
+
// ============================================================================
|
|
1847
|
+
// RSS/Atom Feed Ingestion
|
|
1848
|
+
// ============================================================================
|
|
1849
|
+
/**
|
|
1850
|
+
* Ingest content from an RSS or Atom feed
|
|
1851
|
+
*
|
|
1852
|
+
* @example
|
|
1853
|
+
* ```typescript
|
|
1854
|
+
* // Simple RSS ingestion
|
|
1855
|
+
* await plugin.ingestFromRSS({
|
|
1856
|
+
* feedUrl: 'https://myblog.com/feed/',
|
|
1857
|
+
* });
|
|
1858
|
+
*
|
|
1859
|
+
* // Fetch full page content for each item
|
|
1860
|
+
* await plugin.ingestFromRSS({
|
|
1861
|
+
* feedUrl: 'https://myblog.com/feed/',
|
|
1862
|
+
* fetchFullContent: true,
|
|
1863
|
+
* contentSelector: 'article',
|
|
1864
|
+
* });
|
|
1865
|
+
* ```
|
|
1866
|
+
*/
|
|
1867
|
+
async ingestFromRSS(config, options) {
|
|
1868
|
+
try {
|
|
1869
|
+
const response = await fetch(config.feedUrl, {
|
|
1870
|
+
headers: { "User-Agent": "SnapAgent-CMS-Crawler/1.0" },
|
|
1871
|
+
signal: AbortSignal.timeout(3e4)
|
|
1872
|
+
});
|
|
1873
|
+
if (!response.ok) {
|
|
1874
|
+
return {
|
|
1875
|
+
success: false,
|
|
1876
|
+
indexed: 0,
|
|
1877
|
+
failed: 1,
|
|
1878
|
+
urlsCrawled: 0,
|
|
1879
|
+
urlsSkipped: 0,
|
|
1880
|
+
urlsFailed: 1,
|
|
1881
|
+
crawledAt: /* @__PURE__ */ new Date(),
|
|
1882
|
+
errors: [{ id: config.feedUrl, error: `HTTP ${response.status}` }]
|
|
1883
|
+
};
|
|
1884
|
+
}
|
|
1885
|
+
const xml = await response.text();
|
|
1886
|
+
const items = this.parseRSSFeed(xml);
|
|
1887
|
+
if (items.length === 0) {
|
|
1888
|
+
return {
|
|
1889
|
+
success: true,
|
|
1890
|
+
indexed: 0,
|
|
1891
|
+
failed: 0,
|
|
1892
|
+
urlsCrawled: 0,
|
|
1893
|
+
urlsSkipped: 0,
|
|
1894
|
+
urlsFailed: 0,
|
|
1895
|
+
crawledAt: /* @__PURE__ */ new Date()
|
|
1896
|
+
};
|
|
1897
|
+
}
|
|
1898
|
+
const documents = [];
|
|
1899
|
+
const type = config.type || "post";
|
|
1900
|
+
let urlsCrawled = 0;
|
|
1901
|
+
let urlsFailed = 0;
|
|
1902
|
+
const errors = [];
|
|
1903
|
+
for (const item of items) {
|
|
1904
|
+
try {
|
|
1905
|
+
let content = item.content || item.description || "";
|
|
1906
|
+
if (config.fetchFullContent && item.link) {
|
|
1907
|
+
try {
|
|
1908
|
+
const doc = await this.crawlPage(item.link, {
|
|
1909
|
+
contentSelector: config.contentSelector,
|
|
1910
|
+
defaultType: type
|
|
1911
|
+
}, 3e4);
|
|
1912
|
+
if (doc) {
|
|
1913
|
+
content = doc.content;
|
|
1914
|
+
}
|
|
1915
|
+
urlsCrawled++;
|
|
1916
|
+
} catch (error) {
|
|
1917
|
+
urlsFailed++;
|
|
1918
|
+
}
|
|
1919
|
+
}
|
|
1920
|
+
content = this.stripHtml(content);
|
|
1921
|
+
if (content.length < 50) continue;
|
|
1922
|
+
documents.push({
|
|
1923
|
+
id: this.urlToId(item.link || item.guid || `rss-${documents.length}`),
|
|
1924
|
+
content,
|
|
1925
|
+
metadata: {
|
|
1926
|
+
type,
|
|
1927
|
+
title: item.title,
|
|
1928
|
+
url: item.link,
|
|
1929
|
+
publishedAt: item.pubDate,
|
|
1930
|
+
author: item.author,
|
|
1931
|
+
categories: item.categories,
|
|
1932
|
+
...config.metadata
|
|
1933
|
+
}
|
|
1934
|
+
});
|
|
1935
|
+
} catch (error) {
|
|
1936
|
+
errors.push({
|
|
1937
|
+
id: item.link || "unknown",
|
|
1938
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
1939
|
+
});
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
let indexed = 0;
|
|
1943
|
+
if (documents.length > 0) {
|
|
1944
|
+
const ingestResult = await this.ingest(documents, options);
|
|
1945
|
+
indexed = ingestResult.indexed;
|
|
1946
|
+
}
|
|
1947
|
+
return {
|
|
1948
|
+
success: errors.length === 0,
|
|
1949
|
+
indexed,
|
|
1950
|
+
failed: errors.length,
|
|
1951
|
+
urlsCrawled,
|
|
1952
|
+
urlsSkipped: 0,
|
|
1953
|
+
urlsFailed,
|
|
1954
|
+
crawledAt: /* @__PURE__ */ new Date(),
|
|
1955
|
+
errors: errors.length > 0 ? errors : void 0
|
|
1956
|
+
};
|
|
1957
|
+
} catch (error) {
|
|
1958
|
+
return {
|
|
1959
|
+
success: false,
|
|
1960
|
+
indexed: 0,
|
|
1961
|
+
failed: 1,
|
|
1962
|
+
urlsCrawled: 0,
|
|
1963
|
+
urlsSkipped: 0,
|
|
1964
|
+
urlsFailed: 0,
|
|
1965
|
+
crawledAt: /* @__PURE__ */ new Date(),
|
|
1966
|
+
errors: [{
|
|
1967
|
+
id: config.feedUrl,
|
|
1968
|
+
error: error instanceof Error ? error.message : "Unknown error"
|
|
1969
|
+
}]
|
|
1970
|
+
};
|
|
1971
|
+
}
|
|
1972
|
+
}
|
|
1973
|
+
/**
|
|
1974
|
+
* Parse RSS/Atom feed XML
|
|
1975
|
+
*/
|
|
1976
|
+
parseRSSFeed(xml) {
|
|
1977
|
+
const items = [];
|
|
1978
|
+
const isAtom = xml.includes("<feed") && xml.includes('xmlns="http://www.w3.org/2005/Atom"');
|
|
1979
|
+
if (isAtom) {
|
|
1980
|
+
const entryRegex = /<entry>([\s\S]*?)<\/entry>/gi;
|
|
1981
|
+
let match;
|
|
1982
|
+
while ((match = entryRegex.exec(xml)) !== null) {
|
|
1983
|
+
const entry = match[1];
|
|
1984
|
+
items.push({
|
|
1985
|
+
title: this.extractXmlValue(entry, "title"),
|
|
1986
|
+
link: this.extractAtomLink(entry),
|
|
1987
|
+
guid: this.extractXmlValue(entry, "id"),
|
|
1988
|
+
content: this.extractXmlValue(entry, "content") || this.extractXmlValue(entry, "summary"),
|
|
1989
|
+
pubDate: this.extractXmlValue(entry, "published") || this.extractXmlValue(entry, "updated"),
|
|
1990
|
+
author: this.extractXmlValue(entry, "name"),
|
|
1991
|
+
// Inside <author>
|
|
1992
|
+
categories: this.extractXmlValues(entry, "category", "term")
|
|
1993
|
+
});
|
|
1994
|
+
}
|
|
1995
|
+
} else {
|
|
1996
|
+
const itemRegex = /<item>([\s\S]*?)<\/item>/gi;
|
|
1997
|
+
let match;
|
|
1998
|
+
while ((match = itemRegex.exec(xml)) !== null) {
|
|
1999
|
+
const item = match[1];
|
|
2000
|
+
items.push({
|
|
2001
|
+
title: this.extractXmlValue(item, "title"),
|
|
2002
|
+
link: this.extractXmlValue(item, "link"),
|
|
2003
|
+
guid: this.extractXmlValue(item, "guid"),
|
|
2004
|
+
description: this.extractXmlValue(item, "description"),
|
|
2005
|
+
content: this.extractXmlValue(item, "content:encoded") || this.extractXmlValue(item, "content"),
|
|
2006
|
+
pubDate: this.extractXmlValue(item, "pubDate"),
|
|
2007
|
+
author: this.extractXmlValue(item, "author") || this.extractXmlValue(item, "dc:creator"),
|
|
2008
|
+
categories: this.extractXmlValues(item, "category")
|
|
2009
|
+
});
|
|
2010
|
+
}
|
|
2011
|
+
}
|
|
2012
|
+
return items;
|
|
2013
|
+
}
|
|
2014
|
+
/**
|
|
2015
|
+
* Extract a single value from XML
|
|
2016
|
+
*/
|
|
2017
|
+
extractXmlValue(xml, tag) {
|
|
2018
|
+
const cdataRegex = new RegExp(`<${tag}[^>]*><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`, "i");
|
|
2019
|
+
const cdataMatch = xml.match(cdataRegex);
|
|
2020
|
+
if (cdataMatch) {
|
|
2021
|
+
return cdataMatch[1].trim();
|
|
2022
|
+
}
|
|
2023
|
+
const regex = new RegExp(`<${tag}[^>]*>([^<]*)<\\/${tag}>`, "i");
|
|
2024
|
+
const match = xml.match(regex);
|
|
2025
|
+
return match ? match[1].trim() : void 0;
|
|
2026
|
+
}
|
|
2027
|
+
/**
|
|
2028
|
+
* Extract multiple values from XML
|
|
2029
|
+
*/
|
|
2030
|
+
extractXmlValues(xml, tag, attr) {
|
|
2031
|
+
const values = [];
|
|
2032
|
+
if (attr) {
|
|
2033
|
+
const regex = new RegExp(`<${tag}[^>]*${attr}="([^"]*)"[^>]*/?>`, "gi");
|
|
2034
|
+
let match;
|
|
2035
|
+
while ((match = regex.exec(xml)) !== null) {
|
|
2036
|
+
values.push(match[1]);
|
|
2037
|
+
}
|
|
2038
|
+
} else {
|
|
2039
|
+
const regex = new RegExp(`<${tag}[^>]*>([^<]*)<\\/${tag}>`, "gi");
|
|
2040
|
+
let match;
|
|
2041
|
+
while ((match = regex.exec(xml)) !== null) {
|
|
2042
|
+
values.push(match[1].trim());
|
|
2043
|
+
}
|
|
2044
|
+
}
|
|
2045
|
+
return values;
|
|
2046
|
+
}
|
|
2047
|
+
/**
|
|
2048
|
+
* Extract link from Atom entry
|
|
2049
|
+
*/
|
|
2050
|
+
extractAtomLink(entry) {
|
|
2051
|
+
const alternateMatch = entry.match(/<link[^>]*rel="alternate"[^>]*href="([^"]+)"/i);
|
|
2052
|
+
if (alternateMatch) return alternateMatch[1];
|
|
2053
|
+
const linkMatch = entry.match(/<link[^>]*href="([^"]+)"/i);
|
|
2054
|
+
return linkMatch ? linkMatch[1] : void 0;
|
|
2055
|
+
}
|
|
2056
|
+
/**
|
|
2057
|
+
* Strip HTML tags from content
|
|
2058
|
+
*/
|
|
2059
|
+
stripHtml(html) {
|
|
2060
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/ /g, " ").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, '"').replace(/'/g, "'").replace(/\s+/g, " ").trim();
|
|
2061
|
+
}
|
|
2062
|
+
// ============================================================================
|
|
2063
|
+
// Utility Methods
|
|
2064
|
+
// ============================================================================
|
|
2065
|
+
/**
|
|
2066
|
+
* Get cache statistics
|
|
2067
|
+
*/
|
|
2068
|
+
getCacheStats() {
|
|
2069
|
+
const total = this.cacheStats.hits + this.cacheStats.misses;
|
|
2070
|
+
const hitRate = total > 0 ? (this.cacheStats.hits / total).toFixed(3) : "0.000";
|
|
2071
|
+
return { ...this.cacheStats, hitRate };
|
|
2072
|
+
}
|
|
2073
|
+
/**
|
|
2074
|
+
* Clear embedding cache
|
|
2075
|
+
*/
|
|
2076
|
+
clearCache() {
|
|
2077
|
+
this.embeddingCache.clear();
|
|
2078
|
+
this.cacheStats = { hits: 0, misses: 0 };
|
|
2079
|
+
}
|
|
2080
|
+
/**
|
|
2081
|
+
* Get plugin configuration (for persistence)
|
|
2082
|
+
*/
|
|
2083
|
+
getConfig() {
|
|
2084
|
+
return {
|
|
2085
|
+
name: this.name,
|
|
2086
|
+
mongoUri: "${MONGODB_URI}",
|
|
2087
|
+
// Reference env var
|
|
2088
|
+
dbName: this.config.dbName,
|
|
2089
|
+
collection: this.config.collection,
|
|
2090
|
+
openaiApiKey: "${OPENAI_API_KEY}",
|
|
2091
|
+
// Reference env var
|
|
2092
|
+
embeddingModel: this.config.embeddingModel,
|
|
2093
|
+
tenantId: this.config.tenantId,
|
|
2094
|
+
vectorIndexName: this.config.vectorIndexName,
|
|
2095
|
+
numCandidates: this.config.numCandidates,
|
|
2096
|
+
limit: this.config.limit,
|
|
2097
|
+
minScore: this.config.minScore,
|
|
2098
|
+
filterableFields: this.config.filterableFields,
|
|
2099
|
+
typeBoosts: this.config.typeBoosts,
|
|
2100
|
+
recencyBoost: this.config.recencyBoost,
|
|
2101
|
+
priority: this.priority
|
|
2102
|
+
};
|
|
2103
|
+
}
|
|
2104
|
+
};
|
|
2105
|
+
export {
|
|
2106
|
+
WebRAGPlugin
|
|
2107
|
+
};
|