@snap-agent/rag-web 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,2107 @@
1
+ // src/WebRAGPlugin.ts
2
+ import { MongoClient } from "mongodb";
3
+ import OpenAI from "openai";
4
+ import * as cheerio from "cheerio";
5
+ import * as fs from "fs";
6
+ import * as path from "path";
7
+ var WebRAGPlugin = class _WebRAGPlugin {
8
+ name = "web-rag";
9
+ type = "rag";
10
+ priority;
11
+ config;
12
+ client = null;
13
+ db = null;
14
+ openai;
15
+ // Embedding cache
16
+ embeddingCache = /* @__PURE__ */ new Map();
17
+ cacheStats = { hits: 0, misses: 0 };
18
+ constructor(config) {
19
+ this.config = {
20
+ collection: "web_content",
21
+ embeddingModel: "text-embedding-3-small",
22
+ vectorIndexName: "web_vector_index",
23
+ numCandidates: 100,
24
+ limit: 10,
25
+ minScore: 0.7,
26
+ filterableFields: ["type"],
27
+ ...config
28
+ };
29
+ this.priority = config.priority ?? 100;
30
+ this.openai = new OpenAI({ apiKey: config.openaiApiKey });
31
+ }
32
+ // ============================================================================
33
+ // MongoDB Connection
34
+ // ============================================================================
35
+ async getCollection() {
36
+ if (!this.client) {
37
+ this.client = new MongoClient(this.config.mongoUri);
38
+ await this.client.connect();
39
+ this.db = this.client.db(this.config.dbName);
40
+ }
41
+ return this.db.collection(this.config.collection);
42
+ }
43
+ async getLedgerCollection() {
44
+ if (!this.client) {
45
+ this.client = new MongoClient(this.config.mongoUri);
46
+ await this.client.connect();
47
+ this.db = this.client.db(this.config.dbName);
48
+ }
49
+ const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
50
+ return this.db.collection(name);
51
+ }
52
+ /**
53
+ * List recent crawl ledger rows (for dashboards / pagination in the front).
54
+ */
55
+ async listCrawlLedger(options = {}) {
56
+ const col = await this.getLedgerCollection();
57
+ const filter = { tenantId: this.config.tenantId };
58
+ filter.agentId = options.agentId ?? "shared";
59
+ if (options.domain) filter.domain = options.domain;
60
+ if (options.status) filter.lastStatus = options.status;
61
+ const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
62
+ const skip = Math.max(options.skip ?? 0, 0);
63
+ return col.find(filter).sort({ lastCrawledAt: -1 }).skip(skip).limit(limit).toArray();
64
+ }
65
+ resolveCrawlLedgerOptions(config) {
66
+ const plugin = this.config.crawlLedger;
67
+ const per = config.crawlLedger;
68
+ const enabled = per?.enabled ?? plugin?.enabled ?? false;
69
+ if (!enabled) return null;
70
+ const ttlMsFailure = per?.ttlMsFailure ?? plugin?.ttlMsFailure ?? 60 * 60 * 1e3;
71
+ return {
72
+ ttlMsIndexed: per?.ttlMsIndexed ?? plugin?.ttlMsIndexed ?? 7 * 24 * 60 * 60 * 1e3,
73
+ ttlMsFailure,
74
+ ttlMsRenderError: per?.ttlMsRenderError ?? plugin?.ttlMsRenderError ?? 5 * 60 * 1e3,
75
+ maxPageStatuses: per?.maxPageStatuses ?? 500,
76
+ stripQuery: config.stripQueryParams ?? false
77
+ };
78
+ }
79
+ normalizeLedgerUrl(url, stripQuery) {
80
+ return this.normalizeWebsiteUrl(url, stripQuery);
81
+ }
82
+ shouldSkipLedger(entry, ttlMsIndexed, ttlMsFailure, ttlMsRenderError, forceRecrawl) {
83
+ if (forceRecrawl || !entry) return false;
84
+ const t = entry.lastCrawledAt instanceof Date ? entry.lastCrawledAt.getTime() : new Date(entry.lastCrawledAt).getTime();
85
+ const age = Date.now() - t;
86
+ if (entry.lastStatus === "indexed" && age < ttlMsIndexed) return true;
87
+ if (entry.lastStatus === "error" && age < ttlMsRenderError) return true;
88
+ if (entry.lastStatus !== "indexed" && entry.lastStatus !== "error" && age < ttlMsFailure) {
89
+ return true;
90
+ }
91
+ return false;
92
+ }
93
+ async findLedgerEntry(urlNormalized, agentId) {
94
+ const col = await this.getLedgerCollection();
95
+ return col.findOne({
96
+ tenantId: this.config.tenantId,
97
+ agentId,
98
+ urlNormalized
99
+ });
100
+ }
101
+ toLedgerStatus(doc, diag) {
102
+ if (doc) return "indexed";
103
+ if (diag?.reason === "non_html") return "non_html";
104
+ if (diag?.reason === "blocked_suspected") return "blocked_suspected";
105
+ if (diag?.reason === "render_error") return "error";
106
+ return "too_small";
107
+ }
108
+ async upsertLedgerRecord(params) {
109
+ const col = await this.getLedgerCollection();
110
+ let domain = "";
111
+ try {
112
+ domain = new URL(params.url).hostname;
113
+ } catch {
114
+ domain = "";
115
+ }
116
+ const now = /* @__PURE__ */ new Date();
117
+ const errMsg = params.errorMessage ?? params.diag?.errorMessage;
118
+ const $set = {
119
+ tenantId: this.config.tenantId,
120
+ agentId: params.agentId,
121
+ urlNormalized: params.urlNormalized,
122
+ url: params.url,
123
+ domain,
124
+ lastStatus: params.status,
125
+ lastCrawledAt: now,
126
+ updatedAt: now
127
+ };
128
+ if (errMsg !== void 0) {
129
+ $set.errorMessage = errMsg;
130
+ } else if (params.status === "indexed" && params.doc) {
131
+ $set.errorMessage = null;
132
+ }
133
+ if (params.doc) {
134
+ $set.modeUsed = params.diag?.modeUsed;
135
+ $set.contentLength = params.doc.content.length;
136
+ $set.title = params.doc.metadata?.title;
137
+ $set.docId = params.doc.id;
138
+ } else {
139
+ $set.modeUsed = params.diag?.modeUsed;
140
+ $set.contentLength = null;
141
+ $set.title = null;
142
+ $set.docId = null;
143
+ }
144
+ await col.updateOne(
145
+ {
146
+ tenantId: this.config.tenantId,
147
+ agentId: params.agentId,
148
+ urlNormalized: params.urlNormalized
149
+ },
150
+ { $set },
151
+ { upsert: true }
152
+ );
153
+ }
154
+ pushPageStatus(list, max, entry) {
155
+ list.push(entry);
156
+ while (list.length > max) list.shift();
157
+ }
158
+ async disconnect() {
159
+ if (this.client) {
160
+ await this.client.close();
161
+ this.client = null;
162
+ this.db = null;
163
+ }
164
+ }
165
+ // ============================================================================
166
+ // RAG Plugin Interface
167
+ // ============================================================================
168
+ /**
169
+ * Retrieve contextual content for a message
170
+ */
171
+ async retrieveContext(message, options = {}) {
172
+ const queryVector = await this.generateEmbedding(message);
173
+ const hardFilters = {
174
+ tenantId: this.config.tenantId,
175
+ ...options.filters
176
+ };
177
+ if (options.agentId) {
178
+ hardFilters.agentId = { $in: ["shared", options.agentId] };
179
+ }
180
+ const results = await this.vectorSearch({
181
+ queryVector,
182
+ hardFilters
183
+ });
184
+ let scoredResults = results;
185
+ if (this.config.typeBoosts) {
186
+ scoredResults = results.map((doc) => ({
187
+ ...doc,
188
+ score: doc.score * (this.config.typeBoosts[doc.metadata.type] ?? 1)
189
+ }));
190
+ }
191
+ if (this.config.recencyBoost?.enabled) {
192
+ const { field, decayDays, maxBoost = 1.2 } = this.config.recencyBoost;
193
+ const now = Date.now();
194
+ const decayMs = decayDays * 24 * 60 * 60 * 1e3;
195
+ scoredResults = scoredResults.map((doc) => {
196
+ const dateValue = doc.metadata[field];
197
+ if (!dateValue) return doc;
198
+ const docDate = new Date(dateValue).getTime();
199
+ const age = now - docDate;
200
+ const freshness = Math.max(0, 1 - age / decayMs);
201
+ const boost = 1 + (maxBoost - 1) * freshness;
202
+ return { ...doc, score: doc.score * boost };
203
+ });
204
+ }
205
+ scoredResults.sort((a, b) => b.score - a.score);
206
+ scoredResults = scoredResults.slice(0, this.config.limit);
207
+ const content = this.formatResultsToContext(scoredResults);
208
+ return {
209
+ content,
210
+ metadata: {
211
+ plugin: this.name,
212
+ contentCount: scoredResults.length,
213
+ types: [...new Set(scoredResults.map((d) => d.metadata.type))],
214
+ topResults: scoredResults.slice(0, 5).map((doc) => ({
215
+ id: doc.id,
216
+ type: doc.metadata.type,
217
+ title: doc.metadata.title,
218
+ url: doc.metadata.url,
219
+ score: doc.score
220
+ }))
221
+ }
222
+ };
223
+ }
224
+ /**
225
+ * Format retrieved content for LLM context
226
+ */
227
+ formatResultsToContext(docs) {
228
+ if (docs.length === 0) {
229
+ return "No relevant content found.";
230
+ }
231
+ const sections = ["## Relevant Content\n"];
232
+ for (const doc of docs) {
233
+ const meta = doc.metadata;
234
+ const header = meta.title || `${meta.type} (${doc.id})`;
235
+ sections.push(`### ${header}`);
236
+ if (meta.type) sections.push(`**Type:** ${meta.type}`);
237
+ if (meta.url) sections.push(`**URL:** ${meta.url}`);
238
+ const skipFields = ["type", "title", "url", "sourceUrl", "fetchedAt"];
239
+ const extraMeta = Object.entries(meta).filter(([key]) => !skipFields.includes(key)).map(([key, value]) => `**${this.formatFieldName(key)}:** ${this.formatFieldValue(value)}`);
240
+ if (extraMeta.length > 0) {
241
+ sections.push(extraMeta.join("\n"));
242
+ }
243
+ sections.push("");
244
+ sections.push(doc.content);
245
+ sections.push("");
246
+ }
247
+ return sections.join("\n");
248
+ }
249
+ formatFieldName(key) {
250
+ return key.replace(/([A-Z])/g, " $1").replace(/^./, (s) => s.toUpperCase());
251
+ }
252
+ formatFieldValue(value) {
253
+ if (Array.isArray(value)) return value.join(", ");
254
+ if (value instanceof Date) return value.toLocaleDateString();
255
+ if (typeof value === "object") return JSON.stringify(value);
256
+ return String(value);
257
+ }
258
+ // ============================================================================
259
+ // Vector Search
260
+ // ============================================================================
261
+ async vectorSearch(options) {
262
+ const collection = await this.getCollection();
263
+ const pipeline = [
264
+ {
265
+ $vectorSearch: {
266
+ index: this.config.vectorIndexName,
267
+ path: "embedding",
268
+ queryVector: options.queryVector,
269
+ numCandidates: this.config.numCandidates,
270
+ limit: this.config.limit * 2,
271
+ // Fetch more for post-filtering
272
+ filter: options.hardFilters
273
+ }
274
+ },
275
+ {
276
+ $addFields: {
277
+ score: { $meta: "vectorSearchScore" }
278
+ }
279
+ }
280
+ ];
281
+ if (this.config.minScore) {
282
+ pipeline.push({
283
+ $match: { score: { $gte: this.config.minScore } }
284
+ });
285
+ }
286
+ pipeline.push({ $limit: this.config.limit * 2 });
287
+ const results = await collection.aggregate(pipeline).toArray();
288
+ return results;
289
+ }
290
+ // ============================================================================
291
+ // Embedding Generation
292
+ // ============================================================================
293
+ async generateEmbedding(text) {
294
+ const cacheConfig = this.config.cache?.embeddings;
295
+ if (cacheConfig?.enabled) {
296
+ const cached = this.embeddingCache.get(text);
297
+ const ttl = cacheConfig.ttl ?? 36e5;
298
+ if (cached && Date.now() - cached.timestamp < ttl) {
299
+ this.cacheStats.hits++;
300
+ return cached.value;
301
+ }
302
+ }
303
+ this.cacheStats.misses++;
304
+ const response = await this.openai.embeddings.create({
305
+ model: this.config.embeddingModel,
306
+ input: text
307
+ });
308
+ const embedding = response.data[0].embedding;
309
+ if (cacheConfig?.enabled) {
310
+ const maxSize = cacheConfig.maxSize ?? 1e3;
311
+ if (this.embeddingCache.size >= maxSize) {
312
+ const firstKey = this.embeddingCache.keys().next().value;
313
+ if (firstKey) this.embeddingCache.delete(firstKey);
314
+ }
315
+ this.embeddingCache.set(text, { value: embedding, timestamp: Date.now() });
316
+ }
317
+ return embedding;
318
+ }
319
+ async generateEmbeddingsBatch(texts) {
320
+ const embeddings = [];
321
+ for (const text of texts) {
322
+ const embedding = await this.generateEmbedding(text);
323
+ embeddings.push(embedding);
324
+ }
325
+ return embeddings;
326
+ }
327
+ // ============================================================================
328
+ // Document Ingestion
329
+ // ============================================================================
330
+ /**
331
+ * Ingest documents into the CMS RAG system
332
+ */
333
+ async ingest(documents, options) {
334
+ const collection = await this.getCollection();
335
+ let indexed = 0;
336
+ const errors = [];
337
+ const batchSize = options?.batchSize ?? 10;
338
+ for (let i = 0; i < documents.length; i += batchSize) {
339
+ const batch = documents.slice(i, i + batchSize);
340
+ const embeddings = await this.generateEmbeddingsBatch(
341
+ batch.map((doc) => doc.content)
342
+ );
343
+ const docsToStore = batch.map((doc, idx) => ({
344
+ id: doc.id,
345
+ content: doc.content,
346
+ metadata: {
347
+ type: doc.metadata?.type || "content",
348
+ ...doc.metadata
349
+ },
350
+ tenantId: this.config.tenantId,
351
+ // Use 'shared' marker for tenant-wide content, specific agentId for agent-only
352
+ agentId: options?.agentId || "shared",
353
+ embedding: embeddings[idx]
354
+ }));
355
+ for (const doc of docsToStore) {
356
+ try {
357
+ const filter = {
358
+ tenantId: this.config.tenantId,
359
+ id: doc.id,
360
+ // Match by agentId ('shared' for tenant-wide, specific for agent-only)
361
+ agentId: options?.agentId || "shared"
362
+ };
363
+ await collection.updateOne(
364
+ filter,
365
+ {
366
+ $set: { ...doc, updatedAt: /* @__PURE__ */ new Date() },
367
+ $setOnInsert: { createdAt: /* @__PURE__ */ new Date() }
368
+ },
369
+ { upsert: true }
370
+ );
371
+ indexed++;
372
+ } catch (error) {
373
+ errors.push({
374
+ id: doc.id,
375
+ error: error instanceof Error ? error.message : "Unknown error"
376
+ });
377
+ }
378
+ }
379
+ }
380
+ return {
381
+ success: errors.length === 0,
382
+ indexed,
383
+ failed: errors.length,
384
+ errors: errors.length > 0 ? errors : void 0,
385
+ metadata: {
386
+ tenantId: this.config.tenantId,
387
+ collection: this.config.collection
388
+ }
389
+ };
390
+ }
391
+ /**
392
+ * Update a single document
393
+ */
394
+ async update(id, document, options) {
395
+ const collection = await this.getCollection();
396
+ const update = { updatedAt: /* @__PURE__ */ new Date() };
397
+ if (document.content) {
398
+ const embedding = await this.generateEmbedding(document.content);
399
+ update.content = document.content;
400
+ update.embedding = embedding;
401
+ }
402
+ if (document.metadata) {
403
+ for (const [key, value] of Object.entries(document.metadata)) {
404
+ update[`metadata.${key}`] = value;
405
+ }
406
+ }
407
+ const filter = {
408
+ tenantId: this.config.tenantId,
409
+ id,
410
+ // Match by agentId ('shared' for tenant-wide, specific for agent-only)
411
+ agentId: options?.agentId || "shared"
412
+ };
413
+ await collection.updateOne(filter, { $set: update });
414
+ }
415
+ /**
416
+ * Delete document(s) by ID
417
+ */
418
+ async delete(ids, options) {
419
+ const collection = await this.getCollection();
420
+ const idArray = Array.isArray(ids) ? ids : [ids];
421
+ const filter = {
422
+ tenantId: this.config.tenantId,
423
+ id: { $in: idArray },
424
+ // Match by agentId ('shared' for tenant-wide, specific for agent-only)
425
+ agentId: options?.agentId || "shared"
426
+ };
427
+ const result = await collection.deleteMany(filter);
428
+ return result.deletedCount;
429
+ }
430
+ /**
431
+ * Bulk operations
432
+ */
433
+ async bulk(operations, options) {
434
+ let inserted = 0;
435
+ let updated = 0;
436
+ let deleted = 0;
437
+ let failed = 0;
438
+ const errors = [];
439
+ for (const op of operations) {
440
+ try {
441
+ switch (op.type) {
442
+ case "insert":
443
+ if (op.document) {
444
+ await this.ingest([op.document], options);
445
+ inserted++;
446
+ }
447
+ break;
448
+ case "update":
449
+ if (op.document) {
450
+ await this.update(op.id, op.document, options);
451
+ updated++;
452
+ }
453
+ break;
454
+ case "delete":
455
+ const count = await this.delete(op.id, options);
456
+ deleted += count;
457
+ break;
458
+ }
459
+ } catch (error) {
460
+ failed++;
461
+ errors.push({
462
+ id: op.id,
463
+ operation: op.type,
464
+ error: error.message || "Unknown error"
465
+ });
466
+ }
467
+ }
468
+ return {
469
+ success: failed === 0,
470
+ inserted,
471
+ updated,
472
+ deleted,
473
+ failed,
474
+ errors: errors.length > 0 ? errors : void 0
475
+ };
476
+ }
477
+ // ============================================================================
478
+ // URL Ingestion
479
+ // ============================================================================
480
+ /**
481
+ * Ingest content from a URL (JSON, CSV, XML, or API)
482
+ */
483
+ async ingestFromUrl(source, options) {
484
+ try {
485
+ const controller = new AbortController();
486
+ const timeoutId = setTimeout(() => controller.abort(), source.timeout || 3e4);
487
+ const response = await fetch(source.url, {
488
+ headers: {
489
+ ...source.headers,
490
+ ...source.auth && this.buildAuthHeaders(source.auth)
491
+ },
492
+ signal: controller.signal
493
+ });
494
+ clearTimeout(timeoutId);
495
+ if (!response.ok) {
496
+ throw new Error(`HTTP error: ${response.status} ${response.statusText}`);
497
+ }
498
+ let documents;
499
+ if (source.type === "json" || source.type === "api") {
500
+ const data = await response.json();
501
+ documents = this.transformJsonToDocuments(data, source.transform);
502
+ } else if (source.type === "csv") {
503
+ const data = await response.text();
504
+ documents = this.transformCsvToDocuments(data, source.transform);
505
+ } else if (source.type === "xml") {
506
+ const data = await response.text();
507
+ documents = this.transformXmlToDocuments(data, source.transform);
508
+ } else {
509
+ throw new Error(`Unsupported source type: ${source.type}`);
510
+ }
511
+ documents = documents.map((doc) => ({
512
+ ...doc,
513
+ metadata: {
514
+ ...doc.metadata,
515
+ ...source.metadata,
516
+ sourceUrl: source.url,
517
+ fetchedAt: (/* @__PURE__ */ new Date()).toISOString()
518
+ }
519
+ }));
520
+ const ingestResult = await this.ingest(documents, options);
521
+ return {
522
+ ...ingestResult,
523
+ sourceUrl: source.url,
524
+ fetchedAt: /* @__PURE__ */ new Date(),
525
+ documentsFetched: documents.length
526
+ };
527
+ } catch (error) {
528
+ return {
529
+ success: false,
530
+ indexed: 0,
531
+ failed: 0,
532
+ sourceUrl: source.url,
533
+ fetchedAt: /* @__PURE__ */ new Date(),
534
+ documentsFetched: 0,
535
+ errors: [{
536
+ id: "fetch",
537
+ error: error instanceof Error ? error.message : "Unknown error"
538
+ }]
539
+ };
540
+ }
541
+ }
542
+ buildAuthHeaders(auth) {
543
+ if (!auth) return {};
544
+ switch (auth.type) {
545
+ case "bearer":
546
+ return auth.token ? { Authorization: `Bearer ${auth.token}` } : {};
547
+ case "basic":
548
+ if (auth.username && auth.password) {
549
+ const encoded = Buffer.from(`${auth.username}:${auth.password}`).toString("base64");
550
+ return { Authorization: `Basic ${encoded}` };
551
+ }
552
+ return {};
553
+ case "api-key":
554
+ return auth.header && auth.key ? { [auth.header]: auth.key } : {};
555
+ case "custom":
556
+ return auth.headers || {};
557
+ default:
558
+ return {};
559
+ }
560
+ }
561
+ transformJsonToDocuments(data, transform) {
562
+ let items = data;
563
+ if (transform?.documentPath) {
564
+ items = this.extractByPath(data, transform.documentPath);
565
+ }
566
+ if (!Array.isArray(items)) {
567
+ items = [items];
568
+ }
569
+ const fieldMapping = transform?.fieldMapping || {};
570
+ return items.map((item, index) => {
571
+ const metadata = {};
572
+ for (const [targetField, sourcePath] of Object.entries(fieldMapping)) {
573
+ if (targetField === "id" || targetField === "content") continue;
574
+ if (typeof sourcePath === "function") {
575
+ metadata[targetField] = sourcePath();
576
+ } else if (sourcePath) {
577
+ metadata[targetField] = this.extractField(item, sourcePath);
578
+ }
579
+ }
580
+ if (!metadata.type) {
581
+ metadata.type = "content";
582
+ }
583
+ return {
584
+ id: this.extractField(item, fieldMapping.id || "id") || `doc-${index}`,
585
+ content: this.extractField(item, fieldMapping.content || "content") || JSON.stringify(item),
586
+ metadata
587
+ };
588
+ });
589
+ }
590
+ transformCsvToDocuments(csvData, transform) {
591
+ const lines = csvData.trim().split("\n");
592
+ if (lines.length < 2) return [];
593
+ const headers = this.parseCsvLine(lines[0]);
594
+ return lines.slice(1).map((line, index) => {
595
+ const values = this.parseCsvLine(line);
596
+ const item = headers.reduce((acc, header, i) => {
597
+ acc[header] = values[i] || "";
598
+ return acc;
599
+ }, {});
600
+ return this.transformJsonToDocuments([item], transform)[0];
601
+ });
602
+ }
603
+ parseCsvLine(line) {
604
+ const result = [];
605
+ let current = "";
606
+ let inQuotes = false;
607
+ for (const char of line) {
608
+ if (char === '"') {
609
+ inQuotes = !inQuotes;
610
+ } else if (char === "," && !inQuotes) {
611
+ result.push(current.trim());
612
+ current = "";
613
+ } else {
614
+ current += char;
615
+ }
616
+ }
617
+ result.push(current.trim());
618
+ return result;
619
+ }
620
+ transformXmlToDocuments(xmlData, transform) {
621
+ const items = [];
622
+ const itemPath = transform?.documentPath || "item";
623
+ const itemRegex = new RegExp(`<${itemPath}[^>]*>([\\s\\S]*?)<\\/${itemPath}>`, "gi");
624
+ let match;
625
+ while ((match = itemRegex.exec(xmlData)) !== null) {
626
+ const itemXml = match[1];
627
+ const item = {};
628
+ const tagRegex = /<(\w+)[^>]*>([^<]*)<\/\1>/g;
629
+ let tagMatch;
630
+ while ((tagMatch = tagRegex.exec(itemXml)) !== null) {
631
+ item[tagMatch[1]] = tagMatch[2].trim();
632
+ }
633
+ items.push(item);
634
+ }
635
+ return this.transformJsonToDocuments(items, transform);
636
+ }
637
+ extractByPath(obj, path2) {
638
+ const parts = path2.split(".");
639
+ let current = obj;
640
+ for (const part of parts) {
641
+ if (current == null) return void 0;
642
+ const arrayMatch = part.match(/^(\w+)\[(\d+)\]$/);
643
+ if (arrayMatch) {
644
+ current = current[arrayMatch[1]]?.[parseInt(arrayMatch[2])];
645
+ } else {
646
+ current = current[part];
647
+ }
648
+ }
649
+ return current;
650
+ }
651
+ extractField(item, path2) {
652
+ return this.extractByPath(item, path2);
653
+ }
654
+ // ============================================================================
655
+ // Drupal JSON:API Integration
656
+ // ============================================================================
657
+ /**
658
+ * Ingest content from a Drupal site using JSON:API
659
+ */
660
+ async ingestFromDrupal(config, options) {
661
+ const results = [];
662
+ for (const contentType of config.contentTypes) {
663
+ const url = `${config.baseUrl}/jsonapi/node/${contentType}`;
664
+ const mapping = config.mappings?.[contentType];
665
+ const result = await this.ingestFromUrl(
666
+ {
667
+ url,
668
+ type: "json",
669
+ auth: config.auth,
670
+ transform: {
671
+ documentPath: "data",
672
+ fieldMapping: {
673
+ id: "id",
674
+ content: mapping?.content || "attributes.body.processed",
675
+ type: () => contentType,
676
+ title: "attributes.title",
677
+ url: "attributes.path.alias",
678
+ ...mapping?.fields
679
+ }
680
+ }
681
+ },
682
+ options
683
+ );
684
+ results.push(result);
685
+ }
686
+ return results;
687
+ }
688
+ /**
689
+ * Parse Drupal JSON:API node type (e.g., 'node--project' → 'project')
690
+ */
691
+ static parseDrupalType(type) {
692
+ return type.replace(/^node--/, "");
693
+ }
694
+ // ============================================================================
695
+ // WordPress REST API Integration
696
+ // ============================================================================
697
+ /**
698
+ * Ingest content from a WordPress site using REST API
699
+ *
700
+ * @example
701
+ * ```typescript
702
+ * await plugin.ingestFromWordPress({
703
+ * baseUrl: 'https://myblog.com',
704
+ * postTypes: ['posts', 'pages'],
705
+ * perPage: 100,
706
+ * });
707
+ * ```
708
+ */
709
+ async ingestFromWordPress(config, options) {
710
+ const results = [];
711
+ const postTypes = config.postTypes || ["posts", "pages"];
712
+ const perPage = config.perPage || 100;
713
+ const maxPages = config.maxPages || 10;
714
+ for (const postType of postTypes) {
715
+ let page = 1;
716
+ let hasMore = true;
717
+ while (hasMore && page <= maxPages) {
718
+ const url = `${config.baseUrl}/wp-json/wp/v2/${postType}?per_page=${perPage}&page=${page}&_embed`;
719
+ const mapping = config.mappings?.[postType];
720
+ try {
721
+ const result = await this.ingestFromUrl(
722
+ {
723
+ url,
724
+ type: "json",
725
+ auth: config.auth,
726
+ transform: {
727
+ fieldMapping: {
728
+ id: "id",
729
+ content: mapping?.content || "content.rendered",
730
+ type: () => this.normalizeWordPressType(postType),
731
+ title: "title.rendered",
732
+ url: "link",
733
+ slug: "slug",
734
+ publishedAt: "date",
735
+ modifiedAt: "modified",
736
+ author: "_embedded.author.0.name",
737
+ featuredImage: "_embedded.wp:featuredmedia.0.source_url",
738
+ excerpt: "excerpt.rendered",
739
+ categories: "_embedded.wp:term.0",
740
+ tags: "_embedded.wp:term.1",
741
+ ...mapping?.fields
742
+ }
743
+ }
744
+ },
745
+ options
746
+ );
747
+ results.push(result);
748
+ hasMore = result.documentsFetched === perPage;
749
+ page++;
750
+ } catch (error) {
751
+ hasMore = false;
752
+ }
753
+ }
754
+ }
755
+ return results;
756
+ }
757
+ /**
758
+ * Normalize WordPress post type to a cleaner name
759
+ */
760
+ normalizeWordPressType(postType) {
761
+ if (postType.endsWith("s")) {
762
+ return postType.slice(0, -1);
763
+ }
764
+ return postType;
765
+ }
766
+ // ============================================================================
767
+ // Sanity.io Integration
768
+ // ============================================================================
769
+ /**
770
+ * Ingest content from a Sanity.io project using GROQ queries
771
+ *
772
+ * @example
773
+ * ```typescript
774
+ * await plugin.ingestFromSanity({
775
+ * projectId: 'abc123',
776
+ * dataset: 'production',
777
+ * queries: {
778
+ * post: {
779
+ * query: '*[_type == "post" && !(_id in path("drafts.**"))]',
780
+ * content: 'body',
781
+ * fields: {
782
+ * author: 'author->name',
783
+ * categories: 'categories[]->title',
784
+ * },
785
+ * },
786
+ * },
787
+ * });
788
+ * ```
789
+ */
790
+ async ingestFromSanity(config, options) {
791
+ const results = [];
792
+ const apiVersion = config.apiVersion || "v2024-01-01";
793
+ const useCdn = config.useCdn !== false;
794
+ const baseUrl = useCdn ? `https://${config.projectId}.apicdn.sanity.io/${apiVersion}` : `https://${config.projectId}.api.sanity.io/${apiVersion}`;
795
+ for (const [contentType, queryConfig] of Object.entries(config.queries)) {
796
+ const encodedQuery = encodeURIComponent(queryConfig.query);
797
+ const url = `${baseUrl}/data/query/${config.dataset}?query=${encodedQuery}`;
798
+ const headers = {};
799
+ if (config.token) {
800
+ headers["Authorization"] = `Bearer ${config.token}`;
801
+ }
802
+ const result = await this.ingestFromUrl(
803
+ {
804
+ url,
805
+ type: "json",
806
+ headers,
807
+ transform: {
808
+ documentPath: "result",
809
+ fieldMapping: {
810
+ id: "_id",
811
+ content: queryConfig.content,
812
+ type: () => contentType,
813
+ title: "title",
814
+ slug: "slug.current",
815
+ publishedAt: "publishedAt",
816
+ updatedAt: "_updatedAt",
817
+ ...queryConfig.fields
818
+ }
819
+ }
820
+ },
821
+ options
822
+ );
823
+ results.push(result);
824
+ }
825
+ return results;
826
+ }
827
+ /**
828
+ * Convert Sanity Portable Text blocks to plain text
829
+ * Useful for extracting content from rich text fields
830
+ */
831
+ static sanityBlocksToText(blocks) {
832
+ if (!Array.isArray(blocks)) return "";
833
+ return blocks.filter((block) => block._type === "block").map((block) => {
834
+ if (!block.children) return "";
835
+ return block.children.map((child) => child.text || "").join("");
836
+ }).join("\n\n");
837
+ }
838
+ // ============================================================================
839
+ // Strapi Integration
840
+ // ============================================================================
841
+ /**
842
+ * Ingest content from a Strapi CMS (v4 by default)
843
+ *
844
+ * @example
845
+ * ```typescript
846
+ * await plugin.ingestFromStrapi({
847
+ * baseUrl: 'https://my-strapi.com',
848
+ * apiToken: process.env.STRAPI_TOKEN,
849
+ * contentTypes: ['articles', 'pages'],
850
+ * mappings: {
851
+ * articles: {
852
+ * content: 'attributes.content',
853
+ * fields: {
854
+ * author: 'attributes.author.data.attributes.name',
855
+ * category: 'attributes.category.data.attributes.name',
856
+ * },
857
+ * },
858
+ * },
859
+ * });
860
+ * ```
861
+ */
862
+ async ingestFromStrapi(config, options) {
863
+ const results = [];
864
+ const pageSize = config.pageSize || 100;
865
+ const maxPages = config.maxPages || 10;
866
+ for (const contentType of config.contentTypes) {
867
+ let page = 1;
868
+ let hasMore = true;
869
+ const mapping = config.mappings?.[contentType];
870
+ const useAttributes = mapping?.useAttributes !== false;
871
+ while (hasMore && page <= maxPages) {
872
+ const url = `${config.baseUrl}/api/${contentType}?pagination[page]=${page}&pagination[pageSize]=${pageSize}&populate=*`;
873
+ const headers = {};
874
+ if (config.apiToken) {
875
+ headers["Authorization"] = `Bearer ${config.apiToken}`;
876
+ }
877
+ try {
878
+ const result = await this.ingestFromUrl(
879
+ {
880
+ url,
881
+ type: "json",
882
+ headers,
883
+ transform: {
884
+ documentPath: "data",
885
+ fieldMapping: useAttributes ? {
886
+ // Strapi v4 format (with attributes)
887
+ id: "id",
888
+ content: mapping?.content || "attributes.content",
889
+ type: () => this.normalizeStrapiType(contentType),
890
+ title: "attributes.title",
891
+ slug: "attributes.slug",
892
+ publishedAt: "attributes.publishedAt",
893
+ updatedAt: "attributes.updatedAt",
894
+ ...mapping?.fields
895
+ } : {
896
+ // Strapi v3 format (flat)
897
+ id: "id",
898
+ content: mapping?.content || "content",
899
+ type: () => this.normalizeStrapiType(contentType),
900
+ title: "title",
901
+ slug: "slug",
902
+ publishedAt: "published_at",
903
+ updatedAt: "updated_at",
904
+ ...mapping?.fields
905
+ }
906
+ }
907
+ },
908
+ options
909
+ );
910
+ results.push(result);
911
+ hasMore = result.documentsFetched === pageSize;
912
+ page++;
913
+ } catch (error) {
914
+ hasMore = false;
915
+ }
916
+ }
917
+ }
918
+ return results;
919
+ }
920
+ /**
921
+ * Normalize Strapi collection type to singular form
922
+ */
923
+ normalizeStrapiType(collectionType) {
924
+ if (collectionType.endsWith("s")) {
925
+ return collectionType.slice(0, -1);
926
+ }
927
+ return collectionType;
928
+ }
929
+ // ============================================================================
930
+ // Web Crawling - Zero Setup for Non-Technical Clients
931
+ // ============================================================================
932
+ /**
933
+ * Ingest content by crawling a website's sitemap
934
+ * Perfect for non-technical clients - just provide the sitemap URL
935
+ *
936
+ * @example
937
+ * ```typescript
938
+ * // Simple usage - just provide the sitemap
939
+ * await plugin.ingestFromSitemap({
940
+ * sitemapUrl: 'https://my-site/sitemap.xml',
941
+ * });
942
+ *
943
+ * // Or auto-discover sitemap from base URL
944
+ * await plugin.ingestFromSitemap({
945
+ * baseUrl: 'https://my-site',
946
+ * });
947
+ *
948
+ * // With content selectors and type inference
949
+ * await plugin.ingestFromSitemap({
950
+ * sitemapUrl: 'https://my-site/sitemap.xml',
951
+ * contentSelector: 'article, .main-content',
952
+ * excludePatterns: ['/cart', '/checkout', '/admin'],
953
+ * typeFromUrl: {
954
+ * '/projects/': 'project',
955
+ * '/perspectives/': 'blog',
956
+ * '/people/': 'team',
957
+ * },
958
+ * });
959
+ * ```
960
+ */
961
+ async ingestFromSitemap(config, options) {
962
+ const maxPages = config.maxPages ?? 100;
963
+ const concurrency = config.concurrency ?? 3;
964
+ const delayMs = config.delayMs ?? 500;
965
+ let sitemapUrl = config.sitemapUrl;
966
+ if (!sitemapUrl && config.baseUrl) {
967
+ sitemapUrl = `${config.baseUrl.replace(/\/$/, "")}/sitemap.xml`;
968
+ }
969
+ if (!sitemapUrl) {
970
+ return {
971
+ success: false,
972
+ indexed: 0,
973
+ failed: 0,
974
+ urlsCrawled: 0,
975
+ urlsSkipped: 0,
976
+ urlsFailed: 0,
977
+ crawledAt: /* @__PURE__ */ new Date(),
978
+ errors: [{ id: "config", error: "Either sitemapUrl or baseUrl is required" }]
979
+ };
980
+ }
981
+ const urls = await this.parseSitemap(sitemapUrl, config);
982
+ let filteredUrls = urls;
983
+ if (config.includePatterns?.length) {
984
+ filteredUrls = filteredUrls.filter(
985
+ (url) => config.includePatterns.some((pattern) => url.includes(pattern))
986
+ );
987
+ }
988
+ if (config.excludePatterns?.length) {
989
+ filteredUrls = filteredUrls.filter(
990
+ (url) => !config.excludePatterns.some((pattern) => url.includes(pattern))
991
+ );
992
+ }
993
+ const urlsToCrawl = filteredUrls.slice(0, maxPages);
994
+ const urlsSkipped = filteredUrls.length - urlsToCrawl.length;
995
+ const result = await this.crawlUrls(urlsToCrawl, {
996
+ ...config,
997
+ concurrency,
998
+ delayMs
999
+ }, options);
1000
+ return {
1001
+ ...result,
1002
+ urlsSkipped,
1003
+ crawledAt: /* @__PURE__ */ new Date()
1004
+ };
1005
+ }
1006
+ /**
1007
+ * Ingest content from a website that has no sitemap (or sitemap is incomplete).
1008
+ * Discovers internal links from `baseUrl` (BFS) and then crawls the discovered URLs.
1009
+ *
1010
+ * This uses the same extraction pipeline as `ingestFromSitemap()` (via `crawlPage()`).
1011
+ */
1012
+ async ingestFromWebsite(config, options) {
1013
+ const maxPages = config.maxPages ?? 100;
1014
+ const maxDepth = config.maxDepth ?? 3;
1015
+ const concurrency = config.concurrency ?? 3;
1016
+ const delayMs = config.delayMs ?? 500;
1017
+ const timeout = config.timeout ?? 3e4;
1018
+ const stripQueryParams = config.stripQueryParams ?? true;
1019
+ if (!config.baseUrl) {
1020
+ return {
1021
+ success: false,
1022
+ indexed: 0,
1023
+ failed: 0,
1024
+ urlsCrawled: 0,
1025
+ urlsSkipped: 0,
1026
+ urlsFailed: 0,
1027
+ crawledAt: /* @__PURE__ */ new Date(),
1028
+ errors: [{ id: "config", error: "baseUrl is required" }]
1029
+ };
1030
+ }
1031
+ const dbg = this.createDebugCollector(config.debug);
1032
+ const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
1033
+ if (!base) {
1034
+ return {
1035
+ success: false,
1036
+ indexed: 0,
1037
+ failed: 0,
1038
+ urlsCrawled: 0,
1039
+ urlsSkipped: 0,
1040
+ urlsFailed: 0,
1041
+ crawledAt: /* @__PURE__ */ new Date(),
1042
+ errors: [{ id: "config", error: "Invalid baseUrl" }]
1043
+ };
1044
+ }
1045
+ const discoveredSitemaps = await this.discoverSitemaps(base, timeout, dbg);
1046
+ dbg.log("discovery.sitemaps", { baseUrl: base, sitemaps: discoveredSitemaps });
1047
+ let urlsToCrawl = [];
1048
+ let urlsSkipped = 0;
1049
+ for (const sm of discoveredSitemaps) {
1050
+ const urls = await this.parseSitemap(sm, {
1051
+ sitemapUrl: sm,
1052
+ timeout
1053
+ });
1054
+ if (urls.length > 0) {
1055
+ dbg.log("discovery.sitemapParsed", { sitemapUrl: sm, urlCount: urls.length });
1056
+ let filteredUrls = urls;
1057
+ if (config.includePatterns?.length) {
1058
+ filteredUrls = filteredUrls.filter((u) => config.includePatterns.some((p) => u.includes(p)));
1059
+ }
1060
+ if (config.excludePatterns?.length) {
1061
+ filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
1062
+ }
1063
+ urlsToCrawl = filteredUrls.slice(0, maxPages);
1064
+ urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
1065
+ break;
1066
+ }
1067
+ }
1068
+ if (urlsToCrawl.length === 0) {
1069
+ dbg.log("discovery.fallback", { reason: "no_sitemap_urls", method: "link_lookup" });
1070
+ const discovery = await this.discoverInternalUrls({
1071
+ baseUrl: base,
1072
+ maxPages,
1073
+ maxDepth,
1074
+ concurrency,
1075
+ delayMs,
1076
+ timeout,
1077
+ includePatterns: config.includePatterns,
1078
+ excludePatterns: config.excludePatterns,
1079
+ stripQueryParams
1080
+ });
1081
+ urlsToCrawl = discovery.urls;
1082
+ urlsSkipped = discovery.skipped;
1083
+ dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
1084
+ }
1085
+ const result = await this.crawlUrls(urlsToCrawl, {
1086
+ contentSelector: config.contentSelector,
1087
+ titleSelector: config.titleSelector,
1088
+ removeSelectors: config.removeSelectors,
1089
+ concurrency,
1090
+ delayMs,
1091
+ timeout,
1092
+ typeFromUrl: config.typeFromUrl,
1093
+ defaultType: config.defaultType ?? "page",
1094
+ metadata: config.metadata,
1095
+ includePatterns: config.includePatterns,
1096
+ excludePatterns: config.excludePatterns,
1097
+ stripQueryParams,
1098
+ render: config.render,
1099
+ renderOptions: config.renderOptions,
1100
+ debug: config.debug,
1101
+ crawlLedger: config.crawlLedger
1102
+ }, options);
1103
+ return {
1104
+ ...result,
1105
+ urlsSkipped,
1106
+ crawledAt: /* @__PURE__ */ new Date(),
1107
+ metadata: {
1108
+ ...result.metadata || {},
1109
+ discoveryDebug: dbg.summary()
1110
+ }
1111
+ };
1112
+ }
1113
+ /**
1114
+ * Parse sitemap XML and extract URLs
1115
+ */
1116
+ async parseSitemap(sitemapUrl, config) {
1117
+ const urls = [];
1118
+ try {
1119
+ const response = await fetch(sitemapUrl, {
1120
+ headers: { "User-Agent": "SnapAgent-CMS-Crawler/1.0" },
1121
+ signal: AbortSignal.timeout(config.timeout || 3e4)
1122
+ });
1123
+ if (!response.ok) {
1124
+ console.error(`Failed to fetch sitemap: ${response.status}`);
1125
+ return urls;
1126
+ }
1127
+ const xml = await response.text();
1128
+ if (xml.includes("<sitemapindex")) {
1129
+ const sitemapUrls = this.extractUrlsFromXml(xml, "sitemap", "loc");
1130
+ for (const subSitemapUrl of sitemapUrls.slice(0, 10)) {
1131
+ const subUrls = await this.parseSitemap(subSitemapUrl, config);
1132
+ urls.push(...subUrls);
1133
+ }
1134
+ } else {
1135
+ const pageUrls = this.extractUrlsFromXml(xml, "url", "loc");
1136
+ urls.push(...pageUrls);
1137
+ }
1138
+ } catch (error) {
1139
+ console.error(`Error parsing sitemap ${sitemapUrl}:`, error);
1140
+ }
1141
+ return urls;
1142
+ }
1143
+ /**
1144
+ * Extract URLs from sitemap XML
1145
+ */
1146
+ extractUrlsFromXml(xml, parentTag, urlTag) {
1147
+ const urls = [];
1148
+ const regex = new RegExp(`<${parentTag}[^>]*>[\\s\\S]*?<${urlTag}>([^<]+)<\\/${urlTag}>[\\s\\S]*?<\\/${parentTag}>`, "gi");
1149
+ let match;
1150
+ while ((match = regex.exec(xml)) !== null) {
1151
+ const url = match[1].trim();
1152
+ if (url.startsWith("http")) {
1153
+ urls.push(url);
1154
+ }
1155
+ }
1156
+ return urls;
1157
+ }
1158
+ async discoverInternalUrls(input) {
1159
+ const start = this.normalizeWebsiteUrl(input.baseUrl, input.stripQueryParams);
1160
+ if (!start) return { urls: [], skipped: 0 };
1161
+ const startUrl = new URL(start);
1162
+ const visited = /* @__PURE__ */ new Set();
1163
+ const queue = [{ url: startUrl.toString(), depth: 0 }];
1164
+ const discovered = [];
1165
+ let skipped = 0;
1166
+ while (queue.length > 0 && discovered.length < input.maxPages) {
1167
+ const batch = queue.splice(0, input.concurrency);
1168
+ const results = await Promise.allSettled(
1169
+ batch.map(async ({ url, depth }) => {
1170
+ if (visited.has(url)) return { url, depth, links: [] };
1171
+ visited.add(url);
1172
+ if (depth > input.maxDepth) return { url, depth, links: [] };
1173
+ if (input.includePatterns?.length && !input.includePatterns.some((p) => url.includes(p))) {
1174
+ skipped++;
1175
+ return { url, depth, links: [] };
1176
+ }
1177
+ if (input.excludePatterns?.length && input.excludePatterns.some((p) => url.includes(p))) {
1178
+ skipped++;
1179
+ return { url, depth, links: [] };
1180
+ }
1181
+ discovered.push(url);
1182
+ if (discovered.length >= input.maxPages) return { url, depth, links: [] };
1183
+ try {
1184
+ const html = await this.fetchHtml(url, input.timeout);
1185
+ if (!html) return { url, depth, links: [] };
1186
+ const links = this.extractInternalLinks(html, startUrl, input.stripQueryParams);
1187
+ return { url, depth, links };
1188
+ } catch {
1189
+ return { url, depth, links: [] };
1190
+ }
1191
+ })
1192
+ );
1193
+ for (const r of results) {
1194
+ if (r.status !== "fulfilled") continue;
1195
+ const { depth, links } = r.value;
1196
+ const nextDepth = depth + 1;
1197
+ if (nextDepth > input.maxDepth) continue;
1198
+ for (const link of links) {
1199
+ if (discovered.length + queue.length >= input.maxPages * 3) continue;
1200
+ if (visited.has(link)) continue;
1201
+ queue.push({ url: link, depth: nextDepth });
1202
+ }
1203
+ }
1204
+ if (queue.length > 0 && discovered.length < input.maxPages) {
1205
+ await this.delay(input.delayMs);
1206
+ }
1207
+ }
1208
+ if (discovered.length >= input.maxPages) {
1209
+ skipped += queue.length;
1210
+ }
1211
+ return { urls: discovered.slice(0, input.maxPages), skipped };
1212
+ }
1213
+ normalizeWebsiteUrl(inputUrl, stripQueryParams) {
1214
+ try {
1215
+ const u = new URL(inputUrl);
1216
+ u.hash = "";
1217
+ if (stripQueryParams) u.search = "";
1218
+ return u.toString();
1219
+ } catch {
1220
+ return null;
1221
+ }
1222
+ }
1223
+ async fetchHtml(url, timeout) {
1224
+ const response = await fetch(url, {
1225
+ headers: {
1226
+ "User-Agent": "SnapAgent-CMS-Crawler/1.0",
1227
+ "Accept": "text/html,application/xhtml+xml"
1228
+ },
1229
+ signal: AbortSignal.timeout(timeout)
1230
+ });
1231
+ if (!response.ok) return null;
1232
+ const contentType = response.headers.get("content-type") || "";
1233
+ if (!contentType.includes("text/html")) return null;
1234
+ return await response.text();
1235
+ }
1236
+ extractInternalLinks(html, base, stripQueryParams) {
1237
+ const $ = cheerio.load(html);
1238
+ const links = /* @__PURE__ */ new Set();
1239
+ $("a[href]").each((_, el) => {
1240
+ const href = ($(el).attr("href") || "").trim();
1241
+ if (!href) return;
1242
+ if (href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) return;
1243
+ try {
1244
+ const u = new URL(href, base);
1245
+ if (u.origin !== base.origin) return;
1246
+ u.hash = "";
1247
+ if (stripQueryParams) u.search = "";
1248
+ links.add(u.toString());
1249
+ } catch {
1250
+ }
1251
+ });
1252
+ return Array.from(links);
1253
+ }
1254
+ /**
1255
+ * Ingest content from a list of URLs
1256
+ *
1257
+ * @example
1258
+ * ```typescript
1259
+ * await plugin.ingestFromUrls([
1260
+ * 'https://example.com/about',
1261
+ * 'https://example.com/services',
1262
+ * 'https://example.com/contact',
1263
+ * ], {
1264
+ * contentSelector: '.page-content',
1265
+ * type: 'page',
1266
+ * });
1267
+ * ```
1268
+ */
1269
+ async ingestFromUrls(urls, config = {}, options) {
1270
+ return this.crawlUrls(urls, {
1271
+ contentSelector: config.contentSelector,
1272
+ titleSelector: config.titleSelector,
1273
+ removeSelectors: config.removeSelectors,
1274
+ concurrency: config.concurrency ?? 3,
1275
+ delayMs: config.delayMs ?? 500,
1276
+ timeout: config.timeout ?? 3e4,
1277
+ typeFromUrl: config.typeFromUrl,
1278
+ defaultType: config.type || "page",
1279
+ metadata: config.metadata,
1280
+ stripQueryParams: config.stripQueryParams ?? false,
1281
+ render: config.render,
1282
+ renderOptions: config.renderOptions,
1283
+ debug: config.debug,
1284
+ crawlLedger: config.crawlLedger
1285
+ }, options);
1286
+ }
1287
+ /**
1288
+ * Ingest a single page from a URL (no sitemap discovery, no link lookup).
1289
+ * Uses the same crawl pipeline (static/render/auto) as other web ingestion methods.
1290
+ */
1291
+ async ingestSinglePageFromUrl(config, options) {
1292
+ if (!config?.url) {
1293
+ return {
1294
+ success: false,
1295
+ indexed: 0,
1296
+ failed: 0,
1297
+ urlsCrawled: 0,
1298
+ urlsSkipped: 0,
1299
+ urlsFailed: 0,
1300
+ crawledAt: /* @__PURE__ */ new Date(),
1301
+ errors: [{ id: "config", error: "url is required" }]
1302
+ };
1303
+ }
1304
+ return this.crawlUrls([config.url], {
1305
+ contentSelector: config.contentSelector,
1306
+ titleSelector: config.titleSelector,
1307
+ removeSelectors: config.removeSelectors,
1308
+ concurrency: 1,
1309
+ delayMs: 0,
1310
+ timeout: config.timeout ?? 3e4,
1311
+ typeFromUrl: config.typeFromUrl,
1312
+ defaultType: config.type || "page",
1313
+ metadata: config.metadata,
1314
+ stripQueryParams: config.stripQueryParams ?? true,
1315
+ render: config.render,
1316
+ renderOptions: config.renderOptions,
1317
+ debug: config.debug,
1318
+ crawlLedger: config.crawlLedger
1319
+ }, options);
1320
+ }
1321
+ /**
1322
+ * Crawl a list of URLs and ingest their content
1323
+ */
1324
+ async crawlUrls(urls, config, options) {
1325
+ const concurrency = config.concurrency ?? 3;
1326
+ const delayMs = config.delayMs ?? 500;
1327
+ const timeout = config.timeout ?? 3e4;
1328
+ const renderMode = config.render ?? false;
1329
+ const renderOptions = config.renderOptions || {};
1330
+ const minContentLength = renderOptions.minContentLength ?? 200;
1331
+ const dbg = this.createDebugCollector(config.debug);
1332
+ const ledgerOpts = this.resolveCrawlLedgerOptions(config);
1333
+ const forceRecrawl = !!(options && options.forceRecrawl);
1334
+ const agentId = options?.agentId ?? "shared";
1335
+ const stripQ = config.stripQueryParams ?? false;
1336
+ const urlByNorm = /* @__PURE__ */ new Map();
1337
+ for (const u of urls) {
1338
+ const norm = this.normalizeLedgerUrl(u, stripQ) || u;
1339
+ if (!urlByNorm.has(norm)) urlByNorm.set(norm, u);
1340
+ }
1341
+ const uniqueUrls = Array.from(urlByNorm.values());
1342
+ const counters = {
1343
+ staticOk: 0,
1344
+ renderOk: 0,
1345
+ renderFallbacks: 0,
1346
+ nonHtml: 0,
1347
+ tooSmall: 0,
1348
+ blockedSuspected: 0,
1349
+ renderErrors: 0,
1350
+ ledgerSkipped: 0
1351
+ };
1352
+ let indexed = 0;
1353
+ let urlsCrawled = 0;
1354
+ let urlsFailed = 0;
1355
+ const errors = [];
1356
+ const documents = [];
1357
+ const pageStatuses = [];
1358
+ const maxStatuses = ledgerOpts?.maxPageStatuses ?? 500;
1359
+ for (let i = 0; i < uniqueUrls.length; i += concurrency) {
1360
+ const batch = uniqueUrls.slice(i, i + concurrency);
1361
+ const results = await Promise.allSettled(
1362
+ batch.map(async (url) => {
1363
+ const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
1364
+ if (ledgerOpts && !forceRecrawl) {
1365
+ const entry = await this.findLedgerEntry(urlNormalized, agentId);
1366
+ if (this.shouldSkipLedger(
1367
+ entry,
1368
+ ledgerOpts.ttlMsIndexed,
1369
+ ledgerOpts.ttlMsFailure,
1370
+ ledgerOpts.ttlMsRenderError,
1371
+ false
1372
+ )) {
1373
+ counters.ledgerSkipped++;
1374
+ this.pushPageStatus(pageStatuses, maxStatuses, {
1375
+ url,
1376
+ urlNormalized,
1377
+ status: "skipped_ledger",
1378
+ skippedReason: `fresh:${entry?.lastStatus}`,
1379
+ contentLength: entry?.contentLength,
1380
+ title: entry?.title,
1381
+ docId: entry?.docId
1382
+ });
1383
+ dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
1384
+ return { kind: "ledger_skip", url };
1385
+ }
1386
+ }
1387
+ try {
1388
+ const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
1389
+ renderMode,
1390
+ renderOptions,
1391
+ minContentLength,
1392
+ dbg
1393
+ });
1394
+ if (diag?.modeUsed === "static_ok") counters.staticOk++;
1395
+ if (diag?.modeUsed === "render_ok") counters.renderOk++;
1396
+ if (diag?.modeUsed === "render_fallback_ok") counters.renderFallbacks++;
1397
+ if (diag?.reason === "non_html") counters.nonHtml++;
1398
+ if (diag?.reason === "too_small") counters.tooSmall++;
1399
+ if (diag?.reason === "blocked_suspected") counters.blockedSuspected++;
1400
+ if (diag?.reason === "render_error") counters.renderErrors++;
1401
+ const crawlSt = this.toLedgerStatus(doc, diag);
1402
+ if (ledgerOpts) {
1403
+ await this.upsertLedgerRecord({
1404
+ url,
1405
+ urlNormalized,
1406
+ agentId,
1407
+ status: crawlSt,
1408
+ doc,
1409
+ diag
1410
+ });
1411
+ }
1412
+ this.pushPageStatus(pageStatuses, maxStatuses, {
1413
+ url,
1414
+ urlNormalized,
1415
+ status: crawlSt,
1416
+ modeUsed: diag?.modeUsed,
1417
+ contentLength: doc?.content?.length,
1418
+ bodyTextLengthHint,
1419
+ title: doc?.metadata?.title,
1420
+ docId: doc?.id,
1421
+ error: diag?.errorMessage
1422
+ });
1423
+ return { kind: "doc", doc, url };
1424
+ } catch (error) {
1425
+ const msg = error instanceof Error ? error.message : String(error);
1426
+ if (ledgerOpts) {
1427
+ await this.upsertLedgerRecord({
1428
+ url,
1429
+ urlNormalized,
1430
+ agentId,
1431
+ status: "error",
1432
+ errorMessage: msg
1433
+ });
1434
+ }
1435
+ this.pushPageStatus(pageStatuses, maxStatuses, {
1436
+ url,
1437
+ urlNormalized,
1438
+ status: "error",
1439
+ error: msg
1440
+ });
1441
+ throw { url, error };
1442
+ }
1443
+ })
1444
+ );
1445
+ for (const result of results) {
1446
+ if (result.status === "fulfilled") {
1447
+ const v = result.value;
1448
+ if (v && typeof v === "object" && "kind" in v && v.kind === "ledger_skip") {
1449
+ continue;
1450
+ }
1451
+ if (v && typeof v === "object" && "kind" in v && v.kind === "doc" && v.doc) {
1452
+ documents.push(v.doc);
1453
+ urlsCrawled++;
1454
+ }
1455
+ } else if (result.status === "rejected") {
1456
+ urlsFailed++;
1457
+ errors.push({
1458
+ id: result.reason.url || "unknown",
1459
+ error: result.reason.error?.message || "Failed to crawl"
1460
+ });
1461
+ }
1462
+ }
1463
+ if (i + concurrency < uniqueUrls.length) {
1464
+ await this.delay(delayMs);
1465
+ }
1466
+ }
1467
+ if (documents.length > 0) {
1468
+ const ingestResult = await this.ingest(documents, options);
1469
+ indexed = ingestResult.indexed;
1470
+ if (ingestResult.errors) {
1471
+ errors.push(...ingestResult.errors);
1472
+ }
1473
+ }
1474
+ return {
1475
+ success: errors.length === 0,
1476
+ indexed,
1477
+ failed: errors.length,
1478
+ urlsCrawled,
1479
+ urlsSkipped: 0,
1480
+ urlsFailed,
1481
+ crawledAt: /* @__PURE__ */ new Date(),
1482
+ errors: errors.length > 0 ? errors : void 0,
1483
+ metadata: {
1484
+ counters,
1485
+ pageStatuses,
1486
+ debug: dbg.summary()
1487
+ }
1488
+ };
1489
+ }
1490
+ /**
1491
+ * Crawl a single page and extract content
1492
+ */
1493
+ async crawlPage(url, config, timeout) {
1494
+ const response = await fetch(url, {
1495
+ headers: {
1496
+ "User-Agent": "SnapAgent-CMS-Crawler/1.0",
1497
+ "Accept": "text/html,application/xhtml+xml"
1498
+ },
1499
+ signal: AbortSignal.timeout(timeout)
1500
+ });
1501
+ if (!response.ok) {
1502
+ throw new Error(`HTTP ${response.status}`);
1503
+ }
1504
+ const contentType = response.headers.get("content-type") || "";
1505
+ if (!contentType.includes("text/html")) {
1506
+ return null;
1507
+ }
1508
+ const html = await response.text();
1509
+ return this.extractDocumentFromHtml(url, html, config);
1510
+ }
1511
+ /**
1512
+ * Default chain works for many WordPress / Elementor / block themes where `.first()`
1513
+ * would otherwise hit an empty wrapper.
1514
+ */
1515
+ static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
1516
+ stripNoiseFromDom($, config) {
1517
+ const removeSelectors = config.removeSelectors || [
1518
+ "script",
1519
+ "style",
1520
+ "nav",
1521
+ "header",
1522
+ "footer",
1523
+ ".sidebar",
1524
+ ".navigation",
1525
+ ".menu",
1526
+ ".comments",
1527
+ '[role="navigation"]',
1528
+ '[role="banner"]'
1529
+ ];
1530
+ removeSelectors.forEach((selector) => $(selector).remove());
1531
+ }
1532
+ /** Longest cleaned text among selector matches and full body (after noise strip). */
1533
+ extractBestContentText($, config) {
1534
+ const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
1535
+ const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
1536
+ let best = "";
1537
+ for (const sel of selectors) {
1538
+ $(sel).each((_, el) => {
1539
+ const t = this.cleanContent($(el).text().trim());
1540
+ if (t.length > best.length) best = t;
1541
+ });
1542
+ }
1543
+ const bodyText = this.cleanContent($("body").text().trim());
1544
+ if (bodyText.length > best.length) best = bodyText;
1545
+ return best;
1546
+ }
1547
+ bodyTextLengthHint(html, config) {
1548
+ const $ = cheerio.load(html);
1549
+ this.stripNoiseFromDom($, config);
1550
+ return this.cleanContent($("body").text().trim()).length;
1551
+ }
1552
+ extractDocumentFromHtml(url, html, config) {
1553
+ const $ = cheerio.load(html);
1554
+ this.stripNoiseFromDom($, config);
1555
+ const titleSelector = config.titleSelector || "h1, title";
1556
+ let title = $(titleSelector).first().text().trim();
1557
+ if (!title) {
1558
+ title = $("title").text().trim();
1559
+ }
1560
+ const content = this.extractBestContentText($, config);
1561
+ const minChars = config.minExtractedContentLength ?? 50;
1562
+ if (!content || content.length < minChars) return null;
1563
+ let type = config.defaultType || "page";
1564
+ if (config.typeFromUrl) {
1565
+ for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
1566
+ if (url.includes(pattern)) {
1567
+ type = typeName;
1568
+ break;
1569
+ }
1570
+ }
1571
+ }
1572
+ const id = this.urlToId(url);
1573
+ return {
1574
+ id,
1575
+ content,
1576
+ metadata: {
1577
+ type,
1578
+ title,
1579
+ url,
1580
+ ...config.metadata
1581
+ }
1582
+ };
1583
+ }
1584
+ looksLikeDynamicShell(html) {
1585
+ const lower = html.toLowerCase();
1586
+ const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
1587
+ const body = bodyMatch?.[1] ?? html;
1588
+ const textOnly = body.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
1589
+ const scriptCount = (body.match(/<script\b/gi) ?? []).length;
1590
+ const hasEmptyAppMountNode = /<(div|main)[^>]+id=["'](__next|root|app)["'][^>]*>\s*<\/\1>/i.test(body);
1591
+ const hasHydrationData = lower.includes("__next_data__") || lower.includes("__next_f") || lower.includes("window.__initial_state__") || lower.includes("window.__apollo_state__") || lower.includes("data-reactroot");
1592
+ const asksForJavascript = lower.includes("please enable javascript") || lower.includes("enable javascript to run this app") || lower.includes("you need to enable javascript");
1593
+ const hasLoadingHints = /\b(loading|please wait|spinner|initializing|fetching)\b/i.test(lower);
1594
+ const textLength = textOnly.length;
1595
+ const htmlLength = lower.length;
1596
+ const contentDensity = textLength / Math.max(htmlLength, 1);
1597
+ const isMostlyScripts = scriptCount >= 5 && textLength < 500;
1598
+ const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
1599
+ return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
1600
+ }
1601
+ diagFromRenderedAttempt(doc, bodyTextLengthHint, renderFailure, blockedSuspected, modeOk, modeFailed) {
1602
+ if (blockedSuspected) {
1603
+ return {
1604
+ doc: null,
1605
+ diag: { modeUsed: modeFailed, reason: "blocked_suspected" }
1606
+ };
1607
+ }
1608
+ if (renderFailure) {
1609
+ return {
1610
+ doc: null,
1611
+ diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure }
1612
+ };
1613
+ }
1614
+ return {
1615
+ doc,
1616
+ diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
1617
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint
1618
+ };
1619
+ }
1620
+ async crawlPageSmart(url, config, timeout, ctx) {
1621
+ if (ctx.renderMode === true) {
1622
+ const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
1623
+ url,
1624
+ config,
1625
+ timeout,
1626
+ ctx.renderOptions,
1627
+ ctx.dbg
1628
+ );
1629
+ return this.diagFromRenderedAttempt(
1630
+ doc,
1631
+ bodyTextLengthHint,
1632
+ renderFailure,
1633
+ blockedSuspected,
1634
+ "render_ok",
1635
+ "render_failed"
1636
+ );
1637
+ }
1638
+ try {
1639
+ const response = await fetch(url, {
1640
+ headers: {
1641
+ "User-Agent": "SnapAgent-CMS-Crawler/1.0",
1642
+ "Accept": "text/html,application/xhtml+xml"
1643
+ },
1644
+ signal: AbortSignal.timeout(timeout)
1645
+ });
1646
+ if (!response.ok) {
1647
+ const status = response.status;
1648
+ if (status === 403 || status === 429 || status === 503) {
1649
+ ctx.dbg.log("crawl.blocked", { url, status });
1650
+ return { doc: null, diag: { modeUsed: "static_failed", reason: "blocked_suspected" } };
1651
+ }
1652
+ throw new Error(`HTTP ${status}`);
1653
+ }
1654
+ const contentType = response.headers.get("content-type") || "";
1655
+ if (!contentType.includes("text/html")) {
1656
+ return { doc: null, diag: { modeUsed: "static_failed", reason: "non_html" } };
1657
+ }
1658
+ const html = await response.text();
1659
+ const doc = this.extractDocumentFromHtml(url, html, config);
1660
+ const staticHint = !doc ? this.bodyTextLengthHint(html, config) : void 0;
1661
+ if (doc && doc.content.length >= ctx.minContentLength) {
1662
+ return { doc, diag: { modeUsed: "static_ok" } };
1663
+ }
1664
+ if (ctx.renderMode === "auto") {
1665
+ const shouldRender = this.looksLikeDynamicShell(html) || !doc || doc.content.length < ctx.minContentLength;
1666
+ if (shouldRender) {
1667
+ ctx.dbg.log("crawl.renderFallback", {
1668
+ url,
1669
+ reason: !doc ? "no_doc" : "too_small",
1670
+ staticLength: doc?.content?.length ?? 0
1671
+ });
1672
+ const {
1673
+ doc: rendered,
1674
+ bodyTextLengthHint: rHint,
1675
+ renderFailure,
1676
+ blockedSuspected
1677
+ } = await this.crawlPageRendered(
1678
+ url,
1679
+ config,
1680
+ timeout,
1681
+ ctx.renderOptions,
1682
+ ctx.dbg
1683
+ );
1684
+ const mergedHint = rHint ?? staticHint;
1685
+ const fb = this.diagFromRenderedAttempt(
1686
+ rendered,
1687
+ mergedHint,
1688
+ renderFailure,
1689
+ blockedSuspected,
1690
+ "render_fallback_ok",
1691
+ "render_fallback_failed"
1692
+ );
1693
+ if (!rendered && (renderFailure || blockedSuspected)) {
1694
+ fb.bodyTextLengthHint = staticHint ?? rHint;
1695
+ }
1696
+ return fb;
1697
+ }
1698
+ }
1699
+ return {
1700
+ doc: null,
1701
+ diag: { modeUsed: "static_failed", reason: "too_small" },
1702
+ bodyTextLengthHint: staticHint
1703
+ };
1704
+ } catch (e) {
1705
+ throw e;
1706
+ }
1707
+ }
1708
+ async crawlPageRendered(url, config, timeout, renderOptions, dbg) {
1709
+ let playwright;
1710
+ try {
1711
+ playwright = await Function('return import("playwright")')();
1712
+ } catch (e) {
1713
+ dbg.log("render.missingDependency", { url, error: "playwright_not_installed" });
1714
+ throw new Error("playwright is not installed. Add it to dependencies to use crawlPageRendered().");
1715
+ }
1716
+ const waitUntil = renderOptions.waitUntil || "domcontentloaded";
1717
+ const waitForSelector = renderOptions.waitForSelector;
1718
+ const scrollCfg = renderOptions.scroll || {};
1719
+ const doScroll = scrollCfg.enabled ?? false;
1720
+ const maxScrolls = scrollCfg.maxScrolls ?? 10;
1721
+ const scrollDelayMs = scrollCfg.scrollDelayMs ?? 750;
1722
+ const stableIterations = scrollCfg.stableIterations ?? 2;
1723
+ const postRenderDelayMs = renderOptions.postRenderDelayMs ?? 0;
1724
+ const browser = await playwright.chromium.launch({ headless: true });
1725
+ try {
1726
+ const page = await browser.newPage();
1727
+ await page.goto(url, { waitUntil, timeout });
1728
+ if (waitForSelector) {
1729
+ await page.waitForSelector(waitForSelector, { timeout });
1730
+ }
1731
+ if (postRenderDelayMs > 0) {
1732
+ await page.waitForTimeout(postRenderDelayMs);
1733
+ }
1734
+ if (doScroll) {
1735
+ let stable = 0;
1736
+ let lastLen = 0;
1737
+ for (let i = 0; i < maxScrolls; i++) {
1738
+ const len = await page.evaluate("(document.body?.innerText || '').length");
1739
+ if (len <= lastLen + 20) stable++;
1740
+ else stable = 0;
1741
+ lastLen = len;
1742
+ if (stable >= stableIterations) break;
1743
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)");
1744
+ await page.waitForTimeout(scrollDelayMs);
1745
+ }
1746
+ }
1747
+ const html = await page.content();
1748
+ const bodyTextLengthHint = this.bodyTextLengthHint(html, config);
1749
+ const doc = this.extractDocumentFromHtml(url, html, config);
1750
+ if (config.debug?.saveDir && config.debug?.enabled) {
1751
+ try {
1752
+ const saveDir = config.debug.saveDir;
1753
+ const safeId = this.urlToId(url) || "page";
1754
+ const outDir = path.join(saveDir, safeId);
1755
+ fs.mkdirSync(outDir, { recursive: true });
1756
+ fs.writeFileSync(path.join(outDir, "rendered.html"), html, "utf8");
1757
+ fs.writeFileSync(path.join(outDir, "extracted.txt"), doc?.content || "", "utf8");
1758
+ fs.writeFileSync(path.join(outDir, "meta.json"), JSON.stringify(doc?.metadata || {}, null, 2), "utf8");
1759
+ } catch (e) {
1760
+ dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
1761
+ }
1762
+ }
1763
+ return { doc, bodyTextLengthHint };
1764
+ } catch (e) {
1765
+ const msg = String(e?.message || e || "render_failed");
1766
+ const lower = msg.toLowerCase();
1767
+ if (lower.includes("captcha") || lower.includes("access denied")) {
1768
+ dbg.log("render.blocked", { url, error: msg });
1769
+ return { doc: null, bodyTextLengthHint: 0, blockedSuspected: true };
1770
+ }
1771
+ dbg.log("render.error", { url, error: msg });
1772
+ return { doc: null, bodyTextLengthHint: 0, renderFailure: msg };
1773
+ } finally {
1774
+ await browser.close();
1775
+ }
1776
+ }
1777
+ async discoverSitemaps(baseUrl, timeout, dbg) {
1778
+ const base = new URL(baseUrl);
1779
+ const robotsUrl = new URL("/robots.txt", base).toString();
1780
+ const found = /* @__PURE__ */ new Set();
1781
+ try {
1782
+ const res = await fetch(robotsUrl, {
1783
+ headers: { "User-Agent": "SnapAgent-CMS-Crawler/1.0" },
1784
+ signal: AbortSignal.timeout(timeout)
1785
+ });
1786
+ if (res.ok) {
1787
+ const txt = await res.text();
1788
+ const rx = /^sitemap:\s*(\S+)/gim;
1789
+ let m;
1790
+ while ((m = rx.exec(txt)) !== null) {
1791
+ const sm = m[1].trim();
1792
+ if (sm.startsWith("http")) found.add(sm);
1793
+ }
1794
+ dbg.log("discovery.robots", { robotsUrl, ok: true, sitemapCount: found.size });
1795
+ } else {
1796
+ dbg.log("discovery.robots", { robotsUrl, ok: false, status: res.status });
1797
+ }
1798
+ } catch (e) {
1799
+ dbg.log("discovery.robots", { robotsUrl, ok: false, error: e instanceof Error ? e.message : "failed" });
1800
+ }
1801
+ if (found.size === 0) {
1802
+ const candidates = [
1803
+ "/sitemap.xml",
1804
+ "/sitemap_index.xml",
1805
+ "/sitemap-index.xml",
1806
+ "/wp-sitemap.xml"
1807
+ ].map((p) => new URL(p, base).toString());
1808
+ candidates.forEach((c) => found.add(c));
1809
+ dbg.log("discovery.sitemapCandidates", { count: candidates.length });
1810
+ }
1811
+ return Array.from(found);
1812
+ }
1813
+ createDebugCollector(debug) {
1814
+ const enabled = !!debug?.enabled;
1815
+ const level = debug?.level || "summary";
1816
+ const maxPerUrlLogs = debug?.maxPerUrlLogs ?? 200;
1817
+ const entries = [];
1818
+ return {
1819
+ log: (event, data) => {
1820
+ if (!enabled) return;
1821
+ if (level === "summary" && !event.startsWith("discovery.") && !event.startsWith("crawl.")) return;
1822
+ if (entries.length >= maxPerUrlLogs) return;
1823
+ entries.push({ ts: (/* @__PURE__ */ new Date()).toISOString(), event, data });
1824
+ },
1825
+ summary: () => enabled ? { enabled, level, entries } : void 0
1826
+ };
1827
+ }
1828
+ /**
1829
+ * Clean extracted text content
1830
+ */
1831
+ cleanContent(text) {
1832
+ return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
1833
+ }
1834
+ /**
1835
+ * Convert URL to a stable document ID
1836
+ */
1837
+ urlToId(url) {
1838
+ return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
1839
+ }
1840
+ /**
1841
+ * Delay helper
1842
+ */
1843
+ delay(ms) {
1844
+ return new Promise((resolve) => setTimeout(resolve, ms));
1845
+ }
1846
+ // ============================================================================
1847
+ // RSS/Atom Feed Ingestion
1848
+ // ============================================================================
1849
+ /**
1850
+ * Ingest content from an RSS or Atom feed
1851
+ *
1852
+ * @example
1853
+ * ```typescript
1854
+ * // Simple RSS ingestion
1855
+ * await plugin.ingestFromRSS({
1856
+ * feedUrl: 'https://myblog.com/feed/',
1857
+ * });
1858
+ *
1859
+ * // Fetch full page content for each item
1860
+ * await plugin.ingestFromRSS({
1861
+ * feedUrl: 'https://myblog.com/feed/',
1862
+ * fetchFullContent: true,
1863
+ * contentSelector: 'article',
1864
+ * });
1865
+ * ```
1866
+ */
1867
+ async ingestFromRSS(config, options) {
1868
+ try {
1869
+ const response = await fetch(config.feedUrl, {
1870
+ headers: { "User-Agent": "SnapAgent-CMS-Crawler/1.0" },
1871
+ signal: AbortSignal.timeout(3e4)
1872
+ });
1873
+ if (!response.ok) {
1874
+ return {
1875
+ success: false,
1876
+ indexed: 0,
1877
+ failed: 1,
1878
+ urlsCrawled: 0,
1879
+ urlsSkipped: 0,
1880
+ urlsFailed: 1,
1881
+ crawledAt: /* @__PURE__ */ new Date(),
1882
+ errors: [{ id: config.feedUrl, error: `HTTP ${response.status}` }]
1883
+ };
1884
+ }
1885
+ const xml = await response.text();
1886
+ const items = this.parseRSSFeed(xml);
1887
+ if (items.length === 0) {
1888
+ return {
1889
+ success: true,
1890
+ indexed: 0,
1891
+ failed: 0,
1892
+ urlsCrawled: 0,
1893
+ urlsSkipped: 0,
1894
+ urlsFailed: 0,
1895
+ crawledAt: /* @__PURE__ */ new Date()
1896
+ };
1897
+ }
1898
+ const documents = [];
1899
+ const type = config.type || "post";
1900
+ let urlsCrawled = 0;
1901
+ let urlsFailed = 0;
1902
+ const errors = [];
1903
+ for (const item of items) {
1904
+ try {
1905
+ let content = item.content || item.description || "";
1906
+ if (config.fetchFullContent && item.link) {
1907
+ try {
1908
+ const doc = await this.crawlPage(item.link, {
1909
+ contentSelector: config.contentSelector,
1910
+ defaultType: type
1911
+ }, 3e4);
1912
+ if (doc) {
1913
+ content = doc.content;
1914
+ }
1915
+ urlsCrawled++;
1916
+ } catch (error) {
1917
+ urlsFailed++;
1918
+ }
1919
+ }
1920
+ content = this.stripHtml(content);
1921
+ if (content.length < 50) continue;
1922
+ documents.push({
1923
+ id: this.urlToId(item.link || item.guid || `rss-${documents.length}`),
1924
+ content,
1925
+ metadata: {
1926
+ type,
1927
+ title: item.title,
1928
+ url: item.link,
1929
+ publishedAt: item.pubDate,
1930
+ author: item.author,
1931
+ categories: item.categories,
1932
+ ...config.metadata
1933
+ }
1934
+ });
1935
+ } catch (error) {
1936
+ errors.push({
1937
+ id: item.link || "unknown",
1938
+ error: error instanceof Error ? error.message : "Unknown error"
1939
+ });
1940
+ }
1941
+ }
1942
+ let indexed = 0;
1943
+ if (documents.length > 0) {
1944
+ const ingestResult = await this.ingest(documents, options);
1945
+ indexed = ingestResult.indexed;
1946
+ }
1947
+ return {
1948
+ success: errors.length === 0,
1949
+ indexed,
1950
+ failed: errors.length,
1951
+ urlsCrawled,
1952
+ urlsSkipped: 0,
1953
+ urlsFailed,
1954
+ crawledAt: /* @__PURE__ */ new Date(),
1955
+ errors: errors.length > 0 ? errors : void 0
1956
+ };
1957
+ } catch (error) {
1958
+ return {
1959
+ success: false,
1960
+ indexed: 0,
1961
+ failed: 1,
1962
+ urlsCrawled: 0,
1963
+ urlsSkipped: 0,
1964
+ urlsFailed: 0,
1965
+ crawledAt: /* @__PURE__ */ new Date(),
1966
+ errors: [{
1967
+ id: config.feedUrl,
1968
+ error: error instanceof Error ? error.message : "Unknown error"
1969
+ }]
1970
+ };
1971
+ }
1972
+ }
1973
+ /**
1974
+ * Parse RSS/Atom feed XML
1975
+ */
1976
+ parseRSSFeed(xml) {
1977
+ const items = [];
1978
+ const isAtom = xml.includes("<feed") && xml.includes('xmlns="http://www.w3.org/2005/Atom"');
1979
+ if (isAtom) {
1980
+ const entryRegex = /<entry>([\s\S]*?)<\/entry>/gi;
1981
+ let match;
1982
+ while ((match = entryRegex.exec(xml)) !== null) {
1983
+ const entry = match[1];
1984
+ items.push({
1985
+ title: this.extractXmlValue(entry, "title"),
1986
+ link: this.extractAtomLink(entry),
1987
+ guid: this.extractXmlValue(entry, "id"),
1988
+ content: this.extractXmlValue(entry, "content") || this.extractXmlValue(entry, "summary"),
1989
+ pubDate: this.extractXmlValue(entry, "published") || this.extractXmlValue(entry, "updated"),
1990
+ author: this.extractXmlValue(entry, "name"),
1991
+ // Inside <author>
1992
+ categories: this.extractXmlValues(entry, "category", "term")
1993
+ });
1994
+ }
1995
+ } else {
1996
+ const itemRegex = /<item>([\s\S]*?)<\/item>/gi;
1997
+ let match;
1998
+ while ((match = itemRegex.exec(xml)) !== null) {
1999
+ const item = match[1];
2000
+ items.push({
2001
+ title: this.extractXmlValue(item, "title"),
2002
+ link: this.extractXmlValue(item, "link"),
2003
+ guid: this.extractXmlValue(item, "guid"),
2004
+ description: this.extractXmlValue(item, "description"),
2005
+ content: this.extractXmlValue(item, "content:encoded") || this.extractXmlValue(item, "content"),
2006
+ pubDate: this.extractXmlValue(item, "pubDate"),
2007
+ author: this.extractXmlValue(item, "author") || this.extractXmlValue(item, "dc:creator"),
2008
+ categories: this.extractXmlValues(item, "category")
2009
+ });
2010
+ }
2011
+ }
2012
+ return items;
2013
+ }
2014
+ /**
2015
+ * Extract a single value from XML
2016
+ */
2017
+ extractXmlValue(xml, tag) {
2018
+ const cdataRegex = new RegExp(`<${tag}[^>]*><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`, "i");
2019
+ const cdataMatch = xml.match(cdataRegex);
2020
+ if (cdataMatch) {
2021
+ return cdataMatch[1].trim();
2022
+ }
2023
+ const regex = new RegExp(`<${tag}[^>]*>([^<]*)<\\/${tag}>`, "i");
2024
+ const match = xml.match(regex);
2025
+ return match ? match[1].trim() : void 0;
2026
+ }
2027
+ /**
2028
+ * Extract multiple values from XML
2029
+ */
2030
+ extractXmlValues(xml, tag, attr) {
2031
+ const values = [];
2032
+ if (attr) {
2033
+ const regex = new RegExp(`<${tag}[^>]*${attr}="([^"]*)"[^>]*/?>`, "gi");
2034
+ let match;
2035
+ while ((match = regex.exec(xml)) !== null) {
2036
+ values.push(match[1]);
2037
+ }
2038
+ } else {
2039
+ const regex = new RegExp(`<${tag}[^>]*>([^<]*)<\\/${tag}>`, "gi");
2040
+ let match;
2041
+ while ((match = regex.exec(xml)) !== null) {
2042
+ values.push(match[1].trim());
2043
+ }
2044
+ }
2045
+ return values;
2046
+ }
2047
+ /**
2048
+ * Extract link from Atom entry
2049
+ */
2050
+ extractAtomLink(entry) {
2051
+ const alternateMatch = entry.match(/<link[^>]*rel="alternate"[^>]*href="([^"]+)"/i);
2052
+ if (alternateMatch) return alternateMatch[1];
2053
+ const linkMatch = entry.match(/<link[^>]*href="([^"]+)"/i);
2054
+ return linkMatch ? linkMatch[1] : void 0;
2055
+ }
2056
+ /**
2057
+ * Strip HTML tags from content
2058
+ */
2059
+ stripHtml(html) {
2060
+ return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/\s+/g, " ").trim();
2061
+ }
2062
+ // ============================================================================
2063
+ // Utility Methods
2064
+ // ============================================================================
2065
+ /**
2066
+ * Get cache statistics
2067
+ */
2068
+ getCacheStats() {
2069
+ const total = this.cacheStats.hits + this.cacheStats.misses;
2070
+ const hitRate = total > 0 ? (this.cacheStats.hits / total).toFixed(3) : "0.000";
2071
+ return { ...this.cacheStats, hitRate };
2072
+ }
2073
+ /**
2074
+ * Clear embedding cache
2075
+ */
2076
+ clearCache() {
2077
+ this.embeddingCache.clear();
2078
+ this.cacheStats = { hits: 0, misses: 0 };
2079
+ }
2080
+ /**
2081
+ * Get plugin configuration (for persistence)
2082
+ */
2083
+ getConfig() {
2084
+ return {
2085
+ name: this.name,
2086
+ mongoUri: "${MONGODB_URI}",
2087
+ // Reference env var
2088
+ dbName: this.config.dbName,
2089
+ collection: this.config.collection,
2090
+ openaiApiKey: "${OPENAI_API_KEY}",
2091
+ // Reference env var
2092
+ embeddingModel: this.config.embeddingModel,
2093
+ tenantId: this.config.tenantId,
2094
+ vectorIndexName: this.config.vectorIndexName,
2095
+ numCandidates: this.config.numCandidates,
2096
+ limit: this.config.limit,
2097
+ minScore: this.config.minScore,
2098
+ filterableFields: this.config.filterableFields,
2099
+ typeBoosts: this.config.typeBoosts,
2100
+ recencyBoost: this.config.recencyBoost,
2101
+ priority: this.priority
2102
+ };
2103
+ }
2104
+ };
2105
+ export {
2106
+ WebRAGPlugin
2107
+ };