@snap-agent/rag-web 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,2144 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ WebRAGPlugin: () => WebRAGPlugin
34
+ });
35
+ module.exports = __toCommonJS(index_exports);
36
+
37
+ // src/WebRAGPlugin.ts
38
+ var import_mongodb = require("mongodb");
39
+ var import_openai = __toESM(require("openai"));
40
+ var cheerio = __toESM(require("cheerio"));
41
+ var fs = __toESM(require("fs"));
42
+ var path = __toESM(require("path"));
43
+ var WebRAGPlugin = class _WebRAGPlugin {
44
+ name = "web-rag";
45
+ type = "rag";
46
+ priority;
47
+ config;
48
+ client = null;
49
+ db = null;
50
+ openai;
51
+ // Embedding cache
52
+ embeddingCache = /* @__PURE__ */ new Map();
53
+ cacheStats = { hits: 0, misses: 0 };
54
+ constructor(config) {
55
+ this.config = {
56
+ collection: "web_content",
57
+ embeddingModel: "text-embedding-3-small",
58
+ vectorIndexName: "web_vector_index",
59
+ numCandidates: 100,
60
+ limit: 10,
61
+ minScore: 0.7,
62
+ filterableFields: ["type"],
63
+ ...config
64
+ };
65
+ this.priority = config.priority ?? 100;
66
+ this.openai = new import_openai.default({ apiKey: config.openaiApiKey });
67
+ }
68
+ // ============================================================================
69
+ // MongoDB Connection
70
+ // ============================================================================
71
+ async getCollection() {
72
+ if (!this.client) {
73
+ this.client = new import_mongodb.MongoClient(this.config.mongoUri);
74
+ await this.client.connect();
75
+ this.db = this.client.db(this.config.dbName);
76
+ }
77
+ return this.db.collection(this.config.collection);
78
+ }
79
+ async getLedgerCollection() {
80
+ if (!this.client) {
81
+ this.client = new import_mongodb.MongoClient(this.config.mongoUri);
82
+ await this.client.connect();
83
+ this.db = this.client.db(this.config.dbName);
84
+ }
85
+ const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
86
+ return this.db.collection(name);
87
+ }
88
+ /**
89
+ * List recent crawl ledger rows (for dashboards / pagination in the front).
90
+ */
91
+ async listCrawlLedger(options = {}) {
92
+ const col = await this.getLedgerCollection();
93
+ const filter = { tenantId: this.config.tenantId };
94
+ filter.agentId = options.agentId ?? "shared";
95
+ if (options.domain) filter.domain = options.domain;
96
+ if (options.status) filter.lastStatus = options.status;
97
+ const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
98
+ const skip = Math.max(options.skip ?? 0, 0);
99
+ return col.find(filter).sort({ lastCrawledAt: -1 }).skip(skip).limit(limit).toArray();
100
+ }
101
+ resolveCrawlLedgerOptions(config) {
102
+ const plugin = this.config.crawlLedger;
103
+ const per = config.crawlLedger;
104
+ const enabled = per?.enabled ?? plugin?.enabled ?? false;
105
+ if (!enabled) return null;
106
+ const ttlMsFailure = per?.ttlMsFailure ?? plugin?.ttlMsFailure ?? 60 * 60 * 1e3;
107
+ return {
108
+ ttlMsIndexed: per?.ttlMsIndexed ?? plugin?.ttlMsIndexed ?? 7 * 24 * 60 * 60 * 1e3,
109
+ ttlMsFailure,
110
+ ttlMsRenderError: per?.ttlMsRenderError ?? plugin?.ttlMsRenderError ?? 5 * 60 * 1e3,
111
+ maxPageStatuses: per?.maxPageStatuses ?? 500,
112
+ stripQuery: config.stripQueryParams ?? false
113
+ };
114
+ }
115
+ normalizeLedgerUrl(url, stripQuery) {
116
+ return this.normalizeWebsiteUrl(url, stripQuery);
117
+ }
118
+ shouldSkipLedger(entry, ttlMsIndexed, ttlMsFailure, ttlMsRenderError, forceRecrawl) {
119
+ if (forceRecrawl || !entry) return false;
120
+ const t = entry.lastCrawledAt instanceof Date ? entry.lastCrawledAt.getTime() : new Date(entry.lastCrawledAt).getTime();
121
+ const age = Date.now() - t;
122
+ if (entry.lastStatus === "indexed" && age < ttlMsIndexed) return true;
123
+ if (entry.lastStatus === "error" && age < ttlMsRenderError) return true;
124
+ if (entry.lastStatus !== "indexed" && entry.lastStatus !== "error" && age < ttlMsFailure) {
125
+ return true;
126
+ }
127
+ return false;
128
+ }
129
+ async findLedgerEntry(urlNormalized, agentId) {
130
+ const col = await this.getLedgerCollection();
131
+ return col.findOne({
132
+ tenantId: this.config.tenantId,
133
+ agentId,
134
+ urlNormalized
135
+ });
136
+ }
137
+ toLedgerStatus(doc, diag) {
138
+ if (doc) return "indexed";
139
+ if (diag?.reason === "non_html") return "non_html";
140
+ if (diag?.reason === "blocked_suspected") return "blocked_suspected";
141
+ if (diag?.reason === "render_error") return "error";
142
+ return "too_small";
143
+ }
144
+ async upsertLedgerRecord(params) {
145
+ const col = await this.getLedgerCollection();
146
+ let domain = "";
147
+ try {
148
+ domain = new URL(params.url).hostname;
149
+ } catch {
150
+ domain = "";
151
+ }
152
+ const now = /* @__PURE__ */ new Date();
153
+ const errMsg = params.errorMessage ?? params.diag?.errorMessage;
154
+ const $set = {
155
+ tenantId: this.config.tenantId,
156
+ agentId: params.agentId,
157
+ urlNormalized: params.urlNormalized,
158
+ url: params.url,
159
+ domain,
160
+ lastStatus: params.status,
161
+ lastCrawledAt: now,
162
+ updatedAt: now
163
+ };
164
+ if (errMsg !== void 0) {
165
+ $set.errorMessage = errMsg;
166
+ } else if (params.status === "indexed" && params.doc) {
167
+ $set.errorMessage = null;
168
+ }
169
+ if (params.doc) {
170
+ $set.modeUsed = params.diag?.modeUsed;
171
+ $set.contentLength = params.doc.content.length;
172
+ $set.title = params.doc.metadata?.title;
173
+ $set.docId = params.doc.id;
174
+ } else {
175
+ $set.modeUsed = params.diag?.modeUsed;
176
+ $set.contentLength = null;
177
+ $set.title = null;
178
+ $set.docId = null;
179
+ }
180
+ await col.updateOne(
181
+ {
182
+ tenantId: this.config.tenantId,
183
+ agentId: params.agentId,
184
+ urlNormalized: params.urlNormalized
185
+ },
186
+ { $set },
187
+ { upsert: true }
188
+ );
189
+ }
190
+ pushPageStatus(list, max, entry) {
191
+ list.push(entry);
192
+ while (list.length > max) list.shift();
193
+ }
194
+ async disconnect() {
195
+ if (this.client) {
196
+ await this.client.close();
197
+ this.client = null;
198
+ this.db = null;
199
+ }
200
+ }
201
+ // ============================================================================
202
+ // RAG Plugin Interface
203
+ // ============================================================================
204
+ /**
205
+ * Retrieve contextual content for a message
206
+ */
207
+ async retrieveContext(message, options = {}) {
208
+ const queryVector = await this.generateEmbedding(message);
209
+ const hardFilters = {
210
+ tenantId: this.config.tenantId,
211
+ ...options.filters
212
+ };
213
+ if (options.agentId) {
214
+ hardFilters.agentId = { $in: ["shared", options.agentId] };
215
+ }
216
+ const results = await this.vectorSearch({
217
+ queryVector,
218
+ hardFilters
219
+ });
220
+ let scoredResults = results;
221
+ if (this.config.typeBoosts) {
222
+ scoredResults = results.map((doc) => ({
223
+ ...doc,
224
+ score: doc.score * (this.config.typeBoosts[doc.metadata.type] ?? 1)
225
+ }));
226
+ }
227
+ if (this.config.recencyBoost?.enabled) {
228
+ const { field, decayDays, maxBoost = 1.2 } = this.config.recencyBoost;
229
+ const now = Date.now();
230
+ const decayMs = decayDays * 24 * 60 * 60 * 1e3;
231
+ scoredResults = scoredResults.map((doc) => {
232
+ const dateValue = doc.metadata[field];
233
+ if (!dateValue) return doc;
234
+ const docDate = new Date(dateValue).getTime();
235
+ const age = now - docDate;
236
+ const freshness = Math.max(0, 1 - age / decayMs);
237
+ const boost = 1 + (maxBoost - 1) * freshness;
238
+ return { ...doc, score: doc.score * boost };
239
+ });
240
+ }
241
+ scoredResults.sort((a, b) => b.score - a.score);
242
+ scoredResults = scoredResults.slice(0, this.config.limit);
243
+ const content = this.formatResultsToContext(scoredResults);
244
+ return {
245
+ content,
246
+ metadata: {
247
+ plugin: this.name,
248
+ contentCount: scoredResults.length,
249
+ types: [...new Set(scoredResults.map((d) => d.metadata.type))],
250
+ topResults: scoredResults.slice(0, 5).map((doc) => ({
251
+ id: doc.id,
252
+ type: doc.metadata.type,
253
+ title: doc.metadata.title,
254
+ url: doc.metadata.url,
255
+ score: doc.score
256
+ }))
257
+ }
258
+ };
259
+ }
260
+ /**
261
+ * Format retrieved content for LLM context
262
+ */
263
+ formatResultsToContext(docs) {
264
+ if (docs.length === 0) {
265
+ return "No relevant content found.";
266
+ }
267
+ const sections = ["## Relevant Content\n"];
268
+ for (const doc of docs) {
269
+ const meta = doc.metadata;
270
+ const header = meta.title || `${meta.type} (${doc.id})`;
271
+ sections.push(`### ${header}`);
272
+ if (meta.type) sections.push(`**Type:** ${meta.type}`);
273
+ if (meta.url) sections.push(`**URL:** ${meta.url}`);
274
+ const skipFields = ["type", "title", "url", "sourceUrl", "fetchedAt"];
275
+ const extraMeta = Object.entries(meta).filter(([key]) => !skipFields.includes(key)).map(([key, value]) => `**${this.formatFieldName(key)}:** ${this.formatFieldValue(value)}`);
276
+ if (extraMeta.length > 0) {
277
+ sections.push(extraMeta.join("\n"));
278
+ }
279
+ sections.push("");
280
+ sections.push(doc.content);
281
+ sections.push("");
282
+ }
283
+ return sections.join("\n");
284
+ }
285
+ formatFieldName(key) {
286
+ return key.replace(/([A-Z])/g, " $1").replace(/^./, (s) => s.toUpperCase());
287
+ }
288
+ formatFieldValue(value) {
289
+ if (Array.isArray(value)) return value.join(", ");
290
+ if (value instanceof Date) return value.toLocaleDateString();
291
+ if (typeof value === "object") return JSON.stringify(value);
292
+ return String(value);
293
+ }
294
+ // ============================================================================
295
+ // Vector Search
296
+ // ============================================================================
297
+ async vectorSearch(options) {
298
+ const collection = await this.getCollection();
299
+ const pipeline = [
300
+ {
301
+ $vectorSearch: {
302
+ index: this.config.vectorIndexName,
303
+ path: "embedding",
304
+ queryVector: options.queryVector,
305
+ numCandidates: this.config.numCandidates,
306
+ limit: this.config.limit * 2,
307
+ // Fetch more for post-filtering
308
+ filter: options.hardFilters
309
+ }
310
+ },
311
+ {
312
+ $addFields: {
313
+ score: { $meta: "vectorSearchScore" }
314
+ }
315
+ }
316
+ ];
317
+ if (this.config.minScore) {
318
+ pipeline.push({
319
+ $match: { score: { $gte: this.config.minScore } }
320
+ });
321
+ }
322
+ pipeline.push({ $limit: this.config.limit * 2 });
323
+ const results = await collection.aggregate(pipeline).toArray();
324
+ return results;
325
+ }
326
+ // ============================================================================
327
+ // Embedding Generation
328
+ // ============================================================================
329
+ async generateEmbedding(text) {
330
+ const cacheConfig = this.config.cache?.embeddings;
331
+ if (cacheConfig?.enabled) {
332
+ const cached = this.embeddingCache.get(text);
333
+ const ttl = cacheConfig.ttl ?? 36e5;
334
+ if (cached && Date.now() - cached.timestamp < ttl) {
335
+ this.cacheStats.hits++;
336
+ return cached.value;
337
+ }
338
+ }
339
+ this.cacheStats.misses++;
340
+ const response = await this.openai.embeddings.create({
341
+ model: this.config.embeddingModel,
342
+ input: text
343
+ });
344
+ const embedding = response.data[0].embedding;
345
+ if (cacheConfig?.enabled) {
346
+ const maxSize = cacheConfig.maxSize ?? 1e3;
347
+ if (this.embeddingCache.size >= maxSize) {
348
+ const firstKey = this.embeddingCache.keys().next().value;
349
+ if (firstKey) this.embeddingCache.delete(firstKey);
350
+ }
351
+ this.embeddingCache.set(text, { value: embedding, timestamp: Date.now() });
352
+ }
353
+ return embedding;
354
+ }
355
+ async generateEmbeddingsBatch(texts) {
356
+ const embeddings = [];
357
+ for (const text of texts) {
358
+ const embedding = await this.generateEmbedding(text);
359
+ embeddings.push(embedding);
360
+ }
361
+ return embeddings;
362
+ }
363
+ // ============================================================================
364
+ // Document Ingestion
365
+ // ============================================================================
366
+ /**
367
+ * Ingest documents into the CMS RAG system
368
+ */
369
+ async ingest(documents, options) {
370
+ const collection = await this.getCollection();
371
+ let indexed = 0;
372
+ const errors = [];
373
+ const batchSize = options?.batchSize ?? 10;
374
+ for (let i = 0; i < documents.length; i += batchSize) {
375
+ const batch = documents.slice(i, i + batchSize);
376
+ const embeddings = await this.generateEmbeddingsBatch(
377
+ batch.map((doc) => doc.content)
378
+ );
379
+ const docsToStore = batch.map((doc, idx) => ({
380
+ id: doc.id,
381
+ content: doc.content,
382
+ metadata: {
383
+ type: doc.metadata?.type || "content",
384
+ ...doc.metadata
385
+ },
386
+ tenantId: this.config.tenantId,
387
+ // Use 'shared' marker for tenant-wide content, specific agentId for agent-only
388
+ agentId: options?.agentId || "shared",
389
+ embedding: embeddings[idx]
390
+ }));
391
+ for (const doc of docsToStore) {
392
+ try {
393
+ const filter = {
394
+ tenantId: this.config.tenantId,
395
+ id: doc.id,
396
+ // Match by agentId ('shared' for tenant-wide, specific for agent-only)
397
+ agentId: options?.agentId || "shared"
398
+ };
399
+ await collection.updateOne(
400
+ filter,
401
+ {
402
+ $set: { ...doc, updatedAt: /* @__PURE__ */ new Date() },
403
+ $setOnInsert: { createdAt: /* @__PURE__ */ new Date() }
404
+ },
405
+ { upsert: true }
406
+ );
407
+ indexed++;
408
+ } catch (error) {
409
+ errors.push({
410
+ id: doc.id,
411
+ error: error instanceof Error ? error.message : "Unknown error"
412
+ });
413
+ }
414
+ }
415
+ }
416
+ return {
417
+ success: errors.length === 0,
418
+ indexed,
419
+ failed: errors.length,
420
+ errors: errors.length > 0 ? errors : void 0,
421
+ metadata: {
422
+ tenantId: this.config.tenantId,
423
+ collection: this.config.collection
424
+ }
425
+ };
426
+ }
427
+ /**
428
+ * Update a single document
429
+ */
430
+ async update(id, document, options) {
431
+ const collection = await this.getCollection();
432
+ const update = { updatedAt: /* @__PURE__ */ new Date() };
433
+ if (document.content) {
434
+ const embedding = await this.generateEmbedding(document.content);
435
+ update.content = document.content;
436
+ update.embedding = embedding;
437
+ }
438
+ if (document.metadata) {
439
+ for (const [key, value] of Object.entries(document.metadata)) {
440
+ update[`metadata.${key}`] = value;
441
+ }
442
+ }
443
+ const filter = {
444
+ tenantId: this.config.tenantId,
445
+ id,
446
+ // Match by agentId ('shared' for tenant-wide, specific for agent-only)
447
+ agentId: options?.agentId || "shared"
448
+ };
449
+ await collection.updateOne(filter, { $set: update });
450
+ }
451
+ /**
452
+ * Delete document(s) by ID
453
+ */
454
+ async delete(ids, options) {
455
+ const collection = await this.getCollection();
456
+ const idArray = Array.isArray(ids) ? ids : [ids];
457
+ const filter = {
458
+ tenantId: this.config.tenantId,
459
+ id: { $in: idArray },
460
+ // Match by agentId ('shared' for tenant-wide, specific for agent-only)
461
+ agentId: options?.agentId || "shared"
462
+ };
463
+ const result = await collection.deleteMany(filter);
464
+ return result.deletedCount;
465
+ }
466
+ /**
467
+ * Bulk operations
468
+ */
469
+ async bulk(operations, options) {
470
+ let inserted = 0;
471
+ let updated = 0;
472
+ let deleted = 0;
473
+ let failed = 0;
474
+ const errors = [];
475
+ for (const op of operations) {
476
+ try {
477
+ switch (op.type) {
478
+ case "insert":
479
+ if (op.document) {
480
+ await this.ingest([op.document], options);
481
+ inserted++;
482
+ }
483
+ break;
484
+ case "update":
485
+ if (op.document) {
486
+ await this.update(op.id, op.document, options);
487
+ updated++;
488
+ }
489
+ break;
490
+ case "delete":
491
+ const count = await this.delete(op.id, options);
492
+ deleted += count;
493
+ break;
494
+ }
495
+ } catch (error) {
496
+ failed++;
497
+ errors.push({
498
+ id: op.id,
499
+ operation: op.type,
500
+ error: error.message || "Unknown error"
501
+ });
502
+ }
503
+ }
504
+ return {
505
+ success: failed === 0,
506
+ inserted,
507
+ updated,
508
+ deleted,
509
+ failed,
510
+ errors: errors.length > 0 ? errors : void 0
511
+ };
512
+ }
513
+ // ============================================================================
514
+ // URL Ingestion
515
+ // ============================================================================
516
+ /**
517
+ * Ingest content from a URL (JSON, CSV, XML, or API)
518
+ */
519
+ async ingestFromUrl(source, options) {
520
+ try {
521
+ const controller = new AbortController();
522
+ const timeoutId = setTimeout(() => controller.abort(), source.timeout || 3e4);
523
+ const response = await fetch(source.url, {
524
+ headers: {
525
+ ...source.headers,
526
+ ...source.auth && this.buildAuthHeaders(source.auth)
527
+ },
528
+ signal: controller.signal
529
+ });
530
+ clearTimeout(timeoutId);
531
+ if (!response.ok) {
532
+ throw new Error(`HTTP error: ${response.status} ${response.statusText}`);
533
+ }
534
+ let documents;
535
+ if (source.type === "json" || source.type === "api") {
536
+ const data = await response.json();
537
+ documents = this.transformJsonToDocuments(data, source.transform);
538
+ } else if (source.type === "csv") {
539
+ const data = await response.text();
540
+ documents = this.transformCsvToDocuments(data, source.transform);
541
+ } else if (source.type === "xml") {
542
+ const data = await response.text();
543
+ documents = this.transformXmlToDocuments(data, source.transform);
544
+ } else {
545
+ throw new Error(`Unsupported source type: ${source.type}`);
546
+ }
547
+ documents = documents.map((doc) => ({
548
+ ...doc,
549
+ metadata: {
550
+ ...doc.metadata,
551
+ ...source.metadata,
552
+ sourceUrl: source.url,
553
+ fetchedAt: (/* @__PURE__ */ new Date()).toISOString()
554
+ }
555
+ }));
556
+ const ingestResult = await this.ingest(documents, options);
557
+ return {
558
+ ...ingestResult,
559
+ sourceUrl: source.url,
560
+ fetchedAt: /* @__PURE__ */ new Date(),
561
+ documentsFetched: documents.length
562
+ };
563
+ } catch (error) {
564
+ return {
565
+ success: false,
566
+ indexed: 0,
567
+ failed: 0,
568
+ sourceUrl: source.url,
569
+ fetchedAt: /* @__PURE__ */ new Date(),
570
+ documentsFetched: 0,
571
+ errors: [{
572
+ id: "fetch",
573
+ error: error instanceof Error ? error.message : "Unknown error"
574
+ }]
575
+ };
576
+ }
577
+ }
578
+ buildAuthHeaders(auth) {
579
+ if (!auth) return {};
580
+ switch (auth.type) {
581
+ case "bearer":
582
+ return auth.token ? { Authorization: `Bearer ${auth.token}` } : {};
583
+ case "basic":
584
+ if (auth.username && auth.password) {
585
+ const encoded = Buffer.from(`${auth.username}:${auth.password}`).toString("base64");
586
+ return { Authorization: `Basic ${encoded}` };
587
+ }
588
+ return {};
589
+ case "api-key":
590
+ return auth.header && auth.key ? { [auth.header]: auth.key } : {};
591
+ case "custom":
592
+ return auth.headers || {};
593
+ default:
594
+ return {};
595
+ }
596
+ }
597
+ transformJsonToDocuments(data, transform) {
598
+ let items = data;
599
+ if (transform?.documentPath) {
600
+ items = this.extractByPath(data, transform.documentPath);
601
+ }
602
+ if (!Array.isArray(items)) {
603
+ items = [items];
604
+ }
605
+ const fieldMapping = transform?.fieldMapping || {};
606
+ return items.map((item, index) => {
607
+ const metadata = {};
608
+ for (const [targetField, sourcePath] of Object.entries(fieldMapping)) {
609
+ if (targetField === "id" || targetField === "content") continue;
610
+ if (typeof sourcePath === "function") {
611
+ metadata[targetField] = sourcePath();
612
+ } else if (sourcePath) {
613
+ metadata[targetField] = this.extractField(item, sourcePath);
614
+ }
615
+ }
616
+ if (!metadata.type) {
617
+ metadata.type = "content";
618
+ }
619
+ return {
620
+ id: this.extractField(item, fieldMapping.id || "id") || `doc-${index}`,
621
+ content: this.extractField(item, fieldMapping.content || "content") || JSON.stringify(item),
622
+ metadata
623
+ };
624
+ });
625
+ }
626
+ transformCsvToDocuments(csvData, transform) {
627
+ const lines = csvData.trim().split("\n");
628
+ if (lines.length < 2) return [];
629
+ const headers = this.parseCsvLine(lines[0]);
630
+ return lines.slice(1).map((line, index) => {
631
+ const values = this.parseCsvLine(line);
632
+ const item = headers.reduce((acc, header, i) => {
633
+ acc[header] = values[i] || "";
634
+ return acc;
635
+ }, {});
636
+ return this.transformJsonToDocuments([item], transform)[0];
637
+ });
638
+ }
639
+ parseCsvLine(line) {
640
+ const result = [];
641
+ let current = "";
642
+ let inQuotes = false;
643
+ for (const char of line) {
644
+ if (char === '"') {
645
+ inQuotes = !inQuotes;
646
+ } else if (char === "," && !inQuotes) {
647
+ result.push(current.trim());
648
+ current = "";
649
+ } else {
650
+ current += char;
651
+ }
652
+ }
653
+ result.push(current.trim());
654
+ return result;
655
+ }
656
+ transformXmlToDocuments(xmlData, transform) {
657
+ const items = [];
658
+ const itemPath = transform?.documentPath || "item";
659
+ const itemRegex = new RegExp(`<${itemPath}[^>]*>([\\s\\S]*?)<\\/${itemPath}>`, "gi");
660
+ let match;
661
+ while ((match = itemRegex.exec(xmlData)) !== null) {
662
+ const itemXml = match[1];
663
+ const item = {};
664
+ const tagRegex = /<(\w+)[^>]*>([^<]*)<\/\1>/g;
665
+ let tagMatch;
666
+ while ((tagMatch = tagRegex.exec(itemXml)) !== null) {
667
+ item[tagMatch[1]] = tagMatch[2].trim();
668
+ }
669
+ items.push(item);
670
+ }
671
+ return this.transformJsonToDocuments(items, transform);
672
+ }
673
+ extractByPath(obj, path2) {
674
+ const parts = path2.split(".");
675
+ let current = obj;
676
+ for (const part of parts) {
677
+ if (current == null) return void 0;
678
+ const arrayMatch = part.match(/^(\w+)\[(\d+)\]$/);
679
+ if (arrayMatch) {
680
+ current = current[arrayMatch[1]]?.[parseInt(arrayMatch[2])];
681
+ } else {
682
+ current = current[part];
683
+ }
684
+ }
685
+ return current;
686
+ }
687
+ extractField(item, path2) {
688
+ return this.extractByPath(item, path2);
689
+ }
690
+ // ============================================================================
691
+ // Drupal JSON:API Integration
692
+ // ============================================================================
693
+ /**
694
+ * Ingest content from a Drupal site using JSON:API
695
+ */
696
+ async ingestFromDrupal(config, options) {
697
+ const results = [];
698
+ for (const contentType of config.contentTypes) {
699
+ const url = `${config.baseUrl}/jsonapi/node/${contentType}`;
700
+ const mapping = config.mappings?.[contentType];
701
+ const result = await this.ingestFromUrl(
702
+ {
703
+ url,
704
+ type: "json",
705
+ auth: config.auth,
706
+ transform: {
707
+ documentPath: "data",
708
+ fieldMapping: {
709
+ id: "id",
710
+ content: mapping?.content || "attributes.body.processed",
711
+ type: () => contentType,
712
+ title: "attributes.title",
713
+ url: "attributes.path.alias",
714
+ ...mapping?.fields
715
+ }
716
+ }
717
+ },
718
+ options
719
+ );
720
+ results.push(result);
721
+ }
722
+ return results;
723
+ }
724
+ /**
725
+ * Parse Drupal JSON:API node type (e.g., 'node--project' → 'project')
726
+ */
727
+ static parseDrupalType(type) {
728
+ return type.replace(/^node--/, "");
729
+ }
730
+ // ============================================================================
731
+ // WordPress REST API Integration
732
+ // ============================================================================
733
+ /**
734
+ * Ingest content from a WordPress site using REST API
735
+ *
736
+ * @example
737
+ * ```typescript
738
+ * await plugin.ingestFromWordPress({
739
+ * baseUrl: 'https://myblog.com',
740
+ * postTypes: ['posts', 'pages'],
741
+ * perPage: 100,
742
+ * });
743
+ * ```
744
+ */
745
+ async ingestFromWordPress(config, options) {
746
+ const results = [];
747
+ const postTypes = config.postTypes || ["posts", "pages"];
748
+ const perPage = config.perPage || 100;
749
+ const maxPages = config.maxPages || 10;
750
+ for (const postType of postTypes) {
751
+ let page = 1;
752
+ let hasMore = true;
753
+ while (hasMore && page <= maxPages) {
754
+ const url = `${config.baseUrl}/wp-json/wp/v2/${postType}?per_page=${perPage}&page=${page}&_embed`;
755
+ const mapping = config.mappings?.[postType];
756
+ try {
757
+ const result = await this.ingestFromUrl(
758
+ {
759
+ url,
760
+ type: "json",
761
+ auth: config.auth,
762
+ transform: {
763
+ fieldMapping: {
764
+ id: "id",
765
+ content: mapping?.content || "content.rendered",
766
+ type: () => this.normalizeWordPressType(postType),
767
+ title: "title.rendered",
768
+ url: "link",
769
+ slug: "slug",
770
+ publishedAt: "date",
771
+ modifiedAt: "modified",
772
+ author: "_embedded.author.0.name",
773
+ featuredImage: "_embedded.wp:featuredmedia.0.source_url",
774
+ excerpt: "excerpt.rendered",
775
+ categories: "_embedded.wp:term.0",
776
+ tags: "_embedded.wp:term.1",
777
+ ...mapping?.fields
778
+ }
779
+ }
780
+ },
781
+ options
782
+ );
783
+ results.push(result);
784
+ hasMore = result.documentsFetched === perPage;
785
+ page++;
786
+ } catch (error) {
787
+ hasMore = false;
788
+ }
789
+ }
790
+ }
791
+ return results;
792
+ }
793
+ /**
794
+ * Normalize WordPress post type to a cleaner name
795
+ */
796
+ normalizeWordPressType(postType) {
797
+ if (postType.endsWith("s")) {
798
+ return postType.slice(0, -1);
799
+ }
800
+ return postType;
801
+ }
802
+ // ============================================================================
803
+ // Sanity.io Integration
804
+ // ============================================================================
805
+ /**
806
+ * Ingest content from a Sanity.io project using GROQ queries
807
+ *
808
+ * @example
809
+ * ```typescript
810
+ * await plugin.ingestFromSanity({
811
+ * projectId: 'abc123',
812
+ * dataset: 'production',
813
+ * queries: {
814
+ * post: {
815
+ * query: '*[_type == "post" && !(_id in path("drafts.**"))]',
816
+ * content: 'body',
817
+ * fields: {
818
+ * author: 'author->name',
819
+ * categories: 'categories[]->title',
820
+ * },
821
+ * },
822
+ * },
823
+ * });
824
+ * ```
825
+ */
826
+ async ingestFromSanity(config, options) {
827
+ const results = [];
828
+ const apiVersion = config.apiVersion || "v2024-01-01";
829
+ const useCdn = config.useCdn !== false;
830
+ const baseUrl = useCdn ? `https://${config.projectId}.apicdn.sanity.io/${apiVersion}` : `https://${config.projectId}.api.sanity.io/${apiVersion}`;
831
+ for (const [contentType, queryConfig] of Object.entries(config.queries)) {
832
+ const encodedQuery = encodeURIComponent(queryConfig.query);
833
+ const url = `${baseUrl}/data/query/${config.dataset}?query=${encodedQuery}`;
834
+ const headers = {};
835
+ if (config.token) {
836
+ headers["Authorization"] = `Bearer ${config.token}`;
837
+ }
838
+ const result = await this.ingestFromUrl(
839
+ {
840
+ url,
841
+ type: "json",
842
+ headers,
843
+ transform: {
844
+ documentPath: "result",
845
+ fieldMapping: {
846
+ id: "_id",
847
+ content: queryConfig.content,
848
+ type: () => contentType,
849
+ title: "title",
850
+ slug: "slug.current",
851
+ publishedAt: "publishedAt",
852
+ updatedAt: "_updatedAt",
853
+ ...queryConfig.fields
854
+ }
855
+ }
856
+ },
857
+ options
858
+ );
859
+ results.push(result);
860
+ }
861
+ return results;
862
+ }
863
+ /**
864
+ * Convert Sanity Portable Text blocks to plain text
865
+ * Useful for extracting content from rich text fields
866
+ */
867
+ static sanityBlocksToText(blocks) {
868
+ if (!Array.isArray(blocks)) return "";
869
+ return blocks.filter((block) => block._type === "block").map((block) => {
870
+ if (!block.children) return "";
871
+ return block.children.map((child) => child.text || "").join("");
872
+ }).join("\n\n");
873
+ }
874
+ // ============================================================================
875
+ // Strapi Integration
876
+ // ============================================================================
877
+ /**
878
+ * Ingest content from a Strapi CMS (v4 by default)
879
+ *
880
+ * @example
881
+ * ```typescript
882
+ * await plugin.ingestFromStrapi({
883
+ * baseUrl: 'https://my-strapi.com',
884
+ * apiToken: process.env.STRAPI_TOKEN,
885
+ * contentTypes: ['articles', 'pages'],
886
+ * mappings: {
887
+ * articles: {
888
+ * content: 'attributes.content',
889
+ * fields: {
890
+ * author: 'attributes.author.data.attributes.name',
891
+ * category: 'attributes.category.data.attributes.name',
892
+ * },
893
+ * },
894
+ * },
895
+ * });
896
+ * ```
897
+ */
898
+ async ingestFromStrapi(config, options) {
899
+ const results = [];
900
+ const pageSize = config.pageSize || 100;
901
+ const maxPages = config.maxPages || 10;
902
+ for (const contentType of config.contentTypes) {
903
+ let page = 1;
904
+ let hasMore = true;
905
+ const mapping = config.mappings?.[contentType];
906
+ const useAttributes = mapping?.useAttributes !== false;
907
+ while (hasMore && page <= maxPages) {
908
+ const url = `${config.baseUrl}/api/${contentType}?pagination[page]=${page}&pagination[pageSize]=${pageSize}&populate=*`;
909
+ const headers = {};
910
+ if (config.apiToken) {
911
+ headers["Authorization"] = `Bearer ${config.apiToken}`;
912
+ }
913
+ try {
914
+ const result = await this.ingestFromUrl(
915
+ {
916
+ url,
917
+ type: "json",
918
+ headers,
919
+ transform: {
920
+ documentPath: "data",
921
+ fieldMapping: useAttributes ? {
922
+ // Strapi v4 format (with attributes)
923
+ id: "id",
924
+ content: mapping?.content || "attributes.content",
925
+ type: () => this.normalizeStrapiType(contentType),
926
+ title: "attributes.title",
927
+ slug: "attributes.slug",
928
+ publishedAt: "attributes.publishedAt",
929
+ updatedAt: "attributes.updatedAt",
930
+ ...mapping?.fields
931
+ } : {
932
+ // Strapi v3 format (flat)
933
+ id: "id",
934
+ content: mapping?.content || "content",
935
+ type: () => this.normalizeStrapiType(contentType),
936
+ title: "title",
937
+ slug: "slug",
938
+ publishedAt: "published_at",
939
+ updatedAt: "updated_at",
940
+ ...mapping?.fields
941
+ }
942
+ }
943
+ },
944
+ options
945
+ );
946
+ results.push(result);
947
+ hasMore = result.documentsFetched === pageSize;
948
+ page++;
949
+ } catch (error) {
950
+ hasMore = false;
951
+ }
952
+ }
953
+ }
954
+ return results;
955
+ }
956
+ /**
957
+ * Normalize Strapi collection type to singular form
958
+ */
959
+ normalizeStrapiType(collectionType) {
960
+ if (collectionType.endsWith("s")) {
961
+ return collectionType.slice(0, -1);
962
+ }
963
+ return collectionType;
964
+ }
965
+ // ============================================================================
966
+ // Web Crawling - Zero Setup for Non-Technical Clients
967
+ // ============================================================================
968
+ /**
969
+ * Ingest content by crawling a website's sitemap
970
+ * Perfect for non-technical clients - just provide the sitemap URL
971
+ *
972
+ * @example
973
+ * ```typescript
974
+ * // Simple usage - just provide the sitemap
975
+ * await plugin.ingestFromSitemap({
976
+ * sitemapUrl: 'https://my-site/sitemap.xml',
977
+ * });
978
+ *
979
+ * // Or auto-discover sitemap from base URL
980
+ * await plugin.ingestFromSitemap({
981
+ * baseUrl: 'https://my-site',
982
+ * });
983
+ *
984
+ * // With content selectors and type inference
985
+ * await plugin.ingestFromSitemap({
986
+ * sitemapUrl: 'https://my-site/sitemap.xml',
987
+ * contentSelector: 'article, .main-content',
988
+ * excludePatterns: ['/cart', '/checkout', '/admin'],
989
+ * typeFromUrl: {
990
+ * '/projects/': 'project',
991
+ * '/perspectives/': 'blog',
992
+ * '/people/': 'team',
993
+ * },
994
+ * });
995
+ * ```
996
+ */
997
+ async ingestFromSitemap(config, options) {
998
+ const maxPages = config.maxPages ?? 100;
999
+ const concurrency = config.concurrency ?? 3;
1000
+ const delayMs = config.delayMs ?? 500;
1001
+ let sitemapUrl = config.sitemapUrl;
1002
+ if (!sitemapUrl && config.baseUrl) {
1003
+ sitemapUrl = `${config.baseUrl.replace(/\/$/, "")}/sitemap.xml`;
1004
+ }
1005
+ if (!sitemapUrl) {
1006
+ return {
1007
+ success: false,
1008
+ indexed: 0,
1009
+ failed: 0,
1010
+ urlsCrawled: 0,
1011
+ urlsSkipped: 0,
1012
+ urlsFailed: 0,
1013
+ crawledAt: /* @__PURE__ */ new Date(),
1014
+ errors: [{ id: "config", error: "Either sitemapUrl or baseUrl is required" }]
1015
+ };
1016
+ }
1017
+ const urls = await this.parseSitemap(sitemapUrl, config);
1018
+ let filteredUrls = urls;
1019
+ if (config.includePatterns?.length) {
1020
+ filteredUrls = filteredUrls.filter(
1021
+ (url) => config.includePatterns.some((pattern) => url.includes(pattern))
1022
+ );
1023
+ }
1024
+ if (config.excludePatterns?.length) {
1025
+ filteredUrls = filteredUrls.filter(
1026
+ (url) => !config.excludePatterns.some((pattern) => url.includes(pattern))
1027
+ );
1028
+ }
1029
+ const urlsToCrawl = filteredUrls.slice(0, maxPages);
1030
+ const urlsSkipped = filteredUrls.length - urlsToCrawl.length;
1031
+ const result = await this.crawlUrls(urlsToCrawl, {
1032
+ ...config,
1033
+ concurrency,
1034
+ delayMs
1035
+ }, options);
1036
+ return {
1037
+ ...result,
1038
+ urlsSkipped,
1039
+ crawledAt: /* @__PURE__ */ new Date()
1040
+ };
1041
+ }
1042
+ /**
1043
+ * Ingest content from a website that has no sitemap (or sitemap is incomplete).
1044
+ * Discovers internal links from `baseUrl` (BFS) and then crawls the discovered URLs.
1045
+ *
1046
+ * This uses the same extraction pipeline as `ingestFromSitemap()` (via `crawlPage()`).
1047
+ */
1048
+ async ingestFromWebsite(config, options) {
1049
+ const maxPages = config.maxPages ?? 100;
1050
+ const maxDepth = config.maxDepth ?? 3;
1051
+ const concurrency = config.concurrency ?? 3;
1052
+ const delayMs = config.delayMs ?? 500;
1053
+ const timeout = config.timeout ?? 3e4;
1054
+ const stripQueryParams = config.stripQueryParams ?? true;
1055
+ if (!config.baseUrl) {
1056
+ return {
1057
+ success: false,
1058
+ indexed: 0,
1059
+ failed: 0,
1060
+ urlsCrawled: 0,
1061
+ urlsSkipped: 0,
1062
+ urlsFailed: 0,
1063
+ crawledAt: /* @__PURE__ */ new Date(),
1064
+ errors: [{ id: "config", error: "baseUrl is required" }]
1065
+ };
1066
+ }
1067
+ const dbg = this.createDebugCollector(config.debug);
1068
+ const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
1069
+ if (!base) {
1070
+ return {
1071
+ success: false,
1072
+ indexed: 0,
1073
+ failed: 0,
1074
+ urlsCrawled: 0,
1075
+ urlsSkipped: 0,
1076
+ urlsFailed: 0,
1077
+ crawledAt: /* @__PURE__ */ new Date(),
1078
+ errors: [{ id: "config", error: "Invalid baseUrl" }]
1079
+ };
1080
+ }
1081
+ const discoveredSitemaps = await this.discoverSitemaps(base, timeout, dbg);
1082
+ dbg.log("discovery.sitemaps", { baseUrl: base, sitemaps: discoveredSitemaps });
1083
+ let urlsToCrawl = [];
1084
+ let urlsSkipped = 0;
1085
+ for (const sm of discoveredSitemaps) {
1086
+ const urls = await this.parseSitemap(sm, {
1087
+ sitemapUrl: sm,
1088
+ timeout
1089
+ });
1090
+ if (urls.length > 0) {
1091
+ dbg.log("discovery.sitemapParsed", { sitemapUrl: sm, urlCount: urls.length });
1092
+ let filteredUrls = urls;
1093
+ if (config.includePatterns?.length) {
1094
+ filteredUrls = filteredUrls.filter((u) => config.includePatterns.some((p) => u.includes(p)));
1095
+ }
1096
+ if (config.excludePatterns?.length) {
1097
+ filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
1098
+ }
1099
+ urlsToCrawl = filteredUrls.slice(0, maxPages);
1100
+ urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
1101
+ break;
1102
+ }
1103
+ }
1104
+ if (urlsToCrawl.length === 0) {
1105
+ dbg.log("discovery.fallback", { reason: "no_sitemap_urls", method: "link_lookup" });
1106
+ const discovery = await this.discoverInternalUrls({
1107
+ baseUrl: base,
1108
+ maxPages,
1109
+ maxDepth,
1110
+ concurrency,
1111
+ delayMs,
1112
+ timeout,
1113
+ includePatterns: config.includePatterns,
1114
+ excludePatterns: config.excludePatterns,
1115
+ stripQueryParams
1116
+ });
1117
+ urlsToCrawl = discovery.urls;
1118
+ urlsSkipped = discovery.skipped;
1119
+ dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
1120
+ }
1121
+ const result = await this.crawlUrls(urlsToCrawl, {
1122
+ contentSelector: config.contentSelector,
1123
+ titleSelector: config.titleSelector,
1124
+ removeSelectors: config.removeSelectors,
1125
+ concurrency,
1126
+ delayMs,
1127
+ timeout,
1128
+ typeFromUrl: config.typeFromUrl,
1129
+ defaultType: config.defaultType ?? "page",
1130
+ metadata: config.metadata,
1131
+ includePatterns: config.includePatterns,
1132
+ excludePatterns: config.excludePatterns,
1133
+ stripQueryParams,
1134
+ render: config.render,
1135
+ renderOptions: config.renderOptions,
1136
+ debug: config.debug,
1137
+ crawlLedger: config.crawlLedger
1138
+ }, options);
1139
+ return {
1140
+ ...result,
1141
+ urlsSkipped,
1142
+ crawledAt: /* @__PURE__ */ new Date(),
1143
+ metadata: {
1144
+ ...result.metadata || {},
1145
+ discoveryDebug: dbg.summary()
1146
+ }
1147
+ };
1148
+ }
1149
+ /**
1150
+ * Parse sitemap XML and extract URLs
1151
+ */
1152
+ async parseSitemap(sitemapUrl, config) {
1153
+ const urls = [];
1154
+ try {
1155
+ const response = await fetch(sitemapUrl, {
1156
+ headers: { "User-Agent": "SnapAgent-CMS-Crawler/1.0" },
1157
+ signal: AbortSignal.timeout(config.timeout || 3e4)
1158
+ });
1159
+ if (!response.ok) {
1160
+ console.error(`Failed to fetch sitemap: ${response.status}`);
1161
+ return urls;
1162
+ }
1163
+ const xml = await response.text();
1164
+ if (xml.includes("<sitemapindex")) {
1165
+ const sitemapUrls = this.extractUrlsFromXml(xml, "sitemap", "loc");
1166
+ for (const subSitemapUrl of sitemapUrls.slice(0, 10)) {
1167
+ const subUrls = await this.parseSitemap(subSitemapUrl, config);
1168
+ urls.push(...subUrls);
1169
+ }
1170
+ } else {
1171
+ const pageUrls = this.extractUrlsFromXml(xml, "url", "loc");
1172
+ urls.push(...pageUrls);
1173
+ }
1174
+ } catch (error) {
1175
+ console.error(`Error parsing sitemap ${sitemapUrl}:`, error);
1176
+ }
1177
+ return urls;
1178
+ }
1179
+ /**
1180
+ * Extract URLs from sitemap XML
1181
+ */
1182
+ extractUrlsFromXml(xml, parentTag, urlTag) {
1183
+ const urls = [];
1184
+ const regex = new RegExp(`<${parentTag}[^>]*>[\\s\\S]*?<${urlTag}>([^<]+)<\\/${urlTag}>[\\s\\S]*?<\\/${parentTag}>`, "gi");
1185
+ let match;
1186
+ while ((match = regex.exec(xml)) !== null) {
1187
+ const url = match[1].trim();
1188
+ if (url.startsWith("http")) {
1189
+ urls.push(url);
1190
+ }
1191
+ }
1192
+ return urls;
1193
+ }
1194
+ async discoverInternalUrls(input) {
1195
+ const start = this.normalizeWebsiteUrl(input.baseUrl, input.stripQueryParams);
1196
+ if (!start) return { urls: [], skipped: 0 };
1197
+ const startUrl = new URL(start);
1198
+ const visited = /* @__PURE__ */ new Set();
1199
+ const queue = [{ url: startUrl.toString(), depth: 0 }];
1200
+ const discovered = [];
1201
+ let skipped = 0;
1202
+ while (queue.length > 0 && discovered.length < input.maxPages) {
1203
+ const batch = queue.splice(0, input.concurrency);
1204
+ const results = await Promise.allSettled(
1205
+ batch.map(async ({ url, depth }) => {
1206
+ if (visited.has(url)) return { url, depth, links: [] };
1207
+ visited.add(url);
1208
+ if (depth > input.maxDepth) return { url, depth, links: [] };
1209
+ if (input.includePatterns?.length && !input.includePatterns.some((p) => url.includes(p))) {
1210
+ skipped++;
1211
+ return { url, depth, links: [] };
1212
+ }
1213
+ if (input.excludePatterns?.length && input.excludePatterns.some((p) => url.includes(p))) {
1214
+ skipped++;
1215
+ return { url, depth, links: [] };
1216
+ }
1217
+ discovered.push(url);
1218
+ if (discovered.length >= input.maxPages) return { url, depth, links: [] };
1219
+ try {
1220
+ const html = await this.fetchHtml(url, input.timeout);
1221
+ if (!html) return { url, depth, links: [] };
1222
+ const links = this.extractInternalLinks(html, startUrl, input.stripQueryParams);
1223
+ return { url, depth, links };
1224
+ } catch {
1225
+ return { url, depth, links: [] };
1226
+ }
1227
+ })
1228
+ );
1229
+ for (const r of results) {
1230
+ if (r.status !== "fulfilled") continue;
1231
+ const { depth, links } = r.value;
1232
+ const nextDepth = depth + 1;
1233
+ if (nextDepth > input.maxDepth) continue;
1234
+ for (const link of links) {
1235
+ if (discovered.length + queue.length >= input.maxPages * 3) continue;
1236
+ if (visited.has(link)) continue;
1237
+ queue.push({ url: link, depth: nextDepth });
1238
+ }
1239
+ }
1240
+ if (queue.length > 0 && discovered.length < input.maxPages) {
1241
+ await this.delay(input.delayMs);
1242
+ }
1243
+ }
1244
+ if (discovered.length >= input.maxPages) {
1245
+ skipped += queue.length;
1246
+ }
1247
+ return { urls: discovered.slice(0, input.maxPages), skipped };
1248
+ }
1249
+ normalizeWebsiteUrl(inputUrl, stripQueryParams) {
1250
+ try {
1251
+ const u = new URL(inputUrl);
1252
+ u.hash = "";
1253
+ if (stripQueryParams) u.search = "";
1254
+ return u.toString();
1255
+ } catch {
1256
+ return null;
1257
+ }
1258
+ }
1259
+ async fetchHtml(url, timeout) {
1260
+ const response = await fetch(url, {
1261
+ headers: {
1262
+ "User-Agent": "SnapAgent-CMS-Crawler/1.0",
1263
+ "Accept": "text/html,application/xhtml+xml"
1264
+ },
1265
+ signal: AbortSignal.timeout(timeout)
1266
+ });
1267
+ if (!response.ok) return null;
1268
+ const contentType = response.headers.get("content-type") || "";
1269
+ if (!contentType.includes("text/html")) return null;
1270
+ return await response.text();
1271
+ }
1272
+ extractInternalLinks(html, base, stripQueryParams) {
1273
+ const $ = cheerio.load(html);
1274
+ const links = /* @__PURE__ */ new Set();
1275
+ $("a[href]").each((_, el) => {
1276
+ const href = ($(el).attr("href") || "").trim();
1277
+ if (!href) return;
1278
+ if (href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) return;
1279
+ try {
1280
+ const u = new URL(href, base);
1281
+ if (u.origin !== base.origin) return;
1282
+ u.hash = "";
1283
+ if (stripQueryParams) u.search = "";
1284
+ links.add(u.toString());
1285
+ } catch {
1286
+ }
1287
+ });
1288
+ return Array.from(links);
1289
+ }
1290
+ /**
1291
+ * Ingest content from a list of URLs
1292
+ *
1293
+ * @example
1294
+ * ```typescript
1295
+ * await plugin.ingestFromUrls([
1296
+ * 'https://example.com/about',
1297
+ * 'https://example.com/services',
1298
+ * 'https://example.com/contact',
1299
+ * ], {
1300
+ * contentSelector: '.page-content',
1301
+ * type: 'page',
1302
+ * });
1303
+ * ```
1304
+ */
1305
+ async ingestFromUrls(urls, config = {}, options) {
1306
+ return this.crawlUrls(urls, {
1307
+ contentSelector: config.contentSelector,
1308
+ titleSelector: config.titleSelector,
1309
+ removeSelectors: config.removeSelectors,
1310
+ concurrency: config.concurrency ?? 3,
1311
+ delayMs: config.delayMs ?? 500,
1312
+ timeout: config.timeout ?? 3e4,
1313
+ typeFromUrl: config.typeFromUrl,
1314
+ defaultType: config.type || "page",
1315
+ metadata: config.metadata,
1316
+ stripQueryParams: config.stripQueryParams ?? false,
1317
+ render: config.render,
1318
+ renderOptions: config.renderOptions,
1319
+ debug: config.debug,
1320
+ crawlLedger: config.crawlLedger
1321
+ }, options);
1322
+ }
1323
+ /**
1324
+ * Ingest a single page from a URL (no sitemap discovery, no link lookup).
1325
+ * Uses the same crawl pipeline (static/render/auto) as other web ingestion methods.
1326
+ */
1327
+ async ingestSinglePageFromUrl(config, options) {
1328
+ if (!config?.url) {
1329
+ return {
1330
+ success: false,
1331
+ indexed: 0,
1332
+ failed: 0,
1333
+ urlsCrawled: 0,
1334
+ urlsSkipped: 0,
1335
+ urlsFailed: 0,
1336
+ crawledAt: /* @__PURE__ */ new Date(),
1337
+ errors: [{ id: "config", error: "url is required" }]
1338
+ };
1339
+ }
1340
+ return this.crawlUrls([config.url], {
1341
+ contentSelector: config.contentSelector,
1342
+ titleSelector: config.titleSelector,
1343
+ removeSelectors: config.removeSelectors,
1344
+ concurrency: 1,
1345
+ delayMs: 0,
1346
+ timeout: config.timeout ?? 3e4,
1347
+ typeFromUrl: config.typeFromUrl,
1348
+ defaultType: config.type || "page",
1349
+ metadata: config.metadata,
1350
+ stripQueryParams: config.stripQueryParams ?? true,
1351
+ render: config.render,
1352
+ renderOptions: config.renderOptions,
1353
+ debug: config.debug,
1354
+ crawlLedger: config.crawlLedger
1355
+ }, options);
1356
+ }
1357
+ /**
1358
+ * Crawl a list of URLs and ingest their content
1359
+ */
1360
+ async crawlUrls(urls, config, options) {
1361
+ const concurrency = config.concurrency ?? 3;
1362
+ const delayMs = config.delayMs ?? 500;
1363
+ const timeout = config.timeout ?? 3e4;
1364
+ const renderMode = config.render ?? false;
1365
+ const renderOptions = config.renderOptions || {};
1366
+ const minContentLength = renderOptions.minContentLength ?? 200;
1367
+ const dbg = this.createDebugCollector(config.debug);
1368
+ const ledgerOpts = this.resolveCrawlLedgerOptions(config);
1369
+ const forceRecrawl = !!(options && options.forceRecrawl);
1370
+ const agentId = options?.agentId ?? "shared";
1371
+ const stripQ = config.stripQueryParams ?? false;
1372
+ const urlByNorm = /* @__PURE__ */ new Map();
1373
+ for (const u of urls) {
1374
+ const norm = this.normalizeLedgerUrl(u, stripQ) || u;
1375
+ if (!urlByNorm.has(norm)) urlByNorm.set(norm, u);
1376
+ }
1377
+ const uniqueUrls = Array.from(urlByNorm.values());
1378
+ const counters = {
1379
+ staticOk: 0,
1380
+ renderOk: 0,
1381
+ renderFallbacks: 0,
1382
+ nonHtml: 0,
1383
+ tooSmall: 0,
1384
+ blockedSuspected: 0,
1385
+ renderErrors: 0,
1386
+ ledgerSkipped: 0
1387
+ };
1388
+ let indexed = 0;
1389
+ let urlsCrawled = 0;
1390
+ let urlsFailed = 0;
1391
+ const errors = [];
1392
+ const documents = [];
1393
+ const pageStatuses = [];
1394
+ const maxStatuses = ledgerOpts?.maxPageStatuses ?? 500;
1395
+ for (let i = 0; i < uniqueUrls.length; i += concurrency) {
1396
+ const batch = uniqueUrls.slice(i, i + concurrency);
1397
+ const results = await Promise.allSettled(
1398
+ batch.map(async (url) => {
1399
+ const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
1400
+ if (ledgerOpts && !forceRecrawl) {
1401
+ const entry = await this.findLedgerEntry(urlNormalized, agentId);
1402
+ if (this.shouldSkipLedger(
1403
+ entry,
1404
+ ledgerOpts.ttlMsIndexed,
1405
+ ledgerOpts.ttlMsFailure,
1406
+ ledgerOpts.ttlMsRenderError,
1407
+ false
1408
+ )) {
1409
+ counters.ledgerSkipped++;
1410
+ this.pushPageStatus(pageStatuses, maxStatuses, {
1411
+ url,
1412
+ urlNormalized,
1413
+ status: "skipped_ledger",
1414
+ skippedReason: `fresh:${entry?.lastStatus}`,
1415
+ contentLength: entry?.contentLength,
1416
+ title: entry?.title,
1417
+ docId: entry?.docId
1418
+ });
1419
+ dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
1420
+ return { kind: "ledger_skip", url };
1421
+ }
1422
+ }
1423
+ try {
1424
+ const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
1425
+ renderMode,
1426
+ renderOptions,
1427
+ minContentLength,
1428
+ dbg
1429
+ });
1430
+ if (diag?.modeUsed === "static_ok") counters.staticOk++;
1431
+ if (diag?.modeUsed === "render_ok") counters.renderOk++;
1432
+ if (diag?.modeUsed === "render_fallback_ok") counters.renderFallbacks++;
1433
+ if (diag?.reason === "non_html") counters.nonHtml++;
1434
+ if (diag?.reason === "too_small") counters.tooSmall++;
1435
+ if (diag?.reason === "blocked_suspected") counters.blockedSuspected++;
1436
+ if (diag?.reason === "render_error") counters.renderErrors++;
1437
+ const crawlSt = this.toLedgerStatus(doc, diag);
1438
+ if (ledgerOpts) {
1439
+ await this.upsertLedgerRecord({
1440
+ url,
1441
+ urlNormalized,
1442
+ agentId,
1443
+ status: crawlSt,
1444
+ doc,
1445
+ diag
1446
+ });
1447
+ }
1448
+ this.pushPageStatus(pageStatuses, maxStatuses, {
1449
+ url,
1450
+ urlNormalized,
1451
+ status: crawlSt,
1452
+ modeUsed: diag?.modeUsed,
1453
+ contentLength: doc?.content?.length,
1454
+ bodyTextLengthHint,
1455
+ title: doc?.metadata?.title,
1456
+ docId: doc?.id,
1457
+ error: diag?.errorMessage
1458
+ });
1459
+ return { kind: "doc", doc, url };
1460
+ } catch (error) {
1461
+ const msg = error instanceof Error ? error.message : String(error);
1462
+ if (ledgerOpts) {
1463
+ await this.upsertLedgerRecord({
1464
+ url,
1465
+ urlNormalized,
1466
+ agentId,
1467
+ status: "error",
1468
+ errorMessage: msg
1469
+ });
1470
+ }
1471
+ this.pushPageStatus(pageStatuses, maxStatuses, {
1472
+ url,
1473
+ urlNormalized,
1474
+ status: "error",
1475
+ error: msg
1476
+ });
1477
+ throw { url, error };
1478
+ }
1479
+ })
1480
+ );
1481
+ for (const result of results) {
1482
+ if (result.status === "fulfilled") {
1483
+ const v = result.value;
1484
+ if (v && typeof v === "object" && "kind" in v && v.kind === "ledger_skip") {
1485
+ continue;
1486
+ }
1487
+ if (v && typeof v === "object" && "kind" in v && v.kind === "doc" && v.doc) {
1488
+ documents.push(v.doc);
1489
+ urlsCrawled++;
1490
+ }
1491
+ } else if (result.status === "rejected") {
1492
+ urlsFailed++;
1493
+ errors.push({
1494
+ id: result.reason.url || "unknown",
1495
+ error: result.reason.error?.message || "Failed to crawl"
1496
+ });
1497
+ }
1498
+ }
1499
+ if (i + concurrency < uniqueUrls.length) {
1500
+ await this.delay(delayMs);
1501
+ }
1502
+ }
1503
+ if (documents.length > 0) {
1504
+ const ingestResult = await this.ingest(documents, options);
1505
+ indexed = ingestResult.indexed;
1506
+ if (ingestResult.errors) {
1507
+ errors.push(...ingestResult.errors);
1508
+ }
1509
+ }
1510
+ return {
1511
+ success: errors.length === 0,
1512
+ indexed,
1513
+ failed: errors.length,
1514
+ urlsCrawled,
1515
+ urlsSkipped: 0,
1516
+ urlsFailed,
1517
+ crawledAt: /* @__PURE__ */ new Date(),
1518
+ errors: errors.length > 0 ? errors : void 0,
1519
+ metadata: {
1520
+ counters,
1521
+ pageStatuses,
1522
+ debug: dbg.summary()
1523
+ }
1524
+ };
1525
+ }
1526
+ /**
1527
+ * Crawl a single page and extract content
1528
+ */
1529
+ async crawlPage(url, config, timeout) {
1530
+ const response = await fetch(url, {
1531
+ headers: {
1532
+ "User-Agent": "SnapAgent-CMS-Crawler/1.0",
1533
+ "Accept": "text/html,application/xhtml+xml"
1534
+ },
1535
+ signal: AbortSignal.timeout(timeout)
1536
+ });
1537
+ if (!response.ok) {
1538
+ throw new Error(`HTTP ${response.status}`);
1539
+ }
1540
+ const contentType = response.headers.get("content-type") || "";
1541
+ if (!contentType.includes("text/html")) {
1542
+ return null;
1543
+ }
1544
+ const html = await response.text();
1545
+ return this.extractDocumentFromHtml(url, html, config);
1546
+ }
1547
+ /**
1548
+ * Default chain works for many WordPress / Elementor / block themes where `.first()`
1549
+ * would otherwise hit an empty wrapper.
1550
+ */
1551
+ static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
1552
+ stripNoiseFromDom($, config) {
1553
+ const removeSelectors = config.removeSelectors || [
1554
+ "script",
1555
+ "style",
1556
+ "nav",
1557
+ "header",
1558
+ "footer",
1559
+ ".sidebar",
1560
+ ".navigation",
1561
+ ".menu",
1562
+ ".comments",
1563
+ '[role="navigation"]',
1564
+ '[role="banner"]'
1565
+ ];
1566
+ removeSelectors.forEach((selector) => $(selector).remove());
1567
+ }
1568
+ /** Longest cleaned text among selector matches and full body (after noise strip). */
1569
+ extractBestContentText($, config) {
1570
+ const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
1571
+ const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
1572
+ let best = "";
1573
+ for (const sel of selectors) {
1574
+ $(sel).each((_, el) => {
1575
+ const t = this.cleanContent($(el).text().trim());
1576
+ if (t.length > best.length) best = t;
1577
+ });
1578
+ }
1579
+ const bodyText = this.cleanContent($("body").text().trim());
1580
+ if (bodyText.length > best.length) best = bodyText;
1581
+ return best;
1582
+ }
1583
+ bodyTextLengthHint(html, config) {
1584
+ const $ = cheerio.load(html);
1585
+ this.stripNoiseFromDom($, config);
1586
+ return this.cleanContent($("body").text().trim()).length;
1587
+ }
1588
+ extractDocumentFromHtml(url, html, config) {
1589
+ const $ = cheerio.load(html);
1590
+ this.stripNoiseFromDom($, config);
1591
+ const titleSelector = config.titleSelector || "h1, title";
1592
+ let title = $(titleSelector).first().text().trim();
1593
+ if (!title) {
1594
+ title = $("title").text().trim();
1595
+ }
1596
+ const content = this.extractBestContentText($, config);
1597
+ const minChars = config.minExtractedContentLength ?? 50;
1598
+ if (!content || content.length < minChars) return null;
1599
+ let type = config.defaultType || "page";
1600
+ if (config.typeFromUrl) {
1601
+ for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
1602
+ if (url.includes(pattern)) {
1603
+ type = typeName;
1604
+ break;
1605
+ }
1606
+ }
1607
+ }
1608
+ const id = this.urlToId(url);
1609
+ return {
1610
+ id,
1611
+ content,
1612
+ metadata: {
1613
+ type,
1614
+ title,
1615
+ url,
1616
+ ...config.metadata
1617
+ }
1618
+ };
1619
+ }
1620
+ looksLikeDynamicShell(html) {
1621
+ const lower = html.toLowerCase();
1622
+ const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
1623
+ const body = bodyMatch?.[1] ?? html;
1624
+ const textOnly = body.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
1625
+ const scriptCount = (body.match(/<script\b/gi) ?? []).length;
1626
+ const hasEmptyAppMountNode = /<(div|main)[^>]+id=["'](__next|root|app)["'][^>]*>\s*<\/\1>/i.test(body);
1627
+ const hasHydrationData = lower.includes("__next_data__") || lower.includes("__next_f") || lower.includes("window.__initial_state__") || lower.includes("window.__apollo_state__") || lower.includes("data-reactroot");
1628
+ const asksForJavascript = lower.includes("please enable javascript") || lower.includes("enable javascript to run this app") || lower.includes("you need to enable javascript");
1629
+ const hasLoadingHints = /\b(loading|please wait|spinner|initializing|fetching)\b/i.test(lower);
1630
+ const textLength = textOnly.length;
1631
+ const htmlLength = lower.length;
1632
+ const contentDensity = textLength / Math.max(htmlLength, 1);
1633
+ const isMostlyScripts = scriptCount >= 5 && textLength < 500;
1634
+ const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
1635
+ return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
1636
+ }
1637
+ diagFromRenderedAttempt(doc, bodyTextLengthHint, renderFailure, blockedSuspected, modeOk, modeFailed) {
1638
+ if (blockedSuspected) {
1639
+ return {
1640
+ doc: null,
1641
+ diag: { modeUsed: modeFailed, reason: "blocked_suspected" }
1642
+ };
1643
+ }
1644
+ if (renderFailure) {
1645
+ return {
1646
+ doc: null,
1647
+ diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure }
1648
+ };
1649
+ }
1650
+ return {
1651
+ doc,
1652
+ diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
1653
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint
1654
+ };
1655
+ }
1656
+ async crawlPageSmart(url, config, timeout, ctx) {
1657
+ if (ctx.renderMode === true) {
1658
+ const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
1659
+ url,
1660
+ config,
1661
+ timeout,
1662
+ ctx.renderOptions,
1663
+ ctx.dbg
1664
+ );
1665
+ return this.diagFromRenderedAttempt(
1666
+ doc,
1667
+ bodyTextLengthHint,
1668
+ renderFailure,
1669
+ blockedSuspected,
1670
+ "render_ok",
1671
+ "render_failed"
1672
+ );
1673
+ }
1674
+ try {
1675
+ const response = await fetch(url, {
1676
+ headers: {
1677
+ "User-Agent": "SnapAgent-CMS-Crawler/1.0",
1678
+ "Accept": "text/html,application/xhtml+xml"
1679
+ },
1680
+ signal: AbortSignal.timeout(timeout)
1681
+ });
1682
+ if (!response.ok) {
1683
+ const status = response.status;
1684
+ if (status === 403 || status === 429 || status === 503) {
1685
+ ctx.dbg.log("crawl.blocked", { url, status });
1686
+ return { doc: null, diag: { modeUsed: "static_failed", reason: "blocked_suspected" } };
1687
+ }
1688
+ throw new Error(`HTTP ${status}`);
1689
+ }
1690
+ const contentType = response.headers.get("content-type") || "";
1691
+ if (!contentType.includes("text/html")) {
1692
+ return { doc: null, diag: { modeUsed: "static_failed", reason: "non_html" } };
1693
+ }
1694
+ const html = await response.text();
1695
+ const doc = this.extractDocumentFromHtml(url, html, config);
1696
+ const staticHint = !doc ? this.bodyTextLengthHint(html, config) : void 0;
1697
+ if (doc && doc.content.length >= ctx.minContentLength) {
1698
+ return { doc, diag: { modeUsed: "static_ok" } };
1699
+ }
1700
+ if (ctx.renderMode === "auto") {
1701
+ const shouldRender = this.looksLikeDynamicShell(html) || !doc || doc.content.length < ctx.minContentLength;
1702
+ if (shouldRender) {
1703
+ ctx.dbg.log("crawl.renderFallback", {
1704
+ url,
1705
+ reason: !doc ? "no_doc" : "too_small",
1706
+ staticLength: doc?.content?.length ?? 0
1707
+ });
1708
+ const {
1709
+ doc: rendered,
1710
+ bodyTextLengthHint: rHint,
1711
+ renderFailure,
1712
+ blockedSuspected
1713
+ } = await this.crawlPageRendered(
1714
+ url,
1715
+ config,
1716
+ timeout,
1717
+ ctx.renderOptions,
1718
+ ctx.dbg
1719
+ );
1720
+ const mergedHint = rHint ?? staticHint;
1721
+ const fb = this.diagFromRenderedAttempt(
1722
+ rendered,
1723
+ mergedHint,
1724
+ renderFailure,
1725
+ blockedSuspected,
1726
+ "render_fallback_ok",
1727
+ "render_fallback_failed"
1728
+ );
1729
+ if (!rendered && (renderFailure || blockedSuspected)) {
1730
+ fb.bodyTextLengthHint = staticHint ?? rHint;
1731
+ }
1732
+ return fb;
1733
+ }
1734
+ }
1735
+ return {
1736
+ doc: null,
1737
+ diag: { modeUsed: "static_failed", reason: "too_small" },
1738
+ bodyTextLengthHint: staticHint
1739
+ };
1740
+ } catch (e) {
1741
+ throw e;
1742
+ }
1743
+ }
1744
+ async crawlPageRendered(url, config, timeout, renderOptions, dbg) {
1745
+ let playwright;
1746
+ try {
1747
+ playwright = await Function('return import("playwright")')();
1748
+ } catch (e) {
1749
+ dbg.log("render.missingDependency", { url, error: "playwright_not_installed" });
1750
+ throw new Error("playwright is not installed. Add it to dependencies to use crawlPageRendered().");
1751
+ }
1752
+ const waitUntil = renderOptions.waitUntil || "domcontentloaded";
1753
+ const waitForSelector = renderOptions.waitForSelector;
1754
+ const scrollCfg = renderOptions.scroll || {};
1755
+ const doScroll = scrollCfg.enabled ?? false;
1756
+ const maxScrolls = scrollCfg.maxScrolls ?? 10;
1757
+ const scrollDelayMs = scrollCfg.scrollDelayMs ?? 750;
1758
+ const stableIterations = scrollCfg.stableIterations ?? 2;
1759
+ const postRenderDelayMs = renderOptions.postRenderDelayMs ?? 0;
1760
+ const browser = await playwright.chromium.launch({ headless: true });
1761
+ try {
1762
+ const page = await browser.newPage();
1763
+ await page.goto(url, { waitUntil, timeout });
1764
+ if (waitForSelector) {
1765
+ await page.waitForSelector(waitForSelector, { timeout });
1766
+ }
1767
+ if (postRenderDelayMs > 0) {
1768
+ await page.waitForTimeout(postRenderDelayMs);
1769
+ }
1770
+ if (doScroll) {
1771
+ let stable = 0;
1772
+ let lastLen = 0;
1773
+ for (let i = 0; i < maxScrolls; i++) {
1774
+ const len = await page.evaluate("(document.body?.innerText || '').length");
1775
+ if (len <= lastLen + 20) stable++;
1776
+ else stable = 0;
1777
+ lastLen = len;
1778
+ if (stable >= stableIterations) break;
1779
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)");
1780
+ await page.waitForTimeout(scrollDelayMs);
1781
+ }
1782
+ }
1783
+ const html = await page.content();
1784
+ const bodyTextLengthHint = this.bodyTextLengthHint(html, config);
1785
+ const doc = this.extractDocumentFromHtml(url, html, config);
1786
+ if (config.debug?.saveDir && config.debug?.enabled) {
1787
+ try {
1788
+ const saveDir = config.debug.saveDir;
1789
+ const safeId = this.urlToId(url) || "page";
1790
+ const outDir = path.join(saveDir, safeId);
1791
+ fs.mkdirSync(outDir, { recursive: true });
1792
+ fs.writeFileSync(path.join(outDir, "rendered.html"), html, "utf8");
1793
+ fs.writeFileSync(path.join(outDir, "extracted.txt"), doc?.content || "", "utf8");
1794
+ fs.writeFileSync(path.join(outDir, "meta.json"), JSON.stringify(doc?.metadata || {}, null, 2), "utf8");
1795
+ } catch (e) {
1796
+ dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
1797
+ }
1798
+ }
1799
+ return { doc, bodyTextLengthHint };
1800
+ } catch (e) {
1801
+ const msg = String(e?.message || e || "render_failed");
1802
+ const lower = msg.toLowerCase();
1803
+ if (lower.includes("captcha") || lower.includes("access denied")) {
1804
+ dbg.log("render.blocked", { url, error: msg });
1805
+ return { doc: null, bodyTextLengthHint: 0, blockedSuspected: true };
1806
+ }
1807
+ dbg.log("render.error", { url, error: msg });
1808
+ return { doc: null, bodyTextLengthHint: 0, renderFailure: msg };
1809
+ } finally {
1810
+ await browser.close();
1811
+ }
1812
+ }
1813
+ async discoverSitemaps(baseUrl, timeout, dbg) {
1814
+ const base = new URL(baseUrl);
1815
+ const robotsUrl = new URL("/robots.txt", base).toString();
1816
+ const found = /* @__PURE__ */ new Set();
1817
+ try {
1818
+ const res = await fetch(robotsUrl, {
1819
+ headers: { "User-Agent": "SnapAgent-CMS-Crawler/1.0" },
1820
+ signal: AbortSignal.timeout(timeout)
1821
+ });
1822
+ if (res.ok) {
1823
+ const txt = await res.text();
1824
+ const rx = /^sitemap:\s*(\S+)/gim;
1825
+ let m;
1826
+ while ((m = rx.exec(txt)) !== null) {
1827
+ const sm = m[1].trim();
1828
+ if (sm.startsWith("http")) found.add(sm);
1829
+ }
1830
+ dbg.log("discovery.robots", { robotsUrl, ok: true, sitemapCount: found.size });
1831
+ } else {
1832
+ dbg.log("discovery.robots", { robotsUrl, ok: false, status: res.status });
1833
+ }
1834
+ } catch (e) {
1835
+ dbg.log("discovery.robots", { robotsUrl, ok: false, error: e instanceof Error ? e.message : "failed" });
1836
+ }
1837
+ if (found.size === 0) {
1838
+ const candidates = [
1839
+ "/sitemap.xml",
1840
+ "/sitemap_index.xml",
1841
+ "/sitemap-index.xml",
1842
+ "/wp-sitemap.xml"
1843
+ ].map((p) => new URL(p, base).toString());
1844
+ candidates.forEach((c) => found.add(c));
1845
+ dbg.log("discovery.sitemapCandidates", { count: candidates.length });
1846
+ }
1847
+ return Array.from(found);
1848
+ }
1849
+ createDebugCollector(debug) {
1850
+ const enabled = !!debug?.enabled;
1851
+ const level = debug?.level || "summary";
1852
+ const maxPerUrlLogs = debug?.maxPerUrlLogs ?? 200;
1853
+ const entries = [];
1854
+ return {
1855
+ log: (event, data) => {
1856
+ if (!enabled) return;
1857
+ if (level === "summary" && !event.startsWith("discovery.") && !event.startsWith("crawl.")) return;
1858
+ if (entries.length >= maxPerUrlLogs) return;
1859
+ entries.push({ ts: (/* @__PURE__ */ new Date()).toISOString(), event, data });
1860
+ },
1861
+ summary: () => enabled ? { enabled, level, entries } : void 0
1862
+ };
1863
+ }
1864
+ /**
1865
+ * Clean extracted text content
1866
+ */
1867
+ cleanContent(text) {
1868
+ return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
1869
+ }
1870
+ /**
1871
+ * Convert URL to a stable document ID
1872
+ */
1873
+ urlToId(url) {
1874
+ return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
1875
+ }
1876
+ /**
1877
+ * Delay helper
1878
+ */
1879
+ delay(ms) {
1880
+ return new Promise((resolve) => setTimeout(resolve, ms));
1881
+ }
1882
+ // ============================================================================
1883
+ // RSS/Atom Feed Ingestion
1884
+ // ============================================================================
1885
+ /**
1886
+ * Ingest content from an RSS or Atom feed
1887
+ *
1888
+ * @example
1889
+ * ```typescript
1890
+ * // Simple RSS ingestion
1891
+ * await plugin.ingestFromRSS({
1892
+ * feedUrl: 'https://myblog.com/feed/',
1893
+ * });
1894
+ *
1895
+ * // Fetch full page content for each item
1896
+ * await plugin.ingestFromRSS({
1897
+ * feedUrl: 'https://myblog.com/feed/',
1898
+ * fetchFullContent: true,
1899
+ * contentSelector: 'article',
1900
+ * });
1901
+ * ```
1902
+ */
1903
+ async ingestFromRSS(config, options) {
1904
+ try {
1905
+ const response = await fetch(config.feedUrl, {
1906
+ headers: { "User-Agent": "SnapAgent-CMS-Crawler/1.0" },
1907
+ signal: AbortSignal.timeout(3e4)
1908
+ });
1909
+ if (!response.ok) {
1910
+ return {
1911
+ success: false,
1912
+ indexed: 0,
1913
+ failed: 1,
1914
+ urlsCrawled: 0,
1915
+ urlsSkipped: 0,
1916
+ urlsFailed: 1,
1917
+ crawledAt: /* @__PURE__ */ new Date(),
1918
+ errors: [{ id: config.feedUrl, error: `HTTP ${response.status}` }]
1919
+ };
1920
+ }
1921
+ const xml = await response.text();
1922
+ const items = this.parseRSSFeed(xml);
1923
+ if (items.length === 0) {
1924
+ return {
1925
+ success: true,
1926
+ indexed: 0,
1927
+ failed: 0,
1928
+ urlsCrawled: 0,
1929
+ urlsSkipped: 0,
1930
+ urlsFailed: 0,
1931
+ crawledAt: /* @__PURE__ */ new Date()
1932
+ };
1933
+ }
1934
+ const documents = [];
1935
+ const type = config.type || "post";
1936
+ let urlsCrawled = 0;
1937
+ let urlsFailed = 0;
1938
+ const errors = [];
1939
+ for (const item of items) {
1940
+ try {
1941
+ let content = item.content || item.description || "";
1942
+ if (config.fetchFullContent && item.link) {
1943
+ try {
1944
+ const doc = await this.crawlPage(item.link, {
1945
+ contentSelector: config.contentSelector,
1946
+ defaultType: type
1947
+ }, 3e4);
1948
+ if (doc) {
1949
+ content = doc.content;
1950
+ }
1951
+ urlsCrawled++;
1952
+ } catch (error) {
1953
+ urlsFailed++;
1954
+ }
1955
+ }
1956
+ content = this.stripHtml(content);
1957
+ if (content.length < 50) continue;
1958
+ documents.push({
1959
+ id: this.urlToId(item.link || item.guid || `rss-${documents.length}`),
1960
+ content,
1961
+ metadata: {
1962
+ type,
1963
+ title: item.title,
1964
+ url: item.link,
1965
+ publishedAt: item.pubDate,
1966
+ author: item.author,
1967
+ categories: item.categories,
1968
+ ...config.metadata
1969
+ }
1970
+ });
1971
+ } catch (error) {
1972
+ errors.push({
1973
+ id: item.link || "unknown",
1974
+ error: error instanceof Error ? error.message : "Unknown error"
1975
+ });
1976
+ }
1977
+ }
1978
+ let indexed = 0;
1979
+ if (documents.length > 0) {
1980
+ const ingestResult = await this.ingest(documents, options);
1981
+ indexed = ingestResult.indexed;
1982
+ }
1983
+ return {
1984
+ success: errors.length === 0,
1985
+ indexed,
1986
+ failed: errors.length,
1987
+ urlsCrawled,
1988
+ urlsSkipped: 0,
1989
+ urlsFailed,
1990
+ crawledAt: /* @__PURE__ */ new Date(),
1991
+ errors: errors.length > 0 ? errors : void 0
1992
+ };
1993
+ } catch (error) {
1994
+ return {
1995
+ success: false,
1996
+ indexed: 0,
1997
+ failed: 1,
1998
+ urlsCrawled: 0,
1999
+ urlsSkipped: 0,
2000
+ urlsFailed: 0,
2001
+ crawledAt: /* @__PURE__ */ new Date(),
2002
+ errors: [{
2003
+ id: config.feedUrl,
2004
+ error: error instanceof Error ? error.message : "Unknown error"
2005
+ }]
2006
+ };
2007
+ }
2008
+ }
2009
+ /**
2010
+ * Parse RSS/Atom feed XML
2011
+ */
2012
+ parseRSSFeed(xml) {
2013
+ const items = [];
2014
+ const isAtom = xml.includes("<feed") && xml.includes('xmlns="http://www.w3.org/2005/Atom"');
2015
+ if (isAtom) {
2016
+ const entryRegex = /<entry>([\s\S]*?)<\/entry>/gi;
2017
+ let match;
2018
+ while ((match = entryRegex.exec(xml)) !== null) {
2019
+ const entry = match[1];
2020
+ items.push({
2021
+ title: this.extractXmlValue(entry, "title"),
2022
+ link: this.extractAtomLink(entry),
2023
+ guid: this.extractXmlValue(entry, "id"),
2024
+ content: this.extractXmlValue(entry, "content") || this.extractXmlValue(entry, "summary"),
2025
+ pubDate: this.extractXmlValue(entry, "published") || this.extractXmlValue(entry, "updated"),
2026
+ author: this.extractXmlValue(entry, "name"),
2027
+ // Inside <author>
2028
+ categories: this.extractXmlValues(entry, "category", "term")
2029
+ });
2030
+ }
2031
+ } else {
2032
+ const itemRegex = /<item>([\s\S]*?)<\/item>/gi;
2033
+ let match;
2034
+ while ((match = itemRegex.exec(xml)) !== null) {
2035
+ const item = match[1];
2036
+ items.push({
2037
+ title: this.extractXmlValue(item, "title"),
2038
+ link: this.extractXmlValue(item, "link"),
2039
+ guid: this.extractXmlValue(item, "guid"),
2040
+ description: this.extractXmlValue(item, "description"),
2041
+ content: this.extractXmlValue(item, "content:encoded") || this.extractXmlValue(item, "content"),
2042
+ pubDate: this.extractXmlValue(item, "pubDate"),
2043
+ author: this.extractXmlValue(item, "author") || this.extractXmlValue(item, "dc:creator"),
2044
+ categories: this.extractXmlValues(item, "category")
2045
+ });
2046
+ }
2047
+ }
2048
+ return items;
2049
+ }
2050
+ /**
2051
+ * Extract a single value from XML
2052
+ */
2053
+ extractXmlValue(xml, tag) {
2054
+ const cdataRegex = new RegExp(`<${tag}[^>]*><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`, "i");
2055
+ const cdataMatch = xml.match(cdataRegex);
2056
+ if (cdataMatch) {
2057
+ return cdataMatch[1].trim();
2058
+ }
2059
+ const regex = new RegExp(`<${tag}[^>]*>([^<]*)<\\/${tag}>`, "i");
2060
+ const match = xml.match(regex);
2061
+ return match ? match[1].trim() : void 0;
2062
+ }
2063
+ /**
2064
+ * Extract multiple values from XML
2065
+ */
2066
+ extractXmlValues(xml, tag, attr) {
2067
+ const values = [];
2068
+ if (attr) {
2069
+ const regex = new RegExp(`<${tag}[^>]*${attr}="([^"]*)"[^>]*/?>`, "gi");
2070
+ let match;
2071
+ while ((match = regex.exec(xml)) !== null) {
2072
+ values.push(match[1]);
2073
+ }
2074
+ } else {
2075
+ const regex = new RegExp(`<${tag}[^>]*>([^<]*)<\\/${tag}>`, "gi");
2076
+ let match;
2077
+ while ((match = regex.exec(xml)) !== null) {
2078
+ values.push(match[1].trim());
2079
+ }
2080
+ }
2081
+ return values;
2082
+ }
2083
+ /**
2084
+ * Extract link from Atom entry
2085
+ */
2086
+ extractAtomLink(entry) {
2087
+ const alternateMatch = entry.match(/<link[^>]*rel="alternate"[^>]*href="([^"]+)"/i);
2088
+ if (alternateMatch) return alternateMatch[1];
2089
+ const linkMatch = entry.match(/<link[^>]*href="([^"]+)"/i);
2090
+ return linkMatch ? linkMatch[1] : void 0;
2091
+ }
2092
+ /**
2093
+ * Strip HTML tags from content
2094
+ */
2095
+ stripHtml(html) {
2096
+ return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/\s+/g, " ").trim();
2097
+ }
2098
+ // ============================================================================
2099
+ // Utility Methods
2100
+ // ============================================================================
2101
+ /**
2102
+ * Get cache statistics
2103
+ */
2104
+ getCacheStats() {
2105
+ const total = this.cacheStats.hits + this.cacheStats.misses;
2106
+ const hitRate = total > 0 ? (this.cacheStats.hits / total).toFixed(3) : "0.000";
2107
+ return { ...this.cacheStats, hitRate };
2108
+ }
2109
+ /**
2110
+ * Clear embedding cache
2111
+ */
2112
+ clearCache() {
2113
+ this.embeddingCache.clear();
2114
+ this.cacheStats = { hits: 0, misses: 0 };
2115
+ }
2116
+ /**
2117
+ * Get plugin configuration (for persistence)
2118
+ */
2119
+ getConfig() {
2120
+ return {
2121
+ name: this.name,
2122
+ mongoUri: "${MONGODB_URI}",
2123
+ // Reference env var
2124
+ dbName: this.config.dbName,
2125
+ collection: this.config.collection,
2126
+ openaiApiKey: "${OPENAI_API_KEY}",
2127
+ // Reference env var
2128
+ embeddingModel: this.config.embeddingModel,
2129
+ tenantId: this.config.tenantId,
2130
+ vectorIndexName: this.config.vectorIndexName,
2131
+ numCandidates: this.config.numCandidates,
2132
+ limit: this.config.limit,
2133
+ minScore: this.config.minScore,
2134
+ filterableFields: this.config.filterableFields,
2135
+ typeBoosts: this.config.typeBoosts,
2136
+ recencyBoost: this.config.recencyBoost,
2137
+ priority: this.priority
2138
+ };
2139
+ }
2140
+ };
2141
+ // Annotate the CommonJS export names for ESM import in node:
2142
+ 0 && (module.exports = {
2143
+ WebRAGPlugin
2144
+ });