@snap-agent/rag-web 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -30,17 +30,338 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
- WebRAGPlugin: () => WebRAGPlugin
33
+ WebRAGPlugin: () => WebRAGPlugin,
34
+ bodyTextLengthHint: () => bodyTextLengthHint,
35
+ extractPageFromHtml: () => extractPageFromHtml,
36
+ extractProductMetadata: () => extractProductMetadata,
37
+ normalizeAvailability: () => normalizeAvailability,
38
+ normalizeCurrency: () => normalizeCurrency,
39
+ parsePrice: () => parsePrice,
40
+ urlToDocumentId: () => urlToDocumentId
34
41
  });
35
42
  module.exports = __toCommonJS(index_exports);
36
43
 
37
44
  // src/WebRAGPlugin.ts
38
45
  var import_mongodb = require("mongodb");
39
46
  var import_openai = __toESM(require("openai"));
40
- var cheerio = __toESM(require("cheerio"));
47
+ var cheerio3 = __toESM(require("cheerio"));
41
48
  var fs = __toESM(require("fs"));
42
49
  var path = __toESM(require("path"));
43
- var WebRAGPlugin = class _WebRAGPlugin {
50
+
51
+ // src/htmlPageExtract.ts
52
+ var cheerio2 = __toESM(require("cheerio"));
53
+
54
+ // src/productMetadata.ts
55
+ var cheerio = __toESM(require("cheerio"));
56
+ function extractProductMetadata(html) {
57
+ const $ = cheerio.load(html);
58
+ const fromJsonLd = extractFromJsonLd($);
59
+ const fromOg = extractFromOpenGraph($);
60
+ const fromMicrodata = extractFromMicrodata($);
61
+ const result = {};
62
+ const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
63
+ if (price != null) result.price = price;
64
+ const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
65
+ if (currency) result.currency = currency;
66
+ const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
67
+ if (availability) result.availability = availability;
68
+ return result;
69
+ }
70
+ function extractFromJsonLd($) {
71
+ const result = {};
72
+ $('script[type="application/ld+json"]').each((_, el) => {
73
+ if (result.price != null && result.currency && result.availability) return false;
74
+ const raw = $(el).html()?.trim();
75
+ if (!raw) return;
76
+ let parsed;
77
+ try {
78
+ parsed = JSON.parse(raw);
79
+ } catch {
80
+ return;
81
+ }
82
+ for (const node of collectJsonLdNodes(parsed)) {
83
+ if (!isProductType(node)) continue;
84
+ const offer = pickOffer(node);
85
+ if (!offer) continue;
86
+ if (result.price == null) {
87
+ const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
88
+ if (price != null) result.price = price;
89
+ }
90
+ if (!result.currency) {
91
+ const currency = normalizeCurrency(offer.priceCurrency);
92
+ if (currency) result.currency = currency;
93
+ }
94
+ if (!result.availability) {
95
+ const availability = normalizeAvailability(offer.availability);
96
+ if (availability) result.availability = availability;
97
+ }
98
+ }
99
+ });
100
+ return result;
101
+ }
102
+ function extractFromOpenGraph($) {
103
+ const result = {};
104
+ const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
105
+ const price = parsePrice(priceRaw);
106
+ if (price != null) result.price = price;
107
+ const currency = normalizeCurrency(
108
+ $('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
109
+ );
110
+ if (currency) result.currency = currency;
111
+ const availability = normalizeAvailability(
112
+ $('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
113
+ );
114
+ if (availability) result.availability = availability;
115
+ return result;
116
+ }
117
+ function microdataField($, itemprop) {
118
+ const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
119
+ return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
120
+ }
121
+ function extractFromMicrodata($) {
122
+ const result = {};
123
+ const priceEl = microdataField($, "price");
124
+ const price = parsePrice(priceEl.attr("content") || priceEl.text());
125
+ if (price != null) result.price = price;
126
+ const currencyEl = microdataField($, "priceCurrency");
127
+ const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
128
+ if (currency) result.currency = currency;
129
+ const availabilityEl = microdataField($, "availability");
130
+ const availability = normalizeAvailability(
131
+ availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
132
+ );
133
+ if (availability) result.availability = availability;
134
+ return result;
135
+ }
136
+ function collectJsonLdNodes(data) {
137
+ const nodes = [];
138
+ const visit = (value) => {
139
+ if (value == null) return;
140
+ if (Array.isArray(value)) {
141
+ value.forEach(visit);
142
+ return;
143
+ }
144
+ if (typeof value !== "object") return;
145
+ const obj = value;
146
+ nodes.push(obj);
147
+ if (obj["@graph"]) visit(obj["@graph"]);
148
+ };
149
+ visit(data);
150
+ return nodes;
151
+ }
152
+ function isProductType(node) {
153
+ const type = node["@type"];
154
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
155
+ return types.some((t) => {
156
+ const s = String(t).toLowerCase();
157
+ return s === "product" || s.endsWith("/product");
158
+ });
159
+ }
160
+ function pickOffer(product) {
161
+ const offers = product.offers;
162
+ if (offers == null) return null;
163
+ if (Array.isArray(offers)) {
164
+ const first = offers.find((o) => o && typeof o === "object");
165
+ return first ?? null;
166
+ }
167
+ if (typeof offers === "object") return offers;
168
+ return null;
169
+ }
170
+ function parsePrice(value) {
171
+ if (value == null || value === "") return void 0;
172
+ if (typeof value === "number" && Number.isFinite(value)) return value;
173
+ let s = String(value).trim();
174
+ if (!s) return void 0;
175
+ s = s.replace(/[^\d.,\-]/g, "");
176
+ if (!s || s === "-" || s === ".") return void 0;
177
+ const lastComma = s.lastIndexOf(",");
178
+ const lastDot = s.lastIndexOf(".");
179
+ if (lastComma > -1 && lastDot > -1) {
180
+ if (lastComma > lastDot) {
181
+ s = s.replace(/\./g, "").replace(",", ".");
182
+ } else {
183
+ s = s.replace(/,/g, "");
184
+ }
185
+ } else if (lastComma > -1) {
186
+ const parts = s.split(",");
187
+ if (parts.length === 2 && parts[1].length <= 2) {
188
+ s = parts[0].replace(/\./g, "") + "." + parts[1];
189
+ } else {
190
+ s = s.replace(/,/g, "");
191
+ }
192
+ }
193
+ const num = parseFloat(s);
194
+ return Number.isFinite(num) ? num : void 0;
195
+ }
196
+ function normalizeCurrency(value) {
197
+ if (value == null) return void 0;
198
+ const s = String(value).trim().toUpperCase();
199
+ if (!s) return void 0;
200
+ const iso = s.match(/[A-Z]{3}/);
201
+ return iso ? iso[0] : s.length <= 4 ? s : void 0;
202
+ }
203
+ function normalizeAvailability(value) {
204
+ if (value == null) return void 0;
205
+ let s = String(value).trim();
206
+ if (!s) return void 0;
207
+ if (s.includes("schema.org/")) {
208
+ const parts = s.split("/");
209
+ s = parts[parts.length - 1] || s;
210
+ }
211
+ s = s.replace(/^https?:\/\/[^/]+\//, "");
212
+ if (s.includes("/")) {
213
+ const parts = s.split("/");
214
+ s = parts[parts.length - 1] || s;
215
+ }
216
+ return s.replace(/\s+/g, "") || void 0;
217
+ }
218
+
219
+ // src/htmlPageExtract.ts
220
+ var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
221
+ var DEFAULT_REMOVE_SELECTORS = [
222
+ "script",
223
+ "style",
224
+ "nav",
225
+ "header",
226
+ "footer",
227
+ ".sidebar",
228
+ ".navigation",
229
+ ".menu",
230
+ ".comments",
231
+ '[role="navigation"]',
232
+ '[role="banner"]'
233
+ ];
234
+ function urlToDocumentId(url) {
235
+ return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
236
+ }
237
+ function cleanContent(text) {
238
+ return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
239
+ }
240
+ function bodyTextLengthHint(html, options = {}) {
241
+ const $ = cheerio2.load(html);
242
+ stripNoiseFromDom($, options);
243
+ return cleanContent($("body").text().trim()).length;
244
+ }
245
+ function extractPageFromHtml(url, html, options = {}) {
246
+ const $ = cheerio2.load(html);
247
+ stripNoiseFromDom($, options);
248
+ const titleSelector = options.titleSelector || "h1, title";
249
+ let title = $(titleSelector).first().text().trim();
250
+ if (!title) {
251
+ title = $("title").text().trim();
252
+ }
253
+ const content = extractBestContentText($, options);
254
+ const minChars = options.minExtractedContentLength ?? 50;
255
+ const indexable = Boolean(content && content.length >= minChars);
256
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
257
+ let imageUrl;
258
+ if (image) {
259
+ try {
260
+ imageUrl = new URL(image, url).href;
261
+ } catch {
262
+ imageUrl = image;
263
+ }
264
+ }
265
+ const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
266
+ let type = options.defaultType || "page";
267
+ if (options.typeFromUrl) {
268
+ for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
269
+ if (url.includes(pattern)) {
270
+ type = typeName;
271
+ break;
272
+ }
273
+ }
274
+ }
275
+ const productMeta = extractProductMetadata(html);
276
+ const metadata = {
277
+ type,
278
+ ...title ? { title } : {},
279
+ url,
280
+ ...imageUrl ? { imageUrl } : {},
281
+ ...description ? { description } : {},
282
+ ...productMeta.price != null ? { price: productMeta.price } : {},
283
+ ...productMeta.currency ? { currency: productMeta.currency } : {},
284
+ ...productMeta.availability ? { availability: productMeta.availability } : {},
285
+ ...options.metadata
286
+ };
287
+ const previewLen = 400;
288
+ const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
289
+ return {
290
+ id: urlToDocumentId(url),
291
+ metadata,
292
+ content,
293
+ indexable,
294
+ contentPreview
295
+ };
296
+ }
297
+ function stripNoiseFromDom($, options) {
298
+ const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
299
+ removeSelectors.forEach((selector) => $(selector).remove());
300
+ }
301
+ function extractBestContentText($, options) {
302
+ const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
303
+ const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
304
+ let best = "";
305
+ for (const sel of selectors) {
306
+ $(sel).each((_, el) => {
307
+ const t = cleanContent($(el).text().trim());
308
+ if (t.length > best.length) best = t;
309
+ });
310
+ }
311
+ const bodyText = cleanContent($("body").text().trim());
312
+ if (bodyText.length > best.length) best = bodyText;
313
+ return best;
314
+ }
315
+ function extractHeroImage($, pageUrl) {
316
+ const containers = $('main, article, [role="main"], #content, .content');
317
+ const scope = containers.length > 0 ? containers : $("body");
318
+ let best;
319
+ scope.find("img[src]").each((_, el) => {
320
+ if (best) return false;
321
+ const src = $(el).attr("src") || "";
322
+ const alt = ($(el).attr("alt") || "").toLowerCase();
323
+ const width = parseInt($(el).attr("width") || "0", 10);
324
+ const height = parseInt($(el).attr("height") || "0", 10);
325
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
326
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
327
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
328
+ if (src.includes("/_next/image")) {
329
+ try {
330
+ const nextUrl = new URL(src, pageUrl);
331
+ const realUrl = nextUrl.searchParams.get("url");
332
+ if (realUrl) {
333
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
334
+ return false;
335
+ }
336
+ } catch {
337
+ }
338
+ }
339
+ best = src;
340
+ return false;
341
+ });
342
+ return best;
343
+ }
344
+
345
+ // src/WebRAGPlugin.ts
346
+ function bulkOpCurrentUrl(op) {
347
+ const meta = op.document?.metadata;
348
+ if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
349
+ if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
350
+ return void 0;
351
+ }
352
+ function isUrlListingInsert(document) {
353
+ const meta = document.metadata;
354
+ if (meta?.type !== "url") return false;
355
+ const url = typeof meta.url === "string" ? meta.url.trim() : "";
356
+ if (!url) return false;
357
+ try {
358
+ const parsed = new URL(url);
359
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
360
+ } catch {
361
+ return false;
362
+ }
363
+ }
364
+ var WebRAGPlugin = class {
44
365
  name = "web-rag";
45
366
  type = "rag";
46
367
  priority;
@@ -78,6 +399,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
78
399
  }
79
400
  return this.db.collection(this.config.collection);
80
401
  }
402
+ ledgerIndexesEnsured = false;
81
403
  async getLedgerCollection() {
82
404
  if (!this.client) {
83
405
  this.client = new import_mongodb.MongoClient(this.config.mongoUri);
@@ -85,7 +407,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
85
407
  this.db = this.client.db(this.config.dbName);
86
408
  }
87
409
  const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
88
- return this.db.collection(name);
410
+ const col = this.db.collection(name);
411
+ if (!this.ledgerIndexesEnsured) {
412
+ this.ledgerIndexesEnsured = true;
413
+ await col.createIndex(
414
+ { tenantId: 1, agentId: 1, urlNormalized: 1 },
415
+ { unique: true }
416
+ );
417
+ await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
418
+ }
419
+ return col;
89
420
  }
90
421
  /**
91
422
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -95,6 +426,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
95
426
  const filter = { tenantId: this.config.tenantId };
96
427
  filter.agentId = options.agentId ?? "shared";
97
428
  if (options.domain) filter.domain = options.domain;
429
+ if (options.ingestionId) filter.ingestionId = options.ingestionId;
98
430
  if (options.status) filter.lastStatus = options.status;
99
431
  const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
100
432
  const skip = Math.max(options.skip ?? 0, 0);
@@ -163,6 +495,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
163
495
  lastCrawledAt: now,
164
496
  updatedAt: now
165
497
  };
498
+ if (params.ingestionId) {
499
+ $set.ingestionId = params.ingestionId;
500
+ }
166
501
  if (errMsg !== void 0) {
167
502
  $set.errorMessage = errMsg;
168
503
  } else if (params.status === "indexed" && params.doc) {
@@ -175,9 +510,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
175
510
  $set.docId = params.doc.id;
176
511
  } else {
177
512
  $set.modeUsed = params.diag?.modeUsed;
178
- $set.contentLength = null;
179
- $set.title = null;
180
- $set.docId = null;
513
+ $set.contentLength = params.contentLength ?? null;
514
+ $set.title = params.title ?? null;
515
+ $set.docId = params.docId ?? null;
181
516
  }
182
517
  await col.updateOne(
183
518
  {
@@ -256,6 +591,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
256
591
  url: doc.metadata.url,
257
592
  imageUrl: doc.metadata.imageUrl,
258
593
  description: doc.metadata.description,
594
+ ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
595
+ ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
596
+ ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
259
597
  score: doc.score
260
598
  }))
261
599
  }
@@ -421,9 +759,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
421
759
  let indexed = 0;
422
760
  const errors = [];
423
761
  const agentId = options?.agentId || "shared";
424
- for (const doc of documents) {
762
+ const onCrawlProgress = options?.metadata?.onCrawlProgress;
763
+ const indexingTotal = documents.length;
764
+ const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
765
+ const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
766
+ let chunksProcessed = 0;
767
+ if (onCrawlProgress && indexingTotal > 0) {
768
+ this.emitCrawlProgress(
769
+ { metadata: options?.metadata },
770
+ {
771
+ phase: "indexing",
772
+ urlsScheduled: indexingTotal,
773
+ pagesProcessed: 0,
774
+ chunksTotal,
775
+ chunksProcessed: 0
776
+ }
777
+ );
778
+ }
779
+ for (let docIndex = 0; docIndex < documents.length; docIndex++) {
780
+ const doc = documents[docIndex];
781
+ const chunks = chunkPlan[docIndex];
425
782
  try {
426
- const chunks = this.chunkContent(doc.content);
427
783
  const isChunked = chunks.length > 1;
428
784
  if (isChunked) {
429
785
  await collection.deleteMany({
@@ -458,6 +814,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
458
814
  },
459
815
  { upsert: true }
460
816
  );
817
+ chunksProcessed++;
818
+ if (onCrawlProgress) {
819
+ this.emitCrawlProgress(
820
+ { metadata: options?.metadata },
821
+ {
822
+ phase: "indexing",
823
+ urlsScheduled: indexingTotal,
824
+ pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
825
+ chunksTotal,
826
+ chunksProcessed
827
+ }
828
+ );
829
+ }
461
830
  }
462
831
  indexed++;
463
832
  } catch (error) {
@@ -537,23 +906,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
537
906
  let deleted = 0;
538
907
  let failed = 0;
539
908
  const errors = [];
909
+ const opsTotal = operations.length;
910
+ let opsDone = 0;
911
+ const ingestOptions = options ?? {};
912
+ this.emitBulkProgress(ingestOptions, {
913
+ phase: "processing",
914
+ opsTotal,
915
+ opsDone: 0
916
+ });
540
917
  for (const op of operations) {
918
+ const currentUrl = bulkOpCurrentUrl(op);
541
919
  try {
542
920
  switch (op.type) {
543
921
  case "insert":
544
922
  if (op.document) {
545
- await this.ingest([op.document], options);
546
- inserted++;
923
+ if (isUrlListingInsert(op.document)) {
924
+ const url = bulkOpCurrentUrl(op);
925
+ const crawlResult = await this.ingestSinglePageFromUrl(
926
+ {
927
+ url,
928
+ metadata: {
929
+ ...op.document.metadata ?? {},
930
+ url
931
+ }
932
+ },
933
+ ingestOptions
934
+ );
935
+ if (crawlResult.indexed > 0) {
936
+ inserted++;
937
+ } else {
938
+ failed++;
939
+ const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
940
+ errors.push({
941
+ id: op.id,
942
+ operation: op.type,
943
+ error: err
944
+ });
945
+ }
946
+ } else {
947
+ await this.ingest([op.document], ingestOptions);
948
+ inserted++;
949
+ }
547
950
  }
548
951
  break;
549
952
  case "update":
550
953
  if (op.document) {
551
- await this.update(op.id, op.document, options);
954
+ await this.update(op.id, op.document, ingestOptions);
552
955
  updated++;
553
956
  }
554
957
  break;
555
958
  case "delete":
556
- const count = await this.delete(op.id, options);
959
+ const count = await this.delete(op.id, ingestOptions);
557
960
  deleted += count;
558
961
  break;
559
962
  }
@@ -564,6 +967,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
564
967
  operation: op.type,
565
968
  error: error.message || "Unknown error"
566
969
  });
970
+ } finally {
971
+ opsDone++;
972
+ this.emitBulkProgress(ingestOptions, {
973
+ phase: "processing",
974
+ opsTotal,
975
+ opsDone,
976
+ currentOpType: op.type,
977
+ ...currentUrl ? { currentUrl } : {}
978
+ });
567
979
  }
568
980
  }
569
981
  return {
@@ -1130,6 +1542,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1130
1542
  };
1131
1543
  }
1132
1544
  const dbg = this.createDebugCollector(config.debug);
1545
+ this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
1133
1546
  const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
1134
1547
  if (!base) {
1135
1548
  return {
@@ -1161,6 +1574,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
1161
1574
  if (config.excludePatterns?.length) {
1162
1575
  filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
1163
1576
  }
1577
+ this.emitCrawlProgress(config, {
1578
+ phase: "discovering",
1579
+ urlsDiscovered: filteredUrls.length
1580
+ });
1164
1581
  urlsToCrawl = filteredUrls.slice(0, maxPages);
1165
1582
  urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
1166
1583
  break;
@@ -1182,7 +1599,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
1182
1599
  urlsToCrawl = discovery.urls;
1183
1600
  urlsSkipped = discovery.skipped;
1184
1601
  dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
1602
+ this.emitCrawlProgress(config, {
1603
+ phase: "discovering",
1604
+ urlsDiscovered: urlsToCrawl.length
1605
+ });
1185
1606
  }
1607
+ this.emitCrawlProgress(config, {
1608
+ phase: "crawling",
1609
+ urlsDiscovered: urlsToCrawl.length,
1610
+ urlsScheduled: urlsToCrawl.length
1611
+ });
1186
1612
  const result = await this.crawlUrls(urlsToCrawl, {
1187
1613
  contentSelector: config.contentSelector,
1188
1614
  titleSelector: config.titleSelector,
@@ -1204,9 +1630,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1204
1630
  return {
1205
1631
  ...result,
1206
1632
  urlsSkipped,
1633
+ /** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
1634
+ urlsScheduled: urlsToCrawl.length,
1207
1635
  crawledAt: /* @__PURE__ */ new Date(),
1208
1636
  metadata: {
1209
1637
  ...result.metadata || {},
1638
+ urlsScheduled: urlsToCrawl.length,
1210
1639
  discoveryDebug: dbg.summary()
1211
1640
  }
1212
1641
  };
@@ -1335,7 +1764,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1335
1764
  return await response.text();
1336
1765
  }
1337
1766
  extractInternalLinks(html, base, stripQueryParams) {
1338
- const $ = cheerio.load(html);
1767
+ const $ = cheerio3.load(html);
1339
1768
  const links = /* @__PURE__ */ new Set();
1340
1769
  $("a[href]").each((_, el) => {
1341
1770
  const href = ($(el).attr("href") || "").trim();
@@ -1434,6 +1863,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1434
1863
  const forceRecrawl = !!(options && options.forceRecrawl);
1435
1864
  const agentId = options?.agentId ?? "shared";
1436
1865
  const stripQ = config.stripQueryParams ?? false;
1866
+ const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
1437
1867
  const urlByNorm = /* @__PURE__ */ new Map();
1438
1868
  for (const u of urls) {
1439
1869
  const norm = this.normalizeLedgerUrl(u, stripQ) || u;
@@ -1462,6 +1892,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1462
1892
  const results = await Promise.allSettled(
1463
1893
  batch.map(async (url) => {
1464
1894
  const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
1895
+ this.emitCrawlPage(config, { url, event: "start" });
1465
1896
  if (ledgerOpts && !forceRecrawl) {
1466
1897
  const entry = await this.findLedgerEntry(urlNormalized, agentId);
1467
1898
  if (this.shouldSkipLedger(
@@ -1482,11 +1913,24 @@ var WebRAGPlugin = class _WebRAGPlugin {
1482
1913
  docId: entry?.docId
1483
1914
  });
1484
1915
  dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
1916
+ if (ledgerOpts) {
1917
+ await this.upsertLedgerRecord({
1918
+ url,
1919
+ urlNormalized,
1920
+ agentId,
1921
+ ingestionId,
1922
+ status: "skipped_ledger",
1923
+ title: entry?.title,
1924
+ docId: entry?.docId,
1925
+ contentLength: entry?.contentLength
1926
+ });
1927
+ }
1928
+ this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
1485
1929
  return { kind: "ledger_skip", url };
1486
1930
  }
1487
1931
  }
1488
1932
  try {
1489
- const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
1933
+ const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
1490
1934
  renderMode,
1491
1935
  renderOptions,
1492
1936
  minContentLength,
@@ -1505,6 +1949,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1505
1949
  url,
1506
1950
  urlNormalized,
1507
1951
  agentId,
1952
+ ingestionId,
1508
1953
  status: crawlSt,
1509
1954
  doc,
1510
1955
  diag
@@ -1516,11 +1961,17 @@ var WebRAGPlugin = class _WebRAGPlugin {
1516
1961
  status: crawlSt,
1517
1962
  modeUsed: diag?.modeUsed,
1518
1963
  contentLength: doc?.content?.length,
1519
- bodyTextLengthHint,
1964
+ bodyTextLengthHint: bodyTextLengthHint2,
1520
1965
  title: doc?.metadata?.title,
1521
1966
  docId: doc?.id,
1522
1967
  error: diag?.errorMessage
1523
1968
  });
1969
+ this.emitCrawlPage(config, {
1970
+ url,
1971
+ event: "done",
1972
+ status: crawlSt,
1973
+ error: diag?.errorMessage
1974
+ });
1524
1975
  return { kind: "doc", doc, url };
1525
1976
  } catch (error) {
1526
1977
  const msg = error instanceof Error ? error.message : String(error);
@@ -1529,6 +1980,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1529
1980
  url,
1530
1981
  urlNormalized,
1531
1982
  agentId,
1983
+ ingestionId,
1532
1984
  status: "error",
1533
1985
  errorMessage: msg
1534
1986
  });
@@ -1539,6 +1991,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1539
1991
  status: "error",
1540
1992
  error: msg
1541
1993
  });
1994
+ this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
1542
1995
  throw { url, error };
1543
1996
  }
1544
1997
  })
@@ -1561,12 +2014,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
1561
2014
  });
1562
2015
  }
1563
2016
  }
2017
+ this.emitCrawlProgress(config, {
2018
+ phase: "crawling",
2019
+ urlsScheduled: uniqueUrls.length,
2020
+ pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
2021
+ });
1564
2022
  if (i + concurrency < uniqueUrls.length) {
1565
2023
  await this.delay(delayMs);
1566
2024
  }
1567
2025
  }
1568
2026
  if (documents.length > 0) {
1569
- const ingestResult = await this.ingest(documents, options);
2027
+ const ingestResult = await this.ingest(documents, {
2028
+ ...options,
2029
+ metadata: {
2030
+ ...options?.metadata ?? {},
2031
+ onCrawlProgress: config.metadata?.onCrawlProgress
2032
+ }
2033
+ });
1570
2034
  indexed = ingestResult.indexed;
1571
2035
  if (ingestResult.errors) {
1572
2036
  errors.push(...ingestResult.errors);
@@ -1609,125 +2073,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
1609
2073
  const html = await response.text();
1610
2074
  return this.extractDocumentFromHtml(url, html, config);
1611
2075
  }
1612
- /**
1613
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
1614
- * would otherwise hit an empty wrapper.
1615
- */
1616
- static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
1617
- stripNoiseFromDom($, config) {
1618
- const removeSelectors = config.removeSelectors || [
1619
- "script",
1620
- "style",
1621
- "nav",
1622
- "header",
1623
- "footer",
1624
- ".sidebar",
1625
- ".navigation",
1626
- ".menu",
1627
- ".comments",
1628
- '[role="navigation"]',
1629
- '[role="banner"]'
1630
- ];
1631
- removeSelectors.forEach((selector) => $(selector).remove());
1632
- }
1633
- /** Longest cleaned text among selector matches and full body (after noise strip). */
1634
- extractBestContentText($, config) {
1635
- const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
1636
- const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
1637
- let best = "";
1638
- for (const sel of selectors) {
1639
- $(sel).each((_, el) => {
1640
- const t = this.cleanContent($(el).text().trim());
1641
- if (t.length > best.length) best = t;
1642
- });
1643
- }
1644
- const bodyText = this.cleanContent($("body").text().trim());
1645
- if (bodyText.length > best.length) best = bodyText;
1646
- return best;
1647
- }
1648
2076
  bodyTextLengthHint(html, config) {
1649
- const $ = cheerio.load(html);
1650
- this.stripNoiseFromDom($, config);
1651
- return this.cleanContent($("body").text().trim()).length;
2077
+ return bodyTextLengthHint(html, config);
1652
2078
  }
1653
2079
  extractDocumentFromHtml(url, html, config) {
1654
- const $ = cheerio.load(html);
1655
- this.stripNoiseFromDom($, config);
1656
- const titleSelector = config.titleSelector || "h1, title";
1657
- let title = $(titleSelector).first().text().trim();
1658
- if (!title) {
1659
- title = $("title").text().trim();
1660
- }
1661
- const content = this.extractBestContentText($, config);
1662
- const minChars = config.minExtractedContentLength ?? 50;
1663
- if (!content || content.length < minChars) return null;
1664
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1665
- this.extractHeroImage($, url) || void 0;
1666
- let imageUrl;
1667
- if (image) {
1668
- try {
1669
- imageUrl = new URL(image, url).href;
1670
- } catch {
1671
- imageUrl = image;
1672
- }
1673
- }
1674
- const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
1675
- let type = config.defaultType || "page";
1676
- if (config.typeFromUrl) {
1677
- for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
1678
- if (url.includes(pattern)) {
1679
- type = typeName;
1680
- break;
1681
- }
1682
- }
1683
- }
1684
- const id = this.urlToId(url);
2080
+ const extracted = extractPageFromHtml(url, html, config);
2081
+ if (!extracted.indexable) return null;
1685
2082
  return {
1686
- id,
1687
- content,
1688
- metadata: {
1689
- type,
1690
- title,
1691
- url,
1692
- ...imageUrl ? { imageUrl } : {},
1693
- ...description ? { description } : {},
1694
- ...config.metadata
1695
- }
2083
+ id: extracted.id,
2084
+ content: extracted.content,
2085
+ metadata: extracted.metadata
1696
2086
  };
1697
2087
  }
1698
- /**
1699
- * Fallback image extraction: finds the first meaningful image in the content area.
1700
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
1701
- */
1702
- extractHeroImage($, pageUrl) {
1703
- const containers = $('main, article, [role="main"], #content, .content');
1704
- const scope = containers.length > 0 ? containers : $("body");
1705
- let best;
1706
- scope.find("img[src]").each((_, el) => {
1707
- if (best) return false;
1708
- const src = $(el).attr("src") || "";
1709
- const alt = ($(el).attr("alt") || "").toLowerCase();
1710
- const width = parseInt($(el).attr("width") || "0", 10);
1711
- const height = parseInt($(el).attr("height") || "0", 10);
1712
- if (width > 0 && width < 80 || height > 0 && height < 80) return;
1713
- if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1714
- if (src.startsWith("data:") || src.endsWith(".svg")) return;
1715
- if (src.includes("/_next/image")) {
1716
- try {
1717
- const nextUrl = new URL(src, pageUrl);
1718
- const realUrl = nextUrl.searchParams.get("url");
1719
- if (realUrl) {
1720
- best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1721
- return false;
1722
- }
1723
- } catch {
1724
- }
1725
- }
1726
- best = src;
1727
- return false;
1728
- });
1729
- return best;
1730
- }
1731
2088
  looksLikeDynamicShell(html) {
1732
2089
  const lower = html.toLowerCase();
1733
2090
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
@@ -1745,7 +2102,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1745
2102
  const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
1746
2103
  return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
1747
2104
  }
1748
- diagFromRenderedAttempt(doc, bodyTextLengthHint, renderFailure, blockedSuspected, modeOk, modeFailed) {
2105
+ diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
1749
2106
  if (blockedSuspected) {
1750
2107
  return {
1751
2108
  doc: null,
@@ -1761,12 +2118,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1761
2118
  return {
1762
2119
  doc,
1763
2120
  diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
1764
- bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint
2121
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
1765
2122
  };
1766
2123
  }
1767
2124
  async crawlPageSmart(url, config, timeout, ctx) {
1768
2125
  if (ctx.renderMode === true) {
1769
- const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
2126
+ const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
1770
2127
  url,
1771
2128
  config,
1772
2129
  timeout,
@@ -1775,7 +2132,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1775
2132
  );
1776
2133
  return this.diagFromRenderedAttempt(
1777
2134
  doc,
1778
- bodyTextLengthHint,
2135
+ bodyTextLengthHint2,
1779
2136
  renderFailure,
1780
2137
  blockedSuspected,
1781
2138
  "render_ok",
@@ -1892,7 +2249,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1892
2249
  }
1893
2250
  }
1894
2251
  const html = await page.content();
1895
- const bodyTextLengthHint = this.bodyTextLengthHint(html, config);
2252
+ const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
1896
2253
  const doc = this.extractDocumentFromHtml(url, html, config);
1897
2254
  if (config.debug?.saveDir && config.debug?.enabled) {
1898
2255
  try {
@@ -1907,7 +2264,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1907
2264
  dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
1908
2265
  }
1909
2266
  }
1910
- return { doc, bodyTextLengthHint };
2267
+ return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
1911
2268
  } catch (e) {
1912
2269
  const msg = String(e?.message || e || "render_failed");
1913
2270
  const lower = msg.toLowerCase();
@@ -1957,6 +2314,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
1957
2314
  }
1958
2315
  return Array.from(found);
1959
2316
  }
2317
+ emitBulkProgress(options, update) {
2318
+ const fn = options?.metadata?.onBulkProgress;
2319
+ if (!fn) return;
2320
+ try {
2321
+ fn(update);
2322
+ } catch {
2323
+ }
2324
+ }
2325
+ emitCrawlProgress(config, update) {
2326
+ const fn = config.metadata?.onCrawlProgress;
2327
+ if (!fn) return;
2328
+ try {
2329
+ fn(update);
2330
+ } catch {
2331
+ }
2332
+ }
2333
+ emitCrawlPage(config, event) {
2334
+ const fn = config.metadata?.onCrawlPage;
2335
+ if (!fn) return;
2336
+ try {
2337
+ fn(event);
2338
+ } catch {
2339
+ }
2340
+ }
1960
2341
  createDebugCollector(debug) {
1961
2342
  const enabled = !!debug?.enabled;
1962
2343
  const level = debug?.level || "summary";
@@ -1975,14 +2356,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
1975
2356
  /**
1976
2357
  * Clean extracted text content
1977
2358
  */
1978
- cleanContent(text) {
1979
- return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
1980
- }
1981
- /**
1982
- * Convert URL to a stable document ID
1983
- */
1984
2359
  urlToId(url) {
1985
- return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
2360
+ return urlToDocumentId(url);
1986
2361
  }
1987
2362
  /**
1988
2363
  * Delay helper
@@ -2245,11 +2620,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
2245
2620
  filterableFields: this.config.filterableFields,
2246
2621
  typeBoosts: this.config.typeBoosts,
2247
2622
  recencyBoost: this.config.recencyBoost,
2623
+ crawlLedger: this.config.crawlLedger,
2248
2624
  priority: this.priority
2249
2625
  };
2250
2626
  }
2251
2627
  };
2252
2628
  // Annotate the CommonJS export names for ESM import in node:
2253
2629
  0 && (module.exports = {
2254
- WebRAGPlugin
2630
+ WebRAGPlugin,
2631
+ bodyTextLengthHint,
2632
+ extractPageFromHtml,
2633
+ extractProductMetadata,
2634
+ normalizeAvailability,
2635
+ normalizeCurrency,
2636
+ parsePrice,
2637
+ urlToDocumentId
2255
2638
  });