@snap-agent/rag-web 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,10 +1,324 @@
1
1
  // src/WebRAGPlugin.ts
2
2
  import { MongoClient } from "mongodb";
3
3
  import OpenAI from "openai";
4
- import * as cheerio from "cheerio";
4
+ import * as cheerio3 from "cheerio";
5
5
  import * as fs from "fs";
6
6
  import * as path from "path";
7
- var WebRAGPlugin = class _WebRAGPlugin {
7
+
8
+ // src/htmlPageExtract.ts
9
+ import * as cheerio2 from "cheerio";
10
+
11
+ // src/productMetadata.ts
12
+ import * as cheerio from "cheerio";
13
+ function extractProductMetadata(html) {
14
+ const $ = cheerio.load(html);
15
+ const fromJsonLd = extractFromJsonLd($);
16
+ const fromOg = extractFromOpenGraph($);
17
+ const fromMicrodata = extractFromMicrodata($);
18
+ const result = {};
19
+ const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
20
+ if (price != null) result.price = price;
21
+ const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
22
+ if (currency) result.currency = currency;
23
+ const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
24
+ if (availability) result.availability = availability;
25
+ return result;
26
+ }
27
+ function extractFromJsonLd($) {
28
+ const result = {};
29
+ $('script[type="application/ld+json"]').each((_, el) => {
30
+ if (result.price != null && result.currency && result.availability) return false;
31
+ const raw = $(el).html()?.trim();
32
+ if (!raw) return;
33
+ let parsed;
34
+ try {
35
+ parsed = JSON.parse(raw);
36
+ } catch {
37
+ return;
38
+ }
39
+ for (const node of collectJsonLdNodes(parsed)) {
40
+ if (!isProductType(node)) continue;
41
+ const offer = pickOffer(node);
42
+ if (!offer) continue;
43
+ if (result.price == null) {
44
+ const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
45
+ if (price != null) result.price = price;
46
+ }
47
+ if (!result.currency) {
48
+ const currency = normalizeCurrency(offer.priceCurrency);
49
+ if (currency) result.currency = currency;
50
+ }
51
+ if (!result.availability) {
52
+ const availability = normalizeAvailability(offer.availability);
53
+ if (availability) result.availability = availability;
54
+ }
55
+ }
56
+ });
57
+ return result;
58
+ }
59
+ function extractFromOpenGraph($) {
60
+ const result = {};
61
+ const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
62
+ const price = parsePrice(priceRaw);
63
+ if (price != null) result.price = price;
64
+ const currency = normalizeCurrency(
65
+ $('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
66
+ );
67
+ if (currency) result.currency = currency;
68
+ const availability = normalizeAvailability(
69
+ $('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
70
+ );
71
+ if (availability) result.availability = availability;
72
+ return result;
73
+ }
74
+ function microdataField($, itemprop) {
75
+ const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
76
+ return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
77
+ }
78
+ function extractFromMicrodata($) {
79
+ const result = {};
80
+ const priceEl = microdataField($, "price");
81
+ const price = parsePrice(priceEl.attr("content") || priceEl.text());
82
+ if (price != null) result.price = price;
83
+ const currencyEl = microdataField($, "priceCurrency");
84
+ const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
85
+ if (currency) result.currency = currency;
86
+ const availabilityEl = microdataField($, "availability");
87
+ const availability = normalizeAvailability(
88
+ availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
89
+ );
90
+ if (availability) result.availability = availability;
91
+ return result;
92
+ }
93
+ function collectJsonLdNodes(data) {
94
+ const nodes = [];
95
+ const visit = (value) => {
96
+ if (value == null) return;
97
+ if (Array.isArray(value)) {
98
+ value.forEach(visit);
99
+ return;
100
+ }
101
+ if (typeof value !== "object") return;
102
+ const obj = value;
103
+ nodes.push(obj);
104
+ if (obj["@graph"]) visit(obj["@graph"]);
105
+ };
106
+ visit(data);
107
+ return nodes;
108
+ }
109
+ function isProductType(node) {
110
+ const type = node["@type"];
111
+ const types = Array.isArray(type) ? type : type != null ? [type] : [];
112
+ return types.some((t) => {
113
+ const s = String(t).toLowerCase();
114
+ return s === "product" || s.endsWith("/product");
115
+ });
116
+ }
117
+ function pickOffer(product) {
118
+ const offers = product.offers;
119
+ if (offers == null) return null;
120
+ if (Array.isArray(offers)) {
121
+ const first = offers.find((o) => o && typeof o === "object");
122
+ return first ?? null;
123
+ }
124
+ if (typeof offers === "object") return offers;
125
+ return null;
126
+ }
127
+ function parsePrice(value) {
128
+ if (value == null || value === "") return void 0;
129
+ if (typeof value === "number" && Number.isFinite(value)) return value;
130
+ let s = String(value).trim();
131
+ if (!s) return void 0;
132
+ s = s.replace(/[^\d.,\-]/g, "");
133
+ if (!s || s === "-" || s === ".") return void 0;
134
+ const lastComma = s.lastIndexOf(",");
135
+ const lastDot = s.lastIndexOf(".");
136
+ if (lastComma > -1 && lastDot > -1) {
137
+ if (lastComma > lastDot) {
138
+ s = s.replace(/\./g, "").replace(",", ".");
139
+ } else {
140
+ s = s.replace(/,/g, "");
141
+ }
142
+ } else if (lastComma > -1) {
143
+ const parts = s.split(",");
144
+ if (parts.length === 2 && parts[1].length <= 2) {
145
+ s = parts[0].replace(/\./g, "") + "." + parts[1];
146
+ } else {
147
+ s = s.replace(/,/g, "");
148
+ }
149
+ }
150
+ const num = parseFloat(s);
151
+ return Number.isFinite(num) ? num : void 0;
152
+ }
153
+ function normalizeCurrency(value) {
154
+ if (value == null) return void 0;
155
+ const s = String(value).trim().toUpperCase();
156
+ if (!s) return void 0;
157
+ const iso = s.match(/[A-Z]{3}/);
158
+ return iso ? iso[0] : s.length <= 4 ? s : void 0;
159
+ }
160
+ function normalizeAvailability(value) {
161
+ if (value == null) return void 0;
162
+ let s = String(value).trim();
163
+ if (!s) return void 0;
164
+ if (s.includes("schema.org/")) {
165
+ const parts = s.split("/");
166
+ s = parts[parts.length - 1] || s;
167
+ }
168
+ s = s.replace(/^https?:\/\/[^/]+\//, "");
169
+ if (s.includes("/")) {
170
+ const parts = s.split("/");
171
+ s = parts[parts.length - 1] || s;
172
+ }
173
+ return s.replace(/\s+/g, "") || void 0;
174
+ }
175
+
176
+ // src/htmlPageExtract.ts
177
+ var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
178
+ var DEFAULT_REMOVE_SELECTORS = [
179
+ "script",
180
+ "style",
181
+ "nav",
182
+ "header",
183
+ "footer",
184
+ ".sidebar",
185
+ ".navigation",
186
+ ".menu",
187
+ ".comments",
188
+ '[role="navigation"]',
189
+ '[role="banner"]'
190
+ ];
191
+ function urlToDocumentId(url) {
192
+ return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
193
+ }
194
+ function cleanContent(text) {
195
+ return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
196
+ }
197
+ function bodyTextLengthHint(html, options = {}) {
198
+ const $ = cheerio2.load(html);
199
+ stripNoiseFromDom($, options);
200
+ return cleanContent($("body").text().trim()).length;
201
+ }
202
+ function extractPageFromHtml(url, html, options = {}) {
203
+ const $ = cheerio2.load(html);
204
+ stripNoiseFromDom($, options);
205
+ const titleSelector = options.titleSelector || "h1, title";
206
+ let title = $(titleSelector).first().text().trim();
207
+ if (!title) {
208
+ title = $("title").text().trim();
209
+ }
210
+ const content = extractBestContentText($, options);
211
+ const minChars = options.minExtractedContentLength ?? 50;
212
+ const indexable = Boolean(content && content.length >= minChars);
213
+ const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
214
+ let imageUrl;
215
+ if (image) {
216
+ try {
217
+ imageUrl = new URL(image, url).href;
218
+ } catch {
219
+ imageUrl = image;
220
+ }
221
+ }
222
+ const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
223
+ let type = options.defaultType || "page";
224
+ if (options.typeFromUrl) {
225
+ for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
226
+ if (url.includes(pattern)) {
227
+ type = typeName;
228
+ break;
229
+ }
230
+ }
231
+ }
232
+ const productMeta = extractProductMetadata(html);
233
+ const metadata = {
234
+ type,
235
+ ...title ? { title } : {},
236
+ url,
237
+ ...imageUrl ? { imageUrl } : {},
238
+ ...description ? { description } : {},
239
+ ...productMeta.price != null ? { price: productMeta.price } : {},
240
+ ...productMeta.currency ? { currency: productMeta.currency } : {},
241
+ ...productMeta.availability ? { availability: productMeta.availability } : {},
242
+ ...options.metadata
243
+ };
244
+ const previewLen = 400;
245
+ const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
246
+ return {
247
+ id: urlToDocumentId(url),
248
+ metadata,
249
+ content,
250
+ indexable,
251
+ contentPreview
252
+ };
253
+ }
254
+ function stripNoiseFromDom($, options) {
255
+ const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
256
+ removeSelectors.forEach((selector) => $(selector).remove());
257
+ }
258
+ function extractBestContentText($, options) {
259
+ const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
260
+ const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
261
+ let best = "";
262
+ for (const sel of selectors) {
263
+ $(sel).each((_, el) => {
264
+ const t = cleanContent($(el).text().trim());
265
+ if (t.length > best.length) best = t;
266
+ });
267
+ }
268
+ const bodyText = cleanContent($("body").text().trim());
269
+ if (bodyText.length > best.length) best = bodyText;
270
+ return best;
271
+ }
272
+ function extractHeroImage($, pageUrl) {
273
+ const containers = $('main, article, [role="main"], #content, .content');
274
+ const scope = containers.length > 0 ? containers : $("body");
275
+ let best;
276
+ scope.find("img[src]").each((_, el) => {
277
+ if (best) return false;
278
+ const src = $(el).attr("src") || "";
279
+ const alt = ($(el).attr("alt") || "").toLowerCase();
280
+ const width = parseInt($(el).attr("width") || "0", 10);
281
+ const height = parseInt($(el).attr("height") || "0", 10);
282
+ if (width > 0 && width < 80 || height > 0 && height < 80) return;
283
+ if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
284
+ if (src.startsWith("data:") || src.endsWith(".svg")) return;
285
+ if (src.includes("/_next/image")) {
286
+ try {
287
+ const nextUrl = new URL(src, pageUrl);
288
+ const realUrl = nextUrl.searchParams.get("url");
289
+ if (realUrl) {
290
+ best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
291
+ return false;
292
+ }
293
+ } catch {
294
+ }
295
+ }
296
+ best = src;
297
+ return false;
298
+ });
299
+ return best;
300
+ }
301
+
302
+ // src/WebRAGPlugin.ts
303
+ function bulkOpCurrentUrl(op) {
304
+ const meta = op.document?.metadata;
305
+ if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
306
+ if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
307
+ return void 0;
308
+ }
309
+ function isUrlListingInsert(document) {
310
+ const meta = document.metadata;
311
+ if (meta?.type !== "url") return false;
312
+ const url = typeof meta.url === "string" ? meta.url.trim() : "";
313
+ if (!url) return false;
314
+ try {
315
+ const parsed = new URL(url);
316
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
317
+ } catch {
318
+ return false;
319
+ }
320
+ }
321
+ var WebRAGPlugin = class {
8
322
  name = "web-rag";
9
323
  type = "rag";
10
324
  priority;
@@ -42,6 +356,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
42
356
  }
43
357
  return this.db.collection(this.config.collection);
44
358
  }
359
+ ledgerIndexesEnsured = false;
45
360
  async getLedgerCollection() {
46
361
  if (!this.client) {
47
362
  this.client = new MongoClient(this.config.mongoUri);
@@ -49,7 +364,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
49
364
  this.db = this.client.db(this.config.dbName);
50
365
  }
51
366
  const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
52
- return this.db.collection(name);
367
+ const col = this.db.collection(name);
368
+ if (!this.ledgerIndexesEnsured) {
369
+ this.ledgerIndexesEnsured = true;
370
+ await col.createIndex(
371
+ { tenantId: 1, agentId: 1, urlNormalized: 1 },
372
+ { unique: true }
373
+ );
374
+ await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
375
+ }
376
+ return col;
53
377
  }
54
378
  /**
55
379
  * List recent crawl ledger rows (for dashboards / pagination in the front).
@@ -59,6 +383,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
59
383
  const filter = { tenantId: this.config.tenantId };
60
384
  filter.agentId = options.agentId ?? "shared";
61
385
  if (options.domain) filter.domain = options.domain;
386
+ if (options.ingestionId) filter.ingestionId = options.ingestionId;
62
387
  if (options.status) filter.lastStatus = options.status;
63
388
  const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
64
389
  const skip = Math.max(options.skip ?? 0, 0);
@@ -127,6 +452,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
127
452
  lastCrawledAt: now,
128
453
  updatedAt: now
129
454
  };
455
+ if (params.ingestionId) {
456
+ $set.ingestionId = params.ingestionId;
457
+ }
130
458
  if (errMsg !== void 0) {
131
459
  $set.errorMessage = errMsg;
132
460
  } else if (params.status === "indexed" && params.doc) {
@@ -139,9 +467,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
139
467
  $set.docId = params.doc.id;
140
468
  } else {
141
469
  $set.modeUsed = params.diag?.modeUsed;
142
- $set.contentLength = null;
143
- $set.title = null;
144
- $set.docId = null;
470
+ $set.contentLength = params.contentLength ?? null;
471
+ $set.title = params.title ?? null;
472
+ $set.docId = params.docId ?? null;
145
473
  }
146
474
  await col.updateOne(
147
475
  {
@@ -220,6 +548,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
220
548
  url: doc.metadata.url,
221
549
  imageUrl: doc.metadata.imageUrl,
222
550
  description: doc.metadata.description,
551
+ ...doc.metadata.price != null ? { price: doc.metadata.price } : {},
552
+ ...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
553
+ ...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
223
554
  score: doc.score
224
555
  }))
225
556
  }
@@ -385,9 +716,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
385
716
  let indexed = 0;
386
717
  const errors = [];
387
718
  const agentId = options?.agentId || "shared";
388
- for (const doc of documents) {
719
+ const onCrawlProgress = options?.metadata?.onCrawlProgress;
720
+ const indexingTotal = documents.length;
721
+ const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
722
+ const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
723
+ let chunksProcessed = 0;
724
+ if (onCrawlProgress && indexingTotal > 0) {
725
+ this.emitCrawlProgress(
726
+ { metadata: options?.metadata },
727
+ {
728
+ phase: "indexing",
729
+ urlsScheduled: indexingTotal,
730
+ pagesProcessed: 0,
731
+ chunksTotal,
732
+ chunksProcessed: 0
733
+ }
734
+ );
735
+ }
736
+ for (let docIndex = 0; docIndex < documents.length; docIndex++) {
737
+ const doc = documents[docIndex];
738
+ const chunks = chunkPlan[docIndex];
389
739
  try {
390
- const chunks = this.chunkContent(doc.content);
391
740
  const isChunked = chunks.length > 1;
392
741
  if (isChunked) {
393
742
  await collection.deleteMany({
@@ -422,6 +771,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
422
771
  },
423
772
  { upsert: true }
424
773
  );
774
+ chunksProcessed++;
775
+ if (onCrawlProgress) {
776
+ this.emitCrawlProgress(
777
+ { metadata: options?.metadata },
778
+ {
779
+ phase: "indexing",
780
+ urlsScheduled: indexingTotal,
781
+ pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
782
+ chunksTotal,
783
+ chunksProcessed
784
+ }
785
+ );
786
+ }
425
787
  }
426
788
  indexed++;
427
789
  } catch (error) {
@@ -501,23 +863,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
501
863
  let deleted = 0;
502
864
  let failed = 0;
503
865
  const errors = [];
866
+ const opsTotal = operations.length;
867
+ let opsDone = 0;
868
+ const ingestOptions = options ?? {};
869
+ this.emitBulkProgress(ingestOptions, {
870
+ phase: "processing",
871
+ opsTotal,
872
+ opsDone: 0
873
+ });
504
874
  for (const op of operations) {
875
+ const currentUrl = bulkOpCurrentUrl(op);
505
876
  try {
506
877
  switch (op.type) {
507
878
  case "insert":
508
879
  if (op.document) {
509
- await this.ingest([op.document], options);
510
- inserted++;
880
+ if (isUrlListingInsert(op.document)) {
881
+ const url = bulkOpCurrentUrl(op);
882
+ const crawlResult = await this.ingestSinglePageFromUrl(
883
+ {
884
+ url,
885
+ metadata: {
886
+ ...op.document.metadata ?? {},
887
+ url
888
+ }
889
+ },
890
+ ingestOptions
891
+ );
892
+ if (crawlResult.indexed > 0) {
893
+ inserted++;
894
+ } else {
895
+ failed++;
896
+ const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
897
+ errors.push({
898
+ id: op.id,
899
+ operation: op.type,
900
+ error: err
901
+ });
902
+ }
903
+ } else {
904
+ await this.ingest([op.document], ingestOptions);
905
+ inserted++;
906
+ }
511
907
  }
512
908
  break;
513
909
  case "update":
514
910
  if (op.document) {
515
- await this.update(op.id, op.document, options);
911
+ await this.update(op.id, op.document, ingestOptions);
516
912
  updated++;
517
913
  }
518
914
  break;
519
915
  case "delete":
520
- const count = await this.delete(op.id, options);
916
+ const count = await this.delete(op.id, ingestOptions);
521
917
  deleted += count;
522
918
  break;
523
919
  }
@@ -528,6 +924,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
528
924
  operation: op.type,
529
925
  error: error.message || "Unknown error"
530
926
  });
927
+ } finally {
928
+ opsDone++;
929
+ this.emitBulkProgress(ingestOptions, {
930
+ phase: "processing",
931
+ opsTotal,
932
+ opsDone,
933
+ currentOpType: op.type,
934
+ ...currentUrl ? { currentUrl } : {}
935
+ });
531
936
  }
532
937
  }
533
938
  return {
@@ -1094,6 +1499,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1094
1499
  };
1095
1500
  }
1096
1501
  const dbg = this.createDebugCollector(config.debug);
1502
+ this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
1097
1503
  const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
1098
1504
  if (!base) {
1099
1505
  return {
@@ -1125,6 +1531,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
1125
1531
  if (config.excludePatterns?.length) {
1126
1532
  filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
1127
1533
  }
1534
+ this.emitCrawlProgress(config, {
1535
+ phase: "discovering",
1536
+ urlsDiscovered: filteredUrls.length
1537
+ });
1128
1538
  urlsToCrawl = filteredUrls.slice(0, maxPages);
1129
1539
  urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
1130
1540
  break;
@@ -1146,7 +1556,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
1146
1556
  urlsToCrawl = discovery.urls;
1147
1557
  urlsSkipped = discovery.skipped;
1148
1558
  dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
1559
+ this.emitCrawlProgress(config, {
1560
+ phase: "discovering",
1561
+ urlsDiscovered: urlsToCrawl.length
1562
+ });
1149
1563
  }
1564
+ this.emitCrawlProgress(config, {
1565
+ phase: "crawling",
1566
+ urlsDiscovered: urlsToCrawl.length,
1567
+ urlsScheduled: urlsToCrawl.length
1568
+ });
1150
1569
  const result = await this.crawlUrls(urlsToCrawl, {
1151
1570
  contentSelector: config.contentSelector,
1152
1571
  titleSelector: config.titleSelector,
@@ -1168,9 +1587,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1168
1587
  return {
1169
1588
  ...result,
1170
1589
  urlsSkipped,
1590
+ /** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
1591
+ urlsScheduled: urlsToCrawl.length,
1171
1592
  crawledAt: /* @__PURE__ */ new Date(),
1172
1593
  metadata: {
1173
1594
  ...result.metadata || {},
1595
+ urlsScheduled: urlsToCrawl.length,
1174
1596
  discoveryDebug: dbg.summary()
1175
1597
  }
1176
1598
  };
@@ -1299,7 +1721,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1299
1721
  return await response.text();
1300
1722
  }
1301
1723
  extractInternalLinks(html, base, stripQueryParams) {
1302
- const $ = cheerio.load(html);
1724
+ const $ = cheerio3.load(html);
1303
1725
  const links = /* @__PURE__ */ new Set();
1304
1726
  $("a[href]").each((_, el) => {
1305
1727
  const href = ($(el).attr("href") || "").trim();
@@ -1398,6 +1820,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1398
1820
  const forceRecrawl = !!(options && options.forceRecrawl);
1399
1821
  const agentId = options?.agentId ?? "shared";
1400
1822
  const stripQ = config.stripQueryParams ?? false;
1823
+ const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
1401
1824
  const urlByNorm = /* @__PURE__ */ new Map();
1402
1825
  for (const u of urls) {
1403
1826
  const norm = this.normalizeLedgerUrl(u, stripQ) || u;
@@ -1426,6 +1849,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1426
1849
  const results = await Promise.allSettled(
1427
1850
  batch.map(async (url) => {
1428
1851
  const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
1852
+ this.emitCrawlPage(config, { url, event: "start" });
1429
1853
  if (ledgerOpts && !forceRecrawl) {
1430
1854
  const entry = await this.findLedgerEntry(urlNormalized, agentId);
1431
1855
  if (this.shouldSkipLedger(
@@ -1446,11 +1870,24 @@ var WebRAGPlugin = class _WebRAGPlugin {
1446
1870
  docId: entry?.docId
1447
1871
  });
1448
1872
  dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
1873
+ if (ledgerOpts) {
1874
+ await this.upsertLedgerRecord({
1875
+ url,
1876
+ urlNormalized,
1877
+ agentId,
1878
+ ingestionId,
1879
+ status: "skipped_ledger",
1880
+ title: entry?.title,
1881
+ docId: entry?.docId,
1882
+ contentLength: entry?.contentLength
1883
+ });
1884
+ }
1885
+ this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
1449
1886
  return { kind: "ledger_skip", url };
1450
1887
  }
1451
1888
  }
1452
1889
  try {
1453
- const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
1890
+ const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
1454
1891
  renderMode,
1455
1892
  renderOptions,
1456
1893
  minContentLength,
@@ -1469,6 +1906,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1469
1906
  url,
1470
1907
  urlNormalized,
1471
1908
  agentId,
1909
+ ingestionId,
1472
1910
  status: crawlSt,
1473
1911
  doc,
1474
1912
  diag
@@ -1480,11 +1918,17 @@ var WebRAGPlugin = class _WebRAGPlugin {
1480
1918
  status: crawlSt,
1481
1919
  modeUsed: diag?.modeUsed,
1482
1920
  contentLength: doc?.content?.length,
1483
- bodyTextLengthHint,
1921
+ bodyTextLengthHint: bodyTextLengthHint2,
1484
1922
  title: doc?.metadata?.title,
1485
1923
  docId: doc?.id,
1486
1924
  error: diag?.errorMessage
1487
1925
  });
1926
+ this.emitCrawlPage(config, {
1927
+ url,
1928
+ event: "done",
1929
+ status: crawlSt,
1930
+ error: diag?.errorMessage
1931
+ });
1488
1932
  return { kind: "doc", doc, url };
1489
1933
  } catch (error) {
1490
1934
  const msg = error instanceof Error ? error.message : String(error);
@@ -1493,6 +1937,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1493
1937
  url,
1494
1938
  urlNormalized,
1495
1939
  agentId,
1940
+ ingestionId,
1496
1941
  status: "error",
1497
1942
  errorMessage: msg
1498
1943
  });
@@ -1503,6 +1948,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1503
1948
  status: "error",
1504
1949
  error: msg
1505
1950
  });
1951
+ this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
1506
1952
  throw { url, error };
1507
1953
  }
1508
1954
  })
@@ -1525,12 +1971,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
1525
1971
  });
1526
1972
  }
1527
1973
  }
1974
+ this.emitCrawlProgress(config, {
1975
+ phase: "crawling",
1976
+ urlsScheduled: uniqueUrls.length,
1977
+ pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
1978
+ });
1528
1979
  if (i + concurrency < uniqueUrls.length) {
1529
1980
  await this.delay(delayMs);
1530
1981
  }
1531
1982
  }
1532
1983
  if (documents.length > 0) {
1533
- const ingestResult = await this.ingest(documents, options);
1984
+ const ingestResult = await this.ingest(documents, {
1985
+ ...options,
1986
+ metadata: {
1987
+ ...options?.metadata ?? {},
1988
+ onCrawlProgress: config.metadata?.onCrawlProgress
1989
+ }
1990
+ });
1534
1991
  indexed = ingestResult.indexed;
1535
1992
  if (ingestResult.errors) {
1536
1993
  errors.push(...ingestResult.errors);
@@ -1573,125 +2030,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
1573
2030
  const html = await response.text();
1574
2031
  return this.extractDocumentFromHtml(url, html, config);
1575
2032
  }
1576
- /**
1577
- * Default chain works for many WordPress / Elementor / block themes where `.first()`
1578
- * would otherwise hit an empty wrapper.
1579
- */
1580
- static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
1581
- stripNoiseFromDom($, config) {
1582
- const removeSelectors = config.removeSelectors || [
1583
- "script",
1584
- "style",
1585
- "nav",
1586
- "header",
1587
- "footer",
1588
- ".sidebar",
1589
- ".navigation",
1590
- ".menu",
1591
- ".comments",
1592
- '[role="navigation"]',
1593
- '[role="banner"]'
1594
- ];
1595
- removeSelectors.forEach((selector) => $(selector).remove());
1596
- }
1597
- /** Longest cleaned text among selector matches and full body (after noise strip). */
1598
- extractBestContentText($, config) {
1599
- const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
1600
- const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
1601
- let best = "";
1602
- for (const sel of selectors) {
1603
- $(sel).each((_, el) => {
1604
- const t = this.cleanContent($(el).text().trim());
1605
- if (t.length > best.length) best = t;
1606
- });
1607
- }
1608
- const bodyText = this.cleanContent($("body").text().trim());
1609
- if (bodyText.length > best.length) best = bodyText;
1610
- return best;
1611
- }
1612
2033
  bodyTextLengthHint(html, config) {
1613
- const $ = cheerio.load(html);
1614
- this.stripNoiseFromDom($, config);
1615
- return this.cleanContent($("body").text().trim()).length;
2034
+ return bodyTextLengthHint(html, config);
1616
2035
  }
1617
2036
  extractDocumentFromHtml(url, html, config) {
1618
- const $ = cheerio.load(html);
1619
- this.stripNoiseFromDom($, config);
1620
- const titleSelector = config.titleSelector || "h1, title";
1621
- let title = $(titleSelector).first().text().trim();
1622
- if (!title) {
1623
- title = $("title").text().trim();
1624
- }
1625
- const content = this.extractBestContentText($, config);
1626
- const minChars = config.minExtractedContentLength ?? 50;
1627
- if (!content || content.length < minChars) return null;
1628
- const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
1629
- this.extractHeroImage($, url) || void 0;
1630
- let imageUrl;
1631
- if (image) {
1632
- try {
1633
- imageUrl = new URL(image, url).href;
1634
- } catch {
1635
- imageUrl = image;
1636
- }
1637
- }
1638
- const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
1639
- let type = config.defaultType || "page";
1640
- if (config.typeFromUrl) {
1641
- for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
1642
- if (url.includes(pattern)) {
1643
- type = typeName;
1644
- break;
1645
- }
1646
- }
1647
- }
1648
- const id = this.urlToId(url);
2037
+ const extracted = extractPageFromHtml(url, html, config);
2038
+ if (!extracted.indexable) return null;
1649
2039
  return {
1650
- id,
1651
- content,
1652
- metadata: {
1653
- type,
1654
- title,
1655
- url,
1656
- ...imageUrl ? { imageUrl } : {},
1657
- ...description ? { description } : {},
1658
- ...config.metadata
1659
- }
2040
+ id: extracted.id,
2041
+ content: extracted.content,
2042
+ metadata: extracted.metadata
1660
2043
  };
1661
2044
  }
1662
- /**
1663
- * Fallback image extraction: finds the first meaningful image in the content area.
1664
- * Skips icons, avatars, and tiny assets by filtering on common patterns.
1665
- */
1666
- extractHeroImage($, pageUrl) {
1667
- const containers = $('main, article, [role="main"], #content, .content');
1668
- const scope = containers.length > 0 ? containers : $("body");
1669
- let best;
1670
- scope.find("img[src]").each((_, el) => {
1671
- if (best) return false;
1672
- const src = $(el).attr("src") || "";
1673
- const alt = ($(el).attr("alt") || "").toLowerCase();
1674
- const width = parseInt($(el).attr("width") || "0", 10);
1675
- const height = parseInt($(el).attr("height") || "0", 10);
1676
- if (width > 0 && width < 80 || height > 0 && height < 80) return;
1677
- if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
1678
- if (src.startsWith("data:") || src.endsWith(".svg")) return;
1679
- if (src.includes("/_next/image")) {
1680
- try {
1681
- const nextUrl = new URL(src, pageUrl);
1682
- const realUrl = nextUrl.searchParams.get("url");
1683
- if (realUrl) {
1684
- best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
1685
- return false;
1686
- }
1687
- } catch {
1688
- }
1689
- }
1690
- best = src;
1691
- return false;
1692
- });
1693
- return best;
1694
- }
1695
2045
  looksLikeDynamicShell(html) {
1696
2046
  const lower = html.toLowerCase();
1697
2047
  const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
@@ -1709,7 +2059,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1709
2059
  const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
1710
2060
  return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
1711
2061
  }
1712
- diagFromRenderedAttempt(doc, bodyTextLengthHint, renderFailure, blockedSuspected, modeOk, modeFailed) {
2062
+ diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
1713
2063
  if (blockedSuspected) {
1714
2064
  return {
1715
2065
  doc: null,
@@ -1725,12 +2075,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
1725
2075
  return {
1726
2076
  doc,
1727
2077
  diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
1728
- bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint
2078
+ bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
1729
2079
  };
1730
2080
  }
1731
2081
  async crawlPageSmart(url, config, timeout, ctx) {
1732
2082
  if (ctx.renderMode === true) {
1733
- const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
2083
+ const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
1734
2084
  url,
1735
2085
  config,
1736
2086
  timeout,
@@ -1739,7 +2089,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1739
2089
  );
1740
2090
  return this.diagFromRenderedAttempt(
1741
2091
  doc,
1742
- bodyTextLengthHint,
2092
+ bodyTextLengthHint2,
1743
2093
  renderFailure,
1744
2094
  blockedSuspected,
1745
2095
  "render_ok",
@@ -1856,7 +2206,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1856
2206
  }
1857
2207
  }
1858
2208
  const html = await page.content();
1859
- const bodyTextLengthHint = this.bodyTextLengthHint(html, config);
2209
+ const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
1860
2210
  const doc = this.extractDocumentFromHtml(url, html, config);
1861
2211
  if (config.debug?.saveDir && config.debug?.enabled) {
1862
2212
  try {
@@ -1871,7 +2221,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
1871
2221
  dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
1872
2222
  }
1873
2223
  }
1874
- return { doc, bodyTextLengthHint };
2224
+ return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
1875
2225
  } catch (e) {
1876
2226
  const msg = String(e?.message || e || "render_failed");
1877
2227
  const lower = msg.toLowerCase();
@@ -1921,6 +2271,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
1921
2271
  }
1922
2272
  return Array.from(found);
1923
2273
  }
2274
+ emitBulkProgress(options, update) {
2275
+ const fn = options?.metadata?.onBulkProgress;
2276
+ if (!fn) return;
2277
+ try {
2278
+ fn(update);
2279
+ } catch {
2280
+ }
2281
+ }
2282
+ emitCrawlProgress(config, update) {
2283
+ const fn = config.metadata?.onCrawlProgress;
2284
+ if (!fn) return;
2285
+ try {
2286
+ fn(update);
2287
+ } catch {
2288
+ }
2289
+ }
2290
+ emitCrawlPage(config, event) {
2291
+ const fn = config.metadata?.onCrawlPage;
2292
+ if (!fn) return;
2293
+ try {
2294
+ fn(event);
2295
+ } catch {
2296
+ }
2297
+ }
1924
2298
  createDebugCollector(debug) {
1925
2299
  const enabled = !!debug?.enabled;
1926
2300
  const level = debug?.level || "summary";
@@ -1939,14 +2313,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
1939
2313
  /**
1940
2314
  * Clean extracted text content
1941
2315
  */
1942
- cleanContent(text) {
1943
- return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
1944
- }
1945
- /**
1946
- * Convert URL to a stable document ID
1947
- */
1948
2316
  urlToId(url) {
1949
- return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
2317
+ return urlToDocumentId(url);
1950
2318
  }
1951
2319
  /**
1952
2320
  * Delay helper
@@ -2209,10 +2577,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
2209
2577
  filterableFields: this.config.filterableFields,
2210
2578
  typeBoosts: this.config.typeBoosts,
2211
2579
  recencyBoost: this.config.recencyBoost,
2580
+ crawlLedger: this.config.crawlLedger,
2212
2581
  priority: this.priority
2213
2582
  };
2214
2583
  }
2215
2584
  };
2216
2585
  export {
2217
- WebRAGPlugin
2586
+ WebRAGPlugin,
2587
+ bodyTextLengthHint,
2588
+ extractPageFromHtml,
2589
+ extractProductMetadata,
2590
+ normalizeAvailability,
2591
+ normalizeCurrency,
2592
+ parsePrice,
2593
+ urlToDocumentId
2218
2594
  };