@snap-agent/rag-web 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +83 -18
- package/dist/index.d.ts +83 -18
- package/dist/index.js +527 -144
- package/dist/index.mjs +519 -143
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -30,17 +30,338 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
|
-
WebRAGPlugin: () => WebRAGPlugin
|
|
33
|
+
WebRAGPlugin: () => WebRAGPlugin,
|
|
34
|
+
bodyTextLengthHint: () => bodyTextLengthHint,
|
|
35
|
+
extractPageFromHtml: () => extractPageFromHtml,
|
|
36
|
+
extractProductMetadata: () => extractProductMetadata,
|
|
37
|
+
normalizeAvailability: () => normalizeAvailability,
|
|
38
|
+
normalizeCurrency: () => normalizeCurrency,
|
|
39
|
+
parsePrice: () => parsePrice,
|
|
40
|
+
urlToDocumentId: () => urlToDocumentId
|
|
34
41
|
});
|
|
35
42
|
module.exports = __toCommonJS(index_exports);
|
|
36
43
|
|
|
37
44
|
// src/WebRAGPlugin.ts
|
|
38
45
|
var import_mongodb = require("mongodb");
|
|
39
46
|
var import_openai = __toESM(require("openai"));
|
|
40
|
-
var
|
|
47
|
+
var cheerio3 = __toESM(require("cheerio"));
|
|
41
48
|
var fs = __toESM(require("fs"));
|
|
42
49
|
var path = __toESM(require("path"));
|
|
43
|
-
|
|
50
|
+
|
|
51
|
+
// src/htmlPageExtract.ts
|
|
52
|
+
var cheerio2 = __toESM(require("cheerio"));
|
|
53
|
+
|
|
54
|
+
// src/productMetadata.ts
|
|
55
|
+
var cheerio = __toESM(require("cheerio"));
|
|
56
|
+
function extractProductMetadata(html) {
|
|
57
|
+
const $ = cheerio.load(html);
|
|
58
|
+
const fromJsonLd = extractFromJsonLd($);
|
|
59
|
+
const fromOg = extractFromOpenGraph($);
|
|
60
|
+
const fromMicrodata = extractFromMicrodata($);
|
|
61
|
+
const result = {};
|
|
62
|
+
const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
|
|
63
|
+
if (price != null) result.price = price;
|
|
64
|
+
const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
|
|
65
|
+
if (currency) result.currency = currency;
|
|
66
|
+
const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
|
|
67
|
+
if (availability) result.availability = availability;
|
|
68
|
+
return result;
|
|
69
|
+
}
|
|
70
|
+
function extractFromJsonLd($) {
|
|
71
|
+
const result = {};
|
|
72
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
73
|
+
if (result.price != null && result.currency && result.availability) return false;
|
|
74
|
+
const raw = $(el).html()?.trim();
|
|
75
|
+
if (!raw) return;
|
|
76
|
+
let parsed;
|
|
77
|
+
try {
|
|
78
|
+
parsed = JSON.parse(raw);
|
|
79
|
+
} catch {
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
for (const node of collectJsonLdNodes(parsed)) {
|
|
83
|
+
if (!isProductType(node)) continue;
|
|
84
|
+
const offer = pickOffer(node);
|
|
85
|
+
if (!offer) continue;
|
|
86
|
+
if (result.price == null) {
|
|
87
|
+
const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
|
|
88
|
+
if (price != null) result.price = price;
|
|
89
|
+
}
|
|
90
|
+
if (!result.currency) {
|
|
91
|
+
const currency = normalizeCurrency(offer.priceCurrency);
|
|
92
|
+
if (currency) result.currency = currency;
|
|
93
|
+
}
|
|
94
|
+
if (!result.availability) {
|
|
95
|
+
const availability = normalizeAvailability(offer.availability);
|
|
96
|
+
if (availability) result.availability = availability;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
return result;
|
|
101
|
+
}
|
|
102
|
+
function extractFromOpenGraph($) {
|
|
103
|
+
const result = {};
|
|
104
|
+
const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
|
|
105
|
+
const price = parsePrice(priceRaw);
|
|
106
|
+
if (price != null) result.price = price;
|
|
107
|
+
const currency = normalizeCurrency(
|
|
108
|
+
$('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
|
|
109
|
+
);
|
|
110
|
+
if (currency) result.currency = currency;
|
|
111
|
+
const availability = normalizeAvailability(
|
|
112
|
+
$('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
|
|
113
|
+
);
|
|
114
|
+
if (availability) result.availability = availability;
|
|
115
|
+
return result;
|
|
116
|
+
}
|
|
117
|
+
function microdataField($, itemprop) {
|
|
118
|
+
const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
|
|
119
|
+
return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
|
|
120
|
+
}
|
|
121
|
+
function extractFromMicrodata($) {
|
|
122
|
+
const result = {};
|
|
123
|
+
const priceEl = microdataField($, "price");
|
|
124
|
+
const price = parsePrice(priceEl.attr("content") || priceEl.text());
|
|
125
|
+
if (price != null) result.price = price;
|
|
126
|
+
const currencyEl = microdataField($, "priceCurrency");
|
|
127
|
+
const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
|
|
128
|
+
if (currency) result.currency = currency;
|
|
129
|
+
const availabilityEl = microdataField($, "availability");
|
|
130
|
+
const availability = normalizeAvailability(
|
|
131
|
+
availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
|
|
132
|
+
);
|
|
133
|
+
if (availability) result.availability = availability;
|
|
134
|
+
return result;
|
|
135
|
+
}
|
|
136
|
+
function collectJsonLdNodes(data) {
|
|
137
|
+
const nodes = [];
|
|
138
|
+
const visit = (value) => {
|
|
139
|
+
if (value == null) return;
|
|
140
|
+
if (Array.isArray(value)) {
|
|
141
|
+
value.forEach(visit);
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
if (typeof value !== "object") return;
|
|
145
|
+
const obj = value;
|
|
146
|
+
nodes.push(obj);
|
|
147
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
148
|
+
};
|
|
149
|
+
visit(data);
|
|
150
|
+
return nodes;
|
|
151
|
+
}
|
|
152
|
+
function isProductType(node) {
|
|
153
|
+
const type = node["@type"];
|
|
154
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
155
|
+
return types.some((t) => {
|
|
156
|
+
const s = String(t).toLowerCase();
|
|
157
|
+
return s === "product" || s.endsWith("/product");
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
function pickOffer(product) {
|
|
161
|
+
const offers = product.offers;
|
|
162
|
+
if (offers == null) return null;
|
|
163
|
+
if (Array.isArray(offers)) {
|
|
164
|
+
const first = offers.find((o) => o && typeof o === "object");
|
|
165
|
+
return first ?? null;
|
|
166
|
+
}
|
|
167
|
+
if (typeof offers === "object") return offers;
|
|
168
|
+
return null;
|
|
169
|
+
}
|
|
170
|
+
function parsePrice(value) {
|
|
171
|
+
if (value == null || value === "") return void 0;
|
|
172
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
173
|
+
let s = String(value).trim();
|
|
174
|
+
if (!s) return void 0;
|
|
175
|
+
s = s.replace(/[^\d.,\-]/g, "");
|
|
176
|
+
if (!s || s === "-" || s === ".") return void 0;
|
|
177
|
+
const lastComma = s.lastIndexOf(",");
|
|
178
|
+
const lastDot = s.lastIndexOf(".");
|
|
179
|
+
if (lastComma > -1 && lastDot > -1) {
|
|
180
|
+
if (lastComma > lastDot) {
|
|
181
|
+
s = s.replace(/\./g, "").replace(",", ".");
|
|
182
|
+
} else {
|
|
183
|
+
s = s.replace(/,/g, "");
|
|
184
|
+
}
|
|
185
|
+
} else if (lastComma > -1) {
|
|
186
|
+
const parts = s.split(",");
|
|
187
|
+
if (parts.length === 2 && parts[1].length <= 2) {
|
|
188
|
+
s = parts[0].replace(/\./g, "") + "." + parts[1];
|
|
189
|
+
} else {
|
|
190
|
+
s = s.replace(/,/g, "");
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
const num = parseFloat(s);
|
|
194
|
+
return Number.isFinite(num) ? num : void 0;
|
|
195
|
+
}
|
|
196
|
+
function normalizeCurrency(value) {
|
|
197
|
+
if (value == null) return void 0;
|
|
198
|
+
const s = String(value).trim().toUpperCase();
|
|
199
|
+
if (!s) return void 0;
|
|
200
|
+
const iso = s.match(/[A-Z]{3}/);
|
|
201
|
+
return iso ? iso[0] : s.length <= 4 ? s : void 0;
|
|
202
|
+
}
|
|
203
|
+
function normalizeAvailability(value) {
|
|
204
|
+
if (value == null) return void 0;
|
|
205
|
+
let s = String(value).trim();
|
|
206
|
+
if (!s) return void 0;
|
|
207
|
+
if (s.includes("schema.org/")) {
|
|
208
|
+
const parts = s.split("/");
|
|
209
|
+
s = parts[parts.length - 1] || s;
|
|
210
|
+
}
|
|
211
|
+
s = s.replace(/^https?:\/\/[^/]+\//, "");
|
|
212
|
+
if (s.includes("/")) {
|
|
213
|
+
const parts = s.split("/");
|
|
214
|
+
s = parts[parts.length - 1] || s;
|
|
215
|
+
}
|
|
216
|
+
return s.replace(/\s+/g, "") || void 0;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// src/htmlPageExtract.ts
|
|
220
|
+
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
221
|
+
var DEFAULT_REMOVE_SELECTORS = [
|
|
222
|
+
"script",
|
|
223
|
+
"style",
|
|
224
|
+
"nav",
|
|
225
|
+
"header",
|
|
226
|
+
"footer",
|
|
227
|
+
".sidebar",
|
|
228
|
+
".navigation",
|
|
229
|
+
".menu",
|
|
230
|
+
".comments",
|
|
231
|
+
'[role="navigation"]',
|
|
232
|
+
'[role="banner"]'
|
|
233
|
+
];
|
|
234
|
+
function urlToDocumentId(url) {
|
|
235
|
+
return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
|
|
236
|
+
}
|
|
237
|
+
function cleanContent(text) {
|
|
238
|
+
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
239
|
+
}
|
|
240
|
+
function bodyTextLengthHint(html, options = {}) {
|
|
241
|
+
const $ = cheerio2.load(html);
|
|
242
|
+
stripNoiseFromDom($, options);
|
|
243
|
+
return cleanContent($("body").text().trim()).length;
|
|
244
|
+
}
|
|
245
|
+
function extractPageFromHtml(url, html, options = {}) {
|
|
246
|
+
const $ = cheerio2.load(html);
|
|
247
|
+
stripNoiseFromDom($, options);
|
|
248
|
+
const titleSelector = options.titleSelector || "h1, title";
|
|
249
|
+
let title = $(titleSelector).first().text().trim();
|
|
250
|
+
if (!title) {
|
|
251
|
+
title = $("title").text().trim();
|
|
252
|
+
}
|
|
253
|
+
const content = extractBestContentText($, options);
|
|
254
|
+
const minChars = options.minExtractedContentLength ?? 50;
|
|
255
|
+
const indexable = Boolean(content && content.length >= minChars);
|
|
256
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
|
|
257
|
+
let imageUrl;
|
|
258
|
+
if (image) {
|
|
259
|
+
try {
|
|
260
|
+
imageUrl = new URL(image, url).href;
|
|
261
|
+
} catch {
|
|
262
|
+
imageUrl = image;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
266
|
+
let type = options.defaultType || "page";
|
|
267
|
+
if (options.typeFromUrl) {
|
|
268
|
+
for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
|
|
269
|
+
if (url.includes(pattern)) {
|
|
270
|
+
type = typeName;
|
|
271
|
+
break;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
const productMeta = extractProductMetadata(html);
|
|
276
|
+
const metadata = {
|
|
277
|
+
type,
|
|
278
|
+
...title ? { title } : {},
|
|
279
|
+
url,
|
|
280
|
+
...imageUrl ? { imageUrl } : {},
|
|
281
|
+
...description ? { description } : {},
|
|
282
|
+
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
283
|
+
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
284
|
+
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
285
|
+
...options.metadata
|
|
286
|
+
};
|
|
287
|
+
const previewLen = 400;
|
|
288
|
+
const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
|
|
289
|
+
return {
|
|
290
|
+
id: urlToDocumentId(url),
|
|
291
|
+
metadata,
|
|
292
|
+
content,
|
|
293
|
+
indexable,
|
|
294
|
+
contentPreview
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
function stripNoiseFromDom($, options) {
|
|
298
|
+
const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
|
|
299
|
+
removeSelectors.forEach((selector) => $(selector).remove());
|
|
300
|
+
}
|
|
301
|
+
function extractBestContentText($, options) {
|
|
302
|
+
const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
|
|
303
|
+
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
304
|
+
let best = "";
|
|
305
|
+
for (const sel of selectors) {
|
|
306
|
+
$(sel).each((_, el) => {
|
|
307
|
+
const t = cleanContent($(el).text().trim());
|
|
308
|
+
if (t.length > best.length) best = t;
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
const bodyText = cleanContent($("body").text().trim());
|
|
312
|
+
if (bodyText.length > best.length) best = bodyText;
|
|
313
|
+
return best;
|
|
314
|
+
}
|
|
315
|
+
function extractHeroImage($, pageUrl) {
|
|
316
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
317
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
318
|
+
let best;
|
|
319
|
+
scope.find("img[src]").each((_, el) => {
|
|
320
|
+
if (best) return false;
|
|
321
|
+
const src = $(el).attr("src") || "";
|
|
322
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
323
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
324
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
325
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
326
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
327
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
328
|
+
if (src.includes("/_next/image")) {
|
|
329
|
+
try {
|
|
330
|
+
const nextUrl = new URL(src, pageUrl);
|
|
331
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
332
|
+
if (realUrl) {
|
|
333
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
334
|
+
return false;
|
|
335
|
+
}
|
|
336
|
+
} catch {
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
best = src;
|
|
340
|
+
return false;
|
|
341
|
+
});
|
|
342
|
+
return best;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// src/WebRAGPlugin.ts
|
|
346
|
+
function bulkOpCurrentUrl(op) {
|
|
347
|
+
const meta = op.document?.metadata;
|
|
348
|
+
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
349
|
+
if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
|
|
350
|
+
return void 0;
|
|
351
|
+
}
|
|
352
|
+
function isUrlListingInsert(document) {
|
|
353
|
+
const meta = document.metadata;
|
|
354
|
+
if (meta?.type !== "url") return false;
|
|
355
|
+
const url = typeof meta.url === "string" ? meta.url.trim() : "";
|
|
356
|
+
if (!url) return false;
|
|
357
|
+
try {
|
|
358
|
+
const parsed = new URL(url);
|
|
359
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
360
|
+
} catch {
|
|
361
|
+
return false;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
var WebRAGPlugin = class {
|
|
44
365
|
name = "web-rag";
|
|
45
366
|
type = "rag";
|
|
46
367
|
priority;
|
|
@@ -78,6 +399,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
78
399
|
}
|
|
79
400
|
return this.db.collection(this.config.collection);
|
|
80
401
|
}
|
|
402
|
+
ledgerIndexesEnsured = false;
|
|
81
403
|
async getLedgerCollection() {
|
|
82
404
|
if (!this.client) {
|
|
83
405
|
this.client = new import_mongodb.MongoClient(this.config.mongoUri);
|
|
@@ -85,7 +407,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
85
407
|
this.db = this.client.db(this.config.dbName);
|
|
86
408
|
}
|
|
87
409
|
const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
|
|
88
|
-
|
|
410
|
+
const col = this.db.collection(name);
|
|
411
|
+
if (!this.ledgerIndexesEnsured) {
|
|
412
|
+
this.ledgerIndexesEnsured = true;
|
|
413
|
+
await col.createIndex(
|
|
414
|
+
{ tenantId: 1, agentId: 1, urlNormalized: 1 },
|
|
415
|
+
{ unique: true }
|
|
416
|
+
);
|
|
417
|
+
await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
|
|
418
|
+
}
|
|
419
|
+
return col;
|
|
89
420
|
}
|
|
90
421
|
/**
|
|
91
422
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -95,6 +426,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
95
426
|
const filter = { tenantId: this.config.tenantId };
|
|
96
427
|
filter.agentId = options.agentId ?? "shared";
|
|
97
428
|
if (options.domain) filter.domain = options.domain;
|
|
429
|
+
if (options.ingestionId) filter.ingestionId = options.ingestionId;
|
|
98
430
|
if (options.status) filter.lastStatus = options.status;
|
|
99
431
|
const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
|
|
100
432
|
const skip = Math.max(options.skip ?? 0, 0);
|
|
@@ -163,6 +495,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
163
495
|
lastCrawledAt: now,
|
|
164
496
|
updatedAt: now
|
|
165
497
|
};
|
|
498
|
+
if (params.ingestionId) {
|
|
499
|
+
$set.ingestionId = params.ingestionId;
|
|
500
|
+
}
|
|
166
501
|
if (errMsg !== void 0) {
|
|
167
502
|
$set.errorMessage = errMsg;
|
|
168
503
|
} else if (params.status === "indexed" && params.doc) {
|
|
@@ -175,9 +510,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
175
510
|
$set.docId = params.doc.id;
|
|
176
511
|
} else {
|
|
177
512
|
$set.modeUsed = params.diag?.modeUsed;
|
|
178
|
-
$set.contentLength = null;
|
|
179
|
-
$set.title = null;
|
|
180
|
-
$set.docId = null;
|
|
513
|
+
$set.contentLength = params.contentLength ?? null;
|
|
514
|
+
$set.title = params.title ?? null;
|
|
515
|
+
$set.docId = params.docId ?? null;
|
|
181
516
|
}
|
|
182
517
|
await col.updateOne(
|
|
183
518
|
{
|
|
@@ -256,6 +591,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
256
591
|
url: doc.metadata.url,
|
|
257
592
|
imageUrl: doc.metadata.imageUrl,
|
|
258
593
|
description: doc.metadata.description,
|
|
594
|
+
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
595
|
+
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
596
|
+
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
259
597
|
score: doc.score
|
|
260
598
|
}))
|
|
261
599
|
}
|
|
@@ -421,9 +759,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
421
759
|
let indexed = 0;
|
|
422
760
|
const errors = [];
|
|
423
761
|
const agentId = options?.agentId || "shared";
|
|
424
|
-
|
|
762
|
+
const onCrawlProgress = options?.metadata?.onCrawlProgress;
|
|
763
|
+
const indexingTotal = documents.length;
|
|
764
|
+
const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
|
|
765
|
+
const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
|
|
766
|
+
let chunksProcessed = 0;
|
|
767
|
+
if (onCrawlProgress && indexingTotal > 0) {
|
|
768
|
+
this.emitCrawlProgress(
|
|
769
|
+
{ metadata: options?.metadata },
|
|
770
|
+
{
|
|
771
|
+
phase: "indexing",
|
|
772
|
+
urlsScheduled: indexingTotal,
|
|
773
|
+
pagesProcessed: 0,
|
|
774
|
+
chunksTotal,
|
|
775
|
+
chunksProcessed: 0
|
|
776
|
+
}
|
|
777
|
+
);
|
|
778
|
+
}
|
|
779
|
+
for (let docIndex = 0; docIndex < documents.length; docIndex++) {
|
|
780
|
+
const doc = documents[docIndex];
|
|
781
|
+
const chunks = chunkPlan[docIndex];
|
|
425
782
|
try {
|
|
426
|
-
const chunks = this.chunkContent(doc.content);
|
|
427
783
|
const isChunked = chunks.length > 1;
|
|
428
784
|
if (isChunked) {
|
|
429
785
|
await collection.deleteMany({
|
|
@@ -458,6 +814,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
458
814
|
},
|
|
459
815
|
{ upsert: true }
|
|
460
816
|
);
|
|
817
|
+
chunksProcessed++;
|
|
818
|
+
if (onCrawlProgress) {
|
|
819
|
+
this.emitCrawlProgress(
|
|
820
|
+
{ metadata: options?.metadata },
|
|
821
|
+
{
|
|
822
|
+
phase: "indexing",
|
|
823
|
+
urlsScheduled: indexingTotal,
|
|
824
|
+
pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
|
|
825
|
+
chunksTotal,
|
|
826
|
+
chunksProcessed
|
|
827
|
+
}
|
|
828
|
+
);
|
|
829
|
+
}
|
|
461
830
|
}
|
|
462
831
|
indexed++;
|
|
463
832
|
} catch (error) {
|
|
@@ -537,23 +906,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
537
906
|
let deleted = 0;
|
|
538
907
|
let failed = 0;
|
|
539
908
|
const errors = [];
|
|
909
|
+
const opsTotal = operations.length;
|
|
910
|
+
let opsDone = 0;
|
|
911
|
+
const ingestOptions = options ?? {};
|
|
912
|
+
this.emitBulkProgress(ingestOptions, {
|
|
913
|
+
phase: "processing",
|
|
914
|
+
opsTotal,
|
|
915
|
+
opsDone: 0
|
|
916
|
+
});
|
|
540
917
|
for (const op of operations) {
|
|
918
|
+
const currentUrl = bulkOpCurrentUrl(op);
|
|
541
919
|
try {
|
|
542
920
|
switch (op.type) {
|
|
543
921
|
case "insert":
|
|
544
922
|
if (op.document) {
|
|
545
|
-
|
|
546
|
-
|
|
923
|
+
if (isUrlListingInsert(op.document)) {
|
|
924
|
+
const url = bulkOpCurrentUrl(op);
|
|
925
|
+
const crawlResult = await this.ingestSinglePageFromUrl(
|
|
926
|
+
{
|
|
927
|
+
url,
|
|
928
|
+
metadata: {
|
|
929
|
+
...op.document.metadata ?? {},
|
|
930
|
+
url
|
|
931
|
+
}
|
|
932
|
+
},
|
|
933
|
+
ingestOptions
|
|
934
|
+
);
|
|
935
|
+
if (crawlResult.indexed > 0) {
|
|
936
|
+
inserted++;
|
|
937
|
+
} else {
|
|
938
|
+
failed++;
|
|
939
|
+
const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
|
|
940
|
+
errors.push({
|
|
941
|
+
id: op.id,
|
|
942
|
+
operation: op.type,
|
|
943
|
+
error: err
|
|
944
|
+
});
|
|
945
|
+
}
|
|
946
|
+
} else {
|
|
947
|
+
await this.ingest([op.document], ingestOptions);
|
|
948
|
+
inserted++;
|
|
949
|
+
}
|
|
547
950
|
}
|
|
548
951
|
break;
|
|
549
952
|
case "update":
|
|
550
953
|
if (op.document) {
|
|
551
|
-
await this.update(op.id, op.document,
|
|
954
|
+
await this.update(op.id, op.document, ingestOptions);
|
|
552
955
|
updated++;
|
|
553
956
|
}
|
|
554
957
|
break;
|
|
555
958
|
case "delete":
|
|
556
|
-
const count = await this.delete(op.id,
|
|
959
|
+
const count = await this.delete(op.id, ingestOptions);
|
|
557
960
|
deleted += count;
|
|
558
961
|
break;
|
|
559
962
|
}
|
|
@@ -564,6 +967,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
564
967
|
operation: op.type,
|
|
565
968
|
error: error.message || "Unknown error"
|
|
566
969
|
});
|
|
970
|
+
} finally {
|
|
971
|
+
opsDone++;
|
|
972
|
+
this.emitBulkProgress(ingestOptions, {
|
|
973
|
+
phase: "processing",
|
|
974
|
+
opsTotal,
|
|
975
|
+
opsDone,
|
|
976
|
+
currentOpType: op.type,
|
|
977
|
+
...currentUrl ? { currentUrl } : {}
|
|
978
|
+
});
|
|
567
979
|
}
|
|
568
980
|
}
|
|
569
981
|
return {
|
|
@@ -1130,6 +1542,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1130
1542
|
};
|
|
1131
1543
|
}
|
|
1132
1544
|
const dbg = this.createDebugCollector(config.debug);
|
|
1545
|
+
this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
|
|
1133
1546
|
const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
|
|
1134
1547
|
if (!base) {
|
|
1135
1548
|
return {
|
|
@@ -1161,6 +1574,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1161
1574
|
if (config.excludePatterns?.length) {
|
|
1162
1575
|
filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
|
|
1163
1576
|
}
|
|
1577
|
+
this.emitCrawlProgress(config, {
|
|
1578
|
+
phase: "discovering",
|
|
1579
|
+
urlsDiscovered: filteredUrls.length
|
|
1580
|
+
});
|
|
1164
1581
|
urlsToCrawl = filteredUrls.slice(0, maxPages);
|
|
1165
1582
|
urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
|
|
1166
1583
|
break;
|
|
@@ -1182,7 +1599,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1182
1599
|
urlsToCrawl = discovery.urls;
|
|
1183
1600
|
urlsSkipped = discovery.skipped;
|
|
1184
1601
|
dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
|
|
1602
|
+
this.emitCrawlProgress(config, {
|
|
1603
|
+
phase: "discovering",
|
|
1604
|
+
urlsDiscovered: urlsToCrawl.length
|
|
1605
|
+
});
|
|
1185
1606
|
}
|
|
1607
|
+
this.emitCrawlProgress(config, {
|
|
1608
|
+
phase: "crawling",
|
|
1609
|
+
urlsDiscovered: urlsToCrawl.length,
|
|
1610
|
+
urlsScheduled: urlsToCrawl.length
|
|
1611
|
+
});
|
|
1186
1612
|
const result = await this.crawlUrls(urlsToCrawl, {
|
|
1187
1613
|
contentSelector: config.contentSelector,
|
|
1188
1614
|
titleSelector: config.titleSelector,
|
|
@@ -1204,9 +1630,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1204
1630
|
return {
|
|
1205
1631
|
...result,
|
|
1206
1632
|
urlsSkipped,
|
|
1633
|
+
/** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
|
|
1634
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1207
1635
|
crawledAt: /* @__PURE__ */ new Date(),
|
|
1208
1636
|
metadata: {
|
|
1209
1637
|
...result.metadata || {},
|
|
1638
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1210
1639
|
discoveryDebug: dbg.summary()
|
|
1211
1640
|
}
|
|
1212
1641
|
};
|
|
@@ -1335,7 +1764,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1335
1764
|
return await response.text();
|
|
1336
1765
|
}
|
|
1337
1766
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1338
|
-
const $ =
|
|
1767
|
+
const $ = cheerio3.load(html);
|
|
1339
1768
|
const links = /* @__PURE__ */ new Set();
|
|
1340
1769
|
$("a[href]").each((_, el) => {
|
|
1341
1770
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -1434,6 +1863,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1434
1863
|
const forceRecrawl = !!(options && options.forceRecrawl);
|
|
1435
1864
|
const agentId = options?.agentId ?? "shared";
|
|
1436
1865
|
const stripQ = config.stripQueryParams ?? false;
|
|
1866
|
+
const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
|
|
1437
1867
|
const urlByNorm = /* @__PURE__ */ new Map();
|
|
1438
1868
|
for (const u of urls) {
|
|
1439
1869
|
const norm = this.normalizeLedgerUrl(u, stripQ) || u;
|
|
@@ -1462,6 +1892,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1462
1892
|
const results = await Promise.allSettled(
|
|
1463
1893
|
batch.map(async (url) => {
|
|
1464
1894
|
const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
|
|
1895
|
+
this.emitCrawlPage(config, { url, event: "start" });
|
|
1465
1896
|
if (ledgerOpts && !forceRecrawl) {
|
|
1466
1897
|
const entry = await this.findLedgerEntry(urlNormalized, agentId);
|
|
1467
1898
|
if (this.shouldSkipLedger(
|
|
@@ -1482,11 +1913,24 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1482
1913
|
docId: entry?.docId
|
|
1483
1914
|
});
|
|
1484
1915
|
dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
|
|
1916
|
+
if (ledgerOpts) {
|
|
1917
|
+
await this.upsertLedgerRecord({
|
|
1918
|
+
url,
|
|
1919
|
+
urlNormalized,
|
|
1920
|
+
agentId,
|
|
1921
|
+
ingestionId,
|
|
1922
|
+
status: "skipped_ledger",
|
|
1923
|
+
title: entry?.title,
|
|
1924
|
+
docId: entry?.docId,
|
|
1925
|
+
contentLength: entry?.contentLength
|
|
1926
|
+
});
|
|
1927
|
+
}
|
|
1928
|
+
this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
|
|
1485
1929
|
return { kind: "ledger_skip", url };
|
|
1486
1930
|
}
|
|
1487
1931
|
}
|
|
1488
1932
|
try {
|
|
1489
|
-
const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
|
|
1933
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
1490
1934
|
renderMode,
|
|
1491
1935
|
renderOptions,
|
|
1492
1936
|
minContentLength,
|
|
@@ -1505,6 +1949,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1505
1949
|
url,
|
|
1506
1950
|
urlNormalized,
|
|
1507
1951
|
agentId,
|
|
1952
|
+
ingestionId,
|
|
1508
1953
|
status: crawlSt,
|
|
1509
1954
|
doc,
|
|
1510
1955
|
diag
|
|
@@ -1516,11 +1961,17 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1516
1961
|
status: crawlSt,
|
|
1517
1962
|
modeUsed: diag?.modeUsed,
|
|
1518
1963
|
contentLength: doc?.content?.length,
|
|
1519
|
-
bodyTextLengthHint,
|
|
1964
|
+
bodyTextLengthHint: bodyTextLengthHint2,
|
|
1520
1965
|
title: doc?.metadata?.title,
|
|
1521
1966
|
docId: doc?.id,
|
|
1522
1967
|
error: diag?.errorMessage
|
|
1523
1968
|
});
|
|
1969
|
+
this.emitCrawlPage(config, {
|
|
1970
|
+
url,
|
|
1971
|
+
event: "done",
|
|
1972
|
+
status: crawlSt,
|
|
1973
|
+
error: diag?.errorMessage
|
|
1974
|
+
});
|
|
1524
1975
|
return { kind: "doc", doc, url };
|
|
1525
1976
|
} catch (error) {
|
|
1526
1977
|
const msg = error instanceof Error ? error.message : String(error);
|
|
@@ -1529,6 +1980,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1529
1980
|
url,
|
|
1530
1981
|
urlNormalized,
|
|
1531
1982
|
agentId,
|
|
1983
|
+
ingestionId,
|
|
1532
1984
|
status: "error",
|
|
1533
1985
|
errorMessage: msg
|
|
1534
1986
|
});
|
|
@@ -1539,6 +1991,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1539
1991
|
status: "error",
|
|
1540
1992
|
error: msg
|
|
1541
1993
|
});
|
|
1994
|
+
this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
|
|
1542
1995
|
throw { url, error };
|
|
1543
1996
|
}
|
|
1544
1997
|
})
|
|
@@ -1561,12 +2014,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1561
2014
|
});
|
|
1562
2015
|
}
|
|
1563
2016
|
}
|
|
2017
|
+
this.emitCrawlProgress(config, {
|
|
2018
|
+
phase: "crawling",
|
|
2019
|
+
urlsScheduled: uniqueUrls.length,
|
|
2020
|
+
pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
|
|
2021
|
+
});
|
|
1564
2022
|
if (i + concurrency < uniqueUrls.length) {
|
|
1565
2023
|
await this.delay(delayMs);
|
|
1566
2024
|
}
|
|
1567
2025
|
}
|
|
1568
2026
|
if (documents.length > 0) {
|
|
1569
|
-
const ingestResult = await this.ingest(documents,
|
|
2027
|
+
const ingestResult = await this.ingest(documents, {
|
|
2028
|
+
...options,
|
|
2029
|
+
metadata: {
|
|
2030
|
+
...options?.metadata ?? {},
|
|
2031
|
+
onCrawlProgress: config.metadata?.onCrawlProgress
|
|
2032
|
+
}
|
|
2033
|
+
});
|
|
1570
2034
|
indexed = ingestResult.indexed;
|
|
1571
2035
|
if (ingestResult.errors) {
|
|
1572
2036
|
errors.push(...ingestResult.errors);
|
|
@@ -1609,125 +2073,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1609
2073
|
const html = await response.text();
|
|
1610
2074
|
return this.extractDocumentFromHtml(url, html, config);
|
|
1611
2075
|
}
|
|
1612
|
-
/**
|
|
1613
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
1614
|
-
* would otherwise hit an empty wrapper.
|
|
1615
|
-
*/
|
|
1616
|
-
static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
1617
|
-
stripNoiseFromDom($, config) {
|
|
1618
|
-
const removeSelectors = config.removeSelectors || [
|
|
1619
|
-
"script",
|
|
1620
|
-
"style",
|
|
1621
|
-
"nav",
|
|
1622
|
-
"header",
|
|
1623
|
-
"footer",
|
|
1624
|
-
".sidebar",
|
|
1625
|
-
".navigation",
|
|
1626
|
-
".menu",
|
|
1627
|
-
".comments",
|
|
1628
|
-
'[role="navigation"]',
|
|
1629
|
-
'[role="banner"]'
|
|
1630
|
-
];
|
|
1631
|
-
removeSelectors.forEach((selector) => $(selector).remove());
|
|
1632
|
-
}
|
|
1633
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
1634
|
-
extractBestContentText($, config) {
|
|
1635
|
-
const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
|
|
1636
|
-
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
1637
|
-
let best = "";
|
|
1638
|
-
for (const sel of selectors) {
|
|
1639
|
-
$(sel).each((_, el) => {
|
|
1640
|
-
const t = this.cleanContent($(el).text().trim());
|
|
1641
|
-
if (t.length > best.length) best = t;
|
|
1642
|
-
});
|
|
1643
|
-
}
|
|
1644
|
-
const bodyText = this.cleanContent($("body").text().trim());
|
|
1645
|
-
if (bodyText.length > best.length) best = bodyText;
|
|
1646
|
-
return best;
|
|
1647
|
-
}
|
|
1648
2076
|
bodyTextLengthHint(html, config) {
|
|
1649
|
-
|
|
1650
|
-
this.stripNoiseFromDom($, config);
|
|
1651
|
-
return this.cleanContent($("body").text().trim()).length;
|
|
2077
|
+
return bodyTextLengthHint(html, config);
|
|
1652
2078
|
}
|
|
1653
2079
|
extractDocumentFromHtml(url, html, config) {
|
|
1654
|
-
const
|
|
1655
|
-
|
|
1656
|
-
const titleSelector = config.titleSelector || "h1, title";
|
|
1657
|
-
let title = $(titleSelector).first().text().trim();
|
|
1658
|
-
if (!title) {
|
|
1659
|
-
title = $("title").text().trim();
|
|
1660
|
-
}
|
|
1661
|
-
const content = this.extractBestContentText($, config);
|
|
1662
|
-
const minChars = config.minExtractedContentLength ?? 50;
|
|
1663
|
-
if (!content || content.length < minChars) return null;
|
|
1664
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1665
|
-
this.extractHeroImage($, url) || void 0;
|
|
1666
|
-
let imageUrl;
|
|
1667
|
-
if (image) {
|
|
1668
|
-
try {
|
|
1669
|
-
imageUrl = new URL(image, url).href;
|
|
1670
|
-
} catch {
|
|
1671
|
-
imageUrl = image;
|
|
1672
|
-
}
|
|
1673
|
-
}
|
|
1674
|
-
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
1675
|
-
let type = config.defaultType || "page";
|
|
1676
|
-
if (config.typeFromUrl) {
|
|
1677
|
-
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
1678
|
-
if (url.includes(pattern)) {
|
|
1679
|
-
type = typeName;
|
|
1680
|
-
break;
|
|
1681
|
-
}
|
|
1682
|
-
}
|
|
1683
|
-
}
|
|
1684
|
-
const id = this.urlToId(url);
|
|
2080
|
+
const extracted = extractPageFromHtml(url, html, config);
|
|
2081
|
+
if (!extracted.indexable) return null;
|
|
1685
2082
|
return {
|
|
1686
|
-
id,
|
|
1687
|
-
content,
|
|
1688
|
-
metadata:
|
|
1689
|
-
type,
|
|
1690
|
-
title,
|
|
1691
|
-
url,
|
|
1692
|
-
...imageUrl ? { imageUrl } : {},
|
|
1693
|
-
...description ? { description } : {},
|
|
1694
|
-
...config.metadata
|
|
1695
|
-
}
|
|
2083
|
+
id: extracted.id,
|
|
2084
|
+
content: extracted.content,
|
|
2085
|
+
metadata: extracted.metadata
|
|
1696
2086
|
};
|
|
1697
2087
|
}
|
|
1698
|
-
/**
|
|
1699
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1700
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1701
|
-
*/
|
|
1702
|
-
extractHeroImage($, pageUrl) {
|
|
1703
|
-
const containers = $('main, article, [role="main"], #content, .content');
|
|
1704
|
-
const scope = containers.length > 0 ? containers : $("body");
|
|
1705
|
-
let best;
|
|
1706
|
-
scope.find("img[src]").each((_, el) => {
|
|
1707
|
-
if (best) return false;
|
|
1708
|
-
const src = $(el).attr("src") || "";
|
|
1709
|
-
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1710
|
-
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1711
|
-
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1712
|
-
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1713
|
-
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1714
|
-
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1715
|
-
if (src.includes("/_next/image")) {
|
|
1716
|
-
try {
|
|
1717
|
-
const nextUrl = new URL(src, pageUrl);
|
|
1718
|
-
const realUrl = nextUrl.searchParams.get("url");
|
|
1719
|
-
if (realUrl) {
|
|
1720
|
-
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1721
|
-
return false;
|
|
1722
|
-
}
|
|
1723
|
-
} catch {
|
|
1724
|
-
}
|
|
1725
|
-
}
|
|
1726
|
-
best = src;
|
|
1727
|
-
return false;
|
|
1728
|
-
});
|
|
1729
|
-
return best;
|
|
1730
|
-
}
|
|
1731
2088
|
looksLikeDynamicShell(html) {
|
|
1732
2089
|
const lower = html.toLowerCase();
|
|
1733
2090
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
@@ -1745,7 +2102,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1745
2102
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
1746
2103
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
1747
2104
|
}
|
|
1748
|
-
diagFromRenderedAttempt(doc,
|
|
2105
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
1749
2106
|
if (blockedSuspected) {
|
|
1750
2107
|
return {
|
|
1751
2108
|
doc: null,
|
|
@@ -1761,12 +2118,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1761
2118
|
return {
|
|
1762
2119
|
doc,
|
|
1763
2120
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
1764
|
-
bodyTextLengthHint: doc ? void 0 :
|
|
2121
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
1765
2122
|
};
|
|
1766
2123
|
}
|
|
1767
2124
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
1768
2125
|
if (ctx.renderMode === true) {
|
|
1769
|
-
const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2126
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
1770
2127
|
url,
|
|
1771
2128
|
config,
|
|
1772
2129
|
timeout,
|
|
@@ -1775,7 +2132,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1775
2132
|
);
|
|
1776
2133
|
return this.diagFromRenderedAttempt(
|
|
1777
2134
|
doc,
|
|
1778
|
-
|
|
2135
|
+
bodyTextLengthHint2,
|
|
1779
2136
|
renderFailure,
|
|
1780
2137
|
blockedSuspected,
|
|
1781
2138
|
"render_ok",
|
|
@@ -1892,7 +2249,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1892
2249
|
}
|
|
1893
2250
|
}
|
|
1894
2251
|
const html = await page.content();
|
|
1895
|
-
const
|
|
2252
|
+
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
1896
2253
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
1897
2254
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
1898
2255
|
try {
|
|
@@ -1907,7 +2264,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1907
2264
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
1908
2265
|
}
|
|
1909
2266
|
}
|
|
1910
|
-
return { doc, bodyTextLengthHint };
|
|
2267
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
1911
2268
|
} catch (e) {
|
|
1912
2269
|
const msg = String(e?.message || e || "render_failed");
|
|
1913
2270
|
const lower = msg.toLowerCase();
|
|
@@ -1957,6 +2314,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1957
2314
|
}
|
|
1958
2315
|
return Array.from(found);
|
|
1959
2316
|
}
|
|
2317
|
+
emitBulkProgress(options, update) {
|
|
2318
|
+
const fn = options?.metadata?.onBulkProgress;
|
|
2319
|
+
if (!fn) return;
|
|
2320
|
+
try {
|
|
2321
|
+
fn(update);
|
|
2322
|
+
} catch {
|
|
2323
|
+
}
|
|
2324
|
+
}
|
|
2325
|
+
emitCrawlProgress(config, update) {
|
|
2326
|
+
const fn = config.metadata?.onCrawlProgress;
|
|
2327
|
+
if (!fn) return;
|
|
2328
|
+
try {
|
|
2329
|
+
fn(update);
|
|
2330
|
+
} catch {
|
|
2331
|
+
}
|
|
2332
|
+
}
|
|
2333
|
+
emitCrawlPage(config, event) {
|
|
2334
|
+
const fn = config.metadata?.onCrawlPage;
|
|
2335
|
+
if (!fn) return;
|
|
2336
|
+
try {
|
|
2337
|
+
fn(event);
|
|
2338
|
+
} catch {
|
|
2339
|
+
}
|
|
2340
|
+
}
|
|
1960
2341
|
createDebugCollector(debug) {
|
|
1961
2342
|
const enabled = !!debug?.enabled;
|
|
1962
2343
|
const level = debug?.level || "summary";
|
|
@@ -1975,14 +2356,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1975
2356
|
/**
|
|
1976
2357
|
* Clean extracted text content
|
|
1977
2358
|
*/
|
|
1978
|
-
cleanContent(text) {
|
|
1979
|
-
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
1980
|
-
}
|
|
1981
|
-
/**
|
|
1982
|
-
* Convert URL to a stable document ID
|
|
1983
|
-
*/
|
|
1984
2359
|
urlToId(url) {
|
|
1985
|
-
return url
|
|
2360
|
+
return urlToDocumentId(url);
|
|
1986
2361
|
}
|
|
1987
2362
|
/**
|
|
1988
2363
|
* Delay helper
|
|
@@ -2245,11 +2620,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2245
2620
|
filterableFields: this.config.filterableFields,
|
|
2246
2621
|
typeBoosts: this.config.typeBoosts,
|
|
2247
2622
|
recencyBoost: this.config.recencyBoost,
|
|
2623
|
+
crawlLedger: this.config.crawlLedger,
|
|
2248
2624
|
priority: this.priority
|
|
2249
2625
|
};
|
|
2250
2626
|
}
|
|
2251
2627
|
};
|
|
2252
2628
|
// Annotate the CommonJS export names for ESM import in node:
|
|
2253
2629
|
0 && (module.exports = {
|
|
2254
|
-
WebRAGPlugin
|
|
2630
|
+
WebRAGPlugin,
|
|
2631
|
+
bodyTextLengthHint,
|
|
2632
|
+
extractPageFromHtml,
|
|
2633
|
+
extractProductMetadata,
|
|
2634
|
+
normalizeAvailability,
|
|
2635
|
+
normalizeCurrency,
|
|
2636
|
+
parsePrice,
|
|
2637
|
+
urlToDocumentId
|
|
2255
2638
|
});
|