@snap-agent/rag-web 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +83 -18
- package/dist/index.d.ts +83 -18
- package/dist/index.js +527 -144
- package/dist/index.mjs +519 -143
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -1,10 +1,324 @@
|
|
|
1
1
|
// src/WebRAGPlugin.ts
|
|
2
2
|
import { MongoClient } from "mongodb";
|
|
3
3
|
import OpenAI from "openai";
|
|
4
|
-
import * as
|
|
4
|
+
import * as cheerio3 from "cheerio";
|
|
5
5
|
import * as fs from "fs";
|
|
6
6
|
import * as path from "path";
|
|
7
|
-
|
|
7
|
+
|
|
8
|
+
// src/htmlPageExtract.ts
|
|
9
|
+
import * as cheerio2 from "cheerio";
|
|
10
|
+
|
|
11
|
+
// src/productMetadata.ts
|
|
12
|
+
import * as cheerio from "cheerio";
|
|
13
|
+
function extractProductMetadata(html) {
|
|
14
|
+
const $ = cheerio.load(html);
|
|
15
|
+
const fromJsonLd = extractFromJsonLd($);
|
|
16
|
+
const fromOg = extractFromOpenGraph($);
|
|
17
|
+
const fromMicrodata = extractFromMicrodata($);
|
|
18
|
+
const result = {};
|
|
19
|
+
const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
|
|
20
|
+
if (price != null) result.price = price;
|
|
21
|
+
const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
|
|
22
|
+
if (currency) result.currency = currency;
|
|
23
|
+
const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
|
|
24
|
+
if (availability) result.availability = availability;
|
|
25
|
+
return result;
|
|
26
|
+
}
|
|
27
|
+
function extractFromJsonLd($) {
|
|
28
|
+
const result = {};
|
|
29
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
30
|
+
if (result.price != null && result.currency && result.availability) return false;
|
|
31
|
+
const raw = $(el).html()?.trim();
|
|
32
|
+
if (!raw) return;
|
|
33
|
+
let parsed;
|
|
34
|
+
try {
|
|
35
|
+
parsed = JSON.parse(raw);
|
|
36
|
+
} catch {
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
for (const node of collectJsonLdNodes(parsed)) {
|
|
40
|
+
if (!isProductType(node)) continue;
|
|
41
|
+
const offer = pickOffer(node);
|
|
42
|
+
if (!offer) continue;
|
|
43
|
+
if (result.price == null) {
|
|
44
|
+
const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
|
|
45
|
+
if (price != null) result.price = price;
|
|
46
|
+
}
|
|
47
|
+
if (!result.currency) {
|
|
48
|
+
const currency = normalizeCurrency(offer.priceCurrency);
|
|
49
|
+
if (currency) result.currency = currency;
|
|
50
|
+
}
|
|
51
|
+
if (!result.availability) {
|
|
52
|
+
const availability = normalizeAvailability(offer.availability);
|
|
53
|
+
if (availability) result.availability = availability;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
return result;
|
|
58
|
+
}
|
|
59
|
+
function extractFromOpenGraph($) {
|
|
60
|
+
const result = {};
|
|
61
|
+
const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
|
|
62
|
+
const price = parsePrice(priceRaw);
|
|
63
|
+
if (price != null) result.price = price;
|
|
64
|
+
const currency = normalizeCurrency(
|
|
65
|
+
$('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
|
|
66
|
+
);
|
|
67
|
+
if (currency) result.currency = currency;
|
|
68
|
+
const availability = normalizeAvailability(
|
|
69
|
+
$('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
|
|
70
|
+
);
|
|
71
|
+
if (availability) result.availability = availability;
|
|
72
|
+
return result;
|
|
73
|
+
}
|
|
74
|
+
function microdataField($, itemprop) {
|
|
75
|
+
const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
|
|
76
|
+
return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
|
|
77
|
+
}
|
|
78
|
+
function extractFromMicrodata($) {
|
|
79
|
+
const result = {};
|
|
80
|
+
const priceEl = microdataField($, "price");
|
|
81
|
+
const price = parsePrice(priceEl.attr("content") || priceEl.text());
|
|
82
|
+
if (price != null) result.price = price;
|
|
83
|
+
const currencyEl = microdataField($, "priceCurrency");
|
|
84
|
+
const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
|
|
85
|
+
if (currency) result.currency = currency;
|
|
86
|
+
const availabilityEl = microdataField($, "availability");
|
|
87
|
+
const availability = normalizeAvailability(
|
|
88
|
+
availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
|
|
89
|
+
);
|
|
90
|
+
if (availability) result.availability = availability;
|
|
91
|
+
return result;
|
|
92
|
+
}
|
|
93
|
+
function collectJsonLdNodes(data) {
|
|
94
|
+
const nodes = [];
|
|
95
|
+
const visit = (value) => {
|
|
96
|
+
if (value == null) return;
|
|
97
|
+
if (Array.isArray(value)) {
|
|
98
|
+
value.forEach(visit);
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
if (typeof value !== "object") return;
|
|
102
|
+
const obj = value;
|
|
103
|
+
nodes.push(obj);
|
|
104
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
105
|
+
};
|
|
106
|
+
visit(data);
|
|
107
|
+
return nodes;
|
|
108
|
+
}
|
|
109
|
+
function isProductType(node) {
|
|
110
|
+
const type = node["@type"];
|
|
111
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
112
|
+
return types.some((t) => {
|
|
113
|
+
const s = String(t).toLowerCase();
|
|
114
|
+
return s === "product" || s.endsWith("/product");
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
function pickOffer(product) {
|
|
118
|
+
const offers = product.offers;
|
|
119
|
+
if (offers == null) return null;
|
|
120
|
+
if (Array.isArray(offers)) {
|
|
121
|
+
const first = offers.find((o) => o && typeof o === "object");
|
|
122
|
+
return first ?? null;
|
|
123
|
+
}
|
|
124
|
+
if (typeof offers === "object") return offers;
|
|
125
|
+
return null;
|
|
126
|
+
}
|
|
127
|
+
function parsePrice(value) {
|
|
128
|
+
if (value == null || value === "") return void 0;
|
|
129
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
130
|
+
let s = String(value).trim();
|
|
131
|
+
if (!s) return void 0;
|
|
132
|
+
s = s.replace(/[^\d.,\-]/g, "");
|
|
133
|
+
if (!s || s === "-" || s === ".") return void 0;
|
|
134
|
+
const lastComma = s.lastIndexOf(",");
|
|
135
|
+
const lastDot = s.lastIndexOf(".");
|
|
136
|
+
if (lastComma > -1 && lastDot > -1) {
|
|
137
|
+
if (lastComma > lastDot) {
|
|
138
|
+
s = s.replace(/\./g, "").replace(",", ".");
|
|
139
|
+
} else {
|
|
140
|
+
s = s.replace(/,/g, "");
|
|
141
|
+
}
|
|
142
|
+
} else if (lastComma > -1) {
|
|
143
|
+
const parts = s.split(",");
|
|
144
|
+
if (parts.length === 2 && parts[1].length <= 2) {
|
|
145
|
+
s = parts[0].replace(/\./g, "") + "." + parts[1];
|
|
146
|
+
} else {
|
|
147
|
+
s = s.replace(/,/g, "");
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
const num = parseFloat(s);
|
|
151
|
+
return Number.isFinite(num) ? num : void 0;
|
|
152
|
+
}
|
|
153
|
+
function normalizeCurrency(value) {
|
|
154
|
+
if (value == null) return void 0;
|
|
155
|
+
const s = String(value).trim().toUpperCase();
|
|
156
|
+
if (!s) return void 0;
|
|
157
|
+
const iso = s.match(/[A-Z]{3}/);
|
|
158
|
+
return iso ? iso[0] : s.length <= 4 ? s : void 0;
|
|
159
|
+
}
|
|
160
|
+
function normalizeAvailability(value) {
|
|
161
|
+
if (value == null) return void 0;
|
|
162
|
+
let s = String(value).trim();
|
|
163
|
+
if (!s) return void 0;
|
|
164
|
+
if (s.includes("schema.org/")) {
|
|
165
|
+
const parts = s.split("/");
|
|
166
|
+
s = parts[parts.length - 1] || s;
|
|
167
|
+
}
|
|
168
|
+
s = s.replace(/^https?:\/\/[^/]+\//, "");
|
|
169
|
+
if (s.includes("/")) {
|
|
170
|
+
const parts = s.split("/");
|
|
171
|
+
s = parts[parts.length - 1] || s;
|
|
172
|
+
}
|
|
173
|
+
return s.replace(/\s+/g, "") || void 0;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// src/htmlPageExtract.ts
|
|
177
|
+
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
178
|
+
var DEFAULT_REMOVE_SELECTORS = [
|
|
179
|
+
"script",
|
|
180
|
+
"style",
|
|
181
|
+
"nav",
|
|
182
|
+
"header",
|
|
183
|
+
"footer",
|
|
184
|
+
".sidebar",
|
|
185
|
+
".navigation",
|
|
186
|
+
".menu",
|
|
187
|
+
".comments",
|
|
188
|
+
'[role="navigation"]',
|
|
189
|
+
'[role="banner"]'
|
|
190
|
+
];
|
|
191
|
+
function urlToDocumentId(url) {
|
|
192
|
+
return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
|
|
193
|
+
}
|
|
194
|
+
function cleanContent(text) {
|
|
195
|
+
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
196
|
+
}
|
|
197
|
+
function bodyTextLengthHint(html, options = {}) {
|
|
198
|
+
const $ = cheerio2.load(html);
|
|
199
|
+
stripNoiseFromDom($, options);
|
|
200
|
+
return cleanContent($("body").text().trim()).length;
|
|
201
|
+
}
|
|
202
|
+
function extractPageFromHtml(url, html, options = {}) {
|
|
203
|
+
const $ = cheerio2.load(html);
|
|
204
|
+
stripNoiseFromDom($, options);
|
|
205
|
+
const titleSelector = options.titleSelector || "h1, title";
|
|
206
|
+
let title = $(titleSelector).first().text().trim();
|
|
207
|
+
if (!title) {
|
|
208
|
+
title = $("title").text().trim();
|
|
209
|
+
}
|
|
210
|
+
const content = extractBestContentText($, options);
|
|
211
|
+
const minChars = options.minExtractedContentLength ?? 50;
|
|
212
|
+
const indexable = Boolean(content && content.length >= minChars);
|
|
213
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
|
|
214
|
+
let imageUrl;
|
|
215
|
+
if (image) {
|
|
216
|
+
try {
|
|
217
|
+
imageUrl = new URL(image, url).href;
|
|
218
|
+
} catch {
|
|
219
|
+
imageUrl = image;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
223
|
+
let type = options.defaultType || "page";
|
|
224
|
+
if (options.typeFromUrl) {
|
|
225
|
+
for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
|
|
226
|
+
if (url.includes(pattern)) {
|
|
227
|
+
type = typeName;
|
|
228
|
+
break;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
const productMeta = extractProductMetadata(html);
|
|
233
|
+
const metadata = {
|
|
234
|
+
type,
|
|
235
|
+
...title ? { title } : {},
|
|
236
|
+
url,
|
|
237
|
+
...imageUrl ? { imageUrl } : {},
|
|
238
|
+
...description ? { description } : {},
|
|
239
|
+
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
240
|
+
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
241
|
+
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
242
|
+
...options.metadata
|
|
243
|
+
};
|
|
244
|
+
const previewLen = 400;
|
|
245
|
+
const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
|
|
246
|
+
return {
|
|
247
|
+
id: urlToDocumentId(url),
|
|
248
|
+
metadata,
|
|
249
|
+
content,
|
|
250
|
+
indexable,
|
|
251
|
+
contentPreview
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
function stripNoiseFromDom($, options) {
|
|
255
|
+
const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
|
|
256
|
+
removeSelectors.forEach((selector) => $(selector).remove());
|
|
257
|
+
}
|
|
258
|
+
function extractBestContentText($, options) {
|
|
259
|
+
const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
|
|
260
|
+
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
261
|
+
let best = "";
|
|
262
|
+
for (const sel of selectors) {
|
|
263
|
+
$(sel).each((_, el) => {
|
|
264
|
+
const t = cleanContent($(el).text().trim());
|
|
265
|
+
if (t.length > best.length) best = t;
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
const bodyText = cleanContent($("body").text().trim());
|
|
269
|
+
if (bodyText.length > best.length) best = bodyText;
|
|
270
|
+
return best;
|
|
271
|
+
}
|
|
272
|
+
function extractHeroImage($, pageUrl) {
|
|
273
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
274
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
275
|
+
let best;
|
|
276
|
+
scope.find("img[src]").each((_, el) => {
|
|
277
|
+
if (best) return false;
|
|
278
|
+
const src = $(el).attr("src") || "";
|
|
279
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
280
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
281
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
282
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
283
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
284
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
285
|
+
if (src.includes("/_next/image")) {
|
|
286
|
+
try {
|
|
287
|
+
const nextUrl = new URL(src, pageUrl);
|
|
288
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
289
|
+
if (realUrl) {
|
|
290
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
291
|
+
return false;
|
|
292
|
+
}
|
|
293
|
+
} catch {
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
best = src;
|
|
297
|
+
return false;
|
|
298
|
+
});
|
|
299
|
+
return best;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// src/WebRAGPlugin.ts
|
|
303
|
+
function bulkOpCurrentUrl(op) {
|
|
304
|
+
const meta = op.document?.metadata;
|
|
305
|
+
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
306
|
+
if (typeof meta?.source === "string" && meta.source.trim()) return meta.source.trim();
|
|
307
|
+
return void 0;
|
|
308
|
+
}
|
|
309
|
+
function isUrlListingInsert(document) {
|
|
310
|
+
const meta = document.metadata;
|
|
311
|
+
if (meta?.type !== "url") return false;
|
|
312
|
+
const url = typeof meta.url === "string" ? meta.url.trim() : "";
|
|
313
|
+
if (!url) return false;
|
|
314
|
+
try {
|
|
315
|
+
const parsed = new URL(url);
|
|
316
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
317
|
+
} catch {
|
|
318
|
+
return false;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
var WebRAGPlugin = class {
|
|
8
322
|
name = "web-rag";
|
|
9
323
|
type = "rag";
|
|
10
324
|
priority;
|
|
@@ -42,6 +356,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
42
356
|
}
|
|
43
357
|
return this.db.collection(this.config.collection);
|
|
44
358
|
}
|
|
359
|
+
ledgerIndexesEnsured = false;
|
|
45
360
|
async getLedgerCollection() {
|
|
46
361
|
if (!this.client) {
|
|
47
362
|
this.client = new MongoClient(this.config.mongoUri);
|
|
@@ -49,7 +364,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
49
364
|
this.db = this.client.db(this.config.dbName);
|
|
50
365
|
}
|
|
51
366
|
const name = this.config.crawlLedger?.collection ?? "web_crawl_ledger";
|
|
52
|
-
|
|
367
|
+
const col = this.db.collection(name);
|
|
368
|
+
if (!this.ledgerIndexesEnsured) {
|
|
369
|
+
this.ledgerIndexesEnsured = true;
|
|
370
|
+
await col.createIndex(
|
|
371
|
+
{ tenantId: 1, agentId: 1, urlNormalized: 1 },
|
|
372
|
+
{ unique: true }
|
|
373
|
+
);
|
|
374
|
+
await col.createIndex({ tenantId: 1, agentId: 1, ingestionId: 1, lastCrawledAt: -1 });
|
|
375
|
+
}
|
|
376
|
+
return col;
|
|
53
377
|
}
|
|
54
378
|
/**
|
|
55
379
|
* List recent crawl ledger rows (for dashboards / pagination in the front).
|
|
@@ -59,6 +383,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
59
383
|
const filter = { tenantId: this.config.tenantId };
|
|
60
384
|
filter.agentId = options.agentId ?? "shared";
|
|
61
385
|
if (options.domain) filter.domain = options.domain;
|
|
386
|
+
if (options.ingestionId) filter.ingestionId = options.ingestionId;
|
|
62
387
|
if (options.status) filter.lastStatus = options.status;
|
|
63
388
|
const limit = Math.min(Math.max(options.limit ?? 50, 1), 500);
|
|
64
389
|
const skip = Math.max(options.skip ?? 0, 0);
|
|
@@ -127,6 +452,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
127
452
|
lastCrawledAt: now,
|
|
128
453
|
updatedAt: now
|
|
129
454
|
};
|
|
455
|
+
if (params.ingestionId) {
|
|
456
|
+
$set.ingestionId = params.ingestionId;
|
|
457
|
+
}
|
|
130
458
|
if (errMsg !== void 0) {
|
|
131
459
|
$set.errorMessage = errMsg;
|
|
132
460
|
} else if (params.status === "indexed" && params.doc) {
|
|
@@ -139,9 +467,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
139
467
|
$set.docId = params.doc.id;
|
|
140
468
|
} else {
|
|
141
469
|
$set.modeUsed = params.diag?.modeUsed;
|
|
142
|
-
$set.contentLength = null;
|
|
143
|
-
$set.title = null;
|
|
144
|
-
$set.docId = null;
|
|
470
|
+
$set.contentLength = params.contentLength ?? null;
|
|
471
|
+
$set.title = params.title ?? null;
|
|
472
|
+
$set.docId = params.docId ?? null;
|
|
145
473
|
}
|
|
146
474
|
await col.updateOne(
|
|
147
475
|
{
|
|
@@ -220,6 +548,9 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
220
548
|
url: doc.metadata.url,
|
|
221
549
|
imageUrl: doc.metadata.imageUrl,
|
|
222
550
|
description: doc.metadata.description,
|
|
551
|
+
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
552
|
+
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
553
|
+
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
223
554
|
score: doc.score
|
|
224
555
|
}))
|
|
225
556
|
}
|
|
@@ -385,9 +716,27 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
385
716
|
let indexed = 0;
|
|
386
717
|
const errors = [];
|
|
387
718
|
const agentId = options?.agentId || "shared";
|
|
388
|
-
|
|
719
|
+
const onCrawlProgress = options?.metadata?.onCrawlProgress;
|
|
720
|
+
const indexingTotal = documents.length;
|
|
721
|
+
const chunkPlan = documents.map((doc) => this.chunkContent(doc.content));
|
|
722
|
+
const chunksTotal = chunkPlan.reduce((sum, chunks) => sum + chunks.length, 0);
|
|
723
|
+
let chunksProcessed = 0;
|
|
724
|
+
if (onCrawlProgress && indexingTotal > 0) {
|
|
725
|
+
this.emitCrawlProgress(
|
|
726
|
+
{ metadata: options?.metadata },
|
|
727
|
+
{
|
|
728
|
+
phase: "indexing",
|
|
729
|
+
urlsScheduled: indexingTotal,
|
|
730
|
+
pagesProcessed: 0,
|
|
731
|
+
chunksTotal,
|
|
732
|
+
chunksProcessed: 0
|
|
733
|
+
}
|
|
734
|
+
);
|
|
735
|
+
}
|
|
736
|
+
for (let docIndex = 0; docIndex < documents.length; docIndex++) {
|
|
737
|
+
const doc = documents[docIndex];
|
|
738
|
+
const chunks = chunkPlan[docIndex];
|
|
389
739
|
try {
|
|
390
|
-
const chunks = this.chunkContent(doc.content);
|
|
391
740
|
const isChunked = chunks.length > 1;
|
|
392
741
|
if (isChunked) {
|
|
393
742
|
await collection.deleteMany({
|
|
@@ -422,6 +771,19 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
422
771
|
},
|
|
423
772
|
{ upsert: true }
|
|
424
773
|
);
|
|
774
|
+
chunksProcessed++;
|
|
775
|
+
if (onCrawlProgress) {
|
|
776
|
+
this.emitCrawlProgress(
|
|
777
|
+
{ metadata: options?.metadata },
|
|
778
|
+
{
|
|
779
|
+
phase: "indexing",
|
|
780
|
+
urlsScheduled: indexingTotal,
|
|
781
|
+
pagesProcessed: docIndex + (i + 1 === chunks.length ? 1 : 0),
|
|
782
|
+
chunksTotal,
|
|
783
|
+
chunksProcessed
|
|
784
|
+
}
|
|
785
|
+
);
|
|
786
|
+
}
|
|
425
787
|
}
|
|
426
788
|
indexed++;
|
|
427
789
|
} catch (error) {
|
|
@@ -501,23 +863,57 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
501
863
|
let deleted = 0;
|
|
502
864
|
let failed = 0;
|
|
503
865
|
const errors = [];
|
|
866
|
+
const opsTotal = operations.length;
|
|
867
|
+
let opsDone = 0;
|
|
868
|
+
const ingestOptions = options ?? {};
|
|
869
|
+
this.emitBulkProgress(ingestOptions, {
|
|
870
|
+
phase: "processing",
|
|
871
|
+
opsTotal,
|
|
872
|
+
opsDone: 0
|
|
873
|
+
});
|
|
504
874
|
for (const op of operations) {
|
|
875
|
+
const currentUrl = bulkOpCurrentUrl(op);
|
|
505
876
|
try {
|
|
506
877
|
switch (op.type) {
|
|
507
878
|
case "insert":
|
|
508
879
|
if (op.document) {
|
|
509
|
-
|
|
510
|
-
|
|
880
|
+
if (isUrlListingInsert(op.document)) {
|
|
881
|
+
const url = bulkOpCurrentUrl(op);
|
|
882
|
+
const crawlResult = await this.ingestSinglePageFromUrl(
|
|
883
|
+
{
|
|
884
|
+
url,
|
|
885
|
+
metadata: {
|
|
886
|
+
...op.document.metadata ?? {},
|
|
887
|
+
url
|
|
888
|
+
}
|
|
889
|
+
},
|
|
890
|
+
ingestOptions
|
|
891
|
+
);
|
|
892
|
+
if (crawlResult.indexed > 0) {
|
|
893
|
+
inserted++;
|
|
894
|
+
} else {
|
|
895
|
+
failed++;
|
|
896
|
+
const err = crawlResult.errors?.[0]?.error ?? `Failed to crawl ${url}`;
|
|
897
|
+
errors.push({
|
|
898
|
+
id: op.id,
|
|
899
|
+
operation: op.type,
|
|
900
|
+
error: err
|
|
901
|
+
});
|
|
902
|
+
}
|
|
903
|
+
} else {
|
|
904
|
+
await this.ingest([op.document], ingestOptions);
|
|
905
|
+
inserted++;
|
|
906
|
+
}
|
|
511
907
|
}
|
|
512
908
|
break;
|
|
513
909
|
case "update":
|
|
514
910
|
if (op.document) {
|
|
515
|
-
await this.update(op.id, op.document,
|
|
911
|
+
await this.update(op.id, op.document, ingestOptions);
|
|
516
912
|
updated++;
|
|
517
913
|
}
|
|
518
914
|
break;
|
|
519
915
|
case "delete":
|
|
520
|
-
const count = await this.delete(op.id,
|
|
916
|
+
const count = await this.delete(op.id, ingestOptions);
|
|
521
917
|
deleted += count;
|
|
522
918
|
break;
|
|
523
919
|
}
|
|
@@ -528,6 +924,15 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
528
924
|
operation: op.type,
|
|
529
925
|
error: error.message || "Unknown error"
|
|
530
926
|
});
|
|
927
|
+
} finally {
|
|
928
|
+
opsDone++;
|
|
929
|
+
this.emitBulkProgress(ingestOptions, {
|
|
930
|
+
phase: "processing",
|
|
931
|
+
opsTotal,
|
|
932
|
+
opsDone,
|
|
933
|
+
currentOpType: op.type,
|
|
934
|
+
...currentUrl ? { currentUrl } : {}
|
|
935
|
+
});
|
|
531
936
|
}
|
|
532
937
|
}
|
|
533
938
|
return {
|
|
@@ -1094,6 +1499,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1094
1499
|
};
|
|
1095
1500
|
}
|
|
1096
1501
|
const dbg = this.createDebugCollector(config.debug);
|
|
1502
|
+
this.emitCrawlProgress(config, { phase: "discovering", urlsDiscovered: 0 });
|
|
1097
1503
|
const base = this.normalizeWebsiteUrl(config.baseUrl, stripQueryParams);
|
|
1098
1504
|
if (!base) {
|
|
1099
1505
|
return {
|
|
@@ -1125,6 +1531,10 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1125
1531
|
if (config.excludePatterns?.length) {
|
|
1126
1532
|
filteredUrls = filteredUrls.filter((u) => !config.excludePatterns.some((p) => u.includes(p)));
|
|
1127
1533
|
}
|
|
1534
|
+
this.emitCrawlProgress(config, {
|
|
1535
|
+
phase: "discovering",
|
|
1536
|
+
urlsDiscovered: filteredUrls.length
|
|
1537
|
+
});
|
|
1128
1538
|
urlsToCrawl = filteredUrls.slice(0, maxPages);
|
|
1129
1539
|
urlsSkipped = Math.max(0, filteredUrls.length - urlsToCrawl.length);
|
|
1130
1540
|
break;
|
|
@@ -1146,7 +1556,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1146
1556
|
urlsToCrawl = discovery.urls;
|
|
1147
1557
|
urlsSkipped = discovery.skipped;
|
|
1148
1558
|
dbg.log("discovery.linkLookup", { discovered: urlsToCrawl.length, skipped: urlsSkipped });
|
|
1559
|
+
this.emitCrawlProgress(config, {
|
|
1560
|
+
phase: "discovering",
|
|
1561
|
+
urlsDiscovered: urlsToCrawl.length
|
|
1562
|
+
});
|
|
1149
1563
|
}
|
|
1564
|
+
this.emitCrawlProgress(config, {
|
|
1565
|
+
phase: "crawling",
|
|
1566
|
+
urlsDiscovered: urlsToCrawl.length,
|
|
1567
|
+
urlsScheduled: urlsToCrawl.length
|
|
1568
|
+
});
|
|
1150
1569
|
const result = await this.crawlUrls(urlsToCrawl, {
|
|
1151
1570
|
contentSelector: config.contentSelector,
|
|
1152
1571
|
titleSelector: config.titleSelector,
|
|
@@ -1168,9 +1587,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1168
1587
|
return {
|
|
1169
1588
|
...result,
|
|
1170
1589
|
urlsSkipped,
|
|
1590
|
+
/** URLs selected for this crawl (≤ maxPages); use for progress UI denominador. */
|
|
1591
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1171
1592
|
crawledAt: /* @__PURE__ */ new Date(),
|
|
1172
1593
|
metadata: {
|
|
1173
1594
|
...result.metadata || {},
|
|
1595
|
+
urlsScheduled: urlsToCrawl.length,
|
|
1174
1596
|
discoveryDebug: dbg.summary()
|
|
1175
1597
|
}
|
|
1176
1598
|
};
|
|
@@ -1299,7 +1721,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1299
1721
|
return await response.text();
|
|
1300
1722
|
}
|
|
1301
1723
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1302
|
-
const $ =
|
|
1724
|
+
const $ = cheerio3.load(html);
|
|
1303
1725
|
const links = /* @__PURE__ */ new Set();
|
|
1304
1726
|
$("a[href]").each((_, el) => {
|
|
1305
1727
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -1398,6 +1820,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1398
1820
|
const forceRecrawl = !!(options && options.forceRecrawl);
|
|
1399
1821
|
const agentId = options?.agentId ?? "shared";
|
|
1400
1822
|
const stripQ = config.stripQueryParams ?? false;
|
|
1823
|
+
const ingestionId = typeof config.metadata?.ingestionId === "string" && config.metadata.ingestionId.trim() ? config.metadata.ingestionId.trim() : void 0;
|
|
1401
1824
|
const urlByNorm = /* @__PURE__ */ new Map();
|
|
1402
1825
|
for (const u of urls) {
|
|
1403
1826
|
const norm = this.normalizeLedgerUrl(u, stripQ) || u;
|
|
@@ -1426,6 +1849,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1426
1849
|
const results = await Promise.allSettled(
|
|
1427
1850
|
batch.map(async (url) => {
|
|
1428
1851
|
const urlNormalized = this.normalizeLedgerUrl(url, stripQ) || url;
|
|
1852
|
+
this.emitCrawlPage(config, { url, event: "start" });
|
|
1429
1853
|
if (ledgerOpts && !forceRecrawl) {
|
|
1430
1854
|
const entry = await this.findLedgerEntry(urlNormalized, agentId);
|
|
1431
1855
|
if (this.shouldSkipLedger(
|
|
@@ -1446,11 +1870,24 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1446
1870
|
docId: entry?.docId
|
|
1447
1871
|
});
|
|
1448
1872
|
dbg.log("crawl.ledgerSkip", { url, urlNormalized, lastStatus: entry?.lastStatus });
|
|
1873
|
+
if (ledgerOpts) {
|
|
1874
|
+
await this.upsertLedgerRecord({
|
|
1875
|
+
url,
|
|
1876
|
+
urlNormalized,
|
|
1877
|
+
agentId,
|
|
1878
|
+
ingestionId,
|
|
1879
|
+
status: "skipped_ledger",
|
|
1880
|
+
title: entry?.title,
|
|
1881
|
+
docId: entry?.docId,
|
|
1882
|
+
contentLength: entry?.contentLength
|
|
1883
|
+
});
|
|
1884
|
+
}
|
|
1885
|
+
this.emitCrawlPage(config, { url, event: "done", status: "skipped_ledger" });
|
|
1449
1886
|
return { kind: "ledger_skip", url };
|
|
1450
1887
|
}
|
|
1451
1888
|
}
|
|
1452
1889
|
try {
|
|
1453
|
-
const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
|
|
1890
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
1454
1891
|
renderMode,
|
|
1455
1892
|
renderOptions,
|
|
1456
1893
|
minContentLength,
|
|
@@ -1469,6 +1906,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1469
1906
|
url,
|
|
1470
1907
|
urlNormalized,
|
|
1471
1908
|
agentId,
|
|
1909
|
+
ingestionId,
|
|
1472
1910
|
status: crawlSt,
|
|
1473
1911
|
doc,
|
|
1474
1912
|
diag
|
|
@@ -1480,11 +1918,17 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1480
1918
|
status: crawlSt,
|
|
1481
1919
|
modeUsed: diag?.modeUsed,
|
|
1482
1920
|
contentLength: doc?.content?.length,
|
|
1483
|
-
bodyTextLengthHint,
|
|
1921
|
+
bodyTextLengthHint: bodyTextLengthHint2,
|
|
1484
1922
|
title: doc?.metadata?.title,
|
|
1485
1923
|
docId: doc?.id,
|
|
1486
1924
|
error: diag?.errorMessage
|
|
1487
1925
|
});
|
|
1926
|
+
this.emitCrawlPage(config, {
|
|
1927
|
+
url,
|
|
1928
|
+
event: "done",
|
|
1929
|
+
status: crawlSt,
|
|
1930
|
+
error: diag?.errorMessage
|
|
1931
|
+
});
|
|
1488
1932
|
return { kind: "doc", doc, url };
|
|
1489
1933
|
} catch (error) {
|
|
1490
1934
|
const msg = error instanceof Error ? error.message : String(error);
|
|
@@ -1493,6 +1937,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1493
1937
|
url,
|
|
1494
1938
|
urlNormalized,
|
|
1495
1939
|
agentId,
|
|
1940
|
+
ingestionId,
|
|
1496
1941
|
status: "error",
|
|
1497
1942
|
errorMessage: msg
|
|
1498
1943
|
});
|
|
@@ -1503,6 +1948,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1503
1948
|
status: "error",
|
|
1504
1949
|
error: msg
|
|
1505
1950
|
});
|
|
1951
|
+
this.emitCrawlPage(config, { url, event: "done", status: "error", error: msg });
|
|
1506
1952
|
throw { url, error };
|
|
1507
1953
|
}
|
|
1508
1954
|
})
|
|
@@ -1525,12 +1971,23 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1525
1971
|
});
|
|
1526
1972
|
}
|
|
1527
1973
|
}
|
|
1974
|
+
this.emitCrawlProgress(config, {
|
|
1975
|
+
phase: "crawling",
|
|
1976
|
+
urlsScheduled: uniqueUrls.length,
|
|
1977
|
+
pagesProcessed: Math.min(i + batch.length, uniqueUrls.length)
|
|
1978
|
+
});
|
|
1528
1979
|
if (i + concurrency < uniqueUrls.length) {
|
|
1529
1980
|
await this.delay(delayMs);
|
|
1530
1981
|
}
|
|
1531
1982
|
}
|
|
1532
1983
|
if (documents.length > 0) {
|
|
1533
|
-
const ingestResult = await this.ingest(documents,
|
|
1984
|
+
const ingestResult = await this.ingest(documents, {
|
|
1985
|
+
...options,
|
|
1986
|
+
metadata: {
|
|
1987
|
+
...options?.metadata ?? {},
|
|
1988
|
+
onCrawlProgress: config.metadata?.onCrawlProgress
|
|
1989
|
+
}
|
|
1990
|
+
});
|
|
1534
1991
|
indexed = ingestResult.indexed;
|
|
1535
1992
|
if (ingestResult.errors) {
|
|
1536
1993
|
errors.push(...ingestResult.errors);
|
|
@@ -1573,125 +2030,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1573
2030
|
const html = await response.text();
|
|
1574
2031
|
return this.extractDocumentFromHtml(url, html, config);
|
|
1575
2032
|
}
|
|
1576
|
-
/**
|
|
1577
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
1578
|
-
* would otherwise hit an empty wrapper.
|
|
1579
|
-
*/
|
|
1580
|
-
static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
1581
|
-
stripNoiseFromDom($, config) {
|
|
1582
|
-
const removeSelectors = config.removeSelectors || [
|
|
1583
|
-
"script",
|
|
1584
|
-
"style",
|
|
1585
|
-
"nav",
|
|
1586
|
-
"header",
|
|
1587
|
-
"footer",
|
|
1588
|
-
".sidebar",
|
|
1589
|
-
".navigation",
|
|
1590
|
-
".menu",
|
|
1591
|
-
".comments",
|
|
1592
|
-
'[role="navigation"]',
|
|
1593
|
-
'[role="banner"]'
|
|
1594
|
-
];
|
|
1595
|
-
removeSelectors.forEach((selector) => $(selector).remove());
|
|
1596
|
-
}
|
|
1597
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
1598
|
-
extractBestContentText($, config) {
|
|
1599
|
-
const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
|
|
1600
|
-
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
1601
|
-
let best = "";
|
|
1602
|
-
for (const sel of selectors) {
|
|
1603
|
-
$(sel).each((_, el) => {
|
|
1604
|
-
const t = this.cleanContent($(el).text().trim());
|
|
1605
|
-
if (t.length > best.length) best = t;
|
|
1606
|
-
});
|
|
1607
|
-
}
|
|
1608
|
-
const bodyText = this.cleanContent($("body").text().trim());
|
|
1609
|
-
if (bodyText.length > best.length) best = bodyText;
|
|
1610
|
-
return best;
|
|
1611
|
-
}
|
|
1612
2033
|
bodyTextLengthHint(html, config) {
|
|
1613
|
-
|
|
1614
|
-
this.stripNoiseFromDom($, config);
|
|
1615
|
-
return this.cleanContent($("body").text().trim()).length;
|
|
2034
|
+
return bodyTextLengthHint(html, config);
|
|
1616
2035
|
}
|
|
1617
2036
|
extractDocumentFromHtml(url, html, config) {
|
|
1618
|
-
const
|
|
1619
|
-
|
|
1620
|
-
const titleSelector = config.titleSelector || "h1, title";
|
|
1621
|
-
let title = $(titleSelector).first().text().trim();
|
|
1622
|
-
if (!title) {
|
|
1623
|
-
title = $("title").text().trim();
|
|
1624
|
-
}
|
|
1625
|
-
const content = this.extractBestContentText($, config);
|
|
1626
|
-
const minChars = config.minExtractedContentLength ?? 50;
|
|
1627
|
-
if (!content || content.length < minChars) return null;
|
|
1628
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1629
|
-
this.extractHeroImage($, url) || void 0;
|
|
1630
|
-
let imageUrl;
|
|
1631
|
-
if (image) {
|
|
1632
|
-
try {
|
|
1633
|
-
imageUrl = new URL(image, url).href;
|
|
1634
|
-
} catch {
|
|
1635
|
-
imageUrl = image;
|
|
1636
|
-
}
|
|
1637
|
-
}
|
|
1638
|
-
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
1639
|
-
let type = config.defaultType || "page";
|
|
1640
|
-
if (config.typeFromUrl) {
|
|
1641
|
-
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
1642
|
-
if (url.includes(pattern)) {
|
|
1643
|
-
type = typeName;
|
|
1644
|
-
break;
|
|
1645
|
-
}
|
|
1646
|
-
}
|
|
1647
|
-
}
|
|
1648
|
-
const id = this.urlToId(url);
|
|
2037
|
+
const extracted = extractPageFromHtml(url, html, config);
|
|
2038
|
+
if (!extracted.indexable) return null;
|
|
1649
2039
|
return {
|
|
1650
|
-
id,
|
|
1651
|
-
content,
|
|
1652
|
-
metadata:
|
|
1653
|
-
type,
|
|
1654
|
-
title,
|
|
1655
|
-
url,
|
|
1656
|
-
...imageUrl ? { imageUrl } : {},
|
|
1657
|
-
...description ? { description } : {},
|
|
1658
|
-
...config.metadata
|
|
1659
|
-
}
|
|
2040
|
+
id: extracted.id,
|
|
2041
|
+
content: extracted.content,
|
|
2042
|
+
metadata: extracted.metadata
|
|
1660
2043
|
};
|
|
1661
2044
|
}
|
|
1662
|
-
/**
|
|
1663
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1664
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1665
|
-
*/
|
|
1666
|
-
extractHeroImage($, pageUrl) {
|
|
1667
|
-
const containers = $('main, article, [role="main"], #content, .content');
|
|
1668
|
-
const scope = containers.length > 0 ? containers : $("body");
|
|
1669
|
-
let best;
|
|
1670
|
-
scope.find("img[src]").each((_, el) => {
|
|
1671
|
-
if (best) return false;
|
|
1672
|
-
const src = $(el).attr("src") || "";
|
|
1673
|
-
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1674
|
-
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1675
|
-
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1676
|
-
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1677
|
-
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1678
|
-
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1679
|
-
if (src.includes("/_next/image")) {
|
|
1680
|
-
try {
|
|
1681
|
-
const nextUrl = new URL(src, pageUrl);
|
|
1682
|
-
const realUrl = nextUrl.searchParams.get("url");
|
|
1683
|
-
if (realUrl) {
|
|
1684
|
-
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1685
|
-
return false;
|
|
1686
|
-
}
|
|
1687
|
-
} catch {
|
|
1688
|
-
}
|
|
1689
|
-
}
|
|
1690
|
-
best = src;
|
|
1691
|
-
return false;
|
|
1692
|
-
});
|
|
1693
|
-
return best;
|
|
1694
|
-
}
|
|
1695
2045
|
looksLikeDynamicShell(html) {
|
|
1696
2046
|
const lower = html.toLowerCase();
|
|
1697
2047
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
@@ -1709,7 +2059,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1709
2059
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
1710
2060
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
1711
2061
|
}
|
|
1712
|
-
diagFromRenderedAttempt(doc,
|
|
2062
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
1713
2063
|
if (blockedSuspected) {
|
|
1714
2064
|
return {
|
|
1715
2065
|
doc: null,
|
|
@@ -1725,12 +2075,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1725
2075
|
return {
|
|
1726
2076
|
doc,
|
|
1727
2077
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
1728
|
-
bodyTextLengthHint: doc ? void 0 :
|
|
2078
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
1729
2079
|
};
|
|
1730
2080
|
}
|
|
1731
2081
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
1732
2082
|
if (ctx.renderMode === true) {
|
|
1733
|
-
const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2083
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
1734
2084
|
url,
|
|
1735
2085
|
config,
|
|
1736
2086
|
timeout,
|
|
@@ -1739,7 +2089,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1739
2089
|
);
|
|
1740
2090
|
return this.diagFromRenderedAttempt(
|
|
1741
2091
|
doc,
|
|
1742
|
-
|
|
2092
|
+
bodyTextLengthHint2,
|
|
1743
2093
|
renderFailure,
|
|
1744
2094
|
blockedSuspected,
|
|
1745
2095
|
"render_ok",
|
|
@@ -1856,7 +2206,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1856
2206
|
}
|
|
1857
2207
|
}
|
|
1858
2208
|
const html = await page.content();
|
|
1859
|
-
const
|
|
2209
|
+
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
1860
2210
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
1861
2211
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
1862
2212
|
try {
|
|
@@ -1871,7 +2221,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1871
2221
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
1872
2222
|
}
|
|
1873
2223
|
}
|
|
1874
|
-
return { doc, bodyTextLengthHint };
|
|
2224
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
1875
2225
|
} catch (e) {
|
|
1876
2226
|
const msg = String(e?.message || e || "render_failed");
|
|
1877
2227
|
const lower = msg.toLowerCase();
|
|
@@ -1921,6 +2271,30 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1921
2271
|
}
|
|
1922
2272
|
return Array.from(found);
|
|
1923
2273
|
}
|
|
2274
|
+
emitBulkProgress(options, update) {
|
|
2275
|
+
const fn = options?.metadata?.onBulkProgress;
|
|
2276
|
+
if (!fn) return;
|
|
2277
|
+
try {
|
|
2278
|
+
fn(update);
|
|
2279
|
+
} catch {
|
|
2280
|
+
}
|
|
2281
|
+
}
|
|
2282
|
+
emitCrawlProgress(config, update) {
|
|
2283
|
+
const fn = config.metadata?.onCrawlProgress;
|
|
2284
|
+
if (!fn) return;
|
|
2285
|
+
try {
|
|
2286
|
+
fn(update);
|
|
2287
|
+
} catch {
|
|
2288
|
+
}
|
|
2289
|
+
}
|
|
2290
|
+
emitCrawlPage(config, event) {
|
|
2291
|
+
const fn = config.metadata?.onCrawlPage;
|
|
2292
|
+
if (!fn) return;
|
|
2293
|
+
try {
|
|
2294
|
+
fn(event);
|
|
2295
|
+
} catch {
|
|
2296
|
+
}
|
|
2297
|
+
}
|
|
1924
2298
|
createDebugCollector(debug) {
|
|
1925
2299
|
const enabled = !!debug?.enabled;
|
|
1926
2300
|
const level = debug?.level || "summary";
|
|
@@ -1939,14 +2313,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1939
2313
|
/**
|
|
1940
2314
|
* Clean extracted text content
|
|
1941
2315
|
*/
|
|
1942
|
-
cleanContent(text) {
|
|
1943
|
-
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
1944
|
-
}
|
|
1945
|
-
/**
|
|
1946
|
-
* Convert URL to a stable document ID
|
|
1947
|
-
*/
|
|
1948
2316
|
urlToId(url) {
|
|
1949
|
-
return url
|
|
2317
|
+
return urlToDocumentId(url);
|
|
1950
2318
|
}
|
|
1951
2319
|
/**
|
|
1952
2320
|
* Delay helper
|
|
@@ -2209,10 +2577,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2209
2577
|
filterableFields: this.config.filterableFields,
|
|
2210
2578
|
typeBoosts: this.config.typeBoosts,
|
|
2211
2579
|
recencyBoost: this.config.recencyBoost,
|
|
2580
|
+
crawlLedger: this.config.crawlLedger,
|
|
2212
2581
|
priority: this.priority
|
|
2213
2582
|
};
|
|
2214
2583
|
}
|
|
2215
2584
|
};
|
|
2216
2585
|
export {
|
|
2217
|
-
WebRAGPlugin
|
|
2586
|
+
WebRAGPlugin,
|
|
2587
|
+
bodyTextLengthHint,
|
|
2588
|
+
extractPageFromHtml,
|
|
2589
|
+
extractProductMetadata,
|
|
2590
|
+
normalizeAvailability,
|
|
2591
|
+
normalizeCurrency,
|
|
2592
|
+
parsePrice,
|
|
2593
|
+
urlToDocumentId
|
|
2218
2594
|
};
|