@snap-agent/rag-web 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +67 -18
- package/dist/index.d.ts +67 -18
- package/dist/index.js +590 -134
- package/dist/index.mjs +578 -133
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -30,16 +30,566 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
|
-
WebRAGPlugin: () => WebRAGPlugin
|
|
33
|
+
WebRAGPlugin: () => WebRAGPlugin,
|
|
34
|
+
bodyTextLengthHint: () => bodyTextLengthHint,
|
|
35
|
+
extractPageFromHtml: () => extractPageFromHtml,
|
|
36
|
+
extractProductMetadata: () => extractProductMetadata,
|
|
37
|
+
hardExcludePage: () => hardExcludePage,
|
|
38
|
+
inferTypeFromUrl: () => inferTypeFromUrl,
|
|
39
|
+
normalizeAvailability: () => normalizeAvailability,
|
|
40
|
+
normalizeCurrency: () => normalizeCurrency,
|
|
41
|
+
normalizeDisplayTitle: () => normalizeDisplayTitle,
|
|
42
|
+
parsePrice: () => parsePrice,
|
|
43
|
+
resolvePageCardMetadata: () => resolvePageCardMetadata,
|
|
44
|
+
urlToDocumentId: () => urlToDocumentId
|
|
34
45
|
});
|
|
35
46
|
module.exports = __toCommonJS(index_exports);
|
|
36
47
|
|
|
37
48
|
// src/WebRAGPlugin.ts
|
|
38
49
|
var import_mongodb = require("mongodb");
|
|
39
50
|
var import_openai = __toESM(require("openai"));
|
|
40
|
-
var
|
|
51
|
+
var cheerio4 = __toESM(require("cheerio"));
|
|
41
52
|
var fs = __toESM(require("fs"));
|
|
42
53
|
var path = __toESM(require("path"));
|
|
54
|
+
|
|
55
|
+
// src/htmlPageExtract.ts
|
|
56
|
+
var cheerio3 = __toESM(require("cheerio"));
|
|
57
|
+
|
|
58
|
+
// src/productMetadata.ts
|
|
59
|
+
var cheerio = __toESM(require("cheerio"));
|
|
60
|
+
function extractProductMetadata(html) {
|
|
61
|
+
const $ = cheerio.load(html);
|
|
62
|
+
const fromJsonLd = extractFromJsonLd($);
|
|
63
|
+
const fromOg = extractFromOpenGraph($);
|
|
64
|
+
const fromMicrodata = extractFromMicrodata($);
|
|
65
|
+
const result = {};
|
|
66
|
+
const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
|
|
67
|
+
if (price != null) result.price = price;
|
|
68
|
+
const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
|
|
69
|
+
if (currency) result.currency = currency;
|
|
70
|
+
const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
|
|
71
|
+
if (availability) result.availability = availability;
|
|
72
|
+
return result;
|
|
73
|
+
}
|
|
74
|
+
function extractFromJsonLd($) {
|
|
75
|
+
const result = {};
|
|
76
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
77
|
+
if (result.price != null && result.currency && result.availability) return false;
|
|
78
|
+
const raw = $(el).html()?.trim();
|
|
79
|
+
if (!raw) return;
|
|
80
|
+
let parsed;
|
|
81
|
+
try {
|
|
82
|
+
parsed = JSON.parse(raw);
|
|
83
|
+
} catch {
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
for (const node of collectJsonLdNodes(parsed)) {
|
|
87
|
+
if (!isProductType(node)) continue;
|
|
88
|
+
const offer = pickOffer(node);
|
|
89
|
+
if (!offer) continue;
|
|
90
|
+
if (result.price == null) {
|
|
91
|
+
const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
|
|
92
|
+
if (price != null) result.price = price;
|
|
93
|
+
}
|
|
94
|
+
if (!result.currency) {
|
|
95
|
+
const currency = normalizeCurrency(offer.priceCurrency);
|
|
96
|
+
if (currency) result.currency = currency;
|
|
97
|
+
}
|
|
98
|
+
if (!result.availability) {
|
|
99
|
+
const availability = normalizeAvailability(offer.availability);
|
|
100
|
+
if (availability) result.availability = availability;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
return result;
|
|
105
|
+
}
|
|
106
|
+
function extractFromOpenGraph($) {
|
|
107
|
+
const result = {};
|
|
108
|
+
const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
|
|
109
|
+
const price = parsePrice(priceRaw);
|
|
110
|
+
if (price != null) result.price = price;
|
|
111
|
+
const currency = normalizeCurrency(
|
|
112
|
+
$('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
|
|
113
|
+
);
|
|
114
|
+
if (currency) result.currency = currency;
|
|
115
|
+
const availability = normalizeAvailability(
|
|
116
|
+
$('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
|
|
117
|
+
);
|
|
118
|
+
if (availability) result.availability = availability;
|
|
119
|
+
return result;
|
|
120
|
+
}
|
|
121
|
+
function microdataField($, itemprop) {
|
|
122
|
+
const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
|
|
123
|
+
return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
|
|
124
|
+
}
|
|
125
|
+
function extractFromMicrodata($) {
|
|
126
|
+
const result = {};
|
|
127
|
+
const priceEl = microdataField($, "price");
|
|
128
|
+
const price = parsePrice(priceEl.attr("content") || priceEl.text());
|
|
129
|
+
if (price != null) result.price = price;
|
|
130
|
+
const currencyEl = microdataField($, "priceCurrency");
|
|
131
|
+
const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
|
|
132
|
+
if (currency) result.currency = currency;
|
|
133
|
+
const availabilityEl = microdataField($, "availability");
|
|
134
|
+
const availability = normalizeAvailability(
|
|
135
|
+
availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
|
|
136
|
+
);
|
|
137
|
+
if (availability) result.availability = availability;
|
|
138
|
+
return result;
|
|
139
|
+
}
|
|
140
|
+
function collectJsonLdNodes(data) {
|
|
141
|
+
const nodes = [];
|
|
142
|
+
const visit = (value) => {
|
|
143
|
+
if (value == null) return;
|
|
144
|
+
if (Array.isArray(value)) {
|
|
145
|
+
value.forEach(visit);
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
if (typeof value !== "object") return;
|
|
149
|
+
const obj = value;
|
|
150
|
+
nodes.push(obj);
|
|
151
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
152
|
+
};
|
|
153
|
+
visit(data);
|
|
154
|
+
return nodes;
|
|
155
|
+
}
|
|
156
|
+
function isProductType(node) {
|
|
157
|
+
const type = node["@type"];
|
|
158
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
159
|
+
return types.some((t) => {
|
|
160
|
+
const s = String(t).toLowerCase();
|
|
161
|
+
return s === "product" || s.endsWith("/product");
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
function pickOffer(product) {
|
|
165
|
+
const offers = product.offers;
|
|
166
|
+
if (offers == null) return null;
|
|
167
|
+
if (Array.isArray(offers)) {
|
|
168
|
+
const first = offers.find((o) => o && typeof o === "object");
|
|
169
|
+
return first ?? null;
|
|
170
|
+
}
|
|
171
|
+
if (typeof offers === "object") return offers;
|
|
172
|
+
return null;
|
|
173
|
+
}
|
|
174
|
+
function parsePrice(value) {
|
|
175
|
+
if (value == null || value === "") return void 0;
|
|
176
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
177
|
+
let s = String(value).trim();
|
|
178
|
+
if (!s) return void 0;
|
|
179
|
+
s = s.replace(/[^\d.,\-]/g, "");
|
|
180
|
+
if (!s || s === "-" || s === ".") return void 0;
|
|
181
|
+
const lastComma = s.lastIndexOf(",");
|
|
182
|
+
const lastDot = s.lastIndexOf(".");
|
|
183
|
+
if (lastComma > -1 && lastDot > -1) {
|
|
184
|
+
if (lastComma > lastDot) {
|
|
185
|
+
s = s.replace(/\./g, "").replace(",", ".");
|
|
186
|
+
} else {
|
|
187
|
+
s = s.replace(/,/g, "");
|
|
188
|
+
}
|
|
189
|
+
} else if (lastComma > -1) {
|
|
190
|
+
const parts = s.split(",");
|
|
191
|
+
if (parts.length === 2 && parts[1].length <= 2) {
|
|
192
|
+
s = parts[0].replace(/\./g, "") + "." + parts[1];
|
|
193
|
+
} else {
|
|
194
|
+
s = s.replace(/,/g, "");
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
const num = parseFloat(s);
|
|
198
|
+
return Number.isFinite(num) ? num : void 0;
|
|
199
|
+
}
|
|
200
|
+
function normalizeCurrency(value) {
|
|
201
|
+
if (value == null) return void 0;
|
|
202
|
+
const s = String(value).trim().toUpperCase();
|
|
203
|
+
if (!s) return void 0;
|
|
204
|
+
const iso = s.match(/[A-Z]{3}/);
|
|
205
|
+
return iso ? iso[0] : s.length <= 4 ? s : void 0;
|
|
206
|
+
}
|
|
207
|
+
function normalizeAvailability(value) {
|
|
208
|
+
if (value == null) return void 0;
|
|
209
|
+
let s = String(value).trim();
|
|
210
|
+
if (!s) return void 0;
|
|
211
|
+
if (s.includes("schema.org/")) {
|
|
212
|
+
const parts = s.split("/");
|
|
213
|
+
s = parts[parts.length - 1] || s;
|
|
214
|
+
}
|
|
215
|
+
s = s.replace(/^https?:\/\/[^/]+\//, "");
|
|
216
|
+
if (s.includes("/")) {
|
|
217
|
+
const parts = s.split("/");
|
|
218
|
+
s = parts[parts.length - 1] || s;
|
|
219
|
+
}
|
|
220
|
+
return s.replace(/\s+/g, "") || void 0;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// src/pageCardMetadata.ts
|
|
224
|
+
var cheerio2 = __toESM(require("cheerio"));
|
|
225
|
+
var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
|
|
226
|
+
var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
|
|
227
|
+
var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
|
|
228
|
+
var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
|
|
229
|
+
var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
|
|
230
|
+
var ENTITY_DETAIL_PATH_RES = [
|
|
231
|
+
/\/projects\/[^/]+/i,
|
|
232
|
+
/\/project\/[^/]+/i,
|
|
233
|
+
/\/perspectives\/[^/]+/i,
|
|
234
|
+
/\/perspective\/[^/]+/i,
|
|
235
|
+
/\/portfolio\/[^/]+/i,
|
|
236
|
+
/\/case-stud(?:y|ies)\/[^/]+/i,
|
|
237
|
+
/\/insights?\/[^/]+/i,
|
|
238
|
+
/\/people\/[^/]+/i,
|
|
239
|
+
/\/person\/[^/]+/i,
|
|
240
|
+
/\/team-members?\/[^/]+/i,
|
|
241
|
+
/\/members?\/[^/]+/i,
|
|
242
|
+
/\/staff\/[^/]+/i,
|
|
243
|
+
/\/experts?\/[^/]+/i,
|
|
244
|
+
/\/authors?\/[^/]+/i,
|
|
245
|
+
/\/leadership\/[^/]+/i,
|
|
246
|
+
/\/biograph(?:y|ies)\/[^/]+/i
|
|
247
|
+
];
|
|
248
|
+
var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
|
|
249
|
+
var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
|
|
250
|
+
var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
|
|
251
|
+
var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
|
|
252
|
+
var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
|
|
253
|
+
var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
|
|
254
|
+
var CARD_PRIORITY = {
|
|
255
|
+
detail: 10,
|
|
256
|
+
listing: 6,
|
|
257
|
+
amenity: 5,
|
|
258
|
+
promotion: 2,
|
|
259
|
+
contact: 1,
|
|
260
|
+
content: 1,
|
|
261
|
+
blog: 0,
|
|
262
|
+
system: 0,
|
|
263
|
+
page: 3
|
|
264
|
+
};
|
|
265
|
+
var CARD_ELIGIBLE_DEFAULT = {
|
|
266
|
+
detail: true,
|
|
267
|
+
listing: true,
|
|
268
|
+
amenity: true,
|
|
269
|
+
promotion: false,
|
|
270
|
+
contact: false,
|
|
271
|
+
content: false,
|
|
272
|
+
blog: false,
|
|
273
|
+
system: false,
|
|
274
|
+
page: false
|
|
275
|
+
};
|
|
276
|
+
var SCHEMA_TYPE_MAP = {
|
|
277
|
+
product: "detail",
|
|
278
|
+
service: "amenity",
|
|
279
|
+
hotelroom: "detail",
|
|
280
|
+
room: "detail",
|
|
281
|
+
apartment: "detail",
|
|
282
|
+
lodgingroom: "detail",
|
|
283
|
+
course: "detail",
|
|
284
|
+
event: "detail",
|
|
285
|
+
offer: "promotion",
|
|
286
|
+
person: "detail",
|
|
287
|
+
employee: "detail",
|
|
288
|
+
profilepage: "detail",
|
|
289
|
+
article: "detail",
|
|
290
|
+
newsarticle: "detail",
|
|
291
|
+
blogposting: "detail",
|
|
292
|
+
creativework: "detail"
|
|
293
|
+
};
|
|
294
|
+
function normalizeDisplayTitle(title) {
|
|
295
|
+
if (!title?.trim()) return title;
|
|
296
|
+
let t = title.trim();
|
|
297
|
+
for (let i = 0; i < 2; i++) {
|
|
298
|
+
const dash = t.match(EN_DASH_SUFFIX_RE);
|
|
299
|
+
if (dash && dash.index !== void 0 && dash.index >= 4) {
|
|
300
|
+
t = t.slice(0, dash.index).trim();
|
|
301
|
+
continue;
|
|
302
|
+
}
|
|
303
|
+
const pipe = t.match(PIPE_SUFFIX_RE);
|
|
304
|
+
if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
|
|
305
|
+
t = t.slice(0, pipe.index).trim();
|
|
306
|
+
continue;
|
|
307
|
+
}
|
|
308
|
+
break;
|
|
309
|
+
}
|
|
310
|
+
return t || title.trim();
|
|
311
|
+
}
|
|
312
|
+
function hardExcludePage(url, title) {
|
|
313
|
+
const path2 = url.toLowerCase();
|
|
314
|
+
if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
|
|
315
|
+
if (BLOG_URL_RE.test(path2)) return true;
|
|
316
|
+
if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
|
|
317
|
+
try {
|
|
318
|
+
const u = new URL(url);
|
|
319
|
+
if (u.pathname === "/" || u.pathname === "") return true;
|
|
320
|
+
} catch {
|
|
321
|
+
}
|
|
322
|
+
return false;
|
|
323
|
+
}
|
|
324
|
+
function inferTypeFromUrl(url) {
|
|
325
|
+
const path2 = url.toLowerCase();
|
|
326
|
+
if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
|
|
327
|
+
if (CONTACT_URL_RE.test(path2)) return "contact";
|
|
328
|
+
if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
|
|
329
|
+
if (AMENITY_URL_RE.test(path2)) return "amenity";
|
|
330
|
+
if (DETAIL_URL_RE.test(path2)) return "detail";
|
|
331
|
+
if (LISTING_URL_RE.test(path2)) return "listing";
|
|
332
|
+
if (BLOG_URL_RE.test(path2)) return "blog";
|
|
333
|
+
return void 0;
|
|
334
|
+
}
|
|
335
|
+
function collectJsonLdNodes2(data) {
|
|
336
|
+
const nodes = [];
|
|
337
|
+
const visit = (value) => {
|
|
338
|
+
if (value == null) return;
|
|
339
|
+
if (Array.isArray(value)) {
|
|
340
|
+
value.forEach(visit);
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
if (typeof value !== "object") return;
|
|
344
|
+
const obj = value;
|
|
345
|
+
nodes.push(obj);
|
|
346
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
347
|
+
};
|
|
348
|
+
visit(data);
|
|
349
|
+
return nodes;
|
|
350
|
+
}
|
|
351
|
+
function schemaTypeName(node) {
|
|
352
|
+
const type = node["@type"];
|
|
353
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
354
|
+
const raw = types[0];
|
|
355
|
+
if (raw == null) return "";
|
|
356
|
+
const s = String(raw).toLowerCase();
|
|
357
|
+
const slash = s.lastIndexOf("/");
|
|
358
|
+
return slash >= 0 ? s.slice(slash + 1) : s;
|
|
359
|
+
}
|
|
360
|
+
function inferTypeFromSchema(html) {
|
|
361
|
+
const $ = cheerio2.load(html);
|
|
362
|
+
for (const el of $('script[type="application/ld+json"]').toArray()) {
|
|
363
|
+
const raw = $(el).html()?.trim();
|
|
364
|
+
if (!raw) continue;
|
|
365
|
+
try {
|
|
366
|
+
const parsed = JSON.parse(raw);
|
|
367
|
+
for (const node of collectJsonLdNodes2(parsed)) {
|
|
368
|
+
const name = schemaTypeName(node);
|
|
369
|
+
if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
|
|
370
|
+
if (name === "product" || node.offers != null) return "detail";
|
|
371
|
+
}
|
|
372
|
+
} catch {
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
|
|
376
|
+
if (ogType === "product") return "detail";
|
|
377
|
+
return void 0;
|
|
378
|
+
}
|
|
379
|
+
function normalizePageType(raw) {
|
|
380
|
+
if (!raw) return "page";
|
|
381
|
+
const lower = raw.toLowerCase();
|
|
382
|
+
const known = [
|
|
383
|
+
"detail",
|
|
384
|
+
"listing",
|
|
385
|
+
"amenity",
|
|
386
|
+
"promotion",
|
|
387
|
+
"contact",
|
|
388
|
+
"content",
|
|
389
|
+
"blog",
|
|
390
|
+
"system",
|
|
391
|
+
"page"
|
|
392
|
+
];
|
|
393
|
+
if (known.includes(lower)) return lower;
|
|
394
|
+
if (lower === "room" || lower === "product") return "detail";
|
|
395
|
+
if (lower === "offer" || lower === "sale") return "promotion";
|
|
396
|
+
return raw;
|
|
397
|
+
}
|
|
398
|
+
function resolveDisplayTitle(input) {
|
|
399
|
+
const heading = input.headingTitle?.trim();
|
|
400
|
+
if (heading) return normalizeDisplayTitle(heading);
|
|
401
|
+
return normalizeDisplayTitle(input.title);
|
|
402
|
+
}
|
|
403
|
+
function resolvePageCardMetadata(input) {
|
|
404
|
+
const title = input.title?.trim();
|
|
405
|
+
const url = input.url;
|
|
406
|
+
const displayTitle = resolveDisplayTitle(input);
|
|
407
|
+
if (hardExcludePage(url, title)) {
|
|
408
|
+
return {
|
|
409
|
+
type: "system",
|
|
410
|
+
cardEligible: false,
|
|
411
|
+
cardPriority: 0,
|
|
412
|
+
displayTitle,
|
|
413
|
+
displayDescription: input.description,
|
|
414
|
+
displayImageUrl: input.imageUrl
|
|
415
|
+
};
|
|
416
|
+
}
|
|
417
|
+
let type = normalizePageType(input.type);
|
|
418
|
+
if (type === "page" && input.html) {
|
|
419
|
+
const fromSchema = inferTypeFromSchema(input.html);
|
|
420
|
+
if (fromSchema) type = fromSchema;
|
|
421
|
+
}
|
|
422
|
+
if (type === "page") {
|
|
423
|
+
const fromUrl = inferTypeFromUrl(url);
|
|
424
|
+
if (fromUrl) type = fromUrl;
|
|
425
|
+
}
|
|
426
|
+
if (input.hasProductPrice && type === "page") {
|
|
427
|
+
type = "detail";
|
|
428
|
+
}
|
|
429
|
+
const typeKey = String(type).toLowerCase();
|
|
430
|
+
let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
|
|
431
|
+
let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
|
|
432
|
+
if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
|
|
433
|
+
cardEligible = false;
|
|
434
|
+
}
|
|
435
|
+
return {
|
|
436
|
+
type,
|
|
437
|
+
cardEligible,
|
|
438
|
+
cardPriority,
|
|
439
|
+
displayTitle,
|
|
440
|
+
displayDescription: input.description?.trim() || void 0,
|
|
441
|
+
displayImageUrl: input.imageUrl
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// src/htmlPageExtract.ts
|
|
446
|
+
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
447
|
+
var DEFAULT_REMOVE_SELECTORS = [
|
|
448
|
+
"script",
|
|
449
|
+
"style",
|
|
450
|
+
"nav",
|
|
451
|
+
"header",
|
|
452
|
+
"footer",
|
|
453
|
+
".sidebar",
|
|
454
|
+
".navigation",
|
|
455
|
+
".menu",
|
|
456
|
+
".comments",
|
|
457
|
+
'[role="navigation"]',
|
|
458
|
+
'[role="banner"]'
|
|
459
|
+
];
|
|
460
|
+
function urlToDocumentId(url) {
|
|
461
|
+
return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
|
|
462
|
+
}
|
|
463
|
+
function cleanContent(text) {
|
|
464
|
+
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
465
|
+
}
|
|
466
|
+
function bodyTextLengthHint(html, options = {}) {
|
|
467
|
+
const $ = cheerio3.load(html);
|
|
468
|
+
stripNoiseFromDom($, options);
|
|
469
|
+
return cleanContent($("body").text().trim()).length;
|
|
470
|
+
}
|
|
471
|
+
function extractPageFromHtml(url, html, options = {}) {
|
|
472
|
+
const $ = cheerio3.load(html);
|
|
473
|
+
stripNoiseFromDom($, options);
|
|
474
|
+
const h1Title = $("h1").first().text().trim();
|
|
475
|
+
const docTitle = $("title").text().trim();
|
|
476
|
+
let title = "";
|
|
477
|
+
if (options.titleSelector) {
|
|
478
|
+
title = $(options.titleSelector).first().text().trim();
|
|
479
|
+
} else {
|
|
480
|
+
title = docTitle || h1Title;
|
|
481
|
+
}
|
|
482
|
+
if (!title) {
|
|
483
|
+
title = h1Title || docTitle;
|
|
484
|
+
}
|
|
485
|
+
const content = extractBestContentText($, options);
|
|
486
|
+
const minChars = options.minExtractedContentLength ?? 50;
|
|
487
|
+
const indexable = Boolean(content && content.length >= minChars);
|
|
488
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
|
|
489
|
+
let imageUrl;
|
|
490
|
+
if (image) {
|
|
491
|
+
try {
|
|
492
|
+
imageUrl = new URL(image, url).href;
|
|
493
|
+
} catch {
|
|
494
|
+
imageUrl = image;
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
498
|
+
let type = options.defaultType || "page";
|
|
499
|
+
if (options.typeFromUrl) {
|
|
500
|
+
for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
|
|
501
|
+
if (url.includes(pattern)) {
|
|
502
|
+
type = typeName;
|
|
503
|
+
break;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
const productMeta = extractProductMetadata(html);
|
|
508
|
+
const cardMeta = resolvePageCardMetadata({
|
|
509
|
+
url,
|
|
510
|
+
title,
|
|
511
|
+
headingTitle: h1Title || void 0,
|
|
512
|
+
description,
|
|
513
|
+
imageUrl,
|
|
514
|
+
html,
|
|
515
|
+
type,
|
|
516
|
+
hasProductPrice: productMeta.price != null
|
|
517
|
+
});
|
|
518
|
+
const metadata = {
|
|
519
|
+
type: cardMeta.type,
|
|
520
|
+
cardEligible: cardMeta.cardEligible,
|
|
521
|
+
cardPriority: cardMeta.cardPriority,
|
|
522
|
+
...title ? { title } : {},
|
|
523
|
+
...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
|
|
524
|
+
url,
|
|
525
|
+
...imageUrl ? { imageUrl } : {},
|
|
526
|
+
...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
|
|
527
|
+
...description ? { description } : {},
|
|
528
|
+
...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
|
|
529
|
+
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
530
|
+
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
531
|
+
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
532
|
+
...options.metadata
|
|
533
|
+
};
|
|
534
|
+
const previewLen = 400;
|
|
535
|
+
const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
|
|
536
|
+
return {
|
|
537
|
+
id: urlToDocumentId(url),
|
|
538
|
+
metadata,
|
|
539
|
+
content,
|
|
540
|
+
indexable,
|
|
541
|
+
contentPreview
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
function stripNoiseFromDom($, options) {
|
|
545
|
+
const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
|
|
546
|
+
removeSelectors.forEach((selector) => $(selector).remove());
|
|
547
|
+
}
|
|
548
|
+
function extractBestContentText($, options) {
|
|
549
|
+
const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
|
|
550
|
+
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
551
|
+
let best = "";
|
|
552
|
+
for (const sel of selectors) {
|
|
553
|
+
$(sel).each((_, el) => {
|
|
554
|
+
const t = cleanContent($(el).text().trim());
|
|
555
|
+
if (t.length > best.length) best = t;
|
|
556
|
+
});
|
|
557
|
+
}
|
|
558
|
+
const bodyText = cleanContent($("body").text().trim());
|
|
559
|
+
if (bodyText.length > best.length) best = bodyText;
|
|
560
|
+
return best;
|
|
561
|
+
}
|
|
562
|
+
function extractHeroImage($, pageUrl) {
|
|
563
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
564
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
565
|
+
let best;
|
|
566
|
+
scope.find("img[src]").each((_, el) => {
|
|
567
|
+
if (best) return false;
|
|
568
|
+
const src = $(el).attr("src") || "";
|
|
569
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
570
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
571
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
572
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
573
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
574
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
575
|
+
if (src.includes("/_next/image")) {
|
|
576
|
+
try {
|
|
577
|
+
const nextUrl = new URL(src, pageUrl);
|
|
578
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
579
|
+
if (realUrl) {
|
|
580
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
581
|
+
return false;
|
|
582
|
+
}
|
|
583
|
+
} catch {
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
best = src;
|
|
587
|
+
return false;
|
|
588
|
+
});
|
|
589
|
+
return best;
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
// src/WebRAGPlugin.ts
|
|
43
593
|
function bulkOpCurrentUrl(op) {
|
|
44
594
|
const meta = op.document?.metadata;
|
|
45
595
|
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
@@ -58,7 +608,7 @@ function isUrlListingInsert(document) {
|
|
|
58
608
|
return false;
|
|
59
609
|
}
|
|
60
610
|
}
|
|
61
|
-
var WebRAGPlugin = class
|
|
611
|
+
var WebRAGPlugin = class {
|
|
62
612
|
name = "web-rag";
|
|
63
613
|
type = "rag";
|
|
64
614
|
priority;
|
|
@@ -281,13 +831,21 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
281
831
|
plugin: this.name,
|
|
282
832
|
contentCount: scoredResults.length,
|
|
283
833
|
types: [...new Set(scoredResults.map((d) => d.metadata.type))],
|
|
284
|
-
topResults: scoredResults.slice(0,
|
|
834
|
+
topResults: scoredResults.slice(0, 16).map((doc) => ({
|
|
285
835
|
id: doc.id,
|
|
286
836
|
type: doc.metadata.type,
|
|
287
837
|
title: doc.metadata.title,
|
|
288
838
|
url: doc.metadata.url,
|
|
289
839
|
imageUrl: doc.metadata.imageUrl,
|
|
290
840
|
description: doc.metadata.description,
|
|
841
|
+
cardEligible: doc.metadata.cardEligible,
|
|
842
|
+
cardPriority: doc.metadata.cardPriority,
|
|
843
|
+
displayTitle: doc.metadata.displayTitle,
|
|
844
|
+
displayDescription: doc.metadata.displayDescription,
|
|
845
|
+
displayImageUrl: doc.metadata.displayImageUrl,
|
|
846
|
+
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
847
|
+
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
848
|
+
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
291
849
|
score: doc.score
|
|
292
850
|
}))
|
|
293
851
|
}
|
|
@@ -1458,7 +2016,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1458
2016
|
return await response.text();
|
|
1459
2017
|
}
|
|
1460
2018
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1461
|
-
const $ =
|
|
2019
|
+
const $ = cheerio4.load(html);
|
|
1462
2020
|
const links = /* @__PURE__ */ new Set();
|
|
1463
2021
|
$("a[href]").each((_, el) => {
|
|
1464
2022
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -1624,7 +2182,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1624
2182
|
}
|
|
1625
2183
|
}
|
|
1626
2184
|
try {
|
|
1627
|
-
const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
|
|
2185
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
1628
2186
|
renderMode,
|
|
1629
2187
|
renderOptions,
|
|
1630
2188
|
minContentLength,
|
|
@@ -1655,7 +2213,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1655
2213
|
status: crawlSt,
|
|
1656
2214
|
modeUsed: diag?.modeUsed,
|
|
1657
2215
|
contentLength: doc?.content?.length,
|
|
1658
|
-
bodyTextLengthHint,
|
|
2216
|
+
bodyTextLengthHint: bodyTextLengthHint2,
|
|
1659
2217
|
title: doc?.metadata?.title,
|
|
1660
2218
|
docId: doc?.id,
|
|
1661
2219
|
error: diag?.errorMessage
|
|
@@ -1767,125 +2325,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1767
2325
|
const html = await response.text();
|
|
1768
2326
|
return this.extractDocumentFromHtml(url, html, config);
|
|
1769
2327
|
}
|
|
1770
|
-
/**
|
|
1771
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
1772
|
-
* would otherwise hit an empty wrapper.
|
|
1773
|
-
*/
|
|
1774
|
-
static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
1775
|
-
stripNoiseFromDom($, config) {
|
|
1776
|
-
const removeSelectors = config.removeSelectors || [
|
|
1777
|
-
"script",
|
|
1778
|
-
"style",
|
|
1779
|
-
"nav",
|
|
1780
|
-
"header",
|
|
1781
|
-
"footer",
|
|
1782
|
-
".sidebar",
|
|
1783
|
-
".navigation",
|
|
1784
|
-
".menu",
|
|
1785
|
-
".comments",
|
|
1786
|
-
'[role="navigation"]',
|
|
1787
|
-
'[role="banner"]'
|
|
1788
|
-
];
|
|
1789
|
-
removeSelectors.forEach((selector) => $(selector).remove());
|
|
1790
|
-
}
|
|
1791
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
1792
|
-
extractBestContentText($, config) {
|
|
1793
|
-
const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
|
|
1794
|
-
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
1795
|
-
let best = "";
|
|
1796
|
-
for (const sel of selectors) {
|
|
1797
|
-
$(sel).each((_, el) => {
|
|
1798
|
-
const t = this.cleanContent($(el).text().trim());
|
|
1799
|
-
if (t.length > best.length) best = t;
|
|
1800
|
-
});
|
|
1801
|
-
}
|
|
1802
|
-
const bodyText = this.cleanContent($("body").text().trim());
|
|
1803
|
-
if (bodyText.length > best.length) best = bodyText;
|
|
1804
|
-
return best;
|
|
1805
|
-
}
|
|
1806
2328
|
bodyTextLengthHint(html, config) {
|
|
1807
|
-
|
|
1808
|
-
this.stripNoiseFromDom($, config);
|
|
1809
|
-
return this.cleanContent($("body").text().trim()).length;
|
|
2329
|
+
return bodyTextLengthHint(html, config);
|
|
1810
2330
|
}
|
|
1811
2331
|
extractDocumentFromHtml(url, html, config) {
|
|
1812
|
-
const
|
|
1813
|
-
|
|
1814
|
-
const titleSelector = config.titleSelector || "h1, title";
|
|
1815
|
-
let title = $(titleSelector).first().text().trim();
|
|
1816
|
-
if (!title) {
|
|
1817
|
-
title = $("title").text().trim();
|
|
1818
|
-
}
|
|
1819
|
-
const content = this.extractBestContentText($, config);
|
|
1820
|
-
const minChars = config.minExtractedContentLength ?? 50;
|
|
1821
|
-
if (!content || content.length < minChars) return null;
|
|
1822
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1823
|
-
this.extractHeroImage($, url) || void 0;
|
|
1824
|
-
let imageUrl;
|
|
1825
|
-
if (image) {
|
|
1826
|
-
try {
|
|
1827
|
-
imageUrl = new URL(image, url).href;
|
|
1828
|
-
} catch {
|
|
1829
|
-
imageUrl = image;
|
|
1830
|
-
}
|
|
1831
|
-
}
|
|
1832
|
-
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
1833
|
-
let type = config.defaultType || "page";
|
|
1834
|
-
if (config.typeFromUrl) {
|
|
1835
|
-
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
1836
|
-
if (url.includes(pattern)) {
|
|
1837
|
-
type = typeName;
|
|
1838
|
-
break;
|
|
1839
|
-
}
|
|
1840
|
-
}
|
|
1841
|
-
}
|
|
1842
|
-
const id = this.urlToId(url);
|
|
2332
|
+
const extracted = extractPageFromHtml(url, html, config);
|
|
2333
|
+
if (!extracted.indexable) return null;
|
|
1843
2334
|
return {
|
|
1844
|
-
id,
|
|
1845
|
-
content,
|
|
1846
|
-
metadata:
|
|
1847
|
-
type,
|
|
1848
|
-
title,
|
|
1849
|
-
url,
|
|
1850
|
-
...imageUrl ? { imageUrl } : {},
|
|
1851
|
-
...description ? { description } : {},
|
|
1852
|
-
...config.metadata
|
|
1853
|
-
}
|
|
2335
|
+
id: extracted.id,
|
|
2336
|
+
content: extracted.content,
|
|
2337
|
+
metadata: extracted.metadata
|
|
1854
2338
|
};
|
|
1855
2339
|
}
|
|
1856
|
-
/**
|
|
1857
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1858
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1859
|
-
*/
|
|
1860
|
-
extractHeroImage($, pageUrl) {
|
|
1861
|
-
const containers = $('main, article, [role="main"], #content, .content');
|
|
1862
|
-
const scope = containers.length > 0 ? containers : $("body");
|
|
1863
|
-
let best;
|
|
1864
|
-
scope.find("img[src]").each((_, el) => {
|
|
1865
|
-
if (best) return false;
|
|
1866
|
-
const src = $(el).attr("src") || "";
|
|
1867
|
-
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1868
|
-
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1869
|
-
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1870
|
-
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1871
|
-
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1872
|
-
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1873
|
-
if (src.includes("/_next/image")) {
|
|
1874
|
-
try {
|
|
1875
|
-
const nextUrl = new URL(src, pageUrl);
|
|
1876
|
-
const realUrl = nextUrl.searchParams.get("url");
|
|
1877
|
-
if (realUrl) {
|
|
1878
|
-
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1879
|
-
return false;
|
|
1880
|
-
}
|
|
1881
|
-
} catch {
|
|
1882
|
-
}
|
|
1883
|
-
}
|
|
1884
|
-
best = src;
|
|
1885
|
-
return false;
|
|
1886
|
-
});
|
|
1887
|
-
return best;
|
|
1888
|
-
}
|
|
1889
2340
|
looksLikeDynamicShell(html) {
|
|
1890
2341
|
const lower = html.toLowerCase();
|
|
1891
2342
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
@@ -1903,7 +2354,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1903
2354
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
1904
2355
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
1905
2356
|
}
|
|
1906
|
-
diagFromRenderedAttempt(doc,
|
|
2357
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
1907
2358
|
if (blockedSuspected) {
|
|
1908
2359
|
return {
|
|
1909
2360
|
doc: null,
|
|
@@ -1919,12 +2370,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1919
2370
|
return {
|
|
1920
2371
|
doc,
|
|
1921
2372
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
1922
|
-
bodyTextLengthHint: doc ? void 0 :
|
|
2373
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
1923
2374
|
};
|
|
1924
2375
|
}
|
|
1925
2376
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
1926
2377
|
if (ctx.renderMode === true) {
|
|
1927
|
-
const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2378
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
1928
2379
|
url,
|
|
1929
2380
|
config,
|
|
1930
2381
|
timeout,
|
|
@@ -1933,7 +2384,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1933
2384
|
);
|
|
1934
2385
|
return this.diagFromRenderedAttempt(
|
|
1935
2386
|
doc,
|
|
1936
|
-
|
|
2387
|
+
bodyTextLengthHint2,
|
|
1937
2388
|
renderFailure,
|
|
1938
2389
|
blockedSuspected,
|
|
1939
2390
|
"render_ok",
|
|
@@ -2050,7 +2501,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2050
2501
|
}
|
|
2051
2502
|
}
|
|
2052
2503
|
const html = await page.content();
|
|
2053
|
-
const
|
|
2504
|
+
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
2054
2505
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2055
2506
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
2056
2507
|
try {
|
|
@@ -2065,7 +2516,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2065
2516
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
2066
2517
|
}
|
|
2067
2518
|
}
|
|
2068
|
-
return { doc, bodyTextLengthHint };
|
|
2519
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
2069
2520
|
} catch (e) {
|
|
2070
2521
|
const msg = String(e?.message || e || "render_failed");
|
|
2071
2522
|
const lower = msg.toLowerCase();
|
|
@@ -2157,14 +2608,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2157
2608
|
/**
|
|
2158
2609
|
* Clean extracted text content
|
|
2159
2610
|
*/
|
|
2160
|
-
cleanContent(text) {
|
|
2161
|
-
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
2162
|
-
}
|
|
2163
|
-
/**
|
|
2164
|
-
* Convert URL to a stable document ID
|
|
2165
|
-
*/
|
|
2166
2611
|
urlToId(url) {
|
|
2167
|
-
return url
|
|
2612
|
+
return urlToDocumentId(url);
|
|
2168
2613
|
}
|
|
2169
2614
|
/**
|
|
2170
2615
|
* Delay helper
|
|
@@ -2434,5 +2879,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2434
2879
|
};
|
|
2435
2880
|
// Annotate the CommonJS export names for ESM import in node:
|
|
2436
2881
|
0 && (module.exports = {
|
|
2437
|
-
WebRAGPlugin
|
|
2882
|
+
WebRAGPlugin,
|
|
2883
|
+
bodyTextLengthHint,
|
|
2884
|
+
extractPageFromHtml,
|
|
2885
|
+
extractProductMetadata,
|
|
2886
|
+
hardExcludePage,
|
|
2887
|
+
inferTypeFromUrl,
|
|
2888
|
+
normalizeAvailability,
|
|
2889
|
+
normalizeCurrency,
|
|
2890
|
+
normalizeDisplayTitle,
|
|
2891
|
+
parsePrice,
|
|
2892
|
+
resolvePageCardMetadata,
|
|
2893
|
+
urlToDocumentId
|
|
2438
2894
|
});
|