@snap-agent/rag-web 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +67 -18
- package/dist/index.d.ts +67 -18
- package/dist/index.js +590 -134
- package/dist/index.mjs +578 -133
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -1,9 +1,548 @@
|
|
|
1
1
|
// src/WebRAGPlugin.ts
|
|
2
2
|
import { MongoClient } from "mongodb";
|
|
3
3
|
import OpenAI from "openai";
|
|
4
|
-
import * as
|
|
4
|
+
import * as cheerio4 from "cheerio";
|
|
5
5
|
import * as fs from "fs";
|
|
6
6
|
import * as path from "path";
|
|
7
|
+
|
|
8
|
+
// src/htmlPageExtract.ts
|
|
9
|
+
import * as cheerio3 from "cheerio";
|
|
10
|
+
|
|
11
|
+
// src/productMetadata.ts
|
|
12
|
+
import * as cheerio from "cheerio";
|
|
13
|
+
function extractProductMetadata(html) {
|
|
14
|
+
const $ = cheerio.load(html);
|
|
15
|
+
const fromJsonLd = extractFromJsonLd($);
|
|
16
|
+
const fromOg = extractFromOpenGraph($);
|
|
17
|
+
const fromMicrodata = extractFromMicrodata($);
|
|
18
|
+
const result = {};
|
|
19
|
+
const price = fromJsonLd.price ?? fromOg.price ?? fromMicrodata.price;
|
|
20
|
+
if (price != null) result.price = price;
|
|
21
|
+
const currency = fromJsonLd.currency ?? fromOg.currency ?? fromMicrodata.currency;
|
|
22
|
+
if (currency) result.currency = currency;
|
|
23
|
+
const availability = fromJsonLd.availability ?? fromOg.availability ?? fromMicrodata.availability;
|
|
24
|
+
if (availability) result.availability = availability;
|
|
25
|
+
return result;
|
|
26
|
+
}
|
|
27
|
+
function extractFromJsonLd($) {
|
|
28
|
+
const result = {};
|
|
29
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
30
|
+
if (result.price != null && result.currency && result.availability) return false;
|
|
31
|
+
const raw = $(el).html()?.trim();
|
|
32
|
+
if (!raw) return;
|
|
33
|
+
let parsed;
|
|
34
|
+
try {
|
|
35
|
+
parsed = JSON.parse(raw);
|
|
36
|
+
} catch {
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
for (const node of collectJsonLdNodes(parsed)) {
|
|
40
|
+
if (!isProductType(node)) continue;
|
|
41
|
+
const offer = pickOffer(node);
|
|
42
|
+
if (!offer) continue;
|
|
43
|
+
if (result.price == null) {
|
|
44
|
+
const price = parsePrice(offer.price ?? offer.lowPrice ?? offer.highPrice);
|
|
45
|
+
if (price != null) result.price = price;
|
|
46
|
+
}
|
|
47
|
+
if (!result.currency) {
|
|
48
|
+
const currency = normalizeCurrency(offer.priceCurrency);
|
|
49
|
+
if (currency) result.currency = currency;
|
|
50
|
+
}
|
|
51
|
+
if (!result.availability) {
|
|
52
|
+
const availability = normalizeAvailability(offer.availability);
|
|
53
|
+
if (availability) result.availability = availability;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
return result;
|
|
58
|
+
}
|
|
59
|
+
function extractFromOpenGraph($) {
|
|
60
|
+
const result = {};
|
|
61
|
+
const priceRaw = $('meta[property="product:price:amount"]').attr("content") || $('meta[property="og:price:amount"]').attr("content");
|
|
62
|
+
const price = parsePrice(priceRaw);
|
|
63
|
+
if (price != null) result.price = price;
|
|
64
|
+
const currency = normalizeCurrency(
|
|
65
|
+
$('meta[property="product:price:currency"]').attr("content") || $('meta[property="og:price:currency"]').attr("content")
|
|
66
|
+
);
|
|
67
|
+
if (currency) result.currency = currency;
|
|
68
|
+
const availability = normalizeAvailability(
|
|
69
|
+
$('meta[property="product:availability"]').attr("content") || $('meta[property="og:availability"]').attr("content")
|
|
70
|
+
);
|
|
71
|
+
if (availability) result.availability = availability;
|
|
72
|
+
return result;
|
|
73
|
+
}
|
|
74
|
+
function microdataField($, itemprop) {
|
|
75
|
+
const scope = $('[itemtype*="schema.org/Product"], [itemtype*="schema.org/product"]').first();
|
|
76
|
+
return scope.length > 0 ? scope.find(`[itemprop="${itemprop}"]`).first() : $(`[itemprop="${itemprop}"]`).first();
|
|
77
|
+
}
|
|
78
|
+
function extractFromMicrodata($) {
|
|
79
|
+
const result = {};
|
|
80
|
+
const priceEl = microdataField($, "price");
|
|
81
|
+
const price = parsePrice(priceEl.attr("content") || priceEl.text());
|
|
82
|
+
if (price != null) result.price = price;
|
|
83
|
+
const currencyEl = microdataField($, "priceCurrency");
|
|
84
|
+
const currency = normalizeCurrency(currencyEl.attr("content") || currencyEl.text());
|
|
85
|
+
if (currency) result.currency = currency;
|
|
86
|
+
const availabilityEl = microdataField($, "availability");
|
|
87
|
+
const availability = normalizeAvailability(
|
|
88
|
+
availabilityEl.attr("content") || availabilityEl.attr("href") || availabilityEl.text()
|
|
89
|
+
);
|
|
90
|
+
if (availability) result.availability = availability;
|
|
91
|
+
return result;
|
|
92
|
+
}
|
|
93
|
+
function collectJsonLdNodes(data) {
|
|
94
|
+
const nodes = [];
|
|
95
|
+
const visit = (value) => {
|
|
96
|
+
if (value == null) return;
|
|
97
|
+
if (Array.isArray(value)) {
|
|
98
|
+
value.forEach(visit);
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
if (typeof value !== "object") return;
|
|
102
|
+
const obj = value;
|
|
103
|
+
nodes.push(obj);
|
|
104
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
105
|
+
};
|
|
106
|
+
visit(data);
|
|
107
|
+
return nodes;
|
|
108
|
+
}
|
|
109
|
+
function isProductType(node) {
|
|
110
|
+
const type = node["@type"];
|
|
111
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
112
|
+
return types.some((t) => {
|
|
113
|
+
const s = String(t).toLowerCase();
|
|
114
|
+
return s === "product" || s.endsWith("/product");
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
function pickOffer(product) {
|
|
118
|
+
const offers = product.offers;
|
|
119
|
+
if (offers == null) return null;
|
|
120
|
+
if (Array.isArray(offers)) {
|
|
121
|
+
const first = offers.find((o) => o && typeof o === "object");
|
|
122
|
+
return first ?? null;
|
|
123
|
+
}
|
|
124
|
+
if (typeof offers === "object") return offers;
|
|
125
|
+
return null;
|
|
126
|
+
}
|
|
127
|
+
function parsePrice(value) {
|
|
128
|
+
if (value == null || value === "") return void 0;
|
|
129
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
130
|
+
let s = String(value).trim();
|
|
131
|
+
if (!s) return void 0;
|
|
132
|
+
s = s.replace(/[^\d.,\-]/g, "");
|
|
133
|
+
if (!s || s === "-" || s === ".") return void 0;
|
|
134
|
+
const lastComma = s.lastIndexOf(",");
|
|
135
|
+
const lastDot = s.lastIndexOf(".");
|
|
136
|
+
if (lastComma > -1 && lastDot > -1) {
|
|
137
|
+
if (lastComma > lastDot) {
|
|
138
|
+
s = s.replace(/\./g, "").replace(",", ".");
|
|
139
|
+
} else {
|
|
140
|
+
s = s.replace(/,/g, "");
|
|
141
|
+
}
|
|
142
|
+
} else if (lastComma > -1) {
|
|
143
|
+
const parts = s.split(",");
|
|
144
|
+
if (parts.length === 2 && parts[1].length <= 2) {
|
|
145
|
+
s = parts[0].replace(/\./g, "") + "." + parts[1];
|
|
146
|
+
} else {
|
|
147
|
+
s = s.replace(/,/g, "");
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
const num = parseFloat(s);
|
|
151
|
+
return Number.isFinite(num) ? num : void 0;
|
|
152
|
+
}
|
|
153
|
+
function normalizeCurrency(value) {
|
|
154
|
+
if (value == null) return void 0;
|
|
155
|
+
const s = String(value).trim().toUpperCase();
|
|
156
|
+
if (!s) return void 0;
|
|
157
|
+
const iso = s.match(/[A-Z]{3}/);
|
|
158
|
+
return iso ? iso[0] : s.length <= 4 ? s : void 0;
|
|
159
|
+
}
|
|
160
|
+
function normalizeAvailability(value) {
|
|
161
|
+
if (value == null) return void 0;
|
|
162
|
+
let s = String(value).trim();
|
|
163
|
+
if (!s) return void 0;
|
|
164
|
+
if (s.includes("schema.org/")) {
|
|
165
|
+
const parts = s.split("/");
|
|
166
|
+
s = parts[parts.length - 1] || s;
|
|
167
|
+
}
|
|
168
|
+
s = s.replace(/^https?:\/\/[^/]+\//, "");
|
|
169
|
+
if (s.includes("/")) {
|
|
170
|
+
const parts = s.split("/");
|
|
171
|
+
s = parts[parts.length - 1] || s;
|
|
172
|
+
}
|
|
173
|
+
return s.replace(/\s+/g, "") || void 0;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// src/pageCardMetadata.ts
|
|
177
|
+
import * as cheerio2 from "cheerio";
|
|
178
|
+
var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
|
|
179
|
+
var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
|
|
180
|
+
var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
|
|
181
|
+
var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
|
|
182
|
+
var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
|
|
183
|
+
var ENTITY_DETAIL_PATH_RES = [
|
|
184
|
+
/\/projects\/[^/]+/i,
|
|
185
|
+
/\/project\/[^/]+/i,
|
|
186
|
+
/\/perspectives\/[^/]+/i,
|
|
187
|
+
/\/perspective\/[^/]+/i,
|
|
188
|
+
/\/portfolio\/[^/]+/i,
|
|
189
|
+
/\/case-stud(?:y|ies)\/[^/]+/i,
|
|
190
|
+
/\/insights?\/[^/]+/i,
|
|
191
|
+
/\/people\/[^/]+/i,
|
|
192
|
+
/\/person\/[^/]+/i,
|
|
193
|
+
/\/team-members?\/[^/]+/i,
|
|
194
|
+
/\/members?\/[^/]+/i,
|
|
195
|
+
/\/staff\/[^/]+/i,
|
|
196
|
+
/\/experts?\/[^/]+/i,
|
|
197
|
+
/\/authors?\/[^/]+/i,
|
|
198
|
+
/\/leadership\/[^/]+/i,
|
|
199
|
+
/\/biograph(?:y|ies)\/[^/]+/i
|
|
200
|
+
];
|
|
201
|
+
var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
|
|
202
|
+
var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
|
|
203
|
+
var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
|
|
204
|
+
var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
|
|
205
|
+
var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
|
|
206
|
+
var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
|
|
207
|
+
var CARD_PRIORITY = {
|
|
208
|
+
detail: 10,
|
|
209
|
+
listing: 6,
|
|
210
|
+
amenity: 5,
|
|
211
|
+
promotion: 2,
|
|
212
|
+
contact: 1,
|
|
213
|
+
content: 1,
|
|
214
|
+
blog: 0,
|
|
215
|
+
system: 0,
|
|
216
|
+
page: 3
|
|
217
|
+
};
|
|
218
|
+
var CARD_ELIGIBLE_DEFAULT = {
|
|
219
|
+
detail: true,
|
|
220
|
+
listing: true,
|
|
221
|
+
amenity: true,
|
|
222
|
+
promotion: false,
|
|
223
|
+
contact: false,
|
|
224
|
+
content: false,
|
|
225
|
+
blog: false,
|
|
226
|
+
system: false,
|
|
227
|
+
page: false
|
|
228
|
+
};
|
|
229
|
+
var SCHEMA_TYPE_MAP = {
|
|
230
|
+
product: "detail",
|
|
231
|
+
service: "amenity",
|
|
232
|
+
hotelroom: "detail",
|
|
233
|
+
room: "detail",
|
|
234
|
+
apartment: "detail",
|
|
235
|
+
lodgingroom: "detail",
|
|
236
|
+
course: "detail",
|
|
237
|
+
event: "detail",
|
|
238
|
+
offer: "promotion",
|
|
239
|
+
person: "detail",
|
|
240
|
+
employee: "detail",
|
|
241
|
+
profilepage: "detail",
|
|
242
|
+
article: "detail",
|
|
243
|
+
newsarticle: "detail",
|
|
244
|
+
blogposting: "detail",
|
|
245
|
+
creativework: "detail"
|
|
246
|
+
};
|
|
247
|
+
function normalizeDisplayTitle(title) {
|
|
248
|
+
if (!title?.trim()) return title;
|
|
249
|
+
let t = title.trim();
|
|
250
|
+
for (let i = 0; i < 2; i++) {
|
|
251
|
+
const dash = t.match(EN_DASH_SUFFIX_RE);
|
|
252
|
+
if (dash && dash.index !== void 0 && dash.index >= 4) {
|
|
253
|
+
t = t.slice(0, dash.index).trim();
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
const pipe = t.match(PIPE_SUFFIX_RE);
|
|
257
|
+
if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
|
|
258
|
+
t = t.slice(0, pipe.index).trim();
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
break;
|
|
262
|
+
}
|
|
263
|
+
return t || title.trim();
|
|
264
|
+
}
|
|
265
|
+
function hardExcludePage(url, title) {
|
|
266
|
+
const path2 = url.toLowerCase();
|
|
267
|
+
if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
|
|
268
|
+
if (BLOG_URL_RE.test(path2)) return true;
|
|
269
|
+
if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
|
|
270
|
+
try {
|
|
271
|
+
const u = new URL(url);
|
|
272
|
+
if (u.pathname === "/" || u.pathname === "") return true;
|
|
273
|
+
} catch {
|
|
274
|
+
}
|
|
275
|
+
return false;
|
|
276
|
+
}
|
|
277
|
+
function inferTypeFromUrl(url) {
|
|
278
|
+
const path2 = url.toLowerCase();
|
|
279
|
+
if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
|
|
280
|
+
if (CONTACT_URL_RE.test(path2)) return "contact";
|
|
281
|
+
if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
|
|
282
|
+
if (AMENITY_URL_RE.test(path2)) return "amenity";
|
|
283
|
+
if (DETAIL_URL_RE.test(path2)) return "detail";
|
|
284
|
+
if (LISTING_URL_RE.test(path2)) return "listing";
|
|
285
|
+
if (BLOG_URL_RE.test(path2)) return "blog";
|
|
286
|
+
return void 0;
|
|
287
|
+
}
|
|
288
|
+
function collectJsonLdNodes2(data) {
|
|
289
|
+
const nodes = [];
|
|
290
|
+
const visit = (value) => {
|
|
291
|
+
if (value == null) return;
|
|
292
|
+
if (Array.isArray(value)) {
|
|
293
|
+
value.forEach(visit);
|
|
294
|
+
return;
|
|
295
|
+
}
|
|
296
|
+
if (typeof value !== "object") return;
|
|
297
|
+
const obj = value;
|
|
298
|
+
nodes.push(obj);
|
|
299
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
300
|
+
};
|
|
301
|
+
visit(data);
|
|
302
|
+
return nodes;
|
|
303
|
+
}
|
|
304
|
+
function schemaTypeName(node) {
|
|
305
|
+
const type = node["@type"];
|
|
306
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
307
|
+
const raw = types[0];
|
|
308
|
+
if (raw == null) return "";
|
|
309
|
+
const s = String(raw).toLowerCase();
|
|
310
|
+
const slash = s.lastIndexOf("/");
|
|
311
|
+
return slash >= 0 ? s.slice(slash + 1) : s;
|
|
312
|
+
}
|
|
313
|
+
function inferTypeFromSchema(html) {
|
|
314
|
+
const $ = cheerio2.load(html);
|
|
315
|
+
for (const el of $('script[type="application/ld+json"]').toArray()) {
|
|
316
|
+
const raw = $(el).html()?.trim();
|
|
317
|
+
if (!raw) continue;
|
|
318
|
+
try {
|
|
319
|
+
const parsed = JSON.parse(raw);
|
|
320
|
+
for (const node of collectJsonLdNodes2(parsed)) {
|
|
321
|
+
const name = schemaTypeName(node);
|
|
322
|
+
if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
|
|
323
|
+
if (name === "product" || node.offers != null) return "detail";
|
|
324
|
+
}
|
|
325
|
+
} catch {
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
|
|
329
|
+
if (ogType === "product") return "detail";
|
|
330
|
+
return void 0;
|
|
331
|
+
}
|
|
332
|
+
function normalizePageType(raw) {
|
|
333
|
+
if (!raw) return "page";
|
|
334
|
+
const lower = raw.toLowerCase();
|
|
335
|
+
const known = [
|
|
336
|
+
"detail",
|
|
337
|
+
"listing",
|
|
338
|
+
"amenity",
|
|
339
|
+
"promotion",
|
|
340
|
+
"contact",
|
|
341
|
+
"content",
|
|
342
|
+
"blog",
|
|
343
|
+
"system",
|
|
344
|
+
"page"
|
|
345
|
+
];
|
|
346
|
+
if (known.includes(lower)) return lower;
|
|
347
|
+
if (lower === "room" || lower === "product") return "detail";
|
|
348
|
+
if (lower === "offer" || lower === "sale") return "promotion";
|
|
349
|
+
return raw;
|
|
350
|
+
}
|
|
351
|
+
function resolveDisplayTitle(input) {
|
|
352
|
+
const heading = input.headingTitle?.trim();
|
|
353
|
+
if (heading) return normalizeDisplayTitle(heading);
|
|
354
|
+
return normalizeDisplayTitle(input.title);
|
|
355
|
+
}
|
|
356
|
+
function resolvePageCardMetadata(input) {
|
|
357
|
+
const title = input.title?.trim();
|
|
358
|
+
const url = input.url;
|
|
359
|
+
const displayTitle = resolveDisplayTitle(input);
|
|
360
|
+
if (hardExcludePage(url, title)) {
|
|
361
|
+
return {
|
|
362
|
+
type: "system",
|
|
363
|
+
cardEligible: false,
|
|
364
|
+
cardPriority: 0,
|
|
365
|
+
displayTitle,
|
|
366
|
+
displayDescription: input.description,
|
|
367
|
+
displayImageUrl: input.imageUrl
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
let type = normalizePageType(input.type);
|
|
371
|
+
if (type === "page" && input.html) {
|
|
372
|
+
const fromSchema = inferTypeFromSchema(input.html);
|
|
373
|
+
if (fromSchema) type = fromSchema;
|
|
374
|
+
}
|
|
375
|
+
if (type === "page") {
|
|
376
|
+
const fromUrl = inferTypeFromUrl(url);
|
|
377
|
+
if (fromUrl) type = fromUrl;
|
|
378
|
+
}
|
|
379
|
+
if (input.hasProductPrice && type === "page") {
|
|
380
|
+
type = "detail";
|
|
381
|
+
}
|
|
382
|
+
const typeKey = String(type).toLowerCase();
|
|
383
|
+
let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
|
|
384
|
+
let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
|
|
385
|
+
if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
|
|
386
|
+
cardEligible = false;
|
|
387
|
+
}
|
|
388
|
+
return {
|
|
389
|
+
type,
|
|
390
|
+
cardEligible,
|
|
391
|
+
cardPriority,
|
|
392
|
+
displayTitle,
|
|
393
|
+
displayDescription: input.description?.trim() || void 0,
|
|
394
|
+
displayImageUrl: input.imageUrl
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// src/htmlPageExtract.ts
|
|
399
|
+
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
400
|
+
var DEFAULT_REMOVE_SELECTORS = [
|
|
401
|
+
"script",
|
|
402
|
+
"style",
|
|
403
|
+
"nav",
|
|
404
|
+
"header",
|
|
405
|
+
"footer",
|
|
406
|
+
".sidebar",
|
|
407
|
+
".navigation",
|
|
408
|
+
".menu",
|
|
409
|
+
".comments",
|
|
410
|
+
'[role="navigation"]',
|
|
411
|
+
'[role="banner"]'
|
|
412
|
+
];
|
|
413
|
+
function urlToDocumentId(url) {
|
|
414
|
+
return url.replace(/^https?:\/\//, "").replace(/[^a-zA-Z0-9]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").substring(0, 100);
|
|
415
|
+
}
|
|
416
|
+
function cleanContent(text) {
|
|
417
|
+
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
418
|
+
}
|
|
419
|
+
function bodyTextLengthHint(html, options = {}) {
|
|
420
|
+
const $ = cheerio3.load(html);
|
|
421
|
+
stripNoiseFromDom($, options);
|
|
422
|
+
return cleanContent($("body").text().trim()).length;
|
|
423
|
+
}
|
|
424
|
+
function extractPageFromHtml(url, html, options = {}) {
|
|
425
|
+
const $ = cheerio3.load(html);
|
|
426
|
+
stripNoiseFromDom($, options);
|
|
427
|
+
const h1Title = $("h1").first().text().trim();
|
|
428
|
+
const docTitle = $("title").text().trim();
|
|
429
|
+
let title = "";
|
|
430
|
+
if (options.titleSelector) {
|
|
431
|
+
title = $(options.titleSelector).first().text().trim();
|
|
432
|
+
} else {
|
|
433
|
+
title = docTitle || h1Title;
|
|
434
|
+
}
|
|
435
|
+
if (!title) {
|
|
436
|
+
title = h1Title || docTitle;
|
|
437
|
+
}
|
|
438
|
+
const content = extractBestContentText($, options);
|
|
439
|
+
const minChars = options.minExtractedContentLength ?? 50;
|
|
440
|
+
const indexable = Boolean(content && content.length >= minChars);
|
|
441
|
+
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || extractHeroImage($, url) || void 0;
|
|
442
|
+
let imageUrl;
|
|
443
|
+
if (image) {
|
|
444
|
+
try {
|
|
445
|
+
imageUrl = new URL(image, url).href;
|
|
446
|
+
} catch {
|
|
447
|
+
imageUrl = image;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
451
|
+
let type = options.defaultType || "page";
|
|
452
|
+
if (options.typeFromUrl) {
|
|
453
|
+
for (const [pattern, typeName] of Object.entries(options.typeFromUrl)) {
|
|
454
|
+
if (url.includes(pattern)) {
|
|
455
|
+
type = typeName;
|
|
456
|
+
break;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
const productMeta = extractProductMetadata(html);
|
|
461
|
+
const cardMeta = resolvePageCardMetadata({
|
|
462
|
+
url,
|
|
463
|
+
title,
|
|
464
|
+
headingTitle: h1Title || void 0,
|
|
465
|
+
description,
|
|
466
|
+
imageUrl,
|
|
467
|
+
html,
|
|
468
|
+
type,
|
|
469
|
+
hasProductPrice: productMeta.price != null
|
|
470
|
+
});
|
|
471
|
+
const metadata = {
|
|
472
|
+
type: cardMeta.type,
|
|
473
|
+
cardEligible: cardMeta.cardEligible,
|
|
474
|
+
cardPriority: cardMeta.cardPriority,
|
|
475
|
+
...title ? { title } : {},
|
|
476
|
+
...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
|
|
477
|
+
url,
|
|
478
|
+
...imageUrl ? { imageUrl } : {},
|
|
479
|
+
...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
|
|
480
|
+
...description ? { description } : {},
|
|
481
|
+
...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
|
|
482
|
+
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
483
|
+
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
484
|
+
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
485
|
+
...options.metadata
|
|
486
|
+
};
|
|
487
|
+
const previewLen = 400;
|
|
488
|
+
const contentPreview = content.length > previewLen ? `${content.slice(0, previewLen)}\u2026` : content;
|
|
489
|
+
return {
|
|
490
|
+
id: urlToDocumentId(url),
|
|
491
|
+
metadata,
|
|
492
|
+
content,
|
|
493
|
+
indexable,
|
|
494
|
+
contentPreview
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
function stripNoiseFromDom($, options) {
|
|
498
|
+
const removeSelectors = options.removeSelectors ?? DEFAULT_REMOVE_SELECTORS;
|
|
499
|
+
removeSelectors.forEach((selector) => $(selector).remove());
|
|
500
|
+
}
|
|
501
|
+
function extractBestContentText($, options) {
|
|
502
|
+
const contentSelector = options.contentSelector || DEFAULT_CONTENT_SELECTOR;
|
|
503
|
+
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
504
|
+
let best = "";
|
|
505
|
+
for (const sel of selectors) {
|
|
506
|
+
$(sel).each((_, el) => {
|
|
507
|
+
const t = cleanContent($(el).text().trim());
|
|
508
|
+
if (t.length > best.length) best = t;
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
const bodyText = cleanContent($("body").text().trim());
|
|
512
|
+
if (bodyText.length > best.length) best = bodyText;
|
|
513
|
+
return best;
|
|
514
|
+
}
|
|
515
|
+
function extractHeroImage($, pageUrl) {
|
|
516
|
+
const containers = $('main, article, [role="main"], #content, .content');
|
|
517
|
+
const scope = containers.length > 0 ? containers : $("body");
|
|
518
|
+
let best;
|
|
519
|
+
scope.find("img[src]").each((_, el) => {
|
|
520
|
+
if (best) return false;
|
|
521
|
+
const src = $(el).attr("src") || "";
|
|
522
|
+
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
523
|
+
const width = parseInt($(el).attr("width") || "0", 10);
|
|
524
|
+
const height = parseInt($(el).attr("height") || "0", 10);
|
|
525
|
+
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
526
|
+
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
527
|
+
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
528
|
+
if (src.includes("/_next/image")) {
|
|
529
|
+
try {
|
|
530
|
+
const nextUrl = new URL(src, pageUrl);
|
|
531
|
+
const realUrl = nextUrl.searchParams.get("url");
|
|
532
|
+
if (realUrl) {
|
|
533
|
+
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
534
|
+
return false;
|
|
535
|
+
}
|
|
536
|
+
} catch {
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
best = src;
|
|
540
|
+
return false;
|
|
541
|
+
});
|
|
542
|
+
return best;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// src/WebRAGPlugin.ts
|
|
7
546
|
function bulkOpCurrentUrl(op) {
|
|
8
547
|
const meta = op.document?.metadata;
|
|
9
548
|
if (typeof meta?.url === "string" && meta.url.trim()) return meta.url.trim();
|
|
@@ -22,7 +561,7 @@ function isUrlListingInsert(document) {
|
|
|
22
561
|
return false;
|
|
23
562
|
}
|
|
24
563
|
}
|
|
25
|
-
var WebRAGPlugin = class
|
|
564
|
+
var WebRAGPlugin = class {
|
|
26
565
|
name = "web-rag";
|
|
27
566
|
type = "rag";
|
|
28
567
|
priority;
|
|
@@ -245,13 +784,21 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
245
784
|
plugin: this.name,
|
|
246
785
|
contentCount: scoredResults.length,
|
|
247
786
|
types: [...new Set(scoredResults.map((d) => d.metadata.type))],
|
|
248
|
-
topResults: scoredResults.slice(0,
|
|
787
|
+
topResults: scoredResults.slice(0, 16).map((doc) => ({
|
|
249
788
|
id: doc.id,
|
|
250
789
|
type: doc.metadata.type,
|
|
251
790
|
title: doc.metadata.title,
|
|
252
791
|
url: doc.metadata.url,
|
|
253
792
|
imageUrl: doc.metadata.imageUrl,
|
|
254
793
|
description: doc.metadata.description,
|
|
794
|
+
cardEligible: doc.metadata.cardEligible,
|
|
795
|
+
cardPriority: doc.metadata.cardPriority,
|
|
796
|
+
displayTitle: doc.metadata.displayTitle,
|
|
797
|
+
displayDescription: doc.metadata.displayDescription,
|
|
798
|
+
displayImageUrl: doc.metadata.displayImageUrl,
|
|
799
|
+
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
800
|
+
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
801
|
+
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
255
802
|
score: doc.score
|
|
256
803
|
}))
|
|
257
804
|
}
|
|
@@ -1422,7 +1969,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1422
1969
|
return await response.text();
|
|
1423
1970
|
}
|
|
1424
1971
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1425
|
-
const $ =
|
|
1972
|
+
const $ = cheerio4.load(html);
|
|
1426
1973
|
const links = /* @__PURE__ */ new Set();
|
|
1427
1974
|
$("a[href]").each((_, el) => {
|
|
1428
1975
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -1588,7 +2135,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1588
2135
|
}
|
|
1589
2136
|
}
|
|
1590
2137
|
try {
|
|
1591
|
-
const { doc, diag, bodyTextLengthHint } = await this.crawlPageSmart(url, config, timeout, {
|
|
2138
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
1592
2139
|
renderMode,
|
|
1593
2140
|
renderOptions,
|
|
1594
2141
|
minContentLength,
|
|
@@ -1619,7 +2166,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1619
2166
|
status: crawlSt,
|
|
1620
2167
|
modeUsed: diag?.modeUsed,
|
|
1621
2168
|
contentLength: doc?.content?.length,
|
|
1622
|
-
bodyTextLengthHint,
|
|
2169
|
+
bodyTextLengthHint: bodyTextLengthHint2,
|
|
1623
2170
|
title: doc?.metadata?.title,
|
|
1624
2171
|
docId: doc?.id,
|
|
1625
2172
|
error: diag?.errorMessage
|
|
@@ -1731,125 +2278,18 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1731
2278
|
const html = await response.text();
|
|
1732
2279
|
return this.extractDocumentFromHtml(url, html, config);
|
|
1733
2280
|
}
|
|
1734
|
-
/**
|
|
1735
|
-
* Default chain works for many WordPress / Elementor / block themes where `.first()`
|
|
1736
|
-
* would otherwise hit an empty wrapper.
|
|
1737
|
-
*/
|
|
1738
|
-
static DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
1739
|
-
stripNoiseFromDom($, config) {
|
|
1740
|
-
const removeSelectors = config.removeSelectors || [
|
|
1741
|
-
"script",
|
|
1742
|
-
"style",
|
|
1743
|
-
"nav",
|
|
1744
|
-
"header",
|
|
1745
|
-
"footer",
|
|
1746
|
-
".sidebar",
|
|
1747
|
-
".navigation",
|
|
1748
|
-
".menu",
|
|
1749
|
-
".comments",
|
|
1750
|
-
'[role="navigation"]',
|
|
1751
|
-
'[role="banner"]'
|
|
1752
|
-
];
|
|
1753
|
-
removeSelectors.forEach((selector) => $(selector).remove());
|
|
1754
|
-
}
|
|
1755
|
-
/** Longest cleaned text among selector matches and full body (after noise strip). */
|
|
1756
|
-
extractBestContentText($, config) {
|
|
1757
|
-
const contentSelector = config.contentSelector || _WebRAGPlugin.DEFAULT_CONTENT_SELECTOR;
|
|
1758
|
-
const selectors = contentSelector.split(",").map((s) => s.trim()).filter(Boolean);
|
|
1759
|
-
let best = "";
|
|
1760
|
-
for (const sel of selectors) {
|
|
1761
|
-
$(sel).each((_, el) => {
|
|
1762
|
-
const t = this.cleanContent($(el).text().trim());
|
|
1763
|
-
if (t.length > best.length) best = t;
|
|
1764
|
-
});
|
|
1765
|
-
}
|
|
1766
|
-
const bodyText = this.cleanContent($("body").text().trim());
|
|
1767
|
-
if (bodyText.length > best.length) best = bodyText;
|
|
1768
|
-
return best;
|
|
1769
|
-
}
|
|
1770
2281
|
bodyTextLengthHint(html, config) {
|
|
1771
|
-
|
|
1772
|
-
this.stripNoiseFromDom($, config);
|
|
1773
|
-
return this.cleanContent($("body").text().trim()).length;
|
|
2282
|
+
return bodyTextLengthHint(html, config);
|
|
1774
2283
|
}
|
|
1775
2284
|
extractDocumentFromHtml(url, html, config) {
|
|
1776
|
-
const
|
|
1777
|
-
|
|
1778
|
-
const titleSelector = config.titleSelector || "h1, title";
|
|
1779
|
-
let title = $(titleSelector).first().text().trim();
|
|
1780
|
-
if (!title) {
|
|
1781
|
-
title = $("title").text().trim();
|
|
1782
|
-
}
|
|
1783
|
-
const content = this.extractBestContentText($, config);
|
|
1784
|
-
const minChars = config.minExtractedContentLength ?? 50;
|
|
1785
|
-
if (!content || content.length < minChars) return null;
|
|
1786
|
-
const image = $('meta[property="og:image"]').attr("content") || $('meta[name="twitter:image"]').attr("content") || $('meta[property="product:image"]').attr("content") || $('[itemtype*="schema.org/Product"] img, .product img, .product-image img, #product-image img').first().attr("src") || // Fallback: largest/first meaningful image in main content area
|
|
1787
|
-
this.extractHeroImage($, url) || void 0;
|
|
1788
|
-
let imageUrl;
|
|
1789
|
-
if (image) {
|
|
1790
|
-
try {
|
|
1791
|
-
imageUrl = new URL(image, url).href;
|
|
1792
|
-
} catch {
|
|
1793
|
-
imageUrl = image;
|
|
1794
|
-
}
|
|
1795
|
-
}
|
|
1796
|
-
const description = $('meta[property="og:description"]').attr("content") || $('meta[name="description"]').attr("content") || void 0;
|
|
1797
|
-
let type = config.defaultType || "page";
|
|
1798
|
-
if (config.typeFromUrl) {
|
|
1799
|
-
for (const [pattern, typeName] of Object.entries(config.typeFromUrl)) {
|
|
1800
|
-
if (url.includes(pattern)) {
|
|
1801
|
-
type = typeName;
|
|
1802
|
-
break;
|
|
1803
|
-
}
|
|
1804
|
-
}
|
|
1805
|
-
}
|
|
1806
|
-
const id = this.urlToId(url);
|
|
2285
|
+
const extracted = extractPageFromHtml(url, html, config);
|
|
2286
|
+
if (!extracted.indexable) return null;
|
|
1807
2287
|
return {
|
|
1808
|
-
id,
|
|
1809
|
-
content,
|
|
1810
|
-
metadata:
|
|
1811
|
-
type,
|
|
1812
|
-
title,
|
|
1813
|
-
url,
|
|
1814
|
-
...imageUrl ? { imageUrl } : {},
|
|
1815
|
-
...description ? { description } : {},
|
|
1816
|
-
...config.metadata
|
|
1817
|
-
}
|
|
2288
|
+
id: extracted.id,
|
|
2289
|
+
content: extracted.content,
|
|
2290
|
+
metadata: extracted.metadata
|
|
1818
2291
|
};
|
|
1819
2292
|
}
|
|
1820
|
-
/**
|
|
1821
|
-
* Fallback image extraction: finds the first meaningful image in the content area.
|
|
1822
|
-
* Skips icons, avatars, and tiny assets by filtering on common patterns.
|
|
1823
|
-
*/
|
|
1824
|
-
extractHeroImage($, pageUrl) {
|
|
1825
|
-
const containers = $('main, article, [role="main"], #content, .content');
|
|
1826
|
-
const scope = containers.length > 0 ? containers : $("body");
|
|
1827
|
-
let best;
|
|
1828
|
-
scope.find("img[src]").each((_, el) => {
|
|
1829
|
-
if (best) return false;
|
|
1830
|
-
const src = $(el).attr("src") || "";
|
|
1831
|
-
const alt = ($(el).attr("alt") || "").toLowerCase();
|
|
1832
|
-
const width = parseInt($(el).attr("width") || "0", 10);
|
|
1833
|
-
const height = parseInt($(el).attr("height") || "0", 10);
|
|
1834
|
-
if (width > 0 && width < 80 || height > 0 && height < 80) return;
|
|
1835
|
-
if (/logo|icon|avatar|favicon|badge|spinner|loading/i.test(src + " " + alt)) return;
|
|
1836
|
-
if (src.startsWith("data:") || src.endsWith(".svg")) return;
|
|
1837
|
-
if (src.includes("/_next/image")) {
|
|
1838
|
-
try {
|
|
1839
|
-
const nextUrl = new URL(src, pageUrl);
|
|
1840
|
-
const realUrl = nextUrl.searchParams.get("url");
|
|
1841
|
-
if (realUrl) {
|
|
1842
|
-
best = realUrl.startsWith("http") ? realUrl : new URL(realUrl, pageUrl).href;
|
|
1843
|
-
return false;
|
|
1844
|
-
}
|
|
1845
|
-
} catch {
|
|
1846
|
-
}
|
|
1847
|
-
}
|
|
1848
|
-
best = src;
|
|
1849
|
-
return false;
|
|
1850
|
-
});
|
|
1851
|
-
return best;
|
|
1852
|
-
}
|
|
1853
2293
|
looksLikeDynamicShell(html) {
|
|
1854
2294
|
const lower = html.toLowerCase();
|
|
1855
2295
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
@@ -1867,7 +2307,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1867
2307
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
1868
2308
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
1869
2309
|
}
|
|
1870
|
-
diagFromRenderedAttempt(doc,
|
|
2310
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
1871
2311
|
if (blockedSuspected) {
|
|
1872
2312
|
return {
|
|
1873
2313
|
doc: null,
|
|
@@ -1883,12 +2323,12 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1883
2323
|
return {
|
|
1884
2324
|
doc,
|
|
1885
2325
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
1886
|
-
bodyTextLengthHint: doc ? void 0 :
|
|
2326
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
1887
2327
|
};
|
|
1888
2328
|
}
|
|
1889
2329
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
1890
2330
|
if (ctx.renderMode === true) {
|
|
1891
|
-
const { doc, bodyTextLengthHint, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2331
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
1892
2332
|
url,
|
|
1893
2333
|
config,
|
|
1894
2334
|
timeout,
|
|
@@ -1897,7 +2337,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
1897
2337
|
);
|
|
1898
2338
|
return this.diagFromRenderedAttempt(
|
|
1899
2339
|
doc,
|
|
1900
|
-
|
|
2340
|
+
bodyTextLengthHint2,
|
|
1901
2341
|
renderFailure,
|
|
1902
2342
|
blockedSuspected,
|
|
1903
2343
|
"render_ok",
|
|
@@ -2014,7 +2454,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2014
2454
|
}
|
|
2015
2455
|
}
|
|
2016
2456
|
const html = await page.content();
|
|
2017
|
-
const
|
|
2457
|
+
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
2018
2458
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2019
2459
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
2020
2460
|
try {
|
|
@@ -2029,7 +2469,7 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2029
2469
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
2030
2470
|
}
|
|
2031
2471
|
}
|
|
2032
|
-
return { doc, bodyTextLengthHint };
|
|
2472
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
2033
2473
|
} catch (e) {
|
|
2034
2474
|
const msg = String(e?.message || e || "render_failed");
|
|
2035
2475
|
const lower = msg.toLowerCase();
|
|
@@ -2121,14 +2561,8 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2121
2561
|
/**
|
|
2122
2562
|
* Clean extracted text content
|
|
2123
2563
|
*/
|
|
2124
|
-
cleanContent(text) {
|
|
2125
|
-
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
2126
|
-
}
|
|
2127
|
-
/**
|
|
2128
|
-
* Convert URL to a stable document ID
|
|
2129
|
-
*/
|
|
2130
2564
|
urlToId(url) {
|
|
2131
|
-
return url
|
|
2565
|
+
return urlToDocumentId(url);
|
|
2132
2566
|
}
|
|
2133
2567
|
/**
|
|
2134
2568
|
* Delay helper
|
|
@@ -2397,5 +2831,16 @@ var WebRAGPlugin = class _WebRAGPlugin {
|
|
|
2397
2831
|
}
|
|
2398
2832
|
};
|
|
2399
2833
|
export {
|
|
2400
|
-
WebRAGPlugin
|
|
2834
|
+
WebRAGPlugin,
|
|
2835
|
+
bodyTextLengthHint,
|
|
2836
|
+
extractPageFromHtml,
|
|
2837
|
+
extractProductMetadata,
|
|
2838
|
+
hardExcludePage,
|
|
2839
|
+
inferTypeFromUrl,
|
|
2840
|
+
normalizeAvailability,
|
|
2841
|
+
normalizeCurrency,
|
|
2842
|
+
normalizeDisplayTitle,
|
|
2843
|
+
parsePrice,
|
|
2844
|
+
resolvePageCardMetadata,
|
|
2845
|
+
urlToDocumentId
|
|
2401
2846
|
};
|