@snap-agent/rag-web 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +28 -1
- package/dist/index.d.ts +28 -1
- package/dist/index.js +266 -10
- package/dist/index.mjs +262 -10
- package/package.json +1 -1
package/dist/index.d.mts
CHANGED
|
@@ -805,4 +805,31 @@ declare function parsePrice(value: unknown): number | undefined;
|
|
|
805
805
|
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
806
|
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
807
|
|
|
808
|
-
|
|
808
|
+
/** Abstract page roles — vertical-agnostic. */
|
|
809
|
+
type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
|
|
810
|
+
interface PageCardMetadataInput {
|
|
811
|
+
url: string;
|
|
812
|
+
title?: string;
|
|
813
|
+
/** Primary heading (h1) — preferred for displayTitle over the document title tag. */
|
|
814
|
+
headingTitle?: string;
|
|
815
|
+
description?: string;
|
|
816
|
+
imageUrl?: string;
|
|
817
|
+
html?: string;
|
|
818
|
+
/** Type already resolved from typeFromUrl / defaultType. */
|
|
819
|
+
type?: string;
|
|
820
|
+
hasProductPrice?: boolean;
|
|
821
|
+
}
|
|
822
|
+
interface PageCardMetadataResult {
|
|
823
|
+
type: PageCardType | string;
|
|
824
|
+
cardEligible: boolean;
|
|
825
|
+
cardPriority: number;
|
|
826
|
+
displayTitle?: string;
|
|
827
|
+
displayDescription?: string;
|
|
828
|
+
displayImageUrl?: string;
|
|
829
|
+
}
|
|
830
|
+
declare function normalizeDisplayTitle(title?: string): string | undefined;
|
|
831
|
+
declare function hardExcludePage(url: string, title?: string): boolean;
|
|
832
|
+
declare function inferTypeFromUrl(url: string): PageCardType | undefined;
|
|
833
|
+
declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
|
|
834
|
+
|
|
835
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
|
package/dist/index.d.ts
CHANGED
|
@@ -805,4 +805,31 @@ declare function parsePrice(value: unknown): number | undefined;
|
|
|
805
805
|
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
806
|
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
807
|
|
|
808
|
-
|
|
808
|
+
/** Abstract page roles — vertical-agnostic. */
|
|
809
|
+
type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
|
|
810
|
+
interface PageCardMetadataInput {
|
|
811
|
+
url: string;
|
|
812
|
+
title?: string;
|
|
813
|
+
/** Primary heading (h1) — preferred for displayTitle over the document title tag. */
|
|
814
|
+
headingTitle?: string;
|
|
815
|
+
description?: string;
|
|
816
|
+
imageUrl?: string;
|
|
817
|
+
html?: string;
|
|
818
|
+
/** Type already resolved from typeFromUrl / defaultType. */
|
|
819
|
+
type?: string;
|
|
820
|
+
hasProductPrice?: boolean;
|
|
821
|
+
}
|
|
822
|
+
interface PageCardMetadataResult {
|
|
823
|
+
type: PageCardType | string;
|
|
824
|
+
cardEligible: boolean;
|
|
825
|
+
cardPriority: number;
|
|
826
|
+
displayTitle?: string;
|
|
827
|
+
displayDescription?: string;
|
|
828
|
+
displayImageUrl?: string;
|
|
829
|
+
}
|
|
830
|
+
declare function normalizeDisplayTitle(title?: string): string | undefined;
|
|
831
|
+
declare function hardExcludePage(url: string, title?: string): boolean;
|
|
832
|
+
declare function inferTypeFromUrl(url: string): PageCardType | undefined;
|
|
833
|
+
declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
|
|
834
|
+
|
|
835
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
|
package/dist/index.js
CHANGED
|
@@ -34,9 +34,13 @@ __export(index_exports, {
|
|
|
34
34
|
bodyTextLengthHint: () => bodyTextLengthHint,
|
|
35
35
|
extractPageFromHtml: () => extractPageFromHtml,
|
|
36
36
|
extractProductMetadata: () => extractProductMetadata,
|
|
37
|
+
hardExcludePage: () => hardExcludePage,
|
|
38
|
+
inferTypeFromUrl: () => inferTypeFromUrl,
|
|
37
39
|
normalizeAvailability: () => normalizeAvailability,
|
|
38
40
|
normalizeCurrency: () => normalizeCurrency,
|
|
41
|
+
normalizeDisplayTitle: () => normalizeDisplayTitle,
|
|
39
42
|
parsePrice: () => parsePrice,
|
|
43
|
+
resolvePageCardMetadata: () => resolvePageCardMetadata,
|
|
40
44
|
urlToDocumentId: () => urlToDocumentId
|
|
41
45
|
});
|
|
42
46
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -44,12 +48,12 @@ module.exports = __toCommonJS(index_exports);
|
|
|
44
48
|
// src/WebRAGPlugin.ts
|
|
45
49
|
var import_mongodb = require("mongodb");
|
|
46
50
|
var import_openai = __toESM(require("openai"));
|
|
47
|
-
var
|
|
51
|
+
var cheerio4 = __toESM(require("cheerio"));
|
|
48
52
|
var fs = __toESM(require("fs"));
|
|
49
53
|
var path = __toESM(require("path"));
|
|
50
54
|
|
|
51
55
|
// src/htmlPageExtract.ts
|
|
52
|
-
var
|
|
56
|
+
var cheerio3 = __toESM(require("cheerio"));
|
|
53
57
|
|
|
54
58
|
// src/productMetadata.ts
|
|
55
59
|
var cheerio = __toESM(require("cheerio"));
|
|
@@ -216,6 +220,228 @@ function normalizeAvailability(value) {
|
|
|
216
220
|
return s.replace(/\s+/g, "") || void 0;
|
|
217
221
|
}
|
|
218
222
|
|
|
223
|
+
// src/pageCardMetadata.ts
|
|
224
|
+
var cheerio2 = __toESM(require("cheerio"));
|
|
225
|
+
var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
|
|
226
|
+
var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
|
|
227
|
+
var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
|
|
228
|
+
var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
|
|
229
|
+
var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
|
|
230
|
+
var ENTITY_DETAIL_PATH_RES = [
|
|
231
|
+
/\/projects\/[^/]+/i,
|
|
232
|
+
/\/project\/[^/]+/i,
|
|
233
|
+
/\/perspectives\/[^/]+/i,
|
|
234
|
+
/\/perspective\/[^/]+/i,
|
|
235
|
+
/\/portfolio\/[^/]+/i,
|
|
236
|
+
/\/case-stud(?:y|ies)\/[^/]+/i,
|
|
237
|
+
/\/insights?\/[^/]+/i,
|
|
238
|
+
/\/people\/[^/]+/i,
|
|
239
|
+
/\/person\/[^/]+/i,
|
|
240
|
+
/\/team-members?\/[^/]+/i,
|
|
241
|
+
/\/members?\/[^/]+/i,
|
|
242
|
+
/\/staff\/[^/]+/i,
|
|
243
|
+
/\/experts?\/[^/]+/i,
|
|
244
|
+
/\/authors?\/[^/]+/i,
|
|
245
|
+
/\/leadership\/[^/]+/i,
|
|
246
|
+
/\/biograph(?:y|ies)\/[^/]+/i
|
|
247
|
+
];
|
|
248
|
+
var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
|
|
249
|
+
var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
|
|
250
|
+
var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
|
|
251
|
+
var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
|
|
252
|
+
var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
|
|
253
|
+
var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
|
|
254
|
+
var CARD_PRIORITY = {
|
|
255
|
+
detail: 10,
|
|
256
|
+
listing: 6,
|
|
257
|
+
amenity: 5,
|
|
258
|
+
promotion: 2,
|
|
259
|
+
contact: 1,
|
|
260
|
+
content: 1,
|
|
261
|
+
blog: 0,
|
|
262
|
+
system: 0,
|
|
263
|
+
page: 3
|
|
264
|
+
};
|
|
265
|
+
var CARD_ELIGIBLE_DEFAULT = {
|
|
266
|
+
detail: true,
|
|
267
|
+
listing: true,
|
|
268
|
+
amenity: true,
|
|
269
|
+
promotion: false,
|
|
270
|
+
contact: false,
|
|
271
|
+
content: false,
|
|
272
|
+
blog: false,
|
|
273
|
+
system: false,
|
|
274
|
+
page: false
|
|
275
|
+
};
|
|
276
|
+
var SCHEMA_TYPE_MAP = {
|
|
277
|
+
product: "detail",
|
|
278
|
+
service: "amenity",
|
|
279
|
+
hotelroom: "detail",
|
|
280
|
+
room: "detail",
|
|
281
|
+
apartment: "detail",
|
|
282
|
+
lodgingroom: "detail",
|
|
283
|
+
course: "detail",
|
|
284
|
+
event: "detail",
|
|
285
|
+
offer: "promotion",
|
|
286
|
+
person: "detail",
|
|
287
|
+
employee: "detail",
|
|
288
|
+
profilepage: "detail",
|
|
289
|
+
article: "detail",
|
|
290
|
+
newsarticle: "detail",
|
|
291
|
+
blogposting: "detail",
|
|
292
|
+
creativework: "detail"
|
|
293
|
+
};
|
|
294
|
+
function normalizeDisplayTitle(title) {
|
|
295
|
+
if (!title?.trim()) return title;
|
|
296
|
+
let t = title.trim();
|
|
297
|
+
for (let i = 0; i < 2; i++) {
|
|
298
|
+
const dash = t.match(EN_DASH_SUFFIX_RE);
|
|
299
|
+
if (dash && dash.index !== void 0 && dash.index >= 4) {
|
|
300
|
+
t = t.slice(0, dash.index).trim();
|
|
301
|
+
continue;
|
|
302
|
+
}
|
|
303
|
+
const pipe = t.match(PIPE_SUFFIX_RE);
|
|
304
|
+
if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
|
|
305
|
+
t = t.slice(0, pipe.index).trim();
|
|
306
|
+
continue;
|
|
307
|
+
}
|
|
308
|
+
break;
|
|
309
|
+
}
|
|
310
|
+
return t || title.trim();
|
|
311
|
+
}
|
|
312
|
+
function hardExcludePage(url, title) {
|
|
313
|
+
const path2 = url.toLowerCase();
|
|
314
|
+
if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
|
|
315
|
+
if (BLOG_URL_RE.test(path2)) return true;
|
|
316
|
+
if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
|
|
317
|
+
try {
|
|
318
|
+
const u = new URL(url);
|
|
319
|
+
if (u.pathname === "/" || u.pathname === "") return true;
|
|
320
|
+
} catch {
|
|
321
|
+
}
|
|
322
|
+
return false;
|
|
323
|
+
}
|
|
324
|
+
function inferTypeFromUrl(url) {
|
|
325
|
+
const path2 = url.toLowerCase();
|
|
326
|
+
if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
|
|
327
|
+
if (CONTACT_URL_RE.test(path2)) return "contact";
|
|
328
|
+
if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
|
|
329
|
+
if (AMENITY_URL_RE.test(path2)) return "amenity";
|
|
330
|
+
if (DETAIL_URL_RE.test(path2)) return "detail";
|
|
331
|
+
if (LISTING_URL_RE.test(path2)) return "listing";
|
|
332
|
+
if (BLOG_URL_RE.test(path2)) return "blog";
|
|
333
|
+
return void 0;
|
|
334
|
+
}
|
|
335
|
+
function collectJsonLdNodes2(data) {
|
|
336
|
+
const nodes = [];
|
|
337
|
+
const visit = (value) => {
|
|
338
|
+
if (value == null) return;
|
|
339
|
+
if (Array.isArray(value)) {
|
|
340
|
+
value.forEach(visit);
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
if (typeof value !== "object") return;
|
|
344
|
+
const obj = value;
|
|
345
|
+
nodes.push(obj);
|
|
346
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
347
|
+
};
|
|
348
|
+
visit(data);
|
|
349
|
+
return nodes;
|
|
350
|
+
}
|
|
351
|
+
function schemaTypeName(node) {
|
|
352
|
+
const type = node["@type"];
|
|
353
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
354
|
+
const raw = types[0];
|
|
355
|
+
if (raw == null) return "";
|
|
356
|
+
const s = String(raw).toLowerCase();
|
|
357
|
+
const slash = s.lastIndexOf("/");
|
|
358
|
+
return slash >= 0 ? s.slice(slash + 1) : s;
|
|
359
|
+
}
|
|
360
|
+
function inferTypeFromSchema(html) {
|
|
361
|
+
const $ = cheerio2.load(html);
|
|
362
|
+
for (const el of $('script[type="application/ld+json"]').toArray()) {
|
|
363
|
+
const raw = $(el).html()?.trim();
|
|
364
|
+
if (!raw) continue;
|
|
365
|
+
try {
|
|
366
|
+
const parsed = JSON.parse(raw);
|
|
367
|
+
for (const node of collectJsonLdNodes2(parsed)) {
|
|
368
|
+
const name = schemaTypeName(node);
|
|
369
|
+
if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
|
|
370
|
+
if (name === "product" || node.offers != null) return "detail";
|
|
371
|
+
}
|
|
372
|
+
} catch {
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
|
|
376
|
+
if (ogType === "product") return "detail";
|
|
377
|
+
return void 0;
|
|
378
|
+
}
|
|
379
|
+
function normalizePageType(raw) {
|
|
380
|
+
if (!raw) return "page";
|
|
381
|
+
const lower = raw.toLowerCase();
|
|
382
|
+
const known = [
|
|
383
|
+
"detail",
|
|
384
|
+
"listing",
|
|
385
|
+
"amenity",
|
|
386
|
+
"promotion",
|
|
387
|
+
"contact",
|
|
388
|
+
"content",
|
|
389
|
+
"blog",
|
|
390
|
+
"system",
|
|
391
|
+
"page"
|
|
392
|
+
];
|
|
393
|
+
if (known.includes(lower)) return lower;
|
|
394
|
+
if (lower === "room" || lower === "product") return "detail";
|
|
395
|
+
if (lower === "offer" || lower === "sale") return "promotion";
|
|
396
|
+
return raw;
|
|
397
|
+
}
|
|
398
|
+
function resolveDisplayTitle(input) {
|
|
399
|
+
const heading = input.headingTitle?.trim();
|
|
400
|
+
if (heading) return normalizeDisplayTitle(heading);
|
|
401
|
+
return normalizeDisplayTitle(input.title);
|
|
402
|
+
}
|
|
403
|
+
function resolvePageCardMetadata(input) {
|
|
404
|
+
const title = input.title?.trim();
|
|
405
|
+
const url = input.url;
|
|
406
|
+
const displayTitle = resolveDisplayTitle(input);
|
|
407
|
+
if (hardExcludePage(url, title)) {
|
|
408
|
+
return {
|
|
409
|
+
type: "system",
|
|
410
|
+
cardEligible: false,
|
|
411
|
+
cardPriority: 0,
|
|
412
|
+
displayTitle,
|
|
413
|
+
displayDescription: input.description,
|
|
414
|
+
displayImageUrl: input.imageUrl
|
|
415
|
+
};
|
|
416
|
+
}
|
|
417
|
+
let type = normalizePageType(input.type);
|
|
418
|
+
if (type === "page" && input.html) {
|
|
419
|
+
const fromSchema = inferTypeFromSchema(input.html);
|
|
420
|
+
if (fromSchema) type = fromSchema;
|
|
421
|
+
}
|
|
422
|
+
if (type === "page") {
|
|
423
|
+
const fromUrl = inferTypeFromUrl(url);
|
|
424
|
+
if (fromUrl) type = fromUrl;
|
|
425
|
+
}
|
|
426
|
+
if (input.hasProductPrice && type === "page") {
|
|
427
|
+
type = "detail";
|
|
428
|
+
}
|
|
429
|
+
const typeKey = String(type).toLowerCase();
|
|
430
|
+
let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
|
|
431
|
+
let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
|
|
432
|
+
if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
|
|
433
|
+
cardEligible = false;
|
|
434
|
+
}
|
|
435
|
+
return {
|
|
436
|
+
type,
|
|
437
|
+
cardEligible,
|
|
438
|
+
cardPriority,
|
|
439
|
+
displayTitle,
|
|
440
|
+
displayDescription: input.description?.trim() || void 0,
|
|
441
|
+
displayImageUrl: input.imageUrl
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
219
445
|
// src/htmlPageExtract.ts
|
|
220
446
|
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
221
447
|
var DEFAULT_REMOVE_SELECTORS = [
|
|
@@ -238,17 +464,23 @@ function cleanContent(text) {
|
|
|
238
464
|
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
239
465
|
}
|
|
240
466
|
function bodyTextLengthHint(html, options = {}) {
|
|
241
|
-
const $ =
|
|
467
|
+
const $ = cheerio3.load(html);
|
|
242
468
|
stripNoiseFromDom($, options);
|
|
243
469
|
return cleanContent($("body").text().trim()).length;
|
|
244
470
|
}
|
|
245
471
|
function extractPageFromHtml(url, html, options = {}) {
|
|
246
|
-
const $ =
|
|
472
|
+
const $ = cheerio3.load(html);
|
|
247
473
|
stripNoiseFromDom($, options);
|
|
248
|
-
const
|
|
249
|
-
|
|
474
|
+
const h1Title = $("h1").first().text().trim();
|
|
475
|
+
const docTitle = $("title").text().trim();
|
|
476
|
+
let title = "";
|
|
477
|
+
if (options.titleSelector) {
|
|
478
|
+
title = $(options.titleSelector).first().text().trim();
|
|
479
|
+
} else {
|
|
480
|
+
title = docTitle || h1Title;
|
|
481
|
+
}
|
|
250
482
|
if (!title) {
|
|
251
|
-
title =
|
|
483
|
+
title = h1Title || docTitle;
|
|
252
484
|
}
|
|
253
485
|
const content = extractBestContentText($, options);
|
|
254
486
|
const minChars = options.minExtractedContentLength ?? 50;
|
|
@@ -273,12 +505,27 @@ function extractPageFromHtml(url, html, options = {}) {
|
|
|
273
505
|
}
|
|
274
506
|
}
|
|
275
507
|
const productMeta = extractProductMetadata(html);
|
|
276
|
-
const
|
|
508
|
+
const cardMeta = resolvePageCardMetadata({
|
|
509
|
+
url,
|
|
510
|
+
title,
|
|
511
|
+
headingTitle: h1Title || void 0,
|
|
512
|
+
description,
|
|
513
|
+
imageUrl,
|
|
514
|
+
html,
|
|
277
515
|
type,
|
|
516
|
+
hasProductPrice: productMeta.price != null
|
|
517
|
+
});
|
|
518
|
+
const metadata = {
|
|
519
|
+
type: cardMeta.type,
|
|
520
|
+
cardEligible: cardMeta.cardEligible,
|
|
521
|
+
cardPriority: cardMeta.cardPriority,
|
|
278
522
|
...title ? { title } : {},
|
|
523
|
+
...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
|
|
279
524
|
url,
|
|
280
525
|
...imageUrl ? { imageUrl } : {},
|
|
526
|
+
...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
|
|
281
527
|
...description ? { description } : {},
|
|
528
|
+
...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
|
|
282
529
|
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
283
530
|
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
284
531
|
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
@@ -584,13 +831,18 @@ var WebRAGPlugin = class {
|
|
|
584
831
|
plugin: this.name,
|
|
585
832
|
contentCount: scoredResults.length,
|
|
586
833
|
types: [...new Set(scoredResults.map((d) => d.metadata.type))],
|
|
587
|
-
topResults: scoredResults.slice(0,
|
|
834
|
+
topResults: scoredResults.slice(0, 16).map((doc) => ({
|
|
588
835
|
id: doc.id,
|
|
589
836
|
type: doc.metadata.type,
|
|
590
837
|
title: doc.metadata.title,
|
|
591
838
|
url: doc.metadata.url,
|
|
592
839
|
imageUrl: doc.metadata.imageUrl,
|
|
593
840
|
description: doc.metadata.description,
|
|
841
|
+
cardEligible: doc.metadata.cardEligible,
|
|
842
|
+
cardPriority: doc.metadata.cardPriority,
|
|
843
|
+
displayTitle: doc.metadata.displayTitle,
|
|
844
|
+
displayDescription: doc.metadata.displayDescription,
|
|
845
|
+
displayImageUrl: doc.metadata.displayImageUrl,
|
|
594
846
|
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
595
847
|
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
596
848
|
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
@@ -1764,7 +2016,7 @@ var WebRAGPlugin = class {
|
|
|
1764
2016
|
return await response.text();
|
|
1765
2017
|
}
|
|
1766
2018
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1767
|
-
const $ =
|
|
2019
|
+
const $ = cheerio4.load(html);
|
|
1768
2020
|
const links = /* @__PURE__ */ new Set();
|
|
1769
2021
|
$("a[href]").each((_, el) => {
|
|
1770
2022
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -2631,8 +2883,12 @@ var WebRAGPlugin = class {
|
|
|
2631
2883
|
bodyTextLengthHint,
|
|
2632
2884
|
extractPageFromHtml,
|
|
2633
2885
|
extractProductMetadata,
|
|
2886
|
+
hardExcludePage,
|
|
2887
|
+
inferTypeFromUrl,
|
|
2634
2888
|
normalizeAvailability,
|
|
2635
2889
|
normalizeCurrency,
|
|
2890
|
+
normalizeDisplayTitle,
|
|
2636
2891
|
parsePrice,
|
|
2892
|
+
resolvePageCardMetadata,
|
|
2637
2893
|
urlToDocumentId
|
|
2638
2894
|
});
|
package/dist/index.mjs
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
// src/WebRAGPlugin.ts
|
|
2
2
|
import { MongoClient } from "mongodb";
|
|
3
3
|
import OpenAI from "openai";
|
|
4
|
-
import * as
|
|
4
|
+
import * as cheerio4 from "cheerio";
|
|
5
5
|
import * as fs from "fs";
|
|
6
6
|
import * as path from "path";
|
|
7
7
|
|
|
8
8
|
// src/htmlPageExtract.ts
|
|
9
|
-
import * as
|
|
9
|
+
import * as cheerio3 from "cheerio";
|
|
10
10
|
|
|
11
11
|
// src/productMetadata.ts
|
|
12
12
|
import * as cheerio from "cheerio";
|
|
@@ -173,6 +173,228 @@ function normalizeAvailability(value) {
|
|
|
173
173
|
return s.replace(/\s+/g, "") || void 0;
|
|
174
174
|
}
|
|
175
175
|
|
|
176
|
+
// src/pageCardMetadata.ts
|
|
177
|
+
import * as cheerio2 from "cheerio";
|
|
178
|
+
var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
|
|
179
|
+
var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
|
|
180
|
+
var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
|
|
181
|
+
var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
|
|
182
|
+
var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
|
|
183
|
+
var ENTITY_DETAIL_PATH_RES = [
|
|
184
|
+
/\/projects\/[^/]+/i,
|
|
185
|
+
/\/project\/[^/]+/i,
|
|
186
|
+
/\/perspectives\/[^/]+/i,
|
|
187
|
+
/\/perspective\/[^/]+/i,
|
|
188
|
+
/\/portfolio\/[^/]+/i,
|
|
189
|
+
/\/case-stud(?:y|ies)\/[^/]+/i,
|
|
190
|
+
/\/insights?\/[^/]+/i,
|
|
191
|
+
/\/people\/[^/]+/i,
|
|
192
|
+
/\/person\/[^/]+/i,
|
|
193
|
+
/\/team-members?\/[^/]+/i,
|
|
194
|
+
/\/members?\/[^/]+/i,
|
|
195
|
+
/\/staff\/[^/]+/i,
|
|
196
|
+
/\/experts?\/[^/]+/i,
|
|
197
|
+
/\/authors?\/[^/]+/i,
|
|
198
|
+
/\/leadership\/[^/]+/i,
|
|
199
|
+
/\/biograph(?:y|ies)\/[^/]+/i
|
|
200
|
+
];
|
|
201
|
+
var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
|
|
202
|
+
var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
|
|
203
|
+
var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
|
|
204
|
+
var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
|
|
205
|
+
var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
|
|
206
|
+
var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
|
|
207
|
+
var CARD_PRIORITY = {
|
|
208
|
+
detail: 10,
|
|
209
|
+
listing: 6,
|
|
210
|
+
amenity: 5,
|
|
211
|
+
promotion: 2,
|
|
212
|
+
contact: 1,
|
|
213
|
+
content: 1,
|
|
214
|
+
blog: 0,
|
|
215
|
+
system: 0,
|
|
216
|
+
page: 3
|
|
217
|
+
};
|
|
218
|
+
var CARD_ELIGIBLE_DEFAULT = {
|
|
219
|
+
detail: true,
|
|
220
|
+
listing: true,
|
|
221
|
+
amenity: true,
|
|
222
|
+
promotion: false,
|
|
223
|
+
contact: false,
|
|
224
|
+
content: false,
|
|
225
|
+
blog: false,
|
|
226
|
+
system: false,
|
|
227
|
+
page: false
|
|
228
|
+
};
|
|
229
|
+
var SCHEMA_TYPE_MAP = {
|
|
230
|
+
product: "detail",
|
|
231
|
+
service: "amenity",
|
|
232
|
+
hotelroom: "detail",
|
|
233
|
+
room: "detail",
|
|
234
|
+
apartment: "detail",
|
|
235
|
+
lodgingroom: "detail",
|
|
236
|
+
course: "detail",
|
|
237
|
+
event: "detail",
|
|
238
|
+
offer: "promotion",
|
|
239
|
+
person: "detail",
|
|
240
|
+
employee: "detail",
|
|
241
|
+
profilepage: "detail",
|
|
242
|
+
article: "detail",
|
|
243
|
+
newsarticle: "detail",
|
|
244
|
+
blogposting: "detail",
|
|
245
|
+
creativework: "detail"
|
|
246
|
+
};
|
|
247
|
+
function normalizeDisplayTitle(title) {
|
|
248
|
+
if (!title?.trim()) return title;
|
|
249
|
+
let t = title.trim();
|
|
250
|
+
for (let i = 0; i < 2; i++) {
|
|
251
|
+
const dash = t.match(EN_DASH_SUFFIX_RE);
|
|
252
|
+
if (dash && dash.index !== void 0 && dash.index >= 4) {
|
|
253
|
+
t = t.slice(0, dash.index).trim();
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
const pipe = t.match(PIPE_SUFFIX_RE);
|
|
257
|
+
if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
|
|
258
|
+
t = t.slice(0, pipe.index).trim();
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
break;
|
|
262
|
+
}
|
|
263
|
+
return t || title.trim();
|
|
264
|
+
}
|
|
265
|
+
function hardExcludePage(url, title) {
|
|
266
|
+
const path2 = url.toLowerCase();
|
|
267
|
+
if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
|
|
268
|
+
if (BLOG_URL_RE.test(path2)) return true;
|
|
269
|
+
if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
|
|
270
|
+
try {
|
|
271
|
+
const u = new URL(url);
|
|
272
|
+
if (u.pathname === "/" || u.pathname === "") return true;
|
|
273
|
+
} catch {
|
|
274
|
+
}
|
|
275
|
+
return false;
|
|
276
|
+
}
|
|
277
|
+
function inferTypeFromUrl(url) {
|
|
278
|
+
const path2 = url.toLowerCase();
|
|
279
|
+
if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
|
|
280
|
+
if (CONTACT_URL_RE.test(path2)) return "contact";
|
|
281
|
+
if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
|
|
282
|
+
if (AMENITY_URL_RE.test(path2)) return "amenity";
|
|
283
|
+
if (DETAIL_URL_RE.test(path2)) return "detail";
|
|
284
|
+
if (LISTING_URL_RE.test(path2)) return "listing";
|
|
285
|
+
if (BLOG_URL_RE.test(path2)) return "blog";
|
|
286
|
+
return void 0;
|
|
287
|
+
}
|
|
288
|
+
function collectJsonLdNodes2(data) {
|
|
289
|
+
const nodes = [];
|
|
290
|
+
const visit = (value) => {
|
|
291
|
+
if (value == null) return;
|
|
292
|
+
if (Array.isArray(value)) {
|
|
293
|
+
value.forEach(visit);
|
|
294
|
+
return;
|
|
295
|
+
}
|
|
296
|
+
if (typeof value !== "object") return;
|
|
297
|
+
const obj = value;
|
|
298
|
+
nodes.push(obj);
|
|
299
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
300
|
+
};
|
|
301
|
+
visit(data);
|
|
302
|
+
return nodes;
|
|
303
|
+
}
|
|
304
|
+
function schemaTypeName(node) {
|
|
305
|
+
const type = node["@type"];
|
|
306
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
307
|
+
const raw = types[0];
|
|
308
|
+
if (raw == null) return "";
|
|
309
|
+
const s = String(raw).toLowerCase();
|
|
310
|
+
const slash = s.lastIndexOf("/");
|
|
311
|
+
return slash >= 0 ? s.slice(slash + 1) : s;
|
|
312
|
+
}
|
|
313
|
+
function inferTypeFromSchema(html) {
|
|
314
|
+
const $ = cheerio2.load(html);
|
|
315
|
+
for (const el of $('script[type="application/ld+json"]').toArray()) {
|
|
316
|
+
const raw = $(el).html()?.trim();
|
|
317
|
+
if (!raw) continue;
|
|
318
|
+
try {
|
|
319
|
+
const parsed = JSON.parse(raw);
|
|
320
|
+
for (const node of collectJsonLdNodes2(parsed)) {
|
|
321
|
+
const name = schemaTypeName(node);
|
|
322
|
+
if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
|
|
323
|
+
if (name === "product" || node.offers != null) return "detail";
|
|
324
|
+
}
|
|
325
|
+
} catch {
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
|
|
329
|
+
if (ogType === "product") return "detail";
|
|
330
|
+
return void 0;
|
|
331
|
+
}
|
|
332
|
+
function normalizePageType(raw) {
|
|
333
|
+
if (!raw) return "page";
|
|
334
|
+
const lower = raw.toLowerCase();
|
|
335
|
+
const known = [
|
|
336
|
+
"detail",
|
|
337
|
+
"listing",
|
|
338
|
+
"amenity",
|
|
339
|
+
"promotion",
|
|
340
|
+
"contact",
|
|
341
|
+
"content",
|
|
342
|
+
"blog",
|
|
343
|
+
"system",
|
|
344
|
+
"page"
|
|
345
|
+
];
|
|
346
|
+
if (known.includes(lower)) return lower;
|
|
347
|
+
if (lower === "room" || lower === "product") return "detail";
|
|
348
|
+
if (lower === "offer" || lower === "sale") return "promotion";
|
|
349
|
+
return raw;
|
|
350
|
+
}
|
|
351
|
+
function resolveDisplayTitle(input) {
|
|
352
|
+
const heading = input.headingTitle?.trim();
|
|
353
|
+
if (heading) return normalizeDisplayTitle(heading);
|
|
354
|
+
return normalizeDisplayTitle(input.title);
|
|
355
|
+
}
|
|
356
|
+
function resolvePageCardMetadata(input) {
|
|
357
|
+
const title = input.title?.trim();
|
|
358
|
+
const url = input.url;
|
|
359
|
+
const displayTitle = resolveDisplayTitle(input);
|
|
360
|
+
if (hardExcludePage(url, title)) {
|
|
361
|
+
return {
|
|
362
|
+
type: "system",
|
|
363
|
+
cardEligible: false,
|
|
364
|
+
cardPriority: 0,
|
|
365
|
+
displayTitle,
|
|
366
|
+
displayDescription: input.description,
|
|
367
|
+
displayImageUrl: input.imageUrl
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
let type = normalizePageType(input.type);
|
|
371
|
+
if (type === "page" && input.html) {
|
|
372
|
+
const fromSchema = inferTypeFromSchema(input.html);
|
|
373
|
+
if (fromSchema) type = fromSchema;
|
|
374
|
+
}
|
|
375
|
+
if (type === "page") {
|
|
376
|
+
const fromUrl = inferTypeFromUrl(url);
|
|
377
|
+
if (fromUrl) type = fromUrl;
|
|
378
|
+
}
|
|
379
|
+
if (input.hasProductPrice && type === "page") {
|
|
380
|
+
type = "detail";
|
|
381
|
+
}
|
|
382
|
+
const typeKey = String(type).toLowerCase();
|
|
383
|
+
let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
|
|
384
|
+
let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
|
|
385
|
+
if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
|
|
386
|
+
cardEligible = false;
|
|
387
|
+
}
|
|
388
|
+
return {
|
|
389
|
+
type,
|
|
390
|
+
cardEligible,
|
|
391
|
+
cardPriority,
|
|
392
|
+
displayTitle,
|
|
393
|
+
displayDescription: input.description?.trim() || void 0,
|
|
394
|
+
displayImageUrl: input.imageUrl
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
|
|
176
398
|
// src/htmlPageExtract.ts
|
|
177
399
|
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
178
400
|
var DEFAULT_REMOVE_SELECTORS = [
|
|
@@ -195,17 +417,23 @@ function cleanContent(text) {
|
|
|
195
417
|
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
196
418
|
}
|
|
197
419
|
function bodyTextLengthHint(html, options = {}) {
|
|
198
|
-
const $ =
|
|
420
|
+
const $ = cheerio3.load(html);
|
|
199
421
|
stripNoiseFromDom($, options);
|
|
200
422
|
return cleanContent($("body").text().trim()).length;
|
|
201
423
|
}
|
|
202
424
|
function extractPageFromHtml(url, html, options = {}) {
|
|
203
|
-
const $ =
|
|
425
|
+
const $ = cheerio3.load(html);
|
|
204
426
|
stripNoiseFromDom($, options);
|
|
205
|
-
const
|
|
206
|
-
|
|
427
|
+
const h1Title = $("h1").first().text().trim();
|
|
428
|
+
const docTitle = $("title").text().trim();
|
|
429
|
+
let title = "";
|
|
430
|
+
if (options.titleSelector) {
|
|
431
|
+
title = $(options.titleSelector).first().text().trim();
|
|
432
|
+
} else {
|
|
433
|
+
title = docTitle || h1Title;
|
|
434
|
+
}
|
|
207
435
|
if (!title) {
|
|
208
|
-
title =
|
|
436
|
+
title = h1Title || docTitle;
|
|
209
437
|
}
|
|
210
438
|
const content = extractBestContentText($, options);
|
|
211
439
|
const minChars = options.minExtractedContentLength ?? 50;
|
|
@@ -230,12 +458,27 @@ function extractPageFromHtml(url, html, options = {}) {
|
|
|
230
458
|
}
|
|
231
459
|
}
|
|
232
460
|
const productMeta = extractProductMetadata(html);
|
|
233
|
-
const
|
|
461
|
+
const cardMeta = resolvePageCardMetadata({
|
|
462
|
+
url,
|
|
463
|
+
title,
|
|
464
|
+
headingTitle: h1Title || void 0,
|
|
465
|
+
description,
|
|
466
|
+
imageUrl,
|
|
467
|
+
html,
|
|
234
468
|
type,
|
|
469
|
+
hasProductPrice: productMeta.price != null
|
|
470
|
+
});
|
|
471
|
+
const metadata = {
|
|
472
|
+
type: cardMeta.type,
|
|
473
|
+
cardEligible: cardMeta.cardEligible,
|
|
474
|
+
cardPriority: cardMeta.cardPriority,
|
|
235
475
|
...title ? { title } : {},
|
|
476
|
+
...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
|
|
236
477
|
url,
|
|
237
478
|
...imageUrl ? { imageUrl } : {},
|
|
479
|
+
...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
|
|
238
480
|
...description ? { description } : {},
|
|
481
|
+
...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
|
|
239
482
|
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
240
483
|
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
241
484
|
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
@@ -541,13 +784,18 @@ var WebRAGPlugin = class {
|
|
|
541
784
|
plugin: this.name,
|
|
542
785
|
contentCount: scoredResults.length,
|
|
543
786
|
types: [...new Set(scoredResults.map((d) => d.metadata.type))],
|
|
544
|
-
topResults: scoredResults.slice(0,
|
|
787
|
+
topResults: scoredResults.slice(0, 16).map((doc) => ({
|
|
545
788
|
id: doc.id,
|
|
546
789
|
type: doc.metadata.type,
|
|
547
790
|
title: doc.metadata.title,
|
|
548
791
|
url: doc.metadata.url,
|
|
549
792
|
imageUrl: doc.metadata.imageUrl,
|
|
550
793
|
description: doc.metadata.description,
|
|
794
|
+
cardEligible: doc.metadata.cardEligible,
|
|
795
|
+
cardPriority: doc.metadata.cardPriority,
|
|
796
|
+
displayTitle: doc.metadata.displayTitle,
|
|
797
|
+
displayDescription: doc.metadata.displayDescription,
|
|
798
|
+
displayImageUrl: doc.metadata.displayImageUrl,
|
|
551
799
|
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
552
800
|
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
553
801
|
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
@@ -1721,7 +1969,7 @@ var WebRAGPlugin = class {
|
|
|
1721
1969
|
return await response.text();
|
|
1722
1970
|
}
|
|
1723
1971
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1724
|
-
const $ =
|
|
1972
|
+
const $ = cheerio4.load(html);
|
|
1725
1973
|
const links = /* @__PURE__ */ new Set();
|
|
1726
1974
|
$("a[href]").each((_, el) => {
|
|
1727
1975
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -2587,8 +2835,12 @@ export {
|
|
|
2587
2835
|
bodyTextLengthHint,
|
|
2588
2836
|
extractPageFromHtml,
|
|
2589
2837
|
extractProductMetadata,
|
|
2838
|
+
hardExcludePage,
|
|
2839
|
+
inferTypeFromUrl,
|
|
2590
2840
|
normalizeAvailability,
|
|
2591
2841
|
normalizeCurrency,
|
|
2842
|
+
normalizeDisplayTitle,
|
|
2592
2843
|
parsePrice,
|
|
2844
|
+
resolvePageCardMetadata,
|
|
2593
2845
|
urlToDocumentId
|
|
2594
2846
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@snap-agent/rag-web",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.7",
|
|
4
4
|
"description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|