@snap-agent/rag-web 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +52 -1
- package/dist/index.d.ts +52 -1
- package/dist/index.js +312 -31
- package/dist/index.mjs +308 -31
- package/package.json +1 -2
package/dist/index.d.mts
CHANGED
|
@@ -114,6 +114,12 @@ interface CrawlPageStatusEntry {
|
|
|
114
114
|
httpStatus?: number;
|
|
115
115
|
error?: string;
|
|
116
116
|
skippedReason?: string;
|
|
117
|
+
/**
|
|
118
|
+
* Same-origin internal links found on this page, populated only when `extractLinks` is set on
|
|
119
|
+
* the crawl config. Enables resumable recursive (BFS) crawling: the caller feeds these back into
|
|
120
|
+
* its own frontier instead of the SDK doing a separate link-discovery fetch.
|
|
121
|
+
*/
|
|
122
|
+
links?: string[];
|
|
117
123
|
}
|
|
118
124
|
interface CrawlLedgerDocument {
|
|
119
125
|
tenantId: string;
|
|
@@ -258,6 +264,10 @@ interface SitemapConfig {
|
|
|
258
264
|
*/
|
|
259
265
|
debug?: DebugOptions;
|
|
260
266
|
crawlLedger?: CrawlLedgerOptions;
|
|
267
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
268
|
+
extractLinks?: boolean;
|
|
269
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
270
|
+
maxLinksPerPage?: number;
|
|
261
271
|
}
|
|
262
272
|
/**
|
|
263
273
|
* Direct URL list crawling configuration
|
|
@@ -277,6 +287,10 @@ interface UrlListConfig {
|
|
|
277
287
|
debug?: DebugOptions;
|
|
278
288
|
stripQueryParams?: boolean;
|
|
279
289
|
crawlLedger?: CrawlLedgerOptions;
|
|
290
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
291
|
+
extractLinks?: boolean;
|
|
292
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
293
|
+
maxLinksPerPage?: number;
|
|
280
294
|
}
|
|
281
295
|
/**
|
|
282
296
|
* Single page ingestion (no discovery)
|
|
@@ -321,6 +335,10 @@ interface WebsiteCrawlConfig {
|
|
|
321
335
|
renderOptions?: RenderOptions;
|
|
322
336
|
debug?: DebugOptions;
|
|
323
337
|
crawlLedger?: CrawlLedgerOptions;
|
|
338
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
339
|
+
extractLinks?: boolean;
|
|
340
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
341
|
+
maxLinksPerPage?: number;
|
|
324
342
|
}
|
|
325
343
|
interface RenderOptions {
|
|
326
344
|
/**
|
|
@@ -661,6 +679,12 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
661
679
|
private normalizeWebsiteUrl;
|
|
662
680
|
private fetchHtml;
|
|
663
681
|
private extractInternalLinks;
|
|
682
|
+
/**
|
|
683
|
+
* When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
|
|
684
|
+
* caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
|
|
685
|
+
* undefined when disabled or on any parse error (link extraction must never fail a crawl).
|
|
686
|
+
*/
|
|
687
|
+
private extractLinksIfEnabled;
|
|
664
688
|
/**
|
|
665
689
|
* Ingest content from a list of URLs
|
|
666
690
|
*
|
|
@@ -805,4 +829,31 @@ declare function parsePrice(value: unknown): number | undefined;
|
|
|
805
829
|
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
830
|
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
831
|
|
|
808
|
-
|
|
832
|
+
/** Abstract page roles — vertical-agnostic. */
|
|
833
|
+
type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
|
|
834
|
+
interface PageCardMetadataInput {
|
|
835
|
+
url: string;
|
|
836
|
+
title?: string;
|
|
837
|
+
/** Primary heading (h1) — preferred for displayTitle over the document title tag. */
|
|
838
|
+
headingTitle?: string;
|
|
839
|
+
description?: string;
|
|
840
|
+
imageUrl?: string;
|
|
841
|
+
html?: string;
|
|
842
|
+
/** Type already resolved from typeFromUrl / defaultType. */
|
|
843
|
+
type?: string;
|
|
844
|
+
hasProductPrice?: boolean;
|
|
845
|
+
}
|
|
846
|
+
interface PageCardMetadataResult {
|
|
847
|
+
type: PageCardType | string;
|
|
848
|
+
cardEligible: boolean;
|
|
849
|
+
cardPriority: number;
|
|
850
|
+
displayTitle?: string;
|
|
851
|
+
displayDescription?: string;
|
|
852
|
+
displayImageUrl?: string;
|
|
853
|
+
}
|
|
854
|
+
declare function normalizeDisplayTitle(title?: string): string | undefined;
|
|
855
|
+
declare function hardExcludePage(url: string, title?: string): boolean;
|
|
856
|
+
declare function inferTypeFromUrl(url: string): PageCardType | undefined;
|
|
857
|
+
declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
|
|
858
|
+
|
|
859
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
|
package/dist/index.d.ts
CHANGED
|
@@ -114,6 +114,12 @@ interface CrawlPageStatusEntry {
|
|
|
114
114
|
httpStatus?: number;
|
|
115
115
|
error?: string;
|
|
116
116
|
skippedReason?: string;
|
|
117
|
+
/**
|
|
118
|
+
* Same-origin internal links found on this page, populated only when `extractLinks` is set on
|
|
119
|
+
* the crawl config. Enables resumable recursive (BFS) crawling: the caller feeds these back into
|
|
120
|
+
* its own frontier instead of the SDK doing a separate link-discovery fetch.
|
|
121
|
+
*/
|
|
122
|
+
links?: string[];
|
|
117
123
|
}
|
|
118
124
|
interface CrawlLedgerDocument {
|
|
119
125
|
tenantId: string;
|
|
@@ -258,6 +264,10 @@ interface SitemapConfig {
|
|
|
258
264
|
*/
|
|
259
265
|
debug?: DebugOptions;
|
|
260
266
|
crawlLedger?: CrawlLedgerOptions;
|
|
267
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
268
|
+
extractLinks?: boolean;
|
|
269
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
270
|
+
maxLinksPerPage?: number;
|
|
261
271
|
}
|
|
262
272
|
/**
|
|
263
273
|
* Direct URL list crawling configuration
|
|
@@ -277,6 +287,10 @@ interface UrlListConfig {
|
|
|
277
287
|
debug?: DebugOptions;
|
|
278
288
|
stripQueryParams?: boolean;
|
|
279
289
|
crawlLedger?: CrawlLedgerOptions;
|
|
290
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
291
|
+
extractLinks?: boolean;
|
|
292
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
293
|
+
maxLinksPerPage?: number;
|
|
280
294
|
}
|
|
281
295
|
/**
|
|
282
296
|
* Single page ingestion (no discovery)
|
|
@@ -321,6 +335,10 @@ interface WebsiteCrawlConfig {
|
|
|
321
335
|
renderOptions?: RenderOptions;
|
|
322
336
|
debug?: DebugOptions;
|
|
323
337
|
crawlLedger?: CrawlLedgerOptions;
|
|
338
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
339
|
+
extractLinks?: boolean;
|
|
340
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
341
|
+
maxLinksPerPage?: number;
|
|
324
342
|
}
|
|
325
343
|
interface RenderOptions {
|
|
326
344
|
/**
|
|
@@ -661,6 +679,12 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
661
679
|
private normalizeWebsiteUrl;
|
|
662
680
|
private fetchHtml;
|
|
663
681
|
private extractInternalLinks;
|
|
682
|
+
/**
|
|
683
|
+
* When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
|
|
684
|
+
* caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
|
|
685
|
+
* undefined when disabled or on any parse error (link extraction must never fail a crawl).
|
|
686
|
+
*/
|
|
687
|
+
private extractLinksIfEnabled;
|
|
664
688
|
/**
|
|
665
689
|
* Ingest content from a list of URLs
|
|
666
690
|
*
|
|
@@ -805,4 +829,31 @@ declare function parsePrice(value: unknown): number | undefined;
|
|
|
805
829
|
declare function normalizeCurrency(value: unknown): string | undefined;
|
|
806
830
|
declare function normalizeAvailability(value: unknown): string | undefined;
|
|
807
831
|
|
|
808
|
-
|
|
832
|
+
/** Abstract page roles — vertical-agnostic. */
|
|
833
|
+
type PageCardType = 'detail' | 'listing' | 'amenity' | 'promotion' | 'contact' | 'content' | 'blog' | 'system' | 'page';
|
|
834
|
+
interface PageCardMetadataInput {
|
|
835
|
+
url: string;
|
|
836
|
+
title?: string;
|
|
837
|
+
/** Primary heading (h1) — preferred for displayTitle over the document title tag. */
|
|
838
|
+
headingTitle?: string;
|
|
839
|
+
description?: string;
|
|
840
|
+
imageUrl?: string;
|
|
841
|
+
html?: string;
|
|
842
|
+
/** Type already resolved from typeFromUrl / defaultType. */
|
|
843
|
+
type?: string;
|
|
844
|
+
hasProductPrice?: boolean;
|
|
845
|
+
}
|
|
846
|
+
interface PageCardMetadataResult {
|
|
847
|
+
type: PageCardType | string;
|
|
848
|
+
cardEligible: boolean;
|
|
849
|
+
cardPriority: number;
|
|
850
|
+
displayTitle?: string;
|
|
851
|
+
displayDescription?: string;
|
|
852
|
+
displayImageUrl?: string;
|
|
853
|
+
}
|
|
854
|
+
declare function normalizeDisplayTitle(title?: string): string | undefined;
|
|
855
|
+
declare function hardExcludePage(url: string, title?: string): boolean;
|
|
856
|
+
declare function inferTypeFromUrl(url: string): PageCardType | undefined;
|
|
857
|
+
declare function resolvePageCardMetadata(input: PageCardMetadataInput): PageCardMetadataResult;
|
|
858
|
+
|
|
859
|
+
export { type BulkProgressCallback, type BulkProgressPhase, type BulkProgressUpdate, type CrawlLedgerDocument, type CrawlLedgerOptions, type CrawlLedgerPluginConfig, type CrawlLedgerStatus, type CrawlPageCallback, type CrawlPageEvent, type CrawlPageStatusEntry, type CrawlProgressCallback, type CrawlProgressPhase, type CrawlProgressUpdate, type CrawlResult, type DataTransform, type DebugOptions, type DrupalConfig, type HtmlPageExtractOptions, type HtmlPageExtractResult, type PageCardMetadataInput, type PageCardMetadataResult, type PageCardType, type ProductMetadata, type RSSConfig, type RenderOptions, type SanityConfig, type SinglePageConfig, type SitemapConfig, type StoredWebDocument, type StrapiConfig, type URLSource, type URLSourceAuth, type UrlListConfig, type WebDocument, type WebIngestResult, type WebRAGConfig, WebRAGPlugin, type WebURLIngestResult, type WebsiteCrawlConfig, type WordPressConfig, bodyTextLengthHint, extractPageFromHtml, extractProductMetadata, hardExcludePage, inferTypeFromUrl, normalizeAvailability, normalizeCurrency, normalizeDisplayTitle, parsePrice, resolvePageCardMetadata, urlToDocumentId };
|
package/dist/index.js
CHANGED
|
@@ -34,9 +34,13 @@ __export(index_exports, {
|
|
|
34
34
|
bodyTextLengthHint: () => bodyTextLengthHint,
|
|
35
35
|
extractPageFromHtml: () => extractPageFromHtml,
|
|
36
36
|
extractProductMetadata: () => extractProductMetadata,
|
|
37
|
+
hardExcludePage: () => hardExcludePage,
|
|
38
|
+
inferTypeFromUrl: () => inferTypeFromUrl,
|
|
37
39
|
normalizeAvailability: () => normalizeAvailability,
|
|
38
40
|
normalizeCurrency: () => normalizeCurrency,
|
|
41
|
+
normalizeDisplayTitle: () => normalizeDisplayTitle,
|
|
39
42
|
parsePrice: () => parsePrice,
|
|
43
|
+
resolvePageCardMetadata: () => resolvePageCardMetadata,
|
|
40
44
|
urlToDocumentId: () => urlToDocumentId
|
|
41
45
|
});
|
|
42
46
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -44,12 +48,12 @@ module.exports = __toCommonJS(index_exports);
|
|
|
44
48
|
// src/WebRAGPlugin.ts
|
|
45
49
|
var import_mongodb = require("mongodb");
|
|
46
50
|
var import_openai = __toESM(require("openai"));
|
|
47
|
-
var
|
|
51
|
+
var cheerio4 = __toESM(require("cheerio"));
|
|
48
52
|
var fs = __toESM(require("fs"));
|
|
49
53
|
var path = __toESM(require("path"));
|
|
50
54
|
|
|
51
55
|
// src/htmlPageExtract.ts
|
|
52
|
-
var
|
|
56
|
+
var cheerio3 = __toESM(require("cheerio"));
|
|
53
57
|
|
|
54
58
|
// src/productMetadata.ts
|
|
55
59
|
var cheerio = __toESM(require("cheerio"));
|
|
@@ -216,6 +220,228 @@ function normalizeAvailability(value) {
|
|
|
216
220
|
return s.replace(/\s+/g, "") || void 0;
|
|
217
221
|
}
|
|
218
222
|
|
|
223
|
+
// src/pageCardMetadata.ts
|
|
224
|
+
var cheerio2 = __toESM(require("cheerio"));
|
|
225
|
+
var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
|
|
226
|
+
var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
|
|
227
|
+
var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
|
|
228
|
+
var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
|
|
229
|
+
var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
|
|
230
|
+
var ENTITY_DETAIL_PATH_RES = [
|
|
231
|
+
/\/projects\/[^/]+/i,
|
|
232
|
+
/\/project\/[^/]+/i,
|
|
233
|
+
/\/perspectives\/[^/]+/i,
|
|
234
|
+
/\/perspective\/[^/]+/i,
|
|
235
|
+
/\/portfolio\/[^/]+/i,
|
|
236
|
+
/\/case-stud(?:y|ies)\/[^/]+/i,
|
|
237
|
+
/\/insights?\/[^/]+/i,
|
|
238
|
+
/\/people\/[^/]+/i,
|
|
239
|
+
/\/person\/[^/]+/i,
|
|
240
|
+
/\/team-members?\/[^/]+/i,
|
|
241
|
+
/\/members?\/[^/]+/i,
|
|
242
|
+
/\/staff\/[^/]+/i,
|
|
243
|
+
/\/experts?\/[^/]+/i,
|
|
244
|
+
/\/authors?\/[^/]+/i,
|
|
245
|
+
/\/leadership\/[^/]+/i,
|
|
246
|
+
/\/biograph(?:y|ies)\/[^/]+/i
|
|
247
|
+
];
|
|
248
|
+
var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
|
|
249
|
+
var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
|
|
250
|
+
var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
|
|
251
|
+
var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
|
|
252
|
+
var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
|
|
253
|
+
var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
|
|
254
|
+
var CARD_PRIORITY = {
|
|
255
|
+
detail: 10,
|
|
256
|
+
listing: 6,
|
|
257
|
+
amenity: 5,
|
|
258
|
+
promotion: 2,
|
|
259
|
+
contact: 1,
|
|
260
|
+
content: 1,
|
|
261
|
+
blog: 0,
|
|
262
|
+
system: 0,
|
|
263
|
+
page: 3
|
|
264
|
+
};
|
|
265
|
+
var CARD_ELIGIBLE_DEFAULT = {
|
|
266
|
+
detail: true,
|
|
267
|
+
listing: true,
|
|
268
|
+
amenity: true,
|
|
269
|
+
promotion: false,
|
|
270
|
+
contact: false,
|
|
271
|
+
content: false,
|
|
272
|
+
blog: false,
|
|
273
|
+
system: false,
|
|
274
|
+
page: false
|
|
275
|
+
};
|
|
276
|
+
var SCHEMA_TYPE_MAP = {
|
|
277
|
+
product: "detail",
|
|
278
|
+
service: "amenity",
|
|
279
|
+
hotelroom: "detail",
|
|
280
|
+
room: "detail",
|
|
281
|
+
apartment: "detail",
|
|
282
|
+
lodgingroom: "detail",
|
|
283
|
+
course: "detail",
|
|
284
|
+
event: "detail",
|
|
285
|
+
offer: "promotion",
|
|
286
|
+
person: "detail",
|
|
287
|
+
employee: "detail",
|
|
288
|
+
profilepage: "detail",
|
|
289
|
+
article: "detail",
|
|
290
|
+
newsarticle: "detail",
|
|
291
|
+
blogposting: "detail",
|
|
292
|
+
creativework: "detail"
|
|
293
|
+
};
|
|
294
|
+
function normalizeDisplayTitle(title) {
|
|
295
|
+
if (!title?.trim()) return title;
|
|
296
|
+
let t = title.trim();
|
|
297
|
+
for (let i = 0; i < 2; i++) {
|
|
298
|
+
const dash = t.match(EN_DASH_SUFFIX_RE);
|
|
299
|
+
if (dash && dash.index !== void 0 && dash.index >= 4) {
|
|
300
|
+
t = t.slice(0, dash.index).trim();
|
|
301
|
+
continue;
|
|
302
|
+
}
|
|
303
|
+
const pipe = t.match(PIPE_SUFFIX_RE);
|
|
304
|
+
if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
|
|
305
|
+
t = t.slice(0, pipe.index).trim();
|
|
306
|
+
continue;
|
|
307
|
+
}
|
|
308
|
+
break;
|
|
309
|
+
}
|
|
310
|
+
return t || title.trim();
|
|
311
|
+
}
|
|
312
|
+
function hardExcludePage(url, title) {
|
|
313
|
+
const path2 = url.toLowerCase();
|
|
314
|
+
if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
|
|
315
|
+
if (BLOG_URL_RE.test(path2)) return true;
|
|
316
|
+
if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
|
|
317
|
+
try {
|
|
318
|
+
const u = new URL(url);
|
|
319
|
+
if (u.pathname === "/" || u.pathname === "") return true;
|
|
320
|
+
} catch {
|
|
321
|
+
}
|
|
322
|
+
return false;
|
|
323
|
+
}
|
|
324
|
+
function inferTypeFromUrl(url) {
|
|
325
|
+
const path2 = url.toLowerCase();
|
|
326
|
+
if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
|
|
327
|
+
if (CONTACT_URL_RE.test(path2)) return "contact";
|
|
328
|
+
if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
|
|
329
|
+
if (AMENITY_URL_RE.test(path2)) return "amenity";
|
|
330
|
+
if (DETAIL_URL_RE.test(path2)) return "detail";
|
|
331
|
+
if (LISTING_URL_RE.test(path2)) return "listing";
|
|
332
|
+
if (BLOG_URL_RE.test(path2)) return "blog";
|
|
333
|
+
return void 0;
|
|
334
|
+
}
|
|
335
|
+
function collectJsonLdNodes2(data) {
|
|
336
|
+
const nodes = [];
|
|
337
|
+
const visit = (value) => {
|
|
338
|
+
if (value == null) return;
|
|
339
|
+
if (Array.isArray(value)) {
|
|
340
|
+
value.forEach(visit);
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
if (typeof value !== "object") return;
|
|
344
|
+
const obj = value;
|
|
345
|
+
nodes.push(obj);
|
|
346
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
347
|
+
};
|
|
348
|
+
visit(data);
|
|
349
|
+
return nodes;
|
|
350
|
+
}
|
|
351
|
+
function schemaTypeName(node) {
|
|
352
|
+
const type = node["@type"];
|
|
353
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
354
|
+
const raw = types[0];
|
|
355
|
+
if (raw == null) return "";
|
|
356
|
+
const s = String(raw).toLowerCase();
|
|
357
|
+
const slash = s.lastIndexOf("/");
|
|
358
|
+
return slash >= 0 ? s.slice(slash + 1) : s;
|
|
359
|
+
}
|
|
360
|
+
function inferTypeFromSchema(html) {
|
|
361
|
+
const $ = cheerio2.load(html);
|
|
362
|
+
for (const el of $('script[type="application/ld+json"]').toArray()) {
|
|
363
|
+
const raw = $(el).html()?.trim();
|
|
364
|
+
if (!raw) continue;
|
|
365
|
+
try {
|
|
366
|
+
const parsed = JSON.parse(raw);
|
|
367
|
+
for (const node of collectJsonLdNodes2(parsed)) {
|
|
368
|
+
const name = schemaTypeName(node);
|
|
369
|
+
if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
|
|
370
|
+
if (name === "product" || node.offers != null) return "detail";
|
|
371
|
+
}
|
|
372
|
+
} catch {
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
|
|
376
|
+
if (ogType === "product") return "detail";
|
|
377
|
+
return void 0;
|
|
378
|
+
}
|
|
379
|
+
function normalizePageType(raw) {
|
|
380
|
+
if (!raw) return "page";
|
|
381
|
+
const lower = raw.toLowerCase();
|
|
382
|
+
const known = [
|
|
383
|
+
"detail",
|
|
384
|
+
"listing",
|
|
385
|
+
"amenity",
|
|
386
|
+
"promotion",
|
|
387
|
+
"contact",
|
|
388
|
+
"content",
|
|
389
|
+
"blog",
|
|
390
|
+
"system",
|
|
391
|
+
"page"
|
|
392
|
+
];
|
|
393
|
+
if (known.includes(lower)) return lower;
|
|
394
|
+
if (lower === "room" || lower === "product") return "detail";
|
|
395
|
+
if (lower === "offer" || lower === "sale") return "promotion";
|
|
396
|
+
return raw;
|
|
397
|
+
}
|
|
398
|
+
function resolveDisplayTitle(input) {
|
|
399
|
+
const heading = input.headingTitle?.trim();
|
|
400
|
+
if (heading) return normalizeDisplayTitle(heading);
|
|
401
|
+
return normalizeDisplayTitle(input.title);
|
|
402
|
+
}
|
|
403
|
+
function resolvePageCardMetadata(input) {
|
|
404
|
+
const title = input.title?.trim();
|
|
405
|
+
const url = input.url;
|
|
406
|
+
const displayTitle = resolveDisplayTitle(input);
|
|
407
|
+
if (hardExcludePage(url, title)) {
|
|
408
|
+
return {
|
|
409
|
+
type: "system",
|
|
410
|
+
cardEligible: false,
|
|
411
|
+
cardPriority: 0,
|
|
412
|
+
displayTitle,
|
|
413
|
+
displayDescription: input.description,
|
|
414
|
+
displayImageUrl: input.imageUrl
|
|
415
|
+
};
|
|
416
|
+
}
|
|
417
|
+
let type = normalizePageType(input.type);
|
|
418
|
+
if (type === "page" && input.html) {
|
|
419
|
+
const fromSchema = inferTypeFromSchema(input.html);
|
|
420
|
+
if (fromSchema) type = fromSchema;
|
|
421
|
+
}
|
|
422
|
+
if (type === "page") {
|
|
423
|
+
const fromUrl = inferTypeFromUrl(url);
|
|
424
|
+
if (fromUrl) type = fromUrl;
|
|
425
|
+
}
|
|
426
|
+
if (input.hasProductPrice && type === "page") {
|
|
427
|
+
type = "detail";
|
|
428
|
+
}
|
|
429
|
+
const typeKey = String(type).toLowerCase();
|
|
430
|
+
let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
|
|
431
|
+
let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
|
|
432
|
+
if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
|
|
433
|
+
cardEligible = false;
|
|
434
|
+
}
|
|
435
|
+
return {
|
|
436
|
+
type,
|
|
437
|
+
cardEligible,
|
|
438
|
+
cardPriority,
|
|
439
|
+
displayTitle,
|
|
440
|
+
displayDescription: input.description?.trim() || void 0,
|
|
441
|
+
displayImageUrl: input.imageUrl
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
219
445
|
// src/htmlPageExtract.ts
|
|
220
446
|
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
221
447
|
var DEFAULT_REMOVE_SELECTORS = [
|
|
@@ -238,17 +464,23 @@ function cleanContent(text) {
|
|
|
238
464
|
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
239
465
|
}
|
|
240
466
|
function bodyTextLengthHint(html, options = {}) {
|
|
241
|
-
const $ =
|
|
467
|
+
const $ = cheerio3.load(html);
|
|
242
468
|
stripNoiseFromDom($, options);
|
|
243
469
|
return cleanContent($("body").text().trim()).length;
|
|
244
470
|
}
|
|
245
471
|
function extractPageFromHtml(url, html, options = {}) {
|
|
246
|
-
const $ =
|
|
472
|
+
const $ = cheerio3.load(html);
|
|
247
473
|
stripNoiseFromDom($, options);
|
|
248
|
-
const
|
|
249
|
-
|
|
474
|
+
const h1Title = $("h1").first().text().trim();
|
|
475
|
+
const docTitle = $("title").text().trim();
|
|
476
|
+
let title = "";
|
|
477
|
+
if (options.titleSelector) {
|
|
478
|
+
title = $(options.titleSelector).first().text().trim();
|
|
479
|
+
} else {
|
|
480
|
+
title = docTitle || h1Title;
|
|
481
|
+
}
|
|
250
482
|
if (!title) {
|
|
251
|
-
title =
|
|
483
|
+
title = h1Title || docTitle;
|
|
252
484
|
}
|
|
253
485
|
const content = extractBestContentText($, options);
|
|
254
486
|
const minChars = options.minExtractedContentLength ?? 50;
|
|
@@ -273,12 +505,27 @@ function extractPageFromHtml(url, html, options = {}) {
|
|
|
273
505
|
}
|
|
274
506
|
}
|
|
275
507
|
const productMeta = extractProductMetadata(html);
|
|
276
|
-
const
|
|
508
|
+
const cardMeta = resolvePageCardMetadata({
|
|
509
|
+
url,
|
|
510
|
+
title,
|
|
511
|
+
headingTitle: h1Title || void 0,
|
|
512
|
+
description,
|
|
513
|
+
imageUrl,
|
|
514
|
+
html,
|
|
277
515
|
type,
|
|
516
|
+
hasProductPrice: productMeta.price != null
|
|
517
|
+
});
|
|
518
|
+
const metadata = {
|
|
519
|
+
type: cardMeta.type,
|
|
520
|
+
cardEligible: cardMeta.cardEligible,
|
|
521
|
+
cardPriority: cardMeta.cardPriority,
|
|
278
522
|
...title ? { title } : {},
|
|
523
|
+
...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
|
|
279
524
|
url,
|
|
280
525
|
...imageUrl ? { imageUrl } : {},
|
|
526
|
+
...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
|
|
281
527
|
...description ? { description } : {},
|
|
528
|
+
...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
|
|
282
529
|
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
283
530
|
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
284
531
|
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
@@ -584,13 +831,18 @@ var WebRAGPlugin = class {
|
|
|
584
831
|
plugin: this.name,
|
|
585
832
|
contentCount: scoredResults.length,
|
|
586
833
|
types: [...new Set(scoredResults.map((d) => d.metadata.type))],
|
|
587
|
-
topResults: scoredResults.slice(0,
|
|
834
|
+
topResults: scoredResults.slice(0, 16).map((doc) => ({
|
|
588
835
|
id: doc.id,
|
|
589
836
|
type: doc.metadata.type,
|
|
590
837
|
title: doc.metadata.title,
|
|
591
838
|
url: doc.metadata.url,
|
|
592
839
|
imageUrl: doc.metadata.imageUrl,
|
|
593
840
|
description: doc.metadata.description,
|
|
841
|
+
cardEligible: doc.metadata.cardEligible,
|
|
842
|
+
cardPriority: doc.metadata.cardPriority,
|
|
843
|
+
displayTitle: doc.metadata.displayTitle,
|
|
844
|
+
displayDescription: doc.metadata.displayDescription,
|
|
845
|
+
displayImageUrl: doc.metadata.displayImageUrl,
|
|
594
846
|
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
595
847
|
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
596
848
|
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
@@ -1625,7 +1877,9 @@ var WebRAGPlugin = class {
|
|
|
1625
1877
|
render: config.render,
|
|
1626
1878
|
renderOptions: config.renderOptions,
|
|
1627
1879
|
debug: config.debug,
|
|
1628
|
-
crawlLedger: config.crawlLedger
|
|
1880
|
+
crawlLedger: config.crawlLedger,
|
|
1881
|
+
extractLinks: config.extractLinks,
|
|
1882
|
+
maxLinksPerPage: config.maxLinksPerPage
|
|
1629
1883
|
}, options);
|
|
1630
1884
|
return {
|
|
1631
1885
|
...result,
|
|
@@ -1764,7 +2018,7 @@ var WebRAGPlugin = class {
|
|
|
1764
2018
|
return await response.text();
|
|
1765
2019
|
}
|
|
1766
2020
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1767
|
-
const $ =
|
|
2021
|
+
const $ = cheerio4.load(html);
|
|
1768
2022
|
const links = /* @__PURE__ */ new Set();
|
|
1769
2023
|
$("a[href]").each((_, el) => {
|
|
1770
2024
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -1781,6 +2035,22 @@ var WebRAGPlugin = class {
|
|
|
1781
2035
|
});
|
|
1782
2036
|
return Array.from(links);
|
|
1783
2037
|
}
|
|
2038
|
+
/**
|
|
2039
|
+
* When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
|
|
2040
|
+
* caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
|
|
2041
|
+
* undefined when disabled or on any parse error (link extraction must never fail a crawl).
|
|
2042
|
+
*/
|
|
2043
|
+
extractLinksIfEnabled(url, html, config) {
|
|
2044
|
+
if (!config.extractLinks) return void 0;
|
|
2045
|
+
try {
|
|
2046
|
+
const base = new URL(url);
|
|
2047
|
+
const links = this.extractInternalLinks(html, base, config.stripQueryParams ?? false);
|
|
2048
|
+
const cap = config.maxLinksPerPage ?? 200;
|
|
2049
|
+
return links.length > cap ? links.slice(0, cap) : links;
|
|
2050
|
+
} catch {
|
|
2051
|
+
return void 0;
|
|
2052
|
+
}
|
|
2053
|
+
}
|
|
1784
2054
|
/**
|
|
1785
2055
|
* Ingest content from a list of URLs
|
|
1786
2056
|
*
|
|
@@ -1811,7 +2081,9 @@ var WebRAGPlugin = class {
|
|
|
1811
2081
|
render: config.render,
|
|
1812
2082
|
renderOptions: config.renderOptions,
|
|
1813
2083
|
debug: config.debug,
|
|
1814
|
-
crawlLedger: config.crawlLedger
|
|
2084
|
+
crawlLedger: config.crawlLedger,
|
|
2085
|
+
extractLinks: config.extractLinks,
|
|
2086
|
+
maxLinksPerPage: config.maxLinksPerPage
|
|
1815
2087
|
}, options);
|
|
1816
2088
|
}
|
|
1817
2089
|
/**
|
|
@@ -1930,7 +2202,7 @@ var WebRAGPlugin = class {
|
|
|
1930
2202
|
}
|
|
1931
2203
|
}
|
|
1932
2204
|
try {
|
|
1933
|
-
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
2205
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2, links } = await this.crawlPageSmart(url, config, timeout, {
|
|
1934
2206
|
renderMode,
|
|
1935
2207
|
renderOptions,
|
|
1936
2208
|
minContentLength,
|
|
@@ -1964,7 +2236,8 @@ var WebRAGPlugin = class {
|
|
|
1964
2236
|
bodyTextLengthHint: bodyTextLengthHint2,
|
|
1965
2237
|
title: doc?.metadata?.title,
|
|
1966
2238
|
docId: doc?.id,
|
|
1967
|
-
error: diag?.errorMessage
|
|
2239
|
+
error: diag?.errorMessage,
|
|
2240
|
+
...links ? { links } : {}
|
|
1968
2241
|
});
|
|
1969
2242
|
this.emitCrawlPage(config, {
|
|
1970
2243
|
url,
|
|
@@ -2102,41 +2375,39 @@ var WebRAGPlugin = class {
|
|
|
2102
2375
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
2103
2376
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
2104
2377
|
}
|
|
2105
|
-
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
2378
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed, links) {
|
|
2106
2379
|
if (blockedSuspected) {
|
|
2107
2380
|
return {
|
|
2108
2381
|
doc: null,
|
|
2109
|
-
diag: { modeUsed: modeFailed, reason: "blocked_suspected" }
|
|
2382
|
+
diag: { modeUsed: modeFailed, reason: "blocked_suspected" },
|
|
2383
|
+
links
|
|
2110
2384
|
};
|
|
2111
2385
|
}
|
|
2112
2386
|
if (renderFailure) {
|
|
2113
2387
|
return {
|
|
2114
2388
|
doc: null,
|
|
2115
|
-
diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure }
|
|
2389
|
+
diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure },
|
|
2390
|
+
links
|
|
2116
2391
|
};
|
|
2117
2392
|
}
|
|
2118
2393
|
return {
|
|
2119
2394
|
doc,
|
|
2120
2395
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
2121
|
-
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
2396
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2,
|
|
2397
|
+
links
|
|
2122
2398
|
};
|
|
2123
2399
|
}
|
|
2124
2400
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
2125
2401
|
if (ctx.renderMode === true) {
|
|
2126
|
-
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2127
|
-
url,
|
|
2128
|
-
config,
|
|
2129
|
-
timeout,
|
|
2130
|
-
ctx.renderOptions,
|
|
2131
|
-
ctx.dbg
|
|
2132
|
-
);
|
|
2402
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected, links } = await this.crawlPageRendered(url, config, timeout, ctx.renderOptions, ctx.dbg);
|
|
2133
2403
|
return this.diagFromRenderedAttempt(
|
|
2134
2404
|
doc,
|
|
2135
2405
|
bodyTextLengthHint2,
|
|
2136
2406
|
renderFailure,
|
|
2137
2407
|
blockedSuspected,
|
|
2138
2408
|
"render_ok",
|
|
2139
|
-
"render_failed"
|
|
2409
|
+
"render_failed",
|
|
2410
|
+
links
|
|
2140
2411
|
);
|
|
2141
2412
|
}
|
|
2142
2413
|
try {
|
|
@@ -2162,8 +2433,9 @@ var WebRAGPlugin = class {
|
|
|
2162
2433
|
const html = await response.text();
|
|
2163
2434
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2164
2435
|
const staticHint = !doc ? this.bodyTextLengthHint(html, config) : void 0;
|
|
2436
|
+
const staticLinks = this.extractLinksIfEnabled(url, html, config);
|
|
2165
2437
|
if (doc && doc.content.length >= ctx.minContentLength) {
|
|
2166
|
-
return { doc, diag: { modeUsed: "static_ok" } };
|
|
2438
|
+
return { doc, diag: { modeUsed: "static_ok" }, links: staticLinks };
|
|
2167
2439
|
}
|
|
2168
2440
|
if (ctx.renderMode === "auto") {
|
|
2169
2441
|
const shouldRender = this.looksLikeDynamicShell(html) || !doc || doc.content.length < ctx.minContentLength;
|
|
@@ -2177,7 +2449,8 @@ var WebRAGPlugin = class {
|
|
|
2177
2449
|
doc: rendered,
|
|
2178
2450
|
bodyTextLengthHint: rHint,
|
|
2179
2451
|
renderFailure,
|
|
2180
|
-
blockedSuspected
|
|
2452
|
+
blockedSuspected,
|
|
2453
|
+
links: renderedLinks
|
|
2181
2454
|
} = await this.crawlPageRendered(
|
|
2182
2455
|
url,
|
|
2183
2456
|
config,
|
|
@@ -2192,7 +2465,9 @@ var WebRAGPlugin = class {
|
|
|
2192
2465
|
renderFailure,
|
|
2193
2466
|
blockedSuspected,
|
|
2194
2467
|
"render_fallback_ok",
|
|
2195
|
-
"render_fallback_failed"
|
|
2468
|
+
"render_fallback_failed",
|
|
2469
|
+
// Prefer links from the rendered DOM; fall back to the static HTML's links.
|
|
2470
|
+
renderedLinks ?? staticLinks
|
|
2196
2471
|
);
|
|
2197
2472
|
if (!rendered && (renderFailure || blockedSuspected)) {
|
|
2198
2473
|
fb.bodyTextLengthHint = staticHint ?? rHint;
|
|
@@ -2203,7 +2478,8 @@ var WebRAGPlugin = class {
|
|
|
2203
2478
|
return {
|
|
2204
2479
|
doc: null,
|
|
2205
2480
|
diag: { modeUsed: "static_failed", reason: "too_small" },
|
|
2206
|
-
bodyTextLengthHint: staticHint
|
|
2481
|
+
bodyTextLengthHint: staticHint,
|
|
2482
|
+
links: staticLinks
|
|
2207
2483
|
};
|
|
2208
2484
|
} catch (e) {
|
|
2209
2485
|
throw e;
|
|
@@ -2251,6 +2527,7 @@ var WebRAGPlugin = class {
|
|
|
2251
2527
|
const html = await page.content();
|
|
2252
2528
|
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
2253
2529
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2530
|
+
const links = this.extractLinksIfEnabled(url, html, config);
|
|
2254
2531
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
2255
2532
|
try {
|
|
2256
2533
|
const saveDir = config.debug.saveDir;
|
|
@@ -2264,7 +2541,7 @@ var WebRAGPlugin = class {
|
|
|
2264
2541
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
2265
2542
|
}
|
|
2266
2543
|
}
|
|
2267
|
-
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
2544
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2, links };
|
|
2268
2545
|
} catch (e) {
|
|
2269
2546
|
const msg = String(e?.message || e || "render_failed");
|
|
2270
2547
|
const lower = msg.toLowerCase();
|
|
@@ -2631,8 +2908,12 @@ var WebRAGPlugin = class {
|
|
|
2631
2908
|
bodyTextLengthHint,
|
|
2632
2909
|
extractPageFromHtml,
|
|
2633
2910
|
extractProductMetadata,
|
|
2911
|
+
hardExcludePage,
|
|
2912
|
+
inferTypeFromUrl,
|
|
2634
2913
|
normalizeAvailability,
|
|
2635
2914
|
normalizeCurrency,
|
|
2915
|
+
normalizeDisplayTitle,
|
|
2636
2916
|
parsePrice,
|
|
2917
|
+
resolvePageCardMetadata,
|
|
2637
2918
|
urlToDocumentId
|
|
2638
2919
|
});
|
package/dist/index.mjs
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
// src/WebRAGPlugin.ts
|
|
2
2
|
import { MongoClient } from "mongodb";
|
|
3
3
|
import OpenAI from "openai";
|
|
4
|
-
import * as
|
|
4
|
+
import * as cheerio4 from "cheerio";
|
|
5
5
|
import * as fs from "fs";
|
|
6
6
|
import * as path from "path";
|
|
7
7
|
|
|
8
8
|
// src/htmlPageExtract.ts
|
|
9
|
-
import * as
|
|
9
|
+
import * as cheerio3 from "cheerio";
|
|
10
10
|
|
|
11
11
|
// src/productMetadata.ts
|
|
12
12
|
import * as cheerio from "cheerio";
|
|
@@ -173,6 +173,228 @@ function normalizeAvailability(value) {
|
|
|
173
173
|
return s.replace(/\s+/g, "") || void 0;
|
|
174
174
|
}
|
|
175
175
|
|
|
176
|
+
// src/pageCardMetadata.ts
|
|
177
|
+
import * as cheerio2 from "cheerio";
|
|
178
|
+
var HARD_EXCLUDE_URL_RE = /(?:^|\/)(?:login|signin|sign-in|signup|sign-up|register|account|cart|checkout|admin|wp-admin|privacy|terms|legal|cookies|gdpr|thank|gracias|confirm|success|receipt|404|tag|tags|category|categories|author|archive|newsletter|careers|jobs)(?:\/|$|-|\.)/i;
|
|
179
|
+
var HARD_EXCLUDE_TITLE_RE = /\b(?:login|sign\s*in|sign\s*up|privacy\s*policy|terms\s*(?:of\s*)?service|thank\s*you|gracias\s*por|admin|404|not\s*found)\b/i;
|
|
180
|
+
var BLOG_URL_RE = /(?:^|\/)(?:blog|news|press|article|posts?)(?:\/|$)/i;
|
|
181
|
+
var PROMOTION_URL_RE = /(?:^|\/)(?:offer|offers|sale|sales|promo|promotion|deal|deals|coupon|special-offer|buster)(?:\/|$|-|\.)/i;
|
|
182
|
+
var PROMOTION_SLUG_RE = /(?:^|\/)[^/]*(?:-sale|-offer|-promo|-deal|-buster)(?:\/|$)/i;
|
|
183
|
+
var ENTITY_DETAIL_PATH_RES = [
|
|
184
|
+
/\/projects\/[^/]+/i,
|
|
185
|
+
/\/project\/[^/]+/i,
|
|
186
|
+
/\/perspectives\/[^/]+/i,
|
|
187
|
+
/\/perspective\/[^/]+/i,
|
|
188
|
+
/\/portfolio\/[^/]+/i,
|
|
189
|
+
/\/case-stud(?:y|ies)\/[^/]+/i,
|
|
190
|
+
/\/insights?\/[^/]+/i,
|
|
191
|
+
/\/people\/[^/]+/i,
|
|
192
|
+
/\/person\/[^/]+/i,
|
|
193
|
+
/\/team-members?\/[^/]+/i,
|
|
194
|
+
/\/members?\/[^/]+/i,
|
|
195
|
+
/\/staff\/[^/]+/i,
|
|
196
|
+
/\/experts?\/[^/]+/i,
|
|
197
|
+
/\/authors?\/[^/]+/i,
|
|
198
|
+
/\/leadership\/[^/]+/i,
|
|
199
|
+
/\/biograph(?:y|ies)\/[^/]+/i
|
|
200
|
+
];
|
|
201
|
+
var DETAIL_URL_RE = /(?:^|\/)(?:product|products|item|items|p|room|rooms|suite|suites|habitacion|plan|plans|space|spaces|tour|tours|menu|project|perspective|person|team-member|team-members|staff|expert|case-study|author|biography)(?:\/|$)/i;
|
|
202
|
+
var LISTING_URL_RE = /(?:^|\/)(?:catalog|catalogue|collection|collections|category|categories|shop|store|habitaciones|rooms|products|projects|perspectives|portfolio|people|team|members|insights|case-studies|thought-leadership)(?:\/|$)/i;
|
|
203
|
+
var AMENITY_URL_RE = /(?:^|\/)(?:amenity|amenities|activity|activities|experience|experiences|service-page)(?:\/|$)/i;
|
|
204
|
+
var CONTACT_URL_RE = /(?:^|\/)(?:contact|contacto|about|nosotros|faq|help|support)(?:\/|$)/i;
|
|
205
|
+
var EN_DASH_SUFFIX_RE = /\s+[–—]\s+.+$/;
|
|
206
|
+
var PIPE_SUFFIX_RE = /\s+\|\s+.+$/;
|
|
207
|
+
var CARD_PRIORITY = {
|
|
208
|
+
detail: 10,
|
|
209
|
+
listing: 6,
|
|
210
|
+
amenity: 5,
|
|
211
|
+
promotion: 2,
|
|
212
|
+
contact: 1,
|
|
213
|
+
content: 1,
|
|
214
|
+
blog: 0,
|
|
215
|
+
system: 0,
|
|
216
|
+
page: 3
|
|
217
|
+
};
|
|
218
|
+
var CARD_ELIGIBLE_DEFAULT = {
|
|
219
|
+
detail: true,
|
|
220
|
+
listing: true,
|
|
221
|
+
amenity: true,
|
|
222
|
+
promotion: false,
|
|
223
|
+
contact: false,
|
|
224
|
+
content: false,
|
|
225
|
+
blog: false,
|
|
226
|
+
system: false,
|
|
227
|
+
page: false
|
|
228
|
+
};
|
|
229
|
+
var SCHEMA_TYPE_MAP = {
|
|
230
|
+
product: "detail",
|
|
231
|
+
service: "amenity",
|
|
232
|
+
hotelroom: "detail",
|
|
233
|
+
room: "detail",
|
|
234
|
+
apartment: "detail",
|
|
235
|
+
lodgingroom: "detail",
|
|
236
|
+
course: "detail",
|
|
237
|
+
event: "detail",
|
|
238
|
+
offer: "promotion",
|
|
239
|
+
person: "detail",
|
|
240
|
+
employee: "detail",
|
|
241
|
+
profilepage: "detail",
|
|
242
|
+
article: "detail",
|
|
243
|
+
newsarticle: "detail",
|
|
244
|
+
blogposting: "detail",
|
|
245
|
+
creativework: "detail"
|
|
246
|
+
};
|
|
247
|
+
function normalizeDisplayTitle(title) {
|
|
248
|
+
if (!title?.trim()) return title;
|
|
249
|
+
let t = title.trim();
|
|
250
|
+
for (let i = 0; i < 2; i++) {
|
|
251
|
+
const dash = t.match(EN_DASH_SUFFIX_RE);
|
|
252
|
+
if (dash && dash.index !== void 0 && dash.index >= 4) {
|
|
253
|
+
t = t.slice(0, dash.index).trim();
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
const pipe = t.match(PIPE_SUFFIX_RE);
|
|
257
|
+
if (pipe && pipe.index !== void 0 && pipe.index >= 8) {
|
|
258
|
+
t = t.slice(0, pipe.index).trim();
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
break;
|
|
262
|
+
}
|
|
263
|
+
return t || title.trim();
|
|
264
|
+
}
|
|
265
|
+
function hardExcludePage(url, title) {
|
|
266
|
+
const path2 = url.toLowerCase();
|
|
267
|
+
if (HARD_EXCLUDE_URL_RE.test(path2)) return true;
|
|
268
|
+
if (BLOG_URL_RE.test(path2)) return true;
|
|
269
|
+
if (title && HARD_EXCLUDE_TITLE_RE.test(title.toLowerCase())) return true;
|
|
270
|
+
try {
|
|
271
|
+
const u = new URL(url);
|
|
272
|
+
if (u.pathname === "/" || u.pathname === "") return true;
|
|
273
|
+
} catch {
|
|
274
|
+
}
|
|
275
|
+
return false;
|
|
276
|
+
}
|
|
277
|
+
function inferTypeFromUrl(url) {
|
|
278
|
+
const path2 = url.toLowerCase();
|
|
279
|
+
if (PROMOTION_URL_RE.test(path2) || PROMOTION_SLUG_RE.test(path2)) return "promotion";
|
|
280
|
+
if (CONTACT_URL_RE.test(path2)) return "contact";
|
|
281
|
+
if (ENTITY_DETAIL_PATH_RES.some((re) => re.test(path2))) return "detail";
|
|
282
|
+
if (AMENITY_URL_RE.test(path2)) return "amenity";
|
|
283
|
+
if (DETAIL_URL_RE.test(path2)) return "detail";
|
|
284
|
+
if (LISTING_URL_RE.test(path2)) return "listing";
|
|
285
|
+
if (BLOG_URL_RE.test(path2)) return "blog";
|
|
286
|
+
return void 0;
|
|
287
|
+
}
|
|
288
|
+
function collectJsonLdNodes2(data) {
|
|
289
|
+
const nodes = [];
|
|
290
|
+
const visit = (value) => {
|
|
291
|
+
if (value == null) return;
|
|
292
|
+
if (Array.isArray(value)) {
|
|
293
|
+
value.forEach(visit);
|
|
294
|
+
return;
|
|
295
|
+
}
|
|
296
|
+
if (typeof value !== "object") return;
|
|
297
|
+
const obj = value;
|
|
298
|
+
nodes.push(obj);
|
|
299
|
+
if (obj["@graph"]) visit(obj["@graph"]);
|
|
300
|
+
};
|
|
301
|
+
visit(data);
|
|
302
|
+
return nodes;
|
|
303
|
+
}
|
|
304
|
+
function schemaTypeName(node) {
|
|
305
|
+
const type = node["@type"];
|
|
306
|
+
const types = Array.isArray(type) ? type : type != null ? [type] : [];
|
|
307
|
+
const raw = types[0];
|
|
308
|
+
if (raw == null) return "";
|
|
309
|
+
const s = String(raw).toLowerCase();
|
|
310
|
+
const slash = s.lastIndexOf("/");
|
|
311
|
+
return slash >= 0 ? s.slice(slash + 1) : s;
|
|
312
|
+
}
|
|
313
|
+
function inferTypeFromSchema(html) {
|
|
314
|
+
const $ = cheerio2.load(html);
|
|
315
|
+
for (const el of $('script[type="application/ld+json"]').toArray()) {
|
|
316
|
+
const raw = $(el).html()?.trim();
|
|
317
|
+
if (!raw) continue;
|
|
318
|
+
try {
|
|
319
|
+
const parsed = JSON.parse(raw);
|
|
320
|
+
for (const node of collectJsonLdNodes2(parsed)) {
|
|
321
|
+
const name = schemaTypeName(node);
|
|
322
|
+
if (SCHEMA_TYPE_MAP[name]) return SCHEMA_TYPE_MAP[name];
|
|
323
|
+
if (name === "product" || node.offers != null) return "detail";
|
|
324
|
+
}
|
|
325
|
+
} catch {
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
const ogType = $('meta[property="og:type"]').attr("content")?.toLowerCase();
|
|
329
|
+
if (ogType === "product") return "detail";
|
|
330
|
+
return void 0;
|
|
331
|
+
}
|
|
332
|
+
function normalizePageType(raw) {
|
|
333
|
+
if (!raw) return "page";
|
|
334
|
+
const lower = raw.toLowerCase();
|
|
335
|
+
const known = [
|
|
336
|
+
"detail",
|
|
337
|
+
"listing",
|
|
338
|
+
"amenity",
|
|
339
|
+
"promotion",
|
|
340
|
+
"contact",
|
|
341
|
+
"content",
|
|
342
|
+
"blog",
|
|
343
|
+
"system",
|
|
344
|
+
"page"
|
|
345
|
+
];
|
|
346
|
+
if (known.includes(lower)) return lower;
|
|
347
|
+
if (lower === "room" || lower === "product") return "detail";
|
|
348
|
+
if (lower === "offer" || lower === "sale") return "promotion";
|
|
349
|
+
return raw;
|
|
350
|
+
}
|
|
351
|
+
function resolveDisplayTitle(input) {
|
|
352
|
+
const heading = input.headingTitle?.trim();
|
|
353
|
+
if (heading) return normalizeDisplayTitle(heading);
|
|
354
|
+
return normalizeDisplayTitle(input.title);
|
|
355
|
+
}
|
|
356
|
+
function resolvePageCardMetadata(input) {
|
|
357
|
+
const title = input.title?.trim();
|
|
358
|
+
const url = input.url;
|
|
359
|
+
const displayTitle = resolveDisplayTitle(input);
|
|
360
|
+
if (hardExcludePage(url, title)) {
|
|
361
|
+
return {
|
|
362
|
+
type: "system",
|
|
363
|
+
cardEligible: false,
|
|
364
|
+
cardPriority: 0,
|
|
365
|
+
displayTitle,
|
|
366
|
+
displayDescription: input.description,
|
|
367
|
+
displayImageUrl: input.imageUrl
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
let type = normalizePageType(input.type);
|
|
371
|
+
if (type === "page" && input.html) {
|
|
372
|
+
const fromSchema = inferTypeFromSchema(input.html);
|
|
373
|
+
if (fromSchema) type = fromSchema;
|
|
374
|
+
}
|
|
375
|
+
if (type === "page") {
|
|
376
|
+
const fromUrl = inferTypeFromUrl(url);
|
|
377
|
+
if (fromUrl) type = fromUrl;
|
|
378
|
+
}
|
|
379
|
+
if (input.hasProductPrice && type === "page") {
|
|
380
|
+
type = "detail";
|
|
381
|
+
}
|
|
382
|
+
const typeKey = String(type).toLowerCase();
|
|
383
|
+
let cardEligible = CARD_ELIGIBLE_DEFAULT[typeKey] ?? false;
|
|
384
|
+
let cardPriority = CARD_PRIORITY[typeKey] ?? 3;
|
|
385
|
+
if (cardEligible && PROMOTION_URL_RE.test(url.toLowerCase())) {
|
|
386
|
+
cardEligible = false;
|
|
387
|
+
}
|
|
388
|
+
return {
|
|
389
|
+
type,
|
|
390
|
+
cardEligible,
|
|
391
|
+
cardPriority,
|
|
392
|
+
displayTitle,
|
|
393
|
+
displayDescription: input.description?.trim() || void 0,
|
|
394
|
+
displayImageUrl: input.imageUrl
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
|
|
176
398
|
// src/htmlPageExtract.ts
|
|
177
399
|
var DEFAULT_CONTENT_SELECTOR = 'article, main, [role="main"], #content, #primary, #main, .content, .post-content, .entry-content, .elementor-location-content, .elementor-widget-theme-post-content, .wp-block-group, .site-content, .ast-single-post, .ast-page';
|
|
178
400
|
var DEFAULT_REMOVE_SELECTORS = [
|
|
@@ -195,17 +417,23 @@ function cleanContent(text) {
|
|
|
195
417
|
return text.replace(/\s+/g, " ").replace(/\n\s*\n/g, "\n\n").replace(/\t/g, " ").trim();
|
|
196
418
|
}
|
|
197
419
|
function bodyTextLengthHint(html, options = {}) {
|
|
198
|
-
const $ =
|
|
420
|
+
const $ = cheerio3.load(html);
|
|
199
421
|
stripNoiseFromDom($, options);
|
|
200
422
|
return cleanContent($("body").text().trim()).length;
|
|
201
423
|
}
|
|
202
424
|
function extractPageFromHtml(url, html, options = {}) {
|
|
203
|
-
const $ =
|
|
425
|
+
const $ = cheerio3.load(html);
|
|
204
426
|
stripNoiseFromDom($, options);
|
|
205
|
-
const
|
|
206
|
-
|
|
427
|
+
const h1Title = $("h1").first().text().trim();
|
|
428
|
+
const docTitle = $("title").text().trim();
|
|
429
|
+
let title = "";
|
|
430
|
+
if (options.titleSelector) {
|
|
431
|
+
title = $(options.titleSelector).first().text().trim();
|
|
432
|
+
} else {
|
|
433
|
+
title = docTitle || h1Title;
|
|
434
|
+
}
|
|
207
435
|
if (!title) {
|
|
208
|
-
title =
|
|
436
|
+
title = h1Title || docTitle;
|
|
209
437
|
}
|
|
210
438
|
const content = extractBestContentText($, options);
|
|
211
439
|
const minChars = options.minExtractedContentLength ?? 50;
|
|
@@ -230,12 +458,27 @@ function extractPageFromHtml(url, html, options = {}) {
|
|
|
230
458
|
}
|
|
231
459
|
}
|
|
232
460
|
const productMeta = extractProductMetadata(html);
|
|
233
|
-
const
|
|
461
|
+
const cardMeta = resolvePageCardMetadata({
|
|
462
|
+
url,
|
|
463
|
+
title,
|
|
464
|
+
headingTitle: h1Title || void 0,
|
|
465
|
+
description,
|
|
466
|
+
imageUrl,
|
|
467
|
+
html,
|
|
234
468
|
type,
|
|
469
|
+
hasProductPrice: productMeta.price != null
|
|
470
|
+
});
|
|
471
|
+
const metadata = {
|
|
472
|
+
type: cardMeta.type,
|
|
473
|
+
cardEligible: cardMeta.cardEligible,
|
|
474
|
+
cardPriority: cardMeta.cardPriority,
|
|
235
475
|
...title ? { title } : {},
|
|
476
|
+
...cardMeta.displayTitle ? { displayTitle: cardMeta.displayTitle } : {},
|
|
236
477
|
url,
|
|
237
478
|
...imageUrl ? { imageUrl } : {},
|
|
479
|
+
...cardMeta.displayImageUrl ? { displayImageUrl: cardMeta.displayImageUrl } : {},
|
|
238
480
|
...description ? { description } : {},
|
|
481
|
+
...cardMeta.displayDescription ? { displayDescription: cardMeta.displayDescription } : {},
|
|
239
482
|
...productMeta.price != null ? { price: productMeta.price } : {},
|
|
240
483
|
...productMeta.currency ? { currency: productMeta.currency } : {},
|
|
241
484
|
...productMeta.availability ? { availability: productMeta.availability } : {},
|
|
@@ -541,13 +784,18 @@ var WebRAGPlugin = class {
|
|
|
541
784
|
plugin: this.name,
|
|
542
785
|
contentCount: scoredResults.length,
|
|
543
786
|
types: [...new Set(scoredResults.map((d) => d.metadata.type))],
|
|
544
|
-
topResults: scoredResults.slice(0,
|
|
787
|
+
topResults: scoredResults.slice(0, 16).map((doc) => ({
|
|
545
788
|
id: doc.id,
|
|
546
789
|
type: doc.metadata.type,
|
|
547
790
|
title: doc.metadata.title,
|
|
548
791
|
url: doc.metadata.url,
|
|
549
792
|
imageUrl: doc.metadata.imageUrl,
|
|
550
793
|
description: doc.metadata.description,
|
|
794
|
+
cardEligible: doc.metadata.cardEligible,
|
|
795
|
+
cardPriority: doc.metadata.cardPriority,
|
|
796
|
+
displayTitle: doc.metadata.displayTitle,
|
|
797
|
+
displayDescription: doc.metadata.displayDescription,
|
|
798
|
+
displayImageUrl: doc.metadata.displayImageUrl,
|
|
551
799
|
...doc.metadata.price != null ? { price: doc.metadata.price } : {},
|
|
552
800
|
...doc.metadata.currency ? { currency: doc.metadata.currency } : {},
|
|
553
801
|
...doc.metadata.availability ? { availability: doc.metadata.availability } : {},
|
|
@@ -1582,7 +1830,9 @@ var WebRAGPlugin = class {
|
|
|
1582
1830
|
render: config.render,
|
|
1583
1831
|
renderOptions: config.renderOptions,
|
|
1584
1832
|
debug: config.debug,
|
|
1585
|
-
crawlLedger: config.crawlLedger
|
|
1833
|
+
crawlLedger: config.crawlLedger,
|
|
1834
|
+
extractLinks: config.extractLinks,
|
|
1835
|
+
maxLinksPerPage: config.maxLinksPerPage
|
|
1586
1836
|
}, options);
|
|
1587
1837
|
return {
|
|
1588
1838
|
...result,
|
|
@@ -1721,7 +1971,7 @@ var WebRAGPlugin = class {
|
|
|
1721
1971
|
return await response.text();
|
|
1722
1972
|
}
|
|
1723
1973
|
extractInternalLinks(html, base, stripQueryParams) {
|
|
1724
|
-
const $ =
|
|
1974
|
+
const $ = cheerio4.load(html);
|
|
1725
1975
|
const links = /* @__PURE__ */ new Set();
|
|
1726
1976
|
$("a[href]").each((_, el) => {
|
|
1727
1977
|
const href = ($(el).attr("href") || "").trim();
|
|
@@ -1738,6 +1988,22 @@ var WebRAGPlugin = class {
|
|
|
1738
1988
|
});
|
|
1739
1989
|
return Array.from(links);
|
|
1740
1990
|
}
|
|
1991
|
+
/**
|
|
1992
|
+
* When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
|
|
1993
|
+
* caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
|
|
1994
|
+
* undefined when disabled or on any parse error (link extraction must never fail a crawl).
|
|
1995
|
+
*/
|
|
1996
|
+
extractLinksIfEnabled(url, html, config) {
|
|
1997
|
+
if (!config.extractLinks) return void 0;
|
|
1998
|
+
try {
|
|
1999
|
+
const base = new URL(url);
|
|
2000
|
+
const links = this.extractInternalLinks(html, base, config.stripQueryParams ?? false);
|
|
2001
|
+
const cap = config.maxLinksPerPage ?? 200;
|
|
2002
|
+
return links.length > cap ? links.slice(0, cap) : links;
|
|
2003
|
+
} catch {
|
|
2004
|
+
return void 0;
|
|
2005
|
+
}
|
|
2006
|
+
}
|
|
1741
2007
|
/**
|
|
1742
2008
|
* Ingest content from a list of URLs
|
|
1743
2009
|
*
|
|
@@ -1768,7 +2034,9 @@ var WebRAGPlugin = class {
|
|
|
1768
2034
|
render: config.render,
|
|
1769
2035
|
renderOptions: config.renderOptions,
|
|
1770
2036
|
debug: config.debug,
|
|
1771
|
-
crawlLedger: config.crawlLedger
|
|
2037
|
+
crawlLedger: config.crawlLedger,
|
|
2038
|
+
extractLinks: config.extractLinks,
|
|
2039
|
+
maxLinksPerPage: config.maxLinksPerPage
|
|
1772
2040
|
}, options);
|
|
1773
2041
|
}
|
|
1774
2042
|
/**
|
|
@@ -1887,7 +2155,7 @@ var WebRAGPlugin = class {
|
|
|
1887
2155
|
}
|
|
1888
2156
|
}
|
|
1889
2157
|
try {
|
|
1890
|
-
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
2158
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2, links } = await this.crawlPageSmart(url, config, timeout, {
|
|
1891
2159
|
renderMode,
|
|
1892
2160
|
renderOptions,
|
|
1893
2161
|
minContentLength,
|
|
@@ -1921,7 +2189,8 @@ var WebRAGPlugin = class {
|
|
|
1921
2189
|
bodyTextLengthHint: bodyTextLengthHint2,
|
|
1922
2190
|
title: doc?.metadata?.title,
|
|
1923
2191
|
docId: doc?.id,
|
|
1924
|
-
error: diag?.errorMessage
|
|
2192
|
+
error: diag?.errorMessage,
|
|
2193
|
+
...links ? { links } : {}
|
|
1925
2194
|
});
|
|
1926
2195
|
this.emitCrawlPage(config, {
|
|
1927
2196
|
url,
|
|
@@ -2059,41 +2328,39 @@ var WebRAGPlugin = class {
|
|
|
2059
2328
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
2060
2329
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
2061
2330
|
}
|
|
2062
|
-
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
2331
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed, links) {
|
|
2063
2332
|
if (blockedSuspected) {
|
|
2064
2333
|
return {
|
|
2065
2334
|
doc: null,
|
|
2066
|
-
diag: { modeUsed: modeFailed, reason: "blocked_suspected" }
|
|
2335
|
+
diag: { modeUsed: modeFailed, reason: "blocked_suspected" },
|
|
2336
|
+
links
|
|
2067
2337
|
};
|
|
2068
2338
|
}
|
|
2069
2339
|
if (renderFailure) {
|
|
2070
2340
|
return {
|
|
2071
2341
|
doc: null,
|
|
2072
|
-
diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure }
|
|
2342
|
+
diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure },
|
|
2343
|
+
links
|
|
2073
2344
|
};
|
|
2074
2345
|
}
|
|
2075
2346
|
return {
|
|
2076
2347
|
doc,
|
|
2077
2348
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
2078
|
-
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
2349
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2,
|
|
2350
|
+
links
|
|
2079
2351
|
};
|
|
2080
2352
|
}
|
|
2081
2353
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
2082
2354
|
if (ctx.renderMode === true) {
|
|
2083
|
-
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2084
|
-
url,
|
|
2085
|
-
config,
|
|
2086
|
-
timeout,
|
|
2087
|
-
ctx.renderOptions,
|
|
2088
|
-
ctx.dbg
|
|
2089
|
-
);
|
|
2355
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected, links } = await this.crawlPageRendered(url, config, timeout, ctx.renderOptions, ctx.dbg);
|
|
2090
2356
|
return this.diagFromRenderedAttempt(
|
|
2091
2357
|
doc,
|
|
2092
2358
|
bodyTextLengthHint2,
|
|
2093
2359
|
renderFailure,
|
|
2094
2360
|
blockedSuspected,
|
|
2095
2361
|
"render_ok",
|
|
2096
|
-
"render_failed"
|
|
2362
|
+
"render_failed",
|
|
2363
|
+
links
|
|
2097
2364
|
);
|
|
2098
2365
|
}
|
|
2099
2366
|
try {
|
|
@@ -2119,8 +2386,9 @@ var WebRAGPlugin = class {
|
|
|
2119
2386
|
const html = await response.text();
|
|
2120
2387
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2121
2388
|
const staticHint = !doc ? this.bodyTextLengthHint(html, config) : void 0;
|
|
2389
|
+
const staticLinks = this.extractLinksIfEnabled(url, html, config);
|
|
2122
2390
|
if (doc && doc.content.length >= ctx.minContentLength) {
|
|
2123
|
-
return { doc, diag: { modeUsed: "static_ok" } };
|
|
2391
|
+
return { doc, diag: { modeUsed: "static_ok" }, links: staticLinks };
|
|
2124
2392
|
}
|
|
2125
2393
|
if (ctx.renderMode === "auto") {
|
|
2126
2394
|
const shouldRender = this.looksLikeDynamicShell(html) || !doc || doc.content.length < ctx.minContentLength;
|
|
@@ -2134,7 +2402,8 @@ var WebRAGPlugin = class {
|
|
|
2134
2402
|
doc: rendered,
|
|
2135
2403
|
bodyTextLengthHint: rHint,
|
|
2136
2404
|
renderFailure,
|
|
2137
|
-
blockedSuspected
|
|
2405
|
+
blockedSuspected,
|
|
2406
|
+
links: renderedLinks
|
|
2138
2407
|
} = await this.crawlPageRendered(
|
|
2139
2408
|
url,
|
|
2140
2409
|
config,
|
|
@@ -2149,7 +2418,9 @@ var WebRAGPlugin = class {
|
|
|
2149
2418
|
renderFailure,
|
|
2150
2419
|
blockedSuspected,
|
|
2151
2420
|
"render_fallback_ok",
|
|
2152
|
-
"render_fallback_failed"
|
|
2421
|
+
"render_fallback_failed",
|
|
2422
|
+
// Prefer links from the rendered DOM; fall back to the static HTML's links.
|
|
2423
|
+
renderedLinks ?? staticLinks
|
|
2153
2424
|
);
|
|
2154
2425
|
if (!rendered && (renderFailure || blockedSuspected)) {
|
|
2155
2426
|
fb.bodyTextLengthHint = staticHint ?? rHint;
|
|
@@ -2160,7 +2431,8 @@ var WebRAGPlugin = class {
|
|
|
2160
2431
|
return {
|
|
2161
2432
|
doc: null,
|
|
2162
2433
|
diag: { modeUsed: "static_failed", reason: "too_small" },
|
|
2163
|
-
bodyTextLengthHint: staticHint
|
|
2434
|
+
bodyTextLengthHint: staticHint,
|
|
2435
|
+
links: staticLinks
|
|
2164
2436
|
};
|
|
2165
2437
|
} catch (e) {
|
|
2166
2438
|
throw e;
|
|
@@ -2208,6 +2480,7 @@ var WebRAGPlugin = class {
|
|
|
2208
2480
|
const html = await page.content();
|
|
2209
2481
|
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
2210
2482
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2483
|
+
const links = this.extractLinksIfEnabled(url, html, config);
|
|
2211
2484
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
2212
2485
|
try {
|
|
2213
2486
|
const saveDir = config.debug.saveDir;
|
|
@@ -2221,7 +2494,7 @@ var WebRAGPlugin = class {
|
|
|
2221
2494
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
2222
2495
|
}
|
|
2223
2496
|
}
|
|
2224
|
-
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
2497
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2, links };
|
|
2225
2498
|
} catch (e) {
|
|
2226
2499
|
const msg = String(e?.message || e || "render_failed");
|
|
2227
2500
|
const lower = msg.toLowerCase();
|
|
@@ -2587,8 +2860,12 @@ export {
|
|
|
2587
2860
|
bodyTextLengthHint,
|
|
2588
2861
|
extractPageFromHtml,
|
|
2589
2862
|
extractProductMetadata,
|
|
2863
|
+
hardExcludePage,
|
|
2864
|
+
inferTypeFromUrl,
|
|
2590
2865
|
normalizeAvailability,
|
|
2591
2866
|
normalizeCurrency,
|
|
2867
|
+
normalizeDisplayTitle,
|
|
2592
2868
|
parsePrice,
|
|
2869
|
+
resolvePageCardMetadata,
|
|
2593
2870
|
urlToDocumentId
|
|
2594
2871
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@snap-agent/rag-web",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.8",
|
|
4
4
|
"description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|
|
@@ -68,4 +68,3 @@
|
|
|
68
68
|
"url": "https://github.com/vilo-hq/snap-agent/issues"
|
|
69
69
|
}
|
|
70
70
|
}
|
|
71
|
-
|