@snap-agent/rag-web 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +24 -0
- package/dist/index.d.ts +24 -0
- package/dist/index.js +46 -21
- package/dist/index.mjs +46 -21
- package/package.json +1 -2
package/dist/index.d.mts
CHANGED
|
@@ -114,6 +114,12 @@ interface CrawlPageStatusEntry {
|
|
|
114
114
|
httpStatus?: number;
|
|
115
115
|
error?: string;
|
|
116
116
|
skippedReason?: string;
|
|
117
|
+
/**
|
|
118
|
+
* Same-origin internal links found on this page, populated only when `extractLinks` is set on
|
|
119
|
+
* the crawl config. Enables resumable recursive (BFS) crawling: the caller feeds these back into
|
|
120
|
+
* its own frontier instead of the SDK doing a separate link-discovery fetch.
|
|
121
|
+
*/
|
|
122
|
+
links?: string[];
|
|
117
123
|
}
|
|
118
124
|
interface CrawlLedgerDocument {
|
|
119
125
|
tenantId: string;
|
|
@@ -258,6 +264,10 @@ interface SitemapConfig {
|
|
|
258
264
|
*/
|
|
259
265
|
debug?: DebugOptions;
|
|
260
266
|
crawlLedger?: CrawlLedgerOptions;
|
|
267
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
268
|
+
extractLinks?: boolean;
|
|
269
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
270
|
+
maxLinksPerPage?: number;
|
|
261
271
|
}
|
|
262
272
|
/**
|
|
263
273
|
* Direct URL list crawling configuration
|
|
@@ -277,6 +287,10 @@ interface UrlListConfig {
|
|
|
277
287
|
debug?: DebugOptions;
|
|
278
288
|
stripQueryParams?: boolean;
|
|
279
289
|
crawlLedger?: CrawlLedgerOptions;
|
|
290
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
291
|
+
extractLinks?: boolean;
|
|
292
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
293
|
+
maxLinksPerPage?: number;
|
|
280
294
|
}
|
|
281
295
|
/**
|
|
282
296
|
* Single page ingestion (no discovery)
|
|
@@ -321,6 +335,10 @@ interface WebsiteCrawlConfig {
|
|
|
321
335
|
renderOptions?: RenderOptions;
|
|
322
336
|
debug?: DebugOptions;
|
|
323
337
|
crawlLedger?: CrawlLedgerOptions;
|
|
338
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
339
|
+
extractLinks?: boolean;
|
|
340
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
341
|
+
maxLinksPerPage?: number;
|
|
324
342
|
}
|
|
325
343
|
interface RenderOptions {
|
|
326
344
|
/**
|
|
@@ -661,6 +679,12 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
661
679
|
private normalizeWebsiteUrl;
|
|
662
680
|
private fetchHtml;
|
|
663
681
|
private extractInternalLinks;
|
|
682
|
+
/**
|
|
683
|
+
* When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
|
|
684
|
+
* caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
|
|
685
|
+
* undefined when disabled or on any parse error (link extraction must never fail a crawl).
|
|
686
|
+
*/
|
|
687
|
+
private extractLinksIfEnabled;
|
|
664
688
|
/**
|
|
665
689
|
* Ingest content from a list of URLs
|
|
666
690
|
*
|
package/dist/index.d.ts
CHANGED
|
@@ -114,6 +114,12 @@ interface CrawlPageStatusEntry {
|
|
|
114
114
|
httpStatus?: number;
|
|
115
115
|
error?: string;
|
|
116
116
|
skippedReason?: string;
|
|
117
|
+
/**
|
|
118
|
+
* Same-origin internal links found on this page, populated only when `extractLinks` is set on
|
|
119
|
+
* the crawl config. Enables resumable recursive (BFS) crawling: the caller feeds these back into
|
|
120
|
+
* its own frontier instead of the SDK doing a separate link-discovery fetch.
|
|
121
|
+
*/
|
|
122
|
+
links?: string[];
|
|
117
123
|
}
|
|
118
124
|
interface CrawlLedgerDocument {
|
|
119
125
|
tenantId: string;
|
|
@@ -258,6 +264,10 @@ interface SitemapConfig {
|
|
|
258
264
|
*/
|
|
259
265
|
debug?: DebugOptions;
|
|
260
266
|
crawlLedger?: CrawlLedgerOptions;
|
|
267
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
268
|
+
extractLinks?: boolean;
|
|
269
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
270
|
+
maxLinksPerPage?: number;
|
|
261
271
|
}
|
|
262
272
|
/**
|
|
263
273
|
* Direct URL list crawling configuration
|
|
@@ -277,6 +287,10 @@ interface UrlListConfig {
|
|
|
277
287
|
debug?: DebugOptions;
|
|
278
288
|
stripQueryParams?: boolean;
|
|
279
289
|
crawlLedger?: CrawlLedgerOptions;
|
|
290
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
291
|
+
extractLinks?: boolean;
|
|
292
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
293
|
+
maxLinksPerPage?: number;
|
|
280
294
|
}
|
|
281
295
|
/**
|
|
282
296
|
* Single page ingestion (no discovery)
|
|
@@ -321,6 +335,10 @@ interface WebsiteCrawlConfig {
|
|
|
321
335
|
renderOptions?: RenderOptions;
|
|
322
336
|
debug?: DebugOptions;
|
|
323
337
|
crawlLedger?: CrawlLedgerOptions;
|
|
338
|
+
/** Extract same-origin internal links per page into `pageStatuses[].links` (for caller-driven BFS). */
|
|
339
|
+
extractLinks?: boolean;
|
|
340
|
+
/** Max links kept per page when `extractLinks` is set (default: 200). */
|
|
341
|
+
maxLinksPerPage?: number;
|
|
324
342
|
}
|
|
325
343
|
interface RenderOptions {
|
|
326
344
|
/**
|
|
@@ -661,6 +679,12 @@ declare class WebRAGPlugin implements RAGPlugin {
|
|
|
661
679
|
private normalizeWebsiteUrl;
|
|
662
680
|
private fetchHtml;
|
|
663
681
|
private extractInternalLinks;
|
|
682
|
+
/**
|
|
683
|
+
* When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
|
|
684
|
+
* caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
|
|
685
|
+
* undefined when disabled or on any parse error (link extraction must never fail a crawl).
|
|
686
|
+
*/
|
|
687
|
+
private extractLinksIfEnabled;
|
|
664
688
|
/**
|
|
665
689
|
* Ingest content from a list of URLs
|
|
666
690
|
*
|
package/dist/index.js
CHANGED
|
@@ -1877,7 +1877,9 @@ var WebRAGPlugin = class {
|
|
|
1877
1877
|
render: config.render,
|
|
1878
1878
|
renderOptions: config.renderOptions,
|
|
1879
1879
|
debug: config.debug,
|
|
1880
|
-
crawlLedger: config.crawlLedger
|
|
1880
|
+
crawlLedger: config.crawlLedger,
|
|
1881
|
+
extractLinks: config.extractLinks,
|
|
1882
|
+
maxLinksPerPage: config.maxLinksPerPage
|
|
1881
1883
|
}, options);
|
|
1882
1884
|
return {
|
|
1883
1885
|
...result,
|
|
@@ -2033,6 +2035,22 @@ var WebRAGPlugin = class {
|
|
|
2033
2035
|
});
|
|
2034
2036
|
return Array.from(links);
|
|
2035
2037
|
}
|
|
2038
|
+
/**
|
|
2039
|
+
* When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
|
|
2040
|
+
* caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
|
|
2041
|
+
* undefined when disabled or on any parse error (link extraction must never fail a crawl).
|
|
2042
|
+
*/
|
|
2043
|
+
extractLinksIfEnabled(url, html, config) {
|
|
2044
|
+
if (!config.extractLinks) return void 0;
|
|
2045
|
+
try {
|
|
2046
|
+
const base = new URL(url);
|
|
2047
|
+
const links = this.extractInternalLinks(html, base, config.stripQueryParams ?? false);
|
|
2048
|
+
const cap = config.maxLinksPerPage ?? 200;
|
|
2049
|
+
return links.length > cap ? links.slice(0, cap) : links;
|
|
2050
|
+
} catch {
|
|
2051
|
+
return void 0;
|
|
2052
|
+
}
|
|
2053
|
+
}
|
|
2036
2054
|
/**
|
|
2037
2055
|
* Ingest content from a list of URLs
|
|
2038
2056
|
*
|
|
@@ -2063,7 +2081,9 @@ var WebRAGPlugin = class {
|
|
|
2063
2081
|
render: config.render,
|
|
2064
2082
|
renderOptions: config.renderOptions,
|
|
2065
2083
|
debug: config.debug,
|
|
2066
|
-
crawlLedger: config.crawlLedger
|
|
2084
|
+
crawlLedger: config.crawlLedger,
|
|
2085
|
+
extractLinks: config.extractLinks,
|
|
2086
|
+
maxLinksPerPage: config.maxLinksPerPage
|
|
2067
2087
|
}, options);
|
|
2068
2088
|
}
|
|
2069
2089
|
/**
|
|
@@ -2182,7 +2202,7 @@ var WebRAGPlugin = class {
|
|
|
2182
2202
|
}
|
|
2183
2203
|
}
|
|
2184
2204
|
try {
|
|
2185
|
-
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
2205
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2, links } = await this.crawlPageSmart(url, config, timeout, {
|
|
2186
2206
|
renderMode,
|
|
2187
2207
|
renderOptions,
|
|
2188
2208
|
minContentLength,
|
|
@@ -2216,7 +2236,8 @@ var WebRAGPlugin = class {
|
|
|
2216
2236
|
bodyTextLengthHint: bodyTextLengthHint2,
|
|
2217
2237
|
title: doc?.metadata?.title,
|
|
2218
2238
|
docId: doc?.id,
|
|
2219
|
-
error: diag?.errorMessage
|
|
2239
|
+
error: diag?.errorMessage,
|
|
2240
|
+
...links ? { links } : {}
|
|
2220
2241
|
});
|
|
2221
2242
|
this.emitCrawlPage(config, {
|
|
2222
2243
|
url,
|
|
@@ -2354,41 +2375,39 @@ var WebRAGPlugin = class {
|
|
|
2354
2375
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
2355
2376
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
2356
2377
|
}
|
|
2357
|
-
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
2378
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed, links) {
|
|
2358
2379
|
if (blockedSuspected) {
|
|
2359
2380
|
return {
|
|
2360
2381
|
doc: null,
|
|
2361
|
-
diag: { modeUsed: modeFailed, reason: "blocked_suspected" }
|
|
2382
|
+
diag: { modeUsed: modeFailed, reason: "blocked_suspected" },
|
|
2383
|
+
links
|
|
2362
2384
|
};
|
|
2363
2385
|
}
|
|
2364
2386
|
if (renderFailure) {
|
|
2365
2387
|
return {
|
|
2366
2388
|
doc: null,
|
|
2367
|
-
diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure }
|
|
2389
|
+
diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure },
|
|
2390
|
+
links
|
|
2368
2391
|
};
|
|
2369
2392
|
}
|
|
2370
2393
|
return {
|
|
2371
2394
|
doc,
|
|
2372
2395
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
2373
|
-
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
2396
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2,
|
|
2397
|
+
links
|
|
2374
2398
|
};
|
|
2375
2399
|
}
|
|
2376
2400
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
2377
2401
|
if (ctx.renderMode === true) {
|
|
2378
|
-
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2379
|
-
url,
|
|
2380
|
-
config,
|
|
2381
|
-
timeout,
|
|
2382
|
-
ctx.renderOptions,
|
|
2383
|
-
ctx.dbg
|
|
2384
|
-
);
|
|
2402
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected, links } = await this.crawlPageRendered(url, config, timeout, ctx.renderOptions, ctx.dbg);
|
|
2385
2403
|
return this.diagFromRenderedAttempt(
|
|
2386
2404
|
doc,
|
|
2387
2405
|
bodyTextLengthHint2,
|
|
2388
2406
|
renderFailure,
|
|
2389
2407
|
blockedSuspected,
|
|
2390
2408
|
"render_ok",
|
|
2391
|
-
"render_failed"
|
|
2409
|
+
"render_failed",
|
|
2410
|
+
links
|
|
2392
2411
|
);
|
|
2393
2412
|
}
|
|
2394
2413
|
try {
|
|
@@ -2414,8 +2433,9 @@ var WebRAGPlugin = class {
|
|
|
2414
2433
|
const html = await response.text();
|
|
2415
2434
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2416
2435
|
const staticHint = !doc ? this.bodyTextLengthHint(html, config) : void 0;
|
|
2436
|
+
const staticLinks = this.extractLinksIfEnabled(url, html, config);
|
|
2417
2437
|
if (doc && doc.content.length >= ctx.minContentLength) {
|
|
2418
|
-
return { doc, diag: { modeUsed: "static_ok" } };
|
|
2438
|
+
return { doc, diag: { modeUsed: "static_ok" }, links: staticLinks };
|
|
2419
2439
|
}
|
|
2420
2440
|
if (ctx.renderMode === "auto") {
|
|
2421
2441
|
const shouldRender = this.looksLikeDynamicShell(html) || !doc || doc.content.length < ctx.minContentLength;
|
|
@@ -2429,7 +2449,8 @@ var WebRAGPlugin = class {
|
|
|
2429
2449
|
doc: rendered,
|
|
2430
2450
|
bodyTextLengthHint: rHint,
|
|
2431
2451
|
renderFailure,
|
|
2432
|
-
blockedSuspected
|
|
2452
|
+
blockedSuspected,
|
|
2453
|
+
links: renderedLinks
|
|
2433
2454
|
} = await this.crawlPageRendered(
|
|
2434
2455
|
url,
|
|
2435
2456
|
config,
|
|
@@ -2444,7 +2465,9 @@ var WebRAGPlugin = class {
|
|
|
2444
2465
|
renderFailure,
|
|
2445
2466
|
blockedSuspected,
|
|
2446
2467
|
"render_fallback_ok",
|
|
2447
|
-
"render_fallback_failed"
|
|
2468
|
+
"render_fallback_failed",
|
|
2469
|
+
// Prefer links from the rendered DOM; fall back to the static HTML's links.
|
|
2470
|
+
renderedLinks ?? staticLinks
|
|
2448
2471
|
);
|
|
2449
2472
|
if (!rendered && (renderFailure || blockedSuspected)) {
|
|
2450
2473
|
fb.bodyTextLengthHint = staticHint ?? rHint;
|
|
@@ -2455,7 +2478,8 @@ var WebRAGPlugin = class {
|
|
|
2455
2478
|
return {
|
|
2456
2479
|
doc: null,
|
|
2457
2480
|
diag: { modeUsed: "static_failed", reason: "too_small" },
|
|
2458
|
-
bodyTextLengthHint: staticHint
|
|
2481
|
+
bodyTextLengthHint: staticHint,
|
|
2482
|
+
links: staticLinks
|
|
2459
2483
|
};
|
|
2460
2484
|
} catch (e) {
|
|
2461
2485
|
throw e;
|
|
@@ -2503,6 +2527,7 @@ var WebRAGPlugin = class {
|
|
|
2503
2527
|
const html = await page.content();
|
|
2504
2528
|
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
2505
2529
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2530
|
+
const links = this.extractLinksIfEnabled(url, html, config);
|
|
2506
2531
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
2507
2532
|
try {
|
|
2508
2533
|
const saveDir = config.debug.saveDir;
|
|
@@ -2516,7 +2541,7 @@ var WebRAGPlugin = class {
|
|
|
2516
2541
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
2517
2542
|
}
|
|
2518
2543
|
}
|
|
2519
|
-
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
2544
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2, links };
|
|
2520
2545
|
} catch (e) {
|
|
2521
2546
|
const msg = String(e?.message || e || "render_failed");
|
|
2522
2547
|
const lower = msg.toLowerCase();
|
package/dist/index.mjs
CHANGED
|
@@ -1830,7 +1830,9 @@ var WebRAGPlugin = class {
|
|
|
1830
1830
|
render: config.render,
|
|
1831
1831
|
renderOptions: config.renderOptions,
|
|
1832
1832
|
debug: config.debug,
|
|
1833
|
-
crawlLedger: config.crawlLedger
|
|
1833
|
+
crawlLedger: config.crawlLedger,
|
|
1834
|
+
extractLinks: config.extractLinks,
|
|
1835
|
+
maxLinksPerPage: config.maxLinksPerPage
|
|
1834
1836
|
}, options);
|
|
1835
1837
|
return {
|
|
1836
1838
|
...result,
|
|
@@ -1986,6 +1988,22 @@ var WebRAGPlugin = class {
|
|
|
1986
1988
|
});
|
|
1987
1989
|
return Array.from(links);
|
|
1988
1990
|
}
|
|
1991
|
+
/**
|
|
1992
|
+
* When `config.extractLinks` is set, parse same-origin internal links from a page's HTML so the
|
|
1993
|
+
* caller can drive a resumable recursive (BFS) crawl without a separate discovery fetch. Returns
|
|
1994
|
+
* undefined when disabled or on any parse error (link extraction must never fail a crawl).
|
|
1995
|
+
*/
|
|
1996
|
+
extractLinksIfEnabled(url, html, config) {
|
|
1997
|
+
if (!config.extractLinks) return void 0;
|
|
1998
|
+
try {
|
|
1999
|
+
const base = new URL(url);
|
|
2000
|
+
const links = this.extractInternalLinks(html, base, config.stripQueryParams ?? false);
|
|
2001
|
+
const cap = config.maxLinksPerPage ?? 200;
|
|
2002
|
+
return links.length > cap ? links.slice(0, cap) : links;
|
|
2003
|
+
} catch {
|
|
2004
|
+
return void 0;
|
|
2005
|
+
}
|
|
2006
|
+
}
|
|
1989
2007
|
/**
|
|
1990
2008
|
* Ingest content from a list of URLs
|
|
1991
2009
|
*
|
|
@@ -2016,7 +2034,9 @@ var WebRAGPlugin = class {
|
|
|
2016
2034
|
render: config.render,
|
|
2017
2035
|
renderOptions: config.renderOptions,
|
|
2018
2036
|
debug: config.debug,
|
|
2019
|
-
crawlLedger: config.crawlLedger
|
|
2037
|
+
crawlLedger: config.crawlLedger,
|
|
2038
|
+
extractLinks: config.extractLinks,
|
|
2039
|
+
maxLinksPerPage: config.maxLinksPerPage
|
|
2020
2040
|
}, options);
|
|
2021
2041
|
}
|
|
2022
2042
|
/**
|
|
@@ -2135,7 +2155,7 @@ var WebRAGPlugin = class {
|
|
|
2135
2155
|
}
|
|
2136
2156
|
}
|
|
2137
2157
|
try {
|
|
2138
|
-
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2 } = await this.crawlPageSmart(url, config, timeout, {
|
|
2158
|
+
const { doc, diag, bodyTextLengthHint: bodyTextLengthHint2, links } = await this.crawlPageSmart(url, config, timeout, {
|
|
2139
2159
|
renderMode,
|
|
2140
2160
|
renderOptions,
|
|
2141
2161
|
minContentLength,
|
|
@@ -2169,7 +2189,8 @@ var WebRAGPlugin = class {
|
|
|
2169
2189
|
bodyTextLengthHint: bodyTextLengthHint2,
|
|
2170
2190
|
title: doc?.metadata?.title,
|
|
2171
2191
|
docId: doc?.id,
|
|
2172
|
-
error: diag?.errorMessage
|
|
2192
|
+
error: diag?.errorMessage,
|
|
2193
|
+
...links ? { links } : {}
|
|
2173
2194
|
});
|
|
2174
2195
|
this.emitCrawlPage(config, {
|
|
2175
2196
|
url,
|
|
@@ -2307,41 +2328,39 @@ var WebRAGPlugin = class {
|
|
|
2307
2328
|
const isSmallShellLike = htmlLength < 5e4 && textLength < 500 && contentDensity < 0.02;
|
|
2308
2329
|
return asksForJavascript || hasEmptyAppMountNode || hasHydrationData || isMostlyScripts || isSmallShellLike || hasLoadingHints && textLength < 1e3 && contentDensity < 0.05;
|
|
2309
2330
|
}
|
|
2310
|
-
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed) {
|
|
2331
|
+
diagFromRenderedAttempt(doc, bodyTextLengthHint2, renderFailure, blockedSuspected, modeOk, modeFailed, links) {
|
|
2311
2332
|
if (blockedSuspected) {
|
|
2312
2333
|
return {
|
|
2313
2334
|
doc: null,
|
|
2314
|
-
diag: { modeUsed: modeFailed, reason: "blocked_suspected" }
|
|
2335
|
+
diag: { modeUsed: modeFailed, reason: "blocked_suspected" },
|
|
2336
|
+
links
|
|
2315
2337
|
};
|
|
2316
2338
|
}
|
|
2317
2339
|
if (renderFailure) {
|
|
2318
2340
|
return {
|
|
2319
2341
|
doc: null,
|
|
2320
|
-
diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure }
|
|
2342
|
+
diag: { modeUsed: modeFailed, reason: "render_error", errorMessage: renderFailure },
|
|
2343
|
+
links
|
|
2321
2344
|
};
|
|
2322
2345
|
}
|
|
2323
2346
|
return {
|
|
2324
2347
|
doc,
|
|
2325
2348
|
diag: doc ? { modeUsed: modeOk } : { modeUsed: modeFailed, reason: "too_small" },
|
|
2326
|
-
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2
|
|
2349
|
+
bodyTextLengthHint: doc ? void 0 : bodyTextLengthHint2,
|
|
2350
|
+
links
|
|
2327
2351
|
};
|
|
2328
2352
|
}
|
|
2329
2353
|
async crawlPageSmart(url, config, timeout, ctx) {
|
|
2330
2354
|
if (ctx.renderMode === true) {
|
|
2331
|
-
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected } = await this.crawlPageRendered(
|
|
2332
|
-
url,
|
|
2333
|
-
config,
|
|
2334
|
-
timeout,
|
|
2335
|
-
ctx.renderOptions,
|
|
2336
|
-
ctx.dbg
|
|
2337
|
-
);
|
|
2355
|
+
const { doc, bodyTextLengthHint: bodyTextLengthHint2, renderFailure, blockedSuspected, links } = await this.crawlPageRendered(url, config, timeout, ctx.renderOptions, ctx.dbg);
|
|
2338
2356
|
return this.diagFromRenderedAttempt(
|
|
2339
2357
|
doc,
|
|
2340
2358
|
bodyTextLengthHint2,
|
|
2341
2359
|
renderFailure,
|
|
2342
2360
|
blockedSuspected,
|
|
2343
2361
|
"render_ok",
|
|
2344
|
-
"render_failed"
|
|
2362
|
+
"render_failed",
|
|
2363
|
+
links
|
|
2345
2364
|
);
|
|
2346
2365
|
}
|
|
2347
2366
|
try {
|
|
@@ -2367,8 +2386,9 @@ var WebRAGPlugin = class {
|
|
|
2367
2386
|
const html = await response.text();
|
|
2368
2387
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2369
2388
|
const staticHint = !doc ? this.bodyTextLengthHint(html, config) : void 0;
|
|
2389
|
+
const staticLinks = this.extractLinksIfEnabled(url, html, config);
|
|
2370
2390
|
if (doc && doc.content.length >= ctx.minContentLength) {
|
|
2371
|
-
return { doc, diag: { modeUsed: "static_ok" } };
|
|
2391
|
+
return { doc, diag: { modeUsed: "static_ok" }, links: staticLinks };
|
|
2372
2392
|
}
|
|
2373
2393
|
if (ctx.renderMode === "auto") {
|
|
2374
2394
|
const shouldRender = this.looksLikeDynamicShell(html) || !doc || doc.content.length < ctx.minContentLength;
|
|
@@ -2382,7 +2402,8 @@ var WebRAGPlugin = class {
|
|
|
2382
2402
|
doc: rendered,
|
|
2383
2403
|
bodyTextLengthHint: rHint,
|
|
2384
2404
|
renderFailure,
|
|
2385
|
-
blockedSuspected
|
|
2405
|
+
blockedSuspected,
|
|
2406
|
+
links: renderedLinks
|
|
2386
2407
|
} = await this.crawlPageRendered(
|
|
2387
2408
|
url,
|
|
2388
2409
|
config,
|
|
@@ -2397,7 +2418,9 @@ var WebRAGPlugin = class {
|
|
|
2397
2418
|
renderFailure,
|
|
2398
2419
|
blockedSuspected,
|
|
2399
2420
|
"render_fallback_ok",
|
|
2400
|
-
"render_fallback_failed"
|
|
2421
|
+
"render_fallback_failed",
|
|
2422
|
+
// Prefer links from the rendered DOM; fall back to the static HTML's links.
|
|
2423
|
+
renderedLinks ?? staticLinks
|
|
2401
2424
|
);
|
|
2402
2425
|
if (!rendered && (renderFailure || blockedSuspected)) {
|
|
2403
2426
|
fb.bodyTextLengthHint = staticHint ?? rHint;
|
|
@@ -2408,7 +2431,8 @@ var WebRAGPlugin = class {
|
|
|
2408
2431
|
return {
|
|
2409
2432
|
doc: null,
|
|
2410
2433
|
diag: { modeUsed: "static_failed", reason: "too_small" },
|
|
2411
|
-
bodyTextLengthHint: staticHint
|
|
2434
|
+
bodyTextLengthHint: staticHint,
|
|
2435
|
+
links: staticLinks
|
|
2412
2436
|
};
|
|
2413
2437
|
} catch (e) {
|
|
2414
2438
|
throw e;
|
|
@@ -2456,6 +2480,7 @@ var WebRAGPlugin = class {
|
|
|
2456
2480
|
const html = await page.content();
|
|
2457
2481
|
const bodyTextLengthHint2 = this.bodyTextLengthHint(html, config);
|
|
2458
2482
|
const doc = this.extractDocumentFromHtml(url, html, config);
|
|
2483
|
+
const links = this.extractLinksIfEnabled(url, html, config);
|
|
2459
2484
|
if (config.debug?.saveDir && config.debug?.enabled) {
|
|
2460
2485
|
try {
|
|
2461
2486
|
const saveDir = config.debug.saveDir;
|
|
@@ -2469,7 +2494,7 @@ var WebRAGPlugin = class {
|
|
|
2469
2494
|
dbg.log("debug.saveFailed", { url, error: e instanceof Error ? e.message : "save_failed" });
|
|
2470
2495
|
}
|
|
2471
2496
|
}
|
|
2472
|
-
return { doc, bodyTextLengthHint: bodyTextLengthHint2 };
|
|
2497
|
+
return { doc, bodyTextLengthHint: bodyTextLengthHint2, links };
|
|
2473
2498
|
} catch (e) {
|
|
2474
2499
|
const msg = String(e?.message || e || "render_failed");
|
|
2475
2500
|
const lower = msg.toLowerCase();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@snap-agent/rag-web",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.8",
|
|
4
4
|
"description": "Web RAG plugin for SnapAgent SDK - Schema-agnostic content search via web crawling, CMS APIs, sitemaps, and RSS feeds",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|
|
@@ -68,4 +68,3 @@
|
|
|
68
68
|
"url": "https://github.com/vilo-hq/snap-agent/issues"
|
|
69
69
|
}
|
|
70
70
|
}
|
|
71
|
-
|