@botpress/runtime 1.6.5 → 1.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/definition.js +167 -30
- package/dist/definition.js.map +4 -4
- package/dist/internal.js +167 -30
- package/dist/internal.js.map +4 -4
- package/dist/library.js +167 -30
- package/dist/library.js.map +4 -4
- package/dist/primitives/data-sources/html-fetch.d.ts +57 -0
- package/dist/primitives/data-sources/html-fetch.d.ts.map +1 -0
- package/dist/primitives/data-sources/source-website.d.ts +69 -4
- package/dist/primitives/data-sources/source-website.d.ts.map +1 -1
- package/dist/runtime.js +167 -30
- package/dist/runtime.js.map +4 -4
- package/package.json +1 -1
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML fetching and metadata extraction utilities
|
|
3
|
+
*/
|
|
4
|
+
export type HtmlMetadata = {
|
|
5
|
+
title?: string;
|
|
6
|
+
description?: string;
|
|
7
|
+
favicon?: string;
|
|
8
|
+
};
|
|
9
|
+
export type FetchHtmlResult = {
|
|
10
|
+
url: string;
|
|
11
|
+
contentType: string;
|
|
12
|
+
content: string;
|
|
13
|
+
metadata?: HtmlMetadata;
|
|
14
|
+
};
|
|
15
|
+
/**
|
|
16
|
+
* Extract metadata from HTML content using regex patterns
|
|
17
|
+
*
|
|
18
|
+
* @param html - The HTML content to parse
|
|
19
|
+
* @returns Extracted metadata including title, description, and favicon
|
|
20
|
+
*/
|
|
21
|
+
export declare function extractHtmlMetadata(html: string): HtmlMetadata;
|
|
22
|
+
/**
|
|
23
|
+
* Resolve a potentially relative URL to an absolute URL
|
|
24
|
+
*
|
|
25
|
+
* @param url - The URL to resolve (may be relative)
|
|
26
|
+
* @param baseUrl - The base URL to resolve against
|
|
27
|
+
* @returns The absolute URL, or the original URL if resolution fails
|
|
28
|
+
*/
|
|
29
|
+
export declare function resolveUrl(url: string, baseUrl: string): string;
|
|
30
|
+
/**
|
|
31
|
+
* Fetch content from a URL and extract metadata if HTML
|
|
32
|
+
*
|
|
33
|
+
* This function safely handles both HTML and non-HTML content (XML, JSON, text, etc.).
|
|
34
|
+
* Metadata extraction only occurs for HTML content types. For other content types
|
|
35
|
+
* (like sitemap.xml, robots.txt, RSS feeds), it returns the raw content without
|
|
36
|
+
* attempting metadata extraction.
|
|
37
|
+
*
|
|
38
|
+
* @param url - The URL to fetch
|
|
39
|
+
* @param options - Optional fetch options
|
|
40
|
+
* @returns Fetch result with content and extracted metadata (HTML only)
|
|
41
|
+
*
|
|
42
|
+
* @example
|
|
43
|
+
* // Fetching HTML - extracts metadata
|
|
44
|
+
* const html = await fetchHtml('https://example.com')
|
|
45
|
+
* console.log(html.metadata?.title) // "Example Domain"
|
|
46
|
+
*
|
|
47
|
+
* @example
|
|
48
|
+
* // Fetching XML - no metadata extraction
|
|
49
|
+
* const xml = await fetchHtml('https://example.com/sitemap.xml')
|
|
50
|
+
* console.log(xml.content) // Raw XML content
|
|
51
|
+
* console.log(xml.metadata) // undefined
|
|
52
|
+
*/
|
|
53
|
+
export declare function fetchHtml(url: string, options?: {
|
|
54
|
+
userAgent?: string;
|
|
55
|
+
timeout?: number;
|
|
56
|
+
}): Promise<FetchHtmlResult>;
|
|
57
|
+
//# sourceMappingURL=html-fetch.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-fetch.d.ts","sourceRoot":"","sources":["../../../src/primitives/data-sources/html-fetch.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,MAAM,YAAY,GAAG;IACzB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,OAAO,CAAC,EAAE,MAAM,CAAA;CACjB,CAAA;AAED,MAAM,MAAM,eAAe,GAAG;IAC5B,GAAG,EAAE,MAAM,CAAA;IACX,WAAW,EAAE,MAAM,CAAA;IACnB,OAAO,EAAE,MAAM,CAAA;IACf,QAAQ,CAAC,EAAE,YAAY,CAAA;CACxB,CAAA;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAoD9D;AAED;;;;;;GAMG;AACH,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAa/D;AAED;;;;;;;;;;;;;;;;;;;;;;GAsBG;AACH,wBAAsB,SAAS,CAC7B,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE;IACR,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,OAAO,CAAC,EAAE,MAAM,CAAA;CACjB,GACA,OAAO,CAAC,eAAe,CAAC,CA+C1B"}
|
|
@@ -16,16 +16,68 @@ type FetchResult = {
|
|
|
16
16
|
[WellKnownMetadata.knowledge.FAVICON]?: string;
|
|
17
17
|
};
|
|
18
18
|
};
|
|
19
|
+
/**
|
|
20
|
+
* Fetch strategy for retrieving web content
|
|
21
|
+
*
|
|
22
|
+
* - 'node:fetch': Uses Node's built-in fetch (fast, no dependencies, works for static HTML)
|
|
23
|
+
* - 'integration:browser': Uses browser integration (slower, requires browser integration, handles JavaScript/SPAs)
|
|
24
|
+
*/
|
|
25
|
+
type FetchStrategy = 'node:fetch' | 'integration:browser';
|
|
26
|
+
/**
|
|
27
|
+
* Fetch option can be:
|
|
28
|
+
* - A strategy string: 'node:fetch' or 'integration:browser'
|
|
29
|
+
* - A custom function: for special authentication, headers, or processing
|
|
30
|
+
*/
|
|
31
|
+
type FetchOption = FetchStrategy | ((url: string) => Promise<FetchResult> | FetchResult);
|
|
19
32
|
type WebsiteSourceOptions = {
|
|
20
33
|
id?: string;
|
|
21
34
|
filter?: (context: SitemapFilterContext) => boolean;
|
|
22
|
-
|
|
35
|
+
/**
|
|
36
|
+
* Fetch method to use for retrieving web pages
|
|
37
|
+
*
|
|
38
|
+
* Options:
|
|
39
|
+
* - 'node:fetch': Fast, uses Node's built-in fetch (best for static HTML sites) **[DEFAULT]**
|
|
40
|
+
* - 'integration:browser': Slower, uses browser integration (best for JavaScript/SPAs)
|
|
41
|
+
* - Custom function: Provide your own fetch implementation (for auth, special headers, etc.)
|
|
42
|
+
* - undefined: Defaults to 'node:fetch'
|
|
43
|
+
*
|
|
44
|
+
* @default 'node:fetch'
|
|
45
|
+
*
|
|
46
|
+
* @example
|
|
47
|
+
* // Use Node's built-in fetch (default, can be omitted)
|
|
48
|
+
* { fetch: 'node:fetch' }
|
|
49
|
+
*
|
|
50
|
+
* @example
|
|
51
|
+
* // Use browser integration for JavaScript-heavy sites
|
|
52
|
+
* { fetch: 'integration:browser' }
|
|
53
|
+
*
|
|
54
|
+
* @example
|
|
55
|
+
* // Custom fetch with authentication
|
|
56
|
+
* {
|
|
57
|
+
* fetch: async (url) => {
|
|
58
|
+
* const response = await fetch(url, {
|
|
59
|
+
* headers: { Authorization: 'Bearer token' }
|
|
60
|
+
* })
|
|
61
|
+
* return {
|
|
62
|
+
* url,
|
|
63
|
+
* contentType: 'text/html',
|
|
64
|
+
* content: await response.text()
|
|
65
|
+
* }
|
|
66
|
+
* }
|
|
67
|
+
* }
|
|
68
|
+
*/
|
|
69
|
+
fetch?: FetchOption;
|
|
23
70
|
maxPages?: number;
|
|
24
71
|
maxDepth?: number;
|
|
25
72
|
};
|
|
26
73
|
type UrlsSourceOptions = {
|
|
27
74
|
id?: string;
|
|
28
|
-
|
|
75
|
+
/**
|
|
76
|
+
* Fetch method to use for retrieving web pages
|
|
77
|
+
*
|
|
78
|
+
* See WebsiteSourceOptions.fetch for detailed documentation
|
|
79
|
+
*/
|
|
80
|
+
fetch?: FetchOption;
|
|
29
81
|
};
|
|
30
82
|
export declare class WebsiteSource extends DataSource {
|
|
31
83
|
private mode;
|
|
@@ -34,19 +86,32 @@ export declare class WebsiteSource extends DataSource {
|
|
|
34
86
|
private urls;
|
|
35
87
|
private filterFn;
|
|
36
88
|
private customFetch;
|
|
89
|
+
private fetchStrategy;
|
|
37
90
|
private maxPages;
|
|
38
91
|
private maxDepth;
|
|
39
92
|
private transformFn;
|
|
40
93
|
private constructor();
|
|
41
94
|
private isBrowserIntegrationAvailable;
|
|
42
95
|
/**
|
|
43
|
-
*
|
|
96
|
+
* Convert HtmlMetadata to FetchResult metadata format
|
|
97
|
+
*/
|
|
98
|
+
private convertMetadata;
|
|
99
|
+
/**
|
|
100
|
+
* Default fetch implementation using Node's built-in fetch
|
|
101
|
+
*/
|
|
102
|
+
private defaultFetch;
|
|
103
|
+
/**
|
|
104
|
+
* Fetch content from a URL for sitemap parsing (raw content needed)
|
|
44
105
|
*/
|
|
45
106
|
private fetchSitemap;
|
|
46
107
|
/**
|
|
47
|
-
* Fetch content from a URL with
|
|
108
|
+
* Fetch content from a URL for indexing (with metadata extraction)
|
|
48
109
|
*/
|
|
49
110
|
private fetchUrl;
|
|
111
|
+
/**
|
|
112
|
+
* Fetch content using the browser integration
|
|
113
|
+
*/
|
|
114
|
+
private fetchWithBrowserIntegration;
|
|
50
115
|
/**
|
|
51
116
|
* Parse sitemap XML content
|
|
52
117
|
*/
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"source-website.d.ts","sourceRoot":"","sources":["../../../src/primitives/data-sources/source-website.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAA4B,MAAM,eAAe,CAAA;AAEpE,OAAO,EAAiB,iBAAiB,EAAE,MAAM,iBAAiB,CAAA;
|
|
1
|
+
{"version":3,"file":"source-website.d.ts","sourceRoot":"","sources":["../../../src/primitives/data-sources/source-website.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAA4B,MAAM,eAAe,CAAA;AAEpE,OAAO,EAAiB,iBAAiB,EAAE,MAAM,iBAAiB,CAAA;AAMlE,KAAK,oBAAoB,GAAG;IAC1B,GAAG,EAAE,MAAM,CAAA;IACX,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAA;CAClB,CAAA;AAED,KAAK,WAAW,GAAG;IACjB,GAAG,EAAE,MAAM,CAAA;IACX,WAAW,EAAE,MAAM,CAAA;IACnB,OAAO,EAAE,MAAM,CAAA;IACf,QAAQ,CAAC,EAAE;QACT,CAAC,iBAAiB,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAA;QAC5C,CAAC,iBAAiB,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,EAAE,MAAM,CAAA;QAClD,CAAC,iBAAiB,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,EAAE,MAAM,CAAA;KAC/C,CAAA;CACF,CAAA;AAED;;;;;GAKG;AACH,KAAK,aAAa,GAAG,YAAY,GAAG,qBAAqB,CAAA;AAEzD;;;;GAIG;AACH,KAAK,WAAW,GAAG,aAAa,GAAG,CAAC,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,WAAW,CAAC,GAAG,WAAW,CAAC,CAAA;AAExF,KAAK,oBAAoB,GAAG;IAC1B,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,oBAAoB,KAAK,OAAO,CAAA;IACnD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;OAiCG;IACH,KAAK,CAAC,EAAE,WAAW,CAAA;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAA;CAClB,CAAA;AAED,KAAK,iBAAiB,GAAG;IACvB,EAAE,CAAC,EAAE,MAAM,CAAA;IACX;;;;OAIG;IACH,KAAK,CAAC,EAAE,WAAW,CAAA;CACpB,CAAA;AAsCD,qBAAa,aAAc,SAAQ,UAAU;IAC3C,OAAO,CAAC,IAAI,CAAmB;IAC/B,OAAO,CAAC,OAAO,CAAoB;IACnC,OAAO,CAAC,UAAU,CAAoB;IACtC,OAAO,CAAC,IAAI,CAAsB;IAClC,OAAO,CAAC,QAAQ,CAA0D;IAC1E,OAAO,CAAC,WAAW,CAAmE;IACtF,OAAO,CAAC,aAAa,CAAe;IACpC,OAAO,CAAC,QAAQ,CAAQ;IACxB,OAAO,CAAC,QAAQ,CAAQ;IACxB,OAAO,CAAC,WAAW,CAAuF;IAE1G,OAAO;IAmCP,OAAO,CAAC,6BAA6B;IAIrC;;OAEG;IACH,OAAO,CAAC,eAAe;IAgBvB;;OAEG;YACW,YAAY;IAsB1B;;OAEG;YACW,YAAY;IAkB1B;;OAEG;YACW,QAAQ;IAkBtB;;OAEG;YACW,2BAA2B;IAgDzC;;OAEG;IACH,OAAO,CAAC,eAAe;IAqDvB;;OAEG;IACH,OAAO,CAAC,eAAe;IAcvB;;OAEG;YACW,uBAAuB;IAoDrC;;OAEG;YACW,YAAY;IAiH1B,IAAW,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;uDAiMtB;IAED,MAAM,CAAC,WAAW,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,oBAAyB,GAAG,aAAa;IAKtF,MAAM,CAAC,WAAW,CAAC,UAAU,EAAE,MAAM,EAAE,OAAO,GAAE,oBAAyB,GAAG,aAAa;IAKzF,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,OAAO,GAAE,iBAAsB,GAAG,aAAa;CAIhF"}
|
package/dist/runtime.js
CHANGED
|
@@ -48,7 +48,7 @@ var init_define_BUILD = __esm({
|
|
|
48
48
|
var define_PACKAGE_VERSIONS_default;
|
|
49
49
|
var init_define_PACKAGE_VERSIONS = __esm({
|
|
50
50
|
"<define:__PACKAGE_VERSIONS__>"() {
|
|
51
|
-
define_PACKAGE_VERSIONS_default = { runtime: "1.6.
|
|
51
|
+
define_PACKAGE_VERSIONS_default = { runtime: "1.6.7", adk: "not-installed", sdk: "4.17.3", llmz: "0.0.27", zai: "2.4.0", cognitive: "0.2.0" };
|
|
52
52
|
}
|
|
53
53
|
});
|
|
54
54
|
|
|
@@ -25291,7 +25291,7 @@ var require_follow_redirects = __commonJS({
|
|
|
25291
25291
|
var currentUrlParts = parseUrl(this._currentUrl);
|
|
25292
25292
|
var currentHost = currentHostHeader || currentUrlParts.host;
|
|
25293
25293
|
var currentUrl = /^\w+:/.test(location) ? this._currentUrl : url2.format(Object.assign(currentUrlParts, { host: currentHost }));
|
|
25294
|
-
var redirectUrl =
|
|
25294
|
+
var redirectUrl = resolveUrl2(location, currentUrl);
|
|
25295
25295
|
debug("redirecting to", redirectUrl.href);
|
|
25296
25296
|
this._isRedirect = true;
|
|
25297
25297
|
spreadUrlObject(redirectUrl, this._options);
|
|
@@ -25375,7 +25375,7 @@ var require_follow_redirects = __commonJS({
|
|
|
25375
25375
|
}
|
|
25376
25376
|
return parsed;
|
|
25377
25377
|
}
|
|
25378
|
-
function
|
|
25378
|
+
function resolveUrl2(relative, base) {
|
|
25379
25379
|
return useNativeURL ? new URL2(relative, base) : parseUrl(url2.resolve(base, relative));
|
|
25380
25380
|
}
|
|
25381
25381
|
function validateUrl(input) {
|
|
@@ -42113,6 +42113,95 @@ var XMLParser = class {
|
|
|
42113
42113
|
}
|
|
42114
42114
|
};
|
|
42115
42115
|
|
|
42116
|
+
// src/primitives/data-sources/html-fetch.ts
|
|
42117
|
+
init_define_BUILD();
|
|
42118
|
+
init_define_PACKAGE_VERSIONS();
|
|
42119
|
+
function extractHtmlMetadata(html) {
|
|
42120
|
+
const metadata = {};
|
|
42121
|
+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
42122
|
+
if (titleMatch && titleMatch[1]) {
|
|
42123
|
+
metadata.title = titleMatch[1].trim();
|
|
42124
|
+
}
|
|
42125
|
+
let descriptionMatch = html.match(
|
|
42126
|
+
/<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content="([^"]+)"/i
|
|
42127
|
+
);
|
|
42128
|
+
if (!descriptionMatch) {
|
|
42129
|
+
descriptionMatch = html.match(
|
|
42130
|
+
/<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content='([^']+)'/i
|
|
42131
|
+
);
|
|
42132
|
+
}
|
|
42133
|
+
if (descriptionMatch && descriptionMatch[1]) {
|
|
42134
|
+
metadata.description = descriptionMatch[1].trim();
|
|
42135
|
+
}
|
|
42136
|
+
const faviconPatterns = [
|
|
42137
|
+
// rel first, double quotes
|
|
42138
|
+
/<link\s+[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"[^>]*href="([^"]+)"/i,
|
|
42139
|
+
// rel first, single quotes
|
|
42140
|
+
/<link\s+[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'[^>]*href='([^']+)'/i,
|
|
42141
|
+
// href first, double quotes
|
|
42142
|
+
/<link\s+[^>]*href="([^"]+)"[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"/i,
|
|
42143
|
+
// href first, single quotes
|
|
42144
|
+
/<link\s+[^>]*href='([^']+)'[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'/i
|
|
42145
|
+
];
|
|
42146
|
+
for (const pattern of faviconPatterns) {
|
|
42147
|
+
const faviconMatch = html.match(pattern);
|
|
42148
|
+
if (faviconMatch && faviconMatch[1]) {
|
|
42149
|
+
metadata.favicon = faviconMatch[1].trim();
|
|
42150
|
+
break;
|
|
42151
|
+
}
|
|
42152
|
+
}
|
|
42153
|
+
if (!metadata.favicon) {
|
|
42154
|
+
metadata.favicon = "/favicon.ico";
|
|
42155
|
+
}
|
|
42156
|
+
return metadata;
|
|
42157
|
+
}
|
|
42158
|
+
function resolveUrl(url2, baseUrl) {
|
|
42159
|
+
if (url2.startsWith("http://") || url2.startsWith("https://")) {
|
|
42160
|
+
return url2;
|
|
42161
|
+
}
|
|
42162
|
+
try {
|
|
42163
|
+
const base = new URL(baseUrl);
|
|
42164
|
+
return new URL(url2, base.origin).href;
|
|
42165
|
+
} catch {
|
|
42166
|
+
return url2;
|
|
42167
|
+
}
|
|
42168
|
+
}
|
|
42169
|
+
async function fetchHtml(url2, options) {
|
|
42170
|
+
const userAgent = options?.userAgent || "Mozilla/5.0 (compatible; BotpressBot/1.0)";
|
|
42171
|
+
const fetchOptions = {
|
|
42172
|
+
headers: {
|
|
42173
|
+
"User-Agent": userAgent
|
|
42174
|
+
}
|
|
42175
|
+
};
|
|
42176
|
+
if (options?.timeout) {
|
|
42177
|
+
fetchOptions.signal = AbortSignal.timeout(options.timeout);
|
|
42178
|
+
}
|
|
42179
|
+
const response = await fetch(url2, fetchOptions);
|
|
42180
|
+
if (!response.ok) {
|
|
42181
|
+
throw new Error(`Failed to fetch ${url2}: ${response.status} ${response.statusText}`);
|
|
42182
|
+
}
|
|
42183
|
+
const contentType = response.headers.get("content-type") || "text/html";
|
|
42184
|
+
const content = await response.text();
|
|
42185
|
+
const isHtml = contentType.includes("text/html") || contentType.includes("application/xhtml");
|
|
42186
|
+
if (!isHtml) {
|
|
42187
|
+
return {
|
|
42188
|
+
url: url2,
|
|
42189
|
+
contentType,
|
|
42190
|
+
content
|
|
42191
|
+
};
|
|
42192
|
+
}
|
|
42193
|
+
const extracted = extractHtmlMetadata(content);
|
|
42194
|
+
if (extracted.favicon) {
|
|
42195
|
+
extracted.favicon = resolveUrl(extracted.favicon, url2);
|
|
42196
|
+
}
|
|
42197
|
+
return {
|
|
42198
|
+
url: url2,
|
|
42199
|
+
contentType,
|
|
42200
|
+
content,
|
|
42201
|
+
metadata: extracted
|
|
42202
|
+
};
|
|
42203
|
+
}
|
|
42204
|
+
|
|
42116
42205
|
// src/primitives/data-sources/source-website.ts
|
|
42117
42206
|
var State = z6.object({
|
|
42118
42207
|
urls: z6.array(
|
|
@@ -42132,6 +42221,7 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
42132
42221
|
urls;
|
|
42133
42222
|
filterFn;
|
|
42134
42223
|
customFetch;
|
|
42224
|
+
fetchStrategy;
|
|
42135
42225
|
maxPages;
|
|
42136
42226
|
maxDepth;
|
|
42137
42227
|
transformFn;
|
|
@@ -42142,7 +42232,16 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
42142
42232
|
this.sitemapUrl = options.sitemapUrl ?? void 0;
|
|
42143
42233
|
this.urls = options.urls ?? void 0;
|
|
42144
42234
|
this.filterFn = "filter" in options ? options.filter : void 0;
|
|
42145
|
-
|
|
42235
|
+
if (typeof options.fetch === "string") {
|
|
42236
|
+
this.fetchStrategy = options.fetch;
|
|
42237
|
+
this.customFetch = void 0;
|
|
42238
|
+
} else if (typeof options.fetch === "function") {
|
|
42239
|
+
this.customFetch = options.fetch;
|
|
42240
|
+
this.fetchStrategy = "node:fetch";
|
|
42241
|
+
} else {
|
|
42242
|
+
this.fetchStrategy = "node:fetch";
|
|
42243
|
+
this.customFetch = void 0;
|
|
42244
|
+
}
|
|
42146
42245
|
this.maxPages = Math.max(1, Math.min(("maxPages" in options ? options.maxPages : void 0) ?? 5e4, 5e4));
|
|
42147
42246
|
this.maxDepth = Math.max(1, Math.min(("maxDepth" in options ? options.maxDepth : void 0) ?? 20, 20));
|
|
42148
42247
|
}
|
|
@@ -42150,51 +42249,82 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
42150
42249
|
return !!adk.project.integrations.get("browser");
|
|
42151
42250
|
}
|
|
42152
42251
|
/**
|
|
42153
|
-
*
|
|
42252
|
+
* Convert HtmlMetadata to FetchResult metadata format
|
|
42253
|
+
*/
|
|
42254
|
+
convertMetadata(metadata) {
|
|
42255
|
+
const result = {};
|
|
42256
|
+
if (metadata.title) {
|
|
42257
|
+
result[WellKnownMetadata.knowledge.TITLE] = metadata.title;
|
|
42258
|
+
}
|
|
42259
|
+
if (metadata.description) {
|
|
42260
|
+
result[WellKnownMetadata.knowledge.DESCRIPTION] = metadata.description;
|
|
42261
|
+
}
|
|
42262
|
+
if (metadata.favicon) {
|
|
42263
|
+
result[WellKnownMetadata.knowledge.FAVICON] = metadata.favicon;
|
|
42264
|
+
}
|
|
42265
|
+
return result;
|
|
42266
|
+
}
|
|
42267
|
+
/**
|
|
42268
|
+
* Default fetch implementation using Node's built-in fetch
|
|
42269
|
+
*/
|
|
42270
|
+
async defaultFetch(url2) {
|
|
42271
|
+
const result = await fetchHtml(url2, {
|
|
42272
|
+
timeout: 3e4
|
|
42273
|
+
});
|
|
42274
|
+
if (!result.metadata) {
|
|
42275
|
+
return {
|
|
42276
|
+
url: result.url,
|
|
42277
|
+
contentType: result.contentType,
|
|
42278
|
+
content: result.content
|
|
42279
|
+
};
|
|
42280
|
+
}
|
|
42281
|
+
return {
|
|
42282
|
+
url: result.url,
|
|
42283
|
+
contentType: result.contentType,
|
|
42284
|
+
content: result.content,
|
|
42285
|
+
metadata: this.convertMetadata(result.metadata)
|
|
42286
|
+
};
|
|
42287
|
+
}
|
|
42288
|
+
/**
|
|
42289
|
+
* Fetch content from a URL for sitemap parsing (raw content needed)
|
|
42154
42290
|
*/
|
|
42155
42291
|
async fetchSitemap(url2) {
|
|
42156
42292
|
if (this.customFetch) {
|
|
42157
42293
|
try {
|
|
42158
42294
|
return await this.customFetch(url2);
|
|
42159
42295
|
} catch (err) {
|
|
42160
|
-
console.warn(`Custom fetch failed for ${url2}, falling back...`);
|
|
42296
|
+
console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
|
|
42161
42297
|
}
|
|
42162
42298
|
}
|
|
42163
|
-
if (
|
|
42164
|
-
|
|
42165
|
-
|
|
42166
|
-
);
|
|
42167
|
-
}
|
|
42168
|
-
const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
|
|
42169
|
-
urls: [url2],
|
|
42170
|
-
timeout: 3e4,
|
|
42171
|
-
waitFor: 500
|
|
42172
|
-
});
|
|
42173
|
-
const result = output2?.results[0];
|
|
42174
|
-
if (!result || !result.content) {
|
|
42175
|
-
throw new Error(`Failed to fetch content from ${url2}`);
|
|
42299
|
+
if (this.fetchStrategy === "integration:browser") {
|
|
42300
|
+
return this.fetchWithBrowserIntegration(url2, { raw: true });
|
|
42301
|
+
} else {
|
|
42302
|
+
return this.defaultFetch(url2);
|
|
42176
42303
|
}
|
|
42177
|
-
return {
|
|
42178
|
-
url: result.url,
|
|
42179
|
-
contentType: "application/html",
|
|
42180
|
-
content: result.raw
|
|
42181
|
-
};
|
|
42182
42304
|
}
|
|
42183
42305
|
/**
|
|
42184
|
-
* Fetch content from a URL with
|
|
42306
|
+
* Fetch content from a URL for indexing (with metadata extraction)
|
|
42185
42307
|
*/
|
|
42186
42308
|
async fetchUrl(url2) {
|
|
42187
42309
|
if (this.customFetch) {
|
|
42188
42310
|
try {
|
|
42189
42311
|
return await this.customFetch(url2);
|
|
42190
42312
|
} catch (err) {
|
|
42191
|
-
console.warn(`Custom fetch failed for ${url2}, falling back...`);
|
|
42313
|
+
console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
|
|
42192
42314
|
}
|
|
42193
42315
|
}
|
|
42316
|
+
if (this.fetchStrategy === "integration:browser") {
|
|
42317
|
+
return this.fetchWithBrowserIntegration(url2, { raw: false });
|
|
42318
|
+
} else {
|
|
42319
|
+
return this.defaultFetch(url2);
|
|
42320
|
+
}
|
|
42321
|
+
}
|
|
42322
|
+
/**
|
|
42323
|
+
* Fetch content using the browser integration
|
|
42324
|
+
*/
|
|
42325
|
+
async fetchWithBrowserIntegration(url2, options) {
|
|
42194
42326
|
if (!this.isBrowserIntegrationAvailable()) {
|
|
42195
|
-
throw new Error(
|
|
42196
|
-
`The 'browser' integration is not installed and is required for crawling website. Please provide a custom fetch function or install the 'browser' integration.`
|
|
42197
|
-
);
|
|
42327
|
+
throw new Error(`The 'browser' integration is not installed. Please install it or use fetch: 'node:fetch'.`);
|
|
42198
42328
|
}
|
|
42199
42329
|
const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
|
|
42200
42330
|
urls: [url2],
|
|
@@ -42205,6 +42335,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
42205
42335
|
if (!result || !result.content) {
|
|
42206
42336
|
throw new Error(`Failed to fetch content from ${url2}`);
|
|
42207
42337
|
}
|
|
42338
|
+
if (options.raw && result.raw) {
|
|
42339
|
+
return {
|
|
42340
|
+
url: result.url,
|
|
42341
|
+
contentType: "application/html",
|
|
42342
|
+
content: result.raw
|
|
42343
|
+
};
|
|
42344
|
+
}
|
|
42208
42345
|
return {
|
|
42209
42346
|
url: result.url,
|
|
42210
42347
|
contentType: "text/markdown",
|
|
@@ -44381,7 +44518,7 @@ var adk = {
|
|
|
44381
44518
|
get zai() {
|
|
44382
44519
|
return new Zai({
|
|
44383
44520
|
client: context2.get("cognitive"),
|
|
44384
|
-
modelId: Array.isArray(adk.project.config.defaultModels.
|
|
44521
|
+
modelId: Array.isArray(adk.project.config.defaultModels.zai) ? adk.project.config.defaultModels.zai[0] ?? "auto" : adk.project.config.defaultModels.zai
|
|
44385
44522
|
});
|
|
44386
44523
|
},
|
|
44387
44524
|
get project() {
|