@botpress/runtime 1.6.4 → 1.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ /**
2
+ * HTML fetching and metadata extraction utilities
3
+ */
4
+ export type HtmlMetadata = {
5
+ title?: string;
6
+ description?: string;
7
+ favicon?: string;
8
+ };
9
+ export type FetchHtmlResult = {
10
+ url: string;
11
+ contentType: string;
12
+ content: string;
13
+ metadata?: HtmlMetadata;
14
+ };
15
+ /**
16
+ * Extract metadata from HTML content using regex patterns
17
+ *
18
+ * @param html - The HTML content to parse
19
+ * @returns Extracted metadata including title, description, and favicon
20
+ */
21
+ export declare function extractHtmlMetadata(html: string): HtmlMetadata;
22
+ /**
23
+ * Resolve a potentially relative URL to an absolute URL
24
+ *
25
+ * @param url - The URL to resolve (may be relative)
26
+ * @param baseUrl - The base URL to resolve against
27
+ * @returns The absolute URL, or the original URL if resolution fails
28
+ */
29
+ export declare function resolveUrl(url: string, baseUrl: string): string;
30
+ /**
31
+ * Fetch content from a URL and extract metadata if HTML
32
+ *
33
+ * This function safely handles both HTML and non-HTML content (XML, JSON, text, etc.).
34
+ * Metadata extraction only occurs for HTML content types. For other content types
35
+ * (like sitemap.xml, robots.txt, RSS feeds), it returns the raw content without
36
+ * attempting metadata extraction.
37
+ *
38
+ * @param url - The URL to fetch
39
+ * @param options - Optional fetch options
40
+ * @returns Fetch result with content and extracted metadata (HTML only)
41
+ *
42
+ * @example
43
+ * // Fetching HTML - extracts metadata
44
+ * const html = await fetchHtml('https://example.com')
45
+ * console.log(html.metadata?.title) // "Example Domain"
46
+ *
47
+ * @example
48
+ * // Fetching XML - no metadata extraction
49
+ * const xml = await fetchHtml('https://example.com/sitemap.xml')
50
+ * console.log(xml.content) // Raw XML content
51
+ * console.log(xml.metadata) // undefined
52
+ */
53
+ export declare function fetchHtml(url: string, options?: {
54
+ userAgent?: string;
55
+ timeout?: number;
56
+ }): Promise<FetchHtmlResult>;
57
+ //# sourceMappingURL=html-fetch.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-fetch.d.ts","sourceRoot":"","sources":["../../../src/primitives/data-sources/html-fetch.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,MAAM,YAAY,GAAG;IACzB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,OAAO,CAAC,EAAE,MAAM,CAAA;CACjB,CAAA;AAED,MAAM,MAAM,eAAe,GAAG;IAC5B,GAAG,EAAE,MAAM,CAAA;IACX,WAAW,EAAE,MAAM,CAAA;IACnB,OAAO,EAAE,MAAM,CAAA;IACf,QAAQ,CAAC,EAAE,YAAY,CAAA;CACxB,CAAA;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAoD9D;AAED;;;;;;GAMG;AACH,wBAAgB,UAAU,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAa/D;AAED;;;;;;;;;;;;;;;;;;;;;;GAsBG;AACH,wBAAsB,SAAS,CAC7B,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE;IACR,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,OAAO,CAAC,EAAE,MAAM,CAAA;CACjB,GACA,OAAO,CAAC,eAAe,CAAC,CA+C1B"}
@@ -16,16 +16,68 @@ type FetchResult = {
16
16
  [WellKnownMetadata.knowledge.FAVICON]?: string;
17
17
  };
18
18
  };
19
+ /**
20
+ * Fetch strategy for retrieving web content
21
+ *
22
+ * - 'node:fetch': Uses Node's built-in fetch (fast, no dependencies, works for static HTML)
23
+ * - 'integration:browser': Uses browser integration (slower, requires browser integration, handles JavaScript/SPAs)
24
+ */
25
+ type FetchStrategy = 'node:fetch' | 'integration:browser';
26
+ /**
27
+ * Fetch option can be:
28
+ * - A strategy string: 'node:fetch' or 'integration:browser'
29
+ * - A custom function: for special authentication, headers, or processing
30
+ */
31
+ type FetchOption = FetchStrategy | ((url: string) => Promise<FetchResult> | FetchResult);
19
32
  type WebsiteSourceOptions = {
20
33
  id?: string;
21
34
  filter?: (context: SitemapFilterContext) => boolean;
22
- fetch?: (url: string) => Promise<FetchResult> | FetchResult;
35
+ /**
36
+ * Fetch method to use for retrieving web pages
37
+ *
38
+ * Options:
39
+ * - 'node:fetch': Fast, uses Node's built-in fetch (best for static HTML sites) **[DEFAULT]**
40
+ * - 'integration:browser': Slower, uses browser integration (best for JavaScript/SPAs)
41
+ * - Custom function: Provide your own fetch implementation (for auth, special headers, etc.)
42
+ * - undefined: Defaults to 'node:fetch'
43
+ *
44
+ * @default 'node:fetch'
45
+ *
46
+ * @example
47
+ * // Use Node's built-in fetch (default, can be omitted)
48
+ * { fetch: 'node:fetch' }
49
+ *
50
+ * @example
51
+ * // Use browser integration for JavaScript-heavy sites
52
+ * { fetch: 'integration:browser' }
53
+ *
54
+ * @example
55
+ * // Custom fetch with authentication
56
+ * {
57
+ * fetch: async (url) => {
58
+ * const response = await fetch(url, {
59
+ * headers: { Authorization: 'Bearer token' }
60
+ * })
61
+ * return {
62
+ * url,
63
+ * contentType: 'text/html',
64
+ * content: await response.text()
65
+ * }
66
+ * }
67
+ * }
68
+ */
69
+ fetch?: FetchOption;
23
70
  maxPages?: number;
24
71
  maxDepth?: number;
25
72
  };
26
73
  type UrlsSourceOptions = {
27
74
  id?: string;
28
- fetch?: (url: string) => Promise<FetchResult> | FetchResult;
75
+ /**
76
+ * Fetch method to use for retrieving web pages
77
+ *
78
+ * See WebsiteSourceOptions.fetch for detailed documentation
79
+ */
80
+ fetch?: FetchOption;
29
81
  };
30
82
  export declare class WebsiteSource extends DataSource {
31
83
  private mode;
@@ -34,19 +86,32 @@ export declare class WebsiteSource extends DataSource {
34
86
  private urls;
35
87
  private filterFn;
36
88
  private customFetch;
89
+ private fetchStrategy;
37
90
  private maxPages;
38
91
  private maxDepth;
39
92
  private transformFn;
40
93
  private constructor();
41
94
  private isBrowserIntegrationAvailable;
42
95
  /**
43
- * Fetch content from a URL with fallback strategy
96
+ * Convert HtmlMetadata to FetchResult metadata format
97
+ */
98
+ private convertMetadata;
99
+ /**
100
+ * Default fetch implementation using Node's built-in fetch
101
+ */
102
+ private defaultFetch;
103
+ /**
104
+ * Fetch content from a URL for sitemap parsing (raw content needed)
44
105
  */
45
106
  private fetchSitemap;
46
107
  /**
47
- * Fetch content from a URL with fallback strategy
108
+ * Fetch content from a URL for indexing (with metadata extraction)
48
109
  */
49
110
  private fetchUrl;
111
+ /**
112
+ * Fetch content using the browser integration
113
+ */
114
+ private fetchWithBrowserIntegration;
50
115
  /**
51
116
  * Parse sitemap XML content
52
117
  */
@@ -1 +1 @@
1
- {"version":3,"file":"source-website.d.ts","sourceRoot":"","sources":["../../../src/primitives/data-sources/source-website.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAA4B,MAAM,eAAe,CAAA;AAEpE,OAAO,EAAiB,iBAAiB,EAAE,MAAM,iBAAiB,CAAA;AAKlE,KAAK,oBAAoB,GAAG;IAC1B,GAAG,EAAE,MAAM,CAAA;IACX,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAA;CAClB,CAAA;AAED,KAAK,WAAW,GAAG;IACjB,GAAG,EAAE,MAAM,CAAA;IACX,WAAW,EAAE,MAAM,CAAA;IACnB,OAAO,EAAE,MAAM,CAAA;IACf,QAAQ,CAAC,EAAE;QACT,CAAC,iBAAiB,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAA;QAC5C,CAAC,iBAAiB,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,EAAE,MAAM,CAAA;QAClD,CAAC,iBAAiB,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,EAAE,MAAM,CAAA;KAC/C,CAAA;CACF,CAAA;AAED,KAAK,oBAAoB,GAAG;IAC1B,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,oBAAoB,KAAK,OAAO,CAAA;IACnD,KAAK,CAAC,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,WAAW,CAAC,GAAG,WAAW,CAAA;IAC3D,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAA;CAClB,CAAA;AAED,KAAK,iBAAiB,GAAG;IACvB,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,KAAK,CAAC,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,WAAW,CAAC,GAAG,WAAW,CAAA;CAC5D,CAAA;AAsCD,qBAAa,aAAc,SAAQ,UAAU;IAC3C,OAAO,CAAC,IAAI,CAAmB;IAC/B,OAAO,CAAC,OAAO,CAAoB;IACnC,OAAO,CAAC,UAAU,CAAoB;IACtC,OAAO,CAAC,IAAI,CAAsB;IAClC,OAAO,CAAC,QAAQ,CAA0D;IAC1E,OAAO,CAAC,WAAW,CAAmE;IACtF,OAAO,CAAC,QAAQ,CAAQ;IACxB,OAAO,CAAC,QAAQ,CAAQ;IACxB,OAAO,CAAC,WAAW,CAAuF;IAE1G,OAAO;IAoBP,OAAO,CAAC,6BAA6B;IAIrC;;OAEG;YACW,YAAY;IA8C1B;;OAEG;YACW,QAAQ;IAkDtB;;OAEG;IACH,OAAO,CAAC,eAAe;IAqDvB;;OAEG;IACH,OAAO,CAAC,eAAe;IAcvB;;OAEG;YACW,uBAAuB;IAoDrC;;OAEG;YACW,YAAY;IAiH1B,IAAW,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;uDAyLtB;IAED,MAAM,CAAC,WAAW,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,oBAAyB,GAAG,aAAa;IAKtF,MAAM,CAAC,WAAW,CAAC,UAAU,EAAE,MAAM,EAAE,OAAO,GAAE,oBAAyB,GAAG,aAAa;IAKzF,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,OAAO,GAAE,iBAAsB,GAAG,aAAa;CAIhF"}
1
+ {"version":3,"file":"source-website.d.ts","sourceRoot":"","sources":["../../../src/primitives/data-sources/source-website.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAA4B,MAAM,eAAe,CAAA;AAEpE,OAAO,EAAiB,iBAAiB,EAAE,MAAM,iBAAiB,CAAA;AAMlE,KAAK,oBAAoB,GAAG;IAC1B,GAAG,EAAE,MAAM,CAAA;IACX,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAA;CAClB,CAAA;AAED,KAAK,WAAW,GAAG;IACjB,GAAG,EAAE,MAAM,CAAA;IACX,WAAW,EAAE,MAAM,CAAA;IACnB,OAAO,EAAE,MAAM,CAAA;IACf,QAAQ,CAAC,EAAE;QACT,CAAC,iBAAiB,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAA;QAC5C,CAAC,iBAAiB,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,EAAE,MAAM,CAAA;QAClD,CAAC,iBAAiB,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,EAAE,MAAM,CAAA;KAC/C,CAAA;CACF,CAAA;AAED;;;;;GAKG;AACH,KAAK,aAAa,GAAG,YAAY,GAAG,qBAAqB,CAAA;AAEzD;;;;GAIG;AACH,KAAK,WAAW,GAAG,aAAa,GAAG,CAAC,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC,WAAW,CAAC,GAAG,WAAW,CAAC,CAAA;AAExF,KAAK,oBAAoB,GAAG;IAC1B,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,oBAAoB,KAAK,OAAO,CAAA;IACnD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;OAiCG;IACH,KAAK,CAAC,EAAE,WAAW,CAAA;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAA;CAClB,CAAA;AAED,KAAK,iBAAiB,GAAG;IACvB,EAAE,CAAC,EAAE,MAAM,CAAA;IACX;;;;OAIG;IACH,KAAK,CAAC,EAAE,WAAW,CAAA;CACpB,CAAA;AAsCD,qBAAa,aAAc,SAAQ,UAAU;IAC3C,OAAO,CAAC,IAAI,CAAmB;IAC/B,OAAO,CAAC,OAAO,CAAoB;IACnC,OAAO,CAAC,UAAU,CAAoB;IACtC,OAAO,CAAC,IAAI,CAAsB;IAClC,OAAO,CAAC,QAAQ,CAA0D;IAC1E,OAAO,CAAC,WAAW,CAAmE;IACtF,OAAO,CAAC,aAAa,CAAe;IACpC,OAAO,CAAC,QAAQ,CAAQ;IACxB,OAAO,CAAC,QAAQ,CAAQ;IACxB,OAAO,CAAC,WAAW,CAAuF;IAE1G,OAAO;IAmCP,OAAO,CAAC,6BAA6B;IAIrC;;OAEG;IACH,OAAO,CAAC,eAAe;IAgBvB;;OAEG;YACW,YAAY;IAsB1B;;OAEG;YACW,YAAY;IAkB1B;;OAEG;YACW,QAAQ;IAkBtB;;OAEG;YACW,2BAA2B;IAgDzC;;OAEG;IACH,OAAO,CAAC,eAAe;IAqDvB;;OAEG;IACH,OAAO,CAAC,eAAe;IAcvB;;OAEG;YACW,uBAAuB;IAoDrC;;OAEG;YACW,YAAY;IAiH1B,IAAW,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;uDAiMtB;IAED,MAAM,CAAC,WAAW,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,oBAAyB,GAAG,aAAa;IAKtF,MAAM,CAAC,WAAW,CAAC,UAAU,EAAE,MAAM,EAAE,OAAO,GAAE,oBAAyB,GAAG,aAAa;IAKzF,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,OAAO,GAAE,iBAAsB,GAAG,aAAa;CAIhF"}
package/dist/runtime.js CHANGED
@@ -48,7 +48,7 @@ var init_define_BUILD = __esm({
48
48
  var define_PACKAGE_VERSIONS_default;
49
49
  var init_define_PACKAGE_VERSIONS = __esm({
50
50
  "<define:__PACKAGE_VERSIONS__>"() {
51
- define_PACKAGE_VERSIONS_default = { runtime: "1.6.4", adk: "not-installed", sdk: "4.17.3", llmz: "0.0.27", zai: "2.4.0", cognitive: "0.2.0" };
51
+ define_PACKAGE_VERSIONS_default = { runtime: "1.6.6", adk: "not-installed", sdk: "4.17.3", llmz: "0.0.27", zai: "2.4.0", cognitive: "0.2.0" };
52
52
  }
53
53
  });
54
54
 
@@ -25291,7 +25291,7 @@ var require_follow_redirects = __commonJS({
25291
25291
  var currentUrlParts = parseUrl(this._currentUrl);
25292
25292
  var currentHost = currentHostHeader || currentUrlParts.host;
25293
25293
  var currentUrl = /^\w+:/.test(location) ? this._currentUrl : url2.format(Object.assign(currentUrlParts, { host: currentHost }));
25294
- var redirectUrl = resolveUrl(location, currentUrl);
25294
+ var redirectUrl = resolveUrl2(location, currentUrl);
25295
25295
  debug("redirecting to", redirectUrl.href);
25296
25296
  this._isRedirect = true;
25297
25297
  spreadUrlObject(redirectUrl, this._options);
@@ -25375,7 +25375,7 @@ var require_follow_redirects = __commonJS({
25375
25375
  }
25376
25376
  return parsed;
25377
25377
  }
25378
- function resolveUrl(relative, base) {
25378
+ function resolveUrl2(relative, base) {
25379
25379
  return useNativeURL ? new URL2(relative, base) : parseUrl(url2.resolve(base, relative));
25380
25380
  }
25381
25381
  function validateUrl(input) {
@@ -42113,6 +42113,95 @@ var XMLParser = class {
42113
42113
  }
42114
42114
  };
42115
42115
 
42116
+ // src/primitives/data-sources/html-fetch.ts
42117
+ init_define_BUILD();
42118
+ init_define_PACKAGE_VERSIONS();
42119
+ function extractHtmlMetadata(html) {
42120
+ const metadata = {};
42121
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
42122
+ if (titleMatch && titleMatch[1]) {
42123
+ metadata.title = titleMatch[1].trim();
42124
+ }
42125
+ let descriptionMatch = html.match(
42126
+ /<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content="([^"]+)"/i
42127
+ );
42128
+ if (!descriptionMatch) {
42129
+ descriptionMatch = html.match(
42130
+ /<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content='([^']+)'/i
42131
+ );
42132
+ }
42133
+ if (descriptionMatch && descriptionMatch[1]) {
42134
+ metadata.description = descriptionMatch[1].trim();
42135
+ }
42136
+ const faviconPatterns = [
42137
+ // rel first, double quotes
42138
+ /<link\s+[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"[^>]*href="([^"]+)"/i,
42139
+ // rel first, single quotes
42140
+ /<link\s+[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'[^>]*href='([^']+)'/i,
42141
+ // href first, double quotes
42142
+ /<link\s+[^>]*href="([^"]+)"[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"/i,
42143
+ // href first, single quotes
42144
+ /<link\s+[^>]*href='([^']+)'[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'/i
42145
+ ];
42146
+ for (const pattern of faviconPatterns) {
42147
+ const faviconMatch = html.match(pattern);
42148
+ if (faviconMatch && faviconMatch[1]) {
42149
+ metadata.favicon = faviconMatch[1].trim();
42150
+ break;
42151
+ }
42152
+ }
42153
+ if (!metadata.favicon) {
42154
+ metadata.favicon = "/favicon.ico";
42155
+ }
42156
+ return metadata;
42157
+ }
42158
+ function resolveUrl(url2, baseUrl) {
42159
+ if (url2.startsWith("http://") || url2.startsWith("https://")) {
42160
+ return url2;
42161
+ }
42162
+ try {
42163
+ const base = new URL(baseUrl);
42164
+ return new URL(url2, base.origin).href;
42165
+ } catch {
42166
+ return url2;
42167
+ }
42168
+ }
42169
+ async function fetchHtml(url2, options) {
42170
+ const userAgent = options?.userAgent || "Mozilla/5.0 (compatible; BotpressBot/1.0)";
42171
+ const fetchOptions = {
42172
+ headers: {
42173
+ "User-Agent": userAgent
42174
+ }
42175
+ };
42176
+ if (options?.timeout) {
42177
+ fetchOptions.signal = AbortSignal.timeout(options.timeout);
42178
+ }
42179
+ const response = await fetch(url2, fetchOptions);
42180
+ if (!response.ok) {
42181
+ throw new Error(`Failed to fetch ${url2}: ${response.status} ${response.statusText}`);
42182
+ }
42183
+ const contentType = response.headers.get("content-type") || "text/html";
42184
+ const content = await response.text();
42185
+ const isHtml = contentType.includes("text/html") || contentType.includes("application/xhtml");
42186
+ if (!isHtml) {
42187
+ return {
42188
+ url: url2,
42189
+ contentType,
42190
+ content
42191
+ };
42192
+ }
42193
+ const extracted = extractHtmlMetadata(content);
42194
+ if (extracted.favicon) {
42195
+ extracted.favicon = resolveUrl(extracted.favicon, url2);
42196
+ }
42197
+ return {
42198
+ url: url2,
42199
+ contentType,
42200
+ content,
42201
+ metadata: extracted
42202
+ };
42203
+ }
42204
+
42116
42205
  // src/primitives/data-sources/source-website.ts
42117
42206
  var State = z6.object({
42118
42207
  urls: z6.array(
@@ -42132,6 +42221,7 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
42132
42221
  urls;
42133
42222
  filterFn;
42134
42223
  customFetch;
42224
+ fetchStrategy;
42135
42225
  maxPages;
42136
42226
  maxDepth;
42137
42227
  transformFn;
@@ -42142,7 +42232,16 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
42142
42232
  this.sitemapUrl = options.sitemapUrl ?? void 0;
42143
42233
  this.urls = options.urls ?? void 0;
42144
42234
  this.filterFn = "filter" in options ? options.filter : void 0;
42145
- this.customFetch = options.fetch ?? void 0;
42235
+ if (typeof options.fetch === "string") {
42236
+ this.fetchStrategy = options.fetch;
42237
+ this.customFetch = void 0;
42238
+ } else if (typeof options.fetch === "function") {
42239
+ this.customFetch = options.fetch;
42240
+ this.fetchStrategy = "node:fetch";
42241
+ } else {
42242
+ this.fetchStrategy = "node:fetch";
42243
+ this.customFetch = void 0;
42244
+ }
42146
42245
  this.maxPages = Math.max(1, Math.min(("maxPages" in options ? options.maxPages : void 0) ?? 5e4, 5e4));
42147
42246
  this.maxDepth = Math.max(1, Math.min(("maxDepth" in options ? options.maxDepth : void 0) ?? 20, 20));
42148
42247
  }
@@ -42150,51 +42249,82 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
42150
42249
  return !!adk.project.integrations.get("browser");
42151
42250
  }
42152
42251
  /**
42153
- * Fetch content from a URL with fallback strategy
42252
+ * Convert HtmlMetadata to FetchResult metadata format
42253
+ */
42254
+ convertMetadata(metadata) {
42255
+ const result = {};
42256
+ if (metadata.title) {
42257
+ result[WellKnownMetadata.knowledge.TITLE] = metadata.title;
42258
+ }
42259
+ if (metadata.description) {
42260
+ result[WellKnownMetadata.knowledge.DESCRIPTION] = metadata.description;
42261
+ }
42262
+ if (metadata.favicon) {
42263
+ result[WellKnownMetadata.knowledge.FAVICON] = metadata.favicon;
42264
+ }
42265
+ return result;
42266
+ }
42267
+ /**
42268
+ * Default fetch implementation using Node's built-in fetch
42269
+ */
42270
+ async defaultFetch(url2) {
42271
+ const result = await fetchHtml(url2, {
42272
+ timeout: 3e4
42273
+ });
42274
+ if (!result.metadata) {
42275
+ return {
42276
+ url: result.url,
42277
+ contentType: result.contentType,
42278
+ content: result.content
42279
+ };
42280
+ }
42281
+ return {
42282
+ url: result.url,
42283
+ contentType: result.contentType,
42284
+ content: result.content,
42285
+ metadata: this.convertMetadata(result.metadata)
42286
+ };
42287
+ }
42288
+ /**
42289
+ * Fetch content from a URL for sitemap parsing (raw content needed)
42154
42290
  */
42155
42291
  async fetchSitemap(url2) {
42156
42292
  if (this.customFetch) {
42157
42293
  try {
42158
42294
  return await this.customFetch(url2);
42159
42295
  } catch (err) {
42160
- console.warn(`Custom fetch failed for ${url2}, falling back...`);
42296
+ console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
42161
42297
  }
42162
42298
  }
42163
- if (!this.isBrowserIntegrationAvailable()) {
42164
- throw new Error(
42165
- `The 'browser' integration is not installed and is required for crawling website. Please provide a custom fetch function or install the 'browser' integration.`
42166
- );
42167
- }
42168
- const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
42169
- urls: [url2],
42170
- timeout: 3e4,
42171
- waitFor: 500
42172
- });
42173
- const result = output2?.results[0];
42174
- if (!result || !result.content) {
42175
- throw new Error(`Failed to fetch content from ${url2}`);
42299
+ if (this.fetchStrategy === "integration:browser") {
42300
+ return this.fetchWithBrowserIntegration(url2, { raw: true });
42301
+ } else {
42302
+ return this.defaultFetch(url2);
42176
42303
  }
42177
- return {
42178
- url: result.url,
42179
- contentType: "application/html",
42180
- content: result.raw
42181
- };
42182
42304
  }
42183
42305
  /**
42184
- * Fetch content from a URL with fallback strategy
42306
+ * Fetch content from a URL for indexing (with metadata extraction)
42185
42307
  */
42186
42308
  async fetchUrl(url2) {
42187
42309
  if (this.customFetch) {
42188
42310
  try {
42189
42311
  return await this.customFetch(url2);
42190
42312
  } catch (err) {
42191
- console.warn(`Custom fetch failed for ${url2}, falling back...`);
42313
+ console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
42192
42314
  }
42193
42315
  }
42316
+ if (this.fetchStrategy === "integration:browser") {
42317
+ return this.fetchWithBrowserIntegration(url2, { raw: false });
42318
+ } else {
42319
+ return this.defaultFetch(url2);
42320
+ }
42321
+ }
42322
+ /**
42323
+ * Fetch content using the browser integration
42324
+ */
42325
+ async fetchWithBrowserIntegration(url2, options) {
42194
42326
  if (!this.isBrowserIntegrationAvailable()) {
42195
- throw new Error(
42196
- `The 'browser' integration is not installed and is required for crawling website. Please provide a custom fetch function or install the 'browser' integration.`
42197
- );
42327
+ throw new Error(`The 'browser' integration is not installed. Please install it or use fetch: 'node:fetch'.`);
42198
42328
  }
42199
42329
  const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
42200
42330
  urls: [url2],
@@ -42205,6 +42335,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
42205
42335
  if (!result || !result.content) {
42206
42336
  throw new Error(`Failed to fetch content from ${url2}`);
42207
42337
  }
42338
+ if (options.raw && result.raw) {
42339
+ return {
42340
+ url: result.url,
42341
+ contentType: "application/html",
42342
+ content: result.raw
42343
+ };
42344
+ }
42208
42345
  return {
42209
42346
  url: result.url,
42210
42347
  contentType: "text/markdown",
@@ -42433,6 +42570,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
42433
42570
  const toRemove = existingFiles.filter(
42434
42571
  (f) => !discoveredUrls.find((u) => u.loc === f.metadata?.[WellKnownMetadata.knowledge.URL])
42435
42572
  );
42573
+ if (existingFiles.length > 0 && toRemove.length >= existingFiles.length * 0.8) {
42574
+ console.error(
42575
+ `Warning: All existing files (${existingFiles.length}) are scheduled for removal. Please check if the sitemap URL is correct and the website is accessible. We will try again in 5 minutes.`
42576
+ );
42577
+ await step2.sleep("retry wait", 5 * 60 * 1e3);
42578
+ throw new Error("Aborting sync due to potential misconfiguration (all files to be removed)");
42579
+ }
42436
42580
  const toFetch = [];
42437
42581
  let skippedUnchanged = 0;
42438
42582
  for (const url2 of discoveredUrls) {