@botpress/runtime 1.6.5 → 1.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/internal.js CHANGED
@@ -48,7 +48,7 @@ var init_define_BUILD = __esm({
48
48
  var define_PACKAGE_VERSIONS_default;
49
49
  var init_define_PACKAGE_VERSIONS = __esm({
50
50
  "<define:__PACKAGE_VERSIONS__>"() {
51
- define_PACKAGE_VERSIONS_default = { runtime: "1.6.5", adk: "not-installed", sdk: "4.17.3", llmz: "0.0.27", zai: "2.4.0", cognitive: "0.2.0" };
51
+ define_PACKAGE_VERSIONS_default = { runtime: "1.6.6", adk: "not-installed", sdk: "4.17.3", llmz: "0.0.27", zai: "2.4.0", cognitive: "0.2.0" };
52
52
  }
53
53
  });
54
54
 
@@ -25291,7 +25291,7 @@ var require_follow_redirects = __commonJS({
25291
25291
  var currentUrlParts = parseUrl(this._currentUrl);
25292
25292
  var currentHost = currentHostHeader || currentUrlParts.host;
25293
25293
  var currentUrl = /^\w+:/.test(location) ? this._currentUrl : url2.format(Object.assign(currentUrlParts, { host: currentHost }));
25294
- var redirectUrl = resolveUrl(location, currentUrl);
25294
+ var redirectUrl = resolveUrl2(location, currentUrl);
25295
25295
  debug("redirecting to", redirectUrl.href);
25296
25296
  this._isRedirect = true;
25297
25297
  spreadUrlObject(redirectUrl, this._options);
@@ -25375,7 +25375,7 @@ var require_follow_redirects = __commonJS({
25375
25375
  }
25376
25376
  return parsed;
25377
25377
  }
25378
- function resolveUrl(relative, base) {
25378
+ function resolveUrl2(relative, base) {
25379
25379
  return useNativeURL ? new URL2(relative, base) : parseUrl(url2.resolve(base, relative));
25380
25380
  }
25381
25381
  function validateUrl(input) {
@@ -44688,6 +44688,95 @@ var XMLParser = class {
44688
44688
  }
44689
44689
  };
44690
44690
 
44691
+ // src/primitives/data-sources/html-fetch.ts
44692
+ init_define_BUILD();
44693
+ init_define_PACKAGE_VERSIONS();
44694
+ function extractHtmlMetadata(html) {
44695
+ const metadata = {};
44696
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
44697
+ if (titleMatch && titleMatch[1]) {
44698
+ metadata.title = titleMatch[1].trim();
44699
+ }
44700
+ let descriptionMatch = html.match(
44701
+ /<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content="([^"]+)"/i
44702
+ );
44703
+ if (!descriptionMatch) {
44704
+ descriptionMatch = html.match(
44705
+ /<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content='([^']+)'/i
44706
+ );
44707
+ }
44708
+ if (descriptionMatch && descriptionMatch[1]) {
44709
+ metadata.description = descriptionMatch[1].trim();
44710
+ }
44711
+ const faviconPatterns = [
44712
+ // rel first, double quotes
44713
+ /<link\s+[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"[^>]*href="([^"]+)"/i,
44714
+ // rel first, single quotes
44715
+ /<link\s+[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'[^>]*href='([^']+)'/i,
44716
+ // href first, double quotes
44717
+ /<link\s+[^>]*href="([^"]+)"[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"/i,
44718
+ // href first, single quotes
44719
+ /<link\s+[^>]*href='([^']+)'[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'/i
44720
+ ];
44721
+ for (const pattern of faviconPatterns) {
44722
+ const faviconMatch = html.match(pattern);
44723
+ if (faviconMatch && faviconMatch[1]) {
44724
+ metadata.favicon = faviconMatch[1].trim();
44725
+ break;
44726
+ }
44727
+ }
44728
+ if (!metadata.favicon) {
44729
+ metadata.favicon = "/favicon.ico";
44730
+ }
44731
+ return metadata;
44732
+ }
44733
+ function resolveUrl(url2, baseUrl) {
44734
+ if (url2.startsWith("http://") || url2.startsWith("https://")) {
44735
+ return url2;
44736
+ }
44737
+ try {
44738
+ const base = new URL(baseUrl);
44739
+ return new URL(url2, base.origin).href;
44740
+ } catch {
44741
+ return url2;
44742
+ }
44743
+ }
44744
+ async function fetchHtml(url2, options) {
44745
+ const userAgent = options?.userAgent || "Mozilla/5.0 (compatible; BotpressBot/1.0)";
44746
+ const fetchOptions = {
44747
+ headers: {
44748
+ "User-Agent": userAgent
44749
+ }
44750
+ };
44751
+ if (options?.timeout) {
44752
+ fetchOptions.signal = AbortSignal.timeout(options.timeout);
44753
+ }
44754
+ const response = await fetch(url2, fetchOptions);
44755
+ if (!response.ok) {
44756
+ throw new Error(`Failed to fetch ${url2}: ${response.status} ${response.statusText}`);
44757
+ }
44758
+ const contentType = response.headers.get("content-type") || "text/html";
44759
+ const content = await response.text();
44760
+ const isHtml = contentType.includes("text/html") || contentType.includes("application/xhtml");
44761
+ if (!isHtml) {
44762
+ return {
44763
+ url: url2,
44764
+ contentType,
44765
+ content
44766
+ };
44767
+ }
44768
+ const extracted = extractHtmlMetadata(content);
44769
+ if (extracted.favicon) {
44770
+ extracted.favicon = resolveUrl(extracted.favicon, url2);
44771
+ }
44772
+ return {
44773
+ url: url2,
44774
+ contentType,
44775
+ content,
44776
+ metadata: extracted
44777
+ };
44778
+ }
44779
+
44691
44780
  // src/primitives/data-sources/source-website.ts
44692
44781
  var State = z20.object({
44693
44782
  urls: z20.array(
@@ -44707,6 +44796,7 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
44707
44796
  urls;
44708
44797
  filterFn;
44709
44798
  customFetch;
44799
+ fetchStrategy;
44710
44800
  maxPages;
44711
44801
  maxDepth;
44712
44802
  transformFn;
@@ -44717,7 +44807,16 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
44717
44807
  this.sitemapUrl = options.sitemapUrl ?? void 0;
44718
44808
  this.urls = options.urls ?? void 0;
44719
44809
  this.filterFn = "filter" in options ? options.filter : void 0;
44720
- this.customFetch = options.fetch ?? void 0;
44810
+ if (typeof options.fetch === "string") {
44811
+ this.fetchStrategy = options.fetch;
44812
+ this.customFetch = void 0;
44813
+ } else if (typeof options.fetch === "function") {
44814
+ this.customFetch = options.fetch;
44815
+ this.fetchStrategy = "node:fetch";
44816
+ } else {
44817
+ this.fetchStrategy = "node:fetch";
44818
+ this.customFetch = void 0;
44819
+ }
44721
44820
  this.maxPages = Math.max(1, Math.min(("maxPages" in options ? options.maxPages : void 0) ?? 5e4, 5e4));
44722
44821
  this.maxDepth = Math.max(1, Math.min(("maxDepth" in options ? options.maxDepth : void 0) ?? 20, 20));
44723
44822
  }
@@ -44725,51 +44824,82 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
44725
44824
  return !!adk.project.integrations.get("browser");
44726
44825
  }
44727
44826
  /**
44728
- * Fetch content from a URL with fallback strategy
44827
+ * Convert HtmlMetadata to FetchResult metadata format
44828
+ */
44829
+ convertMetadata(metadata) {
44830
+ const result = {};
44831
+ if (metadata.title) {
44832
+ result[WellKnownMetadata.knowledge.TITLE] = metadata.title;
44833
+ }
44834
+ if (metadata.description) {
44835
+ result[WellKnownMetadata.knowledge.DESCRIPTION] = metadata.description;
44836
+ }
44837
+ if (metadata.favicon) {
44838
+ result[WellKnownMetadata.knowledge.FAVICON] = metadata.favicon;
44839
+ }
44840
+ return result;
44841
+ }
44842
+ /**
44843
+ * Default fetch implementation using Node's built-in fetch
44844
+ */
44845
+ async defaultFetch(url2) {
44846
+ const result = await fetchHtml(url2, {
44847
+ timeout: 3e4
44848
+ });
44849
+ if (!result.metadata) {
44850
+ return {
44851
+ url: result.url,
44852
+ contentType: result.contentType,
44853
+ content: result.content
44854
+ };
44855
+ }
44856
+ return {
44857
+ url: result.url,
44858
+ contentType: result.contentType,
44859
+ content: result.content,
44860
+ metadata: this.convertMetadata(result.metadata)
44861
+ };
44862
+ }
44863
+ /**
44864
+ * Fetch content from a URL for sitemap parsing (raw content needed)
44729
44865
  */
44730
44866
  async fetchSitemap(url2) {
44731
44867
  if (this.customFetch) {
44732
44868
  try {
44733
44869
  return await this.customFetch(url2);
44734
44870
  } catch (err) {
44735
- console.warn(`Custom fetch failed for ${url2}, falling back...`);
44871
+ console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
44736
44872
  }
44737
44873
  }
44738
- if (!this.isBrowserIntegrationAvailable()) {
44739
- throw new Error(
44740
- `The 'browser' integration is not installed and is required for crawling website. Please provide a custom fetch function or install the 'browser' integration.`
44741
- );
44742
- }
44743
- const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
44744
- urls: [url2],
44745
- timeout: 3e4,
44746
- waitFor: 500
44747
- });
44748
- const result = output2?.results[0];
44749
- if (!result || !result.content) {
44750
- throw new Error(`Failed to fetch content from ${url2}`);
44874
+ if (this.fetchStrategy === "integration:browser") {
44875
+ return this.fetchWithBrowserIntegration(url2, { raw: true });
44876
+ } else {
44877
+ return this.defaultFetch(url2);
44751
44878
  }
44752
- return {
44753
- url: result.url,
44754
- contentType: "application/html",
44755
- content: result.raw
44756
- };
44757
44879
  }
44758
44880
  /**
44759
- * Fetch content from a URL with fallback strategy
44881
+ * Fetch content from a URL for indexing (with metadata extraction)
44760
44882
  */
44761
44883
  async fetchUrl(url2) {
44762
44884
  if (this.customFetch) {
44763
44885
  try {
44764
44886
  return await this.customFetch(url2);
44765
44887
  } catch (err) {
44766
- console.warn(`Custom fetch failed for ${url2}, falling back...`);
44888
+ console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
44767
44889
  }
44768
44890
  }
44891
+ if (this.fetchStrategy === "integration:browser") {
44892
+ return this.fetchWithBrowserIntegration(url2, { raw: false });
44893
+ } else {
44894
+ return this.defaultFetch(url2);
44895
+ }
44896
+ }
44897
+ /**
44898
+ * Fetch content using the browser integration
44899
+ */
44900
+ async fetchWithBrowserIntegration(url2, options) {
44769
44901
  if (!this.isBrowserIntegrationAvailable()) {
44770
- throw new Error(
44771
- `The 'browser' integration is not installed and is required for crawling website. Please provide a custom fetch function or install the 'browser' integration.`
44772
- );
44902
+ throw new Error(`The 'browser' integration is not installed. Please install it or use fetch: 'node:fetch'.`);
44773
44903
  }
44774
44904
  const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
44775
44905
  urls: [url2],
@@ -44780,6 +44910,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
44780
44910
  if (!result || !result.content) {
44781
44911
  throw new Error(`Failed to fetch content from ${url2}`);
44782
44912
  }
44913
+ if (options.raw && result.raw) {
44914
+ return {
44915
+ url: result.url,
44916
+ contentType: "application/html",
44917
+ content: result.raw
44918
+ };
44919
+ }
44783
44920
  return {
44784
44921
  url: result.url,
44785
44922
  contentType: "text/markdown",