@botpress/runtime 1.6.4 → 1.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/library.js CHANGED
@@ -48,7 +48,7 @@ var init_define_BUILD = __esm({
48
48
  var define_PACKAGE_VERSIONS_default;
49
49
  var init_define_PACKAGE_VERSIONS = __esm({
50
50
  "<define:__PACKAGE_VERSIONS__>"() {
51
- define_PACKAGE_VERSIONS_default = { runtime: "1.6.4", adk: "not-installed", sdk: "4.17.3", llmz: "0.0.27", zai: "2.4.0", cognitive: "0.2.0" };
51
+ define_PACKAGE_VERSIONS_default = { runtime: "1.6.6", adk: "not-installed", sdk: "4.17.3", llmz: "0.0.27", zai: "2.4.0", cognitive: "0.2.0" };
52
52
  }
53
53
  });
54
54
 
@@ -25291,7 +25291,7 @@ var require_follow_redirects = __commonJS({
25291
25291
  var currentUrlParts = parseUrl(this._currentUrl);
25292
25292
  var currentHost = currentHostHeader || currentUrlParts.host;
25293
25293
  var currentUrl = /^\w+:/.test(location) ? this._currentUrl : url2.format(Object.assign(currentUrlParts, { host: currentHost }));
25294
- var redirectUrl = resolveUrl(location, currentUrl);
25294
+ var redirectUrl = resolveUrl2(location, currentUrl);
25295
25295
  debug("redirecting to", redirectUrl.href);
25296
25296
  this._isRedirect = true;
25297
25297
  spreadUrlObject(redirectUrl, this._options);
@@ -25375,7 +25375,7 @@ var require_follow_redirects = __commonJS({
25375
25375
  }
25376
25376
  return parsed;
25377
25377
  }
25378
- function resolveUrl(relative, base) {
25378
+ function resolveUrl2(relative, base) {
25379
25379
  return useNativeURL ? new URL2(relative, base) : parseUrl(url2.resolve(base, relative));
25380
25380
  }
25381
25381
  function validateUrl(input) {
@@ -44720,6 +44720,95 @@ var XMLParser = class {
44720
44720
  }
44721
44721
  };
44722
44722
 
44723
+ // src/primitives/data-sources/html-fetch.ts
44724
+ init_define_BUILD();
44725
+ init_define_PACKAGE_VERSIONS();
44726
+ function extractHtmlMetadata(html) {
44727
+ const metadata = {};
44728
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
44729
+ if (titleMatch && titleMatch[1]) {
44730
+ metadata.title = titleMatch[1].trim();
44731
+ }
44732
+ let descriptionMatch = html.match(
44733
+ /<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content="([^"]+)"/i
44734
+ );
44735
+ if (!descriptionMatch) {
44736
+ descriptionMatch = html.match(
44737
+ /<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content='([^']+)'/i
44738
+ );
44739
+ }
44740
+ if (descriptionMatch && descriptionMatch[1]) {
44741
+ metadata.description = descriptionMatch[1].trim();
44742
+ }
44743
+ const faviconPatterns = [
44744
+ // rel first, double quotes
44745
+ /<link\s+[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"[^>]*href="([^"]+)"/i,
44746
+ // rel first, single quotes
44747
+ /<link\s+[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'[^>]*href='([^']+)'/i,
44748
+ // href first, double quotes
44749
+ /<link\s+[^>]*href="([^"]+)"[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"/i,
44750
+ // href first, single quotes
44751
+ /<link\s+[^>]*href='([^']+)'[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'/i
44752
+ ];
44753
+ for (const pattern of faviconPatterns) {
44754
+ const faviconMatch = html.match(pattern);
44755
+ if (faviconMatch && faviconMatch[1]) {
44756
+ metadata.favicon = faviconMatch[1].trim();
44757
+ break;
44758
+ }
44759
+ }
44760
+ if (!metadata.favicon) {
44761
+ metadata.favicon = "/favicon.ico";
44762
+ }
44763
+ return metadata;
44764
+ }
44765
+ function resolveUrl(url2, baseUrl) {
44766
+ if (url2.startsWith("http://") || url2.startsWith("https://")) {
44767
+ return url2;
44768
+ }
44769
+ try {
44770
+ const base = new URL(baseUrl);
44771
+ return new URL(url2, base.origin).href;
44772
+ } catch {
44773
+ return url2;
44774
+ }
44775
+ }
44776
+ async function fetchHtml(url2, options) {
44777
+ const userAgent = options?.userAgent || "Mozilla/5.0 (compatible; BotpressBot/1.0)";
44778
+ const fetchOptions = {
44779
+ headers: {
44780
+ "User-Agent": userAgent
44781
+ }
44782
+ };
44783
+ if (options?.timeout) {
44784
+ fetchOptions.signal = AbortSignal.timeout(options.timeout);
44785
+ }
44786
+ const response = await fetch(url2, fetchOptions);
44787
+ if (!response.ok) {
44788
+ throw new Error(`Failed to fetch ${url2}: ${response.status} ${response.statusText}`);
44789
+ }
44790
+ const contentType = response.headers.get("content-type") || "text/html";
44791
+ const content = await response.text();
44792
+ const isHtml = contentType.includes("text/html") || contentType.includes("application/xhtml");
44793
+ if (!isHtml) {
44794
+ return {
44795
+ url: url2,
44796
+ contentType,
44797
+ content
44798
+ };
44799
+ }
44800
+ const extracted = extractHtmlMetadata(content);
44801
+ if (extracted.favicon) {
44802
+ extracted.favicon = resolveUrl(extracted.favicon, url2);
44803
+ }
44804
+ return {
44805
+ url: url2,
44806
+ contentType,
44807
+ content,
44808
+ metadata: extracted
44809
+ };
44810
+ }
44811
+
44723
44812
  // src/primitives/data-sources/source-website.ts
44724
44813
  var State = z19.object({
44725
44814
  urls: z19.array(
@@ -44739,6 +44828,7 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
44739
44828
  urls;
44740
44829
  filterFn;
44741
44830
  customFetch;
44831
+ fetchStrategy;
44742
44832
  maxPages;
44743
44833
  maxDepth;
44744
44834
  transformFn;
@@ -44749,7 +44839,16 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
44749
44839
  this.sitemapUrl = options.sitemapUrl ?? void 0;
44750
44840
  this.urls = options.urls ?? void 0;
44751
44841
  this.filterFn = "filter" in options ? options.filter : void 0;
44752
- this.customFetch = options.fetch ?? void 0;
44842
+ if (typeof options.fetch === "string") {
44843
+ this.fetchStrategy = options.fetch;
44844
+ this.customFetch = void 0;
44845
+ } else if (typeof options.fetch === "function") {
44846
+ this.customFetch = options.fetch;
44847
+ this.fetchStrategy = "node:fetch";
44848
+ } else {
44849
+ this.fetchStrategy = "node:fetch";
44850
+ this.customFetch = void 0;
44851
+ }
44753
44852
  this.maxPages = Math.max(1, Math.min(("maxPages" in options ? options.maxPages : void 0) ?? 5e4, 5e4));
44754
44853
  this.maxDepth = Math.max(1, Math.min(("maxDepth" in options ? options.maxDepth : void 0) ?? 20, 20));
44755
44854
  }
@@ -44757,51 +44856,82 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
44757
44856
  return !!adk.project.integrations.get("browser");
44758
44857
  }
44759
44858
  /**
44760
- * Fetch content from a URL with fallback strategy
44859
+ * Convert HtmlMetadata to FetchResult metadata format
44860
+ */
44861
+ convertMetadata(metadata) {
44862
+ const result = {};
44863
+ if (metadata.title) {
44864
+ result[WellKnownMetadata.knowledge.TITLE] = metadata.title;
44865
+ }
44866
+ if (metadata.description) {
44867
+ result[WellKnownMetadata.knowledge.DESCRIPTION] = metadata.description;
44868
+ }
44869
+ if (metadata.favicon) {
44870
+ result[WellKnownMetadata.knowledge.FAVICON] = metadata.favicon;
44871
+ }
44872
+ return result;
44873
+ }
44874
+ /**
44875
+ * Default fetch implementation using Node's built-in fetch
44876
+ */
44877
+ async defaultFetch(url2) {
44878
+ const result = await fetchHtml(url2, {
44879
+ timeout: 3e4
44880
+ });
44881
+ if (!result.metadata) {
44882
+ return {
44883
+ url: result.url,
44884
+ contentType: result.contentType,
44885
+ content: result.content
44886
+ };
44887
+ }
44888
+ return {
44889
+ url: result.url,
44890
+ contentType: result.contentType,
44891
+ content: result.content,
44892
+ metadata: this.convertMetadata(result.metadata)
44893
+ };
44894
+ }
44895
+ /**
44896
+ * Fetch content from a URL for sitemap parsing (raw content needed)
44761
44897
  */
44762
44898
  async fetchSitemap(url2) {
44763
44899
  if (this.customFetch) {
44764
44900
  try {
44765
44901
  return await this.customFetch(url2);
44766
44902
  } catch (err) {
44767
- console.warn(`Custom fetch failed for ${url2}, falling back...`);
44903
+ console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
44768
44904
  }
44769
44905
  }
44770
- if (!this.isBrowserIntegrationAvailable()) {
44771
- throw new Error(
44772
- `The 'browser' integration is not installed and is required for crawling website. Please provide a custom fetch function or install the 'browser' integration.`
44773
- );
44774
- }
44775
- const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
44776
- urls: [url2],
44777
- timeout: 3e4,
44778
- waitFor: 500
44779
- });
44780
- const result = output2?.results[0];
44781
- if (!result || !result.content) {
44782
- throw new Error(`Failed to fetch content from ${url2}`);
44906
+ if (this.fetchStrategy === "integration:browser") {
44907
+ return this.fetchWithBrowserIntegration(url2, { raw: true });
44908
+ } else {
44909
+ return this.defaultFetch(url2);
44783
44910
  }
44784
- return {
44785
- url: result.url,
44786
- contentType: "application/html",
44787
- content: result.raw
44788
- };
44789
44911
  }
44790
44912
  /**
44791
- * Fetch content from a URL with fallback strategy
44913
+ * Fetch content from a URL for indexing (with metadata extraction)
44792
44914
  */
44793
44915
  async fetchUrl(url2) {
44794
44916
  if (this.customFetch) {
44795
44917
  try {
44796
44918
  return await this.customFetch(url2);
44797
44919
  } catch (err) {
44798
- console.warn(`Custom fetch failed for ${url2}, falling back...`);
44920
+ console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
44799
44921
  }
44800
44922
  }
44923
+ if (this.fetchStrategy === "integration:browser") {
44924
+ return this.fetchWithBrowserIntegration(url2, { raw: false });
44925
+ } else {
44926
+ return this.defaultFetch(url2);
44927
+ }
44928
+ }
44929
+ /**
44930
+ * Fetch content using the browser integration
44931
+ */
44932
+ async fetchWithBrowserIntegration(url2, options) {
44801
44933
  if (!this.isBrowserIntegrationAvailable()) {
44802
- throw new Error(
44803
- `The 'browser' integration is not installed and is required for crawling website. Please provide a custom fetch function or install the 'browser' integration.`
44804
- );
44934
+ throw new Error(`The 'browser' integration is not installed. Please install it or use fetch: 'node:fetch'.`);
44805
44935
  }
44806
44936
  const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
44807
44937
  urls: [url2],
@@ -44812,6 +44942,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
44812
44942
  if (!result || !result.content) {
44813
44943
  throw new Error(`Failed to fetch content from ${url2}`);
44814
44944
  }
44945
+ if (options.raw && result.raw) {
44946
+ return {
44947
+ url: result.url,
44948
+ contentType: "application/html",
44949
+ content: result.raw
44950
+ };
44951
+ }
44815
44952
  return {
44816
44953
  url: result.url,
44817
44954
  contentType: "text/markdown",
@@ -45040,6 +45177,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
45040
45177
  const toRemove = existingFiles.filter(
45041
45178
  (f) => !discoveredUrls.find((u) => u.loc === f.metadata?.[WellKnownMetadata.knowledge.URL])
45042
45179
  );
45180
+ if (existingFiles.length > 0 && toRemove.length >= existingFiles.length * 0.8) {
45181
+ console.error(
45182
+ `Warning: All existing files (${existingFiles.length}) are scheduled for removal. Please check if the sitemap URL is correct and the website is accessible. We will try again in 5 minutes.`
45183
+ );
45184
+ await step2.sleep("retry wait", 5 * 60 * 1e3);
45185
+ throw new Error("Aborting sync due to potential misconfiguration (all files to be removed)");
45186
+ }
45043
45187
  const toFetch = [];
45044
45188
  let skippedUnchanged = 0;
45045
45189
  for (const url2 of discoveredUrls) {