@botpress/runtime 1.6.4 → 1.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/definition.js +173 -29
- package/dist/definition.js.map +4 -4
- package/dist/internal.js +173 -29
- package/dist/internal.js.map +4 -4
- package/dist/library.js +173 -29
- package/dist/library.js.map +4 -4
- package/dist/primitives/data-sources/html-fetch.d.ts +57 -0
- package/dist/primitives/data-sources/html-fetch.d.ts.map +1 -0
- package/dist/primitives/data-sources/source-website.d.ts +69 -4
- package/dist/primitives/data-sources/source-website.d.ts.map +1 -1
- package/dist/runtime.js +173 -29
- package/dist/runtime.js.map +4 -4
- package/package.json +1 -1
package/dist/internal.js
CHANGED
|
@@ -48,7 +48,7 @@ var init_define_BUILD = __esm({
|
|
|
48
48
|
var define_PACKAGE_VERSIONS_default;
|
|
49
49
|
var init_define_PACKAGE_VERSIONS = __esm({
|
|
50
50
|
"<define:__PACKAGE_VERSIONS__>"() {
|
|
51
|
-
define_PACKAGE_VERSIONS_default = { runtime: "1.6.
|
|
51
|
+
define_PACKAGE_VERSIONS_default = { runtime: "1.6.6", adk: "not-installed", sdk: "4.17.3", llmz: "0.0.27", zai: "2.4.0", cognitive: "0.2.0" };
|
|
52
52
|
}
|
|
53
53
|
});
|
|
54
54
|
|
|
@@ -25291,7 +25291,7 @@ var require_follow_redirects = __commonJS({
|
|
|
25291
25291
|
var currentUrlParts = parseUrl(this._currentUrl);
|
|
25292
25292
|
var currentHost = currentHostHeader || currentUrlParts.host;
|
|
25293
25293
|
var currentUrl = /^\w+:/.test(location) ? this._currentUrl : url2.format(Object.assign(currentUrlParts, { host: currentHost }));
|
|
25294
|
-
var redirectUrl =
|
|
25294
|
+
var redirectUrl = resolveUrl2(location, currentUrl);
|
|
25295
25295
|
debug("redirecting to", redirectUrl.href);
|
|
25296
25296
|
this._isRedirect = true;
|
|
25297
25297
|
spreadUrlObject(redirectUrl, this._options);
|
|
@@ -25375,7 +25375,7 @@ var require_follow_redirects = __commonJS({
|
|
|
25375
25375
|
}
|
|
25376
25376
|
return parsed;
|
|
25377
25377
|
}
|
|
25378
|
-
function
|
|
25378
|
+
function resolveUrl2(relative, base) {
|
|
25379
25379
|
return useNativeURL ? new URL2(relative, base) : parseUrl(url2.resolve(base, relative));
|
|
25380
25380
|
}
|
|
25381
25381
|
function validateUrl(input) {
|
|
@@ -44688,6 +44688,95 @@ var XMLParser = class {
|
|
|
44688
44688
|
}
|
|
44689
44689
|
};
|
|
44690
44690
|
|
|
44691
|
+
// src/primitives/data-sources/html-fetch.ts
|
|
44692
|
+
init_define_BUILD();
|
|
44693
|
+
init_define_PACKAGE_VERSIONS();
|
|
44694
|
+
function extractHtmlMetadata(html) {
|
|
44695
|
+
const metadata = {};
|
|
44696
|
+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
44697
|
+
if (titleMatch && titleMatch[1]) {
|
|
44698
|
+
metadata.title = titleMatch[1].trim();
|
|
44699
|
+
}
|
|
44700
|
+
let descriptionMatch = html.match(
|
|
44701
|
+
/<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content="([^"]+)"/i
|
|
44702
|
+
);
|
|
44703
|
+
if (!descriptionMatch) {
|
|
44704
|
+
descriptionMatch = html.match(
|
|
44705
|
+
/<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content='([^']+)'/i
|
|
44706
|
+
);
|
|
44707
|
+
}
|
|
44708
|
+
if (descriptionMatch && descriptionMatch[1]) {
|
|
44709
|
+
metadata.description = descriptionMatch[1].trim();
|
|
44710
|
+
}
|
|
44711
|
+
const faviconPatterns = [
|
|
44712
|
+
// rel first, double quotes
|
|
44713
|
+
/<link\s+[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"[^>]*href="([^"]+)"/i,
|
|
44714
|
+
// rel first, single quotes
|
|
44715
|
+
/<link\s+[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'[^>]*href='([^']+)'/i,
|
|
44716
|
+
// href first, double quotes
|
|
44717
|
+
/<link\s+[^>]*href="([^"]+)"[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"/i,
|
|
44718
|
+
// href first, single quotes
|
|
44719
|
+
/<link\s+[^>]*href='([^']+)'[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'/i
|
|
44720
|
+
];
|
|
44721
|
+
for (const pattern of faviconPatterns) {
|
|
44722
|
+
const faviconMatch = html.match(pattern);
|
|
44723
|
+
if (faviconMatch && faviconMatch[1]) {
|
|
44724
|
+
metadata.favicon = faviconMatch[1].trim();
|
|
44725
|
+
break;
|
|
44726
|
+
}
|
|
44727
|
+
}
|
|
44728
|
+
if (!metadata.favicon) {
|
|
44729
|
+
metadata.favicon = "/favicon.ico";
|
|
44730
|
+
}
|
|
44731
|
+
return metadata;
|
|
44732
|
+
}
|
|
44733
|
+
function resolveUrl(url2, baseUrl) {
|
|
44734
|
+
if (url2.startsWith("http://") || url2.startsWith("https://")) {
|
|
44735
|
+
return url2;
|
|
44736
|
+
}
|
|
44737
|
+
try {
|
|
44738
|
+
const base = new URL(baseUrl);
|
|
44739
|
+
return new URL(url2, base.origin).href;
|
|
44740
|
+
} catch {
|
|
44741
|
+
return url2;
|
|
44742
|
+
}
|
|
44743
|
+
}
|
|
44744
|
+
async function fetchHtml(url2, options) {
|
|
44745
|
+
const userAgent = options?.userAgent || "Mozilla/5.0 (compatible; BotpressBot/1.0)";
|
|
44746
|
+
const fetchOptions = {
|
|
44747
|
+
headers: {
|
|
44748
|
+
"User-Agent": userAgent
|
|
44749
|
+
}
|
|
44750
|
+
};
|
|
44751
|
+
if (options?.timeout) {
|
|
44752
|
+
fetchOptions.signal = AbortSignal.timeout(options.timeout);
|
|
44753
|
+
}
|
|
44754
|
+
const response = await fetch(url2, fetchOptions);
|
|
44755
|
+
if (!response.ok) {
|
|
44756
|
+
throw new Error(`Failed to fetch ${url2}: ${response.status} ${response.statusText}`);
|
|
44757
|
+
}
|
|
44758
|
+
const contentType = response.headers.get("content-type") || "text/html";
|
|
44759
|
+
const content = await response.text();
|
|
44760
|
+
const isHtml = contentType.includes("text/html") || contentType.includes("application/xhtml");
|
|
44761
|
+
if (!isHtml) {
|
|
44762
|
+
return {
|
|
44763
|
+
url: url2,
|
|
44764
|
+
contentType,
|
|
44765
|
+
content
|
|
44766
|
+
};
|
|
44767
|
+
}
|
|
44768
|
+
const extracted = extractHtmlMetadata(content);
|
|
44769
|
+
if (extracted.favicon) {
|
|
44770
|
+
extracted.favicon = resolveUrl(extracted.favicon, url2);
|
|
44771
|
+
}
|
|
44772
|
+
return {
|
|
44773
|
+
url: url2,
|
|
44774
|
+
contentType,
|
|
44775
|
+
content,
|
|
44776
|
+
metadata: extracted
|
|
44777
|
+
};
|
|
44778
|
+
}
|
|
44779
|
+
|
|
44691
44780
|
// src/primitives/data-sources/source-website.ts
|
|
44692
44781
|
var State = z20.object({
|
|
44693
44782
|
urls: z20.array(
|
|
@@ -44707,6 +44796,7 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
44707
44796
|
urls;
|
|
44708
44797
|
filterFn;
|
|
44709
44798
|
customFetch;
|
|
44799
|
+
fetchStrategy;
|
|
44710
44800
|
maxPages;
|
|
44711
44801
|
maxDepth;
|
|
44712
44802
|
transformFn;
|
|
@@ -44717,7 +44807,16 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
44717
44807
|
this.sitemapUrl = options.sitemapUrl ?? void 0;
|
|
44718
44808
|
this.urls = options.urls ?? void 0;
|
|
44719
44809
|
this.filterFn = "filter" in options ? options.filter : void 0;
|
|
44720
|
-
|
|
44810
|
+
if (typeof options.fetch === "string") {
|
|
44811
|
+
this.fetchStrategy = options.fetch;
|
|
44812
|
+
this.customFetch = void 0;
|
|
44813
|
+
} else if (typeof options.fetch === "function") {
|
|
44814
|
+
this.customFetch = options.fetch;
|
|
44815
|
+
this.fetchStrategy = "node:fetch";
|
|
44816
|
+
} else {
|
|
44817
|
+
this.fetchStrategy = "node:fetch";
|
|
44818
|
+
this.customFetch = void 0;
|
|
44819
|
+
}
|
|
44721
44820
|
this.maxPages = Math.max(1, Math.min(("maxPages" in options ? options.maxPages : void 0) ?? 5e4, 5e4));
|
|
44722
44821
|
this.maxDepth = Math.max(1, Math.min(("maxDepth" in options ? options.maxDepth : void 0) ?? 20, 20));
|
|
44723
44822
|
}
|
|
@@ -44725,51 +44824,82 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
44725
44824
|
return !!adk.project.integrations.get("browser");
|
|
44726
44825
|
}
|
|
44727
44826
|
/**
|
|
44728
|
-
*
|
|
44827
|
+
* Convert HtmlMetadata to FetchResult metadata format
|
|
44828
|
+
*/
|
|
44829
|
+
convertMetadata(metadata) {
|
|
44830
|
+
const result = {};
|
|
44831
|
+
if (metadata.title) {
|
|
44832
|
+
result[WellKnownMetadata.knowledge.TITLE] = metadata.title;
|
|
44833
|
+
}
|
|
44834
|
+
if (metadata.description) {
|
|
44835
|
+
result[WellKnownMetadata.knowledge.DESCRIPTION] = metadata.description;
|
|
44836
|
+
}
|
|
44837
|
+
if (metadata.favicon) {
|
|
44838
|
+
result[WellKnownMetadata.knowledge.FAVICON] = metadata.favicon;
|
|
44839
|
+
}
|
|
44840
|
+
return result;
|
|
44841
|
+
}
|
|
44842
|
+
/**
|
|
44843
|
+
* Default fetch implementation using Node's built-in fetch
|
|
44844
|
+
*/
|
|
44845
|
+
async defaultFetch(url2) {
|
|
44846
|
+
const result = await fetchHtml(url2, {
|
|
44847
|
+
timeout: 3e4
|
|
44848
|
+
});
|
|
44849
|
+
if (!result.metadata) {
|
|
44850
|
+
return {
|
|
44851
|
+
url: result.url,
|
|
44852
|
+
contentType: result.contentType,
|
|
44853
|
+
content: result.content
|
|
44854
|
+
};
|
|
44855
|
+
}
|
|
44856
|
+
return {
|
|
44857
|
+
url: result.url,
|
|
44858
|
+
contentType: result.contentType,
|
|
44859
|
+
content: result.content,
|
|
44860
|
+
metadata: this.convertMetadata(result.metadata)
|
|
44861
|
+
};
|
|
44862
|
+
}
|
|
44863
|
+
/**
|
|
44864
|
+
* Fetch content from a URL for sitemap parsing (raw content needed)
|
|
44729
44865
|
*/
|
|
44730
44866
|
async fetchSitemap(url2) {
|
|
44731
44867
|
if (this.customFetch) {
|
|
44732
44868
|
try {
|
|
44733
44869
|
return await this.customFetch(url2);
|
|
44734
44870
|
} catch (err) {
|
|
44735
|
-
console.warn(`Custom fetch failed for ${url2}, falling back...`);
|
|
44871
|
+
console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
|
|
44736
44872
|
}
|
|
44737
44873
|
}
|
|
44738
|
-
if (
|
|
44739
|
-
|
|
44740
|
-
|
|
44741
|
-
);
|
|
44742
|
-
}
|
|
44743
|
-
const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
|
|
44744
|
-
urls: [url2],
|
|
44745
|
-
timeout: 3e4,
|
|
44746
|
-
waitFor: 500
|
|
44747
|
-
});
|
|
44748
|
-
const result = output2?.results[0];
|
|
44749
|
-
if (!result || !result.content) {
|
|
44750
|
-
throw new Error(`Failed to fetch content from ${url2}`);
|
|
44874
|
+
if (this.fetchStrategy === "integration:browser") {
|
|
44875
|
+
return this.fetchWithBrowserIntegration(url2, { raw: true });
|
|
44876
|
+
} else {
|
|
44877
|
+
return this.defaultFetch(url2);
|
|
44751
44878
|
}
|
|
44752
|
-
return {
|
|
44753
|
-
url: result.url,
|
|
44754
|
-
contentType: "application/html",
|
|
44755
|
-
content: result.raw
|
|
44756
|
-
};
|
|
44757
44879
|
}
|
|
44758
44880
|
/**
|
|
44759
|
-
* Fetch content from a URL with
|
|
44881
|
+
* Fetch content from a URL for indexing (with metadata extraction)
|
|
44760
44882
|
*/
|
|
44761
44883
|
async fetchUrl(url2) {
|
|
44762
44884
|
if (this.customFetch) {
|
|
44763
44885
|
try {
|
|
44764
44886
|
return await this.customFetch(url2);
|
|
44765
44887
|
} catch (err) {
|
|
44766
|
-
console.warn(`Custom fetch failed for ${url2}, falling back...`);
|
|
44888
|
+
console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
|
|
44767
44889
|
}
|
|
44768
44890
|
}
|
|
44891
|
+
if (this.fetchStrategy === "integration:browser") {
|
|
44892
|
+
return this.fetchWithBrowserIntegration(url2, { raw: false });
|
|
44893
|
+
} else {
|
|
44894
|
+
return this.defaultFetch(url2);
|
|
44895
|
+
}
|
|
44896
|
+
}
|
|
44897
|
+
/**
|
|
44898
|
+
* Fetch content using the browser integration
|
|
44899
|
+
*/
|
|
44900
|
+
async fetchWithBrowserIntegration(url2, options) {
|
|
44769
44901
|
if (!this.isBrowserIntegrationAvailable()) {
|
|
44770
|
-
throw new Error(
|
|
44771
|
-
`The 'browser' integration is not installed and is required for crawling website. Please provide a custom fetch function or install the 'browser' integration.`
|
|
44772
|
-
);
|
|
44902
|
+
throw new Error(`The 'browser' integration is not installed. Please install it or use fetch: 'node:fetch'.`);
|
|
44773
44903
|
}
|
|
44774
44904
|
const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
|
|
44775
44905
|
urls: [url2],
|
|
@@ -44780,6 +44910,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
44780
44910
|
if (!result || !result.content) {
|
|
44781
44911
|
throw new Error(`Failed to fetch content from ${url2}`);
|
|
44782
44912
|
}
|
|
44913
|
+
if (options.raw && result.raw) {
|
|
44914
|
+
return {
|
|
44915
|
+
url: result.url,
|
|
44916
|
+
contentType: "application/html",
|
|
44917
|
+
content: result.raw
|
|
44918
|
+
};
|
|
44919
|
+
}
|
|
44783
44920
|
return {
|
|
44784
44921
|
url: result.url,
|
|
44785
44922
|
contentType: "text/markdown",
|
|
@@ -45008,6 +45145,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
45008
45145
|
const toRemove = existingFiles.filter(
|
|
45009
45146
|
(f) => !discoveredUrls.find((u) => u.loc === f.metadata?.[WellKnownMetadata.knowledge.URL])
|
|
45010
45147
|
);
|
|
45148
|
+
if (existingFiles.length > 0 && toRemove.length >= existingFiles.length * 0.8) {
|
|
45149
|
+
console.error(
|
|
45150
|
+
`Warning: All existing files (${existingFiles.length}) are scheduled for removal. Please check if the sitemap URL is correct and the website is accessible. We will try again in 5 minutes.`
|
|
45151
|
+
);
|
|
45152
|
+
await step2.sleep("retry wait", 5 * 60 * 1e3);
|
|
45153
|
+
throw new Error("Aborting sync due to potential misconfiguration (all files to be removed)");
|
|
45154
|
+
}
|
|
45011
45155
|
const toFetch = [];
|
|
45012
45156
|
let skippedUnchanged = 0;
|
|
45013
45157
|
for (const url2 of discoveredUrls) {
|