@botpress/runtime 1.6.4 → 1.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/definition.js +173 -29
- package/dist/definition.js.map +4 -4
- package/dist/internal.js +173 -29
- package/dist/internal.js.map +4 -4
- package/dist/library.js +173 -29
- package/dist/library.js.map +4 -4
- package/dist/primitives/data-sources/html-fetch.d.ts +57 -0
- package/dist/primitives/data-sources/html-fetch.d.ts.map +1 -0
- package/dist/primitives/data-sources/source-website.d.ts +69 -4
- package/dist/primitives/data-sources/source-website.d.ts.map +1 -1
- package/dist/runtime.js +173 -29
- package/dist/runtime.js.map +4 -4
- package/package.json +1 -1
package/dist/library.js
CHANGED
|
@@ -48,7 +48,7 @@ var init_define_BUILD = __esm({
|
|
|
48
48
|
var define_PACKAGE_VERSIONS_default;
|
|
49
49
|
var init_define_PACKAGE_VERSIONS = __esm({
|
|
50
50
|
"<define:__PACKAGE_VERSIONS__>"() {
|
|
51
|
-
define_PACKAGE_VERSIONS_default = { runtime: "1.6.
|
|
51
|
+
define_PACKAGE_VERSIONS_default = { runtime: "1.6.6", adk: "not-installed", sdk: "4.17.3", llmz: "0.0.27", zai: "2.4.0", cognitive: "0.2.0" };
|
|
52
52
|
}
|
|
53
53
|
});
|
|
54
54
|
|
|
@@ -25291,7 +25291,7 @@ var require_follow_redirects = __commonJS({
|
|
|
25291
25291
|
var currentUrlParts = parseUrl(this._currentUrl);
|
|
25292
25292
|
var currentHost = currentHostHeader || currentUrlParts.host;
|
|
25293
25293
|
var currentUrl = /^\w+:/.test(location) ? this._currentUrl : url2.format(Object.assign(currentUrlParts, { host: currentHost }));
|
|
25294
|
-
var redirectUrl =
|
|
25294
|
+
var redirectUrl = resolveUrl2(location, currentUrl);
|
|
25295
25295
|
debug("redirecting to", redirectUrl.href);
|
|
25296
25296
|
this._isRedirect = true;
|
|
25297
25297
|
spreadUrlObject(redirectUrl, this._options);
|
|
@@ -25375,7 +25375,7 @@ var require_follow_redirects = __commonJS({
|
|
|
25375
25375
|
}
|
|
25376
25376
|
return parsed;
|
|
25377
25377
|
}
|
|
25378
|
-
function
|
|
25378
|
+
function resolveUrl2(relative, base) {
|
|
25379
25379
|
return useNativeURL ? new URL2(relative, base) : parseUrl(url2.resolve(base, relative));
|
|
25380
25380
|
}
|
|
25381
25381
|
function validateUrl(input) {
|
|
@@ -44720,6 +44720,95 @@ var XMLParser = class {
|
|
|
44720
44720
|
}
|
|
44721
44721
|
};
|
|
44722
44722
|
|
|
44723
|
+
// src/primitives/data-sources/html-fetch.ts
|
|
44724
|
+
init_define_BUILD();
|
|
44725
|
+
init_define_PACKAGE_VERSIONS();
|
|
44726
|
+
function extractHtmlMetadata(html) {
|
|
44727
|
+
const metadata = {};
|
|
44728
|
+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
44729
|
+
if (titleMatch && titleMatch[1]) {
|
|
44730
|
+
metadata.title = titleMatch[1].trim();
|
|
44731
|
+
}
|
|
44732
|
+
let descriptionMatch = html.match(
|
|
44733
|
+
/<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content="([^"]+)"/i
|
|
44734
|
+
);
|
|
44735
|
+
if (!descriptionMatch) {
|
|
44736
|
+
descriptionMatch = html.match(
|
|
44737
|
+
/<meta\s+(?:name|property)=["'](?:description|og:description)["']\s+content='([^']+)'/i
|
|
44738
|
+
);
|
|
44739
|
+
}
|
|
44740
|
+
if (descriptionMatch && descriptionMatch[1]) {
|
|
44741
|
+
metadata.description = descriptionMatch[1].trim();
|
|
44742
|
+
}
|
|
44743
|
+
const faviconPatterns = [
|
|
44744
|
+
// rel first, double quotes
|
|
44745
|
+
/<link\s+[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"[^>]*href="([^"]+)"/i,
|
|
44746
|
+
// rel first, single quotes
|
|
44747
|
+
/<link\s+[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'[^>]*href='([^']+)'/i,
|
|
44748
|
+
// href first, double quotes
|
|
44749
|
+
/<link\s+[^>]*href="([^"]+)"[^>]*rel="(?:icon|shortcut icon|apple-touch-icon)"/i,
|
|
44750
|
+
// href first, single quotes
|
|
44751
|
+
/<link\s+[^>]*href='([^']+)'[^>]*rel='(?:icon|shortcut icon|apple-touch-icon)'/i
|
|
44752
|
+
];
|
|
44753
|
+
for (const pattern of faviconPatterns) {
|
|
44754
|
+
const faviconMatch = html.match(pattern);
|
|
44755
|
+
if (faviconMatch && faviconMatch[1]) {
|
|
44756
|
+
metadata.favicon = faviconMatch[1].trim();
|
|
44757
|
+
break;
|
|
44758
|
+
}
|
|
44759
|
+
}
|
|
44760
|
+
if (!metadata.favicon) {
|
|
44761
|
+
metadata.favicon = "/favicon.ico";
|
|
44762
|
+
}
|
|
44763
|
+
return metadata;
|
|
44764
|
+
}
|
|
44765
|
+
function resolveUrl(url2, baseUrl) {
|
|
44766
|
+
if (url2.startsWith("http://") || url2.startsWith("https://")) {
|
|
44767
|
+
return url2;
|
|
44768
|
+
}
|
|
44769
|
+
try {
|
|
44770
|
+
const base = new URL(baseUrl);
|
|
44771
|
+
return new URL(url2, base.origin).href;
|
|
44772
|
+
} catch {
|
|
44773
|
+
return url2;
|
|
44774
|
+
}
|
|
44775
|
+
}
|
|
44776
|
+
async function fetchHtml(url2, options) {
|
|
44777
|
+
const userAgent = options?.userAgent || "Mozilla/5.0 (compatible; BotpressBot/1.0)";
|
|
44778
|
+
const fetchOptions = {
|
|
44779
|
+
headers: {
|
|
44780
|
+
"User-Agent": userAgent
|
|
44781
|
+
}
|
|
44782
|
+
};
|
|
44783
|
+
if (options?.timeout) {
|
|
44784
|
+
fetchOptions.signal = AbortSignal.timeout(options.timeout);
|
|
44785
|
+
}
|
|
44786
|
+
const response = await fetch(url2, fetchOptions);
|
|
44787
|
+
if (!response.ok) {
|
|
44788
|
+
throw new Error(`Failed to fetch ${url2}: ${response.status} ${response.statusText}`);
|
|
44789
|
+
}
|
|
44790
|
+
const contentType = response.headers.get("content-type") || "text/html";
|
|
44791
|
+
const content = await response.text();
|
|
44792
|
+
const isHtml = contentType.includes("text/html") || contentType.includes("application/xhtml");
|
|
44793
|
+
if (!isHtml) {
|
|
44794
|
+
return {
|
|
44795
|
+
url: url2,
|
|
44796
|
+
contentType,
|
|
44797
|
+
content
|
|
44798
|
+
};
|
|
44799
|
+
}
|
|
44800
|
+
const extracted = extractHtmlMetadata(content);
|
|
44801
|
+
if (extracted.favicon) {
|
|
44802
|
+
extracted.favicon = resolveUrl(extracted.favicon, url2);
|
|
44803
|
+
}
|
|
44804
|
+
return {
|
|
44805
|
+
url: url2,
|
|
44806
|
+
contentType,
|
|
44807
|
+
content,
|
|
44808
|
+
metadata: extracted
|
|
44809
|
+
};
|
|
44810
|
+
}
|
|
44811
|
+
|
|
44723
44812
|
// src/primitives/data-sources/source-website.ts
|
|
44724
44813
|
var State = z19.object({
|
|
44725
44814
|
urls: z19.array(
|
|
@@ -44739,6 +44828,7 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
44739
44828
|
urls;
|
|
44740
44829
|
filterFn;
|
|
44741
44830
|
customFetch;
|
|
44831
|
+
fetchStrategy;
|
|
44742
44832
|
maxPages;
|
|
44743
44833
|
maxDepth;
|
|
44744
44834
|
transformFn;
|
|
@@ -44749,7 +44839,16 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
44749
44839
|
this.sitemapUrl = options.sitemapUrl ?? void 0;
|
|
44750
44840
|
this.urls = options.urls ?? void 0;
|
|
44751
44841
|
this.filterFn = "filter" in options ? options.filter : void 0;
|
|
44752
|
-
|
|
44842
|
+
if (typeof options.fetch === "string") {
|
|
44843
|
+
this.fetchStrategy = options.fetch;
|
|
44844
|
+
this.customFetch = void 0;
|
|
44845
|
+
} else if (typeof options.fetch === "function") {
|
|
44846
|
+
this.customFetch = options.fetch;
|
|
44847
|
+
this.fetchStrategy = "node:fetch";
|
|
44848
|
+
} else {
|
|
44849
|
+
this.fetchStrategy = "node:fetch";
|
|
44850
|
+
this.customFetch = void 0;
|
|
44851
|
+
}
|
|
44753
44852
|
this.maxPages = Math.max(1, Math.min(("maxPages" in options ? options.maxPages : void 0) ?? 5e4, 5e4));
|
|
44754
44853
|
this.maxDepth = Math.max(1, Math.min(("maxDepth" in options ? options.maxDepth : void 0) ?? 20, 20));
|
|
44755
44854
|
}
|
|
@@ -44757,51 +44856,82 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
44757
44856
|
return !!adk.project.integrations.get("browser");
|
|
44758
44857
|
}
|
|
44759
44858
|
/**
|
|
44760
|
-
*
|
|
44859
|
+
* Convert HtmlMetadata to FetchResult metadata format
|
|
44860
|
+
*/
|
|
44861
|
+
convertMetadata(metadata) {
|
|
44862
|
+
const result = {};
|
|
44863
|
+
if (metadata.title) {
|
|
44864
|
+
result[WellKnownMetadata.knowledge.TITLE] = metadata.title;
|
|
44865
|
+
}
|
|
44866
|
+
if (metadata.description) {
|
|
44867
|
+
result[WellKnownMetadata.knowledge.DESCRIPTION] = metadata.description;
|
|
44868
|
+
}
|
|
44869
|
+
if (metadata.favicon) {
|
|
44870
|
+
result[WellKnownMetadata.knowledge.FAVICON] = metadata.favicon;
|
|
44871
|
+
}
|
|
44872
|
+
return result;
|
|
44873
|
+
}
|
|
44874
|
+
/**
|
|
44875
|
+
* Default fetch implementation using Node's built-in fetch
|
|
44876
|
+
*/
|
|
44877
|
+
async defaultFetch(url2) {
|
|
44878
|
+
const result = await fetchHtml(url2, {
|
|
44879
|
+
timeout: 3e4
|
|
44880
|
+
});
|
|
44881
|
+
if (!result.metadata) {
|
|
44882
|
+
return {
|
|
44883
|
+
url: result.url,
|
|
44884
|
+
contentType: result.contentType,
|
|
44885
|
+
content: result.content
|
|
44886
|
+
};
|
|
44887
|
+
}
|
|
44888
|
+
return {
|
|
44889
|
+
url: result.url,
|
|
44890
|
+
contentType: result.contentType,
|
|
44891
|
+
content: result.content,
|
|
44892
|
+
metadata: this.convertMetadata(result.metadata)
|
|
44893
|
+
};
|
|
44894
|
+
}
|
|
44895
|
+
/**
|
|
44896
|
+
* Fetch content from a URL for sitemap parsing (raw content needed)
|
|
44761
44897
|
*/
|
|
44762
44898
|
async fetchSitemap(url2) {
|
|
44763
44899
|
if (this.customFetch) {
|
|
44764
44900
|
try {
|
|
44765
44901
|
return await this.customFetch(url2);
|
|
44766
44902
|
} catch (err) {
|
|
44767
|
-
console.warn(`Custom fetch failed for ${url2}, falling back...`);
|
|
44903
|
+
console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
|
|
44768
44904
|
}
|
|
44769
44905
|
}
|
|
44770
|
-
if (
|
|
44771
|
-
|
|
44772
|
-
|
|
44773
|
-
);
|
|
44774
|
-
}
|
|
44775
|
-
const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
|
|
44776
|
-
urls: [url2],
|
|
44777
|
-
timeout: 3e4,
|
|
44778
|
-
waitFor: 500
|
|
44779
|
-
});
|
|
44780
|
-
const result = output2?.results[0];
|
|
44781
|
-
if (!result || !result.content) {
|
|
44782
|
-
throw new Error(`Failed to fetch content from ${url2}`);
|
|
44906
|
+
if (this.fetchStrategy === "integration:browser") {
|
|
44907
|
+
return this.fetchWithBrowserIntegration(url2, { raw: true });
|
|
44908
|
+
} else {
|
|
44909
|
+
return this.defaultFetch(url2);
|
|
44783
44910
|
}
|
|
44784
|
-
return {
|
|
44785
|
-
url: result.url,
|
|
44786
|
-
contentType: "application/html",
|
|
44787
|
-
content: result.raw
|
|
44788
|
-
};
|
|
44789
44911
|
}
|
|
44790
44912
|
/**
|
|
44791
|
-
* Fetch content from a URL with
|
|
44913
|
+
* Fetch content from a URL for indexing (with metadata extraction)
|
|
44792
44914
|
*/
|
|
44793
44915
|
async fetchUrl(url2) {
|
|
44794
44916
|
if (this.customFetch) {
|
|
44795
44917
|
try {
|
|
44796
44918
|
return await this.customFetch(url2);
|
|
44797
44919
|
} catch (err) {
|
|
44798
|
-
console.warn(`Custom fetch failed for ${url2}, falling back...`);
|
|
44920
|
+
console.warn(`Custom fetch failed for ${url2}, falling back to ${this.fetchStrategy}...`);
|
|
44799
44921
|
}
|
|
44800
44922
|
}
|
|
44923
|
+
if (this.fetchStrategy === "integration:browser") {
|
|
44924
|
+
return this.fetchWithBrowserIntegration(url2, { raw: false });
|
|
44925
|
+
} else {
|
|
44926
|
+
return this.defaultFetch(url2);
|
|
44927
|
+
}
|
|
44928
|
+
}
|
|
44929
|
+
/**
|
|
44930
|
+
* Fetch content using the browser integration
|
|
44931
|
+
*/
|
|
44932
|
+
async fetchWithBrowserIntegration(url2, options) {
|
|
44801
44933
|
if (!this.isBrowserIntegrationAvailable()) {
|
|
44802
|
-
throw new Error(
|
|
44803
|
-
`The 'browser' integration is not installed and is required for crawling website. Please provide a custom fetch function or install the 'browser' integration.`
|
|
44804
|
-
);
|
|
44934
|
+
throw new Error(`The 'browser' integration is not installed. Please install it or use fetch: 'node:fetch'.`);
|
|
44805
44935
|
}
|
|
44806
44936
|
const output2 = await adk.project.integrations.get("browser")?.actions.browsePages({
|
|
44807
44937
|
urls: [url2],
|
|
@@ -44812,6 +44942,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
44812
44942
|
if (!result || !result.content) {
|
|
44813
44943
|
throw new Error(`Failed to fetch content from ${url2}`);
|
|
44814
44944
|
}
|
|
44945
|
+
if (options.raw && result.raw) {
|
|
44946
|
+
return {
|
|
44947
|
+
url: result.url,
|
|
44948
|
+
contentType: "application/html",
|
|
44949
|
+
content: result.raw
|
|
44950
|
+
};
|
|
44951
|
+
}
|
|
44815
44952
|
return {
|
|
44816
44953
|
url: result.url,
|
|
44817
44954
|
contentType: "text/markdown",
|
|
@@ -45040,6 +45177,13 @@ var WebsiteSource = class _WebsiteSource extends DataSource {
|
|
|
45040
45177
|
const toRemove = existingFiles.filter(
|
|
45041
45178
|
(f) => !discoveredUrls.find((u) => u.loc === f.metadata?.[WellKnownMetadata.knowledge.URL])
|
|
45042
45179
|
);
|
|
45180
|
+
if (existingFiles.length > 0 && toRemove.length >= existingFiles.length * 0.8) {
|
|
45181
|
+
console.error(
|
|
45182
|
+
`Warning: All existing files (${existingFiles.length}) are scheduled for removal. Please check if the sitemap URL is correct and the website is accessible. We will try again in 5 minutes.`
|
|
45183
|
+
);
|
|
45184
|
+
await step2.sleep("retry wait", 5 * 60 * 1e3);
|
|
45185
|
+
throw new Error("Aborting sync due to potential misconfiguration (all files to be removed)");
|
|
45186
|
+
}
|
|
45043
45187
|
const toFetch = [];
|
|
45044
45188
|
let skippedUnchanged = 0;
|
|
45045
45189
|
for (const url2 of discoveredUrls) {
|