@promptbook/website-crawler 0.71.0-13 → 0.71.0-15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/index.es.js +61 -75
- package/esm/index.es.js.map +1 -1
- package/esm/typings/src/_packages/node.index.d.ts +2 -0
- package/esm/typings/src/_packages/types.index.d.ts +2 -0
- package/esm/typings/src/collection/constructors/createCollectionFromDirectory.d.ts +2 -2
- package/esm/typings/src/conversion/pipelineStringToJson.d.ts +1 -1
- package/esm/typings/src/execution/ExecutionTools.d.ts +12 -4
- package/esm/typings/src/execution/FilesystemTools.d.ts +9 -0
- package/esm/typings/src/execution/translation/automatic-translate/translateMessages.d.ts +1 -0
- package/esm/typings/src/llm-providers/_common/register/$provideLlmToolsFromEnv.d.ts +0 -1
- package/esm/typings/src/prepare/preparePipeline.d.ts +1 -1
- package/esm/typings/src/prepare/prepareTemplates.d.ts +1 -1
- package/esm/typings/src/scrapers/_common/prepareKnowledgePieces.d.ts +1 -1
- package/esm/typings/src/scrapers/_common/register/$provideFilesystemForNode.d.ts +11 -0
- package/esm/typings/src/scrapers/_common/register/$provideScrapersForNode.d.ts +1 -1
- package/esm/typings/src/scrapers/_common/utils/getScraperIntermediateSource.d.ts +1 -0
- package/esm/typings/src/scrapers/_common/utils/makeKnowledgeSourceHandler.d.ts +2 -4
- package/esm/typings/src/scrapers/document/DocumentScraper.d.ts +1 -1
- package/esm/typings/src/scrapers/document-legacy/LegacyDocumentScraper.d.ts +1 -1
- package/esm/typings/src/scrapers/website/WebsiteScraper.d.ts +1 -1
- package/esm/typings/src/storage/file-cache-storage/FileCacheStorage.d.ts +3 -1
- package/esm/typings/src/utils/files/{$isDirectoryExisting.d.ts → isDirectoryExisting.d.ts} +3 -4
- package/esm/typings/src/utils/files/isFileExisting.d.ts +13 -0
- package/esm/typings/src/utils/files/{$listAllFiles.d.ts → listAllFiles.d.ts} +3 -4
- package/package.json +2 -2
- package/umd/index.umd.js +60 -74
- package/umd/index.umd.js.map +1 -1
- package/esm/typings/src/utils/files/$isFileExisting.d.ts +0 -14
- /package/esm/typings/src/utils/files/{$isDirectoryExisting.test.d.ts → isDirectoryExisting.test.d.ts} +0 -0
- /package/esm/typings/src/utils/files/{$isFileExisting.test.d.ts → isFileExisting.test.d.ts} +0 -0
- /package/esm/typings/src/utils/files/{$listAllFiles.test.d.ts → listAllFiles.test.d.ts} +0 -0
package/esm/index.es.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import spaceTrim$1, { spaceTrim } from 'spacetrim';
|
|
2
2
|
import { Readability } from '@mozilla/readability';
|
|
3
|
-
import { mkdir, rm,
|
|
3
|
+
import { mkdir, rm, readFile, writeFile } from 'fs/promises';
|
|
4
4
|
import { JSDOM } from 'jsdom';
|
|
5
5
|
import { forTime } from 'waitasecond';
|
|
6
6
|
import { SHA256 } from 'crypto-js';
|
|
@@ -16,7 +16,7 @@ import { Converter } from 'showdown';
|
|
|
16
16
|
/**
|
|
17
17
|
* The version of the Promptbook library
|
|
18
18
|
*/
|
|
19
|
-
var PROMPTBOOK_VERSION = '0.71.0-
|
|
19
|
+
var PROMPTBOOK_VERSION = '0.71.0-14';
|
|
20
20
|
// TODO: [main] !!!! List here all the versions and annotate + put into script
|
|
21
21
|
|
|
22
22
|
/*! *****************************************************************************
|
|
@@ -656,6 +656,35 @@ var KnowledgeScrapeError = /** @class */ (function (_super) {
|
|
|
656
656
|
return KnowledgeScrapeError;
|
|
657
657
|
}(Error));
|
|
658
658
|
|
|
659
|
+
/**
|
|
660
|
+
* Tests if given string is valid URL.
|
|
661
|
+
*
|
|
662
|
+
* Note: Dataurl are considered perfectly valid.
|
|
663
|
+
* Note: There are two simmilar functions:
|
|
664
|
+
* - `isValidUrl` which tests any URL
|
|
665
|
+
* - `isValidPipelineUrl` *(this one)* which tests just promptbook URL
|
|
666
|
+
*
|
|
667
|
+
* @public exported from `@promptbook/utils`
|
|
668
|
+
*/
|
|
669
|
+
function isValidUrl(url) {
|
|
670
|
+
if (typeof url !== 'string') {
|
|
671
|
+
return false;
|
|
672
|
+
}
|
|
673
|
+
try {
|
|
674
|
+
if (url.startsWith('blob:')) {
|
|
675
|
+
url = url.replace(/^blob:/, '');
|
|
676
|
+
}
|
|
677
|
+
var urlObject = new URL(url /* because fail is handled */);
|
|
678
|
+
if (!['http:', 'https:', 'data:'].includes(urlObject.protocol)) {
|
|
679
|
+
return false;
|
|
680
|
+
}
|
|
681
|
+
return true;
|
|
682
|
+
}
|
|
683
|
+
catch (error) {
|
|
684
|
+
return false;
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
|
|
659
688
|
var defaultDiacriticsRemovalMap = [
|
|
660
689
|
{
|
|
661
690
|
base: 'A',
|
|
@@ -976,35 +1005,6 @@ function normalizeToKebabCase(text) {
|
|
|
976
1005
|
return normalizedName;
|
|
977
1006
|
}
|
|
978
1007
|
|
|
979
|
-
/**
|
|
980
|
-
* Tests if given string is valid URL.
|
|
981
|
-
*
|
|
982
|
-
* Note: Dataurl are considered perfectly valid.
|
|
983
|
-
* Note: There are two simmilar functions:
|
|
984
|
-
* - `isValidUrl` which tests any URL
|
|
985
|
-
* - `isValidPipelineUrl` *(this one)* which tests just promptbook URL
|
|
986
|
-
*
|
|
987
|
-
* @public exported from `@promptbook/utils`
|
|
988
|
-
*/
|
|
989
|
-
function isValidUrl(url) {
|
|
990
|
-
if (typeof url !== 'string') {
|
|
991
|
-
return false;
|
|
992
|
-
}
|
|
993
|
-
try {
|
|
994
|
-
if (url.startsWith('blob:')) {
|
|
995
|
-
url = url.replace(/^blob:/, '');
|
|
996
|
-
}
|
|
997
|
-
var urlObject = new URL(url /* because fail is handled */);
|
|
998
|
-
if (!['http:', 'https:', 'data:'].includes(urlObject.protocol)) {
|
|
999
|
-
return false;
|
|
1000
|
-
}
|
|
1001
|
-
return true;
|
|
1002
|
-
}
|
|
1003
|
-
catch (error) {
|
|
1004
|
-
return false;
|
|
1005
|
-
}
|
|
1006
|
-
}
|
|
1007
|
-
|
|
1008
1008
|
/**
|
|
1009
1009
|
* Removes emojis from a string and fix whitespaces
|
|
1010
1010
|
*
|
|
@@ -1167,6 +1167,7 @@ function getScraperIntermediateSource(source, options) {
|
|
|
1167
1167
|
* 1) Need to store more than serialized JSONs
|
|
1168
1168
|
* 2) Need to switch between a `rootDirname` and `cacheDirname` <- TODO: !!!!
|
|
1169
1169
|
* TODO: [🐱🐉][🧠] Make some smart crop
|
|
1170
|
+
* Note: [🟢] Code in this file should never be never released in packages that could be imported into browser environment
|
|
1170
1171
|
*/
|
|
1171
1172
|
|
|
1172
1173
|
var PipelineCollection = [{title:"Prepare Knowledge from Markdown",pipelineUrl:"https://promptbook.studio/promptbook/prepare-knowledge-from-markdown.ptbk.md",parameters:[{name:"knowledgeContent",description:"Markdown document content",isInput:true,isOutput:false},{name:"knowledgePieces",description:"The knowledge JSON object",isInput:false,isOutput:true}],templates:[{templateType:"PROMPT_TEMPLATE",name:"knowledge",title:"Knowledge",content:"You are experienced data researcher, extract the important knowledge from the document.\n\n# Rules\n\n- Make pieces of information concise, clear, and easy to understand\n- One piece of information should be approximately 1 paragraph\n- Divide the paragraphs by markdown horizontal lines ---\n- Omit irrelevant information\n- Group redundant information\n- Write just extracted information, nothing else\n\n# The document\n\nTake information from this document:\n\n> {knowledgeContent}",resultingParameterName:"knowledgePieces",dependentParameterNames:["knowledgeContent"]}],knowledgeSources:[],knowledgePieces:[],personas:[],preparations:[],sourceFile:"./promptbook-collection/prepare-knowledge-from-markdown.ptbk.md"},{title:"Prepare Keywords",pipelineUrl:"https://promptbook.studio/promptbook/prepare-knowledge-keywords.ptbk.md",parameters:[{name:"knowledgePieceContent",description:"The content",isInput:true,isOutput:false},{name:"keywords",description:"Keywords separated by comma",isInput:false,isOutput:true}],templates:[{templateType:"PROMPT_TEMPLATE",name:"knowledge",title:"Knowledge",content:"You are experienced data researcher, detect the important keywords in the document.\n\n# Rules\n\n- Write just keywords separated by comma\n\n# The document\n\nTake information from this document:\n\n> {knowledgePieceContent}",resultingParameterName:"keywords",dependentParameterNames:["knowledgePieceContent"]}],knowledgeSources:[],knowledgePieces:[],personas:[],preparations:[],sourceFile:"./promptbook-collection/prepare-knowledge-keywords.ptbk.md"},{title:"Prepare Title",pipelineUrl:"https://promptbook.studio/promptbook/prepare-knowledge-title.ptbk.md",parameters:[{name:"knowledgePieceContent",description:"The content",isInput:true,isOutput:false},{name:"title",description:"The title of the document",isInput:false,isOutput:true}],templates:[{templateType:"PROMPT_TEMPLATE",name:"knowledge",title:"Knowledge",content:"You are experienced content creator, write best title for the document.\n\n# Rules\n\n- Write just title, nothing else\n- Title should be concise and clear\n- Write maximum 5 words for the title\n\n# The document\n\n> {knowledgePieceContent}",resultingParameterName:"title",expectations:{words:{min:1,max:8}},dependentParameterNames:["knowledgePieceContent"]}],knowledgeSources:[],knowledgePieces:[],personas:[],preparations:[],sourceFile:"./promptbook-collection/prepare-knowledge-title.ptbk.md"},{title:"Prepare Keywords",pipelineUrl:"https://promptbook.studio/promptbook/prepare-persona.ptbk.md",parameters:[{name:"availableModelNames",description:"List of available model names separated by comma (,)",isInput:true,isOutput:false},{name:"personaDescription",description:"Description of the persona",isInput:true,isOutput:false},{name:"modelRequirements",description:"Specific requirements for the model",isInput:false,isOutput:true}],templates:[{templateType:"PROMPT_TEMPLATE",name:"make-model-requirements",title:"Make modelRequirements",content:"You are experienced AI engineer, you need to create virtual assistant.\nWrite\n\n## Sample\n\n```json\n{\n\"modelName\": \"gpt-4o\",\n\"systemMessage\": \"You are experienced AI engineer and helpfull assistant.\",\n\"temperature\": 0.7\n}\n```\n\n## Instructions\n\n- Your output format is JSON object\n- Write just the JSON object, no other text should be present\n- It contains the following keys:\n - `modelName`: The name of the model to use\n - `systemMessage`: The system message to provide context to the model\n - `temperature`: The sampling temperature to use\n\n### Key `modelName`\n\nPick from the following models:\n\n- {availableModelNames}\n\n### Key `systemMessage`\n\nThe system message is used to communicate instructions or provide context to the model at the beginning of a conversation. It is displayed in a different format compared to user messages, helping the model understand its role in the conversation. The system message typically guides the model's behavior, sets the tone, or specifies desired output from the model. By utilizing the system message effectively, users can steer the model towards generating more accurate and relevant responses.\n\nFor example:\n\n> You are an experienced AI engineer and helpful assistant.\n\n> You are a friendly and knowledgeable chatbot.\n\n### Key `temperature`\n\nThe sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.\n\nYou can pick a value between 0 and 2. For example:\n\n- `0.1`: Low temperature, extremely conservative and deterministic\n- `0.5`: Medium temperature, balanced between conservative and creative\n- `1.0`: High temperature, creative and bit random\n- `1.5`: Very high temperature, extremely creative and often chaotic and unpredictable\n- `2.0`: Maximum temperature, completely random and unpredictable, for some extreme creative use cases\n\n# The assistant\n\nTake this description of the persona:\n\n> {personaDescription}",resultingParameterName:"modelRequirements",format:"JSON",dependentParameterNames:["availableModelNames","personaDescription"]}],knowledgeSources:[],knowledgePieces:[],personas:[],preparations:[],sourceFile:"./promptbook-collection/prepare-persona.ptbk.md"}];
|
|
@@ -3039,39 +3040,45 @@ function sourceContentToName(sourceContent) {
|
|
|
3039
3040
|
*/
|
|
3040
3041
|
|
|
3041
3042
|
/**
|
|
3042
|
-
*
|
|
3043
|
+
* Convert file extension to mime type
|
|
3043
3044
|
*
|
|
3044
|
-
*
|
|
3045
|
+
* @private within the repository
|
|
3046
|
+
*/
|
|
3047
|
+
function extensionToMimeType(value) {
|
|
3048
|
+
return lookup(value) || 'application/octet-stream';
|
|
3049
|
+
}
|
|
3050
|
+
|
|
3051
|
+
/**
|
|
3052
|
+
* Get the file extension from a file name
|
|
3045
3053
|
*
|
|
3046
|
-
* @
|
|
3054
|
+
* @private within the repository
|
|
3047
3055
|
*/
|
|
3048
|
-
|
|
3056
|
+
function getFileExtension(value) {
|
|
3057
|
+
var match = value.match(/\.([0-9a-z]+)(?:[?#]|$)/i);
|
|
3058
|
+
return match ? match[1].toLowerCase() : null;
|
|
3059
|
+
}
|
|
3049
3060
|
|
|
3050
3061
|
/**
|
|
3051
3062
|
* Checks if the file exists
|
|
3052
3063
|
*
|
|
3053
|
-
* Note: `$` is used to indicate that this function is not a pure function - it looks at the filesystem
|
|
3054
|
-
*
|
|
3055
3064
|
* @private within the repository
|
|
3056
3065
|
*/
|
|
3057
|
-
function
|
|
3066
|
+
function isFileExisting(filename, fs) {
|
|
3058
3067
|
return __awaiter(this, void 0, void 0, function () {
|
|
3059
3068
|
var isReadAccessAllowed, isFile;
|
|
3060
3069
|
return __generator(this, function (_a) {
|
|
3061
3070
|
switch (_a.label) {
|
|
3062
|
-
case 0:
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
3066
|
-
return [4 /*yield*/, access(filename, constants.R_OK)
|
|
3067
|
-
.then(function () { return true; })
|
|
3068
|
-
.catch(function () { return false; })];
|
|
3071
|
+
case 0: return [4 /*yield*/, fs
|
|
3072
|
+
.access(filename, fs.constants.R_OK)
|
|
3073
|
+
.then(function () { return true; })
|
|
3074
|
+
.catch(function () { return false; })];
|
|
3069
3075
|
case 1:
|
|
3070
3076
|
isReadAccessAllowed = _a.sent();
|
|
3071
3077
|
if (!isReadAccessAllowed) {
|
|
3072
3078
|
return [2 /*return*/, false];
|
|
3073
3079
|
}
|
|
3074
|
-
return [4 /*yield*/,
|
|
3080
|
+
return [4 /*yield*/, fs
|
|
3081
|
+
.stat(filename)
|
|
3075
3082
|
.then(function (fileStat) { return fileStat.isFile(); })
|
|
3076
3083
|
.catch(function () { return false; })];
|
|
3077
3084
|
case 2:
|
|
@@ -3082,36 +3089,17 @@ function $isFileExisting(filename) {
|
|
|
3082
3089
|
});
|
|
3083
3090
|
}
|
|
3084
3091
|
/**
|
|
3085
|
-
* Note: [
|
|
3092
|
+
* Note: Not [~🟢~] because it is not directly dependent on `fs
|
|
3086
3093
|
* TODO: [🐠] This can be a validator - with variants that return true/false and variants that throw errors with meaningless messages
|
|
3087
3094
|
* TODO: [🖇] What about symlinks?
|
|
3088
3095
|
*/
|
|
3089
3096
|
|
|
3090
|
-
/**
|
|
3091
|
-
* Convert file extension to mime type
|
|
3092
|
-
*
|
|
3093
|
-
* @private within the repository
|
|
3094
|
-
*/
|
|
3095
|
-
function extensionToMimeType(value) {
|
|
3096
|
-
return lookup(value) || 'application/octet-stream';
|
|
3097
|
-
}
|
|
3098
|
-
|
|
3099
|
-
/**
|
|
3100
|
-
* Get the file extension from a file name
|
|
3101
|
-
*
|
|
3102
|
-
* @private within the repository
|
|
3103
|
-
*/
|
|
3104
|
-
function getFileExtension(value) {
|
|
3105
|
-
var match = value.match(/\.([0-9a-z]+)(?:[?#]|$)/i);
|
|
3106
|
-
return match ? match[1].toLowerCase() : null;
|
|
3107
|
-
}
|
|
3108
|
-
|
|
3109
3097
|
/**
|
|
3110
3098
|
* @@@
|
|
3111
3099
|
*
|
|
3112
3100
|
* @private for scraper utilities
|
|
3113
3101
|
*/
|
|
3114
|
-
function makeKnowledgeSourceHandler(knowledgeSource, options) {
|
|
3102
|
+
function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
3115
3103
|
var _a;
|
|
3116
3104
|
return __awaiter(this, void 0, void 0, function () {
|
|
3117
3105
|
var sourceContent, name, _b, _c, rootDirname, _d,
|
|
@@ -3180,8 +3168,9 @@ function makeKnowledgeSourceHandler(knowledgeSource, options) {
|
|
|
3180
3168
|
}];
|
|
3181
3169
|
case 2:
|
|
3182
3170
|
if (!(isValidFilePath(sourceContent) || /\.[a-z]{1,10}$/i.exec(sourceContent))) return [3 /*break*/, 4];
|
|
3183
|
-
if (
|
|
3184
|
-
throw new EnvironmentMismatchError('
|
|
3171
|
+
if (tools.fs === undefined) {
|
|
3172
|
+
throw new EnvironmentMismatchError('Can not import file knowledge without filesystem tools');
|
|
3173
|
+
// <- TODO: [🧠] What is the best error type here`
|
|
3185
3174
|
}
|
|
3186
3175
|
if (rootDirname === null) {
|
|
3187
3176
|
throw new EnvironmentMismatchError('Can not import file knowledge in non-file pipeline');
|
|
@@ -3190,7 +3179,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, options) {
|
|
|
3190
3179
|
filename_1 = join(rootDirname, sourceContent).split('\\').join('/');
|
|
3191
3180
|
fileExtension = getFileExtension(filename_1);
|
|
3192
3181
|
mimeType_1 = extensionToMimeType(fileExtension || '');
|
|
3193
|
-
return [4 /*yield*/,
|
|
3182
|
+
return [4 /*yield*/, isFileExisting(filename_1, tools.fs)];
|
|
3194
3183
|
case 3:
|
|
3195
3184
|
if (!(_e.sent())) {
|
|
3196
3185
|
throw new NotFoundError(spaceTrim$1(function (block) { return "\n Can not make source handler for file which does not exist:\n\n File:\n ".concat(block(filename_1), "\n "); }));
|
|
@@ -3206,7 +3195,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, options) {
|
|
|
3206
3195
|
var content;
|
|
3207
3196
|
return __generator(this, function (_a) {
|
|
3208
3197
|
switch (_a.label) {
|
|
3209
|
-
case 0: return [4 /*yield*/, readFile(filename_1)];
|
|
3198
|
+
case 0: return [4 /*yield*/, tools.fs.readFile(filename_1)];
|
|
3210
3199
|
case 1:
|
|
3211
3200
|
content = _a.sent();
|
|
3212
3201
|
return [2 /*return*/, new Blob([
|
|
@@ -3260,9 +3249,6 @@ function makeKnowledgeSourceHandler(knowledgeSource, options) {
|
|
|
3260
3249
|
});
|
|
3261
3250
|
});
|
|
3262
3251
|
}
|
|
3263
|
-
/**
|
|
3264
|
-
* TODO: !!!!!!! Maybe constrain to @promptbook/node bundle
|
|
3265
|
-
*/
|
|
3266
3252
|
|
|
3267
3253
|
/**
|
|
3268
3254
|
* Prepares the knowle
|
|
@@ -3286,7 +3272,7 @@ function prepareKnowledgePieces(knowledgeSources, tools, options) {
|
|
|
3286
3272
|
switch (_d.label) {
|
|
3287
3273
|
case 0:
|
|
3288
3274
|
partialPieces = null;
|
|
3289
|
-
return [4 /*yield*/, makeKnowledgeSourceHandler(knowledgeSource, { rootDirname: rootDirname, isVerbose: isVerbose })];
|
|
3275
|
+
return [4 /*yield*/, makeKnowledgeSourceHandler(knowledgeSource, tools, { rootDirname: rootDirname, isVerbose: isVerbose })];
|
|
3290
3276
|
case 1:
|
|
3291
3277
|
sourceHandler = _d.sent();
|
|
3292
3278
|
_d.label = 2;
|