@promptbook/website-crawler 0.71.0-13 → 0.71.0-15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/esm/index.es.js +61 -75
  2. package/esm/index.es.js.map +1 -1
  3. package/esm/typings/src/_packages/node.index.d.ts +2 -0
  4. package/esm/typings/src/_packages/types.index.d.ts +2 -0
  5. package/esm/typings/src/collection/constructors/createCollectionFromDirectory.d.ts +2 -2
  6. package/esm/typings/src/conversion/pipelineStringToJson.d.ts +1 -1
  7. package/esm/typings/src/execution/ExecutionTools.d.ts +12 -4
  8. package/esm/typings/src/execution/FilesystemTools.d.ts +9 -0
  9. package/esm/typings/src/execution/translation/automatic-translate/translateMessages.d.ts +1 -0
  10. package/esm/typings/src/llm-providers/_common/register/$provideLlmToolsFromEnv.d.ts +0 -1
  11. package/esm/typings/src/prepare/preparePipeline.d.ts +1 -1
  12. package/esm/typings/src/prepare/prepareTemplates.d.ts +1 -1
  13. package/esm/typings/src/scrapers/_common/prepareKnowledgePieces.d.ts +1 -1
  14. package/esm/typings/src/scrapers/_common/register/$provideFilesystemForNode.d.ts +11 -0
  15. package/esm/typings/src/scrapers/_common/register/$provideScrapersForNode.d.ts +1 -1
  16. package/esm/typings/src/scrapers/_common/utils/getScraperIntermediateSource.d.ts +1 -0
  17. package/esm/typings/src/scrapers/_common/utils/makeKnowledgeSourceHandler.d.ts +2 -4
  18. package/esm/typings/src/scrapers/document/DocumentScraper.d.ts +1 -1
  19. package/esm/typings/src/scrapers/document-legacy/LegacyDocumentScraper.d.ts +1 -1
  20. package/esm/typings/src/scrapers/website/WebsiteScraper.d.ts +1 -1
  21. package/esm/typings/src/storage/file-cache-storage/FileCacheStorage.d.ts +3 -1
  22. package/esm/typings/src/utils/files/{$isDirectoryExisting.d.ts → isDirectoryExisting.d.ts} +3 -4
  23. package/esm/typings/src/utils/files/isFileExisting.d.ts +13 -0
  24. package/esm/typings/src/utils/files/{$listAllFiles.d.ts → listAllFiles.d.ts} +3 -4
  25. package/package.json +2 -2
  26. package/umd/index.umd.js +60 -74
  27. package/umd/index.umd.js.map +1 -1
  28. package/esm/typings/src/utils/files/$isFileExisting.d.ts +0 -14
  29. /package/esm/typings/src/utils/files/{$isDirectoryExisting.test.d.ts → isDirectoryExisting.test.d.ts} +0 -0
  30. /package/esm/typings/src/utils/files/{$isFileExisting.test.d.ts → isFileExisting.test.d.ts} +0 -0
  31. /package/esm/typings/src/utils/files/{$listAllFiles.test.d.ts → listAllFiles.test.d.ts} +0 -0
package/esm/index.es.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import spaceTrim$1, { spaceTrim } from 'spacetrim';
2
2
  import { Readability } from '@mozilla/readability';
3
- import { mkdir, rm, stat, access, constants, readFile, writeFile } from 'fs/promises';
3
+ import { mkdir, rm, readFile, writeFile } from 'fs/promises';
4
4
  import { JSDOM } from 'jsdom';
5
5
  import { forTime } from 'waitasecond';
6
6
  import { SHA256 } from 'crypto-js';
@@ -16,7 +16,7 @@ import { Converter } from 'showdown';
16
16
  /**
17
17
  * The version of the Promptbook library
18
18
  */
19
- var PROMPTBOOK_VERSION = '0.71.0-12';
19
+ var PROMPTBOOK_VERSION = '0.71.0-14';
20
20
  // TODO: [main] !!!! List here all the versions and annotate + put into script
21
21
 
22
22
  /*! *****************************************************************************
@@ -656,6 +656,35 @@ var KnowledgeScrapeError = /** @class */ (function (_super) {
656
656
  return KnowledgeScrapeError;
657
657
  }(Error));
658
658
 
659
+ /**
660
+ * Tests if given string is valid URL.
661
+ *
662
+ * Note: Dataurl are considered perfectly valid.
663
+ * Note: There are two simmilar functions:
664
+ * - `isValidUrl` which tests any URL
665
+ * - `isValidPipelineUrl` *(this one)* which tests just promptbook URL
666
+ *
667
+ * @public exported from `@promptbook/utils`
668
+ */
669
+ function isValidUrl(url) {
670
+ if (typeof url !== 'string') {
671
+ return false;
672
+ }
673
+ try {
674
+ if (url.startsWith('blob:')) {
675
+ url = url.replace(/^blob:/, '');
676
+ }
677
+ var urlObject = new URL(url /* because fail is handled */);
678
+ if (!['http:', 'https:', 'data:'].includes(urlObject.protocol)) {
679
+ return false;
680
+ }
681
+ return true;
682
+ }
683
+ catch (error) {
684
+ return false;
685
+ }
686
+ }
687
+
659
688
  var defaultDiacriticsRemovalMap = [
660
689
  {
661
690
  base: 'A',
@@ -976,35 +1005,6 @@ function normalizeToKebabCase(text) {
976
1005
  return normalizedName;
977
1006
  }
978
1007
 
979
- /**
980
- * Tests if given string is valid URL.
981
- *
982
- * Note: Dataurl are considered perfectly valid.
983
- * Note: There are two simmilar functions:
984
- * - `isValidUrl` which tests any URL
985
- * - `isValidPipelineUrl` *(this one)* which tests just promptbook URL
986
- *
987
- * @public exported from `@promptbook/utils`
988
- */
989
- function isValidUrl(url) {
990
- if (typeof url !== 'string') {
991
- return false;
992
- }
993
- try {
994
- if (url.startsWith('blob:')) {
995
- url = url.replace(/^blob:/, '');
996
- }
997
- var urlObject = new URL(url /* because fail is handled */);
998
- if (!['http:', 'https:', 'data:'].includes(urlObject.protocol)) {
999
- return false;
1000
- }
1001
- return true;
1002
- }
1003
- catch (error) {
1004
- return false;
1005
- }
1006
- }
1007
-
1008
1008
  /**
1009
1009
  * Removes emojis from a string and fix whitespaces
1010
1010
  *
@@ -1167,6 +1167,7 @@ function getScraperIntermediateSource(source, options) {
1167
1167
  * 1) Need to store more than serialized JSONs
1168
1168
  * 2) Need to switch between a `rootDirname` and `cacheDirname` <- TODO: !!!!
1169
1169
  * TODO: [🐱‍🐉][🧠] Make some smart crop
1170
+ * Note: [🟢] Code in this file should never be never released in packages that could be imported into browser environment
1170
1171
  */
1171
1172
 
1172
1173
  var PipelineCollection = [{title:"Prepare Knowledge from Markdown",pipelineUrl:"https://promptbook.studio/promptbook/prepare-knowledge-from-markdown.ptbk.md",parameters:[{name:"knowledgeContent",description:"Markdown document content",isInput:true,isOutput:false},{name:"knowledgePieces",description:"The knowledge JSON object",isInput:false,isOutput:true}],templates:[{templateType:"PROMPT_TEMPLATE",name:"knowledge",title:"Knowledge",content:"You are experienced data researcher, extract the important knowledge from the document.\n\n# Rules\n\n- Make pieces of information concise, clear, and easy to understand\n- One piece of information should be approximately 1 paragraph\n- Divide the paragraphs by markdown horizontal lines ---\n- Omit irrelevant information\n- Group redundant information\n- Write just extracted information, nothing else\n\n# The document\n\nTake information from this document:\n\n> {knowledgeContent}",resultingParameterName:"knowledgePieces",dependentParameterNames:["knowledgeContent"]}],knowledgeSources:[],knowledgePieces:[],personas:[],preparations:[],sourceFile:"./promptbook-collection/prepare-knowledge-from-markdown.ptbk.md"},{title:"Prepare Keywords",pipelineUrl:"https://promptbook.studio/promptbook/prepare-knowledge-keywords.ptbk.md",parameters:[{name:"knowledgePieceContent",description:"The content",isInput:true,isOutput:false},{name:"keywords",description:"Keywords separated by comma",isInput:false,isOutput:true}],templates:[{templateType:"PROMPT_TEMPLATE",name:"knowledge",title:"Knowledge",content:"You are experienced data researcher, detect the important keywords in the document.\n\n# Rules\n\n- Write just keywords separated by comma\n\n# The document\n\nTake information from this document:\n\n> {knowledgePieceContent}",resultingParameterName:"keywords",dependentParameterNames:["knowledgePieceContent"]}],knowledgeSources:[],knowledgePieces:[],personas:[],preparations:[],sourceFile:"./promptbook-collection/prepare-knowledge-keywords.ptbk.md"},{title:"Prepare Title",pipelineUrl:"https://promptbook.studio/promptbook/prepare-knowledge-title.ptbk.md",parameters:[{name:"knowledgePieceContent",description:"The content",isInput:true,isOutput:false},{name:"title",description:"The title of the document",isInput:false,isOutput:true}],templates:[{templateType:"PROMPT_TEMPLATE",name:"knowledge",title:"Knowledge",content:"You are experienced content creator, write best title for the document.\n\n# Rules\n\n- Write just title, nothing else\n- Title should be concise and clear\n- Write maximum 5 words for the title\n\n# The document\n\n> {knowledgePieceContent}",resultingParameterName:"title",expectations:{words:{min:1,max:8}},dependentParameterNames:["knowledgePieceContent"]}],knowledgeSources:[],knowledgePieces:[],personas:[],preparations:[],sourceFile:"./promptbook-collection/prepare-knowledge-title.ptbk.md"},{title:"Prepare Keywords",pipelineUrl:"https://promptbook.studio/promptbook/prepare-persona.ptbk.md",parameters:[{name:"availableModelNames",description:"List of available model names separated by comma (,)",isInput:true,isOutput:false},{name:"personaDescription",description:"Description of the persona",isInput:true,isOutput:false},{name:"modelRequirements",description:"Specific requirements for the model",isInput:false,isOutput:true}],templates:[{templateType:"PROMPT_TEMPLATE",name:"make-model-requirements",title:"Make modelRequirements",content:"You are experienced AI engineer, you need to create virtual assistant.\nWrite\n\n## Sample\n\n```json\n{\n\"modelName\": \"gpt-4o\",\n\"systemMessage\": \"You are experienced AI engineer and helpfull assistant.\",\n\"temperature\": 0.7\n}\n```\n\n## Instructions\n\n- Your output format is JSON object\n- Write just the JSON object, no other text should be present\n- It contains the following keys:\n - `modelName`: The name of the model to use\n - `systemMessage`: The system message to provide context to the model\n - `temperature`: The sampling temperature to use\n\n### Key `modelName`\n\nPick from the following models:\n\n- {availableModelNames}\n\n### Key `systemMessage`\n\nThe system message is used to communicate instructions or provide context to the model at the beginning of a conversation. It is displayed in a different format compared to user messages, helping the model understand its role in the conversation. The system message typically guides the model's behavior, sets the tone, or specifies desired output from the model. By utilizing the system message effectively, users can steer the model towards generating more accurate and relevant responses.\n\nFor example:\n\n> You are an experienced AI engineer and helpful assistant.\n\n> You are a friendly and knowledgeable chatbot.\n\n### Key `temperature`\n\nThe sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.\n\nYou can pick a value between 0 and 2. For example:\n\n- `0.1`: Low temperature, extremely conservative and deterministic\n- `0.5`: Medium temperature, balanced between conservative and creative\n- `1.0`: High temperature, creative and bit random\n- `1.5`: Very high temperature, extremely creative and often chaotic and unpredictable\n- `2.0`: Maximum temperature, completely random and unpredictable, for some extreme creative use cases\n\n# The assistant\n\nTake this description of the persona:\n\n> {personaDescription}",resultingParameterName:"modelRequirements",format:"JSON",dependentParameterNames:["availableModelNames","personaDescription"]}],knowledgeSources:[],knowledgePieces:[],personas:[],preparations:[],sourceFile:"./promptbook-collection/prepare-persona.ptbk.md"}];
@@ -3039,39 +3040,45 @@ function sourceContentToName(sourceContent) {
3039
3040
  */
3040
3041
 
3041
3042
  /**
3042
- * Detects if the code is running in a Node.js environment
3043
+ * Convert file extension to mime type
3043
3044
  *
3044
- * Note: `$` is used to indicate that this function is not a pure function - it looks at the global object to determine the environment
3045
+ * @private within the repository
3046
+ */
3047
+ function extensionToMimeType(value) {
3048
+ return lookup(value) || 'application/octet-stream';
3049
+ }
3050
+
3051
+ /**
3052
+ * Get the file extension from a file name
3045
3053
  *
3046
- * @public exported from `@promptbook/utils`
3054
+ * @private within the repository
3047
3055
  */
3048
- var $isRunningInNode = new Function("\n try {\n return this === global;\n } catch (e) {\n return false;\n }\n");
3056
+ function getFileExtension(value) {
3057
+ var match = value.match(/\.([0-9a-z]+)(?:[?#]|$)/i);
3058
+ return match ? match[1].toLowerCase() : null;
3059
+ }
3049
3060
 
3050
3061
  /**
3051
3062
  * Checks if the file exists
3052
3063
  *
3053
- * Note: `$` is used to indicate that this function is not a pure function - it looks at the filesystem
3054
- *
3055
3064
  * @private within the repository
3056
3065
  */
3057
- function $isFileExisting(filename) {
3066
+ function isFileExisting(filename, fs) {
3058
3067
  return __awaiter(this, void 0, void 0, function () {
3059
3068
  var isReadAccessAllowed, isFile;
3060
3069
  return __generator(this, function (_a) {
3061
3070
  switch (_a.label) {
3062
- case 0:
3063
- if (!$isRunningInNode()) {
3064
- throw new EnvironmentMismatchError('Function `$isFileExisting` works only in Node environment.js');
3065
- }
3066
- return [4 /*yield*/, access(filename, constants.R_OK)
3067
- .then(function () { return true; })
3068
- .catch(function () { return false; })];
3071
+ case 0: return [4 /*yield*/, fs
3072
+ .access(filename, fs.constants.R_OK)
3073
+ .then(function () { return true; })
3074
+ .catch(function () { return false; })];
3069
3075
  case 1:
3070
3076
  isReadAccessAllowed = _a.sent();
3071
3077
  if (!isReadAccessAllowed) {
3072
3078
  return [2 /*return*/, false];
3073
3079
  }
3074
- return [4 /*yield*/, stat(filename)
3080
+ return [4 /*yield*/, fs
3081
+ .stat(filename)
3075
3082
  .then(function (fileStat) { return fileStat.isFile(); })
3076
3083
  .catch(function () { return false; })];
3077
3084
  case 2:
@@ -3082,36 +3089,17 @@ function $isFileExisting(filename) {
3082
3089
  });
3083
3090
  }
3084
3091
  /**
3085
- * Note: [🟢 !!!!!! After fix makeKnowledgeSourceHandler] Code in this file should never be published outside of `@promptbook/node` and `@promptbook/cli`
3092
+ * Note: Not [~🟢~] because it is not directly dependent on `fs
3086
3093
  * TODO: [🐠] This can be a validator - with variants that return true/false and variants that throw errors with meaningless messages
3087
3094
  * TODO: [🖇] What about symlinks?
3088
3095
  */
3089
3096
 
3090
- /**
3091
- * Convert file extension to mime type
3092
- *
3093
- * @private within the repository
3094
- */
3095
- function extensionToMimeType(value) {
3096
- return lookup(value) || 'application/octet-stream';
3097
- }
3098
-
3099
- /**
3100
- * Get the file extension from a file name
3101
- *
3102
- * @private within the repository
3103
- */
3104
- function getFileExtension(value) {
3105
- var match = value.match(/\.([0-9a-z]+)(?:[?#]|$)/i);
3106
- return match ? match[1].toLowerCase() : null;
3107
- }
3108
-
3109
3097
  /**
3110
3098
  * @@@
3111
3099
  *
3112
3100
  * @private for scraper utilities
3113
3101
  */
3114
- function makeKnowledgeSourceHandler(knowledgeSource, options) {
3102
+ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3115
3103
  var _a;
3116
3104
  return __awaiter(this, void 0, void 0, function () {
3117
3105
  var sourceContent, name, _b, _c, rootDirname, _d,
@@ -3180,8 +3168,9 @@ function makeKnowledgeSourceHandler(knowledgeSource, options) {
3180
3168
  }];
3181
3169
  case 2:
3182
3170
  if (!(isValidFilePath(sourceContent) || /\.[a-z]{1,10}$/i.exec(sourceContent))) return [3 /*break*/, 4];
3183
- if (!$isRunningInNode()) {
3184
- throw new EnvironmentMismatchError('Importing knowledge source file works only in Node.js environment');
3171
+ if (tools.fs === undefined) {
3172
+ throw new EnvironmentMismatchError('Can not import file knowledge without filesystem tools');
3173
+ // <- TODO: [🧠] What is the best error type here`
3185
3174
  }
3186
3175
  if (rootDirname === null) {
3187
3176
  throw new EnvironmentMismatchError('Can not import file knowledge in non-file pipeline');
@@ -3190,7 +3179,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, options) {
3190
3179
  filename_1 = join(rootDirname, sourceContent).split('\\').join('/');
3191
3180
  fileExtension = getFileExtension(filename_1);
3192
3181
  mimeType_1 = extensionToMimeType(fileExtension || '');
3193
- return [4 /*yield*/, $isFileExisting(filename_1)];
3182
+ return [4 /*yield*/, isFileExisting(filename_1, tools.fs)];
3194
3183
  case 3:
3195
3184
  if (!(_e.sent())) {
3196
3185
  throw new NotFoundError(spaceTrim$1(function (block) { return "\n Can not make source handler for file which does not exist:\n\n File:\n ".concat(block(filename_1), "\n "); }));
@@ -3206,7 +3195,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, options) {
3206
3195
  var content;
3207
3196
  return __generator(this, function (_a) {
3208
3197
  switch (_a.label) {
3209
- case 0: return [4 /*yield*/, readFile(filename_1)];
3198
+ case 0: return [4 /*yield*/, tools.fs.readFile(filename_1)];
3210
3199
  case 1:
3211
3200
  content = _a.sent();
3212
3201
  return [2 /*return*/, new Blob([
@@ -3260,9 +3249,6 @@ function makeKnowledgeSourceHandler(knowledgeSource, options) {
3260
3249
  });
3261
3250
  });
3262
3251
  }
3263
- /**
3264
- * TODO: !!!!!!! Maybe constrain to @promptbook/node bundle
3265
- */
3266
3252
 
3267
3253
  /**
3268
3254
  * Prepares the knowle
@@ -3286,7 +3272,7 @@ function prepareKnowledgePieces(knowledgeSources, tools, options) {
3286
3272
  switch (_d.label) {
3287
3273
  case 0:
3288
3274
  partialPieces = null;
3289
- return [4 /*yield*/, makeKnowledgeSourceHandler(knowledgeSource, { rootDirname: rootDirname, isVerbose: isVerbose })];
3275
+ return [4 /*yield*/, makeKnowledgeSourceHandler(knowledgeSource, tools, { rootDirname: rootDirname, isVerbose: isVerbose })];
3290
3276
  case 1:
3291
3277
  sourceHandler = _d.sent();
3292
3278
  _d.label = 2;