@promptbook/website-crawler 0.75.3 → 0.75.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/esm/index.es.js CHANGED
@@ -24,7 +24,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
24
24
  *
25
25
  * @see https://github.com/webgptorg/promptbook
26
26
  */
27
- var PROMPTBOOK_ENGINE_VERSION = '0.75.2';
27
+ var PROMPTBOOK_ENGINE_VERSION = '0.75.4';
28
28
  /**
29
29
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
30
30
  * Note: [💞] Ignore a discrepancy between file name and entity name
@@ -2092,6 +2092,42 @@ var PipelineExecutionError = /** @class */ (function (_super) {
2092
2092
  return PipelineExecutionError;
2093
2093
  }(Error));
2094
2094
 
2095
+ /**
2096
+ * This error indicates problems parsing the format value
2097
+ *
2098
+ * For example, when the format value is not a valid JSON or CSV
2099
+ * This is not thrown directly but in extended classes
2100
+ *
2101
+ * @public exported from `@promptbook/core`
2102
+ */
2103
+ var AbstractFormatError = /** @class */ (function (_super) {
2104
+ __extends(AbstractFormatError, _super);
2105
+ // Note: To allow instanceof do not put here error `name`
2106
+ // public readonly name = 'AbstractFormatError';
2107
+ function AbstractFormatError(message) {
2108
+ var _this = _super.call(this, message) || this;
2109
+ Object.setPrototypeOf(_this, AbstractFormatError.prototype);
2110
+ return _this;
2111
+ }
2112
+ return AbstractFormatError;
2113
+ }(Error));
2114
+
2115
+ /**
2116
+ * This error indicates problem with parsing of CSV
2117
+ *
2118
+ * @public exported from `@promptbook/core`
2119
+ */
2120
+ var CsvFormatError = /** @class */ (function (_super) {
2121
+ __extends(CsvFormatError, _super);
2122
+ function CsvFormatError(message) {
2123
+ var _this = _super.call(this, message) || this;
2124
+ _this.name = 'CsvFormatError';
2125
+ Object.setPrototypeOf(_this, CsvFormatError.prototype);
2126
+ return _this;
2127
+ }
2128
+ return CsvFormatError;
2129
+ }(AbstractFormatError));
2130
+
2095
2131
  /**
2096
2132
  * This error indicates that the pipeline collection cannot be propperly loaded
2097
2133
  *
@@ -2149,10 +2185,14 @@ var LimitReachedError = /** @class */ (function (_super) {
2149
2185
  * @public exported from `@promptbook/core`
2150
2186
  */
2151
2187
  var ERRORS = {
2152
- ExpectError: ExpectError,
2188
+ AbstractFormatError: AbstractFormatError,
2189
+ CsvFormatError: CsvFormatError,
2153
2190
  CollectionError: CollectionError,
2154
2191
  EnvironmentMismatchError: EnvironmentMismatchError,
2192
+ ExpectError: ExpectError,
2193
+ KnowledgeScrapeError: KnowledgeScrapeError,
2155
2194
  LimitReachedError: LimitReachedError,
2195
+ MissingToolsError: MissingToolsError,
2156
2196
  NotFoundError: NotFoundError,
2157
2197
  NotYetImplementedError: NotYetImplementedError,
2158
2198
  ParseError: ParseError,
@@ -2972,8 +3012,8 @@ var $scrapersRegister = new $Register('scraper_constructors');
2972
3012
  *
2973
3013
  * @private internal function of `createScrapersFromConfiguration` and `createScrapersFromEnv`
2974
3014
  */
2975
- function $registeredScrapersMessage() {
2976
- var e_1, _a, e_2, _b;
3015
+ function $registeredScrapersMessage(availableScrapers) {
3016
+ var e_1, _a, e_2, _b, e_3, _c;
2977
3017
  /**
2978
3018
  * Mixes registered scrapers from $scrapersMetadataRegister and $scrapersRegister
2979
3019
  */
@@ -2985,15 +3025,15 @@ function $registeredScrapersMessage() {
2985
3025
  all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
2986
3026
  };
2987
3027
  try {
2988
- for (var _c = __values($scrapersMetadataRegister.list()), _d = _c.next(); !_d.done; _d = _c.next()) {
2989
- var _e = _d.value, packageName = _e.packageName, className = _e.className, mimeTypes = _e.mimeTypes, documentationUrl = _e.documentationUrl, isAvilableInBrowser = _e.isAvilableInBrowser;
3028
+ for (var _d = __values($scrapersMetadataRegister.list()), _e = _d.next(); !_e.done; _e = _d.next()) {
3029
+ var _f = _e.value, packageName = _f.packageName, className = _f.className, mimeTypes = _f.mimeTypes, documentationUrl = _f.documentationUrl, isAvilableInBrowser = _f.isAvilableInBrowser;
2990
3030
  _loop_1(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
2991
3031
  }
2992
3032
  }
2993
3033
  catch (e_1_1) { e_1 = { error: e_1_1 }; }
2994
3034
  finally {
2995
3035
  try {
2996
- if (_d && !_d.done && (_a = _c.return)) _a.call(_c);
3036
+ if (_e && !_e.done && (_a = _d.return)) _a.call(_d);
2997
3037
  }
2998
3038
  finally { if (e_1) throw e_1.error; }
2999
3039
  }
@@ -3004,18 +3044,31 @@ function $registeredScrapersMessage() {
3004
3044
  all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
3005
3045
  };
3006
3046
  try {
3007
- for (var _f = __values($scrapersRegister.list()), _g = _f.next(); !_g.done; _g = _f.next()) {
3008
- var _h = _g.value, packageName = _h.packageName, className = _h.className, mimeTypes = _h.mimeTypes, documentationUrl = _h.documentationUrl, isAvilableInBrowser = _h.isAvilableInBrowser;
3047
+ for (var _g = __values($scrapersRegister.list()), _h = _g.next(); !_h.done; _h = _g.next()) {
3048
+ var _j = _h.value, packageName = _j.packageName, className = _j.className, mimeTypes = _j.mimeTypes, documentationUrl = _j.documentationUrl, isAvilableInBrowser = _j.isAvilableInBrowser;
3009
3049
  _loop_2(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
3010
3050
  }
3011
3051
  }
3012
3052
  catch (e_2_1) { e_2 = { error: e_2_1 }; }
3013
3053
  finally {
3014
3054
  try {
3015
- if (_g && !_g.done && (_b = _f.return)) _b.call(_f);
3055
+ if (_h && !_h.done && (_b = _g.return)) _b.call(_g);
3016
3056
  }
3017
3057
  finally { if (e_2) throw e_2.error; }
3018
3058
  }
3059
+ try {
3060
+ for (var availableScrapers_1 = __values(availableScrapers), availableScrapers_1_1 = availableScrapers_1.next(); !availableScrapers_1_1.done; availableScrapers_1_1 = availableScrapers_1.next()) {
3061
+ var metadata_1 = availableScrapers_1_1.value.metadata;
3062
+ all.push(metadata_1);
3063
+ }
3064
+ }
3065
+ catch (e_3_1) { e_3 = { error: e_3_1 }; }
3066
+ finally {
3067
+ try {
3068
+ if (availableScrapers_1_1 && !availableScrapers_1_1.done && (_c = availableScrapers_1.return)) _c.call(availableScrapers_1);
3069
+ }
3070
+ finally { if (e_3) throw e_3.error; }
3071
+ }
3019
3072
  var metadata = all.map(function (metadata) {
3020
3073
  var isMetadataAviailable = $scrapersMetadataRegister
3021
3074
  .list()
@@ -3029,42 +3082,44 @@ function $registeredScrapersMessage() {
3029
3082
  var packageName = _a.packageName, className = _a.className;
3030
3083
  return metadata.packageName === packageName && metadata.className === className;
3031
3084
  });
3032
- return __assign(__assign({}, metadata), { isMetadataAviailable: isMetadataAviailable, isInstalled: isInstalled });
3085
+ var isAvilableInTools = availableScrapers.some(function (_a) {
3086
+ var _b = _a.metadata, packageName = _b.packageName, className = _b.className;
3087
+ return metadata.packageName === packageName && metadata.className === className;
3088
+ });
3089
+ return __assign(__assign({}, metadata), { isMetadataAviailable: isMetadataAviailable, isInstalled: isInstalled, isAvilableInTools: isAvilableInTools });
3033
3090
  });
3034
3091
  if (metadata.length === 0) {
3035
- return "No scrapers are available";
3092
+ return spaceTrim$1("\n **No scrapers are available**\n\n This is a unexpected behavior, you are probably using some broken version of Promptbook\n At least there should be available the metadata of the scrapers\n ");
3036
3093
  }
3037
3094
  return spaceTrim$1(function (block) { return "\n Available scrapers are:\n ".concat(block(metadata
3038
3095
  .map(function (_a, i) {
3039
- var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled, mimeTypes = _a.mimeTypes, isAvilableInBrowser = _a.isAvilableInBrowser;
3040
- var more;
3041
- // TODO: Use documentationUrl
3042
- if (just(false)) {
3043
- more = '';
3044
- }
3045
- else if (!isMetadataAviailable && !isInstalled) {
3046
- // TODO: [�][�] Maybe do allow to do auto-install if package not registered and not found
3047
- more = "*(not installed and no metadata, looks like a unexpected behavior)*";
3048
- }
3049
- else if (isMetadataAviailable && !isInstalled) {
3050
- // TODO: [�][�]
3051
- more = "*(not installed)*";
3052
- }
3053
- else if (!isMetadataAviailable && isInstalled) {
3054
- more = "*(no metadata, looks like a unexpected behavior)*";
3055
- }
3056
- else if (isMetadataAviailable && isInstalled) {
3057
- more = "(installed)";
3058
- }
3059
- else {
3060
- more = "*(unknown state, looks like a unexpected behavior)*";
3061
- }
3096
+ var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled, mimeTypes = _a.mimeTypes, isAvilableInBrowser = _a.isAvilableInBrowser, isAvilableInTools = _a.isAvilableInTools;
3097
+ var more = [];
3098
+ // TODO: [🧠] Maybe use `documentationUrl`
3099
+ if (isMetadataAviailable) {
3100
+ more.push("\u2B1C Metadata registered");
3101
+ } // not else
3102
+ if (isInstalled) {
3103
+ more.push("\uD83D\uDFE9 Installed");
3104
+ } // not else
3105
+ if (isAvilableInTools) {
3106
+ more.push("\uD83D\uDFE6 Available in tools");
3107
+ } // not else
3108
+ if (!isMetadataAviailable && isInstalled) {
3109
+ more.push("When no metadata registered but scraper is installed, it is an unexpected behavior");
3110
+ } // not else
3111
+ if (!isInstalled && isAvilableInTools) {
3112
+ more.push("When the scraper is not installed but available in tools, it is an unexpected compatibility behavior");
3113
+ } // not else
3062
3114
  if (!isAvilableInBrowser) {
3063
- more += " *(not available in browser)*";
3115
+ more.push("Not usable in browser");
3064
3116
  }
3065
- return "".concat(i + 1, ") `").concat(className, "` from `").concat(packageName, "` compatible to scrape ").concat(mimeTypes.join(', '), " ").concat(more);
3117
+ var moreText = more.length === 0 ? '' : " *(".concat(more.join('; '), ")*");
3118
+ return "".concat(i + 1, ") `").concat(className, "` from `").concat(packageName, "` compatible to scrape ").concat(mimeTypes
3119
+ .map(function (mimeType) { return "\"".concat(mimeType, "\""); })
3120
+ .join(', ')).concat(moreText);
3066
3121
  })
3067
- .join('\n')), "\n "); });
3122
+ .join('\n')), "\n\n Legend:\n - \u2B1C **Metadata registered** means that Promptbook knows about the scraper, it is similar to registration in some registry\n - \uD83D\uDFE9 **Installed** means that you have imported package with particular scraper\n - \uD83D\uDFE6 **Available in tools** means that you have passed scraper as dependency into prepare or execution process\n\n "); });
3068
3123
  }
3069
3124
  /**
3070
3125
  * TODO: [®] DRY Register logic
@@ -3312,57 +3367,75 @@ function prepareKnowledgePieces(knowledgeSources, tools, options) {
3312
3367
  _a = options.maxParallelCount, maxParallelCount = _a === void 0 ? DEFAULT_MAX_PARALLEL_COUNT : _a, rootDirname = options.rootDirname, _b = options.isVerbose, isVerbose = _b === void 0 ? DEFAULT_IS_VERBOSE : _b;
3313
3368
  knowledgePreparedUnflatten = new Array(knowledgeSources.length);
3314
3369
  return [4 /*yield*/, forEachAsync(knowledgeSources, { maxParallelCount: maxParallelCount }, function (knowledgeSource, index) { return __awaiter(_this, void 0, void 0, function () {
3315
- var partialPieces, sourceHandler, _a, _b, scraper, partialPiecesUnchecked, e_1_1, pieces;
3316
- var e_1, _c;
3317
- return __generator(this, function (_d) {
3318
- switch (_d.label) {
3370
+ var partialPieces, sourceHandler, scrapers, _loop_1, scrapers_1, scrapers_1_1, scraper, state_1, e_1_1, pieces;
3371
+ var e_1, _a;
3372
+ return __generator(this, function (_b) {
3373
+ switch (_b.label) {
3319
3374
  case 0:
3320
3375
  partialPieces = null;
3321
3376
  return [4 /*yield*/, makeKnowledgeSourceHandler(knowledgeSource, tools, { rootDirname: rootDirname, isVerbose: isVerbose })];
3322
3377
  case 1:
3323
- sourceHandler = _d.sent();
3324
- _d.label = 2;
3378
+ sourceHandler = _b.sent();
3379
+ scrapers = arrayableToArray(tools.scrapers);
3380
+ _loop_1 = function (scraper) {
3381
+ var partialPiecesUnchecked;
3382
+ return __generator(this, function (_c) {
3383
+ switch (_c.label) {
3384
+ case 0:
3385
+ if (!scraper.metadata.mimeTypes.includes(sourceHandler.mimeType)
3386
+ // <- TODO: [🦔] Implement mime-type wildcards
3387
+ ) {
3388
+ return [2 /*return*/, "continue"];
3389
+ }
3390
+ return [4 /*yield*/, scraper.scrape(sourceHandler)];
3391
+ case 1:
3392
+ partialPiecesUnchecked = _c.sent();
3393
+ if (partialPiecesUnchecked !== null) {
3394
+ partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
3395
+ return [2 /*return*/, "break"];
3396
+ }
3397
+ console.warn(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge from source despite the scraper `".concat(scraper.metadata.className, "` supports the mime type \"").concat(sourceHandler.mimeType, "\".\n \n The source:\n > ").concat(block(knowledgeSource.sourceContent
3398
+ .split('\n')
3399
+ .map(function (line) { return "> ".concat(line); })
3400
+ .join('\n')), "\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
3401
+ return [2 /*return*/];
3402
+ }
3403
+ });
3404
+ };
3405
+ _b.label = 2;
3325
3406
  case 2:
3326
- _d.trys.push([2, 7, 8, 9]);
3327
- _a = __values(arrayableToArray(tools.scrapers)), _b = _a.next();
3328
- _d.label = 3;
3407
+ _b.trys.push([2, 7, 8, 9]);
3408
+ scrapers_1 = __values(scrapers), scrapers_1_1 = scrapers_1.next();
3409
+ _b.label = 3;
3329
3410
  case 3:
3330
- if (!!_b.done) return [3 /*break*/, 6];
3331
- scraper = _b.value;
3332
- if (!scraper.metadata.mimeTypes.includes(sourceHandler.mimeType)
3333
- // <- TODO: [🦔] Implement mime-type wildcards
3334
- ) {
3335
- return [3 /*break*/, 5];
3336
- }
3337
- return [4 /*yield*/, scraper.scrape(sourceHandler)];
3411
+ if (!!scrapers_1_1.done) return [3 /*break*/, 6];
3412
+ scraper = scrapers_1_1.value;
3413
+ return [5 /*yield**/, _loop_1(scraper)];
3338
3414
  case 4:
3339
- partialPiecesUnchecked = _d.sent();
3340
- if (partialPiecesUnchecked !== null) {
3341
- partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
3342
- // <- TODO: [🪓] Here should be no need for spreading new array, just `partialPieces = partialPiecesUnchecked`
3415
+ state_1 = _b.sent();
3416
+ if (state_1 === "break")
3343
3417
  return [3 /*break*/, 6];
3344
- }
3345
- _d.label = 5;
3418
+ _b.label = 5;
3346
3419
  case 5:
3347
- _b = _a.next();
3420
+ scrapers_1_1 = scrapers_1.next();
3348
3421
  return [3 /*break*/, 3];
3349
3422
  case 6: return [3 /*break*/, 9];
3350
3423
  case 7:
3351
- e_1_1 = _d.sent();
3424
+ e_1_1 = _b.sent();
3352
3425
  e_1 = { error: e_1_1 };
3353
3426
  return [3 /*break*/, 9];
3354
3427
  case 8:
3355
3428
  try {
3356
- if (_b && !_b.done && (_c = _a.return)) _c.call(_a);
3429
+ if (scrapers_1_1 && !scrapers_1_1.done && (_a = scrapers_1.return)) _a.call(scrapers_1);
3357
3430
  }
3358
3431
  finally { if (e_1) throw e_1.error; }
3359
3432
  return [7 /*endfinally*/];
3360
3433
  case 9:
3361
3434
  if (partialPieces === null) {
3362
- throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge from source:\n \n > ".concat(block(knowledgeSource.sourceContent
3435
+ throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge\n \n The source:\n > ".concat(block(knowledgeSource.sourceContent
3363
3436
  .split('\n')
3364
3437
  .map(function (line) { return "> ".concat(line); })
3365
- .join('\n')), "\n\n No scraper found for the mime type \"").concat(sourceHandler.mimeType, "\"\n\n ").concat(block($registeredScrapersMessage()), "\n\n\n "); }));
3438
+ .join('\n')), "\n\n No scraper found for the mime type \"").concat(sourceHandler.mimeType, "\"\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
3366
3439
  }
3367
3440
  pieces = partialPieces.map(function (partialPiece) { return (__assign(__assign({}, partialPiece), { sources: [
3368
3441
  {
@@ -3788,42 +3861,6 @@ function union() {
3788
3861
  return union;
3789
3862
  }
3790
3863
 
3791
- /**
3792
- * This error indicates problems parsing the format value
3793
- *
3794
- * For example, when the format value is not a valid JSON or CSV
3795
- * This is not thrown directly but in extended classes
3796
- *
3797
- * @public exported from `@promptbook/core`
3798
- */
3799
- var AbstractFormatError = /** @class */ (function (_super) {
3800
- __extends(AbstractFormatError, _super);
3801
- // Note: To allow instanceof do not put here error `name`
3802
- // public readonly name = 'AbstractFormatError';
3803
- function AbstractFormatError(message) {
3804
- var _this = _super.call(this, message) || this;
3805
- Object.setPrototypeOf(_this, AbstractFormatError.prototype);
3806
- return _this;
3807
- }
3808
- return AbstractFormatError;
3809
- }(Error));
3810
-
3811
- /**
3812
- * This error indicates problem with parsing of CSV
3813
- *
3814
- * @public exported from `@promptbook/core`
3815
- */
3816
- var CsvFormatError = /** @class */ (function (_super) {
3817
- __extends(CsvFormatError, _super);
3818
- function CsvFormatError(message) {
3819
- var _this = _super.call(this, message) || this;
3820
- _this.name = 'CsvFormatError';
3821
- Object.setPrototypeOf(_this, CsvFormatError.prototype);
3822
- return _this;
3823
- }
3824
- return CsvFormatError;
3825
- }(AbstractFormatError));
3826
-
3827
3864
  /**
3828
3865
  * @@@
3829
3866
  *
@@ -3864,7 +3901,7 @@ var CsvFormatDefinition = {
3864
3901
  case 0:
3865
3902
  csv = parse(value, __assign(__assign({}, settings), MANDATORY_CSV_SETTINGS));
3866
3903
  if (csv.errors.length !== 0) {
3867
- throw new CsvFormatError(spaceTrim$1(function (block) { return "\n CSV parsing error\n\n ".concat(block(csv.errors.map(function (error) { return error.message; }).join('\n\n')), "\n "); }));
3904
+ throw new CsvFormatError(spaceTrim$1(function (block) { return "\n CSV parsing error\n\n Error(s) from CSV parsing:\n ".concat(block(csv.errors.map(function (error) { return error.message; }).join('\n\n')), "\n\n The CSV data:\n ").concat(block(value), "\n "); }));
3868
3905
  }
3869
3906
  return [4 /*yield*/, Promise.all(csv.data.map(function (row, index) { return __awaiter(_this, void 0, void 0, function () {
3870
3907
  var _a, _b;
@@ -3902,7 +3939,7 @@ var CsvFormatDefinition = {
3902
3939
  case 0:
3903
3940
  csv = parse(value, __assign(__assign({}, settings), MANDATORY_CSV_SETTINGS));
3904
3941
  if (csv.errors.length !== 0) {
3905
- throw new CsvFormatError(spaceTrim$1(function (block) { return "\n CSV parsing error\n\n ".concat(block(csv.errors.map(function (error) { return error.message; }).join('\n\n')), "\n "); }));
3942
+ throw new CsvFormatError(spaceTrim$1(function (block) { return "\n CSV parsing error\n\n Error(s) from CSV parsing:\n ".concat(block(csv.errors.map(function (error) { return error.message; }).join('\n\n')), "\n\n The CSV data:\n ").concat(block(value), "\n "); }));
3906
3943
  }
3907
3944
  return [4 /*yield*/, Promise.all(csv.data.map(function (row, rowIndex) { return __awaiter(_this, void 0, void 0, function () {
3908
3945
  var _this = this;