@promptbook/website-crawler 0.75.2 → 0.75.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/esm/index.es.js CHANGED
@@ -24,7 +24,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
24
24
  *
25
25
  * @see https://github.com/webgptorg/promptbook
26
26
  */
27
- var PROMPTBOOK_ENGINE_VERSION = '0.75.1';
27
+ var PROMPTBOOK_ENGINE_VERSION = '0.75.3';
28
28
  /**
29
29
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
30
30
  * Note: [💞] Ignore a discrepancy between file name and entity name
@@ -2972,50 +2972,63 @@ var $scrapersRegister = new $Register('scraper_constructors');
2972
2972
  *
2973
2973
  * @private internal function of `createScrapersFromConfiguration` and `createScrapersFromEnv`
2974
2974
  */
2975
- function $registeredScrapersMessage() {
2976
- var e_1, _a, e_2, _b;
2975
+ function $registeredScrapersMessage(availableScrapers) {
2976
+ var e_1, _a, e_2, _b, e_3, _c;
2977
2977
  /**
2978
2978
  * Mixes registered scrapers from $scrapersMetadataRegister and $scrapersRegister
2979
2979
  */
2980
2980
  var all = [];
2981
- var _loop_1 = function (packageName, className) {
2981
+ var _loop_1 = function (packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser) {
2982
2982
  if (all.some(function (item) { return item.packageName === packageName && item.className === className; })) {
2983
2983
  return "continue";
2984
2984
  }
2985
- all.push({ packageName: packageName, className: className });
2985
+ all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
2986
2986
  };
2987
2987
  try {
2988
- for (var _c = __values($scrapersMetadataRegister.list()), _d = _c.next(); !_d.done; _d = _c.next()) {
2989
- var _e = _d.value, packageName = _e.packageName, className = _e.className;
2990
- _loop_1(packageName, className);
2988
+ for (var _d = __values($scrapersMetadataRegister.list()), _e = _d.next(); !_e.done; _e = _d.next()) {
2989
+ var _f = _e.value, packageName = _f.packageName, className = _f.className, mimeTypes = _f.mimeTypes, documentationUrl = _f.documentationUrl, isAvilableInBrowser = _f.isAvilableInBrowser;
2990
+ _loop_1(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
2991
2991
  }
2992
2992
  }
2993
2993
  catch (e_1_1) { e_1 = { error: e_1_1 }; }
2994
2994
  finally {
2995
2995
  try {
2996
- if (_d && !_d.done && (_a = _c.return)) _a.call(_c);
2996
+ if (_e && !_e.done && (_a = _d.return)) _a.call(_d);
2997
2997
  }
2998
2998
  finally { if (e_1) throw e_1.error; }
2999
2999
  }
3000
- var _loop_2 = function (packageName, className) {
3000
+ var _loop_2 = function (packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser) {
3001
3001
  if (all.some(function (item) { return item.packageName === packageName && item.className === className; })) {
3002
3002
  return "continue";
3003
3003
  }
3004
- all.push({ packageName: packageName, className: className });
3004
+ all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
3005
3005
  };
3006
3006
  try {
3007
- for (var _f = __values($scrapersRegister.list()), _g = _f.next(); !_g.done; _g = _f.next()) {
3008
- var _h = _g.value, packageName = _h.packageName, className = _h.className;
3009
- _loop_2(packageName, className);
3007
+ for (var _g = __values($scrapersRegister.list()), _h = _g.next(); !_h.done; _h = _g.next()) {
3008
+ var _j = _h.value, packageName = _j.packageName, className = _j.className, mimeTypes = _j.mimeTypes, documentationUrl = _j.documentationUrl, isAvilableInBrowser = _j.isAvilableInBrowser;
3009
+ _loop_2(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
3010
3010
  }
3011
3011
  }
3012
3012
  catch (e_2_1) { e_2 = { error: e_2_1 }; }
3013
3013
  finally {
3014
3014
  try {
3015
- if (_g && !_g.done && (_b = _f.return)) _b.call(_f);
3015
+ if (_h && !_h.done && (_b = _g.return)) _b.call(_g);
3016
3016
  }
3017
3017
  finally { if (e_2) throw e_2.error; }
3018
3018
  }
3019
+ try {
3020
+ for (var availableScrapers_1 = __values(availableScrapers), availableScrapers_1_1 = availableScrapers_1.next(); !availableScrapers_1_1.done; availableScrapers_1_1 = availableScrapers_1.next()) {
3021
+ var metadata_1 = availableScrapers_1_1.value.metadata;
3022
+ all.push(metadata_1);
3023
+ }
3024
+ }
3025
+ catch (e_3_1) { e_3 = { error: e_3_1 }; }
3026
+ finally {
3027
+ try {
3028
+ if (availableScrapers_1_1 && !availableScrapers_1_1.done && (_c = availableScrapers_1.return)) _c.call(availableScrapers_1);
3029
+ }
3030
+ finally { if (e_3) throw e_3.error; }
3031
+ }
3019
3032
  var metadata = all.map(function (metadata) {
3020
3033
  var isMetadataAviailable = $scrapersMetadataRegister
3021
3034
  .list()
@@ -3029,38 +3042,44 @@ function $registeredScrapersMessage() {
3029
3042
  var packageName = _a.packageName, className = _a.className;
3030
3043
  return metadata.packageName === packageName && metadata.className === className;
3031
3044
  });
3032
- return __assign(__assign({}, metadata), { isMetadataAviailable: isMetadataAviailable, isInstalled: isInstalled });
3045
+ var isAvilableInTools = availableScrapers.some(function (_a) {
3046
+ var _b = _a.metadata, packageName = _b.packageName, className = _b.className;
3047
+ return metadata.packageName === packageName && metadata.className === className;
3048
+ });
3049
+ return __assign(__assign({}, metadata), { isMetadataAviailable: isMetadataAviailable, isInstalled: isInstalled, isAvilableInTools: isAvilableInTools });
3033
3050
  });
3034
3051
  if (metadata.length === 0) {
3035
- return "No scrapers are available";
3052
+ return spaceTrim$1("\n **No scrapers are available**\n\n This is a unexpected behavior, you are probably using some broken version of Promptbook\n At least there should be available the metadata of the scrapers\n ");
3036
3053
  }
3037
3054
  return spaceTrim$1(function (block) { return "\n Available scrapers are:\n ".concat(block(metadata
3038
3055
  .map(function (_a, i) {
3039
- var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled;
3040
- var more;
3041
- if (just(false)) {
3042
- more = '';
3043
- }
3044
- else if (!isMetadataAviailable && !isInstalled) {
3045
- // TODO: [�][�] Maybe do allow to do auto-install if package not registered and not found
3046
- more = "(not installed and no metadata, looks like a unexpected behavior)";
3047
- }
3048
- else if (isMetadataAviailable && !isInstalled) {
3049
- // TODO: [�][�]
3050
- more = "(not installed)";
3051
- }
3052
- else if (!isMetadataAviailable && isInstalled) {
3053
- more = "(no metadata, looks like a unexpected behavior)";
3054
- }
3055
- else if (isMetadataAviailable && isInstalled) {
3056
- more = "(installed)";
3057
- }
3058
- else {
3059
- more = "(unknown state, looks like a unexpected behavior)";
3056
+ var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled, mimeTypes = _a.mimeTypes, isAvilableInBrowser = _a.isAvilableInBrowser, isAvilableInTools = _a.isAvilableInTools;
3057
+ var more = [];
3058
+ // TODO: [🧠] Maybe use `documentationUrl`
3059
+ if (isMetadataAviailable) {
3060
+ more.push("\u2B1C Metadata registered");
3061
+ } // not else
3062
+ if (isInstalled) {
3063
+ more.push("\uD83D\uDFE9 Installed");
3064
+ } // not else
3065
+ if (isAvilableInTools) {
3066
+ more.push("\uD83D\uDFE6 Available in tools");
3067
+ } // not else
3068
+ if (!isMetadataAviailable && isInstalled) {
3069
+ more.push("When no metadata registered but scraper is installed, it is an unexpected behavior");
3070
+ } // not else
3071
+ if (!isInstalled && isAvilableInTools) {
3072
+ more.push("When the scraper is not installed but available in tools, it is an unexpected compatibility behavior");
3073
+ } // not else
3074
+ if (!isAvilableInBrowser) {
3075
+ more.push("Not usable in browser");
3060
3076
  }
3061
- return "".concat(i + 1, ") `").concat(className, "` from `").concat(packageName, "` ").concat(more);
3077
+ var moreText = more.length === 0 ? '' : " *(".concat(more.join('; '), ")*");
3078
+ return "".concat(i + 1, ") `").concat(className, "` from `").concat(packageName, "` compatible to scrape ").concat(mimeTypes
3079
+ .map(function (mimeType) { return "\"".concat(mimeType, "\""); })
3080
+ .join(', ')).concat(moreText);
3062
3081
  })
3063
- .join('\n')), "\n "); });
3082
+ .join('\n')), "\n\n Legend:\n - \u2B1C **Metadata registered** means that Promptbook knows about the scraper, it is similar to registration in some registry\n - \uD83D\uDFE9 **Installed** means that you have imported package with particular scraper\n - \uD83D\uDFE6 **Available in tools** means that you have passed scraper as dependency into prepare or execution process\n\n "); });
3064
3083
  }
3065
3084
  /**
3066
3085
  * TODO: [®] DRY Register logic
@@ -3308,54 +3327,75 @@ function prepareKnowledgePieces(knowledgeSources, tools, options) {
3308
3327
  _a = options.maxParallelCount, maxParallelCount = _a === void 0 ? DEFAULT_MAX_PARALLEL_COUNT : _a, rootDirname = options.rootDirname, _b = options.isVerbose, isVerbose = _b === void 0 ? DEFAULT_IS_VERBOSE : _b;
3309
3328
  knowledgePreparedUnflatten = new Array(knowledgeSources.length);
3310
3329
  return [4 /*yield*/, forEachAsync(knowledgeSources, { maxParallelCount: maxParallelCount }, function (knowledgeSource, index) { return __awaiter(_this, void 0, void 0, function () {
3311
- var partialPieces, sourceHandler, _a, _b, scraper, partialPiecesUnchecked, e_1_1, pieces;
3312
- var e_1, _c;
3313
- return __generator(this, function (_d) {
3314
- switch (_d.label) {
3330
+ var partialPieces, sourceHandler, scrapers, _loop_1, scrapers_1, scrapers_1_1, scraper, state_1, e_1_1, pieces;
3331
+ var e_1, _a;
3332
+ return __generator(this, function (_b) {
3333
+ switch (_b.label) {
3315
3334
  case 0:
3316
3335
  partialPieces = null;
3317
3336
  return [4 /*yield*/, makeKnowledgeSourceHandler(knowledgeSource, tools, { rootDirname: rootDirname, isVerbose: isVerbose })];
3318
3337
  case 1:
3319
- sourceHandler = _d.sent();
3320
- _d.label = 2;
3338
+ sourceHandler = _b.sent();
3339
+ scrapers = arrayableToArray(tools.scrapers);
3340
+ _loop_1 = function (scraper) {
3341
+ var partialPiecesUnchecked;
3342
+ return __generator(this, function (_c) {
3343
+ switch (_c.label) {
3344
+ case 0:
3345
+ if (!scraper.metadata.mimeTypes.includes(sourceHandler.mimeType)
3346
+ // <- TODO: [🦔] Implement mime-type wildcards
3347
+ ) {
3348
+ return [2 /*return*/, "continue"];
3349
+ }
3350
+ return [4 /*yield*/, scraper.scrape(sourceHandler)];
3351
+ case 1:
3352
+ partialPiecesUnchecked = _c.sent();
3353
+ if (partialPiecesUnchecked !== null) {
3354
+ partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
3355
+ return [2 /*return*/, "break"];
3356
+ }
3357
+ console.warn(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge from source despite the scraper `".concat(scraper.metadata.className, "` supports the mime type \"").concat(sourceHandler.mimeType, "\".\n \n The source:\n > ").concat(block(knowledgeSource.sourceContent
3358
+ .split('\n')
3359
+ .map(function (line) { return "> ".concat(line); })
3360
+ .join('\n')), "\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
3361
+ return [2 /*return*/];
3362
+ }
3363
+ });
3364
+ };
3365
+ _b.label = 2;
3321
3366
  case 2:
3322
- _d.trys.push([2, 7, 8, 9]);
3323
- _a = __values(arrayableToArray(tools.scrapers)), _b = _a.next();
3324
- _d.label = 3;
3367
+ _b.trys.push([2, 7, 8, 9]);
3368
+ scrapers_1 = __values(scrapers), scrapers_1_1 = scrapers_1.next();
3369
+ _b.label = 3;
3325
3370
  case 3:
3326
- if (!!_b.done) return [3 /*break*/, 6];
3327
- scraper = _b.value;
3328
- if (!scraper.metadata.mimeTypes.includes(sourceHandler.mimeType)
3329
- // <- TODO: [🦔] Implement mime-type wildcards
3330
- ) {
3331
- return [3 /*break*/, 5];
3332
- }
3333
- return [4 /*yield*/, scraper.scrape(sourceHandler)];
3371
+ if (!!scrapers_1_1.done) return [3 /*break*/, 6];
3372
+ scraper = scrapers_1_1.value;
3373
+ return [5 /*yield**/, _loop_1(scraper)];
3334
3374
  case 4:
3335
- partialPiecesUnchecked = _d.sent();
3336
- if (partialPiecesUnchecked !== null) {
3337
- partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
3338
- // <- TODO: [🪓] Here should be no need for spreading new array, just `partialPieces = partialPiecesUnchecked`
3375
+ state_1 = _b.sent();
3376
+ if (state_1 === "break")
3339
3377
  return [3 /*break*/, 6];
3340
- }
3341
- _d.label = 5;
3378
+ _b.label = 5;
3342
3379
  case 5:
3343
- _b = _a.next();
3380
+ scrapers_1_1 = scrapers_1.next();
3344
3381
  return [3 /*break*/, 3];
3345
3382
  case 6: return [3 /*break*/, 9];
3346
3383
  case 7:
3347
- e_1_1 = _d.sent();
3384
+ e_1_1 = _b.sent();
3348
3385
  e_1 = { error: e_1_1 };
3349
3386
  return [3 /*break*/, 9];
3350
3387
  case 8:
3351
3388
  try {
3352
- if (_b && !_b.done && (_c = _a.return)) _c.call(_a);
3389
+ if (scrapers_1_1 && !scrapers_1_1.done && (_a = scrapers_1.return)) _a.call(scrapers_1);
3353
3390
  }
3354
3391
  finally { if (e_1) throw e_1.error; }
3355
3392
  return [7 /*endfinally*/];
3356
3393
  case 9:
3357
3394
  if (partialPieces === null) {
3358
- throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge from source: ".concat(knowledgeSource.sourceContent, "\n\n No scraper found for the mime type \"").concat(sourceHandler.mimeType, "\"\n\n ").concat(block($registeredScrapersMessage()), "\n\n\n "); }));
3395
+ throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge\n \n The source:\n > ".concat(block(knowledgeSource.sourceContent
3396
+ .split('\n')
3397
+ .map(function (line) { return "> ".concat(line); })
3398
+ .join('\n')), "\n\n No scraper found for the mime type \"").concat(sourceHandler.mimeType, "\"\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
3359
3399
  }
3360
3400
  pieces = partialPieces.map(function (partialPiece) { return (__assign(__assign({}, partialPiece), { sources: [
3361
3401
  {