@promptbook/website-crawler 0.75.3 → 0.75.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/esm/index.es.js CHANGED
@@ -24,7 +24,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
24
24
  *
25
25
  * @see https://github.com/webgptorg/promptbook
26
26
  */
27
- var PROMPTBOOK_ENGINE_VERSION = '0.75.2';
27
+ var PROMPTBOOK_ENGINE_VERSION = '0.75.3';
28
28
  /**
29
29
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
30
30
  * Note: [💞] Ignore a discrepancy between file name and entity name
@@ -2972,8 +2972,8 @@ var $scrapersRegister = new $Register('scraper_constructors');
2972
2972
  *
2973
2973
  * @private internal function of `createScrapersFromConfiguration` and `createScrapersFromEnv`
2974
2974
  */
2975
- function $registeredScrapersMessage() {
2976
- var e_1, _a, e_2, _b;
2975
+ function $registeredScrapersMessage(availableScrapers) {
2976
+ var e_1, _a, e_2, _b, e_3, _c;
2977
2977
  /**
2978
2978
  * Mixes registered scrapers from $scrapersMetadataRegister and $scrapersRegister
2979
2979
  */
@@ -2985,15 +2985,15 @@ function $registeredScrapersMessage() {
2985
2985
  all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
2986
2986
  };
2987
2987
  try {
2988
- for (var _c = __values($scrapersMetadataRegister.list()), _d = _c.next(); !_d.done; _d = _c.next()) {
2989
- var _e = _d.value, packageName = _e.packageName, className = _e.className, mimeTypes = _e.mimeTypes, documentationUrl = _e.documentationUrl, isAvilableInBrowser = _e.isAvilableInBrowser;
2988
+ for (var _d = __values($scrapersMetadataRegister.list()), _e = _d.next(); !_e.done; _e = _d.next()) {
2989
+ var _f = _e.value, packageName = _f.packageName, className = _f.className, mimeTypes = _f.mimeTypes, documentationUrl = _f.documentationUrl, isAvilableInBrowser = _f.isAvilableInBrowser;
2990
2990
  _loop_1(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
2991
2991
  }
2992
2992
  }
2993
2993
  catch (e_1_1) { e_1 = { error: e_1_1 }; }
2994
2994
  finally {
2995
2995
  try {
2996
- if (_d && !_d.done && (_a = _c.return)) _a.call(_c);
2996
+ if (_e && !_e.done && (_a = _d.return)) _a.call(_d);
2997
2997
  }
2998
2998
  finally { if (e_1) throw e_1.error; }
2999
2999
  }
@@ -3004,18 +3004,31 @@ function $registeredScrapersMessage() {
3004
3004
  all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
3005
3005
  };
3006
3006
  try {
3007
- for (var _f = __values($scrapersRegister.list()), _g = _f.next(); !_g.done; _g = _f.next()) {
3008
- var _h = _g.value, packageName = _h.packageName, className = _h.className, mimeTypes = _h.mimeTypes, documentationUrl = _h.documentationUrl, isAvilableInBrowser = _h.isAvilableInBrowser;
3007
+ for (var _g = __values($scrapersRegister.list()), _h = _g.next(); !_h.done; _h = _g.next()) {
3008
+ var _j = _h.value, packageName = _j.packageName, className = _j.className, mimeTypes = _j.mimeTypes, documentationUrl = _j.documentationUrl, isAvilableInBrowser = _j.isAvilableInBrowser;
3009
3009
  _loop_2(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
3010
3010
  }
3011
3011
  }
3012
3012
  catch (e_2_1) { e_2 = { error: e_2_1 }; }
3013
3013
  finally {
3014
3014
  try {
3015
- if (_g && !_g.done && (_b = _f.return)) _b.call(_f);
3015
+ if (_h && !_h.done && (_b = _g.return)) _b.call(_g);
3016
3016
  }
3017
3017
  finally { if (e_2) throw e_2.error; }
3018
3018
  }
3019
+ try {
3020
+ for (var availableScrapers_1 = __values(availableScrapers), availableScrapers_1_1 = availableScrapers_1.next(); !availableScrapers_1_1.done; availableScrapers_1_1 = availableScrapers_1.next()) {
3021
+ var metadata_1 = availableScrapers_1_1.value.metadata;
3022
+ all.push(metadata_1);
3023
+ }
3024
+ }
3025
+ catch (e_3_1) { e_3 = { error: e_3_1 }; }
3026
+ finally {
3027
+ try {
3028
+ if (availableScrapers_1_1 && !availableScrapers_1_1.done && (_c = availableScrapers_1.return)) _c.call(availableScrapers_1);
3029
+ }
3030
+ finally { if (e_3) throw e_3.error; }
3031
+ }
3019
3032
  var metadata = all.map(function (metadata) {
3020
3033
  var isMetadataAviailable = $scrapersMetadataRegister
3021
3034
  .list()
@@ -3029,42 +3042,44 @@ function $registeredScrapersMessage() {
3029
3042
  var packageName = _a.packageName, className = _a.className;
3030
3043
  return metadata.packageName === packageName && metadata.className === className;
3031
3044
  });
3032
- return __assign(__assign({}, metadata), { isMetadataAviailable: isMetadataAviailable, isInstalled: isInstalled });
3045
+ var isAvilableInTools = availableScrapers.some(function (_a) {
3046
+ var _b = _a.metadata, packageName = _b.packageName, className = _b.className;
3047
+ return metadata.packageName === packageName && metadata.className === className;
3048
+ });
3049
+ return __assign(__assign({}, metadata), { isMetadataAviailable: isMetadataAviailable, isInstalled: isInstalled, isAvilableInTools: isAvilableInTools });
3033
3050
  });
3034
3051
  if (metadata.length === 0) {
3035
- return "No scrapers are available";
3052
+ return spaceTrim$1("\n **No scrapers are available**\n\n This is a unexpected behavior, you are probably using some broken version of Promptbook\n At least there should be available the metadata of the scrapers\n ");
3036
3053
  }
3037
3054
  return spaceTrim$1(function (block) { return "\n Available scrapers are:\n ".concat(block(metadata
3038
3055
  .map(function (_a, i) {
3039
- var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled, mimeTypes = _a.mimeTypes, isAvilableInBrowser = _a.isAvilableInBrowser;
3040
- var more;
3041
- // TODO: Use documentationUrl
3042
- if (just(false)) {
3043
- more = '';
3044
- }
3045
- else if (!isMetadataAviailable && !isInstalled) {
3046
- // TODO: [�][�] Maybe do allow to do auto-install if package not registered and not found
3047
- more = "*(not installed and no metadata, looks like a unexpected behavior)*";
3048
- }
3049
- else if (isMetadataAviailable && !isInstalled) {
3050
- // TODO: [�][�]
3051
- more = "*(not installed)*";
3052
- }
3053
- else if (!isMetadataAviailable && isInstalled) {
3054
- more = "*(no metadata, looks like a unexpected behavior)*";
3055
- }
3056
- else if (isMetadataAviailable && isInstalled) {
3057
- more = "(installed)";
3058
- }
3059
- else {
3060
- more = "*(unknown state, looks like a unexpected behavior)*";
3061
- }
3056
+ var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled, mimeTypes = _a.mimeTypes, isAvilableInBrowser = _a.isAvilableInBrowser, isAvilableInTools = _a.isAvilableInTools;
3057
+ var more = [];
3058
+ // TODO: [🧠] Maybe use `documentationUrl`
3059
+ if (isMetadataAviailable) {
3060
+ more.push("\u2B1C Metadata registered");
3061
+ } // not else
3062
+ if (isInstalled) {
3063
+ more.push("\uD83D\uDFE9 Installed");
3064
+ } // not else
3065
+ if (isAvilableInTools) {
3066
+ more.push("\uD83D\uDFE6 Available in tools");
3067
+ } // not else
3068
+ if (!isMetadataAviailable && isInstalled) {
3069
+ more.push("When no metadata registered but scraper is installed, it is an unexpected behavior");
3070
+ } // not else
3071
+ if (!isInstalled && isAvilableInTools) {
3072
+ more.push("When the scraper is not installed but available in tools, it is an unexpected compatibility behavior");
3073
+ } // not else
3062
3074
  if (!isAvilableInBrowser) {
3063
- more += " *(not available in browser)*";
3075
+ more.push("Not usable in browser");
3064
3076
  }
3065
- return "".concat(i + 1, ") `").concat(className, "` from `").concat(packageName, "` compatible to scrape ").concat(mimeTypes.join(', '), " ").concat(more);
3077
+ var moreText = more.length === 0 ? '' : " *(".concat(more.join('; '), ")*");
3078
+ return "".concat(i + 1, ") `").concat(className, "` from `").concat(packageName, "` compatible to scrape ").concat(mimeTypes
3079
+ .map(function (mimeType) { return "\"".concat(mimeType, "\""); })
3080
+ .join(', ')).concat(moreText);
3066
3081
  })
3067
- .join('\n')), "\n "); });
3082
+ .join('\n')), "\n\n Legend:\n - \u2B1C **Metadata registered** means that Promptbook knows about the scraper, it is similar to registration in some registry\n - \uD83D\uDFE9 **Installed** means that you have imported package with particular scraper\n - \uD83D\uDFE6 **Available in tools** means that you have passed scraper as dependency into prepare or execution process\n\n "); });
3068
3083
  }
3069
3084
  /**
3070
3085
  * TODO: [®] DRY Register logic
@@ -3312,57 +3327,75 @@ function prepareKnowledgePieces(knowledgeSources, tools, options) {
3312
3327
  _a = options.maxParallelCount, maxParallelCount = _a === void 0 ? DEFAULT_MAX_PARALLEL_COUNT : _a, rootDirname = options.rootDirname, _b = options.isVerbose, isVerbose = _b === void 0 ? DEFAULT_IS_VERBOSE : _b;
3313
3328
  knowledgePreparedUnflatten = new Array(knowledgeSources.length);
3314
3329
  return [4 /*yield*/, forEachAsync(knowledgeSources, { maxParallelCount: maxParallelCount }, function (knowledgeSource, index) { return __awaiter(_this, void 0, void 0, function () {
3315
- var partialPieces, sourceHandler, _a, _b, scraper, partialPiecesUnchecked, e_1_1, pieces;
3316
- var e_1, _c;
3317
- return __generator(this, function (_d) {
3318
- switch (_d.label) {
3330
+ var partialPieces, sourceHandler, scrapers, _loop_1, scrapers_1, scrapers_1_1, scraper, state_1, e_1_1, pieces;
3331
+ var e_1, _a;
3332
+ return __generator(this, function (_b) {
3333
+ switch (_b.label) {
3319
3334
  case 0:
3320
3335
  partialPieces = null;
3321
3336
  return [4 /*yield*/, makeKnowledgeSourceHandler(knowledgeSource, tools, { rootDirname: rootDirname, isVerbose: isVerbose })];
3322
3337
  case 1:
3323
- sourceHandler = _d.sent();
3324
- _d.label = 2;
3338
+ sourceHandler = _b.sent();
3339
+ scrapers = arrayableToArray(tools.scrapers);
3340
+ _loop_1 = function (scraper) {
3341
+ var partialPiecesUnchecked;
3342
+ return __generator(this, function (_c) {
3343
+ switch (_c.label) {
3344
+ case 0:
3345
+ if (!scraper.metadata.mimeTypes.includes(sourceHandler.mimeType)
3346
+ // <- TODO: [🦔] Implement mime-type wildcards
3347
+ ) {
3348
+ return [2 /*return*/, "continue"];
3349
+ }
3350
+ return [4 /*yield*/, scraper.scrape(sourceHandler)];
3351
+ case 1:
3352
+ partialPiecesUnchecked = _c.sent();
3353
+ if (partialPiecesUnchecked !== null) {
3354
+ partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
3355
+ return [2 /*return*/, "break"];
3356
+ }
3357
+ console.warn(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge from source despite the scraper `".concat(scraper.metadata.className, "` supports the mime type \"").concat(sourceHandler.mimeType, "\".\n \n The source:\n > ").concat(block(knowledgeSource.sourceContent
3358
+ .split('\n')
3359
+ .map(function (line) { return "> ".concat(line); })
3360
+ .join('\n')), "\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
3361
+ return [2 /*return*/];
3362
+ }
3363
+ });
3364
+ };
3365
+ _b.label = 2;
3325
3366
  case 2:
3326
- _d.trys.push([2, 7, 8, 9]);
3327
- _a = __values(arrayableToArray(tools.scrapers)), _b = _a.next();
3328
- _d.label = 3;
3367
+ _b.trys.push([2, 7, 8, 9]);
3368
+ scrapers_1 = __values(scrapers), scrapers_1_1 = scrapers_1.next();
3369
+ _b.label = 3;
3329
3370
  case 3:
3330
- if (!!_b.done) return [3 /*break*/, 6];
3331
- scraper = _b.value;
3332
- if (!scraper.metadata.mimeTypes.includes(sourceHandler.mimeType)
3333
- // <- TODO: [🦔] Implement mime-type wildcards
3334
- ) {
3335
- return [3 /*break*/, 5];
3336
- }
3337
- return [4 /*yield*/, scraper.scrape(sourceHandler)];
3371
+ if (!!scrapers_1_1.done) return [3 /*break*/, 6];
3372
+ scraper = scrapers_1_1.value;
3373
+ return [5 /*yield**/, _loop_1(scraper)];
3338
3374
  case 4:
3339
- partialPiecesUnchecked = _d.sent();
3340
- if (partialPiecesUnchecked !== null) {
3341
- partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
3342
- // <- TODO: [🪓] Here should be no need for spreading new array, just `partialPieces = partialPiecesUnchecked`
3375
+ state_1 = _b.sent();
3376
+ if (state_1 === "break")
3343
3377
  return [3 /*break*/, 6];
3344
- }
3345
- _d.label = 5;
3378
+ _b.label = 5;
3346
3379
  case 5:
3347
- _b = _a.next();
3380
+ scrapers_1_1 = scrapers_1.next();
3348
3381
  return [3 /*break*/, 3];
3349
3382
  case 6: return [3 /*break*/, 9];
3350
3383
  case 7:
3351
- e_1_1 = _d.sent();
3384
+ e_1_1 = _b.sent();
3352
3385
  e_1 = { error: e_1_1 };
3353
3386
  return [3 /*break*/, 9];
3354
3387
  case 8:
3355
3388
  try {
3356
- if (_b && !_b.done && (_c = _a.return)) _c.call(_a);
3389
+ if (scrapers_1_1 && !scrapers_1_1.done && (_a = scrapers_1.return)) _a.call(scrapers_1);
3357
3390
  }
3358
3391
  finally { if (e_1) throw e_1.error; }
3359
3392
  return [7 /*endfinally*/];
3360
3393
  case 9:
3361
3394
  if (partialPieces === null) {
3362
- throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge from source:\n \n > ".concat(block(knowledgeSource.sourceContent
3395
+ throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge\n \n The source:\n > ".concat(block(knowledgeSource.sourceContent
3363
3396
  .split('\n')
3364
3397
  .map(function (line) { return "> ".concat(line); })
3365
- .join('\n')), "\n\n No scraper found for the mime type \"").concat(sourceHandler.mimeType, "\"\n\n ").concat(block($registeredScrapersMessage()), "\n\n\n "); }));
3398
+ .join('\n')), "\n\n No scraper found for the mime type \"").concat(sourceHandler.mimeType, "\"\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
3366
3399
  }
3367
3400
  pieces = partialPieces.map(function (partialPiece) { return (__assign(__assign({}, partialPiece), { sources: [
3368
3401
  {