@promptbook/website-crawler 0.75.2 → 0.75.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/index.es.js +107 -67
- package/esm/index.es.js.map +1 -1
- package/esm/typings/src/_packages/core.index.d.ts +2 -0
- package/esm/typings/src/config.d.ts +9 -1
- package/esm/typings/src/scrapers/_common/register/$registeredScrapersMessage.d.ts +2 -1
- package/package.json +2 -2
- package/umd/index.umd.js +107 -67
- package/umd/index.umd.js.map +1 -1
package/esm/index.es.js
CHANGED
|
@@ -24,7 +24,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
|
|
|
24
24
|
*
|
|
25
25
|
* @see https://github.com/webgptorg/promptbook
|
|
26
26
|
*/
|
|
27
|
-
var PROMPTBOOK_ENGINE_VERSION = '0.75.
|
|
27
|
+
var PROMPTBOOK_ENGINE_VERSION = '0.75.3';
|
|
28
28
|
/**
|
|
29
29
|
* TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
|
|
30
30
|
* Note: [💞] Ignore a discrepancy between file name and entity name
|
|
@@ -2972,50 +2972,63 @@ var $scrapersRegister = new $Register('scraper_constructors');
|
|
|
2972
2972
|
*
|
|
2973
2973
|
* @private internal function of `createScrapersFromConfiguration` and `createScrapersFromEnv`
|
|
2974
2974
|
*/
|
|
2975
|
-
function $registeredScrapersMessage() {
|
|
2976
|
-
var e_1, _a, e_2, _b;
|
|
2975
|
+
function $registeredScrapersMessage(availableScrapers) {
|
|
2976
|
+
var e_1, _a, e_2, _b, e_3, _c;
|
|
2977
2977
|
/**
|
|
2978
2978
|
* Mixes registered scrapers from $scrapersMetadataRegister and $scrapersRegister
|
|
2979
2979
|
*/
|
|
2980
2980
|
var all = [];
|
|
2981
|
-
var _loop_1 = function (packageName, className) {
|
|
2981
|
+
var _loop_1 = function (packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser) {
|
|
2982
2982
|
if (all.some(function (item) { return item.packageName === packageName && item.className === className; })) {
|
|
2983
2983
|
return "continue";
|
|
2984
2984
|
}
|
|
2985
|
-
all.push({ packageName: packageName, className: className });
|
|
2985
|
+
all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
|
|
2986
2986
|
};
|
|
2987
2987
|
try {
|
|
2988
|
-
for (var
|
|
2989
|
-
var
|
|
2990
|
-
_loop_1(packageName, className);
|
|
2988
|
+
for (var _d = __values($scrapersMetadataRegister.list()), _e = _d.next(); !_e.done; _e = _d.next()) {
|
|
2989
|
+
var _f = _e.value, packageName = _f.packageName, className = _f.className, mimeTypes = _f.mimeTypes, documentationUrl = _f.documentationUrl, isAvilableInBrowser = _f.isAvilableInBrowser;
|
|
2990
|
+
_loop_1(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
|
|
2991
2991
|
}
|
|
2992
2992
|
}
|
|
2993
2993
|
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
2994
2994
|
finally {
|
|
2995
2995
|
try {
|
|
2996
|
-
if (
|
|
2996
|
+
if (_e && !_e.done && (_a = _d.return)) _a.call(_d);
|
|
2997
2997
|
}
|
|
2998
2998
|
finally { if (e_1) throw e_1.error; }
|
|
2999
2999
|
}
|
|
3000
|
-
var _loop_2 = function (packageName, className) {
|
|
3000
|
+
var _loop_2 = function (packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser) {
|
|
3001
3001
|
if (all.some(function (item) { return item.packageName === packageName && item.className === className; })) {
|
|
3002
3002
|
return "continue";
|
|
3003
3003
|
}
|
|
3004
|
-
all.push({ packageName: packageName, className: className });
|
|
3004
|
+
all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
|
|
3005
3005
|
};
|
|
3006
3006
|
try {
|
|
3007
|
-
for (var
|
|
3008
|
-
var
|
|
3009
|
-
_loop_2(packageName, className);
|
|
3007
|
+
for (var _g = __values($scrapersRegister.list()), _h = _g.next(); !_h.done; _h = _g.next()) {
|
|
3008
|
+
var _j = _h.value, packageName = _j.packageName, className = _j.className, mimeTypes = _j.mimeTypes, documentationUrl = _j.documentationUrl, isAvilableInBrowser = _j.isAvilableInBrowser;
|
|
3009
|
+
_loop_2(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
|
|
3010
3010
|
}
|
|
3011
3011
|
}
|
|
3012
3012
|
catch (e_2_1) { e_2 = { error: e_2_1 }; }
|
|
3013
3013
|
finally {
|
|
3014
3014
|
try {
|
|
3015
|
-
if (
|
|
3015
|
+
if (_h && !_h.done && (_b = _g.return)) _b.call(_g);
|
|
3016
3016
|
}
|
|
3017
3017
|
finally { if (e_2) throw e_2.error; }
|
|
3018
3018
|
}
|
|
3019
|
+
try {
|
|
3020
|
+
for (var availableScrapers_1 = __values(availableScrapers), availableScrapers_1_1 = availableScrapers_1.next(); !availableScrapers_1_1.done; availableScrapers_1_1 = availableScrapers_1.next()) {
|
|
3021
|
+
var metadata_1 = availableScrapers_1_1.value.metadata;
|
|
3022
|
+
all.push(metadata_1);
|
|
3023
|
+
}
|
|
3024
|
+
}
|
|
3025
|
+
catch (e_3_1) { e_3 = { error: e_3_1 }; }
|
|
3026
|
+
finally {
|
|
3027
|
+
try {
|
|
3028
|
+
if (availableScrapers_1_1 && !availableScrapers_1_1.done && (_c = availableScrapers_1.return)) _c.call(availableScrapers_1);
|
|
3029
|
+
}
|
|
3030
|
+
finally { if (e_3) throw e_3.error; }
|
|
3031
|
+
}
|
|
3019
3032
|
var metadata = all.map(function (metadata) {
|
|
3020
3033
|
var isMetadataAviailable = $scrapersMetadataRegister
|
|
3021
3034
|
.list()
|
|
@@ -3029,38 +3042,44 @@ function $registeredScrapersMessage() {
|
|
|
3029
3042
|
var packageName = _a.packageName, className = _a.className;
|
|
3030
3043
|
return metadata.packageName === packageName && metadata.className === className;
|
|
3031
3044
|
});
|
|
3032
|
-
|
|
3045
|
+
var isAvilableInTools = availableScrapers.some(function (_a) {
|
|
3046
|
+
var _b = _a.metadata, packageName = _b.packageName, className = _b.className;
|
|
3047
|
+
return metadata.packageName === packageName && metadata.className === className;
|
|
3048
|
+
});
|
|
3049
|
+
return __assign(__assign({}, metadata), { isMetadataAviailable: isMetadataAviailable, isInstalled: isInstalled, isAvilableInTools: isAvilableInTools });
|
|
3033
3050
|
});
|
|
3034
3051
|
if (metadata.length === 0) {
|
|
3035
|
-
return "No scrapers are available";
|
|
3052
|
+
return spaceTrim$1("\n **No scrapers are available**\n\n This is a unexpected behavior, you are probably using some broken version of Promptbook\n At least there should be available the metadata of the scrapers\n ");
|
|
3036
3053
|
}
|
|
3037
3054
|
return spaceTrim$1(function (block) { return "\n Available scrapers are:\n ".concat(block(metadata
|
|
3038
3055
|
.map(function (_a, i) {
|
|
3039
|
-
var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled;
|
|
3040
|
-
var more;
|
|
3041
|
-
|
|
3042
|
-
|
|
3043
|
-
|
|
3044
|
-
|
|
3045
|
-
|
|
3046
|
-
more
|
|
3047
|
-
}
|
|
3048
|
-
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3057
|
-
|
|
3058
|
-
|
|
3059
|
-
more = "(unknown state, looks like a unexpected behavior)";
|
|
3056
|
+
var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled, mimeTypes = _a.mimeTypes, isAvilableInBrowser = _a.isAvilableInBrowser, isAvilableInTools = _a.isAvilableInTools;
|
|
3057
|
+
var more = [];
|
|
3058
|
+
// TODO: [🧠] Maybe use `documentationUrl`
|
|
3059
|
+
if (isMetadataAviailable) {
|
|
3060
|
+
more.push("\u2B1C Metadata registered");
|
|
3061
|
+
} // not else
|
|
3062
|
+
if (isInstalled) {
|
|
3063
|
+
more.push("\uD83D\uDFE9 Installed");
|
|
3064
|
+
} // not else
|
|
3065
|
+
if (isAvilableInTools) {
|
|
3066
|
+
more.push("\uD83D\uDFE6 Available in tools");
|
|
3067
|
+
} // not else
|
|
3068
|
+
if (!isMetadataAviailable && isInstalled) {
|
|
3069
|
+
more.push("When no metadata registered but scraper is installed, it is an unexpected behavior");
|
|
3070
|
+
} // not else
|
|
3071
|
+
if (!isInstalled && isAvilableInTools) {
|
|
3072
|
+
more.push("When the scraper is not installed but available in tools, it is an unexpected compatibility behavior");
|
|
3073
|
+
} // not else
|
|
3074
|
+
if (!isAvilableInBrowser) {
|
|
3075
|
+
more.push("Not usable in browser");
|
|
3060
3076
|
}
|
|
3061
|
-
|
|
3077
|
+
var moreText = more.length === 0 ? '' : " *(".concat(more.join('; '), ")*");
|
|
3078
|
+
return "".concat(i + 1, ") `").concat(className, "` from `").concat(packageName, "` compatible to scrape ").concat(mimeTypes
|
|
3079
|
+
.map(function (mimeType) { return "\"".concat(mimeType, "\""); })
|
|
3080
|
+
.join(', ')).concat(moreText);
|
|
3062
3081
|
})
|
|
3063
|
-
.join('\n')), "\n "); });
|
|
3082
|
+
.join('\n')), "\n\n Legend:\n - \u2B1C **Metadata registered** means that Promptbook knows about the scraper, it is similar to registration in some registry\n - \uD83D\uDFE9 **Installed** means that you have imported package with particular scraper\n - \uD83D\uDFE6 **Available in tools** means that you have passed scraper as dependency into prepare or execution process\n\n "); });
|
|
3064
3083
|
}
|
|
3065
3084
|
/**
|
|
3066
3085
|
* TODO: [®] DRY Register logic
|
|
@@ -3308,54 +3327,75 @@ function prepareKnowledgePieces(knowledgeSources, tools, options) {
|
|
|
3308
3327
|
_a = options.maxParallelCount, maxParallelCount = _a === void 0 ? DEFAULT_MAX_PARALLEL_COUNT : _a, rootDirname = options.rootDirname, _b = options.isVerbose, isVerbose = _b === void 0 ? DEFAULT_IS_VERBOSE : _b;
|
|
3309
3328
|
knowledgePreparedUnflatten = new Array(knowledgeSources.length);
|
|
3310
3329
|
return [4 /*yield*/, forEachAsync(knowledgeSources, { maxParallelCount: maxParallelCount }, function (knowledgeSource, index) { return __awaiter(_this, void 0, void 0, function () {
|
|
3311
|
-
var partialPieces, sourceHandler,
|
|
3312
|
-
var e_1,
|
|
3313
|
-
return __generator(this, function (
|
|
3314
|
-
switch (
|
|
3330
|
+
var partialPieces, sourceHandler, scrapers, _loop_1, scrapers_1, scrapers_1_1, scraper, state_1, e_1_1, pieces;
|
|
3331
|
+
var e_1, _a;
|
|
3332
|
+
return __generator(this, function (_b) {
|
|
3333
|
+
switch (_b.label) {
|
|
3315
3334
|
case 0:
|
|
3316
3335
|
partialPieces = null;
|
|
3317
3336
|
return [4 /*yield*/, makeKnowledgeSourceHandler(knowledgeSource, tools, { rootDirname: rootDirname, isVerbose: isVerbose })];
|
|
3318
3337
|
case 1:
|
|
3319
|
-
sourceHandler =
|
|
3320
|
-
|
|
3338
|
+
sourceHandler = _b.sent();
|
|
3339
|
+
scrapers = arrayableToArray(tools.scrapers);
|
|
3340
|
+
_loop_1 = function (scraper) {
|
|
3341
|
+
var partialPiecesUnchecked;
|
|
3342
|
+
return __generator(this, function (_c) {
|
|
3343
|
+
switch (_c.label) {
|
|
3344
|
+
case 0:
|
|
3345
|
+
if (!scraper.metadata.mimeTypes.includes(sourceHandler.mimeType)
|
|
3346
|
+
// <- TODO: [🦔] Implement mime-type wildcards
|
|
3347
|
+
) {
|
|
3348
|
+
return [2 /*return*/, "continue"];
|
|
3349
|
+
}
|
|
3350
|
+
return [4 /*yield*/, scraper.scrape(sourceHandler)];
|
|
3351
|
+
case 1:
|
|
3352
|
+
partialPiecesUnchecked = _c.sent();
|
|
3353
|
+
if (partialPiecesUnchecked !== null) {
|
|
3354
|
+
partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
|
|
3355
|
+
return [2 /*return*/, "break"];
|
|
3356
|
+
}
|
|
3357
|
+
console.warn(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge from source despite the scraper `".concat(scraper.metadata.className, "` supports the mime type \"").concat(sourceHandler.mimeType, "\".\n \n The source:\n > ").concat(block(knowledgeSource.sourceContent
|
|
3358
|
+
.split('\n')
|
|
3359
|
+
.map(function (line) { return "> ".concat(line); })
|
|
3360
|
+
.join('\n')), "\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
|
|
3361
|
+
return [2 /*return*/];
|
|
3362
|
+
}
|
|
3363
|
+
});
|
|
3364
|
+
};
|
|
3365
|
+
_b.label = 2;
|
|
3321
3366
|
case 2:
|
|
3322
|
-
|
|
3323
|
-
|
|
3324
|
-
|
|
3367
|
+
_b.trys.push([2, 7, 8, 9]);
|
|
3368
|
+
scrapers_1 = __values(scrapers), scrapers_1_1 = scrapers_1.next();
|
|
3369
|
+
_b.label = 3;
|
|
3325
3370
|
case 3:
|
|
3326
|
-
if (!!
|
|
3327
|
-
scraper =
|
|
3328
|
-
|
|
3329
|
-
// <- TODO: [🦔] Implement mime-type wildcards
|
|
3330
|
-
) {
|
|
3331
|
-
return [3 /*break*/, 5];
|
|
3332
|
-
}
|
|
3333
|
-
return [4 /*yield*/, scraper.scrape(sourceHandler)];
|
|
3371
|
+
if (!!scrapers_1_1.done) return [3 /*break*/, 6];
|
|
3372
|
+
scraper = scrapers_1_1.value;
|
|
3373
|
+
return [5 /*yield**/, _loop_1(scraper)];
|
|
3334
3374
|
case 4:
|
|
3335
|
-
|
|
3336
|
-
if (
|
|
3337
|
-
partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
|
|
3338
|
-
// <- TODO: [🪓] Here should be no need for spreading new array, just `partialPieces = partialPiecesUnchecked`
|
|
3375
|
+
state_1 = _b.sent();
|
|
3376
|
+
if (state_1 === "break")
|
|
3339
3377
|
return [3 /*break*/, 6];
|
|
3340
|
-
|
|
3341
|
-
_d.label = 5;
|
|
3378
|
+
_b.label = 5;
|
|
3342
3379
|
case 5:
|
|
3343
|
-
|
|
3380
|
+
scrapers_1_1 = scrapers_1.next();
|
|
3344
3381
|
return [3 /*break*/, 3];
|
|
3345
3382
|
case 6: return [3 /*break*/, 9];
|
|
3346
3383
|
case 7:
|
|
3347
|
-
e_1_1 =
|
|
3384
|
+
e_1_1 = _b.sent();
|
|
3348
3385
|
e_1 = { error: e_1_1 };
|
|
3349
3386
|
return [3 /*break*/, 9];
|
|
3350
3387
|
case 8:
|
|
3351
3388
|
try {
|
|
3352
|
-
if (
|
|
3389
|
+
if (scrapers_1_1 && !scrapers_1_1.done && (_a = scrapers_1.return)) _a.call(scrapers_1);
|
|
3353
3390
|
}
|
|
3354
3391
|
finally { if (e_1) throw e_1.error; }
|
|
3355
3392
|
return [7 /*endfinally*/];
|
|
3356
3393
|
case 9:
|
|
3357
3394
|
if (partialPieces === null) {
|
|
3358
|
-
throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge
|
|
3395
|
+
throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge\n \n The source:\n > ".concat(block(knowledgeSource.sourceContent
|
|
3396
|
+
.split('\n')
|
|
3397
|
+
.map(function (line) { return "> ".concat(line); })
|
|
3398
|
+
.join('\n')), "\n\n No scraper found for the mime type \"").concat(sourceHandler.mimeType, "\"\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
|
|
3359
3399
|
}
|
|
3360
3400
|
pieces = partialPieces.map(function (partialPiece) { return (__assign(__assign({}, partialPiece), { sources: [
|
|
3361
3401
|
{
|