@promptbook/website-crawler 0.75.3 → 0.75.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/index.es.js +98 -65
- package/esm/index.es.js.map +1 -1
- package/esm/typings/src/_packages/core.index.d.ts +2 -0
- package/esm/typings/src/config.d.ts +9 -1
- package/esm/typings/src/scrapers/_common/register/$registeredScrapersMessage.d.ts +2 -1
- package/package.json +2 -2
- package/umd/index.umd.js +98 -65
- package/umd/index.umd.js.map +1 -1
package/esm/index.es.js
CHANGED
|
@@ -24,7 +24,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
|
|
|
24
24
|
*
|
|
25
25
|
* @see https://github.com/webgptorg/promptbook
|
|
26
26
|
*/
|
|
27
|
-
var PROMPTBOOK_ENGINE_VERSION = '0.75.
|
|
27
|
+
var PROMPTBOOK_ENGINE_VERSION = '0.75.3';
|
|
28
28
|
/**
|
|
29
29
|
* TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
|
|
30
30
|
* Note: [💞] Ignore a discrepancy between file name and entity name
|
|
@@ -2972,8 +2972,8 @@ var $scrapersRegister = new $Register('scraper_constructors');
|
|
|
2972
2972
|
*
|
|
2973
2973
|
* @private internal function of `createScrapersFromConfiguration` and `createScrapersFromEnv`
|
|
2974
2974
|
*/
|
|
2975
|
-
function $registeredScrapersMessage() {
|
|
2976
|
-
var e_1, _a, e_2, _b;
|
|
2975
|
+
function $registeredScrapersMessage(availableScrapers) {
|
|
2976
|
+
var e_1, _a, e_2, _b, e_3, _c;
|
|
2977
2977
|
/**
|
|
2978
2978
|
* Mixes registered scrapers from $scrapersMetadataRegister and $scrapersRegister
|
|
2979
2979
|
*/
|
|
@@ -2985,15 +2985,15 @@ function $registeredScrapersMessage() {
|
|
|
2985
2985
|
all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
|
|
2986
2986
|
};
|
|
2987
2987
|
try {
|
|
2988
|
-
for (var
|
|
2989
|
-
var
|
|
2988
|
+
for (var _d = __values($scrapersMetadataRegister.list()), _e = _d.next(); !_e.done; _e = _d.next()) {
|
|
2989
|
+
var _f = _e.value, packageName = _f.packageName, className = _f.className, mimeTypes = _f.mimeTypes, documentationUrl = _f.documentationUrl, isAvilableInBrowser = _f.isAvilableInBrowser;
|
|
2990
2990
|
_loop_1(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
|
|
2991
2991
|
}
|
|
2992
2992
|
}
|
|
2993
2993
|
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
2994
2994
|
finally {
|
|
2995
2995
|
try {
|
|
2996
|
-
if (
|
|
2996
|
+
if (_e && !_e.done && (_a = _d.return)) _a.call(_d);
|
|
2997
2997
|
}
|
|
2998
2998
|
finally { if (e_1) throw e_1.error; }
|
|
2999
2999
|
}
|
|
@@ -3004,18 +3004,31 @@ function $registeredScrapersMessage() {
|
|
|
3004
3004
|
all.push({ packageName: packageName, className: className, mimeTypes: mimeTypes, documentationUrl: documentationUrl, isAvilableInBrowser: isAvilableInBrowser });
|
|
3005
3005
|
};
|
|
3006
3006
|
try {
|
|
3007
|
-
for (var
|
|
3008
|
-
var
|
|
3007
|
+
for (var _g = __values($scrapersRegister.list()), _h = _g.next(); !_h.done; _h = _g.next()) {
|
|
3008
|
+
var _j = _h.value, packageName = _j.packageName, className = _j.className, mimeTypes = _j.mimeTypes, documentationUrl = _j.documentationUrl, isAvilableInBrowser = _j.isAvilableInBrowser;
|
|
3009
3009
|
_loop_2(packageName, className, mimeTypes, documentationUrl, isAvilableInBrowser);
|
|
3010
3010
|
}
|
|
3011
3011
|
}
|
|
3012
3012
|
catch (e_2_1) { e_2 = { error: e_2_1 }; }
|
|
3013
3013
|
finally {
|
|
3014
3014
|
try {
|
|
3015
|
-
if (
|
|
3015
|
+
if (_h && !_h.done && (_b = _g.return)) _b.call(_g);
|
|
3016
3016
|
}
|
|
3017
3017
|
finally { if (e_2) throw e_2.error; }
|
|
3018
3018
|
}
|
|
3019
|
+
try {
|
|
3020
|
+
for (var availableScrapers_1 = __values(availableScrapers), availableScrapers_1_1 = availableScrapers_1.next(); !availableScrapers_1_1.done; availableScrapers_1_1 = availableScrapers_1.next()) {
|
|
3021
|
+
var metadata_1 = availableScrapers_1_1.value.metadata;
|
|
3022
|
+
all.push(metadata_1);
|
|
3023
|
+
}
|
|
3024
|
+
}
|
|
3025
|
+
catch (e_3_1) { e_3 = { error: e_3_1 }; }
|
|
3026
|
+
finally {
|
|
3027
|
+
try {
|
|
3028
|
+
if (availableScrapers_1_1 && !availableScrapers_1_1.done && (_c = availableScrapers_1.return)) _c.call(availableScrapers_1);
|
|
3029
|
+
}
|
|
3030
|
+
finally { if (e_3) throw e_3.error; }
|
|
3031
|
+
}
|
|
3019
3032
|
var metadata = all.map(function (metadata) {
|
|
3020
3033
|
var isMetadataAviailable = $scrapersMetadataRegister
|
|
3021
3034
|
.list()
|
|
@@ -3029,42 +3042,44 @@ function $registeredScrapersMessage() {
|
|
|
3029
3042
|
var packageName = _a.packageName, className = _a.className;
|
|
3030
3043
|
return metadata.packageName === packageName && metadata.className === className;
|
|
3031
3044
|
});
|
|
3032
|
-
|
|
3045
|
+
var isAvilableInTools = availableScrapers.some(function (_a) {
|
|
3046
|
+
var _b = _a.metadata, packageName = _b.packageName, className = _b.className;
|
|
3047
|
+
return metadata.packageName === packageName && metadata.className === className;
|
|
3048
|
+
});
|
|
3049
|
+
return __assign(__assign({}, metadata), { isMetadataAviailable: isMetadataAviailable, isInstalled: isInstalled, isAvilableInTools: isAvilableInTools });
|
|
3033
3050
|
});
|
|
3034
3051
|
if (metadata.length === 0) {
|
|
3035
|
-
return "No scrapers are available";
|
|
3052
|
+
return spaceTrim$1("\n **No scrapers are available**\n\n This is a unexpected behavior, you are probably using some broken version of Promptbook\n At least there should be available the metadata of the scrapers\n ");
|
|
3036
3053
|
}
|
|
3037
3054
|
return spaceTrim$1(function (block) { return "\n Available scrapers are:\n ".concat(block(metadata
|
|
3038
3055
|
.map(function (_a, i) {
|
|
3039
|
-
var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled, mimeTypes = _a.mimeTypes, isAvilableInBrowser = _a.isAvilableInBrowser;
|
|
3040
|
-
var more;
|
|
3041
|
-
// TODO:
|
|
3042
|
-
if (
|
|
3043
|
-
more
|
|
3044
|
-
}
|
|
3045
|
-
|
|
3046
|
-
|
|
3047
|
-
|
|
3048
|
-
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3057
|
-
more = "(installed)";
|
|
3058
|
-
}
|
|
3059
|
-
else {
|
|
3060
|
-
more = "*(unknown state, looks like a unexpected behavior)*";
|
|
3061
|
-
}
|
|
3056
|
+
var packageName = _a.packageName, className = _a.className, isMetadataAviailable = _a.isMetadataAviailable, isInstalled = _a.isInstalled, mimeTypes = _a.mimeTypes, isAvilableInBrowser = _a.isAvilableInBrowser, isAvilableInTools = _a.isAvilableInTools;
|
|
3057
|
+
var more = [];
|
|
3058
|
+
// TODO: [🧠] Maybe use `documentationUrl`
|
|
3059
|
+
if (isMetadataAviailable) {
|
|
3060
|
+
more.push("\u2B1C Metadata registered");
|
|
3061
|
+
} // not else
|
|
3062
|
+
if (isInstalled) {
|
|
3063
|
+
more.push("\uD83D\uDFE9 Installed");
|
|
3064
|
+
} // not else
|
|
3065
|
+
if (isAvilableInTools) {
|
|
3066
|
+
more.push("\uD83D\uDFE6 Available in tools");
|
|
3067
|
+
} // not else
|
|
3068
|
+
if (!isMetadataAviailable && isInstalled) {
|
|
3069
|
+
more.push("When no metadata registered but scraper is installed, it is an unexpected behavior");
|
|
3070
|
+
} // not else
|
|
3071
|
+
if (!isInstalled && isAvilableInTools) {
|
|
3072
|
+
more.push("When the scraper is not installed but available in tools, it is an unexpected compatibility behavior");
|
|
3073
|
+
} // not else
|
|
3062
3074
|
if (!isAvilableInBrowser) {
|
|
3063
|
-
more
|
|
3075
|
+
more.push("Not usable in browser");
|
|
3064
3076
|
}
|
|
3065
|
-
|
|
3077
|
+
var moreText = more.length === 0 ? '' : " *(".concat(more.join('; '), ")*");
|
|
3078
|
+
return "".concat(i + 1, ") `").concat(className, "` from `").concat(packageName, "` compatible to scrape ").concat(mimeTypes
|
|
3079
|
+
.map(function (mimeType) { return "\"".concat(mimeType, "\""); })
|
|
3080
|
+
.join(', ')).concat(moreText);
|
|
3066
3081
|
})
|
|
3067
|
-
.join('\n')), "\n "); });
|
|
3082
|
+
.join('\n')), "\n\n Legend:\n - \u2B1C **Metadata registered** means that Promptbook knows about the scraper, it is similar to registration in some registry\n - \uD83D\uDFE9 **Installed** means that you have imported package with particular scraper\n - \uD83D\uDFE6 **Available in tools** means that you have passed scraper as dependency into prepare or execution process\n\n "); });
|
|
3068
3083
|
}
|
|
3069
3084
|
/**
|
|
3070
3085
|
* TODO: [®] DRY Register logic
|
|
@@ -3312,57 +3327,75 @@ function prepareKnowledgePieces(knowledgeSources, tools, options) {
|
|
|
3312
3327
|
_a = options.maxParallelCount, maxParallelCount = _a === void 0 ? DEFAULT_MAX_PARALLEL_COUNT : _a, rootDirname = options.rootDirname, _b = options.isVerbose, isVerbose = _b === void 0 ? DEFAULT_IS_VERBOSE : _b;
|
|
3313
3328
|
knowledgePreparedUnflatten = new Array(knowledgeSources.length);
|
|
3314
3329
|
return [4 /*yield*/, forEachAsync(knowledgeSources, { maxParallelCount: maxParallelCount }, function (knowledgeSource, index) { return __awaiter(_this, void 0, void 0, function () {
|
|
3315
|
-
var partialPieces, sourceHandler,
|
|
3316
|
-
var e_1,
|
|
3317
|
-
return __generator(this, function (
|
|
3318
|
-
switch (
|
|
3330
|
+
var partialPieces, sourceHandler, scrapers, _loop_1, scrapers_1, scrapers_1_1, scraper, state_1, e_1_1, pieces;
|
|
3331
|
+
var e_1, _a;
|
|
3332
|
+
return __generator(this, function (_b) {
|
|
3333
|
+
switch (_b.label) {
|
|
3319
3334
|
case 0:
|
|
3320
3335
|
partialPieces = null;
|
|
3321
3336
|
return [4 /*yield*/, makeKnowledgeSourceHandler(knowledgeSource, tools, { rootDirname: rootDirname, isVerbose: isVerbose })];
|
|
3322
3337
|
case 1:
|
|
3323
|
-
sourceHandler =
|
|
3324
|
-
|
|
3338
|
+
sourceHandler = _b.sent();
|
|
3339
|
+
scrapers = arrayableToArray(tools.scrapers);
|
|
3340
|
+
_loop_1 = function (scraper) {
|
|
3341
|
+
var partialPiecesUnchecked;
|
|
3342
|
+
return __generator(this, function (_c) {
|
|
3343
|
+
switch (_c.label) {
|
|
3344
|
+
case 0:
|
|
3345
|
+
if (!scraper.metadata.mimeTypes.includes(sourceHandler.mimeType)
|
|
3346
|
+
// <- TODO: [🦔] Implement mime-type wildcards
|
|
3347
|
+
) {
|
|
3348
|
+
return [2 /*return*/, "continue"];
|
|
3349
|
+
}
|
|
3350
|
+
return [4 /*yield*/, scraper.scrape(sourceHandler)];
|
|
3351
|
+
case 1:
|
|
3352
|
+
partialPiecesUnchecked = _c.sent();
|
|
3353
|
+
if (partialPiecesUnchecked !== null) {
|
|
3354
|
+
partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
|
|
3355
|
+
return [2 /*return*/, "break"];
|
|
3356
|
+
}
|
|
3357
|
+
console.warn(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge from source despite the scraper `".concat(scraper.metadata.className, "` supports the mime type \"").concat(sourceHandler.mimeType, "\".\n \n The source:\n > ").concat(block(knowledgeSource.sourceContent
|
|
3358
|
+
.split('\n')
|
|
3359
|
+
.map(function (line) { return "> ".concat(line); })
|
|
3360
|
+
.join('\n')), "\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
|
|
3361
|
+
return [2 /*return*/];
|
|
3362
|
+
}
|
|
3363
|
+
});
|
|
3364
|
+
};
|
|
3365
|
+
_b.label = 2;
|
|
3325
3366
|
case 2:
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3367
|
+
_b.trys.push([2, 7, 8, 9]);
|
|
3368
|
+
scrapers_1 = __values(scrapers), scrapers_1_1 = scrapers_1.next();
|
|
3369
|
+
_b.label = 3;
|
|
3329
3370
|
case 3:
|
|
3330
|
-
if (!!
|
|
3331
|
-
scraper =
|
|
3332
|
-
|
|
3333
|
-
// <- TODO: [🦔] Implement mime-type wildcards
|
|
3334
|
-
) {
|
|
3335
|
-
return [3 /*break*/, 5];
|
|
3336
|
-
}
|
|
3337
|
-
return [4 /*yield*/, scraper.scrape(sourceHandler)];
|
|
3371
|
+
if (!!scrapers_1_1.done) return [3 /*break*/, 6];
|
|
3372
|
+
scraper = scrapers_1_1.value;
|
|
3373
|
+
return [5 /*yield**/, _loop_1(scraper)];
|
|
3338
3374
|
case 4:
|
|
3339
|
-
|
|
3340
|
-
if (
|
|
3341
|
-
partialPieces = __spreadArray([], __read(partialPiecesUnchecked), false);
|
|
3342
|
-
// <- TODO: [🪓] Here should be no need for spreading new array, just `partialPieces = partialPiecesUnchecked`
|
|
3375
|
+
state_1 = _b.sent();
|
|
3376
|
+
if (state_1 === "break")
|
|
3343
3377
|
return [3 /*break*/, 6];
|
|
3344
|
-
|
|
3345
|
-
_d.label = 5;
|
|
3378
|
+
_b.label = 5;
|
|
3346
3379
|
case 5:
|
|
3347
|
-
|
|
3380
|
+
scrapers_1_1 = scrapers_1.next();
|
|
3348
3381
|
return [3 /*break*/, 3];
|
|
3349
3382
|
case 6: return [3 /*break*/, 9];
|
|
3350
3383
|
case 7:
|
|
3351
|
-
e_1_1 =
|
|
3384
|
+
e_1_1 = _b.sent();
|
|
3352
3385
|
e_1 = { error: e_1_1 };
|
|
3353
3386
|
return [3 /*break*/, 9];
|
|
3354
3387
|
case 8:
|
|
3355
3388
|
try {
|
|
3356
|
-
if (
|
|
3389
|
+
if (scrapers_1_1 && !scrapers_1_1.done && (_a = scrapers_1.return)) _a.call(scrapers_1);
|
|
3357
3390
|
}
|
|
3358
3391
|
finally { if (e_1) throw e_1.error; }
|
|
3359
3392
|
return [7 /*endfinally*/];
|
|
3360
3393
|
case 9:
|
|
3361
3394
|
if (partialPieces === null) {
|
|
3362
|
-
throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge
|
|
3395
|
+
throw new KnowledgeScrapeError(spaceTrim$1(function (block) { return "\n Cannot scrape knowledge\n \n The source:\n > ".concat(block(knowledgeSource.sourceContent
|
|
3363
3396
|
.split('\n')
|
|
3364
3397
|
.map(function (line) { return "> ".concat(line); })
|
|
3365
|
-
.join('\n')), "\n\n No scraper found for the mime type \"").concat(sourceHandler.mimeType, "\"\n\n ").concat(block($registeredScrapersMessage()), "\n\n\n "); }));
|
|
3398
|
+
.join('\n')), "\n\n No scraper found for the mime type \"").concat(sourceHandler.mimeType, "\"\n\n ").concat(block($registeredScrapersMessage(scrapers)), "\n\n\n "); }));
|
|
3366
3399
|
}
|
|
3367
3400
|
pieces = partialPieces.map(function (partialPiece) { return (__assign(__assign({}, partialPiece), { sources: [
|
|
3368
3401
|
{
|