@promptbook/website-crawler 0.84.0-11 โ 0.84.0-13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/index.es.js +85 -48
- package/esm/index.es.js.map +1 -1
- package/esm/typings/src/_packages/core.index.d.ts +4 -2
- package/esm/typings/src/config.d.ts +9 -1
- package/esm/typings/src/execution/FilesystemTools.d.ts +1 -1
- package/esm/typings/src/wizzard/wizzard.d.ts +7 -1
- package/package.json +2 -2
- package/umd/index.umd.js +89 -52
- package/umd/index.umd.js.map +1 -1
package/esm/index.es.js
CHANGED
|
@@ -8,6 +8,7 @@ import { basename, join, dirname } from 'path';
|
|
|
8
8
|
import { format } from 'prettier';
|
|
9
9
|
import parserHtml from 'prettier/parser-html';
|
|
10
10
|
import { forTime } from 'waitasecond';
|
|
11
|
+
import sha256 from 'crypto-js/sha256';
|
|
11
12
|
import { lookup } from 'mime-types';
|
|
12
13
|
import { unparse, parse } from 'papaparse';
|
|
13
14
|
import { Converter } from 'showdown';
|
|
@@ -26,7 +27,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
|
|
|
26
27
|
* @generated
|
|
27
28
|
* @see https://github.com/webgptorg/promptbook
|
|
28
29
|
*/
|
|
29
|
-
var PROMPTBOOK_ENGINE_VERSION = '0.84.0-
|
|
30
|
+
var PROMPTBOOK_ENGINE_VERSION = '0.84.0-12';
|
|
30
31
|
/**
|
|
31
32
|
* TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
|
|
32
33
|
* Note: [๐] Ignore a discrepancy between file name and entity name
|
|
@@ -268,6 +269,12 @@ var SMALL_NUMBER = 0.001;
|
|
|
268
269
|
* @private within the repository - too low-level in comparison with other `MAX_...`
|
|
269
270
|
*/
|
|
270
271
|
var IMMEDIATE_TIME = 10;
|
|
272
|
+
/**
|
|
273
|
+
* The maximum length of the (generated) filename
|
|
274
|
+
*
|
|
275
|
+
* @public exported from `@promptbook/core`
|
|
276
|
+
*/
|
|
277
|
+
var MAX_FILENAME_LENGTH = 30;
|
|
271
278
|
/**
|
|
272
279
|
* Strategy for caching the intermediate results for knowledge sources
|
|
273
280
|
*
|
|
@@ -287,6 +294,15 @@ var DEFAULT_MAX_PARALLEL_COUNT = 5; // <- TODO: [๐คนโโ๏ธ]
|
|
|
287
294
|
* @public exported from `@promptbook/core`
|
|
288
295
|
*/
|
|
289
296
|
var DEFAULT_MAX_EXECUTION_ATTEMPTS = 3; // <- TODO: [๐คนโโ๏ธ]
|
|
297
|
+
// <- TODO: [๐] Make also `BOOKS_DIRNAME_ALTERNATIVES`
|
|
298
|
+
/**
|
|
299
|
+
* Where to store the temporary downloads
|
|
300
|
+
*
|
|
301
|
+
* Note: When the folder does not exist, it is created recursively
|
|
302
|
+
*
|
|
303
|
+
* @public exported from `@promptbook/core`
|
|
304
|
+
*/
|
|
305
|
+
var DEFAULT_DOWNLOAD_CACHE_DIRNAME = './.promptbook/download-cache';
|
|
290
306
|
/**
|
|
291
307
|
* Where to store the scrape cache
|
|
292
308
|
*
|
|
@@ -3609,10 +3625,11 @@ var scraperFetch = function (url, init) { return __awaiter(void 0, void 0, void
|
|
|
3609
3625
|
function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
3610
3626
|
var _a;
|
|
3611
3627
|
return __awaiter(this, void 0, void 0, function () {
|
|
3612
|
-
var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response_1, mimeType, filename_1, fileExtension, mimeType;
|
|
3613
|
-
return __generator(this, function (
|
|
3614
|
-
switch (
|
|
3628
|
+
var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response_1, mimeType, basename, hash, rootDirname_1, filepath, _f, _g, _h, _j, _k, filename_1, fileExtension, mimeType;
|
|
3629
|
+
return __generator(this, function (_l) {
|
|
3630
|
+
switch (_l.label) {
|
|
3615
3631
|
case 0:
|
|
3632
|
+
console.log('!!! makeKnowledgeSourceHandler', knowledgeSource);
|
|
3616
3633
|
_b = tools.fetch, fetch = _b === void 0 ? scraperFetch : _b;
|
|
3617
3634
|
knowledgeSourceContent = knowledgeSource.knowledgeSourceContent;
|
|
3618
3635
|
name = knowledgeSource.name;
|
|
@@ -3620,54 +3637,74 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
|
3620
3637
|
if (!name) {
|
|
3621
3638
|
name = knowledgeSourceContentToName(knowledgeSourceContent);
|
|
3622
3639
|
}
|
|
3623
|
-
if (!isValidUrl(knowledgeSourceContent)) return [3 /*break*/,
|
|
3640
|
+
if (!isValidUrl(knowledgeSourceContent)) return [3 /*break*/, 5];
|
|
3624
3641
|
url = knowledgeSourceContent;
|
|
3625
3642
|
return [4 /*yield*/, fetch(url)];
|
|
3626
3643
|
case 1:
|
|
3627
|
-
response_1 =
|
|
3644
|
+
response_1 = _l.sent();
|
|
3628
3645
|
mimeType = ((_a = response_1.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3632
|
-
|
|
3633
|
-
|
|
3634
|
-
|
|
3635
|
-
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
|
|
3640
|
-
|
|
3641
|
-
|
|
3642
|
-
|
|
3643
|
-
|
|
3644
|
-
|
|
3645
|
-
|
|
3646
|
-
|
|
3647
|
-
|
|
3648
|
-
|
|
3649
|
-
|
|
3650
|
-
|
|
3651
|
-
|
|
3646
|
+
if (tools.fs === undefined || !url.endsWith('.pdf')) {
|
|
3647
|
+
return [2 /*return*/, {
|
|
3648
|
+
source: name,
|
|
3649
|
+
filename: null,
|
|
3650
|
+
url: url,
|
|
3651
|
+
mimeType: mimeType,
|
|
3652
|
+
/*
|
|
3653
|
+
TODO: [๐ฅฝ]
|
|
3654
|
+
> async asBlob() {
|
|
3655
|
+
> // TODO: [๐จ๐ปโ๐คโ๐จ๐ป] This can be called multiple times BUT when called second time, response in already consumed
|
|
3656
|
+
> const content = await response.blob();
|
|
3657
|
+
> return content;
|
|
3658
|
+
> },
|
|
3659
|
+
*/
|
|
3660
|
+
asJson: function () {
|
|
3661
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
3662
|
+
var content;
|
|
3663
|
+
return __generator(this, function (_a) {
|
|
3664
|
+
switch (_a.label) {
|
|
3665
|
+
case 0: return [4 /*yield*/, response_1.json()];
|
|
3666
|
+
case 1:
|
|
3667
|
+
content = _a.sent();
|
|
3668
|
+
return [2 /*return*/, content];
|
|
3669
|
+
}
|
|
3670
|
+
});
|
|
3652
3671
|
});
|
|
3653
|
-
}
|
|
3654
|
-
|
|
3655
|
-
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
|
|
3659
|
-
|
|
3660
|
-
|
|
3661
|
-
|
|
3662
|
-
|
|
3663
|
-
|
|
3664
|
-
}
|
|
3672
|
+
},
|
|
3673
|
+
asText: function () {
|
|
3674
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
3675
|
+
var content;
|
|
3676
|
+
return __generator(this, function (_a) {
|
|
3677
|
+
switch (_a.label) {
|
|
3678
|
+
case 0: return [4 /*yield*/, response_1.text()];
|
|
3679
|
+
case 1:
|
|
3680
|
+
content = _a.sent();
|
|
3681
|
+
return [2 /*return*/, content];
|
|
3682
|
+
}
|
|
3683
|
+
});
|
|
3665
3684
|
});
|
|
3666
|
-
}
|
|
3667
|
-
}
|
|
3668
|
-
|
|
3685
|
+
},
|
|
3686
|
+
}];
|
|
3687
|
+
}
|
|
3688
|
+
basename = url.split('/').pop() || titleToName(url);
|
|
3689
|
+
hash = sha256(hexEncoder.parse(url)).toString( /* hex */);
|
|
3690
|
+
rootDirname_1 = join(process.cwd(), DEFAULT_DOWNLOAD_CACHE_DIRNAME);
|
|
3691
|
+
filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [๐] Maybe add some SHA256 prefix */)), false), ["".concat(basename.substring(0, MAX_FILENAME_LENGTH), ".pdf")], false));
|
|
3692
|
+
return [4 /*yield*/, tools.fs.mkdir(dirname(join(rootDirname_1, filepath)), { recursive: true })];
|
|
3669
3693
|
case 2:
|
|
3670
|
-
|
|
3694
|
+
_l.sent();
|
|
3695
|
+
_g = (_f = tools.fs).writeFile;
|
|
3696
|
+
_h = [join(rootDirname_1, filepath)];
|
|
3697
|
+
_k = (_j = Buffer).from;
|
|
3698
|
+
return [4 /*yield*/, response_1.arrayBuffer()];
|
|
3699
|
+
case 3: return [4 /*yield*/, _g.apply(_f, _h.concat([_k.apply(_j, [_l.sent()])]))];
|
|
3700
|
+
case 4:
|
|
3701
|
+
_l.sent();
|
|
3702
|
+
// TODO: !!!!!!!! Check the file security
|
|
3703
|
+
// TODO: !!!!!!!! Check the file size (if it is not too big)
|
|
3704
|
+
// TODO: !!!!!!!! Delete the file
|
|
3705
|
+
return [2 /*return*/, makeKnowledgeSourceHandler({ name: name, knowledgeSourceContent: filepath }, tools, __assign(__assign({}, options), { rootDirname: rootDirname_1 }))];
|
|
3706
|
+
case 5:
|
|
3707
|
+
if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 7];
|
|
3671
3708
|
if (tools.fs === undefined) {
|
|
3672
3709
|
throw new EnvironmentMismatchError('Can not import file knowledge without filesystem tools');
|
|
3673
3710
|
// <- TODO: [๐ง ] What is the best error type here`
|
|
@@ -3680,8 +3717,8 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
|
3680
3717
|
fileExtension = getFileExtension(filename_1);
|
|
3681
3718
|
mimeType = extensionToMimeType(fileExtension || '');
|
|
3682
3719
|
return [4 /*yield*/, isFileExisting(filename_1, tools.fs)];
|
|
3683
|
-
case
|
|
3684
|
-
if (!(
|
|
3720
|
+
case 6:
|
|
3721
|
+
if (!(_l.sent())) {
|
|
3685
3722
|
throw new NotFoundError(spaceTrim$1(function (block) { return "\n Can not make source handler for file which does not exist:\n\n File:\n ".concat(block(knowledgeSourceContent), "\n\n Full file path:\n ").concat(block(filename_1), "\n "); }));
|
|
3686
3723
|
}
|
|
3687
3724
|
// TODO: [๐ง ][๐ฟ] Test security file - file is scoped to the project (BUT maybe do this in `filesystemTools`)
|
|
@@ -3727,7 +3764,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
|
3727
3764
|
});
|
|
3728
3765
|
},
|
|
3729
3766
|
}];
|
|
3730
|
-
case
|
|
3767
|
+
case 7: return [2 /*return*/, {
|
|
3731
3768
|
source: name,
|
|
3732
3769
|
filename: null,
|
|
3733
3770
|
url: null,
|