@promptbook/website-crawler 0.84.0-12 → 0.84.0-14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/index.es.js +63 -10
- package/esm/index.es.js.map +1 -1
- package/esm/typings/src/_packages/core.index.d.ts +4 -0
- package/esm/typings/src/config.d.ts +12 -0
- package/esm/typings/src/utils/editable/edit-pipeline-string/deflatePipeline.test.d.ts +1 -0
- package/esm/typings/src/utils/editable/utils/isFlatPipeline.test.d.ts +1 -0
- package/esm/typings/src/utils/files/mimeTypeToExtension.d.ts +10 -0
- package/esm/typings/src/utils/files/mimeTypeToExtension.test.d.ts +1 -0
- package/package.json +2 -2
- package/umd/index.umd.js +62 -9
- package/umd/index.umd.js.map +1 -1
package/esm/index.es.js
CHANGED
|
@@ -9,7 +9,7 @@ import { format } from 'prettier';
|
|
|
9
9
|
import parserHtml from 'prettier/parser-html';
|
|
10
10
|
import { forTime } from 'waitasecond';
|
|
11
11
|
import sha256 from 'crypto-js/sha256';
|
|
12
|
-
import { lookup } from 'mime-types';
|
|
12
|
+
import { lookup, extension } from 'mime-types';
|
|
13
13
|
import { unparse, parse } from 'papaparse';
|
|
14
14
|
import { Converter } from 'showdown';
|
|
15
15
|
|
|
@@ -27,7 +27,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
|
|
|
27
27
|
* @generated
|
|
28
28
|
* @see https://github.com/webgptorg/promptbook
|
|
29
29
|
*/
|
|
30
|
-
var PROMPTBOOK_ENGINE_VERSION = '0.84.0-
|
|
30
|
+
var PROMPTBOOK_ENGINE_VERSION = '0.84.0-13';
|
|
31
31
|
/**
|
|
32
32
|
* TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
|
|
33
33
|
* Note: [💞] Ignore a discrepancy between file name and entity name
|
|
@@ -3590,6 +3590,17 @@ function isFileExisting(filename, fs) {
|
|
|
3590
3590
|
* TODO: [🖇] What about symlinks?
|
|
3591
3591
|
*/
|
|
3592
3592
|
|
|
3593
|
+
/**
|
|
3594
|
+
* Convert mime type to file extension
|
|
3595
|
+
*
|
|
3596
|
+
* Note: If the mime type is invalid, `null` is returned
|
|
3597
|
+
*
|
|
3598
|
+
* @private within the repository
|
|
3599
|
+
*/
|
|
3600
|
+
function mimeTypeToExtension(value) {
|
|
3601
|
+
return extension(value) || null;
|
|
3602
|
+
}
|
|
3603
|
+
|
|
3593
3604
|
/**
|
|
3594
3605
|
* The built-in `fetch' function with a lightweight error handling wrapper as default fetch function used in Promptbook scrapers
|
|
3595
3606
|
*
|
|
@@ -3625,7 +3636,7 @@ var scraperFetch = function (url, init) { return __awaiter(void 0, void 0, void
|
|
|
3625
3636
|
function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
3626
3637
|
var _a;
|
|
3627
3638
|
return __awaiter(this, void 0, void 0, function () {
|
|
3628
|
-
var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url,
|
|
3639
|
+
var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response_1, mimeType, basename, hash, rootDirname_1, filepath, _f, _g, _h, _j, _k, filename_1, fileExtension, mimeType;
|
|
3629
3640
|
return __generator(this, function (_l) {
|
|
3630
3641
|
switch (_l.label) {
|
|
3631
3642
|
case 0:
|
|
@@ -3641,25 +3652,67 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
|
3641
3652
|
url = knowledgeSourceContent;
|
|
3642
3653
|
return [4 /*yield*/, fetch(url)];
|
|
3643
3654
|
case 1:
|
|
3644
|
-
|
|
3645
|
-
mimeType = ((_a =
|
|
3646
|
-
|
|
3655
|
+
response_1 = _l.sent();
|
|
3656
|
+
mimeType = ((_a = response_1.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
|
|
3657
|
+
if (tools.fs === undefined || !url.endsWith('.pdf' /* <- TODO: [💵] */)) {
|
|
3658
|
+
return [2 /*return*/, {
|
|
3659
|
+
source: name,
|
|
3660
|
+
filename: null,
|
|
3661
|
+
url: url,
|
|
3662
|
+
mimeType: mimeType,
|
|
3663
|
+
/*
|
|
3664
|
+
TODO: [🥽]
|
|
3665
|
+
> async asBlob() {
|
|
3666
|
+
> // TODO: [👨🏻🤝👨🏻] This can be called multiple times BUT when called second time, response in already consumed
|
|
3667
|
+
> const content = await response.blob();
|
|
3668
|
+
> return content;
|
|
3669
|
+
> },
|
|
3670
|
+
*/
|
|
3671
|
+
asJson: function () {
|
|
3672
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
3673
|
+
var content;
|
|
3674
|
+
return __generator(this, function (_a) {
|
|
3675
|
+
switch (_a.label) {
|
|
3676
|
+
case 0: return [4 /*yield*/, response_1.json()];
|
|
3677
|
+
case 1:
|
|
3678
|
+
content = _a.sent();
|
|
3679
|
+
return [2 /*return*/, content];
|
|
3680
|
+
}
|
|
3681
|
+
});
|
|
3682
|
+
});
|
|
3683
|
+
},
|
|
3684
|
+
asText: function () {
|
|
3685
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
3686
|
+
var content;
|
|
3687
|
+
return __generator(this, function (_a) {
|
|
3688
|
+
switch (_a.label) {
|
|
3689
|
+
case 0: return [4 /*yield*/, response_1.text()];
|
|
3690
|
+
case 1:
|
|
3691
|
+
content = _a.sent();
|
|
3692
|
+
return [2 /*return*/, content];
|
|
3693
|
+
}
|
|
3694
|
+
});
|
|
3695
|
+
});
|
|
3696
|
+
},
|
|
3697
|
+
}];
|
|
3698
|
+
}
|
|
3699
|
+
basename = url.split('/').pop() || titleToName(url);
|
|
3647
3700
|
hash = sha256(hexEncoder.parse(url)).toString( /* hex */);
|
|
3648
3701
|
rootDirname_1 = join(process.cwd(), DEFAULT_DOWNLOAD_CACHE_DIRNAME);
|
|
3649
|
-
filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [🎎] Maybe add some SHA256 prefix */)), false), ["".concat(
|
|
3702
|
+
filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [🎎] Maybe add some SHA256 prefix */)), false), ["".concat(basename.substring(0, MAX_FILENAME_LENGTH), ".").concat(mimeTypeToExtension(mimeType))], false));
|
|
3650
3703
|
return [4 /*yield*/, tools.fs.mkdir(dirname(join(rootDirname_1, filepath)), { recursive: true })];
|
|
3651
3704
|
case 2:
|
|
3652
3705
|
_l.sent();
|
|
3653
3706
|
_g = (_f = tools.fs).writeFile;
|
|
3654
3707
|
_h = [join(rootDirname_1, filepath)];
|
|
3655
3708
|
_k = (_j = Buffer).from;
|
|
3656
|
-
return [4 /*yield*/,
|
|
3709
|
+
return [4 /*yield*/, response_1.arrayBuffer()];
|
|
3657
3710
|
case 3: return [4 /*yield*/, _g.apply(_f, _h.concat([_k.apply(_j, [_l.sent()])]))];
|
|
3658
3711
|
case 4:
|
|
3659
3712
|
_l.sent();
|
|
3660
|
-
// TODO:
|
|
3713
|
+
// TODO: [💵] Check the file security
|
|
3661
3714
|
// TODO: !!!!!!!! Check the file size (if it is not too big)
|
|
3662
|
-
// TODO: !!!!!!!! Delete the file
|
|
3715
|
+
// TODO: !!!!!!!! Delete the file after the scraping is done
|
|
3663
3716
|
return [2 /*return*/, makeKnowledgeSourceHandler({ name: name, knowledgeSourceContent: filepath }, tools, __assign(__assign({}, options), { rootDirname: rootDirname_1 }))];
|
|
3664
3717
|
case 5:
|
|
3665
3718
|
if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 7];
|