@promptbook/website-crawler 0.84.0-12 → 0.84.0-14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/esm/index.es.js CHANGED
@@ -9,7 +9,7 @@ import { format } from 'prettier';
9
9
  import parserHtml from 'prettier/parser-html';
10
10
  import { forTime } from 'waitasecond';
11
11
  import sha256 from 'crypto-js/sha256';
12
- import { lookup } from 'mime-types';
12
+ import { lookup, extension } from 'mime-types';
13
13
  import { unparse, parse } from 'papaparse';
14
14
  import { Converter } from 'showdown';
15
15
 
@@ -27,7 +27,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
27
27
  * @generated
28
28
  * @see https://github.com/webgptorg/promptbook
29
29
  */
30
- var PROMPTBOOK_ENGINE_VERSION = '0.84.0-11';
30
+ var PROMPTBOOK_ENGINE_VERSION = '0.84.0-13';
31
31
  /**
32
32
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
33
33
  * Note: [💞] Ignore a discrepancy between file name and entity name
@@ -3590,6 +3590,17 @@ function isFileExisting(filename, fs) {
3590
3590
  * TODO: [🖇] What about symlinks?
3591
3591
  */
3592
3592
 
3593
+ /**
3594
+ * Convert mime type to file extension
3595
+ *
3596
+ * Note: If the mime type is invalid, `null` is returned
3597
+ *
3598
+ * @private within the repository
3599
+ */
3600
+ function mimeTypeToExtension(value) {
3601
+ return extension(value) || null;
3602
+ }
3603
+
3593
3604
  /**
3594
3605
  * The built-in `fetch' function with a lightweight error handling wrapper as default fetch function used in Promptbook scrapers
3595
3606
  *
@@ -3625,7 +3636,7 @@ var scraperFetch = function (url, init) { return __awaiter(void 0, void 0, void
3625
3636
  function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3626
3637
  var _a;
3627
3638
  return __awaiter(this, void 0, void 0, function () {
3628
- var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response, mimeType, filename, hash, rootDirname_1, filepath, _f, _g, _h, _j, _k, filename_1, fileExtension, mimeType;
3639
+ var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response_1, mimeType, basename, hash, rootDirname_1, filepath, _f, _g, _h, _j, _k, filename_1, fileExtension, mimeType;
3629
3640
  return __generator(this, function (_l) {
3630
3641
  switch (_l.label) {
3631
3642
  case 0:
@@ -3641,25 +3652,67 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3641
3652
  url = knowledgeSourceContent;
3642
3653
  return [4 /*yield*/, fetch(url)];
3643
3654
  case 1:
3644
- response = _l.sent();
3645
- mimeType = ((_a = response.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3646
- filename = url.split('/').pop() || titleToName(url);
3655
+ response_1 = _l.sent();
3656
+ mimeType = ((_a = response_1.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3657
+ if (tools.fs === undefined || !url.endsWith('.pdf' /* <- TODO: [💵] */)) {
3658
+ return [2 /*return*/, {
3659
+ source: name,
3660
+ filename: null,
3661
+ url: url,
3662
+ mimeType: mimeType,
3663
+ /*
3664
+ TODO: [🥽]
3665
+ > async asBlob() {
3666
+ > // TODO: [👨🏻‍🤝‍👨🏻] This can be called multiple times BUT when called second time, response in already consumed
3667
+ > const content = await response.blob();
3668
+ > return content;
3669
+ > },
3670
+ */
3671
+ asJson: function () {
3672
+ return __awaiter(this, void 0, void 0, function () {
3673
+ var content;
3674
+ return __generator(this, function (_a) {
3675
+ switch (_a.label) {
3676
+ case 0: return [4 /*yield*/, response_1.json()];
3677
+ case 1:
3678
+ content = _a.sent();
3679
+ return [2 /*return*/, content];
3680
+ }
3681
+ });
3682
+ });
3683
+ },
3684
+ asText: function () {
3685
+ return __awaiter(this, void 0, void 0, function () {
3686
+ var content;
3687
+ return __generator(this, function (_a) {
3688
+ switch (_a.label) {
3689
+ case 0: return [4 /*yield*/, response_1.text()];
3690
+ case 1:
3691
+ content = _a.sent();
3692
+ return [2 /*return*/, content];
3693
+ }
3694
+ });
3695
+ });
3696
+ },
3697
+ }];
3698
+ }
3699
+ basename = url.split('/').pop() || titleToName(url);
3647
3700
  hash = sha256(hexEncoder.parse(url)).toString( /* hex */);
3648
3701
  rootDirname_1 = join(process.cwd(), DEFAULT_DOWNLOAD_CACHE_DIRNAME);
3649
- filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [🎎] Maybe add some SHA256 prefix */)), false), ["".concat(filename.substring(0, MAX_FILENAME_LENGTH), ".pdf")], false));
3702
+ filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [🎎] Maybe add some SHA256 prefix */)), false), ["".concat(basename.substring(0, MAX_FILENAME_LENGTH), ".").concat(mimeTypeToExtension(mimeType))], false));
3650
3703
  return [4 /*yield*/, tools.fs.mkdir(dirname(join(rootDirname_1, filepath)), { recursive: true })];
3651
3704
  case 2:
3652
3705
  _l.sent();
3653
3706
  _g = (_f = tools.fs).writeFile;
3654
3707
  _h = [join(rootDirname_1, filepath)];
3655
3708
  _k = (_j = Buffer).from;
3656
- return [4 /*yield*/, response.arrayBuffer()];
3709
+ return [4 /*yield*/, response_1.arrayBuffer()];
3657
3710
  case 3: return [4 /*yield*/, _g.apply(_f, _h.concat([_k.apply(_j, [_l.sent()])]))];
3658
3711
  case 4:
3659
3712
  _l.sent();
3660
- // TODO: !!!!!!!! Check the file security
3713
+ // TODO: [💵] Check the file security
3661
3714
  // TODO: !!!!!!!! Check the file size (if it is not too big)
3662
- // TODO: !!!!!!!! Delete the file
3715
+ // TODO: !!!!!!!! Delete the file after the scraping is done
3663
3716
  return [2 /*return*/, makeKnowledgeSourceHandler({ name: name, knowledgeSourceContent: filepath }, tools, __assign(__assign({}, options), { rootDirname: rootDirname_1 }))];
3664
3717
  case 5:
3665
3718
  if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 7];