@promptbook/website-crawler 0.84.0-11 โ†’ 0.84.0-12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/esm/index.es.js CHANGED
@@ -8,6 +8,7 @@ import { basename, join, dirname } from 'path';
8
8
  import { format } from 'prettier';
9
9
  import parserHtml from 'prettier/parser-html';
10
10
  import { forTime } from 'waitasecond';
11
+ import sha256 from 'crypto-js/sha256';
11
12
  import { lookup } from 'mime-types';
12
13
  import { unparse, parse } from 'papaparse';
13
14
  import { Converter } from 'showdown';
@@ -26,7 +27,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
26
27
  * @generated
27
28
  * @see https://github.com/webgptorg/promptbook
28
29
  */
29
- var PROMPTBOOK_ENGINE_VERSION = '0.84.0-10';
30
+ var PROMPTBOOK_ENGINE_VERSION = '0.84.0-11';
30
31
  /**
31
32
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
32
33
  * Note: [๐Ÿ’ž] Ignore a discrepancy between file name and entity name
@@ -268,6 +269,12 @@ var SMALL_NUMBER = 0.001;
268
269
  * @private within the repository - too low-level in comparison with other `MAX_...`
269
270
  */
270
271
  var IMMEDIATE_TIME = 10;
272
+ /**
273
+ * The maximum length of the (generated) filename
274
+ *
275
+ * @public exported from `@promptbook/core`
276
+ */
277
+ var MAX_FILENAME_LENGTH = 30;
271
278
  /**
272
279
  * Strategy for caching the intermediate results for knowledge sources
273
280
  *
@@ -287,6 +294,15 @@ var DEFAULT_MAX_PARALLEL_COUNT = 5; // <- TODO: [๐Ÿคนโ€โ™‚๏ธ]
287
294
  * @public exported from `@promptbook/core`
288
295
  */
289
296
  var DEFAULT_MAX_EXECUTION_ATTEMPTS = 3; // <- TODO: [๐Ÿคนโ€โ™‚๏ธ]
297
+ // <- TODO: [๐Ÿ•] Make also `BOOKS_DIRNAME_ALTERNATIVES`
298
+ /**
299
+ * Where to store the temporary downloads
300
+ *
301
+ * Note: When the folder does not exist, it is created recursively
302
+ *
303
+ * @public exported from `@promptbook/core`
304
+ */
305
+ var DEFAULT_DOWNLOAD_CACHE_DIRNAME = './.promptbook/download-cache';
290
306
  /**
291
307
  * Where to store the scrape cache
292
308
  *
@@ -3609,10 +3625,11 @@ var scraperFetch = function (url, init) { return __awaiter(void 0, void 0, void
3609
3625
  function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3610
3626
  var _a;
3611
3627
  return __awaiter(this, void 0, void 0, function () {
3612
- var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response_1, mimeType, filename_1, fileExtension, mimeType;
3613
- return __generator(this, function (_f) {
3614
- switch (_f.label) {
3628
+ var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response, mimeType, filename, hash, rootDirname_1, filepath, _f, _g, _h, _j, _k, filename_1, fileExtension, mimeType;
3629
+ return __generator(this, function (_l) {
3630
+ switch (_l.label) {
3615
3631
  case 0:
3632
+ console.log('!!! makeKnowledgeSourceHandler', knowledgeSource);
3616
3633
  _b = tools.fetch, fetch = _b === void 0 ? scraperFetch : _b;
3617
3634
  knowledgeSourceContent = knowledgeSource.knowledgeSourceContent;
3618
3635
  name = knowledgeSource.name;
@@ -3620,54 +3637,32 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3620
3637
  if (!name) {
3621
3638
  name = knowledgeSourceContentToName(knowledgeSourceContent);
3622
3639
  }
3623
- if (!isValidUrl(knowledgeSourceContent)) return [3 /*break*/, 2];
3640
+ if (!isValidUrl(knowledgeSourceContent)) return [3 /*break*/, 5];
3624
3641
  url = knowledgeSourceContent;
3625
3642
  return [4 /*yield*/, fetch(url)];
3626
3643
  case 1:
3627
- response_1 = _f.sent();
3628
- mimeType = ((_a = response_1.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3629
- return [2 /*return*/, {
3630
- source: name,
3631
- filename: null,
3632
- url: url,
3633
- mimeType: mimeType,
3634
- /*
3635
- TODO: [๐Ÿฅฝ]
3636
- > async asBlob() {
3637
- > // TODO: [๐Ÿ‘จ๐Ÿปโ€๐Ÿคโ€๐Ÿ‘จ๐Ÿป] This can be called multiple times BUT when called second time, response in already consumed
3638
- > const content = await response.blob();
3639
- > return content;
3640
- > },
3641
- */
3642
- asJson: function () {
3643
- return __awaiter(this, void 0, void 0, function () {
3644
- var content;
3645
- return __generator(this, function (_a) {
3646
- switch (_a.label) {
3647
- case 0: return [4 /*yield*/, response_1.json()];
3648
- case 1:
3649
- content = _a.sent();
3650
- return [2 /*return*/, content];
3651
- }
3652
- });
3653
- });
3654
- },
3655
- asText: function () {
3656
- return __awaiter(this, void 0, void 0, function () {
3657
- var content;
3658
- return __generator(this, function (_a) {
3659
- switch (_a.label) {
3660
- case 0: return [4 /*yield*/, response_1.text()];
3661
- case 1:
3662
- content = _a.sent();
3663
- return [2 /*return*/, content];
3664
- }
3665
- });
3666
- });
3667
- },
3668
- }];
3644
+ response = _l.sent();
3645
+ mimeType = ((_a = response.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3646
+ filename = url.split('/').pop() || titleToName(url);
3647
+ hash = sha256(hexEncoder.parse(url)).toString( /* hex */);
3648
+ rootDirname_1 = join(process.cwd(), DEFAULT_DOWNLOAD_CACHE_DIRNAME);
3649
+ filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [๐ŸŽŽ] Maybe add some SHA256 prefix */)), false), ["".concat(filename.substring(0, MAX_FILENAME_LENGTH), ".pdf")], false));
3650
+ return [4 /*yield*/, tools.fs.mkdir(dirname(join(rootDirname_1, filepath)), { recursive: true })];
3669
3651
  case 2:
3670
- if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 4];
3652
+ _l.sent();
3653
+ _g = (_f = tools.fs).writeFile;
3654
+ _h = [join(rootDirname_1, filepath)];
3655
+ _k = (_j = Buffer).from;
3656
+ return [4 /*yield*/, response.arrayBuffer()];
3657
+ case 3: return [4 /*yield*/, _g.apply(_f, _h.concat([_k.apply(_j, [_l.sent()])]))];
3658
+ case 4:
3659
+ _l.sent();
3660
+ // TODO: !!!!!!!! Check the file security
3661
+ // TODO: !!!!!!!! Check the file size (if it is not too big)
3662
+ // TODO: !!!!!!!! Delete the file
3663
+ return [2 /*return*/, makeKnowledgeSourceHandler({ name: name, knowledgeSourceContent: filepath }, tools, __assign(__assign({}, options), { rootDirname: rootDirname_1 }))];
3664
+ case 5:
3665
+ if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 7];
3671
3666
  if (tools.fs === undefined) {
3672
3667
  throw new EnvironmentMismatchError('Can not import file knowledge without filesystem tools');
3673
3668
  // <- TODO: [๐Ÿง ] What is the best error type here`
@@ -3680,8 +3675,8 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3680
3675
  fileExtension = getFileExtension(filename_1);
3681
3676
  mimeType = extensionToMimeType(fileExtension || '');
3682
3677
  return [4 /*yield*/, isFileExisting(filename_1, tools.fs)];
3683
- case 3:
3684
- if (!(_f.sent())) {
3678
+ case 6:
3679
+ if (!(_l.sent())) {
3685
3680
  throw new NotFoundError(spaceTrim$1(function (block) { return "\n Can not make source handler for file which does not exist:\n\n File:\n ".concat(block(knowledgeSourceContent), "\n\n Full file path:\n ").concat(block(filename_1), "\n "); }));
3686
3681
  }
3687
3682
  // TODO: [๐Ÿง ][๐Ÿ˜ฟ] Test security file - file is scoped to the project (BUT maybe do this in `filesystemTools`)
@@ -3727,7 +3722,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3727
3722
  });
3728
3723
  },
3729
3724
  }];
3730
- case 4: return [2 /*return*/, {
3725
+ case 7: return [2 /*return*/, {
3731
3726
  source: name,
3732
3727
  filename: null,
3733
3728
  url: null,