@promptbook/markdown-utils 0.84.0-11 โ†’ 0.84.0-12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/esm/index.es.js CHANGED
@@ -2,9 +2,10 @@ import spaceTrim, { spaceTrim as spaceTrim$1 } from 'spacetrim';
2
2
  import { format } from 'prettier';
3
3
  import parserHtml from 'prettier/parser-html';
4
4
  import { forTime } from 'waitasecond';
5
- import { join, basename } from 'path';
6
- import { SHA256 } from 'crypto-js';
7
5
  import hexEncoder from 'crypto-js/enc-hex';
6
+ import sha256 from 'crypto-js/sha256';
7
+ import { basename, join, dirname } from 'path';
8
+ import { SHA256 } from 'crypto-js';
8
9
  import { lookup } from 'mime-types';
9
10
  import { unparse, parse } from 'papaparse';
10
11
 
@@ -22,7 +23,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
22
23
  * @generated
23
24
  * @see https://github.com/webgptorg/promptbook
24
25
  */
25
- var PROMPTBOOK_ENGINE_VERSION = '0.84.0-10';
26
+ var PROMPTBOOK_ENGINE_VERSION = '0.84.0-11';
26
27
  /**
27
28
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
28
29
  * Note: [๐Ÿ’ž] Ignore a discrepancy between file name and entity name
@@ -808,6 +809,12 @@ var SMALL_NUMBER = 0.001;
808
809
  * @private within the repository - too low-level in comparison with other `MAX_...`
809
810
  */
810
811
  var IMMEDIATE_TIME = 10;
812
+ /**
813
+ * The maximum length of the (generated) filename
814
+ *
815
+ * @public exported from `@promptbook/core`
816
+ */
817
+ var MAX_FILENAME_LENGTH = 30;
811
818
  /**
812
819
  * Strategy for caching the intermediate results for knowledge sources
813
820
  *
@@ -827,6 +834,15 @@ var DEFAULT_MAX_PARALLEL_COUNT = 5; // <- TODO: [๐Ÿคนโ€โ™‚๏ธ]
827
834
  * @public exported from `@promptbook/core`
828
835
  */
829
836
  var DEFAULT_MAX_EXECUTION_ATTEMPTS = 3; // <- TODO: [๐Ÿคนโ€โ™‚๏ธ]
837
+ // <- TODO: [๐Ÿ•] Make also `BOOKS_DIRNAME_ALTERNATIVES`
838
+ /**
839
+ * Where to store the temporary downloads
840
+ *
841
+ * Note: When the folder does not exist, it is created recursively
842
+ *
843
+ * @public exported from `@promptbook/core`
844
+ */
845
+ var DEFAULT_DOWNLOAD_CACHE_DIRNAME = './.promptbook/download-cache';
830
846
  /**
831
847
  * Where to store the scrape cache
832
848
  *
@@ -3190,6 +3206,22 @@ function $registeredScrapersMessage(availableScrapers) {
3190
3206
  * TODO: [ยฎ] DRY Register logic
3191
3207
  */
3192
3208
 
3209
+ /**
3210
+ * Removes emojis from a string and fix whitespaces
3211
+ *
3212
+ * @param text with emojis
3213
+ * @returns text without emojis
3214
+ * @public exported from `@promptbook/utils`
3215
+ */
3216
+ function removeEmojis(text) {
3217
+ // Replace emojis (and also ZWJ sequence) with hyphens
3218
+ text = text.replace(/(\p{Extended_Pictographic})\p{Modifier_Symbol}/gu, '$1');
3219
+ text = text.replace(/(\p{Extended_Pictographic})[\u{FE00}-\u{FE0F}]/gu, '$1');
3220
+ text = text.replace(/(\p{Extended_Pictographic})(\u{200D}\p{Extended_Pictographic})*/gu, '$1');
3221
+ text = text.replace(/\p{Extended_Pictographic}/gu, '');
3222
+ return text;
3223
+ }
3224
+
3193
3225
  var defaultDiacriticsRemovalMap = [
3194
3226
  {
3195
3227
  base: 'A',
@@ -3513,6 +3545,30 @@ function normalizeToKebabCase(text) {
3513
3545
  * Note: [๐Ÿ’ž] Ignore a discrepancy between file name and entity name
3514
3546
  */
3515
3547
 
3548
+ /**
3549
+ * @@@
3550
+ *
3551
+ * @param value @@@
3552
+ * @returns @@@
3553
+ * @example @@@
3554
+ * @public exported from `@promptbook/utils`
3555
+ */
3556
+ function titleToName(value) {
3557
+ if (isValidUrl(value)) {
3558
+ value = value.replace(/^https?:\/\//, '');
3559
+ value = value.replace(/\.html$/, '');
3560
+ }
3561
+ else if (isValidFilePath(value)) {
3562
+ value = basename(value);
3563
+ // Note: Keeping extension in the name
3564
+ }
3565
+ value = value.split('/').join('-');
3566
+ value = removeEmojis(value);
3567
+ value = normalizeToKebabCase(value);
3568
+ // TODO: [๐Ÿง ] Maybe warn or add some padding to short name which are not good identifiers
3569
+ return value;
3570
+ }
3571
+
3516
3572
  /**
3517
3573
  * Creates unique name for the source
3518
3574
  *
@@ -3534,6 +3590,15 @@ function knowledgeSourceContentToName(knowledgeSourceContent) {
3534
3590
  * TODO: [๐Ÿฑโ€๐Ÿ‰][๐Ÿง ] Make some smart crop NOT source-i-m-pavol-a-develop-... BUT source-i-m-pavol-a-developer-...
3535
3591
  */
3536
3592
 
3593
+ /**
3594
+ * @@@
3595
+ *
3596
+ * @private for `FileCacheStorage`
3597
+ */
3598
+ function nameToSubfolderPath(name) {
3599
+ return [name.substr(0, 1).toLowerCase(), name.substr(1, 1).toLowerCase()];
3600
+ }
3601
+
3537
3602
  /**
3538
3603
  * Convert file extension to mime type
3539
3604
  *
@@ -3624,10 +3689,11 @@ var scraperFetch = function (url, init) { return __awaiter(void 0, void 0, void
3624
3689
  function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3625
3690
  var _a;
3626
3691
  return __awaiter(this, void 0, void 0, function () {
3627
- var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response_1, mimeType, filename_1, fileExtension, mimeType;
3628
- return __generator(this, function (_f) {
3629
- switch (_f.label) {
3692
+ var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response, mimeType, filename, hash, rootDirname_1, filepath, _f, _g, _h, _j, _k, filename_1, fileExtension, mimeType;
3693
+ return __generator(this, function (_l) {
3694
+ switch (_l.label) {
3630
3695
  case 0:
3696
+ console.log('!!! makeKnowledgeSourceHandler', knowledgeSource);
3631
3697
  _b = tools.fetch, fetch = _b === void 0 ? scraperFetch : _b;
3632
3698
  knowledgeSourceContent = knowledgeSource.knowledgeSourceContent;
3633
3699
  name = knowledgeSource.name;
@@ -3635,54 +3701,32 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3635
3701
  if (!name) {
3636
3702
  name = knowledgeSourceContentToName(knowledgeSourceContent);
3637
3703
  }
3638
- if (!isValidUrl(knowledgeSourceContent)) return [3 /*break*/, 2];
3704
+ if (!isValidUrl(knowledgeSourceContent)) return [3 /*break*/, 5];
3639
3705
  url = knowledgeSourceContent;
3640
3706
  return [4 /*yield*/, fetch(url)];
3641
3707
  case 1:
3642
- response_1 = _f.sent();
3643
- mimeType = ((_a = response_1.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3644
- return [2 /*return*/, {
3645
- source: name,
3646
- filename: null,
3647
- url: url,
3648
- mimeType: mimeType,
3649
- /*
3650
- TODO: [๐Ÿฅฝ]
3651
- > async asBlob() {
3652
- > // TODO: [๐Ÿ‘จ๐Ÿปโ€๐Ÿคโ€๐Ÿ‘จ๐Ÿป] This can be called multiple times BUT when called second time, response in already consumed
3653
- > const content = await response.blob();
3654
- > return content;
3655
- > },
3656
- */
3657
- asJson: function () {
3658
- return __awaiter(this, void 0, void 0, function () {
3659
- var content;
3660
- return __generator(this, function (_a) {
3661
- switch (_a.label) {
3662
- case 0: return [4 /*yield*/, response_1.json()];
3663
- case 1:
3664
- content = _a.sent();
3665
- return [2 /*return*/, content];
3666
- }
3667
- });
3668
- });
3669
- },
3670
- asText: function () {
3671
- return __awaiter(this, void 0, void 0, function () {
3672
- var content;
3673
- return __generator(this, function (_a) {
3674
- switch (_a.label) {
3675
- case 0: return [4 /*yield*/, response_1.text()];
3676
- case 1:
3677
- content = _a.sent();
3678
- return [2 /*return*/, content];
3679
- }
3680
- });
3681
- });
3682
- },
3683
- }];
3708
+ response = _l.sent();
3709
+ mimeType = ((_a = response.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3710
+ filename = url.split('/').pop() || titleToName(url);
3711
+ hash = sha256(hexEncoder.parse(url)).toString( /* hex */);
3712
+ rootDirname_1 = join(process.cwd(), DEFAULT_DOWNLOAD_CACHE_DIRNAME);
3713
+ filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [๐ŸŽŽ] Maybe add some SHA256 prefix */)), false), ["".concat(filename.substring(0, MAX_FILENAME_LENGTH), ".pdf")], false));
3714
+ return [4 /*yield*/, tools.fs.mkdir(dirname(join(rootDirname_1, filepath)), { recursive: true })];
3684
3715
  case 2:
3685
- if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 4];
3716
+ _l.sent();
3717
+ _g = (_f = tools.fs).writeFile;
3718
+ _h = [join(rootDirname_1, filepath)];
3719
+ _k = (_j = Buffer).from;
3720
+ return [4 /*yield*/, response.arrayBuffer()];
3721
+ case 3: return [4 /*yield*/, _g.apply(_f, _h.concat([_k.apply(_j, [_l.sent()])]))];
3722
+ case 4:
3723
+ _l.sent();
3724
+ // TODO: !!!!!!!! Check the file security
3725
+ // TODO: !!!!!!!! Check the file size (if it is not too big)
3726
+ // TODO: !!!!!!!! Delete the file
3727
+ return [2 /*return*/, makeKnowledgeSourceHandler({ name: name, knowledgeSourceContent: filepath }, tools, __assign(__assign({}, options), { rootDirname: rootDirname_1 }))];
3728
+ case 5:
3729
+ if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 7];
3686
3730
  if (tools.fs === undefined) {
3687
3731
  throw new EnvironmentMismatchError('Can not import file knowledge without filesystem tools');
3688
3732
  // <- TODO: [๐Ÿง ] What is the best error type here`
@@ -3695,8 +3739,8 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3695
3739
  fileExtension = getFileExtension(filename_1);
3696
3740
  mimeType = extensionToMimeType(fileExtension || '');
3697
3741
  return [4 /*yield*/, isFileExisting(filename_1, tools.fs)];
3698
- case 3:
3699
- if (!(_f.sent())) {
3742
+ case 6:
3743
+ if (!(_l.sent())) {
3700
3744
  throw new NotFoundError(spaceTrim(function (block) { return "\n Can not make source handler for file which does not exist:\n\n File:\n ".concat(block(knowledgeSourceContent), "\n\n Full file path:\n ").concat(block(filename_1), "\n "); }));
3701
3745
  }
3702
3746
  // TODO: [๐Ÿง ][๐Ÿ˜ฟ] Test security file - file is scoped to the project (BUT maybe do this in `filesystemTools`)
@@ -3742,7 +3786,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3742
3786
  });
3743
3787
  },
3744
3788
  }];
3745
- case 4: return [2 /*return*/, {
3789
+ case 7: return [2 /*return*/, {
3746
3790
  source: name,
3747
3791
  filename: null,
3748
3792
  url: null,
@@ -5972,46 +6016,6 @@ function createPipelineExecutor(options) {
5972
6016
  * TODO: [๐Ÿš] Change onProgress to object that represents the running execution, can be subscribed via RxJS to and also awaited
5973
6017
  */
5974
6018
 
5975
- /**
5976
- * Removes emojis from a string and fix whitespaces
5977
- *
5978
- * @param text with emojis
5979
- * @returns text without emojis
5980
- * @public exported from `@promptbook/utils`
5981
- */
5982
- function removeEmojis(text) {
5983
- // Replace emojis (and also ZWJ sequence) with hyphens
5984
- text = text.replace(/(\p{Extended_Pictographic})\p{Modifier_Symbol}/gu, '$1');
5985
- text = text.replace(/(\p{Extended_Pictographic})[\u{FE00}-\u{FE0F}]/gu, '$1');
5986
- text = text.replace(/(\p{Extended_Pictographic})(\u{200D}\p{Extended_Pictographic})*/gu, '$1');
5987
- text = text.replace(/\p{Extended_Pictographic}/gu, '');
5988
- return text;
5989
- }
5990
-
5991
- /**
5992
- * @@@
5993
- *
5994
- * @param value @@@
5995
- * @returns @@@
5996
- * @example @@@
5997
- * @public exported from `@promptbook/utils`
5998
- */
5999
- function titleToName(value) {
6000
- if (isValidUrl(value)) {
6001
- value = value.replace(/^https?:\/\//, '');
6002
- value = value.replace(/\.html$/, '');
6003
- }
6004
- else if (isValidFilePath(value)) {
6005
- value = basename(value);
6006
- // Note: Keeping extension in the name
6007
- }
6008
- value = value.split('/').join('-');
6009
- value = removeEmojis(value);
6010
- value = normalizeToKebabCase(value);
6011
- // TODO: [๐Ÿง ] Maybe warn or add some padding to short name which are not good identifiers
6012
- return value;
6013
- }
6014
-
6015
6019
  /**
6016
6020
  * Metadata of the scraper
6017
6021
  *