@promptbook/markdown-utils 0.84.0-12 → 0.84.0-14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/esm/index.es.js CHANGED
@@ -6,7 +6,7 @@ import hexEncoder from 'crypto-js/enc-hex';
6
6
  import sha256 from 'crypto-js/sha256';
7
7
  import { basename, join, dirname } from 'path';
8
8
  import { SHA256 } from 'crypto-js';
9
- import { lookup } from 'mime-types';
9
+ import { lookup, extension } from 'mime-types';
10
10
  import { unparse, parse } from 'papaparse';
11
11
 
12
12
  // ⚠️ WARNING: This code has been generated so that any manual changes will be overwritten
@@ -23,7 +23,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
23
23
  * @generated
24
24
  * @see https://github.com/webgptorg/promptbook
25
25
  */
26
- var PROMPTBOOK_ENGINE_VERSION = '0.84.0-11';
26
+ var PROMPTBOOK_ENGINE_VERSION = '0.84.0-13';
27
27
  /**
28
28
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
29
29
  * Note: [💞] Ignore a discrepancy between file name and entity name
@@ -3206,22 +3206,6 @@ function $registeredScrapersMessage(availableScrapers) {
3206
3206
  * TODO: [®] DRY Register logic
3207
3207
  */
3208
3208
 
3209
- /**
3210
- * Removes emojis from a string and fix whitespaces
3211
- *
3212
- * @param text with emojis
3213
- * @returns text without emojis
3214
- * @public exported from `@promptbook/utils`
3215
- */
3216
- function removeEmojis(text) {
3217
- // Replace emojis (and also ZWJ sequence) with hyphens
3218
- text = text.replace(/(\p{Extended_Pictographic})\p{Modifier_Symbol}/gu, '$1');
3219
- text = text.replace(/(\p{Extended_Pictographic})[\u{FE00}-\u{FE0F}]/gu, '$1');
3220
- text = text.replace(/(\p{Extended_Pictographic})(\u{200D}\p{Extended_Pictographic})*/gu, '$1');
3221
- text = text.replace(/\p{Extended_Pictographic}/gu, '');
3222
- return text;
3223
- }
3224
-
3225
3209
  var defaultDiacriticsRemovalMap = [
3226
3210
  {
3227
3211
  base: 'A',
@@ -3545,30 +3529,6 @@ function normalizeToKebabCase(text) {
3545
3529
  * Note: [💞] Ignore a discrepancy between file name and entity name
3546
3530
  */
3547
3531
 
3548
- /**
3549
- * @@@
3550
- *
3551
- * @param value @@@
3552
- * @returns @@@
3553
- * @example @@@
3554
- * @public exported from `@promptbook/utils`
3555
- */
3556
- function titleToName(value) {
3557
- if (isValidUrl(value)) {
3558
- value = value.replace(/^https?:\/\//, '');
3559
- value = value.replace(/\.html$/, '');
3560
- }
3561
- else if (isValidFilePath(value)) {
3562
- value = basename(value);
3563
- // Note: Keeping extension in the name
3564
- }
3565
- value = value.split('/').join('-');
3566
- value = removeEmojis(value);
3567
- value = normalizeToKebabCase(value);
3568
- // TODO: [🧠] Maybe warn or add some padding to short name which are not good identifiers
3569
- return value;
3570
- }
3571
-
3572
3532
  /**
3573
3533
  * Creates unique name for the source
3574
3534
  *
@@ -3654,6 +3614,57 @@ function isFileExisting(filename, fs) {
3654
3614
  * TODO: [🖇] What about symlinks?
3655
3615
  */
3656
3616
 
3617
+ /**
3618
+ * Convert mime type to file extension
3619
+ *
3620
+ * Note: If the mime type is invalid, `null` is returned
3621
+ *
3622
+ * @private within the repository
3623
+ */
3624
+ function mimeTypeToExtension(value) {
3625
+ return extension(value) || null;
3626
+ }
3627
+
3628
+ /**
3629
+ * Removes emojis from a string and fix whitespaces
3630
+ *
3631
+ * @param text with emojis
3632
+ * @returns text without emojis
3633
+ * @public exported from `@promptbook/utils`
3634
+ */
3635
+ function removeEmojis(text) {
3636
+ // Replace emojis (and also ZWJ sequence) with hyphens
3637
+ text = text.replace(/(\p{Extended_Pictographic})\p{Modifier_Symbol}/gu, '$1');
3638
+ text = text.replace(/(\p{Extended_Pictographic})[\u{FE00}-\u{FE0F}]/gu, '$1');
3639
+ text = text.replace(/(\p{Extended_Pictographic})(\u{200D}\p{Extended_Pictographic})*/gu, '$1');
3640
+ text = text.replace(/\p{Extended_Pictographic}/gu, '');
3641
+ return text;
3642
+ }
3643
+
3644
+ /**
3645
+ * @@@
3646
+ *
3647
+ * @param value @@@
3648
+ * @returns @@@
3649
+ * @example @@@
3650
+ * @public exported from `@promptbook/utils`
3651
+ */
3652
+ function titleToName(value) {
3653
+ if (isValidUrl(value)) {
3654
+ value = value.replace(/^https?:\/\//, '');
3655
+ value = value.replace(/\.html$/, '');
3656
+ }
3657
+ else if (isValidFilePath(value)) {
3658
+ value = basename(value);
3659
+ // Note: Keeping extension in the name
3660
+ }
3661
+ value = value.split('/').join('-');
3662
+ value = removeEmojis(value);
3663
+ value = normalizeToKebabCase(value);
3664
+ // TODO: [🧠] Maybe warn or add some padding to short name which are not good identifiers
3665
+ return value;
3666
+ }
3667
+
3657
3668
  /**
3658
3669
  * The built-in `fetch' function with a lightweight error handling wrapper as default fetch function used in Promptbook scrapers
3659
3670
  *
@@ -3689,7 +3700,7 @@ var scraperFetch = function (url, init) { return __awaiter(void 0, void 0, void
3689
3700
  function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3690
3701
  var _a;
3691
3702
  return __awaiter(this, void 0, void 0, function () {
3692
- var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response, mimeType, filename, hash, rootDirname_1, filepath, _f, _g, _h, _j, _k, filename_1, fileExtension, mimeType;
3703
+ var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response_1, mimeType, basename, hash, rootDirname_1, filepath, _f, _g, _h, _j, _k, filename_1, fileExtension, mimeType;
3693
3704
  return __generator(this, function (_l) {
3694
3705
  switch (_l.label) {
3695
3706
  case 0:
@@ -3705,25 +3716,67 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3705
3716
  url = knowledgeSourceContent;
3706
3717
  return [4 /*yield*/, fetch(url)];
3707
3718
  case 1:
3708
- response = _l.sent();
3709
- mimeType = ((_a = response.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3710
- filename = url.split('/').pop() || titleToName(url);
3719
+ response_1 = _l.sent();
3720
+ mimeType = ((_a = response_1.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3721
+ if (tools.fs === undefined || !url.endsWith('.pdf' /* <- TODO: [💵] */)) {
3722
+ return [2 /*return*/, {
3723
+ source: name,
3724
+ filename: null,
3725
+ url: url,
3726
+ mimeType: mimeType,
3727
+ /*
3728
+ TODO: [🥽]
3729
+ > async asBlob() {
3730
+ > // TODO: [👨🏻‍🤝‍👨🏻] This can be called multiple times BUT when called second time, response in already consumed
3731
+ > const content = await response.blob();
3732
+ > return content;
3733
+ > },
3734
+ */
3735
+ asJson: function () {
3736
+ return __awaiter(this, void 0, void 0, function () {
3737
+ var content;
3738
+ return __generator(this, function (_a) {
3739
+ switch (_a.label) {
3740
+ case 0: return [4 /*yield*/, response_1.json()];
3741
+ case 1:
3742
+ content = _a.sent();
3743
+ return [2 /*return*/, content];
3744
+ }
3745
+ });
3746
+ });
3747
+ },
3748
+ asText: function () {
3749
+ return __awaiter(this, void 0, void 0, function () {
3750
+ var content;
3751
+ return __generator(this, function (_a) {
3752
+ switch (_a.label) {
3753
+ case 0: return [4 /*yield*/, response_1.text()];
3754
+ case 1:
3755
+ content = _a.sent();
3756
+ return [2 /*return*/, content];
3757
+ }
3758
+ });
3759
+ });
3760
+ },
3761
+ }];
3762
+ }
3763
+ basename = url.split('/').pop() || titleToName(url);
3711
3764
  hash = sha256(hexEncoder.parse(url)).toString( /* hex */);
3712
3765
  rootDirname_1 = join(process.cwd(), DEFAULT_DOWNLOAD_CACHE_DIRNAME);
3713
- filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [🎎] Maybe add some SHA256 prefix */)), false), ["".concat(filename.substring(0, MAX_FILENAME_LENGTH), ".pdf")], false));
3766
+ filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [🎎] Maybe add some SHA256 prefix */)), false), ["".concat(basename.substring(0, MAX_FILENAME_LENGTH), ".").concat(mimeTypeToExtension(mimeType))], false));
3714
3767
  return [4 /*yield*/, tools.fs.mkdir(dirname(join(rootDirname_1, filepath)), { recursive: true })];
3715
3768
  case 2:
3716
3769
  _l.sent();
3717
3770
  _g = (_f = tools.fs).writeFile;
3718
3771
  _h = [join(rootDirname_1, filepath)];
3719
3772
  _k = (_j = Buffer).from;
3720
- return [4 /*yield*/, response.arrayBuffer()];
3773
+ return [4 /*yield*/, response_1.arrayBuffer()];
3721
3774
  case 3: return [4 /*yield*/, _g.apply(_f, _h.concat([_k.apply(_j, [_l.sent()])]))];
3722
3775
  case 4:
3723
3776
  _l.sent();
3724
- // TODO: !!!!!!!! Check the file security
3777
+ // TODO: [💵] Check the file security
3725
3778
  // TODO: !!!!!!!! Check the file size (if it is not too big)
3726
- // TODO: !!!!!!!! Delete the file
3779
+ // TODO: !!!!!!!! Delete the file after the scraping is done
3727
3780
  return [2 /*return*/, makeKnowledgeSourceHandler({ name: name, knowledgeSourceContent: filepath }, tools, __assign(__assign({}, options), { rootDirname: rootDirname_1 }))];
3728
3781
  case 5:
3729
3782
  if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 7];