@promptbook/markitdown 0.84.0-13 → 0.84.0-15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE.md ADDED
@@ -0,0 +1 @@
1
+ [Functional Source License, Version 1.1, ALv2 Future License](https://github.com/getsentry/fsl.software/blob/main/FSL-1.1-ALv2.template.md)
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  <!-- ⚠️ WARNING: This code has been generated so that any manual changes will be overwritten -->
2
2
 
3
- # Promptbook
3
+ # Promptbook
4
4
 
5
5
 
6
6
 
@@ -16,10 +16,10 @@
16
16
 
17
17
  ## ❄ New Features
18
18
 
19
+ - 🐋 **Support of [DeepSeek models](https://www.deepseek.com/)**
19
20
  - 💙 Working [the **Book** language v1.0.0](https://github.com/webgptorg/book)
20
21
  - 🖤 Run books from CLI - `npx ptbk run path/to/your/book`
21
- - 📚 Support of `.docx`, `.doc` and `.pdf` documents
22
- - ✨ **Support of [OpenAI o1 model](https://openai.com/o1/)**
22
+ - 📚 Support of `.docx`, `.doc` and `.pdf` documents as knowledge
23
23
 
24
24
 
25
25
 
package/esm/index.es.js CHANGED
@@ -7,7 +7,7 @@ import { format } from 'prettier';
7
7
  import parserHtml from 'prettier/parser-html';
8
8
  import { forTime } from 'waitasecond';
9
9
  import sha256 from 'crypto-js/sha256';
10
- import { lookup } from 'mime-types';
10
+ import { lookup, extension } from 'mime-types';
11
11
  import { unparse, parse } from 'papaparse';
12
12
 
13
13
  // ⚠️ WARNING: This code has been generated so that any manual changes will be overwritten
@@ -24,7 +24,7 @@ var BOOK_LANGUAGE_VERSION = '1.0.0';
24
24
  * @generated
25
25
  * @see https://github.com/webgptorg/promptbook
26
26
  */
27
- var PROMPTBOOK_ENGINE_VERSION = '0.84.0-12';
27
+ var PROMPTBOOK_ENGINE_VERSION = '0.84.0-14';
28
28
  /**
29
29
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
30
30
  * Note: [💞] Ignore a discrepancy between file name and entity name
@@ -3567,6 +3567,17 @@ function getFileExtension(value) {
3567
3567
  return match ? match[1].toLowerCase() : null;
3568
3568
  }
3569
3569
 
3570
+ /**
3571
+ * Convert mime type to file extension
3572
+ *
3573
+ * Note: If the mime type is invalid, `null` is returned
3574
+ *
3575
+ * @private within the repository
3576
+ */
3577
+ function mimeTypeToExtension(value) {
3578
+ return extension(value) || null;
3579
+ }
3580
+
3570
3581
  /**
3571
3582
  * The built-in `fetch' function with a lightweight error handling wrapper as default fetch function used in Promptbook scrapers
3572
3583
  *
@@ -3620,7 +3631,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3620
3631
  case 1:
3621
3632
  response_1 = _l.sent();
3622
3633
  mimeType = ((_a = response_1.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3623
- if (tools.fs === undefined || !url.endsWith('.pdf')) {
3634
+ if (tools.fs === undefined || !url.endsWith('.pdf' /* <- TODO: [💵] */)) {
3624
3635
  return [2 /*return*/, {
3625
3636
  source: name,
3626
3637
  filename: null,
@@ -3665,7 +3676,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3665
3676
  basename = url.split('/').pop() || titleToName(url);
3666
3677
  hash = sha256(hexEncoder.parse(url)).toString( /* hex */);
3667
3678
  rootDirname_1 = join(process.cwd(), DEFAULT_DOWNLOAD_CACHE_DIRNAME);
3668
- filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [🎎] Maybe add some SHA256 prefix */)), false), ["".concat(basename.substring(0, MAX_FILENAME_LENGTH), ".pdf")], false));
3679
+ filepath = join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [🎎] Maybe add some SHA256 prefix */)), false), ["".concat(basename.substring(0, MAX_FILENAME_LENGTH), ".").concat(mimeTypeToExtension(mimeType))], false));
3669
3680
  return [4 /*yield*/, tools.fs.mkdir(dirname(join(rootDirname_1, filepath)), { recursive: true })];
3670
3681
  case 2:
3671
3682
  _l.sent();
@@ -3676,9 +3687,9 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3676
3687
  case 3: return [4 /*yield*/, _g.apply(_f, _h.concat([_k.apply(_j, [_l.sent()])]))];
3677
3688
  case 4:
3678
3689
  _l.sent();
3679
- // TODO: !!!!!!!! Check the file security
3690
+ // TODO: [💵] Check the file security
3680
3691
  // TODO: !!!!!!!! Check the file size (if it is not too big)
3681
- // TODO: !!!!!!!! Delete the file
3692
+ // TODO: !!!!!!!! Delete the file after the scraping is done
3682
3693
  return [2 /*return*/, makeKnowledgeSourceHandler({ name: name, knowledgeSourceContent: filepath }, tools, __assign(__assign({}, options), { rootDirname: rootDirname_1 }))];
3683
3694
  case 5:
3684
3695
  if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 7];
@@ -6297,7 +6308,7 @@ var markitdownScraperMetadata = $deepFreeze({
6297
6308
  className: 'MarkitdownScraper',
6298
6309
  mimeTypes: [
6299
6310
  'application/pdf',
6300
- // TODO: Make priority for scrapers and than allow all mime types here:
6311
+ // TODO: [💵] Make priority for scrapers and than analyze which mime-types can Markitdown scrape and allow all mime types here:
6301
6312
  // 'text/html',
6302
6313
  // 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
6303
6314
  ],
@@ -6333,10 +6344,10 @@ var MarkitdownScraper = /** @class */ (function () {
6333
6344
  this.tools = tools;
6334
6345
  this.options = options;
6335
6346
  this.markdownScraper = new MarkdownScraper(tools, options);
6347
+ // Note: Module `markitdown-ts` has no types available, so it is imported using `require`
6336
6348
  // eslint-disable-next-line @typescript-eslint/no-var-requires
6337
6349
  var MarkItDown = require('markitdown-ts').MarkItDown;
6338
- // <- TODO: !!! Use Markitdown directly not through this package
6339
- // <- Note: !!!!!!!
6350
+ // <- TODO: 'Use Markitdown directly not through this package
6340
6351
  this.markitdown = new MarkItDown();
6341
6352
  }
6342
6353
  Object.defineProperty(MarkitdownScraper.prototype, "metadata", {
@@ -6381,12 +6392,12 @@ var MarkitdownScraper = /** @class */ (function () {
6381
6392
  case 2:
6382
6393
  if (!!(_f.sent())) return [3 /*break*/, 5];
6383
6394
  src = source.filename || source.url || null;
6384
- console.log('!!!', { src: src, source: source, cacheFilehandler: cacheFilehandler });
6395
+ // console.log('!!!', { src, source, cacheFilehandler });
6385
6396
  if (src === null) {
6386
6397
  throw new UnexpectedError('Source has no filename or url');
6387
6398
  }
6388
6399
  return [4 /*yield*/, this.markitdown.convert(src, {
6389
- // TODO: !!!!!! Pass when sacraping Youtube
6400
+ // TODO: Pass when sacraping Youtube
6390
6401
  // enableYoutubeTranscript: true,
6391
6402
  // youtubeTranscriptLanguage: 'en',
6392
6403
  })];
@@ -6396,9 +6407,10 @@ var MarkitdownScraper = /** @class */ (function () {
6396
6407
  throw new Error("Markitdown could not convert the \"".concat(source.source, "\""));
6397
6408
  // <- TODO: !!! Make MarkitdownError
6398
6409
  }
6399
- console.log('!!!', { result: result, cacheFilehandler: cacheFilehandler });
6410
+ // console.log('!!!', { result, cacheFilehandler });
6400
6411
  return [4 /*yield*/, this.tools.fs.writeFile(cacheFilehandler.filename, result.text_content)];
6401
6412
  case 4:
6413
+ // console.log('!!!', { result, cacheFilehandler });
6402
6414
  _f.sent();
6403
6415
  _f.label = 5;
6404
6416
  case 5: return [2 /*return*/, cacheFilehandler];