@promptbook/markitdown 0.100.0 โ†’ 0.100.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/esm/index.es.js CHANGED
@@ -2,7 +2,7 @@ import { mkdir, rm, readFile } from 'fs/promises';
2
2
  import spaceTrim, { spaceTrim as spaceTrim$1 } from 'spacetrim';
3
3
  import { SHA256 } from 'crypto-js';
4
4
  import hexEncoder from 'crypto-js/enc-hex';
5
- import { basename, join, dirname } from 'path';
5
+ import { basename, join, dirname, isAbsolute } from 'path';
6
6
  import parserHtml from 'prettier/parser-html';
7
7
  import parserMarkdown from 'prettier/parser-markdown';
8
8
  import { format } from 'prettier/standalone';
@@ -27,7 +27,7 @@ const BOOK_LANGUAGE_VERSION = '1.0.0';
27
27
  * @generated
28
28
  * @see https://github.com/webgptorg/promptbook
29
29
  */
30
- const PROMPTBOOK_ENGINE_VERSION = '0.100.0';
30
+ const PROMPTBOOK_ENGINE_VERSION = '0.100.2';
31
31
  /**
32
32
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
33
33
  * Note: [๐Ÿ’ž] Ignore a discrepancy between file name and entity name
@@ -722,7 +722,7 @@ function removeEmojis(text) {
722
722
  }
723
723
 
724
724
  /**
725
- * Tests if given string is valid URL.
725
+ * Tests if given string is valid file path.
726
726
  *
727
727
  * Note: This does not check if the file exists only if the path is valid
728
728
  * @public exported from `@promptbook/utils`
@@ -734,18 +734,25 @@ function isValidFilePath(filename) {
734
734
  if (filename.split('\n').length > 1) {
735
735
  return false;
736
736
  }
737
- if (filename.split(' ').length >
738
- 5 /* <- TODO: [๐Ÿง ][๐Ÿˆท] Make some better non-arbitrary way how to distinct filenames from informational texts */) {
737
+ // Normalize slashes early so heuristics can detect path-like inputs
738
+ const filenameSlashes = filename.replace(/\\/g, '/');
739
+ // Reject strings that look like sentences (informational text)
740
+ // Heuristic: contains multiple spaces and ends with a period, or contains typical sentence punctuation
741
+ // But skip this heuristic if the string looks like a path (contains '/' or starts with a drive letter)
742
+ if (filename.trim().length > 60 && // long enough to be a sentence
743
+ /[.!?]/.test(filename) && // contains sentence punctuation
744
+ filename.split(' ').length > 8 && // has many words
745
+ !/\/|^[A-Z]:/i.test(filenameSlashes) // do NOT treat as sentence if looks like a path
746
+ ) {
739
747
  return false;
740
748
  }
741
- const filenameSlashes = filename.split('\\').join('/');
742
749
  // Absolute Unix path: /hello.txt
743
750
  if (/^(\/)/i.test(filenameSlashes)) {
744
751
  // console.log(filename, 'Absolute Unix path: /hello.txt');
745
752
  return true;
746
753
  }
747
- // Absolute Windows path: /hello.txt
748
- if (/^([A-Z]{1,2}:\/?)\//i.test(filenameSlashes)) {
754
+ // Absolute Windows path: C:/ or C:\ (allow spaces and multiple dots in filename)
755
+ if (/^[A-Z]:\/.+$/i.test(filenameSlashes)) {
749
756
  // console.log(filename, 'Absolute Windows path: /hello.txt');
750
757
  return true;
751
758
  }
@@ -2990,7 +2997,7 @@ const LLM_PROVIDER_PROFILES = {
2990
2997
  };
2991
2998
  /**
2992
2999
  * TODO: Refactor this - each profile must be alongside the provider definition
2993
- * TODO: Unite `AvatarProfileProps` and `ChatParticipant`
3000
+ * TODO: [๐Ÿ•›] Unite `AvatarProfileProps`, `ChatParticipant`, `LlmExecutionTools` + `LlmToolsMetadata`
2994
3001
  * Note: [๐Ÿ’ž] Ignore a discrepancy between file name and entity name
2995
3002
  */
2996
3003
 
@@ -3627,9 +3634,15 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3627
3634
  }
3628
3635
  if (isValidUrl(knowledgeSourceContent)) {
3629
3636
  const url = knowledgeSourceContent;
3637
+ if (isVerbose) {
3638
+ console.info(`๐Ÿ“„ [1] "${name}" is available at "${url}"`);
3639
+ }
3630
3640
  const response = await fetch(url); // <- TODO: [๐Ÿง ] Scraping and fetch proxy
3631
3641
  const mimeType = ((_a = response.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3632
3642
  if (tools.fs === undefined || !url.endsWith('.pdf' /* <- TODO: [๐Ÿ’ต] */)) {
3643
+ if (isVerbose) {
3644
+ console.info(`๐Ÿ“„ [2] "${name}" tools.fs is not available or URL is not a PDF.`);
3645
+ }
3633
3646
  return {
3634
3647
  source: name,
3635
3648
  filename: null,
@@ -3665,13 +3678,17 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3665
3678
  await tools.fs.mkdir(dirname(join(rootDirname, filepath)), { recursive: true });
3666
3679
  }
3667
3680
  catch (error) {
3681
+ if (isVerbose) {
3682
+ console.info(`๐Ÿ“„ [3] "${name}" error creating cache directory`);
3683
+ }
3668
3684
  // Note: If we can't create cache directory, we'll handle it when trying to write the file
3669
3685
  // This handles read-only filesystems, permission issues, and missing parent directories
3670
- if (error instanceof Error && (error.message.includes('EROFS') ||
3671
- error.message.includes('read-only') ||
3672
- error.message.includes('EACCES') ||
3673
- error.message.includes('EPERM') ||
3674
- error.message.includes('ENOENT'))) ;
3686
+ if (error instanceof Error &&
3687
+ (error.message.includes('EROFS') ||
3688
+ error.message.includes('read-only') ||
3689
+ error.message.includes('EACCES') ||
3690
+ error.message.includes('EPERM') ||
3691
+ error.message.includes('ENOENT'))) ;
3675
3692
  else {
3676
3693
  // Re-throw other unexpected errors
3677
3694
  throw error;
@@ -3686,13 +3703,17 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3686
3703
  await tools.fs.writeFile(join(rootDirname, filepath), fileContent);
3687
3704
  }
3688
3705
  catch (error) {
3706
+ if (isVerbose) {
3707
+ console.info(`๐Ÿ“„ [4] "${name}" error writing cache file`);
3708
+ }
3689
3709
  // Note: If we can't write to cache, we'll process the file directly from memory
3690
3710
  // This handles read-only filesystems like Vercel
3691
- if (error instanceof Error && (error.message.includes('EROFS') ||
3692
- error.message.includes('read-only') ||
3693
- error.message.includes('EACCES') ||
3694
- error.message.includes('EPERM') ||
3695
- error.message.includes('ENOENT'))) {
3711
+ if (error instanceof Error &&
3712
+ (error.message.includes('EROFS') ||
3713
+ error.message.includes('read-only') ||
3714
+ error.message.includes('EACCES') ||
3715
+ error.message.includes('EPERM') ||
3716
+ error.message.includes('ENOENT'))) {
3696
3717
  // Return a handler that works directly with the downloaded content
3697
3718
  return {
3698
3719
  source: name,
@@ -3714,6 +3735,9 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3714
3735
  }
3715
3736
  // TODO: [๐Ÿ’ต] Check the file security
3716
3737
  // TODO: [๐Ÿงน][๐Ÿง ] Delete the file after the scraping is done
3738
+ if (isVerbose) {
3739
+ console.info(`๐Ÿ“„ [5] "${name}" cached at "${join(rootDirname, filepath)}"`);
3740
+ }
3717
3741
  return makeKnowledgeSourceHandler({ name, knowledgeSourceContent: filepath }, tools, {
3718
3742
  ...options,
3719
3743
  rootDirname,
@@ -3728,7 +3752,12 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3728
3752
  throw new EnvironmentMismatchError('Can not import file knowledge in non-file pipeline');
3729
3753
  // <- TODO: [๐Ÿง ] What is the best error type here`
3730
3754
  }
3731
- const filename = join(rootDirname, knowledgeSourceContent).split('\\').join('/');
3755
+ const filename = isAbsolute(knowledgeSourceContent)
3756
+ ? knowledgeSourceContent
3757
+ : join(rootDirname, knowledgeSourceContent).split('\\').join('/');
3758
+ if (isVerbose) {
3759
+ console.info(`๐Ÿ“„ [6] "${name}" is a valid file "${filename}"`);
3760
+ }
3732
3761
  const fileExtension = getFileExtension(filename);
3733
3762
  const mimeType = extensionToMimeType(fileExtension || '');
3734
3763
  if (!(await isFileExisting(filename, tools.fs))) {
@@ -3770,6 +3799,12 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3770
3799
  };
3771
3800
  }
3772
3801
  else {
3802
+ if (isVerbose) {
3803
+ console.info(`๐Ÿ“„ [7] "${name}" is just a explicit string text with a knowledge source`);
3804
+ console.info('---');
3805
+ console.info(knowledgeSourceContent);
3806
+ console.info('---');
3807
+ }
3773
3808
  return {
3774
3809
  source: name,
3775
3810
  filename: null,