@promptbook/pdf 0.100.1 โ†’ 0.100.3-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -29,6 +29,10 @@ Write AI applications using plain human language across multiple models and plat
29
29
 
30
30
 
31
31
 
32
+ <blockquote style="color: #ff8811">
33
+ <b>โš  Warning:</b> This is a pre-release version of the library. It is not yet ready for production use. Please look at <a href="https://www.npmjs.com/package/@promptbook/core?activeTab=versions">latest stable release</a>.
34
+ </blockquote>
35
+
32
36
  ## ๐Ÿ“ฆ Package `@promptbook/pdf`
33
37
 
34
38
  - Promptbooks are [divided into several](#-packages) packages, all are published from [single monorepo](https://github.com/webgptorg/promptbook).
package/esm/index.es.js CHANGED
@@ -2,7 +2,7 @@ import { mkdir, rm, readFile } from 'fs/promises';
2
2
  import spaceTrim, { spaceTrim as spaceTrim$1 } from 'spacetrim';
3
3
  import { SHA256 } from 'crypto-js';
4
4
  import hexEncoder from 'crypto-js/enc-hex';
5
- import { basename, join, dirname } from 'path';
5
+ import { basename, join, dirname, isAbsolute } from 'path';
6
6
  import parserHtml from 'prettier/parser-html';
7
7
  import parserMarkdown from 'prettier/parser-markdown';
8
8
  import { format } from 'prettier/standalone';
@@ -27,7 +27,7 @@ const BOOK_LANGUAGE_VERSION = '1.0.0';
27
27
  * @generated
28
28
  * @see https://github.com/webgptorg/promptbook
29
29
  */
30
- const PROMPTBOOK_ENGINE_VERSION = '0.100.1';
30
+ const PROMPTBOOK_ENGINE_VERSION = '0.100.3-0';
31
31
  /**
32
32
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
33
33
  * Note: [๐Ÿ’ž] Ignore a discrepancy between file name and entity name
@@ -722,7 +722,7 @@ function removeEmojis(text) {
722
722
  }
723
723
 
724
724
  /**
725
- * Tests if given string is valid URL.
725
+ * Tests if given string is valid file path.
726
726
  *
727
727
  * Note: This does not check if the file exists only if the path is valid
728
728
  * @public exported from `@promptbook/utils`
@@ -734,18 +734,25 @@ function isValidFilePath(filename) {
734
734
  if (filename.split('\n').length > 1) {
735
735
  return false;
736
736
  }
737
- if (filename.split(' ').length >
738
- 5 /* <- TODO: [๐Ÿง ][๐Ÿˆท] Make some better non-arbitrary way how to distinct filenames from informational texts */) {
737
+ // Normalize slashes early so heuristics can detect path-like inputs
738
+ const filenameSlashes = filename.replace(/\\/g, '/');
739
+ // Reject strings that look like sentences (informational text)
740
+ // Heuristic: contains multiple spaces and ends with a period, or contains typical sentence punctuation
741
+ // But skip this heuristic if the string looks like a path (contains '/' or starts with a drive letter)
742
+ if (filename.trim().length > 60 && // long enough to be a sentence
743
+ /[.!?]/.test(filename) && // contains sentence punctuation
744
+ filename.split(' ').length > 8 && // has many words
745
+ !/\/|^[A-Z]:/i.test(filenameSlashes) // do NOT treat as sentence if looks like a path
746
+ ) {
739
747
  return false;
740
748
  }
741
- const filenameSlashes = filename.split('\\').join('/');
742
749
  // Absolute Unix path: /hello.txt
743
750
  if (/^(\/)/i.test(filenameSlashes)) {
744
751
  // console.log(filename, 'Absolute Unix path: /hello.txt');
745
752
  return true;
746
753
  }
747
- // Absolute Windows path: /hello.txt
748
- if (/^([A-Z]{1,2}:\/?)\//i.test(filenameSlashes)) {
754
+ // Absolute Windows path: C:/ or C:\ (allow spaces and multiple dots in filename)
755
+ if (/^[A-Z]:\/.+$/i.test(filenameSlashes)) {
749
756
  // console.log(filename, 'Absolute Windows path: /hello.txt');
750
757
  return true;
751
758
  }
@@ -3640,9 +3647,15 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3640
3647
  }
3641
3648
  if (isValidUrl(knowledgeSourceContent)) {
3642
3649
  const url = knowledgeSourceContent;
3650
+ if (isVerbose) {
3651
+ console.info(`๐Ÿ“„ [1] "${name}" is available at "${url}"`);
3652
+ }
3643
3653
  const response = await fetch(url); // <- TODO: [๐Ÿง ] Scraping and fetch proxy
3644
3654
  const mimeType = ((_a = response.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3645
3655
  if (tools.fs === undefined || !url.endsWith('.pdf' /* <- TODO: [๐Ÿ’ต] */)) {
3656
+ if (isVerbose) {
3657
+ console.info(`๐Ÿ“„ [2] "${name}" tools.fs is not available or URL is not a PDF.`);
3658
+ }
3646
3659
  return {
3647
3660
  source: name,
3648
3661
  filename: null,
@@ -3678,13 +3691,17 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3678
3691
  await tools.fs.mkdir(dirname(join(rootDirname, filepath)), { recursive: true });
3679
3692
  }
3680
3693
  catch (error) {
3694
+ if (isVerbose) {
3695
+ console.info(`๐Ÿ“„ [3] "${name}" error creating cache directory`);
3696
+ }
3681
3697
  // Note: If we can't create cache directory, we'll handle it when trying to write the file
3682
3698
  // This handles read-only filesystems, permission issues, and missing parent directories
3683
- if (error instanceof Error && (error.message.includes('EROFS') ||
3684
- error.message.includes('read-only') ||
3685
- error.message.includes('EACCES') ||
3686
- error.message.includes('EPERM') ||
3687
- error.message.includes('ENOENT'))) ;
3699
+ if (error instanceof Error &&
3700
+ (error.message.includes('EROFS') ||
3701
+ error.message.includes('read-only') ||
3702
+ error.message.includes('EACCES') ||
3703
+ error.message.includes('EPERM') ||
3704
+ error.message.includes('ENOENT'))) ;
3688
3705
  else {
3689
3706
  // Re-throw other unexpected errors
3690
3707
  throw error;
@@ -3699,13 +3716,17 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3699
3716
  await tools.fs.writeFile(join(rootDirname, filepath), fileContent);
3700
3717
  }
3701
3718
  catch (error) {
3719
+ if (isVerbose) {
3720
+ console.info(`๐Ÿ“„ [4] "${name}" error writing cache file`);
3721
+ }
3702
3722
  // Note: If we can't write to cache, we'll process the file directly from memory
3703
3723
  // This handles read-only filesystems like Vercel
3704
- if (error instanceof Error && (error.message.includes('EROFS') ||
3705
- error.message.includes('read-only') ||
3706
- error.message.includes('EACCES') ||
3707
- error.message.includes('EPERM') ||
3708
- error.message.includes('ENOENT'))) {
3724
+ if (error instanceof Error &&
3725
+ (error.message.includes('EROFS') ||
3726
+ error.message.includes('read-only') ||
3727
+ error.message.includes('EACCES') ||
3728
+ error.message.includes('EPERM') ||
3729
+ error.message.includes('ENOENT'))) {
3709
3730
  // Return a handler that works directly with the downloaded content
3710
3731
  return {
3711
3732
  source: name,
@@ -3727,6 +3748,9 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3727
3748
  }
3728
3749
  // TODO: [๐Ÿ’ต] Check the file security
3729
3750
  // TODO: [๐Ÿงน][๐Ÿง ] Delete the file after the scraping is done
3751
+ if (isVerbose) {
3752
+ console.info(`๐Ÿ“„ [5] "${name}" cached at "${join(rootDirname, filepath)}"`);
3753
+ }
3730
3754
  return makeKnowledgeSourceHandler({ name, knowledgeSourceContent: filepath }, tools, {
3731
3755
  ...options,
3732
3756
  rootDirname,
@@ -3741,7 +3765,12 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3741
3765
  throw new EnvironmentMismatchError('Can not import file knowledge in non-file pipeline');
3742
3766
  // <- TODO: [๐Ÿง ] What is the best error type here`
3743
3767
  }
3744
- const filename = join(rootDirname, knowledgeSourceContent).split('\\').join('/');
3768
+ const filename = isAbsolute(knowledgeSourceContent)
3769
+ ? knowledgeSourceContent
3770
+ : join(rootDirname, knowledgeSourceContent).split('\\').join('/');
3771
+ if (isVerbose) {
3772
+ console.info(`๐Ÿ“„ [6] "${name}" is a valid file "${filename}"`);
3773
+ }
3745
3774
  const fileExtension = getFileExtension(filename);
3746
3775
  const mimeType = extensionToMimeType(fileExtension || '');
3747
3776
  if (!(await isFileExisting(filename, tools.fs))) {
@@ -3783,6 +3812,12 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3783
3812
  };
3784
3813
  }
3785
3814
  else {
3815
+ if (isVerbose) {
3816
+ console.info(`๐Ÿ“„ [7] "${name}" is just a explicit string text with a knowledge source`);
3817
+ console.info('---');
3818
+ console.info(knowledgeSourceContent);
3819
+ console.info('---');
3820
+ }
3786
3821
  return {
3787
3822
  source: name,
3788
3823
  filename: null,