@promptbook/website-crawler 0.100.0 โ†’ 0.100.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/esm/index.es.js CHANGED
@@ -4,7 +4,7 @@ import { JSDOM } from 'jsdom';
4
4
  import { SHA256 } from 'crypto-js';
5
5
  import hexEncoder from 'crypto-js/enc-hex';
6
6
  import { mkdir, rm } from 'fs/promises';
7
- import { basename, join, dirname } from 'path';
7
+ import { basename, join, dirname, isAbsolute } from 'path';
8
8
  import parserHtml from 'prettier/parser-html';
9
9
  import parserMarkdown from 'prettier/parser-markdown';
10
10
  import { format } from 'prettier/standalone';
@@ -30,7 +30,7 @@ const BOOK_LANGUAGE_VERSION = '1.0.0';
30
30
  * @generated
31
31
  * @see https://github.com/webgptorg/promptbook
32
32
  */
33
- const PROMPTBOOK_ENGINE_VERSION = '0.100.0';
33
+ const PROMPTBOOK_ENGINE_VERSION = '0.100.2';
34
34
  /**
35
35
  * TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
36
36
  * Note: [๐Ÿ’ž] Ignore a discrepancy between file name and entity name
@@ -901,7 +901,7 @@ function removeEmojis(text) {
901
901
  }
902
902
 
903
903
  /**
904
- * Tests if given string is valid URL.
904
+ * Tests if given string is valid file path.
905
905
  *
906
906
  * Note: This does not check if the file exists only if the path is valid
907
907
  * @public exported from `@promptbook/utils`
@@ -913,18 +913,25 @@ function isValidFilePath(filename) {
913
913
  if (filename.split('\n').length > 1) {
914
914
  return false;
915
915
  }
916
- if (filename.split(' ').length >
917
- 5 /* <- TODO: [๐Ÿง ][๐Ÿˆท] Make some better non-arbitrary way how to distinct filenames from informational texts */) {
916
+ // Normalize slashes early so heuristics can detect path-like inputs
917
+ const filenameSlashes = filename.replace(/\\/g, '/');
918
+ // Reject strings that look like sentences (informational text)
919
+ // Heuristic: contains multiple spaces and ends with a period, or contains typical sentence punctuation
920
+ // But skip this heuristic if the string looks like a path (contains '/' or starts with a drive letter)
921
+ if (filename.trim().length > 60 && // long enough to be a sentence
922
+ /[.!?]/.test(filename) && // contains sentence punctuation
923
+ filename.split(' ').length > 8 && // has many words
924
+ !/\/|^[A-Z]:/i.test(filenameSlashes) // do NOT treat as sentence if looks like a path
925
+ ) {
918
926
  return false;
919
927
  }
920
- const filenameSlashes = filename.split('\\').join('/');
921
928
  // Absolute Unix path: /hello.txt
922
929
  if (/^(\/)/i.test(filenameSlashes)) {
923
930
  // console.log(filename, 'Absolute Unix path: /hello.txt');
924
931
  return true;
925
932
  }
926
- // Absolute Windows path: /hello.txt
927
- if (/^([A-Z]{1,2}:\/?)\//i.test(filenameSlashes)) {
933
+ // Absolute Windows path: C:/ or C:\ (allow spaces and multiple dots in filename)
934
+ if (/^[A-Z]:\/.+$/i.test(filenameSlashes)) {
928
935
  // console.log(filename, 'Absolute Windows path: /hello.txt');
929
936
  return true;
930
937
  }
@@ -3119,7 +3126,7 @@ const LLM_PROVIDER_PROFILES = {
3119
3126
  };
3120
3127
  /**
3121
3128
  * TODO: Refactor this - each profile must be alongside the provider definition
3122
- * TODO: Unite `AvatarProfileProps` and `ChatParticipant`
3129
+ * TODO: [๐Ÿ•›] Unite `AvatarProfileProps`, `ChatParticipant`, `LlmExecutionTools` + `LlmToolsMetadata`
3123
3130
  * Note: [๐Ÿ’ž] Ignore a discrepancy between file name and entity name
3124
3131
  */
3125
3132
 
@@ -3641,9 +3648,15 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3641
3648
  }
3642
3649
  if (isValidUrl(knowledgeSourceContent)) {
3643
3650
  const url = knowledgeSourceContent;
3651
+ if (isVerbose) {
3652
+ console.info(`๐Ÿ“„ [1] "${name}" is available at "${url}"`);
3653
+ }
3644
3654
  const response = await fetch(url); // <- TODO: [๐Ÿง ] Scraping and fetch proxy
3645
3655
  const mimeType = ((_a = response.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
3646
3656
  if (tools.fs === undefined || !url.endsWith('.pdf' /* <- TODO: [๐Ÿ’ต] */)) {
3657
+ if (isVerbose) {
3658
+ console.info(`๐Ÿ“„ [2] "${name}" tools.fs is not available or URL is not a PDF.`);
3659
+ }
3647
3660
  return {
3648
3661
  source: name,
3649
3662
  filename: null,
@@ -3679,13 +3692,17 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3679
3692
  await tools.fs.mkdir(dirname(join(rootDirname, filepath)), { recursive: true });
3680
3693
  }
3681
3694
  catch (error) {
3695
+ if (isVerbose) {
3696
+ console.info(`๐Ÿ“„ [3] "${name}" error creating cache directory`);
3697
+ }
3682
3698
  // Note: If we can't create cache directory, we'll handle it when trying to write the file
3683
3699
  // This handles read-only filesystems, permission issues, and missing parent directories
3684
- if (error instanceof Error && (error.message.includes('EROFS') ||
3685
- error.message.includes('read-only') ||
3686
- error.message.includes('EACCES') ||
3687
- error.message.includes('EPERM') ||
3688
- error.message.includes('ENOENT'))) ;
3700
+ if (error instanceof Error &&
3701
+ (error.message.includes('EROFS') ||
3702
+ error.message.includes('read-only') ||
3703
+ error.message.includes('EACCES') ||
3704
+ error.message.includes('EPERM') ||
3705
+ error.message.includes('ENOENT'))) ;
3689
3706
  else {
3690
3707
  // Re-throw other unexpected errors
3691
3708
  throw error;
@@ -3700,13 +3717,17 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3700
3717
  await tools.fs.writeFile(join(rootDirname, filepath), fileContent);
3701
3718
  }
3702
3719
  catch (error) {
3720
+ if (isVerbose) {
3721
+ console.info(`๐Ÿ“„ [4] "${name}" error writing cache file`);
3722
+ }
3703
3723
  // Note: If we can't write to cache, we'll process the file directly from memory
3704
3724
  // This handles read-only filesystems like Vercel
3705
- if (error instanceof Error && (error.message.includes('EROFS') ||
3706
- error.message.includes('read-only') ||
3707
- error.message.includes('EACCES') ||
3708
- error.message.includes('EPERM') ||
3709
- error.message.includes('ENOENT'))) {
3725
+ if (error instanceof Error &&
3726
+ (error.message.includes('EROFS') ||
3727
+ error.message.includes('read-only') ||
3728
+ error.message.includes('EACCES') ||
3729
+ error.message.includes('EPERM') ||
3730
+ error.message.includes('ENOENT'))) {
3710
3731
  // Return a handler that works directly with the downloaded content
3711
3732
  return {
3712
3733
  source: name,
@@ -3728,6 +3749,9 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3728
3749
  }
3729
3750
  // TODO: [๐Ÿ’ต] Check the file security
3730
3751
  // TODO: [๐Ÿงน][๐Ÿง ] Delete the file after the scraping is done
3752
+ if (isVerbose) {
3753
+ console.info(`๐Ÿ“„ [5] "${name}" cached at "${join(rootDirname, filepath)}"`);
3754
+ }
3731
3755
  return makeKnowledgeSourceHandler({ name, knowledgeSourceContent: filepath }, tools, {
3732
3756
  ...options,
3733
3757
  rootDirname,
@@ -3742,7 +3766,12 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3742
3766
  throw new EnvironmentMismatchError('Can not import file knowledge in non-file pipeline');
3743
3767
  // <- TODO: [๐Ÿง ] What is the best error type here`
3744
3768
  }
3745
- const filename = join(rootDirname, knowledgeSourceContent).split('\\').join('/');
3769
+ const filename = isAbsolute(knowledgeSourceContent)
3770
+ ? knowledgeSourceContent
3771
+ : join(rootDirname, knowledgeSourceContent).split('\\').join('/');
3772
+ if (isVerbose) {
3773
+ console.info(`๐Ÿ“„ [6] "${name}" is a valid file "${filename}"`);
3774
+ }
3746
3775
  const fileExtension = getFileExtension(filename);
3747
3776
  const mimeType = extensionToMimeType(fileExtension || '');
3748
3777
  if (!(await isFileExisting(filename, tools.fs))) {
@@ -3784,6 +3813,12 @@ async function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3784
3813
  };
3785
3814
  }
3786
3815
  else {
3816
+ if (isVerbose) {
3817
+ console.info(`๐Ÿ“„ [7] "${name}" is just a explicit string text with a knowledge source`);
3818
+ console.info('---');
3819
+ console.info(knowledgeSourceContent);
3820
+ console.info('---');
3821
+ }
3787
3822
  return {
3788
3823
  source: name,
3789
3824
  filename: null,