@promptbook/website-crawler 0.71.0-16 → 0.71.0-18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +5 -1
  2. package/esm/index.es.js +31 -41
  3. package/esm/index.es.js.map +1 -1
  4. package/esm/typings/src/_packages/core.index.d.ts +2 -0
  5. package/esm/typings/src/_packages/node.index.d.ts +2 -0
  6. package/esm/typings/src/_packages/types.index.d.ts +8 -8
  7. package/esm/typings/src/_packages/utils.index.d.ts +2 -2
  8. package/esm/typings/src/execution/Executables.d.ts +18 -0
  9. package/esm/typings/src/execution/ExecutionTools.d.ts +9 -3
  10. package/esm/typings/src/execution/translation/automatic-translate/automatic-translators/LindatAutomaticTranslator.d.ts +11 -3
  11. package/esm/typings/src/llm-providers/multiple/MultipleLlmExecutionTools.d.ts +1 -2
  12. package/esm/typings/src/llm-providers/remote/interfaces/PromptbookServer_ListModels_Response.d.ts +4 -4
  13. package/esm/typings/src/llm-providers/remote/interfaces/PromptbookServer_Prompt_Response.d.ts +3 -3
  14. package/esm/typings/src/prepare/PrepareAndScrapeOptions.d.ts +0 -19
  15. package/esm/typings/src/scrapers/_common/Scraper.d.ts +1 -1
  16. package/esm/typings/src/scrapers/_common/register/$provideExecutablesForNode.d.ts +12 -0
  17. package/esm/typings/src/scrapers/_common/register/$provideScrapersForBrowser.d.ts +1 -1
  18. package/esm/typings/src/scrapers/_common/register/$provideScrapersForNode.d.ts +2 -2
  19. package/esm/typings/src/scrapers/document/DocumentScraper.d.ts +1 -1
  20. package/esm/typings/src/scrapers/document-legacy/LegacyDocumentScraper.d.ts +2 -2
  21. package/esm/typings/src/scrapers/pdf/PdfScraper.d.ts +1 -1
  22. package/esm/typings/src/scrapers/website/WebsiteScraper.d.ts +5 -2
  23. package/esm/typings/src/scrapers/website/utils/createShowdownConverter.d.ts +7 -0
  24. package/esm/typings/src/types/PipelineJson/TemplateJsonCommon.d.ts +2 -2
  25. package/esm/typings/src/utils/execCommand/$execCommand.d.ts +2 -2
  26. package/esm/typings/src/utils/execCommand/{IExecCommandOptions.d.ts → ExecCommandOptions.d.ts} +2 -6
  27. package/esm/typings/src/utils/execCommand/execCommandNormalizeOptions.d.ts +3 -3
  28. package/esm/typings/src/utils/normalization/IKeywords.d.ts +2 -2
  29. package/esm/typings/src/utils/normalization/parseKeywords.d.ts +2 -2
  30. package/esm/typings/src/utils/normalization/parseKeywordsFromString.d.ts +2 -2
  31. package/esm/typings/src/utils/normalization/searchKeywords.d.ts +2 -2
  32. package/esm/typings/src/utils/unwrapResult.d.ts +4 -4
  33. package/package.json +4 -3
  34. package/umd/index.umd.js +33 -43
  35. package/umd/index.umd.js.map +1 -1
  36. package/esm/typings/src/scrapers/website/utils/markdownConverter.d.ts +0 -12
  37. /package/esm/typings/src/scrapers/website/utils/{markdownConverter.test.d.ts → createShowdownConverter.test.d.ts} +0 -0
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  # ![Promptbook logo - cube with letters P and B](./other/design/logo-h1.png) Promptbook
4
4
 
5
- Supercharge your use of large language models
5
+ Build responsible, controlled and transparent applications on top of LLM models!
6
6
 
7
7
 
8
8
 
@@ -26,6 +26,10 @@ Supercharge your use of large language models
26
26
 
27
27
 
28
28
 
29
+ <blockquote style="color: #ff8811">
30
+ <b>⚠ Warning:</b> This is a pre-release version of the library. It is not yet ready for production use. Please look at <a href="https://www.npmjs.com/package/@promptbook/core?activeTab=versions">latest stable release</a>.
31
+ </blockquote>
32
+
29
33
  ## 📦 Package `@promptbook/website-crawler`
30
34
 
31
35
  - Promptbooks are [divided into several](#-packages) packages, all are published from [single monorepo](https://github.com/webgptorg/promptbook).
package/esm/index.es.js CHANGED
@@ -1,13 +1,13 @@
1
1
  import spaceTrim$1, { spaceTrim } from 'spacetrim';
2
2
  import { Readability } from '@mozilla/readability';
3
- import { mkdir, rm, readFile, writeFile } from 'fs/promises';
3
+ import { mkdir, rm, writeFile } from 'fs/promises';
4
4
  import { JSDOM } from 'jsdom';
5
- import { forTime } from 'waitasecond';
6
5
  import { SHA256 } from 'crypto-js';
7
6
  import hexEncoder from 'crypto-js/enc-hex';
8
7
  import { basename, join, dirname } from 'path';
9
8
  import { format } from 'prettier';
10
9
  import parserHtml from 'prettier/parser-html';
10
+ import { forTime } from 'waitasecond';
11
11
  import { lookup } from 'mime-types';
12
12
  import { unparse, parse } from 'papaparse';
13
13
  import { Converter } from 'showdown';
@@ -16,7 +16,7 @@ import { Converter } from 'showdown';
16
16
  /**
17
17
  * The version of the Promptbook library
18
18
  */
19
- var PROMPTBOOK_VERSION = '0.71.0-15';
19
+ var PROMPTBOOK_VERSION = '0.71.0-17';
20
20
  // TODO: [main] !!!! List here all the versions and annotate + put into script
21
21
 
22
22
  /*! *****************************************************************************
@@ -2527,8 +2527,7 @@ function countTotalUsage(llmTools) {
2527
2527
  * Multiple LLM Execution Tools is a proxy server that uses multiple execution tools internally and exposes the executor interface externally.
2528
2528
  *
2529
2529
  * Note: Internal utility of `joinLlmExecutionTools` but exposed type
2530
- * @public exported from `@promptbook/types`
2531
- * TODO: !!!!!! Export as runtime class not just type
2530
+ * @public exported from `@promptbook/core`
2532
2531
  */
2533
2532
  var MultipleLlmExecutionTools = /** @class */ (function () {
2534
2533
  /**
@@ -2915,7 +2914,7 @@ var $scrapersRegister = new $Register('scraper_constructors');
2915
2914
  * TODO: [®] DRY Register logic
2916
2915
  */
2917
2916
 
2918
- // TODO: !!!!!! Maybe delete this function
2917
+ // TODO: !!!!!!last - Maybe delete this function
2919
2918
  /**
2920
2919
  * Creates a message with all registered scrapers
2921
2920
  *
@@ -3023,7 +3022,6 @@ function $registeredScrapersMessage() {
3023
3022
  * @private within the repository
3024
3023
  */
3025
3024
  function sourceContentToName(sourceContent) {
3026
- // TODO: !!!!!! Better name for source than gibberish hash
3027
3025
  var hash = SHA256(hexEncoder.parse(JSON.stringify(sourceContent)))
3028
3026
  // <- TODO: [🥬] Encapsulate sha256 to some private utility function
3029
3027
  .toString( /* hex */)
@@ -3200,7 +3198,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3200
3198
  content = _a.sent();
3201
3199
  return [2 /*return*/, new Blob([
3202
3200
  content,
3203
- // <- TODO: !!!!!! Maybe not working
3201
+ // <- TODO: !!!!!! Test that this is working
3204
3202
  ], { type: mimeType_1 })];
3205
3203
  }
3206
3204
  });
@@ -3213,7 +3211,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3213
3211
  switch (_c.label) {
3214
3212
  case 0:
3215
3213
  _b = (_a = JSON).parse;
3216
- return [4 /*yield*/, readFile(filename_1, 'utf-8')];
3214
+ return [4 /*yield*/, tools.fs.readFile(filename_1, 'utf-8')];
3217
3215
  case 1: return [2 /*return*/, _b.apply(_a, [_c.sent()])];
3218
3216
  }
3219
3217
  });
@@ -3223,7 +3221,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
3223
3221
  return __awaiter(this, void 0, void 0, function () {
3224
3222
  return __generator(this, function (_a) {
3225
3223
  switch (_a.label) {
3226
- case 0: return [4 /*yield*/, readFile(filename_1, 'utf-8')];
3224
+ case 0: return [4 /*yield*/, tools.fs.readFile(filename_1, 'utf-8')];
3227
3225
  case 1: return [2 /*return*/, _a.sent()];
3228
3226
  }
3229
3227
  });
@@ -5778,32 +5776,29 @@ var MarkdownScraper = /** @class */ (function () {
5778
5776
  */
5779
5777
 
5780
5778
  /**
5781
- * A converter instance that uses showdown and highlight extensions
5779
+ * Create a new showdown converter instance
5782
5780
  *
5783
- * @type {Converter}
5784
- * @private for markdown and html knowledge scrapers
5785
- */
5786
- var markdownConverter = new Converter({
5787
- flavor: 'github', // <- TODO: !!!!!! Explicitly specify the flavor of promptbook markdown
5788
- /*
5789
- > import showdownHighlight from 'showdown-highlight';
5790
- > extensions: [
5791
- > showdownHighlight({
5792
- > // Whether to add the classes to the <pre> tag, default is false
5793
- > pre: true,
5794
- > // Whether to use hljs' auto language detection, default is true
5795
- > auto_detection: true,
5796
- > }),
5797
- > ],
5798
- */
5799
- });
5800
- /**
5801
- * TODO: !!!!!! Figure out better name not to confuse with `Converter`
5802
- * TODO: !!!!!! Lazy-make converter
5781
+ * @private utility of `WebsiteScraper`
5803
5782
  */
5783
+ function createShowdownConverter() {
5784
+ return new Converter({
5785
+ flavor: 'github',
5786
+ /*
5787
+ > import showdownHighlight from 'showdown-highlight';
5788
+ > extensions: [
5789
+ > showdownHighlight({
5790
+ > // Whether to add the classes to the <pre> tag, default is false
5791
+ > pre: true,
5792
+ > // Whether to use hljs' auto language detection, default is true
5793
+ > auto_detection: true,
5794
+ > }),
5795
+ > ],
5796
+ */
5797
+ });
5798
+ }
5804
5799
 
5805
5800
  /**
5806
- * Scraper for .docx files
5801
+ * Scraper for websites
5807
5802
  *
5808
5803
  * @see `documentationUrl` for more details
5809
5804
  * @public exported from `@promptbook/website-crawler`
@@ -5813,6 +5808,7 @@ var WebsiteScraper = /** @class */ (function () {
5813
5808
  this.tools = tools;
5814
5809
  this.options = options;
5815
5810
  this.markdownScraper = new MarkdownScraper(tools, options);
5811
+ this.showdownConverter = createShowdownConverter();
5816
5812
  }
5817
5813
  Object.defineProperty(WebsiteScraper.prototype, "metadata", {
5818
5814
  /**
@@ -5833,7 +5829,6 @@ var WebsiteScraper = /** @class */ (function () {
5833
5829
  return __awaiter(this, void 0, void 0, function () {
5834
5830
  var _a, _b,
5835
5831
  // TODO: [🧠] Maybe in node use headless browser not just JSDOM
5836
- // externalProgramsPaths = {},
5837
5832
  rootDirname, _c, cacheDirname, _d, isCacheCleaned, _e, isVerbose, jsdom, _f, reader, article, html, i, cacheFilehandler, markdown;
5838
5833
  return __generator(this, function (_g) {
5839
5834
  switch (_g.label) {
@@ -5851,10 +5846,6 @@ var WebsiteScraper = /** @class */ (function () {
5851
5846
  }]))();
5852
5847
  reader = new Readability(jsdom.window.document);
5853
5848
  article = reader.parse();
5854
- console.log(article);
5855
- return [4 /*yield*/, forTime(10000)];
5856
- case 2:
5857
- _g.sent();
5858
5849
  html = (article === null || article === void 0 ? void 0 : article.content) || (article === null || article === void 0 ? void 0 : article.textContent) || jsdom.window.document.body.innerHTML;
5859
5850
  // Note: Unwrap html such as it is convertable by `markdownConverter`
5860
5851
  for (i = 0; i < 2; i++) {
@@ -5870,12 +5861,12 @@ var WebsiteScraper = /** @class */ (function () {
5870
5861
  extension: 'html',
5871
5862
  isVerbose: isVerbose,
5872
5863
  })];
5873
- case 3:
5864
+ case 2:
5874
5865
  cacheFilehandler = _g.sent();
5875
5866
  return [4 /*yield*/, writeFile(cacheFilehandler.filename, html, 'utf-8')];
5876
- case 4:
5867
+ case 3:
5877
5868
  _g.sent();
5878
- markdown = markdownConverter.makeMarkdown(html, jsdom.window.document);
5869
+ markdown = this.showdownConverter.makeMarkdown(html, jsdom.window.document);
5879
5870
  return [2 /*return*/, __assign(__assign({}, cacheFilehandler), { markdown: markdown })];
5880
5871
  }
5881
5872
  });
@@ -5919,7 +5910,6 @@ var WebsiteScraper = /** @class */ (function () {
5919
5910
  return WebsiteScraper;
5920
5911
  }());
5921
5912
  /**
5922
- * TODO: !!!!!! Put into separate package
5923
5913
  * TODO: [👣] Scraped website in .md can act as cache item - there is no need to run conversion each time
5924
5914
  * TODO: [🪂] Do it in parallel 11:11
5925
5915
  * Note: No need to aggregate usage here, it is done by intercepting the llmTools