@promptbook/website-crawler 0.71.0-16 → 0.71.0-18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/esm/index.es.js +31 -41
- package/esm/index.es.js.map +1 -1
- package/esm/typings/src/_packages/core.index.d.ts +2 -0
- package/esm/typings/src/_packages/node.index.d.ts +2 -0
- package/esm/typings/src/_packages/types.index.d.ts +8 -8
- package/esm/typings/src/_packages/utils.index.d.ts +2 -2
- package/esm/typings/src/execution/Executables.d.ts +18 -0
- package/esm/typings/src/execution/ExecutionTools.d.ts +9 -3
- package/esm/typings/src/execution/translation/automatic-translate/automatic-translators/LindatAutomaticTranslator.d.ts +11 -3
- package/esm/typings/src/llm-providers/multiple/MultipleLlmExecutionTools.d.ts +1 -2
- package/esm/typings/src/llm-providers/remote/interfaces/PromptbookServer_ListModels_Response.d.ts +4 -4
- package/esm/typings/src/llm-providers/remote/interfaces/PromptbookServer_Prompt_Response.d.ts +3 -3
- package/esm/typings/src/prepare/PrepareAndScrapeOptions.d.ts +0 -19
- package/esm/typings/src/scrapers/_common/Scraper.d.ts +1 -1
- package/esm/typings/src/scrapers/_common/register/$provideExecutablesForNode.d.ts +12 -0
- package/esm/typings/src/scrapers/_common/register/$provideScrapersForBrowser.d.ts +1 -1
- package/esm/typings/src/scrapers/_common/register/$provideScrapersForNode.d.ts +2 -2
- package/esm/typings/src/scrapers/document/DocumentScraper.d.ts +1 -1
- package/esm/typings/src/scrapers/document-legacy/LegacyDocumentScraper.d.ts +2 -2
- package/esm/typings/src/scrapers/pdf/PdfScraper.d.ts +1 -1
- package/esm/typings/src/scrapers/website/WebsiteScraper.d.ts +5 -2
- package/esm/typings/src/scrapers/website/utils/createShowdownConverter.d.ts +7 -0
- package/esm/typings/src/types/PipelineJson/TemplateJsonCommon.d.ts +2 -2
- package/esm/typings/src/utils/execCommand/$execCommand.d.ts +2 -2
- package/esm/typings/src/utils/execCommand/{IExecCommandOptions.d.ts → ExecCommandOptions.d.ts} +2 -6
- package/esm/typings/src/utils/execCommand/execCommandNormalizeOptions.d.ts +3 -3
- package/esm/typings/src/utils/normalization/IKeywords.d.ts +2 -2
- package/esm/typings/src/utils/normalization/parseKeywords.d.ts +2 -2
- package/esm/typings/src/utils/normalization/parseKeywordsFromString.d.ts +2 -2
- package/esm/typings/src/utils/normalization/searchKeywords.d.ts +2 -2
- package/esm/typings/src/utils/unwrapResult.d.ts +4 -4
- package/package.json +4 -3
- package/umd/index.umd.js +33 -43
- package/umd/index.umd.js.map +1 -1
- package/esm/typings/src/scrapers/website/utils/markdownConverter.d.ts +0 -12
- /package/esm/typings/src/scrapers/website/utils/{markdownConverter.test.d.ts → createShowdownConverter.test.d.ts} +0 -0
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#  Promptbook
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Build responsible, controlled and transparent applications on top of LLM models!
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
|
|
@@ -26,6 +26,10 @@ Supercharge your use of large language models
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
<blockquote style="color: #ff8811">
|
|
30
|
+
<b>⚠ Warning:</b> This is a pre-release version of the library. It is not yet ready for production use. Please look at <a href="https://www.npmjs.com/package/@promptbook/core?activeTab=versions">latest stable release</a>.
|
|
31
|
+
</blockquote>
|
|
32
|
+
|
|
29
33
|
## 📦 Package `@promptbook/website-crawler`
|
|
30
34
|
|
|
31
35
|
- Promptbooks are [divided into several](#-packages) packages, all are published from [single monorepo](https://github.com/webgptorg/promptbook).
|
package/esm/index.es.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import spaceTrim$1, { spaceTrim } from 'spacetrim';
|
|
2
2
|
import { Readability } from '@mozilla/readability';
|
|
3
|
-
import { mkdir, rm,
|
|
3
|
+
import { mkdir, rm, writeFile } from 'fs/promises';
|
|
4
4
|
import { JSDOM } from 'jsdom';
|
|
5
|
-
import { forTime } from 'waitasecond';
|
|
6
5
|
import { SHA256 } from 'crypto-js';
|
|
7
6
|
import hexEncoder from 'crypto-js/enc-hex';
|
|
8
7
|
import { basename, join, dirname } from 'path';
|
|
9
8
|
import { format } from 'prettier';
|
|
10
9
|
import parserHtml from 'prettier/parser-html';
|
|
10
|
+
import { forTime } from 'waitasecond';
|
|
11
11
|
import { lookup } from 'mime-types';
|
|
12
12
|
import { unparse, parse } from 'papaparse';
|
|
13
13
|
import { Converter } from 'showdown';
|
|
@@ -16,7 +16,7 @@ import { Converter } from 'showdown';
|
|
|
16
16
|
/**
|
|
17
17
|
* The version of the Promptbook library
|
|
18
18
|
*/
|
|
19
|
-
var PROMPTBOOK_VERSION = '0.71.0-
|
|
19
|
+
var PROMPTBOOK_VERSION = '0.71.0-17';
|
|
20
20
|
// TODO: [main] !!!! List here all the versions and annotate + put into script
|
|
21
21
|
|
|
22
22
|
/*! *****************************************************************************
|
|
@@ -2527,8 +2527,7 @@ function countTotalUsage(llmTools) {
|
|
|
2527
2527
|
* Multiple LLM Execution Tools is a proxy server that uses multiple execution tools internally and exposes the executor interface externally.
|
|
2528
2528
|
*
|
|
2529
2529
|
* Note: Internal utility of `joinLlmExecutionTools` but exposed type
|
|
2530
|
-
* @public exported from `@promptbook/
|
|
2531
|
-
* TODO: !!!!!! Export as runtime class not just type
|
|
2530
|
+
* @public exported from `@promptbook/core`
|
|
2532
2531
|
*/
|
|
2533
2532
|
var MultipleLlmExecutionTools = /** @class */ (function () {
|
|
2534
2533
|
/**
|
|
@@ -2915,7 +2914,7 @@ var $scrapersRegister = new $Register('scraper_constructors');
|
|
|
2915
2914
|
* TODO: [®] DRY Register logic
|
|
2916
2915
|
*/
|
|
2917
2916
|
|
|
2918
|
-
// TODO: !!!!!! Maybe delete this function
|
|
2917
|
+
// TODO: !!!!!!last - Maybe delete this function
|
|
2919
2918
|
/**
|
|
2920
2919
|
* Creates a message with all registered scrapers
|
|
2921
2920
|
*
|
|
@@ -3023,7 +3022,6 @@ function $registeredScrapersMessage() {
|
|
|
3023
3022
|
* @private within the repository
|
|
3024
3023
|
*/
|
|
3025
3024
|
function sourceContentToName(sourceContent) {
|
|
3026
|
-
// TODO: !!!!!! Better name for source than gibberish hash
|
|
3027
3025
|
var hash = SHA256(hexEncoder.parse(JSON.stringify(sourceContent)))
|
|
3028
3026
|
// <- TODO: [🥬] Encapsulate sha256 to some private utility function
|
|
3029
3027
|
.toString( /* hex */)
|
|
@@ -3200,7 +3198,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
|
3200
3198
|
content = _a.sent();
|
|
3201
3199
|
return [2 /*return*/, new Blob([
|
|
3202
3200
|
content,
|
|
3203
|
-
// <- TODO: !!!!!!
|
|
3201
|
+
// <- TODO: !!!!!! Test that this is working
|
|
3204
3202
|
], { type: mimeType_1 })];
|
|
3205
3203
|
}
|
|
3206
3204
|
});
|
|
@@ -3213,7 +3211,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
|
3213
3211
|
switch (_c.label) {
|
|
3214
3212
|
case 0:
|
|
3215
3213
|
_b = (_a = JSON).parse;
|
|
3216
|
-
return [4 /*yield*/, readFile(filename_1, 'utf-8')];
|
|
3214
|
+
return [4 /*yield*/, tools.fs.readFile(filename_1, 'utf-8')];
|
|
3217
3215
|
case 1: return [2 /*return*/, _b.apply(_a, [_c.sent()])];
|
|
3218
3216
|
}
|
|
3219
3217
|
});
|
|
@@ -3223,7 +3221,7 @@ function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
|
3223
3221
|
return __awaiter(this, void 0, void 0, function () {
|
|
3224
3222
|
return __generator(this, function (_a) {
|
|
3225
3223
|
switch (_a.label) {
|
|
3226
|
-
case 0: return [4 /*yield*/, readFile(filename_1, 'utf-8')];
|
|
3224
|
+
case 0: return [4 /*yield*/, tools.fs.readFile(filename_1, 'utf-8')];
|
|
3227
3225
|
case 1: return [2 /*return*/, _a.sent()];
|
|
3228
3226
|
}
|
|
3229
3227
|
});
|
|
@@ -5778,32 +5776,29 @@ var MarkdownScraper = /** @class */ (function () {
|
|
|
5778
5776
|
*/
|
|
5779
5777
|
|
|
5780
5778
|
/**
|
|
5781
|
-
*
|
|
5779
|
+
* Create a new showdown converter instance
|
|
5782
5780
|
*
|
|
5783
|
-
* @
|
|
5784
|
-
* @private for markdown and html knowledge scrapers
|
|
5785
|
-
*/
|
|
5786
|
-
var markdownConverter = new Converter({
|
|
5787
|
-
flavor: 'github', // <- TODO: !!!!!! Explicitly specify the flavor of promptbook markdown
|
|
5788
|
-
/*
|
|
5789
|
-
> import showdownHighlight from 'showdown-highlight';
|
|
5790
|
-
> extensions: [
|
|
5791
|
-
> showdownHighlight({
|
|
5792
|
-
> // Whether to add the classes to the <pre> tag, default is false
|
|
5793
|
-
> pre: true,
|
|
5794
|
-
> // Whether to use hljs' auto language detection, default is true
|
|
5795
|
-
> auto_detection: true,
|
|
5796
|
-
> }),
|
|
5797
|
-
> ],
|
|
5798
|
-
*/
|
|
5799
|
-
});
|
|
5800
|
-
/**
|
|
5801
|
-
* TODO: !!!!!! Figure out better name not to confuse with `Converter`
|
|
5802
|
-
* TODO: !!!!!! Lazy-make converter
|
|
5781
|
+
* @private utility of `WebsiteScraper`
|
|
5803
5782
|
*/
|
|
5783
|
+
function createShowdownConverter() {
|
|
5784
|
+
return new Converter({
|
|
5785
|
+
flavor: 'github',
|
|
5786
|
+
/*
|
|
5787
|
+
> import showdownHighlight from 'showdown-highlight';
|
|
5788
|
+
> extensions: [
|
|
5789
|
+
> showdownHighlight({
|
|
5790
|
+
> // Whether to add the classes to the <pre> tag, default is false
|
|
5791
|
+
> pre: true,
|
|
5792
|
+
> // Whether to use hljs' auto language detection, default is true
|
|
5793
|
+
> auto_detection: true,
|
|
5794
|
+
> }),
|
|
5795
|
+
> ],
|
|
5796
|
+
*/
|
|
5797
|
+
});
|
|
5798
|
+
}
|
|
5804
5799
|
|
|
5805
5800
|
/**
|
|
5806
|
-
* Scraper for
|
|
5801
|
+
* Scraper for websites
|
|
5807
5802
|
*
|
|
5808
5803
|
* @see `documentationUrl` for more details
|
|
5809
5804
|
* @public exported from `@promptbook/website-crawler`
|
|
@@ -5813,6 +5808,7 @@ var WebsiteScraper = /** @class */ (function () {
|
|
|
5813
5808
|
this.tools = tools;
|
|
5814
5809
|
this.options = options;
|
|
5815
5810
|
this.markdownScraper = new MarkdownScraper(tools, options);
|
|
5811
|
+
this.showdownConverter = createShowdownConverter();
|
|
5816
5812
|
}
|
|
5817
5813
|
Object.defineProperty(WebsiteScraper.prototype, "metadata", {
|
|
5818
5814
|
/**
|
|
@@ -5833,7 +5829,6 @@ var WebsiteScraper = /** @class */ (function () {
|
|
|
5833
5829
|
return __awaiter(this, void 0, void 0, function () {
|
|
5834
5830
|
var _a, _b,
|
|
5835
5831
|
// TODO: [🧠] Maybe in node use headless browser not just JSDOM
|
|
5836
|
-
// externalProgramsPaths = {},
|
|
5837
5832
|
rootDirname, _c, cacheDirname, _d, isCacheCleaned, _e, isVerbose, jsdom, _f, reader, article, html, i, cacheFilehandler, markdown;
|
|
5838
5833
|
return __generator(this, function (_g) {
|
|
5839
5834
|
switch (_g.label) {
|
|
@@ -5851,10 +5846,6 @@ var WebsiteScraper = /** @class */ (function () {
|
|
|
5851
5846
|
}]))();
|
|
5852
5847
|
reader = new Readability(jsdom.window.document);
|
|
5853
5848
|
article = reader.parse();
|
|
5854
|
-
console.log(article);
|
|
5855
|
-
return [4 /*yield*/, forTime(10000)];
|
|
5856
|
-
case 2:
|
|
5857
|
-
_g.sent();
|
|
5858
5849
|
html = (article === null || article === void 0 ? void 0 : article.content) || (article === null || article === void 0 ? void 0 : article.textContent) || jsdom.window.document.body.innerHTML;
|
|
5859
5850
|
// Note: Unwrap html such as it is convertable by `markdownConverter`
|
|
5860
5851
|
for (i = 0; i < 2; i++) {
|
|
@@ -5870,12 +5861,12 @@ var WebsiteScraper = /** @class */ (function () {
|
|
|
5870
5861
|
extension: 'html',
|
|
5871
5862
|
isVerbose: isVerbose,
|
|
5872
5863
|
})];
|
|
5873
|
-
case
|
|
5864
|
+
case 2:
|
|
5874
5865
|
cacheFilehandler = _g.sent();
|
|
5875
5866
|
return [4 /*yield*/, writeFile(cacheFilehandler.filename, html, 'utf-8')];
|
|
5876
|
-
case
|
|
5867
|
+
case 3:
|
|
5877
5868
|
_g.sent();
|
|
5878
|
-
markdown =
|
|
5869
|
+
markdown = this.showdownConverter.makeMarkdown(html, jsdom.window.document);
|
|
5879
5870
|
return [2 /*return*/, __assign(__assign({}, cacheFilehandler), { markdown: markdown })];
|
|
5880
5871
|
}
|
|
5881
5872
|
});
|
|
@@ -5919,7 +5910,6 @@ var WebsiteScraper = /** @class */ (function () {
|
|
|
5919
5910
|
return WebsiteScraper;
|
|
5920
5911
|
}());
|
|
5921
5912
|
/**
|
|
5922
|
-
* TODO: !!!!!! Put into separate package
|
|
5923
5913
|
* TODO: [👣] Scraped website in .md can act as cache item - there is no need to run conversion each time
|
|
5924
5914
|
* TODO: [🪂] Do it in parallel 11:11
|
|
5925
5915
|
* Note: No need to aggregate usage here, it is done by intercepting the llmTools
|