@promptbook/website-crawler 0.84.0-11 → 0.84.0-13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/esm/index.es.js +85 -48
- package/esm/index.es.js.map +1 -1
- package/esm/typings/src/_packages/core.index.d.ts +4 -2
- package/esm/typings/src/config.d.ts +9 -1
- package/esm/typings/src/execution/FilesystemTools.d.ts +1 -1
- package/esm/typings/src/wizzard/wizzard.d.ts +7 -1
- package/package.json +2 -2
- package/umd/index.umd.js +89 -52
- package/umd/index.umd.js.map +1 -1
|
@@ -19,7 +19,8 @@ import { DEFAULT_MAX_EXECUTION_ATTEMPTS } from '../config';
|
|
|
19
19
|
import { DEFAULT_MAX_KNOWLEDGE_SOURCES_SCRAPING_DEPTH } from '../config';
|
|
20
20
|
import { DEFAULT_MAX_KNOWLEDGE_SOURCES_SCRAPING_TOTAL } from '../config';
|
|
21
21
|
import { DEFAULT_BOOKS_DIRNAME } from '../config';
|
|
22
|
-
import {
|
|
22
|
+
import { DEFAULT_DOWNLOAD_CACHE_DIRNAME } from '../config';
|
|
23
|
+
import { DEFAULT_EXECUTION_CACHE_DIRNAME } from '../config';
|
|
23
24
|
import { DEFAULT_SCRAPE_CACHE_DIRNAME } from '../config';
|
|
24
25
|
import { DEFAULT_PIPELINE_COLLECTION_BASE_FILENAME } from '../config';
|
|
25
26
|
import { DEFAULT_REMOTE_URL } from '../config';
|
|
@@ -145,7 +146,8 @@ export { DEFAULT_MAX_EXECUTION_ATTEMPTS };
|
|
|
145
146
|
export { DEFAULT_MAX_KNOWLEDGE_SOURCES_SCRAPING_DEPTH };
|
|
146
147
|
export { DEFAULT_MAX_KNOWLEDGE_SOURCES_SCRAPING_TOTAL };
|
|
147
148
|
export { DEFAULT_BOOKS_DIRNAME };
|
|
148
|
-
export {
|
|
149
|
+
export { DEFAULT_DOWNLOAD_CACHE_DIRNAME };
|
|
150
|
+
export { DEFAULT_EXECUTION_CACHE_DIRNAME };
|
|
149
151
|
export { DEFAULT_SCRAPE_CACHE_DIRNAME };
|
|
150
152
|
export { DEFAULT_PIPELINE_COLLECTION_BASE_FILENAME };
|
|
151
153
|
export { DEFAULT_REMOTE_URL };
|
|
@@ -166,6 +166,14 @@ export declare const DEFAULT_MAX_KNOWLEDGE_SOURCES_SCRAPING_TOTAL = 200;
|
|
|
166
166
|
* @public exported from `@promptbook/core`
|
|
167
167
|
*/
|
|
168
168
|
export declare const DEFAULT_BOOKS_DIRNAME = "./books";
|
|
169
|
+
/**
|
|
170
|
+
* Where to store the temporary downloads
|
|
171
|
+
*
|
|
172
|
+
* Note: When the folder does not exist, it is created recursively
|
|
173
|
+
*
|
|
174
|
+
* @public exported from `@promptbook/core`
|
|
175
|
+
*/
|
|
176
|
+
export declare const DEFAULT_DOWNLOAD_CACHE_DIRNAME = "./.promptbook/download-cache";
|
|
169
177
|
/**
|
|
170
178
|
* Where to store the cache of executions for promptbook CLI
|
|
171
179
|
*
|
|
@@ -173,7 +181,7 @@ export declare const DEFAULT_BOOKS_DIRNAME = "./books";
|
|
|
173
181
|
*
|
|
174
182
|
* @public exported from `@promptbook/core`
|
|
175
183
|
*/
|
|
176
|
-
export declare const
|
|
184
|
+
export declare const DEFAULT_EXECUTION_CACHE_DIRNAME = "./.promptbook/execution-cache";
|
|
177
185
|
/**
|
|
178
186
|
* Where to store the scrape cache
|
|
179
187
|
*
|
|
@@ -3,7 +3,7 @@ import type fs from 'fs/promises';
|
|
|
3
3
|
/**
|
|
4
4
|
* Container for all the tools needed to manipulate with filesystem
|
|
5
5
|
*/
|
|
6
|
-
export type FilesystemTools = Pick<typeof fs, 'access' | 'constants' | 'readFile' | 'writeFile' | 'stat' | 'readdir'>;
|
|
6
|
+
export type FilesystemTools = Pick<typeof fs, 'access' | 'constants' | 'readFile' | 'writeFile' | 'stat' | 'readdir' | 'mkdir'>;
|
|
7
7
|
/**
|
|
8
8
|
* TODO: Implement destroyable pattern to free resources
|
|
9
9
|
*/
|
|
@@ -6,6 +6,7 @@ import type { PipelineString } from '../pipeline/PipelineString';
|
|
|
6
6
|
import type { TaskProgress } from '../types/TaskProgress';
|
|
7
7
|
import type { InputParameters } from '../types/typeAliases';
|
|
8
8
|
import type { string_filename } from '../types/typeAliases';
|
|
9
|
+
import type { string_parameter_value } from '../types/typeAliases';
|
|
9
10
|
import type { string_pipeline_url } from '../types/typeAliases';
|
|
10
11
|
/**
|
|
11
12
|
* Wizzard for simple usage of the Promptbook
|
|
@@ -27,7 +28,12 @@ declare class Wizzard {
|
|
|
27
28
|
*
|
|
28
29
|
* Note: This works simmilar to the `ptbk run` command
|
|
29
30
|
*/
|
|
30
|
-
execute(book: string_pipeline_url | string_filename | PipelineString, inputParameters: InputParameters, onProgress?: (taskProgress: TaskProgress) => Promisable<void>): Promise<
|
|
31
|
+
execute(book: string_pipeline_url | string_filename | PipelineString, inputParameters: InputParameters, onProgress?: (taskProgress: TaskProgress) => Promisable<void>): Promise<{
|
|
32
|
+
/**
|
|
33
|
+
* Simple result of the execution
|
|
34
|
+
*/
|
|
35
|
+
result: string_parameter_value;
|
|
36
|
+
} & PipelineExecutorResult>;
|
|
31
37
|
private executionTools;
|
|
32
38
|
/**
|
|
33
39
|
* Provides the tools automatically for the Node.js environment
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@promptbook/website-crawler",
|
|
3
|
-
"version": "0.84.0-
|
|
3
|
+
"version": "0.84.0-13",
|
|
4
4
|
"description": "It's time for a paradigm shift. The future of software in plain English, French or Latin",
|
|
5
5
|
"--note-0": " <- [🐊]",
|
|
6
6
|
"private": false,
|
|
@@ -54,7 +54,7 @@
|
|
|
54
54
|
"module": "./esm/index.es.js",
|
|
55
55
|
"typings": "./esm/typings/src/_packages/website-crawler.index.d.ts",
|
|
56
56
|
"peerDependencies": {
|
|
57
|
-
"@promptbook/core": "0.84.0-
|
|
57
|
+
"@promptbook/core": "0.84.0-13"
|
|
58
58
|
},
|
|
59
59
|
"dependencies": {
|
|
60
60
|
"@mozilla/readability": "0.5.0",
|
package/umd/index.umd.js
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
(function (global, factory) {
|
|
2
|
-
typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports, require('spacetrim'), require('@mozilla/readability'), require('jsdom'), require('crypto-js'), require('crypto-js/enc-hex'), require('fs/promises'), require('path'), require('prettier'), require('prettier/parser-html'), require('waitasecond'), require('mime-types'), require('papaparse'), require('showdown')) :
|
|
3
|
-
typeof define === 'function' && define.amd ? define(['exports', 'spacetrim', '@mozilla/readability', 'jsdom', 'crypto-js', 'crypto-js/enc-hex', 'fs/promises', 'path', 'prettier', 'prettier/parser-html', 'waitasecond', 'mime-types', 'papaparse', 'showdown'], factory) :
|
|
4
|
-
(global = typeof globalThis !== 'undefined' ? globalThis : global || self, factory(global["promptbook-website-crawler"] = {}, global.spaceTrim, global.readability, global.jsdom, global.cryptoJs, global.hexEncoder, global.promises, global.path, global.prettier, global.parserHtml, global.waitasecond, global.mimeTypes, global.papaparse, global.showdown));
|
|
5
|
-
})(this, (function (exports, spaceTrim, readability, jsdom, cryptoJs, hexEncoder, promises, path, prettier, parserHtml, waitasecond, mimeTypes, papaparse, showdown) { 'use strict';
|
|
2
|
+
typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports, require('spacetrim'), require('@mozilla/readability'), require('jsdom'), require('crypto-js'), require('crypto-js/enc-hex'), require('fs/promises'), require('path'), require('prettier'), require('prettier/parser-html'), require('waitasecond'), require('crypto-js/sha256'), require('mime-types'), require('papaparse'), require('showdown')) :
|
|
3
|
+
typeof define === 'function' && define.amd ? define(['exports', 'spacetrim', '@mozilla/readability', 'jsdom', 'crypto-js', 'crypto-js/enc-hex', 'fs/promises', 'path', 'prettier', 'prettier/parser-html', 'waitasecond', 'crypto-js/sha256', 'mime-types', 'papaparse', 'showdown'], factory) :
|
|
4
|
+
(global = typeof globalThis !== 'undefined' ? globalThis : global || self, factory(global["promptbook-website-crawler"] = {}, global.spaceTrim, global.readability, global.jsdom, global.cryptoJs, global.hexEncoder, global.promises, global.path, global.prettier, global.parserHtml, global.waitasecond, global.sha256, global.mimeTypes, global.papaparse, global.showdown));
|
|
5
|
+
})(this, (function (exports, spaceTrim, readability, jsdom, cryptoJs, hexEncoder, promises, path, prettier, parserHtml, waitasecond, sha256, mimeTypes, papaparse, showdown) { 'use strict';
|
|
6
6
|
|
|
7
7
|
function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
|
|
8
8
|
|
|
9
9
|
var spaceTrim__default = /*#__PURE__*/_interopDefaultLegacy(spaceTrim);
|
|
10
10
|
var hexEncoder__default = /*#__PURE__*/_interopDefaultLegacy(hexEncoder);
|
|
11
11
|
var parserHtml__default = /*#__PURE__*/_interopDefaultLegacy(parserHtml);
|
|
12
|
+
var sha256__default = /*#__PURE__*/_interopDefaultLegacy(sha256);
|
|
12
13
|
|
|
13
14
|
// ⚠️ WARNING: This code has been generated so that any manual changes will be overwritten
|
|
14
15
|
/**
|
|
@@ -24,7 +25,7 @@
|
|
|
24
25
|
* @generated
|
|
25
26
|
* @see https://github.com/webgptorg/promptbook
|
|
26
27
|
*/
|
|
27
|
-
var PROMPTBOOK_ENGINE_VERSION = '0.84.0-
|
|
28
|
+
var PROMPTBOOK_ENGINE_VERSION = '0.84.0-12';
|
|
28
29
|
/**
|
|
29
30
|
* TODO: string_promptbook_version should be constrained to the all versions of Promptbook engine
|
|
30
31
|
* Note: [💞] Ignore a discrepancy between file name and entity name
|
|
@@ -266,6 +267,12 @@
|
|
|
266
267
|
* @private within the repository - too low-level in comparison with other `MAX_...`
|
|
267
268
|
*/
|
|
268
269
|
var IMMEDIATE_TIME = 10;
|
|
270
|
+
/**
|
|
271
|
+
* The maximum length of the (generated) filename
|
|
272
|
+
*
|
|
273
|
+
* @public exported from `@promptbook/core`
|
|
274
|
+
*/
|
|
275
|
+
var MAX_FILENAME_LENGTH = 30;
|
|
269
276
|
/**
|
|
270
277
|
* Strategy for caching the intermediate results for knowledge sources
|
|
271
278
|
*
|
|
@@ -285,6 +292,15 @@
|
|
|
285
292
|
* @public exported from `@promptbook/core`
|
|
286
293
|
*/
|
|
287
294
|
var DEFAULT_MAX_EXECUTION_ATTEMPTS = 3; // <- TODO: [🤹♂️]
|
|
295
|
+
// <- TODO: [🕝] Make also `BOOKS_DIRNAME_ALTERNATIVES`
|
|
296
|
+
/**
|
|
297
|
+
* Where to store the temporary downloads
|
|
298
|
+
*
|
|
299
|
+
* Note: When the folder does not exist, it is created recursively
|
|
300
|
+
*
|
|
301
|
+
* @public exported from `@promptbook/core`
|
|
302
|
+
*/
|
|
303
|
+
var DEFAULT_DOWNLOAD_CACHE_DIRNAME = './.promptbook/download-cache';
|
|
288
304
|
/**
|
|
289
305
|
* Where to store the scrape cache
|
|
290
306
|
*
|
|
@@ -3607,10 +3623,11 @@
|
|
|
3607
3623
|
function makeKnowledgeSourceHandler(knowledgeSource, tools, options) {
|
|
3608
3624
|
var _a;
|
|
3609
3625
|
return __awaiter(this, void 0, void 0, function () {
|
|
3610
|
-
var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response_1, mimeType, filename_1, fileExtension, mimeType;
|
|
3611
|
-
return __generator(this, function (
|
|
3612
|
-
switch (
|
|
3626
|
+
var _b, fetch, knowledgeSourceContent, name, _c, _d, rootDirname, url, response_1, mimeType, basename, hash, rootDirname_1, filepath, _f, _g, _h, _j, _k, filename_1, fileExtension, mimeType;
|
|
3627
|
+
return __generator(this, function (_l) {
|
|
3628
|
+
switch (_l.label) {
|
|
3613
3629
|
case 0:
|
|
3630
|
+
console.log('!!! makeKnowledgeSourceHandler', knowledgeSource);
|
|
3614
3631
|
_b = tools.fetch, fetch = _b === void 0 ? scraperFetch : _b;
|
|
3615
3632
|
knowledgeSourceContent = knowledgeSource.knowledgeSourceContent;
|
|
3616
3633
|
name = knowledgeSource.name;
|
|
@@ -3618,54 +3635,74 @@
|
|
|
3618
3635
|
if (!name) {
|
|
3619
3636
|
name = knowledgeSourceContentToName(knowledgeSourceContent);
|
|
3620
3637
|
}
|
|
3621
|
-
if (!isValidUrl(knowledgeSourceContent)) return [3 /*break*/,
|
|
3638
|
+
if (!isValidUrl(knowledgeSourceContent)) return [3 /*break*/, 5];
|
|
3622
3639
|
url = knowledgeSourceContent;
|
|
3623
3640
|
return [4 /*yield*/, fetch(url)];
|
|
3624
3641
|
case 1:
|
|
3625
|
-
response_1 =
|
|
3642
|
+
response_1 = _l.sent();
|
|
3626
3643
|
mimeType = ((_a = response_1.headers.get('content-type')) === null || _a === void 0 ? void 0 : _a.split(';')[0]) || 'text/html';
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3632
|
-
|
|
3633
|
-
|
|
3634
|
-
|
|
3635
|
-
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
|
|
3640
|
-
|
|
3641
|
-
|
|
3642
|
-
|
|
3643
|
-
|
|
3644
|
-
|
|
3645
|
-
|
|
3646
|
-
|
|
3647
|
-
|
|
3648
|
-
|
|
3649
|
-
|
|
3644
|
+
if (tools.fs === undefined || !url.endsWith('.pdf')) {
|
|
3645
|
+
return [2 /*return*/, {
|
|
3646
|
+
source: name,
|
|
3647
|
+
filename: null,
|
|
3648
|
+
url: url,
|
|
3649
|
+
mimeType: mimeType,
|
|
3650
|
+
/*
|
|
3651
|
+
TODO: [🥽]
|
|
3652
|
+
> async asBlob() {
|
|
3653
|
+
> // TODO: [👨🏻🤝👨🏻] This can be called multiple times BUT when called second time, response in already consumed
|
|
3654
|
+
> const content = await response.blob();
|
|
3655
|
+
> return content;
|
|
3656
|
+
> },
|
|
3657
|
+
*/
|
|
3658
|
+
asJson: function () {
|
|
3659
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
3660
|
+
var content;
|
|
3661
|
+
return __generator(this, function (_a) {
|
|
3662
|
+
switch (_a.label) {
|
|
3663
|
+
case 0: return [4 /*yield*/, response_1.json()];
|
|
3664
|
+
case 1:
|
|
3665
|
+
content = _a.sent();
|
|
3666
|
+
return [2 /*return*/, content];
|
|
3667
|
+
}
|
|
3668
|
+
});
|
|
3650
3669
|
});
|
|
3651
|
-
}
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
|
|
3659
|
-
|
|
3660
|
-
|
|
3661
|
-
|
|
3662
|
-
}
|
|
3670
|
+
},
|
|
3671
|
+
asText: function () {
|
|
3672
|
+
return __awaiter(this, void 0, void 0, function () {
|
|
3673
|
+
var content;
|
|
3674
|
+
return __generator(this, function (_a) {
|
|
3675
|
+
switch (_a.label) {
|
|
3676
|
+
case 0: return [4 /*yield*/, response_1.text()];
|
|
3677
|
+
case 1:
|
|
3678
|
+
content = _a.sent();
|
|
3679
|
+
return [2 /*return*/, content];
|
|
3680
|
+
}
|
|
3681
|
+
});
|
|
3663
3682
|
});
|
|
3664
|
-
}
|
|
3665
|
-
}
|
|
3666
|
-
|
|
3683
|
+
},
|
|
3684
|
+
}];
|
|
3685
|
+
}
|
|
3686
|
+
basename = url.split('/').pop() || titleToName(url);
|
|
3687
|
+
hash = sha256__default["default"](hexEncoder__default["default"].parse(url)).toString( /* hex */);
|
|
3688
|
+
rootDirname_1 = path.join(process.cwd(), DEFAULT_DOWNLOAD_CACHE_DIRNAME);
|
|
3689
|
+
filepath = path.join.apply(void 0, __spreadArray(__spreadArray([], __read(nameToSubfolderPath(hash /* <- TODO: [🎎] Maybe add some SHA256 prefix */)), false), ["".concat(basename.substring(0, MAX_FILENAME_LENGTH), ".pdf")], false));
|
|
3690
|
+
return [4 /*yield*/, tools.fs.mkdir(path.dirname(path.join(rootDirname_1, filepath)), { recursive: true })];
|
|
3667
3691
|
case 2:
|
|
3668
|
-
|
|
3692
|
+
_l.sent();
|
|
3693
|
+
_g = (_f = tools.fs).writeFile;
|
|
3694
|
+
_h = [path.join(rootDirname_1, filepath)];
|
|
3695
|
+
_k = (_j = Buffer).from;
|
|
3696
|
+
return [4 /*yield*/, response_1.arrayBuffer()];
|
|
3697
|
+
case 3: return [4 /*yield*/, _g.apply(_f, _h.concat([_k.apply(_j, [_l.sent()])]))];
|
|
3698
|
+
case 4:
|
|
3699
|
+
_l.sent();
|
|
3700
|
+
// TODO: !!!!!!!! Check the file security
|
|
3701
|
+
// TODO: !!!!!!!! Check the file size (if it is not too big)
|
|
3702
|
+
// TODO: !!!!!!!! Delete the file
|
|
3703
|
+
return [2 /*return*/, makeKnowledgeSourceHandler({ name: name, knowledgeSourceContent: filepath }, tools, __assign(__assign({}, options), { rootDirname: rootDirname_1 }))];
|
|
3704
|
+
case 5:
|
|
3705
|
+
if (!isValidFilePath(knowledgeSourceContent)) return [3 /*break*/, 7];
|
|
3669
3706
|
if (tools.fs === undefined) {
|
|
3670
3707
|
throw new EnvironmentMismatchError('Can not import file knowledge without filesystem tools');
|
|
3671
3708
|
// <- TODO: [🧠] What is the best error type here`
|
|
@@ -3678,8 +3715,8 @@
|
|
|
3678
3715
|
fileExtension = getFileExtension(filename_1);
|
|
3679
3716
|
mimeType = extensionToMimeType(fileExtension || '');
|
|
3680
3717
|
return [4 /*yield*/, isFileExisting(filename_1, tools.fs)];
|
|
3681
|
-
case
|
|
3682
|
-
if (!(
|
|
3718
|
+
case 6:
|
|
3719
|
+
if (!(_l.sent())) {
|
|
3683
3720
|
throw new NotFoundError(spaceTrim__default["default"](function (block) { return "\n Can not make source handler for file which does not exist:\n\n File:\n ".concat(block(knowledgeSourceContent), "\n\n Full file path:\n ").concat(block(filename_1), "\n "); }));
|
|
3684
3721
|
}
|
|
3685
3722
|
// TODO: [🧠][😿] Test security file - file is scoped to the project (BUT maybe do this in `filesystemTools`)
|
|
@@ -3725,7 +3762,7 @@
|
|
|
3725
3762
|
});
|
|
3726
3763
|
},
|
|
3727
3764
|
}];
|
|
3728
|
-
case
|
|
3765
|
+
case 7: return [2 /*return*/, {
|
|
3729
3766
|
source: name,
|
|
3730
3767
|
filename: null,
|
|
3731
3768
|
url: null,
|