website-scrap-engine 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/life-cycle/default-life-cycle.js +2 -0
- package/lib/life-cycle/default-life-cycle.js.map +1 -1
- package/lib/life-cycle/index.d.ts +1 -0
- package/lib/life-cycle/index.js +3 -1
- package/lib/life-cycle/index.js.map +1 -1
- package/lib/life-cycle/process-html-meta.d.ts +4 -0
- package/lib/life-cycle/process-html-meta.js +68 -0
- package/lib/life-cycle/process-html-meta.js.map +1 -0
- package/lib/sources.js +1 -0
- package/lib/sources.js.map +1 -1
- package/package.json +10 -10
- package/src/life-cycle/default-life-cycle.ts +2 -0
- package/src/life-cycle/index.ts +1 -0
- package/src/life-cycle/process-html-meta.ts +87 -0
- package/src/sources.ts +1 -0
- package/CHANGELOG.md +0 -175
|
@@ -6,6 +6,7 @@ const detect_resource_type_1 = require("./detect-resource-type");
|
|
|
6
6
|
const resource_1 = require("../resource");
|
|
7
7
|
const download_resource_1 = require("./download-resource");
|
|
8
8
|
const process_html_1 = require("./process-html");
|
|
9
|
+
const process_html_meta_1 = require("./process-html-meta");
|
|
9
10
|
const process_css_1 = require("./process-css");
|
|
10
11
|
const process_site_map_1 = require("./process-site-map");
|
|
11
12
|
const process_svg_1 = require("./process-svg");
|
|
@@ -31,6 +32,7 @@ const defaultLifeCycle = () => ({
|
|
|
31
32
|
processAfterDownload: [
|
|
32
33
|
adapters_1.processRedirectedUrl,
|
|
33
34
|
process_html_1.processHtml,
|
|
35
|
+
process_html_meta_1.processHtmlMetaRefresh,
|
|
34
36
|
process_svg_1.processSvg,
|
|
35
37
|
process_css_1.processCss,
|
|
36
38
|
process_site_map_1.processSiteMap
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"default-life-cycle.js","sourceRoot":"","sources":["../../src/life-cycle/default-life-cycle.ts"],"names":[],"mappings":";;;AACA,6CAAuC;AACvC,iEAA0D;AAC1D,0CAA2C;AAC3C,2DAAqD;AACrD,iDAA2C;AAC3C,+CAAyC;AACzC,yDAAkD;AAClD,+CAAyC;AACzC,2DAAmD;AACnD,mEAA2D;AAC3D,yCAAgD;AAChD,+EAAwE;AACxE,+EAAsE;AAEtE;;GAEG;AACI,MAAM,gBAAgB,GAAG,GAAwB,EAAE,CAAC,CAAC;IAC1D,IAAI,EAAE,EAAE;IACR,YAAY,EAAE,CAAC,sBAAS,CAAC;IACzB,kBAAkB,EAAE,CAAC,yCAAkB,CAAC;IACxC,cAAc,EAAd,yBAAc;IACd,qBAAqB,EAAE,EAAE;IACzB,QAAQ,EAAE;QACR,oCAAgB;QAChB,uDAAyB;QACzB,qDAAuB;KACxB;IACD,oBAAoB,EAAE;QACpB,+BAAoB;QACpB,0BAAW;QACX,wBAAU;QACV,wBAAU;QACV,iCAAc;KACf;IACD,UAAU,EAAE,CAAC,kCAAc,EAAE,0CAAkB,CAAC;IAChD,OAAO,EAAE,EAAE;CACZ,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"default-life-cycle.js","sourceRoot":"","sources":["../../src/life-cycle/default-life-cycle.ts"],"names":[],"mappings":";;;AACA,6CAAuC;AACvC,iEAA0D;AAC1D,0CAA2C;AAC3C,2DAAqD;AACrD,iDAA2C;AAC3C,2DAA2D;AAC3D,+CAAyC;AACzC,yDAAkD;AAClD,+CAAyC;AACzC,2DAAmD;AACnD,mEAA2D;AAC3D,yCAAgD;AAChD,+EAAwE;AACxE,+EAAsE;AAEtE;;GAEG;AACI,MAAM,gBAAgB,GAAG,GAAwB,EAAE,CAAC,CAAC;IAC1D,IAAI,EAAE,EAAE;IACR,YAAY,EAAE,CAAC,sBAAS,CAAC;IACzB,kBAAkB,EAAE,CAAC,yCAAkB,CAAC;IACxC,cAAc,EAAd,yBAAc;IACd,qBAAqB,EAAE,EAAE;IACzB,QAAQ,EAAE;QACR,oCAAgB;QAChB,uDAAyB;QACzB,qDAAuB;KACxB;IACD,oBAAoB,EAAE;QACpB,+BAAoB;QACpB,0BAAW;QACX,0CAAsB;QACtB,wBAAU;QACV,wBAAU;QACV,iCAAc;KACf;IACD,UAAU,EAAE,CAAC,kCAAc,EAAE,0CAAkB,CAAC;IAChD,OAAO,EAAE,EAAE;CACZ,CAAC,CAAC;AArBU,QAAA,gBAAgB,oBAqB1B"}
|
|
@@ -6,6 +6,7 @@ export { streamingDownloadToFile, downloadStreamingResource, downloadStreamingRe
|
|
|
6
6
|
export { PipelineExecutor } from './pipeline-executor';
|
|
7
7
|
export { processCssText, processCss } from './process-css';
|
|
8
8
|
export { processHtml } from './process-html';
|
|
9
|
+
export { processHtmlMetaRefresh } from './process-html-meta';
|
|
9
10
|
export { processSiteMap } from './process-site-map';
|
|
10
11
|
export { processSvg } from './process-svg';
|
|
11
12
|
export { getResourceBodyFromHtml, saveHtmlToDisk } from './save-html-to-disk';
|
package/lib/life-cycle/index.js
CHANGED
|
@@ -23,7 +23,7 @@ var __importStar = (this && this.__importStar) || function (mod) {
|
|
|
23
23
|
return result;
|
|
24
24
|
};
|
|
25
25
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.types = exports.skipLinks = exports.saveResourceToDisk = exports.saveHtmlToDisk = exports.getResourceBodyFromHtml = exports.processSvg = exports.processSiteMap = exports.processHtml = exports.processCss = exports.processCssText = exports.downloadStreamingResourceWithHook = exports.downloadStreamingResource = exports.streamingDownloadToFile = exports.downloadResource = exports.requestForResource = exports.getRetry = exports.beforeRetryHook = exports.detectResourceType = exports.defaultLifeCycle = exports.adapter = void 0;
|
|
26
|
+
exports.types = exports.skipLinks = exports.saveResourceToDisk = exports.saveHtmlToDisk = exports.getResourceBodyFromHtml = exports.processSvg = exports.processSiteMap = exports.processHtmlMetaRefresh = exports.processHtml = exports.processCss = exports.processCssText = exports.downloadStreamingResourceWithHook = exports.downloadStreamingResource = exports.streamingDownloadToFile = exports.downloadResource = exports.requestForResource = exports.getRetry = exports.beforeRetryHook = exports.detectResourceType = exports.defaultLifeCycle = exports.adapter = void 0;
|
|
27
27
|
exports.adapter = __importStar(require("./adapters"));
|
|
28
28
|
var default_life_cycle_1 = require("./default-life-cycle");
|
|
29
29
|
Object.defineProperty(exports, "defaultLifeCycle", { enumerable: true, get: function () { return default_life_cycle_1.defaultLifeCycle; } });
|
|
@@ -43,6 +43,8 @@ Object.defineProperty(exports, "processCssText", { enumerable: true, get: functi
|
|
|
43
43
|
Object.defineProperty(exports, "processCss", { enumerable: true, get: function () { return process_css_1.processCss; } });
|
|
44
44
|
var process_html_1 = require("./process-html");
|
|
45
45
|
Object.defineProperty(exports, "processHtml", { enumerable: true, get: function () { return process_html_1.processHtml; } });
|
|
46
|
+
var process_html_meta_1 = require("./process-html-meta");
|
|
47
|
+
Object.defineProperty(exports, "processHtmlMetaRefresh", { enumerable: true, get: function () { return process_html_meta_1.processHtmlMetaRefresh; } });
|
|
46
48
|
var process_site_map_1 = require("./process-site-map");
|
|
47
49
|
Object.defineProperty(exports, "processSiteMap", { enumerable: true, get: function () { return process_site_map_1.processSiteMap; } });
|
|
48
50
|
var process_svg_1 = require("./process-svg");
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/life-cycle/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,sDAAsC;AACtC,2DAAsD;AAA9C,sHAAA,gBAAgB,OAAA;AACxB,+DAA0D;AAAlD,0HAAA,kBAAkB,OAAA;AAC1B,yDAE6B;AAD3B,oHAAA,eAAe,OAAA;AAAE,6GAAA,QAAQ,OAAA;AAAE,uHAAA,kBAAkB,OAAA;AAAE,qHAAA,gBAAgB,OAAA;AAEjE,6EAIuC;AAHrC,sIAAA,uBAAuB,OAAA;AACvB,wIAAA,yBAAyB,OAAA;AACzB,gJAAA,iCAAiC,OAAA;AAGnC,6CAAyD;AAAjD,6GAAA,cAAc,OAAA;AAAE,yGAAA,UAAU,OAAA;AAClC,+CAA2C;AAAnC,2GAAA,WAAW,OAAA;AACnB,uDAAkD;AAA1C,kHAAA,cAAc,OAAA;AACtB,6CAAyC;AAAjC,yGAAA,UAAU,OAAA;AAClB,yDAA4E;AAApE,4HAAA,uBAAuB,OAAA;AAAE,mHAAA,cAAc,OAAA;AAC/C,iEAA2D;AAAnD,2HAAA,kBAAkB,OAAA;AAC1B,2CAAuC;AAA/B,uGAAA,SAAS,OAAA;AACjB,iDAAiC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/life-cycle/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,sDAAsC;AACtC,2DAAsD;AAA9C,sHAAA,gBAAgB,OAAA;AACxB,+DAA0D;AAAlD,0HAAA,kBAAkB,OAAA;AAC1B,yDAE6B;AAD3B,oHAAA,eAAe,OAAA;AAAE,6GAAA,QAAQ,OAAA;AAAE,uHAAA,kBAAkB,OAAA;AAAE,qHAAA,gBAAgB,OAAA;AAEjE,6EAIuC;AAHrC,sIAAA,uBAAuB,OAAA;AACvB,wIAAA,yBAAyB,OAAA;AACzB,gJAAA,iCAAiC,OAAA;AAGnC,6CAAyD;AAAjD,6GAAA,cAAc,OAAA;AAAE,yGAAA,UAAU,OAAA;AAClC,+CAA2C;AAAnC,2GAAA,WAAW,OAAA;AACnB,yDAA2D;AAAnD,2HAAA,sBAAsB,OAAA;AAC9B,uDAAkD;AAA1C,kHAAA,cAAc,OAAA;AACtB,6CAAyC;AAAjC,yGAAA,UAAU,OAAA;AAClB,yDAA4E;AAApE,4HAAA,uBAAuB,OAAA;AAAE,mHAAA,cAAc,OAAA;AAC/C,iEAA2D;AAAnD,2HAAA,kBAAkB,OAAA;AAC1B,2CAAuC;AAA/B,uGAAA,SAAS,OAAA;AACjB,iDAAiC"}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { DownloadResource, SubmitResourceFunc } from './types';
|
|
2
|
+
import type { StaticDownloadOptions } from '../options';
|
|
3
|
+
import type { PipelineExecutor } from './pipeline-executor';
|
|
4
|
+
export declare function processHtmlMetaRefresh(res: DownloadResource, submit: SubmitResourceFunc, options: StaticDownloadOptions, pipeline: PipelineExecutor): Promise<DownloadResource>;
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.processHtmlMetaRefresh = void 0;
|
|
4
|
+
const resource_1 = require("../resource");
|
|
5
|
+
const adapters_1 = require("./adapters");
|
|
6
|
+
const logger_1 = require("../logger/logger");
|
|
7
|
+
/**
|
|
8
|
+
* Originally create by https://github.com/stevenvachon at
|
|
9
|
+
* https://github.com/stevenvachon/http-equiv-refresh
|
|
10
|
+
* MIT license
|
|
11
|
+
*/
|
|
12
|
+
const META_REFRESH_PATTERN = /^\s*(\d+)(?:\s*;(?:\s*url\s*=)?\s*(?:["']\s*(.*?)\s*['"]|(.*?)))?\s*$/i;
|
|
13
|
+
async function processHtmlMetaRefresh(res, submit, options, pipeline) {
|
|
14
|
+
if (res.type !== resource_1.ResourceType.Html) {
|
|
15
|
+
return res;
|
|
16
|
+
}
|
|
17
|
+
if (!res.meta.doc) {
|
|
18
|
+
res.meta.doc = (0, adapters_1.parseHtml)(res, options);
|
|
19
|
+
}
|
|
20
|
+
const $ = res.meta.doc;
|
|
21
|
+
const metaLinks = $('meta[http-equiv="refresh"][content]');
|
|
22
|
+
if (metaLinks.length) {
|
|
23
|
+
const refUrl = res.redirectedUrl || res.url;
|
|
24
|
+
const savePath = refUrl === res.url ? res.savePath : undefined;
|
|
25
|
+
const depth = res.depth + 1;
|
|
26
|
+
for (let index = 0; index < metaLinks.length; index++) {
|
|
27
|
+
const elem = metaLinks.eq(index);
|
|
28
|
+
const attrValue = elem.attr('content');
|
|
29
|
+
if (!attrValue) {
|
|
30
|
+
continue;
|
|
31
|
+
}
|
|
32
|
+
const match = META_REFRESH_PATTERN.exec(attrValue);
|
|
33
|
+
if (!match) {
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
const originalLink = match[2] || match[3];
|
|
37
|
+
if (!originalLink) {
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
const link = await pipeline.linkRedirect(originalLink, elem, res);
|
|
41
|
+
if (!link) {
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
const linkType = await pipeline.detectResourceType(link, resource_1.ResourceType.Html, elem, res);
|
|
45
|
+
if (!linkType) {
|
|
46
|
+
if (logger_1.skip.isTraceEnabled()) {
|
|
47
|
+
logger_1.skip.trace('skip detectResourceType', originalLink, link, refUrl);
|
|
48
|
+
}
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
let resource = await pipeline.createResource(linkType, depth, link, refUrl, res.localRoot, options.encoding[linkType], savePath, res.type);
|
|
52
|
+
resource = await pipeline.processBeforeDownload(resource, elem, res, options);
|
|
53
|
+
if (!resource) {
|
|
54
|
+
if (logger_1.skip.isTraceEnabled()) {
|
|
55
|
+
logger_1.skip.trace('skip processBeforeDownload', originalLink, link, linkType, refUrl);
|
|
56
|
+
}
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
if (!resource.shouldBeDiscardedFromDownload) {
|
|
60
|
+
submit(resource);
|
|
61
|
+
}
|
|
62
|
+
elem.attr('content', attrValue.replace(originalLink, resource.replacePath));
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return res;
|
|
66
|
+
}
|
|
67
|
+
exports.processHtmlMetaRefresh = processHtmlMetaRefresh;
|
|
68
|
+
//# sourceMappingURL=process-html-meta.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"process-html-meta.js","sourceRoot":"","sources":["../../src/life-cycle/process-html-meta.ts"],"names":[],"mappings":";;;AAGA,0CAAmD;AACnD,yCAAqC;AACrC,6CAAsC;AAEtC;;;;GAIG;AACH,MAAM,oBAAoB,GACxB,wEAAwE,CAAC;AAEpE,KAAK,UAAU,sBAAsB,CAC1C,GAAqB,EACrB,MAA0B,EAC1B,OAA8B,EAC9B,QAA0B;IAG1B,IAAI,GAAG,CAAC,IAAI,KAAK,uBAAY,CAAC,IAAI,EAAE;QAClC,OAAO,GAAG,CAAC;KACZ;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,EAAE;QACjB,GAAG,CAAC,IAAI,CAAC,GAAG,GAAG,IAAA,oBAAS,EAAC,GAAG,EAAE,OAAO,CAAC,CAAC;KACxC;IACD,MAAM,CAAC,GAAG,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC;IAEvB,MAAM,SAAS,GAAG,CAAC,CAAC,qCAAqC,CAAC,CAAC;IAC3D,IAAI,SAAS,CAAC,MAAM,EAAE;QACpB,MAAM,MAAM,GAAW,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,GAAG,CAAC;QACpD,MAAM,QAAQ,GAAG,MAAM,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;QAE/D,MAAM,KAAK,GAAW,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC;QAEpC,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,SAAS,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE;YACrD,MAAM,IAAI,GAAG,SAAS,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC;YACjC,MAAM,SAAS,GAAkB,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtD,IAAI,CAAC,SAAS,EAAE;gBACd,SAAS;aACV;YACD,MAAM,KAAK,GAAG,oBAAoB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACnD,IAAI,CAAC,KAAK,EAAE;gBACV,SAAS;aACV;YACD,MAAM,YAAY,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC;YAC1C,IAAI,CAAC,YAAY,EAAE;gBACjB,SAAS;aACV;YACD,MAAM,IAAI,GACR,MAAM,QAAQ,CAAC,YAAY,CAAC,YAAY,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;YACvD,IAAI,CAAC,IAAI,EAAE;gBACT,SAAS;aACV;YAED,MAAM,QAAQ,GACZ,MAAM,QAAQ,CAAC,kBAAkB,CAAC,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;YACxE,IAAI,CAAC,QAAQ,EAAE;gBACb,IAAI,aAAI,CAAC,cAAc,EAAE,EAAE;oBACzB,aAAI,CAAC,KAAK,CAAC,yBAAyB,EAClC,YAAY,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;iBAC/B;gBACD,SAAS;aACV;YACD,IAAI,QAAQ,GAAoB,MAAM,QAAQ,CAAC,cAAc,CAC3D,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAC7B,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,EACzC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;YACtB,QAAQ,GAAG,MAAM,QAAQ,CAAC,qBAAqB,CAAC,QAAQ,EAAE,IAAI,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;YAC9E,IAAI,CAAC,QAAQ,EAAE;gBACb,IAAI,aAAI,CAAC,cAAc,EAAE,EAAE;oBACzB,aAAI,CAAC,KAAK,CAAC,4BAA4B,EACrC,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;iBACzC;gBACD,SAAS;aACV;YACD,IAAI,CAAC,QAAQ,CAAC,6BAA6B,EAAE;gBAC3C,MAAM,CAAC,QAAQ,CAAC,CAAC;aAClB;YACD,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,SAAS,CAAC,OAAO,CAAC,YAAY,EAAE,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC;SAC7E;KACF;IAED,OAAO,GAAG,CAAC;AACb,CAAC;AAvED,wDAuEC"}
|
package/lib/sources.js
CHANGED
|
@@ -32,6 +32,7 @@ exports.sources = [
|
|
|
32
32
|
{ selector: 'meta[property="og\\:video\\:url"]', attr: 'content' },
|
|
33
33
|
{ selector: 'meta[property="og\\:video\\:secure_url"]', attr: 'content' },
|
|
34
34
|
{ selector: 'video', attr: 'src' },
|
|
35
|
+
{ selector: 'video', attr: 'poster' },
|
|
35
36
|
{ selector: 'video source', attr: 'src' },
|
|
36
37
|
{ selector: 'video track', attr: 'src' },
|
|
37
38
|
{ selector: 'audio', attr: 'src' },
|
package/lib/sources.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sources.js","sourceRoot":"","sources":["../src/sources.ts"],"names":[],"mappings":";;;AAAA,yCAAwC;AAQxC,0DAA0D;AAC1D,wEAAwE;AAC3D,QAAA,OAAO,GAAuB;IACzC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IACjD,EAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IAClE,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAC;IAC9B,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAC;IACjC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAC;IAClC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,OAAO,EAAC;IAChD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAC;IACjC,EAAC,QAAQ,EAAE,wBAAwB,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,GAAG,EAAC;IAC1E,EAAC,QAAQ,EAAE,mBAAmB,EAAE,IAAI,EAAE,MAAM,EAAC;IAC7C,EAAC,QAAQ,EAAE,sBAAsB,EAAE,IAAI,EAAE,MAAM,EAAC;IAChD,yCAAyC;IACzC,qDAAqD;IACrD,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,YAAY,EAAC;IACrD,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAC;IACvC,EAAC,QAAQ,EAAE,gBAAgB,EAAE,IAAI,EAAE,QAAQ,EAAC;IAC5C,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACzD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IAC1D,EAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACtD,mEAAmE;IACnE,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,YAAY,EAAC;CAC/C,CAAC,GAAG,CAAC,CAAC,GAA8B,EAAE,EAAE;IACvC,IAAI,GAAG,CAAC,QAAQ,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,EAAE;QAC/D,GAAG,CAAC,QAAQ,IAAI,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC;KACjC;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE;QACb,GAAG,CAAC,IAAI,GAAG,uBAAY,CAAC,MAAM,CAAC;KAChC;IACD,OAAO,GAAuB,CAAC;AACjC,CAAC,CAAC,CAAC"}
|
|
1
|
+
{"version":3,"file":"sources.js","sourceRoot":"","sources":["../src/sources.ts"],"names":[],"mappings":";;;AAAA,yCAAwC;AAQxC,0DAA0D;AAC1D,wEAAwE;AAC3D,QAAA,OAAO,GAAuB;IACzC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IACjD,EAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IAClE,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAC;IAC9B,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAC;IACjC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAC;IAClC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,OAAO,EAAC;IAChD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAC;IACjC,EAAC,QAAQ,EAAE,wBAAwB,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,GAAG,EAAC;IAC1E,EAAC,QAAQ,EAAE,mBAAmB,EAAE,IAAI,EAAE,MAAM,EAAC;IAC7C,EAAC,QAAQ,EAAE,sBAAsB,EAAE,IAAI,EAAE,MAAM,EAAC;IAChD,yCAAyC;IACzC,qDAAqD;IACrD,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,YAAY,EAAC;IACrD,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAC;IACvC,EAAC,QAAQ,EAAE,gBAAgB,EAAE,IAAI,EAAE,QAAQ,EAAC;IAC5C,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAC;IACnC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACzD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IAC1D,EAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACtD,mEAAmE;IACnE,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,YAAY,EAAC;CAC/C,CAAC,GAAG,CAAC,CAAC,GAA8B,EAAE,EAAE;IACvC,IAAI,GAAG,CAAC,QAAQ,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,EAAE;QAC/D,GAAG,CAAC,QAAQ,IAAI,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC;KACjC;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE;QACb,GAAG,CAAC,IAAI,GAAG,uBAAY,CAAC,MAAM,CAAC;KAChC;IACD,OAAO,GAAuB,CAAC;AACjC,CAAC,CAAC,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "website-scrap-engine",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.1",
|
|
4
4
|
"description": "Configurable website scraper in typescript",
|
|
5
5
|
"main": "lib",
|
|
6
6
|
"types": "lib",
|
|
@@ -25,20 +25,20 @@
|
|
|
25
25
|
"css-url-parser": "^1.1.3",
|
|
26
26
|
"got": "^11.8.6",
|
|
27
27
|
"log4js": "^6.9.1",
|
|
28
|
-
"mkdirp": "^3.0.
|
|
28
|
+
"mkdirp": "^3.0.1",
|
|
29
29
|
"p-queue": "^6.6.2",
|
|
30
30
|
"srcset": "^4.0.0",
|
|
31
31
|
"urijs": "^1.19.11"
|
|
32
32
|
},
|
|
33
33
|
"devDependencies": {
|
|
34
|
-
"@types/jest": "^
|
|
35
|
-
"@types/node": "^
|
|
36
|
-
"@types/urijs": "^1.19.
|
|
37
|
-
"@typescript-eslint/eslint-plugin": "^5.
|
|
38
|
-
"@typescript-eslint/parser": "^5.
|
|
39
|
-
"eslint": "^8.
|
|
40
|
-
"jest": "^
|
|
41
|
-
"ts-jest": "^
|
|
34
|
+
"@types/jest": "^28.1.1",
|
|
35
|
+
"@types/node": "^20.8.7",
|
|
36
|
+
"@types/urijs": "^1.19.20",
|
|
37
|
+
"@typescript-eslint/eslint-plugin": "^5.62.0",
|
|
38
|
+
"@typescript-eslint/parser": "^5.62.0",
|
|
39
|
+
"eslint": "^8.52.0",
|
|
40
|
+
"jest": "^28.1.3",
|
|
41
|
+
"ts-jest": "^28.0.8",
|
|
42
42
|
"typescript": "^5.0.4"
|
|
43
43
|
},
|
|
44
44
|
"files": [
|
|
@@ -4,6 +4,7 @@ import {detectResourceType} from './detect-resource-type';
|
|
|
4
4
|
import {createResource} from '../resource';
|
|
5
5
|
import {downloadResource} from './download-resource';
|
|
6
6
|
import {processHtml} from './process-html';
|
|
7
|
+
import {processHtmlMetaRefresh} from './process-html-meta';
|
|
7
8
|
import {processCss} from './process-css';
|
|
8
9
|
import {processSiteMap} from './process-site-map';
|
|
9
10
|
import {processSvg} from './process-svg';
|
|
@@ -30,6 +31,7 @@ export const defaultLifeCycle = (): ProcessingLifeCycle => ({
|
|
|
30
31
|
processAfterDownload: [
|
|
31
32
|
processRedirectedUrl,
|
|
32
33
|
processHtml,
|
|
34
|
+
processHtmlMetaRefresh,
|
|
33
35
|
processSvg,
|
|
34
36
|
processCss,
|
|
35
37
|
processSiteMap
|
package/src/life-cycle/index.ts
CHANGED
|
@@ -12,6 +12,7 @@ export {
|
|
|
12
12
|
export {PipelineExecutor} from './pipeline-executor';
|
|
13
13
|
export {processCssText, processCss} from './process-css';
|
|
14
14
|
export {processHtml} from './process-html';
|
|
15
|
+
export {processHtmlMetaRefresh} from './process-html-meta';
|
|
15
16
|
export {processSiteMap} from './process-site-map';
|
|
16
17
|
export {processSvg} from './process-svg';
|
|
17
18
|
export {getResourceBodyFromHtml, saveHtmlToDisk} from './save-html-to-disk';
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import type {DownloadResource, SubmitResourceFunc} from './types';
|
|
2
|
+
import type {StaticDownloadOptions} from '../options';
|
|
3
|
+
import type {PipelineExecutor} from './pipeline-executor';
|
|
4
|
+
import {Resource, ResourceType} from '../resource';
|
|
5
|
+
import {parseHtml} from './adapters';
|
|
6
|
+
import {skip} from '../logger/logger';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Originally create by https://github.com/stevenvachon at
|
|
10
|
+
* https://github.com/stevenvachon/http-equiv-refresh
|
|
11
|
+
* MIT license
|
|
12
|
+
*/
|
|
13
|
+
const META_REFRESH_PATTERN =
|
|
14
|
+
/^\s*(\d+)(?:\s*;(?:\s*url\s*=)?\s*(?:["']\s*(.*?)\s*['"]|(.*?)))?\s*$/i;
|
|
15
|
+
|
|
16
|
+
export async function processHtmlMetaRefresh(
|
|
17
|
+
res: DownloadResource,
|
|
18
|
+
submit: SubmitResourceFunc,
|
|
19
|
+
options: StaticDownloadOptions,
|
|
20
|
+
pipeline: PipelineExecutor
|
|
21
|
+
): Promise<DownloadResource> {
|
|
22
|
+
|
|
23
|
+
if (res.type !== ResourceType.Html) {
|
|
24
|
+
return res;
|
|
25
|
+
}
|
|
26
|
+
if (!res.meta.doc) {
|
|
27
|
+
res.meta.doc = parseHtml(res, options);
|
|
28
|
+
}
|
|
29
|
+
const $ = res.meta.doc;
|
|
30
|
+
|
|
31
|
+
const metaLinks = $('meta[http-equiv="refresh"][content]');
|
|
32
|
+
if (metaLinks.length) {
|
|
33
|
+
const refUrl: string = res.redirectedUrl || res.url;
|
|
34
|
+
const savePath = refUrl === res.url ? res.savePath : undefined;
|
|
35
|
+
|
|
36
|
+
const depth: number = res.depth + 1;
|
|
37
|
+
|
|
38
|
+
for (let index = 0; index < metaLinks.length; index++) {
|
|
39
|
+
const elem = metaLinks.eq(index);
|
|
40
|
+
const attrValue: string | void = elem.attr('content');
|
|
41
|
+
if (!attrValue) {
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
const match = META_REFRESH_PATTERN.exec(attrValue);
|
|
45
|
+
if (!match) {
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
const originalLink = match[2] || match[3];
|
|
49
|
+
if (!originalLink) {
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
const link: string | void =
|
|
53
|
+
await pipeline.linkRedirect(originalLink, elem, res);
|
|
54
|
+
if (!link) {
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const linkType: ResourceType | void =
|
|
59
|
+
await pipeline.detectResourceType(link, ResourceType.Html, elem, res);
|
|
60
|
+
if (!linkType) {
|
|
61
|
+
if (skip.isTraceEnabled()) {
|
|
62
|
+
skip.trace('skip detectResourceType',
|
|
63
|
+
originalLink, link, refUrl);
|
|
64
|
+
}
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
let resource: Resource | void = await pipeline.createResource(
|
|
68
|
+
linkType, depth, link, refUrl,
|
|
69
|
+
res.localRoot, options.encoding[linkType],
|
|
70
|
+
savePath, res.type);
|
|
71
|
+
resource = await pipeline.processBeforeDownload(resource, elem, res, options);
|
|
72
|
+
if (!resource) {
|
|
73
|
+
if (skip.isTraceEnabled()) {
|
|
74
|
+
skip.trace('skip processBeforeDownload',
|
|
75
|
+
originalLink, link, linkType, refUrl);
|
|
76
|
+
}
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
if (!resource.shouldBeDiscardedFromDownload) {
|
|
80
|
+
submit(resource);
|
|
81
|
+
}
|
|
82
|
+
elem.attr('content', attrValue.replace(originalLink, resource.replacePath));
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return res;
|
|
87
|
+
}
|
package/src/sources.ts
CHANGED
|
@@ -36,6 +36,7 @@ export const sources: SourceDefinition[] = [
|
|
|
36
36
|
{selector: 'meta[property="og\\:video\\:url"]', attr: 'content'},
|
|
37
37
|
{selector: 'meta[property="og\\:video\\:secure_url"]', attr: 'content'},
|
|
38
38
|
{selector: 'video', attr: 'src'},
|
|
39
|
+
{selector: 'video', attr: 'poster'},
|
|
39
40
|
{selector: 'video source', attr: 'src'},
|
|
40
41
|
{selector: 'video track', attr: 'src'},
|
|
41
42
|
{selector: 'audio', attr: 'src'},
|
package/CHANGELOG.md
DELETED
|
@@ -1,175 +0,0 @@
|
|
|
1
|
-
0.7.0
|
|
2
|
-
============
|
|
3
|
-
|
|
4
|
-
BREAKING
|
|
5
|
-
------------
|
|
6
|
-
* build(deps): bump mkdirp from 2.1.6 to 3.0.0
|
|
7
|
-
* build(deps-dev): bump typescript from 4.9.5 to 5.0.4
|
|
8
|
-
|
|
9
|
-
0.6.0
|
|
10
|
-
============
|
|
11
|
-
|
|
12
|
-
BREAKING
|
|
13
|
-
------------
|
|
14
|
-
* resource: custom callback for rewriting savePath
|
|
15
|
-
* life-cycle: custom callback for rewriting savePath (<https://github.com/website-local/website-scrap-engine/issues/383>)
|
|
16
|
-
|
|
17
|
-
Fix
|
|
18
|
-
------------
|
|
19
|
-
* cheerio: replace deprecated api
|
|
20
|
-
|
|
21
|
-
Test
|
|
22
|
-
------------
|
|
23
|
-
* test: migrating to eslint v8 and typescript-eslint v5
|
|
24
|
-
* cheerio: fix a test
|
|
25
|
-
* resource: add a test
|
|
26
|
-
* ci: run tests on node.js 18.x (<https://github.com/website-local/website-scrap-engine/issues/610>)
|
|
27
|
-
|
|
28
|
-
Misc
|
|
29
|
-
------------
|
|
30
|
-
* package-lock-resolved: process registry.npmmirror.com
|
|
31
|
-
* logger: fix type conflict
|
|
32
|
-
* util: fix compatibility with typescript 4.8
|
|
33
|
-
* npm: drop @types/mkdirp
|
|
34
|
-
* update deps
|
|
35
|
-
|
|
36
|
-
0.5.0
|
|
37
|
-
============
|
|
38
|
-
|
|
39
|
-
BREAKING
|
|
40
|
-
------------
|
|
41
|
-
* typescript 4.4 support
|
|
42
|
-
* WorkerMessage: `error` can be `unknown`
|
|
43
|
-
* StreamingDownloadErrorHook: `e` can be `unknown`
|
|
44
|
-
* pipeline-executor-impl fix keepSearch param
|
|
45
|
-
* resource: redirectedSavePath not set after redirect
|
|
46
|
-
|
|
47
|
-
Test
|
|
48
|
-
------------
|
|
49
|
-
* test: adapt for jest 27 and ts-jest 27
|
|
50
|
-
|
|
51
|
-
0.4.0
|
|
52
|
-
============
|
|
53
|
-
|
|
54
|
-
BREAKING
|
|
55
|
-
------------
|
|
56
|
-
* worker-pool: load based worker pool (#11)
|
|
57
|
-
* cheerio: adapt for version 1.0.0-rc.10 (#271)
|
|
58
|
-
* test: adapt for URI.js v1.19.7 (#301)
|
|
59
|
-
|
|
60
|
-
Fix
|
|
61
|
-
------------
|
|
62
|
-
* downloader: correctly transfer resource body
|
|
63
|
-
* correctly convent ArrayBufferView to Buffer
|
|
64
|
-
* worker-pool: fix ready
|
|
65
|
-
|
|
66
|
-
Enhancement
|
|
67
|
-
------------
|
|
68
|
-
* npm: update
|
|
69
|
-
* life-cycle: add init and dispose life cycle
|
|
70
|
-
* resource: optionally redirected savePath
|
|
71
|
-
* resource: take a log on replacing long search string
|
|
72
|
-
* save-to-disk: optionally use remote date
|
|
73
|
-
* worker-pool: log worker errors
|
|
74
|
-
* worker-pool: custom initializer of worker
|
|
75
|
-
|
|
76
|
-
Test
|
|
77
|
-
------------
|
|
78
|
-
* worker-pool: basic unit test
|
|
79
|
-
* save-html-to-disk: initial unit tests with mocked fs
|
|
80
|
-
* save-to-disk: refactor tests
|
|
81
|
-
|
|
82
|
-
0.3.2
|
|
83
|
-
============
|
|
84
|
-
|
|
85
|
-
* resource: fix redirected path processing (#157)
|
|
86
|
-
* downloader: optional wait for this.init in method onIdle (#152)
|
|
87
|
-
* typescript: prefer type only import
|
|
88
|
-
|
|
89
|
-
0.3.1
|
|
90
|
-
============
|
|
91
|
-
|
|
92
|
-
* resource: use correct file scheme for windows (#145)
|
|
93
|
-
|
|
94
|
-
0.3.0
|
|
95
|
-
============
|
|
96
|
-
|
|
97
|
-
New Feature
|
|
98
|
-
------------
|
|
99
|
-
* life-cycle: extract and process source maps (#123)
|
|
100
|
-
* adapters: async processHtml
|
|
101
|
-
* life-cycle: add read-or-copy-local-resource
|
|
102
|
-
* resource: support file protocol (#126)
|
|
103
|
-
|
|
104
|
-
Misc
|
|
105
|
-
------------
|
|
106
|
-
* types: export type CheerioElement
|
|
107
|
-
* resource: optional skip replacePath processing in case of parser error (#107)
|
|
108
|
-
* resource: fix new type of Buffer.from (#116)
|
|
109
|
-
* build(deps): bump cheerio from 1.0.0-rc.3 to 1.0.0-rc.5
|
|
110
|
-
* io: mkdirRetry returns no string
|
|
111
|
-
* life-cycle: add download-streaming-resource to default
|
|
112
|
-
* skip-links: skip unix scheme
|
|
113
|
-
* skip-links: allow file protocol
|
|
114
|
-
* download: skip non-http url
|
|
115
|
-
* (BREAKING) resource: refactor createResource (#139)
|
|
116
|
-
|
|
117
|
-
0.2.0
|
|
118
|
-
============
|
|
119
|
-
* life-cycle: streaming download and save binary resource to disk
|
|
120
|
-
* build(deps-dev): bump @types/cheerio from 0.22.21 to 0.22.22
|
|
121
|
-
|
|
122
|
-
0.1.7
|
|
123
|
-
============
|
|
124
|
-
* resource: parse and process standalone svg images
|
|
125
|
-
* save-html-to-disk: keep location hash in redirect placeholder
|
|
126
|
-
* detect-resource-type: export lowerCaseExtension
|
|
127
|
-
* downloader: log downloadLink instead of rawUrl
|
|
128
|
-
* typescript: update to v4.0
|
|
129
|
-
|
|
130
|
-
0.1.6
|
|
131
|
-
============
|
|
132
|
-
* save-resource-to-disk: compare redirectedUrl with url
|
|
133
|
-
* process-html: submit resources from inline css
|
|
134
|
-
* downloader: correctly use adjustTimer on start
|
|
135
|
-
* downloader: deduplicate on redirectedUrl
|
|
136
|
-
downloader: do not wait for complete on add
|
|
137
|
-
|
|
138
|
-
0.1.5
|
|
139
|
-
============
|
|
140
|
-
* downloader: do not wait for complete on add
|
|
141
|
-
* process-html: fix detecting type
|
|
142
|
-
* npm: update p-queue to 6.6.0
|
|
143
|
-
* npm: move copy script to build
|
|
144
|
-
|
|
145
|
-
0.1.4
|
|
146
|
-
============
|
|
147
|
-
* save-html-to-disk: fix redirect check
|
|
148
|
-
* logger: add logger for skipExternal
|
|
149
|
-
|
|
150
|
-
0.1.3
|
|
151
|
-
============
|
|
152
|
-
* save-html-to-disk: fix redirect placeholder path
|
|
153
|
-
|
|
154
|
-
0.1.2
|
|
155
|
-
============
|
|
156
|
-
* adapters: make processRedirectedUrl named function
|
|
157
|
-
* options: move initialUrl and logSubDir to StaticDownloadOptions
|
|
158
|
-
* options: retry on error codes
|
|
159
|
-
* download-resource: manually retry on got internal errors
|
|
160
|
-
* io: refactor mkdirRetry
|
|
161
|
-
* process-html: skip invalid srcset
|
|
162
|
-
|
|
163
|
-
0.1.1
|
|
164
|
-
============
|
|
165
|
-
* io: remove mkdirRetrySync and update writeFile
|
|
166
|
-
* util: arrayToMap could freeze the object returned if required
|
|
167
|
-
* detect-resource-type: fix url with search and hash
|
|
168
|
-
* options: allow merging got options from StaticDownloadOptions
|
|
169
|
-
* options: add comments
|
|
170
|
-
* life-cycle: convent default life cycle fn to named function
|
|
171
|
-
|
|
172
|
-
0.1.0
|
|
173
|
-
============
|
|
174
|
-
Initial release.
|
|
175
|
-
|