website-scrap-engine 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,7 @@ const detect_resource_type_1 = require("./detect-resource-type");
6
6
  const resource_1 = require("../resource");
7
7
  const download_resource_1 = require("./download-resource");
8
8
  const process_html_1 = require("./process-html");
9
+ const process_html_meta_1 = require("./process-html-meta");
9
10
  const process_css_1 = require("./process-css");
10
11
  const process_site_map_1 = require("./process-site-map");
11
12
  const process_svg_1 = require("./process-svg");
@@ -31,6 +32,7 @@ const defaultLifeCycle = () => ({
31
32
  processAfterDownload: [
32
33
  adapters_1.processRedirectedUrl,
33
34
  process_html_1.processHtml,
35
+ process_html_meta_1.processHtmlMetaRefresh,
34
36
  process_svg_1.processSvg,
35
37
  process_css_1.processCss,
36
38
  process_site_map_1.processSiteMap
@@ -1 +1 @@
1
- {"version":3,"file":"default-life-cycle.js","sourceRoot":"","sources":["../../src/life-cycle/default-life-cycle.ts"],"names":[],"mappings":";;;AACA,6CAAuC;AACvC,iEAA0D;AAC1D,0CAA2C;AAC3C,2DAAqD;AACrD,iDAA2C;AAC3C,+CAAyC;AACzC,yDAAkD;AAClD,+CAAyC;AACzC,2DAAmD;AACnD,mEAA2D;AAC3D,yCAAgD;AAChD,+EAAwE;AACxE,+EAAsE;AAEtE;;GAEG;AACI,MAAM,gBAAgB,GAAG,GAAwB,EAAE,CAAC,CAAC;IAC1D,IAAI,EAAE,EAAE;IACR,YAAY,EAAE,CAAC,sBAAS,CAAC;IACzB,kBAAkB,EAAE,CAAC,yCAAkB,CAAC;IACxC,cAAc,EAAd,yBAAc;IACd,qBAAqB,EAAE,EAAE;IACzB,QAAQ,EAAE;QACR,oCAAgB;QAChB,uDAAyB;QACzB,qDAAuB;KACxB;IACD,oBAAoB,EAAE;QACpB,+BAAoB;QACpB,0BAAW;QACX,wBAAU;QACV,wBAAU;QACV,iCAAc;KACf;IACD,UAAU,EAAE,CAAC,kCAAc,EAAE,0CAAkB,CAAC;IAChD,OAAO,EAAE,EAAE;CACZ,CAAC,CAAC;AApBU,QAAA,gBAAgB,oBAoB1B"}
1
+ {"version":3,"file":"default-life-cycle.js","sourceRoot":"","sources":["../../src/life-cycle/default-life-cycle.ts"],"names":[],"mappings":";;;AACA,6CAAuC;AACvC,iEAA0D;AAC1D,0CAA2C;AAC3C,2DAAqD;AACrD,iDAA2C;AAC3C,2DAA2D;AAC3D,+CAAyC;AACzC,yDAAkD;AAClD,+CAAyC;AACzC,2DAAmD;AACnD,mEAA2D;AAC3D,yCAAgD;AAChD,+EAAwE;AACxE,+EAAsE;AAEtE;;GAEG;AACI,MAAM,gBAAgB,GAAG,GAAwB,EAAE,CAAC,CAAC;IAC1D,IAAI,EAAE,EAAE;IACR,YAAY,EAAE,CAAC,sBAAS,CAAC;IACzB,kBAAkB,EAAE,CAAC,yCAAkB,CAAC;IACxC,cAAc,EAAd,yBAAc;IACd,qBAAqB,EAAE,EAAE;IACzB,QAAQ,EAAE;QACR,oCAAgB;QAChB,uDAAyB;QACzB,qDAAuB;KACxB;IACD,oBAAoB,EAAE;QACpB,+BAAoB;QACpB,0BAAW;QACX,0CAAsB;QACtB,wBAAU;QACV,wBAAU;QACV,iCAAc;KACf;IACD,UAAU,EAAE,CAAC,kCAAc,EAAE,0CAAkB,CAAC;IAChD,OAAO,EAAE,EAAE;CACZ,CAAC,CAAC;AArBU,QAAA,gBAAgB,oBAqB1B"}
@@ -6,6 +6,7 @@ export { streamingDownloadToFile, downloadStreamingResource, downloadStreamingRe
6
6
  export { PipelineExecutor } from './pipeline-executor';
7
7
  export { processCssText, processCss } from './process-css';
8
8
  export { processHtml } from './process-html';
9
+ export { processHtmlMetaRefresh } from './process-html-meta';
9
10
  export { processSiteMap } from './process-site-map';
10
11
  export { processSvg } from './process-svg';
11
12
  export { getResourceBodyFromHtml, saveHtmlToDisk } from './save-html-to-disk';
@@ -23,7 +23,7 @@ var __importStar = (this && this.__importStar) || function (mod) {
23
23
  return result;
24
24
  };
25
25
  Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.types = exports.skipLinks = exports.saveResourceToDisk = exports.saveHtmlToDisk = exports.getResourceBodyFromHtml = exports.processSvg = exports.processSiteMap = exports.processHtml = exports.processCss = exports.processCssText = exports.downloadStreamingResourceWithHook = exports.downloadStreamingResource = exports.streamingDownloadToFile = exports.downloadResource = exports.requestForResource = exports.getRetry = exports.beforeRetryHook = exports.detectResourceType = exports.defaultLifeCycle = exports.adapter = void 0;
26
+ exports.types = exports.skipLinks = exports.saveResourceToDisk = exports.saveHtmlToDisk = exports.getResourceBodyFromHtml = exports.processSvg = exports.processSiteMap = exports.processHtmlMetaRefresh = exports.processHtml = exports.processCss = exports.processCssText = exports.downloadStreamingResourceWithHook = exports.downloadStreamingResource = exports.streamingDownloadToFile = exports.downloadResource = exports.requestForResource = exports.getRetry = exports.beforeRetryHook = exports.detectResourceType = exports.defaultLifeCycle = exports.adapter = void 0;
27
27
  exports.adapter = __importStar(require("./adapters"));
28
28
  var default_life_cycle_1 = require("./default-life-cycle");
29
29
  Object.defineProperty(exports, "defaultLifeCycle", { enumerable: true, get: function () { return default_life_cycle_1.defaultLifeCycle; } });
@@ -43,6 +43,8 @@ Object.defineProperty(exports, "processCssText", { enumerable: true, get: functi
43
43
  Object.defineProperty(exports, "processCss", { enumerable: true, get: function () { return process_css_1.processCss; } });
44
44
  var process_html_1 = require("./process-html");
45
45
  Object.defineProperty(exports, "processHtml", { enumerable: true, get: function () { return process_html_1.processHtml; } });
46
+ var process_html_meta_1 = require("./process-html-meta");
47
+ Object.defineProperty(exports, "processHtmlMetaRefresh", { enumerable: true, get: function () { return process_html_meta_1.processHtmlMetaRefresh; } });
46
48
  var process_site_map_1 = require("./process-site-map");
47
49
  Object.defineProperty(exports, "processSiteMap", { enumerable: true, get: function () { return process_site_map_1.processSiteMap; } });
48
50
  var process_svg_1 = require("./process-svg");
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/life-cycle/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,sDAAsC;AACtC,2DAAsD;AAA9C,sHAAA,gBAAgB,OAAA;AACxB,+DAA0D;AAAlD,0HAAA,kBAAkB,OAAA;AAC1B,yDAE6B;AAD3B,oHAAA,eAAe,OAAA;AAAE,6GAAA,QAAQ,OAAA;AAAE,uHAAA,kBAAkB,OAAA;AAAE,qHAAA,gBAAgB,OAAA;AAEjE,6EAIuC;AAHrC,sIAAA,uBAAuB,OAAA;AACvB,wIAAA,yBAAyB,OAAA;AACzB,gJAAA,iCAAiC,OAAA;AAGnC,6CAAyD;AAAjD,6GAAA,cAAc,OAAA;AAAE,yGAAA,UAAU,OAAA;AAClC,+CAA2C;AAAnC,2GAAA,WAAW,OAAA;AACnB,uDAAkD;AAA1C,kHAAA,cAAc,OAAA;AACtB,6CAAyC;AAAjC,yGAAA,UAAU,OAAA;AAClB,yDAA4E;AAApE,4HAAA,uBAAuB,OAAA;AAAE,mHAAA,cAAc,OAAA;AAC/C,iEAA2D;AAAnD,2HAAA,kBAAkB,OAAA;AAC1B,2CAAuC;AAA/B,uGAAA,SAAS,OAAA;AACjB,iDAAiC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/life-cycle/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,sDAAsC;AACtC,2DAAsD;AAA9C,sHAAA,gBAAgB,OAAA;AACxB,+DAA0D;AAAlD,0HAAA,kBAAkB,OAAA;AAC1B,yDAE6B;AAD3B,oHAAA,eAAe,OAAA;AAAE,6GAAA,QAAQ,OAAA;AAAE,uHAAA,kBAAkB,OAAA;AAAE,qHAAA,gBAAgB,OAAA;AAEjE,6EAIuC;AAHrC,sIAAA,uBAAuB,OAAA;AACvB,wIAAA,yBAAyB,OAAA;AACzB,gJAAA,iCAAiC,OAAA;AAGnC,6CAAyD;AAAjD,6GAAA,cAAc,OAAA;AAAE,yGAAA,UAAU,OAAA;AAClC,+CAA2C;AAAnC,2GAAA,WAAW,OAAA;AACnB,yDAA2D;AAAnD,2HAAA,sBAAsB,OAAA;AAC9B,uDAAkD;AAA1C,kHAAA,cAAc,OAAA;AACtB,6CAAyC;AAAjC,yGAAA,UAAU,OAAA;AAClB,yDAA4E;AAApE,4HAAA,uBAAuB,OAAA;AAAE,mHAAA,cAAc,OAAA;AAC/C,iEAA2D;AAAnD,2HAAA,kBAAkB,OAAA;AAC1B,2CAAuC;AAA/B,uGAAA,SAAS,OAAA;AACjB,iDAAiC"}
@@ -0,0 +1,4 @@
1
+ import type { DownloadResource, SubmitResourceFunc } from './types';
2
+ import type { StaticDownloadOptions } from '../options';
3
+ import type { PipelineExecutor } from './pipeline-executor';
4
+ export declare function processHtmlMetaRefresh(res: DownloadResource, submit: SubmitResourceFunc, options: StaticDownloadOptions, pipeline: PipelineExecutor): Promise<DownloadResource>;
@@ -0,0 +1,68 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.processHtmlMetaRefresh = void 0;
4
+ const resource_1 = require("../resource");
5
+ const adapters_1 = require("./adapters");
6
+ const logger_1 = require("../logger/logger");
7
+ /**
8
+ * Originally create by https://github.com/stevenvachon at
9
+ * https://github.com/stevenvachon/http-equiv-refresh
10
+ * MIT license
11
+ */
12
+ const META_REFRESH_PATTERN = /^\s*(\d+)(?:\s*;(?:\s*url\s*=)?\s*(?:["']\s*(.*?)\s*['"]|(.*?)))?\s*$/i;
13
+ async function processHtmlMetaRefresh(res, submit, options, pipeline) {
14
+ if (res.type !== resource_1.ResourceType.Html) {
15
+ return res;
16
+ }
17
+ if (!res.meta.doc) {
18
+ res.meta.doc = (0, adapters_1.parseHtml)(res, options);
19
+ }
20
+ const $ = res.meta.doc;
21
+ const metaLinks = $('meta[http-equiv="refresh"][content]');
22
+ if (metaLinks.length) {
23
+ const refUrl = res.redirectedUrl || res.url;
24
+ const savePath = refUrl === res.url ? res.savePath : undefined;
25
+ const depth = res.depth + 1;
26
+ for (let index = 0; index < metaLinks.length; index++) {
27
+ const elem = metaLinks.eq(index);
28
+ const attrValue = elem.attr('content');
29
+ if (!attrValue) {
30
+ continue;
31
+ }
32
+ const match = META_REFRESH_PATTERN.exec(attrValue);
33
+ if (!match) {
34
+ continue;
35
+ }
36
+ const originalLink = match[2] || match[3];
37
+ if (!originalLink) {
38
+ continue;
39
+ }
40
+ const link = await pipeline.linkRedirect(originalLink, elem, res);
41
+ if (!link) {
42
+ continue;
43
+ }
44
+ const linkType = await pipeline.detectResourceType(link, resource_1.ResourceType.Html, elem, res);
45
+ if (!linkType) {
46
+ if (logger_1.skip.isTraceEnabled()) {
47
+ logger_1.skip.trace('skip detectResourceType', originalLink, link, refUrl);
48
+ }
49
+ continue;
50
+ }
51
+ let resource = await pipeline.createResource(linkType, depth, link, refUrl, res.localRoot, options.encoding[linkType], savePath, res.type);
52
+ resource = await pipeline.processBeforeDownload(resource, elem, res, options);
53
+ if (!resource) {
54
+ if (logger_1.skip.isTraceEnabled()) {
55
+ logger_1.skip.trace('skip processBeforeDownload', originalLink, link, linkType, refUrl);
56
+ }
57
+ continue;
58
+ }
59
+ if (!resource.shouldBeDiscardedFromDownload) {
60
+ submit(resource);
61
+ }
62
+ elem.attr('content', attrValue.replace(originalLink, resource.replacePath));
63
+ }
64
+ }
65
+ return res;
66
+ }
67
+ exports.processHtmlMetaRefresh = processHtmlMetaRefresh;
68
+ //# sourceMappingURL=process-html-meta.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"process-html-meta.js","sourceRoot":"","sources":["../../src/life-cycle/process-html-meta.ts"],"names":[],"mappings":";;;AAGA,0CAAmD;AACnD,yCAAqC;AACrC,6CAAsC;AAEtC;;;;GAIG;AACH,MAAM,oBAAoB,GACxB,wEAAwE,CAAC;AAEpE,KAAK,UAAU,sBAAsB,CAC1C,GAAqB,EACrB,MAA0B,EAC1B,OAA8B,EAC9B,QAA0B;IAG1B,IAAI,GAAG,CAAC,IAAI,KAAK,uBAAY,CAAC,IAAI,EAAE;QAClC,OAAO,GAAG,CAAC;KACZ;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,EAAE;QACjB,GAAG,CAAC,IAAI,CAAC,GAAG,GAAG,IAAA,oBAAS,EAAC,GAAG,EAAE,OAAO,CAAC,CAAC;KACxC;IACD,MAAM,CAAC,GAAG,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC;IAEvB,MAAM,SAAS,GAAG,CAAC,CAAC,qCAAqC,CAAC,CAAC;IAC3D,IAAI,SAAS,CAAC,MAAM,EAAE;QACpB,MAAM,MAAM,GAAW,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,GAAG,CAAC;QACpD,MAAM,QAAQ,GAAG,MAAM,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;QAE/D,MAAM,KAAK,GAAW,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC;QAEpC,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,SAAS,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE;YACrD,MAAM,IAAI,GAAG,SAAS,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC;YACjC,MAAM,SAAS,GAAkB,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtD,IAAI,CAAC,SAAS,EAAE;gBACd,SAAS;aACV;YACD,MAAM,KAAK,GAAG,oBAAoB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACnD,IAAI,CAAC,KAAK,EAAE;gBACV,SAAS;aACV;YACD,MAAM,YAAY,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC;YAC1C,IAAI,CAAC,YAAY,EAAE;gBACjB,SAAS;aACV;YACD,MAAM,IAAI,GACR,MAAM,QAAQ,CAAC,YAAY,CAAC,YAAY,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;YACvD,IAAI,CAAC,IAAI,EAAE;gBACT,SAAS;aACV;YAED,MAAM,QAAQ,GACZ,MAAM,QAAQ,CAAC,kBAAkB,CAAC,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;YACxE,IAAI,CAAC,QAAQ,EAAE;gBACb,IAAI,aAAI,CAAC,cAAc,EAAE,EAAE;oBACzB,aAAI,CAAC,KAAK,CAAC,yBAAyB,EAClC,YAAY,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;iBAC/B;gBACD,SAAS;aACV;YACD,IAAI,QAAQ,GAAoB,MAAM,QAAQ,CAAC,cAAc,CAC3D,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAC7B,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,EACzC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;YACtB,QAAQ,GAAG,MAAM,QAAQ,CAAC,qBAAqB,CAAC,QAAQ,EAAE,IAAI,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;YAC9E,IAAI,CAAC,QAAQ,EAAE;gBACb,IAAI,aAAI,CAAC,cAAc,EAAE,EAAE;oBACzB,aAAI,CAAC,KAAK,CAAC,4BAA4B,EACrC,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;iBACzC;gBACD,SAAS;aACV;YACD,IAAI,CAAC,QAAQ,CAAC,6BAA6B,EAAE;gBAC3C,MAAM,CAAC,QAAQ,CAAC,CAAC;aAClB;YACD,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,SAAS,CAAC,OAAO,CAAC,YAAY,EAAE,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC;SAC7E;KACF;IAED,OAAO,GAAG,CAAC;AACb,CAAC;AAvED,wDAuEC"}
package/lib/sources.js CHANGED
@@ -32,6 +32,7 @@ exports.sources = [
32
32
  { selector: 'meta[property="og\\:video\\:url"]', attr: 'content' },
33
33
  { selector: 'meta[property="og\\:video\\:secure_url"]', attr: 'content' },
34
34
  { selector: 'video', attr: 'src' },
35
+ { selector: 'video', attr: 'poster' },
35
36
  { selector: 'video source', attr: 'src' },
36
37
  { selector: 'video track', attr: 'src' },
37
38
  { selector: 'audio', attr: 'src' },
@@ -1 +1 @@
1
- {"version":3,"file":"sources.js","sourceRoot":"","sources":["../src/sources.ts"],"names":[],"mappings":";;;AAAA,yCAAwC;AAQxC,0DAA0D;AAC1D,wEAAwE;AAC3D,QAAA,OAAO,GAAuB;IACzC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IACjD,EAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IAClE,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAC;IAC9B,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAC;IACjC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAC;IAClC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,OAAO,EAAC;IAChD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAC;IACjC,EAAC,QAAQ,EAAE,wBAAwB,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,GAAG,EAAC;IAC1E,EAAC,QAAQ,EAAE,mBAAmB,EAAE,IAAI,EAAE,MAAM,EAAC;IAC7C,EAAC,QAAQ,EAAE,sBAAsB,EAAE,IAAI,EAAE,MAAM,EAAC;IAChD,yCAAyC;IACzC,qDAAqD;IACrD,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,YAAY,EAAC;IACrD,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAC;IACvC,EAAC,QAAQ,EAAE,gBAAgB,EAAE,IAAI,EAAE,QAAQ,EAAC;IAC5C,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACzD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IAC1D,EAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACtD,mEAAmE;IACnE,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,YAAY,EAAC;CAC/C,CAAC,GAAG,CAAC,CAAC,GAA8B,EAAE,EAAE;IACvC,IAAI,GAAG,CAAC,QAAQ,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,EAAE;QAC/D,GAAG,CAAC,QAAQ,IAAI,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC;KACjC;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE;QACb,GAAG,CAAC,IAAI,GAAG,uBAAY,CAAC,MAAM,CAAC;KAChC;IACD,OAAO,GAAuB,CAAC;AACjC,CAAC,CAAC,CAAC"}
1
+ {"version":3,"file":"sources.js","sourceRoot":"","sources":["../src/sources.ts"],"names":[],"mappings":";;;AAAA,yCAAwC;AAQxC,0DAA0D;AAC1D,wEAAwE;AAC3D,QAAA,OAAO,GAAuB;IACzC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IACjD,EAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IAClE,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAC;IAC9B,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAC;IACjC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAC;IAClC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,OAAO,EAAC;IAChD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAC;IACjC,EAAC,QAAQ,EAAE,wBAAwB,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,GAAG,EAAC;IAC1E,EAAC,QAAQ,EAAE,mBAAmB,EAAE,IAAI,EAAE,MAAM,EAAC;IAC7C,EAAC,QAAQ,EAAE,sBAAsB,EAAE,IAAI,EAAE,MAAM,EAAC;IAChD,yCAAyC;IACzC,qDAAqD;IACrD,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,YAAY,EAAC;IACrD,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAC;IACvC,EAAC,QAAQ,EAAE,gBAAgB,EAAE,IAAI,EAAE,QAAQ,EAAC;IAC5C,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAC;IACnC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACzD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IAC1D,EAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACtD,mEAAmE;IACnE,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,YAAY,EAAC;CAC/C,CAAC,GAAG,CAAC,CAAC,GAA8B,EAAE,EAAE;IACvC,IAAI,GAAG,CAAC,QAAQ,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,EAAE;QAC/D,GAAG,CAAC,QAAQ,IAAI,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC;KACjC;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE;QACb,GAAG,CAAC,IAAI,GAAG,uBAAY,CAAC,MAAM,CAAC;KAChC;IACD,OAAO,GAAuB,CAAC;AACjC,CAAC,CAAC,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "website-scrap-engine",
3
- "version": "0.7.0",
3
+ "version": "0.7.1",
4
4
  "description": "Configurable website scraper in typescript",
5
5
  "main": "lib",
6
6
  "types": "lib",
@@ -25,20 +25,20 @@
25
25
  "css-url-parser": "^1.1.3",
26
26
  "got": "^11.8.6",
27
27
  "log4js": "^6.9.1",
28
- "mkdirp": "^3.0.0",
28
+ "mkdirp": "^3.0.1",
29
29
  "p-queue": "^6.6.2",
30
30
  "srcset": "^4.0.0",
31
31
  "urijs": "^1.19.11"
32
32
  },
33
33
  "devDependencies": {
34
- "@types/jest": "^27.5.2",
35
- "@types/node": "^18.15.13",
36
- "@types/urijs": "^1.19.19",
37
- "@typescript-eslint/eslint-plugin": "^5.59.0",
38
- "@typescript-eslint/parser": "^5.59.0",
39
- "eslint": "^8.38.0",
40
- "jest": "^27.5.1",
41
- "ts-jest": "^27.1.5",
34
+ "@types/jest": "^28.1.1",
35
+ "@types/node": "^20.8.7",
36
+ "@types/urijs": "^1.19.20",
37
+ "@typescript-eslint/eslint-plugin": "^5.62.0",
38
+ "@typescript-eslint/parser": "^5.62.0",
39
+ "eslint": "^8.52.0",
40
+ "jest": "^28.1.3",
41
+ "ts-jest": "^28.0.8",
42
42
  "typescript": "^5.0.4"
43
43
  },
44
44
  "files": [
@@ -4,6 +4,7 @@ import {detectResourceType} from './detect-resource-type';
4
4
  import {createResource} from '../resource';
5
5
  import {downloadResource} from './download-resource';
6
6
  import {processHtml} from './process-html';
7
+ import {processHtmlMetaRefresh} from './process-html-meta';
7
8
  import {processCss} from './process-css';
8
9
  import {processSiteMap} from './process-site-map';
9
10
  import {processSvg} from './process-svg';
@@ -30,6 +31,7 @@ export const defaultLifeCycle = (): ProcessingLifeCycle => ({
30
31
  processAfterDownload: [
31
32
  processRedirectedUrl,
32
33
  processHtml,
34
+ processHtmlMetaRefresh,
33
35
  processSvg,
34
36
  processCss,
35
37
  processSiteMap
@@ -12,6 +12,7 @@ export {
12
12
  export {PipelineExecutor} from './pipeline-executor';
13
13
  export {processCssText, processCss} from './process-css';
14
14
  export {processHtml} from './process-html';
15
+ export {processHtmlMetaRefresh} from './process-html-meta';
15
16
  export {processSiteMap} from './process-site-map';
16
17
  export {processSvg} from './process-svg';
17
18
  export {getResourceBodyFromHtml, saveHtmlToDisk} from './save-html-to-disk';
@@ -0,0 +1,87 @@
1
+ import type {DownloadResource, SubmitResourceFunc} from './types';
2
+ import type {StaticDownloadOptions} from '../options';
3
+ import type {PipelineExecutor} from './pipeline-executor';
4
+ import {Resource, ResourceType} from '../resource';
5
+ import {parseHtml} from './adapters';
6
+ import {skip} from '../logger/logger';
7
+
8
+ /**
9
+ * Originally create by https://github.com/stevenvachon at
10
+ * https://github.com/stevenvachon/http-equiv-refresh
11
+ * MIT license
12
+ */
13
+ const META_REFRESH_PATTERN =
14
+ /^\s*(\d+)(?:\s*;(?:\s*url\s*=)?\s*(?:["']\s*(.*?)\s*['"]|(.*?)))?\s*$/i;
15
+
16
+ export async function processHtmlMetaRefresh(
17
+ res: DownloadResource,
18
+ submit: SubmitResourceFunc,
19
+ options: StaticDownloadOptions,
20
+ pipeline: PipelineExecutor
21
+ ): Promise<DownloadResource> {
22
+
23
+ if (res.type !== ResourceType.Html) {
24
+ return res;
25
+ }
26
+ if (!res.meta.doc) {
27
+ res.meta.doc = parseHtml(res, options);
28
+ }
29
+ const $ = res.meta.doc;
30
+
31
+ const metaLinks = $('meta[http-equiv="refresh"][content]');
32
+ if (metaLinks.length) {
33
+ const refUrl: string = res.redirectedUrl || res.url;
34
+ const savePath = refUrl === res.url ? res.savePath : undefined;
35
+
36
+ const depth: number = res.depth + 1;
37
+
38
+ for (let index = 0; index < metaLinks.length; index++) {
39
+ const elem = metaLinks.eq(index);
40
+ const attrValue: string | void = elem.attr('content');
41
+ if (!attrValue) {
42
+ continue;
43
+ }
44
+ const match = META_REFRESH_PATTERN.exec(attrValue);
45
+ if (!match) {
46
+ continue;
47
+ }
48
+ const originalLink = match[2] || match[3];
49
+ if (!originalLink) {
50
+ continue;
51
+ }
52
+ const link: string | void =
53
+ await pipeline.linkRedirect(originalLink, elem, res);
54
+ if (!link) {
55
+ continue;
56
+ }
57
+
58
+ const linkType: ResourceType | void =
59
+ await pipeline.detectResourceType(link, ResourceType.Html, elem, res);
60
+ if (!linkType) {
61
+ if (skip.isTraceEnabled()) {
62
+ skip.trace('skip detectResourceType',
63
+ originalLink, link, refUrl);
64
+ }
65
+ continue;
66
+ }
67
+ let resource: Resource | void = await pipeline.createResource(
68
+ linkType, depth, link, refUrl,
69
+ res.localRoot, options.encoding[linkType],
70
+ savePath, res.type);
71
+ resource = await pipeline.processBeforeDownload(resource, elem, res, options);
72
+ if (!resource) {
73
+ if (skip.isTraceEnabled()) {
74
+ skip.trace('skip processBeforeDownload',
75
+ originalLink, link, linkType, refUrl);
76
+ }
77
+ continue;
78
+ }
79
+ if (!resource.shouldBeDiscardedFromDownload) {
80
+ submit(resource);
81
+ }
82
+ elem.attr('content', attrValue.replace(originalLink, resource.replacePath));
83
+ }
84
+ }
85
+
86
+ return res;
87
+ }
package/src/sources.ts CHANGED
@@ -36,6 +36,7 @@ export const sources: SourceDefinition[] = [
36
36
  {selector: 'meta[property="og\\:video\\:url"]', attr: 'content'},
37
37
  {selector: 'meta[property="og\\:video\\:secure_url"]', attr: 'content'},
38
38
  {selector: 'video', attr: 'src'},
39
+ {selector: 'video', attr: 'poster'},
39
40
  {selector: 'video source', attr: 'src'},
40
41
  {selector: 'video track', attr: 'src'},
41
42
  {selector: 'audio', attr: 'src'},
package/CHANGELOG.md DELETED
@@ -1,175 +0,0 @@
1
- 0.7.0
2
- ============
3
-
4
- BREAKING
5
- ------------
6
- * build(deps): bump mkdirp from 2.1.6 to 3.0.0
7
- * build(deps-dev): bump typescript from 4.9.5 to 5.0.4
8
-
9
- 0.6.0
10
- ============
11
-
12
- BREAKING
13
- ------------
14
- * resource: custom callback for rewriting savePath
15
- * life-cycle: custom callback for rewriting savePath (<https://github.com/website-local/website-scrap-engine/issues/383>)
16
-
17
- Fix
18
- ------------
19
- * cheerio: replace deprecated api
20
-
21
- Test
22
- ------------
23
- * test: migrating to eslint v8 and typescript-eslint v5
24
- * cheerio: fix a test
25
- * resource: add a test
26
- * ci: run tests on node.js 18.x (<https://github.com/website-local/website-scrap-engine/issues/610>)
27
-
28
- Misc
29
- ------------
30
- * package-lock-resolved: process registry.npmmirror.com
31
- * logger: fix type conflict
32
- * util: fix compatibility with typescript 4.8
33
- * npm: drop @types/mkdirp
34
- * update deps
35
-
36
- 0.5.0
37
- ============
38
-
39
- BREAKING
40
- ------------
41
- * typescript 4.4 support
42
- * WorkerMessage: `error` can be `unknown`
43
- * StreamingDownloadErrorHook: `e` can be `unknown`
44
- * pipeline-executor-impl fix keepSearch param
45
- * resource: redirectedSavePath not set after redirect
46
-
47
- Test
48
- ------------
49
- * test: adapt for jest 27 and ts-jest 27
50
-
51
- 0.4.0
52
- ============
53
-
54
- BREAKING
55
- ------------
56
- * worker-pool: load based worker pool (#11)
57
- * cheerio: adapt for version 1.0.0-rc.10 (#271)
58
- * test: adapt for URI.js v1.19.7 (#301)
59
-
60
- Fix
61
- ------------
62
- * downloader: correctly transfer resource body
63
- * correctly convent ArrayBufferView to Buffer
64
- * worker-pool: fix ready
65
-
66
- Enhancement
67
- ------------
68
- * npm: update
69
- * life-cycle: add init and dispose life cycle
70
- * resource: optionally redirected savePath
71
- * resource: take a log on replacing long search string
72
- * save-to-disk: optionally use remote date
73
- * worker-pool: log worker errors
74
- * worker-pool: custom initializer of worker
75
-
76
- Test
77
- ------------
78
- * worker-pool: basic unit test
79
- * save-html-to-disk: initial unit tests with mocked fs
80
- * save-to-disk: refactor tests
81
-
82
- 0.3.2
83
- ============
84
-
85
- * resource: fix redirected path processing (#157)
86
- * downloader: optional wait for this.init in method onIdle (#152)
87
- * typescript: prefer type only import
88
-
89
- 0.3.1
90
- ============
91
-
92
- * resource: use correct file scheme for windows (#145)
93
-
94
- 0.3.0
95
- ============
96
-
97
- New Feature
98
- ------------
99
- * life-cycle: extract and process source maps (#123)
100
- * adapters: async processHtml
101
- * life-cycle: add read-or-copy-local-resource
102
- * resource: support file protocol (#126)
103
-
104
- Misc
105
- ------------
106
- * types: export type CheerioElement
107
- * resource: optional skip replacePath processing in case of parser error (#107)
108
- * resource: fix new type of Buffer.from (#116)
109
- * build(deps): bump cheerio from 1.0.0-rc.3 to 1.0.0-rc.5
110
- * io: mkdirRetry returns no string
111
- * life-cycle: add download-streaming-resource to default
112
- * skip-links: skip unix scheme
113
- * skip-links: allow file protocol
114
- * download: skip non-http url
115
- * (BREAKING) resource: refactor createResource (#139)
116
-
117
- 0.2.0
118
- ============
119
- * life-cycle: streaming download and save binary resource to disk
120
- * build(deps-dev): bump @types/cheerio from 0.22.21 to 0.22.22
121
-
122
- 0.1.7
123
- ============
124
- * resource: parse and process standalone svg images
125
- * save-html-to-disk: keep location hash in redirect placeholder
126
- * detect-resource-type: export lowerCaseExtension
127
- * downloader: log downloadLink instead of rawUrl
128
- * typescript: update to v4.0
129
-
130
- 0.1.6
131
- ============
132
- * save-resource-to-disk: compare redirectedUrl with url
133
- * process-html: submit resources from inline css
134
- * downloader: correctly use adjustTimer on start
135
- * downloader: deduplicate on redirectedUrl
136
- downloader: do not wait for complete on add
137
-
138
- 0.1.5
139
- ============
140
- * downloader: do not wait for complete on add
141
- * process-html: fix detecting type
142
- * npm: update p-queue to 6.6.0
143
- * npm: move copy script to build
144
-
145
- 0.1.4
146
- ============
147
- * save-html-to-disk: fix redirect check
148
- * logger: add logger for skipExternal
149
-
150
- 0.1.3
151
- ============
152
- * save-html-to-disk: fix redirect placeholder path
153
-
154
- 0.1.2
155
- ============
156
- * adapters: make processRedirectedUrl named function
157
- * options: move initialUrl and logSubDir to StaticDownloadOptions
158
- * options: retry on error codes
159
- * download-resource: manually retry on got internal errors
160
- * io: refactor mkdirRetry
161
- * process-html: skip invalid srcset
162
-
163
- 0.1.1
164
- ============
165
- * io: remove mkdirRetrySync and update writeFile
166
- * util: arrayToMap could freeze the object returned if required
167
- * detect-resource-type: fix url with search and hash
168
- * options: allow merging got options from StaticDownloadOptions
169
- * options: add comments
170
- * life-cycle: convent default life cycle fn to named function
171
-
172
- 0.1.0
173
- ============
174
- Initial release.
175
-