website-scrap-engine 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/io.js CHANGED
@@ -6,14 +6,14 @@ Object.defineProperty(exports, "__esModule", { value: true });
6
6
  exports.writeFile = exports.mkdirRetry = void 0;
7
7
  const fs_1 = __importDefault(require("fs"));
8
8
  const path_1 = require("path");
9
- const mkdirp_1 = __importDefault(require("mkdirp"));
9
+ const mkdirp_1 = require("mkdirp");
10
10
  const logger_1 = require("./logger/logger");
11
11
  const mkdirRetry = async (dir, retry = 3) => {
12
12
  let error;
13
13
  for (let i = 0; i < retry; i++) {
14
14
  error = undefined;
15
15
  try {
16
- await (0, mkdirp_1.default)(dir);
16
+ await (0, mkdirp_1.mkdirp)(dir);
17
17
  }
18
18
  catch (e) {
19
19
  error = e;
package/lib/io.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"io.js","sourceRoot":"","sources":["../src/io.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AAEpB,+BAA6B;AAC7B,oDAA4B;AAE5B,4CAA2E;AAEpE,MAAM,UAAU,GAAG,KAAK,EAAE,GAAW,EAAE,KAAK,GAAG,CAAC,EAAiB,EAAE;IACxE,IAAI,KAAqB,CAAC;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE;QAC9B,KAAK,GAAG,SAAS,CAAC;QAClB,IAAI;YACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;SACnB;QAAC,OAAO,CAAC,EAAE;YACV,KAAK,GAAG,CAAC,CAAC;YACV,IAAI,CAAC,GAAG,CAAC,EAAE;gBACT,cAAW,CAAC,KAAK,CAAC,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC;aACxD;iBAAM;gBACL,cAAW,CAAC,KAAK,CAAC,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC;aACxD;YACD,SAAS;SACV;QACD,KAAK,GAAG,SAAS,CAAC;QAClB,OAAO;KACR;IACD,IAAI,KAAK,EAAE;QACT,MAAM,KAAK,CAAC;KACb;AACH,CAAC,CAAC;AArBW,QAAA,UAAU,cAqBrB;AAEK,MAAM,SAAS,GAAG,KAAK,EAC5B,QAAgB,EAChB,IAAkB,EAClB,QAA0B,EAC1B,KAAqB,EACrB,KAAqB,EACN,EAAE;IACjB,MAAM,GAAG,GAAW,IAAA,cAAO,EAAC,QAAQ,CAAC,CAAC;IACtC,IAAI,CAAC,YAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;QACvB,MAAM,IAAA,kBAAU,EAAC,GAAG,CAAC,CAAC;KACvB;IACD,IAAI,QAA6B,CAAC;IAClC,IAAI,OAAqC,CAAC;IAC1C,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE;QAC5B,QAAQ,GAAG,IAAI,CAAC;QAChB,OAAO,GAAG,EAAC,QAAQ,EAAC,CAAC;KACtB;SAAM,IAAI,IAAI,YAAY,WAAW,EAAE;QACtC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;KAC9B;SAAM,IAAI,IAAI,YAAY,UAAU,IAAI,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE;QAC9D,QAAQ,GAAG,IAAI,CAAC;KACjB;SAAM,IAAI,WAAW,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE;QACnC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;KACvE;SAAM;QACL,oBAAoB;QACpB,MAAM,IAAI,SAAS,CAAC,6BAA6B,CAAC,CAAC;KACpD;IACD,IAAI,OAAO,EAAE;QACX,MAAM,YAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;KAC1D;SAAM;QACL,MAAM,YAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;KACjD;IACD,qBAAqB;IACrB,IAAI,KAAK,EAAE;QACT,IAAI,CAAC,KAAK,EAAE;YACV,KAAK,GAAG,KAAK,CAAC;SACf;QACD,IAAI;YACF,MAAM,YAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;SAClD;QAAC,OAAO,CAAC,EAAE;YACV,cAAW,CAAC,IAAI,CAAC,kBAAkB,GAAG,QAAQ,EAAE,CAAC,CAAC,CAAC;SACpD;KACF;AACH,CAAC,CAAC;AA1CW,QAAA,SAAS,aA0CpB"}
1
+ {"version":3,"file":"io.js","sourceRoot":"","sources":["../src/io.ts"],"names":[],"mappings":";;;;;;AACA,4CAAoB;AACpB,+BAA6B;AAC7B,mCAA8B;AAE9B,4CAA2E;AAEpE,MAAM,UAAU,GAAG,KAAK,EAAE,GAAW,EAAE,KAAK,GAAG,CAAC,EAAiB,EAAE;IACxE,IAAI,KAAqB,CAAC;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE;QAC9B,KAAK,GAAG,SAAS,CAAC;QAClB,IAAI;YACF,MAAM,IAAA,eAAM,EAAC,GAAG,CAAC,CAAC;SACnB;QAAC,OAAO,CAAC,EAAE;YACV,KAAK,GAAG,CAAC,CAAC;YACV,IAAI,CAAC,GAAG,CAAC,EAAE;gBACT,cAAW,CAAC,KAAK,CAAC,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC;aACxD;iBAAM;gBACL,cAAW,CAAC,KAAK,CAAC,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC;aACxD;YACD,SAAS;SACV;QACD,KAAK,GAAG,SAAS,CAAC;QAClB,OAAO;KACR;IACD,IAAI,KAAK,EAAE;QACT,MAAM,KAAK,CAAC;KACb;AACH,CAAC,CAAC;AArBW,QAAA,UAAU,cAqBrB;AAEK,MAAM,SAAS,GAAG,KAAK,EAC5B,QAAgB,EAChB,IAAkB,EAClB,QAA0B,EAC1B,KAAqB,EACrB,KAAqB,EACN,EAAE;IACjB,MAAM,GAAG,GAAW,IAAA,cAAO,EAAC,QAAQ,CAAC,CAAC;IACtC,IAAI,CAAC,YAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;QACvB,MAAM,IAAA,kBAAU,EAAC,GAAG,CAAC,CAAC;KACvB;IACD,IAAI,QAA6B,CAAC;IAClC,IAAI,OAAqC,CAAC;IAC1C,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE;QAC5B,QAAQ,GAAG,IAAI,CAAC;QAChB,OAAO,GAAG,EAAC,QAAQ,EAAC,CAAC;KACtB;SAAM,IAAI,IAAI,YAAY,WAAW,EAAE;QACtC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;KAC9B;SAAM,IAAI,IAAI,YAAY,UAAU,IAAI,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE;QAC9D,QAAQ,GAAG,IAAI,CAAC;KACjB;SAAM,IAAI,WAAW,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE;QACnC,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;KACvE;SAAM;QACL,oBAAoB;QACpB,MAAM,IAAI,SAAS,CAAC,6BAA6B,CAAC,CAAC;KACpD;IACD,IAAI,OAAO,EAAE;QACX,MAAM,YAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,CAAC,CAAC;KAC1D;SAAM;QACL,MAAM,YAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;KACjD;IACD,qBAAqB;IACrB,IAAI,KAAK,EAAE;QACT,IAAI,CAAC,KAAK,EAAE;YACV,KAAK,GAAG,KAAK,CAAC;SACf;QACD,IAAI;YACF,MAAM,YAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;SAClD;QAAC,OAAO,CAAC,EAAE;YACV,cAAW,CAAC,IAAI,CAAC,kBAAkB,GAAG,QAAQ,EAAE,CAAC,CAAC,CAAC;SACpD;KACF;AACH,CAAC,CAAC;AA1CW,QAAA,SAAS,aA0CpB"}
@@ -6,6 +6,7 @@ const detect_resource_type_1 = require("./detect-resource-type");
6
6
  const resource_1 = require("../resource");
7
7
  const download_resource_1 = require("./download-resource");
8
8
  const process_html_1 = require("./process-html");
9
+ const process_html_meta_1 = require("./process-html-meta");
9
10
  const process_css_1 = require("./process-css");
10
11
  const process_site_map_1 = require("./process-site-map");
11
12
  const process_svg_1 = require("./process-svg");
@@ -31,6 +32,7 @@ const defaultLifeCycle = () => ({
31
32
  processAfterDownload: [
32
33
  adapters_1.processRedirectedUrl,
33
34
  process_html_1.processHtml,
35
+ process_html_meta_1.processHtmlMetaRefresh,
34
36
  process_svg_1.processSvg,
35
37
  process_css_1.processCss,
36
38
  process_site_map_1.processSiteMap
@@ -1 +1 @@
1
- {"version":3,"file":"default-life-cycle.js","sourceRoot":"","sources":["../../src/life-cycle/default-life-cycle.ts"],"names":[],"mappings":";;;AACA,6CAAuC;AACvC,iEAA0D;AAC1D,0CAA2C;AAC3C,2DAAqD;AACrD,iDAA2C;AAC3C,+CAAyC;AACzC,yDAAkD;AAClD,+CAAyC;AACzC,2DAAmD;AACnD,mEAA2D;AAC3D,yCAAgD;AAChD,+EAAwE;AACxE,+EAAsE;AAEtE;;GAEG;AACI,MAAM,gBAAgB,GAAG,GAAwB,EAAE,CAAC,CAAC;IAC1D,IAAI,EAAE,EAAE;IACR,YAAY,EAAE,CAAC,sBAAS,CAAC;IACzB,kBAAkB,EAAE,CAAC,yCAAkB,CAAC;IACxC,cAAc,EAAd,yBAAc;IACd,qBAAqB,EAAE,EAAE;IACzB,QAAQ,EAAE;QACR,oCAAgB;QAChB,uDAAyB;QACzB,qDAAuB;KACxB;IACD,oBAAoB,EAAE;QACpB,+BAAoB;QACpB,0BAAW;QACX,wBAAU;QACV,wBAAU;QACV,iCAAc;KACf;IACD,UAAU,EAAE,CAAC,kCAAc,EAAE,0CAAkB,CAAC;IAChD,OAAO,EAAE,EAAE;CACZ,CAAC,CAAC;AApBU,QAAA,gBAAgB,oBAoB1B"}
1
+ {"version":3,"file":"default-life-cycle.js","sourceRoot":"","sources":["../../src/life-cycle/default-life-cycle.ts"],"names":[],"mappings":";;;AACA,6CAAuC;AACvC,iEAA0D;AAC1D,0CAA2C;AAC3C,2DAAqD;AACrD,iDAA2C;AAC3C,2DAA2D;AAC3D,+CAAyC;AACzC,yDAAkD;AAClD,+CAAyC;AACzC,2DAAmD;AACnD,mEAA2D;AAC3D,yCAAgD;AAChD,+EAAwE;AACxE,+EAAsE;AAEtE;;GAEG;AACI,MAAM,gBAAgB,GAAG,GAAwB,EAAE,CAAC,CAAC;IAC1D,IAAI,EAAE,EAAE;IACR,YAAY,EAAE,CAAC,sBAAS,CAAC;IACzB,kBAAkB,EAAE,CAAC,yCAAkB,CAAC;IACxC,cAAc,EAAd,yBAAc;IACd,qBAAqB,EAAE,EAAE;IACzB,QAAQ,EAAE;QACR,oCAAgB;QAChB,uDAAyB;QACzB,qDAAuB;KACxB;IACD,oBAAoB,EAAE;QACpB,+BAAoB;QACpB,0BAAW;QACX,0CAAsB;QACtB,wBAAU;QACV,wBAAU;QACV,iCAAc;KACf;IACD,UAAU,EAAE,CAAC,kCAAc,EAAE,0CAAkB,CAAC;IAChD,OAAO,EAAE,EAAE;CACZ,CAAC,CAAC;AArBU,QAAA,gBAAgB,oBAqB1B"}
@@ -6,6 +6,7 @@ export { streamingDownloadToFile, downloadStreamingResource, downloadStreamingRe
6
6
  export { PipelineExecutor } from './pipeline-executor';
7
7
  export { processCssText, processCss } from './process-css';
8
8
  export { processHtml } from './process-html';
9
+ export { processHtmlMetaRefresh } from './process-html-meta';
9
10
  export { processSiteMap } from './process-site-map';
10
11
  export { processSvg } from './process-svg';
11
12
  export { getResourceBodyFromHtml, saveHtmlToDisk } from './save-html-to-disk';
@@ -23,7 +23,7 @@ var __importStar = (this && this.__importStar) || function (mod) {
23
23
  return result;
24
24
  };
25
25
  Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.types = exports.skipLinks = exports.saveResourceToDisk = exports.saveHtmlToDisk = exports.getResourceBodyFromHtml = exports.processSvg = exports.processSiteMap = exports.processHtml = exports.processCss = exports.processCssText = exports.downloadStreamingResourceWithHook = exports.downloadStreamingResource = exports.streamingDownloadToFile = exports.downloadResource = exports.requestForResource = exports.getRetry = exports.beforeRetryHook = exports.detectResourceType = exports.defaultLifeCycle = exports.adapter = void 0;
26
+ exports.types = exports.skipLinks = exports.saveResourceToDisk = exports.saveHtmlToDisk = exports.getResourceBodyFromHtml = exports.processSvg = exports.processSiteMap = exports.processHtmlMetaRefresh = exports.processHtml = exports.processCss = exports.processCssText = exports.downloadStreamingResourceWithHook = exports.downloadStreamingResource = exports.streamingDownloadToFile = exports.downloadResource = exports.requestForResource = exports.getRetry = exports.beforeRetryHook = exports.detectResourceType = exports.defaultLifeCycle = exports.adapter = void 0;
27
27
  exports.adapter = __importStar(require("./adapters"));
28
28
  var default_life_cycle_1 = require("./default-life-cycle");
29
29
  Object.defineProperty(exports, "defaultLifeCycle", { enumerable: true, get: function () { return default_life_cycle_1.defaultLifeCycle; } });
@@ -43,6 +43,8 @@ Object.defineProperty(exports, "processCssText", { enumerable: true, get: functi
43
43
  Object.defineProperty(exports, "processCss", { enumerable: true, get: function () { return process_css_1.processCss; } });
44
44
  var process_html_1 = require("./process-html");
45
45
  Object.defineProperty(exports, "processHtml", { enumerable: true, get: function () { return process_html_1.processHtml; } });
46
+ var process_html_meta_1 = require("./process-html-meta");
47
+ Object.defineProperty(exports, "processHtmlMetaRefresh", { enumerable: true, get: function () { return process_html_meta_1.processHtmlMetaRefresh; } });
46
48
  var process_site_map_1 = require("./process-site-map");
47
49
  Object.defineProperty(exports, "processSiteMap", { enumerable: true, get: function () { return process_site_map_1.processSiteMap; } });
48
50
  var process_svg_1 = require("./process-svg");
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/life-cycle/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,sDAAsC;AACtC,2DAAsD;AAA9C,sHAAA,gBAAgB,OAAA;AACxB,+DAA0D;AAAlD,0HAAA,kBAAkB,OAAA;AAC1B,yDAE6B;AAD3B,oHAAA,eAAe,OAAA;AAAE,6GAAA,QAAQ,OAAA;AAAE,uHAAA,kBAAkB,OAAA;AAAE,qHAAA,gBAAgB,OAAA;AAEjE,6EAIuC;AAHrC,sIAAA,uBAAuB,OAAA;AACvB,wIAAA,yBAAyB,OAAA;AACzB,gJAAA,iCAAiC,OAAA;AAGnC,6CAAyD;AAAjD,6GAAA,cAAc,OAAA;AAAE,yGAAA,UAAU,OAAA;AAClC,+CAA2C;AAAnC,2GAAA,WAAW,OAAA;AACnB,uDAAkD;AAA1C,kHAAA,cAAc,OAAA;AACtB,6CAAyC;AAAjC,yGAAA,UAAU,OAAA;AAClB,yDAA4E;AAApE,4HAAA,uBAAuB,OAAA;AAAE,mHAAA,cAAc,OAAA;AAC/C,iEAA2D;AAAnD,2HAAA,kBAAkB,OAAA;AAC1B,2CAAuC;AAA/B,uGAAA,SAAS,OAAA;AACjB,iDAAiC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/life-cycle/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,sDAAsC;AACtC,2DAAsD;AAA9C,sHAAA,gBAAgB,OAAA;AACxB,+DAA0D;AAAlD,0HAAA,kBAAkB,OAAA;AAC1B,yDAE6B;AAD3B,oHAAA,eAAe,OAAA;AAAE,6GAAA,QAAQ,OAAA;AAAE,uHAAA,kBAAkB,OAAA;AAAE,qHAAA,gBAAgB,OAAA;AAEjE,6EAIuC;AAHrC,sIAAA,uBAAuB,OAAA;AACvB,wIAAA,yBAAyB,OAAA;AACzB,gJAAA,iCAAiC,OAAA;AAGnC,6CAAyD;AAAjD,6GAAA,cAAc,OAAA;AAAE,yGAAA,UAAU,OAAA;AAClC,+CAA2C;AAAnC,2GAAA,WAAW,OAAA;AACnB,yDAA2D;AAAnD,2HAAA,sBAAsB,OAAA;AAC9B,uDAAkD;AAA1C,kHAAA,cAAc,OAAA;AACtB,6CAAyC;AAAjC,yGAAA,UAAU,OAAA;AAClB,yDAA4E;AAApE,4HAAA,uBAAuB,OAAA;AAAE,mHAAA,cAAc,OAAA;AAC/C,iEAA2D;AAAnD,2HAAA,kBAAkB,OAAA;AAC1B,2CAAuC;AAA/B,uGAAA,SAAS,OAAA;AACjB,iDAAiC"}
@@ -0,0 +1,4 @@
1
+ import type { DownloadResource, SubmitResourceFunc } from './types';
2
+ import type { StaticDownloadOptions } from '../options';
3
+ import type { PipelineExecutor } from './pipeline-executor';
4
+ export declare function processHtmlMetaRefresh(res: DownloadResource, submit: SubmitResourceFunc, options: StaticDownloadOptions, pipeline: PipelineExecutor): Promise<DownloadResource>;
@@ -0,0 +1,68 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.processHtmlMetaRefresh = void 0;
4
+ const resource_1 = require("../resource");
5
+ const adapters_1 = require("./adapters");
6
+ const logger_1 = require("../logger/logger");
7
+ /**
8
+ * Originally create by https://github.com/stevenvachon at
9
+ * https://github.com/stevenvachon/http-equiv-refresh
10
+ * MIT license
11
+ */
12
+ const META_REFRESH_PATTERN = /^\s*(\d+)(?:\s*;(?:\s*url\s*=)?\s*(?:["']\s*(.*?)\s*['"]|(.*?)))?\s*$/i;
13
+ async function processHtmlMetaRefresh(res, submit, options, pipeline) {
14
+ if (res.type !== resource_1.ResourceType.Html) {
15
+ return res;
16
+ }
17
+ if (!res.meta.doc) {
18
+ res.meta.doc = (0, adapters_1.parseHtml)(res, options);
19
+ }
20
+ const $ = res.meta.doc;
21
+ const metaLinks = $('meta[http-equiv="refresh"][content]');
22
+ if (metaLinks.length) {
23
+ const refUrl = res.redirectedUrl || res.url;
24
+ const savePath = refUrl === res.url ? res.savePath : undefined;
25
+ const depth = res.depth + 1;
26
+ for (let index = 0; index < metaLinks.length; index++) {
27
+ const elem = metaLinks.eq(index);
28
+ const attrValue = elem.attr('content');
29
+ if (!attrValue) {
30
+ continue;
31
+ }
32
+ const match = META_REFRESH_PATTERN.exec(attrValue);
33
+ if (!match) {
34
+ continue;
35
+ }
36
+ const originalLink = match[2] || match[3];
37
+ if (!originalLink) {
38
+ continue;
39
+ }
40
+ const link = await pipeline.linkRedirect(originalLink, elem, res);
41
+ if (!link) {
42
+ continue;
43
+ }
44
+ const linkType = await pipeline.detectResourceType(link, resource_1.ResourceType.Html, elem, res);
45
+ if (!linkType) {
46
+ if (logger_1.skip.isTraceEnabled()) {
47
+ logger_1.skip.trace('skip detectResourceType', originalLink, link, refUrl);
48
+ }
49
+ continue;
50
+ }
51
+ let resource = await pipeline.createResource(linkType, depth, link, refUrl, res.localRoot, options.encoding[linkType], savePath, res.type);
52
+ resource = await pipeline.processBeforeDownload(resource, elem, res, options);
53
+ if (!resource) {
54
+ if (logger_1.skip.isTraceEnabled()) {
55
+ logger_1.skip.trace('skip processBeforeDownload', originalLink, link, linkType, refUrl);
56
+ }
57
+ continue;
58
+ }
59
+ if (!resource.shouldBeDiscardedFromDownload) {
60
+ submit(resource);
61
+ }
62
+ elem.attr('content', attrValue.replace(originalLink, resource.replacePath));
63
+ }
64
+ }
65
+ return res;
66
+ }
67
+ exports.processHtmlMetaRefresh = processHtmlMetaRefresh;
68
+ //# sourceMappingURL=process-html-meta.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"process-html-meta.js","sourceRoot":"","sources":["../../src/life-cycle/process-html-meta.ts"],"names":[],"mappings":";;;AAGA,0CAAmD;AACnD,yCAAqC;AACrC,6CAAsC;AAEtC;;;;GAIG;AACH,MAAM,oBAAoB,GACxB,wEAAwE,CAAC;AAEpE,KAAK,UAAU,sBAAsB,CAC1C,GAAqB,EACrB,MAA0B,EAC1B,OAA8B,EAC9B,QAA0B;IAG1B,IAAI,GAAG,CAAC,IAAI,KAAK,uBAAY,CAAC,IAAI,EAAE;QAClC,OAAO,GAAG,CAAC;KACZ;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,EAAE;QACjB,GAAG,CAAC,IAAI,CAAC,GAAG,GAAG,IAAA,oBAAS,EAAC,GAAG,EAAE,OAAO,CAAC,CAAC;KACxC;IACD,MAAM,CAAC,GAAG,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC;IAEvB,MAAM,SAAS,GAAG,CAAC,CAAC,qCAAqC,CAAC,CAAC;IAC3D,IAAI,SAAS,CAAC,MAAM,EAAE;QACpB,MAAM,MAAM,GAAW,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,GAAG,CAAC;QACpD,MAAM,QAAQ,GAAG,MAAM,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC;QAE/D,MAAM,KAAK,GAAW,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC;QAEpC,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,SAAS,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE;YACrD,MAAM,IAAI,GAAG,SAAS,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC;YACjC,MAAM,SAAS,GAAkB,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtD,IAAI,CAAC,SAAS,EAAE;gBACd,SAAS;aACV;YACD,MAAM,KAAK,GAAG,oBAAoB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACnD,IAAI,CAAC,KAAK,EAAE;gBACV,SAAS;aACV;YACD,MAAM,YAAY,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC;YAC1C,IAAI,CAAC,YAAY,EAAE;gBACjB,SAAS;aACV;YACD,MAAM,IAAI,GACR,MAAM,QAAQ,CAAC,YAAY,CAAC,YAAY,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;YACvD,IAAI,CAAC,IAAI,EAAE;gBACT,SAAS;aACV;YAED,MAAM,QAAQ,GACZ,MAAM,QAAQ,CAAC,kBAAkB,CAAC,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAE,IAAI,EAAE,GAAG,CAAC,CAAC;YACxE,IAAI,CAAC,QAAQ,EAAE;gBACb,IAAI,aAAI,CAAC,cAAc,EAAE,EAAE;oBACzB,aAAI,CAAC,KAAK,CAAC,yBAAyB,EAClC,YAAY,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;iBAC/B;gBACD,SAAS;aACV;YACD,IAAI,QAAQ,GAAoB,MAAM,QAAQ,CAAC,cAAc,CAC3D,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAC7B,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAC,EACzC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;YACtB,QAAQ,GAAG,MAAM,QAAQ,CAAC,qBAAqB,CAAC,QAAQ,EAAE,IAAI,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;YAC9E,IAAI,CAAC,QAAQ,EAAE;gBACb,IAAI,aAAI,CAAC,cAAc,EAAE,EAAE;oBACzB,aAAI,CAAC,KAAK,CAAC,4BAA4B,EACrC,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;iBACzC;gBACD,SAAS;aACV;YACD,IAAI,CAAC,QAAQ,CAAC,6BAA6B,EAAE;gBAC3C,MAAM,CAAC,QAAQ,CAAC,CAAC;aAClB;YACD,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,SAAS,CAAC,OAAO,CAAC,YAAY,EAAE,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC;SAC7E;KACF;IAED,OAAO,GAAG,CAAC;AACb,CAAC;AAvED,wDAuEC"}
package/lib/sources.js CHANGED
@@ -32,6 +32,7 @@ exports.sources = [
32
32
  { selector: 'meta[property="og\\:video\\:url"]', attr: 'content' },
33
33
  { selector: 'meta[property="og\\:video\\:secure_url"]', attr: 'content' },
34
34
  { selector: 'video', attr: 'src' },
35
+ { selector: 'video', attr: 'poster' },
35
36
  { selector: 'video source', attr: 'src' },
36
37
  { selector: 'video track', attr: 'src' },
37
38
  { selector: 'audio', attr: 'src' },
@@ -1 +1 @@
1
- {"version":3,"file":"sources.js","sourceRoot":"","sources":["../src/sources.ts"],"names":[],"mappings":";;;AAAA,yCAAwC;AAQxC,0DAA0D;AAC1D,wEAAwE;AAC3D,QAAA,OAAO,GAAuB;IACzC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IACjD,EAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IAClE,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAC;IAC9B,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAC;IACjC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAC;IAClC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,OAAO,EAAC;IAChD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAC;IACjC,EAAC,QAAQ,EAAE,wBAAwB,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,GAAG,EAAC;IAC1E,EAAC,QAAQ,EAAE,mBAAmB,EAAE,IAAI,EAAE,MAAM,EAAC;IAC7C,EAAC,QAAQ,EAAE,sBAAsB,EAAE,IAAI,EAAE,MAAM,EAAC;IAChD,yCAAyC;IACzC,qDAAqD;IACrD,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,YAAY,EAAC;IACrD,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAC;IACvC,EAAC,QAAQ,EAAE,gBAAgB,EAAE,IAAI,EAAE,QAAQ,EAAC;IAC5C,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACzD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IAC1D,EAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACtD,mEAAmE;IACnE,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,YAAY,EAAC;CAC/C,CAAC,GAAG,CAAC,CAAC,GAA8B,EAAE,EAAE;IACvC,IAAI,GAAG,CAAC,QAAQ,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,EAAE;QAC/D,GAAG,CAAC,QAAQ,IAAI,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC;KACjC;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE;QACb,GAAG,CAAC,IAAI,GAAG,uBAAY,CAAC,MAAM,CAAC;KAChC;IACD,OAAO,GAAuB,CAAC;AACjC,CAAC,CAAC,CAAC"}
1
+ {"version":3,"file":"sources.js","sourceRoot":"","sources":["../src/sources.ts"],"names":[],"mappings":";;;AAAA,yCAAwC;AAQxC,0DAA0D;AAC1D,wEAAwE;AAC3D,QAAA,OAAO,GAAuB;IACzC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IACjD,EAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,uBAAY,CAAC,SAAS,EAAC;IAClE,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAC;IAC9B,EAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,EAAE,QAAQ,EAAC;IACjC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAC;IAClC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,OAAO,EAAC;IAChD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAC;IACjC,EAAC,QAAQ,EAAE,wBAAwB,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,GAAG,EAAC;IAC1E,EAAC,QAAQ,EAAE,mBAAmB,EAAE,IAAI,EAAE,MAAM,EAAC;IAC7C,EAAC,QAAQ,EAAE,sBAAsB,EAAE,IAAI,EAAE,MAAM,EAAC;IAChD,yCAAyC;IACzC,qDAAqD;IACrD,EAAC,QAAQ,EAAE,qBAAqB,EAAE,IAAI,EAAE,YAAY,EAAC;IACrD,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,MAAM,EAAC;IACvC,EAAC,QAAQ,EAAE,gBAAgB,EAAE,IAAI,EAAE,QAAQ,EAAC;IAC5C,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,6BAA6B,EAAE,IAAI,EAAE,SAAS,EAAC;IAC1D,EAAC,QAAQ,EAAE,mCAAmC,EAAE,IAAI,EAAE,SAAS,EAAC;IAChE,EAAC,QAAQ,EAAE,0CAA0C,EAAE,IAAI,EAAE,SAAS,EAAC;IACvE,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAC;IACnC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAC;IAChC,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,KAAK,EAAC;IACvC,EAAC,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAC;IACtC,EAAC,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACzD,EAAC,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IAC1D,EAAC,QAAQ,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,uBAAY,CAAC,IAAI,EAAC;IACtD,mEAAmE;IACnE,EAAC,QAAQ,EAAE,cAAc,EAAE,IAAI,EAAE,YAAY,EAAC;CAC/C,CAAC,GAAG,CAAC,CAAC,GAA8B,EAAE,EAAE;IACvC,IAAI,GAAG,CAAC,QAAQ,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,IAAI,EAAE;QAC/D,GAAG,CAAC,QAAQ,IAAI,IAAI,GAAG,CAAC,IAAI,GAAG,CAAC;KACjC;IACD,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE;QACb,GAAG,CAAC,IAAI,GAAG,uBAAY,CAAC,MAAM,CAAC;KAChC;IACD,OAAO,GAAuB,CAAC;AACjC,CAAC,CAAC,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "website-scrap-engine",
3
- "version": "0.6.0",
3
+ "version": "0.7.1",
4
4
  "description": "Configurable website scraper in typescript",
5
5
  "main": "lib",
6
6
  "types": "lib",
@@ -24,22 +24,22 @@
24
24
  "cheerio": "^1.0.0-rc.12",
25
25
  "css-url-parser": "^1.1.3",
26
26
  "got": "^11.8.6",
27
- "log4js": "^6.8.0",
28
- "mkdirp": "^2.1.3",
27
+ "log4js": "^6.9.1",
28
+ "mkdirp": "^3.0.1",
29
29
  "p-queue": "^6.6.2",
30
30
  "srcset": "^4.0.0",
31
31
  "urijs": "^1.19.11"
32
32
  },
33
33
  "devDependencies": {
34
- "@types/jest": "^27.5.2",
35
- "@types/node": "^18.14.1",
36
- "@types/urijs": "^1.19.19",
37
- "@typescript-eslint/eslint-plugin": "^5.53.0",
38
- "@typescript-eslint/parser": "^5.53.0",
39
- "eslint": "^8.34.0",
40
- "jest": "^27.5.1",
41
- "ts-jest": "^27.1.5",
42
- "typescript": "^4.9.5"
34
+ "@types/jest": "^28.1.1",
35
+ "@types/node": "^20.8.7",
36
+ "@types/urijs": "^1.19.20",
37
+ "@typescript-eslint/eslint-plugin": "^5.62.0",
38
+ "@typescript-eslint/parser": "^5.62.0",
39
+ "eslint": "^8.52.0",
40
+ "jest": "^28.1.3",
41
+ "ts-jest": "^28.0.8",
42
+ "typescript": "^5.0.4"
43
43
  },
44
44
  "files": [
45
45
  ".editorconfig",
package/src/io.ts CHANGED
@@ -1,16 +1,16 @@
1
- import fs from 'fs';
2
1
  import type {ObjectEncodingOptions} from 'fs';
2
+ import fs from 'fs';
3
3
  import {dirname} from 'path';
4
- import mkdirP from 'mkdirp';
4
+ import {mkdirp} from 'mkdirp';
5
5
  import type {ResourceBody, ResourceEncoding} from './resource';
6
- import {mkdir as mkdirLogger, error as errorLogger} from './logger/logger';
6
+ import {error as errorLogger, mkdir as mkdirLogger} from './logger/logger';
7
7
 
8
8
  export const mkdirRetry = async (dir: string, retry = 3): Promise<void> => {
9
9
  let error: unknown | void;
10
10
  for (let i = 0; i < retry; i++) {
11
11
  error = undefined;
12
12
  try {
13
- await mkdirP(dir);
13
+ await mkdirp(dir);
14
14
  } catch (e) {
15
15
  error = e;
16
16
  if (i > 0) {
@@ -4,6 +4,7 @@ import {detectResourceType} from './detect-resource-type';
4
4
  import {createResource} from '../resource';
5
5
  import {downloadResource} from './download-resource';
6
6
  import {processHtml} from './process-html';
7
+ import {processHtmlMetaRefresh} from './process-html-meta';
7
8
  import {processCss} from './process-css';
8
9
  import {processSiteMap} from './process-site-map';
9
10
  import {processSvg} from './process-svg';
@@ -30,6 +31,7 @@ export const defaultLifeCycle = (): ProcessingLifeCycle => ({
30
31
  processAfterDownload: [
31
32
  processRedirectedUrl,
32
33
  processHtml,
34
+ processHtmlMetaRefresh,
33
35
  processSvg,
34
36
  processCss,
35
37
  processSiteMap
@@ -12,6 +12,7 @@ export {
12
12
  export {PipelineExecutor} from './pipeline-executor';
13
13
  export {processCssText, processCss} from './process-css';
14
14
  export {processHtml} from './process-html';
15
+ export {processHtmlMetaRefresh} from './process-html-meta';
15
16
  export {processSiteMap} from './process-site-map';
16
17
  export {processSvg} from './process-svg';
17
18
  export {getResourceBodyFromHtml, saveHtmlToDisk} from './save-html-to-disk';
@@ -0,0 +1,87 @@
1
+ import type {DownloadResource, SubmitResourceFunc} from './types';
2
+ import type {StaticDownloadOptions} from '../options';
3
+ import type {PipelineExecutor} from './pipeline-executor';
4
+ import {Resource, ResourceType} from '../resource';
5
+ import {parseHtml} from './adapters';
6
+ import {skip} from '../logger/logger';
7
+
8
+ /**
9
+ * Originally create by https://github.com/stevenvachon at
10
+ * https://github.com/stevenvachon/http-equiv-refresh
11
+ * MIT license
12
+ */
13
+ const META_REFRESH_PATTERN =
14
+ /^\s*(\d+)(?:\s*;(?:\s*url\s*=)?\s*(?:["']\s*(.*?)\s*['"]|(.*?)))?\s*$/i;
15
+
16
+ export async function processHtmlMetaRefresh(
17
+ res: DownloadResource,
18
+ submit: SubmitResourceFunc,
19
+ options: StaticDownloadOptions,
20
+ pipeline: PipelineExecutor
21
+ ): Promise<DownloadResource> {
22
+
23
+ if (res.type !== ResourceType.Html) {
24
+ return res;
25
+ }
26
+ if (!res.meta.doc) {
27
+ res.meta.doc = parseHtml(res, options);
28
+ }
29
+ const $ = res.meta.doc;
30
+
31
+ const metaLinks = $('meta[http-equiv="refresh"][content]');
32
+ if (metaLinks.length) {
33
+ const refUrl: string = res.redirectedUrl || res.url;
34
+ const savePath = refUrl === res.url ? res.savePath : undefined;
35
+
36
+ const depth: number = res.depth + 1;
37
+
38
+ for (let index = 0; index < metaLinks.length; index++) {
39
+ const elem = metaLinks.eq(index);
40
+ const attrValue: string | void = elem.attr('content');
41
+ if (!attrValue) {
42
+ continue;
43
+ }
44
+ const match = META_REFRESH_PATTERN.exec(attrValue);
45
+ if (!match) {
46
+ continue;
47
+ }
48
+ const originalLink = match[2] || match[3];
49
+ if (!originalLink) {
50
+ continue;
51
+ }
52
+ const link: string | void =
53
+ await pipeline.linkRedirect(originalLink, elem, res);
54
+ if (!link) {
55
+ continue;
56
+ }
57
+
58
+ const linkType: ResourceType | void =
59
+ await pipeline.detectResourceType(link, ResourceType.Html, elem, res);
60
+ if (!linkType) {
61
+ if (skip.isTraceEnabled()) {
62
+ skip.trace('skip detectResourceType',
63
+ originalLink, link, refUrl);
64
+ }
65
+ continue;
66
+ }
67
+ let resource: Resource | void = await pipeline.createResource(
68
+ linkType, depth, link, refUrl,
69
+ res.localRoot, options.encoding[linkType],
70
+ savePath, res.type);
71
+ resource = await pipeline.processBeforeDownload(resource, elem, res, options);
72
+ if (!resource) {
73
+ if (skip.isTraceEnabled()) {
74
+ skip.trace('skip processBeforeDownload',
75
+ originalLink, link, linkType, refUrl);
76
+ }
77
+ continue;
78
+ }
79
+ if (!resource.shouldBeDiscardedFromDownload) {
80
+ submit(resource);
81
+ }
82
+ elem.attr('content', attrValue.replace(originalLink, resource.replacePath));
83
+ }
84
+ }
85
+
86
+ return res;
87
+ }
package/src/sources.ts CHANGED
@@ -36,6 +36,7 @@ export const sources: SourceDefinition[] = [
36
36
  {selector: 'meta[property="og\\:video\\:url"]', attr: 'content'},
37
37
  {selector: 'meta[property="og\\:video\\:secure_url"]', attr: 'content'},
38
38
  {selector: 'video', attr: 'src'},
39
+ {selector: 'video', attr: 'poster'},
39
40
  {selector: 'video source', attr: 'src'},
40
41
  {selector: 'video track', attr: 'src'},
41
42
  {selector: 'audio', attr: 'src'},
package/tsconfig.json CHANGED
@@ -3,7 +3,6 @@
3
3
  "module": "commonjs",
4
4
  "target": "es2018",
5
5
  "sourceMap": true,
6
- "charset": "utf8",
7
6
  "newLine": "lf",
8
7
  "outDir": "lib",
9
8
  "declaration": true,
package/CHANGELOG.md DELETED
@@ -1,167 +0,0 @@
1
- 0.6.0
2
- ============
3
-
4
- BREAKING
5
- ------------
6
- * resource: custom callback for rewriting savePath
7
- * life-cycle: custom callback for rewriting savePath (<https://github.com/website-local/website-scrap-engine/issues/383>)
8
-
9
- Fix
10
- ------------
11
- * cheerio: replace deprecated api
12
-
13
- Test
14
- ------------
15
- * test: migrating to eslint v8 and typescript-eslint v5
16
- * cheerio: fix a test
17
- * resource: add a test
18
- * ci: run tests on node.js 18.x (<https://github.com/website-local/website-scrap-engine/issues/610>)
19
-
20
- Misc
21
- ------------
22
- * package-lock-resolved: process registry.npmmirror.com
23
- * logger: fix type conflict
24
- * util: fix compatibility with typescript 4.8
25
- * npm: drop @types/mkdirp
26
- * update deps
27
-
28
- 0.5.0
29
- ============
30
-
31
- BREAKING
32
- ------------
33
- * typescript 4.4 support
34
- * WorkerMessage: `error` can be `unknown`
35
- * StreamingDownloadErrorHook: `e` can be `unknown`
36
- * pipeline-executor-impl fix keepSearch param
37
- * resource: redirectedSavePath not set after redirect
38
-
39
- Test
40
- ------------
41
- * test: adapt for jest 27 and ts-jest 27
42
-
43
- 0.4.0
44
- ============
45
-
46
- BREAKING
47
- ------------
48
- * worker-pool: load based worker pool (#11)
49
- * cheerio: adapt for version 1.0.0-rc.10 (#271)
50
- * test: adapt for URI.js v1.19.7 (#301)
51
-
52
- Fix
53
- ------------
54
- * downloader: correctly transfer resource body
55
- * correctly convent ArrayBufferView to Buffer
56
- * worker-pool: fix ready
57
-
58
- Enhancement
59
- ------------
60
- * npm: update
61
- * life-cycle: add init and dispose life cycle
62
- * resource: optionally redirected savePath
63
- * resource: take a log on replacing long search string
64
- * save-to-disk: optionally use remote date
65
- * worker-pool: log worker errors
66
- * worker-pool: custom initializer of worker
67
-
68
- Test
69
- ------------
70
- * worker-pool: basic unit test
71
- * save-html-to-disk: initial unit tests with mocked fs
72
- * save-to-disk: refactor tests
73
-
74
- 0.3.2
75
- ============
76
-
77
- * resource: fix redirected path processing (#157)
78
- * downloader: optional wait for this.init in method onIdle (#152)
79
- * typescript: prefer type only import
80
-
81
- 0.3.1
82
- ============
83
-
84
- * resource: use correct file scheme for windows (#145)
85
-
86
- 0.3.0
87
- ============
88
-
89
- New Feature
90
- ------------
91
- * life-cycle: extract and process source maps (#123)
92
- * adapters: async processHtml
93
- * life-cycle: add read-or-copy-local-resource
94
- * resource: support file protocol (#126)
95
-
96
- Misc
97
- ------------
98
- * types: export type CheerioElement
99
- * resource: optional skip replacePath processing in case of parser error (#107)
100
- * resource: fix new type of Buffer.from (#116)
101
- * build(deps): bump cheerio from 1.0.0-rc.3 to 1.0.0-rc.5
102
- * io: mkdirRetry returns no string
103
- * life-cycle: add download-streaming-resource to default
104
- * skip-links: skip unix scheme
105
- * skip-links: allow file protocol
106
- * download: skip non-http url
107
- * (BREAKING) resource: refactor createResource (#139)
108
-
109
- 0.2.0
110
- ============
111
- * life-cycle: streaming download and save binary resource to disk
112
- * build(deps-dev): bump @types/cheerio from 0.22.21 to 0.22.22
113
-
114
- 0.1.7
115
- ============
116
- * resource: parse and process standalone svg images
117
- * save-html-to-disk: keep location hash in redirect placeholder
118
- * detect-resource-type: export lowerCaseExtension
119
- * downloader: log downloadLink instead of rawUrl
120
- * typescript: update to v4.0
121
-
122
- 0.1.6
123
- ============
124
- * save-resource-to-disk: compare redirectedUrl with url
125
- * process-html: submit resources from inline css
126
- * downloader: correctly use adjustTimer on start
127
- * downloader: deduplicate on redirectedUrl
128
- downloader: do not wait for complete on add
129
-
130
- 0.1.5
131
- ============
132
- * downloader: do not wait for complete on add
133
- * process-html: fix detecting type
134
- * npm: update p-queue to 6.6.0
135
- * npm: move copy script to build
136
-
137
- 0.1.4
138
- ============
139
- * save-html-to-disk: fix redirect check
140
- * logger: add logger for skipExternal
141
-
142
- 0.1.3
143
- ============
144
- * save-html-to-disk: fix redirect placeholder path
145
-
146
- 0.1.2
147
- ============
148
- * adapters: make processRedirectedUrl named function
149
- * options: move initialUrl and logSubDir to StaticDownloadOptions
150
- * options: retry on error codes
151
- * download-resource: manually retry on got internal errors
152
- * io: refactor mkdirRetry
153
- * process-html: skip invalid srcset
154
-
155
- 0.1.1
156
- ============
157
- * io: remove mkdirRetrySync and update writeFile
158
- * util: arrayToMap could freeze the object returned if required
159
- * detect-resource-type: fix url with search and hash
160
- * options: allow merging got options from StaticDownloadOptions
161
- * options: add comments
162
- * life-cycle: convent default life cycle fn to named function
163
-
164
- 0.1.0
165
- ============
166
- Initial release.
167
-