@jocmp/mercury-parser 2.2.10 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/mercury.js CHANGED
@@ -362,7 +362,7 @@ var KEEP_CLASS = 'mercury-parser-keep';
362
362
  var KEEP_SELECTORS = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them.
363
363
 
364
364
  var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
365
- var WHITELIST_ATTRS = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
365
+ var WHITELIST_ATTRS = ['src', 'srcset', 'start', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
366
366
  var WHITELIST_ATTRS_RE = new RegExp("^(".concat(WHITELIST_ATTRS.join('|'), ")$"), 'i'); // removeEmpty
367
367
 
368
368
  var CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders
@@ -1197,7 +1197,7 @@ function cleanTags$$1($article, $) {
1197
1197
  if (weight < 0) {
1198
1198
  $node.remove();
1199
1199
  } else {
1200
- // deteremine if node seems like content
1200
+ // determine if node seems like content
1201
1201
  removeUnlessContent($node, $, weight);
1202
1202
  }
1203
1203
  });
@@ -1207,11 +1207,16 @@ function cleanTags$$1($article, $) {
1207
1207
  function cleanHeaders($article, $) {
1208
1208
  var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
1209
1209
  $(HEADER_TAG_LIST, $article).each(function (index, header) {
1210
- var $header = $(header); // Remove any headers that appear before all other p tags in the
1210
+ var $header = $(header);
1211
+
1212
+ if ($(header).hasClass(KEEP_CLASS)) {
1213
+ return $header;
1214
+ } // Remove any headers that appear before all other p tags in the
1211
1215
  // document. This probably means that it was part of the title, a
1212
1216
  // subtitle or something else extraneous like a datestamp or byline,
1213
1217
  // all of which should be handled by other metadata handling.
1214
1218
 
1219
+
1215
1220
  if ($($header, $article).prevAll('p').length === 0) {
1216
1221
  return $header.remove();
1217
1222
  } // Remove any headers that match the title exactly.
@@ -6171,8 +6176,18 @@ var WwwVersantsComExtractor = {
6171
6176
  selectors: [['meta[name="og:image"]', 'value']]
6172
6177
  },
6173
6178
  content: {
6174
- selectors: ['.entry-content'],
6175
- clean: ['.adv-link', '.versa-target']
6179
+ transforms: {
6180
+ '.featured-image': function featuredImage($node) {
6181
+ $node.addClass('mercury-parser-keep');
6182
+ var figcaption = $node.find('span');
6183
+ $node.find('figure').append(figcaption);
6184
+ }
6185
+ },
6186
+ selectors: ['.article-content'],
6187
+ clean: ['.adv-link', '.versa-target', 'header', // Clean title
6188
+ '.author', // Clean author
6189
+ '.thumbnail-slider' // Remove, the main images will be within the .main-slider div.
6190
+ ]
6176
6191
  }
6177
6192
  };
6178
6193
 
@@ -6218,20 +6233,24 @@ var WwwAndroidauthorityComExtractor = {
6218
6233
  lead_image_url: {
6219
6234
  selectors: [['meta[name="og:image"]', 'value']]
6220
6235
  },
6236
+ // Some pages have a nested header elements that are significant, and that the parser will
6237
+ // remove if not following a paragraph. Adding this empty paragraph fixes it, and
6238
+ // the empty paragraph will be removed anyway.
6221
6239
  content: {
6222
- selectors: ['.d_Dd'],
6240
+ selectors: ['.e_Bc', '.d_Dd'],
6223
6241
  transforms: {
6224
6242
  ol: function ol(node) {
6225
6243
  node.attr('class', 'mercury-parser-keep');
6226
6244
  },
6227
6245
  h2: function h2($node) {
6228
- // Some pages have an element h2 that is significant, and that the parser will
6229
- // remove if not following a paragraph. Adding this empty paragraph fixes it, and
6230
- // the empty paragraph will be removed anyway.
6231
- $node.before('<p></p>');
6246
+ return $node.attr('class', 'mercury-parser-keep');
6247
+ },
6248
+ h3: function h3($node) {
6249
+ return $node.attr('class', 'mercury-parser-keep');
6232
6250
  }
6233
6251
  },
6234
- clean: ['.d_f .d_nr' // Lead image
6252
+ clean: ['.e_Oh', // Polls
6253
+ 'picture + div' // Lead image text
6235
6254
  ]
6236
6255
  }
6237
6256
  };
@@ -6329,6 +6348,54 @@ var MobilesyrupComExtractor = {
6329
6348
  }
6330
6349
  };
6331
6350
 
6351
+ var WwwChannelnewsasiaComExtractor = {
6352
+ domain: 'www.channelnewsasia.com',
6353
+ title: {
6354
+ selectors: [['meta[name="og:title"]', 'value']]
6355
+ },
6356
+ author: {
6357
+ selectors: ['.link--author-profile', ['meta[name="cXenseParse:author"]', 'value']]
6358
+ },
6359
+ date_published: {
6360
+ selectors: ['.article-publish:not(span)'],
6361
+ format: 'DD MMM YYYY HH:mma',
6362
+ timezone: 'Asia/Singapore'
6363
+ },
6364
+ dek: {
6365
+ selectors: ['.content-detail__description']
6366
+ },
6367
+ lead_image_url: {
6368
+ selectors: [['meta[name="og:image"]', 'value']]
6369
+ },
6370
+ content: {
6371
+ selectors: ['section[data-title="Content"]'],
6372
+ transforms: {},
6373
+ clean: []
6374
+ }
6375
+ };
6376
+
6377
+ var WccftechComExtractor = {
6378
+ domain: 'wccftech.com',
6379
+ title: {
6380
+ selectors: [['meta[name="og:title"]', 'value']]
6381
+ },
6382
+ author: {
6383
+ selectors: ['div.meta a:first-of-type']
6384
+ },
6385
+ date_published: {
6386
+ selectors: [['meta[name="pub_date"]', 'value'], ['meta[name="article:published_time"]', 'value']]
6387
+ },
6388
+ lead_image_url: {
6389
+ selectors: [['meta[name="og:image"]', 'value']]
6390
+ },
6391
+ content: {
6392
+ selectors: ['.content'],
6393
+ transforms: {},
6394
+ clean: ['.democracy' // JavaScript polls
6395
+ ]
6396
+ }
6397
+ };
6398
+
6332
6399
 
6333
6400
 
6334
6401
  var CustomExtractors = /*#__PURE__*/Object.freeze({
@@ -6481,7 +6548,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
6481
6548
  TechcrunchComExtractor: TechcrunchComExtractor,
6482
6549
  WwwHardwarezoneComSgExtractor: WwwHardwarezoneComSgExtractor,
6483
6550
  WwwSpiegelDeExtractor: WwwSpiegelDeExtractor,
6484
- MobilesyrupComExtractor: MobilesyrupComExtractor
6551
+ MobilesyrupComExtractor: MobilesyrupComExtractor,
6552
+ WwwChannelnewsasiaComExtractor: WwwChannelnewsasiaComExtractor,
6553
+ WccftechComExtractor: WccftechComExtractor
6485
6554
  });
6486
6555
 
6487
6556
  var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {