npm - @jocmp/mercury-parser - Versions diffs - 2.2.10 → 2.3.1 - Mend

@jocmp/mercury-parser 2.2.10 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +7 -9
package/cli.js +3 -3
package/dist/generate-custom-parser.js +79 -12
package/dist/generate-custom-parser.js.map +1 -1
package/dist/mercury.js +81 -12
package/dist/mercury.js.map +1 -1
package/dist/mercury.web.js +1 -1
package/dist/mercury.web.js.map +1 -1
package/package.json +1 -1

package/dist/mercury.js CHANGED Viewed

@@ -362,7 +362,7 @@ var KEEP_CLASS = 'mercury-parser-keep';
 var KEEP_SELECTORS = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them.
 var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
-var WHITELIST_ATTRS = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
+var WHITELIST_ATTRS = ['src', 'srcset', 'start', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
 var WHITELIST_ATTRS_RE = new RegExp("^(".concat(WHITELIST_ATTRS.join('|'), ")$"), 'i'); // removeEmpty
 var CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders
@@ -1197,7 +1197,7 @@ function cleanTags$$1($article, $) {
     if (weight < 0) {
       $node.remove();
     } else {
-      // deteremine if node seems like content
+      // determine if node seems like content
       removeUnlessContent($node, $, weight);
     }
   });
@@ -1207,11 +1207,16 @@ function cleanTags$$1($article, $) {
 function cleanHeaders($article, $) {
   var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
   $(HEADER_TAG_LIST, $article).each(function (index, header) {
-    var $header = $(header); // Remove any headers that appear before all other p tags in the
+    var $header = $(header);
+    if ($(header).hasClass(KEEP_CLASS)) {
+      return $header;
+    } // Remove any headers that appear before all other p tags in the
     // document. This probably means that it was part of the title, a
     // subtitle or something else extraneous like a datestamp or byline,
     // all of which should be handled by other metadata handling.
     if ($($header, $article).prevAll('p').length === 0) {
       return $header.remove();
     } // Remove any headers that match the title exactly.
@@ -6171,8 +6176,18 @@ var WwwVersantsComExtractor = {
     selectors: [['meta[name="og:image"]', 'value']]
   },
   content: {
-    selectors: ['.entry-content'],
-    clean: ['.adv-link', '.versa-target']
+    transforms: {
+      '.featured-image': function featuredImage($node) {
+        $node.addClass('mercury-parser-keep');
+        var figcaption = $node.find('span');
+        $node.find('figure').append(figcaption);
+      }
+    },
+    selectors: ['.article-content'],
+    clean: ['.adv-link', '.versa-target', 'header', // Clean title
+    '.author', // Clean author
+    '.thumbnail-slider' // Remove, the main images will be within the .main-slider div.
+    ]
   }
 };
@@ -6218,20 +6233,24 @@ var WwwAndroidauthorityComExtractor = {
   lead_image_url: {
     selectors: [['meta[name="og:image"]', 'value']]
   },
+  // Some pages have a nested header elements that are significant, and that the parser will
+  // remove if not following a paragraph. Adding this empty paragraph fixes it, and
+  // the empty paragraph will be removed anyway.
   content: {
-    selectors: ['.d_Dd'],
+    selectors: ['.e_Bc', '.d_Dd'],
     transforms: {
       ol: function ol(node) {
         node.attr('class', 'mercury-parser-keep');
       },
       h2: function h2($node) {
-        // Some pages have an element h2 that is significant, and that the parser will
-        // remove if not following a paragraph. Adding this empty paragraph fixes it, and
-        // the empty paragraph will be removed anyway.
-        $node.before('<p></p>');
+        return $node.attr('class', 'mercury-parser-keep');
+      },
+      h3: function h3($node) {
+        return $node.attr('class', 'mercury-parser-keep');
       }
     },
-    clean: ['.d_f .d_nr' // Lead image
+    clean: ['.e_Oh', // Polls
+    'picture + div' // Lead image text
     ]
   }
 };
@@ -6329,6 +6348,54 @@ var MobilesyrupComExtractor = {
   }
 };
+var WwwChannelnewsasiaComExtractor = {
+  domain: 'www.channelnewsasia.com',
+  title: {
+    selectors: [['meta[name="og:title"]', 'value']]
+  },
+  author: {
+    selectors: ['.link--author-profile', ['meta[name="cXenseParse:author"]', 'value']]
+  },
+  date_published: {
+    selectors: ['.article-publish:not(span)'],
+    format: 'DD MMM YYYY HH:mma',
+    timezone: 'Asia/Singapore'
+  },
+  dek: {
+    selectors: ['.content-detail__description']
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['section[data-title="Content"]'],
+    transforms: {},
+    clean: []
+  }
+};
+var WccftechComExtractor = {
+  domain: 'wccftech.com',
+  title: {
+    selectors: [['meta[name="og:title"]', 'value']]
+  },
+  author: {
+    selectors: ['div.meta a:first-of-type']
+  },
+  date_published: {
+    selectors: [['meta[name="pub_date"]', 'value'], ['meta[name="article:published_time"]', 'value']]
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['.content'],
+    transforms: {},
+    clean: ['.democracy' // JavaScript polls
+    ]
+  }
+};
 var CustomExtractors = /*#__PURE__*/Object.freeze({
@@ -6481,7 +6548,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
   TechcrunchComExtractor: TechcrunchComExtractor,
   WwwHardwarezoneComSgExtractor: WwwHardwarezoneComSgExtractor,
   WwwSpiegelDeExtractor: WwwSpiegelDeExtractor,
-  MobilesyrupComExtractor: MobilesyrupComExtractor
+  MobilesyrupComExtractor: MobilesyrupComExtractor,
+  WwwChannelnewsasiaComExtractor: WwwChannelnewsasiaComExtractor,
+  WccftechComExtractor: WccftechComExtractor
 });
 var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {