@jocmp/mercury-parser 2.3.5 → 2.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/mercury.js CHANGED
@@ -5965,46 +5965,21 @@ var WwwEngadgetComExtractor = {
5965
5965
 
5966
5966
  var ArstechnicaComExtractor = {
5967
5967
  domain: 'arstechnica.com',
5968
- // Articles from this site are often paginated, but I was unable to write a CSS
5969
- // selector to find the next page. On the last page, there will be a link with a CSS
5970
- // selector indicating that the previous page is next. But the parser appears to find
5971
- // the next page without this extractor finding it, as long as the fallback option is
5972
- // left at its default value of true.
5973
5968
  title: {
5974
- selectors: ['title']
5975
- },
5976
- author: {
5977
- selectors: ['*[rel="author"] *[itemprop="name"]']
5978
- },
5979
- date_published: {
5980
- selectors: [['.byline time', 'datetime']]
5981
- },
5982
- dek: {
5983
- selectors: ['h2[itemprop="description"]']
5969
+ selectors: ['title', 'h1']
5984
5970
  },
5985
5971
  lead_image_url: {
5986
5972
  selectors: [['meta[name="og:image"]', 'value']]
5987
5973
  },
5988
5974
  content: {
5989
- selectors: ['div[itemprop="articleBody"]'],
5990
- // Is there anything in the content you selected that needs transformed
5991
- // before it's consumable content? E.g., unusual lazy loaded images
5975
+ selectors: ['.post-content', 'main'],
5992
5976
  transforms: {
5993
- h2: function h2($node) {
5994
- // Some pages have an element h2 that is significant, and that the parser will
5995
- // remove if not following a paragraph. Adding this empty paragraph fixes it, and
5996
- // the empty paragraph will be removed anyway.
5997
- $node.before('<p></p>');
5977
+ img: function img($node) {
5978
+ $node.removeAttr('width');
5979
+ $node.removeAttr('sizes');
5998
5980
  }
5999
5981
  },
6000
- // Is there anything that is in the result that shouldn't be?
6001
- // The clean selectors will remove anything that matches from
6002
- // the result.
6003
- clean: [// Remove enlarge links and separators inside image captions.
6004
- 'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
6005
- // removed them.
6006
- 'figure.video', // Image galleries that do not work.
6007
- '.gallery', 'aside', '.sidebar']
5982
+ clean: ['header', '.upper-deck__text', '.text-settings-dropdown-story']
6008
5983
  }
6009
5984
  };
6010
5985
 
@@ -6520,32 +6495,6 @@ var WwwNtvDeExtractor = {
6520
6495
  }
6521
6496
  };
6522
6497
 
6523
- var SportSePlExtractor = {
6524
- domain: 'sport.se.pl',
6525
- title: {
6526
- selectors: [['meta[name="og:title"]', 'value']]
6527
- },
6528
- author: {
6529
- selectors: ['.article_author']
6530
- },
6531
- date_published: {
6532
- selectors: ['#timezone'],
6533
- timezone: 'Europe/Warsaw'
6534
- },
6535
- lead_image_url: {
6536
- selectors: [['meta[name="og:image"]', 'value']]
6537
- },
6538
- content: {
6539
- selectors: ['article'],
6540
- transforms: {
6541
- h2: function h2(node) {
6542
- return node.attr('class', 'mercury-parser-keep');
6543
- }
6544
- },
6545
- clean: ['#timezone', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6546
- }
6547
- };
6548
-
6549
6498
  var WwwSePlExtractor = {
6550
6499
  domain: 'www.se.pl',
6551
6500
  title: {
@@ -6568,12 +6517,20 @@ var WwwSePlExtractor = {
6568
6517
  return node.attr('class', 'mercury-parser-keep');
6569
6518
  }
6570
6519
  },
6571
- clean: ['#timezone', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6520
+ clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6572
6521
  }
6573
6522
  };
6574
6523
 
6575
- var PolitykaSePlExtractor = {
6576
- domain: 'polityka.se.pl',
6524
+ var SportSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6525
+ domain: 'sport.se.pl'
6526
+ });
6527
+
6528
+ var PolitykaSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6529
+ domain: 'polityka.se.pl'
6530
+ });
6531
+
6532
+ var SuperserialeSePlExtractor = {
6533
+ domain: 'superseriale.se.pl',
6577
6534
  title: {
6578
6535
  selectors: [['meta[name="og:title"]', 'value']]
6579
6536
  },
@@ -6594,38 +6551,61 @@ var PolitykaSePlExtractor = {
6594
6551
  return node.attr('class', 'mercury-parser-keep');
6595
6552
  }
6596
6553
  },
6597
- clean: ['.article__author__croppimg', // author photo
6554
+ clean: ['#timezone', '.author', '.article__author__croppimg', // author photo
6598
6555
  '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6599
6556
  }
6600
6557
  };
6601
6558
 
6602
- var SuperserialeSePlExtractor = {
6603
- domain: 'superseriale.se.pl',
6559
+ var SzczecinSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6560
+ domain: 'szczecin.se.pl'
6561
+ });
6562
+
6563
+ var SuperbizSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6564
+ domain: 'superbiz.se.pl'
6565
+ });
6566
+
6567
+ var PortalobronnySePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6568
+ domain: 'portalobronny.se.pl'
6569
+ });
6570
+
6571
+ var PolskisamorzadSePlExtractor = {
6572
+ domain: 'polskisamorzad.se.pl',
6604
6573
  title: {
6605
6574
  selectors: [['meta[name="og:title"]', 'value']]
6606
6575
  },
6607
6576
  author: {
6608
- selectors: ['.article_author:first-of-type']
6609
- },
6610
- date_published: {
6611
- selectors: ['#timezone'],
6612
- timezone: 'Europe/Warsaw'
6577
+ selectors: ['.article_author:first-of-type', '.article-author', ['meta[name="og:article:author"]', 'value']]
6613
6578
  },
6614
6579
  lead_image_url: {
6615
6580
  selectors: [['meta[name="og:image"]', 'value']]
6616
6581
  },
6617
6582
  content: {
6618
- selectors: ['article'],
6583
+ selectors: ['.article-single'],
6619
6584
  transforms: {
6620
6585
  h2: function h2(node) {
6621
6586
  return node.attr('class', 'mercury-parser-keep');
6622
6587
  }
6623
6588
  },
6624
- clean: ['#timezone', '.article__author__croppimg', // author photo
6625
- '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6589
+ clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6626
6590
  }
6627
6591
  };
6628
6592
 
6593
+ var LodzSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6594
+ domain: 'lodz.se.pl'
6595
+ });
6596
+
6597
+ var WroclawSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6598
+ domain: 'wroclaw.se.pl'
6599
+ });
6600
+
6601
+ var LublinSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6602
+ domain: 'lublin.se.pl'
6603
+ });
6604
+
6605
+ var BialystokSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6606
+ domain: 'bialystok.se.pl'
6607
+ });
6608
+
6629
6609
 
6630
6610
 
6631
6611
  var CustomExtractors = /*#__PURE__*/Object.freeze({
@@ -6788,7 +6768,15 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
6788
6768
  SportSePlExtractor: SportSePlExtractor,
6789
6769
  WwwSePlExtractor: WwwSePlExtractor,
6790
6770
  PolitykaSePlExtractor: PolitykaSePlExtractor,
6791
- SuperserialeSePlExtractor: SuperserialeSePlExtractor
6771
+ SuperserialeSePlExtractor: SuperserialeSePlExtractor,
6772
+ SzczecinSePlExtractor: SzczecinSePlExtractor,
6773
+ SuperbizSePlExtractor: SuperbizSePlExtractor,
6774
+ PortalobronnySePlExtractor: PortalobronnySePlExtractor,
6775
+ PolskisamorzadSePlExtractor: PolskisamorzadSePlExtractor,
6776
+ LodzSePlExtractor: LodzSePlExtractor,
6777
+ WroclawSePlExtractor: WroclawSePlExtractor,
6778
+ LublinSePlExtractor: LublinSePlExtractor,
6779
+ BialystokSePlExtractor: BialystokSePlExtractor
6792
6780
  });
6793
6781
 
6794
6782
  var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {