@jocmp/mercury-parser 2.3.5 → 2.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/mercury.js CHANGED
@@ -5965,46 +5965,16 @@ var WwwEngadgetComExtractor = {
5965
5965
 
5966
5966
  var ArstechnicaComExtractor = {
5967
5967
  domain: 'arstechnica.com',
5968
- // Articles from this site are often paginated, but I was unable to write a CSS
5969
- // selector to find the next page. On the last page, there will be a link with a CSS
5970
- // selector indicating that the previous page is next. But the parser appears to find
5971
- // the next page without this extractor finding it, as long as the fallback option is
5972
- // left at its default value of true.
5973
5968
  title: {
5974
- selectors: ['title']
5975
- },
5976
- author: {
5977
- selectors: ['*[rel="author"] *[itemprop="name"]']
5978
- },
5979
- date_published: {
5980
- selectors: [['.byline time', 'datetime']]
5981
- },
5982
- dek: {
5983
- selectors: ['h2[itemprop="description"]']
5969
+ selectors: ['title', 'h1']
5984
5970
  },
5985
5971
  lead_image_url: {
5986
5972
  selectors: [['meta[name="og:image"]', 'value']]
5987
5973
  },
5988
5974
  content: {
5989
- selectors: ['div[itemprop="articleBody"]'],
5990
- // Is there anything in the content you selected that needs transformed
5991
- // before it's consumable content? E.g., unusual lazy loaded images
5992
- transforms: {
5993
- h2: function h2($node) {
5994
- // Some pages have an element h2 that is significant, and that the parser will
5995
- // remove if not following a paragraph. Adding this empty paragraph fixes it, and
5996
- // the empty paragraph will be removed anyway.
5997
- $node.before('<p></p>');
5998
- }
5999
- },
6000
- // Is there anything that is in the result that shouldn't be?
6001
- // The clean selectors will remove anything that matches from
6002
- // the result.
6003
- clean: [// Remove enlarge links and separators inside image captions.
6004
- 'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
6005
- // removed them.
6006
- 'figure.video', // Image galleries that do not work.
6007
- '.gallery', 'aside', '.sidebar']
5975
+ selectors: ['main'],
5976
+ transforms: {},
5977
+ clean: ['.upper-deck__text', '.text-settings-dropdown-story']
6008
5978
  }
6009
5979
  };
6010
5980
 
@@ -6520,32 +6490,6 @@ var WwwNtvDeExtractor = {
6520
6490
  }
6521
6491
  };
6522
6492
 
6523
- var SportSePlExtractor = {
6524
- domain: 'sport.se.pl',
6525
- title: {
6526
- selectors: [['meta[name="og:title"]', 'value']]
6527
- },
6528
- author: {
6529
- selectors: ['.article_author']
6530
- },
6531
- date_published: {
6532
- selectors: ['#timezone'],
6533
- timezone: 'Europe/Warsaw'
6534
- },
6535
- lead_image_url: {
6536
- selectors: [['meta[name="og:image"]', 'value']]
6537
- },
6538
- content: {
6539
- selectors: ['article'],
6540
- transforms: {
6541
- h2: function h2(node) {
6542
- return node.attr('class', 'mercury-parser-keep');
6543
- }
6544
- },
6545
- clean: ['#timezone', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6546
- }
6547
- };
6548
-
6549
6493
  var WwwSePlExtractor = {
6550
6494
  domain: 'www.se.pl',
6551
6495
  title: {
@@ -6568,12 +6512,20 @@ var WwwSePlExtractor = {
6568
6512
  return node.attr('class', 'mercury-parser-keep');
6569
6513
  }
6570
6514
  },
6571
- clean: ['#timezone', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6515
+ clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6572
6516
  }
6573
6517
  };
6574
6518
 
6575
- var PolitykaSePlExtractor = {
6576
- domain: 'polityka.se.pl',
6519
+ var SportSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6520
+ domain: 'sport.se.pl'
6521
+ });
6522
+
6523
+ var PolitykaSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6524
+ domain: 'polityka.se.pl'
6525
+ });
6526
+
6527
+ var SuperserialeSePlExtractor = {
6528
+ domain: 'superseriale.se.pl',
6577
6529
  title: {
6578
6530
  selectors: [['meta[name="og:title"]', 'value']]
6579
6531
  },
@@ -6594,35 +6546,42 @@ var PolitykaSePlExtractor = {
6594
6546
  return node.attr('class', 'mercury-parser-keep');
6595
6547
  }
6596
6548
  },
6597
- clean: ['.article__author__croppimg', // author photo
6549
+ clean: ['#timezone', '.author', '.article__author__croppimg', // author photo
6598
6550
  '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6599
6551
  }
6600
6552
  };
6601
6553
 
6602
- var SuperserialeSePlExtractor = {
6603
- domain: 'superseriale.se.pl',
6554
+ var SzczecinSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6555
+ domain: 'szczecin.se.pl'
6556
+ });
6557
+
6558
+ var SuperbizSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6559
+ domain: 'superbiz.se.pl'
6560
+ });
6561
+
6562
+ var PortalobronnySePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6563
+ domain: 'portalobronny.se.pl'
6564
+ });
6565
+
6566
+ var PolskisamorzadSePlExtractor = {
6567
+ domain: 'polskisamorzad.se.pl',
6604
6568
  title: {
6605
6569
  selectors: [['meta[name="og:title"]', 'value']]
6606
6570
  },
6607
6571
  author: {
6608
- selectors: ['.article_author:first-of-type']
6609
- },
6610
- date_published: {
6611
- selectors: ['#timezone'],
6612
- timezone: 'Europe/Warsaw'
6572
+ selectors: ['.article_author:first-of-type', '.article-author', ['meta[name="og:article:author"]', 'value']]
6613
6573
  },
6614
6574
  lead_image_url: {
6615
6575
  selectors: [['meta[name="og:image"]', 'value']]
6616
6576
  },
6617
6577
  content: {
6618
- selectors: ['article'],
6578
+ selectors: ['.article-single'],
6619
6579
  transforms: {
6620
6580
  h2: function h2(node) {
6621
6581
  return node.attr('class', 'mercury-parser-keep');
6622
6582
  }
6623
6583
  },
6624
- clean: ['#timezone', '.article__author__croppimg', // author photo
6625
- '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6584
+ clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6626
6585
  }
6627
6586
  };
6628
6587
 
@@ -6788,7 +6747,11 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
6788
6747
  SportSePlExtractor: SportSePlExtractor,
6789
6748
  WwwSePlExtractor: WwwSePlExtractor,
6790
6749
  PolitykaSePlExtractor: PolitykaSePlExtractor,
6791
- SuperserialeSePlExtractor: SuperserialeSePlExtractor
6750
+ SuperserialeSePlExtractor: SuperserialeSePlExtractor,
6751
+ SzczecinSePlExtractor: SzczecinSePlExtractor,
6752
+ SuperbizSePlExtractor: SuperbizSePlExtractor,
6753
+ PortalobronnySePlExtractor: PortalobronnySePlExtractor,
6754
+ PolskisamorzadSePlExtractor: PolskisamorzadSePlExtractor
6792
6755
  });
6793
6756
 
6794
6757
  var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {