@jocmp/mercury-parser 2.3.4 → 2.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/mercury.js CHANGED
@@ -5965,46 +5965,16 @@ var WwwEngadgetComExtractor = {
5965
5965
 
5966
5966
  var ArstechnicaComExtractor = {
5967
5967
  domain: 'arstechnica.com',
5968
- // Articles from this site are often paginated, but I was unable to write a CSS
5969
- // selector to find the next page. On the last page, there will be a link with a CSS
5970
- // selector indicating that the previous page is next. But the parser appears to find
5971
- // the next page without this extractor finding it, as long as the fallback option is
5972
- // left at its default value of true.
5973
5968
  title: {
5974
- selectors: ['title']
5975
- },
5976
- author: {
5977
- selectors: ['*[rel="author"] *[itemprop="name"]']
5978
- },
5979
- date_published: {
5980
- selectors: [['.byline time', 'datetime']]
5981
- },
5982
- dek: {
5983
- selectors: ['h2[itemprop="description"]']
5969
+ selectors: ['title', 'h1']
5984
5970
  },
5985
5971
  lead_image_url: {
5986
5972
  selectors: [['meta[name="og:image"]', 'value']]
5987
5973
  },
5988
5974
  content: {
5989
- selectors: ['div[itemprop="articleBody"]'],
5990
- // Is there anything in the content you selected that needs transformed
5991
- // before it's consumable content? E.g., unusual lazy loaded images
5992
- transforms: {
5993
- h2: function h2($node) {
5994
- // Some pages have an element h2 that is significant, and that the parser will
5995
- // remove if not following a paragraph. Adding this empty paragraph fixes it, and
5996
- // the empty paragraph will be removed anyway.
5997
- $node.before('<p></p>');
5998
- }
5999
- },
6000
- // Is there anything that is in the result that shouldn't be?
6001
- // The clean selectors will remove anything that matches from
6002
- // the result.
6003
- clean: [// Remove enlarge links and separators inside image captions.
6004
- 'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
6005
- // removed them.
6006
- 'figure.video', // Image galleries that do not work.
6007
- '.gallery', 'aside', '.sidebar']
5975
+ selectors: ['main'],
5976
+ transforms: {},
5977
+ clean: ['.upper-deck__text', '.text-settings-dropdown-story']
6008
5978
  }
6009
5979
  };
6010
5980
 
@@ -6453,20 +6423,20 @@ var WwwHeiseDeExtractor = {
6453
6423
  return $node.attr('class', 'mercury-parser-keep');
6454
6424
  }
6455
6425
  },
6456
- clean: []
6426
+ clean: ['.ad-mobile-group-1', '.branding', '[data-component="RecommendationBox"]']
6457
6427
  }
6458
6428
  };
6459
6429
 
6460
6430
  var TldrTechExtractor = {
6461
6431
  domain: 'tldr.tech',
6462
6432
  title: {
6463
- selectors: [['meta[name="og:title"]', 'value'], 'title']
6433
+ selectors: ['h1']
6464
6434
  },
6465
6435
  lead_image_url: {
6466
6436
  selectors: [['meta[name="twitter:image"]', 'value']]
6467
6437
  },
6468
6438
  content: {
6469
- selectors: ['body'],
6439
+ selectors: ['.content-center', 'body'],
6470
6440
  transforms: {
6471
6441
  h2: function h2($node) {
6472
6442
  return $node.attr('class', 'mercury-parser-keep');
@@ -6479,6 +6449,142 @@ var TldrTechExtractor = {
6479
6449
  }
6480
6450
  };
6481
6451
 
6452
+ var BskyAppExtractor = {
6453
+ domain: 'bsky.app',
6454
+ title: {
6455
+ selectors: [['meta[name="og:title"]', 'value']]
6456
+ },
6457
+ author: null,
6458
+ date_published: null,
6459
+ lead_image_url: {
6460
+ selectors: [['meta[property="og:image"]', 'content'], ['meta[name="og:image"]', 'value']]
6461
+ },
6462
+ content: {
6463
+ selectors: ['noscript'],
6464
+ transforms: {
6465
+ noscript: function noscript($node, $) {
6466
+ var innerHtml = $.browser ? $node.text() : $node.html();
6467
+ var summary = $(innerHtml).find('#bsky_post_text');
6468
+ $node.replaceWith(summary.html());
6469
+ }
6470
+ },
6471
+ clean: []
6472
+ }
6473
+ };
6474
+
6475
+ var WwwNtvDeExtractor = {
6476
+ domain: 'www.n-tv.de',
6477
+ title: {
6478
+ selectors: [['meta[name="og:title"]', 'value']]
6479
+ },
6480
+ date_published: {
6481
+ selectors: [['meta[name="date"]', 'value']]
6482
+ },
6483
+ lead_image_url: {
6484
+ selectors: [['meta[name="og:image"]', 'value']]
6485
+ },
6486
+ content: {
6487
+ selectors: ['.article__text', 'article'],
6488
+ transforms: {},
6489
+ clean: ['.article__share-main']
6490
+ }
6491
+ };
6492
+
6493
+ var WwwSePlExtractor = {
6494
+ domain: 'www.se.pl',
6495
+ title: {
6496
+ selectors: [['meta[name="og:title"]', 'value']]
6497
+ },
6498
+ author: {
6499
+ selectors: ['.article_author:first-of-type']
6500
+ },
6501
+ date_published: {
6502
+ selectors: ['#timezone'],
6503
+ timezone: 'Europe/Warsaw'
6504
+ },
6505
+ lead_image_url: {
6506
+ selectors: [['meta[name="og:image"]', 'value']]
6507
+ },
6508
+ content: {
6509
+ selectors: ['article'],
6510
+ transforms: {
6511
+ h2: function h2(node) {
6512
+ return node.attr('class', 'mercury-parser-keep');
6513
+ }
6514
+ },
6515
+ clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6516
+ }
6517
+ };
6518
+
6519
+ var SportSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6520
+ domain: 'sport.se.pl'
6521
+ });
6522
+
6523
+ var PolitykaSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6524
+ domain: 'polityka.se.pl'
6525
+ });
6526
+
6527
+ var SuperserialeSePlExtractor = {
6528
+ domain: 'superseriale.se.pl',
6529
+ title: {
6530
+ selectors: [['meta[name="og:title"]', 'value']]
6531
+ },
6532
+ author: {
6533
+ selectors: ['.article_author:first-of-type']
6534
+ },
6535
+ date_published: {
6536
+ selectors: ['#timezone'],
6537
+ timezone: 'Europe/Warsaw'
6538
+ },
6539
+ lead_image_url: {
6540
+ selectors: [['meta[name="og:image"]', 'value']]
6541
+ },
6542
+ content: {
6543
+ selectors: ['article'],
6544
+ transforms: {
6545
+ h2: function h2(node) {
6546
+ return node.attr('class', 'mercury-parser-keep');
6547
+ }
6548
+ },
6549
+ clean: ['#timezone', '.author', '.article__author__croppimg', // author photo
6550
+ '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6551
+ }
6552
+ };
6553
+
6554
+ var SzczecinSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6555
+ domain: 'szczecin.se.pl'
6556
+ });
6557
+
6558
+ var SuperbizSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6559
+ domain: 'superbiz.se.pl'
6560
+ });
6561
+
6562
+ var PortalobronnySePlExtractor = _objectSpread({}, WwwSePlExtractor, {
6563
+ domain: 'portalobronny.se.pl'
6564
+ });
6565
+
6566
+ var PolskisamorzadSePlExtractor = {
6567
+ domain: 'polskisamorzad.se.pl',
6568
+ title: {
6569
+ selectors: [['meta[name="og:title"]', 'value']]
6570
+ },
6571
+ author: {
6572
+ selectors: ['.article_author:first-of-type', '.article-author', ['meta[name="og:article:author"]', 'value']]
6573
+ },
6574
+ lead_image_url: {
6575
+ selectors: [['meta[name="og:image"]', 'value']]
6576
+ },
6577
+ content: {
6578
+ selectors: ['.article-single'],
6579
+ transforms: {
6580
+ h2: function h2(node) {
6581
+ return node.attr('class', 'mercury-parser-keep');
6582
+ }
6583
+ },
6584
+ clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
6585
+ }
6586
+ };
6587
+
6482
6588
 
6483
6589
 
6484
6590
  var CustomExtractors = /*#__PURE__*/Object.freeze({
@@ -6635,7 +6741,17 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
6635
6741
  WwwChannelnewsasiaComExtractor: WwwChannelnewsasiaComExtractor,
6636
6742
  WccftechComExtractor: WccftechComExtractor,
6637
6743
  WwwHeiseDeExtractor: WwwHeiseDeExtractor,
6638
- TldrTechExtractor: TldrTechExtractor
6744
+ TldrTechExtractor: TldrTechExtractor,
6745
+ BskyAppExtractor: BskyAppExtractor,
6746
+ WwwNtvDeExtractor: WwwNtvDeExtractor,
6747
+ SportSePlExtractor: SportSePlExtractor,
6748
+ WwwSePlExtractor: WwwSePlExtractor,
6749
+ PolitykaSePlExtractor: PolitykaSePlExtractor,
6750
+ SuperserialeSePlExtractor: SuperserialeSePlExtractor,
6751
+ SzczecinSePlExtractor: SzczecinSePlExtractor,
6752
+ SuperbizSePlExtractor: SuperbizSePlExtractor,
6753
+ PortalobronnySePlExtractor: PortalobronnySePlExtractor,
6754
+ PolskisamorzadSePlExtractor: PolskisamorzadSePlExtractor
6639
6755
  });
6640
6756
 
6641
6757
  var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {