@jocmp/mercury-parser 2.3.4 → 2.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/generate-custom-parser.js +151 -38
- package/dist/generate-custom-parser.js.map +1 -1
- package/dist/mercury.js +154 -38
- package/dist/mercury.js.map +1 -1
- package/dist/mercury.web.js +1 -1
- package/dist/mercury.web.js.map +1 -1
- package/package.json +1 -2
package/dist/mercury.js
CHANGED
|
@@ -5965,46 +5965,16 @@ var WwwEngadgetComExtractor = {
|
|
|
5965
5965
|
|
|
5966
5966
|
var ArstechnicaComExtractor = {
|
|
5967
5967
|
domain: 'arstechnica.com',
|
|
5968
|
-
// Articles from this site are often paginated, but I was unable to write a CSS
|
|
5969
|
-
// selector to find the next page. On the last page, there will be a link with a CSS
|
|
5970
|
-
// selector indicating that the previous page is next. But the parser appears to find
|
|
5971
|
-
// the next page without this extractor finding it, as long as the fallback option is
|
|
5972
|
-
// left at its default value of true.
|
|
5973
5968
|
title: {
|
|
5974
|
-
selectors: ['title']
|
|
5975
|
-
},
|
|
5976
|
-
author: {
|
|
5977
|
-
selectors: ['*[rel="author"] *[itemprop="name"]']
|
|
5978
|
-
},
|
|
5979
|
-
date_published: {
|
|
5980
|
-
selectors: [['.byline time', 'datetime']]
|
|
5981
|
-
},
|
|
5982
|
-
dek: {
|
|
5983
|
-
selectors: ['h2[itemprop="description"]']
|
|
5969
|
+
selectors: ['title', 'h1']
|
|
5984
5970
|
},
|
|
5985
5971
|
lead_image_url: {
|
|
5986
5972
|
selectors: [['meta[name="og:image"]', 'value']]
|
|
5987
5973
|
},
|
|
5988
5974
|
content: {
|
|
5989
|
-
selectors: ['
|
|
5990
|
-
|
|
5991
|
-
|
|
5992
|
-
transforms: {
|
|
5993
|
-
h2: function h2($node) {
|
|
5994
|
-
// Some pages have an element h2 that is significant, and that the parser will
|
|
5995
|
-
// remove if not following a paragraph. Adding this empty paragraph fixes it, and
|
|
5996
|
-
// the empty paragraph will be removed anyway.
|
|
5997
|
-
$node.before('<p></p>');
|
|
5998
|
-
}
|
|
5999
|
-
},
|
|
6000
|
-
// Is there anything that is in the result that shouldn't be?
|
|
6001
|
-
// The clean selectors will remove anything that matches from
|
|
6002
|
-
// the result.
|
|
6003
|
-
clean: [// Remove enlarge links and separators inside image captions.
|
|
6004
|
-
'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
|
|
6005
|
-
// removed them.
|
|
6006
|
-
'figure.video', // Image galleries that do not work.
|
|
6007
|
-
'.gallery', 'aside', '.sidebar']
|
|
5975
|
+
selectors: ['main'],
|
|
5976
|
+
transforms: {},
|
|
5977
|
+
clean: ['.upper-deck__text', '.text-settings-dropdown-story']
|
|
6008
5978
|
}
|
|
6009
5979
|
};
|
|
6010
5980
|
|
|
@@ -6453,20 +6423,20 @@ var WwwHeiseDeExtractor = {
|
|
|
6453
6423
|
return $node.attr('class', 'mercury-parser-keep');
|
|
6454
6424
|
}
|
|
6455
6425
|
},
|
|
6456
|
-
clean: []
|
|
6426
|
+
clean: ['.ad-mobile-group-1', '.branding', '[data-component="RecommendationBox"]']
|
|
6457
6427
|
}
|
|
6458
6428
|
};
|
|
6459
6429
|
|
|
6460
6430
|
var TldrTechExtractor = {
|
|
6461
6431
|
domain: 'tldr.tech',
|
|
6462
6432
|
title: {
|
|
6463
|
-
selectors: [
|
|
6433
|
+
selectors: ['h1']
|
|
6464
6434
|
},
|
|
6465
6435
|
lead_image_url: {
|
|
6466
6436
|
selectors: [['meta[name="twitter:image"]', 'value']]
|
|
6467
6437
|
},
|
|
6468
6438
|
content: {
|
|
6469
|
-
selectors: ['body'],
|
|
6439
|
+
selectors: ['.content-center', 'body'],
|
|
6470
6440
|
transforms: {
|
|
6471
6441
|
h2: function h2($node) {
|
|
6472
6442
|
return $node.attr('class', 'mercury-parser-keep');
|
|
@@ -6479,6 +6449,142 @@ var TldrTechExtractor = {
|
|
|
6479
6449
|
}
|
|
6480
6450
|
};
|
|
6481
6451
|
|
|
6452
|
+
var BskyAppExtractor = {
|
|
6453
|
+
domain: 'bsky.app',
|
|
6454
|
+
title: {
|
|
6455
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
6456
|
+
},
|
|
6457
|
+
author: null,
|
|
6458
|
+
date_published: null,
|
|
6459
|
+
lead_image_url: {
|
|
6460
|
+
selectors: [['meta[property="og:image"]', 'content'], ['meta[name="og:image"]', 'value']]
|
|
6461
|
+
},
|
|
6462
|
+
content: {
|
|
6463
|
+
selectors: ['noscript'],
|
|
6464
|
+
transforms: {
|
|
6465
|
+
noscript: function noscript($node, $) {
|
|
6466
|
+
var innerHtml = $.browser ? $node.text() : $node.html();
|
|
6467
|
+
var summary = $(innerHtml).find('#bsky_post_text');
|
|
6468
|
+
$node.replaceWith(summary.html());
|
|
6469
|
+
}
|
|
6470
|
+
},
|
|
6471
|
+
clean: []
|
|
6472
|
+
}
|
|
6473
|
+
};
|
|
6474
|
+
|
|
6475
|
+
var WwwNtvDeExtractor = {
|
|
6476
|
+
domain: 'www.n-tv.de',
|
|
6477
|
+
title: {
|
|
6478
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
6479
|
+
},
|
|
6480
|
+
date_published: {
|
|
6481
|
+
selectors: [['meta[name="date"]', 'value']]
|
|
6482
|
+
},
|
|
6483
|
+
lead_image_url: {
|
|
6484
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6485
|
+
},
|
|
6486
|
+
content: {
|
|
6487
|
+
selectors: ['.article__text', 'article'],
|
|
6488
|
+
transforms: {},
|
|
6489
|
+
clean: ['.article__share-main']
|
|
6490
|
+
}
|
|
6491
|
+
};
|
|
6492
|
+
|
|
6493
|
+
var WwwSePlExtractor = {
|
|
6494
|
+
domain: 'www.se.pl',
|
|
6495
|
+
title: {
|
|
6496
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
6497
|
+
},
|
|
6498
|
+
author: {
|
|
6499
|
+
selectors: ['.article_author:first-of-type']
|
|
6500
|
+
},
|
|
6501
|
+
date_published: {
|
|
6502
|
+
selectors: ['#timezone'],
|
|
6503
|
+
timezone: 'Europe/Warsaw'
|
|
6504
|
+
},
|
|
6505
|
+
lead_image_url: {
|
|
6506
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6507
|
+
},
|
|
6508
|
+
content: {
|
|
6509
|
+
selectors: ['article'],
|
|
6510
|
+
transforms: {
|
|
6511
|
+
h2: function h2(node) {
|
|
6512
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
6513
|
+
}
|
|
6514
|
+
},
|
|
6515
|
+
clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6516
|
+
}
|
|
6517
|
+
};
|
|
6518
|
+
|
|
6519
|
+
var SportSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6520
|
+
domain: 'sport.se.pl'
|
|
6521
|
+
});
|
|
6522
|
+
|
|
6523
|
+
var PolitykaSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6524
|
+
domain: 'polityka.se.pl'
|
|
6525
|
+
});
|
|
6526
|
+
|
|
6527
|
+
var SuperserialeSePlExtractor = {
|
|
6528
|
+
domain: 'superseriale.se.pl',
|
|
6529
|
+
title: {
|
|
6530
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
6531
|
+
},
|
|
6532
|
+
author: {
|
|
6533
|
+
selectors: ['.article_author:first-of-type']
|
|
6534
|
+
},
|
|
6535
|
+
date_published: {
|
|
6536
|
+
selectors: ['#timezone'],
|
|
6537
|
+
timezone: 'Europe/Warsaw'
|
|
6538
|
+
},
|
|
6539
|
+
lead_image_url: {
|
|
6540
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6541
|
+
},
|
|
6542
|
+
content: {
|
|
6543
|
+
selectors: ['article'],
|
|
6544
|
+
transforms: {
|
|
6545
|
+
h2: function h2(node) {
|
|
6546
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
6547
|
+
}
|
|
6548
|
+
},
|
|
6549
|
+
clean: ['#timezone', '.author', '.article__author__croppimg', // author photo
|
|
6550
|
+
'.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6551
|
+
}
|
|
6552
|
+
};
|
|
6553
|
+
|
|
6554
|
+
var SzczecinSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6555
|
+
domain: 'szczecin.se.pl'
|
|
6556
|
+
});
|
|
6557
|
+
|
|
6558
|
+
var SuperbizSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6559
|
+
domain: 'superbiz.se.pl'
|
|
6560
|
+
});
|
|
6561
|
+
|
|
6562
|
+
var PortalobronnySePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6563
|
+
domain: 'portalobronny.se.pl'
|
|
6564
|
+
});
|
|
6565
|
+
|
|
6566
|
+
var PolskisamorzadSePlExtractor = {
|
|
6567
|
+
domain: 'polskisamorzad.se.pl',
|
|
6568
|
+
title: {
|
|
6569
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
6570
|
+
},
|
|
6571
|
+
author: {
|
|
6572
|
+
selectors: ['.article_author:first-of-type', '.article-author', ['meta[name="og:article:author"]', 'value']]
|
|
6573
|
+
},
|
|
6574
|
+
lead_image_url: {
|
|
6575
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6576
|
+
},
|
|
6577
|
+
content: {
|
|
6578
|
+
selectors: ['.article-single'],
|
|
6579
|
+
transforms: {
|
|
6580
|
+
h2: function h2(node) {
|
|
6581
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
6582
|
+
}
|
|
6583
|
+
},
|
|
6584
|
+
clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6585
|
+
}
|
|
6586
|
+
};
|
|
6587
|
+
|
|
6482
6588
|
|
|
6483
6589
|
|
|
6484
6590
|
var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
@@ -6635,7 +6741,17 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
|
6635
6741
|
WwwChannelnewsasiaComExtractor: WwwChannelnewsasiaComExtractor,
|
|
6636
6742
|
WccftechComExtractor: WccftechComExtractor,
|
|
6637
6743
|
WwwHeiseDeExtractor: WwwHeiseDeExtractor,
|
|
6638
|
-
TldrTechExtractor: TldrTechExtractor
|
|
6744
|
+
TldrTechExtractor: TldrTechExtractor,
|
|
6745
|
+
BskyAppExtractor: BskyAppExtractor,
|
|
6746
|
+
WwwNtvDeExtractor: WwwNtvDeExtractor,
|
|
6747
|
+
SportSePlExtractor: SportSePlExtractor,
|
|
6748
|
+
WwwSePlExtractor: WwwSePlExtractor,
|
|
6749
|
+
PolitykaSePlExtractor: PolitykaSePlExtractor,
|
|
6750
|
+
SuperserialeSePlExtractor: SuperserialeSePlExtractor,
|
|
6751
|
+
SzczecinSePlExtractor: SzczecinSePlExtractor,
|
|
6752
|
+
SuperbizSePlExtractor: SuperbizSePlExtractor,
|
|
6753
|
+
PortalobronnySePlExtractor: PortalobronnySePlExtractor,
|
|
6754
|
+
PolskisamorzadSePlExtractor: PolskisamorzadSePlExtractor
|
|
6639
6755
|
});
|
|
6640
6756
|
|
|
6641
6757
|
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
|