@jocmp/mercury-parser 2.3.5 → 2.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/generate-custom-parser.js +40 -74
- package/dist/generate-custom-parser.js.map +1 -1
- package/dist/mercury.js +38 -75
- package/dist/mercury.js.map +1 -1
- package/dist/mercury.web.js +1 -1
- package/dist/mercury.web.js.map +1 -1
- package/package.json +1 -1
package/dist/mercury.js
CHANGED
|
@@ -5965,46 +5965,16 @@ var WwwEngadgetComExtractor = {
|
|
|
5965
5965
|
|
|
5966
5966
|
var ArstechnicaComExtractor = {
|
|
5967
5967
|
domain: 'arstechnica.com',
|
|
5968
|
-
// Articles from this site are often paginated, but I was unable to write a CSS
|
|
5969
|
-
// selector to find the next page. On the last page, there will be a link with a CSS
|
|
5970
|
-
// selector indicating that the previous page is next. But the parser appears to find
|
|
5971
|
-
// the next page without this extractor finding it, as long as the fallback option is
|
|
5972
|
-
// left at its default value of true.
|
|
5973
5968
|
title: {
|
|
5974
|
-
selectors: ['title']
|
|
5975
|
-
},
|
|
5976
|
-
author: {
|
|
5977
|
-
selectors: ['*[rel="author"] *[itemprop="name"]']
|
|
5978
|
-
},
|
|
5979
|
-
date_published: {
|
|
5980
|
-
selectors: [['.byline time', 'datetime']]
|
|
5981
|
-
},
|
|
5982
|
-
dek: {
|
|
5983
|
-
selectors: ['h2[itemprop="description"]']
|
|
5969
|
+
selectors: ['title', 'h1']
|
|
5984
5970
|
},
|
|
5985
5971
|
lead_image_url: {
|
|
5986
5972
|
selectors: [['meta[name="og:image"]', 'value']]
|
|
5987
5973
|
},
|
|
5988
5974
|
content: {
|
|
5989
|
-
selectors: ['
|
|
5990
|
-
|
|
5991
|
-
|
|
5992
|
-
transforms: {
|
|
5993
|
-
h2: function h2($node) {
|
|
5994
|
-
// Some pages have an element h2 that is significant, and that the parser will
|
|
5995
|
-
// remove if not following a paragraph. Adding this empty paragraph fixes it, and
|
|
5996
|
-
// the empty paragraph will be removed anyway.
|
|
5997
|
-
$node.before('<p></p>');
|
|
5998
|
-
}
|
|
5999
|
-
},
|
|
6000
|
-
// Is there anything that is in the result that shouldn't be?
|
|
6001
|
-
// The clean selectors will remove anything that matches from
|
|
6002
|
-
// the result.
|
|
6003
|
-
clean: [// Remove enlarge links and separators inside image captions.
|
|
6004
|
-
'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
|
|
6005
|
-
// removed them.
|
|
6006
|
-
'figure.video', // Image galleries that do not work.
|
|
6007
|
-
'.gallery', 'aside', '.sidebar']
|
|
5975
|
+
selectors: ['main'],
|
|
5976
|
+
transforms: {},
|
|
5977
|
+
clean: ['.upper-deck__text', '.text-settings-dropdown-story']
|
|
6008
5978
|
}
|
|
6009
5979
|
};
|
|
6010
5980
|
|
|
@@ -6520,32 +6490,6 @@ var WwwNtvDeExtractor = {
|
|
|
6520
6490
|
}
|
|
6521
6491
|
};
|
|
6522
6492
|
|
|
6523
|
-
var SportSePlExtractor = {
|
|
6524
|
-
domain: 'sport.se.pl',
|
|
6525
|
-
title: {
|
|
6526
|
-
selectors: [['meta[name="og:title"]', 'value']]
|
|
6527
|
-
},
|
|
6528
|
-
author: {
|
|
6529
|
-
selectors: ['.article_author']
|
|
6530
|
-
},
|
|
6531
|
-
date_published: {
|
|
6532
|
-
selectors: ['#timezone'],
|
|
6533
|
-
timezone: 'Europe/Warsaw'
|
|
6534
|
-
},
|
|
6535
|
-
lead_image_url: {
|
|
6536
|
-
selectors: [['meta[name="og:image"]', 'value']]
|
|
6537
|
-
},
|
|
6538
|
-
content: {
|
|
6539
|
-
selectors: ['article'],
|
|
6540
|
-
transforms: {
|
|
6541
|
-
h2: function h2(node) {
|
|
6542
|
-
return node.attr('class', 'mercury-parser-keep');
|
|
6543
|
-
}
|
|
6544
|
-
},
|
|
6545
|
-
clean: ['#timezone', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6546
|
-
}
|
|
6547
|
-
};
|
|
6548
|
-
|
|
6549
6493
|
var WwwSePlExtractor = {
|
|
6550
6494
|
domain: 'www.se.pl',
|
|
6551
6495
|
title: {
|
|
@@ -6568,12 +6512,20 @@ var WwwSePlExtractor = {
|
|
|
6568
6512
|
return node.attr('class', 'mercury-parser-keep');
|
|
6569
6513
|
}
|
|
6570
6514
|
},
|
|
6571
|
-
clean: ['#timezone', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6515
|
+
clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6572
6516
|
}
|
|
6573
6517
|
};
|
|
6574
6518
|
|
|
6575
|
-
var
|
|
6576
|
-
domain: '
|
|
6519
|
+
var SportSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6520
|
+
domain: 'sport.se.pl'
|
|
6521
|
+
});
|
|
6522
|
+
|
|
6523
|
+
var PolitykaSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6524
|
+
domain: 'polityka.se.pl'
|
|
6525
|
+
});
|
|
6526
|
+
|
|
6527
|
+
var SuperserialeSePlExtractor = {
|
|
6528
|
+
domain: 'superseriale.se.pl',
|
|
6577
6529
|
title: {
|
|
6578
6530
|
selectors: [['meta[name="og:title"]', 'value']]
|
|
6579
6531
|
},
|
|
@@ -6594,35 +6546,42 @@ var PolitykaSePlExtractor = {
|
|
|
6594
6546
|
return node.attr('class', 'mercury-parser-keep');
|
|
6595
6547
|
}
|
|
6596
6548
|
},
|
|
6597
|
-
clean: ['.article__author__croppimg', // author photo
|
|
6549
|
+
clean: ['#timezone', '.author', '.article__author__croppimg', // author photo
|
|
6598
6550
|
'.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6599
6551
|
}
|
|
6600
6552
|
};
|
|
6601
6553
|
|
|
6602
|
-
var
|
|
6603
|
-
domain: '
|
|
6554
|
+
var SzczecinSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6555
|
+
domain: 'szczecin.se.pl'
|
|
6556
|
+
});
|
|
6557
|
+
|
|
6558
|
+
var SuperbizSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6559
|
+
domain: 'superbiz.se.pl'
|
|
6560
|
+
});
|
|
6561
|
+
|
|
6562
|
+
var PortalobronnySePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6563
|
+
domain: 'portalobronny.se.pl'
|
|
6564
|
+
});
|
|
6565
|
+
|
|
6566
|
+
var PolskisamorzadSePlExtractor = {
|
|
6567
|
+
domain: 'polskisamorzad.se.pl',
|
|
6604
6568
|
title: {
|
|
6605
6569
|
selectors: [['meta[name="og:title"]', 'value']]
|
|
6606
6570
|
},
|
|
6607
6571
|
author: {
|
|
6608
|
-
selectors: ['.article_author:first-of-type']
|
|
6609
|
-
},
|
|
6610
|
-
date_published: {
|
|
6611
|
-
selectors: ['#timezone'],
|
|
6612
|
-
timezone: 'Europe/Warsaw'
|
|
6572
|
+
selectors: ['.article_author:first-of-type', '.article-author', ['meta[name="og:article:author"]', 'value']]
|
|
6613
6573
|
},
|
|
6614
6574
|
lead_image_url: {
|
|
6615
6575
|
selectors: [['meta[name="og:image"]', 'value']]
|
|
6616
6576
|
},
|
|
6617
6577
|
content: {
|
|
6618
|
-
selectors: ['article'],
|
|
6578
|
+
selectors: ['.article-single'],
|
|
6619
6579
|
transforms: {
|
|
6620
6580
|
h2: function h2(node) {
|
|
6621
6581
|
return node.attr('class', 'mercury-parser-keep');
|
|
6622
6582
|
}
|
|
6623
6583
|
},
|
|
6624
|
-
clean: ['#timezone', '.article__author__croppimg',
|
|
6625
|
-
'.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6584
|
+
clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6626
6585
|
}
|
|
6627
6586
|
};
|
|
6628
6587
|
|
|
@@ -6788,7 +6747,11 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
|
6788
6747
|
SportSePlExtractor: SportSePlExtractor,
|
|
6789
6748
|
WwwSePlExtractor: WwwSePlExtractor,
|
|
6790
6749
|
PolitykaSePlExtractor: PolitykaSePlExtractor,
|
|
6791
|
-
SuperserialeSePlExtractor: SuperserialeSePlExtractor
|
|
6750
|
+
SuperserialeSePlExtractor: SuperserialeSePlExtractor,
|
|
6751
|
+
SzczecinSePlExtractor: SzczecinSePlExtractor,
|
|
6752
|
+
SuperbizSePlExtractor: SuperbizSePlExtractor,
|
|
6753
|
+
PortalobronnySePlExtractor: PortalobronnySePlExtractor,
|
|
6754
|
+
PolskisamorzadSePlExtractor: PolskisamorzadSePlExtractor
|
|
6792
6755
|
});
|
|
6793
6756
|
|
|
6794
6757
|
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
|