@jocmp/mercury-parser 2.3.5 → 2.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/generate-custom-parser.js +62 -71
- package/dist/generate-custom-parser.js.map +1 -1
- package/dist/mercury.js +60 -72
- package/dist/mercury.js.map +1 -1
- package/dist/mercury.web.js +1 -1
- package/dist/mercury.web.js.map +1 -1
- package/package.json +1 -1
package/dist/mercury.js
CHANGED
|
@@ -5965,46 +5965,21 @@ var WwwEngadgetComExtractor = {
|
|
|
5965
5965
|
|
|
5966
5966
|
var ArstechnicaComExtractor = {
|
|
5967
5967
|
domain: 'arstechnica.com',
|
|
5968
|
-
// Articles from this site are often paginated, but I was unable to write a CSS
|
|
5969
|
-
// selector to find the next page. On the last page, there will be a link with a CSS
|
|
5970
|
-
// selector indicating that the previous page is next. But the parser appears to find
|
|
5971
|
-
// the next page without this extractor finding it, as long as the fallback option is
|
|
5972
|
-
// left at its default value of true.
|
|
5973
5968
|
title: {
|
|
5974
|
-
selectors: ['title']
|
|
5975
|
-
},
|
|
5976
|
-
author: {
|
|
5977
|
-
selectors: ['*[rel="author"] *[itemprop="name"]']
|
|
5978
|
-
},
|
|
5979
|
-
date_published: {
|
|
5980
|
-
selectors: [['.byline time', 'datetime']]
|
|
5981
|
-
},
|
|
5982
|
-
dek: {
|
|
5983
|
-
selectors: ['h2[itemprop="description"]']
|
|
5969
|
+
selectors: ['title', 'h1']
|
|
5984
5970
|
},
|
|
5985
5971
|
lead_image_url: {
|
|
5986
5972
|
selectors: [['meta[name="og:image"]', 'value']]
|
|
5987
5973
|
},
|
|
5988
5974
|
content: {
|
|
5989
|
-
selectors: ['
|
|
5990
|
-
// Is there anything in the content you selected that needs transformed
|
|
5991
|
-
// before it's consumable content? E.g., unusual lazy loaded images
|
|
5975
|
+
selectors: ['.post-content', 'main'],
|
|
5992
5976
|
transforms: {
|
|
5993
|
-
|
|
5994
|
-
|
|
5995
|
-
|
|
5996
|
-
// the empty paragraph will be removed anyway.
|
|
5997
|
-
$node.before('<p></p>');
|
|
5977
|
+
img: function img($node) {
|
|
5978
|
+
$node.removeAttr('width');
|
|
5979
|
+
$node.removeAttr('sizes');
|
|
5998
5980
|
}
|
|
5999
5981
|
},
|
|
6000
|
-
|
|
6001
|
-
// The clean selectors will remove anything that matches from
|
|
6002
|
-
// the result.
|
|
6003
|
-
clean: [// Remove enlarge links and separators inside image captions.
|
|
6004
|
-
'figcaption .enlarge-link', 'figcaption .sep', // I could not transform the video into usable elements, so I
|
|
6005
|
-
// removed them.
|
|
6006
|
-
'figure.video', // Image galleries that do not work.
|
|
6007
|
-
'.gallery', 'aside', '.sidebar']
|
|
5982
|
+
clean: ['header', '.upper-deck__text', '.text-settings-dropdown-story']
|
|
6008
5983
|
}
|
|
6009
5984
|
};
|
|
6010
5985
|
|
|
@@ -6520,32 +6495,6 @@ var WwwNtvDeExtractor = {
|
|
|
6520
6495
|
}
|
|
6521
6496
|
};
|
|
6522
6497
|
|
|
6523
|
-
var SportSePlExtractor = {
|
|
6524
|
-
domain: 'sport.se.pl',
|
|
6525
|
-
title: {
|
|
6526
|
-
selectors: [['meta[name="og:title"]', 'value']]
|
|
6527
|
-
},
|
|
6528
|
-
author: {
|
|
6529
|
-
selectors: ['.article_author']
|
|
6530
|
-
},
|
|
6531
|
-
date_published: {
|
|
6532
|
-
selectors: ['#timezone'],
|
|
6533
|
-
timezone: 'Europe/Warsaw'
|
|
6534
|
-
},
|
|
6535
|
-
lead_image_url: {
|
|
6536
|
-
selectors: [['meta[name="og:image"]', 'value']]
|
|
6537
|
-
},
|
|
6538
|
-
content: {
|
|
6539
|
-
selectors: ['article'],
|
|
6540
|
-
transforms: {
|
|
6541
|
-
h2: function h2(node) {
|
|
6542
|
-
return node.attr('class', 'mercury-parser-keep');
|
|
6543
|
-
}
|
|
6544
|
-
},
|
|
6545
|
-
clean: ['#timezone', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6546
|
-
}
|
|
6547
|
-
};
|
|
6548
|
-
|
|
6549
6498
|
var WwwSePlExtractor = {
|
|
6550
6499
|
domain: 'www.se.pl',
|
|
6551
6500
|
title: {
|
|
@@ -6568,12 +6517,20 @@ var WwwSePlExtractor = {
|
|
|
6568
6517
|
return node.attr('class', 'mercury-parser-keep');
|
|
6569
6518
|
}
|
|
6570
6519
|
},
|
|
6571
|
-
clean: ['#timezone', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6520
|
+
clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6572
6521
|
}
|
|
6573
6522
|
};
|
|
6574
6523
|
|
|
6575
|
-
var
|
|
6576
|
-
domain: '
|
|
6524
|
+
var SportSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6525
|
+
domain: 'sport.se.pl'
|
|
6526
|
+
});
|
|
6527
|
+
|
|
6528
|
+
var PolitykaSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6529
|
+
domain: 'polityka.se.pl'
|
|
6530
|
+
});
|
|
6531
|
+
|
|
6532
|
+
var SuperserialeSePlExtractor = {
|
|
6533
|
+
domain: 'superseriale.se.pl',
|
|
6577
6534
|
title: {
|
|
6578
6535
|
selectors: [['meta[name="og:title"]', 'value']]
|
|
6579
6536
|
},
|
|
@@ -6594,38 +6551,61 @@ var PolitykaSePlExtractor = {
|
|
|
6594
6551
|
return node.attr('class', 'mercury-parser-keep');
|
|
6595
6552
|
}
|
|
6596
6553
|
},
|
|
6597
|
-
clean: ['.article__author__croppimg', // author photo
|
|
6554
|
+
clean: ['#timezone', '.author', '.article__author__croppimg', // author photo
|
|
6598
6555
|
'.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6599
6556
|
}
|
|
6600
6557
|
};
|
|
6601
6558
|
|
|
6602
|
-
var
|
|
6603
|
-
domain: '
|
|
6559
|
+
var SzczecinSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6560
|
+
domain: 'szczecin.se.pl'
|
|
6561
|
+
});
|
|
6562
|
+
|
|
6563
|
+
var SuperbizSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6564
|
+
domain: 'superbiz.se.pl'
|
|
6565
|
+
});
|
|
6566
|
+
|
|
6567
|
+
var PortalobronnySePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6568
|
+
domain: 'portalobronny.se.pl'
|
|
6569
|
+
});
|
|
6570
|
+
|
|
6571
|
+
var PolskisamorzadSePlExtractor = {
|
|
6572
|
+
domain: 'polskisamorzad.se.pl',
|
|
6604
6573
|
title: {
|
|
6605
6574
|
selectors: [['meta[name="og:title"]', 'value']]
|
|
6606
6575
|
},
|
|
6607
6576
|
author: {
|
|
6608
|
-
selectors: ['.article_author:first-of-type']
|
|
6609
|
-
},
|
|
6610
|
-
date_published: {
|
|
6611
|
-
selectors: ['#timezone'],
|
|
6612
|
-
timezone: 'Europe/Warsaw'
|
|
6577
|
+
selectors: ['.article_author:first-of-type', '.article-author', ['meta[name="og:article:author"]', 'value']]
|
|
6613
6578
|
},
|
|
6614
6579
|
lead_image_url: {
|
|
6615
6580
|
selectors: [['meta[name="og:image"]', 'value']]
|
|
6616
6581
|
},
|
|
6617
6582
|
content: {
|
|
6618
|
-
selectors: ['article'],
|
|
6583
|
+
selectors: ['.article-single'],
|
|
6619
6584
|
transforms: {
|
|
6620
6585
|
h2: function h2(node) {
|
|
6621
6586
|
return node.attr('class', 'mercury-parser-keep');
|
|
6622
6587
|
}
|
|
6623
6588
|
},
|
|
6624
|
-
clean: ['#timezone', '.article__author__croppimg',
|
|
6625
|
-
'.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6589
|
+
clean: ['#timezone', '.author', '.article__author__croppimg', '.article_authors_with_thumbnail', '.related_articles__elements', '.gl_plugin.socials', '.gl_plugin.player', '.gl_plugin.video_player', '.gl_plugin + video']
|
|
6626
6590
|
}
|
|
6627
6591
|
};
|
|
6628
6592
|
|
|
6593
|
+
var LodzSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6594
|
+
domain: 'lodz.se.pl'
|
|
6595
|
+
});
|
|
6596
|
+
|
|
6597
|
+
var WroclawSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6598
|
+
domain: 'wroclaw.se.pl'
|
|
6599
|
+
});
|
|
6600
|
+
|
|
6601
|
+
var LublinSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6602
|
+
domain: 'lublin.se.pl'
|
|
6603
|
+
});
|
|
6604
|
+
|
|
6605
|
+
var BialystokSePlExtractor = _objectSpread({}, WwwSePlExtractor, {
|
|
6606
|
+
domain: 'bialystok.se.pl'
|
|
6607
|
+
});
|
|
6608
|
+
|
|
6629
6609
|
|
|
6630
6610
|
|
|
6631
6611
|
var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
@@ -6788,7 +6768,15 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
|
6788
6768
|
SportSePlExtractor: SportSePlExtractor,
|
|
6789
6769
|
WwwSePlExtractor: WwwSePlExtractor,
|
|
6790
6770
|
PolitykaSePlExtractor: PolitykaSePlExtractor,
|
|
6791
|
-
SuperserialeSePlExtractor: SuperserialeSePlExtractor
|
|
6771
|
+
SuperserialeSePlExtractor: SuperserialeSePlExtractor,
|
|
6772
|
+
SzczecinSePlExtractor: SzczecinSePlExtractor,
|
|
6773
|
+
SuperbizSePlExtractor: SuperbizSePlExtractor,
|
|
6774
|
+
PortalobronnySePlExtractor: PortalobronnySePlExtractor,
|
|
6775
|
+
PolskisamorzadSePlExtractor: PolskisamorzadSePlExtractor,
|
|
6776
|
+
LodzSePlExtractor: LodzSePlExtractor,
|
|
6777
|
+
WroclawSePlExtractor: WroclawSePlExtractor,
|
|
6778
|
+
LublinSePlExtractor: LublinSePlExtractor,
|
|
6779
|
+
BialystokSePlExtractor: BialystokSePlExtractor
|
|
6792
6780
|
});
|
|
6793
6781
|
|
|
6794
6782
|
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
|