@jocmp/mercury-parser 2.4.0 → 2.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/generate-custom-parser.js +162 -54
- package/dist/generate-custom-parser.js.map +1 -1
- package/dist/mercury.js +154 -41
- package/dist/mercury.js.map +1 -1
- package/dist/mercury.web.js +1 -1
- package/dist/mercury.web.js.map +1 -1
- package/package.json +1 -1
package/dist/mercury.js
CHANGED
|
@@ -5924,45 +5924,6 @@ var WwwGrueneDeExtractor = {
|
|
|
5924
5924
|
}
|
|
5925
5925
|
};
|
|
5926
5926
|
|
|
5927
|
-
var WwwEngadgetComExtractor = {
|
|
5928
|
-
domain: 'www.engadget.com',
|
|
5929
|
-
title: {
|
|
5930
|
-
selectors: [['meta[name="og:title"]', 'value']]
|
|
5931
|
-
},
|
|
5932
|
-
author: {
|
|
5933
|
-
selectors: ['a.th-meta[data-ylk*="subsec:author"]']
|
|
5934
|
-
},
|
|
5935
|
-
// Engadget stories have publish dates, but the only representation of them on the page
|
|
5936
|
-
// is in a format like "2h ago". There are also these tags with blank values:
|
|
5937
|
-
// <meta class="swiftype" name="published_at" data-type="date" value="">
|
|
5938
|
-
date_published: {
|
|
5939
|
-
selectors: [// enter selectors
|
|
5940
|
-
]
|
|
5941
|
-
},
|
|
5942
|
-
dek: {
|
|
5943
|
-
selectors: ['div[class*="o-title_mark"] div']
|
|
5944
|
-
},
|
|
5945
|
-
// Engadget stories do have lead images specified by an og:image meta tag, but selecting
|
|
5946
|
-
// the value attribute of that tag fails. I believe the "ℑ" sequence of characters
|
|
5947
|
-
// is triggering this inability to select the attribute value.
|
|
5948
|
-
lead_image_url: {
|
|
5949
|
-
selectors: [// enter selectors
|
|
5950
|
-
]
|
|
5951
|
-
},
|
|
5952
|
-
content: {
|
|
5953
|
-
selectors: [[// Some figures will be inside div.article-text, but some header figures/images
|
|
5954
|
-
// will not.
|
|
5955
|
-
'#page_body figure:not(div.article-text figure)', 'div.article-text']],
|
|
5956
|
-
// Is there anything in the content you selected that needs transformed
|
|
5957
|
-
// before it's consumable content? E.g., unusual lazy loaded images
|
|
5958
|
-
transforms: {},
|
|
5959
|
-
// Is there anything that is in the result that shouldn't be?
|
|
5960
|
-
// The clean selectors will remove anything that matches from
|
|
5961
|
-
// the result
|
|
5962
|
-
clean: []
|
|
5963
|
-
}
|
|
5964
|
-
};
|
|
5965
|
-
|
|
5966
5927
|
var ArstechnicaComExtractor = {
|
|
5967
5928
|
domain: 'arstechnica.com',
|
|
5968
5929
|
title: {
|
|
@@ -6662,6 +6623,153 @@ var WwwQbitaiComExtractor = {
|
|
|
6662
6623
|
}
|
|
6663
6624
|
};
|
|
6664
6625
|
|
|
6626
|
+
var EconomictimesIndiatimesComExtractor = {
|
|
6627
|
+
domain: 'economictimes.indiatimes.com',
|
|
6628
|
+
title: {
|
|
6629
|
+
selectors: ['title', ['meta[name="og:title"]', 'value']]
|
|
6630
|
+
},
|
|
6631
|
+
author: {
|
|
6632
|
+
selectors: ['a[rel="author"]']
|
|
6633
|
+
},
|
|
6634
|
+
lead_image_url: {
|
|
6635
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6636
|
+
},
|
|
6637
|
+
content: {
|
|
6638
|
+
selectors: ['article'],
|
|
6639
|
+
transforms: {},
|
|
6640
|
+
clean: ['span.imgAgency']
|
|
6641
|
+
}
|
|
6642
|
+
};
|
|
6643
|
+
|
|
6644
|
+
var FactorioComExtractor = {
|
|
6645
|
+
domain: 'factorio.com',
|
|
6646
|
+
title: {
|
|
6647
|
+
selectors: ['title']
|
|
6648
|
+
},
|
|
6649
|
+
lead_image_url: {
|
|
6650
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6651
|
+
},
|
|
6652
|
+
content: {
|
|
6653
|
+
selectors: [['.blog-post', 'div:nth-child(2)']],
|
|
6654
|
+
transforms: {
|
|
6655
|
+
h3: function h3(node) {
|
|
6656
|
+
var author = node.find('author');
|
|
6657
|
+
|
|
6658
|
+
if (author.text()) {
|
|
6659
|
+
node.after("<p>".concat(author.text(), "</p>"));
|
|
6660
|
+
author.remove();
|
|
6661
|
+
}
|
|
6662
|
+
}
|
|
6663
|
+
},
|
|
6664
|
+
clean: ['.logo-expansion-space-age']
|
|
6665
|
+
}
|
|
6666
|
+
};
|
|
6667
|
+
|
|
6668
|
+
var WwwTagesschauDeExtractor = {
|
|
6669
|
+
domain: 'www.tagesschau.de',
|
|
6670
|
+
title: {
|
|
6671
|
+
selectors: ['.seitenkopf__headline--text', 'title']
|
|
6672
|
+
},
|
|
6673
|
+
author: {
|
|
6674
|
+
selectors: ['.authorline__author authorline__link:first-child']
|
|
6675
|
+
},
|
|
6676
|
+
date_published: {
|
|
6677
|
+
selectors: [['meta[name="date"]', 'value'], '.metatextline'],
|
|
6678
|
+
timezone: 'UTC'
|
|
6679
|
+
},
|
|
6680
|
+
lead_image_url: {
|
|
6681
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6682
|
+
},
|
|
6683
|
+
content: {
|
|
6684
|
+
selectors: ['article'],
|
|
6685
|
+
clean: ['[data-config]', '.seitenkopf__headline', '.authorline__author', '.metatextline']
|
|
6686
|
+
}
|
|
6687
|
+
};
|
|
6688
|
+
|
|
6689
|
+
var Nineto5googleComExtractor = {
|
|
6690
|
+
domain: '9to5google.com',
|
|
6691
|
+
title: {
|
|
6692
|
+
selectors: ['title', 'h1']
|
|
6693
|
+
},
|
|
6694
|
+
author: {
|
|
6695
|
+
selectors: [['meta[name="author"]', 'value']]
|
|
6696
|
+
},
|
|
6697
|
+
date_published: {
|
|
6698
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
6699
|
+
},
|
|
6700
|
+
lead_image_url: {
|
|
6701
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6702
|
+
},
|
|
6703
|
+
content: {
|
|
6704
|
+
selectors: ['main'],
|
|
6705
|
+
transforms: {
|
|
6706
|
+
img: function img(node) {
|
|
6707
|
+
node.removeAttr('sizes');
|
|
6708
|
+
}
|
|
6709
|
+
},
|
|
6710
|
+
clean: ['.post-meta']
|
|
6711
|
+
}
|
|
6712
|
+
};
|
|
6713
|
+
|
|
6714
|
+
var WwwEngadgetComExtractor = {
|
|
6715
|
+
domain: 'www.engadget.com',
|
|
6716
|
+
title: {
|
|
6717
|
+
selectors: ['title', 'h1']
|
|
6718
|
+
},
|
|
6719
|
+
author: {
|
|
6720
|
+
selectors: ['.caas-attr-item-author']
|
|
6721
|
+
},
|
|
6722
|
+
date_published: {
|
|
6723
|
+
selectors: [['time', 'datetime']]
|
|
6724
|
+
},
|
|
6725
|
+
lead_image_url: {
|
|
6726
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6727
|
+
},
|
|
6728
|
+
content: {
|
|
6729
|
+
selectors: ['.caas-body'],
|
|
6730
|
+
transforms: {
|
|
6731
|
+
h2: function h2(node) {
|
|
6732
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
6733
|
+
},
|
|
6734
|
+
'blockquote noscript': function blockquoteNoscript(node) {
|
|
6735
|
+
var iframe = node.find('iframe');
|
|
6736
|
+
|
|
6737
|
+
if (iframe != null) {
|
|
6738
|
+
return 'div';
|
|
6739
|
+
}
|
|
6740
|
+
|
|
6741
|
+
return null;
|
|
6742
|
+
}
|
|
6743
|
+
},
|
|
6744
|
+
clean: []
|
|
6745
|
+
}
|
|
6746
|
+
};
|
|
6747
|
+
|
|
6748
|
+
var TarnkappeInfoExtractor = {
|
|
6749
|
+
domain: 'tarnkappe.info',
|
|
6750
|
+
title: {
|
|
6751
|
+
selectors: ['title', 'h1']
|
|
6752
|
+
},
|
|
6753
|
+
author: {
|
|
6754
|
+
selectors: [['meta[name="author"]', 'value']]
|
|
6755
|
+
},
|
|
6756
|
+
date_published: {
|
|
6757
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
6758
|
+
},
|
|
6759
|
+
lead_image_url: {
|
|
6760
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6761
|
+
},
|
|
6762
|
+
content: {
|
|
6763
|
+
selectors: ['main'],
|
|
6764
|
+
transforms: {
|
|
6765
|
+
h2: function h2(node) {
|
|
6766
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
6767
|
+
}
|
|
6768
|
+
},
|
|
6769
|
+
clean: ['section#author']
|
|
6770
|
+
}
|
|
6771
|
+
};
|
|
6772
|
+
|
|
6665
6773
|
|
|
6666
6774
|
|
|
6667
6775
|
var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
@@ -6801,7 +6909,6 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
|
6801
6909
|
PastebinComExtractor: PastebinComExtractor,
|
|
6802
6910
|
WwwAbendblattDeExtractor: WwwAbendblattDeExtractor,
|
|
6803
6911
|
WwwGrueneDeExtractor: WwwGrueneDeExtractor,
|
|
6804
|
-
WwwEngadgetComExtractor: WwwEngadgetComExtractor,
|
|
6805
6912
|
ArstechnicaComExtractor: ArstechnicaComExtractor,
|
|
6806
6913
|
WwwNdtvComExtractor: WwwNdtvComExtractor,
|
|
6807
6914
|
SpektrumExtractor: SpektrumExtractor,
|
|
@@ -6834,7 +6941,13 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
|
6834
6941
|
LublinSePlExtractor: LublinSePlExtractor,
|
|
6835
6942
|
BialystokSePlExtractor: BialystokSePlExtractor,
|
|
6836
6943
|
WwwLebensmittelwarnungDeExtractor: WwwLebensmittelwarnungDeExtractor,
|
|
6837
|
-
WwwQbitaiComExtractor: WwwQbitaiComExtractor
|
|
6944
|
+
WwwQbitaiComExtractor: WwwQbitaiComExtractor,
|
|
6945
|
+
EconomictimesIndiatimesComExtractor: EconomictimesIndiatimesComExtractor,
|
|
6946
|
+
FactorioComExtractor: FactorioComExtractor,
|
|
6947
|
+
WwwTagesschauDeExtractor: WwwTagesschauDeExtractor,
|
|
6948
|
+
Nineto5googleComExtractor: Nineto5googleComExtractor,
|
|
6949
|
+
WwwEngadgetComExtractor: WwwEngadgetComExtractor,
|
|
6950
|
+
TarnkappeInfoExtractor: TarnkappeInfoExtractor
|
|
6838
6951
|
});
|
|
6839
6952
|
|
|
6840
6953
|
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
|