@jocmp/mercury-parser 2.4.0 → 2.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/mercury.js CHANGED
@@ -5924,45 +5924,6 @@ var WwwGrueneDeExtractor = {
5924
5924
  }
5925
5925
  };
5926
5926
 
5927
- var WwwEngadgetComExtractor = {
5928
- domain: 'www.engadget.com',
5929
- title: {
5930
- selectors: [['meta[name="og:title"]', 'value']]
5931
- },
5932
- author: {
5933
- selectors: ['a.th-meta[data-ylk*="subsec:author"]']
5934
- },
5935
- // Engadget stories have publish dates, but the only representation of them on the page
5936
- // is in a format like "2h ago". There are also these tags with blank values:
5937
- // <meta class="swiftype" name="published_at" data-type="date" value="">
5938
- date_published: {
5939
- selectors: [// enter selectors
5940
- ]
5941
- },
5942
- dek: {
5943
- selectors: ['div[class*="o-title_mark"] div']
5944
- },
5945
- // Engadget stories do have lead images specified by an og:image meta tag, but selecting
5946
- // the value attribute of that tag fails. I believe the "&#x2111;" sequence of characters
5947
- // is triggering this inability to select the attribute value.
5948
- lead_image_url: {
5949
- selectors: [// enter selectors
5950
- ]
5951
- },
5952
- content: {
5953
- selectors: [[// Some figures will be inside div.article-text, but some header figures/images
5954
- // will not.
5955
- '#page_body figure:not(div.article-text figure)', 'div.article-text']],
5956
- // Is there anything in the content you selected that needs transformed
5957
- // before it's consumable content? E.g., unusual lazy loaded images
5958
- transforms: {},
5959
- // Is there anything that is in the result that shouldn't be?
5960
- // The clean selectors will remove anything that matches from
5961
- // the result
5962
- clean: []
5963
- }
5964
- };
5965
-
5966
5927
  var ArstechnicaComExtractor = {
5967
5928
  domain: 'arstechnica.com',
5968
5929
  title: {
@@ -6662,6 +6623,153 @@ var WwwQbitaiComExtractor = {
6662
6623
  }
6663
6624
  };
6664
6625
 
6626
+ var EconomictimesIndiatimesComExtractor = {
6627
+ domain: 'economictimes.indiatimes.com',
6628
+ title: {
6629
+ selectors: ['title', ['meta[name="og:title"]', 'value']]
6630
+ },
6631
+ author: {
6632
+ selectors: ['a[rel="author"]']
6633
+ },
6634
+ lead_image_url: {
6635
+ selectors: [['meta[name="og:image"]', 'value']]
6636
+ },
6637
+ content: {
6638
+ selectors: ['article'],
6639
+ transforms: {},
6640
+ clean: ['span.imgAgency']
6641
+ }
6642
+ };
6643
+
6644
+ var FactorioComExtractor = {
6645
+ domain: 'factorio.com',
6646
+ title: {
6647
+ selectors: ['title']
6648
+ },
6649
+ lead_image_url: {
6650
+ selectors: [['meta[name="og:image"]', 'value']]
6651
+ },
6652
+ content: {
6653
+ selectors: [['.blog-post', 'div:nth-child(2)']],
6654
+ transforms: {
6655
+ h3: function h3(node) {
6656
+ var author = node.find('author');
6657
+
6658
+ if (author.text()) {
6659
+ node.after("<p>".concat(author.text(), "</p>"));
6660
+ author.remove();
6661
+ }
6662
+ }
6663
+ },
6664
+ clean: ['.logo-expansion-space-age']
6665
+ }
6666
+ };
6667
+
6668
+ var WwwTagesschauDeExtractor = {
6669
+ domain: 'www.tagesschau.de',
6670
+ title: {
6671
+ selectors: ['.seitenkopf__headline--text', 'title']
6672
+ },
6673
+ author: {
6674
+ selectors: ['.authorline__author authorline__link:first-child']
6675
+ },
6676
+ date_published: {
6677
+ selectors: [['meta[name="date"]', 'value'], '.metatextline'],
6678
+ timezone: 'UTC'
6679
+ },
6680
+ lead_image_url: {
6681
+ selectors: [['meta[name="og:image"]', 'value']]
6682
+ },
6683
+ content: {
6684
+ selectors: ['article'],
6685
+ clean: ['[data-config]', '.seitenkopf__headline', '.authorline__author', '.metatextline']
6686
+ }
6687
+ };
6688
+
6689
+ var Nineto5googleComExtractor = {
6690
+ domain: '9to5google.com',
6691
+ title: {
6692
+ selectors: ['title', 'h1']
6693
+ },
6694
+ author: {
6695
+ selectors: [['meta[name="author"]', 'value']]
6696
+ },
6697
+ date_published: {
6698
+ selectors: [['meta[name="article:published_time"]', 'value']]
6699
+ },
6700
+ lead_image_url: {
6701
+ selectors: [['meta[name="og:image"]', 'value']]
6702
+ },
6703
+ content: {
6704
+ selectors: ['main'],
6705
+ transforms: {
6706
+ img: function img(node) {
6707
+ node.removeAttr('sizes');
6708
+ }
6709
+ },
6710
+ clean: ['.post-meta']
6711
+ }
6712
+ };
6713
+
6714
+ var WwwEngadgetComExtractor = {
6715
+ domain: 'www.engadget.com',
6716
+ title: {
6717
+ selectors: ['title', 'h1']
6718
+ },
6719
+ author: {
6720
+ selectors: ['.caas-attr-item-author']
6721
+ },
6722
+ date_published: {
6723
+ selectors: [['time', 'datetime']]
6724
+ },
6725
+ lead_image_url: {
6726
+ selectors: [['meta[name="og:image"]', 'value']]
6727
+ },
6728
+ content: {
6729
+ selectors: ['.caas-body'],
6730
+ transforms: {
6731
+ h2: function h2(node) {
6732
+ return node.attr('class', 'mercury-parser-keep');
6733
+ },
6734
+ 'blockquote noscript': function blockquoteNoscript(node) {
6735
+ var iframe = node.find('iframe');
6736
+
6737
+ if (iframe != null) {
6738
+ return 'div';
6739
+ }
6740
+
6741
+ return null;
6742
+ }
6743
+ },
6744
+ clean: []
6745
+ }
6746
+ };
6747
+
6748
+ var TarnkappeInfoExtractor = {
6749
+ domain: 'tarnkappe.info',
6750
+ title: {
6751
+ selectors: ['title', 'h1']
6752
+ },
6753
+ author: {
6754
+ selectors: [['meta[name="author"]', 'value']]
6755
+ },
6756
+ date_published: {
6757
+ selectors: [['meta[name="article:published_time"]', 'value']]
6758
+ },
6759
+ lead_image_url: {
6760
+ selectors: [['meta[name="og:image"]', 'value']]
6761
+ },
6762
+ content: {
6763
+ selectors: ['main'],
6764
+ transforms: {
6765
+ h2: function h2(node) {
6766
+ return node.attr('class', 'mercury-parser-keep');
6767
+ }
6768
+ },
6769
+ clean: ['section#author']
6770
+ }
6771
+ };
6772
+
6665
6773
 
6666
6774
 
6667
6775
  var CustomExtractors = /*#__PURE__*/Object.freeze({
@@ -6801,7 +6909,6 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
6801
6909
  PastebinComExtractor: PastebinComExtractor,
6802
6910
  WwwAbendblattDeExtractor: WwwAbendblattDeExtractor,
6803
6911
  WwwGrueneDeExtractor: WwwGrueneDeExtractor,
6804
- WwwEngadgetComExtractor: WwwEngadgetComExtractor,
6805
6912
  ArstechnicaComExtractor: ArstechnicaComExtractor,
6806
6913
  WwwNdtvComExtractor: WwwNdtvComExtractor,
6807
6914
  SpektrumExtractor: SpektrumExtractor,
@@ -6834,7 +6941,13 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
6834
6941
  LublinSePlExtractor: LublinSePlExtractor,
6835
6942
  BialystokSePlExtractor: BialystokSePlExtractor,
6836
6943
  WwwLebensmittelwarnungDeExtractor: WwwLebensmittelwarnungDeExtractor,
6837
- WwwQbitaiComExtractor: WwwQbitaiComExtractor
6944
+ WwwQbitaiComExtractor: WwwQbitaiComExtractor,
6945
+ EconomictimesIndiatimesComExtractor: EconomictimesIndiatimesComExtractor,
6946
+ FactorioComExtractor: FactorioComExtractor,
6947
+ WwwTagesschauDeExtractor: WwwTagesschauDeExtractor,
6948
+ Nineto5googleComExtractor: Nineto5googleComExtractor,
6949
+ WwwEngadgetComExtractor: WwwEngadgetComExtractor,
6950
+ TarnkappeInfoExtractor: TarnkappeInfoExtractor
6838
6951
  });
6839
6952
 
6840
6953
  var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {