@jocmp/mercury-parser 2.2.10 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -9
- package/cli.js +3 -3
- package/dist/generate-custom-parser.js +79 -12
- package/dist/generate-custom-parser.js.map +1 -1
- package/dist/mercury.js +81 -12
- package/dist/mercury.js.map +1 -1
- package/dist/mercury.web.js +1 -1
- package/dist/mercury.web.js.map +1 -1
- package/package.json +1 -1
package/dist/mercury.js
CHANGED
|
@@ -362,7 +362,7 @@ var KEEP_CLASS = 'mercury-parser-keep';
|
|
|
362
362
|
var KEEP_SELECTORS = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them.
|
|
363
363
|
|
|
364
364
|
var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
|
|
365
|
-
var WHITELIST_ATTRS = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
|
|
365
|
+
var WHITELIST_ATTRS = ['src', 'srcset', 'start', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
|
|
366
366
|
var WHITELIST_ATTRS_RE = new RegExp("^(".concat(WHITELIST_ATTRS.join('|'), ")$"), 'i'); // removeEmpty
|
|
367
367
|
|
|
368
368
|
var CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders
|
|
@@ -1197,7 +1197,7 @@ function cleanTags$$1($article, $) {
|
|
|
1197
1197
|
if (weight < 0) {
|
|
1198
1198
|
$node.remove();
|
|
1199
1199
|
} else {
|
|
1200
|
-
//
|
|
1200
|
+
// determine if node seems like content
|
|
1201
1201
|
removeUnlessContent($node, $, weight);
|
|
1202
1202
|
}
|
|
1203
1203
|
});
|
|
@@ -1207,11 +1207,16 @@ function cleanTags$$1($article, $) {
|
|
|
1207
1207
|
function cleanHeaders($article, $) {
|
|
1208
1208
|
var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
|
|
1209
1209
|
$(HEADER_TAG_LIST, $article).each(function (index, header) {
|
|
1210
|
-
var $header = $(header);
|
|
1210
|
+
var $header = $(header);
|
|
1211
|
+
|
|
1212
|
+
if ($(header).hasClass(KEEP_CLASS)) {
|
|
1213
|
+
return $header;
|
|
1214
|
+
} // Remove any headers that appear before all other p tags in the
|
|
1211
1215
|
// document. This probably means that it was part of the title, a
|
|
1212
1216
|
// subtitle or something else extraneous like a datestamp or byline,
|
|
1213
1217
|
// all of which should be handled by other metadata handling.
|
|
1214
1218
|
|
|
1219
|
+
|
|
1215
1220
|
if ($($header, $article).prevAll('p').length === 0) {
|
|
1216
1221
|
return $header.remove();
|
|
1217
1222
|
} // Remove any headers that match the title exactly.
|
|
@@ -6171,8 +6176,18 @@ var WwwVersantsComExtractor = {
|
|
|
6171
6176
|
selectors: [['meta[name="og:image"]', 'value']]
|
|
6172
6177
|
},
|
|
6173
6178
|
content: {
|
|
6174
|
-
|
|
6175
|
-
|
|
6179
|
+
transforms: {
|
|
6180
|
+
'.featured-image': function featuredImage($node) {
|
|
6181
|
+
$node.addClass('mercury-parser-keep');
|
|
6182
|
+
var figcaption = $node.find('span');
|
|
6183
|
+
$node.find('figure').append(figcaption);
|
|
6184
|
+
}
|
|
6185
|
+
},
|
|
6186
|
+
selectors: ['.article-content'],
|
|
6187
|
+
clean: ['.adv-link', '.versa-target', 'header', // Clean title
|
|
6188
|
+
'.author', // Clean author
|
|
6189
|
+
'.thumbnail-slider' // Remove, the main images will be within the .main-slider div.
|
|
6190
|
+
]
|
|
6176
6191
|
}
|
|
6177
6192
|
};
|
|
6178
6193
|
|
|
@@ -6218,20 +6233,24 @@ var WwwAndroidauthorityComExtractor = {
|
|
|
6218
6233
|
lead_image_url: {
|
|
6219
6234
|
selectors: [['meta[name="og:image"]', 'value']]
|
|
6220
6235
|
},
|
|
6236
|
+
// Some pages have a nested header elements that are significant, and that the parser will
|
|
6237
|
+
// remove if not following a paragraph. Adding this empty paragraph fixes it, and
|
|
6238
|
+
// the empty paragraph will be removed anyway.
|
|
6221
6239
|
content: {
|
|
6222
|
-
selectors: ['.d_Dd'],
|
|
6240
|
+
selectors: ['.e_Bc', '.d_Dd'],
|
|
6223
6241
|
transforms: {
|
|
6224
6242
|
ol: function ol(node) {
|
|
6225
6243
|
node.attr('class', 'mercury-parser-keep');
|
|
6226
6244
|
},
|
|
6227
6245
|
h2: function h2($node) {
|
|
6228
|
-
|
|
6229
|
-
|
|
6230
|
-
|
|
6231
|
-
$node.
|
|
6246
|
+
return $node.attr('class', 'mercury-parser-keep');
|
|
6247
|
+
},
|
|
6248
|
+
h3: function h3($node) {
|
|
6249
|
+
return $node.attr('class', 'mercury-parser-keep');
|
|
6232
6250
|
}
|
|
6233
6251
|
},
|
|
6234
|
-
clean: ['.
|
|
6252
|
+
clean: ['.e_Oh', // Polls
|
|
6253
|
+
'picture + div' // Lead image text
|
|
6235
6254
|
]
|
|
6236
6255
|
}
|
|
6237
6256
|
};
|
|
@@ -6329,6 +6348,54 @@ var MobilesyrupComExtractor = {
|
|
|
6329
6348
|
}
|
|
6330
6349
|
};
|
|
6331
6350
|
|
|
6351
|
+
var WwwChannelnewsasiaComExtractor = {
|
|
6352
|
+
domain: 'www.channelnewsasia.com',
|
|
6353
|
+
title: {
|
|
6354
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
6355
|
+
},
|
|
6356
|
+
author: {
|
|
6357
|
+
selectors: ['.link--author-profile', ['meta[name="cXenseParse:author"]', 'value']]
|
|
6358
|
+
},
|
|
6359
|
+
date_published: {
|
|
6360
|
+
selectors: ['.article-publish:not(span)'],
|
|
6361
|
+
format: 'DD MMM YYYY HH:mma',
|
|
6362
|
+
timezone: 'Asia/Singapore'
|
|
6363
|
+
},
|
|
6364
|
+
dek: {
|
|
6365
|
+
selectors: ['.content-detail__description']
|
|
6366
|
+
},
|
|
6367
|
+
lead_image_url: {
|
|
6368
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6369
|
+
},
|
|
6370
|
+
content: {
|
|
6371
|
+
selectors: ['section[data-title="Content"]'],
|
|
6372
|
+
transforms: {},
|
|
6373
|
+
clean: []
|
|
6374
|
+
}
|
|
6375
|
+
};
|
|
6376
|
+
|
|
6377
|
+
var WccftechComExtractor = {
|
|
6378
|
+
domain: 'wccftech.com',
|
|
6379
|
+
title: {
|
|
6380
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
6381
|
+
},
|
|
6382
|
+
author: {
|
|
6383
|
+
selectors: ['div.meta a:first-of-type']
|
|
6384
|
+
},
|
|
6385
|
+
date_published: {
|
|
6386
|
+
selectors: [['meta[name="pub_date"]', 'value'], ['meta[name="article:published_time"]', 'value']]
|
|
6387
|
+
},
|
|
6388
|
+
lead_image_url: {
|
|
6389
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6390
|
+
},
|
|
6391
|
+
content: {
|
|
6392
|
+
selectors: ['.content'],
|
|
6393
|
+
transforms: {},
|
|
6394
|
+
clean: ['.democracy' // JavaScript polls
|
|
6395
|
+
]
|
|
6396
|
+
}
|
|
6397
|
+
};
|
|
6398
|
+
|
|
6332
6399
|
|
|
6333
6400
|
|
|
6334
6401
|
var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
@@ -6481,7 +6548,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
|
6481
6548
|
TechcrunchComExtractor: TechcrunchComExtractor,
|
|
6482
6549
|
WwwHardwarezoneComSgExtractor: WwwHardwarezoneComSgExtractor,
|
|
6483
6550
|
WwwSpiegelDeExtractor: WwwSpiegelDeExtractor,
|
|
6484
|
-
MobilesyrupComExtractor: MobilesyrupComExtractor
|
|
6551
|
+
MobilesyrupComExtractor: MobilesyrupComExtractor,
|
|
6552
|
+
WwwChannelnewsasiaComExtractor: WwwChannelnewsasiaComExtractor,
|
|
6553
|
+
WccftechComExtractor: WccftechComExtractor
|
|
6485
6554
|
});
|
|
6486
6555
|
|
|
6487
6556
|
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
|