@jocmp/mercury-parser 2.2.4 → 2.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/dist/generate-custom-parser.js +157 -63
- package/dist/generate-custom-parser.js.map +1 -1
- package/dist/mercury.js +37 -3
- package/dist/mercury.js.map +1 -1
- package/dist/mercury.web.js +1 -1
- package/dist/mercury.web.js.map +1 -1
- package/package.json +5 -5
package/README.md
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
# Mercury Parser - Extracting content from chaos
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
<a href="https://www.npmjs.com/package/@jocmp/mercury-parser">
|
|
4
|
+
<img src="https://img.shields.io/npm/v/@jocmp/mercury-parser.svg" alt="npm version">
|
|
5
|
+
</a>
|
|
6
|
+
<a href="https://github.com/jocmp/mercury-parser/actions/workflows/ci.yml">
|
|
7
|
+
<img src="https://github.com/jocmp/mercury-parser/actions/workflows/ci.yml/badge.svg" alt="CI">
|
|
8
|
+
</a>
|
|
4
9
|
|
|
5
10
|
Mercury Parser extracts the bits that humans care about from any URL you give it. That includes article content, titles, authors, published dates, excerpts, lead images, and more.
|
|
6
11
|
|
|
@@ -5,7 +5,7 @@ function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'defau
|
|
|
5
5
|
var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
|
|
6
6
|
var _toConsumableArray = _interopDefault(require('@babel/runtime-corejs2/helpers/toConsumableArray'));
|
|
7
7
|
var fs = _interopDefault(require('fs'));
|
|
8
|
-
var URL = _interopDefault(require('url'));
|
|
8
|
+
var URL$1 = _interopDefault(require('url'));
|
|
9
9
|
var inquirer = _interopDefault(require('inquirer'));
|
|
10
10
|
var ora = _interopDefault(require('ora'));
|
|
11
11
|
var child_process = require('child_process');
|
|
@@ -29,7 +29,6 @@ var postmanRequest = _interopDefault(require('postman-request'));
|
|
|
29
29
|
var assign = _interopDefault(require('@babel/runtime-corejs2/core-js/object/assign'));
|
|
30
30
|
var keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
|
|
31
31
|
var stringDirection = _interopDefault(require('string-direction'));
|
|
32
|
-
var validUrl = _interopDefault(require('valid-url'));
|
|
33
32
|
var momentTimezone = _interopDefault(require('moment-timezone'));
|
|
34
33
|
var momentParseformat = _interopDefault(require('moment-parseformat'));
|
|
35
34
|
var wuzzy = _interopDefault(require('wuzzy'));
|
|
@@ -77,7 +76,7 @@ function absolutize($, rootUrl, attr) {
|
|
|
77
76
|
var attrs = getAttrs(node);
|
|
78
77
|
var url = attrs[attr];
|
|
79
78
|
if (!url) return;
|
|
80
|
-
var absoluteUrl = URL.resolve(baseUrl || rootUrl, url);
|
|
79
|
+
var absoluteUrl = URL$1.resolve(baseUrl || rootUrl, url);
|
|
81
80
|
setAttr(node, attr, absoluteUrl);
|
|
82
81
|
});
|
|
83
82
|
}
|
|
@@ -97,7 +96,7 @@ function absolutizeSet($, rootUrl, $content) {
|
|
|
97
96
|
// a candidate URL cannot start or end with a comma
|
|
98
97
|
// descriptors are separated from the URLs by unescaped whitespace
|
|
99
98
|
var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
|
|
100
|
-
parts[0] = URL.resolve(rootUrl, parts[0]);
|
|
99
|
+
parts[0] = URL$1.resolve(rootUrl, parts[0]);
|
|
101
100
|
return parts.join(' ');
|
|
102
101
|
});
|
|
103
102
|
|
|
@@ -162,7 +161,7 @@ var _objectWithoutProperties = _interopDefault$1(objectWithoutProperties);
|
|
|
162
161
|
|
|
163
162
|
var _asyncToGenerator = _interopDefault$1(asyncToGenerator);
|
|
164
163
|
|
|
165
|
-
var URL$1 = _interopDefault$1(URL);
|
|
164
|
+
var URL$1$1 = _interopDefault$1(URL$1);
|
|
166
165
|
|
|
167
166
|
var cheerio$1 = _interopDefault$1(cheerio);
|
|
168
167
|
|
|
@@ -198,8 +197,6 @@ var _Object$keys = _interopDefault$1(keys);
|
|
|
198
197
|
|
|
199
198
|
var stringDirection$1 = _interopDefault$1(stringDirection);
|
|
200
199
|
|
|
201
|
-
var validUrl$1 = _interopDefault$1(validUrl);
|
|
202
|
-
|
|
203
200
|
var moment = _interopDefault$1(momentTimezone);
|
|
204
201
|
|
|
205
202
|
var parseFormat = _interopDefault$1(momentParseformat);
|
|
@@ -302,7 +299,7 @@ function isGoodSegment$1(segment, index, firstSegmentHasLetters) {
|
|
|
302
299
|
|
|
303
300
|
|
|
304
301
|
function articleBaseUrl$1(url, parsed) {
|
|
305
|
-
var parsedUrl = parsed || URL$1.parse(url);
|
|
302
|
+
var parsedUrl = parsed || URL$1$1.parse(url);
|
|
306
303
|
var protocol = parsedUrl.protocol,
|
|
307
304
|
host = parsedUrl.host,
|
|
308
305
|
path = parsedUrl.path;
|
|
@@ -455,7 +452,7 @@ function _fetchResource() {
|
|
|
455
452
|
_regeneratorRuntime.mark(function _callee(url, parsedUrl) {
|
|
456
453
|
var headers,
|
|
457
454
|
options,
|
|
458
|
-
|
|
455
|
+
_yield$get,
|
|
459
456
|
response,
|
|
460
457
|
body,
|
|
461
458
|
_args = arguments;
|
|
@@ -465,7 +462,7 @@ function _fetchResource() {
|
|
|
465
462
|
switch (_context.prev = _context.next) {
|
|
466
463
|
case 0:
|
|
467
464
|
headers = _args.length > 2 && _args[2] !== undefined ? _args[2] : {};
|
|
468
|
-
parsedUrl = parsedUrl || URL$1.parse(encodeURI(url));
|
|
465
|
+
parsedUrl = parsedUrl || URL$1$1.parse(encodeURI(url));
|
|
469
466
|
options = _objectSpread({
|
|
470
467
|
url: parsedUrl.href,
|
|
471
468
|
headers: _objectSpread({}, REQUEST_HEADERS, headers),
|
|
@@ -487,9 +484,9 @@ function _fetchResource() {
|
|
|
487
484
|
return get(options);
|
|
488
485
|
|
|
489
486
|
case 5:
|
|
490
|
-
|
|
491
|
-
response =
|
|
492
|
-
body =
|
|
487
|
+
_yield$get = _context.sent;
|
|
488
|
+
response = _yield$get.response;
|
|
489
|
+
body = _yield$get.body;
|
|
493
490
|
_context.prev = 8;
|
|
494
491
|
validateResponse(response);
|
|
495
492
|
return _context.abrupt("return", {
|
|
@@ -818,7 +815,7 @@ function markToKeep$1(article, $, url) {
|
|
|
818
815
|
}
|
|
819
816
|
|
|
820
817
|
if (url) {
|
|
821
|
-
var _URL$parse = URL$1.parse(url),
|
|
818
|
+
var _URL$parse = URL$1$1.parse(url),
|
|
822
819
|
protocol = _URL$parse.protocol,
|
|
823
820
|
hostname = _URL$parse.hostname;
|
|
824
821
|
|
|
@@ -1424,7 +1421,7 @@ function absolutize$1($, rootUrl, attr) {
|
|
|
1424
1421
|
var attrs = getAttrs$1(node);
|
|
1425
1422
|
var url = attrs[attr];
|
|
1426
1423
|
if (!url) return;
|
|
1427
|
-
var absoluteUrl = URL$1.resolve(baseUrl || rootUrl, url);
|
|
1424
|
+
var absoluteUrl = URL$1$1.resolve(baseUrl || rootUrl, url);
|
|
1428
1425
|
setAttr$1(node, attr, absoluteUrl);
|
|
1429
1426
|
});
|
|
1430
1427
|
}
|
|
@@ -1444,7 +1441,7 @@ function absolutizeSet$1($, rootUrl, $content) {
|
|
|
1444
1441
|
// a candidate URL cannot start or end with a comma
|
|
1445
1442
|
// descriptors are separated from the URLs by unescaped whitespace
|
|
1446
1443
|
var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
|
|
1447
|
-
parts[0] = URL$1.resolve(rootUrl, parts[0]);
|
|
1444
|
+
parts[0] = URL$1$1.resolve(rootUrl, parts[0]);
|
|
1448
1445
|
return parts.join(' ');
|
|
1449
1446
|
});
|
|
1450
1447
|
|
|
@@ -2189,13 +2186,16 @@ var NewYorkerExtractor = {
|
|
|
2189
2186
|
var WiredExtractor = {
|
|
2190
2187
|
domain: 'www.wired.com',
|
|
2191
2188
|
title: {
|
|
2192
|
-
selectors: ['h1[data-testId="ContentHeaderHed"]'
|
|
2189
|
+
selectors: ['h1[data-testId="ContentHeaderHed"]' // enter title selectors
|
|
2190
|
+
]
|
|
2193
2191
|
},
|
|
2194
2192
|
author: {
|
|
2195
|
-
selectors: [['meta[name="article:author"]', 'value'], 'a[rel="author"]'
|
|
2193
|
+
selectors: [['meta[name="article:author"]', 'value'], 'a[rel="author"]' // enter author selectors
|
|
2194
|
+
]
|
|
2196
2195
|
},
|
|
2197
2196
|
content: {
|
|
2198
|
-
selectors: ['article.article.main-content', 'article.content'
|
|
2197
|
+
selectors: ['article.article.main-content', 'article.content' // enter content selectors
|
|
2198
|
+
],
|
|
2199
2199
|
// Is there anything in the content you selected that needs transformed
|
|
2200
2200
|
// before it's consumable content? E.g., unusual lazy loaded images
|
|
2201
2201
|
transforms: [],
|
|
@@ -2222,13 +2222,16 @@ var WiredExtractor = {
|
|
|
2222
2222
|
var MSNExtractor = {
|
|
2223
2223
|
domain: 'www.msn.com',
|
|
2224
2224
|
title: {
|
|
2225
|
-
selectors: ['h1'
|
|
2225
|
+
selectors: ['h1' // enter title selectors
|
|
2226
|
+
]
|
|
2226
2227
|
},
|
|
2227
2228
|
author: {
|
|
2228
|
-
selectors: ['span.authorname-txt'
|
|
2229
|
+
selectors: ['span.authorname-txt' // enter author selectors
|
|
2230
|
+
]
|
|
2229
2231
|
},
|
|
2230
2232
|
content: {
|
|
2231
|
-
selectors: ['div.richtext'
|
|
2233
|
+
selectors: ['div.richtext' // enter content selectors
|
|
2234
|
+
],
|
|
2232
2235
|
// Is there anything in the content you selected that needs transformed
|
|
2233
2236
|
// before it's consumable content? E.g., unusual lazy loaded images
|
|
2234
2237
|
transforms: [],
|
|
@@ -2255,10 +2258,12 @@ var MSNExtractor = {
|
|
|
2255
2258
|
var YahooExtractor = {
|
|
2256
2259
|
domain: 'www.yahoo.com',
|
|
2257
2260
|
title: {
|
|
2258
|
-
selectors: ['header.canvas-header'
|
|
2261
|
+
selectors: ['header.canvas-header' // enter title selectors
|
|
2262
|
+
]
|
|
2259
2263
|
},
|
|
2260
2264
|
author: {
|
|
2261
|
-
selectors: ['span.provider-name'
|
|
2265
|
+
selectors: ['span.provider-name' // enter author selectors
|
|
2266
|
+
]
|
|
2262
2267
|
},
|
|
2263
2268
|
content: {
|
|
2264
2269
|
selectors: [// enter content selectors
|
|
@@ -2291,10 +2296,12 @@ var BuzzfeedExtractor = {
|
|
|
2291
2296
|
domain: 'www.buzzfeed.com',
|
|
2292
2297
|
supportedDomains: ['www.buzzfeednews.com'],
|
|
2293
2298
|
title: {
|
|
2294
|
-
selectors: ['h1.embed-headline-title'
|
|
2299
|
+
selectors: ['h1.embed-headline-title' // enter title selectors
|
|
2300
|
+
]
|
|
2295
2301
|
},
|
|
2296
2302
|
author: {
|
|
2297
|
-
selectors: ['a[data-action="user/username"]', 'byline__author', ['meta[name="author"]', 'value']
|
|
2303
|
+
selectors: ['a[data-action="user/username"]', 'byline__author', ['meta[name="author"]', 'value'] // enter author selectors
|
|
2304
|
+
]
|
|
2298
2305
|
},
|
|
2299
2306
|
content: {
|
|
2300
2307
|
selectors: [['div[class^="featureimage_featureImageWrapper"]', '.js-subbuzz-wrapper'], ['.js-subbuzz-wrapper']],
|
|
@@ -2335,13 +2342,16 @@ var BuzzfeedExtractor = {
|
|
|
2335
2342
|
var WikiaExtractor = {
|
|
2336
2343
|
domain: 'fandom.wikia.com',
|
|
2337
2344
|
title: {
|
|
2338
|
-
selectors: ['h1.entry-title'
|
|
2345
|
+
selectors: ['h1.entry-title' // enter title selectors
|
|
2346
|
+
]
|
|
2339
2347
|
},
|
|
2340
2348
|
author: {
|
|
2341
|
-
selectors: ['.author vcard', '.fn'
|
|
2349
|
+
selectors: ['.author vcard', '.fn' // enter author selectors
|
|
2350
|
+
]
|
|
2342
2351
|
},
|
|
2343
2352
|
content: {
|
|
2344
|
-
selectors: ['.grid-content', '.entry-content'
|
|
2353
|
+
selectors: ['.grid-content', '.entry-content' // enter content selectors
|
|
2354
|
+
],
|
|
2345
2355
|
// Is there anything in the content you selected that needs transformed
|
|
2346
2356
|
// before it's consumable content? E.g., unusual lazy loaded images
|
|
2347
2357
|
transforms: [],
|
|
@@ -2368,10 +2378,12 @@ var WikiaExtractor = {
|
|
|
2368
2378
|
var LittleThingsExtractor = {
|
|
2369
2379
|
domain: 'www.littlethings.com',
|
|
2370
2380
|
title: {
|
|
2371
|
-
selectors: ['h1[class*="PostHeader"]', 'h1.post-title'
|
|
2381
|
+
selectors: ['h1[class*="PostHeader"]', 'h1.post-title' // enter title selectors
|
|
2382
|
+
]
|
|
2372
2383
|
},
|
|
2373
2384
|
author: {
|
|
2374
|
-
selectors: ['div[class^="PostHeader__ScAuthorNameSection"]', ['meta[name="author"]', 'value']
|
|
2385
|
+
selectors: ['div[class^="PostHeader__ScAuthorNameSection"]', ['meta[name="author"]', 'value'] // enter author selectors
|
|
2386
|
+
]
|
|
2375
2387
|
},
|
|
2376
2388
|
content: {
|
|
2377
2389
|
selectors: [// enter content selectors
|
|
@@ -2814,7 +2826,8 @@ var WwwThevergeComExtractor = {
|
|
|
2814
2826
|
// Is there anything that is in the result that shouldn't be?
|
|
2815
2827
|
// The clean selectors will remove anything that matches from
|
|
2816
2828
|
// the result
|
|
2817
|
-
clean: ['.aside', 'img.c-dynamic-image'
|
|
2829
|
+
clean: ['.aside', 'img.c-dynamic-image' // images come from noscript transform
|
|
2830
|
+
]
|
|
2818
2831
|
}
|
|
2819
2832
|
};
|
|
2820
2833
|
var WwwCnnComExtractor = {
|
|
@@ -3535,7 +3548,8 @@ var WwwThepoliticalinsiderComExtractor = {
|
|
|
3535
3548
|
]
|
|
3536
3549
|
},
|
|
3537
3550
|
lead_image_url: {
|
|
3538
|
-
selectors: [['meta[name="og:image"]', 'value']
|
|
3551
|
+
selectors: [['meta[name="og:image"]', 'value'] // enter selectors
|
|
3552
|
+
]
|
|
3539
3553
|
},
|
|
3540
3554
|
content: {
|
|
3541
3555
|
selectors: ['div#article-body'],
|
|
@@ -4866,7 +4880,8 @@ var WwwRedditComExtractor = {
|
|
|
4866
4880
|
content: {
|
|
4867
4881
|
selectors: [['div[data-test-id="post-content"] p'], // text post
|
|
4868
4882
|
['div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])', // external link
|
|
4869
|
-
'div[data-test-id="post-content"] div[data-click-id="media"]'
|
|
4883
|
+
'div[data-test-id="post-content"] div[data-click-id="media"]' // embedded media
|
|
4884
|
+
], // external link with media preview (YouTube, imgur album, etc...)
|
|
4870
4885
|
['div[data-test-id="post-content"] div[data-click-id="media"]'], // Embedded media (Reddit video)
|
|
4871
4886
|
['div[data-test-id="post-content"] a'], // external link
|
|
4872
4887
|
'div[data-test-id="post-content"]'],
|
|
@@ -5563,7 +5578,7 @@ var WiredJpExtractor = {
|
|
|
5563
5578
|
'img[data-original]': function imgDataOriginal($node) {
|
|
5564
5579
|
var dataOriginal = $node.attr('data-original');
|
|
5565
5580
|
var src = $node.attr('src');
|
|
5566
|
-
var url = URL$1.resolve(src, dataOriginal);
|
|
5581
|
+
var url = URL$1$1.resolve(src, dataOriginal);
|
|
5567
5582
|
$node.attr('src', url);
|
|
5568
5583
|
}
|
|
5569
5584
|
},
|
|
@@ -5651,8 +5666,9 @@ var WwwPhoronixComExtractor = {
|
|
|
5651
5666
|
format: 'D MMMM YYYY at hh:mm',
|
|
5652
5667
|
timezone: 'America/New_York'
|
|
5653
5668
|
},
|
|
5654
|
-
|
|
5655
|
-
|
|
5669
|
+
lead_image_url: {
|
|
5670
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
5671
|
+
},
|
|
5656
5672
|
content: {
|
|
5657
5673
|
selectors: ['.content'],
|
|
5658
5674
|
// Is there anything in the content you selected that needs transformed
|
|
@@ -6168,6 +6184,83 @@ var WwwCbcCaExtractor = {
|
|
|
6168
6184
|
clean: []
|
|
6169
6185
|
}
|
|
6170
6186
|
};
|
|
6187
|
+
var WwwVersantsComExtractor = {
|
|
6188
|
+
domain: 'www.versants.com',
|
|
6189
|
+
title: {
|
|
6190
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
6191
|
+
},
|
|
6192
|
+
author: {
|
|
6193
|
+
selectors: [['meta[name="author"]', 'value']]
|
|
6194
|
+
},
|
|
6195
|
+
date_published: {
|
|
6196
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
6197
|
+
},
|
|
6198
|
+
lead_image_url: {
|
|
6199
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6200
|
+
},
|
|
6201
|
+
content: {
|
|
6202
|
+
selectors: ['.entry-content'],
|
|
6203
|
+
clean: ['.adv-link', '.versa-target']
|
|
6204
|
+
}
|
|
6205
|
+
};
|
|
6206
|
+
var Www1pezeshkComExtractor = {
|
|
6207
|
+
domain: 'www.1pezeshk.com',
|
|
6208
|
+
title: {
|
|
6209
|
+
selectors: [['meta[name="og:title"]', 'value'], 'h1.post-title']
|
|
6210
|
+
},
|
|
6211
|
+
author: {
|
|
6212
|
+
selectors: [['meta[name="author"]', 'value']]
|
|
6213
|
+
},
|
|
6214
|
+
date_published: {
|
|
6215
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
6216
|
+
},
|
|
6217
|
+
lead_image_url: {
|
|
6218
|
+
selectors: [['.featured-area img', 'src']]
|
|
6219
|
+
},
|
|
6220
|
+
content: {
|
|
6221
|
+
selectors: ['article > .entry-content'],
|
|
6222
|
+
transforms: {
|
|
6223
|
+
img: function img($node) {
|
|
6224
|
+
$node.src = decodeURIComponent($node.src);
|
|
6225
|
+
}
|
|
6226
|
+
},
|
|
6227
|
+
// Is there anything that is in the result that shouldn't be?
|
|
6228
|
+
// The clean selectors will remove anything that matches from
|
|
6229
|
+
// the result
|
|
6230
|
+
clean: []
|
|
6231
|
+
}
|
|
6232
|
+
};
|
|
6233
|
+
var WwwAndroidauthorityComExtractor = {
|
|
6234
|
+
domain: 'www.androidauthority.com',
|
|
6235
|
+
title: {
|
|
6236
|
+
selectors: [['meta[name="og:title"]', 'value'], 'h1']
|
|
6237
|
+
},
|
|
6238
|
+
author: {
|
|
6239
|
+
selectors: ['button.d_ic']
|
|
6240
|
+
},
|
|
6241
|
+
date_published: {
|
|
6242
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
6243
|
+
},
|
|
6244
|
+
lead_image_url: {
|
|
6245
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
6246
|
+
},
|
|
6247
|
+
content: {
|
|
6248
|
+
selectors: ['.d_Dd'],
|
|
6249
|
+
transforms: {
|
|
6250
|
+
ol: function ol(node) {
|
|
6251
|
+
node.attr('class', 'mercury-parser-keep');
|
|
6252
|
+
},
|
|
6253
|
+
h2: function h2($node) {
|
|
6254
|
+
// Some pages have an element h2 that is significant, and that the parser will
|
|
6255
|
+
// remove if not following a paragraph. Adding this empty paragraph fixes it, and
|
|
6256
|
+
// the empty paragraph will be removed anyway.
|
|
6257
|
+
$node.before('<p></p>');
|
|
6258
|
+
}
|
|
6259
|
+
},
|
|
6260
|
+
clean: ['.d_f .d_nr' // Lead image
|
|
6261
|
+
]
|
|
6262
|
+
}
|
|
6263
|
+
};
|
|
6171
6264
|
|
|
6172
6265
|
var CustomExtractors =
|
|
6173
6266
|
/*#__PURE__*/
|
|
@@ -6314,7 +6407,10 @@ _Object$freeze({
|
|
|
6314
6407
|
SpektrumExtractor: SpektrumExtractor,
|
|
6315
6408
|
PostlightComExtractor: PostlightComExtractor,
|
|
6316
6409
|
WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor,
|
|
6317
|
-
WwwCbcCaExtractor: WwwCbcCaExtractor
|
|
6410
|
+
WwwCbcCaExtractor: WwwCbcCaExtractor,
|
|
6411
|
+
WwwVersantsComExtractor: WwwVersantsComExtractor,
|
|
6412
|
+
Www1pezeshkComExtractor: Www1pezeshkComExtractor,
|
|
6413
|
+
WwwAndroidauthorityComExtractor: WwwAndroidauthorityComExtractor
|
|
6318
6414
|
});
|
|
6319
6415
|
|
|
6320
6416
|
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
|
|
@@ -6356,13 +6452,11 @@ function cleanAuthor(author) {
|
|
|
6356
6452
|
}
|
|
6357
6453
|
|
|
6358
6454
|
function clean$1(leadImageUrl) {
|
|
6359
|
-
|
|
6360
|
-
|
|
6361
|
-
|
|
6362
|
-
return
|
|
6455
|
+
try {
|
|
6456
|
+
return new URL(leadImageUrl.trim()).toString();
|
|
6457
|
+
} catch (_unused) {
|
|
6458
|
+
return null;
|
|
6363
6459
|
}
|
|
6364
|
-
|
|
6365
|
-
return null;
|
|
6366
6460
|
} // Return None if the dek wasn't good enough.
|
|
6367
6461
|
|
|
6368
6462
|
|
|
@@ -6549,7 +6643,7 @@ function cleanDomainFromTitle(splitTitle, url) {
|
|
|
6549
6643
|
//
|
|
6550
6644
|
// Strip out the big TLDs - it just makes the matching a bit more
|
|
6551
6645
|
// accurate. Not the end of the world if it doesn't strip right.
|
|
6552
|
-
var _URL$parse = URL$1.parse(url),
|
|
6646
|
+
var _URL$parse = URL$1$1.parse(url),
|
|
6553
6647
|
host = _URL$parse.host;
|
|
6554
6648
|
|
|
6555
6649
|
var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
|
|
@@ -7296,7 +7390,7 @@ function shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrl
|
|
|
7296
7390
|
|
|
7297
7391
|
var hostname = parsedUrl.hostname;
|
|
7298
7392
|
|
|
7299
|
-
var _URL$parse = URL$1.parse(href),
|
|
7393
|
+
var _URL$parse = URL$1$1.parse(href),
|
|
7300
7394
|
linkHost = _URL$parse.hostname; // Domain mismatch.
|
|
7301
7395
|
|
|
7302
7396
|
|
|
@@ -7378,7 +7472,7 @@ function scoreLinks(_ref) {
|
|
|
7378
7472
|
$ = _ref.$,
|
|
7379
7473
|
_ref$previousUrls = _ref.previousUrls,
|
|
7380
7474
|
previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
|
|
7381
|
-
parsedUrl = parsedUrl || URL$1.parse(articleUrl);
|
|
7475
|
+
parsedUrl = parsedUrl || URL$1$1.parse(articleUrl);
|
|
7382
7476
|
var baseRegex = makeBaseRegex(baseUrl);
|
|
7383
7477
|
var isWp = isWordpress$1($); // Loop through all links, looking for hints that they may be next-page
|
|
7384
7478
|
// links. Things like having "page" in their textContent, className or
|
|
@@ -7440,7 +7534,7 @@ var GenericNextPageUrlExtractor = {
|
|
|
7440
7534
|
parsedUrl = _ref.parsedUrl,
|
|
7441
7535
|
_ref$previousUrls = _ref.previousUrls,
|
|
7442
7536
|
previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
|
|
7443
|
-
parsedUrl = parsedUrl || URL$1.parse(url);
|
|
7537
|
+
parsedUrl = parsedUrl || URL$1$1.parse(url);
|
|
7444
7538
|
var articleUrl = removeAnchor$1(url);
|
|
7445
7539
|
var baseUrl = articleBaseUrl$1(url, parsedUrl);
|
|
7446
7540
|
var links = $('a[href]').toArray();
|
|
@@ -7475,7 +7569,7 @@ var GenericNextPageUrlExtractor = {
|
|
|
7475
7569
|
var CANONICAL_META_SELECTORS = ['og:url'];
|
|
7476
7570
|
|
|
7477
7571
|
function parseDomain(url) {
|
|
7478
|
-
var parsedUrl = URL$1.parse(url);
|
|
7572
|
+
var parsedUrl = URL$1$1.parse(url);
|
|
7479
7573
|
var hostname = parsedUrl.hostname;
|
|
7480
7574
|
return hostname;
|
|
7481
7575
|
}
|
|
@@ -7644,7 +7738,7 @@ function detectByHtml($) {
|
|
|
7644
7738
|
}
|
|
7645
7739
|
|
|
7646
7740
|
function getExtractor(url, parsedUrl, $) {
|
|
7647
|
-
parsedUrl = parsedUrl || URL$1.parse(url);
|
|
7741
|
+
parsedUrl = parsedUrl || URL$1$1.parse(url);
|
|
7648
7742
|
var _parsedUrl = parsedUrl,
|
|
7649
7743
|
hostname = _parsedUrl.hostname;
|
|
7650
7744
|
var baseDomain = hostname.split('.').slice(-2).join('.');
|
|
@@ -7868,6 +7962,12 @@ var RootExtractor = {
|
|
|
7868
7962
|
};
|
|
7869
7963
|
}
|
|
7870
7964
|
|
|
7965
|
+
var extendedResults = {};
|
|
7966
|
+
|
|
7967
|
+
if (extractor.extend) {
|
|
7968
|
+
extendedResults = selectExtendedTypes(extractor.extend, opts);
|
|
7969
|
+
}
|
|
7970
|
+
|
|
7871
7971
|
var title = extractResult(_objectSpread({}, opts, {
|
|
7872
7972
|
type: 'title'
|
|
7873
7973
|
}));
|
|
@@ -7916,12 +8016,6 @@ var RootExtractor = {
|
|
|
7916
8016
|
url = _ref3.url,
|
|
7917
8017
|
domain = _ref3.domain;
|
|
7918
8018
|
|
|
7919
|
-
var extendedResults = {};
|
|
7920
|
-
|
|
7921
|
-
if (extractor.extend) {
|
|
7922
|
-
extendedResults = selectExtendedTypes(extractor.extend, opts);
|
|
7923
|
-
}
|
|
7924
|
-
|
|
7925
8019
|
return _objectSpread({
|
|
7926
8020
|
title: title,
|
|
7927
8021
|
content: content,
|
|
@@ -8056,7 +8150,7 @@ var Parser = {
|
|
|
8056
8150
|
html = html || cheerio$1.html();
|
|
8057
8151
|
}
|
|
8058
8152
|
|
|
8059
|
-
parsedUrl = URL$1.parse(url);
|
|
8153
|
+
parsedUrl = URL$1$1.parse(url);
|
|
8060
8154
|
|
|
8061
8155
|
if (validateUrl(parsedUrl)) {
|
|
8062
8156
|
_context.next = 6;
|
|
@@ -8291,7 +8385,7 @@ var questions = [{
|
|
|
8291
8385
|
name: 'website',
|
|
8292
8386
|
message: "Paste a url to an article you'd like to create or extend a parser for:",
|
|
8293
8387
|
validate: function validate(value) {
|
|
8294
|
-
var _URL$parse = URL.parse(value),
|
|
8388
|
+
var _URL$parse = URL$1.parse(value),
|
|
8295
8389
|
hostname = _URL$parse.hostname;
|
|
8296
8390
|
|
|
8297
8391
|
if (hostname) return true;
|
|
@@ -8325,7 +8419,7 @@ function confirmCreateDir(dir, msg) {
|
|
|
8325
8419
|
}
|
|
8326
8420
|
|
|
8327
8421
|
function getDir(url) {
|
|
8328
|
-
var _URL$parse2 = URL.parse(url),
|
|
8422
|
+
var _URL$parse2 = URL$1.parse(url),
|
|
8329
8423
|
hostname = _URL$parse2.hostname;
|
|
8330
8424
|
|
|
8331
8425
|
return "./src/extractors/custom/".concat(hostname);
|
|
@@ -8334,7 +8428,7 @@ function getDir(url) {
|
|
|
8334
8428
|
function scaffoldCustomParser(url) {
|
|
8335
8429
|
var dir = getDir(url);
|
|
8336
8430
|
|
|
8337
|
-
var _URL$parse3 = URL.parse(url),
|
|
8431
|
+
var _URL$parse3 = URL$1.parse(url),
|
|
8338
8432
|
hostname = _URL$parse3.hostname;
|
|
8339
8433
|
|
|
8340
8434
|
var newParser = false;
|
|
@@ -8360,7 +8454,7 @@ if (urlArg) {
|
|
|
8360
8454
|
}
|
|
8361
8455
|
|
|
8362
8456
|
function generateScaffold(url, file, result) {
|
|
8363
|
-
var _URL$parse4 = URL.parse(url),
|
|
8457
|
+
var _URL$parse4 = URL$1.parse(url),
|
|
8364
8458
|
hostname = _URL$parse4.hostname;
|
|
8365
8459
|
|
|
8366
8460
|
var extractor = extractorTemplate(hostname, extractorName(hostname));
|
|
@@ -8375,7 +8469,7 @@ function savePage($, _ref, newParser) {
|
|
|
8375
8469
|
var _ref2 = _slicedToArray(_ref, 1),
|
|
8376
8470
|
url = _ref2[0];
|
|
8377
8471
|
|
|
8378
|
-
var _URL$parse5 = URL.parse(url),
|
|
8472
|
+
var _URL$parse5 = URL$1.parse(url),
|
|
8379
8473
|
hostname = _URL$parse5.hostname;
|
|
8380
8474
|
|
|
8381
8475
|
spinner.succeed();
|
|
@@ -8406,7 +8500,7 @@ function savePage($, _ref, newParser) {
|
|
|
8406
8500
|
}
|
|
8407
8501
|
|
|
8408
8502
|
function exportString(url) {
|
|
8409
|
-
var _URL$parse6 = URL.parse(url),
|
|
8503
|
+
var _URL$parse6 = URL$1.parse(url),
|
|
8410
8504
|
hostname = _URL$parse6.hostname;
|
|
8411
8505
|
|
|
8412
8506
|
return "export * from './".concat(hostname, "';");
|