@jocmp/mercury-parser 3.0.7 → 3.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/generate-custom-parser.js +804 -621
- package/dist/generate-custom-parser.js.map +1 -1
- package/dist/mercury.js +616 -371
- package/dist/mercury.js.map +1 -1
- package/dist/mercury.web.js +2 -2
- package/dist/mercury.web.js.map +1 -1
- package/package.json +12 -16
package/dist/mercury.js
CHANGED
|
@@ -35,12 +35,8 @@ var timezonePlugin = require('dayjs/plugin/timezone');
|
|
|
35
35
|
var customParseFormat = require('dayjs/plugin/customParseFormat');
|
|
36
36
|
var wuzzy = require('wuzzy');
|
|
37
37
|
var difflib = require('difflib');
|
|
38
|
-
var ellipsize = require('ellipsize');
|
|
39
38
|
|
|
40
|
-
function
|
|
41
|
-
|
|
42
|
-
function _interopNamespace(e) {
|
|
43
|
-
if (e && e.__esModule) return e;
|
|
39
|
+
function _interopNamespaceDefault(e) {
|
|
44
40
|
var n = Object.create(null);
|
|
45
41
|
if (e) {
|
|
46
42
|
Object.keys(e).forEach(function (k) {
|
|
@@ -53,46 +49,11 @@ function _interopNamespace(e) {
|
|
|
53
49
|
}
|
|
54
50
|
});
|
|
55
51
|
}
|
|
56
|
-
n
|
|
52
|
+
n.default = e;
|
|
57
53
|
return Object.freeze(n);
|
|
58
54
|
}
|
|
59
55
|
|
|
60
|
-
var
|
|
61
|
-
var _Object$getOwnPropertySymbols__default = /*#__PURE__*/_interopDefaultLegacy(_Object$getOwnPropertySymbols);
|
|
62
|
-
var _Object$getOwnPropertyDescriptor__default = /*#__PURE__*/_interopDefaultLegacy(_Object$getOwnPropertyDescriptor);
|
|
63
|
-
var _Object$getOwnPropertyDescriptors__default = /*#__PURE__*/_interopDefaultLegacy(_Object$getOwnPropertyDescriptors);
|
|
64
|
-
var _Object$defineProperties__default = /*#__PURE__*/_interopDefaultLegacy(_Object$defineProperties);
|
|
65
|
-
var _Object$defineProperty__default = /*#__PURE__*/_interopDefaultLegacy(_Object$defineProperty);
|
|
66
|
-
var _defineProperty__default = /*#__PURE__*/_interopDefaultLegacy(_defineProperty);
|
|
67
|
-
var _objectWithoutProperties__default = /*#__PURE__*/_interopDefaultLegacy(_objectWithoutProperties);
|
|
68
|
-
var _asyncToGenerator__default = /*#__PURE__*/_interopDefaultLegacy(_asyncToGenerator);
|
|
69
|
-
var _regeneratorRuntime__default = /*#__PURE__*/_interopDefaultLegacy(_regeneratorRuntime);
|
|
70
|
-
var URL__default = /*#__PURE__*/_interopDefaultLegacy(URL$1);
|
|
71
|
-
var TurndownService__default = /*#__PURE__*/_interopDefaultLegacy(TurndownService);
|
|
72
|
-
var cheerio__namespace = /*#__PURE__*/_interopNamespace(cheerio);
|
|
73
|
-
var iconv__default = /*#__PURE__*/_interopDefaultLegacy(iconv);
|
|
74
|
-
var _parseInt__default = /*#__PURE__*/_interopDefaultLegacy(_parseInt);
|
|
75
|
-
var _slicedToArray__default = /*#__PURE__*/_interopDefaultLegacy(_slicedToArray);
|
|
76
|
-
var _Promise__default = /*#__PURE__*/_interopDefaultLegacy(_Promise);
|
|
77
|
-
var request__default = /*#__PURE__*/_interopDefaultLegacy(request);
|
|
78
|
-
var _Reflect$ownKeys__default = /*#__PURE__*/_interopDefaultLegacy(_Reflect$ownKeys);
|
|
79
|
-
var _toConsumableArray__default = /*#__PURE__*/_interopDefaultLegacy(_toConsumableArray);
|
|
80
|
-
var _parseFloat__default = /*#__PURE__*/_interopDefaultLegacy(_parseFloat);
|
|
81
|
-
var _Set__default = /*#__PURE__*/_interopDefaultLegacy(_Set);
|
|
82
|
-
var _Array$from__default = /*#__PURE__*/_interopDefaultLegacy(_Array$from);
|
|
83
|
-
var _Symbol__default = /*#__PURE__*/_interopDefaultLegacy(_Symbol);
|
|
84
|
-
var _Symbol$iterator__default = /*#__PURE__*/_interopDefaultLegacy(_Symbol$iterator);
|
|
85
|
-
var _Array$isArray__default = /*#__PURE__*/_interopDefaultLegacy(_Array$isArray);
|
|
86
|
-
var _Object$assign__default = /*#__PURE__*/_interopDefaultLegacy(_Object$assign);
|
|
87
|
-
var stringDirection__default = /*#__PURE__*/_interopDefaultLegacy(stringDirection);
|
|
88
|
-
var _Number$isNaN__default = /*#__PURE__*/_interopDefaultLegacy(_Number$isNaN);
|
|
89
|
-
var dayjs__default = /*#__PURE__*/_interopDefaultLegacy(dayjs);
|
|
90
|
-
var utc__default = /*#__PURE__*/_interopDefaultLegacy(utc);
|
|
91
|
-
var timezonePlugin__default = /*#__PURE__*/_interopDefaultLegacy(timezonePlugin);
|
|
92
|
-
var customParseFormat__default = /*#__PURE__*/_interopDefaultLegacy(customParseFormat);
|
|
93
|
-
var wuzzy__default = /*#__PURE__*/_interopDefaultLegacy(wuzzy);
|
|
94
|
-
var difflib__default = /*#__PURE__*/_interopDefaultLegacy(difflib);
|
|
95
|
-
var ellipsize__default = /*#__PURE__*/_interopDefaultLegacy(ellipsize);
|
|
56
|
+
var cheerio__namespace = /*#__PURE__*/_interopNamespaceDefault(cheerio);
|
|
96
57
|
|
|
97
58
|
var NORMALIZE_RE = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g;
|
|
98
59
|
function normalizeSpaces(text) {
|
|
@@ -140,7 +101,7 @@ var DEFAULT_ENCODING = 'utf-8';
|
|
|
140
101
|
function pageNumFromUrl(url) {
|
|
141
102
|
var matches = url.match(PAGE_IN_HREF_RE);
|
|
142
103
|
if (!matches) return null;
|
|
143
|
-
var pageNum =
|
|
104
|
+
var pageNum = _parseInt(matches[6], 10);
|
|
144
105
|
|
|
145
106
|
// Return pageNum < 100, otherwise
|
|
146
107
|
// return null
|
|
@@ -178,7 +139,7 @@ function isGoodSegment(segment, index, firstSegmentHasLetters) {
|
|
|
178
139
|
// pagination data exists in it. Useful for comparing to other links
|
|
179
140
|
// that might have pagination data within them.
|
|
180
141
|
function articleBaseUrl(url, parsed) {
|
|
181
|
-
var parsedUrl = parsed ||
|
|
142
|
+
var parsedUrl = parsed || URL$1.parse(url);
|
|
182
143
|
var protocol = parsedUrl.protocol,
|
|
183
144
|
host = parsedUrl.host,
|
|
184
145
|
path = parsedUrl.path;
|
|
@@ -189,7 +150,7 @@ function articleBaseUrl(url, parsed) {
|
|
|
189
150
|
// Split off and save anything that looks like a file type.
|
|
190
151
|
if (segment.includes('.')) {
|
|
191
152
|
var _segment$split = segment.split('.'),
|
|
192
|
-
_segment$split2 =
|
|
153
|
+
_segment$split2 = _slicedToArray(_segment$split, 2),
|
|
193
154
|
possibleSegment = _segment$split2[0],
|
|
194
155
|
fileExt = _segment$split2[1];
|
|
195
156
|
if (IS_ALPHA_RE.test(fileExt)) {
|
|
@@ -239,10 +200,10 @@ function getEncoding(str) {
|
|
|
239
200
|
var encoding = DEFAULT_ENCODING;
|
|
240
201
|
var matches = ENCODING_RE.exec(str);
|
|
241
202
|
if (matches !== null) {
|
|
242
|
-
var _matches =
|
|
203
|
+
var _matches = _slicedToArray(matches, 2);
|
|
243
204
|
str = _matches[1];
|
|
244
205
|
}
|
|
245
|
-
if (
|
|
206
|
+
if (iconv.encodingExists(str)) {
|
|
246
207
|
encoding = str;
|
|
247
208
|
}
|
|
248
209
|
return encoding;
|
|
@@ -268,11 +229,11 @@ var BAD_CONTENT_TYPES_RE = new RegExp("^(".concat(BAD_CONTENT_TYPES.join('|'), "
|
|
|
268
229
|
// for us to attempt parsing. Defaults to 5 MB.
|
|
269
230
|
var MAX_CONTENT_LENGTH = 5242880;
|
|
270
231
|
|
|
271
|
-
function ownKeys$h(e, r) { var t = _Object$
|
|
272
|
-
function _objectSpread$h(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$h(Object(t),
|
|
232
|
+
function ownKeys$h(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
233
|
+
function _objectSpread$h(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$h(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$h(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
273
234
|
function get(options) {
|
|
274
|
-
return new
|
|
275
|
-
|
|
235
|
+
return new _Promise(function (resolve, reject) {
|
|
236
|
+
request(options, function (err, response, body) {
|
|
276
237
|
if (err) {
|
|
277
238
|
reject(err);
|
|
278
239
|
} else {
|
|
@@ -331,7 +292,7 @@ function fetchResource(_x, _x2) {
|
|
|
331
292
|
return _fetchResource.apply(this, arguments);
|
|
332
293
|
}
|
|
333
294
|
function _fetchResource() {
|
|
334
|
-
_fetchResource =
|
|
295
|
+
_fetchResource = _asyncToGenerator(/*#__PURE__*/_regeneratorRuntime.mark(function _callee(url, parsedUrl) {
|
|
335
296
|
var headers,
|
|
336
297
|
options,
|
|
337
298
|
_yield$get,
|
|
@@ -339,11 +300,11 @@ function _fetchResource() {
|
|
|
339
300
|
body,
|
|
340
301
|
_args = arguments,
|
|
341
302
|
_t;
|
|
342
|
-
return
|
|
303
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
343
304
|
while (1) switch (_context.prev = _context.next) {
|
|
344
305
|
case 0:
|
|
345
306
|
headers = _args.length > 2 && _args[2] !== undefined ? _args[2] : {};
|
|
346
|
-
parsedUrl = parsedUrl ||
|
|
307
|
+
parsedUrl = parsedUrl || URL$1.parse(encodeURI(url));
|
|
347
308
|
options = _objectSpread$h({
|
|
348
309
|
url: parsedUrl.href,
|
|
349
310
|
headers: _objectSpread$h(_objectSpread$h({}, REQUEST_HEADERS), headers),
|
|
@@ -605,7 +566,7 @@ function getAttrs(node) {
|
|
|
605
566
|
var attribs = node.attribs,
|
|
606
567
|
attributes = node.attributes;
|
|
607
568
|
if (!attribs && attributes) {
|
|
608
|
-
var attrs = _Reflect$
|
|
569
|
+
var attrs = _Reflect$ownKeys(attributes).reduce(function (acc, index) {
|
|
609
570
|
var attr = attributes[index];
|
|
610
571
|
|
|
611
572
|
// In browser, Reflect.ownKeys includes non-numeric keys like 'length', 'item', etc.
|
|
@@ -625,7 +586,7 @@ function convertNodeTo($node, $) {
|
|
|
625
586
|
return $;
|
|
626
587
|
}
|
|
627
588
|
var attrs = getAttrs(node) || {};
|
|
628
|
-
var attribString = _Reflect$
|
|
589
|
+
var attribString = _Reflect$ownKeys(attrs).map(function (key) {
|
|
629
590
|
return "".concat(key, "=").concat(attrs[key]);
|
|
630
591
|
}).join(' ');
|
|
631
592
|
var html;
|
|
@@ -684,8 +645,8 @@ function convertToParagraphs($) {
|
|
|
684
645
|
}
|
|
685
646
|
|
|
686
647
|
function cleanForHeight($img, $) {
|
|
687
|
-
var height =
|
|
688
|
-
var width =
|
|
648
|
+
var height = _parseInt($img.attr('height'), 10);
|
|
649
|
+
var width = _parseInt($img.attr('width'), 10) || 20;
|
|
689
650
|
|
|
690
651
|
// Remove images that explicitly have very small heights or
|
|
691
652
|
// widths, because they are most likely shims or icons,
|
|
@@ -724,10 +685,10 @@ function markToKeep(article, $, url) {
|
|
|
724
685
|
tags = KEEP_SELECTORS;
|
|
725
686
|
}
|
|
726
687
|
if (url) {
|
|
727
|
-
var _URL$parse =
|
|
688
|
+
var _URL$parse = URL$1.parse(url),
|
|
728
689
|
protocol = _URL$parse.protocol,
|
|
729
690
|
hostname = _URL$parse.hostname;
|
|
730
|
-
tags = [].concat(
|
|
691
|
+
tags = [].concat(_toConsumableArray(tags), ["iframe[src^=\"".concat(protocol, "//").concat(hostname, "\"]")]);
|
|
731
692
|
}
|
|
732
693
|
$(tags.join(','), article).addClass(KEEP_CLASS);
|
|
733
694
|
return $;
|
|
@@ -769,21 +730,21 @@ function setAttrs(node, attrs) {
|
|
|
769
730
|
while (node.attributes.length > 0) {
|
|
770
731
|
node.removeAttribute(node.attributes[0].name);
|
|
771
732
|
}
|
|
772
|
-
_Reflect$
|
|
733
|
+
_Reflect$ownKeys(attrs).forEach(function (key) {
|
|
773
734
|
node.setAttribute(key, attrs[key]);
|
|
774
735
|
});
|
|
775
736
|
}
|
|
776
737
|
return node;
|
|
777
738
|
}
|
|
778
739
|
|
|
779
|
-
function ownKeys$g(e, r) { var t = _Object$
|
|
780
|
-
function _objectSpread$g(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$g(Object(t),
|
|
740
|
+
function ownKeys$g(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
741
|
+
function _objectSpread$g(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$g(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$g(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
781
742
|
function removeAllButWhitelist($article, $) {
|
|
782
743
|
$article.find('*').each(function (index, node) {
|
|
783
744
|
var attrs = getAttrs(node);
|
|
784
|
-
setAttrs(node, _Reflect$
|
|
745
|
+
setAttrs(node, _Reflect$ownKeys(attrs).reduce(function (acc, attr) {
|
|
785
746
|
if (WHITELIST_ATTRS_RE.test(attr)) {
|
|
786
|
-
return _objectSpread$g(_objectSpread$g({}, acc), {},
|
|
747
|
+
return _objectSpread$g(_objectSpread$g({}, acc), {}, _defineProperty({}, attr, attrs[attr]));
|
|
787
748
|
}
|
|
788
749
|
return acc;
|
|
789
750
|
}, {}));
|
|
@@ -814,7 +775,7 @@ function removeEmpty($article, $) {
|
|
|
814
775
|
// the node's score attribute
|
|
815
776
|
// returns null if no score set
|
|
816
777
|
function getScore($node) {
|
|
817
|
-
return
|
|
778
|
+
return _parseFloat($node.attr('score')) || null;
|
|
818
779
|
}
|
|
819
780
|
|
|
820
781
|
function setScore($node, $, score) {
|
|
@@ -880,6 +841,7 @@ function scoreParagraph(node) {
|
|
|
880
841
|
|
|
881
842
|
// // CONTENT FETCHING CONSTANTS ////
|
|
882
843
|
|
|
844
|
+
|
|
883
845
|
// A list of tags that should be ignored when trying to find the top candidate
|
|
884
846
|
// for a document.
|
|
885
847
|
var NON_TOP_CANDIDATE_TAGS = ['br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta'];
|
|
@@ -998,7 +960,7 @@ function getWeight(node) {
|
|
|
998
960
|
return score;
|
|
999
961
|
}
|
|
1000
962
|
|
|
1001
|
-
// eslint-disable-next-line import/no-cycle
|
|
963
|
+
// eslint-disable-next-line import-x/no-cycle
|
|
1002
964
|
function addScore($node, $, amount) {
|
|
1003
965
|
try {
|
|
1004
966
|
var score = getOrInitScore($node, $) + amount;
|
|
@@ -1009,7 +971,7 @@ function addScore($node, $, amount) {
|
|
|
1009
971
|
return $node;
|
|
1010
972
|
}
|
|
1011
973
|
|
|
1012
|
-
// eslint-disable-next-line import/no-cycle
|
|
974
|
+
// eslint-disable-next-line import-x/no-cycle
|
|
1013
975
|
|
|
1014
976
|
// Adds 1/4 of a child's score to its parent
|
|
1015
977
|
function addToParent(node, $, score) {
|
|
@@ -1206,7 +1168,7 @@ function absolutize($, rootUrl, attr) {
|
|
|
1206
1168
|
var attrs = getAttrs(node);
|
|
1207
1169
|
var url = attrs[attr];
|
|
1208
1170
|
if (!url) return;
|
|
1209
|
-
var absoluteUrl =
|
|
1171
|
+
var absoluteUrl = URL$1.resolve(baseUrl || rootUrl, url);
|
|
1210
1172
|
setAttr(node, attr, absoluteUrl);
|
|
1211
1173
|
});
|
|
1212
1174
|
}
|
|
@@ -1224,10 +1186,10 @@ function absolutizeSet($, rootUrl, $content) {
|
|
|
1224
1186
|
// a candidate URL cannot start or end with a comma
|
|
1225
1187
|
// descriptors are separated from the URLs by unescaped whitespace
|
|
1226
1188
|
var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
|
|
1227
|
-
parts[0] =
|
|
1189
|
+
parts[0] = URL$1.resolve(rootUrl, parts[0]);
|
|
1228
1190
|
return parts.join(' ');
|
|
1229
1191
|
});
|
|
1230
|
-
var absoluteUrlSet =
|
|
1192
|
+
var absoluteUrlSet = _toConsumableArray(new _Set(absoluteCandidates)).join(', ');
|
|
1231
1193
|
setAttr(node, 'srcset', absoluteUrlSet);
|
|
1232
1194
|
}
|
|
1233
1195
|
});
|
|
@@ -1248,8 +1210,8 @@ function stripTags(text, $) {
|
|
|
1248
1210
|
return cleanText === '' ? text : cleanText;
|
|
1249
1211
|
}
|
|
1250
1212
|
|
|
1251
|
-
function _createForOfIteratorHelper$4(r, e) { var t = "undefined" != typeof
|
|
1252
|
-
function _unsupportedIterableToArray$4(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$4(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
1213
|
+
function _createForOfIteratorHelper$4(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray$4(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
1214
|
+
function _unsupportedIterableToArray$4(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$4(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray$4(r, a) : void 0; } }
|
|
1253
1215
|
function _arrayLikeToArray$4(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
1254
1216
|
|
|
1255
1217
|
// Given a node type to search for, and a list of meta tag names to
|
|
@@ -1259,8 +1221,6 @@ function extractFromMeta($, metaNames, cachedNames) {
|
|
|
1259
1221
|
var foundNames = metaNames.filter(function (name) {
|
|
1260
1222
|
return cachedNames.indexOf(name) !== -1;
|
|
1261
1223
|
});
|
|
1262
|
-
|
|
1263
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
1264
1224
|
var _iterator = _createForOfIteratorHelper$4(foundNames),
|
|
1265
1225
|
_step;
|
|
1266
1226
|
try {
|
|
@@ -1290,7 +1250,7 @@ function extractFromMeta($, metaNames, cachedNames) {
|
|
|
1290
1250
|
if (cleanTags) {
|
|
1291
1251
|
metaValue = stripTags(values[0], $);
|
|
1292
1252
|
} else {
|
|
1293
|
-
var _values =
|
|
1253
|
+
var _values = _slicedToArray(values, 1);
|
|
1294
1254
|
metaValue = _values[0];
|
|
1295
1255
|
}
|
|
1296
1256
|
return {
|
|
@@ -1325,8 +1285,8 @@ function withinComment($node) {
|
|
|
1325
1285
|
return commentParent !== undefined;
|
|
1326
1286
|
}
|
|
1327
1287
|
|
|
1328
|
-
function _createForOfIteratorHelper$3(r, e) { var t = "undefined" != typeof
|
|
1329
|
-
function _unsupportedIterableToArray$3(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$3(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
1288
|
+
function _createForOfIteratorHelper$3(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray$3(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
1289
|
+
function _unsupportedIterableToArray$3(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$3(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray$3(r, a) : void 0; } }
|
|
1330
1290
|
function _arrayLikeToArray$3(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
1331
1291
|
function isGoodNode($node, maxChildren) {
|
|
1332
1292
|
// If it has a number of children, it's more likely a container
|
|
@@ -1347,7 +1307,6 @@ function isGoodNode($node, maxChildren) {
|
|
|
1347
1307
|
function extractFromSelectors($, selectors) {
|
|
1348
1308
|
var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
|
|
1349
1309
|
var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
|
|
1350
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
1351
1310
|
var _iterator = _createForOfIteratorHelper$3(selectors),
|
|
1352
1311
|
_step;
|
|
1353
1312
|
try {
|
|
@@ -1416,7 +1375,7 @@ function convertLazyLoadedImages($) {
|
|
|
1416
1375
|
};
|
|
1417
1376
|
$('img').each(function (_, img) {
|
|
1418
1377
|
var attrs = getAttrs(img);
|
|
1419
|
-
_Reflect$
|
|
1378
|
+
_Reflect$ownKeys(attrs).forEach(function (attr) {
|
|
1420
1379
|
var value = attrs[attr];
|
|
1421
1380
|
if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
|
|
1422
1381
|
$(img).attr('srcset', value);
|
|
@@ -1458,9 +1417,9 @@ var Resource = {
|
|
|
1458
1417
|
create: function create(url, preparedResponse, parsedUrl) {
|
|
1459
1418
|
var _arguments = arguments,
|
|
1460
1419
|
_this = this;
|
|
1461
|
-
return
|
|
1420
|
+
return _asyncToGenerator(/*#__PURE__*/_regeneratorRuntime.mark(function _callee() {
|
|
1462
1421
|
var headers, result, validResponse;
|
|
1463
|
-
return
|
|
1422
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
1464
1423
|
while (1) switch (_context.prev = _context.next) {
|
|
1465
1424
|
case 0:
|
|
1466
1425
|
headers = _arguments.length > 3 && _arguments[3] !== undefined ? _arguments[3] : {};
|
|
@@ -1540,7 +1499,7 @@ var Resource = {
|
|
|
1540
1499
|
}
|
|
1541
1500
|
var encoding = getEncoding(contentType);
|
|
1542
1501
|
// UTF-8 is handled natively by Node.js, skip iconv-lite
|
|
1543
|
-
var decodedContent = encoding === 'utf-8' ? content.toString('utf-8') :
|
|
1502
|
+
var decodedContent = encoding === 'utf-8' ? content.toString('utf-8') : iconv.decode(content, encoding);
|
|
1544
1503
|
var $ = cheerio__namespace.load(decodedContent);
|
|
1545
1504
|
// after first cheerio.load, check to see if encoding matches
|
|
1546
1505
|
var contentTypeSelector = isBrowser ? 'meta[http-equiv=content-type]' : 'meta[http-equiv=content-type i]';
|
|
@@ -1549,7 +1508,7 @@ var Resource = {
|
|
|
1549
1508
|
|
|
1550
1509
|
// if encodings in the header/body dont match, use the one in the body
|
|
1551
1510
|
if (metaContentType && properEncoding !== encoding) {
|
|
1552
|
-
decodedContent = properEncoding === 'utf-8' ? content.toString('utf-8') :
|
|
1511
|
+
decodedContent = properEncoding === 'utf-8' ? content.toString('utf-8') : iconv.decode(content, properEncoding);
|
|
1553
1512
|
$ = cheerio__namespace.load(decodedContent);
|
|
1554
1513
|
}
|
|
1555
1514
|
return $;
|
|
@@ -1559,8 +1518,8 @@ var Resource = {
|
|
|
1559
1518
|
function range() {
|
|
1560
1519
|
var start = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
|
|
1561
1520
|
var end = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
|
|
1562
|
-
return /*#__PURE__*/
|
|
1563
|
-
return
|
|
1521
|
+
return /*#__PURE__*/_regeneratorRuntime.mark(function _callee() {
|
|
1522
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
1564
1523
|
while (1) switch (_context.prev = _context.next) {
|
|
1565
1524
|
case 0:
|
|
1566
1525
|
if (!(start <= end)) {
|
|
@@ -1594,7 +1553,7 @@ var merge = function merge(extractor, domains) {
|
|
|
1594
1553
|
}, {});
|
|
1595
1554
|
};
|
|
1596
1555
|
function mergeSupportedDomains(extractor) {
|
|
1597
|
-
return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(
|
|
1556
|
+
return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
|
|
1598
1557
|
}
|
|
1599
1558
|
|
|
1600
1559
|
var apiExtractors = {};
|
|
@@ -1605,7 +1564,7 @@ function addExtractor(extractor) {
|
|
|
1605
1564
|
message: 'Unable to add custom extractor. Invalid parameters.'
|
|
1606
1565
|
};
|
|
1607
1566
|
}
|
|
1608
|
-
_Object$
|
|
1567
|
+
_Object$assign(apiExtractors, mergeSupportedDomains(extractor));
|
|
1609
1568
|
return apiExtractors;
|
|
1610
1569
|
}
|
|
1611
1570
|
|
|
@@ -2315,7 +2274,7 @@ var MediumExtractor = {
|
|
|
2315
2274
|
var $parent = $node.parents('figure');
|
|
2316
2275
|
if (ytRe.test(thumb)) {
|
|
2317
2276
|
var _thumb$match = thumb.match(ytRe),
|
|
2318
|
-
_thumb$match2 =
|
|
2277
|
+
_thumb$match2 = _slicedToArray(_thumb$match, 2);
|
|
2319
2278
|
_thumb$match2[0];
|
|
2320
2279
|
var youtubeId = _thumb$match2[1]; // eslint-disable-line
|
|
2321
2280
|
$node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
|
|
@@ -2338,7 +2297,7 @@ var MediumExtractor = {
|
|
|
2338
2297
|
// Remove any smaller images that did not get caught by the generic image
|
|
2339
2298
|
// cleaner (author photo 48px, leading sentence images 79px, etc.).
|
|
2340
2299
|
img: function img($node) {
|
|
2341
|
-
var width =
|
|
2300
|
+
var width = _parseInt($node.attr('width'), 10);
|
|
2342
2301
|
if (width < 100) $node.remove();
|
|
2343
2302
|
}
|
|
2344
2303
|
},
|
|
@@ -3233,7 +3192,7 @@ var WwwMsnbcComExtractor = {
|
|
|
3233
3192
|
// before it's consumable content? E.g., unusual lazy loaded images
|
|
3234
3193
|
transforms: {
|
|
3235
3194
|
'.pane-node-body': function paneNodeBody($node, $) {
|
|
3236
|
-
var _WwwMsnbcComExtractor =
|
|
3195
|
+
var _WwwMsnbcComExtractor = _slicedToArray(WwwMsnbcComExtractor.lead_image_url.selectors[0], 2),
|
|
3237
3196
|
selector = _WwwMsnbcComExtractor[0],
|
|
3238
3197
|
attr = _WwwMsnbcComExtractor[1];
|
|
3239
3198
|
var src = $(selector).attr(attr);
|
|
@@ -5382,7 +5341,7 @@ var WiredJpExtractor = {
|
|
|
5382
5341
|
'img[data-original]': function imgDataOriginal($node) {
|
|
5383
5342
|
var dataOriginal = $node.attr('data-original');
|
|
5384
5343
|
var src = $node.attr('src');
|
|
5385
|
-
var url =
|
|
5344
|
+
var url = URL$1.resolve(src, dataOriginal);
|
|
5386
5345
|
$node.attr('src', url);
|
|
5387
5346
|
}
|
|
5388
5347
|
},
|
|
@@ -5686,8 +5645,6 @@ var PastebinComExtractor = {
|
|
|
5686
5645
|
}
|
|
5687
5646
|
};
|
|
5688
5647
|
|
|
5689
|
-
/* eslint-disable no-nested-ternary */
|
|
5690
|
-
/* eslint-disable no-unused-expressions */
|
|
5691
5648
|
var WwwAbendblattDeExtractor = {
|
|
5692
5649
|
domain: 'www.abendblatt.de',
|
|
5693
5650
|
title: {
|
|
@@ -6306,14 +6263,14 @@ var WwwSePlExtractor = {
|
|
|
6306
6263
|
}
|
|
6307
6264
|
};
|
|
6308
6265
|
|
|
6309
|
-
function ownKeys$f(e, r) { var t = _Object$
|
|
6310
|
-
function _objectSpread$f(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$f(Object(t),
|
|
6266
|
+
function ownKeys$f(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6267
|
+
function _objectSpread$f(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$f(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$f(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6311
6268
|
var SportSePlExtractor = _objectSpread$f(_objectSpread$f({}, WwwSePlExtractor), {}, {
|
|
6312
6269
|
domain: 'sport.se.pl'
|
|
6313
6270
|
});
|
|
6314
6271
|
|
|
6315
|
-
function ownKeys$e(e, r) { var t = _Object$
|
|
6316
|
-
function _objectSpread$e(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$e(Object(t),
|
|
6272
|
+
function ownKeys$e(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6273
|
+
function _objectSpread$e(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$e(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$e(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6317
6274
|
var PolitykaSePlExtractor = _objectSpread$e(_objectSpread$e({}, WwwSePlExtractor), {}, {
|
|
6318
6275
|
domain: 'polityka.se.pl'
|
|
6319
6276
|
});
|
|
@@ -6346,20 +6303,20 @@ var SuperserialeSePlExtractor = {
|
|
|
6346
6303
|
}
|
|
6347
6304
|
};
|
|
6348
6305
|
|
|
6349
|
-
function ownKeys$d(e, r) { var t = _Object$
|
|
6350
|
-
function _objectSpread$d(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$d(Object(t),
|
|
6306
|
+
function ownKeys$d(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6307
|
+
function _objectSpread$d(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$d(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$d(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6351
6308
|
var SzczecinSePlExtractor = _objectSpread$d(_objectSpread$d({}, WwwSePlExtractor), {}, {
|
|
6352
6309
|
domain: 'szczecin.se.pl'
|
|
6353
6310
|
});
|
|
6354
6311
|
|
|
6355
|
-
function ownKeys$c(e, r) { var t = _Object$
|
|
6356
|
-
function _objectSpread$c(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$c(Object(t),
|
|
6312
|
+
function ownKeys$c(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6313
|
+
function _objectSpread$c(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$c(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$c(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6357
6314
|
var SuperbizSePlExtractor = _objectSpread$c(_objectSpread$c({}, WwwSePlExtractor), {}, {
|
|
6358
6315
|
domain: 'superbiz.se.pl'
|
|
6359
6316
|
});
|
|
6360
6317
|
|
|
6361
|
-
function ownKeys$b(e, r) { var t = _Object$
|
|
6362
|
-
function _objectSpread$b(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$b(Object(t),
|
|
6318
|
+
function ownKeys$b(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6319
|
+
function _objectSpread$b(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$b(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$b(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6363
6320
|
var PortalobronnySePlExtractor = _objectSpread$b(_objectSpread$b({}, WwwSePlExtractor), {}, {
|
|
6364
6321
|
domain: 'portalobronny.se.pl'
|
|
6365
6322
|
});
|
|
@@ -6386,26 +6343,26 @@ var PolskisamorzadSePlExtractor = {
|
|
|
6386
6343
|
}
|
|
6387
6344
|
};
|
|
6388
6345
|
|
|
6389
|
-
function ownKeys$a(e, r) { var t = _Object$
|
|
6390
|
-
function _objectSpread$a(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$a(Object(t),
|
|
6346
|
+
function ownKeys$a(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6347
|
+
function _objectSpread$a(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$a(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$a(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6391
6348
|
var LodzSePlExtractor = _objectSpread$a(_objectSpread$a({}, WwwSePlExtractor), {}, {
|
|
6392
6349
|
domain: 'lodz.se.pl'
|
|
6393
6350
|
});
|
|
6394
6351
|
|
|
6395
|
-
function ownKeys$9(e, r) { var t = _Object$
|
|
6396
|
-
function _objectSpread$9(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$9(Object(t),
|
|
6352
|
+
function ownKeys$9(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6353
|
+
function _objectSpread$9(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$9(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$9(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6397
6354
|
var WroclawSePlExtractor = _objectSpread$9(_objectSpread$9({}, WwwSePlExtractor), {}, {
|
|
6398
6355
|
domain: 'wroclaw.se.pl'
|
|
6399
6356
|
});
|
|
6400
6357
|
|
|
6401
|
-
function ownKeys$8(e, r) { var t = _Object$
|
|
6402
|
-
function _objectSpread$8(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$8(Object(t),
|
|
6358
|
+
function ownKeys$8(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6359
|
+
function _objectSpread$8(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$8(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$8(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6403
6360
|
var LublinSePlExtractor = _objectSpread$8(_objectSpread$8({}, WwwSePlExtractor), {}, {
|
|
6404
6361
|
domain: 'lublin.se.pl'
|
|
6405
6362
|
});
|
|
6406
6363
|
|
|
6407
|
-
function ownKeys$7(e, r) { var t = _Object$
|
|
6408
|
-
function _objectSpread$7(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$7(Object(t),
|
|
6364
|
+
function ownKeys$7(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6365
|
+
function _objectSpread$7(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$7(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$7(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6409
6366
|
var BialystokSePlExtractor = _objectSpread$7(_objectSpread$7({}, WwwSePlExtractor), {}, {
|
|
6410
6367
|
domain: 'bialystok.se.pl'
|
|
6411
6368
|
});
|
|
@@ -6661,7 +6618,7 @@ var WwwPolygonComExtractor = {
|
|
|
6661
6618
|
img: function img($node) {
|
|
6662
6619
|
var srcset = $node.attr('srcset');
|
|
6663
6620
|
var _split = (srcset || '').split(','),
|
|
6664
|
-
_split2 =
|
|
6621
|
+
_split2 = _slicedToArray(_split, 1),
|
|
6665
6622
|
src = _split2[0];
|
|
6666
6623
|
if (src) {
|
|
6667
6624
|
$node.parent().replaceWith("<figure><img srcset=\"".concat(srcset, "\" src=\"").concat(src, "\"/></figure>"));
|
|
@@ -6701,7 +6658,7 @@ var WwwThevergeComExtractor = {
|
|
|
6701
6658
|
img: function img($node) {
|
|
6702
6659
|
var srcset = $node.attr('srcset');
|
|
6703
6660
|
var _split = (srcset || '').split(','),
|
|
6704
|
-
_split2 =
|
|
6661
|
+
_split2 = _slicedToArray(_split, 1),
|
|
6705
6662
|
src = _split2[0];
|
|
6706
6663
|
if (src) {
|
|
6707
6664
|
$node.parent().replaceWith("<figure><img srcset=\"".concat(srcset, "\" src=\"").concat(src, "\"/></figure>"));
|
|
@@ -7235,216 +7192,457 @@ var WwwEuronewsComExtractor = {
|
|
|
7235
7192
|
}
|
|
7236
7193
|
};
|
|
7237
7194
|
|
|
7238
|
-
function ownKeys$6(e, r) { var t = _Object$
|
|
7239
|
-
function _objectSpread$6(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$6(Object(t),
|
|
7195
|
+
function ownKeys$6(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
7196
|
+
function _objectSpread$6(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$6(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$6(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
7240
7197
|
var GrEuronewsComExtractor = _objectSpread$6(_objectSpread$6({}, WwwEuronewsComExtractor), {}, {
|
|
7241
7198
|
domain: 'gr.euronews.com'
|
|
7242
7199
|
});
|
|
7243
7200
|
|
|
7201
|
+
var WwwIlfattoquotidianoItExtractor = {
|
|
7202
|
+
domain: 'www.ilfattoquotidiano.it',
|
|
7203
|
+
title: {
|
|
7204
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7205
|
+
},
|
|
7206
|
+
author: {
|
|
7207
|
+
selectors: ['.ifq-post__author .ifq-news-meta__author-name'],
|
|
7208
|
+
clean: ['span']
|
|
7209
|
+
},
|
|
7210
|
+
date_published: {
|
|
7211
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
7212
|
+
},
|
|
7213
|
+
dek: {
|
|
7214
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7215
|
+
},
|
|
7216
|
+
lead_image_url: {
|
|
7217
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7218
|
+
},
|
|
7219
|
+
content: {
|
|
7220
|
+
selectors: ['.ifq-post__content', 'article'],
|
|
7221
|
+
transforms: {},
|
|
7222
|
+
clean: []
|
|
7223
|
+
}
|
|
7224
|
+
};
|
|
7225
|
+
|
|
7226
|
+
var ActualidadRtComExtractor = {
|
|
7227
|
+
domain: 'actualidad.rt.com',
|
|
7228
|
+
title: {
|
|
7229
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7230
|
+
},
|
|
7231
|
+
author: {
|
|
7232
|
+
selectors: [['meta[name="article:author"]', 'value']]
|
|
7233
|
+
},
|
|
7234
|
+
date_published: {
|
|
7235
|
+
selectors: [['meta[name="mediator_published_time"]', 'value']]
|
|
7236
|
+
},
|
|
7237
|
+
dek: {
|
|
7238
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7239
|
+
},
|
|
7240
|
+
lead_image_url: {
|
|
7241
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7242
|
+
},
|
|
7243
|
+
content: {
|
|
7244
|
+
selectors: ['.ArticleView-text'],
|
|
7245
|
+
transforms: {},
|
|
7246
|
+
// RT wraps each <img> in a <picture> whose <source> elements carry a
|
|
7247
|
+
// base64 placeholder srcset; browsers honor that over the real <img src>,
|
|
7248
|
+
// so drop the sources and let the <img> (real URL) render.
|
|
7249
|
+
clean: ['.ReadMore-root', 'source']
|
|
7250
|
+
}
|
|
7251
|
+
};
|
|
7252
|
+
|
|
7253
|
+
var WwwTweaktownComExtractor = {
|
|
7254
|
+
domain: 'www.tweaktown.com',
|
|
7255
|
+
title: {
|
|
7256
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7257
|
+
},
|
|
7258
|
+
author: {
|
|
7259
|
+
selectors: ['.info-bar-div2 a[rel="author"]']
|
|
7260
|
+
},
|
|
7261
|
+
date_published: {
|
|
7262
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
7263
|
+
},
|
|
7264
|
+
dek: {
|
|
7265
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7266
|
+
},
|
|
7267
|
+
lead_image_url: {
|
|
7268
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7269
|
+
},
|
|
7270
|
+
content: {
|
|
7271
|
+
selectors: ['#article-body'],
|
|
7272
|
+
transforms: {},
|
|
7273
|
+
clean: []
|
|
7274
|
+
}
|
|
7275
|
+
};
|
|
7276
|
+
|
|
7277
|
+
var WwwFrandroidComExtractor = {
|
|
7278
|
+
domain: 'www.frandroid.com',
|
|
7279
|
+
title: {
|
|
7280
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7281
|
+
},
|
|
7282
|
+
author: {
|
|
7283
|
+
selectors: [['meta[name="parsely-author"]', 'value']]
|
|
7284
|
+
},
|
|
7285
|
+
date_published: {
|
|
7286
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
7287
|
+
},
|
|
7288
|
+
dek: {
|
|
7289
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7290
|
+
},
|
|
7291
|
+
lead_image_url: {
|
|
7292
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7293
|
+
},
|
|
7294
|
+
content: {
|
|
7295
|
+
selectors: ['section.article-content'],
|
|
7296
|
+
transforms: {
|
|
7297
|
+
h2: function h2(node) {
|
|
7298
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
7299
|
+
},
|
|
7300
|
+
h3: function h3(node) {
|
|
7301
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
7302
|
+
}
|
|
7303
|
+
},
|
|
7304
|
+
clean: ['.index-menu-wrapper', '.is-gastric-kingfisher', '.newsletter-form', '.share', '.article-footer', '.js-feed-posts', '.optidigital-adslot', '[id^="optidigital-adslot"]']
|
|
7305
|
+
}
|
|
7306
|
+
};
|
|
7307
|
+
|
|
7308
|
+
var WwwMotorsportComExtractor = {
|
|
7309
|
+
domain: 'www.motorsport.com',
|
|
7310
|
+
title: {
|
|
7311
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7312
|
+
},
|
|
7313
|
+
author: {
|
|
7314
|
+
selectors: ['.msnt-author-toolbar a[href*="/info/about-us/"]']
|
|
7315
|
+
},
|
|
7316
|
+
date_published: {
|
|
7317
|
+
selectors: [['meta[name="datePublished"]', 'value']]
|
|
7318
|
+
},
|
|
7319
|
+
dek: {
|
|
7320
|
+
selectors: ['h2.text-article-description']
|
|
7321
|
+
},
|
|
7322
|
+
lead_image_url: {
|
|
7323
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7324
|
+
},
|
|
7325
|
+
content: {
|
|
7326
|
+
selectors: ['.ms-article-content'],
|
|
7327
|
+
transforms: {
|
|
7328
|
+
h2: function h2(node) {
|
|
7329
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
7330
|
+
}
|
|
7331
|
+
},
|
|
7332
|
+
clean: ['msnt-survey-promo', '.article-fullwidth-gallery_item ~ .article-fullwidth-gallery_item', '.ms-inarticle-widgets', '.relatedContent', '.ms-apb', '.ms-ap-native', '.outstream_partner']
|
|
7333
|
+
}
|
|
7334
|
+
};
|
|
7335
|
+
|
|
7336
|
+
var SubstackComExtractor = {
|
|
7337
|
+
domain: 'substack.com',
|
|
7338
|
+
title: {
|
|
7339
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7340
|
+
},
|
|
7341
|
+
author: {
|
|
7342
|
+
selectors: [['meta[name="author"]', 'value']]
|
|
7343
|
+
},
|
|
7344
|
+
date_published: {
|
|
7345
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
7346
|
+
},
|
|
7347
|
+
dek: {
|
|
7348
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7349
|
+
},
|
|
7350
|
+
lead_image_url: {
|
|
7351
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7352
|
+
},
|
|
7353
|
+
content: {
|
|
7354
|
+
selectors: ['.available-content'],
|
|
7355
|
+
transforms: {
|
|
7356
|
+
'div.captioned-image-container': 'figure',
|
|
7357
|
+
'div.image-link': function divImageLink($node) {
|
|
7358
|
+
$node.replaceWith($node.find('img'));
|
|
7359
|
+
}
|
|
7360
|
+
},
|
|
7361
|
+
clean: ['.subscribe-widget', '.subscription-widget-wrap', '.subscription-widget-wrap-editor', '.button-wrapper', '.poll-embed', '.share-dialog']
|
|
7362
|
+
}
|
|
7363
|
+
};
|
|
7364
|
+
|
|
7365
|
+
var WwwDwComExtractor = {
|
|
7366
|
+
domain: 'www.dw.com',
|
|
7367
|
+
title: {
|
|
7368
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7369
|
+
},
|
|
7370
|
+
author: {
|
|
7371
|
+
selectors: ['.author-name .author-link']
|
|
7372
|
+
},
|
|
7373
|
+
date_published: {
|
|
7374
|
+
selectors: [['meta[name="date"]', 'value']]
|
|
7375
|
+
},
|
|
7376
|
+
dek: {
|
|
7377
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7378
|
+
},
|
|
7379
|
+
lead_image_url: {
|
|
7380
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7381
|
+
},
|
|
7382
|
+
content: {
|
|
7383
|
+
selectors: ['[data-tracking-name="rich-text"]'],
|
|
7384
|
+
transforms: {
|
|
7385
|
+
// DW inline images are responsive: the real template lives in data-url
|
|
7386
|
+
// with a literal ${formatId} size token that JS would replace, leaving a
|
|
7387
|
+
// broken src in the raw HTML. Resolve it to a standard content size.
|
|
7388
|
+
img: function img(node) {
|
|
7389
|
+
var template = node.attr('data-url') || node.attr('src') || '';
|
|
7390
|
+
if (template.includes('${formatId}')) {
|
|
7391
|
+
node.attr('src', template.replace('${formatId}', '6'));
|
|
7392
|
+
}
|
|
7393
|
+
}
|
|
7394
|
+
},
|
|
7395
|
+
// Embedded tweets are non-functional fallback markup without JS.
|
|
7396
|
+
clean: ['blockquote.tweet.embed']
|
|
7397
|
+
}
|
|
7398
|
+
};
|
|
7399
|
+
|
|
7400
|
+
var WwwAnimenewsnetworkComExtractor = {
|
|
7401
|
+
domain: 'www.animenewsnetwork.com',
|
|
7402
|
+
title: {
|
|
7403
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7404
|
+
},
|
|
7405
|
+
author: null,
|
|
7406
|
+
date_published: {
|
|
7407
|
+
selectors: [['small time', 'datetime']]
|
|
7408
|
+
},
|
|
7409
|
+
dek: {
|
|
7410
|
+
selectors: [['meta[name="description"]', 'value']]
|
|
7411
|
+
},
|
|
7412
|
+
lead_image_url: {
|
|
7413
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7414
|
+
},
|
|
7415
|
+
content: {
|
|
7416
|
+
selectors: ['.KonaBody'],
|
|
7417
|
+
transforms: {
|
|
7418
|
+
// Images are lazy-loaded: real URL in data-src, a spacer.gif in src.
|
|
7419
|
+
// Promote data-src so the images survive cleaning and render.
|
|
7420
|
+
img: function img(node) {
|
|
7421
|
+
var dataSrc = node.attr('data-src');
|
|
7422
|
+
if (dataSrc) {
|
|
7423
|
+
var src = dataSrc.startsWith('/') ? "https://www.animenewsnetwork.com".concat(dataSrc) : dataSrc;
|
|
7424
|
+
node.attr('src', src);
|
|
7425
|
+
node.removeAttr('data-src');
|
|
7426
|
+
}
|
|
7427
|
+
}
|
|
7428
|
+
},
|
|
7429
|
+
// .intro duplicates the dek; instaread-player is an audio widget.
|
|
7430
|
+
clean: ['.intro', 'instaread-player']
|
|
7431
|
+
}
|
|
7432
|
+
};
|
|
7433
|
+
|
|
7244
7434
|
var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
7245
7435
|
__proto__: null,
|
|
7436
|
+
AbcnewsGoComExtractor: AbcnewsGoComExtractor,
|
|
7437
|
+
ActualidadRtComExtractor: ActualidadRtComExtractor,
|
|
7438
|
+
ApartmentTherapyExtractor: ApartmentTherapyExtractor,
|
|
7439
|
+
ArstechnicaComExtractor: ArstechnicaComExtractor,
|
|
7246
7440
|
BalloonJuiceComExtractor: BalloonJuiceComExtractor,
|
|
7441
|
+
BialystokSePlExtractor: BialystokSePlExtractor,
|
|
7442
|
+
BiorxivOrgExtractor: BiorxivOrgExtractor,
|
|
7443
|
+
BlisterreviewComExtractor: BlisterreviewComExtractor,
|
|
7247
7444
|
BloggerExtractor: BloggerExtractor,
|
|
7248
|
-
|
|
7249
|
-
|
|
7250
|
-
|
|
7251
|
-
|
|
7252
|
-
TheAtlanticExtractor: TheAtlanticExtractor,
|
|
7253
|
-
NewYorkerExtractor: NewYorkerExtractor,
|
|
7254
|
-
WiredExtractor: WiredExtractor,
|
|
7255
|
-
MSNExtractor: MSNExtractor,
|
|
7256
|
-
YahooExtractor: YahooExtractor,
|
|
7445
|
+
BookwalkerJpExtractor: BookwalkerJpExtractor,
|
|
7446
|
+
BroadwayWorldExtractor: BroadwayWorldExtractor,
|
|
7447
|
+
BskyAppExtractor: BskyAppExtractor,
|
|
7448
|
+
BuzzapJpExtractor: BuzzapJpExtractor,
|
|
7257
7449
|
BuzzfeedExtractor: BuzzfeedExtractor,
|
|
7258
|
-
|
|
7259
|
-
|
|
7260
|
-
|
|
7450
|
+
ChicagoyimbyComExtractor: ChicagoyimbyComExtractor,
|
|
7451
|
+
ClinicaltrialsGovExtractor: ClinicaltrialsGovExtractor,
|
|
7452
|
+
DeadlineComExtractor: DeadlineComExtractor,
|
|
7261
7453
|
DeadspinExtractor: DeadspinExtractor,
|
|
7262
|
-
|
|
7263
|
-
|
|
7454
|
+
EconomictimesIndiatimesComExtractor: EconomictimesIndiatimesComExtractor,
|
|
7455
|
+
EpaperZeitDeExtractor: EpaperZeitDeExtractor,
|
|
7456
|
+
FactorioComExtractor: FactorioComExtractor,
|
|
7457
|
+
FortuneComExtractor: FortuneComExtractor,
|
|
7458
|
+
ForwardComExtractor: ForwardComExtractor,
|
|
7459
|
+
GeniusComExtractor: GeniusComExtractor,
|
|
7460
|
+
GetnewsJpExtractor: GetnewsJpExtractor,
|
|
7461
|
+
GithubComExtractor: GithubComExtractor,
|
|
7462
|
+
GonintendoComExtractor: GonintendoComExtractor,
|
|
7463
|
+
GothamistComExtractor: GothamistComExtractor,
|
|
7464
|
+
GrEuronewsComExtractor: GrEuronewsComExtractor,
|
|
7465
|
+
HellogigglesComExtractor: HellogigglesComExtractor,
|
|
7466
|
+
IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
|
|
7467
|
+
JapanCnetComExtractor: JapanCnetComExtractor,
|
|
7468
|
+
JapanZdnetComExtractor: JapanZdnetComExtractor,
|
|
7469
|
+
JvndbJvnJpExtractor: JvndbJvnJpExtractor,
|
|
7470
|
+
LittleThingsExtractor: LittleThingsExtractor,
|
|
7471
|
+
LodzSePlExtractor: LodzSePlExtractor,
|
|
7472
|
+
LublinSePlExtractor: LublinSePlExtractor,
|
|
7473
|
+
MSNExtractor: MSNExtractor,
|
|
7474
|
+
MaTtiasBeExtractor: MaTtiasBeExtractor,
|
|
7475
|
+
MashableComExtractor: MashableComExtractor,
|
|
7264
7476
|
MediumExtractor: MediumExtractor,
|
|
7265
|
-
|
|
7266
|
-
WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
|
|
7267
|
-
WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
|
|
7268
|
-
NewrepublicComExtractor: NewrepublicComExtractor,
|
|
7477
|
+
MobilesyrupComExtractor: MobilesyrupComExtractor,
|
|
7269
7478
|
MoneyCnnComExtractor: MoneyCnnComExtractor,
|
|
7270
|
-
|
|
7271
|
-
|
|
7272
|
-
|
|
7273
|
-
|
|
7274
|
-
|
|
7275
|
-
WwwBloombergComExtractor: WwwBloombergComExtractor,
|
|
7276
|
-
WwwBustleComExtractor: WwwBustleComExtractor,
|
|
7277
|
-
WwwNprOrgExtractor: WwwNprOrgExtractor,
|
|
7278
|
-
WwwRecodeNetExtractor: WwwRecodeNetExtractor,
|
|
7279
|
-
QzComExtractor: QzComExtractor,
|
|
7280
|
-
WwwDmagazineComExtractor: WwwDmagazineComExtractor,
|
|
7281
|
-
WwwReutersComExtractor: WwwReutersComExtractor,
|
|
7282
|
-
MashableComExtractor: MashableComExtractor,
|
|
7283
|
-
WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
|
|
7284
|
-
WwwVoxComExtractor: WwwVoxComExtractor,
|
|
7479
|
+
NYMagExtractor: NYMagExtractor,
|
|
7480
|
+
NYTimesExtractor: NYTimesExtractor,
|
|
7481
|
+
NewYorkerExtractor: NewYorkerExtractor,
|
|
7482
|
+
NewrepublicComExtractor: NewrepublicComExtractor,
|
|
7483
|
+
NewsMynaviJpExtractor: NewsMynaviJpExtractor,
|
|
7285
7484
|
NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
|
|
7286
|
-
|
|
7287
|
-
|
|
7288
|
-
|
|
7289
|
-
|
|
7290
|
-
|
|
7291
|
-
WwwMsnbcComExtractor: WwwMsnbcComExtractor,
|
|
7292
|
-
WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
|
|
7293
|
-
WwwMentalflossComExtractor: WwwMentalflossComExtractor,
|
|
7294
|
-
AbcnewsGoComExtractor: AbcnewsGoComExtractor,
|
|
7295
|
-
WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
|
|
7296
|
-
WwwCnbcComExtractor: WwwCnbcComExtractor,
|
|
7297
|
-
WwwPopsugarComExtractor: WwwPopsugarComExtractor,
|
|
7485
|
+
NewsPtsOrgTwExtractor: NewsPtsOrgTwExtractor,
|
|
7486
|
+
Nineto5googleComExtractor: Nineto5googleComExtractor,
|
|
7487
|
+
Nineto5linuxComExtractor: Nineto5linuxComExtractor,
|
|
7488
|
+
Nineto5macComExtractor: Nineto5macComExtractor,
|
|
7489
|
+
ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
|
|
7298
7490
|
ObserverComExtractor: ObserverComExtractor,
|
|
7491
|
+
OrfAtExtractor: OrfAtExtractor,
|
|
7492
|
+
OtrsComExtractor: OtrsComExtractor,
|
|
7493
|
+
PagesixComExtractor: PagesixComExtractor,
|
|
7494
|
+
PastebinComExtractor: PastebinComExtractor,
|
|
7299
7495
|
PeopleComExtractor: PeopleComExtractor,
|
|
7300
|
-
|
|
7301
|
-
|
|
7302
|
-
|
|
7496
|
+
PhpspotOrgExtractor: PhpspotOrgExtractor,
|
|
7497
|
+
PitchforkComExtractor: PitchforkComExtractor,
|
|
7498
|
+
PoliticoExtractor: PoliticoExtractor,
|
|
7499
|
+
PolitykaSePlExtractor: PolitykaSePlExtractor,
|
|
7500
|
+
PolskisamorzadSePlExtractor: PolskisamorzadSePlExtractor,
|
|
7501
|
+
PortalobronnySePlExtractor: PortalobronnySePlExtractor,
|
|
7502
|
+
QzComExtractor: QzComExtractor,
|
|
7503
|
+
ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor,
|
|
7504
|
+
ScienceflyComExtractor: ScienceflyComExtractor,
|
|
7505
|
+
SectIijAdJpExtractor: SectIijAdJpExtractor,
|
|
7506
|
+
SgNewsYahooComExtractor: SgNewsYahooComExtractor,
|
|
7507
|
+
SpektrumExtractor: SpektrumExtractor,
|
|
7508
|
+
SportSePlExtractor: SportSePlExtractor,
|
|
7509
|
+
SubstackComExtractor: SubstackComExtractor,
|
|
7510
|
+
SuperbizSePlExtractor: SuperbizSePlExtractor,
|
|
7511
|
+
SuperserialeSePlExtractor: SuperserialeSePlExtractor,
|
|
7512
|
+
SzczecinSePlExtractor: SzczecinSePlExtractor,
|
|
7513
|
+
TakagihiromitsuJpExtractor: TakagihiromitsuJpExtractor,
|
|
7514
|
+
TarnkappeInfoExtractor: TarnkappeInfoExtractor,
|
|
7515
|
+
TechcrunchComExtractor: TechcrunchComExtractor,
|
|
7516
|
+
TechlogIijAdJpExtractor: TechlogIijAdJpExtractor,
|
|
7517
|
+
TerminaltroveComExtractor: TerminaltroveComExtractor,
|
|
7518
|
+
TheAtlanticExtractor: TheAtlanticExtractor,
|
|
7519
|
+
ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
|
|
7520
|
+
ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
|
|
7521
|
+
TimesofindiaIndiatimesComExtractor: TimesofindiaIndiatimesComExtractor,
|
|
7522
|
+
TldrTechExtractor: TldrTechExtractor,
|
|
7523
|
+
TwitterExtractor: TwitterExtractor,
|
|
7303
7524
|
UproxxComExtractor: UproxxComExtractor,
|
|
7304
|
-
|
|
7305
|
-
|
|
7306
|
-
|
|
7307
|
-
|
|
7308
|
-
|
|
7309
|
-
|
|
7310
|
-
|
|
7311
|
-
|
|
7312
|
-
|
|
7525
|
+
WccftechComExtractor: WccftechComExtractor,
|
|
7526
|
+
WeeklyAsciiJpExtractor: WeeklyAsciiJpExtractor,
|
|
7527
|
+
WikiaExtractor: WikiaExtractor,
|
|
7528
|
+
WikipediaExtractor: WikipediaExtractor,
|
|
7529
|
+
WiredExtractor: WiredExtractor,
|
|
7530
|
+
WiredJpExtractor: WiredJpExtractor,
|
|
7531
|
+
WroclawSePlExtractor: WroclawSePlExtractor,
|
|
7532
|
+
Www1pezeshkComExtractor: Www1pezeshkComExtractor,
|
|
7533
|
+
WwwAbendblattDeExtractor: WwwAbendblattDeExtractor,
|
|
7313
7534
|
WwwAlComExtractor: WwwAlComExtractor,
|
|
7314
|
-
WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
|
|
7315
|
-
WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
|
|
7316
7535
|
WwwAmericanowComExtractor: WwwAmericanowComExtractor,
|
|
7317
|
-
|
|
7318
|
-
|
|
7319
|
-
|
|
7320
|
-
|
|
7321
|
-
WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
|
|
7322
|
-
FortuneComExtractor: FortuneComExtractor,
|
|
7323
|
-
WwwLinkedinComExtractor: WwwLinkedinComExtractor,
|
|
7324
|
-
ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
|
|
7325
|
-
WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
|
|
7326
|
-
WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
|
|
7327
|
-
ForwardComExtractor: ForwardComExtractor,
|
|
7328
|
-
WwwQdailyComExtractor: WwwQdailyComExtractor,
|
|
7329
|
-
GothamistComExtractor: GothamistComExtractor,
|
|
7330
|
-
WwwFoolComExtractor: WwwFoolComExtractor,
|
|
7331
|
-
WwwSlateComExtractor: WwwSlateComExtractor,
|
|
7332
|
-
IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
|
|
7333
|
-
WwwFortinetComExtractor: WwwFortinetComExtractor,
|
|
7334
|
-
WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
|
|
7335
|
-
BlisterreviewComExtractor: BlisterreviewComExtractor,
|
|
7336
|
-
NewsMynaviJpExtractor: NewsMynaviJpExtractor,
|
|
7337
|
-
ClinicaltrialsGovExtractor: ClinicaltrialsGovExtractor,
|
|
7338
|
-
GithubComExtractor: GithubComExtractor,
|
|
7339
|
-
WwwRedditComExtractor: WwwRedditComExtractor,
|
|
7340
|
-
OtrsComExtractor: OtrsComExtractor,
|
|
7341
|
-
WwwOssnewsJpExtractor: WwwOssnewsJpExtractor,
|
|
7342
|
-
BuzzapJpExtractor: BuzzapJpExtractor,
|
|
7536
|
+
WwwAndroidauthorityComExtractor: WwwAndroidauthorityComExtractor,
|
|
7537
|
+
WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
|
|
7538
|
+
WwwAnimenewsnetworkComExtractor: WwwAnimenewsnetworkComExtractor,
|
|
7539
|
+
WwwAolComExtractor: WwwAolComExtractor,
|
|
7343
7540
|
WwwAsahiComExtractor: WwwAsahiComExtractor,
|
|
7344
|
-
|
|
7541
|
+
WwwBlickDeExtractor: WwwBlickDeExtractor,
|
|
7542
|
+
WwwBloombergComExtractor: WwwBloombergComExtractor,
|
|
7543
|
+
WwwBustleComExtractor: WwwBustleComExtractor,
|
|
7544
|
+
WwwCbcCaExtractor: WwwCbcCaExtractor,
|
|
7545
|
+
WwwCbssportsComExtractor: WwwCbssportsComExtractor,
|
|
7546
|
+
WwwChannelnewsasiaComExtractor: WwwChannelnewsasiaComExtractor,
|
|
7547
|
+
WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
|
|
7548
|
+
WwwCnbcComExtractor: WwwCnbcComExtractor,
|
|
7549
|
+
WwwCnetComExtractor: WwwCnetComExtractor,
|
|
7550
|
+
WwwCnnComExtractor: WwwCnnComExtractor,
|
|
7551
|
+
WwwDmagazineComExtractor: WwwDmagazineComExtractor,
|
|
7552
|
+
WwwDwComExtractor: WwwDwComExtractor,
|
|
7345
7553
|
WwwElecomCoJpExtractor: WwwElecomCoJpExtractor,
|
|
7346
|
-
|
|
7347
|
-
|
|
7348
|
-
|
|
7349
|
-
|
|
7350
|
-
|
|
7351
|
-
|
|
7352
|
-
|
|
7353
|
-
|
|
7354
|
-
|
|
7355
|
-
TakagihiromitsuJpExtractor: TakagihiromitsuJpExtractor,
|
|
7356
|
-
BookwalkerJpExtractor: BookwalkerJpExtractor,
|
|
7357
|
-
WwwYomiuriCoJpExtractor: WwwYomiuriCoJpExtractor,
|
|
7358
|
-
JapanCnetComExtractor: JapanCnetComExtractor,
|
|
7359
|
-
DeadlineComExtractor: DeadlineComExtractor,
|
|
7554
|
+
WwwEngadgetComExtractor: WwwEngadgetComExtractor,
|
|
7555
|
+
WwwEonlineComExtractor: WwwEonlineComExtractor,
|
|
7556
|
+
WwwEuronewsComExtractor: WwwEuronewsComExtractor,
|
|
7557
|
+
WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
|
|
7558
|
+
WwwFlatpanelshdComExtractor: WwwFlatpanelshdComExtractor,
|
|
7559
|
+
WwwFoolComExtractor: WwwFoolComExtractor,
|
|
7560
|
+
WwwFortinetComExtractor: WwwFortinetComExtractor,
|
|
7561
|
+
WwwFrandroidComExtractor: WwwFrandroidComExtractor,
|
|
7562
|
+
WwwFuturaSciencesComExtractor: WwwFuturaSciencesComExtractor,
|
|
7360
7563
|
WwwGizmodoJpExtractor: WwwGizmodoJpExtractor,
|
|
7361
|
-
GetnewsJpExtractor: GetnewsJpExtractor,
|
|
7362
|
-
WwwLifehackerJpExtractor: WwwLifehackerJpExtractor,
|
|
7363
|
-
SectIijAdJpExtractor: SectIijAdJpExtractor,
|
|
7364
|
-
WwwOreillyCoJpExtractor: WwwOreillyCoJpExtractor,
|
|
7365
|
-
WwwIpaGoJpExtractor: WwwIpaGoJpExtractor,
|
|
7366
|
-
WeeklyAsciiJpExtractor: WeeklyAsciiJpExtractor,
|
|
7367
|
-
TechlogIijAdJpExtractor: TechlogIijAdJpExtractor,
|
|
7368
|
-
WiredJpExtractor: WiredJpExtractor,
|
|
7369
|
-
JapanZdnetComExtractor: JapanZdnetComExtractor,
|
|
7370
|
-
WwwRbbtodayComExtractor: WwwRbbtodayComExtractor,
|
|
7371
|
-
WwwLemondeFrExtractor: WwwLemondeFrExtractor,
|
|
7372
|
-
WwwPhoronixComExtractor: WwwPhoronixComExtractor,
|
|
7373
|
-
PitchforkComExtractor: PitchforkComExtractor,
|
|
7374
|
-
BiorxivOrgExtractor: BiorxivOrgExtractor,
|
|
7375
|
-
EpaperZeitDeExtractor: EpaperZeitDeExtractor,
|
|
7376
|
-
WwwLadbibleComExtractor: WwwLadbibleComExtractor,
|
|
7377
|
-
TimesofindiaIndiatimesComExtractor: TimesofindiaIndiatimesComExtractor,
|
|
7378
|
-
MaTtiasBeExtractor: MaTtiasBeExtractor,
|
|
7379
|
-
PastebinComExtractor: PastebinComExtractor,
|
|
7380
|
-
WwwAbendblattDeExtractor: WwwAbendblattDeExtractor,
|
|
7381
7564
|
WwwGrueneDeExtractor: WwwGrueneDeExtractor,
|
|
7382
|
-
ArstechnicaComExtractor: ArstechnicaComExtractor,
|
|
7383
|
-
WwwNdtvComExtractor: WwwNdtvComExtractor,
|
|
7384
|
-
SpektrumExtractor: SpektrumExtractor,
|
|
7385
|
-
WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor,
|
|
7386
|
-
WwwCbcCaExtractor: WwwCbcCaExtractor,
|
|
7387
|
-
WwwVersantsComExtractor: WwwVersantsComExtractor,
|
|
7388
|
-
Www1pezeshkComExtractor: Www1pezeshkComExtractor,
|
|
7389
|
-
WwwAndroidauthorityComExtractor: WwwAndroidauthorityComExtractor,
|
|
7390
|
-
TechcrunchComExtractor: TechcrunchComExtractor,
|
|
7391
7565
|
WwwHardwarezoneComSgExtractor: WwwHardwarezoneComSgExtractor,
|
|
7392
|
-
WwwSpiegelDeExtractor: WwwSpiegelDeExtractor,
|
|
7393
|
-
MobilesyrupComExtractor: MobilesyrupComExtractor,
|
|
7394
|
-
WwwChannelnewsasiaComExtractor: WwwChannelnewsasiaComExtractor,
|
|
7395
|
-
WccftechComExtractor: WccftechComExtractor,
|
|
7396
7566
|
WwwHeiseDeExtractor: WwwHeiseDeExtractor,
|
|
7397
|
-
|
|
7398
|
-
|
|
7399
|
-
|
|
7400
|
-
|
|
7401
|
-
|
|
7402
|
-
|
|
7403
|
-
|
|
7404
|
-
|
|
7405
|
-
|
|
7406
|
-
|
|
7407
|
-
|
|
7408
|
-
LodzSePlExtractor: LodzSePlExtractor,
|
|
7409
|
-
WroclawSePlExtractor: WroclawSePlExtractor,
|
|
7410
|
-
LublinSePlExtractor: LublinSePlExtractor,
|
|
7411
|
-
BialystokSePlExtractor: BialystokSePlExtractor,
|
|
7567
|
+
WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
|
|
7568
|
+
WwwIlfattoquotidianoItExtractor: WwwIlfattoquotidianoItExtractor,
|
|
7569
|
+
WwwInfoqComExtractor: WwwInfoqComExtractor,
|
|
7570
|
+
WwwInquisitrComExtractor: WwwInquisitrComExtractor,
|
|
7571
|
+
WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor,
|
|
7572
|
+
WwwIpaGoJpExtractor: WwwIpaGoJpExtractor,
|
|
7573
|
+
WwwItmediaCoJpExtractor: WwwItmediaCoJpExtractor,
|
|
7574
|
+
WwwJalopnikComExtractor: WwwJalopnikComExtractor,
|
|
7575
|
+
WwwJnsaOrgExtractor: WwwJnsaOrgExtractor,
|
|
7576
|
+
WwwLadbibleComExtractor: WwwLadbibleComExtractor,
|
|
7577
|
+
WwwLatimesComExtractor: WwwLatimesComExtractor,
|
|
7412
7578
|
WwwLebensmittelwarnungDeExtractor: WwwLebensmittelwarnungDeExtractor,
|
|
7579
|
+
WwwLemondeFrExtractor: WwwLemondeFrExtractor,
|
|
7580
|
+
WwwLifehackerJpExtractor: WwwLifehackerJpExtractor,
|
|
7581
|
+
WwwLinkedinComExtractor: WwwLinkedinComExtractor,
|
|
7582
|
+
WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
|
|
7583
|
+
WwwMentalflossComExtractor: WwwMentalflossComExtractor,
|
|
7584
|
+
WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
|
|
7585
|
+
WwwMoongiftJpExtractor: WwwMoongiftJpExtractor,
|
|
7586
|
+
WwwMotorsportComExtractor: WwwMotorsportComExtractor,
|
|
7587
|
+
WwwMsnbcComExtractor: WwwMsnbcComExtractor,
|
|
7588
|
+
WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
|
|
7589
|
+
WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
|
|
7590
|
+
WwwNdtvComExtractor: WwwNdtvComExtractor,
|
|
7591
|
+
WwwNotebookcheckNetExtractor: WwwNotebookcheckNetExtractor,
|
|
7592
|
+
WwwNprOrgExtractor: WwwNprOrgExtractor,
|
|
7593
|
+
WwwNtvDeExtractor: WwwNtvDeExtractor,
|
|
7594
|
+
WwwNumeramaComExtractor: WwwNumeramaComExtractor,
|
|
7595
|
+
WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
|
|
7596
|
+
WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
|
|
7597
|
+
WwwOreillyCoJpExtractor: WwwOreillyCoJpExtractor,
|
|
7598
|
+
WwwOssnewsJpExtractor: WwwOssnewsJpExtractor,
|
|
7599
|
+
WwwPhoronixComExtractor: WwwPhoronixComExtractor,
|
|
7600
|
+
WwwPolygonComExtractor: WwwPolygonComExtractor,
|
|
7601
|
+
WwwPopsugarComExtractor: WwwPopsugarComExtractor,
|
|
7602
|
+
WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
|
|
7603
|
+
WwwPublickey1JpExtractor: WwwPublickey1JpExtractor,
|
|
7413
7604
|
WwwQbitaiComExtractor: WwwQbitaiComExtractor,
|
|
7414
|
-
|
|
7415
|
-
|
|
7605
|
+
WwwQdailyComExtractor: WwwQdailyComExtractor,
|
|
7606
|
+
WwwRawstoryComExtractor: WwwRawstoryComExtractor,
|
|
7607
|
+
WwwRbbtodayComExtractor: WwwRbbtodayComExtractor,
|
|
7608
|
+
WwwRecodeNetExtractor: WwwRecodeNetExtractor,
|
|
7609
|
+
WwwRedditComExtractor: WwwRedditComExtractor,
|
|
7610
|
+
WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
|
|
7611
|
+
WwwReutersComExtractor: WwwReutersComExtractor,
|
|
7612
|
+
WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
|
|
7613
|
+
WwwSanwaCoJpExtractor: WwwSanwaCoJpExtractor,
|
|
7614
|
+
WwwSbnationComExtractor: WwwSbnationComExtractor,
|
|
7615
|
+
WwwSePlExtractor: WwwSePlExtractor,
|
|
7616
|
+
WwwSiComExtractor: WwwSiComExtractor,
|
|
7617
|
+
WwwSlateComExtractor: WwwSlateComExtractor,
|
|
7618
|
+
WwwSpiegelDeExtractor: WwwSpiegelDeExtractor,
|
|
7416
7619
|
WwwTagesschauDeExtractor: WwwTagesschauDeExtractor,
|
|
7417
|
-
Nineto5googleComExtractor: Nineto5googleComExtractor,
|
|
7418
|
-
WwwEngadgetComExtractor: WwwEngadgetComExtractor,
|
|
7419
|
-
TarnkappeInfoExtractor: TarnkappeInfoExtractor,
|
|
7420
|
-
WwwVortezNetExtractor: WwwVortezNetExtractor,
|
|
7421
|
-
WwwPolygonComExtractor: WwwPolygonComExtractor,
|
|
7422
|
-
WwwThevergeComExtractor: WwwThevergeComExtractor,
|
|
7423
7620
|
WwwTechpowerupComExtractor: WwwTechpowerupComExtractor,
|
|
7424
|
-
WwwFlatpanelshdComExtractor: WwwFlatpanelshdComExtractor,
|
|
7425
|
-
Nineto5macComExtractor: Nineto5macComExtractor,
|
|
7426
|
-
WwwNotebookcheckNetExtractor: WwwNotebookcheckNetExtractor,
|
|
7427
|
-
WwwFuturaSciencesComExtractor: WwwFuturaSciencesComExtractor,
|
|
7428
|
-
SgNewsYahooComExtractor: SgNewsYahooComExtractor,
|
|
7429
|
-
GonintendoComExtractor: GonintendoComExtractor,
|
|
7430
|
-
OrfAtExtractor: OrfAtExtractor,
|
|
7431
|
-
WwwVideogameschronicleComExtractor: WwwVideogameschronicleComExtractor,
|
|
7432
|
-
WwwNumeramaComExtractor: WwwNumeramaComExtractor,
|
|
7433
|
-
TerminaltroveComExtractor: TerminaltroveComExtractor,
|
|
7434
|
-
NewsPtsOrgTwExtractor: NewsPtsOrgTwExtractor,
|
|
7435
7621
|
WwwThedriveComExtractor: WwwThedriveComExtractor,
|
|
7436
|
-
|
|
7437
|
-
|
|
7438
|
-
|
|
7622
|
+
WwwTheguardianComExtractor: WwwTheguardianComExtractor,
|
|
7623
|
+
WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
|
|
7624
|
+
WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
|
|
7625
|
+
WwwThevergeComExtractor: WwwThevergeComExtractor,
|
|
7626
|
+
WwwTmzComExtractor: WwwTmzComExtractor,
|
|
7627
|
+
WwwTodayComExtractor: WwwTodayComExtractor,
|
|
7439
7628
|
WwwTransfermarktDeExtractor: WwwTransfermarktDeExtractor,
|
|
7440
|
-
|
|
7441
|
-
|
|
7442
|
-
|
|
7629
|
+
WwwTweaktownComExtractor: WwwTweaktownComExtractor,
|
|
7630
|
+
WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
|
|
7631
|
+
WwwVersantsComExtractor: WwwVersantsComExtractor,
|
|
7632
|
+
WwwVideogameschronicleComExtractor: WwwVideogameschronicleComExtractor,
|
|
7633
|
+
WwwVortezNetExtractor: WwwVortezNetExtractor,
|
|
7634
|
+
WwwVoxComExtractor: WwwVoxComExtractor,
|
|
7635
|
+
WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
|
|
7636
|
+
WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
|
|
7637
|
+
WwwYomiuriCoJpExtractor: WwwYomiuriCoJpExtractor,
|
|
7638
|
+
WwwYoutubeComExtractor: WwwYoutubeComExtractor,
|
|
7639
|
+
YahooExtractor: YahooExtractor,
|
|
7640
|
+
twofortysevensportsComExtractor: twofortysevensportsComExtractor
|
|
7443
7641
|
});
|
|
7444
7642
|
|
|
7445
|
-
function ownKeys$5(e, r) { var t = _Object$
|
|
7446
|
-
function _objectSpread$5(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$5(Object(t),
|
|
7447
|
-
var Extractors = _Object$
|
|
7643
|
+
function ownKeys$5(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
7644
|
+
function _objectSpread$5(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$5(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$5(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
7645
|
+
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
|
|
7448
7646
|
var extractor = CustomExtractors[key];
|
|
7449
7647
|
return _objectSpread$5(_objectSpread$5({}, acc), mergeSupportedDomains(extractor));
|
|
7450
7648
|
}, {});
|
|
@@ -7514,9 +7712,9 @@ function cleanDek(dek, _ref) {
|
|
|
7514
7712
|
return normalizeSpaces(dekText.trim());
|
|
7515
7713
|
}
|
|
7516
7714
|
|
|
7517
|
-
|
|
7518
|
-
|
|
7519
|
-
|
|
7715
|
+
dayjs.extend(utc);
|
|
7716
|
+
dayjs.extend(timezonePlugin);
|
|
7717
|
+
dayjs.extend(customParseFormat);
|
|
7520
7718
|
var TIMEZONE_ABBR_RE = /\b(EST|EDT|CST|CDT|MST|MDT|PST|PDT|ET|CT|MT|PT|GMT|UTC)\b/gi;
|
|
7521
7719
|
// Check if string contains timezone offset info (e.g., +0000, GMT+0000, Z)
|
|
7522
7720
|
var HAS_TIMEZONE_RE = /([+-]\d{2}:?\d{2}|Z|\bGMT[+-]\d+|\bUTC\b)/i;
|
|
@@ -7536,53 +7734,53 @@ function cleanDateString(dateString) {
|
|
|
7536
7734
|
}
|
|
7537
7735
|
function createDate(dateString, timezone, format) {
|
|
7538
7736
|
if (TIME_WITH_OFFSET_RE.test(dateString)) {
|
|
7539
|
-
return
|
|
7737
|
+
return dayjs(new Date(dateString));
|
|
7540
7738
|
}
|
|
7541
7739
|
if (TIME_AGO_STRING.test(dateString)) {
|
|
7542
7740
|
var fragments = TIME_AGO_STRING.exec(dateString);
|
|
7543
|
-
return
|
|
7741
|
+
return dayjs().subtract(fragments[1], fragments[2]);
|
|
7544
7742
|
}
|
|
7545
7743
|
if (TIME_NOW_STRING.test(dateString)) {
|
|
7546
|
-
return
|
|
7744
|
+
return dayjs();
|
|
7547
7745
|
}
|
|
7548
7746
|
var stringHasTimezone = hasTimezoneInfo(dateString);
|
|
7549
7747
|
var cleanedDateString = stripTimezoneAbbr(dateString);
|
|
7550
7748
|
if (stringHasTimezone) {
|
|
7551
7749
|
var _nativeDate = new Date(dateString);
|
|
7552
|
-
if (!_Number$
|
|
7553
|
-
return
|
|
7750
|
+
if (!_Number$isNaN(_nativeDate.getTime())) {
|
|
7751
|
+
return dayjs(_nativeDate);
|
|
7554
7752
|
}
|
|
7555
7753
|
}
|
|
7556
7754
|
if (timezone && !stringHasTimezone) {
|
|
7557
7755
|
if (format) {
|
|
7558
7756
|
var cleanedFormat = stripTimezoneFromFormat(format);
|
|
7559
7757
|
try {
|
|
7560
|
-
var _parsed =
|
|
7758
|
+
var _parsed = dayjs.tz(cleanedDateString, cleanedFormat, timezone);
|
|
7561
7759
|
if (_parsed.isValid()) return _parsed;
|
|
7562
7760
|
} catch (_unused) {
|
|
7563
7761
|
// Fall through
|
|
7564
7762
|
}
|
|
7565
7763
|
}
|
|
7566
7764
|
var _nativeDate2 = new Date(cleanedDateString);
|
|
7567
|
-
if (!_Number$
|
|
7568
|
-
return
|
|
7765
|
+
if (!_Number$isNaN(_nativeDate2.getTime())) {
|
|
7766
|
+
return dayjs(_nativeDate2).tz(timezone, true);
|
|
7569
7767
|
}
|
|
7570
|
-
var parsed =
|
|
7768
|
+
var parsed = dayjs(cleanedDateString);
|
|
7571
7769
|
if (parsed.isValid()) {
|
|
7572
7770
|
return parsed.tz(timezone, true);
|
|
7573
7771
|
}
|
|
7574
|
-
return
|
|
7772
|
+
return dayjs(null);
|
|
7575
7773
|
}
|
|
7576
7774
|
if (format) {
|
|
7577
7775
|
var _cleanedFormat = stripTimezoneFromFormat(format);
|
|
7578
|
-
var _parsed2 =
|
|
7776
|
+
var _parsed2 = dayjs(cleanedDateString, _cleanedFormat);
|
|
7579
7777
|
if (_parsed2.isValid()) return _parsed2;
|
|
7580
7778
|
}
|
|
7581
7779
|
var nativeDate = new Date(cleanedDateString);
|
|
7582
|
-
if (!_Number$
|
|
7583
|
-
return
|
|
7780
|
+
if (!_Number$isNaN(nativeDate.getTime())) {
|
|
7781
|
+
return dayjs(nativeDate);
|
|
7584
7782
|
}
|
|
7585
|
-
return
|
|
7783
|
+
return dayjs(cleanedDateString);
|
|
7586
7784
|
}
|
|
7587
7785
|
|
|
7588
7786
|
// Take a date published string, and hopefully return a date out of
|
|
@@ -7593,10 +7791,10 @@ function cleanDatePublished(dateString) {
|
|
|
7593
7791
|
format = _ref.format;
|
|
7594
7792
|
// If string is in milliseconds or seconds, convert to int and return
|
|
7595
7793
|
if (MS_DATE_STRING.test(dateString)) {
|
|
7596
|
-
return new Date(
|
|
7794
|
+
return new Date(_parseInt(dateString, 10)).toISOString();
|
|
7597
7795
|
}
|
|
7598
7796
|
if (SEC_DATE_STRING.test(dateString)) {
|
|
7599
|
-
return new Date(
|
|
7797
|
+
return new Date(_parseInt(dateString, 10) * 1000).toISOString();
|
|
7600
7798
|
}
|
|
7601
7799
|
var date = createDate(dateString, timezone, format);
|
|
7602
7800
|
if (!date.isValid()) {
|
|
@@ -7671,13 +7869,13 @@ function extractBreadcrumbTitle(splitTitle, text) {
|
|
|
7671
7869
|
acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
|
|
7672
7870
|
return acc;
|
|
7673
7871
|
}, {});
|
|
7674
|
-
var _Reflect$ownKeys$redu = _Reflect$
|
|
7872
|
+
var _Reflect$ownKeys$redu = _Reflect$ownKeys(termCounts).reduce(function (acc, key) {
|
|
7675
7873
|
if (acc[1] < termCounts[key]) {
|
|
7676
7874
|
return [key, termCounts[key]];
|
|
7677
7875
|
}
|
|
7678
7876
|
return acc;
|
|
7679
7877
|
}, [0, 0]),
|
|
7680
|
-
_Reflect$ownKeys$redu2 =
|
|
7878
|
+
_Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
|
|
7681
7879
|
maxTerm = _Reflect$ownKeys$redu2[0],
|
|
7682
7880
|
termCount = _Reflect$ownKeys$redu2[1];
|
|
7683
7881
|
|
|
@@ -7706,16 +7904,16 @@ function cleanDomainFromTitle(splitTitle, url) {
|
|
|
7706
7904
|
//
|
|
7707
7905
|
// Strip out the big TLDs - it just makes the matching a bit more
|
|
7708
7906
|
// accurate. Not the end of the world if it doesn't strip right.
|
|
7709
|
-
var _URL$parse =
|
|
7907
|
+
var _URL$parse = URL$1.parse(url),
|
|
7710
7908
|
host = _URL$parse.host;
|
|
7711
7909
|
var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
|
|
7712
7910
|
var startSlug = splitTitle[0].toLowerCase().replace(' ', '');
|
|
7713
|
-
var startSlugRatio =
|
|
7911
|
+
var startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);
|
|
7714
7912
|
if (startSlugRatio > 0.4 && startSlug.length > 5) {
|
|
7715
7913
|
return splitTitle.slice(2).join('');
|
|
7716
7914
|
}
|
|
7717
7915
|
var endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
|
|
7718
|
-
var endSlugRatio =
|
|
7916
|
+
var endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
|
|
7719
7917
|
if (endSlugRatio > 0.4 && endSlug.length >= 5) {
|
|
7720
7918
|
return splitTitle.slice(0, -2).join('');
|
|
7721
7919
|
}
|
|
@@ -7815,7 +8013,7 @@ function scoreContent($) {
|
|
|
7815
8013
|
// First, look for special hNews based selectors and give them a big
|
|
7816
8014
|
// boost, if they exist
|
|
7817
8015
|
HNEWS_CONTENT_SELECTORS.forEach(function (_ref) {
|
|
7818
|
-
var _ref2 =
|
|
8016
|
+
var _ref2 = _slicedToArray(_ref, 2),
|
|
7819
8017
|
parentSelector = _ref2[0],
|
|
7820
8018
|
childSelector = _ref2[1];
|
|
7821
8019
|
$("".concat(parentSelector, " ").concat(childSelector)).each(function (index, node) {
|
|
@@ -7947,11 +8145,11 @@ function extractBestNode($, opts) {
|
|
|
7947
8145
|
return $topCandidate;
|
|
7948
8146
|
}
|
|
7949
8147
|
|
|
7950
|
-
function _createForOfIteratorHelper$2(r, e) { var t = "undefined" != typeof
|
|
7951
|
-
function _unsupportedIterableToArray$2(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$2(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
8148
|
+
function _createForOfIteratorHelper$2(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray$2(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
8149
|
+
function _unsupportedIterableToArray$2(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$2(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray$2(r, a) : void 0; } }
|
|
7952
8150
|
function _arrayLikeToArray$2(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
7953
|
-
function ownKeys$4(e, r) { var t = _Object$
|
|
7954
|
-
function _objectSpread$4(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$4(Object(t),
|
|
8151
|
+
function ownKeys$4(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
8152
|
+
function _objectSpread$4(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$4(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$4(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
7955
8153
|
var GenericContentExtractor = {
|
|
7956
8154
|
defaultOpts: {
|
|
7957
8155
|
stripUnlikelyCandidates: true,
|
|
@@ -7994,8 +8192,7 @@ var GenericContentExtractor = {
|
|
|
7994
8192
|
|
|
7995
8193
|
// We didn't succeed on first pass, one by one disable our
|
|
7996
8194
|
// extraction opts and try again.
|
|
7997
|
-
|
|
7998
|
-
var _iterator = _createForOfIteratorHelper$2(_Reflect$ownKeys__default["default"](opts).filter(function (k) {
|
|
8195
|
+
var _iterator = _createForOfIteratorHelper$2(_Reflect$ownKeys(opts).filter(function (k) {
|
|
7999
8196
|
return opts[k] === true;
|
|
8000
8197
|
})),
|
|
8001
8198
|
_step;
|
|
@@ -8123,8 +8320,8 @@ var AUTHOR_SELECTORS = ['.entry .entry-author', '.author.vcard .fn', '.author .v
|
|
|
8123
8320
|
var bylineRe = /^[\n\s]*By/i;
|
|
8124
8321
|
var BYLINE_SELECTORS_RE = [['#byline', bylineRe], ['.byline', bylineRe]];
|
|
8125
8322
|
|
|
8126
|
-
function _createForOfIteratorHelper$1(r, e) { var t = "undefined" != typeof
|
|
8127
|
-
function _unsupportedIterableToArray$1(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$1(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
8323
|
+
function _createForOfIteratorHelper$1(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray$1(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
8324
|
+
function _unsupportedIterableToArray$1(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$1(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray$1(r, a) : void 0; } }
|
|
8128
8325
|
function _arrayLikeToArray$1(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
8129
8326
|
var GenericAuthorExtractor = {
|
|
8130
8327
|
extract: function extract(_ref) {
|
|
@@ -8147,12 +8344,11 @@ var GenericAuthorExtractor = {
|
|
|
8147
8344
|
|
|
8148
8345
|
// Last, use our looser regular-expression based selectors for
|
|
8149
8346
|
// potential authors.
|
|
8150
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
8151
8347
|
var _iterator = _createForOfIteratorHelper$1(BYLINE_SELECTORS_RE),
|
|
8152
8348
|
_step;
|
|
8153
8349
|
try {
|
|
8154
8350
|
for (_iterator.s(); !(_step = _iterator.n()).done;) {
|
|
8155
|
-
var _step$value =
|
|
8351
|
+
var _step$value = _slicedToArray(_step.value, 2),
|
|
8156
8352
|
selector = _step$value[0],
|
|
8157
8353
|
regex = _step$value[1];
|
|
8158
8354
|
var node = $(selector);
|
|
@@ -8309,8 +8505,8 @@ function scoreBySibling($img) {
|
|
|
8309
8505
|
}
|
|
8310
8506
|
function scoreByDimensions($img) {
|
|
8311
8507
|
var score = 0;
|
|
8312
|
-
var width =
|
|
8313
|
-
var height =
|
|
8508
|
+
var width = _parseFloat($img.attr('width'));
|
|
8509
|
+
var height = _parseFloat($img.attr('height'));
|
|
8314
8510
|
var src = $img.attr('src');
|
|
8315
8511
|
|
|
8316
8512
|
// Penalty for skinny images
|
|
@@ -8337,8 +8533,8 @@ function scoreByPosition($imgs, index) {
|
|
|
8337
8533
|
return $imgs.length / 2 - index;
|
|
8338
8534
|
}
|
|
8339
8535
|
|
|
8340
|
-
function _createForOfIteratorHelper(r, e) { var t = "undefined" != typeof
|
|
8341
|
-
function _unsupportedIterableToArray(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
8536
|
+
function _createForOfIteratorHelper(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
8537
|
+
function _unsupportedIterableToArray(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray(r, a) : void 0; } }
|
|
8342
8538
|
function _arrayLikeToArray(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
8343
8539
|
|
|
8344
8540
|
// Given a resource, try to find the lead image URL from within
|
|
@@ -8388,10 +8584,10 @@ var GenericLeadImageUrlExtractor = {
|
|
|
8388
8584
|
score += scoreByPosition(imgs, index);
|
|
8389
8585
|
imgScores[src] = score;
|
|
8390
8586
|
});
|
|
8391
|
-
var _Reflect$ownKeys$redu = _Reflect$
|
|
8587
|
+
var _Reflect$ownKeys$redu = _Reflect$ownKeys(imgScores).reduce(function (acc, key) {
|
|
8392
8588
|
return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc;
|
|
8393
8589
|
}, [null, 0]),
|
|
8394
|
-
_Reflect$ownKeys$redu2 =
|
|
8590
|
+
_Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
|
|
8395
8591
|
topUrl = _Reflect$ownKeys$redu2[0],
|
|
8396
8592
|
topScore = _Reflect$ownKeys$redu2[1];
|
|
8397
8593
|
if (topScore > 0) {
|
|
@@ -8401,7 +8597,6 @@ var GenericLeadImageUrlExtractor = {
|
|
|
8401
8597
|
|
|
8402
8598
|
// If nothing else worked, check to see if there are any really
|
|
8403
8599
|
// probable nodes in the doc, like <link rel="image_src" />.
|
|
8404
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
8405
8600
|
var _iterator = _createForOfIteratorHelper(LEAD_IMAGE_URL_SELECTORS),
|
|
8406
8601
|
_step;
|
|
8407
8602
|
try {
|
|
@@ -8440,7 +8635,7 @@ function scoreSimilarity(score, articleUrl, href) {
|
|
|
8440
8635
|
// sliding scale, subtract points from this link based on
|
|
8441
8636
|
// similarity.
|
|
8442
8637
|
if (score > 0) {
|
|
8443
|
-
var similarity = new
|
|
8638
|
+
var similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio();
|
|
8444
8639
|
// Subtract .1 from diff_percent when calculating modifier,
|
|
8445
8640
|
// which means that if it's less than 10% different, we give a
|
|
8446
8641
|
// bonus instead. Ex:
|
|
@@ -8461,7 +8656,7 @@ function scoreLinkText(linkText, pageNum) {
|
|
|
8461
8656
|
// get scored, and sorted properly by score.
|
|
8462
8657
|
var score = 0;
|
|
8463
8658
|
if (IS_DIGIT_RE.test(linkText.trim())) {
|
|
8464
|
-
var linkTextAsNum =
|
|
8659
|
+
var linkTextAsNum = _parseInt(linkText, 10);
|
|
8465
8660
|
// If it's the first page, we already got it on the first call.
|
|
8466
8661
|
// Give it a negative score. Otherwise, up to page 10, give a
|
|
8467
8662
|
// small bonus.
|
|
@@ -8530,7 +8725,7 @@ function scoreByParents($link) {
|
|
|
8530
8725
|
var positiveMatch = false;
|
|
8531
8726
|
var negativeMatch = false;
|
|
8532
8727
|
var score = 0;
|
|
8533
|
-
_Array$
|
|
8728
|
+
_Array$from(range(0, 4)).forEach(function () {
|
|
8534
8729
|
if ($parent.length === 0) {
|
|
8535
8730
|
return;
|
|
8536
8731
|
}
|
|
@@ -8580,7 +8775,7 @@ function shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrl
|
|
|
8580
8775
|
return false;
|
|
8581
8776
|
}
|
|
8582
8777
|
var hostname = parsedUrl.hostname;
|
|
8583
|
-
var _URL$parse =
|
|
8778
|
+
var _URL$parse = URL$1.parse(href),
|
|
8584
8779
|
linkHost = _URL$parse.hostname;
|
|
8585
8780
|
|
|
8586
8781
|
// Domain mismatch.
|
|
@@ -8655,7 +8850,7 @@ function scoreLinks(_ref) {
|
|
|
8655
8850
|
$ = _ref.$,
|
|
8656
8851
|
_ref$previousUrls = _ref.previousUrls,
|
|
8657
8852
|
previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
|
|
8658
|
-
parsedUrl = parsedUrl ||
|
|
8853
|
+
parsedUrl = parsedUrl || URL$1.parse(articleUrl);
|
|
8659
8854
|
var baseRegex = makeBaseRegex(baseUrl);
|
|
8660
8855
|
var isWp = isWordpress($);
|
|
8661
8856
|
|
|
@@ -8706,7 +8901,7 @@ function scoreLinks(_ref) {
|
|
|
8706
8901
|
possiblePage.score = score;
|
|
8707
8902
|
return possiblePages;
|
|
8708
8903
|
}, {});
|
|
8709
|
-
return _Reflect$
|
|
8904
|
+
return _Reflect$ownKeys(scoredPages).length === 0 ? null : scoredPages;
|
|
8710
8905
|
}
|
|
8711
8906
|
|
|
8712
8907
|
// Looks for and returns next page url
|
|
@@ -8718,7 +8913,7 @@ var GenericNextPageUrlExtractor = {
|
|
|
8718
8913
|
parsedUrl = _ref.parsedUrl,
|
|
8719
8914
|
_ref$previousUrls = _ref.previousUrls,
|
|
8720
8915
|
previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
|
|
8721
|
-
parsedUrl = parsedUrl ||
|
|
8916
|
+
parsedUrl = parsedUrl || URL$1.parse(url);
|
|
8722
8917
|
var articleUrl = removeAnchor(url);
|
|
8723
8918
|
var baseUrl = articleBaseUrl(url, parsedUrl);
|
|
8724
8919
|
var links = $('a[href]').toArray();
|
|
@@ -8736,7 +8931,7 @@ var GenericNextPageUrlExtractor = {
|
|
|
8736
8931
|
|
|
8737
8932
|
// now that we've scored all possible pages,
|
|
8738
8933
|
// find the biggest one.
|
|
8739
|
-
var topPage = _Reflect$
|
|
8934
|
+
var topPage = _Reflect$ownKeys(scoredLinks).reduce(function (acc, link) {
|
|
8740
8935
|
var scoredLink = scoredLinks[link];
|
|
8741
8936
|
return scoredLink.score > acc.score ? scoredLink : acc;
|
|
8742
8937
|
}, {
|
|
@@ -8755,7 +8950,7 @@ var GenericNextPageUrlExtractor = {
|
|
|
8755
8950
|
var CANONICAL_META_SELECTORS = ['og:url'];
|
|
8756
8951
|
|
|
8757
8952
|
function parseDomain(url) {
|
|
8758
|
-
var parsedUrl =
|
|
8953
|
+
var parsedUrl = URL$1.parse(url);
|
|
8759
8954
|
var hostname = parsedUrl.hostname;
|
|
8760
8955
|
return hostname;
|
|
8761
8956
|
}
|
|
@@ -8785,12 +8980,63 @@ var GenericUrlExtractor = {
|
|
|
8785
8980
|
}
|
|
8786
8981
|
};
|
|
8787
8982
|
|
|
8983
|
+
var defaults = {
|
|
8984
|
+
ellipse: '…',
|
|
8985
|
+
chars: [' ', '-'],
|
|
8986
|
+
max: 140,
|
|
8987
|
+
truncate: true
|
|
8988
|
+
};
|
|
8989
|
+
function ellipsizeMiddle(str, max, ellipse, chars) {
|
|
8990
|
+
if (str.length <= max) return str;
|
|
8991
|
+
if (max < 2) return str.slice(0, max - ellipse.length) + ellipse;
|
|
8992
|
+
var maxLen = max - ellipse.length;
|
|
8993
|
+
var middle = Math.floor(maxLen / 2);
|
|
8994
|
+
var left = middle;
|
|
8995
|
+
var right = str.length - middle;
|
|
8996
|
+
for (var i = 0; i < middle; i += 1) {
|
|
8997
|
+
var charLeft = str.charAt(i);
|
|
8998
|
+
var posRight = str.length - i;
|
|
8999
|
+
var charRight = str.charAt(posRight);
|
|
9000
|
+
if (chars.indexOf(charLeft) !== -1) left = i;
|
|
9001
|
+
if (chars.indexOf(charRight) !== -1) right = posRight;
|
|
9002
|
+
}
|
|
9003
|
+
return str.slice(0, left) + ellipse + str.slice(right);
|
|
9004
|
+
}
|
|
9005
|
+
function ellipsize(str, max, ellipse, chars, truncate) {
|
|
9006
|
+
if (str.length <= max) return str;
|
|
9007
|
+
var maxLen = max - ellipse.length;
|
|
9008
|
+
var end = maxLen;
|
|
9009
|
+
var breakpointFound = false;
|
|
9010
|
+
for (var i = 0; i <= maxLen; i += 1) {
|
|
9011
|
+
var _char = str.charAt(i);
|
|
9012
|
+
if (chars.indexOf(_char) !== -1) {
|
|
9013
|
+
end = i;
|
|
9014
|
+
breakpointFound = true;
|
|
9015
|
+
}
|
|
9016
|
+
}
|
|
9017
|
+
if (!truncate && !breakpointFound) return '';
|
|
9018
|
+
return str.slice(0, end) + ellipse;
|
|
9019
|
+
}
|
|
9020
|
+
var ellipsize$1 = (function (str, max, opts) {
|
|
9021
|
+
if (typeof str !== 'string' || str.length === 0) return '';
|
|
9022
|
+
if (max === 0) return '';
|
|
9023
|
+
opts = opts || {};
|
|
9024
|
+
_Object$keys(defaults).forEach(function (key) {
|
|
9025
|
+
if (opts[key] === null || typeof opts[key] === 'undefined') {
|
|
9026
|
+
opts[key] = defaults[key];
|
|
9027
|
+
}
|
|
9028
|
+
});
|
|
9029
|
+
opts.max = max || opts.max;
|
|
9030
|
+
if (opts.truncate === 'middle') return ellipsizeMiddle(str, opts.max, opts.ellipse, opts.chars);
|
|
9031
|
+
return ellipsize(str, opts.max, opts.ellipse, opts.chars, opts.truncate);
|
|
9032
|
+
});
|
|
9033
|
+
|
|
8788
9034
|
var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];
|
|
8789
9035
|
|
|
8790
9036
|
function clean(content, $) {
|
|
8791
9037
|
var maxLength = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;
|
|
8792
9038
|
content = content.replace(/[\s\n]+/g, ' ').trim();
|
|
8793
|
-
return
|
|
9039
|
+
return ellipsize$1(content, maxLength, {
|
|
8794
9040
|
ellipse: '…'
|
|
8795
9041
|
});
|
|
8796
9042
|
}
|
|
@@ -8831,8 +9077,8 @@ var GenericWordCountExtractor = {
|
|
|
8831
9077
|
}
|
|
8832
9078
|
};
|
|
8833
9079
|
|
|
8834
|
-
function ownKeys$3(e, r) { var t = _Object$
|
|
8835
|
-
function _objectSpread$3(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$3(Object(t),
|
|
9080
|
+
function ownKeys$3(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
9081
|
+
function _objectSpread$3(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$3(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$3(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
8836
9082
|
var GenericExtractor = {
|
|
8837
9083
|
// This extractor is the default for all domains
|
|
8838
9084
|
domain: '*',
|
|
@@ -8848,7 +9094,7 @@ var GenericExtractor = {
|
|
|
8848
9094
|
word_count: GenericWordCountExtractor.extract,
|
|
8849
9095
|
direction: function direction(_ref) {
|
|
8850
9096
|
var title = _ref.title;
|
|
8851
|
-
return
|
|
9097
|
+
return stringDirection.getDirection(title);
|
|
8852
9098
|
},
|
|
8853
9099
|
extract: function extract(options) {
|
|
8854
9100
|
var html = options.html,
|
|
@@ -8904,22 +9150,22 @@ var Detectors = {
|
|
|
8904
9150
|
'meta[name="generator"][value="blogger"]': BloggerExtractor
|
|
8905
9151
|
};
|
|
8906
9152
|
function detectByHtml($) {
|
|
8907
|
-
var selector = _Reflect$
|
|
9153
|
+
var selector = _Reflect$ownKeys(Detectors).find(function (s) {
|
|
8908
9154
|
return $(s).length > 0;
|
|
8909
9155
|
});
|
|
8910
9156
|
return Detectors[selector];
|
|
8911
9157
|
}
|
|
8912
9158
|
|
|
8913
9159
|
function getExtractor(url, parsedUrl, $) {
|
|
8914
|
-
parsedUrl = parsedUrl ||
|
|
9160
|
+
parsedUrl = parsedUrl || URL$1.parse(url);
|
|
8915
9161
|
var _parsedUrl = parsedUrl,
|
|
8916
9162
|
hostname = _parsedUrl.hostname;
|
|
8917
9163
|
var baseDomain = hostname.split('.').slice(-2).join('.');
|
|
8918
9164
|
return apiExtractors[hostname] || apiExtractors[baseDomain] || Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
|
|
8919
9165
|
}
|
|
8920
9166
|
|
|
8921
|
-
function ownKeys$2(e, r) { var t = _Object$
|
|
8922
|
-
function _objectSpread$2(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$2(Object(t),
|
|
9167
|
+
function ownKeys$2(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
9168
|
+
function _objectSpread$2(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$2(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$2(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
8923
9169
|
|
|
8924
9170
|
// Remove elements by an array of selectors
|
|
8925
9171
|
function cleanBySelectors($content, $, _ref) {
|
|
@@ -8933,7 +9179,7 @@ function cleanBySelectors($content, $, _ref) {
|
|
|
8933
9179
|
function transformElements($content, $, _ref2) {
|
|
8934
9180
|
var transforms = _ref2.transforms;
|
|
8935
9181
|
if (!transforms) return $content;
|
|
8936
|
-
_Reflect$
|
|
9182
|
+
_Reflect$ownKeys(transforms).forEach(function (key) {
|
|
8937
9183
|
var $matches = $(key, $content);
|
|
8938
9184
|
var value = transforms[key];
|
|
8939
9185
|
|
|
@@ -8957,13 +9203,13 @@ function transformElements($content, $, _ref2) {
|
|
|
8957
9203
|
}
|
|
8958
9204
|
function findMatchingSelector($, selectors, extractHtml, allowMultiple) {
|
|
8959
9205
|
return selectors.find(function (selector) {
|
|
8960
|
-
if (_Array$
|
|
9206
|
+
if (_Array$isArray(selector)) {
|
|
8961
9207
|
if (extractHtml) {
|
|
8962
9208
|
return selector.reduce(function (acc, s) {
|
|
8963
9209
|
return acc && $(s).length > 0;
|
|
8964
9210
|
}, true);
|
|
8965
9211
|
}
|
|
8966
|
-
var _selector =
|
|
9212
|
+
var _selector = _slicedToArray(selector, 2),
|
|
8967
9213
|
s = _selector[0],
|
|
8968
9214
|
attr = _selector[1];
|
|
8969
9215
|
return (allowMultiple || !allowMultiple && $(s).length === 1) && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
|
|
@@ -9005,7 +9251,7 @@ function select(opts) {
|
|
|
9005
9251
|
// multi-match selection, which allows the parser to choose several
|
|
9006
9252
|
// selectors to include in the result. Note that all selectors in the
|
|
9007
9253
|
// array must match in order for this selector to trigger
|
|
9008
|
-
if (_Array$
|
|
9254
|
+
if (_Array$isArray(matchingSelector)) {
|
|
9009
9255
|
$content = $(matchingSelector.join(','));
|
|
9010
9256
|
var $wrapper = $('<div></div>');
|
|
9011
9257
|
$content.each(function (_, element) {
|
|
@@ -9039,8 +9285,8 @@ function select(opts) {
|
|
|
9039
9285
|
var result;
|
|
9040
9286
|
// if selector is an array (e.g., ['img', 'src']),
|
|
9041
9287
|
// extract the attr
|
|
9042
|
-
if (_Array$
|
|
9043
|
-
var _matchingSelector =
|
|
9288
|
+
if (_Array$isArray(matchingSelector)) {
|
|
9289
|
+
var _matchingSelector = _slicedToArray(matchingSelector, 3),
|
|
9044
9290
|
selector = _matchingSelector[0],
|
|
9045
9291
|
attr = _matchingSelector[1],
|
|
9046
9292
|
transform = _matchingSelector[2];
|
|
@@ -9057,7 +9303,7 @@ function select(opts) {
|
|
|
9057
9303
|
return $(el).text().trim();
|
|
9058
9304
|
});
|
|
9059
9305
|
}
|
|
9060
|
-
result = _Array$
|
|
9306
|
+
result = _Array$isArray(result.toArray()) && allowMultiple ? result.toArray() : result[0];
|
|
9061
9307
|
// Allow custom extractor to skip default cleaner
|
|
9062
9308
|
// for this type; defaults to true
|
|
9063
9309
|
if (defaultCleaner && Cleaners[type]) {
|
|
@@ -9067,7 +9313,7 @@ function select(opts) {
|
|
|
9067
9313
|
}
|
|
9068
9314
|
function selectExtendedTypes(extend, opts) {
|
|
9069
9315
|
var results = {};
|
|
9070
|
-
_Reflect$
|
|
9316
|
+
_Reflect$ownKeys(extend).forEach(function (t) {
|
|
9071
9317
|
if (!results[t]) {
|
|
9072
9318
|
results[t] = select(_objectSpread$2(_objectSpread$2({}, opts), {}, {
|
|
9073
9319
|
type: t,
|
|
@@ -9185,15 +9431,15 @@ var RootExtractor = {
|
|
|
9185
9431
|
}
|
|
9186
9432
|
};
|
|
9187
9433
|
|
|
9188
|
-
function ownKeys$1(e, r) { var t = _Object$
|
|
9189
|
-
function _objectSpread$1(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$1(Object(t),
|
|
9434
|
+
function ownKeys$1(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
9435
|
+
function _objectSpread$1(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$1(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$1(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
9190
9436
|
function collectAllPages(_x) {
|
|
9191
9437
|
return _collectAllPages.apply(this, arguments);
|
|
9192
9438
|
}
|
|
9193
9439
|
function _collectAllPages() {
|
|
9194
|
-
_collectAllPages =
|
|
9440
|
+
_collectAllPages = _asyncToGenerator(/*#__PURE__*/_regeneratorRuntime.mark(function _callee(_ref) {
|
|
9195
9441
|
var next_page_url, html, $, metaCache, result, Extractor, title, url, pages, previousUrls, extractorOpts, nextPageResult, word_count;
|
|
9196
|
-
return
|
|
9442
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
9197
9443
|
while (1) switch (_context.prev = _context.next) {
|
|
9198
9444
|
case 0:
|
|
9199
9445
|
next_page_url = _ref.next_page_url, html = _ref.html, $ = _ref.$, metaCache = _ref.metaCache, result = _ref.result, Extractor = _ref.Extractor, title = _ref.title, url = _ref.url;
|
|
@@ -9207,7 +9453,6 @@ function _collectAllPages() {
|
|
|
9207
9453
|
break;
|
|
9208
9454
|
}
|
|
9209
9455
|
pages += 1;
|
|
9210
|
-
// eslint-disable-next-line no-await-in-loop
|
|
9211
9456
|
_context.next = 2;
|
|
9212
9457
|
return Resource.create(next_page_url);
|
|
9213
9458
|
case 2:
|
|
@@ -9248,17 +9493,17 @@ function _collectAllPages() {
|
|
|
9248
9493
|
}
|
|
9249
9494
|
|
|
9250
9495
|
var _excluded = ["html"];
|
|
9251
|
-
function ownKeys(e, r) { var t = _Object$
|
|
9252
|
-
function _objectSpread(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys(Object(t),
|
|
9496
|
+
function ownKeys(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
9497
|
+
function _objectSpread(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
9253
9498
|
var Parser = {
|
|
9254
9499
|
parse: function parse(url) {
|
|
9255
9500
|
var _arguments = arguments;
|
|
9256
|
-
return
|
|
9501
|
+
return _asyncToGenerator(/*#__PURE__*/_regeneratorRuntime.mark(function _callee() {
|
|
9257
9502
|
var _ref, html, opts, _opts$fetchAllPages, fetchAllPages, _opts$fallback, fallback, _opts$contentType, contentType, _opts$headers, headers, extend, customExtractor, parsedUrl, $, Extractor, metaCache, extendedTypes, result, _result, title, next_page_url, turndownService;
|
|
9258
|
-
return
|
|
9503
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
9259
9504
|
while (1) switch (_context.prev = _context.next) {
|
|
9260
9505
|
case 0:
|
|
9261
|
-
_ref = _arguments.length > 1 && _arguments[1] !== undefined ? _arguments[1] : {}, html = _ref.html, opts =
|
|
9506
|
+
_ref = _arguments.length > 1 && _arguments[1] !== undefined ? _arguments[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, _excluded);
|
|
9262
9507
|
_opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend, customExtractor = opts.customExtractor; // if no url was passed and this is the browser version,
|
|
9263
9508
|
// set url to window.location.href and load the html
|
|
9264
9509
|
// from the current page
|
|
@@ -9266,7 +9511,7 @@ var Parser = {
|
|
|
9266
9511
|
url = window.location.href; // eslint-disable-line no-undef
|
|
9267
9512
|
html = html || document.documentElement.outerHTML; // eslint-disable-line no-undef
|
|
9268
9513
|
}
|
|
9269
|
-
parsedUrl =
|
|
9514
|
+
parsedUrl = URL$1.parse(url);
|
|
9270
9515
|
if (validateUrl(parsedUrl)) {
|
|
9271
9516
|
_context.next = 1;
|
|
9272
9517
|
break;
|
|
@@ -9346,7 +9591,7 @@ var Parser = {
|
|
|
9346
9591
|
});
|
|
9347
9592
|
case 6:
|
|
9348
9593
|
if (contentType === 'markdown') {
|
|
9349
|
-
turndownService = new
|
|
9594
|
+
turndownService = new TurndownService();
|
|
9350
9595
|
result.content = turndownService.turndown(result.content);
|
|
9351
9596
|
} else if (contentType === 'text') {
|
|
9352
9597
|
result.content = $.text($(result.content));
|