@jocmp/mercury-parser 3.0.8 → 3.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/generate-custom-parser.js +730 -618
- package/dist/generate-custom-parser.js.map +1 -1
- package/dist/mercury.js +540 -370
- package/dist/mercury.js.map +1 -1
- package/dist/mercury.web.js +2 -2
- package/dist/mercury.web.js.map +1 -1
- package/package.json +7 -12
package/dist/mercury.js
CHANGED
|
@@ -36,10 +36,7 @@ var customParseFormat = require('dayjs/plugin/customParseFormat');
|
|
|
36
36
|
var wuzzy = require('wuzzy');
|
|
37
37
|
var difflib = require('difflib');
|
|
38
38
|
|
|
39
|
-
function
|
|
40
|
-
|
|
41
|
-
function _interopNamespace(e) {
|
|
42
|
-
if (e && e.__esModule) return e;
|
|
39
|
+
function _interopNamespaceDefault(e) {
|
|
43
40
|
var n = Object.create(null);
|
|
44
41
|
if (e) {
|
|
45
42
|
Object.keys(e).forEach(function (k) {
|
|
@@ -52,45 +49,11 @@ function _interopNamespace(e) {
|
|
|
52
49
|
}
|
|
53
50
|
});
|
|
54
51
|
}
|
|
55
|
-
n
|
|
52
|
+
n.default = e;
|
|
56
53
|
return Object.freeze(n);
|
|
57
54
|
}
|
|
58
55
|
|
|
59
|
-
var
|
|
60
|
-
var _Object$getOwnPropertySymbols__default = /*#__PURE__*/_interopDefaultLegacy(_Object$getOwnPropertySymbols);
|
|
61
|
-
var _Object$getOwnPropertyDescriptor__default = /*#__PURE__*/_interopDefaultLegacy(_Object$getOwnPropertyDescriptor);
|
|
62
|
-
var _Object$getOwnPropertyDescriptors__default = /*#__PURE__*/_interopDefaultLegacy(_Object$getOwnPropertyDescriptors);
|
|
63
|
-
var _Object$defineProperties__default = /*#__PURE__*/_interopDefaultLegacy(_Object$defineProperties);
|
|
64
|
-
var _Object$defineProperty__default = /*#__PURE__*/_interopDefaultLegacy(_Object$defineProperty);
|
|
65
|
-
var _defineProperty__default = /*#__PURE__*/_interopDefaultLegacy(_defineProperty);
|
|
66
|
-
var _objectWithoutProperties__default = /*#__PURE__*/_interopDefaultLegacy(_objectWithoutProperties);
|
|
67
|
-
var _asyncToGenerator__default = /*#__PURE__*/_interopDefaultLegacy(_asyncToGenerator);
|
|
68
|
-
var _regeneratorRuntime__default = /*#__PURE__*/_interopDefaultLegacy(_regeneratorRuntime);
|
|
69
|
-
var URL__default = /*#__PURE__*/_interopDefaultLegacy(URL$1);
|
|
70
|
-
var TurndownService__default = /*#__PURE__*/_interopDefaultLegacy(TurndownService);
|
|
71
|
-
var cheerio__namespace = /*#__PURE__*/_interopNamespace(cheerio);
|
|
72
|
-
var iconv__default = /*#__PURE__*/_interopDefaultLegacy(iconv);
|
|
73
|
-
var _parseInt__default = /*#__PURE__*/_interopDefaultLegacy(_parseInt);
|
|
74
|
-
var _slicedToArray__default = /*#__PURE__*/_interopDefaultLegacy(_slicedToArray);
|
|
75
|
-
var _Promise__default = /*#__PURE__*/_interopDefaultLegacy(_Promise);
|
|
76
|
-
var request__default = /*#__PURE__*/_interopDefaultLegacy(request);
|
|
77
|
-
var _Reflect$ownKeys__default = /*#__PURE__*/_interopDefaultLegacy(_Reflect$ownKeys);
|
|
78
|
-
var _toConsumableArray__default = /*#__PURE__*/_interopDefaultLegacy(_toConsumableArray);
|
|
79
|
-
var _parseFloat__default = /*#__PURE__*/_interopDefaultLegacy(_parseFloat);
|
|
80
|
-
var _Set__default = /*#__PURE__*/_interopDefaultLegacy(_Set);
|
|
81
|
-
var _Array$from__default = /*#__PURE__*/_interopDefaultLegacy(_Array$from);
|
|
82
|
-
var _Symbol__default = /*#__PURE__*/_interopDefaultLegacy(_Symbol);
|
|
83
|
-
var _Symbol$iterator__default = /*#__PURE__*/_interopDefaultLegacy(_Symbol$iterator);
|
|
84
|
-
var _Array$isArray__default = /*#__PURE__*/_interopDefaultLegacy(_Array$isArray);
|
|
85
|
-
var _Object$assign__default = /*#__PURE__*/_interopDefaultLegacy(_Object$assign);
|
|
86
|
-
var stringDirection__default = /*#__PURE__*/_interopDefaultLegacy(stringDirection);
|
|
87
|
-
var _Number$isNaN__default = /*#__PURE__*/_interopDefaultLegacy(_Number$isNaN);
|
|
88
|
-
var dayjs__default = /*#__PURE__*/_interopDefaultLegacy(dayjs);
|
|
89
|
-
var utc__default = /*#__PURE__*/_interopDefaultLegacy(utc);
|
|
90
|
-
var timezonePlugin__default = /*#__PURE__*/_interopDefaultLegacy(timezonePlugin);
|
|
91
|
-
var customParseFormat__default = /*#__PURE__*/_interopDefaultLegacy(customParseFormat);
|
|
92
|
-
var wuzzy__default = /*#__PURE__*/_interopDefaultLegacy(wuzzy);
|
|
93
|
-
var difflib__default = /*#__PURE__*/_interopDefaultLegacy(difflib);
|
|
56
|
+
var cheerio__namespace = /*#__PURE__*/_interopNamespaceDefault(cheerio);
|
|
94
57
|
|
|
95
58
|
var NORMALIZE_RE = /\s{2,}(?![^<>]*<\/(pre|code|textarea)>)/g;
|
|
96
59
|
function normalizeSpaces(text) {
|
|
@@ -138,7 +101,7 @@ var DEFAULT_ENCODING = 'utf-8';
|
|
|
138
101
|
function pageNumFromUrl(url) {
|
|
139
102
|
var matches = url.match(PAGE_IN_HREF_RE);
|
|
140
103
|
if (!matches) return null;
|
|
141
|
-
var pageNum =
|
|
104
|
+
var pageNum = _parseInt(matches[6], 10);
|
|
142
105
|
|
|
143
106
|
// Return pageNum < 100, otherwise
|
|
144
107
|
// return null
|
|
@@ -176,7 +139,7 @@ function isGoodSegment(segment, index, firstSegmentHasLetters) {
|
|
|
176
139
|
// pagination data exists in it. Useful for comparing to other links
|
|
177
140
|
// that might have pagination data within them.
|
|
178
141
|
function articleBaseUrl(url, parsed) {
|
|
179
|
-
var parsedUrl = parsed ||
|
|
142
|
+
var parsedUrl = parsed || URL$1.parse(url);
|
|
180
143
|
var protocol = parsedUrl.protocol,
|
|
181
144
|
host = parsedUrl.host,
|
|
182
145
|
path = parsedUrl.path;
|
|
@@ -187,7 +150,7 @@ function articleBaseUrl(url, parsed) {
|
|
|
187
150
|
// Split off and save anything that looks like a file type.
|
|
188
151
|
if (segment.includes('.')) {
|
|
189
152
|
var _segment$split = segment.split('.'),
|
|
190
|
-
_segment$split2 =
|
|
153
|
+
_segment$split2 = _slicedToArray(_segment$split, 2),
|
|
191
154
|
possibleSegment = _segment$split2[0],
|
|
192
155
|
fileExt = _segment$split2[1];
|
|
193
156
|
if (IS_ALPHA_RE.test(fileExt)) {
|
|
@@ -237,10 +200,10 @@ function getEncoding(str) {
|
|
|
237
200
|
var encoding = DEFAULT_ENCODING;
|
|
238
201
|
var matches = ENCODING_RE.exec(str);
|
|
239
202
|
if (matches !== null) {
|
|
240
|
-
var _matches =
|
|
203
|
+
var _matches = _slicedToArray(matches, 2);
|
|
241
204
|
str = _matches[1];
|
|
242
205
|
}
|
|
243
|
-
if (
|
|
206
|
+
if (iconv.encodingExists(str)) {
|
|
244
207
|
encoding = str;
|
|
245
208
|
}
|
|
246
209
|
return encoding;
|
|
@@ -266,11 +229,11 @@ var BAD_CONTENT_TYPES_RE = new RegExp("^(".concat(BAD_CONTENT_TYPES.join('|'), "
|
|
|
266
229
|
// for us to attempt parsing. Defaults to 5 MB.
|
|
267
230
|
var MAX_CONTENT_LENGTH = 5242880;
|
|
268
231
|
|
|
269
|
-
function ownKeys$h(e, r) { var t = _Object$
|
|
270
|
-
function _objectSpread$h(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$h(Object(t),
|
|
232
|
+
function ownKeys$h(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
233
|
+
function _objectSpread$h(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$h(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$h(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
271
234
|
function get(options) {
|
|
272
|
-
return new
|
|
273
|
-
|
|
235
|
+
return new _Promise(function (resolve, reject) {
|
|
236
|
+
request(options, function (err, response, body) {
|
|
274
237
|
if (err) {
|
|
275
238
|
reject(err);
|
|
276
239
|
} else {
|
|
@@ -329,7 +292,7 @@ function fetchResource(_x, _x2) {
|
|
|
329
292
|
return _fetchResource.apply(this, arguments);
|
|
330
293
|
}
|
|
331
294
|
function _fetchResource() {
|
|
332
|
-
_fetchResource =
|
|
295
|
+
_fetchResource = _asyncToGenerator(/*#__PURE__*/_regeneratorRuntime.mark(function _callee(url, parsedUrl) {
|
|
333
296
|
var headers,
|
|
334
297
|
options,
|
|
335
298
|
_yield$get,
|
|
@@ -337,11 +300,11 @@ function _fetchResource() {
|
|
|
337
300
|
body,
|
|
338
301
|
_args = arguments,
|
|
339
302
|
_t;
|
|
340
|
-
return
|
|
303
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
341
304
|
while (1) switch (_context.prev = _context.next) {
|
|
342
305
|
case 0:
|
|
343
306
|
headers = _args.length > 2 && _args[2] !== undefined ? _args[2] : {};
|
|
344
|
-
parsedUrl = parsedUrl ||
|
|
307
|
+
parsedUrl = parsedUrl || URL$1.parse(encodeURI(url));
|
|
345
308
|
options = _objectSpread$h({
|
|
346
309
|
url: parsedUrl.href,
|
|
347
310
|
headers: _objectSpread$h(_objectSpread$h({}, REQUEST_HEADERS), headers),
|
|
@@ -603,7 +566,7 @@ function getAttrs(node) {
|
|
|
603
566
|
var attribs = node.attribs,
|
|
604
567
|
attributes = node.attributes;
|
|
605
568
|
if (!attribs && attributes) {
|
|
606
|
-
var attrs = _Reflect$
|
|
569
|
+
var attrs = _Reflect$ownKeys(attributes).reduce(function (acc, index) {
|
|
607
570
|
var attr = attributes[index];
|
|
608
571
|
|
|
609
572
|
// In browser, Reflect.ownKeys includes non-numeric keys like 'length', 'item', etc.
|
|
@@ -623,7 +586,7 @@ function convertNodeTo($node, $) {
|
|
|
623
586
|
return $;
|
|
624
587
|
}
|
|
625
588
|
var attrs = getAttrs(node) || {};
|
|
626
|
-
var attribString = _Reflect$
|
|
589
|
+
var attribString = _Reflect$ownKeys(attrs).map(function (key) {
|
|
627
590
|
return "".concat(key, "=").concat(attrs[key]);
|
|
628
591
|
}).join(' ');
|
|
629
592
|
var html;
|
|
@@ -682,8 +645,8 @@ function convertToParagraphs($) {
|
|
|
682
645
|
}
|
|
683
646
|
|
|
684
647
|
function cleanForHeight($img, $) {
|
|
685
|
-
var height =
|
|
686
|
-
var width =
|
|
648
|
+
var height = _parseInt($img.attr('height'), 10);
|
|
649
|
+
var width = _parseInt($img.attr('width'), 10) || 20;
|
|
687
650
|
|
|
688
651
|
// Remove images that explicitly have very small heights or
|
|
689
652
|
// widths, because they are most likely shims or icons,
|
|
@@ -722,10 +685,10 @@ function markToKeep(article, $, url) {
|
|
|
722
685
|
tags = KEEP_SELECTORS;
|
|
723
686
|
}
|
|
724
687
|
if (url) {
|
|
725
|
-
var _URL$parse =
|
|
688
|
+
var _URL$parse = URL$1.parse(url),
|
|
726
689
|
protocol = _URL$parse.protocol,
|
|
727
690
|
hostname = _URL$parse.hostname;
|
|
728
|
-
tags = [].concat(
|
|
691
|
+
tags = [].concat(_toConsumableArray(tags), ["iframe[src^=\"".concat(protocol, "//").concat(hostname, "\"]")]);
|
|
729
692
|
}
|
|
730
693
|
$(tags.join(','), article).addClass(KEEP_CLASS);
|
|
731
694
|
return $;
|
|
@@ -767,21 +730,21 @@ function setAttrs(node, attrs) {
|
|
|
767
730
|
while (node.attributes.length > 0) {
|
|
768
731
|
node.removeAttribute(node.attributes[0].name);
|
|
769
732
|
}
|
|
770
|
-
_Reflect$
|
|
733
|
+
_Reflect$ownKeys(attrs).forEach(function (key) {
|
|
771
734
|
node.setAttribute(key, attrs[key]);
|
|
772
735
|
});
|
|
773
736
|
}
|
|
774
737
|
return node;
|
|
775
738
|
}
|
|
776
739
|
|
|
777
|
-
function ownKeys$g(e, r) { var t = _Object$
|
|
778
|
-
function _objectSpread$g(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$g(Object(t),
|
|
740
|
+
function ownKeys$g(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
741
|
+
function _objectSpread$g(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$g(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$g(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
779
742
|
function removeAllButWhitelist($article, $) {
|
|
780
743
|
$article.find('*').each(function (index, node) {
|
|
781
744
|
var attrs = getAttrs(node);
|
|
782
|
-
setAttrs(node, _Reflect$
|
|
745
|
+
setAttrs(node, _Reflect$ownKeys(attrs).reduce(function (acc, attr) {
|
|
783
746
|
if (WHITELIST_ATTRS_RE.test(attr)) {
|
|
784
|
-
return _objectSpread$g(_objectSpread$g({}, acc), {},
|
|
747
|
+
return _objectSpread$g(_objectSpread$g({}, acc), {}, _defineProperty({}, attr, attrs[attr]));
|
|
785
748
|
}
|
|
786
749
|
return acc;
|
|
787
750
|
}, {}));
|
|
@@ -812,7 +775,7 @@ function removeEmpty($article, $) {
|
|
|
812
775
|
// the node's score attribute
|
|
813
776
|
// returns null if no score set
|
|
814
777
|
function getScore($node) {
|
|
815
|
-
return
|
|
778
|
+
return _parseFloat($node.attr('score')) || null;
|
|
816
779
|
}
|
|
817
780
|
|
|
818
781
|
function setScore($node, $, score) {
|
|
@@ -878,6 +841,7 @@ function scoreParagraph(node) {
|
|
|
878
841
|
|
|
879
842
|
// // CONTENT FETCHING CONSTANTS ////
|
|
880
843
|
|
|
844
|
+
|
|
881
845
|
// A list of tags that should be ignored when trying to find the top candidate
|
|
882
846
|
// for a document.
|
|
883
847
|
var NON_TOP_CANDIDATE_TAGS = ['br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta'];
|
|
@@ -996,7 +960,7 @@ function getWeight(node) {
|
|
|
996
960
|
return score;
|
|
997
961
|
}
|
|
998
962
|
|
|
999
|
-
// eslint-disable-next-line import/no-cycle
|
|
963
|
+
// eslint-disable-next-line import-x/no-cycle
|
|
1000
964
|
function addScore($node, $, amount) {
|
|
1001
965
|
try {
|
|
1002
966
|
var score = getOrInitScore($node, $) + amount;
|
|
@@ -1007,7 +971,7 @@ function addScore($node, $, amount) {
|
|
|
1007
971
|
return $node;
|
|
1008
972
|
}
|
|
1009
973
|
|
|
1010
|
-
// eslint-disable-next-line import/no-cycle
|
|
974
|
+
// eslint-disable-next-line import-x/no-cycle
|
|
1011
975
|
|
|
1012
976
|
// Adds 1/4 of a child's score to its parent
|
|
1013
977
|
function addToParent(node, $, score) {
|
|
@@ -1204,7 +1168,7 @@ function absolutize($, rootUrl, attr) {
|
|
|
1204
1168
|
var attrs = getAttrs(node);
|
|
1205
1169
|
var url = attrs[attr];
|
|
1206
1170
|
if (!url) return;
|
|
1207
|
-
var absoluteUrl =
|
|
1171
|
+
var absoluteUrl = URL$1.resolve(baseUrl || rootUrl, url);
|
|
1208
1172
|
setAttr(node, attr, absoluteUrl);
|
|
1209
1173
|
});
|
|
1210
1174
|
}
|
|
@@ -1222,10 +1186,10 @@ function absolutizeSet($, rootUrl, $content) {
|
|
|
1222
1186
|
// a candidate URL cannot start or end with a comma
|
|
1223
1187
|
// descriptors are separated from the URLs by unescaped whitespace
|
|
1224
1188
|
var parts = candidate.trim().replace(/,$/, '').split(/\s+/);
|
|
1225
|
-
parts[0] =
|
|
1189
|
+
parts[0] = URL$1.resolve(rootUrl, parts[0]);
|
|
1226
1190
|
return parts.join(' ');
|
|
1227
1191
|
});
|
|
1228
|
-
var absoluteUrlSet =
|
|
1192
|
+
var absoluteUrlSet = _toConsumableArray(new _Set(absoluteCandidates)).join(', ');
|
|
1229
1193
|
setAttr(node, 'srcset', absoluteUrlSet);
|
|
1230
1194
|
}
|
|
1231
1195
|
});
|
|
@@ -1246,8 +1210,8 @@ function stripTags(text, $) {
|
|
|
1246
1210
|
return cleanText === '' ? text : cleanText;
|
|
1247
1211
|
}
|
|
1248
1212
|
|
|
1249
|
-
function _createForOfIteratorHelper$4(r, e) { var t = "undefined" != typeof
|
|
1250
|
-
function _unsupportedIterableToArray$4(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$4(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
1213
|
+
function _createForOfIteratorHelper$4(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray$4(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
1214
|
+
function _unsupportedIterableToArray$4(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$4(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray$4(r, a) : void 0; } }
|
|
1251
1215
|
function _arrayLikeToArray$4(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
1252
1216
|
|
|
1253
1217
|
// Given a node type to search for, and a list of meta tag names to
|
|
@@ -1257,8 +1221,6 @@ function extractFromMeta($, metaNames, cachedNames) {
|
|
|
1257
1221
|
var foundNames = metaNames.filter(function (name) {
|
|
1258
1222
|
return cachedNames.indexOf(name) !== -1;
|
|
1259
1223
|
});
|
|
1260
|
-
|
|
1261
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
1262
1224
|
var _iterator = _createForOfIteratorHelper$4(foundNames),
|
|
1263
1225
|
_step;
|
|
1264
1226
|
try {
|
|
@@ -1288,7 +1250,7 @@ function extractFromMeta($, metaNames, cachedNames) {
|
|
|
1288
1250
|
if (cleanTags) {
|
|
1289
1251
|
metaValue = stripTags(values[0], $);
|
|
1290
1252
|
} else {
|
|
1291
|
-
var _values =
|
|
1253
|
+
var _values = _slicedToArray(values, 1);
|
|
1292
1254
|
metaValue = _values[0];
|
|
1293
1255
|
}
|
|
1294
1256
|
return {
|
|
@@ -1323,8 +1285,8 @@ function withinComment($node) {
|
|
|
1323
1285
|
return commentParent !== undefined;
|
|
1324
1286
|
}
|
|
1325
1287
|
|
|
1326
|
-
function _createForOfIteratorHelper$3(r, e) { var t = "undefined" != typeof
|
|
1327
|
-
function _unsupportedIterableToArray$3(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$3(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
1288
|
+
function _createForOfIteratorHelper$3(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray$3(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
1289
|
+
function _unsupportedIterableToArray$3(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$3(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray$3(r, a) : void 0; } }
|
|
1328
1290
|
function _arrayLikeToArray$3(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
1329
1291
|
function isGoodNode($node, maxChildren) {
|
|
1330
1292
|
// If it has a number of children, it's more likely a container
|
|
@@ -1345,7 +1307,6 @@ function isGoodNode($node, maxChildren) {
|
|
|
1345
1307
|
function extractFromSelectors($, selectors) {
|
|
1346
1308
|
var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
|
|
1347
1309
|
var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
|
|
1348
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
1349
1310
|
var _iterator = _createForOfIteratorHelper$3(selectors),
|
|
1350
1311
|
_step;
|
|
1351
1312
|
try {
|
|
@@ -1414,7 +1375,7 @@ function convertLazyLoadedImages($) {
|
|
|
1414
1375
|
};
|
|
1415
1376
|
$('img').each(function (_, img) {
|
|
1416
1377
|
var attrs = getAttrs(img);
|
|
1417
|
-
_Reflect$
|
|
1378
|
+
_Reflect$ownKeys(attrs).forEach(function (attr) {
|
|
1418
1379
|
var value = attrs[attr];
|
|
1419
1380
|
if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
|
|
1420
1381
|
$(img).attr('srcset', value);
|
|
@@ -1456,9 +1417,9 @@ var Resource = {
|
|
|
1456
1417
|
create: function create(url, preparedResponse, parsedUrl) {
|
|
1457
1418
|
var _arguments = arguments,
|
|
1458
1419
|
_this = this;
|
|
1459
|
-
return
|
|
1420
|
+
return _asyncToGenerator(/*#__PURE__*/_regeneratorRuntime.mark(function _callee() {
|
|
1460
1421
|
var headers, result, validResponse;
|
|
1461
|
-
return
|
|
1422
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
1462
1423
|
while (1) switch (_context.prev = _context.next) {
|
|
1463
1424
|
case 0:
|
|
1464
1425
|
headers = _arguments.length > 3 && _arguments[3] !== undefined ? _arguments[3] : {};
|
|
@@ -1538,7 +1499,7 @@ var Resource = {
|
|
|
1538
1499
|
}
|
|
1539
1500
|
var encoding = getEncoding(contentType);
|
|
1540
1501
|
// UTF-8 is handled natively by Node.js, skip iconv-lite
|
|
1541
|
-
var decodedContent = encoding === 'utf-8' ? content.toString('utf-8') :
|
|
1502
|
+
var decodedContent = encoding === 'utf-8' ? content.toString('utf-8') : iconv.decode(content, encoding);
|
|
1542
1503
|
var $ = cheerio__namespace.load(decodedContent);
|
|
1543
1504
|
// after first cheerio.load, check to see if encoding matches
|
|
1544
1505
|
var contentTypeSelector = isBrowser ? 'meta[http-equiv=content-type]' : 'meta[http-equiv=content-type i]';
|
|
@@ -1547,7 +1508,7 @@ var Resource = {
|
|
|
1547
1508
|
|
|
1548
1509
|
// if encodings in the header/body dont match, use the one in the body
|
|
1549
1510
|
if (metaContentType && properEncoding !== encoding) {
|
|
1550
|
-
decodedContent = properEncoding === 'utf-8' ? content.toString('utf-8') :
|
|
1511
|
+
decodedContent = properEncoding === 'utf-8' ? content.toString('utf-8') : iconv.decode(content, properEncoding);
|
|
1551
1512
|
$ = cheerio__namespace.load(decodedContent);
|
|
1552
1513
|
}
|
|
1553
1514
|
return $;
|
|
@@ -1557,8 +1518,8 @@ var Resource = {
|
|
|
1557
1518
|
function range() {
|
|
1558
1519
|
var start = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
|
|
1559
1520
|
var end = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
|
|
1560
|
-
return /*#__PURE__*/
|
|
1561
|
-
return
|
|
1521
|
+
return /*#__PURE__*/_regeneratorRuntime.mark(function _callee() {
|
|
1522
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
1562
1523
|
while (1) switch (_context.prev = _context.next) {
|
|
1563
1524
|
case 0:
|
|
1564
1525
|
if (!(start <= end)) {
|
|
@@ -1592,7 +1553,7 @@ var merge = function merge(extractor, domains) {
|
|
|
1592
1553
|
}, {});
|
|
1593
1554
|
};
|
|
1594
1555
|
function mergeSupportedDomains(extractor) {
|
|
1595
|
-
return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(
|
|
1556
|
+
return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
|
|
1596
1557
|
}
|
|
1597
1558
|
|
|
1598
1559
|
var apiExtractors = {};
|
|
@@ -1603,7 +1564,7 @@ function addExtractor(extractor) {
|
|
|
1603
1564
|
message: 'Unable to add custom extractor. Invalid parameters.'
|
|
1604
1565
|
};
|
|
1605
1566
|
}
|
|
1606
|
-
_Object$
|
|
1567
|
+
_Object$assign(apiExtractors, mergeSupportedDomains(extractor));
|
|
1607
1568
|
return apiExtractors;
|
|
1608
1569
|
}
|
|
1609
1570
|
|
|
@@ -2313,7 +2274,7 @@ var MediumExtractor = {
|
|
|
2313
2274
|
var $parent = $node.parents('figure');
|
|
2314
2275
|
if (ytRe.test(thumb)) {
|
|
2315
2276
|
var _thumb$match = thumb.match(ytRe),
|
|
2316
|
-
_thumb$match2 =
|
|
2277
|
+
_thumb$match2 = _slicedToArray(_thumb$match, 2);
|
|
2317
2278
|
_thumb$match2[0];
|
|
2318
2279
|
var youtubeId = _thumb$match2[1]; // eslint-disable-line
|
|
2319
2280
|
$node.attr('src', "https://www.youtube.com/embed/".concat(youtubeId));
|
|
@@ -2336,7 +2297,7 @@ var MediumExtractor = {
|
|
|
2336
2297
|
// Remove any smaller images that did not get caught by the generic image
|
|
2337
2298
|
// cleaner (author photo 48px, leading sentence images 79px, etc.).
|
|
2338
2299
|
img: function img($node) {
|
|
2339
|
-
var width =
|
|
2300
|
+
var width = _parseInt($node.attr('width'), 10);
|
|
2340
2301
|
if (width < 100) $node.remove();
|
|
2341
2302
|
}
|
|
2342
2303
|
},
|
|
@@ -3231,7 +3192,7 @@ var WwwMsnbcComExtractor = {
|
|
|
3231
3192
|
// before it's consumable content? E.g., unusual lazy loaded images
|
|
3232
3193
|
transforms: {
|
|
3233
3194
|
'.pane-node-body': function paneNodeBody($node, $) {
|
|
3234
|
-
var _WwwMsnbcComExtractor =
|
|
3195
|
+
var _WwwMsnbcComExtractor = _slicedToArray(WwwMsnbcComExtractor.lead_image_url.selectors[0], 2),
|
|
3235
3196
|
selector = _WwwMsnbcComExtractor[0],
|
|
3236
3197
|
attr = _WwwMsnbcComExtractor[1];
|
|
3237
3198
|
var src = $(selector).attr(attr);
|
|
@@ -5380,7 +5341,7 @@ var WiredJpExtractor = {
|
|
|
5380
5341
|
'img[data-original]': function imgDataOriginal($node) {
|
|
5381
5342
|
var dataOriginal = $node.attr('data-original');
|
|
5382
5343
|
var src = $node.attr('src');
|
|
5383
|
-
var url =
|
|
5344
|
+
var url = URL$1.resolve(src, dataOriginal);
|
|
5384
5345
|
$node.attr('src', url);
|
|
5385
5346
|
}
|
|
5386
5347
|
},
|
|
@@ -5684,8 +5645,6 @@ var PastebinComExtractor = {
|
|
|
5684
5645
|
}
|
|
5685
5646
|
};
|
|
5686
5647
|
|
|
5687
|
-
/* eslint-disable no-nested-ternary */
|
|
5688
|
-
/* eslint-disable no-unused-expressions */
|
|
5689
5648
|
var WwwAbendblattDeExtractor = {
|
|
5690
5649
|
domain: 'www.abendblatt.de',
|
|
5691
5650
|
title: {
|
|
@@ -6304,14 +6263,14 @@ var WwwSePlExtractor = {
|
|
|
6304
6263
|
}
|
|
6305
6264
|
};
|
|
6306
6265
|
|
|
6307
|
-
function ownKeys$f(e, r) { var t = _Object$
|
|
6308
|
-
function _objectSpread$f(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$f(Object(t),
|
|
6266
|
+
function ownKeys$f(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6267
|
+
function _objectSpread$f(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$f(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$f(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6309
6268
|
var SportSePlExtractor = _objectSpread$f(_objectSpread$f({}, WwwSePlExtractor), {}, {
|
|
6310
6269
|
domain: 'sport.se.pl'
|
|
6311
6270
|
});
|
|
6312
6271
|
|
|
6313
|
-
function ownKeys$e(e, r) { var t = _Object$
|
|
6314
|
-
function _objectSpread$e(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$e(Object(t),
|
|
6272
|
+
function ownKeys$e(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6273
|
+
function _objectSpread$e(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$e(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$e(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6315
6274
|
var PolitykaSePlExtractor = _objectSpread$e(_objectSpread$e({}, WwwSePlExtractor), {}, {
|
|
6316
6275
|
domain: 'polityka.se.pl'
|
|
6317
6276
|
});
|
|
@@ -6344,20 +6303,20 @@ var SuperserialeSePlExtractor = {
|
|
|
6344
6303
|
}
|
|
6345
6304
|
};
|
|
6346
6305
|
|
|
6347
|
-
function ownKeys$d(e, r) { var t = _Object$
|
|
6348
|
-
function _objectSpread$d(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$d(Object(t),
|
|
6306
|
+
function ownKeys$d(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6307
|
+
function _objectSpread$d(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$d(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$d(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6349
6308
|
var SzczecinSePlExtractor = _objectSpread$d(_objectSpread$d({}, WwwSePlExtractor), {}, {
|
|
6350
6309
|
domain: 'szczecin.se.pl'
|
|
6351
6310
|
});
|
|
6352
6311
|
|
|
6353
|
-
function ownKeys$c(e, r) { var t = _Object$
|
|
6354
|
-
function _objectSpread$c(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$c(Object(t),
|
|
6312
|
+
function ownKeys$c(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6313
|
+
function _objectSpread$c(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$c(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$c(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6355
6314
|
var SuperbizSePlExtractor = _objectSpread$c(_objectSpread$c({}, WwwSePlExtractor), {}, {
|
|
6356
6315
|
domain: 'superbiz.se.pl'
|
|
6357
6316
|
});
|
|
6358
6317
|
|
|
6359
|
-
function ownKeys$b(e, r) { var t = _Object$
|
|
6360
|
-
function _objectSpread$b(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$b(Object(t),
|
|
6318
|
+
function ownKeys$b(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6319
|
+
function _objectSpread$b(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$b(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$b(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6361
6320
|
var PortalobronnySePlExtractor = _objectSpread$b(_objectSpread$b({}, WwwSePlExtractor), {}, {
|
|
6362
6321
|
domain: 'portalobronny.se.pl'
|
|
6363
6322
|
});
|
|
@@ -6384,26 +6343,26 @@ var PolskisamorzadSePlExtractor = {
|
|
|
6384
6343
|
}
|
|
6385
6344
|
};
|
|
6386
6345
|
|
|
6387
|
-
function ownKeys$a(e, r) { var t = _Object$
|
|
6388
|
-
function _objectSpread$a(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$a(Object(t),
|
|
6346
|
+
function ownKeys$a(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6347
|
+
function _objectSpread$a(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$a(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$a(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6389
6348
|
var LodzSePlExtractor = _objectSpread$a(_objectSpread$a({}, WwwSePlExtractor), {}, {
|
|
6390
6349
|
domain: 'lodz.se.pl'
|
|
6391
6350
|
});
|
|
6392
6351
|
|
|
6393
|
-
function ownKeys$9(e, r) { var t = _Object$
|
|
6394
|
-
function _objectSpread$9(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$9(Object(t),
|
|
6352
|
+
function ownKeys$9(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6353
|
+
function _objectSpread$9(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$9(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$9(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6395
6354
|
var WroclawSePlExtractor = _objectSpread$9(_objectSpread$9({}, WwwSePlExtractor), {}, {
|
|
6396
6355
|
domain: 'wroclaw.se.pl'
|
|
6397
6356
|
});
|
|
6398
6357
|
|
|
6399
|
-
function ownKeys$8(e, r) { var t = _Object$
|
|
6400
|
-
function _objectSpread$8(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$8(Object(t),
|
|
6358
|
+
function ownKeys$8(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6359
|
+
function _objectSpread$8(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$8(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$8(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6401
6360
|
var LublinSePlExtractor = _objectSpread$8(_objectSpread$8({}, WwwSePlExtractor), {}, {
|
|
6402
6361
|
domain: 'lublin.se.pl'
|
|
6403
6362
|
});
|
|
6404
6363
|
|
|
6405
|
-
function ownKeys$7(e, r) { var t = _Object$
|
|
6406
|
-
function _objectSpread$7(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$7(Object(t),
|
|
6364
|
+
function ownKeys$7(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
6365
|
+
function _objectSpread$7(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$7(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$7(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
6407
6366
|
var BialystokSePlExtractor = _objectSpread$7(_objectSpread$7({}, WwwSePlExtractor), {}, {
|
|
6408
6367
|
domain: 'bialystok.se.pl'
|
|
6409
6368
|
});
|
|
@@ -6659,7 +6618,7 @@ var WwwPolygonComExtractor = {
|
|
|
6659
6618
|
img: function img($node) {
|
|
6660
6619
|
var srcset = $node.attr('srcset');
|
|
6661
6620
|
var _split = (srcset || '').split(','),
|
|
6662
|
-
_split2 =
|
|
6621
|
+
_split2 = _slicedToArray(_split, 1),
|
|
6663
6622
|
src = _split2[0];
|
|
6664
6623
|
if (src) {
|
|
6665
6624
|
$node.parent().replaceWith("<figure><img srcset=\"".concat(srcset, "\" src=\"").concat(src, "\"/></figure>"));
|
|
@@ -6699,7 +6658,7 @@ var WwwThevergeComExtractor = {
|
|
|
6699
6658
|
img: function img($node) {
|
|
6700
6659
|
var srcset = $node.attr('srcset');
|
|
6701
6660
|
var _split = (srcset || '').split(','),
|
|
6702
|
-
_split2 =
|
|
6661
|
+
_split2 = _slicedToArray(_split, 1),
|
|
6703
6662
|
src = _split2[0];
|
|
6704
6663
|
if (src) {
|
|
6705
6664
|
$node.parent().replaceWith("<figure><img srcset=\"".concat(srcset, "\" src=\"").concat(src, "\"/></figure>"));
|
|
@@ -7233,8 +7192,8 @@ var WwwEuronewsComExtractor = {
|
|
|
7233
7192
|
}
|
|
7234
7193
|
};
|
|
7235
7194
|
|
|
7236
|
-
function ownKeys$6(e, r) { var t = _Object$
|
|
7237
|
-
function _objectSpread$6(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$6(Object(t),
|
|
7195
|
+
function ownKeys$6(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
7196
|
+
function _objectSpread$6(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$6(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$6(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
7238
7197
|
var GrEuronewsComExtractor = _objectSpread$6(_objectSpread$6({}, WwwEuronewsComExtractor), {}, {
|
|
7239
7198
|
domain: 'gr.euronews.com'
|
|
7240
7199
|
});
|
|
@@ -7264,211 +7223,426 @@ var WwwIlfattoquotidianoItExtractor = {
|
|
|
7264
7223
|
}
|
|
7265
7224
|
};
|
|
7266
7225
|
|
|
7226
|
+
var ActualidadRtComExtractor = {
|
|
7227
|
+
domain: 'actualidad.rt.com',
|
|
7228
|
+
title: {
|
|
7229
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7230
|
+
},
|
|
7231
|
+
author: {
|
|
7232
|
+
selectors: [['meta[name="article:author"]', 'value']]
|
|
7233
|
+
},
|
|
7234
|
+
date_published: {
|
|
7235
|
+
selectors: [['meta[name="mediator_published_time"]', 'value']]
|
|
7236
|
+
},
|
|
7237
|
+
dek: {
|
|
7238
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7239
|
+
},
|
|
7240
|
+
lead_image_url: {
|
|
7241
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7242
|
+
},
|
|
7243
|
+
content: {
|
|
7244
|
+
selectors: ['.ArticleView-text'],
|
|
7245
|
+
transforms: {},
|
|
7246
|
+
// RT wraps each <img> in a <picture> whose <source> elements carry a
|
|
7247
|
+
// base64 placeholder srcset; browsers honor that over the real <img src>,
|
|
7248
|
+
// so drop the sources and let the <img> (real URL) render.
|
|
7249
|
+
clean: ['.ReadMore-root', 'source']
|
|
7250
|
+
}
|
|
7251
|
+
};
|
|
7252
|
+
|
|
7253
|
+
var WwwTweaktownComExtractor = {
|
|
7254
|
+
domain: 'www.tweaktown.com',
|
|
7255
|
+
title: {
|
|
7256
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7257
|
+
},
|
|
7258
|
+
author: {
|
|
7259
|
+
selectors: ['.info-bar-div2 a[rel="author"]']
|
|
7260
|
+
},
|
|
7261
|
+
date_published: {
|
|
7262
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
7263
|
+
},
|
|
7264
|
+
dek: {
|
|
7265
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7266
|
+
},
|
|
7267
|
+
lead_image_url: {
|
|
7268
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7269
|
+
},
|
|
7270
|
+
content: {
|
|
7271
|
+
selectors: ['#article-body'],
|
|
7272
|
+
transforms: {},
|
|
7273
|
+
clean: []
|
|
7274
|
+
}
|
|
7275
|
+
};
|
|
7276
|
+
|
|
7277
|
+
var WwwFrandroidComExtractor = {
|
|
7278
|
+
domain: 'www.frandroid.com',
|
|
7279
|
+
title: {
|
|
7280
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7281
|
+
},
|
|
7282
|
+
author: {
|
|
7283
|
+
selectors: [['meta[name="parsely-author"]', 'value']]
|
|
7284
|
+
},
|
|
7285
|
+
date_published: {
|
|
7286
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
7287
|
+
},
|
|
7288
|
+
dek: {
|
|
7289
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7290
|
+
},
|
|
7291
|
+
lead_image_url: {
|
|
7292
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7293
|
+
},
|
|
7294
|
+
content: {
|
|
7295
|
+
selectors: ['section.article-content'],
|
|
7296
|
+
transforms: {
|
|
7297
|
+
h2: function h2(node) {
|
|
7298
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
7299
|
+
},
|
|
7300
|
+
h3: function h3(node) {
|
|
7301
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
7302
|
+
}
|
|
7303
|
+
},
|
|
7304
|
+
clean: ['.index-menu-wrapper', '.is-gastric-kingfisher', '.newsletter-form', '.share', '.article-footer', '.js-feed-posts', '.optidigital-adslot', '[id^="optidigital-adslot"]']
|
|
7305
|
+
}
|
|
7306
|
+
};
|
|
7307
|
+
|
|
7308
|
+
var WwwMotorsportComExtractor = {
|
|
7309
|
+
domain: 'www.motorsport.com',
|
|
7310
|
+
title: {
|
|
7311
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7312
|
+
},
|
|
7313
|
+
author: {
|
|
7314
|
+
selectors: ['.msnt-author-toolbar a[href*="/info/about-us/"]']
|
|
7315
|
+
},
|
|
7316
|
+
date_published: {
|
|
7317
|
+
selectors: [['meta[name="datePublished"]', 'value']]
|
|
7318
|
+
},
|
|
7319
|
+
dek: {
|
|
7320
|
+
selectors: ['h2.text-article-description']
|
|
7321
|
+
},
|
|
7322
|
+
lead_image_url: {
|
|
7323
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7324
|
+
},
|
|
7325
|
+
content: {
|
|
7326
|
+
selectors: ['.ms-article-content'],
|
|
7327
|
+
transforms: {
|
|
7328
|
+
h2: function h2(node) {
|
|
7329
|
+
return node.attr('class', 'mercury-parser-keep');
|
|
7330
|
+
}
|
|
7331
|
+
},
|
|
7332
|
+
clean: ['msnt-survey-promo', '.article-fullwidth-gallery_item ~ .article-fullwidth-gallery_item', '.ms-inarticle-widgets', '.relatedContent', '.ms-apb', '.ms-ap-native', '.outstream_partner']
|
|
7333
|
+
}
|
|
7334
|
+
};
|
|
7335
|
+
|
|
7336
|
+
var SubstackComExtractor = {
|
|
7337
|
+
domain: 'substack.com',
|
|
7338
|
+
title: {
|
|
7339
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7340
|
+
},
|
|
7341
|
+
author: {
|
|
7342
|
+
selectors: [['meta[name="author"]', 'value']]
|
|
7343
|
+
},
|
|
7344
|
+
date_published: {
|
|
7345
|
+
selectors: [['meta[name="article:published_time"]', 'value']]
|
|
7346
|
+
},
|
|
7347
|
+
dek: {
|
|
7348
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7349
|
+
},
|
|
7350
|
+
lead_image_url: {
|
|
7351
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7352
|
+
},
|
|
7353
|
+
content: {
|
|
7354
|
+
selectors: ['.available-content'],
|
|
7355
|
+
transforms: {
|
|
7356
|
+
'div.captioned-image-container': 'figure',
|
|
7357
|
+
'div.image-link': function divImageLink($node) {
|
|
7358
|
+
$node.replaceWith($node.find('img'));
|
|
7359
|
+
}
|
|
7360
|
+
},
|
|
7361
|
+
clean: ['.subscribe-widget', '.subscription-widget-wrap', '.subscription-widget-wrap-editor', '.button-wrapper', '.poll-embed', '.share-dialog']
|
|
7362
|
+
}
|
|
7363
|
+
};
|
|
7364
|
+
|
|
7365
|
+
var WwwDwComExtractor = {
|
|
7366
|
+
domain: 'www.dw.com',
|
|
7367
|
+
title: {
|
|
7368
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7369
|
+
},
|
|
7370
|
+
author: {
|
|
7371
|
+
selectors: ['.author-name .author-link']
|
|
7372
|
+
},
|
|
7373
|
+
date_published: {
|
|
7374
|
+
selectors: [['meta[name="date"]', 'value']]
|
|
7375
|
+
},
|
|
7376
|
+
dek: {
|
|
7377
|
+
selectors: [['meta[name="og:description"]', 'value']]
|
|
7378
|
+
},
|
|
7379
|
+
lead_image_url: {
|
|
7380
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7381
|
+
},
|
|
7382
|
+
content: {
|
|
7383
|
+
selectors: ['[data-tracking-name="rich-text"]'],
|
|
7384
|
+
transforms: {
|
|
7385
|
+
// DW inline images are responsive: the real template lives in data-url
|
|
7386
|
+
// with a literal ${formatId} size token that JS would replace, leaving a
|
|
7387
|
+
// broken src in the raw HTML. Resolve it to a standard content size.
|
|
7388
|
+
img: function img(node) {
|
|
7389
|
+
var template = node.attr('data-url') || node.attr('src') || '';
|
|
7390
|
+
if (template.includes('${formatId}')) {
|
|
7391
|
+
node.attr('src', template.replace('${formatId}', '6'));
|
|
7392
|
+
}
|
|
7393
|
+
}
|
|
7394
|
+
},
|
|
7395
|
+
// Embedded tweets are non-functional fallback markup without JS.
|
|
7396
|
+
clean: ['blockquote.tweet.embed']
|
|
7397
|
+
}
|
|
7398
|
+
};
|
|
7399
|
+
|
|
7400
|
+
var WwwAnimenewsnetworkComExtractor = {
|
|
7401
|
+
domain: 'www.animenewsnetwork.com',
|
|
7402
|
+
title: {
|
|
7403
|
+
selectors: [['meta[name="og:title"]', 'value']]
|
|
7404
|
+
},
|
|
7405
|
+
author: null,
|
|
7406
|
+
date_published: {
|
|
7407
|
+
selectors: [['small time', 'datetime']]
|
|
7408
|
+
},
|
|
7409
|
+
dek: {
|
|
7410
|
+
selectors: [['meta[name="description"]', 'value']]
|
|
7411
|
+
},
|
|
7412
|
+
lead_image_url: {
|
|
7413
|
+
selectors: [['meta[name="og:image"]', 'value']]
|
|
7414
|
+
},
|
|
7415
|
+
content: {
|
|
7416
|
+
selectors: ['.KonaBody'],
|
|
7417
|
+
transforms: {
|
|
7418
|
+
// Images are lazy-loaded: real URL in data-src, a spacer.gif in src.
|
|
7419
|
+
// Promote data-src so the images survive cleaning and render.
|
|
7420
|
+
img: function img(node) {
|
|
7421
|
+
var dataSrc = node.attr('data-src');
|
|
7422
|
+
if (dataSrc) {
|
|
7423
|
+
var src = dataSrc.startsWith('/') ? "https://www.animenewsnetwork.com".concat(dataSrc) : dataSrc;
|
|
7424
|
+
node.attr('src', src);
|
|
7425
|
+
node.removeAttr('data-src');
|
|
7426
|
+
}
|
|
7427
|
+
}
|
|
7428
|
+
},
|
|
7429
|
+
// .intro duplicates the dek; instaread-player is an audio widget.
|
|
7430
|
+
clean: ['.intro', 'instaread-player']
|
|
7431
|
+
}
|
|
7432
|
+
};
|
|
7433
|
+
|
|
7267
7434
|
var CustomExtractors = /*#__PURE__*/Object.freeze({
|
|
7268
7435
|
__proto__: null,
|
|
7436
|
+
AbcnewsGoComExtractor: AbcnewsGoComExtractor,
|
|
7437
|
+
ActualidadRtComExtractor: ActualidadRtComExtractor,
|
|
7438
|
+
ApartmentTherapyExtractor: ApartmentTherapyExtractor,
|
|
7439
|
+
ArstechnicaComExtractor: ArstechnicaComExtractor,
|
|
7269
7440
|
BalloonJuiceComExtractor: BalloonJuiceComExtractor,
|
|
7441
|
+
BialystokSePlExtractor: BialystokSePlExtractor,
|
|
7442
|
+
BiorxivOrgExtractor: BiorxivOrgExtractor,
|
|
7443
|
+
BlisterreviewComExtractor: BlisterreviewComExtractor,
|
|
7270
7444
|
BloggerExtractor: BloggerExtractor,
|
|
7271
|
-
|
|
7272
|
-
|
|
7273
|
-
|
|
7274
|
-
|
|
7275
|
-
TheAtlanticExtractor: TheAtlanticExtractor,
|
|
7276
|
-
NewYorkerExtractor: NewYorkerExtractor,
|
|
7277
|
-
WiredExtractor: WiredExtractor,
|
|
7278
|
-
MSNExtractor: MSNExtractor,
|
|
7279
|
-
YahooExtractor: YahooExtractor,
|
|
7445
|
+
BookwalkerJpExtractor: BookwalkerJpExtractor,
|
|
7446
|
+
BroadwayWorldExtractor: BroadwayWorldExtractor,
|
|
7447
|
+
BskyAppExtractor: BskyAppExtractor,
|
|
7448
|
+
BuzzapJpExtractor: BuzzapJpExtractor,
|
|
7280
7449
|
BuzzfeedExtractor: BuzzfeedExtractor,
|
|
7281
|
-
|
|
7282
|
-
|
|
7283
|
-
|
|
7450
|
+
ChicagoyimbyComExtractor: ChicagoyimbyComExtractor,
|
|
7451
|
+
ClinicaltrialsGovExtractor: ClinicaltrialsGovExtractor,
|
|
7452
|
+
DeadlineComExtractor: DeadlineComExtractor,
|
|
7284
7453
|
DeadspinExtractor: DeadspinExtractor,
|
|
7285
|
-
|
|
7286
|
-
|
|
7287
|
-
|
|
7288
|
-
WwwTmzComExtractor: WwwTmzComExtractor,
|
|
7289
|
-
WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
|
|
7290
|
-
WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
|
|
7291
|
-
NewrepublicComExtractor: NewrepublicComExtractor,
|
|
7292
|
-
MoneyCnnComExtractor: MoneyCnnComExtractor,
|
|
7293
|
-
WwwCnnComExtractor: WwwCnnComExtractor,
|
|
7294
|
-
WwwAolComExtractor: WwwAolComExtractor,
|
|
7295
|
-
WwwYoutubeComExtractor: WwwYoutubeComExtractor,
|
|
7296
|
-
WwwTheguardianComExtractor: WwwTheguardianComExtractor,
|
|
7297
|
-
WwwSbnationComExtractor: WwwSbnationComExtractor,
|
|
7298
|
-
WwwBloombergComExtractor: WwwBloombergComExtractor,
|
|
7299
|
-
WwwBustleComExtractor: WwwBustleComExtractor,
|
|
7300
|
-
WwwNprOrgExtractor: WwwNprOrgExtractor,
|
|
7301
|
-
WwwRecodeNetExtractor: WwwRecodeNetExtractor,
|
|
7302
|
-
QzComExtractor: QzComExtractor,
|
|
7303
|
-
WwwDmagazineComExtractor: WwwDmagazineComExtractor,
|
|
7304
|
-
WwwReutersComExtractor: WwwReutersComExtractor,
|
|
7305
|
-
MashableComExtractor: MashableComExtractor,
|
|
7306
|
-
WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
|
|
7307
|
-
WwwVoxComExtractor: WwwVoxComExtractor,
|
|
7308
|
-
NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
|
|
7309
|
-
WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
|
|
7310
|
-
WwwLatimesComExtractor: WwwLatimesComExtractor,
|
|
7311
|
-
PagesixComExtractor: PagesixComExtractor,
|
|
7312
|
-
ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
|
|
7313
|
-
WwwCbssportsComExtractor: WwwCbssportsComExtractor,
|
|
7314
|
-
WwwMsnbcComExtractor: WwwMsnbcComExtractor,
|
|
7315
|
-
WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
|
|
7316
|
-
WwwMentalflossComExtractor: WwwMentalflossComExtractor,
|
|
7317
|
-
AbcnewsGoComExtractor: AbcnewsGoComExtractor,
|
|
7318
|
-
WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
|
|
7319
|
-
WwwCnbcComExtractor: WwwCnbcComExtractor,
|
|
7320
|
-
WwwPopsugarComExtractor: WwwPopsugarComExtractor,
|
|
7321
|
-
ObserverComExtractor: ObserverComExtractor,
|
|
7322
|
-
PeopleComExtractor: PeopleComExtractor,
|
|
7323
|
-
WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
|
|
7324
|
-
WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
|
|
7325
|
-
twofortysevensportsComExtractor: twofortysevensportsComExtractor,
|
|
7326
|
-
UproxxComExtractor: UproxxComExtractor,
|
|
7327
|
-
WwwEonlineComExtractor: WwwEonlineComExtractor,
|
|
7328
|
-
WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
|
|
7329
|
-
WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
|
|
7330
|
-
WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
|
|
7331
|
-
WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
|
|
7332
|
-
WwwSiComExtractor: WwwSiComExtractor,
|
|
7333
|
-
WwwRawstoryComExtractor: WwwRawstoryComExtractor,
|
|
7334
|
-
WwwCnetComExtractor: WwwCnetComExtractor,
|
|
7335
|
-
WwwTodayComExtractor: WwwTodayComExtractor,
|
|
7336
|
-
WwwAlComExtractor: WwwAlComExtractor,
|
|
7337
|
-
WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
|
|
7338
|
-
WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
|
|
7339
|
-
WwwAmericanowComExtractor: WwwAmericanowComExtractor,
|
|
7340
|
-
ScienceflyComExtractor: ScienceflyComExtractor,
|
|
7341
|
-
HellogigglesComExtractor: HellogigglesComExtractor,
|
|
7342
|
-
ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
|
|
7343
|
-
WwwInquisitrComExtractor: WwwInquisitrComExtractor,
|
|
7344
|
-
WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
|
|
7454
|
+
EconomictimesIndiatimesComExtractor: EconomictimesIndiatimesComExtractor,
|
|
7455
|
+
EpaperZeitDeExtractor: EpaperZeitDeExtractor,
|
|
7456
|
+
FactorioComExtractor: FactorioComExtractor,
|
|
7345
7457
|
FortuneComExtractor: FortuneComExtractor,
|
|
7346
|
-
WwwLinkedinComExtractor: WwwLinkedinComExtractor,
|
|
7347
|
-
ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
|
|
7348
|
-
WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
|
|
7349
|
-
WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
|
|
7350
7458
|
ForwardComExtractor: ForwardComExtractor,
|
|
7351
|
-
|
|
7459
|
+
GeniusComExtractor: GeniusComExtractor,
|
|
7460
|
+
GetnewsJpExtractor: GetnewsJpExtractor,
|
|
7461
|
+
GithubComExtractor: GithubComExtractor,
|
|
7462
|
+
GonintendoComExtractor: GonintendoComExtractor,
|
|
7352
7463
|
GothamistComExtractor: GothamistComExtractor,
|
|
7353
|
-
|
|
7354
|
-
|
|
7464
|
+
GrEuronewsComExtractor: GrEuronewsComExtractor,
|
|
7465
|
+
HellogigglesComExtractor: HellogigglesComExtractor,
|
|
7355
7466
|
IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor,
|
|
7356
|
-
|
|
7357
|
-
|
|
7358
|
-
|
|
7467
|
+
JapanCnetComExtractor: JapanCnetComExtractor,
|
|
7468
|
+
JapanZdnetComExtractor: JapanZdnetComExtractor,
|
|
7469
|
+
JvndbJvnJpExtractor: JvndbJvnJpExtractor,
|
|
7470
|
+
LittleThingsExtractor: LittleThingsExtractor,
|
|
7471
|
+
LodzSePlExtractor: LodzSePlExtractor,
|
|
7472
|
+
LublinSePlExtractor: LublinSePlExtractor,
|
|
7473
|
+
MSNExtractor: MSNExtractor,
|
|
7474
|
+
MaTtiasBeExtractor: MaTtiasBeExtractor,
|
|
7475
|
+
MashableComExtractor: MashableComExtractor,
|
|
7476
|
+
MediumExtractor: MediumExtractor,
|
|
7477
|
+
MobilesyrupComExtractor: MobilesyrupComExtractor,
|
|
7478
|
+
MoneyCnnComExtractor: MoneyCnnComExtractor,
|
|
7479
|
+
NYMagExtractor: NYMagExtractor,
|
|
7480
|
+
NYTimesExtractor: NYTimesExtractor,
|
|
7481
|
+
NewYorkerExtractor: NewYorkerExtractor,
|
|
7482
|
+
NewrepublicComExtractor: NewrepublicComExtractor,
|
|
7359
7483
|
NewsMynaviJpExtractor: NewsMynaviJpExtractor,
|
|
7360
|
-
|
|
7361
|
-
|
|
7362
|
-
|
|
7484
|
+
NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
|
|
7485
|
+
NewsPtsOrgTwExtractor: NewsPtsOrgTwExtractor,
|
|
7486
|
+
Nineto5googleComExtractor: Nineto5googleComExtractor,
|
|
7487
|
+
Nineto5linuxComExtractor: Nineto5linuxComExtractor,
|
|
7488
|
+
Nineto5macComExtractor: Nineto5macComExtractor,
|
|
7489
|
+
ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
|
|
7490
|
+
ObserverComExtractor: ObserverComExtractor,
|
|
7491
|
+
OrfAtExtractor: OrfAtExtractor,
|
|
7363
7492
|
OtrsComExtractor: OtrsComExtractor,
|
|
7364
|
-
|
|
7365
|
-
|
|
7366
|
-
|
|
7367
|
-
WwwSanwaCoJpExtractor: WwwSanwaCoJpExtractor,
|
|
7368
|
-
WwwElecomCoJpExtractor: WwwElecomCoJpExtractor,
|
|
7369
|
-
ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor,
|
|
7370
|
-
JvndbJvnJpExtractor: JvndbJvnJpExtractor,
|
|
7371
|
-
GeniusComExtractor: GeniusComExtractor,
|
|
7372
|
-
WwwJnsaOrgExtractor: WwwJnsaOrgExtractor,
|
|
7493
|
+
PagesixComExtractor: PagesixComExtractor,
|
|
7494
|
+
PastebinComExtractor: PastebinComExtractor,
|
|
7495
|
+
PeopleComExtractor: PeopleComExtractor,
|
|
7373
7496
|
PhpspotOrgExtractor: PhpspotOrgExtractor,
|
|
7374
|
-
|
|
7375
|
-
|
|
7376
|
-
|
|
7377
|
-
|
|
7378
|
-
|
|
7379
|
-
|
|
7380
|
-
|
|
7381
|
-
|
|
7382
|
-
DeadlineComExtractor: DeadlineComExtractor,
|
|
7383
|
-
WwwGizmodoJpExtractor: WwwGizmodoJpExtractor,
|
|
7384
|
-
GetnewsJpExtractor: GetnewsJpExtractor,
|
|
7385
|
-
WwwLifehackerJpExtractor: WwwLifehackerJpExtractor,
|
|
7497
|
+
PitchforkComExtractor: PitchforkComExtractor,
|
|
7498
|
+
PoliticoExtractor: PoliticoExtractor,
|
|
7499
|
+
PolitykaSePlExtractor: PolitykaSePlExtractor,
|
|
7500
|
+
PolskisamorzadSePlExtractor: PolskisamorzadSePlExtractor,
|
|
7501
|
+
PortalobronnySePlExtractor: PortalobronnySePlExtractor,
|
|
7502
|
+
QzComExtractor: QzComExtractor,
|
|
7503
|
+
ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor,
|
|
7504
|
+
ScienceflyComExtractor: ScienceflyComExtractor,
|
|
7386
7505
|
SectIijAdJpExtractor: SectIijAdJpExtractor,
|
|
7387
|
-
|
|
7388
|
-
|
|
7389
|
-
|
|
7506
|
+
SgNewsYahooComExtractor: SgNewsYahooComExtractor,
|
|
7507
|
+
SpektrumExtractor: SpektrumExtractor,
|
|
7508
|
+
SportSePlExtractor: SportSePlExtractor,
|
|
7509
|
+
SubstackComExtractor: SubstackComExtractor,
|
|
7510
|
+
SuperbizSePlExtractor: SuperbizSePlExtractor,
|
|
7511
|
+
SuperserialeSePlExtractor: SuperserialeSePlExtractor,
|
|
7512
|
+
SzczecinSePlExtractor: SzczecinSePlExtractor,
|
|
7513
|
+
TakagihiromitsuJpExtractor: TakagihiromitsuJpExtractor,
|
|
7514
|
+
TarnkappeInfoExtractor: TarnkappeInfoExtractor,
|
|
7515
|
+
TechcrunchComExtractor: TechcrunchComExtractor,
|
|
7390
7516
|
TechlogIijAdJpExtractor: TechlogIijAdJpExtractor,
|
|
7391
|
-
|
|
7392
|
-
|
|
7393
|
-
|
|
7394
|
-
|
|
7395
|
-
WwwPhoronixComExtractor: WwwPhoronixComExtractor,
|
|
7396
|
-
PitchforkComExtractor: PitchforkComExtractor,
|
|
7397
|
-
BiorxivOrgExtractor: BiorxivOrgExtractor,
|
|
7398
|
-
EpaperZeitDeExtractor: EpaperZeitDeExtractor,
|
|
7399
|
-
WwwLadbibleComExtractor: WwwLadbibleComExtractor,
|
|
7517
|
+
TerminaltroveComExtractor: TerminaltroveComExtractor,
|
|
7518
|
+
TheAtlanticExtractor: TheAtlanticExtractor,
|
|
7519
|
+
ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
|
|
7520
|
+
ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
|
|
7400
7521
|
TimesofindiaIndiatimesComExtractor: TimesofindiaIndiatimesComExtractor,
|
|
7401
|
-
|
|
7402
|
-
|
|
7403
|
-
|
|
7404
|
-
|
|
7405
|
-
|
|
7406
|
-
|
|
7407
|
-
|
|
7408
|
-
|
|
7409
|
-
|
|
7410
|
-
|
|
7522
|
+
TldrTechExtractor: TldrTechExtractor,
|
|
7523
|
+
TwitterExtractor: TwitterExtractor,
|
|
7524
|
+
UproxxComExtractor: UproxxComExtractor,
|
|
7525
|
+
WccftechComExtractor: WccftechComExtractor,
|
|
7526
|
+
WeeklyAsciiJpExtractor: WeeklyAsciiJpExtractor,
|
|
7527
|
+
WikiaExtractor: WikiaExtractor,
|
|
7528
|
+
WikipediaExtractor: WikipediaExtractor,
|
|
7529
|
+
WiredExtractor: WiredExtractor,
|
|
7530
|
+
WiredJpExtractor: WiredJpExtractor,
|
|
7531
|
+
WroclawSePlExtractor: WroclawSePlExtractor,
|
|
7411
7532
|
Www1pezeshkComExtractor: Www1pezeshkComExtractor,
|
|
7533
|
+
WwwAbendblattDeExtractor: WwwAbendblattDeExtractor,
|
|
7534
|
+
WwwAlComExtractor: WwwAlComExtractor,
|
|
7535
|
+
WwwAmericanowComExtractor: WwwAmericanowComExtractor,
|
|
7412
7536
|
WwwAndroidauthorityComExtractor: WwwAndroidauthorityComExtractor,
|
|
7413
|
-
|
|
7414
|
-
|
|
7415
|
-
|
|
7416
|
-
|
|
7537
|
+
WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor,
|
|
7538
|
+
WwwAnimenewsnetworkComExtractor: WwwAnimenewsnetworkComExtractor,
|
|
7539
|
+
WwwAolComExtractor: WwwAolComExtractor,
|
|
7540
|
+
WwwAsahiComExtractor: WwwAsahiComExtractor,
|
|
7541
|
+
WwwBlickDeExtractor: WwwBlickDeExtractor,
|
|
7542
|
+
WwwBloombergComExtractor: WwwBloombergComExtractor,
|
|
7543
|
+
WwwBustleComExtractor: WwwBustleComExtractor,
|
|
7544
|
+
WwwCbcCaExtractor: WwwCbcCaExtractor,
|
|
7545
|
+
WwwCbssportsComExtractor: WwwCbssportsComExtractor,
|
|
7417
7546
|
WwwChannelnewsasiaComExtractor: WwwChannelnewsasiaComExtractor,
|
|
7418
|
-
|
|
7547
|
+
WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
|
|
7548
|
+
WwwCnbcComExtractor: WwwCnbcComExtractor,
|
|
7549
|
+
WwwCnetComExtractor: WwwCnetComExtractor,
|
|
7550
|
+
WwwCnnComExtractor: WwwCnnComExtractor,
|
|
7551
|
+
WwwDmagazineComExtractor: WwwDmagazineComExtractor,
|
|
7552
|
+
WwwDwComExtractor: WwwDwComExtractor,
|
|
7553
|
+
WwwElecomCoJpExtractor: WwwElecomCoJpExtractor,
|
|
7554
|
+
WwwEngadgetComExtractor: WwwEngadgetComExtractor,
|
|
7555
|
+
WwwEonlineComExtractor: WwwEonlineComExtractor,
|
|
7556
|
+
WwwEuronewsComExtractor: WwwEuronewsComExtractor,
|
|
7557
|
+
WwwFastcompanyComExtractor: WwwFastcompanyComExtractor,
|
|
7558
|
+
WwwFlatpanelshdComExtractor: WwwFlatpanelshdComExtractor,
|
|
7559
|
+
WwwFoolComExtractor: WwwFoolComExtractor,
|
|
7560
|
+
WwwFortinetComExtractor: WwwFortinetComExtractor,
|
|
7561
|
+
WwwFrandroidComExtractor: WwwFrandroidComExtractor,
|
|
7562
|
+
WwwFuturaSciencesComExtractor: WwwFuturaSciencesComExtractor,
|
|
7563
|
+
WwwGizmodoJpExtractor: WwwGizmodoJpExtractor,
|
|
7564
|
+
WwwGrueneDeExtractor: WwwGrueneDeExtractor,
|
|
7565
|
+
WwwHardwarezoneComSgExtractor: WwwHardwarezoneComSgExtractor,
|
|
7419
7566
|
WwwHeiseDeExtractor: WwwHeiseDeExtractor,
|
|
7420
|
-
|
|
7421
|
-
|
|
7422
|
-
|
|
7423
|
-
|
|
7424
|
-
|
|
7425
|
-
|
|
7426
|
-
|
|
7427
|
-
|
|
7428
|
-
|
|
7429
|
-
|
|
7430
|
-
|
|
7431
|
-
LodzSePlExtractor: LodzSePlExtractor,
|
|
7432
|
-
WroclawSePlExtractor: WroclawSePlExtractor,
|
|
7433
|
-
LublinSePlExtractor: LublinSePlExtractor,
|
|
7434
|
-
BialystokSePlExtractor: BialystokSePlExtractor,
|
|
7567
|
+
WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor,
|
|
7568
|
+
WwwIlfattoquotidianoItExtractor: WwwIlfattoquotidianoItExtractor,
|
|
7569
|
+
WwwInfoqComExtractor: WwwInfoqComExtractor,
|
|
7570
|
+
WwwInquisitrComExtractor: WwwInquisitrComExtractor,
|
|
7571
|
+
WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor,
|
|
7572
|
+
WwwIpaGoJpExtractor: WwwIpaGoJpExtractor,
|
|
7573
|
+
WwwItmediaCoJpExtractor: WwwItmediaCoJpExtractor,
|
|
7574
|
+
WwwJalopnikComExtractor: WwwJalopnikComExtractor,
|
|
7575
|
+
WwwJnsaOrgExtractor: WwwJnsaOrgExtractor,
|
|
7576
|
+
WwwLadbibleComExtractor: WwwLadbibleComExtractor,
|
|
7577
|
+
WwwLatimesComExtractor: WwwLatimesComExtractor,
|
|
7435
7578
|
WwwLebensmittelwarnungDeExtractor: WwwLebensmittelwarnungDeExtractor,
|
|
7579
|
+
WwwLemondeFrExtractor: WwwLemondeFrExtractor,
|
|
7580
|
+
WwwLifehackerJpExtractor: WwwLifehackerJpExtractor,
|
|
7581
|
+
WwwLinkedinComExtractor: WwwLinkedinComExtractor,
|
|
7582
|
+
WwwMacrumorsComExtractor: WwwMacrumorsComExtractor,
|
|
7583
|
+
WwwMentalflossComExtractor: WwwMentalflossComExtractor,
|
|
7584
|
+
WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor,
|
|
7585
|
+
WwwMoongiftJpExtractor: WwwMoongiftJpExtractor,
|
|
7586
|
+
WwwMotorsportComExtractor: WwwMotorsportComExtractor,
|
|
7587
|
+
WwwMsnbcComExtractor: WwwMsnbcComExtractor,
|
|
7588
|
+
WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
|
|
7589
|
+
WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
|
|
7590
|
+
WwwNdtvComExtractor: WwwNdtvComExtractor,
|
|
7591
|
+
WwwNotebookcheckNetExtractor: WwwNotebookcheckNetExtractor,
|
|
7592
|
+
WwwNprOrgExtractor: WwwNprOrgExtractor,
|
|
7593
|
+
WwwNtvDeExtractor: WwwNtvDeExtractor,
|
|
7594
|
+
WwwNumeramaComExtractor: WwwNumeramaComExtractor,
|
|
7595
|
+
WwwNydailynewsComExtractor: WwwNydailynewsComExtractor,
|
|
7596
|
+
WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
|
|
7597
|
+
WwwOreillyCoJpExtractor: WwwOreillyCoJpExtractor,
|
|
7598
|
+
WwwOssnewsJpExtractor: WwwOssnewsJpExtractor,
|
|
7599
|
+
WwwPhoronixComExtractor: WwwPhoronixComExtractor,
|
|
7600
|
+
WwwPolygonComExtractor: WwwPolygonComExtractor,
|
|
7601
|
+
WwwPopsugarComExtractor: WwwPopsugarComExtractor,
|
|
7602
|
+
WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
|
|
7603
|
+
WwwPublickey1JpExtractor: WwwPublickey1JpExtractor,
|
|
7436
7604
|
WwwQbitaiComExtractor: WwwQbitaiComExtractor,
|
|
7437
|
-
|
|
7438
|
-
|
|
7605
|
+
WwwQdailyComExtractor: WwwQdailyComExtractor,
|
|
7606
|
+
WwwRawstoryComExtractor: WwwRawstoryComExtractor,
|
|
7607
|
+
WwwRbbtodayComExtractor: WwwRbbtodayComExtractor,
|
|
7608
|
+
WwwRecodeNetExtractor: WwwRecodeNetExtractor,
|
|
7609
|
+
WwwRedditComExtractor: WwwRedditComExtractor,
|
|
7610
|
+
WwwRefinery29ComExtractor: WwwRefinery29ComExtractor,
|
|
7611
|
+
WwwReutersComExtractor: WwwReutersComExtractor,
|
|
7612
|
+
WwwRollingstoneComExtractor: WwwRollingstoneComExtractor,
|
|
7613
|
+
WwwSanwaCoJpExtractor: WwwSanwaCoJpExtractor,
|
|
7614
|
+
WwwSbnationComExtractor: WwwSbnationComExtractor,
|
|
7615
|
+
WwwSePlExtractor: WwwSePlExtractor,
|
|
7616
|
+
WwwSiComExtractor: WwwSiComExtractor,
|
|
7617
|
+
WwwSlateComExtractor: WwwSlateComExtractor,
|
|
7618
|
+
WwwSpiegelDeExtractor: WwwSpiegelDeExtractor,
|
|
7439
7619
|
WwwTagesschauDeExtractor: WwwTagesschauDeExtractor,
|
|
7440
|
-
Nineto5googleComExtractor: Nineto5googleComExtractor,
|
|
7441
|
-
WwwEngadgetComExtractor: WwwEngadgetComExtractor,
|
|
7442
|
-
TarnkappeInfoExtractor: TarnkappeInfoExtractor,
|
|
7443
|
-
WwwVortezNetExtractor: WwwVortezNetExtractor,
|
|
7444
|
-
WwwPolygonComExtractor: WwwPolygonComExtractor,
|
|
7445
|
-
WwwThevergeComExtractor: WwwThevergeComExtractor,
|
|
7446
7620
|
WwwTechpowerupComExtractor: WwwTechpowerupComExtractor,
|
|
7447
|
-
WwwFlatpanelshdComExtractor: WwwFlatpanelshdComExtractor,
|
|
7448
|
-
Nineto5macComExtractor: Nineto5macComExtractor,
|
|
7449
|
-
WwwNotebookcheckNetExtractor: WwwNotebookcheckNetExtractor,
|
|
7450
|
-
WwwFuturaSciencesComExtractor: WwwFuturaSciencesComExtractor,
|
|
7451
|
-
SgNewsYahooComExtractor: SgNewsYahooComExtractor,
|
|
7452
|
-
GonintendoComExtractor: GonintendoComExtractor,
|
|
7453
|
-
OrfAtExtractor: OrfAtExtractor,
|
|
7454
|
-
WwwVideogameschronicleComExtractor: WwwVideogameschronicleComExtractor,
|
|
7455
|
-
WwwNumeramaComExtractor: WwwNumeramaComExtractor,
|
|
7456
|
-
TerminaltroveComExtractor: TerminaltroveComExtractor,
|
|
7457
|
-
NewsPtsOrgTwExtractor: NewsPtsOrgTwExtractor,
|
|
7458
7621
|
WwwThedriveComExtractor: WwwThedriveComExtractor,
|
|
7459
|
-
|
|
7460
|
-
|
|
7461
|
-
|
|
7622
|
+
WwwTheguardianComExtractor: WwwTheguardianComExtractor,
|
|
7623
|
+
WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
|
|
7624
|
+
WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor,
|
|
7625
|
+
WwwThevergeComExtractor: WwwThevergeComExtractor,
|
|
7626
|
+
WwwTmzComExtractor: WwwTmzComExtractor,
|
|
7627
|
+
WwwTodayComExtractor: WwwTodayComExtractor,
|
|
7462
7628
|
WwwTransfermarktDeExtractor: WwwTransfermarktDeExtractor,
|
|
7463
|
-
|
|
7464
|
-
|
|
7465
|
-
|
|
7466
|
-
|
|
7629
|
+
WwwTweaktownComExtractor: WwwTweaktownComExtractor,
|
|
7630
|
+
WwwUsmagazineComExtractor: WwwUsmagazineComExtractor,
|
|
7631
|
+
WwwVersantsComExtractor: WwwVersantsComExtractor,
|
|
7632
|
+
WwwVideogameschronicleComExtractor: WwwVideogameschronicleComExtractor,
|
|
7633
|
+
WwwVortezNetExtractor: WwwVortezNetExtractor,
|
|
7634
|
+
WwwVoxComExtractor: WwwVoxComExtractor,
|
|
7635
|
+
WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor,
|
|
7636
|
+
WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
|
|
7637
|
+
WwwYomiuriCoJpExtractor: WwwYomiuriCoJpExtractor,
|
|
7638
|
+
WwwYoutubeComExtractor: WwwYoutubeComExtractor,
|
|
7639
|
+
YahooExtractor: YahooExtractor,
|
|
7640
|
+
twofortysevensportsComExtractor: twofortysevensportsComExtractor
|
|
7467
7641
|
});
|
|
7468
7642
|
|
|
7469
|
-
function ownKeys$5(e, r) { var t = _Object$
|
|
7470
|
-
function _objectSpread$5(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$5(Object(t),
|
|
7471
|
-
var Extractors = _Object$
|
|
7643
|
+
function ownKeys$5(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
7644
|
+
function _objectSpread$5(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$5(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$5(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
7645
|
+
var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
|
|
7472
7646
|
var extractor = CustomExtractors[key];
|
|
7473
7647
|
return _objectSpread$5(_objectSpread$5({}, acc), mergeSupportedDomains(extractor));
|
|
7474
7648
|
}, {});
|
|
@@ -7538,9 +7712,9 @@ function cleanDek(dek, _ref) {
|
|
|
7538
7712
|
return normalizeSpaces(dekText.trim());
|
|
7539
7713
|
}
|
|
7540
7714
|
|
|
7541
|
-
|
|
7542
|
-
|
|
7543
|
-
|
|
7715
|
+
dayjs.extend(utc);
|
|
7716
|
+
dayjs.extend(timezonePlugin);
|
|
7717
|
+
dayjs.extend(customParseFormat);
|
|
7544
7718
|
var TIMEZONE_ABBR_RE = /\b(EST|EDT|CST|CDT|MST|MDT|PST|PDT|ET|CT|MT|PT|GMT|UTC)\b/gi;
|
|
7545
7719
|
// Check if string contains timezone offset info (e.g., +0000, GMT+0000, Z)
|
|
7546
7720
|
var HAS_TIMEZONE_RE = /([+-]\d{2}:?\d{2}|Z|\bGMT[+-]\d+|\bUTC\b)/i;
|
|
@@ -7560,53 +7734,53 @@ function cleanDateString(dateString) {
|
|
|
7560
7734
|
}
|
|
7561
7735
|
function createDate(dateString, timezone, format) {
|
|
7562
7736
|
if (TIME_WITH_OFFSET_RE.test(dateString)) {
|
|
7563
|
-
return
|
|
7737
|
+
return dayjs(new Date(dateString));
|
|
7564
7738
|
}
|
|
7565
7739
|
if (TIME_AGO_STRING.test(dateString)) {
|
|
7566
7740
|
var fragments = TIME_AGO_STRING.exec(dateString);
|
|
7567
|
-
return
|
|
7741
|
+
return dayjs().subtract(fragments[1], fragments[2]);
|
|
7568
7742
|
}
|
|
7569
7743
|
if (TIME_NOW_STRING.test(dateString)) {
|
|
7570
|
-
return
|
|
7744
|
+
return dayjs();
|
|
7571
7745
|
}
|
|
7572
7746
|
var stringHasTimezone = hasTimezoneInfo(dateString);
|
|
7573
7747
|
var cleanedDateString = stripTimezoneAbbr(dateString);
|
|
7574
7748
|
if (stringHasTimezone) {
|
|
7575
7749
|
var _nativeDate = new Date(dateString);
|
|
7576
|
-
if (!_Number$
|
|
7577
|
-
return
|
|
7750
|
+
if (!_Number$isNaN(_nativeDate.getTime())) {
|
|
7751
|
+
return dayjs(_nativeDate);
|
|
7578
7752
|
}
|
|
7579
7753
|
}
|
|
7580
7754
|
if (timezone && !stringHasTimezone) {
|
|
7581
7755
|
if (format) {
|
|
7582
7756
|
var cleanedFormat = stripTimezoneFromFormat(format);
|
|
7583
7757
|
try {
|
|
7584
|
-
var _parsed =
|
|
7758
|
+
var _parsed = dayjs.tz(cleanedDateString, cleanedFormat, timezone);
|
|
7585
7759
|
if (_parsed.isValid()) return _parsed;
|
|
7586
7760
|
} catch (_unused) {
|
|
7587
7761
|
// Fall through
|
|
7588
7762
|
}
|
|
7589
7763
|
}
|
|
7590
7764
|
var _nativeDate2 = new Date(cleanedDateString);
|
|
7591
|
-
if (!_Number$
|
|
7592
|
-
return
|
|
7765
|
+
if (!_Number$isNaN(_nativeDate2.getTime())) {
|
|
7766
|
+
return dayjs(_nativeDate2).tz(timezone, true);
|
|
7593
7767
|
}
|
|
7594
|
-
var parsed =
|
|
7768
|
+
var parsed = dayjs(cleanedDateString);
|
|
7595
7769
|
if (parsed.isValid()) {
|
|
7596
7770
|
return parsed.tz(timezone, true);
|
|
7597
7771
|
}
|
|
7598
|
-
return
|
|
7772
|
+
return dayjs(null);
|
|
7599
7773
|
}
|
|
7600
7774
|
if (format) {
|
|
7601
7775
|
var _cleanedFormat = stripTimezoneFromFormat(format);
|
|
7602
|
-
var _parsed2 =
|
|
7776
|
+
var _parsed2 = dayjs(cleanedDateString, _cleanedFormat);
|
|
7603
7777
|
if (_parsed2.isValid()) return _parsed2;
|
|
7604
7778
|
}
|
|
7605
7779
|
var nativeDate = new Date(cleanedDateString);
|
|
7606
|
-
if (!_Number$
|
|
7607
|
-
return
|
|
7780
|
+
if (!_Number$isNaN(nativeDate.getTime())) {
|
|
7781
|
+
return dayjs(nativeDate);
|
|
7608
7782
|
}
|
|
7609
|
-
return
|
|
7783
|
+
return dayjs(cleanedDateString);
|
|
7610
7784
|
}
|
|
7611
7785
|
|
|
7612
7786
|
// Take a date published string, and hopefully return a date out of
|
|
@@ -7617,10 +7791,10 @@ function cleanDatePublished(dateString) {
|
|
|
7617
7791
|
format = _ref.format;
|
|
7618
7792
|
// If string is in milliseconds or seconds, convert to int and return
|
|
7619
7793
|
if (MS_DATE_STRING.test(dateString)) {
|
|
7620
|
-
return new Date(
|
|
7794
|
+
return new Date(_parseInt(dateString, 10)).toISOString();
|
|
7621
7795
|
}
|
|
7622
7796
|
if (SEC_DATE_STRING.test(dateString)) {
|
|
7623
|
-
return new Date(
|
|
7797
|
+
return new Date(_parseInt(dateString, 10) * 1000).toISOString();
|
|
7624
7798
|
}
|
|
7625
7799
|
var date = createDate(dateString, timezone, format);
|
|
7626
7800
|
if (!date.isValid()) {
|
|
@@ -7695,13 +7869,13 @@ function extractBreadcrumbTitle(splitTitle, text) {
|
|
|
7695
7869
|
acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
|
|
7696
7870
|
return acc;
|
|
7697
7871
|
}, {});
|
|
7698
|
-
var _Reflect$ownKeys$redu = _Reflect$
|
|
7872
|
+
var _Reflect$ownKeys$redu = _Reflect$ownKeys(termCounts).reduce(function (acc, key) {
|
|
7699
7873
|
if (acc[1] < termCounts[key]) {
|
|
7700
7874
|
return [key, termCounts[key]];
|
|
7701
7875
|
}
|
|
7702
7876
|
return acc;
|
|
7703
7877
|
}, [0, 0]),
|
|
7704
|
-
_Reflect$ownKeys$redu2 =
|
|
7878
|
+
_Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
|
|
7705
7879
|
maxTerm = _Reflect$ownKeys$redu2[0],
|
|
7706
7880
|
termCount = _Reflect$ownKeys$redu2[1];
|
|
7707
7881
|
|
|
@@ -7730,16 +7904,16 @@ function cleanDomainFromTitle(splitTitle, url) {
|
|
|
7730
7904
|
//
|
|
7731
7905
|
// Strip out the big TLDs - it just makes the matching a bit more
|
|
7732
7906
|
// accurate. Not the end of the world if it doesn't strip right.
|
|
7733
|
-
var _URL$parse =
|
|
7907
|
+
var _URL$parse = URL$1.parse(url),
|
|
7734
7908
|
host = _URL$parse.host;
|
|
7735
7909
|
var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
|
|
7736
7910
|
var startSlug = splitTitle[0].toLowerCase().replace(' ', '');
|
|
7737
|
-
var startSlugRatio =
|
|
7911
|
+
var startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);
|
|
7738
7912
|
if (startSlugRatio > 0.4 && startSlug.length > 5) {
|
|
7739
7913
|
return splitTitle.slice(2).join('');
|
|
7740
7914
|
}
|
|
7741
7915
|
var endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
|
|
7742
|
-
var endSlugRatio =
|
|
7916
|
+
var endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
|
|
7743
7917
|
if (endSlugRatio > 0.4 && endSlug.length >= 5) {
|
|
7744
7918
|
return splitTitle.slice(0, -2).join('');
|
|
7745
7919
|
}
|
|
@@ -7839,7 +8013,7 @@ function scoreContent($) {
|
|
|
7839
8013
|
// First, look for special hNews based selectors and give them a big
|
|
7840
8014
|
// boost, if they exist
|
|
7841
8015
|
HNEWS_CONTENT_SELECTORS.forEach(function (_ref) {
|
|
7842
|
-
var _ref2 =
|
|
8016
|
+
var _ref2 = _slicedToArray(_ref, 2),
|
|
7843
8017
|
parentSelector = _ref2[0],
|
|
7844
8018
|
childSelector = _ref2[1];
|
|
7845
8019
|
$("".concat(parentSelector, " ").concat(childSelector)).each(function (index, node) {
|
|
@@ -7971,11 +8145,11 @@ function extractBestNode($, opts) {
|
|
|
7971
8145
|
return $topCandidate;
|
|
7972
8146
|
}
|
|
7973
8147
|
|
|
7974
|
-
function _createForOfIteratorHelper$2(r, e) { var t = "undefined" != typeof
|
|
7975
|
-
function _unsupportedIterableToArray$2(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$2(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
8148
|
+
function _createForOfIteratorHelper$2(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray$2(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
8149
|
+
function _unsupportedIterableToArray$2(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$2(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray$2(r, a) : void 0; } }
|
|
7976
8150
|
function _arrayLikeToArray$2(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
7977
|
-
function ownKeys$4(e, r) { var t = _Object$
|
|
7978
|
-
function _objectSpread$4(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$4(Object(t),
|
|
8151
|
+
function ownKeys$4(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
8152
|
+
function _objectSpread$4(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$4(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$4(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
7979
8153
|
var GenericContentExtractor = {
|
|
7980
8154
|
defaultOpts: {
|
|
7981
8155
|
stripUnlikelyCandidates: true,
|
|
@@ -8018,8 +8192,7 @@ var GenericContentExtractor = {
|
|
|
8018
8192
|
|
|
8019
8193
|
// We didn't succeed on first pass, one by one disable our
|
|
8020
8194
|
// extraction opts and try again.
|
|
8021
|
-
|
|
8022
|
-
var _iterator = _createForOfIteratorHelper$2(_Reflect$ownKeys__default["default"](opts).filter(function (k) {
|
|
8195
|
+
var _iterator = _createForOfIteratorHelper$2(_Reflect$ownKeys(opts).filter(function (k) {
|
|
8023
8196
|
return opts[k] === true;
|
|
8024
8197
|
})),
|
|
8025
8198
|
_step;
|
|
@@ -8147,8 +8320,8 @@ var AUTHOR_SELECTORS = ['.entry .entry-author', '.author.vcard .fn', '.author .v
|
|
|
8147
8320
|
var bylineRe = /^[\n\s]*By/i;
|
|
8148
8321
|
var BYLINE_SELECTORS_RE = [['#byline', bylineRe], ['.byline', bylineRe]];
|
|
8149
8322
|
|
|
8150
|
-
function _createForOfIteratorHelper$1(r, e) { var t = "undefined" != typeof
|
|
8151
|
-
function _unsupportedIterableToArray$1(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$1(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
8323
|
+
function _createForOfIteratorHelper$1(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray$1(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
8324
|
+
function _unsupportedIterableToArray$1(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray$1(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray$1(r, a) : void 0; } }
|
|
8152
8325
|
function _arrayLikeToArray$1(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
8153
8326
|
var GenericAuthorExtractor = {
|
|
8154
8327
|
extract: function extract(_ref) {
|
|
@@ -8171,12 +8344,11 @@ var GenericAuthorExtractor = {
|
|
|
8171
8344
|
|
|
8172
8345
|
// Last, use our looser regular-expression based selectors for
|
|
8173
8346
|
// potential authors.
|
|
8174
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
8175
8347
|
var _iterator = _createForOfIteratorHelper$1(BYLINE_SELECTORS_RE),
|
|
8176
8348
|
_step;
|
|
8177
8349
|
try {
|
|
8178
8350
|
for (_iterator.s(); !(_step = _iterator.n()).done;) {
|
|
8179
|
-
var _step$value =
|
|
8351
|
+
var _step$value = _slicedToArray(_step.value, 2),
|
|
8180
8352
|
selector = _step$value[0],
|
|
8181
8353
|
regex = _step$value[1];
|
|
8182
8354
|
var node = $(selector);
|
|
@@ -8333,8 +8505,8 @@ function scoreBySibling($img) {
|
|
|
8333
8505
|
}
|
|
8334
8506
|
function scoreByDimensions($img) {
|
|
8335
8507
|
var score = 0;
|
|
8336
|
-
var width =
|
|
8337
|
-
var height =
|
|
8508
|
+
var width = _parseFloat($img.attr('width'));
|
|
8509
|
+
var height = _parseFloat($img.attr('height'));
|
|
8338
8510
|
var src = $img.attr('src');
|
|
8339
8511
|
|
|
8340
8512
|
// Penalty for skinny images
|
|
@@ -8361,8 +8533,8 @@ function scoreByPosition($imgs, index) {
|
|
|
8361
8533
|
return $imgs.length / 2 - index;
|
|
8362
8534
|
}
|
|
8363
8535
|
|
|
8364
|
-
function _createForOfIteratorHelper(r, e) { var t = "undefined" != typeof
|
|
8365
|
-
function _unsupportedIterableToArray(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$
|
|
8536
|
+
function _createForOfIteratorHelper(r, e) { var t = "undefined" != typeof _Symbol && r[_Symbol$iterator] || r["@@iterator"]; if (!t) { if (_Array$isArray(r) || (t = _unsupportedIterableToArray(r)) || e) { t && (r = t); var _n = 0, F = function F() {}; return { s: F, n: function n() { return _n >= r.length ? { done: true } : { done: false, value: r[_n++] }; }, e: function e(r) { throw r; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var o, a = true, u = false; return { s: function s() { t = t.call(r); }, n: function n() { var r = t.next(); return a = r.done, r; }, e: function e(r) { u = true, o = r; }, f: function f() { try { a || null == t["return"] || t["return"](); } finally { if (u) throw o; } } }; }
|
|
8537
|
+
function _unsupportedIterableToArray(r, a) { if (r) { if ("string" == typeof r) return _arrayLikeToArray(r, a); var t = {}.toString.call(r).slice(8, -1); return "Object" === t && r.constructor && (t = r.constructor.name), "Map" === t || "Set" === t ? _Array$from(r) : "Arguments" === t || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(t) ? _arrayLikeToArray(r, a) : void 0; } }
|
|
8366
8538
|
function _arrayLikeToArray(r, a) { (null == a || a > r.length) && (a = r.length); for (var e = 0, n = Array(a); e < a; e++) n[e] = r[e]; return n; }
|
|
8367
8539
|
|
|
8368
8540
|
// Given a resource, try to find the lead image URL from within
|
|
@@ -8412,10 +8584,10 @@ var GenericLeadImageUrlExtractor = {
|
|
|
8412
8584
|
score += scoreByPosition(imgs, index);
|
|
8413
8585
|
imgScores[src] = score;
|
|
8414
8586
|
});
|
|
8415
|
-
var _Reflect$ownKeys$redu = _Reflect$
|
|
8587
|
+
var _Reflect$ownKeys$redu = _Reflect$ownKeys(imgScores).reduce(function (acc, key) {
|
|
8416
8588
|
return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc;
|
|
8417
8589
|
}, [null, 0]),
|
|
8418
|
-
_Reflect$ownKeys$redu2 =
|
|
8590
|
+
_Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2),
|
|
8419
8591
|
topUrl = _Reflect$ownKeys$redu2[0],
|
|
8420
8592
|
topScore = _Reflect$ownKeys$redu2[1];
|
|
8421
8593
|
if (topScore > 0) {
|
|
@@ -8425,7 +8597,6 @@ var GenericLeadImageUrlExtractor = {
|
|
|
8425
8597
|
|
|
8426
8598
|
// If nothing else worked, check to see if there are any really
|
|
8427
8599
|
// probable nodes in the doc, like <link rel="image_src" />.
|
|
8428
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
8429
8600
|
var _iterator = _createForOfIteratorHelper(LEAD_IMAGE_URL_SELECTORS),
|
|
8430
8601
|
_step;
|
|
8431
8602
|
try {
|
|
@@ -8464,7 +8635,7 @@ function scoreSimilarity(score, articleUrl, href) {
|
|
|
8464
8635
|
// sliding scale, subtract points from this link based on
|
|
8465
8636
|
// similarity.
|
|
8466
8637
|
if (score > 0) {
|
|
8467
|
-
var similarity = new
|
|
8638
|
+
var similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio();
|
|
8468
8639
|
// Subtract .1 from diff_percent when calculating modifier,
|
|
8469
8640
|
// which means that if it's less than 10% different, we give a
|
|
8470
8641
|
// bonus instead. Ex:
|
|
@@ -8485,7 +8656,7 @@ function scoreLinkText(linkText, pageNum) {
|
|
|
8485
8656
|
// get scored, and sorted properly by score.
|
|
8486
8657
|
var score = 0;
|
|
8487
8658
|
if (IS_DIGIT_RE.test(linkText.trim())) {
|
|
8488
|
-
var linkTextAsNum =
|
|
8659
|
+
var linkTextAsNum = _parseInt(linkText, 10);
|
|
8489
8660
|
// If it's the first page, we already got it on the first call.
|
|
8490
8661
|
// Give it a negative score. Otherwise, up to page 10, give a
|
|
8491
8662
|
// small bonus.
|
|
@@ -8554,7 +8725,7 @@ function scoreByParents($link) {
|
|
|
8554
8725
|
var positiveMatch = false;
|
|
8555
8726
|
var negativeMatch = false;
|
|
8556
8727
|
var score = 0;
|
|
8557
|
-
_Array$
|
|
8728
|
+
_Array$from(range(0, 4)).forEach(function () {
|
|
8558
8729
|
if ($parent.length === 0) {
|
|
8559
8730
|
return;
|
|
8560
8731
|
}
|
|
@@ -8604,7 +8775,7 @@ function shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrl
|
|
|
8604
8775
|
return false;
|
|
8605
8776
|
}
|
|
8606
8777
|
var hostname = parsedUrl.hostname;
|
|
8607
|
-
var _URL$parse =
|
|
8778
|
+
var _URL$parse = URL$1.parse(href),
|
|
8608
8779
|
linkHost = _URL$parse.hostname;
|
|
8609
8780
|
|
|
8610
8781
|
// Domain mismatch.
|
|
@@ -8679,7 +8850,7 @@ function scoreLinks(_ref) {
|
|
|
8679
8850
|
$ = _ref.$,
|
|
8680
8851
|
_ref$previousUrls = _ref.previousUrls,
|
|
8681
8852
|
previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
|
|
8682
|
-
parsedUrl = parsedUrl ||
|
|
8853
|
+
parsedUrl = parsedUrl || URL$1.parse(articleUrl);
|
|
8683
8854
|
var baseRegex = makeBaseRegex(baseUrl);
|
|
8684
8855
|
var isWp = isWordpress($);
|
|
8685
8856
|
|
|
@@ -8730,7 +8901,7 @@ function scoreLinks(_ref) {
|
|
|
8730
8901
|
possiblePage.score = score;
|
|
8731
8902
|
return possiblePages;
|
|
8732
8903
|
}, {});
|
|
8733
|
-
return _Reflect$
|
|
8904
|
+
return _Reflect$ownKeys(scoredPages).length === 0 ? null : scoredPages;
|
|
8734
8905
|
}
|
|
8735
8906
|
|
|
8736
8907
|
// Looks for and returns next page url
|
|
@@ -8742,7 +8913,7 @@ var GenericNextPageUrlExtractor = {
|
|
|
8742
8913
|
parsedUrl = _ref.parsedUrl,
|
|
8743
8914
|
_ref$previousUrls = _ref.previousUrls,
|
|
8744
8915
|
previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls;
|
|
8745
|
-
parsedUrl = parsedUrl ||
|
|
8916
|
+
parsedUrl = parsedUrl || URL$1.parse(url);
|
|
8746
8917
|
var articleUrl = removeAnchor(url);
|
|
8747
8918
|
var baseUrl = articleBaseUrl(url, parsedUrl);
|
|
8748
8919
|
var links = $('a[href]').toArray();
|
|
@@ -8760,7 +8931,7 @@ var GenericNextPageUrlExtractor = {
|
|
|
8760
8931
|
|
|
8761
8932
|
// now that we've scored all possible pages,
|
|
8762
8933
|
// find the biggest one.
|
|
8763
|
-
var topPage = _Reflect$
|
|
8934
|
+
var topPage = _Reflect$ownKeys(scoredLinks).reduce(function (acc, link) {
|
|
8764
8935
|
var scoredLink = scoredLinks[link];
|
|
8765
8936
|
return scoredLink.score > acc.score ? scoredLink : acc;
|
|
8766
8937
|
}, {
|
|
@@ -8779,7 +8950,7 @@ var GenericNextPageUrlExtractor = {
|
|
|
8779
8950
|
var CANONICAL_META_SELECTORS = ['og:url'];
|
|
8780
8951
|
|
|
8781
8952
|
function parseDomain(url) {
|
|
8782
|
-
var parsedUrl =
|
|
8953
|
+
var parsedUrl = URL$1.parse(url);
|
|
8783
8954
|
var hostname = parsedUrl.hostname;
|
|
8784
8955
|
return hostname;
|
|
8785
8956
|
}
|
|
@@ -8850,7 +9021,7 @@ var ellipsize$1 = (function (str, max, opts) {
|
|
|
8850
9021
|
if (typeof str !== 'string' || str.length === 0) return '';
|
|
8851
9022
|
if (max === 0) return '';
|
|
8852
9023
|
opts = opts || {};
|
|
8853
|
-
_Object$
|
|
9024
|
+
_Object$keys(defaults).forEach(function (key) {
|
|
8854
9025
|
if (opts[key] === null || typeof opts[key] === 'undefined') {
|
|
8855
9026
|
opts[key] = defaults[key];
|
|
8856
9027
|
}
|
|
@@ -8906,8 +9077,8 @@ var GenericWordCountExtractor = {
|
|
|
8906
9077
|
}
|
|
8907
9078
|
};
|
|
8908
9079
|
|
|
8909
|
-
function ownKeys$3(e, r) { var t = _Object$
|
|
8910
|
-
function _objectSpread$3(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$3(Object(t),
|
|
9080
|
+
function ownKeys$3(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
9081
|
+
function _objectSpread$3(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$3(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$3(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
8911
9082
|
var GenericExtractor = {
|
|
8912
9083
|
// This extractor is the default for all domains
|
|
8913
9084
|
domain: '*',
|
|
@@ -8923,7 +9094,7 @@ var GenericExtractor = {
|
|
|
8923
9094
|
word_count: GenericWordCountExtractor.extract,
|
|
8924
9095
|
direction: function direction(_ref) {
|
|
8925
9096
|
var title = _ref.title;
|
|
8926
|
-
return
|
|
9097
|
+
return stringDirection.getDirection(title);
|
|
8927
9098
|
},
|
|
8928
9099
|
extract: function extract(options) {
|
|
8929
9100
|
var html = options.html,
|
|
@@ -8979,22 +9150,22 @@ var Detectors = {
|
|
|
8979
9150
|
'meta[name="generator"][value="blogger"]': BloggerExtractor
|
|
8980
9151
|
};
|
|
8981
9152
|
function detectByHtml($) {
|
|
8982
|
-
var selector = _Reflect$
|
|
9153
|
+
var selector = _Reflect$ownKeys(Detectors).find(function (s) {
|
|
8983
9154
|
return $(s).length > 0;
|
|
8984
9155
|
});
|
|
8985
9156
|
return Detectors[selector];
|
|
8986
9157
|
}
|
|
8987
9158
|
|
|
8988
9159
|
function getExtractor(url, parsedUrl, $) {
|
|
8989
|
-
parsedUrl = parsedUrl ||
|
|
9160
|
+
parsedUrl = parsedUrl || URL$1.parse(url);
|
|
8990
9161
|
var _parsedUrl = parsedUrl,
|
|
8991
9162
|
hostname = _parsedUrl.hostname;
|
|
8992
9163
|
var baseDomain = hostname.split('.').slice(-2).join('.');
|
|
8993
9164
|
return apiExtractors[hostname] || apiExtractors[baseDomain] || Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor;
|
|
8994
9165
|
}
|
|
8995
9166
|
|
|
8996
|
-
function ownKeys$2(e, r) { var t = _Object$
|
|
8997
|
-
function _objectSpread$2(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$2(Object(t),
|
|
9167
|
+
function ownKeys$2(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
9168
|
+
function _objectSpread$2(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$2(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$2(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
8998
9169
|
|
|
8999
9170
|
// Remove elements by an array of selectors
|
|
9000
9171
|
function cleanBySelectors($content, $, _ref) {
|
|
@@ -9008,7 +9179,7 @@ function cleanBySelectors($content, $, _ref) {
|
|
|
9008
9179
|
function transformElements($content, $, _ref2) {
|
|
9009
9180
|
var transforms = _ref2.transforms;
|
|
9010
9181
|
if (!transforms) return $content;
|
|
9011
|
-
_Reflect$
|
|
9182
|
+
_Reflect$ownKeys(transforms).forEach(function (key) {
|
|
9012
9183
|
var $matches = $(key, $content);
|
|
9013
9184
|
var value = transforms[key];
|
|
9014
9185
|
|
|
@@ -9032,13 +9203,13 @@ function transformElements($content, $, _ref2) {
|
|
|
9032
9203
|
}
|
|
9033
9204
|
function findMatchingSelector($, selectors, extractHtml, allowMultiple) {
|
|
9034
9205
|
return selectors.find(function (selector) {
|
|
9035
|
-
if (_Array$
|
|
9206
|
+
if (_Array$isArray(selector)) {
|
|
9036
9207
|
if (extractHtml) {
|
|
9037
9208
|
return selector.reduce(function (acc, s) {
|
|
9038
9209
|
return acc && $(s).length > 0;
|
|
9039
9210
|
}, true);
|
|
9040
9211
|
}
|
|
9041
|
-
var _selector =
|
|
9212
|
+
var _selector = _slicedToArray(selector, 2),
|
|
9042
9213
|
s = _selector[0],
|
|
9043
9214
|
attr = _selector[1];
|
|
9044
9215
|
return (allowMultiple || !allowMultiple && $(s).length === 1) && $(s).attr(attr) && $(s).attr(attr).trim() !== '';
|
|
@@ -9080,7 +9251,7 @@ function select(opts) {
|
|
|
9080
9251
|
// multi-match selection, which allows the parser to choose several
|
|
9081
9252
|
// selectors to include in the result. Note that all selectors in the
|
|
9082
9253
|
// array must match in order for this selector to trigger
|
|
9083
|
-
if (_Array$
|
|
9254
|
+
if (_Array$isArray(matchingSelector)) {
|
|
9084
9255
|
$content = $(matchingSelector.join(','));
|
|
9085
9256
|
var $wrapper = $('<div></div>');
|
|
9086
9257
|
$content.each(function (_, element) {
|
|
@@ -9114,8 +9285,8 @@ function select(opts) {
|
|
|
9114
9285
|
var result;
|
|
9115
9286
|
// if selector is an array (e.g., ['img', 'src']),
|
|
9116
9287
|
// extract the attr
|
|
9117
|
-
if (_Array$
|
|
9118
|
-
var _matchingSelector =
|
|
9288
|
+
if (_Array$isArray(matchingSelector)) {
|
|
9289
|
+
var _matchingSelector = _slicedToArray(matchingSelector, 3),
|
|
9119
9290
|
selector = _matchingSelector[0],
|
|
9120
9291
|
attr = _matchingSelector[1],
|
|
9121
9292
|
transform = _matchingSelector[2];
|
|
@@ -9132,7 +9303,7 @@ function select(opts) {
|
|
|
9132
9303
|
return $(el).text().trim();
|
|
9133
9304
|
});
|
|
9134
9305
|
}
|
|
9135
|
-
result = _Array$
|
|
9306
|
+
result = _Array$isArray(result.toArray()) && allowMultiple ? result.toArray() : result[0];
|
|
9136
9307
|
// Allow custom extractor to skip default cleaner
|
|
9137
9308
|
// for this type; defaults to true
|
|
9138
9309
|
if (defaultCleaner && Cleaners[type]) {
|
|
@@ -9142,7 +9313,7 @@ function select(opts) {
|
|
|
9142
9313
|
}
|
|
9143
9314
|
function selectExtendedTypes(extend, opts) {
|
|
9144
9315
|
var results = {};
|
|
9145
|
-
_Reflect$
|
|
9316
|
+
_Reflect$ownKeys(extend).forEach(function (t) {
|
|
9146
9317
|
if (!results[t]) {
|
|
9147
9318
|
results[t] = select(_objectSpread$2(_objectSpread$2({}, opts), {}, {
|
|
9148
9319
|
type: t,
|
|
@@ -9260,15 +9431,15 @@ var RootExtractor = {
|
|
|
9260
9431
|
}
|
|
9261
9432
|
};
|
|
9262
9433
|
|
|
9263
|
-
function ownKeys$1(e, r) { var t = _Object$
|
|
9264
|
-
function _objectSpread$1(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$1(Object(t),
|
|
9434
|
+
function ownKeys$1(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
9435
|
+
function _objectSpread$1(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys$1(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys$1(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
9265
9436
|
function collectAllPages(_x) {
|
|
9266
9437
|
return _collectAllPages.apply(this, arguments);
|
|
9267
9438
|
}
|
|
9268
9439
|
function _collectAllPages() {
|
|
9269
|
-
_collectAllPages =
|
|
9440
|
+
_collectAllPages = _asyncToGenerator(/*#__PURE__*/_regeneratorRuntime.mark(function _callee(_ref) {
|
|
9270
9441
|
var next_page_url, html, $, metaCache, result, Extractor, title, url, pages, previousUrls, extractorOpts, nextPageResult, word_count;
|
|
9271
|
-
return
|
|
9442
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
9272
9443
|
while (1) switch (_context.prev = _context.next) {
|
|
9273
9444
|
case 0:
|
|
9274
9445
|
next_page_url = _ref.next_page_url, html = _ref.html, $ = _ref.$, metaCache = _ref.metaCache, result = _ref.result, Extractor = _ref.Extractor, title = _ref.title, url = _ref.url;
|
|
@@ -9282,7 +9453,6 @@ function _collectAllPages() {
|
|
|
9282
9453
|
break;
|
|
9283
9454
|
}
|
|
9284
9455
|
pages += 1;
|
|
9285
|
-
// eslint-disable-next-line no-await-in-loop
|
|
9286
9456
|
_context.next = 2;
|
|
9287
9457
|
return Resource.create(next_page_url);
|
|
9288
9458
|
case 2:
|
|
@@ -9323,17 +9493,17 @@ function _collectAllPages() {
|
|
|
9323
9493
|
}
|
|
9324
9494
|
|
|
9325
9495
|
var _excluded = ["html"];
|
|
9326
|
-
function ownKeys(e, r) { var t = _Object$
|
|
9327
|
-
function _objectSpread(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys(Object(t),
|
|
9496
|
+
function ownKeys(e, r) { var t = _Object$keys(e); if (_Object$getOwnPropertySymbols) { var o = _Object$getOwnPropertySymbols(e); r && (o = o.filter(function (r) { return _Object$getOwnPropertyDescriptor(e, r).enumerable; })), t.push.apply(t, o); } return t; }
|
|
9497
|
+
function _objectSpread(e) { for (var r = 1; r < arguments.length; r++) { var t = null != arguments[r] ? arguments[r] : {}; r % 2 ? ownKeys(Object(t), true).forEach(function (r) { _defineProperty(e, r, t[r]); }) : _Object$getOwnPropertyDescriptors ? _Object$defineProperties(e, _Object$getOwnPropertyDescriptors(t)) : ownKeys(Object(t)).forEach(function (r) { _Object$defineProperty(e, r, _Object$getOwnPropertyDescriptor(t, r)); }); } return e; }
|
|
9328
9498
|
var Parser = {
|
|
9329
9499
|
parse: function parse(url) {
|
|
9330
9500
|
var _arguments = arguments;
|
|
9331
|
-
return
|
|
9501
|
+
return _asyncToGenerator(/*#__PURE__*/_regeneratorRuntime.mark(function _callee() {
|
|
9332
9502
|
var _ref, html, opts, _opts$fetchAllPages, fetchAllPages, _opts$fallback, fallback, _opts$contentType, contentType, _opts$headers, headers, extend, customExtractor, parsedUrl, $, Extractor, metaCache, extendedTypes, result, _result, title, next_page_url, turndownService;
|
|
9333
|
-
return
|
|
9503
|
+
return _regeneratorRuntime.wrap(function (_context) {
|
|
9334
9504
|
while (1) switch (_context.prev = _context.next) {
|
|
9335
9505
|
case 0:
|
|
9336
|
-
_ref = _arguments.length > 1 && _arguments[1] !== undefined ? _arguments[1] : {}, html = _ref.html, opts =
|
|
9506
|
+
_ref = _arguments.length > 1 && _arguments[1] !== undefined ? _arguments[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, _excluded);
|
|
9337
9507
|
_opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend, customExtractor = opts.customExtractor; // if no url was passed and this is the browser version,
|
|
9338
9508
|
// set url to window.location.href and load the html
|
|
9339
9509
|
// from the current page
|
|
@@ -9341,7 +9511,7 @@ var Parser = {
|
|
|
9341
9511
|
url = window.location.href; // eslint-disable-line no-undef
|
|
9342
9512
|
html = html || document.documentElement.outerHTML; // eslint-disable-line no-undef
|
|
9343
9513
|
}
|
|
9344
|
-
parsedUrl =
|
|
9514
|
+
parsedUrl = URL$1.parse(url);
|
|
9345
9515
|
if (validateUrl(parsedUrl)) {
|
|
9346
9516
|
_context.next = 1;
|
|
9347
9517
|
break;
|
|
@@ -9421,7 +9591,7 @@ var Parser = {
|
|
|
9421
9591
|
});
|
|
9422
9592
|
case 6:
|
|
9423
9593
|
if (contentType === 'markdown') {
|
|
9424
|
-
turndownService = new
|
|
9594
|
+
turndownService = new TurndownService();
|
|
9425
9595
|
result.content = turndownService.turndown(result.content);
|
|
9426
9596
|
} else if (contentType === 'text') {
|
|
9427
9597
|
result.content = $.text($(result.content));
|