@jocmp/mercury-parser 2.2.8 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -106,8 +106,6 @@ Note that the URL argument is still supplied, in order to identify the web site
106
106
 
107
107
  Mercury Parser also ships with a CLI, meaning you can use it from your command line like so:
108
108
 
109
- ![Mercury Parser CLI Basic Usage](./assets/parser-basic-usage.gif)
110
-
111
109
  ```bash
112
110
  # Install Mercury Parser globally
113
111
  yarn global add @jocmp/mercury-parser
@@ -543,7 +543,7 @@ var KEEP_SELECTORS$1 = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="
543
543
 
544
544
  var STRIP_OUTPUT_TAGS$1 = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
545
545
 
546
- var WHITELIST_ATTRS$1 = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
546
+ var WHITELIST_ATTRS$1 = ['src', 'srcset', 'start', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
547
547
  var WHITELIST_ATTRS_RE$1 = new RegExp("^(".concat(WHITELIST_ATTRS$1.join('|'), ")$"), 'i'); // removeEmpty
548
548
 
549
549
  var CLEAN_CONDITIONALLY_TAGS$1 = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders
@@ -1370,7 +1370,7 @@ function cleanTags$$1($article, $) {
1370
1370
  if (weight < 0) {
1371
1371
  $node.remove();
1372
1372
  } else {
1373
- // deteremine if node seems like content
1373
+ // determine if node seems like content
1374
1374
  removeUnlessContent$1($node, $, weight);
1375
1375
  }
1376
1376
  });
@@ -1380,11 +1380,16 @@ function cleanTags$$1($article, $) {
1380
1380
  function cleanHeaders$1($article, $) {
1381
1381
  var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
1382
1382
  $(HEADER_TAG_LIST$1, $article).each(function (index, header) {
1383
- var $header = $(header); // Remove any headers that appear before all other p tags in the
1383
+ var $header = $(header);
1384
+
1385
+ if ($(header).hasClass(KEEP_CLASS$1)) {
1386
+ return $header;
1387
+ } // Remove any headers that appear before all other p tags in the
1384
1388
  // document. This probably means that it was part of the title, a
1385
1389
  // subtitle or something else extraneous like a datestamp or byline,
1386
1390
  // all of which should be handled by other metadata handling.
1387
1391
 
1392
+
1388
1393
  if ($($header, $article).prevAll('p').length === 0) {
1389
1394
  return $header.remove();
1390
1395
  } // Remove any headers that match the title exactly.
@@ -6199,8 +6204,18 @@ var WwwVersantsComExtractor = {
6199
6204
  selectors: [['meta[name="og:image"]', 'value']]
6200
6205
  },
6201
6206
  content: {
6202
- selectors: ['.entry-content'],
6203
- clean: ['.adv-link', '.versa-target']
6207
+ transforms: {
6208
+ '.featured-image': function featuredImage($node) {
6209
+ $node.addClass('mercury-parser-keep');
6210
+ var figcaption = $node.find('span');
6211
+ $node.find('figure').append(figcaption);
6212
+ }
6213
+ },
6214
+ selectors: ['.article-content'],
6215
+ clean: ['.adv-link', '.versa-target', 'header', // Clean title
6216
+ '.author', // Clean author
6217
+ '.thumbnail-slider' // Remove, the main images will be within the .main-slider div.
6218
+ ]
6204
6219
  }
6205
6220
  };
6206
6221
  var Www1pezeshkComExtractor = {
@@ -6244,20 +6259,24 @@ var WwwAndroidauthorityComExtractor = {
6244
6259
  lead_image_url: {
6245
6260
  selectors: [['meta[name="og:image"]', 'value']]
6246
6261
  },
6262
+ // Some pages have a nested header elements that are significant, and that the parser will
6263
+ // remove if not following a paragraph. Adding this empty paragraph fixes it, and
6264
+ // the empty paragraph will be removed anyway.
6247
6265
  content: {
6248
- selectors: ['.d_Dd'],
6266
+ selectors: ['.e_Bc', '.d_Dd'],
6249
6267
  transforms: {
6250
6268
  ol: function ol(node) {
6251
6269
  node.attr('class', 'mercury-parser-keep');
6252
6270
  },
6253
6271
  h2: function h2($node) {
6254
- // Some pages have an element h2 that is significant, and that the parser will
6255
- // remove if not following a paragraph. Adding this empty paragraph fixes it, and
6256
- // the empty paragraph will be removed anyway.
6257
- $node.before('<p></p>');
6272
+ return $node.attr('class', 'mercury-parser-keep');
6273
+ },
6274
+ h3: function h3($node) {
6275
+ return $node.attr('class', 'mercury-parser-keep');
6258
6276
  }
6259
6277
  },
6260
- clean: ['.d_f .d_nr' // Lead image
6278
+ clean: ['.e_Oh', // Polls
6279
+ 'picture + div' // Lead image text
6261
6280
  ]
6262
6281
  }
6263
6282
  };
@@ -6302,6 +6321,54 @@ var WwwHardwarezoneComSgExtractor = {
6302
6321
  clean: []
6303
6322
  }
6304
6323
  };
6324
+ var WwwSpiegelDeExtractor = {
6325
+ domain: 'www.spiegel.de',
6326
+ title: {
6327
+ selectors: [['meta[name="og:title"]', 'value']]
6328
+ },
6329
+ author: {
6330
+ selectors: [['meta[name="author"]', 'value']]
6331
+ },
6332
+ date_published: {
6333
+ selectors: [['meta[name="date"]', 'value']]
6334
+ },
6335
+ lead_image_url: {
6336
+ selectors: [['meta[name="og:image"]', 'value']]
6337
+ },
6338
+ content: {
6339
+ selectors: ['div[data-area="body"]', 'article'],
6340
+ transforms: {},
6341
+ clean: []
6342
+ }
6343
+ };
6344
+ var MobilesyrupComExtractor = {
6345
+ domain: 'mobilesyrup.com',
6346
+ title: {
6347
+ selectors: [['meta[name="og:title"]', 'value']]
6348
+ },
6349
+ author: {
6350
+ selectors: [['meta[name="author"]', 'value']]
6351
+ },
6352
+ date_published: {
6353
+ selectors: [['meta[name="article:published_time"]', 'value']]
6354
+ },
6355
+ dek: {
6356
+ selectors: [// enter selectors
6357
+ ]
6358
+ },
6359
+ lead_image_url: {
6360
+ selectors: [['meta[name="og:image"]', 'value']]
6361
+ },
6362
+ content: {
6363
+ selectors: ['.article-content'],
6364
+ transforms: {
6365
+ '.article-content > ul': function articleContentUl(node) {
6366
+ node.attr('class', 'mercury-parser-keep');
6367
+ }
6368
+ },
6369
+ clean: []
6370
+ }
6371
+ };
6305
6372
 
6306
6373
  var CustomExtractors =
6307
6374
  /*#__PURE__*/
@@ -6453,7 +6520,9 @@ _Object$freeze({
6453
6520
  Www1pezeshkComExtractor: Www1pezeshkComExtractor,
6454
6521
  WwwAndroidauthorityComExtractor: WwwAndroidauthorityComExtractor,
6455
6522
  TechcrunchComExtractor: TechcrunchComExtractor,
6456
- WwwHardwarezoneComSgExtractor: WwwHardwarezoneComSgExtractor
6523
+ WwwHardwarezoneComSgExtractor: WwwHardwarezoneComSgExtractor,
6524
+ WwwSpiegelDeExtractor: WwwSpiegelDeExtractor,
6525
+ MobilesyrupComExtractor: MobilesyrupComExtractor
6457
6526
  });
6458
6527
 
6459
6528
  var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
@@ -1 +1 @@
1
- {"version":3,"file":"generate-custom-parser.js","sources":["../src/utils/dom/constants.js","../src/utils/dom/strip-junk-tags.js","../src/extractors/generic/content/scoring/constants.js","../src/extractors/generic/content/scoring/score-commas.js","../src/utils/text/extract-from-url.js","../src/utils/text/constants.js","../src/utils/text/has-sentence-end.js","../src/extractors/generic/content/scoring/index.js","../src/utils/dom/make-links-absolute.js","../src/utils/dom/strip-tags.js","../src/utils/dom/node-is-sufficient.js","../src/utils/dom/get-attrs.js","../src/utils/dom/set-attr.js","../src/utils/dom/index.js","../scripts/templates/insert-values.js","../scripts/templates/index.js","../scripts/templates/custom-extractor.js","../scripts/templates/custom-extractor-test.js","../scripts/generate-custom-parser.js"],"sourcesContent":["// Spacer images to be removed\nexport const SPACER_RE = new RegExp('transparent|spacer|blank', 'i');\n\n// The class we will use to mark elements we want to keep\n// but would normally remove\nexport const KEEP_CLASS = 'mercury-parser-keep';\n\nexport const KEEP_SELECTORS = [\n 'iframe[src^=\"https://www.youtube.com\"]',\n 'iframe[src^=\"https://www.youtube-nocookie.com\"]',\n 'iframe[src^=\"http://www.youtube.com\"]',\n 'iframe[src^=\"https://player.vimeo\"]',\n 'iframe[src^=\"http://player.vimeo\"]',\n 'iframe[src^=\"https://www.redditmedia.com\"]',\n];\n\n// A list of tags to strip from the output if we encounter them.\nexport const STRIP_OUTPUT_TAGS = [\n 'title',\n 'script',\n 'noscript',\n 'link',\n 'style',\n 'hr',\n 'embed',\n 'iframe',\n 'object',\n];\n\n// cleanAttributes\nexport const REMOVE_ATTRS = ['style', 'align'];\nexport const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(\n selector => `[${selector}]`\n);\nexport const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');\nexport const WHITELIST_ATTRS = [\n 'src',\n 'srcset',\n 'sizes',\n 'type',\n 'href',\n 'class',\n 'id',\n 'alt',\n 'xlink:href',\n 'width',\n 'height',\n];\n\nexport const WHITELIST_ATTRS_RE = new RegExp(\n `^(${WHITELIST_ATTRS.join('|')})$`,\n 'i'\n);\n\n// removeEmpty\nexport const REMOVE_EMPTY_TAGS = ['p'];\nexport const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(\n tag => `${tag}:empty`\n).join(',');\n\n// cleanTags\nexport const CLEAN_CONDITIONALLY_TAGS = [\n 'ul',\n 'ol',\n 'table',\n 'div',\n 'button',\n 'form',\n].join(',');\n\n// cleanHeaders\nconst HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];\nexport const HEADER_TAG_LIST = HEADER_TAGS.join(',');\n\n// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n 'ad-break',\n 'adbox',\n 'advert',\n 'addthis',\n 'agegate',\n 'aux',\n 'blogger-labels',\n 'combx',\n 'comment',\n 'conversation',\n 'disqus',\n 'entry-unrelated',\n 'extra',\n 'foot',\n // 'form', // This is too generic, has too many false positives\n 'header',\n 'hidden',\n 'loader',\n 'login', // Note: This can hit 'blogindex'.\n 'menu',\n 'meta',\n 'nav',\n 'outbrain',\n 'pager',\n 'pagination',\n 'predicta', // readwriteweb inline ad box\n 'presence_control_external', // lifehacker.com container full of false positives\n 'popup',\n 'printfriendly',\n 'related',\n 'remove',\n 'remark',\n 'rss',\n 'share',\n 'shoutbox',\n 'sidebar',\n 'sociable',\n 'sponsor',\n 'taboola',\n 'tools',\n];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nexport const UNLIKELY_CANDIDATES_WHITELIST = [\n 'and',\n 'article',\n 'body',\n 'blogindex',\n 'column',\n 'content',\n 'entry-content-asset',\n 'format', // misuse of form\n 'hfeed',\n 'hentry',\n 'hatom',\n 'main',\n 'page',\n 'posts',\n 'shadow',\n];\n\n// A list of tags which, if found inside, should cause a <div /> to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into <p /> tags.\nexport const DIV_TO_P_BLOCK_TAGS = [\n 'a',\n 'blockquote',\n 'dl',\n 'div',\n 'img',\n 'p',\n 'pre',\n 'table',\n].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nexport const NON_TOP_CANDIDATE_TAGS = [\n 'br',\n 'b',\n 'i',\n 'label',\n 'hr',\n 'area',\n 'base',\n 'basefont',\n 'input',\n 'img',\n 'link',\n 'meta',\n];\n\nexport const NON_TOP_CANDIDATE_TAGS_RE = new RegExp(\n `^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`,\n 'i'\n);\n\n// A list of selectors that specify, very clearly, either hNews or other\n// very content-specific style content, like Blogger templates.\n// More examples here: http://microformats.org/wiki/blog-post-formats\nexport const HNEWS_CONTENT_SELECTORS = [\n ['.hentry', '.entry-content'],\n ['entry', '.entry-content'],\n ['.entry', '.entry_content'],\n ['.post', '.postbody'],\n ['.post', '.post_body'],\n ['.post', '.post-body'],\n];\n\nexport const PHOTO_HINTS = ['figure', 'photo', 'image', 'caption'];\nexport const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const POSITIVE_SCORE_HINTS = [\n 'article',\n 'articlecontent',\n 'instapaper_body',\n 'blog',\n 'body',\n 'content',\n 'entry-content-asset',\n 'entry',\n 'hentry',\n 'main',\n 'Normal',\n 'page',\n 'pagination',\n 'permalink',\n 'post',\n 'story',\n 'text',\n '[-_]copy', // usatoday\n '\\\\Bcopy',\n];\n\n// The above list, joined into a matching regular expression\nexport const POSITIVE_SCORE_RE = new RegExp(\n POSITIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// Readability publisher-specific guidelines\nexport const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const NEGATIVE_SCORE_HINTS = [\n 'adbox',\n 'advert',\n 'author',\n 'bio',\n 'bookmark',\n 'bottom',\n 'byline',\n 'clear',\n 'com-',\n 'combx',\n 'comment',\n 'comment\\\\B',\n 'contact',\n 'copy',\n 'credit',\n 'crumb',\n 'date',\n 'deck',\n 'excerpt',\n 'featured', // tnr.com has a featured_content which throws us off\n 'foot',\n 'footer',\n 'footnote',\n 'graf',\n 'head',\n 'info',\n 'infotext', // newscientist.com copyright\n 'instapaper_ignore',\n 'jump',\n 'linebreak',\n 'link',\n 'masthead',\n 'media',\n 'meta',\n 'modal',\n 'outbrain', // slate.com junk\n 'promo',\n 'pr_', // autoblog - press release\n 'related',\n 'respond',\n 'roundcontent', // lifehacker restricted content warning\n 'scroll',\n 'secondary',\n 'share',\n 'shopping',\n 'shoutbox',\n 'side',\n 'sidebar',\n 'sponsor',\n 'stamp',\n 'sub',\n 'summary',\n 'tags',\n 'tools',\n 'widget',\n];\n// The above list, joined into a matching regular expression\nexport const NEGATIVE_SCORE_RE = new RegExp(\n NEGATIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// XPath to try to determine if a page is wordpress. Not always successful.\nexport const IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]';\n\n// Match a digit. Pretty clear.\nexport const DIGIT_RE = new RegExp('[0-9]');\n\n// A list of words that, if found in link text or URLs, likely mean that\n// this link is not a next page link.\nexport const EXTRANEOUS_LINK_HINTS = [\n 'print',\n 'archive',\n 'comment',\n 'discuss',\n 'e-mail',\n 'email',\n 'share',\n 'reply',\n 'all',\n 'login',\n 'sign',\n 'single',\n 'adx',\n 'entry-unrelated',\n];\nexport const EXTRANEOUS_LINK_HINTS_RE = new RegExp(\n EXTRANEOUS_LINK_HINTS.join('|'),\n 'i'\n);\n\n// Match any phrase that looks like it could be page, or paging, or pagination\nexport const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');\n\n// Match any link text/classname/id that looks like it could mean the next\n// page. Things like: next, continue, >, >>, » but not >|, »| as those can\n// mean last page.\n// export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))', 'i');\nexport const NEXT_LINK_TEXT_RE = /(next|weiter|continue|>([^|]|$)|»([^|]|$))/i;\n\n// Match any link text/classname/id that looks like it is an end link: things\n// like \"first\", \"last\", \"end\", etc.\nexport const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i');\n\n// Match any link text/classname/id that looks like it means the previous\n// page.\nexport const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i');\n\n// Match 2 or more consecutive <br> tags\nexport const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \\n\\r\\t]*){2,}', 'i');\n\n// Match 1 BR tag.\nexport const BR_TAG_RE = new RegExp('<br[^>]*>', 'i');\n\n// A list of all of the block level tags known in HTML5 and below. Taken from\n// http://bit.ly/qneNIT\nexport const BLOCK_LEVEL_TAGS = [\n 'article',\n 'aside',\n 'blockquote',\n 'body',\n 'br',\n 'button',\n 'canvas',\n 'caption',\n 'col',\n 'colgroup',\n 'dd',\n 'div',\n 'dl',\n 'dt',\n 'embed',\n 'fieldset',\n 'figcaption',\n 'figure',\n 'footer',\n 'form',\n 'h1',\n 'h2',\n 'h3',\n 'h4',\n 'h5',\n 'h6',\n 'header',\n 'hgroup',\n 'hr',\n 'li',\n 'map',\n 'object',\n 'ol',\n 'output',\n 'p',\n 'pre',\n 'progress',\n 'section',\n 'table',\n 'tbody',\n 'textarea',\n 'tfoot',\n 'th',\n 'thead',\n 'tr',\n 'ul',\n 'video',\n];\nexport const BLOCK_LEVEL_TAGS_RE = new RegExp(\n `^(${BLOCK_LEVEL_TAGS.join('|')})$`,\n 'i'\n);\n\n// The removal is implemented as a blacklist and whitelist, this test finds\n// blacklisted elements that aren't whitelisted. We do this all in one\n// expression-both because it's only one pass, and because this skips the\n// serialization for whitelisted nodes.\nconst candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');\nexport const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');\n\nconst candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');\nexport const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');\n\nexport const UNLIKELY_RE = new RegExp(\n `!(${candidatesWhitelist})|(${candidatesBlacklist})`,\n 'i'\n);\n\nexport const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');\nexport const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');\nexport const BAD_TAGS = new RegExp('^(address|form)$', 'i');\n\nexport const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');\n","import { STRIP_OUTPUT_TAGS, KEEP_CLASS } from './constants';\n\nexport default function stripJunkTags(article, $, tags = []) {\n if (tags.length === 0) {\n tags = STRIP_OUTPUT_TAGS;\n }\n\n // Remove matching elements, but ignore\n // any element with a class of mercury-parser-keep\n $(tags.join(','), article)\n .not(`.${KEEP_CLASS}`)\n .remove();\n\n return $;\n}\n","// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n 'ad-break',\n 'adbox',\n 'advert',\n 'addthis',\n 'agegate',\n 'aux',\n 'blogger-labels',\n 'combx',\n 'comment',\n 'conversation',\n 'disqus',\n 'entry-unrelated',\n 'extra',\n 'foot',\n 'form',\n 'header',\n 'hidden',\n 'loader',\n 'login', // Note: This can hit 'blogindex'.\n 'menu',\n 'meta',\n 'nav',\n 'pager',\n 'pagination',\n 'predicta', // readwriteweb inline ad box\n 'presence_control_external', // lifehacker.com container full of false positives\n 'popup',\n 'printfriendly',\n 'related',\n 'remove',\n 'remark',\n 'rss',\n 'share',\n 'shoutbox',\n 'sidebar',\n 'sociable',\n 'sponsor',\n 'tools',\n];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nexport const UNLIKELY_CANDIDATES_WHITELIST = [\n 'and',\n 'article',\n 'body',\n 'blogindex',\n 'column',\n 'content',\n 'entry-content-asset',\n 'format', // misuse of form\n 'hfeed',\n 'hentry',\n 'hatom',\n 'main',\n 'page',\n 'posts',\n 'shadow',\n];\n\n// A list of tags which, if found inside, should cause a <div /> to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into <p /> tags.\nexport const DIV_TO_P_BLOCK_TAGS = [\n 'a',\n 'blockquote',\n 'dl',\n 'div',\n 'img',\n 'p',\n 'pre',\n 'table',\n].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nexport const NON_TOP_CANDIDATE_TAGS = [\n 'br',\n 'b',\n 'i',\n 'label',\n 'hr',\n 'area',\n 'base',\n 'basefont',\n 'input',\n 'img',\n 'link',\n 'meta',\n];\n\nexport const NON_TOP_CANDIDATE_TAGS_RE = new RegExp(\n `^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`,\n 'i'\n);\n\n// A list of selectors that specify, very clearly, either hNews or other\n// very content-specific style content, like Blogger templates.\n// More examples here: http://microformats.org/wiki/blog-post-formats\nexport const HNEWS_CONTENT_SELECTORS = [\n ['.hentry', '.entry-content'],\n ['entry', '.entry-content'],\n ['.entry', '.entry_content'],\n ['.post', '.postbody'],\n ['.post', '.post_body'],\n ['.post', '.post-body'],\n];\n\nexport const PHOTO_HINTS = ['figure', 'photo', 'image', 'caption'];\nexport const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const POSITIVE_SCORE_HINTS = [\n 'article',\n 'articlecontent',\n 'instapaper_body',\n 'blog',\n 'body',\n 'content',\n 'entry-content-asset',\n 'entry',\n 'hentry',\n 'main',\n 'Normal',\n 'page',\n 'pagination',\n 'permalink',\n 'post',\n 'story',\n 'text',\n '[-_]copy', // usatoday\n '\\\\Bcopy',\n];\n\n// The above list, joined into a matching regular expression\nexport const POSITIVE_SCORE_RE = new RegExp(\n POSITIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// Readability publisher-specific guidelines\nexport const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const NEGATIVE_SCORE_HINTS = [\n 'adbox',\n 'advert',\n 'author',\n 'bio',\n 'bookmark',\n 'bottom',\n 'byline',\n 'clear',\n 'com-',\n 'combx',\n 'comment',\n 'comment\\\\B',\n 'contact',\n 'copy',\n 'credit',\n 'crumb',\n 'date',\n 'deck',\n 'excerpt',\n 'featured', // tnr.com has a featured_content which throws us off\n 'foot',\n 'footer',\n 'footnote',\n 'graf',\n 'head',\n 'info',\n 'infotext', // newscientist.com copyright\n 'instapaper_ignore',\n 'jump',\n 'linebreak',\n 'link',\n 'masthead',\n 'media',\n 'meta',\n 'modal',\n 'outbrain', // slate.com junk\n 'promo',\n 'pr_', // autoblog - press release\n 'related',\n 'respond',\n 'roundcontent', // lifehacker restricted content warning\n 'scroll',\n 'secondary',\n 'share',\n 'shopping',\n 'shoutbox',\n 'side',\n 'sidebar',\n 'sponsor',\n 'stamp',\n 'sub',\n 'summary',\n 'tags',\n 'tools',\n 'widget',\n];\n// The above list, joined into a matching regular expression\nexport const NEGATIVE_SCORE_RE = new RegExp(\n NEGATIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// Match a digit. Pretty clear.\nexport const DIGIT_RE = new RegExp('[0-9]');\n\n// Match 2 or more consecutive <br> tags\nexport const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \\n\\r\\t]*){2,}', 'i');\n\n// Match 1 BR tag.\nexport const BR_TAG_RE = new RegExp('<br[^>]*>', 'i');\n\n// A list of all of the block level tags known in HTML5 and below. Taken from\n// http://bit.ly/qneNIT\nexport const BLOCK_LEVEL_TAGS = [\n 'article',\n 'aside',\n 'blockquote',\n 'body',\n 'br',\n 'button',\n 'canvas',\n 'caption',\n 'col',\n 'colgroup',\n 'dd',\n 'div',\n 'dl',\n 'dt',\n 'embed',\n 'fieldset',\n 'figcaption',\n 'figure',\n 'footer',\n 'form',\n 'h1',\n 'h2',\n 'h3',\n 'h4',\n 'h5',\n 'h6',\n 'header',\n 'hgroup',\n 'hr',\n 'li',\n 'map',\n 'object',\n 'ol',\n 'output',\n 'p',\n 'pre',\n 'progress',\n 'section',\n 'table',\n 'tbody',\n 'textarea',\n 'tfoot',\n 'th',\n 'thead',\n 'tr',\n 'ul',\n 'video',\n];\nexport const BLOCK_LEVEL_TAGS_RE = new RegExp(\n `^(${BLOCK_LEVEL_TAGS.join('|')})$`,\n 'i'\n);\n\n// The removal is implemented as a blacklist and whitelist, this test finds\n// blacklisted elements that aren't whitelisted. We do this all in one\n// expression-both because it's only one pass, and because this skips the\n// serialization for whitelisted nodes.\nconst candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');\nexport const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');\n\nconst candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');\nexport const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');\n\nexport const UNLIKELY_RE = new RegExp(\n `!(${candidatesWhitelist})|(${candidatesBlacklist})`,\n 'i'\n);\n\nexport const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');\nexport const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');\nexport const BAD_TAGS = new RegExp('^(address|form)$', 'i');\n\nexport const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');\n","// return 1 for every comma in text\nexport default function scoreCommas(text) {\n return (text.match(/,/g) || []).length;\n}\n","// Given a node type to search for, and a list of regular expressions,\n// look to see if this extraction can be found in the URL. Expects\n// that each expression in r_list will return group(1) as the proper\n// string to be cleaned.\n// Only used for date_published currently.\nexport default function extractFromUrl(url, regexList) {\n const matchRe = regexList.find(re => re.test(url));\n if (matchRe) {\n return matchRe.exec(url)[1];\n }\n\n return null;\n}\n","// An expression that looks to try to find the page digit within a URL, if\n// it exists.\n// Matches:\n// page=1\n// pg=1\n// p=1\n// paging=12\n// pag=7\n// pagination/1\n// paging/88\n// pa/83\n// p/11\n//\n// Does not match:\n// pg=102\n// page:2\nexport const PAGE_IN_HREF_RE = new RegExp(\n '(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})',\n 'i'\n);\n\nexport const HAS_ALPHA_RE = /[a-z]/i;\n\nexport const IS_ALPHA_RE = /^[a-z]+$/i;\nexport const IS_DIGIT_RE = /^[0-9]+$/i;\n\nexport const ENCODING_RE = /charset=([\\w-]+)\\b/;\nexport const DEFAULT_ENCODING = 'utf-8';\n","// Given a string, return True if it appears to have an ending sentence\n// within it, false otherwise.\nconst SENTENCE_END_RE = new RegExp('.( |$)');\nexport default function hasSentenceEnd(text) {\n return SENTENCE_END_RE.test(text);\n}\n","// Scoring\nexport { default as getWeight } from './get-weight';\nexport { default as getScore } from './get-score';\nexport { default as scoreCommas } from './score-commas';\nexport { default as scoreLength } from './score-length';\nexport { default as scoreParagraph } from './score-paragraph';\nexport { default as setScore } from './set-score';\nexport { default as addScore } from './add-score';\nexport { default as addToParent } from './add-to-parent';\nexport { default as getOrInitScore } from './get-or-init-score';\nexport { default as scoreNode } from './score-node';\nexport { default as scoreContent } from './score-content';\nexport { default as findTopCandidate } from './find-top-candidate';\n","import URL from 'url';\n\nimport { getAttrs, setAttr } from 'utils/dom';\n\nfunction absolutize($, rootUrl, attr) {\n const baseUrl = $('base').attr('href');\n\n $(`[${attr}]`).each((_, node) => {\n const attrs = getAttrs(node);\n const url = attrs[attr];\n if (!url) return;\n const absoluteUrl = URL.resolve(baseUrl || rootUrl, url);\n\n setAttr(node, attr, absoluteUrl);\n });\n}\n\nfunction absolutizeSet($, rootUrl, $content) {\n $('[srcset]', $content).each((_, node) => {\n const attrs = getAttrs(node);\n const urlSet = attrs.srcset;\n\n if (urlSet) {\n // a comma should be considered part of the candidate URL unless preceded by a descriptor\n // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'\n // space characters inside the URL should be encoded (%20 or +)\n const candidates = urlSet.match(\n /(?:\\s*)(\\S+(?:\\s*[\\d.]+[wx])?)(?:\\s*,\\s*)?/g\n );\n if (!candidates) return;\n const absoluteCandidates = candidates.map(candidate => {\n // a candidate URL cannot start or end with a comma\n // descriptors are separated from the URLs by unescaped whitespace\n const parts = candidate\n .trim()\n .replace(/,$/, '')\n .split(/\\s+/);\n parts[0] = URL.resolve(rootUrl, parts[0]);\n return parts.join(' ');\n });\n const absoluteUrlSet = [...new Set(absoluteCandidates)].join(', ');\n setAttr(node, 'srcset', absoluteUrlSet);\n }\n });\n}\n\nexport default function makeLinksAbsolute($content, $, url) {\n ['href', 'src'].forEach(attr => absolutize($, url, attr));\n absolutizeSet($, url, $content);\n\n return $content;\n}\n","// strips all tags from a string of text\nexport default function stripTags(text, $) {\n // Wrapping text in html element prevents errors when text\n // has no html\n const cleanText = $(`<span>${text}</span>`).text();\n return cleanText === '' ? text : cleanText;\n}\n","// Given a node, determine if it's article-like enough to return\n// param: node (a cheerio node)\n// return: boolean\n\nexport default function nodeIsSufficient($node) {\n return $node.text().trim().length >= 100;\n}\n","export default function getAttrs(node) {\n const { attribs, attributes } = node;\n\n if (!attribs && attributes) {\n const attrs = Reflect.ownKeys(attributes).reduce((acc, index) => {\n const attr = attributes[index];\n\n if (!attr.name || !attr.value) return acc;\n\n acc[attr.name] = attr.value;\n return acc;\n }, {});\n return attrs;\n }\n\n return attribs;\n}\n","export default function setAttr(node, attr, val) {\n if (node.attribs) {\n node.attribs[attr] = val;\n } else if (node.attributes) {\n node.setAttribute(attr, val);\n }\n\n return node;\n}\n","// DOM manipulation\nexport {\n default as stripUnlikelyCandidates,\n} from './strip-unlikely-candidates';\nexport { default as brsToPs } from './brs-to-ps';\nexport { default as paragraphize } from './paragraphize';\nexport { default as convertToParagraphs } from './convert-to-paragraphs';\nexport { default as convertNodeTo } from './convert-node-to';\nexport { default as cleanImages } from './clean-images';\nexport { default as markToKeep } from './mark-to-keep';\nexport { default as stripJunkTags } from './strip-junk-tags';\nexport { default as cleanHOnes } from './clean-h-ones';\nexport { default as cleanAttributes } from './clean-attributes';\nexport { default as removeEmpty } from './remove-empty';\nexport { default as cleanTags } from './clean-tags';\nexport { default as cleanHeaders } from './clean-headers';\nexport { default as rewriteTopLevel } from './rewrite-top-level';\nexport { default as makeLinksAbsolute } from './make-links-absolute';\nexport { textLength, linkDensity } from './link-density';\nexport { default as extractFromMeta } from './extract-from-meta';\nexport { default as extractFromSelectors } from './extract-from-selectors';\nexport { default as stripTags } from './strip-tags';\nexport { default as withinComment } from './within-comment';\nexport { default as nodeIsSufficient } from './node-is-sufficient';\nexport { default as isWordpress } from './is-wordpress';\nexport { default as getAttrs } from './get-attrs';\nexport { default as setAttr } from './set-attr';\nexport { default as setAttrs } from './set-attrs';\n","export default function insertValues(strings, ...values) {\n if (values.length) {\n return strings.reduce((result, part, idx) => {\n let value = values[idx];\n\n if (value && typeof value.toString === 'function') {\n value = value.toString();\n } else {\n value = '';\n }\n\n return result + part + value;\n }, '');\n }\n\n return strings.join('');\n}\n","import insertValues from './insert-values';\n\nconst bodyPattern = /^\\n([\\s\\S]+)\\s{2}$/gm;\nconst trailingWhitespace = /\\s+$/;\n\nexport default function template(strings, ...values) {\n const compiled = insertValues(strings, ...values);\n let [body] = compiled.match(bodyPattern) || [];\n let indentLevel = /^\\s{0,4}(.+)$/g;\n\n if (!body) {\n body = compiled;\n indentLevel = /^\\s{0,2}(.+)$/g;\n }\n\n return body\n .split('\\n')\n .slice(1)\n .map(line => {\n line = line.replace(indentLevel, '$1');\n\n if (trailingWhitespace.test(line)) {\n line = line.replace(trailingWhitespace, '');\n }\n\n return line;\n })\n .join('\\n');\n}\n","import template from './index';\n\nexport default function(hostname, name) {\n return template`\n export const ${name} = {\n domain: '${hostname}',\n\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n }\n `;\n}\n","import template from './index';\n\nconst IGNORE = [\n 'url',\n 'domain',\n 'content',\n 'word_count',\n 'next_page_url',\n 'excerpt',\n 'direction',\n 'total_pages',\n 'rendered_pages',\n];\n\nfunction testFor(key, value, dir) {\n if (IGNORE.find(k => k === key)) return '';\n\n return template`\n it('returns the ${key}', async () => {\n // To pass this test, fill out the ${key} selector\n // in ${dir}/index.js.\n const { ${key} } = await result\n\n // Update these values with the expected values from\n // the article.\n assert.equal(${key}, ${value ? `\\`${value}\\`` : \"''\"})\n });\n `;\n}\n\nexport default function(file, url, dir, result, name) {\n return template`\n import assert from 'assert';\n import URL from 'url';\n import cheerio from 'cheerio';\n\n import Parser from 'mercury';\n import getExtractor from 'extractors/get-extractor';\n import { excerptContent } from 'utils/text';\n\n const fs = require('fs');\n\n describe('${name}', () => {\n describe('initial test case', () => {\n let result;\n let url;\n beforeAll(() => {\n url =\n '${url}';\n const html =\n fs.readFileSync('${file}');\n result =\n Parser.parse(url, { html, fallback: false });\n });\n\n it('is selected properly', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ${Reflect.ownKeys(result)\n .map(k => testFor(k, result[k], dir))\n .join('\\n\\n')}\n\n it('returns the content', async () => {\n // To pass this test, fill out the content selector\n // in ${dir}/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const { content } = await result;\n\n const $ = cheerio.load(content || '');\n\n const first13 = excerptContent($('*').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, 'Add the first 13 words of the article here');\n });\n });\n });\n `;\n}\n","/* eslint-disable import/no-extraneous-dependencies */\n/* eslint-disable no-use-before-define */\n/* eslint-disable no-console */\nimport fs from 'fs';\nimport URL from 'url';\nimport inquirer from 'inquirer';\nimport ora from 'ora';\nimport { exec } from 'child_process';\n\nimport { stripJunkTags, makeLinksAbsolute } from 'utils/dom';\nimport Parser from '../dist/mercury';\nimport extractorTemplate from './templates/custom-extractor';\nimport extractorTestTemplate from './templates/custom-extractor-test';\n\nconst questions = [\n {\n type: 'input',\n name: 'website',\n message:\n \"Paste a url to an article you'd like to create or extend a parser for:\",\n validate(value) {\n const { hostname } = URL.parse(value);\n if (hostname) return true;\n\n return false;\n },\n },\n];\nlet spinner;\n\nfunction confirm(fn, args, msg, newParser) {\n spinner = ora({ text: msg });\n spinner.start();\n const result = fn(...args);\n\n if (result && result.then) {\n result.then(r => savePage(r, args, newParser));\n } else {\n spinner.succeed();\n }\n\n return result;\n}\n\nfunction confirmCreateDir(dir, msg) {\n if (!fs.existsSync(dir)) {\n confirm(fs.mkdirSync, [dir], msg);\n }\n}\n\nfunction getDir(url) {\n const { hostname } = URL.parse(url);\n return `./src/extractors/custom/${hostname}`;\n}\n\nfunction scaffoldCustomParser(url) {\n const dir = getDir(url);\n const { hostname } = URL.parse(url);\n let newParser = false;\n\n if (!fs.existsSync(dir)) {\n newParser = true;\n confirmCreateDir(dir, `Creating ${hostname} directory`);\n confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');\n }\n\n confirm(Parser.fetchResource, [url], 'Fetching fixture', newParser);\n}\n\n// if has arg, just assume that arg is a url and skip prmopt\nconst urlArg = process.argv[2];\nif (urlArg) {\n scaffoldCustomParser(urlArg);\n} else {\n inquirer.prompt(questions).then(answers => {\n scaffoldCustomParser(answers.website);\n });\n}\n\nfunction generateScaffold(url, file, result) {\n const { hostname } = URL.parse(url);\n const extractor = extractorTemplate(hostname, extractorName(hostname));\n const extractorTest = extractorTestTemplate(\n file,\n url,\n getDir(url),\n result,\n extractorName(hostname)\n );\n\n fs.writeFileSync(`${getDir(url)}/index.js`, extractor);\n fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest);\n fs.appendFileSync('./src/extractors/custom/index.js', exportString(url));\n exec(`npm run lint-fix-quiet -- ${getDir(url)}/*.js`);\n}\n\nfunction savePage($, [url], newParser) {\n const { hostname } = URL.parse(url);\n\n spinner.succeed();\n\n const filename = new Date().getTime();\n const file = `./fixtures/${hostname}/${filename}.html`;\n // fix http(s) relative links:\n makeLinksAbsolute($('*').first(), $, url);\n $('[src], [href]').each((index, node) => {\n const $node = $(node);\n const link = $node.attr('src');\n if (link && link.slice(0, 2) === '//') {\n $node.attr('src', `http:${link}`);\n }\n });\n const html = stripJunkTags($('*').first(), $, ['script']).html();\n\n fs.writeFileSync(file, html);\n\n Parser.parse(url, { html }).then(result => {\n if (newParser) {\n confirm(\n generateScaffold,\n [url, file, result],\n 'Generating parser and tests'\n );\n console.log(`Your custom site extractor has been set up. To get started building it, run\n yarn watch:test -- ${hostname}\n -- OR --\n npm run watch:test -- ${hostname}`);\n } else {\n console.log(`\n It looks like you already have a custom parser for this url.\n The page you linked to has been added to ${file}. Copy and paste\n the following code to use that page in your tests:\n const html = fs.readFileSync('${file}');`);\n }\n });\n}\n\nfunction exportString(url) {\n const { hostname } = URL.parse(url);\n return `export * from './${hostname}';`;\n}\n\nfunction extractorName(hostname) {\n const name = hostname\n .split('.')\n .map(w => `${w.charAt(0).toUpperCase()}${w.slice(1)}`)\n .join('');\n return `${name}Extractor`;\n}\n"],"names":["KEEP_CLASS","STRIP_OUTPUT_TAGS","stripJunkTags","article","$","tags","length","join","not","remove","absolutize","rootUrl","attr","baseUrl","each","_","node","attrs","getAttrs","url","absoluteUrl","URL","resolve","setAttr","absolutizeSet","$content","urlSet","srcset","candidates","match","absoluteCandidates","map","candidate","parts","trim","replace","split","absoluteUrlSet","makeLinksAbsolute","forEach","attribs","attributes","reduce","acc","index","name","value","val","setAttribute","insertValues","strings","values","result","part","idx","toString","bodyPattern","trailingWhitespace","template","compiled","body","indentLevel","slice","line","test","hostname","IGNORE","testFor","key","dir","find","k","file","questions","type","message","validate","parse","spinner","confirm","fn","args","msg","newParser","ora","text","start","then","r","savePage","succeed","confirmCreateDir","fs","existsSync","mkdirSync","getDir","scaffoldCustomParser","Parser","fetchResource","urlArg","process","argv","inquirer","prompt","answers","website","generateScaffold","extractor","extractorTemplate","extractorName","extractorTest","extractorTestTemplate","writeFileSync","appendFileSync","exportString","exec","filename","Date","getTime","first","$node","link","html","console","log","w","charAt","toUpperCase"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AACA,AAGA;;AACA,AAAO,IAAMA,UAAU,GAAG,qBAAnB;AAEP;AAUA,AAAO,IAAMC,iBAAiB,GAAG,CAC/B,OAD+B,EAE/B,QAF+B,EAG/B,UAH+B,EAI/B,MAJ+B,EAK/B,OAL+B,EAM/B,IAN+B,EAO/B,OAP+B,EAQ/B,QAR+B,EAS/B,QAT+B,CAA1B;;ACfQ,SAASC,aAAT,CAAuBC,OAAvB,EAAgCC,CAAhC,EAA8C;MAAXC,IAAW,uEAAJ,EAAI;;MACvDA,IAAI,CAACC,MAAL,KAAgB,CAApB,EAAuB;IACrBD,IAAI,GAAGJ,iBAAP;GAFyD;;;;EAO3DG,CAAC,CAACC,IAAI,CAACE,IAAL,CAAU,GAAV,CAAD,EAAiBJ,OAAjB,CAAD,CACGK,GADH,YACWR,UADX,GAEGS,MAFH;SAIOL,CAAP;;;ACbF;;ACAA;;ACAA;;ACAA;;ACAA;;ACAA;;ACIA,SAASM,UAAT,CAAoBN,CAApB,EAAuBO,OAAvB,EAAgCC,IAAhC,EAAsC;MAC9BC,OAAO,GAAGT,CAAC,CAAC,MAAD,CAAD,CAAUQ,IAAV,CAAe,MAAf,CAAhB;EAEAR,CAAC,YAAKQ,IAAL,OAAD,CAAeE,IAAf,CAAoB,UAACC,CAAD,EAAIC,IAAJ,EAAa;QACzBC,KAAK,GAAGC,QAAQ,CAACF,IAAD,CAAtB;QACMG,GAAG,GAAGF,KAAK,CAACL,IAAD,CAAjB;QACI,CAACO,GAAL,EAAU;QACJC,WAAW,GAAGC,KAAG,CAACC,OAAJ,CAAYT,OAAO,IAAIF,OAAvB,EAAgCQ,GAAhC,CAApB;IAEAI,OAAO,CAACP,IAAD,EAAOJ,IAAP,EAAaQ,WAAb,CAAP;GANF;;;AAUF,SAASI,aAAT,CAAuBpB,CAAvB,EAA0BO,OAA1B,EAAmCc,QAAnC,EAA6C;EAC3CrB,CAAC,CAAC,UAAD,EAAaqB,QAAb,CAAD,CAAwBX,IAAxB,CAA6B,UAACC,CAAD,EAAIC,IAAJ,EAAa;QAClCC,KAAK,GAAGC,QAAQ,CAACF,IAAD,CAAtB;QACMU,MAAM,GAAGT,KAAK,CAACU,MAArB;;QAEID,MAAJ,EAAY;;;;UAIJE,UAAU,GAAGF,MAAM,CAACG,KAAP,CACjB,6CADiB,CAAnB;UAGI,CAACD,UAAL,EAAiB;UACXE,kBAAkB,GAAGF,UAAU,CAACG,GAAX,CAAe,UAAAC,SAAS,EAAI;;;YAG/CC,KAAK,GAAGD,SAAS,CACpBE,IADW,GAEXC,OAFW,CAEH,IAFG,EAEG,EAFH,EAGXC,KAHW,CAGL,KAHK,CAAd;QAIAH,KAAK,CAAC,CAAD,CAAL,GAAWZ,KAAG,CAACC,OAAJ,CAAYX,OAAZ,EAAqBsB,KAAK,CAAC,CAAD,CAA1B,CAAX;eACOA,KAAK,CAAC1B,IAAN,CAAW,GAAX,CAAP;OARyB,CAA3B;;UAUM8B,cAAc,GAAG,mBAAI,QAAQP,kBAAR,CAAJ,EAAiCvB,IAAjC,CAAsC,IAAtC,CAAvB;;MACAgB,OAAO,CAACP,IAAD,EAAO,QAAP,EAAiBqB,cAAjB,CAAP;;GAvBJ;;;AA4BF,AAAe,SAASC,oBAAT,CAA2Bb,QAA3B,EAAqCrB,CAArC,EAAwCe,GAAxC,EAA6C;GACzD,MAAD,EAAS,KAAT,EAAgBoB,OAAhB,CAAwB,UAAA3B,IAAI;WAAIF,UAAU,CAACN,CAAD,EAAIe,GAAJ,EAASP,IAAT,CAAd;GAA5B;EACAY,aAAa,CAACpB,CAAD,EAAIe,GAAJ,EAASM,QAAT,CAAb;SAEOA,QAAP;;;AClDF;;ACAA;;ACAe,SAASP,QAAT,CAAkBF,IAAlB,EAAwB;MAC7BwB,OAD6B,GACLxB,IADK,CAC7BwB,OAD6B;MACpBC,UADoB,GACLzB,IADK,CACpByB,UADoB;;MAGjC,CAACD,OAAD,IAAYC,UAAhB,EAA4B;QACpBxB,KAAK,GAAG,iBAAgBwB,UAAhB,EAA4BC,MAA5B,CAAmC,UAACC,GAAD,EAAMC,KAAN,EAAgB;UACzDhC,IAAI,GAAG6B,UAAU,CAACG,KAAD,CAAvB;UAEI,CAAChC,IAAI,CAACiC,IAAN,IAAc,CAACjC,IAAI,CAACkC,KAAxB,EAA+B,OAAOH,GAAP;MAE/BA,GAAG,CAAC/B,IAAI,CAACiC,IAAN,CAAH,GAAiBjC,IAAI,CAACkC,KAAtB;aACOH,GAAP;KANY,EAOX,EAPW,CAAd;;WAQO1B,KAAP;;;SAGKuB,OAAP;;;ACfa,SAASjB,OAAT,CAAiBP,IAAjB,EAAuBJ,IAAvB,EAA6BmC,GAA7B,EAAkC;MAC3C/B,IAAI,CAACwB,OAAT,EAAkB;IAChBxB,IAAI,CAACwB,OAAL,CAAa5B,IAAb,IAAqBmC,GAArB;GADF,MAEO,IAAI/B,IAAI,CAACyB,UAAT,EAAqB;IAC1BzB,IAAI,CAACgC,YAAL,CAAkBpC,IAAlB,EAAwBmC,GAAxB;;;SAGK/B,IAAP;;;ACPF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ACAe,SAASiC,YAAT,CAAsBC,OAAtB,EAA0C;oCAARC,MAAQ;IAARA,MAAQ;;;MACnDA,MAAM,CAAC7C,MAAX,EAAmB;WACV4C,OAAO,CAACR,MAAR,CAAe,UAACU,MAAD,EAASC,IAAT,EAAeC,GAAf,EAAuB;UACvCR,KAAK,GAAGK,MAAM,CAACG,GAAD,CAAlB;;UAEIR,KAAK,IAAI,OAAOA,KAAK,CAACS,QAAb,KAA0B,UAAvC,EAAmD;QACjDT,KAAK,GAAGA,KAAK,CAACS,QAAN,EAAR;OADF,MAEO;QACLT,KAAK,GAAG,EAAR;;;aAGKM,MAAM,GAAGC,IAAT,GAAgBP,KAAvB;KATK,EAUJ,EAVI,CAAP;;;SAaKI,OAAO,CAAC3C,IAAR,CAAa,EAAb,CAAP;;;ACbF,IAAMiD,WAAW,GAAG,sBAApB;AACA,IAAMC,kBAAkB,GAAG,MAA3B;AAEA,AAAe,SAASC,QAAT,CAAkBR,OAAlB,EAAsC;oCAARC,MAAQ;IAARA,MAAQ;;;MAC7CQ,QAAQ,GAAGV,YAAY,MAAZ,UAAaC,OAAb,SAAyBC,MAAzB,EAAjB;;aACaQ,QAAQ,CAAC9B,KAAT,CAAe2B,WAAf,KAA+B,EAFO;;MAE9CI,IAF8C;;MAG/CC,WAAW,GAAG,gBAAlB;;MAEI,CAACD,IAAL,EAAW;IACTA,IAAI,GAAGD,QAAP;IACAE,WAAW,GAAG,gBAAd;;;SAGKD,IAAI,CACRxB,KADI,CACE,IADF,EAEJ0B,KAFI,CAEE,CAFF,EAGJ/B,GAHI,CAGA,UAAAgC,IAAI,EAAI;IACXA,IAAI,GAAGA,IAAI,CAAC5B,OAAL,CAAa0B,WAAb,EAA0B,IAA1B,CAAP;;QAEIJ,kBAAkB,CAACO,IAAnB,CAAwBD,IAAxB,CAAJ,EAAmC;MACjCA,IAAI,GAAGA,IAAI,CAAC5B,OAAL,CAAasB,kBAAb,EAAiC,EAAjC,CAAP;;;WAGKM,IAAP;GAVG,EAYJxD,IAZI,CAYC,IAZD,CAAP;;;;;;;;;;;;ACba,4BAAS0D,QAAT,EAAmBpB,IAAnB,EAAyB;SAC/Ba,QAAP,oBACiBb,IADjB,EAEeoB,QAFf;;;;;;;;;;;;;;;;;;;;;;ACDF,IAAMC,MAAM,GAAG,CACb,KADa,EAEb,QAFa,EAGb,SAHa,EAIb,YAJa,EAKb,eALa,EAMb,SANa,EAOb,WAPa,EAQb,aARa,EASb,gBATa,CAAf;;AAYA,SAASC,OAAT,CAAiBC,GAAjB,EAAsBtB,KAAtB,EAA6BuB,GAA7B,EAAkC;MAC5BH,MAAM,CAACI,IAAP,CAAY,UAAAC,CAAC;WAAIA,CAAC,KAAKH,GAAV;GAAb,CAAJ,EAAiC,OAAO,EAAP;SAE1BV,QAAP,sBACkBU,GADlB,EAE+CA,GAF/C,EAGkBC,GAHlB,EAIoBD,GAJpB,EAQyBA,GARzB,EAQiCtB,KAAK,cAAQA,KAAR,SAAoB,IAR1D;;;AAaF,AAAe,gCAAS0B,IAAT,EAAerD,GAAf,EAAoBkD,GAApB,EAAyBjB,MAAzB,EAAiCP,IAAjC,EAAuC;SAC7Ca,QAAP,qBAWcb,IAXd,EAiBa1B,GAjBb,EAmB6BqD,IAnB7B,EAgCU,iBAAgBpB,MAAhB,EACCrB,GADD,CACK,UAAAwC,CAAC;WAAIJ,OAAO,CAACI,CAAD,EAAInB,MAAM,CAACmB,CAAD,CAAV,EAAeF,GAAf,CAAX;GADN,EAEC9D,IAFD,CAEM,MAFN,CAhCV,EAsCgB8D,GAtChB;;;ACjBF,IAAMI,SAAS,GAAG,CAChB;EACEC,IAAI,EAAE,OADR;EAEE7B,IAAI,EAAE,SAFR;EAGE8B,OAAO,EACL,wEAJJ;EAKEC,QALF,oBAKW9B,KALX,EAKkB;qBACOzB,KAAG,CAACwD,KAAJ,CAAU/B,KAAV,CADP;QACNmB,QADM,cACNA,QADM;;QAEVA,QAAJ,EAAc,OAAO,IAAP;WAEP,KAAP;;CAVY,CAAlB;AAcA,IAAIa,OAAJ;;AAEA,SAASC,OAAT,CAAiBC,EAAjB,EAAqBC,IAArB,EAA2BC,GAA3B,EAAgCC,SAAhC,EAA2C;EACzCL,OAAO,GAAGM,GAAG,CAAC;IAAEC,IAAI,EAAEH;GAAT,CAAb;EACAJ,OAAO,CAACQ,KAAR;MACMlC,MAAM,GAAG4B,EAAE,MAAF,4BAAMC,IAAN,EAAf;;MAEI7B,MAAM,IAAIA,MAAM,CAACmC,IAArB,EAA2B;IACzBnC,MAAM,CAACmC,IAAP,CAAY,UAAAC,CAAC;aAAIC,QAAQ,CAACD,CAAD,EAAIP,IAAJ,EAAUE,SAAV,CAAZ;KAAb;GADF,MAEO;IACLL,OAAO,CAACY,OAAR;;;SAGKtC,MAAP;;;AAGF,SAASuC,gBAAT,CAA0BtB,GAA1B,EAA+Ba,GAA/B,EAAoC;MAC9B,CAACU,EAAE,CAACC,UAAH,CAAcxB,GAAd,CAAL,EAAyB;IACvBU,OAAO,CAACa,EAAE,CAACE,SAAJ,EAAe,CAACzB,GAAD,CAAf,EAAsBa,GAAtB,CAAP;;;;AAIJ,SAASa,MAAT,CAAgB5E,GAAhB,EAAqB;oBACEE,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CADF;MACX8C,QADW,eACXA,QADW;;2CAEeA,QAAlC;;;AAGF,SAAS+B,oBAAT,CAA8B7E,GAA9B,EAAmC;MAC3BkD,GAAG,GAAG0B,MAAM,CAAC5E,GAAD,CAAlB;;oBACqBE,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CAFY;MAEzB8C,QAFyB,eAEzBA,QAFyB;;MAG7BkB,SAAS,GAAG,KAAhB;;MAEI,CAACS,EAAE,CAACC,UAAH,CAAcxB,GAAd,CAAL,EAAyB;IACvBc,SAAS,GAAG,IAAZ;IACAQ,gBAAgB,CAACtB,GAAD,qBAAkBJ,QAAlB,gBAAhB;IACA0B,gBAAgB,sBAAe1B,QAAf,GAA2B,6BAA3B,CAAhB;;;EAGFc,OAAO,CAACkB,OAAM,CAACC,aAAR,EAAuB,CAAC/E,GAAD,CAAvB,EAA8B,kBAA9B,EAAkDgE,SAAlD,CAAP;;;;AAIF,IAAMgB,MAAM,GAAGC,OAAO,CAACC,IAAR,CAAa,CAAb,CAAf;;AACA,IAAIF,MAAJ,EAAY;EACVH,oBAAoB,CAACG,MAAD,CAApB;CADF,MAEO;EACLG,QAAQ,CAACC,MAAT,CAAgB9B,SAAhB,EAA2Bc,IAA3B,CAAgC,UAAAiB,OAAO,EAAI;IACzCR,oBAAoB,CAACQ,OAAO,CAACC,OAAT,CAApB;GADF;;;AAKF,SAASC,gBAAT,CAA0BvF,GAA1B,EAA+BqD,IAA/B,EAAqCpB,MAArC,EAA6C;oBACtB/B,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CADsB;MACnC8C,QADmC,eACnCA,QADmC;;MAErC0C,SAAS,GAAGC,iBAAiB,CAAC3C,QAAD,EAAW4C,aAAa,CAAC5C,QAAD,CAAxB,CAAnC;MACM6C,aAAa,GAAGC,qBAAqB,CACzCvC,IADyC,EAEzCrD,GAFyC,EAGzC4E,MAAM,CAAC5E,GAAD,CAHmC,EAIzCiC,MAJyC,EAKzCyD,aAAa,CAAC5C,QAAD,CAL4B,CAA3C;EAQA2B,EAAE,CAACoB,aAAH,WAAoBjB,MAAM,CAAC5E,GAAD,CAA1B,gBAA4CwF,SAA5C;EACAf,EAAE,CAACoB,aAAH,WAAoBjB,MAAM,CAAC5E,GAAD,CAA1B,qBAAiD2F,aAAjD;EACAlB,EAAE,CAACqB,cAAH,CAAkB,kCAAlB,EAAsDC,YAAY,CAAC/F,GAAD,CAAlE;EACAgG,kBAAI,qCAA8BpB,MAAM,CAAC5E,GAAD,CAApC,WAAJ;;;AAGF,SAASsE,QAAT,CAAkBrF,CAAlB,QAA4B+E,SAA5B,EAAuC;;MAAjBhE,GAAiB;;oBAChBE,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CADgB;MAC7B8C,QAD6B,eAC7BA,QAD6B;;EAGrCa,OAAO,CAACY,OAAR;MAEM0B,QAAQ,GAAG,IAAIC,IAAJ,GAAWC,OAAX,EAAjB;MACM9C,IAAI,wBAAiBP,QAAjB,cAA6BmD,QAA7B,UAAV,CANqC;;EAQrC9E,oBAAiB,CAAClC,CAAC,CAAC,GAAD,CAAD,CAAOmH,KAAP,EAAD,EAAiBnH,CAAjB,EAAoBe,GAApB,CAAjB;EACAf,CAAC,CAAC,eAAD,CAAD,CAAmBU,IAAnB,CAAwB,UAAC8B,KAAD,EAAQ5B,IAAR,EAAiB;QACjCwG,KAAK,GAAGpH,CAAC,CAACY,IAAD,CAAf;QACMyG,IAAI,GAAGD,KAAK,CAAC5G,IAAN,CAAW,KAAX,CAAb;;QACI6G,IAAI,IAAIA,IAAI,CAAC3D,KAAL,CAAW,CAAX,EAAc,CAAd,MAAqB,IAAjC,EAAuC;MACrC0D,KAAK,CAAC5G,IAAN,CAAW,KAAX,iBAA0B6G,IAA1B;;GAJJ;MAOMC,IAAI,GAAGxH,aAAa,CAACE,CAAC,CAAC,GAAD,CAAD,CAAOmH,KAAP,EAAD,EAAiBnH,CAAjB,EAAoB,CAAC,QAAD,CAApB,CAAb,CAA6CsH,IAA7C,EAAb;EAEA9B,EAAE,CAACoB,aAAH,CAAiBxC,IAAjB,EAAuBkD,IAAvB;EAEAzB,OAAM,CAACpB,KAAP,CAAa1D,GAAb,EAAkB;IAAEuG,IAAI,EAAJA;GAApB,EAA4BnC,IAA5B,CAAiC,UAAAnC,MAAM,EAAI;QACrC+B,SAAJ,EAAe;MACbJ,OAAO,CACL2B,gBADK,EAEL,CAACvF,GAAD,EAAMqD,IAAN,EAAYpB,MAAZ,CAFK,EAGL,6BAHK,CAAP;MAKAuE,OAAO,CAACC,GAAR,iHACqB3D,QADrB,6DAGwBA,QAHxB;KANF,MAUO;MACL0D,OAAO,CAACC,GAAR,wHAEuCpD,IAFvC,qHAI4BA,IAJ5B;;GAZJ;;;AAqBF,SAAS0C,YAAT,CAAsB/F,GAAtB,EAA2B;oBACJE,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CADI;MACjB8C,QADiB,eACjBA,QADiB;;oCAEEA,QAA3B;;;AAGF,SAAS4C,aAAT,CAAuB5C,QAAvB,EAAiC;MACzBpB,IAAI,GAAGoB,QAAQ,CAClB7B,KADU,CACJ,GADI,EAEVL,GAFU,CAEN,UAAA8F,CAAC;qBAAOA,CAAC,CAACC,MAAF,CAAS,CAAT,EAAYC,WAAZ,EAAP,SAAmCF,CAAC,CAAC/D,KAAF,CAAQ,CAAR,CAAnC;GAFK,EAGVvD,IAHU,CAGL,EAHK,CAAb;mBAIUsC,IAAV;"}
1
+ {"version":3,"file":"generate-custom-parser.js","sources":["../src/utils/dom/constants.js","../src/utils/dom/strip-junk-tags.js","../src/extractors/generic/content/scoring/constants.js","../src/extractors/generic/content/scoring/score-commas.js","../src/utils/text/extract-from-url.js","../src/utils/text/constants.js","../src/utils/text/has-sentence-end.js","../src/extractors/generic/content/scoring/index.js","../src/utils/dom/make-links-absolute.js","../src/utils/dom/strip-tags.js","../src/utils/dom/node-is-sufficient.js","../src/utils/dom/get-attrs.js","../src/utils/dom/set-attr.js","../src/utils/dom/index.js","../scripts/templates/insert-values.js","../scripts/templates/index.js","../scripts/templates/custom-extractor.js","../scripts/templates/custom-extractor-test.js","../scripts/generate-custom-parser.js"],"sourcesContent":["// Spacer images to be removed\nexport const SPACER_RE = new RegExp('transparent|spacer|blank', 'i');\n\n// The class we will use to mark elements we want to keep\n// but would normally remove\nexport const KEEP_CLASS = 'mercury-parser-keep';\n\nexport const KEEP_SELECTORS = [\n 'iframe[src^=\"https://www.youtube.com\"]',\n 'iframe[src^=\"https://www.youtube-nocookie.com\"]',\n 'iframe[src^=\"http://www.youtube.com\"]',\n 'iframe[src^=\"https://player.vimeo\"]',\n 'iframe[src^=\"http://player.vimeo\"]',\n 'iframe[src^=\"https://www.redditmedia.com\"]',\n];\n\n// A list of tags to strip from the output if we encounter them.\nexport const STRIP_OUTPUT_TAGS = [\n 'title',\n 'script',\n 'noscript',\n 'link',\n 'style',\n 'hr',\n 'embed',\n 'iframe',\n 'object',\n];\n\n// cleanAttributes\nexport const REMOVE_ATTRS = ['style', 'align'];\nexport const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(\n selector => `[${selector}]`\n);\nexport const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');\nexport const WHITELIST_ATTRS = [\n 'src',\n 'srcset',\n 'start',\n 'sizes',\n 'type',\n 'href',\n 'class',\n 'id',\n 'alt',\n 'xlink:href',\n 'width',\n 'height',\n];\n\nexport const WHITELIST_ATTRS_RE = new RegExp(\n `^(${WHITELIST_ATTRS.join('|')})$`,\n 'i'\n);\n\n// removeEmpty\nexport const REMOVE_EMPTY_TAGS = ['p'];\nexport const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(\n tag => `${tag}:empty`\n).join(',');\n\n// cleanTags\nexport const CLEAN_CONDITIONALLY_TAGS = [\n 'ul',\n 'ol',\n 'table',\n 'div',\n 'button',\n 'form',\n].join(',');\n\n// cleanHeaders\nconst HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];\nexport const HEADER_TAG_LIST = HEADER_TAGS.join(',');\n\n// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n 'ad-break',\n 'adbox',\n 'advert',\n 'addthis',\n 'agegate',\n 'aux',\n 'blogger-labels',\n 'combx',\n 'comment',\n 'conversation',\n 'disqus',\n 'entry-unrelated',\n 'extra',\n 'foot',\n // 'form', // This is too generic, has too many false positives\n 'header',\n 'hidden',\n 'loader',\n 'login', // Note: This can hit 'blogindex'.\n 'menu',\n 'meta',\n 'nav',\n 'outbrain',\n 'pager',\n 'pagination',\n 'predicta', // readwriteweb inline ad box\n 'presence_control_external', // lifehacker.com container full of false positives\n 'popup',\n 'printfriendly',\n 'related',\n 'remove',\n 'remark',\n 'rss',\n 'share',\n 'shoutbox',\n 'sidebar',\n 'sociable',\n 'sponsor',\n 'taboola',\n 'tools',\n];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nexport const UNLIKELY_CANDIDATES_WHITELIST = [\n 'and',\n 'article',\n 'body',\n 'blogindex',\n 'column',\n 'content',\n 'entry-content-asset',\n 'format', // misuse of form\n 'hfeed',\n 'hentry',\n 'hatom',\n 'main',\n 'page',\n 'posts',\n 'shadow',\n];\n\n// A list of tags which, if found inside, should cause a <div /> to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into <p /> tags.\nexport const DIV_TO_P_BLOCK_TAGS = [\n 'a',\n 'blockquote',\n 'dl',\n 'div',\n 'img',\n 'p',\n 'pre',\n 'table',\n].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nexport const NON_TOP_CANDIDATE_TAGS = [\n 'br',\n 'b',\n 'i',\n 'label',\n 'hr',\n 'area',\n 'base',\n 'basefont',\n 'input',\n 'img',\n 'link',\n 'meta',\n];\n\nexport const NON_TOP_CANDIDATE_TAGS_RE = new RegExp(\n `^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`,\n 'i'\n);\n\n// A list of selectors that specify, very clearly, either hNews or other\n// very content-specific style content, like Blogger templates.\n// More examples here: http://microformats.org/wiki/blog-post-formats\nexport const HNEWS_CONTENT_SELECTORS = [\n ['.hentry', '.entry-content'],\n ['entry', '.entry-content'],\n ['.entry', '.entry_content'],\n ['.post', '.postbody'],\n ['.post', '.post_body'],\n ['.post', '.post-body'],\n];\n\nexport const PHOTO_HINTS = ['figure', 'photo', 'image', 'caption'];\nexport const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const POSITIVE_SCORE_HINTS = [\n 'article',\n 'articlecontent',\n 'instapaper_body',\n 'blog',\n 'body',\n 'content',\n 'entry-content-asset',\n 'entry',\n 'hentry',\n 'main',\n 'Normal',\n 'page',\n 'pagination',\n 'permalink',\n 'post',\n 'story',\n 'text',\n '[-_]copy', // usatoday\n '\\\\Bcopy',\n];\n\n// The above list, joined into a matching regular expression\nexport const POSITIVE_SCORE_RE = new RegExp(\n POSITIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// Readability publisher-specific guidelines\nexport const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const NEGATIVE_SCORE_HINTS = [\n 'adbox',\n 'advert',\n 'author',\n 'bio',\n 'bookmark',\n 'bottom',\n 'byline',\n 'clear',\n 'com-',\n 'combx',\n 'comment',\n 'comment\\\\B',\n 'contact',\n 'copy',\n 'credit',\n 'crumb',\n 'date',\n 'deck',\n 'excerpt',\n 'featured', // tnr.com has a featured_content which throws us off\n 'foot',\n 'footer',\n 'footnote',\n 'graf',\n 'head',\n 'info',\n 'infotext', // newscientist.com copyright\n 'instapaper_ignore',\n 'jump',\n 'linebreak',\n 'link',\n 'masthead',\n 'media',\n 'meta',\n 'modal',\n 'outbrain', // slate.com junk\n 'promo',\n 'pr_', // autoblog - press release\n 'related',\n 'respond',\n 'roundcontent', // lifehacker restricted content warning\n 'scroll',\n 'secondary',\n 'share',\n 'shopping',\n 'shoutbox',\n 'side',\n 'sidebar',\n 'sponsor',\n 'stamp',\n 'sub',\n 'summary',\n 'tags',\n 'tools',\n 'widget',\n];\n// The above list, joined into a matching regular expression\nexport const NEGATIVE_SCORE_RE = new RegExp(\n NEGATIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// XPath to try to determine if a page is wordpress. Not always successful.\nexport const IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]';\n\n// Match a digit. Pretty clear.\nexport const DIGIT_RE = new RegExp('[0-9]');\n\n// A list of words that, if found in link text or URLs, likely mean that\n// this link is not a next page link.\nexport const EXTRANEOUS_LINK_HINTS = [\n 'print',\n 'archive',\n 'comment',\n 'discuss',\n 'e-mail',\n 'email',\n 'share',\n 'reply',\n 'all',\n 'login',\n 'sign',\n 'single',\n 'adx',\n 'entry-unrelated',\n];\nexport const EXTRANEOUS_LINK_HINTS_RE = new RegExp(\n EXTRANEOUS_LINK_HINTS.join('|'),\n 'i'\n);\n\n// Match any phrase that looks like it could be page, or paging, or pagination\nexport const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');\n\n// Match any link text/classname/id that looks like it could mean the next\n// page. Things like: next, continue, >, >>, » but not >|, »| as those can\n// mean last page.\n// export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))', 'i');\nexport const NEXT_LINK_TEXT_RE = /(next|weiter|continue|>([^|]|$)|»([^|]|$))/i;\n\n// Match any link text/classname/id that looks like it is an end link: things\n// like \"first\", \"last\", \"end\", etc.\nexport const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i');\n\n// Match any link text/classname/id that looks like it means the previous\n// page.\nexport const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i');\n\n// Match 2 or more consecutive <br> tags\nexport const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \\n\\r\\t]*){2,}', 'i');\n\n// Match 1 BR tag.\nexport const BR_TAG_RE = new RegExp('<br[^>]*>', 'i');\n\n// A list of all of the block level tags known in HTML5 and below. Taken from\n// http://bit.ly/qneNIT\nexport const BLOCK_LEVEL_TAGS = [\n 'article',\n 'aside',\n 'blockquote',\n 'body',\n 'br',\n 'button',\n 'canvas',\n 'caption',\n 'col',\n 'colgroup',\n 'dd',\n 'div',\n 'dl',\n 'dt',\n 'embed',\n 'fieldset',\n 'figcaption',\n 'figure',\n 'footer',\n 'form',\n 'h1',\n 'h2',\n 'h3',\n 'h4',\n 'h5',\n 'h6',\n 'header',\n 'hgroup',\n 'hr',\n 'li',\n 'map',\n 'object',\n 'ol',\n 'output',\n 'p',\n 'pre',\n 'progress',\n 'section',\n 'table',\n 'tbody',\n 'textarea',\n 'tfoot',\n 'th',\n 'thead',\n 'tr',\n 'ul',\n 'video',\n];\nexport const BLOCK_LEVEL_TAGS_RE = new RegExp(\n `^(${BLOCK_LEVEL_TAGS.join('|')})$`,\n 'i'\n);\n\n// The removal is implemented as a blacklist and whitelist, this test finds\n// blacklisted elements that aren't whitelisted. We do this all in one\n// expression-both because it's only one pass, and because this skips the\n// serialization for whitelisted nodes.\nconst candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');\nexport const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');\n\nconst candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');\nexport const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');\n\nexport const UNLIKELY_RE = new RegExp(\n `!(${candidatesWhitelist})|(${candidatesBlacklist})`,\n 'i'\n);\n\nexport const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');\nexport const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');\nexport const BAD_TAGS = new RegExp('^(address|form)$', 'i');\n\nexport const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');\n","import { STRIP_OUTPUT_TAGS, KEEP_CLASS } from './constants';\n\nexport default function stripJunkTags(article, $, tags = []) {\n if (tags.length === 0) {\n tags = STRIP_OUTPUT_TAGS;\n }\n\n // Remove matching elements, but ignore\n // any element with a class of mercury-parser-keep\n $(tags.join(','), article)\n .not(`.${KEEP_CLASS}`)\n .remove();\n\n return $;\n}\n","// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n 'ad-break',\n 'adbox',\n 'advert',\n 'addthis',\n 'agegate',\n 'aux',\n 'blogger-labels',\n 'combx',\n 'comment',\n 'conversation',\n 'disqus',\n 'entry-unrelated',\n 'extra',\n 'foot',\n 'form',\n 'header',\n 'hidden',\n 'loader',\n 'login', // Note: This can hit 'blogindex'.\n 'menu',\n 'meta',\n 'nav',\n 'pager',\n 'pagination',\n 'predicta', // readwriteweb inline ad box\n 'presence_control_external', // lifehacker.com container full of false positives\n 'popup',\n 'printfriendly',\n 'related',\n 'remove',\n 'remark',\n 'rss',\n 'share',\n 'shoutbox',\n 'sidebar',\n 'sociable',\n 'sponsor',\n 'tools',\n];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nexport const UNLIKELY_CANDIDATES_WHITELIST = [\n 'and',\n 'article',\n 'body',\n 'blogindex',\n 'column',\n 'content',\n 'entry-content-asset',\n 'format', // misuse of form\n 'hfeed',\n 'hentry',\n 'hatom',\n 'main',\n 'page',\n 'posts',\n 'shadow',\n];\n\n// A list of tags which, if found inside, should cause a <div /> to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into <p /> tags.\nexport const DIV_TO_P_BLOCK_TAGS = [\n 'a',\n 'blockquote',\n 'dl',\n 'div',\n 'img',\n 'p',\n 'pre',\n 'table',\n].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nexport const NON_TOP_CANDIDATE_TAGS = [\n 'br',\n 'b',\n 'i',\n 'label',\n 'hr',\n 'area',\n 'base',\n 'basefont',\n 'input',\n 'img',\n 'link',\n 'meta',\n];\n\nexport const NON_TOP_CANDIDATE_TAGS_RE = new RegExp(\n `^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`,\n 'i'\n);\n\n// A list of selectors that specify, very clearly, either hNews or other\n// very content-specific style content, like Blogger templates.\n// More examples here: http://microformats.org/wiki/blog-post-formats\nexport const HNEWS_CONTENT_SELECTORS = [\n ['.hentry', '.entry-content'],\n ['entry', '.entry-content'],\n ['.entry', '.entry_content'],\n ['.post', '.postbody'],\n ['.post', '.post_body'],\n ['.post', '.post-body'],\n];\n\nexport const PHOTO_HINTS = ['figure', 'photo', 'image', 'caption'];\nexport const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const POSITIVE_SCORE_HINTS = [\n 'article',\n 'articlecontent',\n 'instapaper_body',\n 'blog',\n 'body',\n 'content',\n 'entry-content-asset',\n 'entry',\n 'hentry',\n 'main',\n 'Normal',\n 'page',\n 'pagination',\n 'permalink',\n 'post',\n 'story',\n 'text',\n '[-_]copy', // usatoday\n '\\\\Bcopy',\n];\n\n// The above list, joined into a matching regular expression\nexport const POSITIVE_SCORE_RE = new RegExp(\n POSITIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// Readability publisher-specific guidelines\nexport const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const NEGATIVE_SCORE_HINTS = [\n 'adbox',\n 'advert',\n 'author',\n 'bio',\n 'bookmark',\n 'bottom',\n 'byline',\n 'clear',\n 'com-',\n 'combx',\n 'comment',\n 'comment\\\\B',\n 'contact',\n 'copy',\n 'credit',\n 'crumb',\n 'date',\n 'deck',\n 'excerpt',\n 'featured', // tnr.com has a featured_content which throws us off\n 'foot',\n 'footer',\n 'footnote',\n 'graf',\n 'head',\n 'info',\n 'infotext', // newscientist.com copyright\n 'instapaper_ignore',\n 'jump',\n 'linebreak',\n 'link',\n 'masthead',\n 'media',\n 'meta',\n 'modal',\n 'outbrain', // slate.com junk\n 'promo',\n 'pr_', // autoblog - press release\n 'related',\n 'respond',\n 'roundcontent', // lifehacker restricted content warning\n 'scroll',\n 'secondary',\n 'share',\n 'shopping',\n 'shoutbox',\n 'side',\n 'sidebar',\n 'sponsor',\n 'stamp',\n 'sub',\n 'summary',\n 'tags',\n 'tools',\n 'widget',\n];\n// The above list, joined into a matching regular expression\nexport const NEGATIVE_SCORE_RE = new RegExp(\n NEGATIVE_SCORE_HINTS.join('|'),\n 'i'\n);\n\n// Match a digit. Pretty clear.\nexport const DIGIT_RE = new RegExp('[0-9]');\n\n// Match 2 or more consecutive <br> tags\nexport const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \\n\\r\\t]*){2,}', 'i');\n\n// Match 1 BR tag.\nexport const BR_TAG_RE = new RegExp('<br[^>]*>', 'i');\n\n// A list of all of the block level tags known in HTML5 and below. Taken from\n// http://bit.ly/qneNIT\nexport const BLOCK_LEVEL_TAGS = [\n 'article',\n 'aside',\n 'blockquote',\n 'body',\n 'br',\n 'button',\n 'canvas',\n 'caption',\n 'col',\n 'colgroup',\n 'dd',\n 'div',\n 'dl',\n 'dt',\n 'embed',\n 'fieldset',\n 'figcaption',\n 'figure',\n 'footer',\n 'form',\n 'h1',\n 'h2',\n 'h3',\n 'h4',\n 'h5',\n 'h6',\n 'header',\n 'hgroup',\n 'hr',\n 'li',\n 'map',\n 'object',\n 'ol',\n 'output',\n 'p',\n 'pre',\n 'progress',\n 'section',\n 'table',\n 'tbody',\n 'textarea',\n 'tfoot',\n 'th',\n 'thead',\n 'tr',\n 'ul',\n 'video',\n];\nexport const BLOCK_LEVEL_TAGS_RE = new RegExp(\n `^(${BLOCK_LEVEL_TAGS.join('|')})$`,\n 'i'\n);\n\n// The removal is implemented as a blacklist and whitelist, this test finds\n// blacklisted elements that aren't whitelisted. We do this all in one\n// expression-both because it's only one pass, and because this skips the\n// serialization for whitelisted nodes.\nconst candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');\nexport const CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');\n\nconst candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');\nexport const CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');\n\nexport const UNLIKELY_RE = new RegExp(\n `!(${candidatesWhitelist})|(${candidatesBlacklist})`,\n 'i'\n);\n\nexport const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i');\nexport const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i');\nexport const BAD_TAGS = new RegExp('^(address|form)$', 'i');\n\nexport const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i');\n","// return 1 for every comma in text\nexport default function scoreCommas(text) {\n return (text.match(/,/g) || []).length;\n}\n","// Given a node type to search for, and a list of regular expressions,\n// look to see if this extraction can be found in the URL. Expects\n// that each expression in r_list will return group(1) as the proper\n// string to be cleaned.\n// Only used for date_published currently.\nexport default function extractFromUrl(url, regexList) {\n const matchRe = regexList.find(re => re.test(url));\n if (matchRe) {\n return matchRe.exec(url)[1];\n }\n\n return null;\n}\n","// An expression that looks to try to find the page digit within a URL, if\n// it exists.\n// Matches:\n// page=1\n// pg=1\n// p=1\n// paging=12\n// pag=7\n// pagination/1\n// paging/88\n// pa/83\n// p/11\n//\n// Does not match:\n// pg=102\n// page:2\nexport const PAGE_IN_HREF_RE = new RegExp(\n '(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})',\n 'i'\n);\n\nexport const HAS_ALPHA_RE = /[a-z]/i;\n\nexport const IS_ALPHA_RE = /^[a-z]+$/i;\nexport const IS_DIGIT_RE = /^[0-9]+$/i;\n\nexport const ENCODING_RE = /charset=([\\w-]+)\\b/;\nexport const DEFAULT_ENCODING = 'utf-8';\n","// Given a string, return True if it appears to have an ending sentence\n// within it, false otherwise.\nconst SENTENCE_END_RE = new RegExp('.( |$)');\nexport default function hasSentenceEnd(text) {\n return SENTENCE_END_RE.test(text);\n}\n","// Scoring\nexport { default as getWeight } from './get-weight';\nexport { default as getScore } from './get-score';\nexport { default as scoreCommas } from './score-commas';\nexport { default as scoreLength } from './score-length';\nexport { default as scoreParagraph } from './score-paragraph';\nexport { default as setScore } from './set-score';\nexport { default as addScore } from './add-score';\nexport { default as addToParent } from './add-to-parent';\nexport { default as getOrInitScore } from './get-or-init-score';\nexport { default as scoreNode } from './score-node';\nexport { default as scoreContent } from './score-content';\nexport { default as findTopCandidate } from './find-top-candidate';\n","import URL from 'url';\n\nimport { getAttrs, setAttr } from 'utils/dom';\n\nfunction absolutize($, rootUrl, attr) {\n const baseUrl = $('base').attr('href');\n\n $(`[${attr}]`).each((_, node) => {\n const attrs = getAttrs(node);\n const url = attrs[attr];\n if (!url) return;\n const absoluteUrl = URL.resolve(baseUrl || rootUrl, url);\n\n setAttr(node, attr, absoluteUrl);\n });\n}\n\nfunction absolutizeSet($, rootUrl, $content) {\n $('[srcset]', $content).each((_, node) => {\n const attrs = getAttrs(node);\n const urlSet = attrs.srcset;\n\n if (urlSet) {\n // a comma should be considered part of the candidate URL unless preceded by a descriptor\n // descriptors can only contain positive numbers followed immediately by either 'w' or 'x'\n // space characters inside the URL should be encoded (%20 or +)\n const candidates = urlSet.match(\n /(?:\\s*)(\\S+(?:\\s*[\\d.]+[wx])?)(?:\\s*,\\s*)?/g\n );\n if (!candidates) return;\n const absoluteCandidates = candidates.map(candidate => {\n // a candidate URL cannot start or end with a comma\n // descriptors are separated from the URLs by unescaped whitespace\n const parts = candidate\n .trim()\n .replace(/,$/, '')\n .split(/\\s+/);\n parts[0] = URL.resolve(rootUrl, parts[0]);\n return parts.join(' ');\n });\n const absoluteUrlSet = [...new Set(absoluteCandidates)].join(', ');\n setAttr(node, 'srcset', absoluteUrlSet);\n }\n });\n}\n\nexport default function makeLinksAbsolute($content, $, url) {\n ['href', 'src'].forEach(attr => absolutize($, url, attr));\n absolutizeSet($, url, $content);\n\n return $content;\n}\n","// strips all tags from a string of text\nexport default function stripTags(text, $) {\n // Wrapping text in html element prevents errors when text\n // has no html\n const cleanText = $(`<span>${text}</span>`).text();\n return cleanText === '' ? text : cleanText;\n}\n","// Given a node, determine if it's article-like enough to return\n// param: node (a cheerio node)\n// return: boolean\n\nexport default function nodeIsSufficient($node) {\n return $node.text().trim().length >= 100;\n}\n","export default function getAttrs(node) {\n const { attribs, attributes } = node;\n\n if (!attribs && attributes) {\n const attrs = Reflect.ownKeys(attributes).reduce((acc, index) => {\n const attr = attributes[index];\n\n if (!attr.name || !attr.value) return acc;\n\n acc[attr.name] = attr.value;\n return acc;\n }, {});\n return attrs;\n }\n\n return attribs;\n}\n","export default function setAttr(node, attr, val) {\n if (node.attribs) {\n node.attribs[attr] = val;\n } else if (node.attributes) {\n node.setAttribute(attr, val);\n }\n\n return node;\n}\n","// DOM manipulation\nexport {\n default as stripUnlikelyCandidates,\n} from './strip-unlikely-candidates';\nexport { default as brsToPs } from './brs-to-ps';\nexport { default as paragraphize } from './paragraphize';\nexport { default as convertToParagraphs } from './convert-to-paragraphs';\nexport { default as convertNodeTo } from './convert-node-to';\nexport { default as cleanImages } from './clean-images';\nexport { default as markToKeep } from './mark-to-keep';\nexport { default as stripJunkTags } from './strip-junk-tags';\nexport { default as cleanHOnes } from './clean-h-ones';\nexport { default as cleanAttributes } from './clean-attributes';\nexport { default as removeEmpty } from './remove-empty';\nexport { default as cleanTags } from './clean-tags';\nexport { default as cleanHeaders } from './clean-headers';\nexport { default as rewriteTopLevel } from './rewrite-top-level';\nexport { default as makeLinksAbsolute } from './make-links-absolute';\nexport { textLength, linkDensity } from './link-density';\nexport { default as extractFromMeta } from './extract-from-meta';\nexport { default as extractFromSelectors } from './extract-from-selectors';\nexport { default as stripTags } from './strip-tags';\nexport { default as withinComment } from './within-comment';\nexport { default as nodeIsSufficient } from './node-is-sufficient';\nexport { default as isWordpress } from './is-wordpress';\nexport { default as getAttrs } from './get-attrs';\nexport { default as setAttr } from './set-attr';\nexport { default as setAttrs } from './set-attrs';\n","export default function insertValues(strings, ...values) {\n if (values.length) {\n return strings.reduce((result, part, idx) => {\n let value = values[idx];\n\n if (value && typeof value.toString === 'function') {\n value = value.toString();\n } else {\n value = '';\n }\n\n return result + part + value;\n }, '');\n }\n\n return strings.join('');\n}\n","import insertValues from './insert-values';\n\nconst bodyPattern = /^\\n([\\s\\S]+)\\s{2}$/gm;\nconst trailingWhitespace = /\\s+$/;\n\nexport default function template(strings, ...values) {\n const compiled = insertValues(strings, ...values);\n let [body] = compiled.match(bodyPattern) || [];\n let indentLevel = /^\\s{0,4}(.+)$/g;\n\n if (!body) {\n body = compiled;\n indentLevel = /^\\s{0,2}(.+)$/g;\n }\n\n return body\n .split('\\n')\n .slice(1)\n .map(line => {\n line = line.replace(indentLevel, '$1');\n\n if (trailingWhitespace.test(line)) {\n line = line.replace(trailingWhitespace, '');\n }\n\n return line;\n })\n .join('\\n');\n}\n","import template from './index';\n\nexport default function(hostname, name) {\n return template`\n export const ${name} = {\n domain: '${hostname}',\n\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n }\n `;\n}\n","import template from './index';\n\nconst IGNORE = [\n 'url',\n 'domain',\n 'content',\n 'word_count',\n 'next_page_url',\n 'excerpt',\n 'direction',\n 'total_pages',\n 'rendered_pages',\n];\n\nfunction testFor(key, value, dir) {\n if (IGNORE.find(k => k === key)) return '';\n\n return template`\n it('returns the ${key}', async () => {\n // To pass this test, fill out the ${key} selector\n // in ${dir}/index.js.\n const { ${key} } = await result\n\n // Update these values with the expected values from\n // the article.\n assert.equal(${key}, ${value ? `\\`${value}\\`` : \"''\"})\n });\n `;\n}\n\nexport default function(file, url, dir, result, name) {\n return template`\n import assert from 'assert';\n import URL from 'url';\n import cheerio from 'cheerio';\n\n import Parser from 'mercury';\n import getExtractor from 'extractors/get-extractor';\n import { excerptContent } from 'utils/text';\n\n const fs = require('fs');\n\n describe('${name}', () => {\n describe('initial test case', () => {\n let result;\n let url;\n beforeAll(() => {\n url =\n '${url}';\n const html =\n fs.readFileSync('${file}');\n result =\n Parser.parse(url, { html, fallback: false });\n });\n\n it('is selected properly', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ${Reflect.ownKeys(result)\n .map(k => testFor(k, result[k], dir))\n .join('\\n\\n')}\n\n it('returns the content', async () => {\n // To pass this test, fill out the content selector\n // in ${dir}/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const { content } = await result;\n\n const $ = cheerio.load(content || '');\n\n const first13 = excerptContent($('*').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, 'Add the first 13 words of the article here');\n });\n });\n });\n `;\n}\n","/* eslint-disable import/no-extraneous-dependencies */\n/* eslint-disable no-use-before-define */\n/* eslint-disable no-console */\nimport fs from 'fs';\nimport URL from 'url';\nimport inquirer from 'inquirer';\nimport ora from 'ora';\nimport { exec } from 'child_process';\n\nimport { stripJunkTags, makeLinksAbsolute } from 'utils/dom';\nimport Parser from '../dist/mercury';\nimport extractorTemplate from './templates/custom-extractor';\nimport extractorTestTemplate from './templates/custom-extractor-test';\n\nconst questions = [\n {\n type: 'input',\n name: 'website',\n message:\n \"Paste a url to an article you'd like to create or extend a parser for:\",\n validate(value) {\n const { hostname } = URL.parse(value);\n if (hostname) return true;\n\n return false;\n },\n },\n];\nlet spinner;\n\nfunction confirm(fn, args, msg, newParser) {\n spinner = ora({ text: msg });\n spinner.start();\n const result = fn(...args);\n\n if (result && result.then) {\n result.then(r => savePage(r, args, newParser));\n } else {\n spinner.succeed();\n }\n\n return result;\n}\n\nfunction confirmCreateDir(dir, msg) {\n if (!fs.existsSync(dir)) {\n confirm(fs.mkdirSync, [dir], msg);\n }\n}\n\nfunction getDir(url) {\n const { hostname } = URL.parse(url);\n return `./src/extractors/custom/${hostname}`;\n}\n\nfunction scaffoldCustomParser(url) {\n const dir = getDir(url);\n const { hostname } = URL.parse(url);\n let newParser = false;\n\n if (!fs.existsSync(dir)) {\n newParser = true;\n confirmCreateDir(dir, `Creating ${hostname} directory`);\n confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');\n }\n\n confirm(Parser.fetchResource, [url], 'Fetching fixture', newParser);\n}\n\n// if has arg, just assume that arg is a url and skip prmopt\nconst urlArg = process.argv[2];\nif (urlArg) {\n scaffoldCustomParser(urlArg);\n} else {\n inquirer.prompt(questions).then(answers => {\n scaffoldCustomParser(answers.website);\n });\n}\n\nfunction generateScaffold(url, file, result) {\n const { hostname } = URL.parse(url);\n const extractor = extractorTemplate(hostname, extractorName(hostname));\n const extractorTest = extractorTestTemplate(\n file,\n url,\n getDir(url),\n result,\n extractorName(hostname)\n );\n\n fs.writeFileSync(`${getDir(url)}/index.js`, extractor);\n fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest);\n fs.appendFileSync('./src/extractors/custom/index.js', exportString(url));\n exec(`npm run lint-fix-quiet -- ${getDir(url)}/*.js`);\n}\n\nfunction savePage($, [url], newParser) {\n const { hostname } = URL.parse(url);\n\n spinner.succeed();\n\n const filename = new Date().getTime();\n const file = `./fixtures/${hostname}/${filename}.html`;\n // fix http(s) relative links:\n makeLinksAbsolute($('*').first(), $, url);\n $('[src], [href]').each((index, node) => {\n const $node = $(node);\n const link = $node.attr('src');\n if (link && link.slice(0, 2) === '//') {\n $node.attr('src', `http:${link}`);\n }\n });\n const html = stripJunkTags($('*').first(), $, ['script']).html();\n\n fs.writeFileSync(file, html);\n\n Parser.parse(url, { html }).then(result => {\n if (newParser) {\n confirm(\n generateScaffold,\n [url, file, result],\n 'Generating parser and tests'\n );\n console.log(`Your custom site extractor has been set up. To get started building it, run\n yarn watch:test -- ${hostname}\n -- OR --\n npm run watch:test -- ${hostname}`);\n } else {\n console.log(`\n It looks like you already have a custom parser for this url.\n The page you linked to has been added to ${file}. Copy and paste\n the following code to use that page in your tests:\n const html = fs.readFileSync('${file}');`);\n }\n });\n}\n\nfunction exportString(url) {\n const { hostname } = URL.parse(url);\n return `export * from './${hostname}';`;\n}\n\nfunction extractorName(hostname) {\n const name = hostname\n .split('.')\n .map(w => `${w.charAt(0).toUpperCase()}${w.slice(1)}`)\n .join('');\n return `${name}Extractor`;\n}\n"],"names":["KEEP_CLASS","STRIP_OUTPUT_TAGS","stripJunkTags","article","$","tags","length","join","not","remove","absolutize","rootUrl","attr","baseUrl","each","_","node","attrs","getAttrs","url","absoluteUrl","URL","resolve","setAttr","absolutizeSet","$content","urlSet","srcset","candidates","match","absoluteCandidates","map","candidate","parts","trim","replace","split","absoluteUrlSet","makeLinksAbsolute","forEach","attribs","attributes","reduce","acc","index","name","value","val","setAttribute","insertValues","strings","values","result","part","idx","toString","bodyPattern","trailingWhitespace","template","compiled","body","indentLevel","slice","line","test","hostname","IGNORE","testFor","key","dir","find","k","file","questions","type","message","validate","parse","spinner","confirm","fn","args","msg","newParser","ora","text","start","then","r","savePage","succeed","confirmCreateDir","fs","existsSync","mkdirSync","getDir","scaffoldCustomParser","Parser","fetchResource","urlArg","process","argv","inquirer","prompt","answers","website","generateScaffold","extractor","extractorTemplate","extractorName","extractorTest","extractorTestTemplate","writeFileSync","appendFileSync","exportString","exec","filename","Date","getTime","first","$node","link","html","console","log","w","charAt","toUpperCase"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AACA,AAGA;;AACA,AAAO,IAAMA,UAAU,GAAG,qBAAnB;AAEP;AAUA,AAAO,IAAMC,iBAAiB,GAAG,CAC/B,OAD+B,EAE/B,QAF+B,EAG/B,UAH+B,EAI/B,MAJ+B,EAK/B,OAL+B,EAM/B,IAN+B,EAO/B,OAP+B,EAQ/B,QAR+B,EAS/B,QAT+B,CAA1B;;ACfQ,SAASC,aAAT,CAAuBC,OAAvB,EAAgCC,CAAhC,EAA8C;MAAXC,IAAW,uEAAJ,EAAI;;MACvDA,IAAI,CAACC,MAAL,KAAgB,CAApB,EAAuB;IACrBD,IAAI,GAAGJ,iBAAP;GAFyD;;;;EAO3DG,CAAC,CAACC,IAAI,CAACE,IAAL,CAAU,GAAV,CAAD,EAAiBJ,OAAjB,CAAD,CACGK,GADH,YACWR,UADX,GAEGS,MAFH;SAIOL,CAAP;;;ACbF;;ACAA;;ACAA;;ACAA;;ACAA;;ACAA;;ACIA,SAASM,UAAT,CAAoBN,CAApB,EAAuBO,OAAvB,EAAgCC,IAAhC,EAAsC;MAC9BC,OAAO,GAAGT,CAAC,CAAC,MAAD,CAAD,CAAUQ,IAAV,CAAe,MAAf,CAAhB;EAEAR,CAAC,YAAKQ,IAAL,OAAD,CAAeE,IAAf,CAAoB,UAACC,CAAD,EAAIC,IAAJ,EAAa;QACzBC,KAAK,GAAGC,QAAQ,CAACF,IAAD,CAAtB;QACMG,GAAG,GAAGF,KAAK,CAACL,IAAD,CAAjB;QACI,CAACO,GAAL,EAAU;QACJC,WAAW,GAAGC,KAAG,CAACC,OAAJ,CAAYT,OAAO,IAAIF,OAAvB,EAAgCQ,GAAhC,CAApB;IAEAI,OAAO,CAACP,IAAD,EAAOJ,IAAP,EAAaQ,WAAb,CAAP;GANF;;;AAUF,SAASI,aAAT,CAAuBpB,CAAvB,EAA0BO,OAA1B,EAAmCc,QAAnC,EAA6C;EAC3CrB,CAAC,CAAC,UAAD,EAAaqB,QAAb,CAAD,CAAwBX,IAAxB,CAA6B,UAACC,CAAD,EAAIC,IAAJ,EAAa;QAClCC,KAAK,GAAGC,QAAQ,CAACF,IAAD,CAAtB;QACMU,MAAM,GAAGT,KAAK,CAACU,MAArB;;QAEID,MAAJ,EAAY;;;;UAIJE,UAAU,GAAGF,MAAM,CAACG,KAAP,CACjB,6CADiB,CAAnB;UAGI,CAACD,UAAL,EAAiB;UACXE,kBAAkB,GAAGF,UAAU,CAACG,GAAX,CAAe,UAAAC,SAAS,EAAI;;;YAG/CC,KAAK,GAAGD,SAAS,CACpBE,IADW,GAEXC,OAFW,CAEH,IAFG,EAEG,EAFH,EAGXC,KAHW,CAGL,KAHK,CAAd;QAIAH,KAAK,CAAC,CAAD,CAAL,GAAWZ,KAAG,CAACC,OAAJ,CAAYX,OAAZ,EAAqBsB,KAAK,CAAC,CAAD,CAA1B,CAAX;eACOA,KAAK,CAAC1B,IAAN,CAAW,GAAX,CAAP;OARyB,CAA3B;;UAUM8B,cAAc,GAAG,mBAAI,QAAQP,kBAAR,CAAJ,EAAiCvB,IAAjC,CAAsC,IAAtC,CAAvB;;MACAgB,OAAO,CAACP,IAAD,EAAO,QAAP,EAAiBqB,cAAjB,CAAP;;GAvBJ;;;AA4BF,AAAe,SAASC,oBAAT,CAA2Bb,QAA3B,EAAqCrB,CAArC,EAAwCe,GAAxC,EAA6C;GACzD,MAAD,EAAS,KAAT,EAAgBoB,OAAhB,CAAwB,UAAA3B,IAAI;WAAIF,UAAU,CAACN,CAAD,EAAIe,GAAJ,EAASP,IAAT,CAAd;GAA5B;EACAY,aAAa,CAACpB,CAAD,EAAIe,GAAJ,EAASM,QAAT,CAAb;SAEOA,QAAP;;;AClDF;;ACAA;;ACAe,SAASP,QAAT,CAAkBF,IAAlB,EAAwB;MAC7BwB,OAD6B,GACLxB,IADK,CAC7BwB,OAD6B;MACpBC,UADoB,GACLzB,IADK,CACpByB,UADoB;;MAGjC,CAACD,OAAD,IAAYC,UAAhB,EAA4B;QACpBxB,KAAK,GAAG,iBAAgBwB,UAAhB,EAA4BC,MAA5B,CAAmC,UAACC,GAAD,EAAMC,KAAN,EAAgB;UACzDhC,IAAI,GAAG6B,UAAU,CAACG,KAAD,CAAvB;UAEI,CAAChC,IAAI,CAACiC,IAAN,IAAc,CAACjC,IAAI,CAACkC,KAAxB,EAA+B,OAAOH,GAAP;MAE/BA,GAAG,CAAC/B,IAAI,CAACiC,IAAN,CAAH,GAAiBjC,IAAI,CAACkC,KAAtB;aACOH,GAAP;KANY,EAOX,EAPW,CAAd;;WAQO1B,KAAP;;;SAGKuB,OAAP;;;ACfa,SAASjB,OAAT,CAAiBP,IAAjB,EAAuBJ,IAAvB,EAA6BmC,GAA7B,EAAkC;MAC3C/B,IAAI,CAACwB,OAAT,EAAkB;IAChBxB,IAAI,CAACwB,OAAL,CAAa5B,IAAb,IAAqBmC,GAArB;GADF,MAEO,IAAI/B,IAAI,CAACyB,UAAT,EAAqB;IAC1BzB,IAAI,CAACgC,YAAL,CAAkBpC,IAAlB,EAAwBmC,GAAxB;;;SAGK/B,IAAP;;;ACPF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;ACAe,SAASiC,YAAT,CAAsBC,OAAtB,EAA0C;oCAARC,MAAQ;IAARA,MAAQ;;;MACnDA,MAAM,CAAC7C,MAAX,EAAmB;WACV4C,OAAO,CAACR,MAAR,CAAe,UAACU,MAAD,EAASC,IAAT,EAAeC,GAAf,EAAuB;UACvCR,KAAK,GAAGK,MAAM,CAACG,GAAD,CAAlB;;UAEIR,KAAK,IAAI,OAAOA,KAAK,CAACS,QAAb,KAA0B,UAAvC,EAAmD;QACjDT,KAAK,GAAGA,KAAK,CAACS,QAAN,EAAR;OADF,MAEO;QACLT,KAAK,GAAG,EAAR;;;aAGKM,MAAM,GAAGC,IAAT,GAAgBP,KAAvB;KATK,EAUJ,EAVI,CAAP;;;SAaKI,OAAO,CAAC3C,IAAR,CAAa,EAAb,CAAP;;;ACbF,IAAMiD,WAAW,GAAG,sBAApB;AACA,IAAMC,kBAAkB,GAAG,MAA3B;AAEA,AAAe,SAASC,QAAT,CAAkBR,OAAlB,EAAsC;oCAARC,MAAQ;IAARA,MAAQ;;;MAC7CQ,QAAQ,GAAGV,YAAY,MAAZ,UAAaC,OAAb,SAAyBC,MAAzB,EAAjB;;aACaQ,QAAQ,CAAC9B,KAAT,CAAe2B,WAAf,KAA+B,EAFO;;MAE9CI,IAF8C;;MAG/CC,WAAW,GAAG,gBAAlB;;MAEI,CAACD,IAAL,EAAW;IACTA,IAAI,GAAGD,QAAP;IACAE,WAAW,GAAG,gBAAd;;;SAGKD,IAAI,CACRxB,KADI,CACE,IADF,EAEJ0B,KAFI,CAEE,CAFF,EAGJ/B,GAHI,CAGA,UAAAgC,IAAI,EAAI;IACXA,IAAI,GAAGA,IAAI,CAAC5B,OAAL,CAAa0B,WAAb,EAA0B,IAA1B,CAAP;;QAEIJ,kBAAkB,CAACO,IAAnB,CAAwBD,IAAxB,CAAJ,EAAmC;MACjCA,IAAI,GAAGA,IAAI,CAAC5B,OAAL,CAAasB,kBAAb,EAAiC,EAAjC,CAAP;;;WAGKM,IAAP;GAVG,EAYJxD,IAZI,CAYC,IAZD,CAAP;;;;;;;;;;;;ACba,4BAAS0D,QAAT,EAAmBpB,IAAnB,EAAyB;SAC/Ba,QAAP,oBACiBb,IADjB,EAEeoB,QAFf;;;;;;;;;;;;;;;;;;;;;;ACDF,IAAMC,MAAM,GAAG,CACb,KADa,EAEb,QAFa,EAGb,SAHa,EAIb,YAJa,EAKb,eALa,EAMb,SANa,EAOb,WAPa,EAQb,aARa,EASb,gBATa,CAAf;;AAYA,SAASC,OAAT,CAAiBC,GAAjB,EAAsBtB,KAAtB,EAA6BuB,GAA7B,EAAkC;MAC5BH,MAAM,CAACI,IAAP,CAAY,UAAAC,CAAC;WAAIA,CAAC,KAAKH,GAAV;GAAb,CAAJ,EAAiC,OAAO,EAAP;SAE1BV,QAAP,sBACkBU,GADlB,EAE+CA,GAF/C,EAGkBC,GAHlB,EAIoBD,GAJpB,EAQyBA,GARzB,EAQiCtB,KAAK,cAAQA,KAAR,SAAoB,IAR1D;;;AAaF,AAAe,gCAAS0B,IAAT,EAAerD,GAAf,EAAoBkD,GAApB,EAAyBjB,MAAzB,EAAiCP,IAAjC,EAAuC;SAC7Ca,QAAP,qBAWcb,IAXd,EAiBa1B,GAjBb,EAmB6BqD,IAnB7B,EAgCU,iBAAgBpB,MAAhB,EACCrB,GADD,CACK,UAAAwC,CAAC;WAAIJ,OAAO,CAACI,CAAD,EAAInB,MAAM,CAACmB,CAAD,CAAV,EAAeF,GAAf,CAAX;GADN,EAEC9D,IAFD,CAEM,MAFN,CAhCV,EAsCgB8D,GAtChB;;;ACjBF,IAAMI,SAAS,GAAG,CAChB;EACEC,IAAI,EAAE,OADR;EAEE7B,IAAI,EAAE,SAFR;EAGE8B,OAAO,EACL,wEAJJ;EAKEC,QALF,oBAKW9B,KALX,EAKkB;qBACOzB,KAAG,CAACwD,KAAJ,CAAU/B,KAAV,CADP;QACNmB,QADM,cACNA,QADM;;QAEVA,QAAJ,EAAc,OAAO,IAAP;WAEP,KAAP;;CAVY,CAAlB;AAcA,IAAIa,OAAJ;;AAEA,SAASC,OAAT,CAAiBC,EAAjB,EAAqBC,IAArB,EAA2BC,GAA3B,EAAgCC,SAAhC,EAA2C;EACzCL,OAAO,GAAGM,GAAG,CAAC;IAAEC,IAAI,EAAEH;GAAT,CAAb;EACAJ,OAAO,CAACQ,KAAR;MACMlC,MAAM,GAAG4B,EAAE,MAAF,4BAAMC,IAAN,EAAf;;MAEI7B,MAAM,IAAIA,MAAM,CAACmC,IAArB,EAA2B;IACzBnC,MAAM,CAACmC,IAAP,CAAY,UAAAC,CAAC;aAAIC,QAAQ,CAACD,CAAD,EAAIP,IAAJ,EAAUE,SAAV,CAAZ;KAAb;GADF,MAEO;IACLL,OAAO,CAACY,OAAR;;;SAGKtC,MAAP;;;AAGF,SAASuC,gBAAT,CAA0BtB,GAA1B,EAA+Ba,GAA/B,EAAoC;MAC9B,CAACU,EAAE,CAACC,UAAH,CAAcxB,GAAd,CAAL,EAAyB;IACvBU,OAAO,CAACa,EAAE,CAACE,SAAJ,EAAe,CAACzB,GAAD,CAAf,EAAsBa,GAAtB,CAAP;;;;AAIJ,SAASa,MAAT,CAAgB5E,GAAhB,EAAqB;oBACEE,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CADF;MACX8C,QADW,eACXA,QADW;;2CAEeA,QAAlC;;;AAGF,SAAS+B,oBAAT,CAA8B7E,GAA9B,EAAmC;MAC3BkD,GAAG,GAAG0B,MAAM,CAAC5E,GAAD,CAAlB;;oBACqBE,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CAFY;MAEzB8C,QAFyB,eAEzBA,QAFyB;;MAG7BkB,SAAS,GAAG,KAAhB;;MAEI,CAACS,EAAE,CAACC,UAAH,CAAcxB,GAAd,CAAL,EAAyB;IACvBc,SAAS,GAAG,IAAZ;IACAQ,gBAAgB,CAACtB,GAAD,qBAAkBJ,QAAlB,gBAAhB;IACA0B,gBAAgB,sBAAe1B,QAAf,GAA2B,6BAA3B,CAAhB;;;EAGFc,OAAO,CAACkB,OAAM,CAACC,aAAR,EAAuB,CAAC/E,GAAD,CAAvB,EAA8B,kBAA9B,EAAkDgE,SAAlD,CAAP;;;;AAIF,IAAMgB,MAAM,GAAGC,OAAO,CAACC,IAAR,CAAa,CAAb,CAAf;;AACA,IAAIF,MAAJ,EAAY;EACVH,oBAAoB,CAACG,MAAD,CAApB;CADF,MAEO;EACLG,QAAQ,CAACC,MAAT,CAAgB9B,SAAhB,EAA2Bc,IAA3B,CAAgC,UAAAiB,OAAO,EAAI;IACzCR,oBAAoB,CAACQ,OAAO,CAACC,OAAT,CAApB;GADF;;;AAKF,SAASC,gBAAT,CAA0BvF,GAA1B,EAA+BqD,IAA/B,EAAqCpB,MAArC,EAA6C;oBACtB/B,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CADsB;MACnC8C,QADmC,eACnCA,QADmC;;MAErC0C,SAAS,GAAGC,iBAAiB,CAAC3C,QAAD,EAAW4C,aAAa,CAAC5C,QAAD,CAAxB,CAAnC;MACM6C,aAAa,GAAGC,qBAAqB,CACzCvC,IADyC,EAEzCrD,GAFyC,EAGzC4E,MAAM,CAAC5E,GAAD,CAHmC,EAIzCiC,MAJyC,EAKzCyD,aAAa,CAAC5C,QAAD,CAL4B,CAA3C;EAQA2B,EAAE,CAACoB,aAAH,WAAoBjB,MAAM,CAAC5E,GAAD,CAA1B,gBAA4CwF,SAA5C;EACAf,EAAE,CAACoB,aAAH,WAAoBjB,MAAM,CAAC5E,GAAD,CAA1B,qBAAiD2F,aAAjD;EACAlB,EAAE,CAACqB,cAAH,CAAkB,kCAAlB,EAAsDC,YAAY,CAAC/F,GAAD,CAAlE;EACAgG,kBAAI,qCAA8BpB,MAAM,CAAC5E,GAAD,CAApC,WAAJ;;;AAGF,SAASsE,QAAT,CAAkBrF,CAAlB,QAA4B+E,SAA5B,EAAuC;;MAAjBhE,GAAiB;;oBAChBE,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CADgB;MAC7B8C,QAD6B,eAC7BA,QAD6B;;EAGrCa,OAAO,CAACY,OAAR;MAEM0B,QAAQ,GAAG,IAAIC,IAAJ,GAAWC,OAAX,EAAjB;MACM9C,IAAI,wBAAiBP,QAAjB,cAA6BmD,QAA7B,UAAV,CANqC;;EAQrC9E,oBAAiB,CAAClC,CAAC,CAAC,GAAD,CAAD,CAAOmH,KAAP,EAAD,EAAiBnH,CAAjB,EAAoBe,GAApB,CAAjB;EACAf,CAAC,CAAC,eAAD,CAAD,CAAmBU,IAAnB,CAAwB,UAAC8B,KAAD,EAAQ5B,IAAR,EAAiB;QACjCwG,KAAK,GAAGpH,CAAC,CAACY,IAAD,CAAf;QACMyG,IAAI,GAAGD,KAAK,CAAC5G,IAAN,CAAW,KAAX,CAAb;;QACI6G,IAAI,IAAIA,IAAI,CAAC3D,KAAL,CAAW,CAAX,EAAc,CAAd,MAAqB,IAAjC,EAAuC;MACrC0D,KAAK,CAAC5G,IAAN,CAAW,KAAX,iBAA0B6G,IAA1B;;GAJJ;MAOMC,IAAI,GAAGxH,aAAa,CAACE,CAAC,CAAC,GAAD,CAAD,CAAOmH,KAAP,EAAD,EAAiBnH,CAAjB,EAAoB,CAAC,QAAD,CAApB,CAAb,CAA6CsH,IAA7C,EAAb;EAEA9B,EAAE,CAACoB,aAAH,CAAiBxC,IAAjB,EAAuBkD,IAAvB;EAEAzB,OAAM,CAACpB,KAAP,CAAa1D,GAAb,EAAkB;IAAEuG,IAAI,EAAJA;GAApB,EAA4BnC,IAA5B,CAAiC,UAAAnC,MAAM,EAAI;QACrC+B,SAAJ,EAAe;MACbJ,OAAO,CACL2B,gBADK,EAEL,CAACvF,GAAD,EAAMqD,IAAN,EAAYpB,MAAZ,CAFK,EAGL,6BAHK,CAAP;MAKAuE,OAAO,CAACC,GAAR,iHACqB3D,QADrB,6DAGwBA,QAHxB;KANF,MAUO;MACL0D,OAAO,CAACC,GAAR,wHAEuCpD,IAFvC,qHAI4BA,IAJ5B;;GAZJ;;;AAqBF,SAAS0C,YAAT,CAAsB/F,GAAtB,EAA2B;oBACJE,KAAG,CAACwD,KAAJ,CAAU1D,GAAV,CADI;MACjB8C,QADiB,eACjBA,QADiB;;oCAEEA,QAA3B;;;AAGF,SAAS4C,aAAT,CAAuB5C,QAAvB,EAAiC;MACzBpB,IAAI,GAAGoB,QAAQ,CAClB7B,KADU,CACJ,GADI,EAEVL,GAFU,CAEN,UAAA8F,CAAC;qBAAOA,CAAC,CAACC,MAAF,CAAS,CAAT,EAAYC,WAAZ,EAAP,SAAmCF,CAAC,CAAC/D,KAAF,CAAQ,CAAR,CAAnC;GAFK,EAGVvD,IAHU,CAGL,EAHK,CAAb;mBAIUsC,IAAV;"}
package/dist/mercury.js CHANGED
@@ -362,7 +362,7 @@ var KEEP_CLASS = 'mercury-parser-keep';
362
362
  var KEEP_SELECTORS = ['iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]']; // A list of tags to strip from the output if we encounter them.
363
363
 
364
364
  var STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object']; // cleanAttributes
365
- var WHITELIST_ATTRS = ['src', 'srcset', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
365
+ var WHITELIST_ATTRS = ['src', 'srcset', 'start', 'sizes', 'type', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height'];
366
366
  var WHITELIST_ATTRS_RE = new RegExp("^(".concat(WHITELIST_ATTRS.join('|'), ")$"), 'i'); // removeEmpty
367
367
 
368
368
  var CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(','); // cleanHeaders
@@ -1197,7 +1197,7 @@ function cleanTags$$1($article, $) {
1197
1197
  if (weight < 0) {
1198
1198
  $node.remove();
1199
1199
  } else {
1200
- // deteremine if node seems like content
1200
+ // determine if node seems like content
1201
1201
  removeUnlessContent($node, $, weight);
1202
1202
  }
1203
1203
  });
@@ -1207,11 +1207,16 @@ function cleanTags$$1($article, $) {
1207
1207
  function cleanHeaders($article, $) {
1208
1208
  var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';
1209
1209
  $(HEADER_TAG_LIST, $article).each(function (index, header) {
1210
- var $header = $(header); // Remove any headers that appear before all other p tags in the
1210
+ var $header = $(header);
1211
+
1212
+ if ($(header).hasClass(KEEP_CLASS)) {
1213
+ return $header;
1214
+ } // Remove any headers that appear before all other p tags in the
1211
1215
  // document. This probably means that it was part of the title, a
1212
1216
  // subtitle or something else extraneous like a datestamp or byline,
1213
1217
  // all of which should be handled by other metadata handling.
1214
1218
 
1219
+
1215
1220
  if ($($header, $article).prevAll('p').length === 0) {
1216
1221
  return $header.remove();
1217
1222
  } // Remove any headers that match the title exactly.
@@ -6171,8 +6176,18 @@ var WwwVersantsComExtractor = {
6171
6176
  selectors: [['meta[name="og:image"]', 'value']]
6172
6177
  },
6173
6178
  content: {
6174
- selectors: ['.entry-content'],
6175
- clean: ['.adv-link', '.versa-target']
6179
+ transforms: {
6180
+ '.featured-image': function featuredImage($node) {
6181
+ $node.addClass('mercury-parser-keep');
6182
+ var figcaption = $node.find('span');
6183
+ $node.find('figure').append(figcaption);
6184
+ }
6185
+ },
6186
+ selectors: ['.article-content'],
6187
+ clean: ['.adv-link', '.versa-target', 'header', // Clean title
6188
+ '.author', // Clean author
6189
+ '.thumbnail-slider' // Remove, the main images will be within the .main-slider div.
6190
+ ]
6176
6191
  }
6177
6192
  };
6178
6193
 
@@ -6218,20 +6233,24 @@ var WwwAndroidauthorityComExtractor = {
6218
6233
  lead_image_url: {
6219
6234
  selectors: [['meta[name="og:image"]', 'value']]
6220
6235
  },
6236
+ // Some pages have a nested header elements that are significant, and that the parser will
6237
+ // remove if not following a paragraph. Adding this empty paragraph fixes it, and
6238
+ // the empty paragraph will be removed anyway.
6221
6239
  content: {
6222
- selectors: ['.d_Dd'],
6240
+ selectors: ['.e_Bc', '.d_Dd'],
6223
6241
  transforms: {
6224
6242
  ol: function ol(node) {
6225
6243
  node.attr('class', 'mercury-parser-keep');
6226
6244
  },
6227
6245
  h2: function h2($node) {
6228
- // Some pages have an element h2 that is significant, and that the parser will
6229
- // remove if not following a paragraph. Adding this empty paragraph fixes it, and
6230
- // the empty paragraph will be removed anyway.
6231
- $node.before('<p></p>');
6246
+ return $node.attr('class', 'mercury-parser-keep');
6247
+ },
6248
+ h3: function h3($node) {
6249
+ return $node.attr('class', 'mercury-parser-keep');
6232
6250
  }
6233
6251
  },
6234
- clean: ['.d_f .d_nr' // Lead image
6252
+ clean: ['.e_Oh', // Polls
6253
+ 'picture + div' // Lead image text
6235
6254
  ]
6236
6255
  }
6237
6256
  };
@@ -6279,6 +6298,56 @@ var WwwHardwarezoneComSgExtractor = {
6279
6298
  }
6280
6299
  };
6281
6300
 
6301
+ var WwwSpiegelDeExtractor = {
6302
+ domain: 'www.spiegel.de',
6303
+ title: {
6304
+ selectors: [['meta[name="og:title"]', 'value']]
6305
+ },
6306
+ author: {
6307
+ selectors: [['meta[name="author"]', 'value']]
6308
+ },
6309
+ date_published: {
6310
+ selectors: [['meta[name="date"]', 'value']]
6311
+ },
6312
+ lead_image_url: {
6313
+ selectors: [['meta[name="og:image"]', 'value']]
6314
+ },
6315
+ content: {
6316
+ selectors: ['div[data-area="body"]', 'article'],
6317
+ transforms: {},
6318
+ clean: []
6319
+ }
6320
+ };
6321
+
6322
+ var MobilesyrupComExtractor = {
6323
+ domain: 'mobilesyrup.com',
6324
+ title: {
6325
+ selectors: [['meta[name="og:title"]', 'value']]
6326
+ },
6327
+ author: {
6328
+ selectors: [['meta[name="author"]', 'value']]
6329
+ },
6330
+ date_published: {
6331
+ selectors: [['meta[name="article:published_time"]', 'value']]
6332
+ },
6333
+ dek: {
6334
+ selectors: [// enter selectors
6335
+ ]
6336
+ },
6337
+ lead_image_url: {
6338
+ selectors: [['meta[name="og:image"]', 'value']]
6339
+ },
6340
+ content: {
6341
+ selectors: ['.article-content'],
6342
+ transforms: {
6343
+ '.article-content > ul': function articleContentUl(node) {
6344
+ node.attr('class', 'mercury-parser-keep');
6345
+ }
6346
+ },
6347
+ clean: []
6348
+ }
6349
+ };
6350
+
6282
6351
 
6283
6352
 
6284
6353
  var CustomExtractors = /*#__PURE__*/Object.freeze({
@@ -6429,7 +6498,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
6429
6498
  Www1pezeshkComExtractor: Www1pezeshkComExtractor,
6430
6499
  WwwAndroidauthorityComExtractor: WwwAndroidauthorityComExtractor,
6431
6500
  TechcrunchComExtractor: TechcrunchComExtractor,
6432
- WwwHardwarezoneComSgExtractor: WwwHardwarezoneComSgExtractor
6501
+ WwwHardwarezoneComSgExtractor: WwwHardwarezoneComSgExtractor,
6502
+ WwwSpiegelDeExtractor: WwwSpiegelDeExtractor,
6503
+ MobilesyrupComExtractor: MobilesyrupComExtractor
6433
6504
  });
6434
6505
 
6435
6506
  var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {