defuddle 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@ export declare class Defuddle {
25
25
  private removeEmptyElements;
26
26
  private findSmallImages;
27
27
  private removeSmallImages;
28
- private getImageIdentifier;
28
+ private getElementIdentifier;
29
29
  private findMainContent;
30
30
  private findContentByScoring;
31
31
  private getElementSelector;
package/dist/index.js CHANGED
@@ -1 +1 @@
1
- !function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t():"function"==typeof define&&define.amd?define([],t):"object"==typeof exports?exports.Defuddle=t():e.Defuddle=t()}("undefined"!=typeof self?self:this,(()=>(()=>{"use strict";var e={608:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.MetadataExtractor=void 0,t.MetadataExtractor=class{static extract(e,t){var r;let o="",n="";try{n=(null===(r=e.location)||void 0===r?void 0:r.href)||"",n&&(o=new URL(n).hostname.replace(/^www\./,""))}catch(t){const r=e.querySelector("base[href]");if(r)try{n=r.getAttribute("href")||"",o=new URL(n).hostname.replace(/^www\./,"")}catch(e){console.warn("Failed to parse base URL:",e)}}return{title:this.getTitle(e,t),description:this.getDescription(e,t),domain:o,favicon:this.getFavicon(e,n),image:this.getImage(e,t),published:this.getPublished(e,t),author:this.getAuthor(e,t),site:this.getSite(e,t),schemaOrgData:t}}static getAuthor(e,t){return this.getMetaContent(e,"name","sailthru.author")||this.getSchemaProperty(t,"author.name")||this.getMetaContent(e,"property","author")||this.getMetaContent(e,"name","byl")||this.getMetaContent(e,"name","author")||this.getMetaContent(e,"name","authorList")||this.getMetaContent(e,"name","copyright")||this.getSchemaProperty(t,"copyrightHolder.name")||this.getMetaContent(e,"property","og:site_name")||this.getSchemaProperty(t,"publisher.name")||this.getSchemaProperty(t,"sourceOrganization.name")||this.getSchemaProperty(t,"isPartOf.name")||this.getMetaContent(e,"name","twitter:creator")||this.getMetaContent(e,"name","application-name")||""}static getSite(e,t){return this.getSchemaProperty(t,"publisher.name")||this.getMetaContent(e,"property","og:site_name")||this.getSchemaProperty(t,"sourceOrganization.name")||this.getMetaContent(e,"name","copyright")||this.getSchemaProperty(t,"copyrightHolder.name")||this.getSchemaProperty(t,"isPartOf.name")||this.getMetaContent(e,"name","application-name")||""}static getTitle(e,t){var r,o;return this.getMetaContent(e,"property","og:title")||this.getMetaContent(e,"name","twitter:title")||this.getSchemaProperty(t,"headline")||this.getMetaContent(e,"name","title")||this.getMetaContent(e,"name","sailthru.title")||(null===(o=null===(r=e.querySelector("title"))||void 0===r?void 0:r.textContent)||void 0===o?void 0:o.trim())||""}static getDescription(e,t){return this.getMetaContent(e,"name","description")||this.getMetaContent(e,"property","description")||this.getMetaContent(e,"property","og:description")||this.getSchemaProperty(t,"description")||this.getMetaContent(e,"name","twitter:description")||this.getMetaContent(e,"name","sailthru.description")||""}static getImage(e,t){return this.getMetaContent(e,"property","og:image")||this.getMetaContent(e,"name","twitter:image")||this.getSchemaProperty(t,"image.url")||this.getMetaContent(e,"name","sailthru.image.full")||""}static getFavicon(e,t){var r,o;const n=this.getMetaContent(e,"property","og:image:favicon");if(n)return n;const a=null===(r=e.querySelector("link[rel='icon']"))||void 0===r?void 0:r.getAttribute("href");if(a)return a;const i=null===(o=e.querySelector("link[rel='shortcut icon']"))||void 0===o?void 0:o.getAttribute("href");if(i)return i;if(t)try{return new URL("/favicon.ico",t).href}catch(e){console.warn("Failed to construct favicon URL:",e)}return""}static getPublished(e,t){return this.getSchemaProperty(t,"datePublished")||this.getMetaContent(e,"name","publishDate")||this.getMetaContent(e,"property","article:published_time")||this.getTimeElement(e)||this.getMetaContent(e,"name","sailthru.date")||""}static getMetaContent(e,t,r){var o,n;const a=`meta[${t}]`,i=Array.from(e.querySelectorAll(a)).find((e=>{var o;return(null===(o=e.getAttribute(t))||void 0===o?void 0:o.toLowerCase())===r.toLowerCase()})),s=i&&null!==(n=null===(o=i.getAttribute("content"))||void 0===o?void 0:o.trim())&&void 0!==n?n:"";return this.decodeHTMLEntities(s)}static getTimeElement(e){var t,r,o,n;const a=Array.from(e.querySelectorAll("time"))[0],i=a&&null!==(n=null!==(r=null===(t=a.getAttribute("datetime"))||void 0===t?void 0:t.trim())&&void 0!==r?r:null===(o=a.textContent)||void 0===o?void 0:o.trim())&&void 0!==n?n:"";return this.decodeHTMLEntities(i)}static decodeHTMLEntities(e){const t=document.createElement("textarea");return t.innerHTML=e,t.value}static getSchemaProperty(e,t,r=""){if(!e)return r;const o=(e,t,r,n=!0)=>{if("string"==typeof e)return 0===t.length?[e]:[];if(!e||"object"!=typeof e)return[];if(Array.isArray(e)){const a=t[0];if(/^\[\d+\]$/.test(a)){const i=parseInt(a.slice(1,-1));return e[i]?o(e[i],t.slice(1),r,n):[]}return 0===t.length&&e.every((e=>"string"==typeof e||"number"==typeof e))?e.map(String):e.flatMap((e=>o(e,t,r,n)))}const[a,...i]=t;if(!a)return"string"==typeof e?[e]:"object"==typeof e&&e.name?[e.name]:[];if(e.hasOwnProperty(a))return o(e[a],i,r?`${r}.${a}`:a,!0);if(!n){const n=[];for(const a in e)if("object"==typeof e[a]){const i=o(e[a],t,r?`${r}.${a}`:a,!1);n.push(...i)}if(n.length>0)return n}return[]};try{let n=o(e,t.split("."),"",!0);0===n.length&&(n=o(e,t.split("."),"",!1));const a=n.length>0?n.filter(Boolean).join(", "):r;return this.decodeHTMLEntities(a)}catch(e){return console.error(`Error in getSchemaProperty for ${t}:`,e),r}}static extractSchemaOrgData(e){const t=e.querySelectorAll('script[type="application/ld+json"]'),r=[];return t.forEach((e=>{let t=e.textContent||"";try{t=t.replace(/\/\*[\s\S]*?\*\/|^\s*\/\/.*$/gm,"").replace(/^\s*<!\[CDATA\[([\s\S]*?)\]\]>\s*$/,"$1").replace(/^\s*(\*\/|\/\*)\s*|\s*(\*\/|\/\*)\s*$/g,"").trim();const e=JSON.parse(t);e["@graph"]&&Array.isArray(e["@graph"])?r.push(...e["@graph"]):r.push(e)}catch(e){console.error("Error parsing schema.org data:",e),console.error("Problematic JSON content:",t)}})),r}}},628:(e,t,r)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.Defuddle=void 0;const o=r(608),n=/article|content|main|post|body|text|blog|story/i,a=/comment|meta|footer|footnote|foot|nav|sidebar|banner|ad|popup|menu/i,i=["div","section","article","main"],s=["[hidden]",'[style*="display: none"]','[style*="display:none"]','[style*="visibility: hidden"]','[style*="visibility:hidden"]',".hidden",".invisible"].join(","),l=new Set(["href","src","srcset","data-src","data-srcset","alt","title","id","class","width","height","colspan","rowspan","headers","aria-label","role","lang"]),c=["#toc",".toc","#comments","#siteSub",".ad","aside","button","canvas","dialog","fieldset","footer","form","header","input","iframe","label","link","nav","noscript","option","script","select","sidebar",".sidebar","#sidebar","style","textarea",'[data-link-name*="skip"]','[src*="author"]','[href="#site-content"]','[class^="ad-"]','[class$="-ad"]','[id^="ad-"]','[id$="-ad"]','[role="banner"]','[role="dialog"]','[role="complementary"]','[role="navigation"]'],d=["avatar","-ad-","_ad_","article-end ","article-title","author","banner","bottom-of-article","brand-bar","breadcrumb","button","btn-","-btn","byline","catlinks","collections","comments","comment-content","complementary","-cta","cta-","discussion","eyebrow","expand-reduce","facebook","feedback","fixed","footer","for-you","global","google","goog-","interlude","link-box","loading","logo-","menu-","meta-","metadata","more-","mw-editsection","mw-jump-link","nav-","navbar","next-","newsletter-signup","overlay","popular","popup","post-date","post-title","post_date","post_title","preview","prevnext","profile","promo","qr-code","qr_code","read-next","reading-list","recommend","recirc","register","related","screen-reader-text","share","site-index","skip-","social","sponsor","subscribe","-toc","table-of-contents","tabs-","toolbar","top-wrapper","tree-item","trending","twitter"];t.Defuddle=class{constructor(e,t={}){this.doc=e,this.options=t,this.debug=t.debug||!1}parse(){try{const e=this._evaluateMediaQueries(this.doc),t=this.findSmallImages(this.doc),r=this.doc.cloneNode(!0),n=o.MetadataExtractor.extractSchemaOrgData(this.doc);this.applyMobileStyles(r,e);const a=this.findMainContent(r);if(!a)return Object.assign({content:this.doc.body.innerHTML},o.MetadataExtractor.extract(this.doc,n));this.removeSmallImages(r,t),this.removeHiddenElements(r),this.removeClutter(r),this.cleanContent(a);const i=o.MetadataExtractor.extract(this.doc,n);return Object.assign({content:a?a.outerHTML:this.doc.body.innerHTML},i)}catch(e){console.error("Defuddle","Error processing document:",e);const t=o.MetadataExtractor.extractSchemaOrgData(this.doc);return Object.assign({content:this.doc.body.innerHTML},o.MetadataExtractor.extract(this.doc,t))}}_log(...e){this.debug&&console.log("Defuddle:",...e)}_evaluateMediaQueries(e){const t=[];try{Array.from(e.styleSheets).filter((e=>{try{return e.cssRules,!0}catch(e){return!1}})).forEach((e=>{try{Array.from(e.cssRules).forEach((e=>{var r;e instanceof CSSMediaRule&&e.conditionText.includes("max-width")&&600<=parseInt((null===(r=e.conditionText.match(/\d+/))||void 0===r?void 0:r[0])||"0")&&Array.from(e.cssRules).forEach((e=>{if(e instanceof CSSStyleRule)try{t.push({selector:e.selectorText,styles:e.style.cssText})}catch(t){console.error("Defuddle","Error collecting styles for selector:",e.selectorText,t)}}))}))}catch(e){console.error("Defuddle","Error processing stylesheet:",e)}}))}catch(e){console.error("Defuddle","Error evaluating media queries:",e)}return t}applyMobileStyles(e,t){t.forEach((({selector:t,styles:r})=>{try{e.querySelectorAll(t).forEach((e=>{e.setAttribute("style",(e.getAttribute("style")||"")+r)}))}catch(e){console.error("Defuddle","Error applying styles for selector:",t,e)}}))}removeHiddenElements(e){let t=0;e.querySelectorAll(s).forEach((e=>{e.remove(),t++}));const r=e.getElementsByTagName("*");Array.from(r).forEach((e=>{const r=window.getComputedStyle(e);"none"!==r.display&&"hidden"!==r.visibility&&"0"!==r.opacity||(e.remove(),t++)})),this._log("Removed hidden elements:",t)}removeClutter(e){let t=0,r=0;const o=c.map((e=>e.includes("[")?e.split(/(\[.*?\])/).map((e=>{if(e.startsWith("[")&&e.includes("=")){const[t,r]=e.slice(1,-1).split("=");if(r.startsWith('"')||r.startsWith("'"))return`[${t.toLowerCase()}=${r}]`}return e.toLowerCase()})).join(""):e.toLowerCase())).join(",");e.querySelectorAll(o).forEach((e=>{(null==e?void 0:e.parentNode)&&(e.remove(),t++)}));const n=d.map((e=>new RegExp(e,"i"))),a=new Set;e.querySelectorAll("[class], [id], [data-testid], [data-qa]").forEach((e=>{var t,o;if(!e||!e.parentNode)return;const i=`${e.className&&"string"==typeof e.className?e.className.toLowerCase():""} ${e.id?e.id.toLowerCase():""} ${(null===(t=e.getAttribute("data-testid"))||void 0===t?void 0:t.toLowerCase())||""} ${(null===(o=e.getAttribute("data-qa"))||void 0===o?void 0:o.toLowerCase())||""}`;n.some((e=>e.test(i)))&&(a.add(e),r++)})),a.forEach((e=>e.remove())),this._log("Found clutter elements:",{basicSelectors:t,patternMatches:r,total:t+r})}cleanContent(e){this.removeHtmlComments(e),this.handleHeadings(e),this.stripUnwantedAttributes(e),this.removeEmptyElements(e)}handleHeadings(e){const t=e.getElementsByTagName("h1");let r=!0;Array.from(t).forEach((e=>{var t;if(r)e.remove(),r=!1;else{const r=document.createElement("h2");r.innerHTML=e.innerHTML,Array.from(e.attributes).forEach((e=>{l.has(e.name)&&r.setAttribute(e.name,e.value)})),null===(t=e.parentNode)||void 0===t||t.replaceChild(r,e)}}))}removeHtmlComments(e){const t=[],r=document.createTreeWalker(e,NodeFilter.SHOW_COMMENT,null);let o;for(;o=r.nextNode();)t.push(o);t.forEach((e=>{e.remove()})),this._log("Removed HTML comments:",t.length)}stripUnwantedAttributes(e){let t=0;const r=e=>{Array.from(e.attributes).forEach((r=>{const o=r.name.toLowerCase();l.has(o)||o.startsWith("data-")||(e.removeAttribute(r.name),t++)}))};r(e),e.querySelectorAll("*").forEach(r),this._log("Stripped attributes:",t)}removeEmptyElements(e){let t=0,r=0,o=!0;const n=new Set(["area","audio","base","br","col","embed","figure","hr","iframe","img","input","link","meta","object","param","picture","source","svg","td","th","track","video","wbr"]);for(;o;){r++,o=!1;const a=Array.from(e.getElementsByTagName("*")).filter((e=>{var t;if(n.has(e.tagName.toLowerCase()))return!1;const r=0===(null===(t=e.textContent)||void 0===t?void 0:t.trim().length),o=!e.hasChildNodes()||Array.from(e.childNodes).every((e=>{var t;return e.nodeType===Node.TEXT_NODE&&0===(null===(t=e.textContent)||void 0===t?void 0:t.trim().length)}));return r&&o}));a.length>0&&(a.forEach((e=>{e.remove(),t++})),o=!0)}this._log("Removed empty elements:",{count:t,iterations:r})}findSmallImages(e){let t=0;const r=new Set,o=e.getElementsByTagName("img");return Array.from(o).forEach((e=>{var o;try{const n=window.getComputedStyle(e),a=e.naturalWidth||0,i=e.naturalHeight||0,s=parseInt(e.getAttribute("width")||"0"),l=parseInt(e.getAttribute("height")||"0"),c=parseInt(n.width)||0,d=parseInt(n.height)||0,m=e.getBoundingClientRect(),h=m.width,u=m.height,g=n.transform,p=g?parseFloat((null===(o=g.match(/scale\(([\d.]+)\)/))||void 0===o?void 0:o[1])||"1"):1,f=h*p,y=u*p,v=Math.min(...[a,s,c,f].filter((e=>e>0))),b=Math.min(...[i,l,d,y].filter((e=>e>0)));if(v>0&&b>0&&(v<24||b<24)){const o=this.getImageIdentifier(e);o&&(r.add(o),t++)}}catch(e){console.error("Error processing image:",e)}})),this._log("Found small images:",t),r}removeSmallImages(e,t){let r=0;const o=e.getElementsByTagName("img");Array.from(o).forEach((e=>{const o=this.getImageIdentifier(e);o&&t.has(o)&&(e.remove(),r++)})),this._log("Removed small images:",r)}getImageIdentifier(e){const t=e.src||e.getAttribute("data-src")||"",r=e.srcset||e.getAttribute("data-srcset")||"",o=e.alt||"",n=e.className||"",a=e.id||"";return t?`src:${t}`:r?`srcset:${r}`:a?`id:${a}`:n&&o?`class:${n};alt:${o}`:null}findMainContent(e){const t=["article",'[role="article"]','[itemprop="articleBody"]',".post-content",".article-content","#article-content",".content-article","main",'[role="main"]',"body"],r=[];return t.forEach(((o,n)=>{e.querySelectorAll(o).forEach((e=>{let o=10*(t.length-n);o+=this.scoreElement(e),r.push({element:e,score:o})}))})),0===r.length?this.findContentByScoring(e):(r.sort(((e,t)=>t.score-e.score)),this.debug&&this._log("Content candidates:",r.map((e=>({element:e.element.tagName,selector:this.getElementSelector(e.element),score:e.score})))),r[0].element)}findContentByScoring(e){const t=this.scoreElements(e);return t.length>0?t[0].element:null}getElementSelector(e){const t=[];let r=e;for(;r&&r!==document.documentElement;){let e=r.tagName.toLowerCase();r.id?e+="#"+r.id:r.className&&"string"==typeof r.className&&(e+="."+r.className.trim().split(/\s+/).join(".")),t.unshift(e),r=r.parentElement}return t.join(" > ")}scoreElements(e){const t=[];return i.forEach((r=>{Array.from(e.getElementsByTagName(r)).forEach((e=>{const r=this.scoreElement(e);r>0&&t.push({score:r,element:e})}))})),t.sort(((e,t)=>t.score-e.score))}scoreElement(e){let t=0;const r=e.className&&"string"==typeof e.className?e.className.toLowerCase():"",o=e.id?e.id.toLowerCase():"";(n.test(r)||n.test(o))&&(t+=25),(a.test(r)||a.test(o))&&(t-=25);const i=e.textContent||"",s=i.split(/\s+/).length;t+=Math.min(Math.floor(s/100),3);const l=e.getElementsByTagName("a"),c=Array.from(l).reduce(((e,t)=>{var r;return e+((null===(r=t.textContent)||void 0===r?void 0:r.length)||0)}),0);(i.length?c/i.length:0)>.5&&(t-=10),t+=e.getElementsByTagName("p").length;const d=e.getElementsByTagName("img").length;return t+=Math.min(3*d,9),t}}}},t={};function r(o){var n=t[o];if(void 0!==n)return n.exports;var a=t[o]={exports:{}};return e[o](a,a.exports,r),a.exports}var o={};return(()=>{var e=o;Object.defineProperty(e,"__esModule",{value:!0}),e.Defuddle=void 0;var t=r(628);Object.defineProperty(e,"Defuddle",{enumerable:!0,get:function(){return t.Defuddle}})})(),o})()));
1
+ !function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t():"function"==typeof define&&define.amd?define([],t):"object"==typeof exports?exports.Defuddle=t():e.Defuddle=t()}("undefined"!=typeof self?self:this,(()=>(()=>{"use strict";var e={608:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.MetadataExtractor=void 0,t.MetadataExtractor=class{static extract(e,t){var r;let o="",n="";try{n=(null===(r=e.location)||void 0===r?void 0:r.href)||"",n&&(o=new URL(n).hostname.replace(/^www\./,""))}catch(t){const r=e.querySelector("base[href]");if(r)try{n=r.getAttribute("href")||"",o=new URL(n).hostname.replace(/^www\./,"")}catch(e){console.warn("Failed to parse base URL:",e)}}return{title:this.getTitle(e,t),description:this.getDescription(e,t),domain:o,favicon:this.getFavicon(e,n),image:this.getImage(e,t),published:this.getPublished(e,t),author:this.getAuthor(e,t),site:this.getSite(e,t),schemaOrgData:t}}static getAuthor(e,t){return this.getMetaContent(e,"name","sailthru.author")||this.getSchemaProperty(t,"author.name")||this.getMetaContent(e,"property","author")||this.getMetaContent(e,"name","byl")||this.getMetaContent(e,"name","author")||this.getMetaContent(e,"name","authorList")||this.getMetaContent(e,"name","copyright")||this.getSchemaProperty(t,"copyrightHolder.name")||this.getMetaContent(e,"property","og:site_name")||this.getSchemaProperty(t,"publisher.name")||this.getSchemaProperty(t,"sourceOrganization.name")||this.getSchemaProperty(t,"isPartOf.name")||this.getMetaContent(e,"name","twitter:creator")||this.getMetaContent(e,"name","application-name")||""}static getSite(e,t){return this.getSchemaProperty(t,"publisher.name")||this.getMetaContent(e,"property","og:site_name")||this.getSchemaProperty(t,"sourceOrganization.name")||this.getMetaContent(e,"name","copyright")||this.getSchemaProperty(t,"copyrightHolder.name")||this.getSchemaProperty(t,"isPartOf.name")||this.getMetaContent(e,"name","application-name")||""}static getTitle(e,t){var r,o;return this.getMetaContent(e,"property","og:title")||this.getMetaContent(e,"name","twitter:title")||this.getSchemaProperty(t,"headline")||this.getMetaContent(e,"name","title")||this.getMetaContent(e,"name","sailthru.title")||(null===(o=null===(r=e.querySelector("title"))||void 0===r?void 0:r.textContent)||void 0===o?void 0:o.trim())||""}static getDescription(e,t){return this.getMetaContent(e,"name","description")||this.getMetaContent(e,"property","description")||this.getMetaContent(e,"property","og:description")||this.getSchemaProperty(t,"description")||this.getMetaContent(e,"name","twitter:description")||this.getMetaContent(e,"name","sailthru.description")||""}static getImage(e,t){return this.getMetaContent(e,"property","og:image")||this.getMetaContent(e,"name","twitter:image")||this.getSchemaProperty(t,"image.url")||this.getMetaContent(e,"name","sailthru.image.full")||""}static getFavicon(e,t){var r,o;const n=this.getMetaContent(e,"property","og:image:favicon");if(n)return n;const a=null===(r=e.querySelector("link[rel='icon']"))||void 0===r?void 0:r.getAttribute("href");if(a)return a;const i=null===(o=e.querySelector("link[rel='shortcut icon']"))||void 0===o?void 0:o.getAttribute("href");if(i)return i;if(t)try{return new URL("/favicon.ico",t).href}catch(e){console.warn("Failed to construct favicon URL:",e)}return""}static getPublished(e,t){return this.getSchemaProperty(t,"datePublished")||this.getMetaContent(e,"name","publishDate")||this.getMetaContent(e,"property","article:published_time")||this.getTimeElement(e)||this.getMetaContent(e,"name","sailthru.date")||""}static getMetaContent(e,t,r){var o,n;const a=`meta[${t}]`,i=Array.from(e.querySelectorAll(a)).find((e=>{var o;return(null===(o=e.getAttribute(t))||void 0===o?void 0:o.toLowerCase())===r.toLowerCase()})),s=i&&null!==(n=null===(o=i.getAttribute("content"))||void 0===o?void 0:o.trim())&&void 0!==n?n:"";return this.decodeHTMLEntities(s)}static getTimeElement(e){var t,r,o,n;const a=Array.from(e.querySelectorAll("time"))[0],i=a&&null!==(n=null!==(r=null===(t=a.getAttribute("datetime"))||void 0===t?void 0:t.trim())&&void 0!==r?r:null===(o=a.textContent)||void 0===o?void 0:o.trim())&&void 0!==n?n:"";return this.decodeHTMLEntities(i)}static decodeHTMLEntities(e){const t=document.createElement("textarea");return t.innerHTML=e,t.value}static getSchemaProperty(e,t,r=""){if(!e)return r;const o=(e,t,r,n=!0)=>{if("string"==typeof e)return 0===t.length?[e]:[];if(!e||"object"!=typeof e)return[];if(Array.isArray(e)){const a=t[0];if(/^\[\d+\]$/.test(a)){const i=parseInt(a.slice(1,-1));return e[i]?o(e[i],t.slice(1),r,n):[]}return 0===t.length&&e.every((e=>"string"==typeof e||"number"==typeof e))?e.map(String):e.flatMap((e=>o(e,t,r,n)))}const[a,...i]=t;if(!a)return"string"==typeof e?[e]:"object"==typeof e&&e.name?[e.name]:[];if(e.hasOwnProperty(a))return o(e[a],i,r?`${r}.${a}`:a,!0);if(!n){const n=[];for(const a in e)if("object"==typeof e[a]){const i=o(e[a],t,r?`${r}.${a}`:a,!1);n.push(...i)}if(n.length>0)return n}return[]};try{let n=o(e,t.split("."),"",!0);0===n.length&&(n=o(e,t.split("."),"",!1));const a=n.length>0?n.filter(Boolean).join(", "):r;return this.decodeHTMLEntities(a)}catch(e){return console.error(`Error in getSchemaProperty for ${t}:`,e),r}}static extractSchemaOrgData(e){const t=e.querySelectorAll('script[type="application/ld+json"]'),r=[];return t.forEach((e=>{let t=e.textContent||"";try{t=t.replace(/\/\*[\s\S]*?\*\/|^\s*\/\/.*$/gm,"").replace(/^\s*<!\[CDATA\[([\s\S]*?)\]\]>\s*$/,"$1").replace(/^\s*(\*\/|\/\*)\s*|\s*(\*\/|\/\*)\s*$/g,"").trim();const e=JSON.parse(t);e["@graph"]&&Array.isArray(e["@graph"])?r.push(...e["@graph"]):r.push(e)}catch(e){console.error("Error parsing schema.org data:",e),console.error("Problematic JSON content:",t)}})),r}}},628:(e,t,r)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.Defuddle=void 0;const o=r(608),n=/article|content|main|post|body|text|blog|story/i,a=/comment|meta|footer|footnote|foot|nav|sidebar|banner|ad|popup|menu/i,i=["div","section","article","main"],s=["[hidden]",'[aria-hidden="true"]','[style*="visibility: hidden"]','[style*="visibility:hidden"]',".hidden",".invisible"].join(","),l=new Set(["alt","aria-label","class","colspan","data-src","data-srcset","dir","headers","height","href","id","lang","role","rowspan","src","srcset","title","width"]),c=[".ad","aside","button","canvas","#comments","dialog","fieldset","footer","form","header","input","iframe","label","link","nav","noscript",".noprint","option","script","select","sidebar",".sidebar","#sidebar","#siteSub","style","#toc",".toc","textarea",".clickable-icon",'a[href^="#"][class*="anchor"]','[data-link-name*="skip"]','[src*="author"]','[href="#site-content"]','[class^="ad-"]','[class$="-ad"]','[id^="ad-"]','[id$="-ad"]','[role="banner"]','[role="button"]','[role="dialog"]','[role="complementary"]','[role="navigation"]'],d=["avatar","-ad-","_ad_","article-end ","article-title","author","banner","bottom-of-article","brand-bar","breadcrumb","button-wrapper","btn-","-btn","byline","catlinks","collections","comments","comment-content","complementary","-cta","cta-","discussion","eyebrow","expand-reduce","facebook","feedback","fixed","footer","for-you","frontmatter","global","google","goog-","interlude","link-box","loading","menu-","meta-","metadata","more-","mw-editsection","mw-jump-link","nav-","navbar","next-","newsletter-signup","not-found","overlay","popular","popup","post-date","post-title","post_date","post_title","prevnext","profile","promo","qr-code","qr_code","read-next","reading-list","recommend","recirc","register","related","screen-reader-text","share","site-index","site-header","site-logo","site-name","skip-","social","sponsor","subscribe","-toc","table-of-contents","tabs-","toolbar","top-wrapper","tree-item","trending","twitter"];t.Defuddle=class{constructor(e,t={}){this.doc=e,this.options=t,this.debug=t.debug||!1}parse(){try{const e=this._evaluateMediaQueries(this.doc),t=this.findSmallImages(this.doc),r=this.doc.cloneNode(!0),n=o.MetadataExtractor.extractSchemaOrgData(this.doc);this.applyMobileStyles(r,e);const a=this.findMainContent(r);if(!a)return Object.assign({content:this.doc.body.innerHTML},o.MetadataExtractor.extract(this.doc,n));this.removeSmallImages(r,t),this.removeHiddenElements(r),this.removeClutter(r),this.cleanContent(a);const i=o.MetadataExtractor.extract(this.doc,n);return Object.assign({content:a?a.outerHTML:this.doc.body.innerHTML},i)}catch(e){console.error("Defuddle","Error processing document:",e);const t=o.MetadataExtractor.extractSchemaOrgData(this.doc);return Object.assign({content:this.doc.body.innerHTML},o.MetadataExtractor.extract(this.doc,t))}}_log(...e){this.debug&&console.log("Defuddle:",...e)}_evaluateMediaQueries(e){const t=[];try{Array.from(e.styleSheets).filter((e=>{try{return e.cssRules,!0}catch(e){return!1}})).forEach((e=>{try{Array.from(e.cssRules).forEach((e=>{var r;e instanceof CSSMediaRule&&e.conditionText.includes("max-width")&&600<=parseInt((null===(r=e.conditionText.match(/\d+/))||void 0===r?void 0:r[0])||"0")&&Array.from(e.cssRules).forEach((e=>{if(e instanceof CSSStyleRule)try{t.push({selector:e.selectorText,styles:e.style.cssText})}catch(t){console.error("Defuddle","Error collecting styles for selector:",e.selectorText,t)}}))}))}catch(e){console.error("Defuddle","Error processing stylesheet:",e)}}))}catch(e){console.error("Defuddle","Error evaluating media queries:",e)}return t}applyMobileStyles(e,t){t.forEach((({selector:t,styles:r})=>{try{e.querySelectorAll(t).forEach((e=>{e.setAttribute("style",(e.getAttribute("style")||"")+r)}))}catch(e){console.error("Defuddle","Error applying styles for selector:",t,e)}}))}removeHiddenElements(e){let t=0;e.querySelectorAll(s).forEach((e=>{e.remove(),t++}));const r=e.getElementsByTagName("*");Array.from(r).forEach((e=>{const r=window.getComputedStyle(e);"none"!==r.display&&"hidden"!==r.visibility&&"0"!==r.opacity||(e.remove(),t++)})),this._log("Removed hidden elements:",t)}removeClutter(e){let t=0,r=0;const o=c.map((e=>e.includes("[")?e.split(/(\[.*?\])/).map((e=>{if(e.startsWith("[")&&e.includes("=")){const[t,r]=e.slice(1,-1).split("=");if(r.startsWith('"')||r.startsWith("'"))return`[${t.toLowerCase()}=${r}]`}return e.toLowerCase()})).join(""):e.toLowerCase())).join(",");e.querySelectorAll(o).forEach((e=>{(null==e?void 0:e.parentNode)&&(e.remove(),t++)}));const n=d.map((e=>new RegExp(e,"i"))),a=new Set;e.querySelectorAll("[class], [id], [data-testid], [data-qa]").forEach((e=>{var t,o;if(!e||!e.parentNode)return;const i=`${e.className&&"string"==typeof e.className?e.className.toLowerCase():""} ${e.id?e.id.toLowerCase():""} ${(null===(t=e.getAttribute("data-testid"))||void 0===t?void 0:t.toLowerCase())||""} ${(null===(o=e.getAttribute("data-qa"))||void 0===o?void 0:o.toLowerCase())||""}`;n.some((e=>e.test(i)))&&(a.add(e),r++)})),a.forEach((e=>e.remove())),this._log("Found clutter elements:",{basicSelectors:t,patternMatches:r,total:t+r})}cleanContent(e){this.removeHtmlComments(e),this.handleHeadings(e),this.stripUnwantedAttributes(e),this.removeEmptyElements(e)}handleHeadings(e){const t=e.getElementsByTagName("h1");let r=!0;Array.from(t).forEach((e=>{var t;if(r)e.remove(),r=!1;else{const r=document.createElement("h2");r.innerHTML=e.innerHTML,Array.from(e.attributes).forEach((e=>{l.has(e.name)&&r.setAttribute(e.name,e.value)})),null===(t=e.parentNode)||void 0===t||t.replaceChild(r,e)}}))}removeHtmlComments(e){const t=[],r=document.createTreeWalker(e,NodeFilter.SHOW_COMMENT,null);let o;for(;o=r.nextNode();)t.push(o);t.forEach((e=>{e.remove()})),this._log("Removed HTML comments:",t.length)}stripUnwantedAttributes(e){let t=0;const r=e=>{e instanceof SVGElement||Array.from(e.attributes).forEach((r=>{const o=r.name.toLowerCase();l.has(o)||o.startsWith("data-")||(e.removeAttribute(r.name),t++)}))};r(e),e.querySelectorAll("*").forEach(r),this._log("Stripped attributes:",t)}removeEmptyElements(e){let t=0,r=0,o=!0;const n=new Set(["area","audio","base","br","circle","col","defs","ellipse","embed","figure","g","hr","iframe","img","input","line","link","mask","meta","object","param","path","pattern","picture","polygon","polyline","rect","source","stop","svg","td","th","track","use","video","wbr"]);for(;o;){r++,o=!1;const a=Array.from(e.getElementsByTagName("*")).filter((e=>{var t;if(n.has(e.tagName.toLowerCase()))return!1;const r=0===(null===(t=e.textContent)||void 0===t?void 0:t.trim().length),o=!e.hasChildNodes()||Array.from(e.childNodes).every((e=>{var t;return e.nodeType===Node.TEXT_NODE&&0===(null===(t=e.textContent)||void 0===t?void 0:t.trim().length)}));return r&&o}));a.length>0&&(a.forEach((e=>{e.remove(),t++})),o=!0)}this._log("Removed empty elements:",{count:t,iterations:r})}findSmallImages(e){let t=0;const r=new Set,o=(e,o)=>{Array.from(e).forEach((e=>{var n;try{const a=window.getComputedStyle(e);if("img"===o){const o=e,i=o.naturalWidth||0,s=o.naturalHeight||0,l=parseInt(o.getAttribute("width")||"0"),c=parseInt(o.getAttribute("height")||"0"),d=parseInt(a.width)||0,m=parseInt(a.height)||0,h=o.getBoundingClientRect(),u=h.width,g=h.height,p=a.transform,f=p?parseFloat((null===(n=p.match(/scale\(([\d.]+)\)/))||void 0===n?void 0:n[1])||"1"):1,y=u*f,v=g*f,b=Math.min(...[i,l,d,y].filter((e=>e>0))),E=Math.min(...[s,c,m,v].filter((e=>e>0)));if(b>0&&E>0&&(b<33||E<33)){const e=this.getElementIdentifier(o);e&&(r.add(e),t++)}}else{const o=e,n=o.getBoundingClientRect(),i=parseInt(a.width)||0,s=parseInt(a.height)||0,l=parseInt(o.getAttribute("width")||"0"),c=parseInt(o.getAttribute("height")||"0"),d=Math.min(...[n.width,i,l].filter((e=>e>0))),m=Math.min(...[n.height,s,c].filter((e=>e>0)));if(d>0&&m>0&&(d<33||m<33)){const e=this.getElementIdentifier(o);e&&(r.add(e),t++)}}}catch(e){console.error("Error processing element:",e)}}))};return o(e.getElementsByTagName("img"),"img"),o(e.getElementsByTagName("svg"),"svg"),this._log("Found small elements:",t),r}removeSmallImages(e,t){let r=0;["img","svg"].forEach((o=>{const n=e.getElementsByTagName(o);Array.from(n).forEach((e=>{const o=this.getElementIdentifier(e);o&&t.has(o)&&(e.remove(),r++)}))})),this._log("Removed small elements:",r)}getElementIdentifier(e){if(e instanceof HTMLImageElement){const t=e.src||e.getAttribute("data-src")||"",r=e.srcset||e.getAttribute("data-srcset")||"";if(t)return`src:${t}`;if(r)return`srcset:${r}`}const t=e.id||"",r=e.className||"",o=e instanceof SVGElement&&e.getAttribute("viewBox")||"";return t?`id:${t}`:o?`viewBox:${o}`:r?`class:${r}`:null}findMainContent(e){const t=["article",'[role="article"]','[itemprop="articleBody"]',".post-content",".article-content","#article-content",".content-article","main",'[role="main"]',"body"],r=[];return t.forEach(((o,n)=>{e.querySelectorAll(o).forEach((e=>{let o=10*(t.length-n);o+=this.scoreElement(e),r.push({element:e,score:o})}))})),0===r.length?this.findContentByScoring(e):(r.sort(((e,t)=>t.score-e.score)),this.debug&&this._log("Content candidates:",r.map((e=>({element:e.element.tagName,selector:this.getElementSelector(e.element),score:e.score})))),r[0].element)}findContentByScoring(e){const t=this.scoreElements(e);return t.length>0?t[0].element:null}getElementSelector(e){const t=[];let r=e;for(;r&&r!==document.documentElement;){let e=r.tagName.toLowerCase();r.id?e+="#"+r.id:r.className&&"string"==typeof r.className&&(e+="."+r.className.trim().split(/\s+/).join(".")),t.unshift(e),r=r.parentElement}return t.join(" > ")}scoreElements(e){const t=[];return i.forEach((r=>{Array.from(e.getElementsByTagName(r)).forEach((e=>{const r=this.scoreElement(e);r>0&&t.push({score:r,element:e})}))})),t.sort(((e,t)=>t.score-e.score))}scoreElement(e){let t=0;const r=e.className&&"string"==typeof e.className?e.className.toLowerCase():"",o=e.id?e.id.toLowerCase():"";(n.test(r)||n.test(o))&&(t+=25),(a.test(r)||a.test(o))&&(t-=25);const i=e.textContent||"",s=i.split(/\s+/).length;t+=Math.min(Math.floor(s/100),3);const l=e.getElementsByTagName("a"),c=Array.from(l).reduce(((e,t)=>{var r;return e+((null===(r=t.textContent)||void 0===r?void 0:r.length)||0)}),0);(i.length?c/i.length:0)>.5&&(t-=10),t+=e.getElementsByTagName("p").length;const d=e.getElementsByTagName("img").length;return t+=Math.min(3*d,9),t}}}},t={};function r(o){var n=t[o];if(void 0!==n)return n.exports;var a=t[o]={exports:{}};return e[o](a,a.exports,r),a.exports}var o={};return(()=>{var e=o;Object.defineProperty(e,"__esModule",{value:!0}),e.Defuddle=void 0;var t=r(628);Object.defineProperty(e,"Defuddle",{enumerable:!0,get:function(){return t.Defuddle}})})(),o})()));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "defuddle",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Extract article content and metadata from web pages",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.js",
@@ -10,7 +10,10 @@
10
10
  "build:types": "tsc --project tsconfig.declarations.json",
11
11
  "build:js": "webpack",
12
12
  "build": "npm run clean && npm run build:types && npm run build:js",
13
- "prepublishOnly": "npm run build"
13
+ "prepublishOnly": "npm run build",
14
+ "dev:types": "tsc --project tsconfig.declarations.json --watch",
15
+ "dev:js": "webpack --watch --mode development",
16
+ "dev": "npm run clean && npm run build:types && concurrently \"npm run dev:types\" \"npm run dev:js\""
14
17
  },
15
18
  "keywords": [
16
19
  "readability",
@@ -26,7 +29,7 @@
26
29
  "license": "MIT",
27
30
  "repository": {
28
31
  "type": "git",
29
- "url": "https://github.com/kepano/defuddle"
32
+ "url": "git+https://github.com/kepano/defuddle.git"
30
33
  },
31
34
  "bugs": {
32
35
  "url": "https://github.com/kepano/defuddle/issues"
@@ -34,6 +37,7 @@
34
37
  "homepage": "https://github.com/kepano/defuddle#readme",
35
38
  "devDependencies": {
36
39
  "@types/node": "^20.0.0",
40
+ "concurrently": "^8.2.2",
37
41
  "ts-loader": "^9.5.1",
38
42
  "typescript": "^5.3.3",
39
43
  "undici-types": "^5.0.0",