defuddle 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Steph Ango (@kepano)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,66 @@
1
+ **Beware! Defuddle is very much a work in progress!**
2
+
3
+ Defuddle extracts the main content from web pages. It cleans up web pages by removing clutter like comments, sidebars, headers, footers, and other non-essential elements, leaving only the primary content.
4
+
5
+ ## Key features
6
+
7
+ Defuddle aims to be a replacement for Mozilla Readability, with a few differences:
8
+
9
+ - More forgiving, removes fewer uncertain elements
10
+ - Uses a page's mobile styles to guess at unnecessary elements
11
+ - Extracts more metadata from the page, including schema.org data
12
+ - Simpler scoring algorithm focused on content density and semantic markup
13
+ - Better handling of modern web components and dynamic content
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ npm install defuddle
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ ```typescript
24
+ import { Defuddle } from 'defuddle';
25
+
26
+ const article = new Defuddle(document).parse();
27
+
28
+ // Use the extracted content and metadata
29
+ console.log(article.content); // HTML string of the main content
30
+ console.log(article.title); // Title of the article
31
+ ```
32
+
33
+ ## Response
34
+
35
+ The `parse()` method returns an object with the following properties:
36
+
37
+ | Property | Type | Description |
38
+ |----------|------|-------------|
39
+ | `content` | string | HTML string of the extracted main content |
40
+ | `title` | string | Title of the article |
41
+ | `description` | string | Description or summary of the article |
42
+ | `domain` | string | Domain name of the website |
43
+ | `favicon` | string | URL of the website's favicon |
44
+ | `image` | string | URL of the article's main image |
45
+ | `published` | string | Publication date of the article |
46
+ | `author` | string | Author of the article |
47
+ | `site` | string | Name of the website |
48
+ | `schemaOrgData` | object | Raw schema.org data extracted from the page |
49
+
50
+ ## Development
51
+
52
+ ### Build
53
+
54
+ To build the package, you'll need Node.js and npm installed. Then run:
55
+
56
+ ```bash
57
+ # Install dependencies
58
+ npm install
59
+
60
+ # Clean and build
61
+ npm run build
62
+ ```
63
+
64
+ This will generate:
65
+ - `dist/index.js` - UMD build for both Node.js and browsers
66
+ - `dist/index.d.ts` - TypeScript declaration file
@@ -0,0 +1,34 @@
1
+ import { DefuddleOptions, DefuddleResponse } from './types';
2
+ export declare class Defuddle {
3
+ private doc;
4
+ private options;
5
+ private debug;
6
+ /**
7
+ * Create a new Defuddle instance
8
+ * @param doc - The document to parse
9
+ * @param options - Options for parsing
10
+ */
11
+ constructor(doc: Document, options?: DefuddleOptions);
12
+ /**
13
+ * Parse the document and extract its main content
14
+ */
15
+ parse(): DefuddleResponse;
16
+ private _log;
17
+ private _evaluateMediaQueries;
18
+ private applyMobileStyles;
19
+ private removeHiddenElements;
20
+ private removeClutter;
21
+ private cleanContent;
22
+ private handleHeadings;
23
+ private removeHtmlComments;
24
+ private stripUnwantedAttributes;
25
+ private removeEmptyElements;
26
+ private findSmallImages;
27
+ private removeSmallImages;
28
+ private getImageIdentifier;
29
+ private findMainContent;
30
+ private findContentByScoring;
31
+ private getElementSelector;
32
+ private scoreElements;
33
+ private scoreElement;
34
+ }
@@ -0,0 +1,2 @@
1
+ export { Defuddle } from './defuddle';
2
+ export type { DefuddleOptions, DefuddleResponse, DefuddleMetadata } from './types';
package/dist/index.js ADDED
@@ -0,0 +1 @@
1
+ !function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t():"function"==typeof define&&define.amd?define([],t):"object"==typeof exports?exports.Defuddle=t():e.Defuddle=t()}("undefined"!=typeof self?self:this,(()=>(()=>{"use strict";var e={608:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.MetadataExtractor=void 0,t.MetadataExtractor=class{static extract(e,t){var r;let o="",n="";try{n=(null===(r=e.location)||void 0===r?void 0:r.href)||"",n&&(o=new URL(n).hostname.replace(/^www\./,""))}catch(t){const r=e.querySelector("base[href]");if(r)try{n=r.getAttribute("href")||"",o=new URL(n).hostname.replace(/^www\./,"")}catch(e){console.warn("Failed to parse base URL:",e)}}return{title:this.getTitle(e,t),description:this.getDescription(e,t),domain:o,favicon:this.getFavicon(e,n),image:this.getImage(e,t),published:this.getPublished(e,t),author:this.getAuthor(e,t),site:this.getSite(e,t),schemaOrgData:t}}static getAuthor(e,t){return this.getMetaContent(e,"name","sailthru.author")||this.getSchemaProperty(t,"author.name")||this.getMetaContent(e,"property","author")||this.getMetaContent(e,"name","byl")||this.getMetaContent(e,"name","author")||this.getMetaContent(e,"name","authorList")||this.getMetaContent(e,"name","copyright")||this.getSchemaProperty(t,"copyrightHolder.name")||this.getMetaContent(e,"property","og:site_name")||this.getSchemaProperty(t,"publisher.name")||this.getSchemaProperty(t,"sourceOrganization.name")||this.getSchemaProperty(t,"isPartOf.name")||this.getMetaContent(e,"name","twitter:creator")||this.getMetaContent(e,"name","application-name")||""}static getSite(e,t){return this.getSchemaProperty(t,"publisher.name")||this.getMetaContent(e,"property","og:site_name")||this.getSchemaProperty(t,"sourceOrganization.name")||this.getMetaContent(e,"name","copyright")||this.getSchemaProperty(t,"copyrightHolder.name")||this.getSchemaProperty(t,"isPartOf.name")||this.getMetaContent(e,"name","application-name")||""}static getTitle(e,t){var r,o;return this.getMetaContent(e,"property","og:title")||this.getMetaContent(e,"name","twitter:title")||this.getSchemaProperty(t,"headline")||this.getMetaContent(e,"name","title")||this.getMetaContent(e,"name","sailthru.title")||(null===(o=null===(r=e.querySelector("title"))||void 0===r?void 0:r.textContent)||void 0===o?void 0:o.trim())||""}static getDescription(e,t){return this.getMetaContent(e,"name","description")||this.getMetaContent(e,"property","description")||this.getMetaContent(e,"property","og:description")||this.getSchemaProperty(t,"description")||this.getMetaContent(e,"name","twitter:description")||this.getMetaContent(e,"name","sailthru.description")||""}static getImage(e,t){return this.getMetaContent(e,"property","og:image")||this.getMetaContent(e,"name","twitter:image")||this.getSchemaProperty(t,"image.url")||this.getMetaContent(e,"name","sailthru.image.full")||""}static getFavicon(e,t){var r,o;const n=this.getMetaContent(e,"property","og:image:favicon");if(n)return n;const a=null===(r=e.querySelector("link[rel='icon']"))||void 0===r?void 0:r.getAttribute("href");if(a)return a;const i=null===(o=e.querySelector("link[rel='shortcut icon']"))||void 0===o?void 0:o.getAttribute("href");if(i)return i;if(t)try{return new URL("/favicon.ico",t).href}catch(e){console.warn("Failed to construct favicon URL:",e)}return""}static getPublished(e,t){return this.getSchemaProperty(t,"datePublished")||this.getMetaContent(e,"name","publishDate")||this.getMetaContent(e,"property","article:published_time")||this.getTimeElement(e)||this.getMetaContent(e,"name","sailthru.date")||""}static getMetaContent(e,t,r){var o,n;const a=`meta[${t}]`,i=Array.from(e.querySelectorAll(a)).find((e=>{var o;return(null===(o=e.getAttribute(t))||void 0===o?void 0:o.toLowerCase())===r.toLowerCase()})),s=i&&null!==(n=null===(o=i.getAttribute("content"))||void 0===o?void 0:o.trim())&&void 0!==n?n:"";return this.decodeHTMLEntities(s)}static getTimeElement(e){var t,r,o,n;const a=Array.from(e.querySelectorAll("time"))[0],i=a&&null!==(n=null!==(r=null===(t=a.getAttribute("datetime"))||void 0===t?void 0:t.trim())&&void 0!==r?r:null===(o=a.textContent)||void 0===o?void 0:o.trim())&&void 0!==n?n:"";return this.decodeHTMLEntities(i)}static decodeHTMLEntities(e){const t=document.createElement("textarea");return t.innerHTML=e,t.value}static getSchemaProperty(e,t,r=""){if(!e)return r;const o=(e,t,r,n=!0)=>{if("string"==typeof e)return 0===t.length?[e]:[];if(!e||"object"!=typeof e)return[];if(Array.isArray(e)){const a=t[0];if(/^\[\d+\]$/.test(a)){const i=parseInt(a.slice(1,-1));return e[i]?o(e[i],t.slice(1),r,n):[]}return 0===t.length&&e.every((e=>"string"==typeof e||"number"==typeof e))?e.map(String):e.flatMap((e=>o(e,t,r,n)))}const[a,...i]=t;if(!a)return"string"==typeof e?[e]:"object"==typeof e&&e.name?[e.name]:[];if(e.hasOwnProperty(a))return o(e[a],i,r?`${r}.${a}`:a,!0);if(!n){const n=[];for(const a in e)if("object"==typeof e[a]){const i=o(e[a],t,r?`${r}.${a}`:a,!1);n.push(...i)}if(n.length>0)return n}return[]};try{let n=o(e,t.split("."),"",!0);0===n.length&&(n=o(e,t.split("."),"",!1));const a=n.length>0?n.filter(Boolean).join(", "):r;return this.decodeHTMLEntities(a)}catch(e){return console.error(`Error in getSchemaProperty for ${t}:`,e),r}}static extractSchemaOrgData(e){const t=e.querySelectorAll('script[type="application/ld+json"]'),r=[];return t.forEach((e=>{let t=e.textContent||"";try{t=t.replace(/\/\*[\s\S]*?\*\/|^\s*\/\/.*$/gm,"").replace(/^\s*<!\[CDATA\[([\s\S]*?)\]\]>\s*$/,"$1").replace(/^\s*(\*\/|\/\*)\s*|\s*(\*\/|\/\*)\s*$/g,"").trim();const e=JSON.parse(t);e["@graph"]&&Array.isArray(e["@graph"])?r.push(...e["@graph"]):r.push(e)}catch(e){console.error("Error parsing schema.org data:",e),console.error("Problematic JSON content:",t)}})),r}}},628:(e,t,r)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.Defuddle=void 0;const o=r(608),n=/article|content|main|post|body|text|blog|story/i,a=/comment|meta|footer|footnote|foot|nav|sidebar|banner|ad|popup|menu/i,i=["div","section","article","main"],s=["[hidden]",'[style*="display: none"]','[style*="display:none"]','[style*="visibility: hidden"]','[style*="visibility:hidden"]',".hidden",".invisible"].join(","),l=new Set(["href","src","srcset","data-src","data-srcset","alt","title","id","class","width","height","colspan","rowspan","headers","aria-label","role","lang"]),c=["#toc",".toc","#comments","#siteSub",".ad","aside","button","canvas","dialog","fieldset","footer","form","header","input","iframe","label","link","nav","noscript","option","script","select","sidebar",".sidebar","#sidebar","style","textarea",'[data-link-name*="skip"]','[src*="author"]','[href="#site-content"]','[class^="ad-"]','[class$="-ad"]','[id^="ad-"]','[id$="-ad"]','[role="banner"]','[role="dialog"]','[role="complementary"]','[role="navigation"]'],d=["avatar","-ad-","_ad_","article-end ","article-title","author","banner","bottom-of-article","brand-bar","breadcrumb","button","btn-","-btn","byline","catlinks","collections","comments","comment-content","complementary","-cta","cta-","discussion","eyebrow","expand-reduce","facebook","feedback","fixed","footer","for-you","global","google","goog-","interlude","link-box","loading","logo-","menu-","meta-","metadata","more-","mw-editsection","mw-jump-link","nav-","navbar","next-","newsletter-signup","overlay","popular","popup","post-date","post-title","post_date","post_title","preview","prevnext","profile","promo","qr-code","qr_code","read-next","reading-list","recommend","recirc","register","related","screen-reader-text","share","site-index","skip-","social","sponsor","subscribe","-toc","table-of-contents","tabs-","toolbar","top-wrapper","tree-item","trending","twitter"];t.Defuddle=class{constructor(e,t={}){this.doc=e,this.options=t,this.debug=t.debug||!1}parse(){try{const e=this._evaluateMediaQueries(this.doc),t=this.findSmallImages(this.doc),r=this.doc.cloneNode(!0),n=o.MetadataExtractor.extractSchemaOrgData(this.doc);this.applyMobileStyles(r,e);const a=this.findMainContent(r);if(!a)return Object.assign({content:this.doc.body.innerHTML},o.MetadataExtractor.extract(this.doc,n));this.removeSmallImages(r,t),this.removeHiddenElements(r),this.removeClutter(r),this.cleanContent(a);const i=o.MetadataExtractor.extract(this.doc,n);return Object.assign({content:a?a.outerHTML:this.doc.body.innerHTML},i)}catch(e){console.error("Defuddle","Error processing document:",e);const t=o.MetadataExtractor.extractSchemaOrgData(this.doc);return Object.assign({content:this.doc.body.innerHTML},o.MetadataExtractor.extract(this.doc,t))}}_log(...e){this.debug&&console.log("Defuddle:",...e)}_evaluateMediaQueries(e){const t=[];try{Array.from(e.styleSheets).filter((e=>{try{return e.cssRules,!0}catch(e){return!1}})).forEach((e=>{try{Array.from(e.cssRules).forEach((e=>{var r;e instanceof CSSMediaRule&&e.conditionText.includes("max-width")&&600<=parseInt((null===(r=e.conditionText.match(/\d+/))||void 0===r?void 0:r[0])||"0")&&Array.from(e.cssRules).forEach((e=>{if(e instanceof CSSStyleRule)try{t.push({selector:e.selectorText,styles:e.style.cssText})}catch(t){console.error("Defuddle","Error collecting styles for selector:",e.selectorText,t)}}))}))}catch(e){console.error("Defuddle","Error processing stylesheet:",e)}}))}catch(e){console.error("Defuddle","Error evaluating media queries:",e)}return t}applyMobileStyles(e,t){t.forEach((({selector:t,styles:r})=>{try{e.querySelectorAll(t).forEach((e=>{e.setAttribute("style",(e.getAttribute("style")||"")+r)}))}catch(e){console.error("Defuddle","Error applying styles for selector:",t,e)}}))}removeHiddenElements(e){let t=0;e.querySelectorAll(s).forEach((e=>{e.remove(),t++}));const r=e.getElementsByTagName("*");Array.from(r).forEach((e=>{const r=window.getComputedStyle(e);"none"!==r.display&&"hidden"!==r.visibility&&"0"!==r.opacity||(e.remove(),t++)})),this._log("Removed hidden elements:",t)}removeClutter(e){let t=0,r=0;const o=c.map((e=>e.includes("[")?e.split(/(\[.*?\])/).map((e=>{if(e.startsWith("[")&&e.includes("=")){const[t,r]=e.slice(1,-1).split("=");if(r.startsWith('"')||r.startsWith("'"))return`[${t.toLowerCase()}=${r}]`}return e.toLowerCase()})).join(""):e.toLowerCase())).join(",");e.querySelectorAll(o).forEach((e=>{(null==e?void 0:e.parentNode)&&(e.remove(),t++)}));const n=d.map((e=>new RegExp(e,"i"))),a=new Set;e.querySelectorAll("[class], [id], [data-testid], [data-qa]").forEach((e=>{var t,o;if(!e||!e.parentNode)return;const i=`${e.className&&"string"==typeof e.className?e.className.toLowerCase():""} ${e.id?e.id.toLowerCase():""} ${(null===(t=e.getAttribute("data-testid"))||void 0===t?void 0:t.toLowerCase())||""} ${(null===(o=e.getAttribute("data-qa"))||void 0===o?void 0:o.toLowerCase())||""}`;n.some((e=>e.test(i)))&&(a.add(e),r++)})),a.forEach((e=>e.remove())),this._log("Found clutter elements:",{basicSelectors:t,patternMatches:r,total:t+r})}cleanContent(e){this.removeHtmlComments(e),this.handleHeadings(e),this.stripUnwantedAttributes(e),this.removeEmptyElements(e)}handleHeadings(e){const t=e.getElementsByTagName("h1");let r=!0;Array.from(t).forEach((e=>{var t;if(r)e.remove(),r=!1;else{const r=document.createElement("h2");r.innerHTML=e.innerHTML,Array.from(e.attributes).forEach((e=>{l.has(e.name)&&r.setAttribute(e.name,e.value)})),null===(t=e.parentNode)||void 0===t||t.replaceChild(r,e)}}))}removeHtmlComments(e){const t=[],r=document.createTreeWalker(e,NodeFilter.SHOW_COMMENT,null);let o;for(;o=r.nextNode();)t.push(o);t.forEach((e=>{e.remove()})),this._log("Removed HTML comments:",t.length)}stripUnwantedAttributes(e){let t=0;const r=e=>{Array.from(e.attributes).forEach((r=>{const o=r.name.toLowerCase();l.has(o)||o.startsWith("data-")||(e.removeAttribute(r.name),t++)}))};r(e),e.querySelectorAll("*").forEach(r),this._log("Stripped attributes:",t)}removeEmptyElements(e){let t=0,r=0,o=!0;const n=new Set(["area","audio","base","br","col","embed","figure","hr","iframe","img","input","link","meta","object","param","picture","source","svg","td","th","track","video","wbr"]);for(;o;){r++,o=!1;const a=Array.from(e.getElementsByTagName("*")).filter((e=>{var t;if(n.has(e.tagName.toLowerCase()))return!1;const r=0===(null===(t=e.textContent)||void 0===t?void 0:t.trim().length),o=!e.hasChildNodes()||Array.from(e.childNodes).every((e=>{var t;return e.nodeType===Node.TEXT_NODE&&0===(null===(t=e.textContent)||void 0===t?void 0:t.trim().length)}));return r&&o}));a.length>0&&(a.forEach((e=>{e.remove(),t++})),o=!0)}this._log("Removed empty elements:",{count:t,iterations:r})}findSmallImages(e){let t=0;const r=new Set,o=e.getElementsByTagName("img");return Array.from(o).forEach((e=>{var o;try{const n=window.getComputedStyle(e),a=e.naturalWidth||0,i=e.naturalHeight||0,s=parseInt(e.getAttribute("width")||"0"),l=parseInt(e.getAttribute("height")||"0"),c=parseInt(n.width)||0,d=parseInt(n.height)||0,m=e.getBoundingClientRect(),h=m.width,u=m.height,g=n.transform,p=g?parseFloat((null===(o=g.match(/scale\(([\d.]+)\)/))||void 0===o?void 0:o[1])||"1"):1,f=h*p,y=u*p,v=Math.min(...[a,s,c,f].filter((e=>e>0))),b=Math.min(...[i,l,d,y].filter((e=>e>0)));if(v>0&&b>0&&(v<24||b<24)){const o=this.getImageIdentifier(e);o&&(r.add(o),t++)}}catch(e){console.error("Error processing image:",e)}})),this._log("Found small images:",t),r}removeSmallImages(e,t){let r=0;const o=e.getElementsByTagName("img");Array.from(o).forEach((e=>{const o=this.getImageIdentifier(e);o&&t.has(o)&&(e.remove(),r++)})),this._log("Removed small images:",r)}getImageIdentifier(e){const t=e.src||e.getAttribute("data-src")||"",r=e.srcset||e.getAttribute("data-srcset")||"",o=e.alt||"",n=e.className||"",a=e.id||"";return t?`src:${t}`:r?`srcset:${r}`:a?`id:${a}`:n&&o?`class:${n};alt:${o}`:null}findMainContent(e){const t=["article",'[role="article"]','[itemprop="articleBody"]',".post-content",".article-content","#article-content",".content-article","main",'[role="main"]',"body"],r=[];return t.forEach(((o,n)=>{e.querySelectorAll(o).forEach((e=>{let o=10*(t.length-n);o+=this.scoreElement(e),r.push({element:e,score:o})}))})),0===r.length?this.findContentByScoring(e):(r.sort(((e,t)=>t.score-e.score)),this.debug&&this._log("Content candidates:",r.map((e=>({element:e.element.tagName,selector:this.getElementSelector(e.element),score:e.score})))),r[0].element)}findContentByScoring(e){const t=this.scoreElements(e);return t.length>0?t[0].element:null}getElementSelector(e){const t=[];let r=e;for(;r&&r!==document.documentElement;){let e=r.tagName.toLowerCase();r.id?e+="#"+r.id:r.className&&"string"==typeof r.className&&(e+="."+r.className.trim().split(/\s+/).join(".")),t.unshift(e),r=r.parentElement}return t.join(" > ")}scoreElements(e){const t=[];return i.forEach((r=>{Array.from(e.getElementsByTagName(r)).forEach((e=>{const r=this.scoreElement(e);r>0&&t.push({score:r,element:e})}))})),t.sort(((e,t)=>t.score-e.score))}scoreElement(e){let t=0;const r=e.className&&"string"==typeof e.className?e.className.toLowerCase():"",o=e.id?e.id.toLowerCase():"";(n.test(r)||n.test(o))&&(t+=25),(a.test(r)||a.test(o))&&(t-=25);const i=e.textContent||"",s=i.split(/\s+/).length;t+=Math.min(Math.floor(s/100),3);const l=e.getElementsByTagName("a"),c=Array.from(l).reduce(((e,t)=>{var r;return e+((null===(r=t.textContent)||void 0===r?void 0:r.length)||0)}),0);(i.length?c/i.length:0)>.5&&(t-=10),t+=e.getElementsByTagName("p").length;const d=e.getElementsByTagName("img").length;return t+=Math.min(3*d,9),t}}}},t={};function r(o){var n=t[o];if(void 0!==n)return n.exports;var a=t[o]={exports:{}};return e[o](a,a.exports,r),a.exports}var o={};return(()=>{var e=o;Object.defineProperty(e,"__esModule",{value:!0}),e.Defuddle=void 0;var t=r(628);Object.defineProperty(e,"Defuddle",{enumerable:!0,get:function(){return t.Defuddle}})})(),o})()));
@@ -0,0 +1,16 @@
1
+ import { DefuddleMetadata } from './types';
2
+ export declare class MetadataExtractor {
3
+ static extract(doc: Document, schemaOrgData: any): DefuddleMetadata;
4
+ private static getAuthor;
5
+ private static getSite;
6
+ private static getTitle;
7
+ private static getDescription;
8
+ private static getImage;
9
+ private static getFavicon;
10
+ private static getPublished;
11
+ private static getMetaContent;
12
+ private static getTimeElement;
13
+ private static decodeHTMLEntities;
14
+ private static getSchemaProperty;
15
+ static extractSchemaOrgData(doc: Document): any;
16
+ }
@@ -0,0 +1,18 @@
1
+ export interface DefuddleMetadata {
2
+ title: string;
3
+ description: string;
4
+ domain: string;
5
+ favicon: string;
6
+ image: string;
7
+ published: string;
8
+ author: string;
9
+ site: string;
10
+ schemaOrgData: any;
11
+ }
12
+ export interface DefuddleResponse extends DefuddleMetadata {
13
+ content: string;
14
+ }
15
+ export interface DefuddleOptions {
16
+ debug?: boolean;
17
+ keepClasses?: boolean;
18
+ }
package/package.json ADDED
@@ -0,0 +1,48 @@
1
+ {
2
+ "name": "defuddle",
3
+ "version": "0.1.0",
4
+ "description": "Extract article content and metadata from web pages",
5
+ "main": "dist/index.js",
6
+ "module": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
+ "scripts": {
9
+ "clean": "rm -rf dist",
10
+ "build:types": "tsc --project tsconfig.declarations.json",
11
+ "build:js": "webpack",
12
+ "build": "npm run clean && npm run build:types && npm run build:js",
13
+ "prepublishOnly": "npm run build"
14
+ },
15
+ "keywords": [
16
+ "readability",
17
+ "content-extraction",
18
+ "article-extraction",
19
+ "web-scraping",
20
+ "html-cleanup",
21
+ "content-parser",
22
+ "article-parser",
23
+ "dom"
24
+ ],
25
+ "author": "kepano",
26
+ "license": "MIT",
27
+ "repository": {
28
+ "type": "git",
29
+ "url": "https://github.com/kepano/defuddle"
30
+ },
31
+ "bugs": {
32
+ "url": "https://github.com/kepano/defuddle/issues"
33
+ },
34
+ "homepage": "https://github.com/kepano/defuddle#readme",
35
+ "devDependencies": {
36
+ "@types/node": "^20.0.0",
37
+ "ts-loader": "^9.5.1",
38
+ "typescript": "^5.3.3",
39
+ "undici-types": "^5.0.0",
40
+ "webpack": "^5.90.3",
41
+ "webpack-cli": "^5.1.4"
42
+ },
43
+ "files": [
44
+ "dist",
45
+ "README.md",
46
+ "LICENSE"
47
+ ]
48
+ }