npm - defuddle-js - Versions diffs - 0.1.0 - Mend

defuddle-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/LICENSE +21 -0
package/README.md +102 -0
package/dist/defuddle.cjs.js +1950 -0
package/dist/defuddle.umd.js +1968 -0
package/package.json +41 -0
package/src/constants.js +297 -0
package/src/content-finder.js +116 -0
package/src/content-scorer.js +194 -0
package/src/defuddle.js +252 -0
package/src/index.js +1 -0
package/src/metadata.js +371 -0
package/src/removals/content-patterns.js +174 -0
package/src/removals/hidden.js +51 -0
package/src/removals/selector-remover.js +137 -0
package/src/removals/small-images.js +45 -0
package/src/schema-org.js +102 -0
package/src/standardizer.js +116 -0
package/src/url-resolver.js +101 -0
package/src/utils.js +95 -0

package/src/defuddle.js ADDED Viewed

@@ -0,0 +1,252 @@
+/**
+ * Defuddle - extract main content and metadata from HTML pages.
+ * Works in browser (via <script> tag) and Node.js.
+ *
+ * Browser usage:
+ *   const result = new Defuddle(document).parse();
+ *   // or with an HTML string:
+ *   const doc = new DOMParser().parseFromString(html, 'text/html');
+ *   const result = new Defuddle(doc, { url: 'https://example.com' }).parse();
+ *
+ * Node.js usage (with linkedom or jsdom):
+ *   import { parseHTML } from 'linkedom';
+ *   const { document } = parseHTML(html);
+ *   const result = new Defuddle(document, { url }).parse();
+ */
+import { extractSchemaOrg } from './schema-org.js';
+import { collectMetaTags, extractMetadata } from './metadata.js';
+import { findMainContent } from './content-finder.js';
+import { scoreAndRemove } from './content-scorer.js';
+import { removeHiddenElements } from './removals/hidden.js';
+import { removeExact, removePartial } from './removals/selector-remover.js';
+import { removeSmallImages } from './removals/small-images.js';
+import { removeByContentPattern } from './removals/content-patterns.js';
+import { resolveUrls } from './url-resolver.js';
+import { standardize } from './standardizer.js';
+import { countHtmlWords } from './utils.js';
+export class Defuddle {
+  /**
+   * @param {Document} doc  A parsed DOM Document
+   * @param {object} [options]
+   * @param {string} [options.url]  Page URL for relative URL resolution and domain extraction
+   * @param {boolean} [options.debug]
+   */
+  constructor(doc, options = {}) {
+    this.doc = doc;
+    this.url = options.url || null;
+    this.options = options;
+    this._schemaOrgData = undefined;
+    this._metaTags = undefined;
+    this._metadata = undefined;
+  }
+  /**
+   * Parse the document and return a DefuddleResult.
+   * @param {object} [opts]  Override default options for this parse call
+   * @returns {DefuddleResult}
+   */
+  parse(opts = {}) {
+    const options = { ...this.options, ...opts };
+    // First pass
+    let result = this._parseInternal(options);
+    // Retry 1: too little content → disable partial selectors
+    if (result.wordCount < 200) {
+      const retry = this._parseInternal({ ...options, removePartialSelectors: false });
+      if (retry.wordCount > result.wordCount * 2) result = retry;
+    }
+    // Retry 2: still too little → disable hidden removal
+    if (result.wordCount < 50) {
+      const retry = this._parseInternal({ ...options, removeHidden: false });
+      if (retry.wordCount > result.wordCount * 2) result = retry;
+    }
+    // Retry 3: index/listing page → disable scoring and patterns
+    if (result.wordCount < 50) {
+      const retry = this._parseInternal({
+        ...options,
+        scoreContent: false,
+        removePartialSelectors: false,
+        removeContentPatterns: false,
+      });
+      if (retry.wordCount > result.wordCount) result = retry;
+    }
+    return result;
+  }
+  /**
+   * Internal parse — clones the document for each attempt.
+   * @private
+   */
+  _parseInternal(opts = {}) {
+    const startTime = Date.now();
+    const {
+      removeExactSelectors = true,
+      removePartialSelectors = true,
+      removeHidden = true,
+      removeSmallImages: doSmallImages = true,
+      scoreContent = true,
+      removeContentPatterns = true,
+      standardizeContent = true,
+      debug = false,
+    } = opts;
+    const doc = this.doc;
+    if (!doc || !doc.documentElement) {
+      return this._emptyResult(startTime);
+    }
+    // Cache schema, meta tags, metadata (shared across retries)
+    if (this._schemaOrgData === undefined) {
+      this._schemaOrgData = extractSchemaOrg(doc);
+    }
+    if (this._metaTags === undefined) {
+      this._metaTags = collectMetaTags(doc);
+    }
+    if (this._metadata === undefined) {
+      this._metadata = extractMetadata(doc, this.url, this._schemaOrgData, this._metaTags);
+    }
+    // Clone document for destructive processing
+    const clone = doc.cloneNode(true);
+    // Find main content
+    let mainContent = null;
+    if (opts.contentSelector) {
+      try {
+        mainContent = clone.querySelector(opts.contentSelector);
+      } catch (e) {}
+    }
+    if (!mainContent) {
+      mainContent = findMainContent(clone);
+    }
+    if (!mainContent) {
+      return this._buildResult(clone.body ? clone.body.innerHTML : '', startTime);
+    }
+    // Removal pipeline
+    if (doSmallImages) removeSmallImages(clone, debug);
+    if (removeHidden) removeHiddenElements(clone, debug);
+    if (removeExactSelectors) removeExact(clone, mainContent, debug);
+    if (removePartialSelectors) removePartial(clone, mainContent, debug);
+    if (scoreContent) scoreAndRemove(clone, mainContent, debug);
+    if (removeContentPatterns) removeByContentPattern(mainContent, debug, this.url || '');
+    // Standardize
+    if (standardizeContent) standardize(mainContent);
+    // Resolve URLs
+    if (this.url) resolveUrls(mainContent, clone, this.url);
+    return this._buildResult(mainContent.outerHTML, startTime);
+  }
+  _buildResult(content, startTime) {
+    const meta = this._metadata || {};
+    return new DefuddleResult({
+      content,
+      ...meta,
+      schemaOrgData: this._schemaOrgData || null,
+      metaTags: this._metaTags || [],
+      wordCount: countHtmlWords(content),
+      parseTime: Date.now() - startTime,
+    });
+  }
+  _emptyResult(startTime) {
+    let domain = '';
+    if (this.url) {
+      try {
+        let host = new URL(this.url).hostname;
+        if (host.startsWith('www.')) host = host.slice(4);
+        domain = host;
+      } catch (e) {}
+    }
+    return new DefuddleResult({
+      content: '', title: '', description: '', author: '', published: '',
+      site: '', domain, favicon: '', image: '', language: '',
+      schemaOrgData: null, metaTags: [],
+      wordCount: 0, parseTime: Date.now() - startTime,
+    });
+  }
+  /**
+   * Convenience static method: parse an HTML string.
+   * In browser: uses DOMParser automatically.
+   * In Node.js: requires passing a `parseHtml` function that returns a Document.
+   *
+   * @param {string|Document} input  HTML string or existing Document
+   * @param {object} [options]
+   * @param {string} [options.url]
+   * @param {Function} [options.parseHtml]  Custom HTML parser: (html) => Document
+   * @returns {DefuddleResult}
+   */
+  static parse(input, options = {}) {
+    let doc;
+    if (typeof input === 'string') {
+      if (options.parseHtml) {
+        doc = options.parseHtml(input);
+      } else if (typeof DOMParser !== 'undefined') {
+        // Browser environment
+        doc = new DOMParser().parseFromString(input, 'text/html');
+      } else {
+        throw new Error(
+          'Defuddle.parse() requires a DOM environment. ' +
+          'In Node.js, pass a parseHtml function: ' +
+          'Defuddle.parse(html, { parseHtml: html => require("linkedom").parseHTML(html).document })'
+        );
+      }
+    } else {
+      doc = input;
+    }
+    return new Defuddle(doc, options).parse();
+  }
+}
+/**
+ * Result object returned by Defuddle.parse().
+ */
+export class DefuddleResult {
+  constructor(data) {
+    this.content = data.content || '';
+    this.title = data.title || '';
+    this.description = data.description || '';
+    this.author = data.author || '';
+    this.published = data.published || '';
+    this.site = data.site || '';
+    this.domain = data.domain || '';
+    this.favicon = data.favicon || '';
+    this.image = data.image || '';
+    this.language = data.language || '';
+    this.wordCount = data.wordCount || 0;
+    this.parseTime = data.parseTime || 0;
+    this.schemaOrgData = data.schemaOrgData || null;
+    this.metaTags = data.metaTags || [];
+  }
+  toJSON() {
+    return {
+      content: this.content,
+      title: this.title,
+      description: this.description,
+      author: this.author,
+      published: this.published,
+      site: this.site,
+      domain: this.domain,
+      favicon: this.favicon,
+      image: this.image,
+      language: this.language,
+      wordCount: this.wordCount,
+      parseTime: this.parseTime,
+      schemaOrgData: this.schemaOrgData,
+      metaTags: this.metaTags,
+    };
+  }
+}

package/src/index.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { Defuddle, DefuddleResult } from './defuddle.js';

package/src/metadata.js ADDED Viewed

@@ -0,0 +1,371 @@
+/**
+ * Metadata extraction for defuddle-js.
+ * Cascading: OG → Twitter Card → Schema.org → DOM heuristics
+ */
+import { extractSchemaOrg, getSchemaProperty } from './schema-org.js';
+import { resolveUrl } from './url-resolver.js';
+const ARTICLE_TYPES = ['Article', 'NewsArticle', 'BlogPosting', 'WebPage'];
+const DATE_RE = /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/i;
+/**
+ * Collect all <meta> tags from a document.
+ * @param {Document} doc
+ * @returns {Array<{name:string,property:string,content:string}>}
+ */
+export function collectMetaTags(doc) {
+  const tags = [];
+  for (const meta of doc.querySelectorAll('meta')) {
+    const content = meta.getAttribute('content');
+    if (!content) continue;
+    tags.push({
+      name: meta.getAttribute('name') || null,
+      property: meta.getAttribute('property') || null,
+      content: decodeHtmlEntities(content),
+    });
+  }
+  return tags;
+}
+/**
+ * Extract all metadata from a document.
+ * @param {Document} doc
+ * @param {string|null} url
+ * @param {Array|null} schemaOrgData
+ * @param {Array} metaTags
+ * @returns {object}
+ */
+export function extractMetadata(doc, url, schemaOrgData, metaTags) {
+  const getMeta = (value, attr) => {
+    for (const tag of metaTags) {
+      if (tag[attr] && tag[attr].toLowerCase() === value.toLowerCase()) {
+        return tag.content || null;
+      }
+    }
+    return null;
+  };
+  const getSchema = (property) =>
+    getSchemaProperty(schemaOrgData, ARTICLE_TYPES, property);
+  const site = extractSiteName(doc, metaTags, schemaOrgData, getMeta);
+  const title = extractTitle(doc, metaTags, schemaOrgData, getMeta, getSchema, site);
+  return {
+    title,
+    description: extractDescription(metaTags, getSchema, getMeta),
+    author: extractAuthor(doc, metaTags, schemaOrgData, getMeta),
+    published: extractPublished(doc, metaTags, schemaOrgData, getMeta, getSchema),
+    site,
+    domain: extractDomain(doc, url, getMeta),
+    favicon: extractFavicon(doc, url),
+    image: extractImage(doc, metaTags, getSchema, getMeta),
+    language: extractLanguage(doc, metaTags, getMeta),
+  };
+}
+// ── Title ──────────────────────────────────────────────────────────────────
+function extractTitle(doc, metaTags, schemaOrgData, getMeta, getSchema, siteName) {
+  const raw = getMeta('og:title', 'property')
+    || getMeta('twitter:title', 'name')
+    || getSchema('headline')
+    || getMeta('title', 'name')
+    || (doc.querySelector('title') ? doc.querySelector('title').textContent.trim() : '')
+    || '';
+  return cleanTitle(raw, siteName);
+}
+function cleanTitle(title, siteName) {
+  if (!title || !siteName) return title;
+  const separators = ['|', ' / ', ' · ', ' – ', ' — ', ' - ', ': '];
+  for (const sep of separators) {
+    if (!title.includes(sep)) continue;
+    const idx = title.indexOf(sep);
+    const left = title.slice(0, idx).trim();
+    const right = title.slice(idx + sep.length).trim();
+    if (fuzzyMatch(right, siteName)) return left;
+    if (fuzzyMatch(left, siteName)) return right;
+  }
+  return title;
+}
+function fuzzyMatch(a, b) {
+  const norm = s => s.toLowerCase().replace(/[^a-z0-9]/g, '');
+  const na = norm(a), nb = norm(b);
+  if (na === nb) return true;
+  if (nb.length > 2 && (na.includes(nb) || nb.includes(na))) return true;
+  return false;
+}
+// ── Description ────────────────────────────────────────────────────────────
+function extractDescription(metaTags, getSchema, getMeta) {
+  return getMeta('og:description', 'property')
+    || getMeta('twitter:description', 'name')
+    || getMeta('description', 'name')
+    || getSchema('description')
+    || '';
+}
+// ── Author ─────────────────────────────────────────────────────────────────
+function extractAuthor(doc, metaTags, schemaOrgData, getMeta) {
+  for (const name of ['author', 'sailthru.author', 'citation_author', 'dc.creator', 'byl']) {
+    const val = getMeta(name, 'name');
+    if (val) return val;
+  }
+  // Schema.org author
+  const schemaAuthor = getSchemaAuthor(schemaOrgData);
+  if (schemaAuthor) return schemaAuthor;
+  // DOM selectors
+  const selectors = [
+    '[class*="author"]:not([class*="author-bio"])',
+    '[rel="author"]',
+    '[class*="byline"]',
+    '[itemprop="author"]',
+  ];
+  for (const sel of selectors) {
+    const el = doc.querySelector(sel);
+    if (el) {
+      const text = (el.textContent || '').trim();
+      const words = text.split(/\s+/).filter(Boolean).length;
+      if (text && words <= 6) return stripByPrefix(text);
+    }
+  }
+  // "By X" near h1
+  return extractBylineNearH1(doc);
+}
+function getSchemaAuthor(schemaOrgData) {
+  if (!schemaOrgData) return '';
+  for (const item of schemaOrgData) {
+    if (!item || typeof item !== 'object') continue;
+    const author = item.author;
+    if (!author) continue;
+    if (typeof author === 'string') return author;
+    if (typeof author === 'object') {
+      if (author.name) return String(author.name);
+      if (Array.isArray(author) && author[0]?.name) return String(author[0].name);
+    }
+  }
+  return '';
+}
+function extractBylineNearH1(doc) {
+  const h1 = doc.querySelector('h1');
+  if (!h1 || !h1.parentElement) return '';
+  let checked = 0;
+  for (const sibling of h1.parentElement.children) {
+    if (sibling === h1) continue;
+    const text = (sibling.textContent || '').trim();
+    const match = text.match(/^by\s+(.+)/i);
+    if (match) {
+      const words = text.split(/\s+/).filter(Boolean).length;
+      if (words <= 8) return match[1].trim();
+    }
+    if (++checked >= 5) break;
+  }
+  return '';
+}
+function stripByPrefix(text) {
+  const m = text.match(/^by\s+(.+)/i);
+  return m ? m[1].trim() : text;
+}
+// ── Published ──────────────────────────────────────────────────────────────
+function extractPublished(doc, metaTags, schemaOrgData, getMeta, getSchema) {
+  const raw = getMeta('article:published_time', 'property')
+    || getMeta('article:published', 'property')
+    || getMeta('date', 'name')
+    || getMeta('citation_date', 'name')
+    || getMeta('DC.date', 'name')
+    || getMeta('pubdate', 'name')
+    || null;
+  if (raw) return normalizeDate(raw);
+  const schemaDate = getSchema('datePublished');
+  if (schemaDate) return normalizeDate(String(schemaDate));
+  // <time datetime>
+  const timeEl = doc.querySelector('time[datetime]');
+  if (timeEl) {
+    const dt = timeEl.getAttribute('datetime');
+    if (dt) return normalizeDate(dt);
+  }
+  // Natural language dates in meta content
+  for (const tag of metaTags) {
+    if (tag.content && DATE_RE.test(tag.content)) {
+      const parsed = parseNaturalDate(tag.content);
+      if (parsed) return parsed;
+    }
+  }
+  return '';
+}
+function normalizeDate(raw) {
+  raw = raw.trim();
+  if (!raw) return '';
+  // Already ISO-ish
+  if (/^\d{4}-\d{2}-\d{2}/.test(raw)) {
+    try {
+      return new Date(raw).toISOString();
+    } catch (e) {
+      return raw;
+    }
+  }
+  return parseNaturalDate(raw) || raw;
+}
+function parseNaturalDate(input) {
+  input = input.trim();
+  const d = new Date(input);
+  if (!isNaN(d.getTime()) && d.getFullYear() > 1900) {
+    return d.toISOString();
+  }
+  return null;
+}
+// ── Image ──────────────────────────────────────────────────────────────────
+function extractImage(doc, metaTags, getSchema, getMeta) {
+  const url = getMeta('og:image', 'property')
+    || getMeta('og:image:url', 'property')
+    || getMeta('twitter:image', 'name')
+    || getMeta('twitter:image:src', 'name')
+    || null;
+  if (url) return url;
+  const schemaImage = getSchema('image');
+  if (schemaImage) {
+    if (typeof schemaImage === 'string') return schemaImage;
+    if (schemaImage.url) return String(schemaImage.url);
+  }
+  // First large image in body
+  const imgs = doc.querySelectorAll('body img[src]');
+  for (const img of imgs) {
+    const w = parseInt(img.getAttribute('width') || '0');
+    const h = parseInt(img.getAttribute('height') || '0');
+    if ((w >= 200 && h >= 100) || (!w && !h)) {
+      return img.getAttribute('src') || '';
+    }
+  }
+  return '';
+}
+// ── Site name ──────────────────────────────────────────────────────────────
+function extractSiteName(doc, metaTags, schemaOrgData, getMeta) {
+  const og = getMeta('og:site_name', 'property');
+  if (og) return og;
+  const publisher = getSchemaProperty(schemaOrgData, ARTICLE_TYPES, 'publisher');
+  if (publisher) {
+    if (typeof publisher === 'string') return publisher;
+    if (publisher.name) return String(publisher.name);
+  }
+  // Fallback from <title> using separators
+  const titleEl = doc.querySelector('title');
+  if (titleEl) {
+    const title = titleEl.textContent || '';
+    for (const sep of ['|', ' / ', ' · ', ' – ', ' — ']) {
+      if (title.includes(sep)) {
+        const parts = title.split(sep);
+        const last = parts[parts.length - 1].trim();
+        if (last && last.split(/\s+/).length <= 4) return last;
+      }
+    }
+  }
+  return '';
+}
+// ── Domain ─────────────────────────────────────────────────────────────────
+function extractDomain(doc, url, getMeta) {
+  let pageUrl = url;
+  if (!pageUrl) {
+    pageUrl = getMeta('og:url', 'property');
+  }
+  if (!pageUrl) {
+    const canonical = doc.querySelector('link[rel="canonical"][href]');
+    if (canonical) pageUrl = canonical.getAttribute('href');
+  }
+  if (!pageUrl) return '';
+  try {
+    let host = new URL(pageUrl).hostname;
+    if (host.startsWith('www.')) host = host.slice(4);
+    return host;
+  } catch (e) {
+    return '';
+  }
+}
+// ── Favicon ────────────────────────────────────────────────────────────────
+function extractFavicon(doc, url) {
+  for (const rel of ['shortcut icon', 'icon']) {
+    const el = doc.querySelector(`link[rel="${rel}"][href]`);
+    if (el) {
+      const href = el.getAttribute('href');
+      if (href) return url ? resolveUrl(href, url) : href;
+    }
+  }
+  if (url) {
+    try {
+      const u = new URL(url);
+      return u.origin + '/favicon.ico';
+    } catch (e) {}
+  }
+  return '';
+}
+// ── Language ───────────────────────────────────────────────────────────────
+function extractLanguage(doc, metaTags, getMeta) {
+  const htmlEl = doc.documentElement;
+  if (htmlEl && htmlEl.getAttribute('lang')) {
+    return htmlEl.getAttribute('lang');
+  }
+  const locale = getMeta('og:locale', 'property');
+  if (locale) return locale.replace('_', '-');
+  const contentLang = doc.querySelector('meta[http-equiv="Content-Language"]');
+  if (contentLang) return contentLang.getAttribute('content') || '';
+  return '';
+}
+// ── Helpers ────────────────────────────────────────────────────────────────
+function decodeHtmlEntities(text) {
+  return text
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .replace(/&apos;/g, "'");
+}