npm - @ijonis/geo-lint - Versions diffs - 0.2.0 → 0.2.1 - Mend

@ijonis/geo-lint 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.1] - 2026-03-19
+### Added
+- `contentSource` field on `ContentItem` — allows rules to distinguish between file-based MDX and URL-extracted content
+- Plain-text structure detection (`src/utils/plaintext-structure.ts`) — heuristic heading, table, list, and FAQ detection for content without markdown formatting
+- GEO rules now fall back to plain-text structure detection when `contentSource` is `'url'`, fixing false 100/100 GEO scores on Readability-extracted content
+- Integration tests for URL scanner compatibility
+### Fixed
+- **GEO rules never firing on URL-scanned content** — headings, tables, lists, and FAQ sections are now detected in plain text (no markdown required)
+- **Readability score 0 for German content** — `countSentences` now handles newline-separated sentences and periods without trailing spaces
+- **`slug-invalid-characters` false positive on URL paths** — slashes are now allowed when `contentSource` is `'url'`
+- **`content-repetition` flagging footnotes** — reference boilerplate (Wikipedia citations, DOI, ISBN, etc.) is stripped before n-gram analysis
+- **`missing-h1` false positive on URL content** — rule is skipped when `contentSource` is `'url'` (Readability strips `<h1>`, title is in metadata)
 ## [0.2.0] - 2026-03-10
 ### Added

package/dist/cli.cjs CHANGED Viewed

@@ -883,6 +883,68 @@ var duplicateRules = [
   duplicateDescription
 ];
+// src/utils/plaintext-structure.ts
+var MAX_HEADING_LENGTH = 80;
+var MIN_TABLE_ROWS = 2;
+function detectPlaintextHeadings(text) {
+  const lines = text.split("\n");
+  const headings = [];
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i].trim();
+    if (!line || line.length > MAX_HEADING_LENGTH) continue;
+    const nextLine = lines[i + 1]?.trim() ?? "";
+    const isFollowedByBlank = i + 1 >= lines.length || nextLine === "";
+    if (!isFollowedByBlank) continue;
+    if (/[.,;:]$/.test(line)) continue;
+    const isTitleCase = /^[A-ZÄÖÜ]/.test(line) && line.split(/\s+/).length <= 12;
+    const isAllCaps = line === line.toUpperCase() && /[A-ZÄÖÜ]/.test(line) && line.length > 2;
+    const isQuestion = line.endsWith("?");
+    if (isTitleCase || isAllCaps || isQuestion) {
+      const level = isAllCaps || line.split(/\s+/).length <= 4 ? 2 : 3;
+      headings.push({ level, text: line, line: i + 1 });
+    }
+  }
+  return headings;
+}
+function detectPlaintextTable(text) {
+  const lines = text.split("\n").filter((l) => l.trim().length > 0);
+  const tabLines = lines.filter((l) => l.includes("	"));
+  if (tabLines.length >= MIN_TABLE_ROWS) {
+    const colCounts = tabLines.map((l) => l.split("	").length);
+    const consistent = colCounts.every(
+      (c) => c === colCounts[0] && c >= 2
+    );
+    if (consistent) return true;
+  }
+  const spaceSeparated = lines.filter((l) => /\S {3,}\S/.test(l));
+  if (spaceSeparated.length >= MIN_TABLE_ROWS + 1) {
+    return true;
+  }
+  return false;
+}
+function detectPlaintextList(text) {
+  const listPattern = /^[\s]*[•·–—]\s+|^[\s]*\w\)\s+|^[\s]*\d+\)\s+/m;
+  const lines = text.split("\n").filter((l) => listPattern.test(l));
+  return lines.length >= 2;
+}
+function detectPlaintextFaq(text) {
+  const lines = text.split("\n");
+  let questionCount = 0;
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i].trim();
+    if (!line.endsWith("?")) continue;
+    if (line.length > MAX_HEADING_LENGTH) continue;
+    const nextContent = lines.slice(i + 1).find((l) => l.trim().length > 0);
+    if (nextContent && nextContent.trim().length > line.length) {
+      questionCount++;
+    }
+  }
+  return {
+    hasFaq: questionCount >= 2,
+    questionCount
+  };
+}
 // src/utils/heading-extractor.ts
 function isInCodeBlock(lines, lineIndex) {
   let inCodeBlock = false;
@@ -894,7 +956,7 @@ function isInCodeBlock(lines, lineIndex) {
   }
   return inCodeBlock;
 }
-function extractHeadings(mdxBody) {
+function extractHeadings(mdxBody, contentSource) {
   const headings = [];
   const lines = mdxBody.split("\n");
   const headingRegex = /^(#{1,6})\s+(.+)$/;
@@ -911,6 +973,9 @@ function extractHeadings(mdxBody) {
       });
     }
   }
+  if (headings.length === 0 && contentSource === "url") {
+    return detectPlaintextHeadings(mdxBody);
+  }
   return headings;
 }
 function countH1s(headings) {
@@ -942,6 +1007,9 @@ var missingH1 = {
   category: "seo",
   fixStrategy: "Add an H1 heading (# Heading) at the start of the content",
   run: (item) => {
+    if (item.contentSource === "url") {
+      return [];
+    }
     if (item.contentType === "blog") {
       return [];
     }
@@ -1228,8 +1296,16 @@ function countWords(text) {
 }
 function countSentences(text) {
   const stripped = stripMarkdown(text);
-  const sentences = stripped.match(/[.!?]+(?:\s|$)/g);
-  return sentences ? sentences.length : 0;
+  const sentenceEndings = stripped.match(/[.!?]+(?:\s|$|(?=[A-ZÄÖÜ]))/g);
+  if (sentenceEndings && sentenceEndings.length > 0) {
+    return sentenceEndings.length;
+  }
+  const lines = stripped.split(/\n+/).filter((l) => l.trim().length > 20);
+  if (lines.length > 1) {
+    return lines.length;
+  }
+  const hasWords = /\w{2,}/.test(stripped);
+  return hasWords ? 1 : 0;
 }
 // src/utils/readability.ts
@@ -1511,6 +1587,7 @@ var robotsRules = [
 // src/rules/slug-rules.ts
 var SLUG_DEFAULTS = { maxLength: 75 };
 var SLUG_PATTERN = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
+var URL_PATH_PATTERN = /^[a-z0-9]+(?:[-/][a-z0-9]+)*$/;
 var slugInvalidCharacters = {
   name: "slug-invalid-characters",
   severity: "error",
@@ -1518,8 +1595,10 @@ var slugInvalidCharacters = {
   fixStrategy: 'Use lowercase alphanumeric characters with hyphens only (e.g., "my-blog-post")',
   run: (item) => {
     if (!item.slug) return [];
+    const isUrl = item.contentSource === "url";
+    const pattern = isUrl ? URL_PATH_PATTERN : SLUG_PATTERN;
     const hasUppercase = /[A-Z]/.test(item.slug);
-    const matchesPattern = SLUG_PATTERN.test(item.slug);
+    const matchesPattern = pattern.test(item.slug);
     if (hasUppercase || !matchesPattern) {
       return [{
         file: getDisplayPath(item),
@@ -1527,7 +1606,7 @@ var slugInvalidCharacters = {
         rule: "slug-invalid-characters",
         severity: "error",
         message: `Slug "${item.slug}" contains invalid characters`,
-        suggestion: 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
+        suggestion: isUrl ? "URL paths must be lowercase alphanumeric with hyphens and slashes only" : 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
       }];
     }
     return [];
@@ -1792,8 +1871,8 @@ var WEAK_LEAD_STARTS = [
   "schauen wir uns"
 ];
 var TABLE_SEPARATOR_PATTERN = /\|\s*:?-{2,}/;
-function countQuestionHeadings(body) {
-  const headings = extractHeadings(body);
+function countQuestionHeadings(body, contentSource) {
+  const headings = extractHeadings(body, contentSource);
   let count = 0;
   for (const heading of headings) {
     const text = heading.text.trim();
@@ -1855,12 +1934,20 @@ function countStatistics(body) {
   }
   return matches.size;
 }
-function hasFAQSection(body) {
+function hasFAQSection(body, contentSource) {
   const faqPattern = /#{2,3}\s*(FAQ|Häufige Fragen|Frequently Asked|Fragen und Antworten)/i;
-  return faqPattern.test(body);
+  if (faqPattern.test(body)) return true;
+  if (contentSource === "url") {
+    return detectPlaintextFaq(body).hasFaq;
+  }
+  return false;
 }
-function hasMarkdownTable(body) {
-  return TABLE_SEPARATOR_PATTERN.test(body);
+function hasMarkdownTable(body, contentSource) {
+  if (TABLE_SEPARATOR_PATTERN.test(body)) return true;
+  if (contentSource === "url") {
+    return detectPlaintextTable(body);
+  }
+  return false;
 }
 function countEntityMentions(body, entity) {
   const escapedEntity = entity.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
@@ -2022,7 +2109,7 @@ function getParagraphs(body) {
   }
   return paragraphs;
 }
-function hasMarkdownList(body) {
+function hasMarkdownList(body, contentSource) {
   const lines = body.split("\n");
   let inCodeBlock = false;
   for (const line of lines) {
@@ -2035,6 +2122,9 @@ function hasMarkdownList(body) {
     if (/^[-*]\s+/.test(trimmed)) return true;
     if (/^\d+\.\s+/.test(trimmed)) return true;
   }
+  if (contentSource === "url") {
+    return detectPlaintextList(body);
+  }
   return false;
 }
 function countInternalLinks(body) {
@@ -13766,8 +13856,27 @@ function jaccardSimilarity(a, b) {
   const union = a.size + b.size - intersection;
   return union > 0 ? intersection / union : 0;
 }
+var REFERENCE_PATTERNS = [
+  /archived from the original on/gi,
+  /retrieved (?:on )?\d/gi,
+  /accessed (?:on )?\d/gi,
+  /cite (?:web|book|journal|news)/gi,
+  /\^\s*\[?\d+\]?/g,
+  /isbn \d/gi,
+  /doi:\s*\d/gi,
+  /pmid:\s*\d/gi
+];
+function stripReferenceBoilerplate(text) {
+  let result = text;
+  for (const pattern of REFERENCE_PATTERNS) {
+    result = result.replace(pattern, "");
+  }
+  result = result.replace(/\n(?:references|sources|bibliography|einzelnachweise|weblinks)\n[\s\S]*$/i, "");
+  return result;
+}
 function analyzeRepetition(body) {
-  const plain = stripMarkdown(body).toLowerCase();
+  const cleaned = stripReferenceBoilerplate(body);
+  const plain = stripMarkdown(cleaned).toLowerCase();
   const words = plain.replace(/[^\p{L}\p{N}\s]/gu, " ").split(/\s+/).filter((w) => w.length > 0);
   const fiveGrams = extractNgrams(words, 5);
   const phraseCounts = /* @__PURE__ */ new Map();
@@ -13776,7 +13885,7 @@ function analyzeRepetition(body) {
   }
   const repeatedPhrases = [...phraseCounts.entries()].filter(([, count]) => count >= 3).sort((a, b) => b[1] - a[1]);
   const topRepeatedPhrases = repeatedPhrases.slice(0, 5).map(([phrase, count]) => ({ phrase, count }));
-  const paragraphs = body.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
+  const paragraphs = cleaned.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
   let totalSimilarity = 0;
   let pairCount = 0;
   for (let i = 0; i < paragraphs.length; i++) {
@@ -14082,10 +14191,10 @@ var geoNoQuestionHeadings = {
     if (!geoTypes.includes(item.contentType)) return [];
     const wordCount = countWords(item.body);
     if (wordCount < GEO_MIN_WORDS) return [];
-    const headings = extractHeadings(item.body);
+    const headings = extractHeadings(item.body, item.contentSource);
     const subHeadings = headings.filter((h) => h.level === 2 || h.level === 3);
     if (subHeadings.length === 0) return [];
-    const questionCount = countQuestionHeadings(item.body);
+    const questionCount = countQuestionHeadings(item.body, item.contentSource);
     const ratio = questionCount / subHeadings.length;
     if (ratio < QUESTION_HEADING_THRESHOLD) {
       return [{
@@ -14175,7 +14284,7 @@ var geoMissingFaqSection = {
     if (!geoTypes.includes(item.contentType)) return [];
     const wordCount = countWords(item.body);
     if (wordCount < FAQ_MIN_WORDS) return [];
-    if (!hasFAQSection(item.body)) {
+    if (!hasFAQSection(item.body, item.contentSource)) {
       return [{
         file: getDisplayPath(item),
         field: "body",
@@ -14220,7 +14329,7 @@ var geoMissingTable = {
     if (!geoTypes.includes(item.contentType)) return [];
     const wordCount = countWords(item.body);
     if (wordCount < TABLE_MIN_WORDS) return [];
-    if (!hasMarkdownTable(item.body)) {
+    if (!hasMarkdownTable(item.body, item.contentSource)) {
       return [{
         file: getDisplayPath(item),
         field: "body",
@@ -14868,7 +14977,7 @@ var geoMissingLists = {
     if (!geoTypes.includes(item.contentType)) return [];
     const wordCount = countWords(item.body);
     if (wordCount < STRUCTURE_MIN_WORDS) return [];
-    if (!hasMarkdownList(item.body)) {
+    if (!hasMarkdownList(item.body, item.contentSource)) {
       return [{
         file: getDisplayPath(item),
         field: "body",