@ijonis/geo-lint 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.2.1] - 2026-03-19
11
+
12
+ ### Added
13
+ - `contentSource` field on `ContentItem` — allows rules to distinguish between file-based MDX and URL-extracted content
14
+ - Plain-text structure detection (`src/utils/plaintext-structure.ts`) — heuristic heading, table, list, and FAQ detection for content without markdown formatting
15
+ - GEO rules now fall back to plain-text structure detection when `contentSource` is `'url'`, fixing false 100/100 GEO scores on Readability-extracted content
16
+ - Integration tests for URL scanner compatibility
17
+
18
+ ### Fixed
19
+ - **GEO rules never firing on URL-scanned content** — headings, tables, lists, and FAQ sections are now detected in plain text (no markdown required)
20
+ - **Readability score 0 for German content** — `countSentences` now handles newline-separated sentences and periods without trailing spaces
21
+ - **`slug-invalid-characters` false positive on URL paths** — slashes are now allowed when `contentSource` is `'url'`
22
+ - **`content-repetition` flagging footnotes** — reference boilerplate (Wikipedia citations, DOI, ISBN, etc.) is stripped before n-gram analysis
23
+ - **`missing-h1` false positive on URL content** — rule is skipped when `contentSource` is `'url'` (Readability strips `<h1>`, title is in metadata)
24
+
10
25
  ## [0.2.0] - 2026-03-10
11
26
 
12
27
  ### Added
package/dist/cli.cjs CHANGED
@@ -883,6 +883,68 @@ var duplicateRules = [
883
883
  duplicateDescription
884
884
  ];
885
885
 
886
+ // src/utils/plaintext-structure.ts
887
+ var MAX_HEADING_LENGTH = 80;
888
+ var MIN_TABLE_ROWS = 2;
889
+ function detectPlaintextHeadings(text) {
890
+ const lines = text.split("\n");
891
+ const headings = [];
892
+ for (let i = 0; i < lines.length; i++) {
893
+ const line = lines[i].trim();
894
+ if (!line || line.length > MAX_HEADING_LENGTH) continue;
895
+ const nextLine = lines[i + 1]?.trim() ?? "";
896
+ const isFollowedByBlank = i + 1 >= lines.length || nextLine === "";
897
+ if (!isFollowedByBlank) continue;
898
+ if (/[.,;:]$/.test(line)) continue;
899
+ const isTitleCase = /^[A-ZÄÖÜ]/.test(line) && line.split(/\s+/).length <= 12;
900
+ const isAllCaps = line === line.toUpperCase() && /[A-ZÄÖÜ]/.test(line) && line.length > 2;
901
+ const isQuestion = line.endsWith("?");
902
+ if (isTitleCase || isAllCaps || isQuestion) {
903
+ const level = isAllCaps || line.split(/\s+/).length <= 4 ? 2 : 3;
904
+ headings.push({ level, text: line, line: i + 1 });
905
+ }
906
+ }
907
+ return headings;
908
+ }
909
+ function detectPlaintextTable(text) {
910
+ const lines = text.split("\n").filter((l) => l.trim().length > 0);
911
+ const tabLines = lines.filter((l) => l.includes(" "));
912
+ if (tabLines.length >= MIN_TABLE_ROWS) {
913
+ const colCounts = tabLines.map((l) => l.split(" ").length);
914
+ const consistent = colCounts.every(
915
+ (c) => c === colCounts[0] && c >= 2
916
+ );
917
+ if (consistent) return true;
918
+ }
919
+ const spaceSeparated = lines.filter((l) => /\S {3,}\S/.test(l));
920
+ if (spaceSeparated.length >= MIN_TABLE_ROWS + 1) {
921
+ return true;
922
+ }
923
+ return false;
924
+ }
925
+ function detectPlaintextList(text) {
926
+ const listPattern = /^[\s]*[•·–—]\s+|^[\s]*\w\)\s+|^[\s]*\d+\)\s+/m;
927
+ const lines = text.split("\n").filter((l) => listPattern.test(l));
928
+ return lines.length >= 2;
929
+ }
930
+ function detectPlaintextFaq(text) {
931
+ const lines = text.split("\n");
932
+ let questionCount = 0;
933
+ for (let i = 0; i < lines.length; i++) {
934
+ const line = lines[i].trim();
935
+ if (!line.endsWith("?")) continue;
936
+ if (line.length > MAX_HEADING_LENGTH) continue;
937
+ const nextContent = lines.slice(i + 1).find((l) => l.trim().length > 0);
938
+ if (nextContent && nextContent.trim().length > line.length) {
939
+ questionCount++;
940
+ }
941
+ }
942
+ return {
943
+ hasFaq: questionCount >= 2,
944
+ questionCount
945
+ };
946
+ }
947
+
886
948
  // src/utils/heading-extractor.ts
887
949
  function isInCodeBlock(lines, lineIndex) {
888
950
  let inCodeBlock = false;
@@ -894,7 +956,7 @@ function isInCodeBlock(lines, lineIndex) {
894
956
  }
895
957
  return inCodeBlock;
896
958
  }
897
- function extractHeadings(mdxBody) {
959
+ function extractHeadings(mdxBody, contentSource) {
898
960
  const headings = [];
899
961
  const lines = mdxBody.split("\n");
900
962
  const headingRegex = /^(#{1,6})\s+(.+)$/;
@@ -911,6 +973,9 @@ function extractHeadings(mdxBody) {
911
973
  });
912
974
  }
913
975
  }
976
+ if (headings.length === 0 && contentSource === "url") {
977
+ return detectPlaintextHeadings(mdxBody);
978
+ }
914
979
  return headings;
915
980
  }
916
981
  function countH1s(headings) {
@@ -942,6 +1007,9 @@ var missingH1 = {
942
1007
  category: "seo",
943
1008
  fixStrategy: "Add an H1 heading (# Heading) at the start of the content",
944
1009
  run: (item) => {
1010
+ if (item.contentSource === "url") {
1011
+ return [];
1012
+ }
945
1013
  if (item.contentType === "blog") {
946
1014
  return [];
947
1015
  }
@@ -1228,8 +1296,16 @@ function countWords(text) {
1228
1296
  }
1229
1297
  function countSentences(text) {
1230
1298
  const stripped = stripMarkdown(text);
1231
- const sentences = stripped.match(/[.!?]+(?:\s|$)/g);
1232
- return sentences ? sentences.length : 0;
1299
+ const sentenceEndings = stripped.match(/[.!?]+(?:\s|$|(?=[A-ZÄÖÜ]))/g);
1300
+ if (sentenceEndings && sentenceEndings.length > 0) {
1301
+ return sentenceEndings.length;
1302
+ }
1303
+ const lines = stripped.split(/\n+/).filter((l) => l.trim().length > 20);
1304
+ if (lines.length > 1) {
1305
+ return lines.length;
1306
+ }
1307
+ const hasWords = /\w{2,}/.test(stripped);
1308
+ return hasWords ? 1 : 0;
1233
1309
  }
1234
1310
 
1235
1311
  // src/utils/readability.ts
@@ -1511,6 +1587,7 @@ var robotsRules = [
1511
1587
  // src/rules/slug-rules.ts
1512
1588
  var SLUG_DEFAULTS = { maxLength: 75 };
1513
1589
  var SLUG_PATTERN = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
1590
+ var URL_PATH_PATTERN = /^[a-z0-9]+(?:[-/][a-z0-9]+)*$/;
1514
1591
  var slugInvalidCharacters = {
1515
1592
  name: "slug-invalid-characters",
1516
1593
  severity: "error",
@@ -1518,8 +1595,10 @@ var slugInvalidCharacters = {
1518
1595
  fixStrategy: 'Use lowercase alphanumeric characters with hyphens only (e.g., "my-blog-post")',
1519
1596
  run: (item) => {
1520
1597
  if (!item.slug) return [];
1598
+ const isUrl = item.contentSource === "url";
1599
+ const pattern = isUrl ? URL_PATH_PATTERN : SLUG_PATTERN;
1521
1600
  const hasUppercase = /[A-Z]/.test(item.slug);
1522
- const matchesPattern = SLUG_PATTERN.test(item.slug);
1601
+ const matchesPattern = pattern.test(item.slug);
1523
1602
  if (hasUppercase || !matchesPattern) {
1524
1603
  return [{
1525
1604
  file: getDisplayPath(item),
@@ -1527,7 +1606,7 @@ var slugInvalidCharacters = {
1527
1606
  rule: "slug-invalid-characters",
1528
1607
  severity: "error",
1529
1608
  message: `Slug "${item.slug}" contains invalid characters`,
1530
- suggestion: 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
1609
+ suggestion: isUrl ? "URL paths must be lowercase alphanumeric with hyphens and slashes only" : 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
1531
1610
  }];
1532
1611
  }
1533
1612
  return [];
@@ -1792,8 +1871,8 @@ var WEAK_LEAD_STARTS = [
1792
1871
  "schauen wir uns"
1793
1872
  ];
1794
1873
  var TABLE_SEPARATOR_PATTERN = /\|\s*:?-{2,}/;
1795
- function countQuestionHeadings(body) {
1796
- const headings = extractHeadings(body);
1874
+ function countQuestionHeadings(body, contentSource) {
1875
+ const headings = extractHeadings(body, contentSource);
1797
1876
  let count = 0;
1798
1877
  for (const heading of headings) {
1799
1878
  const text = heading.text.trim();
@@ -1855,12 +1934,20 @@ function countStatistics(body) {
1855
1934
  }
1856
1935
  return matches.size;
1857
1936
  }
1858
- function hasFAQSection(body) {
1937
+ function hasFAQSection(body, contentSource) {
1859
1938
  const faqPattern = /#{2,3}\s*(FAQ|Häufige Fragen|Frequently Asked|Fragen und Antworten)/i;
1860
- return faqPattern.test(body);
1939
+ if (faqPattern.test(body)) return true;
1940
+ if (contentSource === "url") {
1941
+ return detectPlaintextFaq(body).hasFaq;
1942
+ }
1943
+ return false;
1861
1944
  }
1862
- function hasMarkdownTable(body) {
1863
- return TABLE_SEPARATOR_PATTERN.test(body);
1945
+ function hasMarkdownTable(body, contentSource) {
1946
+ if (TABLE_SEPARATOR_PATTERN.test(body)) return true;
1947
+ if (contentSource === "url") {
1948
+ return detectPlaintextTable(body);
1949
+ }
1950
+ return false;
1864
1951
  }
1865
1952
  function countEntityMentions(body, entity) {
1866
1953
  const escapedEntity = entity.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
@@ -2022,7 +2109,7 @@ function getParagraphs(body) {
2022
2109
  }
2023
2110
  return paragraphs;
2024
2111
  }
2025
- function hasMarkdownList(body) {
2112
+ function hasMarkdownList(body, contentSource) {
2026
2113
  const lines = body.split("\n");
2027
2114
  let inCodeBlock = false;
2028
2115
  for (const line of lines) {
@@ -2035,6 +2122,9 @@ function hasMarkdownList(body) {
2035
2122
  if (/^[-*]\s+/.test(trimmed)) return true;
2036
2123
  if (/^\d+\.\s+/.test(trimmed)) return true;
2037
2124
  }
2125
+ if (contentSource === "url") {
2126
+ return detectPlaintextList(body);
2127
+ }
2038
2128
  return false;
2039
2129
  }
2040
2130
  function countInternalLinks(body) {
@@ -13766,8 +13856,27 @@ function jaccardSimilarity(a, b) {
13766
13856
  const union = a.size + b.size - intersection;
13767
13857
  return union > 0 ? intersection / union : 0;
13768
13858
  }
13859
+ var REFERENCE_PATTERNS = [
13860
+ /archived from the original on/gi,
13861
+ /retrieved (?:on )?\d/gi,
13862
+ /accessed (?:on )?\d/gi,
13863
+ /cite (?:web|book|journal|news)/gi,
13864
+ /\^\s*\[?\d+\]?/g,
13865
+ /isbn \d/gi,
13866
+ /doi:\s*\d/gi,
13867
+ /pmid:\s*\d/gi
13868
+ ];
13869
+ function stripReferenceBoilerplate(text) {
13870
+ let result = text;
13871
+ for (const pattern of REFERENCE_PATTERNS) {
13872
+ result = result.replace(pattern, "");
13873
+ }
13874
+ result = result.replace(/\n(?:references|sources|bibliography|einzelnachweise|weblinks)\n[\s\S]*$/i, "");
13875
+ return result;
13876
+ }
13769
13877
  function analyzeRepetition(body) {
13770
- const plain = stripMarkdown(body).toLowerCase();
13878
+ const cleaned = stripReferenceBoilerplate(body);
13879
+ const plain = stripMarkdown(cleaned).toLowerCase();
13771
13880
  const words = plain.replace(/[^\p{L}\p{N}\s]/gu, " ").split(/\s+/).filter((w) => w.length > 0);
13772
13881
  const fiveGrams = extractNgrams(words, 5);
13773
13882
  const phraseCounts = /* @__PURE__ */ new Map();
@@ -13776,7 +13885,7 @@ function analyzeRepetition(body) {
13776
13885
  }
13777
13886
  const repeatedPhrases = [...phraseCounts.entries()].filter(([, count]) => count >= 3).sort((a, b) => b[1] - a[1]);
13778
13887
  const topRepeatedPhrases = repeatedPhrases.slice(0, 5).map(([phrase, count]) => ({ phrase, count }));
13779
- const paragraphs = body.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
13888
+ const paragraphs = cleaned.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
13780
13889
  let totalSimilarity = 0;
13781
13890
  let pairCount = 0;
13782
13891
  for (let i = 0; i < paragraphs.length; i++) {
@@ -14082,10 +14191,10 @@ var geoNoQuestionHeadings = {
14082
14191
  if (!geoTypes.includes(item.contentType)) return [];
14083
14192
  const wordCount = countWords(item.body);
14084
14193
  if (wordCount < GEO_MIN_WORDS) return [];
14085
- const headings = extractHeadings(item.body);
14194
+ const headings = extractHeadings(item.body, item.contentSource);
14086
14195
  const subHeadings = headings.filter((h) => h.level === 2 || h.level === 3);
14087
14196
  if (subHeadings.length === 0) return [];
14088
- const questionCount = countQuestionHeadings(item.body);
14197
+ const questionCount = countQuestionHeadings(item.body, item.contentSource);
14089
14198
  const ratio = questionCount / subHeadings.length;
14090
14199
  if (ratio < QUESTION_HEADING_THRESHOLD) {
14091
14200
  return [{
@@ -14175,7 +14284,7 @@ var geoMissingFaqSection = {
14175
14284
  if (!geoTypes.includes(item.contentType)) return [];
14176
14285
  const wordCount = countWords(item.body);
14177
14286
  if (wordCount < FAQ_MIN_WORDS) return [];
14178
- if (!hasFAQSection(item.body)) {
14287
+ if (!hasFAQSection(item.body, item.contentSource)) {
14179
14288
  return [{
14180
14289
  file: getDisplayPath(item),
14181
14290
  field: "body",
@@ -14220,7 +14329,7 @@ var geoMissingTable = {
14220
14329
  if (!geoTypes.includes(item.contentType)) return [];
14221
14330
  const wordCount = countWords(item.body);
14222
14331
  if (wordCount < TABLE_MIN_WORDS) return [];
14223
- if (!hasMarkdownTable(item.body)) {
14332
+ if (!hasMarkdownTable(item.body, item.contentSource)) {
14224
14333
  return [{
14225
14334
  file: getDisplayPath(item),
14226
14335
  field: "body",
@@ -14868,7 +14977,7 @@ var geoMissingLists = {
14868
14977
  if (!geoTypes.includes(item.contentType)) return [];
14869
14978
  const wordCount = countWords(item.body);
14870
14979
  if (wordCount < STRUCTURE_MIN_WORDS) return [];
14871
- if (!hasMarkdownList(item.body)) {
14980
+ if (!hasMarkdownList(item.body, item.contentSource)) {
14872
14981
  return [{
14873
14982
  file: getDisplayPath(item),
14874
14983
  field: "body",