@ijonis/geo-lint 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -51,6 +51,8 @@ interface ContentItem {
51
51
  rawContent: string;
52
52
  /** Body content without frontmatter */
53
53
  body: string;
54
+ /** How content was acquired: 'file' (MDX on disk) or 'url' (extracted via Readability) */
55
+ contentSource?: 'file' | 'url';
54
56
  }
55
57
  /**
56
58
  * Context passed to rules for cross-content validation
package/dist/index.d.ts CHANGED
@@ -51,6 +51,8 @@ interface ContentItem {
51
51
  rawContent: string;
52
52
  /** Body content without frontmatter */
53
53
  body: string;
54
+ /** How content was acquired: 'file' (MDX on disk) or 'url' (extracted via Readability) */
55
+ contentSource?: 'file' | 'url';
54
56
  }
55
57
  /**
56
58
  * Context passed to rules for cross-content validation
package/dist/index.js CHANGED
@@ -854,6 +854,68 @@ var duplicateRules = [
854
854
  duplicateDescription
855
855
  ];
856
856
 
857
+ // src/utils/plaintext-structure.ts
858
+ var MAX_HEADING_LENGTH = 80;
859
+ var MIN_TABLE_ROWS = 2;
860
+ function detectPlaintextHeadings(text) {
861
+ const lines = text.split("\n");
862
+ const headings = [];
863
+ for (let i = 0; i < lines.length; i++) {
864
+ const line = lines[i].trim();
865
+ if (!line || line.length > MAX_HEADING_LENGTH) continue;
866
+ const nextLine = lines[i + 1]?.trim() ?? "";
867
+ const isFollowedByBlank = i + 1 >= lines.length || nextLine === "";
868
+ if (!isFollowedByBlank) continue;
869
+ if (/[.,;:]$/.test(line)) continue;
870
+ const isTitleCase = /^[A-ZÄÖÜ]/.test(line) && line.split(/\s+/).length <= 12;
871
+ const isAllCaps = line === line.toUpperCase() && /[A-ZÄÖÜ]/.test(line) && line.length > 2;
872
+ const isQuestion = line.endsWith("?");
873
+ if (isTitleCase || isAllCaps || isQuestion) {
874
+ const level = isAllCaps || line.split(/\s+/).length <= 4 ? 2 : 3;
875
+ headings.push({ level, text: line, line: i + 1 });
876
+ }
877
+ }
878
+ return headings;
879
+ }
880
+ function detectPlaintextTable(text) {
881
+ const lines = text.split("\n").filter((l) => l.trim().length > 0);
882
+ const tabLines = lines.filter((l) => l.includes(" "));
883
+ if (tabLines.length >= MIN_TABLE_ROWS) {
884
+ const colCounts = tabLines.map((l) => l.split(" ").length);
885
+ const consistent = colCounts.every(
886
+ (c) => c === colCounts[0] && c >= 2
887
+ );
888
+ if (consistent) return true;
889
+ }
890
+ const spaceSeparated = lines.filter((l) => /\S {3,}\S/.test(l));
891
+ if (spaceSeparated.length >= MIN_TABLE_ROWS + 1) {
892
+ return true;
893
+ }
894
+ return false;
895
+ }
896
+ function detectPlaintextList(text) {
897
+ const listPattern = /^[\s]*[•·–—]\s+|^[\s]*\w\)\s+|^[\s]*\d+\)\s+/m;
898
+ const lines = text.split("\n").filter((l) => listPattern.test(l));
899
+ return lines.length >= 2;
900
+ }
901
+ function detectPlaintextFaq(text) {
902
+ const lines = text.split("\n");
903
+ let questionCount = 0;
904
+ for (let i = 0; i < lines.length; i++) {
905
+ const line = lines[i].trim();
906
+ if (!line.endsWith("?")) continue;
907
+ if (line.length > MAX_HEADING_LENGTH) continue;
908
+ const nextContent = lines.slice(i + 1).find((l) => l.trim().length > 0);
909
+ if (nextContent && nextContent.trim().length > line.length) {
910
+ questionCount++;
911
+ }
912
+ }
913
+ return {
914
+ hasFaq: questionCount >= 2,
915
+ questionCount
916
+ };
917
+ }
918
+
857
919
  // src/utils/heading-extractor.ts
858
920
  function isInCodeBlock(lines, lineIndex) {
859
921
  let inCodeBlock = false;
@@ -865,7 +927,7 @@ function isInCodeBlock(lines, lineIndex) {
865
927
  }
866
928
  return inCodeBlock;
867
929
  }
868
- function extractHeadings(mdxBody) {
930
+ function extractHeadings(mdxBody, contentSource) {
869
931
  const headings = [];
870
932
  const lines = mdxBody.split("\n");
871
933
  const headingRegex = /^(#{1,6})\s+(.+)$/;
@@ -882,6 +944,9 @@ function extractHeadings(mdxBody) {
882
944
  });
883
945
  }
884
946
  }
947
+ if (headings.length === 0 && contentSource === "url") {
948
+ return detectPlaintextHeadings(mdxBody);
949
+ }
885
950
  return headings;
886
951
  }
887
952
  function countH1s(headings) {
@@ -913,6 +978,9 @@ var missingH1 = {
913
978
  category: "seo",
914
979
  fixStrategy: "Add an H1 heading (# Heading) at the start of the content",
915
980
  run: (item) => {
981
+ if (item.contentSource === "url") {
982
+ return [];
983
+ }
916
984
  if (item.contentType === "blog") {
917
985
  return [];
918
986
  }
@@ -1199,8 +1267,16 @@ function countWords(text) {
1199
1267
  }
1200
1268
  function countSentences(text) {
1201
1269
  const stripped = stripMarkdown(text);
1202
- const sentences = stripped.match(/[.!?]+(?:\s|$)/g);
1203
- return sentences ? sentences.length : 0;
1270
+ const sentenceEndings = stripped.match(/[.!?]+(?:\s|$|(?=[A-ZÄÖÜ]))/g);
1271
+ if (sentenceEndings && sentenceEndings.length > 0) {
1272
+ return sentenceEndings.length;
1273
+ }
1274
+ const lines = stripped.split(/\n+/).filter((l) => l.trim().length > 20);
1275
+ if (lines.length > 1) {
1276
+ return lines.length;
1277
+ }
1278
+ const hasWords = /\w{2,}/.test(stripped);
1279
+ return hasWords ? 1 : 0;
1204
1280
  }
1205
1281
 
1206
1282
  // src/utils/readability.ts
@@ -1482,6 +1558,7 @@ var robotsRules = [
1482
1558
  // src/rules/slug-rules.ts
1483
1559
  var SLUG_DEFAULTS = { maxLength: 75 };
1484
1560
  var SLUG_PATTERN = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
1561
+ var URL_PATH_PATTERN = /^[a-z0-9]+(?:[-/][a-z0-9]+)*$/;
1485
1562
  var slugInvalidCharacters = {
1486
1563
  name: "slug-invalid-characters",
1487
1564
  severity: "error",
@@ -1489,8 +1566,10 @@ var slugInvalidCharacters = {
1489
1566
  fixStrategy: 'Use lowercase alphanumeric characters with hyphens only (e.g., "my-blog-post")',
1490
1567
  run: (item) => {
1491
1568
  if (!item.slug) return [];
1569
+ const isUrl = item.contentSource === "url";
1570
+ const pattern = isUrl ? URL_PATH_PATTERN : SLUG_PATTERN;
1492
1571
  const hasUppercase = /[A-Z]/.test(item.slug);
1493
- const matchesPattern = SLUG_PATTERN.test(item.slug);
1572
+ const matchesPattern = pattern.test(item.slug);
1494
1573
  if (hasUppercase || !matchesPattern) {
1495
1574
  return [{
1496
1575
  file: getDisplayPath(item),
@@ -1498,7 +1577,7 @@ var slugInvalidCharacters = {
1498
1577
  rule: "slug-invalid-characters",
1499
1578
  severity: "error",
1500
1579
  message: `Slug "${item.slug}" contains invalid characters`,
1501
- suggestion: 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
1580
+ suggestion: isUrl ? "URL paths must be lowercase alphanumeric with hyphens and slashes only" : 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
1502
1581
  }];
1503
1582
  }
1504
1583
  return [];
@@ -1763,8 +1842,8 @@ var WEAK_LEAD_STARTS = [
1763
1842
  "schauen wir uns"
1764
1843
  ];
1765
1844
  var TABLE_SEPARATOR_PATTERN = /\|\s*:?-{2,}/;
1766
- function countQuestionHeadings(body) {
1767
- const headings = extractHeadings(body);
1845
+ function countQuestionHeadings(body, contentSource) {
1846
+ const headings = extractHeadings(body, contentSource);
1768
1847
  let count = 0;
1769
1848
  for (const heading of headings) {
1770
1849
  const text = heading.text.trim();
@@ -1826,12 +1905,20 @@ function countStatistics(body) {
1826
1905
  }
1827
1906
  return matches.size;
1828
1907
  }
1829
- function hasFAQSection(body) {
1908
+ function hasFAQSection(body, contentSource) {
1830
1909
  const faqPattern = /#{2,3}\s*(FAQ|Häufige Fragen|Frequently Asked|Fragen und Antworten)/i;
1831
- return faqPattern.test(body);
1910
+ if (faqPattern.test(body)) return true;
1911
+ if (contentSource === "url") {
1912
+ return detectPlaintextFaq(body).hasFaq;
1913
+ }
1914
+ return false;
1832
1915
  }
1833
- function hasMarkdownTable(body) {
1834
- return TABLE_SEPARATOR_PATTERN.test(body);
1916
+ function hasMarkdownTable(body, contentSource) {
1917
+ if (TABLE_SEPARATOR_PATTERN.test(body)) return true;
1918
+ if (contentSource === "url") {
1919
+ return detectPlaintextTable(body);
1920
+ }
1921
+ return false;
1835
1922
  }
1836
1923
  function countEntityMentions(body, entity) {
1837
1924
  const escapedEntity = entity.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
@@ -1993,7 +2080,7 @@ function getParagraphs(body) {
1993
2080
  }
1994
2081
  return paragraphs;
1995
2082
  }
1996
- function hasMarkdownList(body) {
2083
+ function hasMarkdownList(body, contentSource) {
1997
2084
  const lines = body.split("\n");
1998
2085
  let inCodeBlock = false;
1999
2086
  for (const line of lines) {
@@ -2006,6 +2093,9 @@ function hasMarkdownList(body) {
2006
2093
  if (/^[-*]\s+/.test(trimmed)) return true;
2007
2094
  if (/^\d+\.\s+/.test(trimmed)) return true;
2008
2095
  }
2096
+ if (contentSource === "url") {
2097
+ return detectPlaintextList(body);
2098
+ }
2009
2099
  return false;
2010
2100
  }
2011
2101
  function countInternalLinks(body) {
@@ -13737,8 +13827,27 @@ function jaccardSimilarity(a, b) {
13737
13827
  const union = a.size + b.size - intersection;
13738
13828
  return union > 0 ? intersection / union : 0;
13739
13829
  }
13830
+ var REFERENCE_PATTERNS = [
13831
+ /archived from the original on/gi,
13832
+ /retrieved (?:on )?\d/gi,
13833
+ /accessed (?:on )?\d/gi,
13834
+ /cite (?:web|book|journal|news)/gi,
13835
+ /\^\s*\[?\d+\]?/g,
13836
+ /isbn \d/gi,
13837
+ /doi:\s*\d/gi,
13838
+ /pmid:\s*\d/gi
13839
+ ];
13840
+ function stripReferenceBoilerplate(text) {
13841
+ let result = text;
13842
+ for (const pattern of REFERENCE_PATTERNS) {
13843
+ result = result.replace(pattern, "");
13844
+ }
13845
+ result = result.replace(/\n(?:references|sources|bibliography|einzelnachweise|weblinks)\n[\s\S]*$/i, "");
13846
+ return result;
13847
+ }
13740
13848
  function analyzeRepetition(body) {
13741
- const plain = stripMarkdown(body).toLowerCase();
13849
+ const cleaned = stripReferenceBoilerplate(body);
13850
+ const plain = stripMarkdown(cleaned).toLowerCase();
13742
13851
  const words = plain.replace(/[^\p{L}\p{N}\s]/gu, " ").split(/\s+/).filter((w) => w.length > 0);
13743
13852
  const fiveGrams = extractNgrams(words, 5);
13744
13853
  const phraseCounts = /* @__PURE__ */ new Map();
@@ -13747,7 +13856,7 @@ function analyzeRepetition(body) {
13747
13856
  }
13748
13857
  const repeatedPhrases = [...phraseCounts.entries()].filter(([, count]) => count >= 3).sort((a, b) => b[1] - a[1]);
13749
13858
  const topRepeatedPhrases = repeatedPhrases.slice(0, 5).map(([phrase, count]) => ({ phrase, count }));
13750
- const paragraphs = body.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
13859
+ const paragraphs = cleaned.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
13751
13860
  let totalSimilarity = 0;
13752
13861
  let pairCount = 0;
13753
13862
  for (let i = 0; i < paragraphs.length; i++) {
@@ -14053,10 +14162,10 @@ var geoNoQuestionHeadings = {
14053
14162
  if (!geoTypes.includes(item.contentType)) return [];
14054
14163
  const wordCount = countWords(item.body);
14055
14164
  if (wordCount < GEO_MIN_WORDS) return [];
14056
- const headings = extractHeadings(item.body);
14165
+ const headings = extractHeadings(item.body, item.contentSource);
14057
14166
  const subHeadings = headings.filter((h) => h.level === 2 || h.level === 3);
14058
14167
  if (subHeadings.length === 0) return [];
14059
- const questionCount = countQuestionHeadings(item.body);
14168
+ const questionCount = countQuestionHeadings(item.body, item.contentSource);
14060
14169
  const ratio = questionCount / subHeadings.length;
14061
14170
  if (ratio < QUESTION_HEADING_THRESHOLD) {
14062
14171
  return [{
@@ -14146,7 +14255,7 @@ var geoMissingFaqSection = {
14146
14255
  if (!geoTypes.includes(item.contentType)) return [];
14147
14256
  const wordCount = countWords(item.body);
14148
14257
  if (wordCount < FAQ_MIN_WORDS) return [];
14149
- if (!hasFAQSection(item.body)) {
14258
+ if (!hasFAQSection(item.body, item.contentSource)) {
14150
14259
  return [{
14151
14260
  file: getDisplayPath(item),
14152
14261
  field: "body",
@@ -14191,7 +14300,7 @@ var geoMissingTable = {
14191
14300
  if (!geoTypes.includes(item.contentType)) return [];
14192
14301
  const wordCount = countWords(item.body);
14193
14302
  if (wordCount < TABLE_MIN_WORDS) return [];
14194
- if (!hasMarkdownTable(item.body)) {
14303
+ if (!hasMarkdownTable(item.body, item.contentSource)) {
14195
14304
  return [{
14196
14305
  file: getDisplayPath(item),
14197
14306
  field: "body",
@@ -14839,7 +14948,7 @@ var geoMissingLists = {
14839
14948
  if (!geoTypes.includes(item.contentType)) return [];
14840
14949
  const wordCount = countWords(item.body);
14841
14950
  if (wordCount < STRUCTURE_MIN_WORDS) return [];
14842
- if (!hasMarkdownList(item.body)) {
14951
+ if (!hasMarkdownList(item.body, item.contentSource)) {
14843
14952
  return [{
14844
14953
  file: getDisplayPath(item),
14845
14954
  field: "body",