@ijonis/geo-lint 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -898,6 +898,68 @@ var duplicateRules = [
898
898
  duplicateDescription
899
899
  ];
900
900
 
901
+ // src/utils/plaintext-structure.ts
902
+ var MAX_HEADING_LENGTH = 80;
903
+ var MIN_TABLE_ROWS = 2;
904
+ function detectPlaintextHeadings(text) {
905
+ const lines = text.split("\n");
906
+ const headings = [];
907
+ for (let i = 0; i < lines.length; i++) {
908
+ const line = lines[i].trim();
909
+ if (!line || line.length > MAX_HEADING_LENGTH) continue;
910
+ const nextLine = lines[i + 1]?.trim() ?? "";
911
+ const isFollowedByBlank = i + 1 >= lines.length || nextLine === "";
912
+ if (!isFollowedByBlank) continue;
913
+ if (/[.,;:]$/.test(line)) continue;
914
+ const isTitleCase = /^[A-ZÄÖÜ]/.test(line) && line.split(/\s+/).length <= 12;
915
+ const isAllCaps = line === line.toUpperCase() && /[A-ZÄÖÜ]/.test(line) && line.length > 2;
916
+ const isQuestion = line.endsWith("?");
917
+ if (isTitleCase || isAllCaps || isQuestion) {
918
+ const level = isAllCaps || line.split(/\s+/).length <= 4 ? 2 : 3;
919
+ headings.push({ level, text: line, line: i + 1 });
920
+ }
921
+ }
922
+ return headings;
923
+ }
924
+ function detectPlaintextTable(text) {
925
+ const lines = text.split("\n").filter((l) => l.trim().length > 0);
926
+ const tabLines = lines.filter((l) => l.includes(" "));
927
+ if (tabLines.length >= MIN_TABLE_ROWS) {
928
+ const colCounts = tabLines.map((l) => l.split(" ").length);
929
+ const consistent = colCounts.every(
930
+ (c) => c === colCounts[0] && c >= 2
931
+ );
932
+ if (consistent) return true;
933
+ }
934
+ const spaceSeparated = lines.filter((l) => /\S {3,}\S/.test(l));
935
+ if (spaceSeparated.length >= MIN_TABLE_ROWS + 1) {
936
+ return true;
937
+ }
938
+ return false;
939
+ }
940
+ function detectPlaintextList(text) {
941
+ const listPattern = /^[\s]*[•·–—]\s+|^[\s]*\w\)\s+|^[\s]*\d+\)\s+/m;
942
+ const lines = text.split("\n").filter((l) => listPattern.test(l));
943
+ return lines.length >= 2;
944
+ }
945
+ function detectPlaintextFaq(text) {
946
+ const lines = text.split("\n");
947
+ let questionCount = 0;
948
+ for (let i = 0; i < lines.length; i++) {
949
+ const line = lines[i].trim();
950
+ if (!line.endsWith("?")) continue;
951
+ if (line.length > MAX_HEADING_LENGTH) continue;
952
+ const nextContent = lines.slice(i + 1).find((l) => l.trim().length > 0);
953
+ if (nextContent && nextContent.trim().length > line.length) {
954
+ questionCount++;
955
+ }
956
+ }
957
+ return {
958
+ hasFaq: questionCount >= 2,
959
+ questionCount
960
+ };
961
+ }
962
+
901
963
  // src/utils/heading-extractor.ts
902
964
  function isInCodeBlock(lines, lineIndex) {
903
965
  let inCodeBlock = false;
@@ -909,7 +971,7 @@ function isInCodeBlock(lines, lineIndex) {
909
971
  }
910
972
  return inCodeBlock;
911
973
  }
912
- function extractHeadings(mdxBody) {
974
+ function extractHeadings(mdxBody, contentSource) {
913
975
  const headings = [];
914
976
  const lines = mdxBody.split("\n");
915
977
  const headingRegex = /^(#{1,6})\s+(.+)$/;
@@ -926,6 +988,9 @@ function extractHeadings(mdxBody) {
926
988
  });
927
989
  }
928
990
  }
991
+ if (headings.length === 0 && contentSource === "url") {
992
+ return detectPlaintextHeadings(mdxBody);
993
+ }
929
994
  return headings;
930
995
  }
931
996
  function countH1s(headings) {
@@ -957,6 +1022,9 @@ var missingH1 = {
957
1022
  category: "seo",
958
1023
  fixStrategy: "Add an H1 heading (# Heading) at the start of the content",
959
1024
  run: (item) => {
1025
+ if (item.contentSource === "url") {
1026
+ return [];
1027
+ }
960
1028
  if (item.contentType === "blog") {
961
1029
  return [];
962
1030
  }
@@ -1243,8 +1311,16 @@ function countWords(text) {
1243
1311
  }
1244
1312
  function countSentences(text) {
1245
1313
  const stripped = stripMarkdown(text);
1246
- const sentences = stripped.match(/[.!?]+(?:\s|$)/g);
1247
- return sentences ? sentences.length : 0;
1314
+ const sentenceEndings = stripped.match(/[.!?]+(?:\s|$|(?=[A-ZÄÖÜ]))/g);
1315
+ if (sentenceEndings && sentenceEndings.length > 0) {
1316
+ return sentenceEndings.length;
1317
+ }
1318
+ const lines = stripped.split(/\n+/).filter((l) => l.trim().length > 20);
1319
+ if (lines.length > 1) {
1320
+ return lines.length;
1321
+ }
1322
+ const hasWords = /\w{2,}/.test(stripped);
1323
+ return hasWords ? 1 : 0;
1248
1324
  }
1249
1325
 
1250
1326
  // src/utils/readability.ts
@@ -1526,6 +1602,7 @@ var robotsRules = [
1526
1602
  // src/rules/slug-rules.ts
1527
1603
  var SLUG_DEFAULTS = { maxLength: 75 };
1528
1604
  var SLUG_PATTERN = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
1605
+ var URL_PATH_PATTERN = /^[a-z0-9]+(?:[-/][a-z0-9]+)*$/;
1529
1606
  var slugInvalidCharacters = {
1530
1607
  name: "slug-invalid-characters",
1531
1608
  severity: "error",
@@ -1533,8 +1610,10 @@ var slugInvalidCharacters = {
1533
1610
  fixStrategy: 'Use lowercase alphanumeric characters with hyphens only (e.g., "my-blog-post")',
1534
1611
  run: (item) => {
1535
1612
  if (!item.slug) return [];
1613
+ const isUrl = item.contentSource === "url";
1614
+ const pattern = isUrl ? URL_PATH_PATTERN : SLUG_PATTERN;
1536
1615
  const hasUppercase = /[A-Z]/.test(item.slug);
1537
- const matchesPattern = SLUG_PATTERN.test(item.slug);
1616
+ const matchesPattern = pattern.test(item.slug);
1538
1617
  if (hasUppercase || !matchesPattern) {
1539
1618
  return [{
1540
1619
  file: getDisplayPath(item),
@@ -1542,7 +1621,7 @@ var slugInvalidCharacters = {
1542
1621
  rule: "slug-invalid-characters",
1543
1622
  severity: "error",
1544
1623
  message: `Slug "${item.slug}" contains invalid characters`,
1545
- suggestion: 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
1624
+ suggestion: isUrl ? "URL paths must be lowercase alphanumeric with hyphens and slashes only" : 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
1546
1625
  }];
1547
1626
  }
1548
1627
  return [];
@@ -1807,8 +1886,8 @@ var WEAK_LEAD_STARTS = [
1807
1886
  "schauen wir uns"
1808
1887
  ];
1809
1888
  var TABLE_SEPARATOR_PATTERN = /\|\s*:?-{2,}/;
1810
- function countQuestionHeadings(body) {
1811
- const headings = extractHeadings(body);
1889
+ function countQuestionHeadings(body, contentSource) {
1890
+ const headings = extractHeadings(body, contentSource);
1812
1891
  let count = 0;
1813
1892
  for (const heading of headings) {
1814
1893
  const text = heading.text.trim();
@@ -1870,12 +1949,20 @@ function countStatistics(body) {
1870
1949
  }
1871
1950
  return matches.size;
1872
1951
  }
1873
- function hasFAQSection(body) {
1952
+ function hasFAQSection(body, contentSource) {
1874
1953
  const faqPattern = /#{2,3}\s*(FAQ|Häufige Fragen|Frequently Asked|Fragen und Antworten)/i;
1875
- return faqPattern.test(body);
1954
+ if (faqPattern.test(body)) return true;
1955
+ if (contentSource === "url") {
1956
+ return detectPlaintextFaq(body).hasFaq;
1957
+ }
1958
+ return false;
1876
1959
  }
1877
- function hasMarkdownTable(body) {
1878
- return TABLE_SEPARATOR_PATTERN.test(body);
1960
+ function hasMarkdownTable(body, contentSource) {
1961
+ if (TABLE_SEPARATOR_PATTERN.test(body)) return true;
1962
+ if (contentSource === "url") {
1963
+ return detectPlaintextTable(body);
1964
+ }
1965
+ return false;
1879
1966
  }
1880
1967
  function countEntityMentions(body, entity) {
1881
1968
  const escapedEntity = entity.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
@@ -2037,7 +2124,7 @@ function getParagraphs(body) {
2037
2124
  }
2038
2125
  return paragraphs;
2039
2126
  }
2040
- function hasMarkdownList(body) {
2127
+ function hasMarkdownList(body, contentSource) {
2041
2128
  const lines = body.split("\n");
2042
2129
  let inCodeBlock = false;
2043
2130
  for (const line of lines) {
@@ -2050,6 +2137,9 @@ function hasMarkdownList(body) {
2050
2137
  if (/^[-*]\s+/.test(trimmed)) return true;
2051
2138
  if (/^\d+\.\s+/.test(trimmed)) return true;
2052
2139
  }
2140
+ if (contentSource === "url") {
2141
+ return detectPlaintextList(body);
2142
+ }
2053
2143
  return false;
2054
2144
  }
2055
2145
  function countInternalLinks(body) {
@@ -13781,8 +13871,27 @@ function jaccardSimilarity(a, b) {
13781
13871
  const union = a.size + b.size - intersection;
13782
13872
  return union > 0 ? intersection / union : 0;
13783
13873
  }
13874
+ var REFERENCE_PATTERNS = [
13875
+ /archived from the original on/gi,
13876
+ /retrieved (?:on )?\d/gi,
13877
+ /accessed (?:on )?\d/gi,
13878
+ /cite (?:web|book|journal|news)/gi,
13879
+ /\^\s*\[?\d+\]?/g,
13880
+ /isbn \d/gi,
13881
+ /doi:\s*\d/gi,
13882
+ /pmid:\s*\d/gi
13883
+ ];
13884
+ function stripReferenceBoilerplate(text) {
13885
+ let result = text;
13886
+ for (const pattern of REFERENCE_PATTERNS) {
13887
+ result = result.replace(pattern, "");
13888
+ }
13889
+ result = result.replace(/\n(?:references|sources|bibliography|einzelnachweise|weblinks)\n[\s\S]*$/i, "");
13890
+ return result;
13891
+ }
13784
13892
  function analyzeRepetition(body) {
13785
- const plain = stripMarkdown(body).toLowerCase();
13893
+ const cleaned = stripReferenceBoilerplate(body);
13894
+ const plain = stripMarkdown(cleaned).toLowerCase();
13786
13895
  const words = plain.replace(/[^\p{L}\p{N}\s]/gu, " ").split(/\s+/).filter((w) => w.length > 0);
13787
13896
  const fiveGrams = extractNgrams(words, 5);
13788
13897
  const phraseCounts = /* @__PURE__ */ new Map();
@@ -13791,7 +13900,7 @@ function analyzeRepetition(body) {
13791
13900
  }
13792
13901
  const repeatedPhrases = [...phraseCounts.entries()].filter(([, count]) => count >= 3).sort((a, b) => b[1] - a[1]);
13793
13902
  const topRepeatedPhrases = repeatedPhrases.slice(0, 5).map(([phrase, count]) => ({ phrase, count }));
13794
- const paragraphs = body.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
13903
+ const paragraphs = cleaned.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
13795
13904
  let totalSimilarity = 0;
13796
13905
  let pairCount = 0;
13797
13906
  for (let i = 0; i < paragraphs.length; i++) {
@@ -14097,10 +14206,10 @@ var geoNoQuestionHeadings = {
14097
14206
  if (!geoTypes.includes(item.contentType)) return [];
14098
14207
  const wordCount = countWords(item.body);
14099
14208
  if (wordCount < GEO_MIN_WORDS) return [];
14100
- const headings = extractHeadings(item.body);
14209
+ const headings = extractHeadings(item.body, item.contentSource);
14101
14210
  const subHeadings = headings.filter((h) => h.level === 2 || h.level === 3);
14102
14211
  if (subHeadings.length === 0) return [];
14103
- const questionCount = countQuestionHeadings(item.body);
14212
+ const questionCount = countQuestionHeadings(item.body, item.contentSource);
14104
14213
  const ratio = questionCount / subHeadings.length;
14105
14214
  if (ratio < QUESTION_HEADING_THRESHOLD) {
14106
14215
  return [{
@@ -14190,7 +14299,7 @@ var geoMissingFaqSection = {
14190
14299
  if (!geoTypes.includes(item.contentType)) return [];
14191
14300
  const wordCount = countWords(item.body);
14192
14301
  if (wordCount < FAQ_MIN_WORDS) return [];
14193
- if (!hasFAQSection(item.body)) {
14302
+ if (!hasFAQSection(item.body, item.contentSource)) {
14194
14303
  return [{
14195
14304
  file: getDisplayPath(item),
14196
14305
  field: "body",
@@ -14235,7 +14344,7 @@ var geoMissingTable = {
14235
14344
  if (!geoTypes.includes(item.contentType)) return [];
14236
14345
  const wordCount = countWords(item.body);
14237
14346
  if (wordCount < TABLE_MIN_WORDS) return [];
14238
- if (!hasMarkdownTable(item.body)) {
14347
+ if (!hasMarkdownTable(item.body, item.contentSource)) {
14239
14348
  return [{
14240
14349
  file: getDisplayPath(item),
14241
14350
  field: "body",
@@ -14883,7 +14992,7 @@ var geoMissingLists = {
14883
14992
  if (!geoTypes.includes(item.contentType)) return [];
14884
14993
  const wordCount = countWords(item.body);
14885
14994
  if (wordCount < STRUCTURE_MIN_WORDS) return [];
14886
- if (!hasMarkdownList(item.body)) {
14995
+ if (!hasMarkdownList(item.body, item.contentSource)) {
14887
14996
  return [{
14888
14997
  file: getDisplayPath(item),
14889
14998
  field: "body",