@ijonis/geo-lint 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -856,6 +856,68 @@ var duplicateRules = [
856
856
  duplicateDescription
857
857
  ];
858
858
 
859
+ // src/utils/plaintext-structure.ts
860
+ var MAX_HEADING_LENGTH = 80;
861
+ var MIN_TABLE_ROWS = 2;
862
+ function detectPlaintextHeadings(text) {
863
+ const lines = text.split("\n");
864
+ const headings = [];
865
+ for (let i = 0; i < lines.length; i++) {
866
+ const line = lines[i].trim();
867
+ if (!line || line.length > MAX_HEADING_LENGTH) continue;
868
+ const nextLine = lines[i + 1]?.trim() ?? "";
869
+ const isFollowedByBlank = i + 1 >= lines.length || nextLine === "";
870
+ if (!isFollowedByBlank) continue;
871
+ if (/[.,;:]$/.test(line)) continue;
872
+ const isTitleCase = /^[A-ZÄÖÜ]/.test(line) && line.split(/\s+/).length <= 12;
873
+ const isAllCaps = line === line.toUpperCase() && /[A-ZÄÖÜ]/.test(line) && line.length > 2;
874
+ const isQuestion = line.endsWith("?");
875
+ if (isTitleCase || isAllCaps || isQuestion) {
876
+ const level = isAllCaps || line.split(/\s+/).length <= 4 ? 2 : 3;
877
+ headings.push({ level, text: line, line: i + 1 });
878
+ }
879
+ }
880
+ return headings;
881
+ }
882
+ function detectPlaintextTable(text) {
883
+ const lines = text.split("\n").filter((l) => l.trim().length > 0);
884
+ const tabLines = lines.filter((l) => l.includes(" "));
885
+ if (tabLines.length >= MIN_TABLE_ROWS) {
886
+ const colCounts = tabLines.map((l) => l.split(" ").length);
887
+ const consistent = colCounts.every(
888
+ (c) => c === colCounts[0] && c >= 2
889
+ );
890
+ if (consistent) return true;
891
+ }
892
+ const spaceSeparated = lines.filter((l) => /\S {3,}\S/.test(l));
893
+ if (spaceSeparated.length >= MIN_TABLE_ROWS + 1) {
894
+ return true;
895
+ }
896
+ return false;
897
+ }
898
+ function detectPlaintextList(text) {
899
+ const listPattern = /^[\s]*[•·–—]\s+|^[\s]*\w\)\s+|^[\s]*\d+\)\s+/m;
900
+ const lines = text.split("\n").filter((l) => listPattern.test(l));
901
+ return lines.length >= 2;
902
+ }
903
+ function detectPlaintextFaq(text) {
904
+ const lines = text.split("\n");
905
+ let questionCount = 0;
906
+ for (let i = 0; i < lines.length; i++) {
907
+ const line = lines[i].trim();
908
+ if (!line.endsWith("?")) continue;
909
+ if (line.length > MAX_HEADING_LENGTH) continue;
910
+ const nextContent = lines.slice(i + 1).find((l) => l.trim().length > 0);
911
+ if (nextContent && nextContent.trim().length > line.length) {
912
+ questionCount++;
913
+ }
914
+ }
915
+ return {
916
+ hasFaq: questionCount >= 2,
917
+ questionCount
918
+ };
919
+ }
920
+
859
921
  // src/utils/heading-extractor.ts
860
922
  function isInCodeBlock(lines, lineIndex) {
861
923
  let inCodeBlock = false;
@@ -867,7 +929,7 @@ function isInCodeBlock(lines, lineIndex) {
867
929
  }
868
930
  return inCodeBlock;
869
931
  }
870
- function extractHeadings(mdxBody) {
932
+ function extractHeadings(mdxBody, contentSource) {
871
933
  const headings = [];
872
934
  const lines = mdxBody.split("\n");
873
935
  const headingRegex = /^(#{1,6})\s+(.+)$/;
@@ -884,6 +946,9 @@ function extractHeadings(mdxBody) {
884
946
  });
885
947
  }
886
948
  }
949
+ if (headings.length === 0 && contentSource === "url") {
950
+ return detectPlaintextHeadings(mdxBody);
951
+ }
887
952
  return headings;
888
953
  }
889
954
  function countH1s(headings) {
@@ -915,6 +980,9 @@ var missingH1 = {
915
980
  category: "seo",
916
981
  fixStrategy: "Add an H1 heading (# Heading) at the start of the content",
917
982
  run: (item) => {
983
+ if (item.contentSource === "url") {
984
+ return [];
985
+ }
918
986
  if (item.contentType === "blog") {
919
987
  return [];
920
988
  }
@@ -1201,8 +1269,16 @@ function countWords(text) {
1201
1269
  }
1202
1270
  function countSentences(text) {
1203
1271
  const stripped = stripMarkdown(text);
1204
- const sentences = stripped.match(/[.!?]+(?:\s|$)/g);
1205
- return sentences ? sentences.length : 0;
1272
+ const sentenceEndings = stripped.match(/[.!?]+(?:\s|$|(?=[A-ZÄÖÜ]))/g);
1273
+ if (sentenceEndings && sentenceEndings.length > 0) {
1274
+ return sentenceEndings.length;
1275
+ }
1276
+ const lines = stripped.split(/\n+/).filter((l) => l.trim().length > 20);
1277
+ if (lines.length > 1) {
1278
+ return lines.length;
1279
+ }
1280
+ const hasWords = /\w{2,}/.test(stripped);
1281
+ return hasWords ? 1 : 0;
1206
1282
  }
1207
1283
 
1208
1284
  // src/utils/readability.ts
@@ -1484,6 +1560,7 @@ var robotsRules = [
1484
1560
  // src/rules/slug-rules.ts
1485
1561
  var SLUG_DEFAULTS = { maxLength: 75 };
1486
1562
  var SLUG_PATTERN = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
1563
+ var URL_PATH_PATTERN = /^[a-z0-9]+(?:[-/][a-z0-9]+)*$/;
1487
1564
  var slugInvalidCharacters = {
1488
1565
  name: "slug-invalid-characters",
1489
1566
  severity: "error",
@@ -1491,8 +1568,10 @@ var slugInvalidCharacters = {
1491
1568
  fixStrategy: 'Use lowercase alphanumeric characters with hyphens only (e.g., "my-blog-post")',
1492
1569
  run: (item) => {
1493
1570
  if (!item.slug) return [];
1571
+ const isUrl = item.contentSource === "url";
1572
+ const pattern = isUrl ? URL_PATH_PATTERN : SLUG_PATTERN;
1494
1573
  const hasUppercase = /[A-Z]/.test(item.slug);
1495
- const matchesPattern = SLUG_PATTERN.test(item.slug);
1574
+ const matchesPattern = pattern.test(item.slug);
1496
1575
  if (hasUppercase || !matchesPattern) {
1497
1576
  return [{
1498
1577
  file: getDisplayPath(item),
@@ -1500,7 +1579,7 @@ var slugInvalidCharacters = {
1500
1579
  rule: "slug-invalid-characters",
1501
1580
  severity: "error",
1502
1581
  message: `Slug "${item.slug}" contains invalid characters`,
1503
- suggestion: 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
1582
+ suggestion: isUrl ? "URL paths must be lowercase alphanumeric with hyphens and slashes only" : 'Slugs must be lowercase alphanumeric with hyphens only (e.g., "my-blog-post")'
1504
1583
  }];
1505
1584
  }
1506
1585
  return [];
@@ -1765,8 +1844,8 @@ var WEAK_LEAD_STARTS = [
1765
1844
  "schauen wir uns"
1766
1845
  ];
1767
1846
  var TABLE_SEPARATOR_PATTERN = /\|\s*:?-{2,}/;
1768
- function countQuestionHeadings(body) {
1769
- const headings = extractHeadings(body);
1847
+ function countQuestionHeadings(body, contentSource) {
1848
+ const headings = extractHeadings(body, contentSource);
1770
1849
  let count = 0;
1771
1850
  for (const heading of headings) {
1772
1851
  const text = heading.text.trim();
@@ -1828,12 +1907,20 @@ function countStatistics(body) {
1828
1907
  }
1829
1908
  return matches.size;
1830
1909
  }
1831
- function hasFAQSection(body) {
1910
+ function hasFAQSection(body, contentSource) {
1832
1911
  const faqPattern = /#{2,3}\s*(FAQ|Häufige Fragen|Frequently Asked|Fragen und Antworten)/i;
1833
- return faqPattern.test(body);
1912
+ if (faqPattern.test(body)) return true;
1913
+ if (contentSource === "url") {
1914
+ return detectPlaintextFaq(body).hasFaq;
1915
+ }
1916
+ return false;
1834
1917
  }
1835
- function hasMarkdownTable(body) {
1836
- return TABLE_SEPARATOR_PATTERN.test(body);
1918
+ function hasMarkdownTable(body, contentSource) {
1919
+ if (TABLE_SEPARATOR_PATTERN.test(body)) return true;
1920
+ if (contentSource === "url") {
1921
+ return detectPlaintextTable(body);
1922
+ }
1923
+ return false;
1837
1924
  }
1838
1925
  function countEntityMentions(body, entity) {
1839
1926
  const escapedEntity = entity.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
@@ -1995,7 +2082,7 @@ function getParagraphs(body) {
1995
2082
  }
1996
2083
  return paragraphs;
1997
2084
  }
1998
- function hasMarkdownList(body) {
2085
+ function hasMarkdownList(body, contentSource) {
1999
2086
  const lines = body.split("\n");
2000
2087
  let inCodeBlock = false;
2001
2088
  for (const line of lines) {
@@ -2008,6 +2095,9 @@ function hasMarkdownList(body) {
2008
2095
  if (/^[-*]\s+/.test(trimmed)) return true;
2009
2096
  if (/^\d+\.\s+/.test(trimmed)) return true;
2010
2097
  }
2098
+ if (contentSource === "url") {
2099
+ return detectPlaintextList(body);
2100
+ }
2011
2101
  return false;
2012
2102
  }
2013
2103
  function countInternalLinks(body) {
@@ -13739,8 +13829,27 @@ function jaccardSimilarity(a, b) {
13739
13829
  const union = a.size + b.size - intersection;
13740
13830
  return union > 0 ? intersection / union : 0;
13741
13831
  }
13832
+ var REFERENCE_PATTERNS = [
13833
+ /archived from the original on/gi,
13834
+ /retrieved (?:on )?\d/gi,
13835
+ /accessed (?:on )?\d/gi,
13836
+ /cite (?:web|book|journal|news)/gi,
13837
+ /\^\s*\[?\d+\]?/g,
13838
+ /isbn \d/gi,
13839
+ /doi:\s*\d/gi,
13840
+ /pmid:\s*\d/gi
13841
+ ];
13842
+ function stripReferenceBoilerplate(text) {
13843
+ let result = text;
13844
+ for (const pattern of REFERENCE_PATTERNS) {
13845
+ result = result.replace(pattern, "");
13846
+ }
13847
+ result = result.replace(/\n(?:references|sources|bibliography|einzelnachweise|weblinks)\n[\s\S]*$/i, "");
13848
+ return result;
13849
+ }
13742
13850
  function analyzeRepetition(body) {
13743
- const plain = stripMarkdown(body).toLowerCase();
13851
+ const cleaned = stripReferenceBoilerplate(body);
13852
+ const plain = stripMarkdown(cleaned).toLowerCase();
13744
13853
  const words = plain.replace(/[^\p{L}\p{N}\s]/gu, " ").split(/\s+/).filter((w) => w.length > 0);
13745
13854
  const fiveGrams = extractNgrams(words, 5);
13746
13855
  const phraseCounts = /* @__PURE__ */ new Map();
@@ -13749,7 +13858,7 @@ function analyzeRepetition(body) {
13749
13858
  }
13750
13859
  const repeatedPhrases = [...phraseCounts.entries()].filter(([, count]) => count >= 3).sort((a, b) => b[1] - a[1]);
13751
13860
  const topRepeatedPhrases = repeatedPhrases.slice(0, 5).map(([phrase, count]) => ({ phrase, count }));
13752
- const paragraphs = body.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
13861
+ const paragraphs = cleaned.split(/\n\s*\n/).map((p) => p.trim()).filter((p) => p.length > 0 && !p.startsWith("#") && !p.startsWith("|"));
13753
13862
  let totalSimilarity = 0;
13754
13863
  let pairCount = 0;
13755
13864
  for (let i = 0; i < paragraphs.length; i++) {
@@ -14055,10 +14164,10 @@ var geoNoQuestionHeadings = {
14055
14164
  if (!geoTypes.includes(item.contentType)) return [];
14056
14165
  const wordCount = countWords(item.body);
14057
14166
  if (wordCount < GEO_MIN_WORDS) return [];
14058
- const headings = extractHeadings(item.body);
14167
+ const headings = extractHeadings(item.body, item.contentSource);
14059
14168
  const subHeadings = headings.filter((h) => h.level === 2 || h.level === 3);
14060
14169
  if (subHeadings.length === 0) return [];
14061
- const questionCount = countQuestionHeadings(item.body);
14170
+ const questionCount = countQuestionHeadings(item.body, item.contentSource);
14062
14171
  const ratio = questionCount / subHeadings.length;
14063
14172
  if (ratio < QUESTION_HEADING_THRESHOLD) {
14064
14173
  return [{
@@ -14148,7 +14257,7 @@ var geoMissingFaqSection = {
14148
14257
  if (!geoTypes.includes(item.contentType)) return [];
14149
14258
  const wordCount = countWords(item.body);
14150
14259
  if (wordCount < FAQ_MIN_WORDS) return [];
14151
- if (!hasFAQSection(item.body)) {
14260
+ if (!hasFAQSection(item.body, item.contentSource)) {
14152
14261
  return [{
14153
14262
  file: getDisplayPath(item),
14154
14263
  field: "body",
@@ -14193,7 +14302,7 @@ var geoMissingTable = {
14193
14302
  if (!geoTypes.includes(item.contentType)) return [];
14194
14303
  const wordCount = countWords(item.body);
14195
14304
  if (wordCount < TABLE_MIN_WORDS) return [];
14196
- if (!hasMarkdownTable(item.body)) {
14305
+ if (!hasMarkdownTable(item.body, item.contentSource)) {
14197
14306
  return [{
14198
14307
  file: getDisplayPath(item),
14199
14308
  field: "body",
@@ -14841,7 +14950,7 @@ var geoMissingLists = {
14841
14950
  if (!geoTypes.includes(item.contentType)) return [];
14842
14951
  const wordCount = countWords(item.body);
14843
14952
  if (wordCount < STRUCTURE_MIN_WORDS) return [];
14844
- if (!hasMarkdownList(item.body)) {
14953
+ if (!hasMarkdownList(item.body, item.contentSource)) {
14845
14954
  return [{
14846
14955
  file: getDisplayPath(item),
14847
14956
  field: "body",