aiseo-audit 1.2.7 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
  [![License: MIT](https://img.shields.io/badge/License-MIT-7EB6D7.svg)](https://opensource.org/licenses/MIT)
6
6
  [![Node.js](https://img.shields.io/badge/node-%3E%3D20-7EB6D7.svg)](https://nodejs.org)
7
7
  [![TypeScript](https://img.shields.io/badge/TypeScript-5.9-7EB6D7?logo=typescript&logoColor=white)](https://www.typescriptlang.org/)
8
- [![Tests](https://img.shields.io/badge/tests-194%20passed-8FBC8F)](https://github.com/agencyenterprise/aiseo-audit)
8
+ [![Tests](https://img.shields.io/badge/tests-238%20passed-8FBC8F)](https://github.com/agencyenterprise/aiseo-audit)
9
9
  [![Coverage](https://img.shields.io/codecov/c/github/agencyenterprise/aiseo-audit?color=8FBC8F&label=coverage)](https://codecov.io/gh/agencyenterprise/aiseo-audit)
10
10
 
11
11
  <div align="center">
@@ -15,6 +15,9 @@
15
15
 
16
16
  Deterministic CLI that audits web pages for **AI search readiness**. Think Lighthouse, but for how well AI engines can fetch, extract, understand, and cite your content.
17
17
 
18
+ > [!TIP]
19
+ > Run `aiseo-audit https://www.aiseo-audit.com` to see a 100/100 [A+ Score](https://www.aiseo-audit.com/).
20
+
18
21
  **AI SEO measures how reusable your content is for generative engines, not traditional search rankings.**
19
22
 
20
23
  - [Quick Start](#quick-start)
package/dist/cli.js CHANGED
@@ -27,7 +27,7 @@ var import_commander = require("commander");
27
27
 
28
28
  // src/modules/analyzer/constants.ts
29
29
  var DOMAIN_SIGNAL_TIMEOUT_CAP = 5e3;
30
- var VERSION = true ? "1.2.7" : "0.0.0";
30
+ var VERSION = true ? "1.3.0" : "0.0.0";
31
31
 
32
32
  // src/modules/fetcher/constants.ts
33
33
  var MAX_RESPONSE_SIZE = 10 * 1024 * 1024;
@@ -366,7 +366,16 @@ function extractEntities(text) {
366
366
  0,
367
367
  15
368
368
  );
369
- return { people, organizations, places, topics };
369
+ const imperativeVerbCount = doc.verbs().isImperative().length;
370
+ const numberCount = doc.numbers().length;
371
+ return {
372
+ people,
373
+ organizations,
374
+ places,
375
+ topics,
376
+ imperativeVerbCount,
377
+ numberCount
378
+ };
370
379
  }
371
380
  function computeFleschReadingEase(text) {
372
381
  const words = text.split(/\s+/).filter((w) => w.length > 0);
@@ -435,6 +444,7 @@ function auditAnswerability(page) {
435
444
  const text = page.cleanText;
436
445
  const $ = page.$;
437
446
  const factors = [];
447
+ const { imperativeVerbCount = 0 } = extractEntities(text);
438
448
  const defCount = countPatternMatches(text, DEFINITION_PATTERNS);
439
449
  const defScore = thresholdScore(defCount, [
440
450
  [6, 10],
@@ -479,7 +489,7 @@ function auditAnswerability(page) {
479
489
  );
480
490
  const stepCount = countPatternMatches(text, STEP_PATTERNS);
481
491
  const hasOl = $("ol").length > 0;
482
- const stepTotal = stepCount + (hasOl ? 2 : 0);
492
+ const stepTotal = stepCount + imperativeVerbCount + (hasOl ? 2 : 0);
483
493
  const stepScore = thresholdScore(stepTotal, [
484
494
  [5, 10],
485
495
  [2, 7],
@@ -491,7 +501,7 @@ function auditAnswerability(page) {
491
501
  "Step-by-Step Content",
492
502
  stepScore,
493
503
  10,
494
- `${stepCount} step indicators${hasOl ? ", ordered lists found" : ""}`
504
+ `${stepCount} step indicators, ${imperativeVerbCount} instruction verbs${hasOl ? ", ordered lists found" : ""}`
495
505
  )
496
506
  );
497
507
  const questionMatches = text.match(/[^.!?]*\?/g) || [];
@@ -790,51 +800,99 @@ function auditAuthorityContext(page) {
790
800
  }
791
801
 
792
802
  // src/modules/audits/support/robots.ts
803
+ function parseRobotGroups(robotsTxt) {
804
+ const groups = [];
805
+ let current = null;
806
+ for (const raw of robotsTxt.split("\n")) {
807
+ const line = raw.split("#")[0].trim();
808
+ if (!line) {
809
+ current = null;
810
+ continue;
811
+ }
812
+ const colonAt = line.indexOf(":");
813
+ if (colonAt === -1) continue;
814
+ const field = line.slice(0, colonAt).trim().toLowerCase();
815
+ const value = line.slice(colonAt + 1).trim();
816
+ if (field === "user-agent") {
817
+ if (!current) {
818
+ current = { agents: [], rules: [] };
819
+ groups.push(current);
820
+ }
821
+ current.agents.push(value.toLowerCase());
822
+ } else if (field === "disallow" || field === "allow") {
823
+ if (current) {
824
+ current.rules.push({ type: field, path: value });
825
+ }
826
+ }
827
+ }
828
+ return groups;
829
+ }
830
+ function matchingRulesForCrawler(groups, crawlerLower) {
831
+ const specific = [];
832
+ const wildcard = [];
833
+ for (const group of groups) {
834
+ if (group.agents.includes(crawlerLower)) specific.push(...group.rules);
835
+ else if (group.agents.includes("*")) wildcard.push(...group.rules);
836
+ }
837
+ return { specific, wildcard };
838
+ }
839
+ function resolvesPathAsBlocked(rules, path) {
840
+ let bestMatchLength = -1;
841
+ let bestMatchIsDisallow = false;
842
+ for (const rule of rules) {
843
+ const rulePath = rule.path;
844
+ if (!rulePath || !path.startsWith(rulePath)) continue;
845
+ if (rulePath.length > bestMatchLength) {
846
+ bestMatchLength = rulePath.length;
847
+ bestMatchIsDisallow = rule.type === "disallow";
848
+ } else if (rulePath.length === bestMatchLength && rule.type === "allow") {
849
+ bestMatchIsDisallow = false;
850
+ }
851
+ }
852
+ return bestMatchLength >= 0 && bestMatchIsDisallow;
853
+ }
854
+ function findPartialBlocks(rules) {
855
+ return rules.filter((r) => r.type === "disallow" && r.path && r.path !== "/").map((r) => r.path);
856
+ }
793
857
  function checkCrawlerAccess(robotsTxt) {
794
858
  if (!robotsTxt)
795
859
  return { allowed: [], blocked: [], unknown: [...AI_CRAWLERS] };
796
- const lines = robotsTxt.split("\n").map((l) => l.trim());
860
+ const groups = parseRobotGroups(robotsTxt);
797
861
  const allowed = [];
798
862
  const blocked = [];
799
863
  const unknown = [];
864
+ const partiallyBlocked = [];
800
865
  for (const crawler of AI_CRAWLERS) {
801
866
  const crawlerLower = crawler.toLowerCase();
802
- let currentAgent = "";
803
- let isBlocked = false;
804
- let found = false;
805
- for (const line of lines) {
806
- const lower = line.toLowerCase();
807
- if (lower.startsWith("user-agent:")) {
808
- currentAgent = lower.split(":")[1]?.trim() || "";
809
- } else if (currentAgent === crawlerLower || currentAgent === "*") {
810
- if (lower.startsWith("disallow:")) {
811
- const path = lower.split(":")[1]?.trim();
812
- if (path === "/") {
813
- if (currentAgent === crawlerLower) {
814
- isBlocked = true;
815
- found = true;
816
- } else if (currentAgent === "*" && !found) {
817
- isBlocked = true;
818
- }
819
- }
820
- } else if (lower.startsWith("allow:")) {
821
- if (currentAgent === crawlerLower) {
822
- found = true;
823
- isBlocked = false;
824
- }
825
- }
826
- }
867
+ const { specific, wildcard } = matchingRulesForCrawler(
868
+ groups,
869
+ crawlerLower
870
+ );
871
+ const applicableRules = specific.length > 0 ? specific : wildcard;
872
+ if (applicableRules.length === 0) {
873
+ unknown.push(crawler);
874
+ continue;
827
875
  }
828
- if (found) {
829
- if (isBlocked) blocked.push(crawler);
830
- else allowed.push(crawler);
831
- } else if (isBlocked) {
876
+ const isSiteBlocked = resolvesPathAsBlocked(applicableRules, "/");
877
+ if (isSiteBlocked) {
832
878
  blocked.push(crawler);
833
879
  } else {
834
- unknown.push(crawler);
880
+ allowed.push(crawler);
881
+ const pathBlocks = findPartialBlocks(applicableRules);
882
+ for (const path of pathBlocks) {
883
+ const entry = `${crawler}: ${path}`;
884
+ if (!partiallyBlocked.includes(entry)) {
885
+ partiallyBlocked.push(entry);
886
+ }
887
+ }
835
888
  }
836
889
  }
837
- return { allowed, blocked, unknown };
890
+ return {
891
+ allowed,
892
+ blocked,
893
+ unknown,
894
+ ...partiallyBlocked.length > 0 && { partiallyBlocked }
895
+ };
838
896
  }
839
897
 
840
898
  // src/modules/audits/categories/content-extractability.ts
@@ -851,7 +909,7 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
851
909
  )
852
910
  );
853
911
  const extractRatio = page.stats.rawByteLength > 0 ? page.stats.cleanTextLength / page.stats.rawByteLength : 0;
854
- const extractScore = extractRatio >= 0.05 && extractRatio <= 0.15 ? 12 : extractRatio >= 0.01 ? 8 : extractRatio > 0.15 ? 10 : 2;
912
+ const extractScore = extractRatio >= 0.05 && extractRatio <= 0.15 ? 12 : extractRatio > 0.15 ? 10 : extractRatio >= 0.01 ? 8 : 2;
855
913
  factors.push(
856
914
  makeFactor(
857
915
  "Text Extraction Quality",
@@ -876,7 +934,7 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
876
934
  )
877
935
  );
878
936
  const wc = page.stats.wordCount;
879
- const wcScore = wc >= 300 && wc <= 3e3 ? 12 : wc >= 100 ? 8 : wc > 3e3 ? 10 : 2;
937
+ const wcScore = wc >= 300 && wc <= 3e3 ? 12 : wc > 3e3 ? 10 : wc >= 100 ? 8 : 2;
880
938
  factors.push(makeFactor("Word Count Adequacy", wcScore, 12, `${wc} words`));
881
939
  if (domainSignals) {
882
940
  const access2 = checkCrawlerAccess(domainSignals.robotsTxt);
@@ -891,6 +949,10 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
891
949
  )
892
950
  );
893
951
  rawData.crawlerAccess = access2;
952
+ rawData.llmsTxt = {
953
+ llmsTxtExists: domainSignals.llmsTxtExists,
954
+ llmsFullTxtExists: domainSignals.llmsFullTxtExists
955
+ };
894
956
  const hasLlms = domainSignals.llmsTxtExists;
895
957
  const hasLlmsFull = domainSignals.llmsFullTxtExists;
896
958
  const llmsScore = hasLlms && hasLlmsFull ? 6 : hasLlms || hasLlmsFull ? 4 : 0;
@@ -1110,6 +1172,7 @@ function auditGroundingSignals(page) {
1110
1172
  const $ = page.$;
1111
1173
  const text = page.cleanText;
1112
1174
  const factors = [];
1175
+ const { numberCount = 0 } = extractEntities(text);
1113
1176
  const externalLinks = page.externalLinks;
1114
1177
  const extScore = thresholdScore(externalLinks.length, [
1115
1178
  [6, 13],
@@ -1143,7 +1206,8 @@ function auditGroundingSignals(page) {
1143
1206
  )
1144
1207
  );
1145
1208
  const numericCount = countPatternMatches(text, NUMERIC_CLAIM_PATTERNS);
1146
- const numScore = thresholdScore(numericCount, [
1209
+ const totalNumericSignals = numericCount + numberCount;
1210
+ const numScore = thresholdScore(totalNumericSignals, [
1147
1211
  [9, 13],
1148
1212
  [4, 9],
1149
1213
  [1, 5],
@@ -1154,7 +1218,7 @@ function auditGroundingSignals(page) {
1154
1218
  "Numeric Claims",
1155
1219
  numScore,
1156
1220
  13,
1157
- `${numericCount} statistical references`
1221
+ `${numericCount} statistical references, ${numberCount} numeric values`
1158
1222
  )
1159
1223
  );
1160
1224
  const attrCount = countPatternMatches(text, ATTRIBUTION_PATTERNS);
@@ -1359,7 +1423,11 @@ function removeBoilerplate($) {
1359
1423
  function normalizeWhitespace(text) {
1360
1424
  return text.replace(/\s+/g, " ").trim();
1361
1425
  }
1426
+ var BLOCK_ELEMENTS = "p,div,td,th,li,h1,h2,h3,h4,h5,h6,dt,dd,br,blockquote,section,article";
1362
1427
  function extractCleanText($) {
1428
+ $(BLOCK_ELEMENTS).each((_, el) => {
1429
+ $(el).append(" ");
1430
+ });
1363
1431
  return normalizeWhitespace($("body").text());
1364
1432
  }
1365
1433
 
@@ -1368,7 +1436,9 @@ function extractPage(html, url) {
1368
1436
  const $ = cheerio.load(html);
1369
1437
  const title = $("title").text().trim() || $('meta[property="og:title"]').attr("content")?.trim() || "";
1370
1438
  const metaDescription = $('meta[name="description"]').attr("content")?.trim() || $('meta[property="og:description"]').attr("content")?.trim() || "";
1371
- const rawText = $("body").text().replace(/\s+/g, " ").trim();
1439
+ const $raw = cheerio.load(html);
1440
+ $raw("script, style, noscript").remove();
1441
+ const rawText = $raw("body").text().replace(/\s+/g, " ").trim();
1372
1442
  const rawByteLength = Buffer.byteLength(html, "utf-8");
1373
1443
  const h1Count = $("h1").length;
1374
1444
  const h2Count = $("h2").length;
@@ -1380,9 +1450,22 @@ function extractPage(html, url) {
1380
1450
  const listItemCount = $("li").length;
1381
1451
  const tableCount = $("table").length;
1382
1452
  const paragraphCount = $("p").length;
1453
+ const GENERIC_ALT_VALUES = /* @__PURE__ */ new Set([
1454
+ "image",
1455
+ "photo",
1456
+ "logo",
1457
+ "icon",
1458
+ "picture",
1459
+ "img",
1460
+ "graphic",
1461
+ "thumbnail"
1462
+ ]);
1383
1463
  let imagesWithAlt = 0;
1384
1464
  $("img").each((_, el) => {
1385
- if ($(el).attr("alt")) imagesWithAlt++;
1465
+ const alt = $(el).attr("alt")?.trim() ?? "";
1466
+ const words = alt.split(/\s+/).filter((w) => w.length > 0);
1467
+ const isMeaningful = words.length > 1 && alt.length < 200 && !GENERIC_ALT_VALUES.has(alt.toLowerCase());
1468
+ if (isMeaningful) imagesWithAlt++;
1386
1469
  });
1387
1470
  const pageDomain = getDomain(url);
1388
1471
  const externalLinks = [];
@@ -1516,9 +1599,19 @@ var RECOMMENDATION_BUILDERS = {
1516
1599
  }
1517
1600
  return `Your robots.txt is blocking ${blocked}. Blocking these crawlers means your content cannot be discovered or cited by AI engines.`;
1518
1601
  },
1519
- "LLMs.txt Presence": static_(
1520
- "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site, helping them understand and reference your content more effectively."
1521
- ),
1602
+ "LLMs.txt Presence": (rawData) => {
1603
+ const llms = rawData.llmsTxt;
1604
+ if (!llms) {
1605
+ return "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site.";
1606
+ }
1607
+ if (llms.llmsTxtExists && !llms.llmsFullTxtExists) {
1608
+ return "You have llms.txt but are missing llms-full.txt. Adding llms-full.txt provides AI systems with a comprehensive version of your site documentation for deeper ingestion.";
1609
+ }
1610
+ if (!llms.llmsTxtExists && llms.llmsFullTxtExists) {
1611
+ return "You have llms-full.txt but are missing llms.txt. Adding llms.txt provides AI systems with a concise structured overview of your site's purpose and key pages.";
1612
+ }
1613
+ return "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site.";
1614
+ },
1522
1615
  "Image Accessibility": (rawData) => {
1523
1616
  const images = rawData.imageAccessibility;
1524
1617
  if (!images || images.imageCount === 0) {
package/dist/cli.mjs CHANGED
@@ -3,7 +3,7 @@ import { Command } from "commander";
3
3
 
4
4
  // src/modules/analyzer/constants.ts
5
5
  var DOMAIN_SIGNAL_TIMEOUT_CAP = 5e3;
6
- var VERSION = true ? "1.2.7" : "0.0.0";
6
+ var VERSION = true ? "1.3.0" : "0.0.0";
7
7
 
8
8
  // src/modules/fetcher/constants.ts
9
9
  var MAX_RESPONSE_SIZE = 10 * 1024 * 1024;
@@ -342,7 +342,16 @@ function extractEntities(text) {
342
342
  0,
343
343
  15
344
344
  );
345
- return { people, organizations, places, topics };
345
+ const imperativeVerbCount = doc.verbs().isImperative().length;
346
+ const numberCount = doc.numbers().length;
347
+ return {
348
+ people,
349
+ organizations,
350
+ places,
351
+ topics,
352
+ imperativeVerbCount,
353
+ numberCount
354
+ };
346
355
  }
347
356
  function computeFleschReadingEase(text) {
348
357
  const words = text.split(/\s+/).filter((w) => w.length > 0);
@@ -411,6 +420,7 @@ function auditAnswerability(page) {
411
420
  const text = page.cleanText;
412
421
  const $ = page.$;
413
422
  const factors = [];
423
+ const { imperativeVerbCount = 0 } = extractEntities(text);
414
424
  const defCount = countPatternMatches(text, DEFINITION_PATTERNS);
415
425
  const defScore = thresholdScore(defCount, [
416
426
  [6, 10],
@@ -455,7 +465,7 @@ function auditAnswerability(page) {
455
465
  );
456
466
  const stepCount = countPatternMatches(text, STEP_PATTERNS);
457
467
  const hasOl = $("ol").length > 0;
458
- const stepTotal = stepCount + (hasOl ? 2 : 0);
468
+ const stepTotal = stepCount + imperativeVerbCount + (hasOl ? 2 : 0);
459
469
  const stepScore = thresholdScore(stepTotal, [
460
470
  [5, 10],
461
471
  [2, 7],
@@ -467,7 +477,7 @@ function auditAnswerability(page) {
467
477
  "Step-by-Step Content",
468
478
  stepScore,
469
479
  10,
470
- `${stepCount} step indicators${hasOl ? ", ordered lists found" : ""}`
480
+ `${stepCount} step indicators, ${imperativeVerbCount} instruction verbs${hasOl ? ", ordered lists found" : ""}`
471
481
  )
472
482
  );
473
483
  const questionMatches = text.match(/[^.!?]*\?/g) || [];
@@ -766,51 +776,99 @@ function auditAuthorityContext(page) {
766
776
  }
767
777
 
768
778
  // src/modules/audits/support/robots.ts
779
+ function parseRobotGroups(robotsTxt) {
780
+ const groups = [];
781
+ let current = null;
782
+ for (const raw of robotsTxt.split("\n")) {
783
+ const line = raw.split("#")[0].trim();
784
+ if (!line) {
785
+ current = null;
786
+ continue;
787
+ }
788
+ const colonAt = line.indexOf(":");
789
+ if (colonAt === -1) continue;
790
+ const field = line.slice(0, colonAt).trim().toLowerCase();
791
+ const value = line.slice(colonAt + 1).trim();
792
+ if (field === "user-agent") {
793
+ if (!current) {
794
+ current = { agents: [], rules: [] };
795
+ groups.push(current);
796
+ }
797
+ current.agents.push(value.toLowerCase());
798
+ } else if (field === "disallow" || field === "allow") {
799
+ if (current) {
800
+ current.rules.push({ type: field, path: value });
801
+ }
802
+ }
803
+ }
804
+ return groups;
805
+ }
806
+ function matchingRulesForCrawler(groups, crawlerLower) {
807
+ const specific = [];
808
+ const wildcard = [];
809
+ for (const group of groups) {
810
+ if (group.agents.includes(crawlerLower)) specific.push(...group.rules);
811
+ else if (group.agents.includes("*")) wildcard.push(...group.rules);
812
+ }
813
+ return { specific, wildcard };
814
+ }
815
+ function resolvesPathAsBlocked(rules, path) {
816
+ let bestMatchLength = -1;
817
+ let bestMatchIsDisallow = false;
818
+ for (const rule of rules) {
819
+ const rulePath = rule.path;
820
+ if (!rulePath || !path.startsWith(rulePath)) continue;
821
+ if (rulePath.length > bestMatchLength) {
822
+ bestMatchLength = rulePath.length;
823
+ bestMatchIsDisallow = rule.type === "disallow";
824
+ } else if (rulePath.length === bestMatchLength && rule.type === "allow") {
825
+ bestMatchIsDisallow = false;
826
+ }
827
+ }
828
+ return bestMatchLength >= 0 && bestMatchIsDisallow;
829
+ }
830
+ function findPartialBlocks(rules) {
831
+ return rules.filter((r) => r.type === "disallow" && r.path && r.path !== "/").map((r) => r.path);
832
+ }
769
833
  function checkCrawlerAccess(robotsTxt) {
770
834
  if (!robotsTxt)
771
835
  return { allowed: [], blocked: [], unknown: [...AI_CRAWLERS] };
772
- const lines = robotsTxt.split("\n").map((l) => l.trim());
836
+ const groups = parseRobotGroups(robotsTxt);
773
837
  const allowed = [];
774
838
  const blocked = [];
775
839
  const unknown = [];
840
+ const partiallyBlocked = [];
776
841
  for (const crawler of AI_CRAWLERS) {
777
842
  const crawlerLower = crawler.toLowerCase();
778
- let currentAgent = "";
779
- let isBlocked = false;
780
- let found = false;
781
- for (const line of lines) {
782
- const lower = line.toLowerCase();
783
- if (lower.startsWith("user-agent:")) {
784
- currentAgent = lower.split(":")[1]?.trim() || "";
785
- } else if (currentAgent === crawlerLower || currentAgent === "*") {
786
- if (lower.startsWith("disallow:")) {
787
- const path = lower.split(":")[1]?.trim();
788
- if (path === "/") {
789
- if (currentAgent === crawlerLower) {
790
- isBlocked = true;
791
- found = true;
792
- } else if (currentAgent === "*" && !found) {
793
- isBlocked = true;
794
- }
795
- }
796
- } else if (lower.startsWith("allow:")) {
797
- if (currentAgent === crawlerLower) {
798
- found = true;
799
- isBlocked = false;
800
- }
801
- }
802
- }
843
+ const { specific, wildcard } = matchingRulesForCrawler(
844
+ groups,
845
+ crawlerLower
846
+ );
847
+ const applicableRules = specific.length > 0 ? specific : wildcard;
848
+ if (applicableRules.length === 0) {
849
+ unknown.push(crawler);
850
+ continue;
803
851
  }
804
- if (found) {
805
- if (isBlocked) blocked.push(crawler);
806
- else allowed.push(crawler);
807
- } else if (isBlocked) {
852
+ const isSiteBlocked = resolvesPathAsBlocked(applicableRules, "/");
853
+ if (isSiteBlocked) {
808
854
  blocked.push(crawler);
809
855
  } else {
810
- unknown.push(crawler);
856
+ allowed.push(crawler);
857
+ const pathBlocks = findPartialBlocks(applicableRules);
858
+ for (const path of pathBlocks) {
859
+ const entry = `${crawler}: ${path}`;
860
+ if (!partiallyBlocked.includes(entry)) {
861
+ partiallyBlocked.push(entry);
862
+ }
863
+ }
811
864
  }
812
865
  }
813
- return { allowed, blocked, unknown };
866
+ return {
867
+ allowed,
868
+ blocked,
869
+ unknown,
870
+ ...partiallyBlocked.length > 0 && { partiallyBlocked }
871
+ };
814
872
  }
815
873
 
816
874
  // src/modules/audits/categories/content-extractability.ts
@@ -827,7 +885,7 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
827
885
  )
828
886
  );
829
887
  const extractRatio = page.stats.rawByteLength > 0 ? page.stats.cleanTextLength / page.stats.rawByteLength : 0;
830
- const extractScore = extractRatio >= 0.05 && extractRatio <= 0.15 ? 12 : extractRatio >= 0.01 ? 8 : extractRatio > 0.15 ? 10 : 2;
888
+ const extractScore = extractRatio >= 0.05 && extractRatio <= 0.15 ? 12 : extractRatio > 0.15 ? 10 : extractRatio >= 0.01 ? 8 : 2;
831
889
  factors.push(
832
890
  makeFactor(
833
891
  "Text Extraction Quality",
@@ -852,7 +910,7 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
852
910
  )
853
911
  );
854
912
  const wc = page.stats.wordCount;
855
- const wcScore = wc >= 300 && wc <= 3e3 ? 12 : wc >= 100 ? 8 : wc > 3e3 ? 10 : 2;
913
+ const wcScore = wc >= 300 && wc <= 3e3 ? 12 : wc > 3e3 ? 10 : wc >= 100 ? 8 : 2;
856
914
  factors.push(makeFactor("Word Count Adequacy", wcScore, 12, `${wc} words`));
857
915
  if (domainSignals) {
858
916
  const access2 = checkCrawlerAccess(domainSignals.robotsTxt);
@@ -867,6 +925,10 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
867
925
  )
868
926
  );
869
927
  rawData.crawlerAccess = access2;
928
+ rawData.llmsTxt = {
929
+ llmsTxtExists: domainSignals.llmsTxtExists,
930
+ llmsFullTxtExists: domainSignals.llmsFullTxtExists
931
+ };
870
932
  const hasLlms = domainSignals.llmsTxtExists;
871
933
  const hasLlmsFull = domainSignals.llmsFullTxtExists;
872
934
  const llmsScore = hasLlms && hasLlmsFull ? 6 : hasLlms || hasLlmsFull ? 4 : 0;
@@ -1086,6 +1148,7 @@ function auditGroundingSignals(page) {
1086
1148
  const $ = page.$;
1087
1149
  const text = page.cleanText;
1088
1150
  const factors = [];
1151
+ const { numberCount = 0 } = extractEntities(text);
1089
1152
  const externalLinks = page.externalLinks;
1090
1153
  const extScore = thresholdScore(externalLinks.length, [
1091
1154
  [6, 13],
@@ -1119,7 +1182,8 @@ function auditGroundingSignals(page) {
1119
1182
  )
1120
1183
  );
1121
1184
  const numericCount = countPatternMatches(text, NUMERIC_CLAIM_PATTERNS);
1122
- const numScore = thresholdScore(numericCount, [
1185
+ const totalNumericSignals = numericCount + numberCount;
1186
+ const numScore = thresholdScore(totalNumericSignals, [
1123
1187
  [9, 13],
1124
1188
  [4, 9],
1125
1189
  [1, 5],
@@ -1130,7 +1194,7 @@ function auditGroundingSignals(page) {
1130
1194
  "Numeric Claims",
1131
1195
  numScore,
1132
1196
  13,
1133
- `${numericCount} statistical references`
1197
+ `${numericCount} statistical references, ${numberCount} numeric values`
1134
1198
  )
1135
1199
  );
1136
1200
  const attrCount = countPatternMatches(text, ATTRIBUTION_PATTERNS);
@@ -1335,7 +1399,11 @@ function removeBoilerplate($) {
1335
1399
  function normalizeWhitespace(text) {
1336
1400
  return text.replace(/\s+/g, " ").trim();
1337
1401
  }
1402
+ var BLOCK_ELEMENTS = "p,div,td,th,li,h1,h2,h3,h4,h5,h6,dt,dd,br,blockquote,section,article";
1338
1403
  function extractCleanText($) {
1404
+ $(BLOCK_ELEMENTS).each((_, el) => {
1405
+ $(el).append(" ");
1406
+ });
1339
1407
  return normalizeWhitespace($("body").text());
1340
1408
  }
1341
1409
 
@@ -1344,7 +1412,9 @@ function extractPage(html, url) {
1344
1412
  const $ = cheerio.load(html);
1345
1413
  const title = $("title").text().trim() || $('meta[property="og:title"]').attr("content")?.trim() || "";
1346
1414
  const metaDescription = $('meta[name="description"]').attr("content")?.trim() || $('meta[property="og:description"]').attr("content")?.trim() || "";
1347
- const rawText = $("body").text().replace(/\s+/g, " ").trim();
1415
+ const $raw = cheerio.load(html);
1416
+ $raw("script, style, noscript").remove();
1417
+ const rawText = $raw("body").text().replace(/\s+/g, " ").trim();
1348
1418
  const rawByteLength = Buffer.byteLength(html, "utf-8");
1349
1419
  const h1Count = $("h1").length;
1350
1420
  const h2Count = $("h2").length;
@@ -1356,9 +1426,22 @@ function extractPage(html, url) {
1356
1426
  const listItemCount = $("li").length;
1357
1427
  const tableCount = $("table").length;
1358
1428
  const paragraphCount = $("p").length;
1429
+ const GENERIC_ALT_VALUES = /* @__PURE__ */ new Set([
1430
+ "image",
1431
+ "photo",
1432
+ "logo",
1433
+ "icon",
1434
+ "picture",
1435
+ "img",
1436
+ "graphic",
1437
+ "thumbnail"
1438
+ ]);
1359
1439
  let imagesWithAlt = 0;
1360
1440
  $("img").each((_, el) => {
1361
- if ($(el).attr("alt")) imagesWithAlt++;
1441
+ const alt = $(el).attr("alt")?.trim() ?? "";
1442
+ const words = alt.split(/\s+/).filter((w) => w.length > 0);
1443
+ const isMeaningful = words.length > 1 && alt.length < 200 && !GENERIC_ALT_VALUES.has(alt.toLowerCase());
1444
+ if (isMeaningful) imagesWithAlt++;
1362
1445
  });
1363
1446
  const pageDomain = getDomain(url);
1364
1447
  const externalLinks = [];
@@ -1492,9 +1575,19 @@ var RECOMMENDATION_BUILDERS = {
1492
1575
  }
1493
1576
  return `Your robots.txt is blocking ${blocked}. Blocking these crawlers means your content cannot be discovered or cited by AI engines.`;
1494
1577
  },
1495
- "LLMs.txt Presence": static_(
1496
- "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site, helping them understand and reference your content more effectively."
1497
- ),
1578
+ "LLMs.txt Presence": (rawData) => {
1579
+ const llms = rawData.llmsTxt;
1580
+ if (!llms) {
1581
+ return "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site.";
1582
+ }
1583
+ if (llms.llmsTxtExists && !llms.llmsFullTxtExists) {
1584
+ return "You have llms.txt but are missing llms-full.txt. Adding llms-full.txt provides AI systems with a comprehensive version of your site documentation for deeper ingestion.";
1585
+ }
1586
+ if (!llms.llmsTxtExists && llms.llmsFullTxtExists) {
1587
+ return "You have llms-full.txt but are missing llms.txt. Adding llms.txt provides AI systems with a concise structured overview of your site's purpose and key pages.";
1588
+ }
1589
+ return "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site.";
1590
+ },
1498
1591
  "Image Accessibility": (rawData) => {
1499
1592
  const images = rawData.imageAccessibility;
1500
1593
  if (!images || images.imageCount === 0) {
package/dist/index.d.mts CHANGED
@@ -90,6 +90,11 @@ declare const AnalyzerResultSchema: z.ZodObject<{
90
90
  allowed: z.ZodArray<z.ZodString>;
91
91
  blocked: z.ZodArray<z.ZodString>;
92
92
  unknown: z.ZodArray<z.ZodString>;
93
+ partiallyBlocked: z.ZodOptional<z.ZodArray<z.ZodString>>;
94
+ }, z.core.$strip>>;
95
+ llmsTxt: z.ZodOptional<z.ZodObject<{
96
+ llmsTxtExists: z.ZodBoolean;
97
+ llmsFullTxtExists: z.ZodBoolean;
93
98
  }, z.core.$strip>>;
94
99
  sectionLengths: z.ZodOptional<z.ZodObject<{
95
100
  sectionCount: z.ZodNumber;
@@ -105,6 +110,8 @@ declare const AnalyzerResultSchema: z.ZodObject<{
105
110
  organizations: z.ZodArray<z.ZodString>;
106
111
  places: z.ZodArray<z.ZodString>;
107
112
  topics: z.ZodArray<z.ZodString>;
113
+ imperativeVerbCount: z.ZodOptional<z.ZodNumber>;
114
+ numberCount: z.ZodOptional<z.ZodNumber>;
108
115
  }, z.core.$strip>>;
109
116
  externalLinks: z.ZodOptional<z.ZodArray<z.ZodObject<{
110
117
  url: z.ZodString;
@@ -266,6 +273,11 @@ declare const AuditResultSchema: z.ZodObject<{
266
273
  allowed: z.ZodArray<z.ZodString>;
267
274
  blocked: z.ZodArray<z.ZodString>;
268
275
  unknown: z.ZodArray<z.ZodString>;
276
+ partiallyBlocked: z.ZodOptional<z.ZodArray<z.ZodString>>;
277
+ }, z.core.$strip>>;
278
+ llmsTxt: z.ZodOptional<z.ZodObject<{
279
+ llmsTxtExists: z.ZodBoolean;
280
+ llmsFullTxtExists: z.ZodBoolean;
269
281
  }, z.core.$strip>>;
270
282
  sectionLengths: z.ZodOptional<z.ZodObject<{
271
283
  sectionCount: z.ZodNumber;
@@ -281,6 +293,8 @@ declare const AuditResultSchema: z.ZodObject<{
281
293
  organizations: z.ZodArray<z.ZodString>;
282
294
  places: z.ZodArray<z.ZodString>;
283
295
  topics: z.ZodArray<z.ZodString>;
296
+ imperativeVerbCount: z.ZodOptional<z.ZodNumber>;
297
+ numberCount: z.ZodOptional<z.ZodNumber>;
284
298
  }, z.core.$strip>>;
285
299
  externalLinks: z.ZodOptional<z.ZodArray<z.ZodObject<{
286
300
  url: z.ZodString;
package/dist/index.d.ts CHANGED
@@ -90,6 +90,11 @@ declare const AnalyzerResultSchema: z.ZodObject<{
90
90
  allowed: z.ZodArray<z.ZodString>;
91
91
  blocked: z.ZodArray<z.ZodString>;
92
92
  unknown: z.ZodArray<z.ZodString>;
93
+ partiallyBlocked: z.ZodOptional<z.ZodArray<z.ZodString>>;
94
+ }, z.core.$strip>>;
95
+ llmsTxt: z.ZodOptional<z.ZodObject<{
96
+ llmsTxtExists: z.ZodBoolean;
97
+ llmsFullTxtExists: z.ZodBoolean;
93
98
  }, z.core.$strip>>;
94
99
  sectionLengths: z.ZodOptional<z.ZodObject<{
95
100
  sectionCount: z.ZodNumber;
@@ -105,6 +110,8 @@ declare const AnalyzerResultSchema: z.ZodObject<{
105
110
  organizations: z.ZodArray<z.ZodString>;
106
111
  places: z.ZodArray<z.ZodString>;
107
112
  topics: z.ZodArray<z.ZodString>;
113
+ imperativeVerbCount: z.ZodOptional<z.ZodNumber>;
114
+ numberCount: z.ZodOptional<z.ZodNumber>;
108
115
  }, z.core.$strip>>;
109
116
  externalLinks: z.ZodOptional<z.ZodArray<z.ZodObject<{
110
117
  url: z.ZodString;
@@ -266,6 +273,11 @@ declare const AuditResultSchema: z.ZodObject<{
266
273
  allowed: z.ZodArray<z.ZodString>;
267
274
  blocked: z.ZodArray<z.ZodString>;
268
275
  unknown: z.ZodArray<z.ZodString>;
276
+ partiallyBlocked: z.ZodOptional<z.ZodArray<z.ZodString>>;
277
+ }, z.core.$strip>>;
278
+ llmsTxt: z.ZodOptional<z.ZodObject<{
279
+ llmsTxtExists: z.ZodBoolean;
280
+ llmsFullTxtExists: z.ZodBoolean;
269
281
  }, z.core.$strip>>;
270
282
  sectionLengths: z.ZodOptional<z.ZodObject<{
271
283
  sectionCount: z.ZodNumber;
@@ -281,6 +293,8 @@ declare const AuditResultSchema: z.ZodObject<{
281
293
  organizations: z.ZodArray<z.ZodString>;
282
294
  places: z.ZodArray<z.ZodString>;
283
295
  topics: z.ZodArray<z.ZodString>;
296
+ imperativeVerbCount: z.ZodOptional<z.ZodNumber>;
297
+ numberCount: z.ZodOptional<z.ZodNumber>;
284
298
  }, z.core.$strip>>;
285
299
  externalLinks: z.ZodOptional<z.ZodArray<z.ZodObject<{
286
300
  url: z.ZodString;
package/dist/index.js CHANGED
@@ -364,7 +364,16 @@ function extractEntities(text) {
364
364
  0,
365
365
  15
366
366
  );
367
- return { people, organizations, places, topics };
367
+ const imperativeVerbCount = doc.verbs().isImperative().length;
368
+ const numberCount = doc.numbers().length;
369
+ return {
370
+ people,
371
+ organizations,
372
+ places,
373
+ topics,
374
+ imperativeVerbCount,
375
+ numberCount
376
+ };
368
377
  }
369
378
  function computeFleschReadingEase(text) {
370
379
  const words = text.split(/\s+/).filter((w) => w.length > 0);
@@ -433,6 +442,7 @@ function auditAnswerability(page) {
433
442
  const text = page.cleanText;
434
443
  const $ = page.$;
435
444
  const factors = [];
445
+ const { imperativeVerbCount = 0 } = extractEntities(text);
436
446
  const defCount = countPatternMatches(text, DEFINITION_PATTERNS);
437
447
  const defScore = thresholdScore(defCount, [
438
448
  [6, 10],
@@ -477,7 +487,7 @@ function auditAnswerability(page) {
477
487
  );
478
488
  const stepCount = countPatternMatches(text, STEP_PATTERNS);
479
489
  const hasOl = $("ol").length > 0;
480
- const stepTotal = stepCount + (hasOl ? 2 : 0);
490
+ const stepTotal = stepCount + imperativeVerbCount + (hasOl ? 2 : 0);
481
491
  const stepScore = thresholdScore(stepTotal, [
482
492
  [5, 10],
483
493
  [2, 7],
@@ -489,7 +499,7 @@ function auditAnswerability(page) {
489
499
  "Step-by-Step Content",
490
500
  stepScore,
491
501
  10,
492
- `${stepCount} step indicators${hasOl ? ", ordered lists found" : ""}`
502
+ `${stepCount} step indicators, ${imperativeVerbCount} instruction verbs${hasOl ? ", ordered lists found" : ""}`
493
503
  )
494
504
  );
495
505
  const questionMatches = text.match(/[^.!?]*\?/g) || [];
@@ -788,51 +798,99 @@ function auditAuthorityContext(page) {
788
798
  }
789
799
 
790
800
  // src/modules/audits/support/robots.ts
801
+ function parseRobotGroups(robotsTxt) {
802
+ const groups = [];
803
+ let current = null;
804
+ for (const raw of robotsTxt.split("\n")) {
805
+ const line = raw.split("#")[0].trim();
806
+ if (!line) {
807
+ current = null;
808
+ continue;
809
+ }
810
+ const colonAt = line.indexOf(":");
811
+ if (colonAt === -1) continue;
812
+ const field = line.slice(0, colonAt).trim().toLowerCase();
813
+ const value = line.slice(colonAt + 1).trim();
814
+ if (field === "user-agent") {
815
+ if (!current) {
816
+ current = { agents: [], rules: [] };
817
+ groups.push(current);
818
+ }
819
+ current.agents.push(value.toLowerCase());
820
+ } else if (field === "disallow" || field === "allow") {
821
+ if (current) {
822
+ current.rules.push({ type: field, path: value });
823
+ }
824
+ }
825
+ }
826
+ return groups;
827
+ }
828
+ function matchingRulesForCrawler(groups, crawlerLower) {
829
+ const specific = [];
830
+ const wildcard = [];
831
+ for (const group of groups) {
832
+ if (group.agents.includes(crawlerLower)) specific.push(...group.rules);
833
+ else if (group.agents.includes("*")) wildcard.push(...group.rules);
834
+ }
835
+ return { specific, wildcard };
836
+ }
837
+ function resolvesPathAsBlocked(rules, path) {
838
+ let bestMatchLength = -1;
839
+ let bestMatchIsDisallow = false;
840
+ for (const rule of rules) {
841
+ const rulePath = rule.path;
842
+ if (!rulePath || !path.startsWith(rulePath)) continue;
843
+ if (rulePath.length > bestMatchLength) {
844
+ bestMatchLength = rulePath.length;
845
+ bestMatchIsDisallow = rule.type === "disallow";
846
+ } else if (rulePath.length === bestMatchLength && rule.type === "allow") {
847
+ bestMatchIsDisallow = false;
848
+ }
849
+ }
850
+ return bestMatchLength >= 0 && bestMatchIsDisallow;
851
+ }
852
+ function findPartialBlocks(rules) {
853
+ return rules.filter((r) => r.type === "disallow" && r.path && r.path !== "/").map((r) => r.path);
854
+ }
791
855
  function checkCrawlerAccess(robotsTxt) {
792
856
  if (!robotsTxt)
793
857
  return { allowed: [], blocked: [], unknown: [...AI_CRAWLERS] };
794
- const lines = robotsTxt.split("\n").map((l) => l.trim());
858
+ const groups = parseRobotGroups(robotsTxt);
795
859
  const allowed = [];
796
860
  const blocked = [];
797
861
  const unknown = [];
862
+ const partiallyBlocked = [];
798
863
  for (const crawler of AI_CRAWLERS) {
799
864
  const crawlerLower = crawler.toLowerCase();
800
- let currentAgent = "";
801
- let isBlocked = false;
802
- let found = false;
803
- for (const line of lines) {
804
- const lower = line.toLowerCase();
805
- if (lower.startsWith("user-agent:")) {
806
- currentAgent = lower.split(":")[1]?.trim() || "";
807
- } else if (currentAgent === crawlerLower || currentAgent === "*") {
808
- if (lower.startsWith("disallow:")) {
809
- const path = lower.split(":")[1]?.trim();
810
- if (path === "/") {
811
- if (currentAgent === crawlerLower) {
812
- isBlocked = true;
813
- found = true;
814
- } else if (currentAgent === "*" && !found) {
815
- isBlocked = true;
816
- }
817
- }
818
- } else if (lower.startsWith("allow:")) {
819
- if (currentAgent === crawlerLower) {
820
- found = true;
821
- isBlocked = false;
822
- }
823
- }
824
- }
865
+ const { specific, wildcard } = matchingRulesForCrawler(
866
+ groups,
867
+ crawlerLower
868
+ );
869
+ const applicableRules = specific.length > 0 ? specific : wildcard;
870
+ if (applicableRules.length === 0) {
871
+ unknown.push(crawler);
872
+ continue;
825
873
  }
826
- if (found) {
827
- if (isBlocked) blocked.push(crawler);
828
- else allowed.push(crawler);
829
- } else if (isBlocked) {
874
+ const isSiteBlocked = resolvesPathAsBlocked(applicableRules, "/");
875
+ if (isSiteBlocked) {
830
876
  blocked.push(crawler);
831
877
  } else {
832
- unknown.push(crawler);
878
+ allowed.push(crawler);
879
+ const pathBlocks = findPartialBlocks(applicableRules);
880
+ for (const path of pathBlocks) {
881
+ const entry = `${crawler}: ${path}`;
882
+ if (!partiallyBlocked.includes(entry)) {
883
+ partiallyBlocked.push(entry);
884
+ }
885
+ }
833
886
  }
834
887
  }
835
- return { allowed, blocked, unknown };
888
+ return {
889
+ allowed,
890
+ blocked,
891
+ unknown,
892
+ ...partiallyBlocked.length > 0 && { partiallyBlocked }
893
+ };
836
894
  }
837
895
 
838
896
  // src/modules/audits/categories/content-extractability.ts
@@ -849,7 +907,7 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
849
907
  )
850
908
  );
851
909
  const extractRatio = page.stats.rawByteLength > 0 ? page.stats.cleanTextLength / page.stats.rawByteLength : 0;
852
- const extractScore = extractRatio >= 0.05 && extractRatio <= 0.15 ? 12 : extractRatio >= 0.01 ? 8 : extractRatio > 0.15 ? 10 : 2;
910
+ const extractScore = extractRatio >= 0.05 && extractRatio <= 0.15 ? 12 : extractRatio > 0.15 ? 10 : extractRatio >= 0.01 ? 8 : 2;
853
911
  factors.push(
854
912
  makeFactor(
855
913
  "Text Extraction Quality",
@@ -874,7 +932,7 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
874
932
  )
875
933
  );
876
934
  const wc = page.stats.wordCount;
877
- const wcScore = wc >= 300 && wc <= 3e3 ? 12 : wc >= 100 ? 8 : wc > 3e3 ? 10 : 2;
935
+ const wcScore = wc >= 300 && wc <= 3e3 ? 12 : wc > 3e3 ? 10 : wc >= 100 ? 8 : 2;
878
936
  factors.push(makeFactor("Word Count Adequacy", wcScore, 12, `${wc} words`));
879
937
  if (domainSignals) {
880
938
  const access2 = checkCrawlerAccess(domainSignals.robotsTxt);
@@ -889,6 +947,10 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
889
947
  )
890
948
  );
891
949
  rawData.crawlerAccess = access2;
950
+ rawData.llmsTxt = {
951
+ llmsTxtExists: domainSignals.llmsTxtExists,
952
+ llmsFullTxtExists: domainSignals.llmsFullTxtExists
953
+ };
892
954
  const hasLlms = domainSignals.llmsTxtExists;
893
955
  const hasLlmsFull = domainSignals.llmsFullTxtExists;
894
956
  const llmsScore = hasLlms && hasLlmsFull ? 6 : hasLlms || hasLlmsFull ? 4 : 0;
@@ -1108,6 +1170,7 @@ function auditGroundingSignals(page) {
1108
1170
  const $ = page.$;
1109
1171
  const text = page.cleanText;
1110
1172
  const factors = [];
1173
+ const { numberCount = 0 } = extractEntities(text);
1111
1174
  const externalLinks = page.externalLinks;
1112
1175
  const extScore = thresholdScore(externalLinks.length, [
1113
1176
  [6, 13],
@@ -1141,7 +1204,8 @@ function auditGroundingSignals(page) {
1141
1204
  )
1142
1205
  );
1143
1206
  const numericCount = countPatternMatches(text, NUMERIC_CLAIM_PATTERNS);
1144
- const numScore = thresholdScore(numericCount, [
1207
+ const totalNumericSignals = numericCount + numberCount;
1208
+ const numScore = thresholdScore(totalNumericSignals, [
1145
1209
  [9, 13],
1146
1210
  [4, 9],
1147
1211
  [1, 5],
@@ -1152,7 +1216,7 @@ function auditGroundingSignals(page) {
1152
1216
  "Numeric Claims",
1153
1217
  numScore,
1154
1218
  13,
1155
- `${numericCount} statistical references`
1219
+ `${numericCount} statistical references, ${numberCount} numeric values`
1156
1220
  )
1157
1221
  );
1158
1222
  const attrCount = countPatternMatches(text, ATTRIBUTION_PATTERNS);
@@ -1357,7 +1421,11 @@ function removeBoilerplate($) {
1357
1421
  function normalizeWhitespace(text) {
1358
1422
  return text.replace(/\s+/g, " ").trim();
1359
1423
  }
1424
+ var BLOCK_ELEMENTS = "p,div,td,th,li,h1,h2,h3,h4,h5,h6,dt,dd,br,blockquote,section,article";
1360
1425
  function extractCleanText($) {
1426
+ $(BLOCK_ELEMENTS).each((_, el) => {
1427
+ $(el).append(" ");
1428
+ });
1361
1429
  return normalizeWhitespace($("body").text());
1362
1430
  }
1363
1431
 
@@ -1366,7 +1434,9 @@ function extractPage(html, url) {
1366
1434
  const $ = cheerio.load(html);
1367
1435
  const title = $("title").text().trim() || $('meta[property="og:title"]').attr("content")?.trim() || "";
1368
1436
  const metaDescription = $('meta[name="description"]').attr("content")?.trim() || $('meta[property="og:description"]').attr("content")?.trim() || "";
1369
- const rawText = $("body").text().replace(/\s+/g, " ").trim();
1437
+ const $raw = cheerio.load(html);
1438
+ $raw("script, style, noscript").remove();
1439
+ const rawText = $raw("body").text().replace(/\s+/g, " ").trim();
1370
1440
  const rawByteLength = Buffer.byteLength(html, "utf-8");
1371
1441
  const h1Count = $("h1").length;
1372
1442
  const h2Count = $("h2").length;
@@ -1378,9 +1448,22 @@ function extractPage(html, url) {
1378
1448
  const listItemCount = $("li").length;
1379
1449
  const tableCount = $("table").length;
1380
1450
  const paragraphCount = $("p").length;
1451
+ const GENERIC_ALT_VALUES = /* @__PURE__ */ new Set([
1452
+ "image",
1453
+ "photo",
1454
+ "logo",
1455
+ "icon",
1456
+ "picture",
1457
+ "img",
1458
+ "graphic",
1459
+ "thumbnail"
1460
+ ]);
1381
1461
  let imagesWithAlt = 0;
1382
1462
  $("img").each((_, el) => {
1383
- if ($(el).attr("alt")) imagesWithAlt++;
1463
+ const alt = $(el).attr("alt")?.trim() ?? "";
1464
+ const words = alt.split(/\s+/).filter((w) => w.length > 0);
1465
+ const isMeaningful = words.length > 1 && alt.length < 200 && !GENERIC_ALT_VALUES.has(alt.toLowerCase());
1466
+ if (isMeaningful) imagesWithAlt++;
1384
1467
  });
1385
1468
  const pageDomain = getDomain(url);
1386
1469
  const externalLinks = [];
@@ -1440,7 +1523,7 @@ var import_zod = require("zod");
1440
1523
 
1441
1524
  // src/modules/analyzer/constants.ts
1442
1525
  var DOMAIN_SIGNAL_TIMEOUT_CAP = 5e3;
1443
- var VERSION = true ? "1.2.7" : "0.0.0";
1526
+ var VERSION = true ? "1.3.0" : "0.0.0";
1444
1527
 
1445
1528
  // src/modules/fetcher/schema.ts
1446
1529
  var FetchOptionsSchema = import_zod.z.object({
@@ -1520,9 +1603,19 @@ var RECOMMENDATION_BUILDERS = {
1520
1603
  }
1521
1604
  return `Your robots.txt is blocking ${blocked}. Blocking these crawlers means your content cannot be discovered or cited by AI engines.`;
1522
1605
  },
1523
- "LLMs.txt Presence": static_(
1524
- "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site, helping them understand and reference your content more effectively."
1525
- ),
1606
+ "LLMs.txt Presence": (rawData) => {
1607
+ const llms = rawData.llmsTxt;
1608
+ if (!llms) {
1609
+ return "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site.";
1610
+ }
1611
+ if (llms.llmsTxtExists && !llms.llmsFullTxtExists) {
1612
+ return "You have llms.txt but are missing llms-full.txt. Adding llms-full.txt provides AI systems with a comprehensive version of your site documentation for deeper ingestion.";
1613
+ }
1614
+ if (!llms.llmsTxtExists && llms.llmsFullTxtExists) {
1615
+ return "You have llms-full.txt but are missing llms.txt. Adding llms.txt provides AI systems with a concise structured overview of your site's purpose and key pages.";
1616
+ }
1617
+ return "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site.";
1618
+ },
1526
1619
  "Image Accessibility": (rawData) => {
1527
1620
  const images = rawData.imageAccessibility;
1528
1621
  if (!images || images.imageCount === 0) {
package/dist/index.mjs CHANGED
@@ -326,7 +326,16 @@ function extractEntities(text) {
326
326
  0,
327
327
  15
328
328
  );
329
- return { people, organizations, places, topics };
329
+ const imperativeVerbCount = doc.verbs().isImperative().length;
330
+ const numberCount = doc.numbers().length;
331
+ return {
332
+ people,
333
+ organizations,
334
+ places,
335
+ topics,
336
+ imperativeVerbCount,
337
+ numberCount
338
+ };
330
339
  }
331
340
  function computeFleschReadingEase(text) {
332
341
  const words = text.split(/\s+/).filter((w) => w.length > 0);
@@ -395,6 +404,7 @@ function auditAnswerability(page) {
395
404
  const text = page.cleanText;
396
405
  const $ = page.$;
397
406
  const factors = [];
407
+ const { imperativeVerbCount = 0 } = extractEntities(text);
398
408
  const defCount = countPatternMatches(text, DEFINITION_PATTERNS);
399
409
  const defScore = thresholdScore(defCount, [
400
410
  [6, 10],
@@ -439,7 +449,7 @@ function auditAnswerability(page) {
439
449
  );
440
450
  const stepCount = countPatternMatches(text, STEP_PATTERNS);
441
451
  const hasOl = $("ol").length > 0;
442
- const stepTotal = stepCount + (hasOl ? 2 : 0);
452
+ const stepTotal = stepCount + imperativeVerbCount + (hasOl ? 2 : 0);
443
453
  const stepScore = thresholdScore(stepTotal, [
444
454
  [5, 10],
445
455
  [2, 7],
@@ -451,7 +461,7 @@ function auditAnswerability(page) {
451
461
  "Step-by-Step Content",
452
462
  stepScore,
453
463
  10,
454
- `${stepCount} step indicators${hasOl ? ", ordered lists found" : ""}`
464
+ `${stepCount} step indicators, ${imperativeVerbCount} instruction verbs${hasOl ? ", ordered lists found" : ""}`
455
465
  )
456
466
  );
457
467
  const questionMatches = text.match(/[^.!?]*\?/g) || [];
@@ -750,51 +760,99 @@ function auditAuthorityContext(page) {
750
760
  }
751
761
 
752
762
  // src/modules/audits/support/robots.ts
763
+ function parseRobotGroups(robotsTxt) {
764
+ const groups = [];
765
+ let current = null;
766
+ for (const raw of robotsTxt.split("\n")) {
767
+ const line = raw.split("#")[0].trim();
768
+ if (!line) {
769
+ current = null;
770
+ continue;
771
+ }
772
+ const colonAt = line.indexOf(":");
773
+ if (colonAt === -1) continue;
774
+ const field = line.slice(0, colonAt).trim().toLowerCase();
775
+ const value = line.slice(colonAt + 1).trim();
776
+ if (field === "user-agent") {
777
+ if (!current) {
778
+ current = { agents: [], rules: [] };
779
+ groups.push(current);
780
+ }
781
+ current.agents.push(value.toLowerCase());
782
+ } else if (field === "disallow" || field === "allow") {
783
+ if (current) {
784
+ current.rules.push({ type: field, path: value });
785
+ }
786
+ }
787
+ }
788
+ return groups;
789
+ }
790
+ function matchingRulesForCrawler(groups, crawlerLower) {
791
+ const specific = [];
792
+ const wildcard = [];
793
+ for (const group of groups) {
794
+ if (group.agents.includes(crawlerLower)) specific.push(...group.rules);
795
+ else if (group.agents.includes("*")) wildcard.push(...group.rules);
796
+ }
797
+ return { specific, wildcard };
798
+ }
799
+ function resolvesPathAsBlocked(rules, path) {
800
+ let bestMatchLength = -1;
801
+ let bestMatchIsDisallow = false;
802
+ for (const rule of rules) {
803
+ const rulePath = rule.path;
804
+ if (!rulePath || !path.startsWith(rulePath)) continue;
805
+ if (rulePath.length > bestMatchLength) {
806
+ bestMatchLength = rulePath.length;
807
+ bestMatchIsDisallow = rule.type === "disallow";
808
+ } else if (rulePath.length === bestMatchLength && rule.type === "allow") {
809
+ bestMatchIsDisallow = false;
810
+ }
811
+ }
812
+ return bestMatchLength >= 0 && bestMatchIsDisallow;
813
+ }
814
+ function findPartialBlocks(rules) {
815
+ return rules.filter((r) => r.type === "disallow" && r.path && r.path !== "/").map((r) => r.path);
816
+ }
753
817
  function checkCrawlerAccess(robotsTxt) {
754
818
  if (!robotsTxt)
755
819
  return { allowed: [], blocked: [], unknown: [...AI_CRAWLERS] };
756
- const lines = robotsTxt.split("\n").map((l) => l.trim());
820
+ const groups = parseRobotGroups(robotsTxt);
757
821
  const allowed = [];
758
822
  const blocked = [];
759
823
  const unknown = [];
824
+ const partiallyBlocked = [];
760
825
  for (const crawler of AI_CRAWLERS) {
761
826
  const crawlerLower = crawler.toLowerCase();
762
- let currentAgent = "";
763
- let isBlocked = false;
764
- let found = false;
765
- for (const line of lines) {
766
- const lower = line.toLowerCase();
767
- if (lower.startsWith("user-agent:")) {
768
- currentAgent = lower.split(":")[1]?.trim() || "";
769
- } else if (currentAgent === crawlerLower || currentAgent === "*") {
770
- if (lower.startsWith("disallow:")) {
771
- const path = lower.split(":")[1]?.trim();
772
- if (path === "/") {
773
- if (currentAgent === crawlerLower) {
774
- isBlocked = true;
775
- found = true;
776
- } else if (currentAgent === "*" && !found) {
777
- isBlocked = true;
778
- }
779
- }
780
- } else if (lower.startsWith("allow:")) {
781
- if (currentAgent === crawlerLower) {
782
- found = true;
783
- isBlocked = false;
784
- }
785
- }
786
- }
827
+ const { specific, wildcard } = matchingRulesForCrawler(
828
+ groups,
829
+ crawlerLower
830
+ );
831
+ const applicableRules = specific.length > 0 ? specific : wildcard;
832
+ if (applicableRules.length === 0) {
833
+ unknown.push(crawler);
834
+ continue;
787
835
  }
788
- if (found) {
789
- if (isBlocked) blocked.push(crawler);
790
- else allowed.push(crawler);
791
- } else if (isBlocked) {
836
+ const isSiteBlocked = resolvesPathAsBlocked(applicableRules, "/");
837
+ if (isSiteBlocked) {
792
838
  blocked.push(crawler);
793
839
  } else {
794
- unknown.push(crawler);
840
+ allowed.push(crawler);
841
+ const pathBlocks = findPartialBlocks(applicableRules);
842
+ for (const path of pathBlocks) {
843
+ const entry = `${crawler}: ${path}`;
844
+ if (!partiallyBlocked.includes(entry)) {
845
+ partiallyBlocked.push(entry);
846
+ }
847
+ }
795
848
  }
796
849
  }
797
- return { allowed, blocked, unknown };
850
+ return {
851
+ allowed,
852
+ blocked,
853
+ unknown,
854
+ ...partiallyBlocked.length > 0 && { partiallyBlocked }
855
+ };
798
856
  }
799
857
 
800
858
  // src/modules/audits/categories/content-extractability.ts
@@ -811,7 +869,7 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
811
869
  )
812
870
  );
813
871
  const extractRatio = page.stats.rawByteLength > 0 ? page.stats.cleanTextLength / page.stats.rawByteLength : 0;
814
- const extractScore = extractRatio >= 0.05 && extractRatio <= 0.15 ? 12 : extractRatio >= 0.01 ? 8 : extractRatio > 0.15 ? 10 : 2;
872
+ const extractScore = extractRatio >= 0.05 && extractRatio <= 0.15 ? 12 : extractRatio > 0.15 ? 10 : extractRatio >= 0.01 ? 8 : 2;
815
873
  factors.push(
816
874
  makeFactor(
817
875
  "Text Extraction Quality",
@@ -836,7 +894,7 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
836
894
  )
837
895
  );
838
896
  const wc = page.stats.wordCount;
839
- const wcScore = wc >= 300 && wc <= 3e3 ? 12 : wc >= 100 ? 8 : wc > 3e3 ? 10 : 2;
897
+ const wcScore = wc >= 300 && wc <= 3e3 ? 12 : wc > 3e3 ? 10 : wc >= 100 ? 8 : 2;
840
898
  factors.push(makeFactor("Word Count Adequacy", wcScore, 12, `${wc} words`));
841
899
  if (domainSignals) {
842
900
  const access2 = checkCrawlerAccess(domainSignals.robotsTxt);
@@ -851,6 +909,10 @@ function auditContentExtractability(page, fetchResult, domainSignals) {
851
909
  )
852
910
  );
853
911
  rawData.crawlerAccess = access2;
912
+ rawData.llmsTxt = {
913
+ llmsTxtExists: domainSignals.llmsTxtExists,
914
+ llmsFullTxtExists: domainSignals.llmsFullTxtExists
915
+ };
854
916
  const hasLlms = domainSignals.llmsTxtExists;
855
917
  const hasLlmsFull = domainSignals.llmsFullTxtExists;
856
918
  const llmsScore = hasLlms && hasLlmsFull ? 6 : hasLlms || hasLlmsFull ? 4 : 0;
@@ -1070,6 +1132,7 @@ function auditGroundingSignals(page) {
1070
1132
  const $ = page.$;
1071
1133
  const text = page.cleanText;
1072
1134
  const factors = [];
1135
+ const { numberCount = 0 } = extractEntities(text);
1073
1136
  const externalLinks = page.externalLinks;
1074
1137
  const extScore = thresholdScore(externalLinks.length, [
1075
1138
  [6, 13],
@@ -1103,7 +1166,8 @@ function auditGroundingSignals(page) {
1103
1166
  )
1104
1167
  );
1105
1168
  const numericCount = countPatternMatches(text, NUMERIC_CLAIM_PATTERNS);
1106
- const numScore = thresholdScore(numericCount, [
1169
+ const totalNumericSignals = numericCount + numberCount;
1170
+ const numScore = thresholdScore(totalNumericSignals, [
1107
1171
  [9, 13],
1108
1172
  [4, 9],
1109
1173
  [1, 5],
@@ -1114,7 +1178,7 @@ function auditGroundingSignals(page) {
1114
1178
  "Numeric Claims",
1115
1179
  numScore,
1116
1180
  13,
1117
- `${numericCount} statistical references`
1181
+ `${numericCount} statistical references, ${numberCount} numeric values`
1118
1182
  )
1119
1183
  );
1120
1184
  const attrCount = countPatternMatches(text, ATTRIBUTION_PATTERNS);
@@ -1319,7 +1383,11 @@ function removeBoilerplate($) {
1319
1383
  function normalizeWhitespace(text) {
1320
1384
  return text.replace(/\s+/g, " ").trim();
1321
1385
  }
1386
+ var BLOCK_ELEMENTS = "p,div,td,th,li,h1,h2,h3,h4,h5,h6,dt,dd,br,blockquote,section,article";
1322
1387
  function extractCleanText($) {
1388
+ $(BLOCK_ELEMENTS).each((_, el) => {
1389
+ $(el).append(" ");
1390
+ });
1323
1391
  return normalizeWhitespace($("body").text());
1324
1392
  }
1325
1393
 
@@ -1328,7 +1396,9 @@ function extractPage(html, url) {
1328
1396
  const $ = cheerio.load(html);
1329
1397
  const title = $("title").text().trim() || $('meta[property="og:title"]').attr("content")?.trim() || "";
1330
1398
  const metaDescription = $('meta[name="description"]').attr("content")?.trim() || $('meta[property="og:description"]').attr("content")?.trim() || "";
1331
- const rawText = $("body").text().replace(/\s+/g, " ").trim();
1399
+ const $raw = cheerio.load(html);
1400
+ $raw("script, style, noscript").remove();
1401
+ const rawText = $raw("body").text().replace(/\s+/g, " ").trim();
1332
1402
  const rawByteLength = Buffer.byteLength(html, "utf-8");
1333
1403
  const h1Count = $("h1").length;
1334
1404
  const h2Count = $("h2").length;
@@ -1340,9 +1410,22 @@ function extractPage(html, url) {
1340
1410
  const listItemCount = $("li").length;
1341
1411
  const tableCount = $("table").length;
1342
1412
  const paragraphCount = $("p").length;
1413
+ const GENERIC_ALT_VALUES = /* @__PURE__ */ new Set([
1414
+ "image",
1415
+ "photo",
1416
+ "logo",
1417
+ "icon",
1418
+ "picture",
1419
+ "img",
1420
+ "graphic",
1421
+ "thumbnail"
1422
+ ]);
1343
1423
  let imagesWithAlt = 0;
1344
1424
  $("img").each((_, el) => {
1345
- if ($(el).attr("alt")) imagesWithAlt++;
1425
+ const alt = $(el).attr("alt")?.trim() ?? "";
1426
+ const words = alt.split(/\s+/).filter((w) => w.length > 0);
1427
+ const isMeaningful = words.length > 1 && alt.length < 200 && !GENERIC_ALT_VALUES.has(alt.toLowerCase());
1428
+ if (isMeaningful) imagesWithAlt++;
1346
1429
  });
1347
1430
  const pageDomain = getDomain(url);
1348
1431
  const externalLinks = [];
@@ -1402,7 +1485,7 @@ import { z } from "zod";
1402
1485
 
1403
1486
  // src/modules/analyzer/constants.ts
1404
1487
  var DOMAIN_SIGNAL_TIMEOUT_CAP = 5e3;
1405
- var VERSION = true ? "1.2.7" : "0.0.0";
1488
+ var VERSION = true ? "1.3.0" : "0.0.0";
1406
1489
 
1407
1490
  // src/modules/fetcher/schema.ts
1408
1491
  var FetchOptionsSchema = z.object({
@@ -1482,9 +1565,19 @@ var RECOMMENDATION_BUILDERS = {
1482
1565
  }
1483
1566
  return `Your robots.txt is blocking ${blocked}. Blocking these crawlers means your content cannot be discovered or cited by AI engines.`;
1484
1567
  },
1485
- "LLMs.txt Presence": static_(
1486
- "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site, helping them understand and reference your content more effectively."
1487
- ),
1568
+ "LLMs.txt Presence": (rawData) => {
1569
+ const llms = rawData.llmsTxt;
1570
+ if (!llms) {
1571
+ return "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site.";
1572
+ }
1573
+ if (llms.llmsTxtExists && !llms.llmsFullTxtExists) {
1574
+ return "You have llms.txt but are missing llms-full.txt. Adding llms-full.txt provides AI systems with a comprehensive version of your site documentation for deeper ingestion.";
1575
+ }
1576
+ if (!llms.llmsTxtExists && llms.llmsFullTxtExists) {
1577
+ return "You have llms-full.txt but are missing llms.txt. Adding llms.txt provides AI systems with a concise structured overview of your site's purpose and key pages.";
1578
+ }
1579
+ return "Consider adding llms.txt and llms-full.txt files at your domain root. This emerging standard provides AI systems with a structured overview of your site.";
1580
+ },
1488
1581
  "Image Accessibility": (rawData) => {
1489
1582
  const images = rawData.imageAccessibility;
1490
1583
  if (!images || images.imageCount === 0) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "aiseo-audit",
3
- "version": "1.2.7",
3
+ "version": "1.3.0",
4
4
  "description": "Lighthouse for AI SEO. Audit any webpage for AI search readiness. 7 categories, 30+ factors, research-backed scoring. Deterministic, engine-agnostic, zero API keys.",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",