aiseo-audit 1.4.2 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -21
- package/dist/cli.js +472 -28
- package/dist/cli.mjs +472 -28
- package/dist/index.d.mts +8 -1
- package/dist/index.d.ts +8 -1
- package/dist/index.js +474 -28
- package/dist/index.mjs +473 -28
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
[](https://opensource.org/licenses/MIT)
|
|
6
6
|
[](https://nodejs.org)
|
|
7
7
|
[](https://www.typescriptlang.org/)
|
|
8
|
-
[](https://github.com/agencyenterprise/aiseo-audit)
|
|
9
9
|
[](https://codecov.io/gh/agencyenterprise/aiseo-audit)
|
|
10
10
|
|
|
11
11
|
<div align="center">
|
|
@@ -15,12 +15,10 @@
|
|
|
15
15
|
|
|
16
16
|
Deterministic CLI that audits web pages for **AI search readiness**. Think Lighthouse, but for how well AI engines can fetch, extract, understand, and cite your content.
|
|
17
17
|
|
|
18
|
-
> [!TIP]
|
|
19
|
-
> Run `aiseo-audit https://www.aiseo-audit.com` to see a 100/100 [A Score](https://www.aiseo-audit.com/).
|
|
20
|
-
|
|
21
18
|
**AI SEO measures how reusable your content is for generative engines, not traditional search rankings.**
|
|
22
19
|
|
|
23
20
|
- [Quick Start](#quick-start)
|
|
21
|
+
- [CI/CD](#cicd)
|
|
24
22
|
- [CLI Options](#cli-options)
|
|
25
23
|
- [Site-Wide Auditing](#site-wide-auditing)
|
|
26
24
|
- [Local Development](#local-development)
|
|
@@ -51,6 +49,11 @@ aiseo-audit goes deeper:
|
|
|
51
49
|
|
|
52
50
|
## Quick Start
|
|
53
51
|
|
|
52
|
+
```bash
|
|
53
|
+
# Try it instantly — no install required
|
|
54
|
+
npx aiseo-audit https://yoursite.com
|
|
55
|
+
```
|
|
56
|
+
|
|
54
57
|
```bash
|
|
55
58
|
# As a project dependency
|
|
56
59
|
npm install aiseo-audit
|
|
@@ -95,6 +98,25 @@ aiseo-audit https://example.com --user-agent "MyBot/1.0"
|
|
|
95
98
|
aiseo-audit https://example.com --config aiseo.config.json
|
|
96
99
|
```
|
|
97
100
|
|
|
101
|
+
## CI/CD
|
|
102
|
+
|
|
103
|
+
Drop this into any GitHub Actions workflow to gate PRs on AI search readiness:
|
|
104
|
+
|
|
105
|
+
```yaml
|
|
106
|
+
# .github/workflows/aiseo-audit.yml
|
|
107
|
+
name: AI SEO Audit
|
|
108
|
+
on:
|
|
109
|
+
pull_request:
|
|
110
|
+
push:
|
|
111
|
+
branches: [main]
|
|
112
|
+
|
|
113
|
+
jobs:
|
|
114
|
+
audit:
|
|
115
|
+
runs-on: ubuntu-latest
|
|
116
|
+
steps:
|
|
117
|
+
- run: npx aiseo-audit https://yoursite.com --fail-under 70
|
|
118
|
+
```
|
|
119
|
+
|
|
98
120
|
## CLI Options
|
|
99
121
|
|
|
100
122
|
| Option | Description | Default |
|
|
@@ -155,23 +177,6 @@ aiseo-audit --sitemap https://example.com/projects/sitemap.xml --signals-base ht
|
|
|
155
177
|
|
|
156
178
|
Every report format explicitly shows which URL domain signals were fetched from, so there is no guesswork about where `robots.txt`, `llms.txt`, and `llms-full.txt` were checked.
|
|
157
179
|
|
|
158
|
-
## CI/CD
|
|
159
|
-
|
|
160
|
-
```yaml
|
|
161
|
-
# .github/workflows/aiseo-audit.yml
|
|
162
|
-
name: AI SEO Audit
|
|
163
|
-
on:
|
|
164
|
-
pull_request:
|
|
165
|
-
push:
|
|
166
|
-
branches: [main]
|
|
167
|
-
|
|
168
|
-
jobs:
|
|
169
|
-
audit:
|
|
170
|
-
runs-on: ubuntu-latest
|
|
171
|
-
steps:
|
|
172
|
-
- run: npx aiseo-audit https://yoursite.com --fail-under 70
|
|
173
|
-
```
|
|
174
|
-
|
|
175
180
|
## User Agent
|
|
176
181
|
|
|
177
182
|
By default, all HTTP requests (page fetch, `robots.txt`, `llms.txt`) are sent with the header `User-Agent: AISEOAudit/<version>`. This is intentional. If a site blocks unknown bots, that is a meaningful negative signal for AI search readiness, and the audit should surface it as a failing "Fetch Success" score.
|
package/dist/cli.js
CHANGED
|
@@ -28,7 +28,7 @@ var import_zod3 = require("zod");
|
|
|
28
28
|
|
|
29
29
|
// src/modules/analyzer/constants.ts
|
|
30
30
|
var DOMAIN_SIGNAL_TIMEOUT_CAP = 5e3;
|
|
31
|
-
var VERSION = true ? "1.4.
|
|
31
|
+
var VERSION = true ? "1.4.4" : "0.0.0";
|
|
32
32
|
|
|
33
33
|
// src/modules/fetcher/constants.ts
|
|
34
34
|
var MAX_RESPONSE_SIZE = 10 * 1024 * 1024;
|
|
@@ -40,6 +40,56 @@ var DEFAULT_HEADERS = {
|
|
|
40
40
|
};
|
|
41
41
|
|
|
42
42
|
// src/utils/http.ts
|
|
43
|
+
var FetchError = class extends Error {
|
|
44
|
+
code;
|
|
45
|
+
url;
|
|
46
|
+
constructor(code, url, message) {
|
|
47
|
+
super(message);
|
|
48
|
+
this.name = "FetchError";
|
|
49
|
+
this.code = code;
|
|
50
|
+
this.url = url;
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
function classifyFetchError(err, url) {
|
|
54
|
+
if (err instanceof FetchError) return err;
|
|
55
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
56
|
+
const cause = err instanceof Error && err.cause instanceof Error ? err.cause.message : "";
|
|
57
|
+
const combined = `${msg} ${cause}`.toLowerCase();
|
|
58
|
+
if (err instanceof DOMException || err instanceof Error && err.name === "AbortError" || combined.includes("abort")) {
|
|
59
|
+
return new FetchError(
|
|
60
|
+
"TIMEOUT",
|
|
61
|
+
url,
|
|
62
|
+
`Request timed out. The server at "${new URL(url).hostname}" did not respond in time.`
|
|
63
|
+
);
|
|
64
|
+
}
|
|
65
|
+
if (combined.includes("getaddrinfo") || combined.includes("enotfound")) {
|
|
66
|
+
const hostname = new URL(url).hostname;
|
|
67
|
+
return new FetchError(
|
|
68
|
+
"DNS_FAILURE",
|
|
69
|
+
url,
|
|
70
|
+
`DNS lookup failed for "${hostname}". Check that the domain exists and is spelled correctly.`
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
if (combined.includes("econnrefused")) {
|
|
74
|
+
return new FetchError(
|
|
75
|
+
"CONNECTION_REFUSED",
|
|
76
|
+
url,
|
|
77
|
+
`Connection refused by "${new URL(url).hostname}". The server may be down or not accepting connections.`
|
|
78
|
+
);
|
|
79
|
+
}
|
|
80
|
+
if (combined.includes("cert") || combined.includes("ssl") || combined.includes("tls") || combined.includes("unable to verify")) {
|
|
81
|
+
return new FetchError(
|
|
82
|
+
"TLS_ERROR",
|
|
83
|
+
url,
|
|
84
|
+
`TLS/SSL error connecting to "${new URL(url).hostname}". The site may have an invalid or expired certificate.`
|
|
85
|
+
);
|
|
86
|
+
}
|
|
87
|
+
return new FetchError(
|
|
88
|
+
"NETWORK_ERROR",
|
|
89
|
+
url,
|
|
90
|
+
`Network error fetching "${url}": ${msg}`
|
|
91
|
+
);
|
|
92
|
+
}
|
|
43
93
|
async function httpGet(options) {
|
|
44
94
|
const controller = new AbortController();
|
|
45
95
|
const timer = setTimeout(() => controller.abort(), options.timeout);
|
|
@@ -55,14 +105,18 @@ async function httpGet(options) {
|
|
|
55
105
|
});
|
|
56
106
|
const contentLength = response.headers.get("content-length");
|
|
57
107
|
if (contentLength && parseInt(contentLength, 10) > MAX_RESPONSE_SIZE) {
|
|
58
|
-
throw new
|
|
59
|
-
|
|
108
|
+
throw new FetchError(
|
|
109
|
+
"TOO_LARGE",
|
|
110
|
+
options.url,
|
|
111
|
+
`Response from "${new URL(options.url).hostname}" exceeds the ${Math.round(MAX_RESPONSE_SIZE / 1024 / 1024)}MB size limit.`
|
|
60
112
|
);
|
|
61
113
|
}
|
|
62
114
|
const data = await response.text();
|
|
63
115
|
if (data.length > MAX_RESPONSE_SIZE) {
|
|
64
|
-
throw new
|
|
65
|
-
|
|
116
|
+
throw new FetchError(
|
|
117
|
+
"TOO_LARGE",
|
|
118
|
+
options.url,
|
|
119
|
+
`Response from "${new URL(options.url).hostname}" exceeds the ${Math.round(MAX_RESPONSE_SIZE / 1024 / 1024)}MB size limit.`
|
|
66
120
|
);
|
|
67
121
|
}
|
|
68
122
|
const headers = {};
|
|
@@ -75,6 +129,8 @@ async function httpGet(options) {
|
|
|
75
129
|
headers,
|
|
76
130
|
finalUrl: response.url
|
|
77
131
|
};
|
|
132
|
+
} catch (err) {
|
|
133
|
+
throw classifyFetchError(err, options.url);
|
|
78
134
|
} finally {
|
|
79
135
|
clearTimeout(timer);
|
|
80
136
|
}
|
|
@@ -102,6 +158,8 @@ async function httpHead(options) {
|
|
|
102
158
|
headers,
|
|
103
159
|
finalUrl: response.url
|
|
104
160
|
};
|
|
161
|
+
} catch (err) {
|
|
162
|
+
throw classifyFetchError(err, options.url);
|
|
105
163
|
} finally {
|
|
106
164
|
clearTimeout(timer);
|
|
107
165
|
}
|
|
@@ -350,23 +408,407 @@ function countSyllables(word) {
|
|
|
350
408
|
}
|
|
351
409
|
|
|
352
410
|
// src/modules/audits/support/nlp.ts
|
|
411
|
+
var STOPWORDS = /* @__PURE__ */ new Set([
|
|
412
|
+
"a",
|
|
413
|
+
"an",
|
|
414
|
+
"the",
|
|
415
|
+
"and",
|
|
416
|
+
"or",
|
|
417
|
+
"but",
|
|
418
|
+
"in",
|
|
419
|
+
"on",
|
|
420
|
+
"at",
|
|
421
|
+
"to",
|
|
422
|
+
"for",
|
|
423
|
+
"of",
|
|
424
|
+
"with",
|
|
425
|
+
"by",
|
|
426
|
+
"from",
|
|
427
|
+
"as",
|
|
428
|
+
"is",
|
|
429
|
+
"was",
|
|
430
|
+
"are",
|
|
431
|
+
"were",
|
|
432
|
+
"been",
|
|
433
|
+
"be",
|
|
434
|
+
"have",
|
|
435
|
+
"has",
|
|
436
|
+
"had",
|
|
437
|
+
"do",
|
|
438
|
+
"does",
|
|
439
|
+
"did",
|
|
440
|
+
"will",
|
|
441
|
+
"would",
|
|
442
|
+
"could",
|
|
443
|
+
"should",
|
|
444
|
+
"may",
|
|
445
|
+
"might",
|
|
446
|
+
"shall",
|
|
447
|
+
"can",
|
|
448
|
+
"need",
|
|
449
|
+
"must",
|
|
450
|
+
"that",
|
|
451
|
+
"which",
|
|
452
|
+
"who",
|
|
453
|
+
"whom",
|
|
454
|
+
"this",
|
|
455
|
+
"these",
|
|
456
|
+
"those",
|
|
457
|
+
"it",
|
|
458
|
+
"its",
|
|
459
|
+
"he",
|
|
460
|
+
"she",
|
|
461
|
+
"they",
|
|
462
|
+
"we",
|
|
463
|
+
"you",
|
|
464
|
+
"i",
|
|
465
|
+
"me",
|
|
466
|
+
"him",
|
|
467
|
+
"her",
|
|
468
|
+
"us",
|
|
469
|
+
"them",
|
|
470
|
+
"my",
|
|
471
|
+
"your",
|
|
472
|
+
"his",
|
|
473
|
+
"our",
|
|
474
|
+
"their",
|
|
475
|
+
"what",
|
|
476
|
+
"when",
|
|
477
|
+
"where",
|
|
478
|
+
"how",
|
|
479
|
+
"why",
|
|
480
|
+
"all",
|
|
481
|
+
"each",
|
|
482
|
+
"every",
|
|
483
|
+
"both",
|
|
484
|
+
"few",
|
|
485
|
+
"more",
|
|
486
|
+
"most",
|
|
487
|
+
"other",
|
|
488
|
+
"some",
|
|
489
|
+
"such",
|
|
490
|
+
"no",
|
|
491
|
+
"nor",
|
|
492
|
+
"not",
|
|
493
|
+
"only",
|
|
494
|
+
"own",
|
|
495
|
+
"same",
|
|
496
|
+
"so",
|
|
497
|
+
"than",
|
|
498
|
+
"too",
|
|
499
|
+
"very",
|
|
500
|
+
"just",
|
|
501
|
+
"about",
|
|
502
|
+
"above",
|
|
503
|
+
"after",
|
|
504
|
+
"again",
|
|
505
|
+
"also",
|
|
506
|
+
"any",
|
|
507
|
+
"because",
|
|
508
|
+
"before",
|
|
509
|
+
"between",
|
|
510
|
+
"during",
|
|
511
|
+
"here",
|
|
512
|
+
"if",
|
|
513
|
+
"into",
|
|
514
|
+
"like",
|
|
515
|
+
"new",
|
|
516
|
+
"now",
|
|
517
|
+
"over",
|
|
518
|
+
"then",
|
|
519
|
+
"there",
|
|
520
|
+
"through",
|
|
521
|
+
"under",
|
|
522
|
+
"up",
|
|
523
|
+
"out",
|
|
524
|
+
"off",
|
|
525
|
+
"down",
|
|
526
|
+
"much",
|
|
527
|
+
"well",
|
|
528
|
+
"back",
|
|
529
|
+
"even",
|
|
530
|
+
"still",
|
|
531
|
+
"also",
|
|
532
|
+
"get",
|
|
533
|
+
"got",
|
|
534
|
+
"one",
|
|
535
|
+
"two",
|
|
536
|
+
"make",
|
|
537
|
+
"many",
|
|
538
|
+
"say",
|
|
539
|
+
"said",
|
|
540
|
+
"see",
|
|
541
|
+
"go",
|
|
542
|
+
"come",
|
|
543
|
+
"take",
|
|
544
|
+
"know",
|
|
545
|
+
"think",
|
|
546
|
+
"good",
|
|
547
|
+
"great",
|
|
548
|
+
"first",
|
|
549
|
+
"last",
|
|
550
|
+
"long",
|
|
551
|
+
"way",
|
|
552
|
+
"find",
|
|
553
|
+
"use",
|
|
554
|
+
"used",
|
|
555
|
+
"using",
|
|
556
|
+
"while",
|
|
557
|
+
"being",
|
|
558
|
+
"made",
|
|
559
|
+
"however",
|
|
560
|
+
"since",
|
|
561
|
+
"per",
|
|
562
|
+
"via",
|
|
563
|
+
"based",
|
|
564
|
+
"within",
|
|
565
|
+
"without",
|
|
566
|
+
"across",
|
|
567
|
+
"along",
|
|
568
|
+
"around",
|
|
569
|
+
"among",
|
|
570
|
+
"until",
|
|
571
|
+
"another",
|
|
572
|
+
"www",
|
|
573
|
+
"http",
|
|
574
|
+
"https",
|
|
575
|
+
"com"
|
|
576
|
+
]);
|
|
577
|
+
var ACRONYM_STOPLIST = /* @__PURE__ */ new Set([
|
|
578
|
+
"I",
|
|
579
|
+
"A",
|
|
580
|
+
"OK",
|
|
581
|
+
"AM",
|
|
582
|
+
"PM",
|
|
583
|
+
"US",
|
|
584
|
+
"UK",
|
|
585
|
+
"EU",
|
|
586
|
+
"VS",
|
|
587
|
+
"EG",
|
|
588
|
+
"IE",
|
|
589
|
+
"ET",
|
|
590
|
+
"AL",
|
|
591
|
+
"HTML",
|
|
592
|
+
"CSS",
|
|
593
|
+
"JS",
|
|
594
|
+
"TS",
|
|
595
|
+
"URL",
|
|
596
|
+
"HTTP",
|
|
597
|
+
"HTTPS",
|
|
598
|
+
"API",
|
|
599
|
+
"SDK",
|
|
600
|
+
"CLI",
|
|
601
|
+
"GUI",
|
|
602
|
+
"PDF",
|
|
603
|
+
"CSV",
|
|
604
|
+
"JSON",
|
|
605
|
+
"XML",
|
|
606
|
+
"SQL",
|
|
607
|
+
"RSS",
|
|
608
|
+
"FTP",
|
|
609
|
+
"SSH",
|
|
610
|
+
"SSL",
|
|
611
|
+
"TLS",
|
|
612
|
+
"DNS",
|
|
613
|
+
"TCP",
|
|
614
|
+
"UDP",
|
|
615
|
+
"IP",
|
|
616
|
+
"RAM",
|
|
617
|
+
"ROM",
|
|
618
|
+
"CPU",
|
|
619
|
+
"GPU",
|
|
620
|
+
"SSD",
|
|
621
|
+
"HDD",
|
|
622
|
+
"USB",
|
|
623
|
+
"HDMI",
|
|
624
|
+
"FAQ",
|
|
625
|
+
"DIY",
|
|
626
|
+
"ASAP",
|
|
627
|
+
"FYI",
|
|
628
|
+
"TBD",
|
|
629
|
+
"TBA",
|
|
630
|
+
"ETA",
|
|
631
|
+
"ROI",
|
|
632
|
+
"KPI",
|
|
633
|
+
"CEO",
|
|
634
|
+
"CTO",
|
|
635
|
+
"CFO",
|
|
636
|
+
"COO",
|
|
637
|
+
"CIO",
|
|
638
|
+
"VP",
|
|
639
|
+
"SVP",
|
|
640
|
+
"EVP",
|
|
641
|
+
"HR",
|
|
642
|
+
"PR",
|
|
643
|
+
"QA",
|
|
644
|
+
"IT",
|
|
645
|
+
"RD",
|
|
646
|
+
"RND",
|
|
647
|
+
"LLC",
|
|
648
|
+
"INC",
|
|
649
|
+
"LTD",
|
|
650
|
+
"CORP",
|
|
651
|
+
"PLC",
|
|
652
|
+
"USD",
|
|
653
|
+
"EUR",
|
|
654
|
+
"GBP",
|
|
655
|
+
"JPY",
|
|
656
|
+
"CAD",
|
|
657
|
+
"ID",
|
|
658
|
+
"NO",
|
|
659
|
+
"RE",
|
|
660
|
+
"CC",
|
|
661
|
+
"BCC",
|
|
662
|
+
"GEO",
|
|
663
|
+
"SEO",
|
|
664
|
+
"SEM",
|
|
665
|
+
"PPC",
|
|
666
|
+
"CMS",
|
|
667
|
+
"CRM",
|
|
668
|
+
"ERP",
|
|
669
|
+
"SaaS",
|
|
670
|
+
"AI",
|
|
671
|
+
"ML",
|
|
672
|
+
"NLP",
|
|
673
|
+
"LLM",
|
|
674
|
+
"GPT",
|
|
675
|
+
"NER",
|
|
676
|
+
"TLDR",
|
|
677
|
+
"AKA",
|
|
678
|
+
"RSVP",
|
|
679
|
+
"PS"
|
|
680
|
+
]);
|
|
681
|
+
var ORG_SUFFIXES = /\b(?:Inc|Corp|Corporation|LLC|Ltd|Limited|Co|Company|Group|Foundation|Institute|University|Association|Society|Agency|Authority|Bureau|Commission|Council|Department|Board|Trust|Fund|Partners|Ventures|Labs|Technologies|Solutions|Systems|Services|Consulting|Media|Network|Studios|Entertainment|Healthcare|Pharmaceuticals|Dynamics|Holdings|Capital|Enterprises|International)\b/i;
|
|
682
|
+
var PERSON_HONORIFICS = /\b(?:Mr|Mrs|Ms|Miss|Dr|Prof|Professor|Rev|Reverend|Sen|Senator|Rep|Representative|Gov|Governor|Pres|President|Gen|General|Col|Colonel|Sgt|Sergeant|Cpl|Corporal|Pvt|Private|Adm|Admiral|Capt|Captain|Lt|Lieutenant|Maj|Major|Sir|Dame|Lord|Lady|Hon|Honorable|Judge|Justice|Chancellor|Dean|Provost)\.\s*/;
|
|
683
|
+
function extractAcronymEntities(text) {
|
|
684
|
+
const matches = text.match(/\b[A-Z]{2,6}\b/g);
|
|
685
|
+
if (!matches) return [];
|
|
686
|
+
const seen = /* @__PURE__ */ new Set();
|
|
687
|
+
const results = [];
|
|
688
|
+
for (const m of matches) {
|
|
689
|
+
if (!ACRONYM_STOPLIST.has(m) && !seen.has(m)) {
|
|
690
|
+
seen.add(m);
|
|
691
|
+
results.push(m);
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
return results;
|
|
695
|
+
}
|
|
696
|
+
function extractTitleCaseEntities(text) {
|
|
697
|
+
const pattern = /\b([A-Z][a-z]+(?:\s+(?:of|the|and|for|de|van|von|al|el|la|le|del|der|den|das|di|du))?\s+(?:[A-Z][a-z]+)(?:\s+[A-Z][a-z]+){0,3})\b/g;
|
|
698
|
+
const sentences = text.split(/[.!?]\s+/);
|
|
699
|
+
const sentenceStarts = /* @__PURE__ */ new Set();
|
|
700
|
+
for (const s of sentences) {
|
|
701
|
+
const trimmed = s.trim();
|
|
702
|
+
const firstWord = trimmed.split(/\s+/)[0];
|
|
703
|
+
if (firstWord) sentenceStarts.add(firstWord);
|
|
704
|
+
}
|
|
705
|
+
const seen = /* @__PURE__ */ new Set();
|
|
706
|
+
const results = [];
|
|
707
|
+
let match;
|
|
708
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
709
|
+
const entity = match[1];
|
|
710
|
+
const firstWord = entity.split(/\s+/)[0];
|
|
711
|
+
if (sentenceStarts.has(firstWord) && !text.includes(`. ${entity}`) && !text.includes(`, ${entity}`)) {
|
|
712
|
+
const escapedEntity = entity.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
713
|
+
const appearances = text.match(new RegExp(escapedEntity, "g"));
|
|
714
|
+
if (!appearances || appearances.length < 2) continue;
|
|
715
|
+
}
|
|
716
|
+
if (!seen.has(entity) && entity.split(/\s+/).length >= 2) {
|
|
717
|
+
seen.add(entity);
|
|
718
|
+
results.push(entity);
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
return results;
|
|
722
|
+
}
|
|
723
|
+
function isOrganizationByPattern(entity) {
|
|
724
|
+
return ORG_SUFFIXES.test(entity);
|
|
725
|
+
}
|
|
726
|
+
function isPersonByHonorific(text, entity) {
|
|
727
|
+
const escapedEntity = entity.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
728
|
+
const pattern = new RegExp(
|
|
729
|
+
`(?:${PERSON_HONORIFICS.source})\\s*${escapedEntity}`,
|
|
730
|
+
"i"
|
|
731
|
+
);
|
|
732
|
+
return pattern.test(text);
|
|
733
|
+
}
|
|
734
|
+
function extractTopicsByTfIdf(text, limit) {
|
|
735
|
+
const lower = text.toLowerCase();
|
|
736
|
+
const words = lower.replace(/[^a-z0-9\s'-]/g, " ").split(/\s+/).filter((w) => w.length > 2 && !STOPWORDS.has(w));
|
|
737
|
+
if (words.length === 0) return [];
|
|
738
|
+
const freq = /* @__PURE__ */ new Map();
|
|
739
|
+
for (const w of words) {
|
|
740
|
+
freq.set(w, (freq.get(w) || 0) + 1);
|
|
741
|
+
}
|
|
742
|
+
for (let i = 0; i < words.length - 1; i++) {
|
|
743
|
+
const bigram = `${words[i]} ${words[i + 1]}`;
|
|
744
|
+
freq.set(bigram, (freq.get(bigram) || 0) + 1);
|
|
745
|
+
}
|
|
746
|
+
const candidates = [];
|
|
747
|
+
for (const [term, count] of freq) {
|
|
748
|
+
if (count >= 2) {
|
|
749
|
+
const isBigram = term.includes(" ");
|
|
750
|
+
const score = isBigram ? count * 1.5 : count;
|
|
751
|
+
candidates.push([term, score]);
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
candidates.sort((a, b) => b[1] - a[1]);
|
|
755
|
+
return candidates.slice(0, limit).map(([term]) => term);
|
|
756
|
+
}
|
|
757
|
+
function smartDedup(entities) {
|
|
758
|
+
if (entities.length === 0) return [];
|
|
759
|
+
const sorted = [...entities].sort((a, b) => b.length - a.length);
|
|
760
|
+
const result = [];
|
|
761
|
+
const lowerSeen = /* @__PURE__ */ new Set();
|
|
762
|
+
for (const entity of sorted) {
|
|
763
|
+
const lower = entity.toLowerCase();
|
|
764
|
+
if (lowerSeen.has(lower)) continue;
|
|
765
|
+
let isSubstring = false;
|
|
766
|
+
for (const accepted of lowerSeen) {
|
|
767
|
+
if (accepted.includes(lower)) {
|
|
768
|
+
isSubstring = true;
|
|
769
|
+
break;
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
if (isSubstring) continue;
|
|
773
|
+
result.push(entity);
|
|
774
|
+
lowerSeen.add(lower);
|
|
775
|
+
}
|
|
776
|
+
return result;
|
|
777
|
+
}
|
|
778
|
+
function mergeEntityLists(compromiseList, supplementalList, limit) {
|
|
779
|
+
const combined = [...compromiseList, ...supplementalList];
|
|
780
|
+
return smartDedup(combined).slice(0, limit);
|
|
781
|
+
}
|
|
353
782
|
function extractEntities(text) {
|
|
354
783
|
const doc = (0, import_compromise.default)(text);
|
|
355
|
-
const
|
|
356
|
-
|
|
357
|
-
10
|
|
358
|
-
);
|
|
359
|
-
const organizations = [
|
|
784
|
+
const compromisePeople = [...new Set(doc.people().out("array"))];
|
|
785
|
+
const compromiseOrgs = [
|
|
360
786
|
...new Set(doc.organizations().out("array"))
|
|
361
|
-
]
|
|
362
|
-
const
|
|
363
|
-
|
|
787
|
+
];
|
|
788
|
+
const compromisePlaces = [...new Set(doc.places().out("array"))];
|
|
789
|
+
const acronyms = extractAcronymEntities(text);
|
|
790
|
+
const titleCaseEntities = extractTitleCaseEntities(text);
|
|
791
|
+
const supplementalPeople = [];
|
|
792
|
+
const supplementalOrgs = [];
|
|
793
|
+
const unclassified = [];
|
|
794
|
+
for (const entity of titleCaseEntities) {
|
|
795
|
+
if (isPersonByHonorific(text, entity)) {
|
|
796
|
+
supplementalPeople.push(entity);
|
|
797
|
+
} else if (isOrganizationByPattern(entity)) {
|
|
798
|
+
supplementalOrgs.push(entity);
|
|
799
|
+
} else {
|
|
800
|
+
unclassified.push(entity);
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
unclassified.push(...acronyms);
|
|
804
|
+
const people = mergeEntityLists(compromisePeople, supplementalPeople, 10);
|
|
805
|
+
const organizations = mergeEntityLists(
|
|
806
|
+
compromiseOrgs,
|
|
807
|
+
[...supplementalOrgs, ...unclassified],
|
|
364
808
|
10
|
|
365
809
|
);
|
|
366
|
-
const
|
|
367
|
-
|
|
368
|
-
15
|
|
369
|
-
);
|
|
810
|
+
const places = smartDedup([...new Set(compromisePlaces)]).slice(0, 10);
|
|
811
|
+
const topics = extractTopicsByTfIdf(text, 15);
|
|
370
812
|
const imperativeVerbCount = doc.verbs().isImperative().length;
|
|
371
813
|
const numberCount = doc.numbers().length;
|
|
372
814
|
return {
|
|
@@ -441,11 +883,11 @@ function maxFactors(factors) {
|
|
|
441
883
|
}
|
|
442
884
|
|
|
443
885
|
// src/modules/audits/categories/answerability.ts
|
|
444
|
-
function auditAnswerability(page) {
|
|
886
|
+
function auditAnswerability(page, preExtracted) {
|
|
445
887
|
const text = page.cleanText;
|
|
446
888
|
const $ = page.$;
|
|
447
889
|
const factors = [];
|
|
448
|
-
const { imperativeVerbCount = 0 } = extractEntities(text);
|
|
890
|
+
const { imperativeVerbCount = 0 } = preExtracted ?? extractEntities(text);
|
|
449
891
|
const defCount = countPatternMatches(text, DEFINITION_PATTERNS);
|
|
450
892
|
const defScore = thresholdScore(defCount, [
|
|
451
893
|
[6, 10],
|
|
@@ -1104,10 +1546,10 @@ function auditContentStructure(page) {
|
|
|
1104
1546
|
}
|
|
1105
1547
|
|
|
1106
1548
|
// src/modules/audits/categories/entity-clarity.ts
|
|
1107
|
-
function auditEntityClarity(page) {
|
|
1549
|
+
function auditEntityClarity(page, preExtracted) {
|
|
1108
1550
|
const text = page.cleanText;
|
|
1109
1551
|
const factors = [];
|
|
1110
|
-
const entities = extractEntities(text);
|
|
1552
|
+
const entities = preExtracted ?? extractEntities(text);
|
|
1111
1553
|
const totalEntities = entities.people.length + entities.organizations.length + entities.places.length + entities.topics.length;
|
|
1112
1554
|
const richScore = thresholdScore(totalEntities, [
|
|
1113
1555
|
[9, 20],
|
|
@@ -1136,13 +1578,14 @@ function auditEntityClarity(page) {
|
|
|
1136
1578
|
}
|
|
1137
1579
|
}
|
|
1138
1580
|
const consistencyRatio = keyWords.length > 0 ? topicOverlap / keyWords.length : 0;
|
|
1139
|
-
const consistencyScore = consistencyRatio >= 0.5 ? 25 : consistencyRatio > 0 ? 15 :
|
|
1581
|
+
const consistencyScore = keyWords.length === 0 ? 0 : consistencyRatio >= 0.5 ? 25 : consistencyRatio > 0 ? 15 : 0;
|
|
1140
1582
|
factors.push(
|
|
1141
1583
|
makeFactor(
|
|
1142
1584
|
"Topic Consistency",
|
|
1143
1585
|
consistencyScore,
|
|
1144
1586
|
25,
|
|
1145
|
-
`${topicOverlap}/${keyWords.length} title keywords align with content topics
|
|
1587
|
+
`${topicOverlap}/${keyWords.length} title keywords align with content topics`,
|
|
1588
|
+
keyWords.length === 0 ? "neutral" : void 0
|
|
1146
1589
|
)
|
|
1147
1590
|
);
|
|
1148
1591
|
const wordCount = countWords(text);
|
|
@@ -1171,11 +1614,11 @@ function auditEntityClarity(page) {
|
|
|
1171
1614
|
}
|
|
1172
1615
|
|
|
1173
1616
|
// src/modules/audits/categories/grounding-signals.ts
|
|
1174
|
-
function auditGroundingSignals(page) {
|
|
1617
|
+
function auditGroundingSignals(page, preExtracted) {
|
|
1175
1618
|
const $ = page.$;
|
|
1176
1619
|
const text = page.cleanText;
|
|
1177
1620
|
const factors = [];
|
|
1178
|
-
const { numberCount = 0 } = extractEntities(text);
|
|
1621
|
+
const { numberCount = 0 } = preExtracted ?? extractEntities(text);
|
|
1179
1622
|
const externalLinks = page.externalLinks;
|
|
1180
1623
|
const extScore = thresholdScore(externalLinks.length, [
|
|
1181
1624
|
[6, 13],
|
|
@@ -1345,15 +1788,16 @@ function auditReadabilityForCompression(page) {
|
|
|
1345
1788
|
|
|
1346
1789
|
// src/modules/audits/service.ts
|
|
1347
1790
|
function runAudits(page, fetchResult, domainSignals) {
|
|
1791
|
+
const entities = extractEntities(page.cleanText);
|
|
1348
1792
|
const extractability = auditContentExtractability(
|
|
1349
1793
|
page,
|
|
1350
1794
|
fetchResult,
|
|
1351
1795
|
domainSignals
|
|
1352
1796
|
);
|
|
1353
1797
|
const structure = auditContentStructure(page);
|
|
1354
|
-
const answerability = auditAnswerability(page);
|
|
1355
|
-
const entityClarity = auditEntityClarity(page);
|
|
1356
|
-
const groundingSignals = auditGroundingSignals(page);
|
|
1798
|
+
const answerability = auditAnswerability(page, entities);
|
|
1799
|
+
const entityClarity = auditEntityClarity(page, entities);
|
|
1800
|
+
const groundingSignals = auditGroundingSignals(page, entities);
|
|
1357
1801
|
const authorityContext = auditAuthorityContext(page);
|
|
1358
1802
|
const readability = auditReadabilityForCompression(page);
|
|
1359
1803
|
return {
|