@pseolint/core 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +81 -13
- package/dist/ai/triage.d.ts +6 -0
- package/dist/ai/triage.d.ts.map +1 -1
- package/dist/ai/triage.js +26 -13
- package/dist/ai/triage.js.map +1 -1
- package/dist/ai/types.d.ts +2 -1
- package/dist/ai/types.d.ts.map +1 -1
- package/dist/auditor.d.ts.map +1 -1
- package/dist/auditor.js +114 -24
- package/dist/auditor.js.map +1 -1
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +38 -2
- package/dist/cache.js.map +1 -1
- package/dist/rules/tech/robots-sitemap-presence.d.ts +16 -0
- package/dist/rules/tech/robots-sitemap-presence.d.ts.map +1 -1
- package/dist/rules/tech/robots-sitemap-presence.js +26 -2
- package/dist/rules/tech/robots-sitemap-presence.js.map +1 -1
- package/dist/telemetry/aggregator.d.ts +19 -0
- package/dist/telemetry/aggregator.d.ts.map +1 -1
- package/dist/telemetry/aggregator.js +50 -6
- package/dist/telemetry/aggregator.js.map +1 -1
- package/dist/telemetry/types.d.ts +6 -0
- package/dist/telemetry/types.d.ts.map +1 -1
- package/dist/telemetry/types.js +5 -0
- package/dist/telemetry/types.js.map +1 -1
- package/dist/types.d.ts +16 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +91 -66
package/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# @pseolint/core
|
|
2
2
|
|
|
3
|
-
> Programmatic SEO audit engine
|
|
3
|
+
> Programmatic SEO audit engine for SpamBrain-risk detection across large template-generated sites.
|
|
4
4
|
|
|
5
|
-
The core engine behind [pseolint](https://www.npmjs.com/package/pseolint). Use this package to
|
|
5
|
+
The core engine behind [pseolint](https://www.npmjs.com/package/pseolint). Use this package to embed pSEO auditing into your own tools, CI pipelines, or SaaS products.
|
|
6
6
|
|
|
7
7
|
## Install
|
|
8
8
|
|
|
@@ -12,7 +12,7 @@ npm install @pseolint/core
|
|
|
12
12
|
|
|
13
13
|
## Usage
|
|
14
14
|
|
|
15
|
-
```
|
|
15
|
+
```ts
|
|
16
16
|
import { auditSource } from "@pseolint/core";
|
|
17
17
|
|
|
18
18
|
const summary = await auditSource("./out");
|
|
@@ -20,34 +20,102 @@ console.log(`Score: ${summary.score}/100`);
|
|
|
20
20
|
console.log(`Findings: ${summary.findings.length}`);
|
|
21
21
|
```
|
|
22
22
|
|
|
23
|
+
`auditSource` accepts a local directory, a single HTML file, a page URL, or a sitemap URL.
|
|
24
|
+
|
|
23
25
|
## What It Checks
|
|
24
26
|
|
|
25
|
-
|
|
27
|
+
37+ rules across 7 categories:
|
|
26
28
|
|
|
27
|
-
- **SpamBrain
|
|
28
|
-
- **Content
|
|
29
|
-
- **Internal
|
|
30
|
-
- **Technical SEO** — canonical consistency, sitemap completeness,
|
|
31
|
-
- **Structured
|
|
29
|
+
- **Spam / SpamBrain risk** — near-duplicate (SimHash), entity-swap doorways, thin content, boilerplate ratio, template diversity, template coverage, publication velocity, doorway pattern
|
|
30
|
+
- **Content** — unique value, heading / meta uniqueness, author attribution, E-E-A-T signals
|
|
31
|
+
- **Internal linking** — orphan pages, dead ends, cluster connectivity, hub pages, link depth, unreachable-from-root
|
|
32
|
+
- **Technical SEO** — canonical consistency, canonical/noindex and robots/noindex conflicts, sitemap completeness, robots compliance, redirect chains, soft 404s, Open Graph, hreflang
|
|
33
|
+
- **Structured data** — JSON-LD validity, required fields, cross-page schema consistency
|
|
32
34
|
- **Cannibalization** — title overlap, keyword collision, URL pattern conflicts
|
|
35
|
+
- **Data binding** — verify rendered pages expose values from a source dataset (catches missing or identical-across-pages bindings)
|
|
33
36
|
|
|
34
37
|
## API
|
|
35
38
|
|
|
36
39
|
### `auditSource(source, options?)`
|
|
37
40
|
|
|
38
|
-
|
|
41
|
+
Returns an `AuditSummary` with composite score, category scores, enriched findings, and optional cache / state / AI-triage metadata.
|
|
42
|
+
|
|
43
|
+
Selected options (see `AuditOptions` in `types.ts` for the full surface):
|
|
44
|
+
|
|
45
|
+
```ts
|
|
46
|
+
await auditSource("https://example.com/sitemap.xml", {
|
|
47
|
+
concurrency: 5,
|
|
48
|
+
timeout: 30_000,
|
|
49
|
+
sampleSize: 200,
|
|
50
|
+
samplingStrategy: "stratified", // or "random"
|
|
51
|
+
ignore: ["**/api/**"],
|
|
52
|
+
maxFetchBytes: 52_428_800, // 50 MB hard cap per run
|
|
53
|
+
cache: { dir: ".pseolint/cache", ttlMs: 7 * 24 * 60 * 60 * 1000 },
|
|
54
|
+
state: { path: ".pseolint/state.json", since: true, exitOnRegression: true },
|
|
55
|
+
pageGroups: {
|
|
56
|
+
blog: { match: "**/blog/**", rules: ["content/*", "spam/*"] },
|
|
57
|
+
products: { match: "**/p/**", overrides: { "spam/thin-content": { thinContentMinWords: 200 } } },
|
|
58
|
+
},
|
|
59
|
+
dataSource: { records: [{ url: "/p/*", data: { price: "$19", stock: 12 } }] },
|
|
60
|
+
entityPatterns: [{ placeholder: "[CITY]", pattern: "\\b(NYC|LA|SF)\\b", flags: "gi" }],
|
|
61
|
+
ai: { enabled: true, provider: "anthropic", model: "claude-haiku-4-5-20251001", maxCostUsd: 0.1 },
|
|
62
|
+
telemetry: { enabled: true, path: ".pseolint/telemetry.jsonl" },
|
|
63
|
+
rules: {
|
|
64
|
+
nearDuplicateThreshold: 0.85,
|
|
65
|
+
thinContentMinWords: 300,
|
|
66
|
+
titleOverlapThreshold: 0.8,
|
|
67
|
+
// ...
|
|
68
|
+
},
|
|
69
|
+
});
|
|
70
|
+
```
|
|
39
71
|
|
|
40
72
|
### Formatters
|
|
41
73
|
|
|
42
|
-
```
|
|
74
|
+
```ts
|
|
43
75
|
import { formatConsole, formatJson, formatMarkdown, formatHtml } from "@pseolint/core";
|
|
44
76
|
|
|
45
|
-
const
|
|
77
|
+
const out = formatConsole(summary);
|
|
46
78
|
const json = formatJson(summary);
|
|
47
|
-
const md
|
|
79
|
+
const md = formatMarkdown(summary);
|
|
48
80
|
const html = formatHtml(summary);
|
|
49
81
|
```
|
|
50
82
|
|
|
83
|
+
### AI triage
|
|
84
|
+
|
|
85
|
+
When `ai.enabled` is set, findings are clustered into root-causes by an LLM. Providers are loaded lazily from optional peer deps — install only the one you need:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
npm install @ai-sdk/anthropic # or @ai-sdk/openai, @ai-sdk/google, @ai-sdk/mistral,
|
|
89
|
+
# @ai-sdk/groq, @ai-sdk/xai, @ai-sdk/cohere,
|
|
90
|
+
# ollama-ai-provider-v2
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
```ts
|
|
94
|
+
import { triageFindings, createLanguageModel, estimateCostUsd } from "@pseolint/core";
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Cost and daily-budget caps are enforced pre-flight; results are cached on disk by default.
|
|
98
|
+
|
|
99
|
+
### Delta runs & regression gating
|
|
100
|
+
|
|
101
|
+
Pass `state.since: true` to audit only URLs whose content hash changed since the last run, and `state.exitOnRegression: true` to flag a run where a new rule ID fires on any previously clean URL (`summary.hasRegression`).
|
|
102
|
+
|
|
103
|
+
### Caching
|
|
104
|
+
|
|
105
|
+
Setting `cache` enables an ETag/Last-Modified-aware disk cache for HTTP fetches. `summary.cacheStats` reports `{ hits, total, bytesSavedEstimate }`.
|
|
106
|
+
|
|
107
|
+
### Page groups
|
|
108
|
+
|
|
109
|
+
Classify pages by glob and apply different rule subsets or threshold overrides per group. Results are surfaced in `summary.groupScores` / `summary.groupPageCounts`.
|
|
110
|
+
|
|
111
|
+
### Rendering
|
|
112
|
+
|
|
113
|
+
For client-rendered pages, install `playwright-core` and pass `render: { browserWsEndpoint }` to connect to an existing browser endpoint.
|
|
114
|
+
|
|
115
|
+
## Peer dependencies
|
|
116
|
+
|
|
117
|
+
All AI providers and `playwright-core` are optional peers — you only install the ones you actually use.
|
|
118
|
+
|
|
51
119
|
## License
|
|
52
120
|
|
|
53
121
|
MIT
|
package/dist/ai/triage.d.ts
CHANGED
|
@@ -8,6 +8,12 @@ export interface TriageOptions {
|
|
|
8
8
|
modelId: string;
|
|
9
9
|
maxInputTokens?: number;
|
|
10
10
|
maxOutputTokens?: number;
|
|
11
|
+
/** Hard cap on estimated USD for this call. Refuse pre-flight if exceeded. */
|
|
12
|
+
maxCostUsd?: number;
|
|
13
|
+
/** USD already spent on successful triages today (used against dailyBudgetUsd). */
|
|
14
|
+
spentTodayUsd?: number;
|
|
15
|
+
/** Daily budget ceiling. Refuse pre-flight if spentTodayUsd + this call's estimate > budget. */
|
|
16
|
+
dailyBudgetUsd?: number;
|
|
11
17
|
cache?: {
|
|
12
18
|
dir: string;
|
|
13
19
|
ttlMs: number;
|
package/dist/ai/triage.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"triage.d.ts","sourceRoot":"","sources":["../../src/ai/triage.ts"],"names":[],"mappings":"AACA,OAAO,EAAkB,KAAK,aAAa,EAAE,MAAM,IAAI,CAAC;AAExD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AA2B/C,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,OAAO,CAAC;IACjB,KAAK,EAAE,aAAa,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,KAAK,CAAC,EAAE;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,GAAG,KAAK,CAAC;IAC/C,MAAM,CAAC,EAAE,WAAW,CAAC;CACtB;AAED,MAAM,WAAW,aAAa;IAC5B,MAAM,CAAC,EAAE,YAAY,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAeD,wBAAsB,cAAc,CAClC,QAAQ,EAAE,UAAU,EAAE,EACtB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,aAAa,GACrB,OAAO,CAAC,aAAa,CAAC,
|
|
1
|
+
{"version":3,"file":"triage.d.ts","sourceRoot":"","sources":["../../src/ai/triage.ts"],"names":[],"mappings":"AACA,OAAO,EAAkB,KAAK,aAAa,EAAE,MAAM,IAAI,CAAC;AAExD,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AA2B/C,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,OAAO,CAAC;IACjB,KAAK,EAAE,aAAa,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,8EAA8E;IAC9E,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,mFAAmF;IACnF,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gGAAgG;IAChG,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,KAAK,CAAC,EAAE;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,GAAG,KAAK,CAAC;IAC/C,MAAM,CAAC,EAAE,WAAW,CAAC;CACtB;AAED,MAAM,WAAW,aAAa;IAC5B,MAAM,CAAC,EAAE,YAAY,CAAC;IACtB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAeD,wBAAsB,cAAc,CAClC,QAAQ,EAAE,UAAU,EAAE,EACtB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,aAAa,GACrB,OAAO,CAAC,aAAa,CAAC,CAuHxB"}
|
package/dist/ai/triage.js
CHANGED
|
@@ -16,7 +16,7 @@ const rootCauseSchema = z.object({
|
|
|
16
16
|
});
|
|
17
17
|
const triagePayloadSchema = z.object({
|
|
18
18
|
rootCauses: z.array(rootCauseSchema).min(1).max(5).describe("1 to 5 root causes, ranked by SEO impact (highest first)."),
|
|
19
|
-
narrative: z.string().
|
|
19
|
+
narrative: z.string().optional().describe("1-2 sentence overall summary. Optional — if output is tight, prioritize rootCauses and omit this."),
|
|
20
20
|
});
|
|
21
21
|
const DEFAULT_MAX_INPUT_TOKENS = 60_000;
|
|
22
22
|
const DEFAULT_MAX_OUTPUT_TOKENS = 4_000;
|
|
@@ -39,6 +39,26 @@ export async function triageFindings(findings, pageCount, options) {
|
|
|
39
39
|
if (estimate > maxInputTokens) {
|
|
40
40
|
return { skipReason: `pre-flight token estimate ${estimate} exceeds cap ${maxInputTokens}` };
|
|
41
41
|
}
|
|
42
|
+
// Pre-flight cost gate: pessimistic upper bound using (estimated input + max output).
|
|
43
|
+
const maxOutputTokens = options.maxOutputTokens ?? DEFAULT_MAX_OUTPUT_TOKENS;
|
|
44
|
+
const preflightCostUsd = estimateCostUsd(options.providerId, options.modelId, {
|
|
45
|
+
input: estimate,
|
|
46
|
+
output: maxOutputTokens,
|
|
47
|
+
});
|
|
48
|
+
if (options.maxCostUsd !== undefined &&
|
|
49
|
+
preflightCostUsd !== undefined &&
|
|
50
|
+
preflightCostUsd > options.maxCostUsd) {
|
|
51
|
+
return {
|
|
52
|
+
skipReason: `pre-flight cost $${preflightCostUsd.toFixed(3)} exceeds cap $${options.maxCostUsd.toFixed(3)}`,
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
if (options.dailyBudgetUsd !== undefined &&
|
|
56
|
+
preflightCostUsd !== undefined &&
|
|
57
|
+
(options.spentTodayUsd ?? 0) + preflightCostUsd > options.dailyBudgetUsd) {
|
|
58
|
+
return {
|
|
59
|
+
skipReason: `daily budget exhausted: $${(options.spentTodayUsd ?? 0).toFixed(3)} + $${preflightCostUsd.toFixed(3)} > $${options.dailyBudgetUsd.toFixed(3)}`,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
42
62
|
const validIds = new Set();
|
|
43
63
|
for (const f of findings)
|
|
44
64
|
validIds.add(assignFindingId(f));
|
|
@@ -58,6 +78,9 @@ export async function triageFindings(findings, pageCount, options) {
|
|
|
58
78
|
// cache read errors are non-fatal — fall through to fresh call
|
|
59
79
|
}
|
|
60
80
|
}
|
|
81
|
+
// One-line pre-call estimate so users see what's about to be spent.
|
|
82
|
+
const costLabel = preflightCostUsd !== undefined ? `~$${preflightCostUsd.toFixed(3)}` : "cost unknown";
|
|
83
|
+
console.error(`[ai-triage] calling ${options.providerId}:${options.modelId} — ~${estimate.toLocaleString()} input / ≤${maxOutputTokens.toLocaleString()} output tokens, ${costLabel}`);
|
|
61
84
|
let generated;
|
|
62
85
|
try {
|
|
63
86
|
generated = await generateObject({
|
|
@@ -65,7 +88,7 @@ export async function triageFindings(findings, pageCount, options) {
|
|
|
65
88
|
system: req.system,
|
|
66
89
|
prompt: req.user,
|
|
67
90
|
schema: triagePayloadSchema,
|
|
68
|
-
maxOutputTokens
|
|
91
|
+
maxOutputTokens,
|
|
69
92
|
abortSignal: options.signal,
|
|
70
93
|
});
|
|
71
94
|
}
|
|
@@ -74,17 +97,7 @@ export async function triageFindings(findings, pageCount, options) {
|
|
|
74
97
|
if (err?.name === "AbortError" || (err?.message && /abort/i.test(err.message))) {
|
|
75
98
|
return { skipReason: "aborted during LLM call" };
|
|
76
99
|
}
|
|
77
|
-
|
|
78
|
-
if (err && typeof err === "object" && "text" in err && typeof err.text === "string") {
|
|
79
|
-
detail += ` | raw=${err.text.slice(0, 400)}`;
|
|
80
|
-
}
|
|
81
|
-
if (err?.cause) {
|
|
82
|
-
try {
|
|
83
|
-
detail += ` | cause=${JSON.stringify(err.cause).slice(0, 400)}`;
|
|
84
|
-
}
|
|
85
|
-
catch { /* ignore */ }
|
|
86
|
-
}
|
|
87
|
-
return { skipReason: `LLM call failed: ${detail}` };
|
|
100
|
+
return { skipReason: `LLM call failed: ${err?.message ?? String(e)}` };
|
|
88
101
|
}
|
|
89
102
|
// Validate relatedFindingIds reference known findings (semantic check
|
|
90
103
|
// beyond the structural schema enforcement performed by generateObject).
|
package/dist/ai/triage.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"triage.js","sourceRoot":"","sources":["../../src/ai/triage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,cAAc,EAAsB,MAAM,IAAI,CAAC;AACxD,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,OAAO,EACL,cAAc,EACd,sBAAsB,EACtB,eAAe,EACf,kBAAkB,GACnB,MAAM,aAAa,CAAC;AACrB,OAAO,EAAE,eAAe,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAC/E,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAE5C,MAAM,UAAU,GAAG,CAAC,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,UAAU,CAAU,CAAC;AAErE,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,CAAC,uCAAuC,CAAC;IAClF,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,wDAAwD,CAAC;IAChH,eAAe,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,2EAA2E,CAAC;IAC1H,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,QAAQ,CAAC,yEAAyE,CAAC;IAChH,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IACnG,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,oDAAoD,CAAC;IAC3F,iBAAiB,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,wHAAwH,CAAC;CAC1K,CAAC,CAAC;AAEH,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IACnC,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,2DAA2D,CAAC;IACxH,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,
|
|
1
|
+
{"version":3,"file":"triage.js","sourceRoot":"","sources":["../../src/ai/triage.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,cAAc,EAAsB,MAAM,IAAI,CAAC;AACxD,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,OAAO,EACL,cAAc,EACd,sBAAsB,EACtB,eAAe,EACf,kBAAkB,GACnB,MAAM,aAAa,CAAC;AACrB,OAAO,EAAE,eAAe,EAAE,gBAAgB,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAC/E,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAE5C,MAAM,UAAU,GAAG,CAAC,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,UAAU,CAAU,CAAC;AAErE,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,CAAC,uCAAuC,CAAC;IAClF,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,wDAAwD,CAAC;IAChH,eAAe,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,2EAA2E,CAAC;IAC1H,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,QAAQ,CAAC,yEAAyE,CAAC;IAChH,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,uDAAuD,CAAC;IACnG,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,oDAAoD,CAAC;IAC3F,iBAAiB,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,wHAAwH,CAAC;CAC1K,CAAC,CAAC;AAEH,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IACnC,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,2DAA2D,CAAC;IACxH,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,mGAAmG,CAAC;CAC/I,CAAC,CAAC;AAwBH,MAAM,wBAAwB,GAAG,MAAM,CAAC;AACxC,MAAM,yBAAyB,GAAG,KAAK,CAAC;AAExC,SAAS,YAAY,CAAC,QAAsB;IAC1C,MAAM,GAAG,GAAG,QAAQ,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC,IAAI,EAAE,CAAC;IACjD,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAClE,CAAC;AAED,kFAAkF;AAClF,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACpC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAsB,EACtB,SAAiB,EACjB,OAAsB;IAEtB,IAAI,OAAO,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC;QAC5B,OAAO,EAAE,UAAU,EAAE,+BAA+B,EAAE,CAAC;IACzD,CAAC;IAED,MAAM,GAAG,GAAG,kBAAkB,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IACpD,MAAM,cAAc,GAAG,QAAQ,CAAC,MAAM,GAAG,sBAAsB,CAAC;IAEhE,MAAM,cAAc,GAAG,OAAO,CAAC,cAAc,IAAI,wBAAwB,CAAC;IAC1E,MAAM,QAAQ,GAAG,cAAc,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,IAAI,CAAC,CAAC;IACvD,IAAI,QAAQ,GAAG,cAAc,EAAE,CAAC;QAC9B,OAAO,EAAE,UAAU,EAAE,6BAA6B,QAAQ,gBAAgB,cAAc,EAAE,EAAE,CAAC;IAC/F,CAAC;IAED,sFAAsF;IACtF,MAAM,eAAe,GAAG,OAAO,CAAC,eAAe,IAAI,yBAAyB,CAAC;IAC7E,MAAM,gBAAgB,GAAG,eAAe,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,OAAO,EAAE;QAC5E,KAAK,EAAE,QAAQ;QACf,MAAM,EAAE,eAAe;KACxB,CAAC,CAAC;IACH,IACE,OAAO,CAAC,UAAU,KAAK,SAAS;QAChC,gBAAgB,KAAK,SAAS;QAC9B,gBAAgB,GAAG,OAAO,CAAC,UAAU,EACrC,CAAC;QACD,OAAO;YACL,UAAU,EAAE,oBAAoB,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,iBAAiB,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;SAC5G,CAAC;IACJ,CAAC;IACD,IACE,OAAO,CAAC,cAAc,KAAK,SAAS;QACpC,gBAAgB,KAAK,SAAS;QAC9B,CAAC,OAAO,CAAC,aAAa,IAAI,CAAC,CAAC,GAAG,gBAAgB,GAAG,OAAO,CAAC,cAAc,EACxE,CAAC;QACD,OAAO;YACL,UAAU,EAAE,4BAA4B,CAAC,OAAO,CAAC,aAAa,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,OAAO,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;SAC5J,CAAC;IACJ,CAAC;IAED,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IACnC,KAAK,MAAM,CAAC,IAAI,QAAQ;QAAE,QAAQ,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC;IAE3D,MAAM,QAAQ,GAAG,cAAc,CAAC;QAC9B,YAAY,EAAE,YAAY,CAAC,QAAQ,CAAC;QACpC,KAAK,EAAE,OAAO,CAAC,OAAO;QACtB,aAAa,EAAE,cAAc;KAC9B,CAAC,CAAC;IAEH,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAClB,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,EAAE,QAAQ,EAAE,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YACvF,IAAI,MAAM,EAAE,CAAC;gBACX,OAAO,EAAE,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,QAAQ,EAAE,IAAI,EAAE,EAAE,CAAC;YACnD,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,+DAA+D;QACjE,CAAC;IACH,CAAC;IAED,oEAAoE;IACpE,MAAM,SAAS,GAAG,gBAAgB,KAAK,SAAS,CAAC,CAAC,CAAC,KAAK,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,cAAc,CAAC;IACvG,OAAO,CAAC,KAAK,CACX,uBAAuB,OAAO,CAAC,UAAU,IAAI,OAAO,CAAC,OAAO,OAAO,QAAQ,CAAC,cAAc,EAAE,aAAa,eAAe,CAAC,cAAc,EAAE,mBAAmB,SAAS,EAAE,CACxK,CAAC;IAEF,IAAI,SAAS,CAAC;IACd,IAAI,CAAC;QACH,SAAS,GAAG,MAAM,cAAc,CAAC;YAC/B,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,GAAG,CAAC,MAAM;YAClB,MAAM,EAAE,GAAG,CAAC,IAAI;YAChB,MAAM,EAAE,mBAAmB;YAC3B,eAAe;YACf,WAAW,EAAE,OAAO,CAAC,MAAM;SAC5B,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,GAAG,GAAG,CAAwC,CAAC;QACrD,IAAI,GAAG,EAAE,IAAI,KAAK,YAAY,IAAI,CAAC,GAAG,EAAE,OAAO,IAAI,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC;YAC/E,OAAO,EAAE,UAAU,EAAE,yBAAyB,EAAE,CAAC;QACnD,CAAC;QACD,OAAO,EAAE,UAAU,EAAE,oBAAoB,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;IACzE,CAAC;IAED,sEAAsE;IACtE,yEAAyE;IACzE,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,SAAS,CAAC,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC;QAC3D,KAAK,MAAM,EAAE,IAAI,CAAC,CAAC,iBAAiB,EAAE,CAAC;YACrC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC;gBACtB,OAAO,EAAE,UAAU,EAAE,iDAAiD,CAAC,MAAM,EAAE,EAAE,EAAE,CAAC;YACtF,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,KAAK,GAAG;QACZ,KAAK,EAAE,SAAS,CAAC,KAAK,CAAC,WAAW,IAAI,CAAC;QACvC,MAAM,EAAE,SAAS,CAAC,KAAK,CAAC,YAAY,IAAI,CAAC;KAC1C,CAAC;IAEF,MAAM,MAAM,GAAiB;QAC3B,UAAU,EAAE,SAAS,CAAC,MAAM,CAAC,UAAU;QACvC,SAAS,EAAE,SAAS,CAAC,MAAM,CAAC,SAAS;QACrC,SAAS,EAAE,OAAO,CAAC,OAAO;QAC1B,UAAU,EAAE,OAAO,CAAC,UAAU;QAC9B,UAAU,EAAE,KAAK;QACjB,gBAAgB,EAAE,eAAe,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,OAAO,EAAE,KAAK,CAAC;QAC7E,QAAQ,EAAE,KAAK;QACf,aAAa,EAAE,cAAc;QAC7B,cAAc;KACf,CAAC;IAEF,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAClB,IAAI,CAAC;YACH,MAAM,gBAAgB,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;QAC9D,CAAC;QAAC,MAAM,CAAC;YACP,mCAAmC;QACrC,CAAC;IACH,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,CAAC;AACpB,CAAC"}
|
package/dist/ai/types.d.ts
CHANGED
|
@@ -14,7 +14,8 @@ export interface RootCause {
|
|
|
14
14
|
}
|
|
15
15
|
export interface TriageResult {
|
|
16
16
|
rootCauses: RootCause[];
|
|
17
|
-
|
|
17
|
+
/** Optional — may be absent when the model ran tight on output tokens. */
|
|
18
|
+
narrative?: string;
|
|
18
19
|
modelUsed: string;
|
|
19
20
|
providerId: string;
|
|
20
21
|
tokenUsage: TokenUsage;
|
package/dist/ai/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/ai/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAE5C,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,SAAS;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,aAAa,EAAE,MAAM,CAAC;IACtB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,iBAAiB,EAAE,MAAM,EAAE,CAAC;CAC7B;AAED,MAAM,WAAW,YAAY;IAC3B,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,SAAS,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/ai/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAE5C,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,SAAS;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,aAAa,EAAE,MAAM,CAAC;IACtB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,QAAQ,EAAE,QAAQ,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,iBAAiB,EAAE,MAAM,EAAE,CAAC;CAC7B;AAED,MAAM,WAAW,YAAY;IAC3B,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,0EAA0E;IAC1E,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,UAAU,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,QAAQ,EAAE,OAAO,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;IACtB,cAAc,EAAE,OAAO,CAAC;CACzB"}
|
package/dist/auditor.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"auditor.d.ts","sourceRoot":"","sources":["../src/auditor.ts"],"names":[],"mappings":"AAoDA,OAAO,KAAK,EAAE,YAAY,EAAE,YAAY,EAAwG,MAAM,YAAY,CAAC;AA8wBnK,wBAAsB,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAod/F"}
|
package/dist/auditor.js
CHANGED
|
@@ -26,7 +26,7 @@ import { hreflangConsistencyRule } from "./rules/tech/hreflang-consistency.js";
|
|
|
26
26
|
import { ogCompletenessRule } from "./rules/tech/og-completeness.js";
|
|
27
27
|
import { robotsNoindexConflictRule } from "./rules/tech/robots-noindex-conflict.js";
|
|
28
28
|
import { sitemapCompletenessRule } from "./rules/tech/sitemap-completeness.js";
|
|
29
|
-
import { robotsComplianceRule } from "./rules/tech/robots-sitemap-presence.js";
|
|
29
|
+
import { robotsComplianceRule, parseDisallowPatterns, isBlockedByPattern, parseCrawlDelaySeconds } from "./rules/tech/robots-sitemap-presence.js";
|
|
30
30
|
import { redirectChainRule } from "./rules/tech/redirect-chain.js";
|
|
31
31
|
import { soft404Rule } from "./rules/tech/soft-404.js";
|
|
32
32
|
import { jsonLdValidRule } from "./rules/schema/json-ld-valid.js";
|
|
@@ -43,7 +43,7 @@ import { enrichFindings } from "./enrich-findings.js";
|
|
|
43
43
|
import { triageFindings } from "./ai/triage.js";
|
|
44
44
|
import { createLanguageModel } from "./ai/adapters/index.js";
|
|
45
45
|
import { promptTriageFeedback } from "./ai/feedback-prompt.js";
|
|
46
|
-
import { generateRunId, appendTelemetryRecord, } from "./telemetry/index.js";
|
|
46
|
+
import { generateRunId, appendTelemetryRecord, todayTriageSpendUsd, } from "./telemetry/index.js";
|
|
47
47
|
import { cachedFetch } from "./cache.js";
|
|
48
48
|
import { stratifiedSample } from "./stratified-sample.js";
|
|
49
49
|
import { readState, writeState, computeContentHash, STATE_SCHEMA_VERSION, } from "./state.js";
|
|
@@ -430,7 +430,35 @@ async function collectUrlsFromSitemap(sitemapText, sitemapUrl, visited, timeoutM
|
|
|
430
430
|
}
|
|
431
431
|
return allUrls;
|
|
432
432
|
}
|
|
433
|
-
async function
|
|
433
|
+
async function fetchRobotsMeta(origin, timeoutMs, cache, stats) {
|
|
434
|
+
if (!origin)
|
|
435
|
+
return { disallow: [], crawlDelaySec: 0 };
|
|
436
|
+
try {
|
|
437
|
+
const robotsUrl = `${origin}/robots.txt`;
|
|
438
|
+
const fetched = await fetchTextStrict(robotsUrl, timeoutMs, cache, stats);
|
|
439
|
+
return {
|
|
440
|
+
disallow: parseDisallowPatterns(fetched.text),
|
|
441
|
+
crawlDelaySec: parseCrawlDelaySeconds(fetched.text),
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
catch {
|
|
445
|
+
return { disallow: [], crawlDelaySec: 0 };
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
function sleep(ms) {
|
|
449
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
450
|
+
}
|
|
451
|
+
function isDisallowedByRobots(urlPath, patterns) {
|
|
452
|
+
for (const pat of patterns) {
|
|
453
|
+
if (isBlockedByPattern(urlPath, pat))
|
|
454
|
+
return true;
|
|
455
|
+
}
|
|
456
|
+
return false;
|
|
457
|
+
}
|
|
458
|
+
function budgetExceeded(b) {
|
|
459
|
+
return b.cap > 0 && b.used >= b.cap;
|
|
460
|
+
}
|
|
461
|
+
async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cache, stats, fillBudgetViaLinkDiscovery = false, byteBudget = { used: 0, cap: 0 }) {
|
|
434
462
|
if (/^https?:\/\//i.test(source)) {
|
|
435
463
|
let text;
|
|
436
464
|
let contentType;
|
|
@@ -467,23 +495,38 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
467
495
|
? fisherYatesSample(allSitemapUrls, discoveryBudget)
|
|
468
496
|
: allSitemapUrls;
|
|
469
497
|
const pages = [];
|
|
470
|
-
|
|
498
|
+
// Fetch robots.txt once for the origin — reused for Crawl-Delay pacing and Disallow checks.
|
|
499
|
+
const sourceOrigin = (() => { try {
|
|
500
|
+
return new URL(source).origin;
|
|
501
|
+
}
|
|
502
|
+
catch {
|
|
503
|
+
return "";
|
|
504
|
+
} })();
|
|
505
|
+
const robots = await fetchRobotsMeta(sourceOrigin, timeoutMs, cache, stats);
|
|
506
|
+
const effectiveConcurrency = robots.crawlDelaySec > 0 ? 1 : concurrency;
|
|
507
|
+
const delayMs = robots.crawlDelaySec * 1000;
|
|
508
|
+
await runWithConcurrency(urlsToFetch, effectiveConcurrency, async (url) => {
|
|
509
|
+
if (budgetExceeded(byteBudget))
|
|
510
|
+
return;
|
|
471
511
|
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
472
512
|
if (result) {
|
|
513
|
+
byteBudget.used += result.html.length;
|
|
473
514
|
pages.push(result);
|
|
474
515
|
}
|
|
516
|
+
if (delayMs > 0)
|
|
517
|
+
await sleep(delayMs);
|
|
475
518
|
});
|
|
476
|
-
//
|
|
477
|
-
|
|
519
|
+
// Link discovery fills the sample.
|
|
520
|
+
// Legacy behavior: no budget set + crawlDiscovery true → fill from links (unchanged).
|
|
521
|
+
// New behavior: budget set + crawlDiscovery true + opt-in flag → top up to budget.
|
|
522
|
+
const budgetUnderfilled = discoveryBudget > 0 && pages.length < discoveryBudget;
|
|
523
|
+
const legacyBudgetless = discoveryBudget === 0;
|
|
524
|
+
const shouldFill = crawlDiscovery && (legacyBudgetless || (budgetUnderfilled && fillBudgetViaLinkDiscovery));
|
|
525
|
+
if (shouldFill) {
|
|
478
526
|
const sitemapUrlSet = new Set(allSitemapUrls);
|
|
479
527
|
const discoveredUrls = new Set();
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
sourceOrigin = new URL(source).origin;
|
|
483
|
-
}
|
|
484
|
-
catch {
|
|
485
|
-
sourceOrigin = "";
|
|
486
|
-
}
|
|
528
|
+
// robots already fetched above; reuse its Disallow patterns here.
|
|
529
|
+
const disallowPatterns = robots.disallow;
|
|
487
530
|
for (const page of pages) {
|
|
488
531
|
const linkMatches = Array.from(page.html.matchAll(/href=["']([^"']+)["']/gi));
|
|
489
532
|
for (const match of linkMatches) {
|
|
@@ -500,9 +543,11 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
500
543
|
resolvedUrl.search = "";
|
|
501
544
|
resolvedUrl.hash = "";
|
|
502
545
|
const normalized = resolvedUrl.href;
|
|
503
|
-
if (
|
|
504
|
-
|
|
505
|
-
|
|
546
|
+
if (sitemapUrlSet.has(normalized) || discoveredUrls.has(normalized))
|
|
547
|
+
continue;
|
|
548
|
+
if (isDisallowedByRobots(resolvedUrl.pathname, disallowPatterns))
|
|
549
|
+
continue;
|
|
550
|
+
discoveredUrls.add(normalized);
|
|
506
551
|
}
|
|
507
552
|
catch {
|
|
508
553
|
continue;
|
|
@@ -510,11 +555,21 @@ async function loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscover
|
|
|
510
555
|
}
|
|
511
556
|
}
|
|
512
557
|
if (discoveredUrls.size > 0) {
|
|
513
|
-
|
|
558
|
+
const candidates = Array.from(discoveredUrls);
|
|
559
|
+
// Fisher-Yates shuffle so we don't bias toward the first-discovered links (nav/footer).
|
|
560
|
+
const shuffled = fisherYatesSample(candidates, candidates.length);
|
|
561
|
+
const remaining = discoveryBudget === 0 ? Infinity : discoveryBudget - pages.length;
|
|
562
|
+
const toFetch = remaining === Infinity ? shuffled : shuffled.slice(0, remaining);
|
|
563
|
+
await runWithConcurrency(toFetch, effectiveConcurrency, async (url) => {
|
|
564
|
+
if (budgetExceeded(byteBudget))
|
|
565
|
+
return;
|
|
514
566
|
const result = await fetchPageWithMeta(url, timeoutMs, cache, stats);
|
|
515
567
|
if (result && result.httpMeta && result.httpMeta.statusCode >= 200 && result.httpMeta.statusCode < 300) {
|
|
568
|
+
byteBudget.used += result.html.length;
|
|
516
569
|
pages.push(result);
|
|
517
570
|
}
|
|
571
|
+
if (delayMs > 0)
|
|
572
|
+
await sleep(delayMs);
|
|
518
573
|
});
|
|
519
574
|
}
|
|
520
575
|
}
|
|
@@ -668,7 +723,10 @@ export async function auditSource(source, options) {
|
|
|
668
723
|
ttlMs: options.cache.ttlMs ?? 7 * 24 * 60 * 60 * 1000,
|
|
669
724
|
}
|
|
670
725
|
: null;
|
|
671
|
-
const
|
|
726
|
+
const fillBudgetViaLinkDiscovery = options?.fillBudgetViaLinkDiscovery ?? false;
|
|
727
|
+
const maxFetchBytes = options?.maxFetchBytes ?? 52_428_800;
|
|
728
|
+
const fetchByteBudget = { used: 0, cap: maxFetchBytes };
|
|
729
|
+
const { pages: loadedPagesRaw, sitemapUrls: sitemapUrlSet, discoveredUrlCount } = await loadPagesFromSource(source, concurrency, timeoutMs, crawlDiscovery, discoveryBudget, cacheConfig, cacheStats, fillBudgetViaLinkDiscovery, fetchByteBudget);
|
|
672
730
|
const loadedPages = [...loadedPagesRaw];
|
|
673
731
|
if (discoveredUrlCount && discoveredUrlCount > loadedPages.length) {
|
|
674
732
|
console.error(`Discovered ${discoveredUrlCount} pages, fetched ${loadedPages.length} for audit. Use --sample-size 0 for full crawl.`);
|
|
@@ -916,7 +974,13 @@ export async function auditSource(source, options) {
|
|
|
916
974
|
};
|
|
917
975
|
await writeState(statePath, newState);
|
|
918
976
|
}
|
|
977
|
+
// Captured for telemetry even when triage is skipped, so users can diagnose
|
|
978
|
+
// model/provider reliability from their local stats.jsonl.
|
|
979
|
+
let triageAttempt;
|
|
919
980
|
if (options?.ai?.enabled) {
|
|
981
|
+
if (options.ai.apiKey) {
|
|
982
|
+
console.error("[ai-triage] warning: ai.apiKey is set in options. Prefer env vars (ANTHROPIC_API_KEY, OPENAI_API_KEY, etc.) — never commit an apiKey to a config file.");
|
|
983
|
+
}
|
|
920
984
|
try {
|
|
921
985
|
const resolved = await createLanguageModel({
|
|
922
986
|
provider: options.ai.provider,
|
|
@@ -930,6 +994,17 @@ export async function auditSource(source, options) {
|
|
|
930
994
|
dir: options.ai.cache?.dir ?? ".pseolint/ai-cache",
|
|
931
995
|
ttlMs: options.ai.cache?.ttlMs ?? 30 * 24 * 60 * 60 * 1000,
|
|
932
996
|
};
|
|
997
|
+
// Daily-budget pre-flight read (best-effort — missing file is fine).
|
|
998
|
+
let spentTodayUsd;
|
|
999
|
+
if (options.ai.dailyBudgetUsd !== undefined) {
|
|
1000
|
+
const telemetryPath = options.telemetry?.path ?? ".pseolint/telemetry.jsonl";
|
|
1001
|
+
try {
|
|
1002
|
+
spentTodayUsd = await todayTriageSpendUsd(telemetryPath);
|
|
1003
|
+
}
|
|
1004
|
+
catch {
|
|
1005
|
+
spentTodayUsd = 0;
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
933
1008
|
const outcome = await triageFindings(summary.findings, summary.pageCount, {
|
|
934
1009
|
enabled: true,
|
|
935
1010
|
model: resolved.model,
|
|
@@ -937,22 +1012,24 @@ export async function auditSource(source, options) {
|
|
|
937
1012
|
modelId: resolved.modelId,
|
|
938
1013
|
maxInputTokens: options.ai.maxInputTokens,
|
|
939
1014
|
maxOutputTokens: options.ai.maxOutputTokens,
|
|
1015
|
+
maxCostUsd: options.ai.maxCostUsd,
|
|
1016
|
+
dailyBudgetUsd: options.ai.dailyBudgetUsd,
|
|
1017
|
+
spentTodayUsd,
|
|
940
1018
|
cache: cacheConfig,
|
|
941
1019
|
});
|
|
942
1020
|
if (outcome.skipReason) {
|
|
943
1021
|
console.error(`[ai-triage] skipped: ${outcome.skipReason}`);
|
|
1022
|
+
triageAttempt = { providerId: resolved.providerId, modelId: resolved.modelId, skipReason: outcome.skipReason };
|
|
944
1023
|
}
|
|
945
1024
|
else {
|
|
946
1025
|
summary.triage = outcome.result;
|
|
947
1026
|
}
|
|
948
1027
|
}
|
|
949
1028
|
catch (e) {
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
console.error(`[ai-triage] skipped: unknown error`);
|
|
955
|
-
}
|
|
1029
|
+
const reason = e instanceof Error ? e.message : "unknown error";
|
|
1030
|
+
console.error(`[ai-triage] skipped: ${reason}`);
|
|
1031
|
+
// No resolved model — providerId/modelId blank.
|
|
1032
|
+
triageAttempt = { providerId: options.ai.provider ?? "", modelId: options.ai.model ?? "", skipReason: reason };
|
|
956
1033
|
}
|
|
957
1034
|
}
|
|
958
1035
|
if (options?.telemetry?.enabled) {
|
|
@@ -971,6 +1048,7 @@ export async function auditSource(source, options) {
|
|
|
971
1048
|
...(summary.cacheStats && { cacheStats: summary.cacheStats }),
|
|
972
1049
|
...(summary.triage && {
|
|
973
1050
|
triage: {
|
|
1051
|
+
success: true,
|
|
974
1052
|
rootCauseCount: summary.triage.rootCauses.length,
|
|
975
1053
|
providerId: summary.triage.providerId,
|
|
976
1054
|
modelId: summary.triage.modelUsed,
|
|
@@ -982,6 +1060,18 @@ export async function auditSource(source, options) {
|
|
|
982
1060
|
truncatedInput: summary.triage.truncatedInput,
|
|
983
1061
|
},
|
|
984
1062
|
}),
|
|
1063
|
+
...(!summary.triage && triageAttempt && {
|
|
1064
|
+
triage: {
|
|
1065
|
+
success: false,
|
|
1066
|
+
skipReason: triageAttempt.skipReason,
|
|
1067
|
+
rootCauseCount: 0,
|
|
1068
|
+
providerId: triageAttempt.providerId,
|
|
1069
|
+
modelId: triageAttempt.modelId,
|
|
1070
|
+
cacheHit: false,
|
|
1071
|
+
tokenUsage: { input: 0, output: 0 },
|
|
1072
|
+
truncatedInput: false,
|
|
1073
|
+
},
|
|
1074
|
+
}),
|
|
985
1075
|
};
|
|
986
1076
|
await appendTelemetryRecord(telemetryPath, auditRecord);
|
|
987
1077
|
// Feedback: only if triage ran
|