gravito-eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +137 -0
  3. package/dist/cli/index.d.ts +14 -0
  4. package/dist/cli/index.d.ts.map +1 -0
  5. package/dist/cli/index.js +276 -0
  6. package/dist/cli/index.js.map +1 -0
  7. package/dist/src/adjudication/index.d.ts +36 -0
  8. package/dist/src/adjudication/index.d.ts.map +1 -0
  9. package/dist/src/adjudication/index.js +149 -0
  10. package/dist/src/adjudication/index.js.map +1 -0
  11. package/dist/src/calibration/index.d.ts +38 -0
  12. package/dist/src/calibration/index.d.ts.map +1 -0
  13. package/dist/src/calibration/index.js +104 -0
  14. package/dist/src/calibration/index.js.map +1 -0
  15. package/dist/src/confidence/index.d.ts +27 -0
  16. package/dist/src/confidence/index.d.ts.map +1 -0
  17. package/dist/src/confidence/index.js +168 -0
  18. package/dist/src/confidence/index.js.map +1 -0
  19. package/dist/src/index.d.ts +26 -0
  20. package/dist/src/index.d.ts.map +1 -0
  21. package/dist/src/index.js +47 -0
  22. package/dist/src/index.js.map +1 -0
  23. package/dist/src/matching/index.d.ts +37 -0
  24. package/dist/src/matching/index.d.ts.map +1 -0
  25. package/dist/src/matching/index.js +292 -0
  26. package/dist/src/matching/index.js.map +1 -0
  27. package/dist/src/metrics/index.d.ts +15 -0
  28. package/dist/src/metrics/index.d.ts.map +1 -0
  29. package/dist/src/metrics/index.js +177 -0
  30. package/dist/src/metrics/index.js.map +1 -0
  31. package/dist/src/telemetry/index.d.ts +10 -0
  32. package/dist/src/telemetry/index.d.ts.map +1 -0
  33. package/dist/src/telemetry/index.js +106 -0
  34. package/dist/src/telemetry/index.js.map +1 -0
  35. package/dist/src/types.d.ts +131 -0
  36. package/dist/src/types.d.ts.map +1 -0
  37. package/dist/src/types.js +28 -0
  38. package/dist/src/types.js.map +1 -0
  39. package/examples/basic/input.json +76 -0
  40. package/examples/basic/run.ts +33 -0
  41. package/package.json +50 -0
@@ -0,0 +1,131 @@
1
+ /**
2
+ * Gravito Eval — Core Types
3
+ *
4
+ * Shared type definitions for the evaluation framework.
5
+ * All modules import from here — no circular dependencies.
6
+ */
7
+ export declare const ISSUE_CATEGORIES: readonly ["conversion", "navigation", "visual_hierarchy", "trust", "content", "compliance", "performance"];
8
+ export type IssueCategory = (typeof ISSUE_CATEGORIES)[number];
9
+ export declare const SEVERITY_LEVELS: readonly ["low", "medium", "high", "critical"];
10
+ export type SeverityLevel = (typeof SEVERITY_LEVELS)[number];
11
+ /**
12
+ * A normalized finding from either an AI system or a human auditor.
13
+ * This is the universal input format for gravito-eval.
14
+ */
15
+ export interface Finding {
16
+ /** Unique identifier */
17
+ id: string;
18
+ /** Human-readable description of the issue */
19
+ description: string;
20
+ /** Issue category */
21
+ category: IssueCategory;
22
+ /** Severity level */
23
+ severity: SeverityLevel;
24
+ /** Where the issue was found (page, component, URL, etc.) */
25
+ location?: string;
26
+ /** Optional keywords for matching */
27
+ keywords?: string[];
28
+ }
29
+ export interface MatchPair {
30
+ aiIssue: Finding;
31
+ humanIssue: Finding;
32
+ similarity: number;
33
+ matchType: "strict" | "cross_category" | "conceptual";
34
+ }
35
+ export interface MatchResult {
36
+ matched: MatchPair[];
37
+ aiOnly: Finding[];
38
+ humanOnly: Finding[];
39
+ }
40
+ export interface MultiPassMatchResult {
41
+ strictMatches: MatchPair[];
42
+ crossCategoryMatches: MatchPair[];
43
+ conceptualMatches: MatchPair[];
44
+ aiOnly: Finding[];
45
+ humanOnly: Finding[];
46
+ summary: {
47
+ total_ai: number;
48
+ total_human: number;
49
+ strict_matched: number;
50
+ cross_category_matched: number;
51
+ conceptual_matched: number;
52
+ total_matched: number;
53
+ ai_only: number;
54
+ human_only: number;
55
+ };
56
+ }
57
+ export declare const ADJUDICATION_LABELS: readonly ["VALID", "INVALID", "DUPLICATE", "LOW_VALUE"];
58
+ export type AdjudicationLabel = (typeof ADJUDICATION_LABELS)[number];
59
+ export interface Adjudication {
60
+ /** ID of the AI-only finding being adjudicated */
61
+ findingId: string;
62
+ /** Verdict */
63
+ label: AdjudicationLabel;
64
+ /** Reasoning */
65
+ reasoning?: string;
66
+ }
67
+ export interface DetectionMetrics {
68
+ recall: number;
69
+ precision: number;
70
+ f1: number;
71
+ matchedCount: number;
72
+ totalAI: number;
73
+ totalHuman: number;
74
+ }
75
+ export interface RankingMetrics {
76
+ top3Overlap: number;
77
+ top5Overlap: number;
78
+ spearmanCorrelation: number;
79
+ }
80
+ export interface SeverityMetrics {
81
+ weightedKappa: number;
82
+ meanAbsoluteError: number;
83
+ confusionMatrix: Record<string, Record<string, number>>;
84
+ }
85
+ export interface NovelSignalMetrics {
86
+ totalAiOnly: number;
87
+ validCount: number;
88
+ invalidCount: number;
89
+ duplicateCount: number;
90
+ lowValueCount: number;
91
+ validatedNovelRate: number;
92
+ systemStrength: "WEAK" | "MODERATE" | "STRONG" | "DIFFERENTIATED";
93
+ }
94
+ export interface ConfidenceInterval {
95
+ mean: number;
96
+ lowerBound: number;
97
+ upperBound: number;
98
+ }
99
+ export interface EvalResult {
100
+ detection: DetectionMetrics;
101
+ ranking: RankingMetrics;
102
+ severity: SeverityMetrics;
103
+ novelSignal?: NovelSignalMetrics;
104
+ matchBreakdown: {
105
+ strict: number;
106
+ crossCategory: number;
107
+ conceptual: number;
108
+ };
109
+ /** Individual match pairs for detailed inspection */
110
+ matches: MatchPair[];
111
+ /** Findings only in AI output (novel/unmatched) */
112
+ aiOnly: Finding[];
113
+ /** Findings only in human output (missed by AI) */
114
+ humanOnly: Finding[];
115
+ adjustedPrecision?: number;
116
+ verdict: "PASS" | "PARTIAL" | "FAIL" | "INSUFFICIENT_DATA";
117
+ }
118
+ export interface ConfidenceFactors {
119
+ signal_strength: number;
120
+ cross_signal_support: number;
121
+ pattern_repetition: number;
122
+ rule_determinism: number;
123
+ clarity_of_evidence: number;
124
+ }
125
+ export interface ScoredFinding extends Finding {
126
+ confidence: number;
127
+ factors: ConfidenceFactors;
128
+ isSubjective: boolean;
129
+ signalCount: number;
130
+ }
131
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,eAAO,MAAM,gBAAgB,4GAQnB,CAAC;AAEX,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,gBAAgB,CAAC,CAAC,MAAM,CAAC,CAAC;AAE9D,eAAO,MAAM,eAAe,gDAAiD,CAAC;AAE9E,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,eAAe,CAAC,CAAC,MAAM,CAAC,CAAC;AAE7D;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,wBAAwB;IACxB,EAAE,EAAE,MAAM,CAAC;IACX,8CAA8C;IAC9C,WAAW,EAAE,MAAM,CAAC;IACpB,qBAAqB;IACrB,QAAQ,EAAE,aAAa,CAAC;IACxB,qBAAqB;IACrB,QAAQ,EAAE,aAAa,CAAC;IACxB,6DAA6D;IAC7D,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,qCAAqC;IACrC,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CACrB;AAID,MAAM,WAAW,SAAS;IACxB,OAAO,EAAE,OAAO,CAAC;IACjB,UAAU,EAAE,OAAO,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,QAAQ,GAAG,gBAAgB,GAAG,YAAY,CAAC;CACvD;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,SAAS,EAAE,CAAC;IACrB,MAAM,EAAE,OAAO,EAAE,CAAC;IAClB,SAAS,EAAE,OAAO,EAAE,CAAC;CACtB;AAED,MAAM,WAAW,oBAAoB;IACnC,aAAa,EAAE,SAAS,EAAE,CAAC;IAC3B,oBAAoB,EAAE,SAAS,EAAE,CAAC;IAClC,iBAAiB,EAAE,SAAS,EAAE,CAAC;IAC/B,MAAM,EAAE,OAAO,EAAE,CAAC;IAClB,SAAS,EAAE,OAAO,EAAE,CAAC;IACrB,OAAO,EAAE;QACP,QAAQ,EAAE,MAAM,CAAC;QACjB,WAAW,EAAE,MAAM,CAAC;QACpB,cAAc,EAAE,MAAM,CAAC;QACvB,sBAAsB,EAAE,MAAM,CAAC;QAC/B,kBAAkB,EAAE,MAAM,CAAC;QAC3B,aAAa,EAAE,MAAM,CAAC;QACtB,OAAO,EAAE,MAAM,CAAC;QAChB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAID,eAAO,MAAM,mBAAmB,yDAKtB,CAAC;AAEX,MAAM,MAAM,iBAAiB,GAAG,CAAC,OAAO,mBAAmB,CAAC,CAAC,MAAM,CAAC,CAAC;AAErE,MAAM,WAAW,YAAY;IAC3B,kDAAkD;IAClD,SAAS,EAAE,MAAM,CAAC;IAClB,cAAc;IACd,KAAK,EAAE,iBAAiB,CAAC;IACzB,gBAAgB;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAID,MAAM,WAAW,gBAAgB;IAC/B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,EAAE,EAAE,MAAM,CAAC;IACX,YAAY,EAAE,MAAM,CAAC;IACrB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,cAAc;IAC7B,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,WAAW,eAAe;IAC9B,aAAa,EAAE,MAAM,CAAC;IACtB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;CACzD;AAED,MAAM,WAAW,kBAAkB;IACjC,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,cAAc,EAAE,MAAM,GAAG,UAAU,GAAG,QAAQ,GAAG,gBAAgB,CAAC;CACnE;AAED,MAAM,WAAW,kBAAkB;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,UAAU;IACzB,SAAS,EAAE,gBAAgB,CAAC;IAC5B,OAAO,EAAE,cAAc,CAAC;IACxB,QAAQ,EAAE,eAAe,CAAC;IAC1B,WAAW,CAAC,EAAE,kBAAkB,CAAC;IACjC,cAAc,EAAE;QACd,MAAM,EAAE,MAAM,CAAC;QACf,aAAa,EAAE,MAAM,CAAC;QACtB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,qDAAqD;IACrD,OAAO,EAAE,SAAS,EAAE,CAAC;IACrB,mDAAmD;IACnD,MAAM,EAAE,OAAO,EAAE,CAAC;IAClB,mDAAmD;IACnD,SAAS,EAAE,OAAO,EAAE,CAAC;IACrB,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,OAAO,EAAE,MAAM,GAAG,SAAS,GAAG,MAAM,GAAG,mBAAmB,CAAC;CAC5D;AAID,MAAM,WAAW,iBAAiB;IAChC,eAAe,EAAE,MAAM,CAAC;IACxB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,kBAAkB,EAAE,MAAM,CAAC;IAC3B,gBAAgB,EAAE,MAAM,CAAC;IACzB,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,WAAW,aAAc,SAAQ,OAAO;IAC5C,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,iBAAiB,CAAC;IAC3B,YAAY,EAAE,OAAO,CAAC;IACtB,WAAW,EAAE,MAAM,CAAC;CACrB"}
@@ -0,0 +1,28 @@
1
+ "use strict";
2
+ /**
3
+ * Gravito Eval — Core Types
4
+ *
5
+ * Shared type definitions for the evaluation framework.
6
+ * All modules import from here — no circular dependencies.
7
+ */
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.ADJUDICATION_LABELS = exports.SEVERITY_LEVELS = exports.ISSUE_CATEGORIES = void 0;
10
+ // ─── Finding Types ────────────────────────────────────────────────────────
11
+ exports.ISSUE_CATEGORIES = [
12
+ "conversion",
13
+ "navigation",
14
+ "visual_hierarchy",
15
+ "trust",
16
+ "content",
17
+ "compliance",
18
+ "performance",
19
+ ];
20
+ exports.SEVERITY_LEVELS = ["low", "medium", "high", "critical"];
21
+ // ─── Adjudication Types ───────────────────────────────────────────────────
22
+ exports.ADJUDICATION_LABELS = [
23
+ "VALID",
24
+ "INVALID",
25
+ "DUPLICATE",
26
+ "LOW_VALUE",
27
+ ];
28
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;AAEH,6EAA6E;AAEhE,QAAA,gBAAgB,GAAG;IAC9B,YAAY;IACZ,YAAY;IACZ,kBAAkB;IAClB,OAAO;IACP,SAAS;IACT,YAAY;IACZ,aAAa;CACL,CAAC;AAIE,QAAA,eAAe,GAAG,CAAC,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,UAAU,CAAU,CAAC;AAwD9E,6EAA6E;AAEhE,QAAA,mBAAmB,GAAG;IACjC,OAAO;IACP,SAAS;IACT,WAAW;IACX,WAAW;CACH,CAAC"}
@@ -0,0 +1,76 @@
1
+ {
2
+ "aiFindings": [
3
+ {
4
+ "id": "ai-1",
5
+ "description": "Missing primary CTA button on homepage — no clear action for visitors to take",
6
+ "category": "conversion",
7
+ "severity": "high",
8
+ "keywords": ["CTA", "button", "action", "homepage", "conversion"]
9
+ },
10
+ {
11
+ "id": "ai-2",
12
+ "description": "Weak value proposition headline — does not explain what the product does or its core benefit",
13
+ "category": "content",
14
+ "severity": "high",
15
+ "keywords": ["headline", "value proposition", "product", "benefit", "explain"]
16
+ },
17
+ {
18
+ "id": "ai-3",
19
+ "description": "Confusing visual hierarchy — competing elements with similar weight make page hard to scan",
20
+ "category": "visual_hierarchy",
21
+ "severity": "medium",
22
+ "keywords": ["hierarchy", "layout", "confusing", "importance", "scanning"]
23
+ },
24
+ {
25
+ "id": "ai-4",
26
+ "description": "Inconsistent tone of voice across product and marketing pages",
27
+ "category": "content",
28
+ "severity": "low",
29
+ "keywords": ["tone", "voice", "consistency", "messaging"]
30
+ },
31
+ {
32
+ "id": "ai-5",
33
+ "description": "No trust indicators visible above the fold — missing testimonials or customer logos",
34
+ "category": "trust",
35
+ "severity": "medium",
36
+ "keywords": ["trust", "testimonials", "logos", "social proof"]
37
+ },
38
+ {
39
+ "id": "ai-6",
40
+ "description": "Navigation labels overlap in meaning — users cannot distinguish between sections",
41
+ "category": "navigation",
42
+ "severity": "medium",
43
+ "keywords": ["navigation", "labels", "overlap", "confusing", "sections"]
44
+ }
45
+ ],
46
+ "humanFindings": [
47
+ {
48
+ "id": "human-1",
49
+ "description": "No clear action button on the homepage — visitors don't know what to do next",
50
+ "category": "conversion",
51
+ "severity": "high",
52
+ "keywords": ["action", "button", "homepage", "CTA", "conversion"]
53
+ },
54
+ {
55
+ "id": "human-2",
56
+ "description": "Confusing layout — hard to tell what's most important on the page, hierarchy unclear",
57
+ "category": "visual_hierarchy",
58
+ "severity": "medium",
59
+ "keywords": ["layout", "confusing", "hierarchy", "importance"]
60
+ },
61
+ {
62
+ "id": "human-3",
63
+ "description": "Headline doesn't explain what the product does — weak value proposition",
64
+ "category": "content",
65
+ "severity": "high",
66
+ "keywords": ["headline", "product", "explain", "value proposition"]
67
+ },
68
+ {
69
+ "id": "human-4",
70
+ "description": "Footer links are broken — several lead to 404 pages",
71
+ "category": "navigation",
72
+ "severity": "medium",
73
+ "keywords": ["footer", "links", "broken", "404"]
74
+ }
75
+ ]
76
+ }
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Basic Example — Simplest possible evaluation
3
+ *
4
+ * Run: npx ts-node examples/basic/run.ts
5
+ * Or: gravito-eval run examples/basic
6
+ */
7
+
8
+ import { evaluate } from "../../src";
9
+ import data from "./input.json";
10
+
11
+ const result = evaluate(data.aiFindings as any, data.humanFindings as any);
12
+
13
+ console.log("Gravito Eval Results\n");
14
+ console.log(`Recall: ${Math.round(result.detection.recall * 100)}%`);
15
+ console.log(`Precision: ${Math.round(result.detection.precision * 100)}%`);
16
+ console.log(`F1: ${Math.round(result.detection.f1 * 100)}%`);
17
+ console.log();
18
+ console.log(`Top-3 Agreement: ${Math.round(result.ranking.top3Overlap * 100)}%`);
19
+ if (result.novelSignal) {
20
+ console.log(`Novel Signal: ${Math.round(result.novelSignal.validatedNovelRate * 100)}% (validated)`);
21
+ }
22
+ console.log();
23
+ console.log("Interpretation:");
24
+ if (result.detection.recall >= 0.7) {
25
+ console.log("- Strong alignment with human judgment");
26
+ } else if (result.detection.recall >= 0.5) {
27
+ console.log("- Moderate alignment — some human findings missed");
28
+ } else {
29
+ console.log("- Low alignment — many human findings missed");
30
+ }
31
+ if (result.novelSignal && result.novelSignal.validatedNovelRate >= 0.25) {
32
+ console.log("- Additional issues detected beyond baseline");
33
+ }
package/package.json ADDED
@@ -0,0 +1,50 @@
1
+ {
2
+ "name": "gravito-eval",
3
+ "version": "0.1.0",
4
+ "description": "Measure how closely AI decisions match human judgment — and where they add new signal.",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "bin": {
8
+ "gravito-eval": "dist/cli/index.js"
9
+ },
10
+ "files": [
11
+ "dist",
12
+ "examples",
13
+ "README.md",
14
+ "LICENSE"
15
+ ],
16
+ "scripts": {
17
+ "build": "tsc",
18
+ "test": "vitest run",
19
+ "test:watch": "vitest",
20
+ "clean": "rm -rf dist",
21
+ "prepublishOnly": "npm run clean && npm run build"
22
+ },
23
+ "keywords": [
24
+ "ai-evaluation",
25
+ "alignment",
26
+ "calibration",
27
+ "precision",
28
+ "recall",
29
+ "f1-score",
30
+ "semantic-matching",
31
+ "novel-signal",
32
+ "human-ai-comparison",
33
+ "evaluation-framework"
34
+ ],
35
+ "author": "Gravito <hello@gravitoai.com>",
36
+ "license": "MIT",
37
+ "repository": {
38
+ "type": "git",
39
+ "url": "https://github.com/samuelrkestenbaum-dot/gravito-eval.git"
40
+ },
41
+ "homepage": "https://gravito.ai",
42
+ "engines": {
43
+ "node": ">=18.0.0"
44
+ },
45
+ "devDependencies": {
46
+ "@types/node": "^25.5.0",
47
+ "typescript": "^5.4.0",
48
+ "vitest": "^1.6.0"
49
+ }
50
+ }