snipara-evals 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +166 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +84 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +2 -0
- package/dist/io.d.ts +2 -0
- package/dist/io.js +23 -0
- package/dist/report.d.ts +3 -0
- package/dist/report.js +48 -0
- package/dist/scoring.d.ts +4 -0
- package/dist/scoring.js +270 -0
- package/dist/types.d.ts +88 -0
- package/dist/types.js +7 -0
- package/examples/project-intelligence-case.json +55 -0
- package/package.json +57 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
6
|
+
|
|
7
|
+
1. Definitions.
|
|
8
|
+
|
|
9
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
|
10
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
|
11
|
+
|
|
12
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
|
13
|
+
the copyright owner that is granting the License.
|
|
14
|
+
|
|
15
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
|
16
|
+
other entities that control, are controlled by, or are under common
|
|
17
|
+
control with that entity. For the purposes of this definition,
|
|
18
|
+
"control" means (i) the power, direct or indirect, to cause the
|
|
19
|
+
direction or management of such entity, whether by contract or
|
|
20
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
22
|
+
|
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
|
24
|
+
exercising permissions granted by this License.
|
|
25
|
+
|
|
26
|
+
"Source" form shall mean the preferred form for making modifications,
|
|
27
|
+
including but not limited to software source code, documentation
|
|
28
|
+
source, and configuration files.
|
|
29
|
+
|
|
30
|
+
"Object" form shall mean any form resulting from mechanical
|
|
31
|
+
transformation or translation of a Source form, including but
|
|
32
|
+
not limited to compiled object code, generated documentation,
|
|
33
|
+
and conversions to other media types.
|
|
34
|
+
|
|
35
|
+
"Work" shall mean the work of authorship, whether in Source or
|
|
36
|
+
Object form, made available under the License, as indicated by a
|
|
37
|
+
copyright notice that is included in or attached to the work
|
|
38
|
+
(an example is provided in the Appendix below).
|
|
39
|
+
|
|
40
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
|
41
|
+
form, that is based on (or derived from) the Work and for which the
|
|
42
|
+
editorial revisions, annotations, elaborations, or other modifications
|
|
43
|
+
represent, as a whole, an original work of authorship. For the purposes
|
|
44
|
+
of this License, Derivative Works shall not include works that remain
|
|
45
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
|
46
|
+
the Work and Derivative Works thereof.
|
|
47
|
+
|
|
48
|
+
"Contribution" shall mean any work of authorship, including
|
|
49
|
+
the original version of the Work and any modifications or additions
|
|
50
|
+
to that Work or Derivative Works thereof, that is intentionally
|
|
51
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
52
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
|
53
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
|
54
|
+
means any form of electronic, verbal, or written communication sent
|
|
55
|
+
to the Licensor or its representatives, including but not limited to
|
|
56
|
+
communication on electronic mailing lists, source code control systems,
|
|
57
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
|
58
|
+
Licensor for the purpose of discussing and improving the Work, but
|
|
59
|
+
excluding communication that is conspicuously marked or otherwise
|
|
60
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
|
61
|
+
|
|
62
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
63
|
+
on behalf of whom a Contribution has been received by Licensor and
|
|
64
|
+
subsequently incorporated within the Work.
|
|
65
|
+
|
|
66
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
67
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
68
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
69
|
+
copyright license to reproduce, prepare Derivative Works of,
|
|
70
|
+
publicly display, publicly perform, sublicense, and distribute the
|
|
71
|
+
Work and such Derivative Works in Source or Object form.
|
|
72
|
+
|
|
73
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
|
74
|
+
this License, each Contributor hereby grants to You a perpetual,
|
|
75
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
76
|
+
(except as stated in this section) patent license to make, have made,
|
|
77
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
78
|
+
where such license applies only to those patent claims licensable
|
|
79
|
+
by such Contributor that are necessarily infringed by their
|
|
80
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
|
81
|
+
with the Work to which such Contribution(s) was submitted. If You
|
|
82
|
+
institute patent litigation against any entity (including a
|
|
83
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
84
|
+
or a Contribution incorporated within the Work constitutes direct
|
|
85
|
+
or contributory patent infringement, then any patent licenses
|
|
86
|
+
granted to You under this License for that Work shall terminate
|
|
87
|
+
as of the date such litigation is filed.
|
|
88
|
+
|
|
89
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
|
90
|
+
Work or Derivative Works thereof in any medium, with or without
|
|
91
|
+
modifications, and in Source or Object form, provided that You
|
|
92
|
+
meet the following conditions:
|
|
93
|
+
|
|
94
|
+
(a) You must give any other recipients of the Work or
|
|
95
|
+
Derivative Works a copy of this License; and
|
|
96
|
+
|
|
97
|
+
(b) You must cause any modified files to carry prominent notices
|
|
98
|
+
stating that You changed the files; and
|
|
99
|
+
|
|
100
|
+
(c) You must retain, in the Source form of any Derivative Works
|
|
101
|
+
that You distribute, all copyright, patent, trademark, and
|
|
102
|
+
attribution notices from the Source form of the Work,
|
|
103
|
+
excluding those notices that do not pertain to any part of
|
|
104
|
+
the Derivative Works; and
|
|
105
|
+
|
|
106
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
|
107
|
+
distribution, then any Derivative Works that You distribute must
|
|
108
|
+
include a readable copy of the attribution notices contained
|
|
109
|
+
within such NOTICE file, excluding those notices that do not
|
|
110
|
+
pertain to any part of the Derivative Works, in at least one
|
|
111
|
+
of the following places: within a NOTICE text file distributed
|
|
112
|
+
as part of the Derivative Works; within the Source form or
|
|
113
|
+
documentation, if provided along with the Derivative Works; or,
|
|
114
|
+
within a display generated by the Derivative Works, if and
|
|
115
|
+
wherever such third-party notices normally appear. The contents
|
|
116
|
+
of the NOTICE file are for informational purposes only and
|
|
117
|
+
do not modify the License. You may add Your own attribution
|
|
118
|
+
notices within Derivative Works that You distribute, alongside
|
|
119
|
+
or as an addendum to the NOTICE text from the Work, provided
|
|
120
|
+
that such additional attribution notices cannot be construed
|
|
121
|
+
as modifying the License.
|
|
122
|
+
|
|
123
|
+
You may add Your own copyright statement to Your modifications and
|
|
124
|
+
may provide additional or different license terms and conditions
|
|
125
|
+
for use, reproduction, or distribution of Your modifications, or
|
|
126
|
+
for any such Derivative Works as a whole, provided Your use,
|
|
127
|
+
reproduction, and distribution of the Work otherwise complies with
|
|
128
|
+
the conditions stated in this License.
|
|
129
|
+
|
|
130
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
131
|
+
any Contribution intentionally submitted for inclusion in the Work
|
|
132
|
+
by You to the Licensor shall be under the terms and conditions of
|
|
133
|
+
this License, without any additional terms or conditions.
|
|
134
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
|
135
|
+
the terms of any separate license agreement you may have executed
|
|
136
|
+
with Licensor regarding such Contributions.
|
|
137
|
+
|
|
138
|
+
6. Trademarks. This License does not grant permission to use the trade
|
|
139
|
+
names, trademarks, service marks, or product names of the Licensor,
|
|
140
|
+
except as required for reasonable and customary use in describing the
|
|
141
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
|
142
|
+
|
|
143
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
144
|
+
agreed to in writing, Licensor provides the Work (and each
|
|
145
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
146
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
147
|
+
implied, including, without limitation, any warranties or conditions
|
|
148
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
149
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
150
|
+
appropriateness of using or redistributing the Work and assume any
|
|
151
|
+
risks associated with Your exercise of permissions under this License.
|
|
152
|
+
|
|
153
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
|
154
|
+
whether in tort (including negligence), contract, or otherwise,
|
|
155
|
+
unless required by applicable law (such as deliberate and grossly
|
|
156
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
|
157
|
+
liable to You for damages, including any direct, indirect, special,
|
|
158
|
+
incidental, or consequential damages of any character arising as a
|
|
159
|
+
result of this License or out of the use or inability to use the
|
|
160
|
+
Work (including but not limited to damages for loss of goodwill,
|
|
161
|
+
work stoppage, computer failure or malfunction, or any and all
|
|
162
|
+
other commercial damages or losses), even if such Contributor
|
|
163
|
+
has been advised of the possibility of such damages.
|
|
164
|
+
|
|
165
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
|
166
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
|
167
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
168
|
+
or other liability obligations and/or rights consistent with this
|
|
169
|
+
License. However, in accepting such obligations, You may act only
|
|
170
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
|
171
|
+
of any other Contributor, and only if You agree to indemnify,
|
|
172
|
+
defend, and hold each Contributor harmless for any liability
|
|
173
|
+
incurred by, or claims asserted against, such Contributor by reason
|
|
174
|
+
of your accepting any such warranty or additional liability.
|
|
175
|
+
|
|
176
|
+
END OF TERMS AND CONDITIONS
|
|
177
|
+
|
|
178
|
+
APPENDIX: How to apply the Apache License to your work.
|
|
179
|
+
|
|
180
|
+
To apply the Apache License to your work, attach the following
|
|
181
|
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
182
|
+
replaced with your own identifying information. (Don't include
|
|
183
|
+
the brackets!) The text should be enclosed in the appropriate
|
|
184
|
+
comment syntax for the file format. We also recommend that a
|
|
185
|
+
file or class name and description of purpose be included on the
|
|
186
|
+
same "printed page" as the copyright notice for easier
|
|
187
|
+
identification within third-party archives.
|
|
188
|
+
|
|
189
|
+
Copyright [yyyy] [name of copyright owner]
|
|
190
|
+
|
|
191
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
192
|
+
you may not use this file except in compliance with the License.
|
|
193
|
+
You may obtain a copy of the License at
|
|
194
|
+
|
|
195
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
196
|
+
|
|
197
|
+
Unless required by applicable law or agreed to in writing, software
|
|
198
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
199
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
200
|
+
See the License for the specific language governing permissions and
|
|
201
|
+
limitations under the License.
|
package/README.md
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# snipara-evals
|
|
2
|
+
|
|
3
|
+
**Project Intelligence evals for AI coding agents.**
|
|
4
|
+
|
|
5
|
+
`snipara-evals` is a small open-source scoring harness for questions most agent
|
|
6
|
+
evals still miss:
|
|
7
|
+
|
|
8
|
+
- did the agent preserve the project context it was given?
|
|
9
|
+
- did it respect decisions and avoid rejected directions?
|
|
10
|
+
- did it understand the impact surface of the change?
|
|
11
|
+
- did it run or report the expected verification?
|
|
12
|
+
- did it preserve enough continuity for another agent to resume the work?
|
|
13
|
+
|
|
14
|
+
It is deterministic by default and does not require an LLM judge. You describe a
|
|
15
|
+
case in JSON, pass in the observed agent output, and get a scorecard that can run
|
|
16
|
+
locally or in CI.
|
|
17
|
+
|
|
18
|
+
## Why This Exists
|
|
19
|
+
|
|
20
|
+
Most coding-agent evals ask whether the final answer is correct. That is useful,
|
|
21
|
+
but not enough for real projects. Agents also need to carry project decisions,
|
|
22
|
+
blast radius, verification plans, and handoff context across a task.
|
|
23
|
+
|
|
24
|
+
This repo is part of the open "Mini Snipara" stack:
|
|
25
|
+
|
|
26
|
+
| Repo | Role |
|
|
27
|
+
| --- | --- |
|
|
28
|
+
| [`snipara-companion`](https://github.com/Snipara/snipara-companion) | Workflow continuity for AI coding agents |
|
|
29
|
+
| [`snipara-memory`](https://github.com/Snipara/snipara-memory) | Local durable project memory |
|
|
30
|
+
| [`snipara-evals`](https://github.com/Snipara/snipara-evals) | Project Intelligence scoring |
|
|
31
|
+
|
|
32
|
+
Hosted Snipara remains the managed layer for source authority, reviewed memory,
|
|
33
|
+
team sync, code graph impact, dashboards, and operations.
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
npx snipara-evals run examples/project-intelligence-case.json
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
JSON output:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
npx snipara-evals run examples/project-intelligence-case.json --json
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Fail CI when thresholds are missed:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
npx snipara-evals run examples/project-intelligence-case.json --fail-on-threshold
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Case Format
|
|
54
|
+
|
|
55
|
+
```json
|
|
56
|
+
{
|
|
57
|
+
"id": "agent-handoff-example",
|
|
58
|
+
"expected": {
|
|
59
|
+
"context": [
|
|
60
|
+
{
|
|
61
|
+
"id": "license",
|
|
62
|
+
"text": "The public repository uses Apache-2.0.",
|
|
63
|
+
"keywords": ["Apache-2.0", "public repository"]
|
|
64
|
+
}
|
|
65
|
+
],
|
|
66
|
+
"decisions": [
|
|
67
|
+
{
|
|
68
|
+
"id": "hosted-code-graph",
|
|
69
|
+
"statement": "Code graph remains hosted.",
|
|
70
|
+
"keywords": ["code graph", "hosted"],
|
|
71
|
+
"rejectedKeywords": ["open source code graph"]
|
|
72
|
+
}
|
|
73
|
+
],
|
|
74
|
+
"impact": [
|
|
75
|
+
{
|
|
76
|
+
"id": "cli-surface",
|
|
77
|
+
"target": "CLI command surface",
|
|
78
|
+
"keywords": ["CLI", "command"],
|
|
79
|
+
"files": ["src/cli.ts"]
|
|
80
|
+
}
|
|
81
|
+
],
|
|
82
|
+
"verification": [
|
|
83
|
+
{
|
|
84
|
+
"id": "test-suite",
|
|
85
|
+
"check": "Run the package tests.",
|
|
86
|
+
"command": "pnpm test",
|
|
87
|
+
"keywords": ["tests pass"]
|
|
88
|
+
}
|
|
89
|
+
],
|
|
90
|
+
"continuity": [
|
|
91
|
+
{
|
|
92
|
+
"id": "handoff",
|
|
93
|
+
"handoff": "Leave a concise next-step handoff.",
|
|
94
|
+
"keywords": ["handoff", "next step"]
|
|
95
|
+
}
|
|
96
|
+
]
|
|
97
|
+
},
|
|
98
|
+
"observed": {
|
|
99
|
+
"answer": "Implemented the CLI in src/cli.ts, kept code graph hosted, and ran pnpm test. Handoff: next step is release packaging.",
|
|
100
|
+
"filesChanged": ["src/cli.ts"],
|
|
101
|
+
"commandsRun": ["pnpm test"]
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Metrics
|
|
107
|
+
|
|
108
|
+
`snipara-evals` reports five metric scores from 0 to 100.
|
|
109
|
+
|
|
110
|
+
| Metric | What it checks |
|
|
111
|
+
| --- | --- |
|
|
112
|
+
| Context Preservation | Required facts, constraints, and source details were carried into the result |
|
|
113
|
+
| Decision Consistency | Canonical decisions were followed and rejected directions were avoided |
|
|
114
|
+
| Impact Awareness | Files, commands, dependencies, risks, or affected surfaces were acknowledged |
|
|
115
|
+
| Verification Coverage | Expected checks were run, mentioned, or marked with status |
|
|
116
|
+
| Continuity | Handoff, resume point, blockers, and next action are visible |
|
|
117
|
+
|
|
118
|
+
The scoring rules are intentionally simple and public. This repo is not a dump
|
|
119
|
+
of Snipara's internal benchmark fixtures, tuning notes, or private rubrics.
|
|
120
|
+
|
|
121
|
+
## Library Usage
|
|
122
|
+
|
|
123
|
+
```ts
|
|
124
|
+
import { evaluateCase } from "snipara-evals";
|
|
125
|
+
|
|
126
|
+
const result = evaluateCase({
|
|
127
|
+
id: "demo",
|
|
128
|
+
expected: {
|
|
129
|
+
context: [{ id: "fact", text: "The package is Apache-2.0 licensed." }],
|
|
130
|
+
},
|
|
131
|
+
observed: {
|
|
132
|
+
answer: "The package is Apache-2.0 licensed.",
|
|
133
|
+
},
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
console.log(result.overall.score);
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Development
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
pnpm install
|
|
143
|
+
pnpm build
|
|
144
|
+
pnpm type-check
|
|
145
|
+
pnpm lint
|
|
146
|
+
pnpm test
|
|
147
|
+
pnpm pack:smoke
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Open Source Boundary
|
|
151
|
+
|
|
152
|
+
Good public contributions:
|
|
153
|
+
|
|
154
|
+
- deterministic scoring helpers
|
|
155
|
+
- portable JSON case formats
|
|
156
|
+
- CLI/reporting improvements
|
|
157
|
+
- sanitized examples
|
|
158
|
+
- adapters for public agent transcripts
|
|
159
|
+
|
|
160
|
+
Keep private:
|
|
161
|
+
|
|
162
|
+
- customer data
|
|
163
|
+
- private runbooks or deployment details
|
|
164
|
+
- internal benchmark datasets and raw reports
|
|
165
|
+
- proprietary ranking heuristics or hosted code graph internals
|
|
166
|
+
- secrets, `.env` files, screenshots, logs, and generated config dumps
|
package/dist/cli.d.ts
ADDED
package/dist/cli.js
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { loadCases } from "./io.js";
|
|
3
|
+
import { formatSuite } from "./report.js";
|
|
4
|
+
import { evaluateSuite } from "./scoring.js";
|
|
5
|
+
async function main(argv) {
|
|
6
|
+
const command = argv[2];
|
|
7
|
+
if (!command || command === "--help" || command === "-h" || command === "help") {
|
|
8
|
+
printHelp();
|
|
9
|
+
return 0;
|
|
10
|
+
}
|
|
11
|
+
if (command !== "run" && command !== "score") {
|
|
12
|
+
console.error(`Unknown command: ${command}`);
|
|
13
|
+
printHelp();
|
|
14
|
+
return 1;
|
|
15
|
+
}
|
|
16
|
+
const options = parseOptions(argv.slice(3));
|
|
17
|
+
if (options.files.length === 0) {
|
|
18
|
+
console.error("Missing eval case file.");
|
|
19
|
+
printHelp();
|
|
20
|
+
return 1;
|
|
21
|
+
}
|
|
22
|
+
const cases = (await Promise.all(options.files.map((file) => loadCases(file)))).flat();
|
|
23
|
+
const result = evaluateSuite(cases);
|
|
24
|
+
if (options.json) {
|
|
25
|
+
console.log(JSON.stringify(result, null, 2));
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
console.log(formatSuite(result));
|
|
29
|
+
}
|
|
30
|
+
return options.failOnThreshold && result.status !== "pass" ? 1 : 0;
|
|
31
|
+
}
|
|
32
|
+
function parseOptions(args) {
|
|
33
|
+
const options = {
|
|
34
|
+
json: false,
|
|
35
|
+
failOnThreshold: false,
|
|
36
|
+
files: [],
|
|
37
|
+
};
|
|
38
|
+
for (const arg of args) {
|
|
39
|
+
if (arg === "--json") {
|
|
40
|
+
options.json = true;
|
|
41
|
+
}
|
|
42
|
+
else if (arg === "--fail-on-threshold" || arg === "--strict") {
|
|
43
|
+
options.failOnThreshold = true;
|
|
44
|
+
}
|
|
45
|
+
else if (arg === "--help" || arg === "-h") {
|
|
46
|
+
printHelp();
|
|
47
|
+
process.exit(0);
|
|
48
|
+
}
|
|
49
|
+
else if (arg.startsWith("-")) {
|
|
50
|
+
throw new Error(`Unknown option: ${arg}`);
|
|
51
|
+
}
|
|
52
|
+
else {
|
|
53
|
+
options.files.push(arg);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return options;
|
|
57
|
+
}
|
|
58
|
+
function printHelp() {
|
|
59
|
+
console.log(`snipara-evals
|
|
60
|
+
|
|
61
|
+
Project Intelligence evals for AI coding agents.
|
|
62
|
+
|
|
63
|
+
Usage:
|
|
64
|
+
snipara-evals run <case.json...> [--json] [--fail-on-threshold]
|
|
65
|
+
snipara-evals score <case.json...> [--json] [--strict]
|
|
66
|
+
|
|
67
|
+
Commands:
|
|
68
|
+
run, score Evaluate one or more JSON case files
|
|
69
|
+
|
|
70
|
+
Options:
|
|
71
|
+
--json Print machine-readable JSON
|
|
72
|
+
--fail-on-threshold Exit 1 when the suite fails thresholds
|
|
73
|
+
--strict Alias for --fail-on-threshold
|
|
74
|
+
-h, --help Show help
|
|
75
|
+
`);
|
|
76
|
+
}
|
|
77
|
+
main(process.argv)
|
|
78
|
+
.then((code) => {
|
|
79
|
+
process.exitCode = code;
|
|
80
|
+
})
|
|
81
|
+
.catch((error) => {
|
|
82
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
83
|
+
process.exitCode = 1;
|
|
84
|
+
});
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
export { evaluateCase, evaluateSuite, metricLabels } from "./scoring.js";
|
|
2
|
+
export { formatCase, formatSuite } from "./report.js";
|
|
3
|
+
export type { EvalCase, EvalResult, EvalSuiteResult, ExpectedItem, ExpectedSignals, ItemScore, MetricKey, MetricScore, ObservedRun, } from "./types.js";
|
package/dist/index.js
ADDED
package/dist/io.d.ts
ADDED
package/dist/io.js
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
export async function loadCases(path) {
|
|
3
|
+
const raw = await readFile(path, "utf8");
|
|
4
|
+
const parsed = JSON.parse(raw);
|
|
5
|
+
if (isCaseArrayWrapper(parsed)) {
|
|
6
|
+
return parsed.cases;
|
|
7
|
+
}
|
|
8
|
+
if (isEvalCase(parsed)) {
|
|
9
|
+
return [parsed];
|
|
10
|
+
}
|
|
11
|
+
throw new Error(`File ${path} must contain an eval case or an object with a cases array.`);
|
|
12
|
+
}
|
|
13
|
+
function isCaseArrayWrapper(value) {
|
|
14
|
+
return (typeof value === "object" &&
|
|
15
|
+
value !== null &&
|
|
16
|
+
Array.isArray(value.cases) &&
|
|
17
|
+
value.cases.every(isEvalCase));
|
|
18
|
+
}
|
|
19
|
+
function isEvalCase(value) {
|
|
20
|
+
return (typeof value === "object" &&
|
|
21
|
+
value !== null &&
|
|
22
|
+
typeof value.id === "string");
|
|
23
|
+
}
|
package/dist/report.d.ts
ADDED
package/dist/report.js
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
export function formatSuite(result) {
|
|
2
|
+
const lines = [
|
|
3
|
+
`snipara-evals suite: ${result.score.toFixed(1)} ${statusLabel(result.status)}`,
|
|
4
|
+
`Cases: ${result.passed} passed, ${result.failed} failed`,
|
|
5
|
+
"",
|
|
6
|
+
];
|
|
7
|
+
for (const item of result.cases) {
|
|
8
|
+
lines.push(...formatCase(item).split("\n"), "");
|
|
9
|
+
}
|
|
10
|
+
return lines.join("\n").trimEnd();
|
|
11
|
+
}
|
|
12
|
+
export function formatCase(result) {
|
|
13
|
+
const lines = [
|
|
14
|
+
`Case: ${result.name ?? result.id}`,
|
|
15
|
+
`Overall: ${result.overall.score.toFixed(1)} ${statusLabel(result.overall.status)} (threshold ${result.overall.threshold})`,
|
|
16
|
+
"",
|
|
17
|
+
];
|
|
18
|
+
for (const metric of result.metrics) {
|
|
19
|
+
if (metric.status === "not_applicable") {
|
|
20
|
+
continue;
|
|
21
|
+
}
|
|
22
|
+
lines.push(`${metric.label}: ${metric.score.toFixed(1)} ${statusLabel(metric.status)} (threshold ${metric.threshold})`, ` ${metric.summary}`);
|
|
23
|
+
for (const item of metric.items) {
|
|
24
|
+
const missing = item.missing.map((evidence) => evidence.value).join(", ");
|
|
25
|
+
const rejected = item.rejected.map((evidence) => evidence.value).join(", ");
|
|
26
|
+
const suffix = [
|
|
27
|
+
missing ? `missing: ${missing}` : "",
|
|
28
|
+
rejected ? `rejected: ${rejected}` : "",
|
|
29
|
+
]
|
|
30
|
+
.filter(Boolean)
|
|
31
|
+
.join("; ");
|
|
32
|
+
lines.push(` - ${item.id}: ${item.score.toFixed(1)}${suffix ? ` (${suffix})` : ""}`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return lines.join("\n");
|
|
36
|
+
}
|
|
37
|
+
function statusLabel(status) {
|
|
38
|
+
switch (status) {
|
|
39
|
+
case "pass":
|
|
40
|
+
return "PASS";
|
|
41
|
+
case "warn":
|
|
42
|
+
return "WARN";
|
|
43
|
+
case "fail":
|
|
44
|
+
return "FAIL";
|
|
45
|
+
case "not_applicable":
|
|
46
|
+
return "N/A";
|
|
47
|
+
}
|
|
48
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import { type EvalCase, type EvalResult, type EvalSuiteResult, type MetricKey } from "./types.js";
|
|
2
|
+
export declare const metricLabels: Record<MetricKey, string>;
|
|
3
|
+
export declare function evaluateSuite(cases: EvalCase[]): EvalSuiteResult;
|
|
4
|
+
export declare function evaluateCase(input: EvalCase): EvalResult;
|
package/dist/scoring.js
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import { metricKeys, } from "./types.js";
|
|
2
|
+
export const metricLabels = {
|
|
3
|
+
contextPreservation: "Context Preservation",
|
|
4
|
+
decisionConsistency: "Decision Consistency",
|
|
5
|
+
impactAwareness: "Impact Awareness",
|
|
6
|
+
verificationCoverage: "Verification Coverage",
|
|
7
|
+
continuity: "Continuity",
|
|
8
|
+
};
|
|
9
|
+
const defaultWeights = {
|
|
10
|
+
contextPreservation: 0.25,
|
|
11
|
+
decisionConsistency: 0.25,
|
|
12
|
+
impactAwareness: 0.2,
|
|
13
|
+
verificationCoverage: 0.2,
|
|
14
|
+
continuity: 0.1,
|
|
15
|
+
};
|
|
16
|
+
const defaultThresholds = {
|
|
17
|
+
contextPreservation: 70,
|
|
18
|
+
decisionConsistency: 70,
|
|
19
|
+
impactAwareness: 70,
|
|
20
|
+
verificationCoverage: 70,
|
|
21
|
+
continuity: 70,
|
|
22
|
+
overall: 75,
|
|
23
|
+
};
|
|
24
|
+
const metricToExpectedKey = {
|
|
25
|
+
contextPreservation: "context",
|
|
26
|
+
decisionConsistency: "decisions",
|
|
27
|
+
impactAwareness: "impact",
|
|
28
|
+
verificationCoverage: "verification",
|
|
29
|
+
continuity: "continuity",
|
|
30
|
+
};
|
|
31
|
+
export function evaluateSuite(cases) {
|
|
32
|
+
const results = cases.map((item) => evaluateCase(item));
|
|
33
|
+
const score = round(results.length === 0
|
|
34
|
+
? 0
|
|
35
|
+
: results.reduce((sum, result) => sum + result.overall.score, 0) / results.length);
|
|
36
|
+
const failed = results.filter((result) => result.overall.status === "fail").length;
|
|
37
|
+
const passed = results.length - failed;
|
|
38
|
+
return {
|
|
39
|
+
score,
|
|
40
|
+
status: failed === 0 && results.length > 0 ? "pass" : "fail",
|
|
41
|
+
passed,
|
|
42
|
+
failed,
|
|
43
|
+
cases: results,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
export function evaluateCase(input) {
|
|
47
|
+
assertCase(input);
|
|
48
|
+
const corpus = buildCorpus(input.observed ?? {});
|
|
49
|
+
const metrics = metricKeys.map((key) => scoreMetric(input, key, corpus));
|
|
50
|
+
const activeMetrics = metrics.filter((metric) => metric.status !== "not_applicable");
|
|
51
|
+
const overallThreshold = input.thresholds?.overall ?? defaultThresholds.overall;
|
|
52
|
+
if (activeMetrics.length === 0) {
|
|
53
|
+
return {
|
|
54
|
+
id: input.id,
|
|
55
|
+
name: input.name,
|
|
56
|
+
overall: {
|
|
57
|
+
score: 0,
|
|
58
|
+
status: "fail",
|
|
59
|
+
threshold: overallThreshold,
|
|
60
|
+
},
|
|
61
|
+
metrics,
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
const totalWeight = activeMetrics.reduce((sum, metric) => sum + (input.weights?.[metric.key] ?? defaultWeights[metric.key]), 0);
|
|
65
|
+
const score = round(activeMetrics.reduce((sum, metric) => {
|
|
66
|
+
const weight = input.weights?.[metric.key] ?? defaultWeights[metric.key];
|
|
67
|
+
return sum + metric.score * (weight / totalWeight);
|
|
68
|
+
}, 0));
|
|
69
|
+
const metricBelowThreshold = activeMetrics.some((metric) => metric.status !== "pass");
|
|
70
|
+
return {
|
|
71
|
+
id: input.id,
|
|
72
|
+
name: input.name,
|
|
73
|
+
overall: {
|
|
74
|
+
score,
|
|
75
|
+
status: score >= overallThreshold && !metricBelowThreshold ? "pass" : "fail",
|
|
76
|
+
threshold: overallThreshold,
|
|
77
|
+
},
|
|
78
|
+
metrics,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
function scoreMetric(input, key, corpus) {
|
|
82
|
+
const expectedKey = metricToExpectedKey[key];
|
|
83
|
+
const items = input.expected?.[expectedKey] ?? [];
|
|
84
|
+
const threshold = input.thresholds?.[key] ?? defaultThresholds[key];
|
|
85
|
+
if (items.length === 0) {
|
|
86
|
+
return {
|
|
87
|
+
key,
|
|
88
|
+
label: metricLabels[key],
|
|
89
|
+
score: 0,
|
|
90
|
+
status: "not_applicable",
|
|
91
|
+
threshold,
|
|
92
|
+
items: [],
|
|
93
|
+
summary: "No expectations provided for this metric.",
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
const itemScores = items.map((item) => scoreItem(item, key, corpus));
|
|
97
|
+
const totalWeight = itemScores.reduce((sum, item) => sum + item.weight, 0);
|
|
98
|
+
const score = round(itemScores.reduce((sum, item) => sum + item.score * (item.weight / totalWeight), 0));
|
|
99
|
+
const status = statusFor(score, threshold);
|
|
100
|
+
const failedItems = itemScores.filter((item) => item.score < threshold).length;
|
|
101
|
+
return {
|
|
102
|
+
key,
|
|
103
|
+
label: metricLabels[key],
|
|
104
|
+
score,
|
|
105
|
+
status,
|
|
106
|
+
threshold,
|
|
107
|
+
items: itemScores,
|
|
108
|
+
summary: failedItems === 0
|
|
109
|
+
? `${items.length}/${items.length} expected signals covered.`
|
|
110
|
+
: `${items.length - failedItems}/${items.length} expected signals covered.`,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
function scoreItem(item, key, corpus) {
|
|
114
|
+
const phrases = phrasesFor(item, key);
|
|
115
|
+
const matched = [];
|
|
116
|
+
const missing = [];
|
|
117
|
+
for (const phrase of phrases) {
|
|
118
|
+
const evidence = matchPhrase(phrase, corpus);
|
|
119
|
+
if (evidence) {
|
|
120
|
+
matched.push(evidence);
|
|
121
|
+
}
|
|
122
|
+
else {
|
|
123
|
+
missing.push({ value: phrase, source: "text" });
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
if (item.files) {
|
|
127
|
+
for (const file of item.files) {
|
|
128
|
+
const found = corpus.files.some((changed) => samePathOrSuffix(changed, file)) || includes(corpus.text, file);
|
|
129
|
+
if (found) {
|
|
130
|
+
matched.push({ value: file, source: "file" });
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
missing.push({ value: file, source: "file" });
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
if (item.command) {
|
|
138
|
+
const found = corpus.commands.some((command) => commandMatches(command, item.command ?? "")) ||
|
|
139
|
+
includes(corpus.text, item.command);
|
|
140
|
+
if (found) {
|
|
141
|
+
matched.push({ value: item.command, source: "command" });
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
missing.push({ value: item.command, source: "command" });
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
const rejected = (item.rejectedKeywords ?? [])
|
|
148
|
+
.filter((phrase) => includes(corpus.text, phrase))
|
|
149
|
+
.map((phrase) => ({ value: phrase, source: "negative" }));
|
|
150
|
+
const positiveTotal = matched.length + missing.length;
|
|
151
|
+
const baseScore = positiveTotal === 0 ? 100 : (matched.length / positiveTotal) * 100;
|
|
152
|
+
const penalty = Math.min(70, rejected.length * 30);
|
|
153
|
+
const decisionCap = key === "decisionConsistency" && rejected.length > 0 ? 40 : 100;
|
|
154
|
+
const score = round(Math.max(0, Math.min(decisionCap, baseScore - penalty)));
|
|
155
|
+
return {
|
|
156
|
+
id: item.id,
|
|
157
|
+
score,
|
|
158
|
+
weight: item.weight ?? 1,
|
|
159
|
+
matched,
|
|
160
|
+
missing,
|
|
161
|
+
rejected,
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
function phrasesFor(item, key) {
|
|
165
|
+
if (item.keywords && item.keywords.length > 0) {
|
|
166
|
+
return unique(item.keywords);
|
|
167
|
+
}
|
|
168
|
+
const source = key === "decisionConsistency"
|
|
169
|
+
? item.statement
|
|
170
|
+
: key === "impactAwareness"
|
|
171
|
+
? item.target
|
|
172
|
+
: key === "verificationCoverage"
|
|
173
|
+
? item.check
|
|
174
|
+
: key === "continuity"
|
|
175
|
+
? item.handoff
|
|
176
|
+
: item.text;
|
|
177
|
+
return source ? significantTerms(source) : [];
|
|
178
|
+
}
|
|
179
|
+
function matchPhrase(phrase, corpus) {
|
|
180
|
+
if (includes(corpus.text, phrase)) {
|
|
181
|
+
return { value: phrase, source: "text" };
|
|
182
|
+
}
|
|
183
|
+
if (corpus.commands.some((command) => commandMatches(command, phrase))) {
|
|
184
|
+
return { value: phrase, source: "command" };
|
|
185
|
+
}
|
|
186
|
+
if (corpus.checks.some((check) => includes(check, phrase))) {
|
|
187
|
+
return { value: phrase, source: "check" };
|
|
188
|
+
}
|
|
189
|
+
if (corpus.files.some((file) => samePathOrSuffix(file, phrase))) {
|
|
190
|
+
return { value: phrase, source: "file" };
|
|
191
|
+
}
|
|
192
|
+
return null;
|
|
193
|
+
}
|
|
194
|
+
function buildCorpus(observed) {
|
|
195
|
+
const files = observed.filesChanged ?? [];
|
|
196
|
+
const commands = observed.commandsRun ?? [];
|
|
197
|
+
const checks = (observed.checks ?? []).map((check) => [check.name, check.status, check.command].filter(Boolean).join(" "));
|
|
198
|
+
const artifactText = (observed.artifacts ?? [])
|
|
199
|
+
.map((artifact) => [artifact.path, artifact.content].filter(Boolean).join(" "))
|
|
200
|
+
.join("\n");
|
|
201
|
+
const text = [
|
|
202
|
+
observed.answer,
|
|
203
|
+
observed.transcript,
|
|
204
|
+
files.join("\n"),
|
|
205
|
+
commands.join("\n"),
|
|
206
|
+
checks.join("\n"),
|
|
207
|
+
artifactText,
|
|
208
|
+
]
|
|
209
|
+
.filter(Boolean)
|
|
210
|
+
.join("\n");
|
|
211
|
+
return {
|
|
212
|
+
text: normalize(text),
|
|
213
|
+
files,
|
|
214
|
+
commands,
|
|
215
|
+
checks,
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
function assertCase(input) {
|
|
219
|
+
if (!input || typeof input !== "object") {
|
|
220
|
+
throw new Error("Eval case must be an object.");
|
|
221
|
+
}
|
|
222
|
+
if (!input.id || typeof input.id !== "string") {
|
|
223
|
+
throw new Error("Eval case requires a string id.");
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
function significantTerms(value) {
|
|
227
|
+
return unique(normalize(value)
|
|
228
|
+
.split(" ")
|
|
229
|
+
.filter((term) => term.length >= 4)
|
|
230
|
+
.slice(0, 12));
|
|
231
|
+
}
|
|
232
|
+
function statusFor(score, threshold) {
|
|
233
|
+
if (score >= threshold) {
|
|
234
|
+
return "pass";
|
|
235
|
+
}
|
|
236
|
+
if (score >= Math.max(0, threshold - 15)) {
|
|
237
|
+
return "warn";
|
|
238
|
+
}
|
|
239
|
+
return "fail";
|
|
240
|
+
}
|
|
241
|
+
function includes(haystack, needle) {
|
|
242
|
+
return normalize(haystack).includes(normalize(needle));
|
|
243
|
+
}
|
|
244
|
+
function commandMatches(command, expected) {
|
|
245
|
+
const normalizedCommand = normalize(command);
|
|
246
|
+
const normalizedExpected = normalize(expected);
|
|
247
|
+
return normalizedCommand === normalizedExpected || normalizedCommand.includes(normalizedExpected);
|
|
248
|
+
}
|
|
249
|
+
function samePathOrSuffix(actual, expected) {
|
|
250
|
+
const cleanActual = normalizePath(actual);
|
|
251
|
+
const cleanExpected = normalizePath(expected);
|
|
252
|
+
return cleanActual === cleanExpected || cleanActual.endsWith(`/${cleanExpected}`);
|
|
253
|
+
}
|
|
254
|
+
function normalizePath(value) {
|
|
255
|
+
return value.replaceAll("\\", "/").replace(/^\.?\//, "").trim().toLowerCase();
|
|
256
|
+
}
|
|
257
|
+
function normalize(value) {
|
|
258
|
+
return value
|
|
259
|
+
.toLowerCase()
|
|
260
|
+
.replace(/[`"']/g, "")
|
|
261
|
+
.replace(/[^a-z0-9./:_+-]+/g, " ")
|
|
262
|
+
.replace(/\s+/g, " ")
|
|
263
|
+
.trim();
|
|
264
|
+
}
|
|
265
|
+
function unique(values) {
|
|
266
|
+
return [...new Set(values.map((value) => value.trim()).filter(Boolean))];
|
|
267
|
+
}
|
|
268
|
+
function round(value) {
|
|
269
|
+
return Math.round(value * 10) / 10;
|
|
270
|
+
}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
export declare const metricKeys: readonly ["contextPreservation", "decisionConsistency", "impactAwareness", "verificationCoverage", "continuity"];
|
|
2
|
+
export type MetricKey = (typeof metricKeys)[number];
|
|
3
|
+
export type MetricStatus = "pass" | "warn" | "fail" | "not_applicable";
|
|
4
|
+
export type ExpectedItem = {
|
|
5
|
+
id: string;
|
|
6
|
+
text?: string;
|
|
7
|
+
statement?: string;
|
|
8
|
+
target?: string;
|
|
9
|
+
check?: string;
|
|
10
|
+
command?: string;
|
|
11
|
+
handoff?: string;
|
|
12
|
+
keywords?: string[];
|
|
13
|
+
rejectedKeywords?: string[];
|
|
14
|
+
files?: string[];
|
|
15
|
+
weight?: number;
|
|
16
|
+
};
|
|
17
|
+
export type ExpectedSignals = {
|
|
18
|
+
context?: ExpectedItem[];
|
|
19
|
+
decisions?: ExpectedItem[];
|
|
20
|
+
impact?: ExpectedItem[];
|
|
21
|
+
verification?: ExpectedItem[];
|
|
22
|
+
continuity?: ExpectedItem[];
|
|
23
|
+
};
|
|
24
|
+
export type ObservedCheck = {
|
|
25
|
+
name: string;
|
|
26
|
+
status?: "pass" | "fail" | "skipped" | "unknown";
|
|
27
|
+
command?: string;
|
|
28
|
+
};
|
|
29
|
+
export type ObservedArtifact = {
|
|
30
|
+
path?: string;
|
|
31
|
+
content?: string;
|
|
32
|
+
};
|
|
33
|
+
export type ObservedRun = {
|
|
34
|
+
answer?: string;
|
|
35
|
+
transcript?: string;
|
|
36
|
+
filesChanged?: string[];
|
|
37
|
+
commandsRun?: string[];
|
|
38
|
+
checks?: ObservedCheck[];
|
|
39
|
+
artifacts?: ObservedArtifact[];
|
|
40
|
+
};
|
|
41
|
+
export type EvalCase = {
|
|
42
|
+
id: string;
|
|
43
|
+
name?: string;
|
|
44
|
+
description?: string;
|
|
45
|
+
tags?: string[];
|
|
46
|
+
expected?: ExpectedSignals;
|
|
47
|
+
observed?: ObservedRun;
|
|
48
|
+
weights?: Partial<Record<MetricKey, number>>;
|
|
49
|
+
thresholds?: Partial<Record<MetricKey | "overall", number>>;
|
|
50
|
+
};
|
|
51
|
+
export type EvidenceMatch = {
|
|
52
|
+
value: string;
|
|
53
|
+
source: "text" | "file" | "command" | "check" | "negative";
|
|
54
|
+
};
|
|
55
|
+
export type ItemScore = {
|
|
56
|
+
id: string;
|
|
57
|
+
score: number;
|
|
58
|
+
weight: number;
|
|
59
|
+
matched: EvidenceMatch[];
|
|
60
|
+
missing: EvidenceMatch[];
|
|
61
|
+
rejected: EvidenceMatch[];
|
|
62
|
+
};
|
|
63
|
+
export type MetricScore = {
|
|
64
|
+
key: MetricKey;
|
|
65
|
+
label: string;
|
|
66
|
+
score: number;
|
|
67
|
+
status: MetricStatus;
|
|
68
|
+
threshold: number;
|
|
69
|
+
items: ItemScore[];
|
|
70
|
+
summary: string;
|
|
71
|
+
};
|
|
72
|
+
export type EvalResult = {
|
|
73
|
+
id: string;
|
|
74
|
+
name?: string;
|
|
75
|
+
overall: {
|
|
76
|
+
score: number;
|
|
77
|
+
status: MetricStatus;
|
|
78
|
+
threshold: number;
|
|
79
|
+
};
|
|
80
|
+
metrics: MetricScore[];
|
|
81
|
+
};
|
|
82
|
+
export type EvalSuiteResult = {
|
|
83
|
+
score: number;
|
|
84
|
+
status: MetricStatus;
|
|
85
|
+
passed: number;
|
|
86
|
+
failed: number;
|
|
87
|
+
cases: EvalResult[];
|
|
88
|
+
};
|
package/dist/types.js
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "mini-snipara-bootstrap",
|
|
3
|
+
"name": "Mini Snipara repository bootstrap",
|
|
4
|
+
"description": "Scores whether an agent preserved the OSS boundary while preparing a public repository.",
|
|
5
|
+
"expected": {
|
|
6
|
+
"context": [
|
|
7
|
+
{
|
|
8
|
+
"id": "public-stack",
|
|
9
|
+
"text": "Mini Snipara is the local OSS stack: companion, memory, evals.",
|
|
10
|
+
"keywords": ["Mini Snipara", "companion", "memory", "evals"]
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"id": "license",
|
|
14
|
+
"text": "snipara-evals uses Apache-2.0.",
|
|
15
|
+
"keywords": ["Apache-2.0", "snipara-evals"]
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"decisions": [
|
|
19
|
+
{
|
|
20
|
+
"id": "hosted-boundary",
|
|
21
|
+
"statement": "Code graph and source authority remain hosted Snipara features.",
|
|
22
|
+
"keywords": ["code graph", "hosted", "source authority"],
|
|
23
|
+
"rejectedKeywords": ["open source code graph"]
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
"impact": [
|
|
27
|
+
{
|
|
28
|
+
"id": "cli-package",
|
|
29
|
+
"target": "The package exposes a CLI and library API.",
|
|
30
|
+
"keywords": ["CLI", "library API"],
|
|
31
|
+
"files": ["src/cli.ts", "src/index.ts"]
|
|
32
|
+
}
|
|
33
|
+
],
|
|
34
|
+
"verification": [
|
|
35
|
+
{
|
|
36
|
+
"id": "package-checks",
|
|
37
|
+
"check": "Run build, type-check, lint, tests, and pack smoke.",
|
|
38
|
+
"command": "pnpm test",
|
|
39
|
+
"keywords": ["pnpm build", "pnpm type-check", "pnpm lint", "pnpm test", "pack smoke"]
|
|
40
|
+
}
|
|
41
|
+
],
|
|
42
|
+
"continuity": [
|
|
43
|
+
{
|
|
44
|
+
"id": "handoff",
|
|
45
|
+
"handoff": "Leave a concise summary and next step for another agent.",
|
|
46
|
+
"keywords": ["summary", "next step", "handoff"]
|
|
47
|
+
}
|
|
48
|
+
]
|
|
49
|
+
},
|
|
50
|
+
"observed": {
|
|
51
|
+
"answer": "Prepared snipara-evals as part of Mini Snipara with companion, memory, and evals. The repo keeps code graph and source authority hosted, uses Apache-2.0, and exposes both a CLI and library API. Verification: pnpm build, pnpm type-check, pnpm lint, pnpm test, and pack smoke. Handoff summary: next step is publishing the package when npm auth is available.",
|
|
52
|
+
"filesChanged": ["src/cli.ts", "src/index.ts", "README.md", "package.json"],
|
|
53
|
+
"commandsRun": ["pnpm test"]
|
|
54
|
+
}
|
|
55
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "snipara-evals",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Project Intelligence evals for AI coding agents.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"snipara-evals": "dist/cli.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"dist",
|
|
11
|
+
"examples",
|
|
12
|
+
"README.md",
|
|
13
|
+
"LICENSE"
|
|
14
|
+
],
|
|
15
|
+
"scripts": {
|
|
16
|
+
"build": "tsc -p tsconfig.json",
|
|
17
|
+
"type-check": "tsc --noEmit -p tsconfig.json",
|
|
18
|
+
"lint": "eslint . --max-warnings=0",
|
|
19
|
+
"test": "pnpm build && node --test test/*.test.js",
|
|
20
|
+
"prepack": "pnpm build",
|
|
21
|
+
"prepublishOnly": "pnpm type-check && pnpm lint && pnpm test",
|
|
22
|
+
"pack:smoke": "npm pack --dry-run --json && node dist/cli.js --help"
|
|
23
|
+
},
|
|
24
|
+
"keywords": [
|
|
25
|
+
"ai-evals",
|
|
26
|
+
"agent-evals",
|
|
27
|
+
"ai-coding-agents",
|
|
28
|
+
"project-intelligence",
|
|
29
|
+
"context-evaluation",
|
|
30
|
+
"verification",
|
|
31
|
+
"snipara"
|
|
32
|
+
],
|
|
33
|
+
"author": "Snipara",
|
|
34
|
+
"license": "Apache-2.0",
|
|
35
|
+
"packageManager": "pnpm@10.28.0",
|
|
36
|
+
"publishConfig": {
|
|
37
|
+
"access": "public"
|
|
38
|
+
},
|
|
39
|
+
"repository": {
|
|
40
|
+
"type": "git",
|
|
41
|
+
"url": "git+https://github.com/Snipara/snipara-evals.git"
|
|
42
|
+
},
|
|
43
|
+
"bugs": {
|
|
44
|
+
"url": "https://github.com/Snipara/snipara-evals/issues"
|
|
45
|
+
},
|
|
46
|
+
"homepage": "https://github.com/Snipara/snipara-evals#readme",
|
|
47
|
+
"engines": {
|
|
48
|
+
"node": ">=20"
|
|
49
|
+
},
|
|
50
|
+
"devDependencies": {
|
|
51
|
+
"@eslint/js": "^9.39.1",
|
|
52
|
+
"@types/node": "^24.10.1",
|
|
53
|
+
"eslint": "^9.39.1",
|
|
54
|
+
"typescript": "^5.9.3",
|
|
55
|
+
"typescript-eslint": "^8.46.4"
|
|
56
|
+
}
|
|
57
|
+
}
|