@markbrutx/promptbook-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +53 -0
  3. package/dist/annotations.d.ts +56 -0
  4. package/dist/annotations.d.ts.map +1 -0
  5. package/dist/annotations.js +50 -0
  6. package/dist/annotations.js.map +1 -0
  7. package/dist/bundle.d.ts +44 -0
  8. package/dist/bundle.d.ts.map +1 -0
  9. package/dist/bundle.js +135 -0
  10. package/dist/bundle.js.map +1 -0
  11. package/dist/edge/index.js +192 -0
  12. package/dist/edge.d.ts +12 -0
  13. package/dist/edge.d.ts.map +1 -0
  14. package/dist/edge.js +11 -0
  15. package/dist/edge.js.map +1 -0
  16. package/dist/eval/assertions.d.ts +15 -0
  17. package/dist/eval/assertions.d.ts.map +1 -0
  18. package/dist/eval/assertions.js +131 -0
  19. package/dist/eval/assertions.js.map +1 -0
  20. package/dist/eval/evaluate.d.ts +15 -0
  21. package/dist/eval/evaluate.d.ts.map +1 -0
  22. package/dist/eval/evaluate.js +65 -0
  23. package/dist/eval/evaluate.js.map +1 -0
  24. package/dist/eval/load-fixtures.d.ts +12 -0
  25. package/dist/eval/load-fixtures.d.ts.map +1 -0
  26. package/dist/eval/load-fixtures.js +87 -0
  27. package/dist/eval/load-fixtures.js.map +1 -0
  28. package/dist/eval/types.d.ts +123 -0
  29. package/dist/eval/types.d.ts.map +1 -0
  30. package/dist/eval/types.js +2 -0
  31. package/dist/eval/types.js.map +1 -0
  32. package/dist/frontmatter.d.ts +12 -0
  33. package/dist/frontmatter.d.ts.map +1 -0
  34. package/dist/frontmatter.js +22 -0
  35. package/dist/frontmatter.js.map +1 -0
  36. package/dist/fs.d.ts +11 -0
  37. package/dist/fs.d.ts.map +1 -0
  38. package/dist/fs.js +20 -0
  39. package/dist/fs.js.map +1 -0
  40. package/dist/guards.d.ts +6 -0
  41. package/dist/guards.d.ts.map +1 -0
  42. package/dist/guards.js +9 -0
  43. package/dist/guards.js.map +1 -0
  44. package/dist/index.d.ts +22 -0
  45. package/dist/index.d.ts.map +1 -0
  46. package/dist/index.js +15 -0
  47. package/dist/index.js.map +1 -0
  48. package/dist/interpolate.d.ts +11 -0
  49. package/dist/interpolate.d.ts.map +1 -0
  50. package/dist/interpolate.js +25 -0
  51. package/dist/interpolate.js.map +1 -0
  52. package/dist/lint/lint.d.ts +11 -0
  53. package/dist/lint/lint.d.ts.map +1 -0
  54. package/dist/lint/lint.js +30 -0
  55. package/dist/lint/lint.js.map +1 -0
  56. package/dist/lint/references.d.ts +18 -0
  57. package/dist/lint/references.d.ts.map +1 -0
  58. package/dist/lint/references.js +39 -0
  59. package/dist/lint/references.js.map +1 -0
  60. package/dist/lint/rules/banned-tokens.d.ts +13 -0
  61. package/dist/lint/rules/banned-tokens.d.ts.map +1 -0
  62. package/dist/lint/rules/banned-tokens.js +38 -0
  63. package/dist/lint/rules/banned-tokens.js.map +1 -0
  64. package/dist/lint/rules/dangling-reference.d.ts +11 -0
  65. package/dist/lint/rules/dangling-reference.d.ts.map +1 -0
  66. package/dist/lint/rules/dangling-reference.js +37 -0
  67. package/dist/lint/rules/dangling-reference.js.map +1 -0
  68. package/dist/lint/rules/dead-rule.d.ts +21 -0
  69. package/dist/lint/rules/dead-rule.d.ts.map +1 -0
  70. package/dist/lint/rules/dead-rule.js +135 -0
  71. package/dist/lint/rules/dead-rule.js.map +1 -0
  72. package/dist/lint/rules/example-balance.d.ts +19 -0
  73. package/dist/lint/rules/example-balance.d.ts.map +1 -0
  74. package/dist/lint/rules/example-balance.js +57 -0
  75. package/dist/lint/rules/example-balance.js.map +1 -0
  76. package/dist/lint/rules/index.d.ts +28 -0
  77. package/dist/lint/rules/index.d.ts.map +1 -0
  78. package/dist/lint/rules/index.js +30 -0
  79. package/dist/lint/rules/index.js.map +1 -0
  80. package/dist/lint/rules/language-directive-position.d.ts +16 -0
  81. package/dist/lint/rules/language-directive-position.d.ts.map +1 -0
  82. package/dist/lint/rules/language-directive-position.js +42 -0
  83. package/dist/lint/rules/language-directive-position.js.map +1 -0
  84. package/dist/lint/rules/token-budget.d.ts +18 -0
  85. package/dist/lint/rules/token-budget.d.ts.map +1 -0
  86. package/dist/lint/rules/token-budget.js +39 -0
  87. package/dist/lint/rules/token-budget.js.map +1 -0
  88. package/dist/lint/rules/unused-fragment.d.ts +11 -0
  89. package/dist/lint/rules/unused-fragment.d.ts.map +1 -0
  90. package/dist/lint/rules/unused-fragment.js +33 -0
  91. package/dist/lint/rules/unused-fragment.js.map +1 -0
  92. package/dist/lint/types.d.ts +50 -0
  93. package/dist/lint/types.d.ts.map +1 -0
  94. package/dist/lint/types.js +2 -0
  95. package/dist/lint/types.js.map +1 -0
  96. package/dist/load.d.ts +12 -0
  97. package/dist/load.d.ts.map +1 -0
  98. package/dist/load.js +238 -0
  99. package/dist/load.js.map +1 -0
  100. package/dist/paths.d.ts +12 -0
  101. package/dist/paths.d.ts.map +1 -0
  102. package/dist/paths.js +25 -0
  103. package/dist/paths.js.map +1 -0
  104. package/dist/resolve-book.d.ts +15 -0
  105. package/dist/resolve-book.d.ts.map +1 -0
  106. package/dist/resolve-book.js +195 -0
  107. package/dist/resolve-book.js.map +1 -0
  108. package/dist/resolve.d.ts +13 -0
  109. package/dist/resolve.d.ts.map +1 -0
  110. package/dist/resolve.js +17 -0
  111. package/dist/resolve.js.map +1 -0
  112. package/dist/types.d.ts +173 -0
  113. package/dist/types.d.ts.map +1 -0
  114. package/dist/types.js +9 -0
  115. package/dist/types.js.map +1 -0
  116. package/package.json +48 -0
  117. package/src/annotations.ts +100 -0
  118. package/src/bundle.ts +163 -0
  119. package/src/edge.ts +11 -0
  120. package/src/eval/assertions.ts +174 -0
  121. package/src/eval/evaluate.ts +84 -0
  122. package/src/eval/load-fixtures.ts +91 -0
  123. package/src/eval/types.ts +134 -0
  124. package/src/frontmatter.ts +28 -0
  125. package/src/fs.ts +21 -0
  126. package/src/guards.ts +11 -0
  127. package/src/index.ts +84 -0
  128. package/src/interpolate.ts +27 -0
  129. package/src/lint/lint.ts +32 -0
  130. package/src/lint/references.ts +50 -0
  131. package/src/lint/rules/banned-tokens.ts +46 -0
  132. package/src/lint/rules/dangling-reference.ts +43 -0
  133. package/src/lint/rules/dead-rule.ts +147 -0
  134. package/src/lint/rules/example-balance.ts +68 -0
  135. package/src/lint/rules/index.ts +47 -0
  136. package/src/lint/rules/language-directive-position.ts +51 -0
  137. package/src/lint/rules/token-budget.ts +50 -0
  138. package/src/lint/rules/unused-fragment.ts +38 -0
  139. package/src/lint/types.ts +55 -0
  140. package/src/load.ts +282 -0
  141. package/src/paths.ts +27 -0
  142. package/src/resolve-book.ts +237 -0
  143. package/src/resolve.ts +18 -0
  144. package/src/types.ts +191 -0
package/src/bundle.ts ADDED
@@ -0,0 +1,163 @@
1
+ import type { CodePrompt, CodePromptSample, Composition, Fragment, PromptBook, Rule } from "./types.js";
2
+
3
+ /** Options for {@link serializeBook}. */
4
+ export interface SerializeBookOptions {
5
+ /** Module specifier for the `import type { PromptBook }` line. Default `@markbrutx/promptbook-core`. */
6
+ importSpecifier?: string;
7
+ /**
8
+ * Emit the `import type { PromptBook }` line and the `: PromptBook`
9
+ * annotation. Default `true`. Set `false` for a plain, inference-typed
10
+ * module — useful for runtimes that cannot resolve the type-only import
11
+ * (e.g. Deno consuming the raw build), where the value still resolves fine.
12
+ */
13
+ typed?: boolean;
14
+ }
15
+
16
+ /** Locale-independent string order so serialized output is byte-stable across machines. */
17
+ function compareKeys(a: string, b: string): number {
18
+ return a < b ? -1 : a > b ? 1 : 0;
19
+ }
20
+
21
+ /** Map entries sorted by key with a locale-independent comparator. */
22
+ function sortedEntries<T>(map: Map<string, T>): [string, T][] {
23
+ return [...map.entries()].sort((a, b) => compareKeys(a[0], b[0]));
24
+ }
25
+
26
+ /** Drop keys whose value is `undefined`, preserving insertion (declaration) order. */
27
+ function compact(obj: Record<string, unknown>): Record<string, unknown> {
28
+ const out: Record<string, unknown> = {};
29
+ for (const [key, value] of Object.entries(obj)) {
30
+ if (value !== undefined) {
31
+ out[key] = value;
32
+ }
33
+ }
34
+ return out;
35
+ }
36
+
37
+ // The canonical builders below list every key explicitly via a
38
+ // `Record<keyof Required<…>>` literal: adding a field to the data model is then
39
+ // a compile error here until it is serialized, so the bundler never silently
40
+ // drops data. Key order follows the type declaration; absent optionals are
41
+ // compacted away. `JSON.stringify` of the result is a deterministic literal.
42
+
43
+ function canonicalFragment(fragment: Fragment): Record<string, unknown> {
44
+ const ordered: Record<keyof Required<Fragment>, unknown> = {
45
+ id: fragment.id,
46
+ kind: fragment.kind,
47
+ tags: fragment.tags,
48
+ body: fragment.body,
49
+ sourceFile: fragment.sourceFile,
50
+ };
51
+ return compact(ordered);
52
+ }
53
+
54
+ function canonicalRule(rule: Rule): Record<string, unknown> {
55
+ const ordered: Record<keyof Required<Rule>, unknown> = {
56
+ index: rule.index,
57
+ when: rule.when,
58
+ action: rule.action,
59
+ add: rule.add,
60
+ after: rule.after,
61
+ replace: rule.replace,
62
+ forbid: rule.forbid,
63
+ order: rule.order,
64
+ };
65
+ return compact(ordered);
66
+ }
67
+
68
+ function canonicalComposition(composition: Composition): Record<string, unknown> {
69
+ const ordered: Record<keyof Required<Composition>, unknown> = {
70
+ name: composition.name,
71
+ base: composition.base,
72
+ order: composition.order,
73
+ rules: composition.rules.map(canonicalRule),
74
+ sourceFile: composition.sourceFile,
75
+ };
76
+ return compact(ordered);
77
+ }
78
+
79
+ function canonicalCodePromptSample(sample: CodePromptSample): Record<string, unknown> {
80
+ const ordered: Record<keyof Required<CodePromptSample>, unknown> = {
81
+ label: sample.label,
82
+ context: sample.context,
83
+ output: sample.output,
84
+ };
85
+ return compact(ordered);
86
+ }
87
+
88
+ function canonicalCodePrompt(codePrompt: CodePrompt): Record<string, unknown> {
89
+ const ordered: Record<keyof Required<CodePrompt>, unknown> = {
90
+ name: codePrompt.name,
91
+ description: codePrompt.description,
92
+ samples: codePrompt.samples.map(canonicalCodePromptSample),
93
+ sourceFile: codePrompt.sourceFile,
94
+ };
95
+ return compact(ordered);
96
+ }
97
+
98
+ /** Emit `new Map([...])` with one `[key, value]` entry per line (entries pre-sorted). */
99
+ function serializeMap<T>(entries: [string, T][], canonical: (value: T) => unknown): string {
100
+ if (entries.length === 0) {
101
+ return "new Map([])";
102
+ }
103
+ const lines = entries.map(
104
+ ([key, value]) => ` [${JSON.stringify(key)}, ${JSON.stringify(canonical(value))}],`,
105
+ );
106
+ return `new Map([\n${lines.join("\n")}\n ])`;
107
+ }
108
+
109
+ /**
110
+ * Serialize a {@link PromptBook} to a deterministic, evaluable expression:
111
+ * `{ fragments: new Map([...]), compositions: new Map([...]),
112
+ * codePrompts: new Map([...]), warnings: [...] }`.
113
+ *
114
+ * The result is pure JavaScript (only `new Map`, arrays, object and scalar
115
+ * literals) so it can be embedded in a module or reconstructed directly. Map
116
+ * entries are sorted by key and optional fields are omitted when absent, so the
117
+ * same book always produces byte-identical output.
118
+ */
119
+ export function serializeBookExpression(book: PromptBook): string {
120
+ const fragments = serializeMap(sortedEntries(book.fragments), canonicalFragment);
121
+ const compositions = serializeMap(sortedEntries(book.compositions), canonicalComposition);
122
+ const codePrompts = serializeMap(sortedEntries(book.codePrompts), canonicalCodePrompt);
123
+ const warnings = JSON.stringify(book.warnings);
124
+ return `{\n fragments: ${fragments},\n compositions: ${compositions},\n codePrompts: ${codePrompts},\n warnings: ${warnings},\n}`;
125
+ }
126
+
127
+ /**
128
+ * Serialize a {@link PromptBook} to an importable TypeScript module exporting
129
+ * `book: PromptBook`. Folding a prompts folder into a single module lets a
130
+ * runtime (e.g. a Deno edge function) import the book instead of reading the
131
+ * disk. Deterministic: the same book yields the same module text.
132
+ *
133
+ * The `: PromptBook` annotation is required so literal rule actions narrow to
134
+ * `RuleAction` via contextual typing; pass `typed: false` (or use
135
+ * {@link serializeBookExpression}) for the annotation-free value.
136
+ */
137
+ export function serializeBook(book: PromptBook, options: SerializeBookOptions = {}): string {
138
+ const typed = options.typed !== false;
139
+ const importSpecifier = options.importSpecifier ?? "@markbrutx/promptbook-core";
140
+ return [
141
+ "// Code generated by `promptbook bundle`; do not edit by hand.",
142
+ ...(typed ? [`import type { PromptBook } from "${importSpecifier}";`] : []),
143
+ "",
144
+ `export const book${typed ? ": PromptBook" : ""} = ${serializeBookExpression(book)};`,
145
+ "",
146
+ "export default book;",
147
+ "",
148
+ ].join("\n");
149
+ }
150
+
151
+ /**
152
+ * Serialize a {@link PromptBook} to a deterministic JSON dump: fragments,
153
+ * compositions and code-prompts as key-sorted arrays of their canonical objects,
154
+ * plus warnings.
155
+ * Shares the canonical key order and locale-independent sort with
156
+ * {@link serializeBook}, so the JSON and module outputs never drift.
157
+ */
158
+ export function serializeBookJson(book: PromptBook): string {
159
+ const fragments = sortedEntries(book.fragments).map(([, value]) => canonicalFragment(value));
160
+ const compositions = sortedEntries(book.compositions).map(([, value]) => canonicalComposition(value));
161
+ const codePrompts = sortedEntries(book.codePrompts).map(([, value]) => canonicalCodePrompt(value));
162
+ return `${JSON.stringify({ fragments, compositions, codePrompts, warnings: book.warnings }, null, 2)}\n`;
163
+ }
package/src/edge.ts ADDED
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Self-contained entrypoint for edge / Deno runtimes that only need to resolve
3
+ * an already-bundled book at request time (no folder loading). Its module graph
4
+ * is just {@link resolveBook} + {@link interpolate} — zero filesystem, YAML, or
5
+ * Node builtins — so `scripts/build-edge.mjs` bundles it to one portable ESM
6
+ * file with no external imports. Consumers (e.g. a Supabase edge function) vendor
7
+ * that file and pair it with a `promptbook bundle` book.
8
+ */
9
+ export { interpolate } from "./interpolate.js";
10
+ export { resolveBook } from "./resolve-book.js";
11
+ export type { Context, PromptBook, ResolveResult, Trace } from "./types.js";
@@ -0,0 +1,174 @@
1
+ /**
2
+ * Built-in, pluggable assertions for the eval engine.
3
+ *
4
+ * Each checker is pure: it inspects a model output against an {@link Assertion}
5
+ * spec and returns an {@link AssertionResult}. Callers can pass their own
6
+ * registry to {@link evaluate} to add or replace checkers, exactly like lint
7
+ * rules. The `language` checker is a script heuristic, not an LLM judge.
8
+ */
9
+ import type { Assertion, AssertionFn, AssertionRegistry, AssertionResult } from "./types.js";
10
+
11
+ const EXCERPT_LIMIT = 160;
12
+
13
+ /** A short, single-line excerpt of the output for failure messages. */
14
+ function excerpt(text: string): string {
15
+ const collapsed = text.replace(/\s+/g, " ").trim();
16
+ return collapsed.length > EXCERPT_LIMIT ? `${collapsed.slice(0, EXCERPT_LIMIT)}…` : collapsed;
17
+ }
18
+
19
+ function result(type: string, pass: boolean, message: string, output: string): AssertionResult {
20
+ return { type, pass, message, excerpt: excerpt(output) };
21
+ }
22
+
23
+ /** Require a spec field to be present, with a clear error when it is not. */
24
+ function need<T>(value: T | undefined, type: string, field: string): T {
25
+ if (value === undefined) {
26
+ throw new Error(`assertion "${type}" requires a "${field}" field.`);
27
+ }
28
+ return value;
29
+ }
30
+
31
+ function buildRegex(assertion: Assertion): RegExp {
32
+ const pattern = need(assertion.pattern, assertion.type, "pattern");
33
+ try {
34
+ return new RegExp(pattern, assertion.flags);
35
+ } catch (error) {
36
+ throw new Error(`assertion "${assertion.type}" has an invalid pattern: ${(error as Error).message}`);
37
+ }
38
+ }
39
+
40
+ const contains: AssertionFn = (output, a) => {
41
+ const value = need(a.value, "contains", "value");
42
+ const pass = output.includes(value);
43
+ return result("contains", pass, pass ? `contains "${value}"` : `missing "${value}"`, output);
44
+ };
45
+
46
+ const notContains: AssertionFn = (output, a) => {
47
+ const value = need(a.value, "not-contains", "value");
48
+ const pass = !output.includes(value);
49
+ return result(
50
+ "not-contains",
51
+ pass,
52
+ pass ? `does not contain "${value}"` : `unexpectedly contains "${value}"`,
53
+ output,
54
+ );
55
+ };
56
+
57
+ const matches: AssertionFn = (output, a) => {
58
+ const re = buildRegex({ ...a, type: "matches" });
59
+ const pass = re.test(output);
60
+ return result("matches", pass, pass ? `matches /${re.source}/` : `does not match /${re.source}/`, output);
61
+ };
62
+
63
+ const notMatches: AssertionFn = (output, a) => {
64
+ const re = buildRegex({ ...a, type: "not-matches" });
65
+ const pass = !re.test(output);
66
+ return result(
67
+ "not-matches",
68
+ pass,
69
+ pass ? `does not match /${re.source}/` : `unexpectedly matches /${re.source}/`,
70
+ output,
71
+ );
72
+ };
73
+
74
+ const equals: AssertionFn = (output, a) => {
75
+ const value = need(a.value, "equals", "value");
76
+ const pass = output.trim() === value.trim();
77
+ return result("equals", pass, pass ? "equals expected text" : "does not equal expected text", output);
78
+ };
79
+
80
+ const jsonValid: AssertionFn = (output) => {
81
+ try {
82
+ JSON.parse(output);
83
+ return result("json-valid", true, "output is valid JSON", output);
84
+ } catch (error) {
85
+ return result("json-valid", false, `output is not valid JSON: ${(error as Error).message}`, output);
86
+ }
87
+ };
88
+
89
+ const maxLength: AssertionFn = (output, a) => {
90
+ const max = need(a.max, "max-length", "max");
91
+ const pass = output.length <= max;
92
+ return result(
93
+ "max-length",
94
+ pass,
95
+ pass ? `length ${output.length} within ${max}` : `length ${output.length} exceeds ${max}`,
96
+ output,
97
+ );
98
+ };
99
+
100
+ /** Map common language tags to the script the `language` heuristic checks. */
101
+ const SCRIPT_BY_LANGUAGE: Record<string, "cyrillic" | "latin"> = {
102
+ ru: "cyrillic",
103
+ uk: "cyrillic",
104
+ be: "cyrillic",
105
+ bg: "cyrillic",
106
+ sr: "cyrillic",
107
+ mk: "cyrillic",
108
+ en: "latin",
109
+ fr: "latin",
110
+ es: "latin",
111
+ de: "latin",
112
+ it: "latin",
113
+ pt: "latin",
114
+ nl: "latin",
115
+ pl: "latin",
116
+ cs: "latin",
117
+ ro: "latin",
118
+ tr: "latin",
119
+ sv: "latin",
120
+ da: "latin",
121
+ no: "latin",
122
+ fi: "latin",
123
+ };
124
+
125
+ const CYRILLIC = /\p{Script=Cyrillic}/gu;
126
+ const LATIN = /\p{Script=Latin}/gu;
127
+
128
+ function count(text: string, re: RegExp): number {
129
+ return text.match(re)?.length ?? 0;
130
+ }
131
+
132
+ /**
133
+ * `language`: script-based heuristic. Resolves the spec `lang` to a script
134
+ * ("cyrillic"/"latin", either directly or via a language tag) and passes when
135
+ * that script has at least one letter and is at least as frequent as the
136
+ * other. Not an LLM judge — it detects script, not fluency.
137
+ */
138
+ const language: AssertionFn = (output, a) => {
139
+ const lang = need(a.lang, "language", "lang").toLowerCase();
140
+ const script = SCRIPT_BY_LANGUAGE[lang] ?? lang;
141
+ if (script !== "cyrillic" && script !== "latin") {
142
+ return result("language", false, `unsupported language "${a.lang}" (no script heuristic)`, output);
143
+ }
144
+ const cyrillic = count(output, CYRILLIC);
145
+ const latin = count(output, LATIN);
146
+ const expected = script === "cyrillic" ? cyrillic : latin;
147
+ const other = script === "cyrillic" ? latin : cyrillic;
148
+ const pass = expected > 0 && expected >= other;
149
+ return result(
150
+ "language",
151
+ pass,
152
+ pass
153
+ ? `output is predominantly ${script}`
154
+ : `expected predominantly ${script} (cyrillic=${cyrillic}, latin=${latin})`,
155
+ output,
156
+ );
157
+ };
158
+
159
+ /**
160
+ * The built-in assertion registry. Pass a merged/replaced object to
161
+ * {@link evaluate} to customize: `{ ...defaultAssertions(), myType: fn }`.
162
+ */
163
+ export function defaultAssertions(): AssertionRegistry {
164
+ return {
165
+ contains,
166
+ "not-contains": notContains,
167
+ matches,
168
+ "not-matches": notMatches,
169
+ equals,
170
+ "json-valid": jsonValid,
171
+ "max-length": maxLength,
172
+ language,
173
+ };
174
+ }
@@ -0,0 +1,84 @@
1
+ import { resolveBook } from "../resolve-book.js";
2
+ import { defaultAssertions } from "./assertions.js";
3
+ import type {
4
+ Assertion,
5
+ AssertionRegistry,
6
+ AssertionResult,
7
+ EvalInput,
8
+ EvalReport,
9
+ Fixture,
10
+ FixtureResult,
11
+ } from "./types.js";
12
+
13
+ /** Run every assertion of a fixture against one model output. */
14
+ function runAssertions(output: string, asserts: Assertion[], registry: AssertionRegistry): AssertionResult[] {
15
+ return asserts.map((assertion) => {
16
+ const fn = registry[assertion.type];
17
+ if (fn === undefined) {
18
+ return { type: assertion.type, pass: false, message: `unknown assertion type "${assertion.type}"` };
19
+ }
20
+ return fn(output, assertion);
21
+ });
22
+ }
23
+
24
+ /**
25
+ * Run fixtures through a model adapter and report pass-rate.
26
+ *
27
+ * For each fixture: assemble the system prompt once via {@link resolveBook}
28
+ * (deterministic — every sample of a fixture sees the same prompt), then take
29
+ * N samples through `adapter.complete` and run the assertions on each. A sample
30
+ * passes only when all its assertions pass; `passRate = passes / samples`. A
31
+ * fixture meets the gate when `passRate >= passThreshold` (default 1).
32
+ *
33
+ * The engine is pure given the adapter: its control flow is deterministic and
34
+ * the only stochasticity/IO is `adapter.complete`. It makes no network calls.
35
+ */
36
+ export async function evaluate(input: EvalInput): Promise<EvalReport> {
37
+ const registry = input.assertions ?? defaultAssertions();
38
+ const defaultSamples = input.samples ?? 1;
39
+ const threshold = input.passThreshold ?? 1;
40
+
41
+ const results: FixtureResult[] = [];
42
+ for (const fixture of input.fixtures) {
43
+ results.push(await runFixture(fixture, input, registry, defaultSamples));
44
+ }
45
+
46
+ const passed = results.filter((r) => r.passRate >= threshold).length;
47
+ return {
48
+ results,
49
+ passed,
50
+ failed: results.length - passed,
51
+ passRate: results.length === 0 ? 1 : passed / results.length,
52
+ };
53
+ }
54
+
55
+ async function runFixture(
56
+ fixture: Fixture,
57
+ input: EvalInput,
58
+ registry: AssertionRegistry,
59
+ defaultSamples: number,
60
+ ): Promise<FixtureResult> {
61
+ const { text: system } = resolveBook(input.book, fixture.prompt, fixture.context ?? {});
62
+ const samples = fixture.samples ?? defaultSamples;
63
+
64
+ let passes = 0;
65
+ const failures: AssertionResult[] = [];
66
+ for (let i = 0; i < samples; i += 1) {
67
+ const response = await input.adapter.complete({ system, input: fixture.input });
68
+ const sampleResults = runAssertions(response.text, fixture.assert, registry);
69
+ const failed = sampleResults.filter((r) => !r.pass);
70
+ if (failed.length === 0) {
71
+ passes += 1;
72
+ } else {
73
+ failures.push(...failed);
74
+ }
75
+ }
76
+
77
+ return {
78
+ name: fixture.name,
79
+ samples,
80
+ passes,
81
+ passRate: samples === 0 ? 0 : passes / samples,
82
+ failures,
83
+ };
84
+ }
@@ -0,0 +1,91 @@
1
+ import { nodeFs } from "../fs.js";
2
+ import { isContextValue, isMapping } from "../guards.js";
3
+ import { joinPath, listFiles, stripExt } from "../paths.js";
4
+ import type { Context, FsAdapter } from "../types.js";
5
+ import type { Assertion, Fixture } from "./types.js";
6
+
7
+ function parseContext(raw: unknown, file: string): Context | undefined {
8
+ if (raw === undefined) {
9
+ return undefined;
10
+ }
11
+ if (!isMapping(raw)) {
12
+ throw new Error(`Fixture "${file}" has a "context" that is not an object.`);
13
+ }
14
+ const context: Context = {};
15
+ for (const [key, value] of Object.entries(raw)) {
16
+ if (!isContextValue(value)) {
17
+ throw new Error(`Fixture "${file}" context key "${key}" must be a string, number or boolean.`);
18
+ }
19
+ context[key] = value;
20
+ }
21
+ return context;
22
+ }
23
+
24
+ function parseAssertions(raw: unknown, file: string): Assertion[] {
25
+ if (!Array.isArray(raw) || raw.length === 0) {
26
+ throw new Error(`Fixture "${file}" must declare a non-empty "assert" array.`);
27
+ }
28
+ return raw.map((entry, index) => {
29
+ if (!isMapping(entry) || typeof entry.type !== "string" || entry.type === "") {
30
+ throw new Error(`Fixture "${file}" assertion #${index} must be an object with a string "type".`);
31
+ }
32
+ return entry as unknown as Assertion;
33
+ });
34
+ }
35
+
36
+ function parseFixture(raw: unknown, file: string, fallbackName: string): Fixture {
37
+ if (!isMapping(raw)) {
38
+ throw new Error(`Fixture "${file}" is not a JSON object.`);
39
+ }
40
+ if (typeof raw.prompt !== "string" || raw.prompt === "") {
41
+ throw new Error(`Fixture "${file}" must declare a string "prompt".`);
42
+ }
43
+ if (typeof raw.input !== "string") {
44
+ throw new Error(`Fixture "${file}" must declare a string "input".`);
45
+ }
46
+ const name = typeof raw.name === "string" && raw.name !== "" ? raw.name : fallbackName;
47
+ const fixture: Fixture = {
48
+ name,
49
+ prompt: raw.prompt,
50
+ input: raw.input,
51
+ assert: parseAssertions(raw.assert, file),
52
+ sourceFile: file,
53
+ };
54
+ const context = parseContext(raw.context, file);
55
+ if (context !== undefined) {
56
+ fixture.context = context;
57
+ }
58
+ if (raw.samples !== undefined) {
59
+ if (typeof raw.samples !== "number" || !Number.isInteger(raw.samples) || raw.samples <= 0) {
60
+ throw new Error(`Fixture "${file}" "samples" must be a positive integer.`);
61
+ }
62
+ fixture.samples = raw.samples;
63
+ }
64
+ return fixture;
65
+ }
66
+
67
+ /**
68
+ * Load eval fixtures from `<dir>/fixtures/*.json` into {@link Fixture}s.
69
+ *
70
+ * Mirrors {@link loadPrompts}: the same `dir` that holds `fragments/` and
71
+ * `rules/` also holds `fixtures/`. A missing `fixtures/` folder yields an empty
72
+ * list; a malformed fixture throws with its file path so the caller can report
73
+ * it. Files are read in sorted order for deterministic results.
74
+ */
75
+ export async function loadFixtures(dir: string, fs: FsAdapter = nodeFs()): Promise<Fixture[]> {
76
+ const fixturesDir = joinPath(dir, "fixtures");
77
+ const files = await listFiles(fs, fixturesDir, [".json"]);
78
+ const fixtures: Fixture[] = [];
79
+ for (const file of files) {
80
+ const full = joinPath(fixturesDir, file);
81
+ const raw = await fs.readFile(full);
82
+ let doc: unknown;
83
+ try {
84
+ doc = JSON.parse(raw);
85
+ } catch (error) {
86
+ throw new Error(`Fixture "${full}" is not valid JSON: ${(error as Error).message}`);
87
+ }
88
+ fixtures.push(parseFixture(doc, full, stripExt(file)));
89
+ }
90
+ return fixtures;
91
+ }
@@ -0,0 +1,134 @@
1
+ /**
2
+ * Public types for the eval engine.
3
+ *
4
+ * Eval is the one place stochasticity enters the system, and it is locked
5
+ * behind an injectable {@link ModelAdapter}. The engine's control flow is
6
+ * deterministic: given the same adapter outputs it returns the same report.
7
+ * The core ships no adapter and makes no network calls — a concrete adapter
8
+ * (e.g. `@markbrutx/promptbook-openrouter`) lives in its own package.
9
+ */
10
+ import type { Context, PromptBook } from "../types.js";
11
+
12
+ /** Token accounting, when an adapter reports it. Provider-agnostic shape. */
13
+ export interface ModelUsage {
14
+ promptTokens?: number;
15
+ completionTokens?: number;
16
+ totalTokens?: number;
17
+ }
18
+
19
+ /** One model call: the assembled system prompt plus the user input. */
20
+ export interface ModelRequest {
21
+ /** The assembled system prompt from {@link resolveBook}. */
22
+ system: string;
23
+ /** The user input that accompanies the system prompt. */
24
+ input: string;
25
+ /** Optional per-request model override; adapters define the default. */
26
+ model?: string;
27
+ }
28
+
29
+ /** A model's reply. `raw` carries the provider payload for debugging. */
30
+ export interface ModelResponse {
31
+ text: string;
32
+ usage?: ModelUsage;
33
+ raw?: unknown;
34
+ }
35
+
36
+ /**
37
+ * The seam stochasticity is locked behind. Implementations perform IO/network;
38
+ * the eval engine only ever calls `complete`, so it stays pure given a fake.
39
+ */
40
+ export interface ModelAdapter {
41
+ complete(request: ModelRequest): Promise<ModelResponse>;
42
+ }
43
+
44
+ /**
45
+ * A single assertion spec as authored in a fixture JSON. `type` selects the
46
+ * checker from the assertion registry; the remaining fields are its params.
47
+ */
48
+ export interface Assertion {
49
+ /** Registry key, e.g. "contains" or "language". */
50
+ type: string;
51
+ /** Substring (contains/not-contains) or exact text (equals). */
52
+ value?: string;
53
+ /** Regex source for matches/not-matches. */
54
+ pattern?: string;
55
+ /** Regex flags for matches/not-matches. */
56
+ flags?: string;
57
+ /** Character ceiling for max-length. */
58
+ max?: number;
59
+ /** Expected language tag or script for `language` (e.g. "ru", "latin"). */
60
+ lang?: string;
61
+ }
62
+
63
+ /** The outcome of running one assertion against one model output. */
64
+ export interface AssertionResult {
65
+ /** The assertion `type` that produced this result. */
66
+ type: string;
67
+ pass: boolean;
68
+ /** Human-readable, domain-agnostic explanation. */
69
+ message: string;
70
+ /** A short excerpt of the output relevant to the assertion. */
71
+ excerpt?: string;
72
+ }
73
+
74
+ /** A checker for one assertion type. Pure: same output + spec, same result. */
75
+ export type AssertionFn = (output: string, assertion: Assertion) => AssertionResult;
76
+
77
+ /** Map of assertion `type` to its checker. Callers may supply their own. */
78
+ export type AssertionRegistry = Record<string, AssertionFn>;
79
+
80
+ /** A single eval test case loaded from a `fixtures/*.json` file. */
81
+ export interface Fixture {
82
+ /** Lookup name; defaults to the file stem when omitted in JSON. */
83
+ name: string;
84
+ /** Composition name to assemble for this case. */
85
+ prompt: string;
86
+ /** Facts to resolve the composition under. */
87
+ context?: Context;
88
+ /** User input sent alongside the assembled system prompt. */
89
+ input: string;
90
+ /** Assertions every sample's output must satisfy. */
91
+ assert: Assertion[];
92
+ /** Per-fixture sample count; overrides the run-level default. */
93
+ samples?: number;
94
+ /** File the fixture was loaded from. */
95
+ sourceFile?: string;
96
+ }
97
+
98
+ /** Per-fixture aggregate over its samples. */
99
+ export interface FixtureResult {
100
+ name: string;
101
+ /** Number of samples taken. */
102
+ samples: number;
103
+ /** Samples where every assertion passed. */
104
+ passes: number;
105
+ /** `passes / samples` (0 when `samples` is 0). */
106
+ passRate: number;
107
+ /** Every failing assertion across all samples, with messages/excerpts. */
108
+ failures: AssertionResult[];
109
+ }
110
+
111
+ /** The aggregated outcome of an eval run. */
112
+ export interface EvalReport {
113
+ results: FixtureResult[];
114
+ /** Fraction of fixtures that met the threshold gate (`passed / total`). */
115
+ passRate: number;
116
+ /** Fixtures whose `passRate >= passThreshold`. */
117
+ passed: number;
118
+ /** Fixtures whose `passRate < passThreshold`. */
119
+ failed: number;
120
+ }
121
+
122
+ /** Input to {@link evaluate}. */
123
+ export interface EvalInput {
124
+ book: PromptBook;
125
+ fixtures: Fixture[];
126
+ /** The injected model seam; all stochasticity/IO lives here. */
127
+ adapter: ModelAdapter;
128
+ /** Assertion checkers; defaults to {@link defaultAssertions}. */
129
+ assertions?: AssertionRegistry;
130
+ /** Default sample count when a fixture sets none. Defaults to 1. */
131
+ samples?: number;
132
+ /** A fixture passes when `passRate >= passThreshold`. Defaults to 1. */
133
+ passThreshold?: number;
134
+ }
@@ -0,0 +1,28 @@
1
+ import { parse as parseYaml } from "yaml";
2
+ import { isMapping } from "./guards.js";
3
+
4
+ export interface ParsedFrontmatter {
5
+ data: Record<string, unknown>;
6
+ body: string;
7
+ }
8
+
9
+ /** Matches a leading `---\n ... \n---` YAML frontmatter block. */
10
+ const FRONTMATTER_RE = /^---\r?\n([\s\S]*?)\r?\n---\r?\n?/;
11
+
12
+ /**
13
+ * Split a Markdown file into YAML frontmatter (parsed) and the body text.
14
+ *
15
+ * Intentionally dependency-light: a single YAML parser, no extra frontmatter
16
+ * library, so the core stays small and easy to run anywhere.
17
+ */
18
+ export function parseFrontmatter(raw: string): ParsedFrontmatter {
19
+ const match = raw.match(FRONTMATTER_RE);
20
+ if (!match) {
21
+ return { data: {}, body: raw };
22
+ }
23
+ const yamlText = match[1] ?? "";
24
+ const parsed = parseYaml(yamlText) as unknown;
25
+ const data = isMapping(parsed) ? parsed : {};
26
+ const body = raw.slice(match[0].length);
27
+ return { data, body };
28
+ }