novel-writer-cli 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +103 -0
  3. package/agents/chapter-writer.md +142 -0
  4. package/agents/character-weaver.md +117 -0
  5. package/agents/consistency-auditor.md +85 -0
  6. package/agents/plot-architect.md +128 -0
  7. package/agents/quality-judge.md +232 -0
  8. package/agents/style-analyzer.md +109 -0
  9. package/agents/style-refiner.md +97 -0
  10. package/agents/summarizer.md +128 -0
  11. package/agents/world-builder.md +161 -0
  12. package/dist/__tests__/character-voice.test.js +445 -0
  13. package/dist/__tests__/commit-prototype-pollution.test.js +45 -0
  14. package/dist/__tests__/engagement.test.js +382 -0
  15. package/dist/__tests__/foreshadow-visibility.test.js +131 -0
  16. package/dist/__tests__/hook-ledger.test.js +1028 -0
  17. package/dist/__tests__/naming-lint.test.js +132 -0
  18. package/dist/__tests__/narrative-health-injection.test.js +359 -0
  19. package/dist/__tests__/next-step-prejudge-guardrails.test.js +325 -0
  20. package/dist/__tests__/next-step-title-fix.test.js +153 -0
  21. package/dist/__tests__/platform-profile.test.js +274 -0
  22. package/dist/__tests__/promise-ledger.test.js +189 -0
  23. package/dist/__tests__/readability-lint.test.js +209 -0
  24. package/dist/__tests__/text-utils.test.js +39 -0
  25. package/dist/__tests__/title-policy.test.js +147 -0
  26. package/dist/advance.js +75 -0
  27. package/dist/character-voice.js +805 -0
  28. package/dist/checkpoint.js +126 -0
  29. package/dist/cli.js +563 -0
  30. package/dist/cliche-lint.js +515 -0
  31. package/dist/commit.js +1460 -0
  32. package/dist/consistency-auditor.js +684 -0
  33. package/dist/engagement.js +687 -0
  34. package/dist/errors.js +7 -0
  35. package/dist/fingerprint.js +16 -0
  36. package/dist/foreshadow-visibility.js +214 -0
  37. package/dist/fs-utils.js +68 -0
  38. package/dist/hook-ledger.js +721 -0
  39. package/dist/hook-policy.js +107 -0
  40. package/dist/instruction-gates.js +51 -0
  41. package/dist/instructions.js +406 -0
  42. package/dist/latest-summary-loader.js +29 -0
  43. package/dist/lock.js +121 -0
  44. package/dist/naming-lint.js +531 -0
  45. package/dist/ner.js +73 -0
  46. package/dist/next-step.js +408 -0
  47. package/dist/novel-ask.js +270 -0
  48. package/dist/output.js +9 -0
  49. package/dist/platform-constraints.js +518 -0
  50. package/dist/platform-profile.js +325 -0
  51. package/dist/prejudge-guardrails.js +370 -0
  52. package/dist/project.js +40 -0
  53. package/dist/promise-ledger.js +723 -0
  54. package/dist/readability-lint.js +555 -0
  55. package/dist/safe-parse.js +36 -0
  56. package/dist/safe-path.js +29 -0
  57. package/dist/scoring-weights.js +290 -0
  58. package/dist/steps.js +60 -0
  59. package/dist/text-utils.js +18 -0
  60. package/dist/title-policy.js +251 -0
  61. package/dist/type-guards.js +6 -0
  62. package/dist/validate.js +131 -0
  63. package/docs/user/README.md +17 -0
  64. package/docs/user/guardrails.md +179 -0
  65. package/docs/user/interactive-gates.md +124 -0
  66. package/docs/user/novel-cli.md +289 -0
  67. package/docs/user/ops.md +123 -0
  68. package/docs/user/quick-start.md +97 -0
  69. package/docs/user/spec-system.md +166 -0
  70. package/docs/user/storylines.md +144 -0
  71. package/package.json +48 -0
  72. package/schemas/README.md +18 -0
  73. package/schemas/character-voice-drift.schema.json +135 -0
  74. package/schemas/character-voice-profiles.schema.json +141 -0
  75. package/schemas/engagement-metrics.schema.json +38 -0
  76. package/schemas/hook-ledger.schema.json +108 -0
  77. package/schemas/platform-profile.schema.json +235 -0
  78. package/schemas/promise-ledger.schema.json +97 -0
  79. package/scripts/calibrate-quality-judge.sh +91 -0
  80. package/scripts/compare-regression-runs.sh +86 -0
  81. package/scripts/lib/_common.py +131 -0
  82. package/scripts/lib/calibrate_quality_judge.py +312 -0
  83. package/scripts/lib/compare_regression_runs.py +142 -0
  84. package/scripts/lib/run_regression.py +621 -0
  85. package/scripts/lint-blacklist.sh +201 -0
  86. package/scripts/lint-cliche.sh +370 -0
  87. package/scripts/lint-readability.sh +404 -0
  88. package/scripts/query-foreshadow.sh +252 -0
  89. package/scripts/run-ner.sh +669 -0
  90. package/scripts/run-regression.sh +122 -0
  91. package/skills/cli-step/SKILL.md +158 -0
  92. package/skills/continue/SKILL.md +348 -0
  93. package/skills/continue/references/context-contracts.md +169 -0
  94. package/skills/continue/references/continuity-checks.md +187 -0
  95. package/skills/continue/references/file-protocols.md +64 -0
  96. package/skills/continue/references/foreshadowing.md +130 -0
  97. package/skills/continue/references/gate-decision.md +53 -0
  98. package/skills/continue/references/periodic-maintenance.md +46 -0
  99. package/skills/novel-writing/SKILL.md +77 -0
  100. package/skills/novel-writing/references/quality-rubric.md +140 -0
  101. package/skills/novel-writing/references/style-guide.md +145 -0
  102. package/skills/start/SKILL.md +458 -0
  103. package/skills/start/references/quality-review.md +86 -0
  104. package/skills/start/references/setting-update.md +44 -0
  105. package/skills/start/references/vol-planning.md +61 -0
  106. package/skills/start/references/vol-review.md +58 -0
  107. package/skills/status/SKILL.md +116 -0
  108. package/skills/status/references/sample-output.md +60 -0
  109. package/templates/ai-blacklist.json +79 -0
  110. package/templates/brief-template.md +46 -0
  111. package/templates/genre-weight-profiles.json +90 -0
  112. package/templates/novel-ask/example.answer.json +12 -0
  113. package/templates/novel-ask/example.question.json +51 -0
  114. package/templates/platform-profile.json +148 -0
  115. package/templates/style-profile-template.json +58 -0
  116. package/templates/web-novel-cliche-lint.json +41 -0
@@ -0,0 +1,312 @@
1
+ """QualityJudge calibration against human-labeled dataset (M3).
2
+
3
+ Extracted from scripts/calibrate-quality-judge.sh heredoc.
4
+ Shared helpers imported from _common.py (same directory).
5
+ """
6
+
7
+ import json
8
+ import math
9
+ import os
10
+ import sys
11
+ from datetime import datetime, timezone
12
+ from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
13
+
14
+ import _common
15
+
16
+ _SCRIPT = "calibrate-quality-judge.sh"
17
+
18
+
19
+ def _die(msg: str, exit_code: int = 1) -> None:
20
+ _common.die(f"{_SCRIPT}: {msg}", exit_code)
21
+
22
+
23
+ def _load_json(path: str) -> Any:
24
+ try:
25
+ return _common.load_json(path)
26
+ except Exception as e:
27
+ _die(f"invalid JSON at {path}: {e}", 1)
28
+
29
+
30
+ def _iter_jsonl(path: str) -> Iterable[Tuple[int, Dict[str, Any]]]:
31
+ try:
32
+ with open(path, "r", encoding="utf-8") as f:
33
+ for line_no, raw in enumerate(f, start=1):
34
+ line = raw.strip()
35
+ if not line:
36
+ continue
37
+ if line.startswith("#"):
38
+ continue
39
+ try:
40
+ obj = json.loads(line)
41
+ except Exception as e:
42
+ _die(f"invalid JSONL at {path}:{line_no}: {e}", 1)
43
+ if not isinstance(obj, dict):
44
+ _die(f"JSONL record must be an object at {path}:{line_no}", 1)
45
+ yield line_no, obj
46
+ except FileNotFoundError:
47
+ _die(f"labels file not found: {path}", 1)
48
+ except SystemExit:
49
+ raise
50
+ except Exception as e:
51
+ _die(f"failed to read labels file {path}: {e}", 1)
52
+
53
+
54
+ def _pearson(x: Sequence[float], y: Sequence[float]) -> Optional[float]:
55
+ if len(x) != len(y) or len(x) < 2:
56
+ return None
57
+ mean_x = sum(x) / len(x)
58
+ mean_y = sum(y) / len(y)
59
+ num = 0.0
60
+ den_x = 0.0
61
+ den_y = 0.0
62
+ for a, b in zip(x, y):
63
+ dx = a - mean_x
64
+ dy = b - mean_y
65
+ num += dx * dy
66
+ den_x += dx * dx
67
+ den_y += dy * dy
68
+ if den_x <= 0.0 or den_y <= 0.0:
69
+ return None
70
+ return num / math.sqrt(den_x * den_y)
71
+
72
+
73
+ def _linear_fit(x: Sequence[float], y: Sequence[float]) -> Optional[Dict[str, float]]:
74
+ if len(x) != len(y) or len(x) < 2:
75
+ return None
76
+ mean_x = sum(x) / len(x)
77
+ mean_y = sum(y) / len(y)
78
+ sxx = 0.0
79
+ sxy = 0.0
80
+ for a, b in zip(x, y):
81
+ dx = a - mean_x
82
+ sxx += dx * dx
83
+ sxy += dx * (b - mean_y)
84
+ if sxx <= 0.0:
85
+ return None
86
+ slope = sxy / sxx
87
+ intercept = mean_y - slope * mean_x
88
+ return {"slope": slope, "intercept": intercept}
89
+
90
+
91
+ def _clamp(v: float, lo: float = 1.0, hi: float = 5.0) -> float:
92
+ return max(lo, min(hi, v))
93
+
94
+
95
+ def _safe_round(v: Optional[float], ndigits: int = 4) -> Optional[float]:
96
+ if v is None:
97
+ return None
98
+ return round(float(v), ndigits)
99
+
100
+
101
+ def main() -> None:
102
+ project_dir = sys.argv[1]
103
+ labels_path = sys.argv[2]
104
+ out_path = sys.argv[3].strip() if len(sys.argv) > 3 else ""
105
+
106
+ label_records: Dict[int, Dict[str, Any]] = {}
107
+ label_line_by_chapter: Dict[int, int] = {}
108
+
109
+ for line_no, obj in _iter_jsonl(labels_path):
110
+ chapter = obj.get("chapter")
111
+ if not isinstance(chapter, int) or chapter < 1:
112
+ _die(f"labels record missing valid chapter at {labels_path}:{line_no}", 1)
113
+ schema_version = obj.get("schema_version")
114
+ if schema_version != 1:
115
+ _die(
116
+ f"unsupported schema_version at {labels_path}:{line_no} (expected 1, got {schema_version})",
117
+ 1,
118
+ )
119
+ human_scores = obj.get("human_scores")
120
+ if not isinstance(human_scores, dict) or _common.as_float(human_scores.get("overall")) is None:
121
+ _die(f"labels record missing human_scores.overall at {labels_path}:{line_no}", 1)
122
+ if chapter in label_records:
123
+ _die(
124
+ f"duplicate chapter {chapter} in labels (lines {label_line_by_chapter[chapter]} and {line_no})",
125
+ 1,
126
+ )
127
+ label_records[chapter] = obj
128
+ label_line_by_chapter[chapter] = line_no
129
+
130
+ if not label_records:
131
+ _die("labels file has no records", 1)
132
+
133
+ eval_dir = os.path.join(project_dir, "evaluations")
134
+ if not os.path.isdir(eval_dir):
135
+ _die(f"evaluations/ not found under project dir: {project_dir}", 1)
136
+
137
+ eval_file_list = _common.find_eval_files(eval_dir)
138
+ eval_files: Dict[int, str] = {ch: path for ch, path in eval_file_list}
139
+
140
+ matched_chapters: List[int] = []
141
+ missing_eval_chapters: List[int] = []
142
+
143
+ human_overall: List[float] = []
144
+ judge_overall: List[float] = []
145
+ judge_overall_source: Dict[int, str] = {}
146
+
147
+ dim_pairs: Dict[str, Tuple[List[float], List[float]]] = {}
148
+
149
+ for chapter in sorted(label_records.keys()):
150
+ eval_path = eval_files.get(chapter)
151
+ if not eval_path:
152
+ missing_eval_chapters.append(chapter)
153
+ continue
154
+
155
+ eval_obj = _load_json(eval_path)
156
+ if not isinstance(eval_obj, dict):
157
+ _die(f"eval JSON must be an object at {eval_path}", 1)
158
+
159
+ human = _common.as_float(label_records[chapter]["human_scores"]["overall"])
160
+ judge = _common.extract_overall(eval_obj)
161
+ if human is None:
162
+ _die(f"labels human_scores.overall not a number for chapter {chapter}", 1)
163
+ if judge is None:
164
+ missing_eval_chapters.append(chapter)
165
+ continue
166
+
167
+ # Track which field provided judge overall.
168
+ src = "unknown"
169
+ if _common.as_float(eval_obj.get("overall_final")) is not None:
170
+ src = "overall_final"
171
+ elif _common.as_float(_common.extract_eval_used(eval_obj).get("overall_final")) is not None:
172
+ src = "eval_used.overall_final"
173
+ elif _common.as_float(_common.extract_eval_used(eval_obj).get("overall")) is not None:
174
+ src = "eval_used.overall"
175
+ elif _common.as_float(eval_obj.get("overall")) is not None:
176
+ src = "overall"
177
+ else:
178
+ meta = eval_obj.get("metadata")
179
+ if isinstance(meta, dict) and isinstance(meta.get("judges"), dict) and _common.as_float(meta["judges"].get("overall_final")) is not None:
180
+ src = "metadata.judges.overall_final"
181
+ judge_overall_source[chapter] = src
182
+
183
+ matched_chapters.append(chapter)
184
+ human_overall.append(float(human))
185
+ judge_overall.append(float(judge))
186
+
187
+ human_scores = label_records[chapter].get("human_scores")
188
+ human_dims: Dict[str, float] = {}
189
+ if isinstance(human_scores, dict):
190
+ for k, v in human_scores.items():
191
+ if k == "overall":
192
+ continue
193
+ n = _common.as_float(v)
194
+ if n is not None:
195
+ human_dims[str(k)] = float(n)
196
+
197
+ judge_dims = _common.extract_dimension_scores(eval_obj)
198
+ for dim_key, human_dim_score in human_dims.items():
199
+ judge_dim_score = judge_dims.get(dim_key)
200
+ if judge_dim_score is None:
201
+ continue
202
+ xs, ys = dim_pairs.setdefault(dim_key, ([], []))
203
+ xs.append(float(human_dim_score))
204
+ ys.append(float(judge_dim_score))
205
+
206
+ if len(matched_chapters) < 2:
207
+ _die(
208
+ f"need at least 2 matched chapters to compute Pearson (matched={len(matched_chapters)})",
209
+ 1,
210
+ )
211
+
212
+ r_overall = _pearson(human_overall, judge_overall)
213
+ fit = _linear_fit(judge_overall, human_overall) # human ~ slope * judge + intercept
214
+
215
+ errors = [j - h for h, j in zip(human_overall, judge_overall)]
216
+ mae = sum(abs(e) for e in errors) / len(errors)
217
+ rmse = math.sqrt(sum(e * e for e in errors) / len(errors))
218
+ bias = sum(errors) / len(errors)
219
+
220
+ # pause_for_user_force_rewrite is implicit (<2.0), no threshold to calibrate
221
+ default_thresholds = {"pass": 4.0, "polish": 3.5, "revise": 3.0, "pause_for_user": 2.0}
222
+
223
+ suggestions: Dict[str, Any] = {"defaults": default_thresholds, "methods": {}}
224
+
225
+ # Method 1: shift thresholds by mean judge-human bias.
226
+ shifted = {k: _safe_round(_clamp(v + bias), 3) for k, v in default_thresholds.items()}
227
+ suggestions["methods"]["shift_by_bias"] = {
228
+ "bias_judge_minus_human": _safe_round(bias, 4),
229
+ "suggested_thresholds": shifted,
230
+ "note": "若 judge 整体偏高(bias>0),建议上调阈值;偏低(bias<0)则下调。仅作启发式建议。",
231
+ }
232
+
233
+ # Method 2: linear fit inversion: find judge thresholds that map to target human thresholds.
234
+ if fit is not None and abs(float(fit["slope"])) > 1e-6:
235
+ slope = float(fit["slope"])
236
+ intercept = float(fit["intercept"])
237
+ inv = {}
238
+ for k, human_t in default_thresholds.items():
239
+ inv[k] = _safe_round(_clamp((human_t - intercept) / slope), 3)
240
+ suggestions["methods"]["linear_fit_inverse"] = {
241
+ "fit_human_equals_slope_times_judge_plus_intercept": {
242
+ "slope": _safe_round(slope, 6),
243
+ "intercept": _safe_round(intercept, 6),
244
+ },
245
+ "suggested_thresholds": inv,
246
+ "note": "基于最小二乘线性拟合的反解(用 judge 预测 human)。样本少/分布窄时可能不稳定,仅作建议。",
247
+ }
248
+ else:
249
+ suggestions["methods"]["linear_fit_inverse"] = {
250
+ "fit_human_equals_slope_times_judge_plus_intercept": None,
251
+ "suggested_thresholds": None,
252
+ "note": "judge 分布过窄或样本不足,无法稳定拟合。",
253
+ }
254
+
255
+ dims_report: Dict[str, Any] = {}
256
+ for dim_key in sorted(dim_pairs.keys()):
257
+ xs, ys = dim_pairs[dim_key]
258
+ r = _pearson(xs, ys)
259
+ dims_report[dim_key] = {"n": len(xs), "pearson_r": _safe_round(r, 4)}
260
+
261
+ now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
262
+ out: Dict[str, Any] = {
263
+ "schema_version": 1,
264
+ "generated_at": now,
265
+ "project": {
266
+ "path": os.path.abspath(project_dir),
267
+ "evaluations_dir": os.path.join(os.path.abspath(project_dir), "evaluations"),
268
+ },
269
+ "labels": {
270
+ "path": os.path.abspath(labels_path),
271
+ "records": len(label_records),
272
+ },
273
+ "alignment": {
274
+ "matched_chapters": matched_chapters,
275
+ "missing_eval_chapters": missing_eval_chapters,
276
+ "judge_overall_source_by_chapter": {str(k): v for k, v in sorted(judge_overall_source.items())},
277
+ },
278
+ "overall": {
279
+ "n": len(matched_chapters),
280
+ "pearson_r": _safe_round(r_overall, 4),
281
+ "human_mean": _safe_round(sum(human_overall) / len(human_overall), 4),
282
+ "judge_mean": _safe_round(sum(judge_overall) / len(judge_overall), 4),
283
+ "mae": _safe_round(mae, 4),
284
+ "rmse": _safe_round(rmse, 4),
285
+ "bias_judge_minus_human": _safe_round(bias, 4),
286
+ },
287
+ "dimensions": dims_report,
288
+ "threshold_suggestions": suggestions,
289
+ }
290
+
291
+ out_json = json.dumps(out, ensure_ascii=False, sort_keys=True) + "\n"
292
+ sys.stdout.write(out_json)
293
+
294
+ if out_path:
295
+ out_dir = os.path.dirname(os.path.abspath(out_path))
296
+ if out_dir and not os.path.isdir(out_dir):
297
+ os.makedirs(out_dir, exist_ok=True)
298
+ try:
299
+ with open(out_path, "w", encoding="utf-8") as f:
300
+ f.write(out_json)
301
+ except Exception as e:
302
+ _die(f"failed to write report to {out_path}: {e}", 1)
303
+
304
+
305
+ if __name__ == "__main__":
306
+ try:
307
+ main()
308
+ except SystemExit:
309
+ raise
310
+ except Exception as e:
311
+ sys.stderr.write(f"calibrate-quality-judge.sh: unexpected error: {e}\n")
312
+ raise SystemExit(2)
@@ -0,0 +1,142 @@
1
+ """Compare two archived regression-run summaries (M3).
2
+
3
+ Extracted from the heredoc in scripts/compare-regression-runs.sh.
4
+ Reuses helpers from _common to avoid duplication.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import sys
10
+ from typing import Any, Dict, Optional
11
+
12
+ import _common
13
+
14
+
15
+ def _die(msg: str, exit_code: int = 1) -> None:
16
+ _common.die(f"compare-regression-runs.sh: {msg}", exit_code)
17
+
18
+
19
+ def _load_json(path: str) -> Any:
20
+ try:
21
+ return _common.load_json(path)
22
+ except Exception as e:
23
+ _die(f"invalid JSON at {path}: {e}", 1)
24
+
25
+
26
+ def _delta_number(a: Any, b: Any) -> Optional[float]:
27
+ na = _common.as_float(a)
28
+ nb = _common.as_float(b)
29
+ if na is None or nb is None:
30
+ return None
31
+ return nb - na
32
+
33
+
34
+ def _delta_dim_means(a: Any, b: Any) -> Dict[str, Optional[float]]:
35
+ """Delta on score_dimensions: extract 'mean' from {n, mean} objects."""
36
+ if not isinstance(a, dict):
37
+ a = {}
38
+ if not isinstance(b, dict):
39
+ b = {}
40
+ keys = sorted(set(list(a.keys()) + list(b.keys())))
41
+ out: Dict[str, Optional[float]] = {}
42
+ for k in keys:
43
+ va = a.get(k)
44
+ vb = b.get(k)
45
+ ma = _common.as_float(va.get("mean")) if isinstance(va, dict) else None
46
+ mb = _common.as_float(vb.get("mean")) if isinstance(vb, dict) else None
47
+ if ma is None and mb is None:
48
+ continue
49
+ if ma is None or mb is None:
50
+ out[k] = None
51
+ else:
52
+ out[k] = round(float(mb - ma), 6)
53
+ return out
54
+
55
+
56
+ def _delta_map(a: Any, b: Any) -> Dict[str, Optional[float]]:
57
+ if not isinstance(a, dict):
58
+ a = {}
59
+ if not isinstance(b, dict):
60
+ b = {}
61
+ keys = set([str(k) for k in a.keys()] + [str(k) for k in b.keys()])
62
+ out: Dict[str, Optional[float]] = {}
63
+ for k in sorted(keys):
64
+ da = _common.as_float(a.get(k))
65
+ db = _common.as_float(b.get(k))
66
+ if da is None and db is None:
67
+ continue
68
+ if da is None or db is None:
69
+ out[k] = None
70
+ else:
71
+ out[k] = float(db - da)
72
+ return out
73
+
74
+
75
+ def main() -> None:
76
+ path_a = sys.argv[1]
77
+ path_b = sys.argv[2]
78
+ out_path = sys.argv[3].strip() if len(sys.argv) > 3 else ""
79
+
80
+ a = _load_json(path_a)
81
+ b = _load_json(path_b)
82
+ if not isinstance(a, dict) or not isinstance(b, dict):
83
+ _die("both summaries must be JSON objects", 1)
84
+
85
+ comp_a = a.get("compliance") if isinstance(a.get("compliance"), dict) else {}
86
+ comp_b = b.get("compliance") if isinstance(b.get("compliance"), dict) else {}
87
+
88
+ score_a = a.get("score_overall") if isinstance(a.get("score_overall"), dict) else {}
89
+ score_b = b.get("score_overall") if isinstance(b.get("score_overall"), dict) else {}
90
+
91
+ out = {
92
+ "schema_version": 1,
93
+ "generated_at": _common.iso_utc_now(),
94
+ "run_a": {"dir": os.path.dirname(os.path.abspath(path_a)), "summary_path": os.path.abspath(path_a), "run_id": a.get("run_id")},
95
+ "run_b": {"dir": os.path.dirname(os.path.abspath(path_b)), "summary_path": os.path.abspath(path_b), "run_id": b.get("run_id")},
96
+ "delta": {
97
+ "chapters_total": _delta_number(a.get("chapters_total"), b.get("chapters_total")),
98
+ "violations_total": _delta_number(a.get("violations_total"), b.get("violations_total")),
99
+ "compliance_rate_high_confidence": _delta_number(comp_a.get("compliance_rate_high_confidence"), comp_b.get("compliance_rate_high_confidence")),
100
+ "compliance_rate_any_violation": _delta_number(comp_a.get("compliance_rate_any_violation"), comp_b.get("compliance_rate_any_violation")),
101
+ "chapters_with_high_confidence_violation": _delta_number(
102
+ comp_a.get("chapters_with_high_confidence_violation"), comp_b.get("chapters_with_high_confidence_violation")
103
+ ),
104
+ "chapters_with_any_violation": _delta_number(comp_a.get("chapters_with_any_violation"), comp_b.get("chapters_with_any_violation")),
105
+ "violations_by_confidence": _delta_map(a.get("violations_by_confidence"), b.get("violations_by_confidence")),
106
+ "violations_by_layer": _delta_map(a.get("violations_by_layer"), b.get("violations_by_layer")),
107
+ "score_overall_mean": _delta_number(score_a.get("mean"), score_b.get("mean")),
108
+ },
109
+ "score_dimensions": {},
110
+ "notes": [],
111
+ }
112
+
113
+ if _common.as_float(a.get("chapters_total")) != _common.as_float(b.get("chapters_total")):
114
+ out["notes"].append("chapters_total differs; compare deltas with caution.")
115
+
116
+ dims_a = a.get("score_dimensions", {})
117
+ dims_b = b.get("score_dimensions", {})
118
+ if dims_a or dims_b:
119
+ out["score_dimensions"] = _delta_dim_means(dims_a, dims_b)
120
+
121
+ out_json = json.dumps(out, ensure_ascii=False, sort_keys=True) + "\n"
122
+ sys.stdout.write(out_json)
123
+
124
+ if out_path:
125
+ out_dir = os.path.dirname(os.path.abspath(out_path))
126
+ if out_dir and not os.path.isdir(out_dir):
127
+ os.makedirs(out_dir, exist_ok=True)
128
+ try:
129
+ with open(out_path, "w", encoding="utf-8") as f:
130
+ f.write(out_json)
131
+ except Exception as e:
132
+ _die(f"failed to write to {out_path}: {e}", 1)
133
+
134
+
135
+ if __name__ == "__main__":
136
+ try:
137
+ main()
138
+ except SystemExit:
139
+ raise
140
+ except Exception as e:
141
+ sys.stderr.write(f"compare-regression-runs.sh: unexpected error: {e}\n")
142
+ raise SystemExit(2)