novel-writer-cli 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +103 -0
- package/agents/chapter-writer.md +142 -0
- package/agents/character-weaver.md +117 -0
- package/agents/consistency-auditor.md +85 -0
- package/agents/plot-architect.md +128 -0
- package/agents/quality-judge.md +232 -0
- package/agents/style-analyzer.md +109 -0
- package/agents/style-refiner.md +97 -0
- package/agents/summarizer.md +128 -0
- package/agents/world-builder.md +161 -0
- package/dist/__tests__/character-voice.test.js +445 -0
- package/dist/__tests__/commit-prototype-pollution.test.js +45 -0
- package/dist/__tests__/engagement.test.js +382 -0
- package/dist/__tests__/foreshadow-visibility.test.js +131 -0
- package/dist/__tests__/hook-ledger.test.js +1028 -0
- package/dist/__tests__/naming-lint.test.js +132 -0
- package/dist/__tests__/narrative-health-injection.test.js +359 -0
- package/dist/__tests__/next-step-prejudge-guardrails.test.js +325 -0
- package/dist/__tests__/next-step-title-fix.test.js +153 -0
- package/dist/__tests__/platform-profile.test.js +274 -0
- package/dist/__tests__/promise-ledger.test.js +189 -0
- package/dist/__tests__/readability-lint.test.js +209 -0
- package/dist/__tests__/text-utils.test.js +39 -0
- package/dist/__tests__/title-policy.test.js +147 -0
- package/dist/advance.js +75 -0
- package/dist/character-voice.js +805 -0
- package/dist/checkpoint.js +126 -0
- package/dist/cli.js +563 -0
- package/dist/cliche-lint.js +515 -0
- package/dist/commit.js +1460 -0
- package/dist/consistency-auditor.js +684 -0
- package/dist/engagement.js +687 -0
- package/dist/errors.js +7 -0
- package/dist/fingerprint.js +16 -0
- package/dist/foreshadow-visibility.js +214 -0
- package/dist/fs-utils.js +68 -0
- package/dist/hook-ledger.js +721 -0
- package/dist/hook-policy.js +107 -0
- package/dist/instruction-gates.js +51 -0
- package/dist/instructions.js +406 -0
- package/dist/latest-summary-loader.js +29 -0
- package/dist/lock.js +121 -0
- package/dist/naming-lint.js +531 -0
- package/dist/ner.js +73 -0
- package/dist/next-step.js +408 -0
- package/dist/novel-ask.js +270 -0
- package/dist/output.js +9 -0
- package/dist/platform-constraints.js +518 -0
- package/dist/platform-profile.js +325 -0
- package/dist/prejudge-guardrails.js +370 -0
- package/dist/project.js +40 -0
- package/dist/promise-ledger.js +723 -0
- package/dist/readability-lint.js +555 -0
- package/dist/safe-parse.js +36 -0
- package/dist/safe-path.js +29 -0
- package/dist/scoring-weights.js +290 -0
- package/dist/steps.js +60 -0
- package/dist/text-utils.js +18 -0
- package/dist/title-policy.js +251 -0
- package/dist/type-guards.js +6 -0
- package/dist/validate.js +131 -0
- package/docs/user/README.md +17 -0
- package/docs/user/guardrails.md +179 -0
- package/docs/user/interactive-gates.md +124 -0
- package/docs/user/novel-cli.md +289 -0
- package/docs/user/ops.md +123 -0
- package/docs/user/quick-start.md +97 -0
- package/docs/user/spec-system.md +166 -0
- package/docs/user/storylines.md +144 -0
- package/package.json +48 -0
- package/schemas/README.md +18 -0
- package/schemas/character-voice-drift.schema.json +135 -0
- package/schemas/character-voice-profiles.schema.json +141 -0
- package/schemas/engagement-metrics.schema.json +38 -0
- package/schemas/hook-ledger.schema.json +108 -0
- package/schemas/platform-profile.schema.json +235 -0
- package/schemas/promise-ledger.schema.json +97 -0
- package/scripts/calibrate-quality-judge.sh +91 -0
- package/scripts/compare-regression-runs.sh +86 -0
- package/scripts/lib/_common.py +131 -0
- package/scripts/lib/calibrate_quality_judge.py +312 -0
- package/scripts/lib/compare_regression_runs.py +142 -0
- package/scripts/lib/run_regression.py +621 -0
- package/scripts/lint-blacklist.sh +201 -0
- package/scripts/lint-cliche.sh +370 -0
- package/scripts/lint-readability.sh +404 -0
- package/scripts/query-foreshadow.sh +252 -0
- package/scripts/run-ner.sh +669 -0
- package/scripts/run-regression.sh +122 -0
- package/skills/cli-step/SKILL.md +158 -0
- package/skills/continue/SKILL.md +348 -0
- package/skills/continue/references/context-contracts.md +169 -0
- package/skills/continue/references/continuity-checks.md +187 -0
- package/skills/continue/references/file-protocols.md +64 -0
- package/skills/continue/references/foreshadowing.md +130 -0
- package/skills/continue/references/gate-decision.md +53 -0
- package/skills/continue/references/periodic-maintenance.md +46 -0
- package/skills/novel-writing/SKILL.md +77 -0
- package/skills/novel-writing/references/quality-rubric.md +140 -0
- package/skills/novel-writing/references/style-guide.md +145 -0
- package/skills/start/SKILL.md +458 -0
- package/skills/start/references/quality-review.md +86 -0
- package/skills/start/references/setting-update.md +44 -0
- package/skills/start/references/vol-planning.md +61 -0
- package/skills/start/references/vol-review.md +58 -0
- package/skills/status/SKILL.md +116 -0
- package/skills/status/references/sample-output.md +60 -0
- package/templates/ai-blacklist.json +79 -0
- package/templates/brief-template.md +46 -0
- package/templates/genre-weight-profiles.json +90 -0
- package/templates/novel-ask/example.answer.json +12 -0
- package/templates/novel-ask/example.question.json +51 -0
- package/templates/platform-profile.json +148 -0
- package/templates/style-profile-template.json +58 -0
- package/templates/web-novel-cliche-lint.json +41 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""QualityJudge calibration against human-labeled dataset (M3).
|
|
2
|
+
|
|
3
|
+
Extracted from scripts/calibrate-quality-judge.sh heredoc.
|
|
4
|
+
Shared helpers imported from _common.py (same directory).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import math
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
|
13
|
+
|
|
14
|
+
import _common
|
|
15
|
+
|
|
16
|
+
_SCRIPT = "calibrate-quality-judge.sh"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _die(msg: str, exit_code: int = 1) -> None:
|
|
20
|
+
_common.die(f"{_SCRIPT}: {msg}", exit_code)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _load_json(path: str) -> Any:
|
|
24
|
+
try:
|
|
25
|
+
return _common.load_json(path)
|
|
26
|
+
except Exception as e:
|
|
27
|
+
_die(f"invalid JSON at {path}: {e}", 1)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _iter_jsonl(path: str) -> Iterable[Tuple[int, Dict[str, Any]]]:
|
|
31
|
+
try:
|
|
32
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
33
|
+
for line_no, raw in enumerate(f, start=1):
|
|
34
|
+
line = raw.strip()
|
|
35
|
+
if not line:
|
|
36
|
+
continue
|
|
37
|
+
if line.startswith("#"):
|
|
38
|
+
continue
|
|
39
|
+
try:
|
|
40
|
+
obj = json.loads(line)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
_die(f"invalid JSONL at {path}:{line_no}: {e}", 1)
|
|
43
|
+
if not isinstance(obj, dict):
|
|
44
|
+
_die(f"JSONL record must be an object at {path}:{line_no}", 1)
|
|
45
|
+
yield line_no, obj
|
|
46
|
+
except FileNotFoundError:
|
|
47
|
+
_die(f"labels file not found: {path}", 1)
|
|
48
|
+
except SystemExit:
|
|
49
|
+
raise
|
|
50
|
+
except Exception as e:
|
|
51
|
+
_die(f"failed to read labels file {path}: {e}", 1)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _pearson(x: Sequence[float], y: Sequence[float]) -> Optional[float]:
|
|
55
|
+
if len(x) != len(y) or len(x) < 2:
|
|
56
|
+
return None
|
|
57
|
+
mean_x = sum(x) / len(x)
|
|
58
|
+
mean_y = sum(y) / len(y)
|
|
59
|
+
num = 0.0
|
|
60
|
+
den_x = 0.0
|
|
61
|
+
den_y = 0.0
|
|
62
|
+
for a, b in zip(x, y):
|
|
63
|
+
dx = a - mean_x
|
|
64
|
+
dy = b - mean_y
|
|
65
|
+
num += dx * dy
|
|
66
|
+
den_x += dx * dx
|
|
67
|
+
den_y += dy * dy
|
|
68
|
+
if den_x <= 0.0 or den_y <= 0.0:
|
|
69
|
+
return None
|
|
70
|
+
return num / math.sqrt(den_x * den_y)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _linear_fit(x: Sequence[float], y: Sequence[float]) -> Optional[Dict[str, float]]:
|
|
74
|
+
if len(x) != len(y) or len(x) < 2:
|
|
75
|
+
return None
|
|
76
|
+
mean_x = sum(x) / len(x)
|
|
77
|
+
mean_y = sum(y) / len(y)
|
|
78
|
+
sxx = 0.0
|
|
79
|
+
sxy = 0.0
|
|
80
|
+
for a, b in zip(x, y):
|
|
81
|
+
dx = a - mean_x
|
|
82
|
+
sxx += dx * dx
|
|
83
|
+
sxy += dx * (b - mean_y)
|
|
84
|
+
if sxx <= 0.0:
|
|
85
|
+
return None
|
|
86
|
+
slope = sxy / sxx
|
|
87
|
+
intercept = mean_y - slope * mean_x
|
|
88
|
+
return {"slope": slope, "intercept": intercept}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _clamp(v: float, lo: float = 1.0, hi: float = 5.0) -> float:
|
|
92
|
+
return max(lo, min(hi, v))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _safe_round(v: Optional[float], ndigits: int = 4) -> Optional[float]:
|
|
96
|
+
if v is None:
|
|
97
|
+
return None
|
|
98
|
+
return round(float(v), ndigits)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def main() -> None:
|
|
102
|
+
project_dir = sys.argv[1]
|
|
103
|
+
labels_path = sys.argv[2]
|
|
104
|
+
out_path = sys.argv[3].strip() if len(sys.argv) > 3 else ""
|
|
105
|
+
|
|
106
|
+
label_records: Dict[int, Dict[str, Any]] = {}
|
|
107
|
+
label_line_by_chapter: Dict[int, int] = {}
|
|
108
|
+
|
|
109
|
+
for line_no, obj in _iter_jsonl(labels_path):
|
|
110
|
+
chapter = obj.get("chapter")
|
|
111
|
+
if not isinstance(chapter, int) or chapter < 1:
|
|
112
|
+
_die(f"labels record missing valid chapter at {labels_path}:{line_no}", 1)
|
|
113
|
+
schema_version = obj.get("schema_version")
|
|
114
|
+
if schema_version != 1:
|
|
115
|
+
_die(
|
|
116
|
+
f"unsupported schema_version at {labels_path}:{line_no} (expected 1, got {schema_version})",
|
|
117
|
+
1,
|
|
118
|
+
)
|
|
119
|
+
human_scores = obj.get("human_scores")
|
|
120
|
+
if not isinstance(human_scores, dict) or _common.as_float(human_scores.get("overall")) is None:
|
|
121
|
+
_die(f"labels record missing human_scores.overall at {labels_path}:{line_no}", 1)
|
|
122
|
+
if chapter in label_records:
|
|
123
|
+
_die(
|
|
124
|
+
f"duplicate chapter {chapter} in labels (lines {label_line_by_chapter[chapter]} and {line_no})",
|
|
125
|
+
1,
|
|
126
|
+
)
|
|
127
|
+
label_records[chapter] = obj
|
|
128
|
+
label_line_by_chapter[chapter] = line_no
|
|
129
|
+
|
|
130
|
+
if not label_records:
|
|
131
|
+
_die("labels file has no records", 1)
|
|
132
|
+
|
|
133
|
+
eval_dir = os.path.join(project_dir, "evaluations")
|
|
134
|
+
if not os.path.isdir(eval_dir):
|
|
135
|
+
_die(f"evaluations/ not found under project dir: {project_dir}", 1)
|
|
136
|
+
|
|
137
|
+
eval_file_list = _common.find_eval_files(eval_dir)
|
|
138
|
+
eval_files: Dict[int, str] = {ch: path for ch, path in eval_file_list}
|
|
139
|
+
|
|
140
|
+
matched_chapters: List[int] = []
|
|
141
|
+
missing_eval_chapters: List[int] = []
|
|
142
|
+
|
|
143
|
+
human_overall: List[float] = []
|
|
144
|
+
judge_overall: List[float] = []
|
|
145
|
+
judge_overall_source: Dict[int, str] = {}
|
|
146
|
+
|
|
147
|
+
dim_pairs: Dict[str, Tuple[List[float], List[float]]] = {}
|
|
148
|
+
|
|
149
|
+
for chapter in sorted(label_records.keys()):
|
|
150
|
+
eval_path = eval_files.get(chapter)
|
|
151
|
+
if not eval_path:
|
|
152
|
+
missing_eval_chapters.append(chapter)
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
eval_obj = _load_json(eval_path)
|
|
156
|
+
if not isinstance(eval_obj, dict):
|
|
157
|
+
_die(f"eval JSON must be an object at {eval_path}", 1)
|
|
158
|
+
|
|
159
|
+
human = _common.as_float(label_records[chapter]["human_scores"]["overall"])
|
|
160
|
+
judge = _common.extract_overall(eval_obj)
|
|
161
|
+
if human is None:
|
|
162
|
+
_die(f"labels human_scores.overall not a number for chapter {chapter}", 1)
|
|
163
|
+
if judge is None:
|
|
164
|
+
missing_eval_chapters.append(chapter)
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
# Track which field provided judge overall.
|
|
168
|
+
src = "unknown"
|
|
169
|
+
if _common.as_float(eval_obj.get("overall_final")) is not None:
|
|
170
|
+
src = "overall_final"
|
|
171
|
+
elif _common.as_float(_common.extract_eval_used(eval_obj).get("overall_final")) is not None:
|
|
172
|
+
src = "eval_used.overall_final"
|
|
173
|
+
elif _common.as_float(_common.extract_eval_used(eval_obj).get("overall")) is not None:
|
|
174
|
+
src = "eval_used.overall"
|
|
175
|
+
elif _common.as_float(eval_obj.get("overall")) is not None:
|
|
176
|
+
src = "overall"
|
|
177
|
+
else:
|
|
178
|
+
meta = eval_obj.get("metadata")
|
|
179
|
+
if isinstance(meta, dict) and isinstance(meta.get("judges"), dict) and _common.as_float(meta["judges"].get("overall_final")) is not None:
|
|
180
|
+
src = "metadata.judges.overall_final"
|
|
181
|
+
judge_overall_source[chapter] = src
|
|
182
|
+
|
|
183
|
+
matched_chapters.append(chapter)
|
|
184
|
+
human_overall.append(float(human))
|
|
185
|
+
judge_overall.append(float(judge))
|
|
186
|
+
|
|
187
|
+
human_scores = label_records[chapter].get("human_scores")
|
|
188
|
+
human_dims: Dict[str, float] = {}
|
|
189
|
+
if isinstance(human_scores, dict):
|
|
190
|
+
for k, v in human_scores.items():
|
|
191
|
+
if k == "overall":
|
|
192
|
+
continue
|
|
193
|
+
n = _common.as_float(v)
|
|
194
|
+
if n is not None:
|
|
195
|
+
human_dims[str(k)] = float(n)
|
|
196
|
+
|
|
197
|
+
judge_dims = _common.extract_dimension_scores(eval_obj)
|
|
198
|
+
for dim_key, human_dim_score in human_dims.items():
|
|
199
|
+
judge_dim_score = judge_dims.get(dim_key)
|
|
200
|
+
if judge_dim_score is None:
|
|
201
|
+
continue
|
|
202
|
+
xs, ys = dim_pairs.setdefault(dim_key, ([], []))
|
|
203
|
+
xs.append(float(human_dim_score))
|
|
204
|
+
ys.append(float(judge_dim_score))
|
|
205
|
+
|
|
206
|
+
if len(matched_chapters) < 2:
|
|
207
|
+
_die(
|
|
208
|
+
f"need at least 2 matched chapters to compute Pearson (matched={len(matched_chapters)})",
|
|
209
|
+
1,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
r_overall = _pearson(human_overall, judge_overall)
|
|
213
|
+
fit = _linear_fit(judge_overall, human_overall) # human ~ slope * judge + intercept
|
|
214
|
+
|
|
215
|
+
errors = [j - h for h, j in zip(human_overall, judge_overall)]
|
|
216
|
+
mae = sum(abs(e) for e in errors) / len(errors)
|
|
217
|
+
rmse = math.sqrt(sum(e * e for e in errors) / len(errors))
|
|
218
|
+
bias = sum(errors) / len(errors)
|
|
219
|
+
|
|
220
|
+
# pause_for_user_force_rewrite is implicit (<2.0), no threshold to calibrate
|
|
221
|
+
default_thresholds = {"pass": 4.0, "polish": 3.5, "revise": 3.0, "pause_for_user": 2.0}
|
|
222
|
+
|
|
223
|
+
suggestions: Dict[str, Any] = {"defaults": default_thresholds, "methods": {}}
|
|
224
|
+
|
|
225
|
+
# Method 1: shift thresholds by mean judge-human bias.
|
|
226
|
+
shifted = {k: _safe_round(_clamp(v + bias), 3) for k, v in default_thresholds.items()}
|
|
227
|
+
suggestions["methods"]["shift_by_bias"] = {
|
|
228
|
+
"bias_judge_minus_human": _safe_round(bias, 4),
|
|
229
|
+
"suggested_thresholds": shifted,
|
|
230
|
+
"note": "若 judge 整体偏高(bias>0),建议上调阈值;偏低(bias<0)则下调。仅作启发式建议。",
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
# Method 2: linear fit inversion: find judge thresholds that map to target human thresholds.
|
|
234
|
+
if fit is not None and abs(float(fit["slope"])) > 1e-6:
|
|
235
|
+
slope = float(fit["slope"])
|
|
236
|
+
intercept = float(fit["intercept"])
|
|
237
|
+
inv = {}
|
|
238
|
+
for k, human_t in default_thresholds.items():
|
|
239
|
+
inv[k] = _safe_round(_clamp((human_t - intercept) / slope), 3)
|
|
240
|
+
suggestions["methods"]["linear_fit_inverse"] = {
|
|
241
|
+
"fit_human_equals_slope_times_judge_plus_intercept": {
|
|
242
|
+
"slope": _safe_round(slope, 6),
|
|
243
|
+
"intercept": _safe_round(intercept, 6),
|
|
244
|
+
},
|
|
245
|
+
"suggested_thresholds": inv,
|
|
246
|
+
"note": "基于最小二乘线性拟合的反解(用 judge 预测 human)。样本少/分布窄时可能不稳定,仅作建议。",
|
|
247
|
+
}
|
|
248
|
+
else:
|
|
249
|
+
suggestions["methods"]["linear_fit_inverse"] = {
|
|
250
|
+
"fit_human_equals_slope_times_judge_plus_intercept": None,
|
|
251
|
+
"suggested_thresholds": None,
|
|
252
|
+
"note": "judge 分布过窄或样本不足,无法稳定拟合。",
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
dims_report: Dict[str, Any] = {}
|
|
256
|
+
for dim_key in sorted(dim_pairs.keys()):
|
|
257
|
+
xs, ys = dim_pairs[dim_key]
|
|
258
|
+
r = _pearson(xs, ys)
|
|
259
|
+
dims_report[dim_key] = {"n": len(xs), "pearson_r": _safe_round(r, 4)}
|
|
260
|
+
|
|
261
|
+
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
262
|
+
out: Dict[str, Any] = {
|
|
263
|
+
"schema_version": 1,
|
|
264
|
+
"generated_at": now,
|
|
265
|
+
"project": {
|
|
266
|
+
"path": os.path.abspath(project_dir),
|
|
267
|
+
"evaluations_dir": os.path.join(os.path.abspath(project_dir), "evaluations"),
|
|
268
|
+
},
|
|
269
|
+
"labels": {
|
|
270
|
+
"path": os.path.abspath(labels_path),
|
|
271
|
+
"records": len(label_records),
|
|
272
|
+
},
|
|
273
|
+
"alignment": {
|
|
274
|
+
"matched_chapters": matched_chapters,
|
|
275
|
+
"missing_eval_chapters": missing_eval_chapters,
|
|
276
|
+
"judge_overall_source_by_chapter": {str(k): v for k, v in sorted(judge_overall_source.items())},
|
|
277
|
+
},
|
|
278
|
+
"overall": {
|
|
279
|
+
"n": len(matched_chapters),
|
|
280
|
+
"pearson_r": _safe_round(r_overall, 4),
|
|
281
|
+
"human_mean": _safe_round(sum(human_overall) / len(human_overall), 4),
|
|
282
|
+
"judge_mean": _safe_round(sum(judge_overall) / len(judge_overall), 4),
|
|
283
|
+
"mae": _safe_round(mae, 4),
|
|
284
|
+
"rmse": _safe_round(rmse, 4),
|
|
285
|
+
"bias_judge_minus_human": _safe_round(bias, 4),
|
|
286
|
+
},
|
|
287
|
+
"dimensions": dims_report,
|
|
288
|
+
"threshold_suggestions": suggestions,
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
out_json = json.dumps(out, ensure_ascii=False, sort_keys=True) + "\n"
|
|
292
|
+
sys.stdout.write(out_json)
|
|
293
|
+
|
|
294
|
+
if out_path:
|
|
295
|
+
out_dir = os.path.dirname(os.path.abspath(out_path))
|
|
296
|
+
if out_dir and not os.path.isdir(out_dir):
|
|
297
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
298
|
+
try:
|
|
299
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
300
|
+
f.write(out_json)
|
|
301
|
+
except Exception as e:
|
|
302
|
+
_die(f"failed to write report to {out_path}: {e}", 1)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
if __name__ == "__main__":
|
|
306
|
+
try:
|
|
307
|
+
main()
|
|
308
|
+
except SystemExit:
|
|
309
|
+
raise
|
|
310
|
+
except Exception as e:
|
|
311
|
+
sys.stderr.write(f"calibrate-quality-judge.sh: unexpected error: {e}\n")
|
|
312
|
+
raise SystemExit(2)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Compare two archived regression-run summaries (M3).
|
|
2
|
+
|
|
3
|
+
Extracted from the heredoc in scripts/compare-regression-runs.sh.
|
|
4
|
+
Reuses helpers from _common to avoid duplication.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
from typing import Any, Dict, Optional
|
|
11
|
+
|
|
12
|
+
import _common
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _die(msg: str, exit_code: int = 1) -> None:
|
|
16
|
+
_common.die(f"compare-regression-runs.sh: {msg}", exit_code)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load_json(path: str) -> Any:
|
|
20
|
+
try:
|
|
21
|
+
return _common.load_json(path)
|
|
22
|
+
except Exception as e:
|
|
23
|
+
_die(f"invalid JSON at {path}: {e}", 1)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _delta_number(a: Any, b: Any) -> Optional[float]:
|
|
27
|
+
na = _common.as_float(a)
|
|
28
|
+
nb = _common.as_float(b)
|
|
29
|
+
if na is None or nb is None:
|
|
30
|
+
return None
|
|
31
|
+
return nb - na
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _delta_dim_means(a: Any, b: Any) -> Dict[str, Optional[float]]:
|
|
35
|
+
"""Delta on score_dimensions: extract 'mean' from {n, mean} objects."""
|
|
36
|
+
if not isinstance(a, dict):
|
|
37
|
+
a = {}
|
|
38
|
+
if not isinstance(b, dict):
|
|
39
|
+
b = {}
|
|
40
|
+
keys = sorted(set(list(a.keys()) + list(b.keys())))
|
|
41
|
+
out: Dict[str, Optional[float]] = {}
|
|
42
|
+
for k in keys:
|
|
43
|
+
va = a.get(k)
|
|
44
|
+
vb = b.get(k)
|
|
45
|
+
ma = _common.as_float(va.get("mean")) if isinstance(va, dict) else None
|
|
46
|
+
mb = _common.as_float(vb.get("mean")) if isinstance(vb, dict) else None
|
|
47
|
+
if ma is None and mb is None:
|
|
48
|
+
continue
|
|
49
|
+
if ma is None or mb is None:
|
|
50
|
+
out[k] = None
|
|
51
|
+
else:
|
|
52
|
+
out[k] = round(float(mb - ma), 6)
|
|
53
|
+
return out
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _delta_map(a: Any, b: Any) -> Dict[str, Optional[float]]:
|
|
57
|
+
if not isinstance(a, dict):
|
|
58
|
+
a = {}
|
|
59
|
+
if not isinstance(b, dict):
|
|
60
|
+
b = {}
|
|
61
|
+
keys = set([str(k) for k in a.keys()] + [str(k) for k in b.keys()])
|
|
62
|
+
out: Dict[str, Optional[float]] = {}
|
|
63
|
+
for k in sorted(keys):
|
|
64
|
+
da = _common.as_float(a.get(k))
|
|
65
|
+
db = _common.as_float(b.get(k))
|
|
66
|
+
if da is None and db is None:
|
|
67
|
+
continue
|
|
68
|
+
if da is None or db is None:
|
|
69
|
+
out[k] = None
|
|
70
|
+
else:
|
|
71
|
+
out[k] = float(db - da)
|
|
72
|
+
return out
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def main() -> None:
|
|
76
|
+
path_a = sys.argv[1]
|
|
77
|
+
path_b = sys.argv[2]
|
|
78
|
+
out_path = sys.argv[3].strip() if len(sys.argv) > 3 else ""
|
|
79
|
+
|
|
80
|
+
a = _load_json(path_a)
|
|
81
|
+
b = _load_json(path_b)
|
|
82
|
+
if not isinstance(a, dict) or not isinstance(b, dict):
|
|
83
|
+
_die("both summaries must be JSON objects", 1)
|
|
84
|
+
|
|
85
|
+
comp_a = a.get("compliance") if isinstance(a.get("compliance"), dict) else {}
|
|
86
|
+
comp_b = b.get("compliance") if isinstance(b.get("compliance"), dict) else {}
|
|
87
|
+
|
|
88
|
+
score_a = a.get("score_overall") if isinstance(a.get("score_overall"), dict) else {}
|
|
89
|
+
score_b = b.get("score_overall") if isinstance(b.get("score_overall"), dict) else {}
|
|
90
|
+
|
|
91
|
+
out = {
|
|
92
|
+
"schema_version": 1,
|
|
93
|
+
"generated_at": _common.iso_utc_now(),
|
|
94
|
+
"run_a": {"dir": os.path.dirname(os.path.abspath(path_a)), "summary_path": os.path.abspath(path_a), "run_id": a.get("run_id")},
|
|
95
|
+
"run_b": {"dir": os.path.dirname(os.path.abspath(path_b)), "summary_path": os.path.abspath(path_b), "run_id": b.get("run_id")},
|
|
96
|
+
"delta": {
|
|
97
|
+
"chapters_total": _delta_number(a.get("chapters_total"), b.get("chapters_total")),
|
|
98
|
+
"violations_total": _delta_number(a.get("violations_total"), b.get("violations_total")),
|
|
99
|
+
"compliance_rate_high_confidence": _delta_number(comp_a.get("compliance_rate_high_confidence"), comp_b.get("compliance_rate_high_confidence")),
|
|
100
|
+
"compliance_rate_any_violation": _delta_number(comp_a.get("compliance_rate_any_violation"), comp_b.get("compliance_rate_any_violation")),
|
|
101
|
+
"chapters_with_high_confidence_violation": _delta_number(
|
|
102
|
+
comp_a.get("chapters_with_high_confidence_violation"), comp_b.get("chapters_with_high_confidence_violation")
|
|
103
|
+
),
|
|
104
|
+
"chapters_with_any_violation": _delta_number(comp_a.get("chapters_with_any_violation"), comp_b.get("chapters_with_any_violation")),
|
|
105
|
+
"violations_by_confidence": _delta_map(a.get("violations_by_confidence"), b.get("violations_by_confidence")),
|
|
106
|
+
"violations_by_layer": _delta_map(a.get("violations_by_layer"), b.get("violations_by_layer")),
|
|
107
|
+
"score_overall_mean": _delta_number(score_a.get("mean"), score_b.get("mean")),
|
|
108
|
+
},
|
|
109
|
+
"score_dimensions": {},
|
|
110
|
+
"notes": [],
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if _common.as_float(a.get("chapters_total")) != _common.as_float(b.get("chapters_total")):
|
|
114
|
+
out["notes"].append("chapters_total differs; compare deltas with caution.")
|
|
115
|
+
|
|
116
|
+
dims_a = a.get("score_dimensions", {})
|
|
117
|
+
dims_b = b.get("score_dimensions", {})
|
|
118
|
+
if dims_a or dims_b:
|
|
119
|
+
out["score_dimensions"] = _delta_dim_means(dims_a, dims_b)
|
|
120
|
+
|
|
121
|
+
out_json = json.dumps(out, ensure_ascii=False, sort_keys=True) + "\n"
|
|
122
|
+
sys.stdout.write(out_json)
|
|
123
|
+
|
|
124
|
+
if out_path:
|
|
125
|
+
out_dir = os.path.dirname(os.path.abspath(out_path))
|
|
126
|
+
if out_dir and not os.path.isdir(out_dir):
|
|
127
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
128
|
+
try:
|
|
129
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
130
|
+
f.write(out_json)
|
|
131
|
+
except Exception as e:
|
|
132
|
+
_die(f"failed to write to {out_path}: {e}", 1)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
try:
|
|
137
|
+
main()
|
|
138
|
+
except SystemExit:
|
|
139
|
+
raise
|
|
140
|
+
except Exception as e:
|
|
141
|
+
sys.stderr.write(f"compare-regression-runs.sh: unexpected error: {e}\n")
|
|
142
|
+
raise SystemExit(2)
|