patina-cli 3.11.0 β†’ 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/.patina.default.yaml +29 -29
  2. package/CHANGELOG.md +53 -0
  3. package/NOTICE +21 -0
  4. package/README.md +117 -224
  5. package/README_JA.md +134 -77
  6. package/README_KR.md +132 -74
  7. package/README_ZH.md +137 -80
  8. package/SKILL.md +11 -20
  9. package/artifacts/rebaseline-2025/README.md +147 -0
  10. package/artifacts/rebaseline-2025/human-controls.public.jsonl +250 -0
  11. package/artifacts/rebaseline-2025/intake.example.jsonl +2 -0
  12. package/artifacts/rebaseline-2025/intake.local.example.jsonl +25 -0
  13. package/artifacts/rebaseline-2025/prompts.template.jsonl +7 -0
  14. package/artifacts/rebaseline-2025/sources.ko-public.jsonl +39 -0
  15. package/assets/brand/patina-badge.svg +18 -0
  16. package/assets/brand/patina-mark.svg +8 -0
  17. package/assets/demo/README.md +79 -0
  18. package/core/scoring.md +12 -12
  19. package/core/standalone-prompt.md +3 -1
  20. package/core/stylometry.md +93 -22
  21. package/docs/API.md +1554 -0
  22. package/docs/AUTHENTICATION.md +50 -26
  23. package/docs/AUTHENTICATION_KR.md +54 -29
  24. package/docs/BRANDING.md +9 -8
  25. package/docs/CLI.md +55 -14
  26. package/docs/COOKBOOK.md +8 -21
  27. package/docs/DEMO.md +32 -5
  28. package/docs/EXIT-CODES.md +2 -3
  29. package/docs/FALSE-POSITIVES.md +63 -0
  30. package/docs/FAQ.md +9 -1
  31. package/docs/FAQ_KR.md +3 -1
  32. package/docs/FLAG-PARITY.md +33 -47
  33. package/docs/ISSUE-WAVES.md +57 -0
  34. package/docs/PATTERNS-EN.md +67 -3
  35. package/docs/PATTERNS-JA.md +68 -2
  36. package/docs/PATTERNS-KO.md +70 -7
  37. package/docs/PATTERNS-ZH.md +67 -3
  38. package/docs/PATTERNS.md +5 -5
  39. package/docs/RESEARCH-DOCS-PLATFORM.md +54 -0
  40. package/docs/ROADMAP.md +46 -66
  41. package/docs/TRANSLATIONESE-KO.md +51 -0
  42. package/docs/audits/2026-05-deep-research.md +3 -1
  43. package/docs/benchmarks/README.md +51 -0
  44. package/docs/benchmarks/detector-comparison.json +69 -9
  45. package/docs/benchmarks/detector-comparison.md +10 -5
  46. package/docs/benchmarks/katfish-ko-latest.json +657 -0
  47. package/docs/benchmarks/katfish-ko-latest.md +77 -0
  48. package/docs/benchmarks/latest.json +1183 -108
  49. package/docs/benchmarks/latest.md +84 -60
  50. package/docs/benchmarks/lexicon-freshness-en-2026-05-22.json +1121 -0
  51. package/docs/benchmarks/lexicon-freshness-en-2026-05-22.md +136 -0
  52. package/docs/benchmarks/rebaseline-latest.json +381 -0
  53. package/docs/benchmarks/rebaseline-latest.md +121 -0
  54. package/docs/benchmarks/register-stratified-latest.json +164 -0
  55. package/docs/benchmarks/register-stratified-latest.md +99 -0
  56. package/docs/benchmarks/register-stratified.md +43 -0
  57. package/docs/integrations/github-action.md +44 -11
  58. package/docs/integrations/playground.md +58 -0
  59. package/docs/integrations/pre-commit.md +5 -5
  60. package/docs/integrations/release.md +5 -3
  61. package/docs/integrations/static-sites.md +83 -0
  62. package/docs/research/2025-rebaseline-plan.md +71 -2
  63. package/docs/research/2026-rebaseline.md +102 -0
  64. package/docs/research/adversarial-mps.md +41 -0
  65. package/docs/research/ai-human-metrics.md +35 -23
  66. package/docs/research/human-eval-panel.md +42 -0
  67. package/docs/research/judge-agreement.md +24 -0
  68. package/docs/research/ko-2025-corpus-sources.md +135 -0
  69. package/docs/research/lexicon-freshness-audit.md +64 -0
  70. package/docs/research/zh-ja-lexicon-calibration.md +60 -0
  71. package/docs/social/patina-launch-copy.md +173 -100
  72. package/docs/social/patina-launch-execution.md +94 -0
  73. package/docs/social/patina-launch-korean-first.md +83 -0
  74. package/docs/social/signs-of-ai-writing.md +26 -0
  75. package/docs/social/signs-of-ai-writing_KR.md +26 -0
  76. package/lexicon/ai-en.md +21 -24
  77. package/lexicon/ai-ja.md +158 -0
  78. package/lexicon/ai-ko.md +9 -9
  79. package/lexicon/ai-zh.md +158 -0
  80. package/lexicon/provenance/ai-en.json +970 -0
  81. package/lexicon/provenance/ai-ja.json +542 -0
  82. package/lexicon/provenance/ai-ko.json +866 -0
  83. package/lexicon/provenance/ai-zh.json +542 -0
  84. package/package.json +49 -8
  85. package/patterns/en-communication.md +5 -0
  86. package/patterns/en-content.md +5 -0
  87. package/patterns/en-filler.md +5 -0
  88. package/patterns/en-language.md +29 -1
  89. package/patterns/en-structure.md +5 -0
  90. package/patterns/en-style.md +5 -0
  91. package/patterns/en-viral-hook.md +42 -2
  92. package/patterns/ja-communication.md +5 -0
  93. package/patterns/ja-content.md +5 -0
  94. package/patterns/ja-filler.md +5 -0
  95. package/patterns/ja-language.md +33 -1
  96. package/patterns/ja-structure.md +12 -0
  97. package/patterns/ja-style.md +5 -0
  98. package/patterns/ja-viral-hook.md +41 -2
  99. package/patterns/ko-communication.md +5 -0
  100. package/patterns/ko-content.md +5 -0
  101. package/patterns/ko-filler.md +5 -0
  102. package/patterns/ko-language.md +33 -1
  103. package/patterns/ko-structure.md +25 -6
  104. package/patterns/ko-style.md +5 -0
  105. package/patterns/ko-viral-hook.md +38 -2
  106. package/patterns/zh-communication.md +5 -0
  107. package/patterns/zh-content.md +5 -0
  108. package/patterns/zh-filler.md +5 -0
  109. package/patterns/zh-language.md +37 -1
  110. package/patterns/zh-structure.md +12 -0
  111. package/patterns/zh-style.md +5 -0
  112. package/patterns/zh-viral-hook.md +38 -2
  113. package/playground/README.md +55 -0
  114. package/playground/analytics.js +4 -0
  115. package/playground/analyzer.js +883 -0
  116. package/playground/app.js +157 -0
  117. package/playground/data/lexicons.js +343 -0
  118. package/playground/index.html +138 -0
  119. package/playground/styles.css +267 -0
  120. package/profiles/namuwiki.md +111 -0
  121. package/scripts/adversarial-mps-report.mjs +201 -0
  122. package/scripts/badge-json.mjs +79 -0
  123. package/scripts/benchmark-report.mjs +56 -9
  124. package/scripts/check-release-metadata.mjs +0 -2
  125. package/scripts/detector-comparison.mjs +7 -7
  126. package/scripts/generate-playground-data.mjs +77 -0
  127. package/scripts/katfish-calibration.mjs +464 -0
  128. package/scripts/lexicon-freshness.mjs +485 -0
  129. package/scripts/lint.mjs +1 -1
  130. package/scripts/precommit-score.mjs +4 -3
  131. package/scripts/prose-score.mjs +81 -5
  132. package/scripts/rebaseline-intake.mjs +242 -0
  133. package/scripts/rebaseline-score.mjs +268 -0
  134. package/scripts/rebaseline-summary.mjs +773 -0
  135. package/scripts/rebaseline-web-collect.mjs +410 -0
  136. package/scripts/update-benchmark-ranges.mjs +1 -0
  137. package/src/api.js +69 -105
  138. package/src/auth.js +50 -2
  139. package/src/backends/claude-cli.js +19 -4
  140. package/src/backends/codex-cli.js +19 -3
  141. package/src/backends/contract.js +230 -1
  142. package/src/backends/gemini-cli.js +18 -5
  143. package/src/backends/index.js +87 -12
  144. package/src/backends/kimi-cli.js +161 -0
  145. package/src/cli.js +577 -567
  146. package/src/commands/doctor.js +2 -2
  147. package/src/config.js +29 -0
  148. package/src/errors.js +53 -1
  149. package/src/features/discourse-tells.js +68 -0
  150. package/src/features/index.js +82 -8
  151. package/src/features/lexicon.js +40 -6
  152. package/src/features/markup-leakage.js +69 -0
  153. package/src/features/segment.js +41 -0
  154. package/src/features/signal-strength.js +81 -0
  155. package/src/features/stylometry.js +231 -1
  156. package/src/features/translationese.js +127 -0
  157. package/src/loader.js +76 -0
  158. package/src/logger.js +22 -23
  159. package/src/model-defaults.js +55 -0
  160. package/src/ouroboros.js +31 -0
  161. package/src/output.js +102 -90
  162. package/src/prompt-builder.js +103 -68
  163. package/src/providers.js +51 -4
  164. package/src/scoring.js +210 -2
  165. package/src/security.js +75 -0
  166. package/tests/fixtures/live-quality/en/public-docs-01.md +26 -0
  167. package/tests/fixtures/live-quality/ko/public-docs-01.md +26 -0
  168. package/tests/fixtures/suspect-zones/expected-ranges.json +207 -16
  169. package/tests/fixtures/suspect-zones/ja/ai/ja-ai-04-lexicon.md +11 -0
  170. package/tests/fixtures/suspect-zones/ja/natural/ja-nat-04-lexicon-cold.md +11 -0
  171. package/tests/fixtures/suspect-zones/ko/ai/ko-ai-02.md +4 -5
  172. package/tests/fixtures/suspect-zones/ko/ai/ko-ai-07-ko-diagnostic.md +11 -0
  173. package/tests/fixtures/suspect-zones/zh/ai/zh-ai-04-lexicon.md +11 -0
  174. package/tests/fixtures/suspect-zones/zh/natural/zh-nat-04-lexicon-cold.md +11 -0
  175. package/tests/quality/README.md +188 -11
  176. package/tests/quality/adversarial-mps/fixtures.jsonl +10 -0
  177. package/tests/quality/benchmark.mjs +39 -1
  178. package/tests/quality/dogfood.mjs +5 -3
  179. package/tests/quality/live-fixtures.jsonl +2 -0
  180. package/tests/quality/live-quality.mjs +596 -0
  181. package/tests/quality/ranking-metrics.mjs +136 -0
  182. package/tests/quality/rebaseline-manifest.example.jsonl +5 -0
  183. package/vercel.json +53 -0
  184. package/SKILL-MAX.md +0 -455
  185. package/docs/internal/HARNESS.md +0 -14
  186. package/docs/internal/README.md +0 -14
  187. package/docs/internal/WARP.md +0 -23
  188. package/patina-max/SKILL.md +0 -523
  189. package/patina-max/composite.py +0 -457
  190. package/src/cache.js +0 -106
  191. package/src/commands/init.js +0 -208
  192. package/src/manifest.js +0 -162
  193. package/src/max-mode.js +0 -207
@@ -14,6 +14,7 @@ Outputs:
14
14
  - A markdown table per language (accuracy, precision, recall, F1, confusion matrix)
15
15
  - A list of any misclassified fixtures with their feature values
16
16
  - `tests/quality/results.json` β€” full per-fixture log (gitignored)
17
+ - `docs/benchmarks/README.md` β€” report index, refresh commands, and public-claim rules
17
18
  - `docs/benchmarks/latest.md` / `latest.json` when run via `npm run benchmark:report`
18
19
  - `docs/benchmarks/detector-comparison.md` / `.json` when run via `npm run benchmark:compare`
19
20
 
@@ -23,34 +24,193 @@ Every fixture under `tests/fixtures/suspect-zones/{lang}/{ai|natural}/*.md`
23
24
  carries an `expected_hot` label in its frontmatter. The benchmark runs
24
25
  `analyzeText()` (defined in `src/features/index.js`) on the body and
25
26
  compares the predicted hot/cold decision against that label. The decision
26
- follows the 3-signal OR rule from `core/stylometry.md` Β§16:
27
+ follows the 4-signal OR rule from `core/stylometry.md` Β§16:
27
28
 
28
29
  ```
29
30
  paragraph is SUSPECT iff
30
31
  burstiness_band == "low" OR
31
32
  MATTR_band == "low" OR
32
- lexicon_density > threshold
33
+ (lexicon_density > threshold AND lexicon_min_hits is satisfied) OR
34
+ koDiagnostics.hot == true
33
35
  ```
34
36
 
37
+ `burstiness_band` is only assigned when a paragraph has at least three
38
+ sentences; two-sentence CV is recorded for diagnostics but is not stable enough
39
+ to classify a paragraph by itself. For ko/zh/ja, a single lexicon hit is also
40
+ only an audit hint; the default hot threshold requires at least two CJK hits.
41
+ to make the paragraph hot by itself.
42
+
43
+ For `lang=ko`, `analyzeText()` also records Korean diagnostic fields:
44
+ `spacing`, `comma`, and `posDiversity` (a suffix-class proxy, not a morphology
45
+ analyzer). They only affect the hot/cold decision through the conservative
46
+ `koDiagnostics` composite: at least four sentences, at least 20 eojeols, fewer than one comma per sentence, regular eojeol length (`CV <= 0.38`), and low suffix-class diversity (`classDiversity <= 0.26`).
47
+
35
48
  Per-language metrics use `expected_hot=true` as the positive class.
36
49
 
50
+ ## Opt-in live rewrite quality
51
+
52
+ `npm run quality:live` runs the live-quality runner without calling a model by
53
+ default. The default path scores fixture inputs and marks the live rewrite step
54
+ as skipped, so it is safe for local smoke checks and CI dry-runs.
55
+
56
+ ```bash
57
+ npm run quality:live
58
+ npm run quality:live -- --json
59
+ ```
60
+
61
+ To run actual rewrites, opt in explicitly with an OpenAI-compatible provider.
62
+ Use `PATINA_LIVE_*` so this stays a deliberate local/manual probe rather than a
63
+ per-PR network dependency:
64
+
65
+ ```bash
66
+ PATINA_LIVE=1 \
67
+ PATINA_LIVE_PROVIDER=gemini \
68
+ PATINA_LIVE_API_KEY=... \
69
+ npm run quality:live -- --language ko --limit 1
70
+ ```
71
+
72
+ Supported live settings:
73
+
74
+ - `PATINA_LIVE_PROVIDER` β€” provider preset (`openai`, `gemini`, `groq`,
75
+ `kimi`, `moonshot`, `together`).
76
+ - `PATINA_LIVE_API_KEY` β€” live-run key; falls back to the provider key or
77
+ `PATINA_API_KEY`.
78
+ - `PATINA_LIVE_MODEL` / `PATINA_LIVE_API_BASE` / `PATINA_LIVE_TIMEOUT_MS`.
79
+
80
+ The fixture set lives in `tests/fixtures/live-quality/{en,ko}/*.md` with YAML
81
+ frontmatter (`fixture_id`, `language`, optional `profile`, `anchors`,
82
+ `expected_focus`) plus the body text. The legacy
83
+ `tests/quality/live-fixtures.jsonl` remains loadable via `--fixtures`.
84
+
85
+ Live reports are structured JSON or Markdown with:
86
+
87
+ - `schema_version`, redacted settings, and policy floors.
88
+ - `before_score` / `after_score` from model-graded `scoreText`.
89
+ - `mps` from `scoreMPS`.
90
+ - `fidelity` from `scoreFidelity`.
91
+ - `pass`, `warn`, `error`, or `skipped` per fixture.
92
+
93
+ A live rewrite passes when `after_score <= 30`, MPS is at least 70, fidelity is
94
+ at least 70, and the AI score improved. Missing credentials, provider failures,
95
+ schema failures, and MPS/fidelity floor violations are `error` and exit
96
+ nonzero; AI-score target misses remain `warn` so the report is still usable.
97
+ Keep this out of mandatory CI unless the live model path is deliberately
98
+ allowed, because LLM output is non-deterministic and may incur provider cost.
99
+
100
+ ## Adversarial MPS fixtures
101
+
102
+ `npm run quality:adversarial-mps` validates a small, repo-owned fixture set
103
+ where explicit meaning anchors are preserved but AI-like wording remains. This
104
+ guards against treating MPS as a humanness score.
105
+
106
+ ```bash
107
+ npm run quality:adversarial-mps
108
+ node scripts/adversarial-mps-report.mjs --check --json
109
+ ```
110
+
111
+ Inputs live in `tests/quality/adversarial-mps/fixtures.jsonl`; the report is
112
+ written to `docs/research/adversarial-mps.md`. The gate is:
113
+
114
+ - anchor-MPS proxy β‰₯90;
115
+ - deterministic AI score β‰₯60;
116
+ - no private or scraped source text.
117
+
118
+ If this gate passes, the case is intentionally adversarial: meaning survived,
119
+ but style still needs work. Ouroboros selection should prefer candidates that
120
+ pass MPS and lower the AI score, rather than letting high MPS hide
121
+ recurring AI markers.
122
+
123
+ ## 2025+ rebaseline manifest
124
+
125
+ `npm run benchmark:rebaseline` validates the public JSONL manifest scaffold and
126
+ prints matrix coverage. It does not collect text from vendors, call external
127
+ detectors, or turn a small sample into a headline claim.
128
+
129
+ ```bash
130
+ npm run benchmark:rebaseline
131
+ npm run benchmark:rebaseline:report
132
+ node scripts/rebaseline-summary.mjs --input tests/quality/rebaseline-manifest.example.jsonl --json
133
+ npm run benchmark:rebaseline:intake -- --input artifacts/rebaseline-2025/intake.example.jsonl --dry-run
134
+ npm run benchmark:rebaseline:intake -- --input artifacts/rebaseline-2025/intake.local.example.jsonl --dry-run --require-source-review
135
+ npm run benchmark:rebaseline:web -- --target-per-register 50 --max-per-source 12 --collected-at 2026-05-22
136
+ npm run benchmark:rebaseline:score -- --input artifacts/rebaseline-2025/private/web-human-controls.generated.private.jsonl --output artifacts/rebaseline-2025/human-controls.public.jsonl --scored-at 2026-05-22
137
+ node scripts/rebaseline-summary.mjs --input artifacts/rebaseline-2025/human-controls.public.jsonl --json
138
+ ```
139
+
140
+ Each row records the source metadata needed by
141
+ `docs/research/2025-rebaseline-plan.md`: `sample_id`, `language`, `class`,
142
+ `register`, `model_family`, `provider`, `model`, `generated_at`, `prompt_id`,
143
+ `decoding`, `postprocess`, `redistribution`, and `text_hash`. Full `text` is
144
+ allowed only for redistributable rows (`repo-ok`, `redistributable`, public
145
+ license values). Private or vendor-copied rows must stay metadata-only and use
146
+ hashes.
147
+
148
+ For local/private corpus intake, use `npm run benchmark:rebaseline:intake`.
149
+ It computes missing `text_hash` values and writes a public manifest that strips
150
+ full text from non-redistributable rows while preserving the full row in the
151
+ gitignored private output. Use `--require-source-review` before pilot reports so
152
+ non-public rows must explain their redistribution status through `source_review`
153
+ or `reviewer_notes`. The tracked `artifacts/rebaseline-2025/intake.example.jsonl`
154
+ fixture and `artifacts/rebaseline-2025/intake.local.example.jsonl` 25-row
155
+ template are smoke checks only; real corpus rows stay local until a license
156
+ review says otherwise.
157
+
158
+ `artifacts/rebaseline-2025/human-controls.public.jsonl` is the first tracked
159
+ web-sourced Korean human-control candidate manifest. It is metadata/hash-only:
160
+ no raw source text is committed. Its deterministic outcome fields are register-stratified false-positive
161
+ evidence; public catch-rate claims require positive AI-like rows and claim-cell coverage, now provided by `rebaseline-2026.scored.public.jsonl` for KO+EN.
162
+
163
+ The #155 report is claim-ready only when the process gate is satisfied: scored outcome rows, at least three generator families across at least two languages, nβ‰₯100 per claim cell, and confidence intervals. The checked-in 2026 manifest now satisfies that gate for KO+EN.
164
+
165
+ `npm run benchmark:rebaseline:report` refreshes
166
+ `docs/benchmarks/rebaseline-latest.md` and `.json`. Use `tests/quality/rebaseline-manifest.example.jsonl` for a BLOCKED smoke fixture; use `artifacts/rebaseline-2025/rebaseline-2026.scored.public.jsonl` for the current READY public report.
167
+
168
+ ## Score vs signal strength
169
+
170
+ The pre-commit prose gate keeps the older, conservative score semantics:
171
+
172
+ ```text
173
+ score = hot_paragraphs / total_paragraphs * 100
174
+ ```
175
+
176
+ That binary ratio decides pass/fail because it is stable for CI. The report also
177
+ prints two diagnostics:
178
+
179
+ - `signal` β€” average paragraph intensity of the strongest deterministic trigger:
180
+ how far burstiness or MATTR is inside its low band, how far lexicon density
181
+ is over the threshold, or how strong the Korean diagnostic composite is.
182
+ - `pattern hits` β€” count of pattern-pack watch terms found in the stripped prose.
183
+ This is diagnostic only; it helps reviewers see pattern-level cleanup that may
184
+ not change the binary hot-paragraph ratio.
185
+
186
+ Treat both as editing diagnostics, not separate authorship verdicts or CI gates.
187
+ The prose gate uses the default deterministic thresholds and the current
188
+ Markdown pattern packs. Runtime scoring may use project config thresholds, so
189
+ compare `signal` values within the same entrypoint rather than across tools.
190
+
191
+ Report person-written paragraphs that cross the gate through the false-positive
192
+ form: <https://github.com/devswha/patina/issues/new?template=false_positive.yml>.
193
+ Include the exact paragraph, language/register, score output, and whether the
194
+ sample can become a public fixture.
195
+
37
196
  ## What it does NOT measure
38
197
 
39
198
  - LLM-based scoring (`src/scoring.js`). The LLM is non-deterministic by
40
199
  design and adds API cost / latency, so it stays out of this layer.
41
200
  A separate live-mode benchmark would be its own follow-up.
42
- - Rewrite quality (does the rewritten text read better?). That requires
43
- human or LLM grading and lives in `tests/e2e/quality-test.js`.
44
- That script is opt-in because it shells out to OpenCode:
201
+ - Mandatory rewrite quality gates. Live rewrite quality lives in
202
+ `tests/quality/live-quality.mjs` and remains opt-in because it can shell out
203
+ to OpenCode:
45
204
 
46
205
  ```bash
47
- OPENCODE_AVAILABLE=1 node tests/e2e/quality-test.js
206
+ OPENCODE_AVAILABLE=1 npm run quality:live -- --limit 1
48
207
  ```
49
208
 
50
- The script uses `opencode/hy3-preview-free` by default. Override it with
209
+ The scaffold uses `opencode/hy3-preview-free` by default. Override it with
51
210
  `OPENCODE_MODEL=<provider/model>` when testing another OpenCode model.
52
- - AUROC against a ranked score β€” the current decision is binary
53
- (hot/cold), so we report accuracy + F1 instead.
211
+ - Generalized model-era detector claims. The report now includes
212
+ `signal_score` ranking diagnostics (ROC-AUC, PR-AUC, best-F1 threshold), but
213
+ those numbers are still limited to the checked-in fixture corpus.
54
214
 
55
215
  ## Extending the corpus
56
216
 
@@ -111,11 +271,28 @@ in `.patina.default.yaml` (`stylometry.burstiness.bands`,
111
271
  classification. Sweep against this benchmark + your own corpus and
112
272
  update thresholds; the shipped values come from the v3.5.1 / v3.7
113
273
  calibration documented in `core/stylometry.md` Β§13 Β§16.
274
+ `stylometry.ko_diagnostics.bands` controls the ko-only composite. The private
275
+ KatFish calibration command below reports aggregate catch-rate and FP deltas
276
+ without committing external raw text:
277
+
278
+ ```bash
279
+ npm run benchmark:katfish-ko -- --write --basename katfish-ko-latest
280
+ ```
281
+
282
+ Treat that report as a KO diagnostic calibration artifact, not as a broad public
283
+ performance claim.
284
+
285
+ `npm run benchmark:report` also records a diagnostic `signal_score` sweep. The
286
+ prediction rule is `signal_score >= threshold`, and the PR-AUC value is average
287
+ precision over descending score groups. Use it to compare tuning candidates, not
288
+ as an authorship verdict.
114
289
 
115
290
  ## Languages
116
291
 
117
292
  Currently runs on all supported pattern-pack languages: `ko`, `en`, `zh`, and
118
293
  `ja`. Chinese and Japanese use a deterministic character-token fallback because
119
294
  normal prose often has no whitespace; ko/en keep whitespace tokenization.
120
- Language-specific zh/ja lexicons are still future work, so current zh/ja
121
- fixtures are mainly burstiness/MATTR regression coverage.
295
+ Korean additionally emits dependency-free spacing/comma/suffix-diversity
296
+ diagnostics and a conservative ko-only composite detector.
297
+ zh/ja now include high-precision AI-lexicon fixtures as well as
298
+ burstiness/MATTR regression coverage.
@@ -0,0 +1,10 @@
1
+ {"id":"adv-mps-ko-01","lang":"ko","register":"marketing","original":"μ›Œν¬μŠ€νŽ˜μ΄μŠ€λŠ” 회의둝, ν•  일, ν”„λ‘œμ νŠΈ 일정을 ν•œ κ³³μ—μ„œ κ΄€λ¦¬ν•œλ‹€. ν…œν”Œλ¦Ώμ€ 30개이며 λ³΅μ œν•΄μ„œ μˆ˜μ •ν•  수 μžˆλ‹€.","rewritten":"이 μ›Œν¬μŠ€νŽ˜μ΄μŠ€λŠ” 회의둝과 ν•  일, ν”„λ‘œμ νŠΈ 일정을 ν†΅ν•©μ μœΌλ‘œ 관리할 수 μžˆλŠ” ν˜μ‹ μ μΈ 생산성 μ†”λ£¨μ…˜μ΄μž 핡심 κ°€μΉ˜λ‘œ μžλ¦¬λ§€κΉ€ν•©λ‹ˆλ‹€. λ‹€μ–‘ν•œ 업무 λ§₯λ½μ—μ„œ ν…œν”Œλ¦Ώ 30개λ₯Ό ν™œμš©ν•  수 있으며, μ‚¬μš©μžλŠ” 이λ₯Ό λ³΅μ œν•΄μ„œ μˆ˜μ •ν•¨μœΌλ‘œμ¨ 업무 νš¨μœ¨μ„±μ„ κ·ΉλŒ€ν™”ν•  수 μžˆμŠ΅λ‹ˆλ‹€.","anchors":["회의둝","ν•  일","ν”„λ‘œμ νŠΈ 일정","ν…œν”Œλ¦Ώ 30개","λ³΅μ œν•΄μ„œ μˆ˜μ •"],"register_note":"Preserves facts but re-adds clustered marketing language."}
2
+ {"id":"adv-mps-ko-02","lang":"ko","register":"technical","original":"배치 μž‘μ—…μ€ 맀일 02μ‹œμ— μ‹€ν–‰λœλ‹€. μ‹€νŒ¨ν•˜λ©΄ μŠ¬λž™ μ•Œλ¦Όμ„ 보내고 μž¬μ‹œλ„λŠ” μ„Έ 번으둜 μ œν•œν•œλ‹€.","rewritten":"λ³Έ 배치 μž‘μ—…μ€ 맀일 02μ‹œμ— μ•ˆμ •μ μœΌλ‘œ μ‹€ν–‰λ˜λ„λ‘ μ„€κ³„λ˜μ–΄ 있으며, μ‹€νŒ¨ μƒν™©μ—μ„œλŠ” μŠ¬λž™ μ•Œλ¦Όμ„ 톡해 즉각적인 λŒ€μ‘μ„ μ§€μ›ν•©λ‹ˆλ‹€. λ˜ν•œ μž¬μ‹œλ„λŠ” μ„Έ 번으둜 μ œν•œν•˜λŠ” 체계적인 운영 λ°©μ‹μœΌλ‘œ 전체 운영 신뒰성을 ν–₯μƒμ‹œν‚€λŠ” 핡심 μ›μΉ™μœΌλ‘œ μžλ¦¬λ§€κΉ€ν•©λ‹ˆλ‹€.","anchors":["맀일 02μ‹œ","μŠ¬λž™ μ•Œλ¦Ό","μž¬μ‹œλ„λŠ” μ„Έ 번"],"register_note":"Operational facts preserved; AI register remains."}
3
+ {"id":"adv-mps-ko-03","lang":"ko","register":"academic","original":"μ‹€ν—˜μ—λŠ” μ €μž₯μ†Œ 60κ°œκ°€ 포함됐닀. 평균 μ„€μ • μ‹œκ°„μ€ 72μ‹œκ°„μ—μ„œ 10λΆ„μœΌλ‘œ μ€„μ—ˆκ³ , ν‘œλ³Έμ΄ μž‘μ•„ μΌλ°˜ν™”μ—λŠ” μ£Όμ˜κ°€ ν•„μš”ν•˜λ‹€.","rewritten":"λ³Έ μ‹€ν—˜μ€ μ €μž₯μ†Œ 60개λ₯Ό λŒ€μƒμœΌλ‘œ μˆ˜ν–‰λ˜μ—ˆμœΌλ©°, 평균 μ„€μ • μ‹œκ°„μ΄ 72μ‹œκ°„μ—μ„œ 10λΆ„μœΌλ‘œ κ°μ†Œν–ˆλ‹€λŠ” μ μ—μ„œ 후속 λ…Όμ˜μ˜ 핡심 기반이자 μ€‘μš”ν•œ 의미λ₯Ό μ§€λ‹™λ‹ˆλ‹€. λ‹€λ§Œ ν‘œλ³Έμ΄ μž‘κΈ° λ•Œλ¬Έμ— κ²°κ³Όλ₯Ό μΌλ°˜ν™”ν•˜λŠ” λ°μ—λŠ” μ‹ μ€‘ν•œ 접근이 ν•„μš”ν•˜λ‹€κ³  ν•  수 μžˆμŠ΅λ‹ˆλ‹€.","anchors":["μ €μž₯μ†Œ 60개","72μ‹œκ°„μ—μ„œ 10λΆ„","ν‘œλ³Έμ΄ μž‘","μΌλ°˜ν™”"],"register_note":"High MPS with clustered academic packaging."}
4
+ {"id":"adv-mps-ko-04","lang":"ko","register":"product-doc","original":"λŒ€μ‹œλ³΄λ“œλŠ” CSV 내보내기λ₯Ό μ§€μ›ν•œλ‹€. ν•„ν„°λŠ” νŒ€, κΈ°κ°„, μƒνƒœ μ„Έ κ°€μ§€λ‹€.","rewritten":"이 λŒ€μ‹œλ³΄λ“œλŠ” CSV 내보내기λ₯Ό μ§€μ›ν•¨μœΌλ‘œμ¨ 데이터 ν™œμš©μ„±μ„ λ†’μ΄λŠ” 데 κΈ°μ—¬ν•©λ‹ˆλ‹€. λ˜ν•œ νŒ€, κΈ°κ°„, μƒνƒœ μ„Έ κ°€μ§€ ν•„ν„°λ₯Ό μ œκ³΅ν•˜μ—¬ μ‚¬μš©μžκ°€ λ‹€μ–‘ν•œ κ΄€μ μ—μ„œ 정보λ₯Ό 효율적으둜 탐색할 수 μžˆλŠ” 업무 μƒνƒœκ³„μ™€ 핡심 운영 양상을 μ œκ³΅ν•©λ‹ˆλ‹€.","anchors":["CSV 내보내기","νŒ€","κΈ°κ°„","μƒνƒœ","μ„Έ κ°€μ§€ ν•„ν„°"],"register_note":"Product-doc facts preserved; support/efficiency wording recurs."}
5
+ {"id":"adv-mps-ko-05","lang":"ko","register":"policy","original":"μ‹ μ²­ 기간은 6μ›” 1일뢀터 6μ›” 14μΌκΉŒμ§€λ‹€. κ°œμΈμ€ 온라인 μ–‘μ‹μœΌλ‘œ μ ‘μˆ˜ν•˜κ³ , κ²°κ³ΌλŠ” 7μ›” 3일에 κ³΅κ°œλœλ‹€.","rewritten":"λ³Έ μ‹ μ²­ 기간은 6μ›” 1일뢀터 6μ›” 14μΌκΉŒμ§€λ‘œ 운영되며, κ°œμΈμ€ 온라인 양식을 톡해 μ ‘μˆ˜ν•  수 μžˆμŠ΅λ‹ˆλ‹€. κ²°κ³ΌλŠ” 7μ›” 3일에 곡개될 μ˜ˆμ •μœΌλ‘œ, μ‹ μ²­μžλŠ” ν•΄λ‹Ή 일정을 사전에 ν™•μΈν•˜λŠ” 것이 핡심이며 μ•ˆμ •μ μΈ μ ‘μˆ˜ μš΄μ˜μ— μ€‘μš”ν•œ 의미λ₯Ό μ§€λ‹™λ‹ˆλ‹€.","anchors":["6μ›” 1일뢀터 6μ›” 14μΌκΉŒμ§€","온라인 양식","7μ›” 3일"],"register_note":"Dates preserved; officialese packaging remains."}
6
+ {"id":"adv-mps-en-01","lang":"en","register":"marketing","original":"The app imports invoices, groups them by client, and exports a CSV summary at the end of each month.","rewritten":"The app provides a seamless workflow that imports invoices, groups them by client, and exports a CSV summary at the end of each month. This streamlined experience empowers teams to unlock more actionable monthly reporting without changing the underlying billing process.","anchors":["imports invoices","groups them by client","exports a CSV summary","end of each month"],"register_note":"Meaning preserved with dense AI-favored vocabulary."}
7
+ {"id":"adv-mps-en-02","lang":"en","register":"technical","original":"The cache expires after 24 hours. Users can force a manual refresh when debugging stale responses.","rewritten":"The cache is designed as a robust framework that expires after 24 hours while still enabling users to force a manual refresh when debugging stale responses. This approach offers a scalable and thoughtful balance between performance and developer control.","anchors":["expires after 24 hours","manual refresh","debugging stale responses"],"register_note":"Exact controls preserved; AI-like abstraction added."}
8
+ {"id":"adv-mps-en-03","lang":"en","register":"academic","original":"The survey covered 42 maintainers. Twenty-nine said review latency was the main blocker, but the sample was self-selected.","rewritten":"The survey covered 42 maintainers and surfaced a compelling insight: 29 respondents identified review latency as the main blocker. However, because the sample was self-selected, the findings should be interpreted through a nuanced and ethical research lens.","anchors":["42 maintainers","29 respondents","review latency","self-selected"],"register_note":"Numbers and caveat preserved with AI-signature phrasing."}
9
+ {"id":"adv-mps-en-04","lang":"en","register":"support","original":"Password reset links expire in 15 minutes. If a user requests another link, the older link stops working.","rewritten":"Password reset links expire in 15 minutes, creating a secure and user-friendly experience. If a user requests another link, the older link stops working, which helps align the reset workflow with modern account-safety expectations.","anchors":["expire in 15 minutes","requests another link","older link stops working"],"register_note":"Security behavior preserved; packaged UX framing added."}
10
+ {"id":"adv-mps-en-05","lang":"en","register":"strategy","original":"The team will cut weekly planning from 90 minutes to 45 minutes and keep Friday demos unchanged.","rewritten":"The team will streamline weekly planning from 90 minutes to 45 minutes while keeping Friday demos unchanged. This targeted adjustment can accelerate decision-making, bolster alignment, and create a more sustainable operating rhythm without disrupting the existing demo cadence.","anchors":["weekly planning","90 minutes to 45 minutes","Friday demos unchanged"],"register_note":"Schedule facts preserved; AI-favored strategy language remains."}
@@ -16,6 +16,8 @@ import yaml from 'js-yaml';
16
16
 
17
17
  import { analyzeText } from '../../src/features/index.js';
18
18
  import { loadLexicon } from '../../src/features/lexicon.js';
19
+ import { summarizeSignalStrength } from '../../src/features/signal-strength.js';
20
+ import { summarizeRanking } from './ranking-metrics.mjs';
19
21
 
20
22
  const __dirname = dirname(fileURLToPath(import.meta.url));
21
23
  const REPO_ROOT = resolve(__dirname, '../..');
@@ -89,6 +91,29 @@ function summarize(m) {
89
91
  };
90
92
  }
91
93
 
94
+ function rankingRecords(fixtures) {
95
+ return fixtures.map((fixture) => ({
96
+ score: fixture.signal_score,
97
+ expected: fixture.expected_hot,
98
+ }));
99
+ }
100
+
101
+ function summarizeRankingByLanguage(fixtures) {
102
+ const byLanguage = {};
103
+ for (const fixture of fixtures) {
104
+ byLanguage[fixture.lang] ||= [];
105
+ byLanguage[fixture.lang].push({
106
+ score: fixture.signal_score,
107
+ expected: fixture.expected_hot,
108
+ });
109
+ }
110
+ return Object.fromEntries(
111
+ Object.entries(byLanguage)
112
+ .sort(([a], [b]) => a.localeCompare(b))
113
+ .map(([lang, records]) => [lang, summarizeRanking(records)])
114
+ );
115
+ }
116
+
92
117
  function round(n, digits = 3) {
93
118
  return Math.round(n * 10 ** digits) / 10 ** digits;
94
119
  }
@@ -108,6 +133,7 @@ function wilsonInterval(successes, n, z = 1.959963984540054) {
108
133
  function detectorHot(result) {
109
134
  return {
110
135
  burstiness: result.paragraphs.some((p) => p.burstiness?.band === 'low'),
136
+ koDiagnostics: result.paragraphs.some((p) => p.koDiagnostics?.hot),
111
137
  mattr: result.paragraphs.some((p) => p.mattr?.band === 'low'),
112
138
  lexicon: result.paragraphs.some((p) => p.lexicon?.hot),
113
139
  };
@@ -116,6 +142,7 @@ function detectorHot(result) {
116
142
  function emptyDetectorMetrics() {
117
143
  return {
118
144
  burstiness: emptyMetrics(),
145
+ koDiagnostics: emptyMetrics(),
119
146
  mattr: emptyMetrics(),
120
147
  lexicon: emptyMetrics(),
121
148
  };
@@ -214,6 +241,10 @@ function main() {
214
241
  mattr_band: p.mattr?.band,
215
242
  lexicon_density: round(p.lexicon?.density ?? 0),
216
243
  lexicon_hits: p.lexicon?.hits ?? [],
244
+ ko_diagnostics_hot: Boolean(p.koDiagnostics?.hot),
245
+ ko_diagnostics_reasons: p.koDiagnostics?.reasons ?? [],
246
+ ko_diagnostics_strength: round(p.koDiagnostics?.strength ?? 0),
247
+ signal_score: round(summarizeSignalStrength(result.paragraphs)),
217
248
  };
218
249
  const pinned = expectedRanges[meta.fixture_id];
219
250
  if (!pinned) {
@@ -253,7 +284,7 @@ function main() {
253
284
  const overallCi = wilsonInterval(totalCorrect, totalCount);
254
285
 
255
286
  const results = {
256
- schemaVersion: 2,
287
+ schemaVersion: 3,
257
288
  fixtureSchemaVersion: FIXTURE_SCHEMA_VERSION,
258
289
  nodeVersion: process.version,
259
290
  generatedAt: new Date().toISOString(),
@@ -267,6 +298,12 @@ function main() {
267
298
  confidence_method: 'Wilson score interval, 95%',
268
299
  },
269
300
  perLanguage: summary,
301
+ ranking: {
302
+ note: 'Signal-score ranking over the checked-in fixture corpus; diagnostic only, not a public generalization claim.',
303
+ score: 'signal_score from the strongest deterministic paragraph trigger, averaged per fixture',
304
+ overall: summarizeRanking(rankingRecords(fixtureLog)),
305
+ perLanguage: summarizeRankingByLanguage(fixtureLog),
306
+ },
270
307
  fixtures: fixtureLog,
271
308
  };
272
309
 
@@ -277,6 +314,7 @@ function main() {
277
314
  if (!quiet) {
278
315
  console.log(`# Quality benchmark β€” ${fixtureLog.length} fixtures`);
279
316
  console.log(`Overall accuracy: ${(overallAccuracy * 100).toFixed(1)}%`);
317
+ console.log(`Signal ROC-AUC: ${results.ranking.overall.roc_auc.toFixed(3)} Β· PR-AUC: ${results.ranking.overall.pr_auc.toFixed(3)} Β· best-F1 threshold: ${results.ranking.overall.bestF1.threshold}`);
280
318
  console.log();
281
319
  console.log('| lang | n | accuracy | precision | recall | f1 | TP | FP | FN | TN |');
282
320
  console.log('|------|---|----------|-----------|--------|----|----|----|----|----|');
@@ -21,6 +21,8 @@ const TARGETS = [
21
21
  { file: 'README_ZH.md', lang: 'zh' },
22
22
  { file: 'README_JA.md', lang: 'ja' },
23
23
  { file: 'docs/FAQ.md', lang: 'en' },
24
+ { file: 'docs/social/signs-of-ai-writing.md', lang: 'en' },
25
+ { file: 'docs/social/signs-of-ai-writing_KR.md', lang: 'ko' },
24
26
  { file: 'SKILL.md', lang: 'ko' },
25
27
  ];
26
28
 
@@ -31,10 +33,10 @@ function scoreFile({ file, lang }) {
31
33
 
32
34
  const rows = TARGETS.map(scoreFile);
33
35
  console.log('# Dogfood docs score');
34
- console.log('| file | lang | paragraphs | hot | score | threshold |');
35
- console.log('|---|---|---:|---:|---:|---:|');
36
+ console.log('| file | lang | paragraphs | hot | score | signal | pattern hits | threshold |');
37
+ console.log('|---|---|---:|---:|---:|---:|---:|---:|');
36
38
  for (const r of rows) {
37
- console.log(`| ${r.file} | ${r.lang} | ${r.paragraphCount} | ${r.hotCount} | ${r.score.toFixed(1)} | ${THRESHOLD} |`);
39
+ console.log(`| ${r.file} | ${r.lang} | ${r.paragraphCount} | ${r.hotCount} | ${r.score.toFixed(1)} | ${r.signalScore.toFixed(1)} | ${r.patternHits} | ${THRESHOLD} |`);
38
40
  }
39
41
 
40
42
  const failures = rows.filter((r) => r.score > THRESHOLD);
@@ -0,0 +1,2 @@
1
+ {"fixture_id":"en-coffee-public-docs-01","language":"en","register":"public-docs","source_type":"synthetic-ai","model_family":"fixture","prompt_id":"live-quality-v1","redistribution":"repo-ok","facts":["coffee","Paris","Tokyo","coffee shops","climate change"],"text":"Coffee has emerged as a pivotal cultural phenomenon that has fundamentally transformed social interactions across the globe. This beloved beverage serves as a catalyst for community building, fosters meaningful connections, and facilitates cross-cultural dialogue. From the bustling cafΓ©s of Paris to the serene tea houses repurposed for coffee in Tokyo, this remarkable journey showcases the innovative spirit of human culinary exploration.\n\nThe proliferation of coffee shops in urban centers has created unprecedented opportunities for social engagement. Patrons from diverse backgrounds converge in these spaces, united by their shared appreciation for this aromatic brew. Furthermore, the ritual of coffee consumption has transcended mere sustenance, evolving into a cornerstone of modern social etiquette.\n\nIndustry experts agree that the coffee sector will continue its growth trajectory. Despite challenges related to climate change and supply chain disruptions, the future remains bright. This beverage will maintain its position as an indispensable component of global culture."}
2
+ {"fixture_id":"ko-coffee-public-docs-01","language":"ko","register":"public-docs","source_type":"synthetic-ai","model_family":"fixture","prompt_id":"live-quality-v1","redistribution":"repo-ok","facts":["컀피","μ„œμšΈ","λΆ€μ‚°","κΈ°ν›„ λ³€ν™”","곡급망"],"text":"μ»€ν”ΌλŠ” ν˜„λŒ€ μ‚¬νšŒμ  μƒν˜Έμž‘μš©μ„ 근본적으둜 λ³€ν™”μ‹œν‚¨ 핡심적인 λ¬Έν™” ν˜„μƒμœΌλ‘œ μžλ¦¬λ§€κΉ€ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. 이 μŒλ£ŒλŠ” 곡동체 ν˜•μ„±μ„ μ΄‰μ§„ν•˜κ³  의미 μžˆλŠ” 연결을 κ°€λŠ₯ν•˜κ²Œ ν•˜λ©°, λ‹€μ–‘ν•œ λ¬Έν™”κΆŒ μ‚¬μ΄μ˜ λŒ€ν™”λ₯Ό ν™œμ„±ν™”ν•˜λŠ” μ€‘μš”ν•œ 맀개체둜 κΈ°λŠ₯ν•©λ‹ˆλ‹€. μ„œμšΈμ˜ λ²ˆν™”ν•œ 카페 거리뢀터 λΆ€μ‚°μ˜ μ‘°μš©ν•œ λ‘œμŠ€ν„°λ¦¬κΉŒμ§€, 컀피 λ¬Έν™”λŠ” μΈκ°„μ˜ 창의적 식문화λ₯Ό λ³΄μ—¬μ£ΌλŠ” λŒ€ν‘œμ μΈ μ‚¬λ‘€μž…λ‹ˆλ‹€.\n\nλ„μ‹œ μ€‘μ‹¬λΆ€μ—μ„œ 컀피 전문점이 ν™•μ‚°λ˜λ©΄μ„œ μ‚¬νšŒμ  μ°Έμ—¬λ₯Ό μœ„ν•œ μ „λ‘€ μ—†λŠ” κΈ°νšŒκ°€ 창좜되고 μžˆμŠ΅λ‹ˆλ‹€. λ‹€μ–‘ν•œ 배경의 고객듀은 이 ν–₯기둜운 μŒλ£Œμ— λŒ€ν•œ κ³΅ν†΅λœ μ„ ν˜Έλ₯Ό λ°”νƒ•μœΌλ‘œ ν•œ 곡간에 λͺ¨μž…λ‹ˆλ‹€. λ‚˜μ•„κ°€ 컀피 μ†ŒλΉ„ μ˜λ‘€λŠ” λ‹¨μˆœν•œ κΈ°ν˜Έμ‹ν’ˆμ„ λ„˜μ–΄ ν˜„λŒ€μ  μƒν™œμ–‘μ‹μ˜ μ€‘μš”ν•œ ꡬ성 μš”μ†Œλ‘œ μ§„ν™”ν–ˆμŠ΅λ‹ˆλ‹€.\n\n업계 전문가듀은 컀피 산업이 μ•žμœΌλ‘œλ„ μ„±μž₯ ꢀ도λ₯Ό μœ μ§€ν•  것이라고 λ΄…λ‹ˆλ‹€. κΈ°ν›„ 변화와 곡급망 λΆˆμ•ˆμ΄λΌλŠ” κ³Όμ œκ°€ μ‘΄μž¬ν•˜μ§€λ§Œ, μ‹œμž₯의 λ―Έλž˜λŠ” μ—¬μ „νžˆ 밝닀고 ν‰κ°€λ©λ‹ˆλ‹€. μ»€ν”ΌλŠ” 세계 λ¬Έν™”μ˜ ν•„μˆ˜μ μΈ μš”μ†Œλ‘œ 계속 μžλ¦¬ν•  κ²ƒμž…λ‹ˆλ‹€."}