patina-cli 3.11.0 β 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.patina.default.yaml +29 -29
- package/CHANGELOG.md +53 -0
- package/NOTICE +21 -0
- package/README.md +117 -224
- package/README_JA.md +134 -77
- package/README_KR.md +132 -74
- package/README_ZH.md +137 -80
- package/SKILL.md +11 -20
- package/artifacts/rebaseline-2025/README.md +147 -0
- package/artifacts/rebaseline-2025/human-controls.public.jsonl +250 -0
- package/artifacts/rebaseline-2025/intake.example.jsonl +2 -0
- package/artifacts/rebaseline-2025/intake.local.example.jsonl +25 -0
- package/artifacts/rebaseline-2025/prompts.template.jsonl +7 -0
- package/artifacts/rebaseline-2025/sources.ko-public.jsonl +39 -0
- package/assets/brand/patina-badge.svg +18 -0
- package/assets/brand/patina-mark.svg +8 -0
- package/assets/demo/README.md +79 -0
- package/core/scoring.md +12 -12
- package/core/standalone-prompt.md +3 -1
- package/core/stylometry.md +93 -22
- package/docs/API.md +1554 -0
- package/docs/AUTHENTICATION.md +50 -26
- package/docs/AUTHENTICATION_KR.md +54 -29
- package/docs/BRANDING.md +9 -8
- package/docs/CLI.md +55 -14
- package/docs/COOKBOOK.md +8 -21
- package/docs/DEMO.md +32 -5
- package/docs/EXIT-CODES.md +2 -3
- package/docs/FALSE-POSITIVES.md +63 -0
- package/docs/FAQ.md +9 -1
- package/docs/FAQ_KR.md +3 -1
- package/docs/FLAG-PARITY.md +33 -47
- package/docs/ISSUE-WAVES.md +57 -0
- package/docs/PATTERNS-EN.md +67 -3
- package/docs/PATTERNS-JA.md +68 -2
- package/docs/PATTERNS-KO.md +70 -7
- package/docs/PATTERNS-ZH.md +67 -3
- package/docs/PATTERNS.md +5 -5
- package/docs/RESEARCH-DOCS-PLATFORM.md +54 -0
- package/docs/ROADMAP.md +46 -66
- package/docs/TRANSLATIONESE-KO.md +51 -0
- package/docs/audits/2026-05-deep-research.md +3 -1
- package/docs/benchmarks/README.md +51 -0
- package/docs/benchmarks/detector-comparison.json +69 -9
- package/docs/benchmarks/detector-comparison.md +10 -5
- package/docs/benchmarks/katfish-ko-latest.json +657 -0
- package/docs/benchmarks/katfish-ko-latest.md +77 -0
- package/docs/benchmarks/latest.json +1183 -108
- package/docs/benchmarks/latest.md +84 -60
- package/docs/benchmarks/lexicon-freshness-en-2026-05-22.json +1121 -0
- package/docs/benchmarks/lexicon-freshness-en-2026-05-22.md +136 -0
- package/docs/benchmarks/rebaseline-latest.json +381 -0
- package/docs/benchmarks/rebaseline-latest.md +121 -0
- package/docs/benchmarks/register-stratified-latest.json +164 -0
- package/docs/benchmarks/register-stratified-latest.md +99 -0
- package/docs/benchmarks/register-stratified.md +43 -0
- package/docs/integrations/github-action.md +44 -11
- package/docs/integrations/playground.md +58 -0
- package/docs/integrations/pre-commit.md +5 -5
- package/docs/integrations/release.md +5 -3
- package/docs/integrations/static-sites.md +83 -0
- package/docs/research/2025-rebaseline-plan.md +71 -2
- package/docs/research/2026-rebaseline.md +102 -0
- package/docs/research/adversarial-mps.md +41 -0
- package/docs/research/ai-human-metrics.md +35 -23
- package/docs/research/human-eval-panel.md +42 -0
- package/docs/research/judge-agreement.md +24 -0
- package/docs/research/ko-2025-corpus-sources.md +135 -0
- package/docs/research/lexicon-freshness-audit.md +64 -0
- package/docs/research/zh-ja-lexicon-calibration.md +60 -0
- package/docs/social/patina-launch-copy.md +173 -100
- package/docs/social/patina-launch-execution.md +94 -0
- package/docs/social/patina-launch-korean-first.md +83 -0
- package/docs/social/signs-of-ai-writing.md +26 -0
- package/docs/social/signs-of-ai-writing_KR.md +26 -0
- package/lexicon/ai-en.md +21 -24
- package/lexicon/ai-ja.md +158 -0
- package/lexicon/ai-ko.md +9 -9
- package/lexicon/ai-zh.md +158 -0
- package/lexicon/provenance/ai-en.json +970 -0
- package/lexicon/provenance/ai-ja.json +542 -0
- package/lexicon/provenance/ai-ko.json +866 -0
- package/lexicon/provenance/ai-zh.json +542 -0
- package/package.json +49 -8
- package/patterns/en-communication.md +5 -0
- package/patterns/en-content.md +5 -0
- package/patterns/en-filler.md +5 -0
- package/patterns/en-language.md +29 -1
- package/patterns/en-structure.md +5 -0
- package/patterns/en-style.md +5 -0
- package/patterns/en-viral-hook.md +42 -2
- package/patterns/ja-communication.md +5 -0
- package/patterns/ja-content.md +5 -0
- package/patterns/ja-filler.md +5 -0
- package/patterns/ja-language.md +33 -1
- package/patterns/ja-structure.md +12 -0
- package/patterns/ja-style.md +5 -0
- package/patterns/ja-viral-hook.md +41 -2
- package/patterns/ko-communication.md +5 -0
- package/patterns/ko-content.md +5 -0
- package/patterns/ko-filler.md +5 -0
- package/patterns/ko-language.md +33 -1
- package/patterns/ko-structure.md +25 -6
- package/patterns/ko-style.md +5 -0
- package/patterns/ko-viral-hook.md +38 -2
- package/patterns/zh-communication.md +5 -0
- package/patterns/zh-content.md +5 -0
- package/patterns/zh-filler.md +5 -0
- package/patterns/zh-language.md +37 -1
- package/patterns/zh-structure.md +12 -0
- package/patterns/zh-style.md +5 -0
- package/patterns/zh-viral-hook.md +38 -2
- package/playground/README.md +55 -0
- package/playground/analytics.js +4 -0
- package/playground/analyzer.js +883 -0
- package/playground/app.js +157 -0
- package/playground/data/lexicons.js +343 -0
- package/playground/index.html +138 -0
- package/playground/styles.css +267 -0
- package/profiles/namuwiki.md +111 -0
- package/scripts/adversarial-mps-report.mjs +201 -0
- package/scripts/badge-json.mjs +79 -0
- package/scripts/benchmark-report.mjs +56 -9
- package/scripts/check-release-metadata.mjs +0 -2
- package/scripts/detector-comparison.mjs +7 -7
- package/scripts/generate-playground-data.mjs +77 -0
- package/scripts/katfish-calibration.mjs +464 -0
- package/scripts/lexicon-freshness.mjs +485 -0
- package/scripts/lint.mjs +1 -1
- package/scripts/precommit-score.mjs +4 -3
- package/scripts/prose-score.mjs +81 -5
- package/scripts/rebaseline-intake.mjs +242 -0
- package/scripts/rebaseline-score.mjs +268 -0
- package/scripts/rebaseline-summary.mjs +773 -0
- package/scripts/rebaseline-web-collect.mjs +410 -0
- package/scripts/update-benchmark-ranges.mjs +1 -0
- package/src/api.js +69 -105
- package/src/auth.js +50 -2
- package/src/backends/claude-cli.js +19 -4
- package/src/backends/codex-cli.js +19 -3
- package/src/backends/contract.js +230 -1
- package/src/backends/gemini-cli.js +18 -5
- package/src/backends/index.js +87 -12
- package/src/backends/kimi-cli.js +161 -0
- package/src/cli.js +577 -567
- package/src/commands/doctor.js +2 -2
- package/src/config.js +29 -0
- package/src/errors.js +53 -1
- package/src/features/discourse-tells.js +68 -0
- package/src/features/index.js +82 -8
- package/src/features/lexicon.js +40 -6
- package/src/features/markup-leakage.js +69 -0
- package/src/features/segment.js +41 -0
- package/src/features/signal-strength.js +81 -0
- package/src/features/stylometry.js +231 -1
- package/src/features/translationese.js +127 -0
- package/src/loader.js +76 -0
- package/src/logger.js +22 -23
- package/src/model-defaults.js +55 -0
- package/src/ouroboros.js +31 -0
- package/src/output.js +102 -90
- package/src/prompt-builder.js +103 -68
- package/src/providers.js +51 -4
- package/src/scoring.js +210 -2
- package/src/security.js +75 -0
- package/tests/fixtures/live-quality/en/public-docs-01.md +26 -0
- package/tests/fixtures/live-quality/ko/public-docs-01.md +26 -0
- package/tests/fixtures/suspect-zones/expected-ranges.json +207 -16
- package/tests/fixtures/suspect-zones/ja/ai/ja-ai-04-lexicon.md +11 -0
- package/tests/fixtures/suspect-zones/ja/natural/ja-nat-04-lexicon-cold.md +11 -0
- package/tests/fixtures/suspect-zones/ko/ai/ko-ai-02.md +4 -5
- package/tests/fixtures/suspect-zones/ko/ai/ko-ai-07-ko-diagnostic.md +11 -0
- package/tests/fixtures/suspect-zones/zh/ai/zh-ai-04-lexicon.md +11 -0
- package/tests/fixtures/suspect-zones/zh/natural/zh-nat-04-lexicon-cold.md +11 -0
- package/tests/quality/README.md +188 -11
- package/tests/quality/adversarial-mps/fixtures.jsonl +10 -0
- package/tests/quality/benchmark.mjs +39 -1
- package/tests/quality/dogfood.mjs +5 -3
- package/tests/quality/live-fixtures.jsonl +2 -0
- package/tests/quality/live-quality.mjs +596 -0
- package/tests/quality/ranking-metrics.mjs +136 -0
- package/tests/quality/rebaseline-manifest.example.jsonl +5 -0
- package/vercel.json +53 -0
- package/SKILL-MAX.md +0 -455
- package/docs/internal/HARNESS.md +0 -14
- package/docs/internal/README.md +0 -14
- package/docs/internal/WARP.md +0 -23
- package/patina-max/SKILL.md +0 -523
- package/patina-max/composite.py +0 -457
- package/src/cache.js +0 -106
- package/src/commands/init.js +0 -208
- package/src/manifest.js +0 -162
- package/src/max-mode.js +0 -207
package/tests/quality/README.md
CHANGED
|
@@ -14,6 +14,7 @@ Outputs:
|
|
|
14
14
|
- A markdown table per language (accuracy, precision, recall, F1, confusion matrix)
|
|
15
15
|
- A list of any misclassified fixtures with their feature values
|
|
16
16
|
- `tests/quality/results.json` β full per-fixture log (gitignored)
|
|
17
|
+
- `docs/benchmarks/README.md` β report index, refresh commands, and public-claim rules
|
|
17
18
|
- `docs/benchmarks/latest.md` / `latest.json` when run via `npm run benchmark:report`
|
|
18
19
|
- `docs/benchmarks/detector-comparison.md` / `.json` when run via `npm run benchmark:compare`
|
|
19
20
|
|
|
@@ -23,34 +24,193 @@ Every fixture under `tests/fixtures/suspect-zones/{lang}/{ai|natural}/*.md`
|
|
|
23
24
|
carries an `expected_hot` label in its frontmatter. The benchmark runs
|
|
24
25
|
`analyzeText()` (defined in `src/features/index.js`) on the body and
|
|
25
26
|
compares the predicted hot/cold decision against that label. The decision
|
|
26
|
-
follows the
|
|
27
|
+
follows the 4-signal OR rule from `core/stylometry.md` Β§16:
|
|
27
28
|
|
|
28
29
|
```
|
|
29
30
|
paragraph is SUSPECT iff
|
|
30
31
|
burstiness_band == "low" OR
|
|
31
32
|
MATTR_band == "low" OR
|
|
32
|
-
lexicon_density > threshold
|
|
33
|
+
(lexicon_density > threshold AND lexicon_min_hits is satisfied) OR
|
|
34
|
+
koDiagnostics.hot == true
|
|
33
35
|
```
|
|
34
36
|
|
|
37
|
+
`burstiness_band` is only assigned when a paragraph has at least three
|
|
38
|
+
sentences; two-sentence CV is recorded for diagnostics but is not stable enough
|
|
39
|
+
to classify a paragraph by itself. For ko/zh/ja, a single lexicon hit is also
|
|
40
|
+
only an audit hint; the default hot threshold requires at least two CJK hits.
|
|
41
|
+
to make the paragraph hot by itself.
|
|
42
|
+
|
|
43
|
+
For `lang=ko`, `analyzeText()` also records Korean diagnostic fields:
|
|
44
|
+
`spacing`, `comma`, and `posDiversity` (a suffix-class proxy, not a morphology
|
|
45
|
+
analyzer). They only affect the hot/cold decision through the conservative
|
|
46
|
+
`koDiagnostics` composite: at least four sentences, at least 20 eojeols, fewer than one comma per sentence, regular eojeol length (`CV <= 0.38`), and low suffix-class diversity (`classDiversity <= 0.26`).
|
|
47
|
+
|
|
35
48
|
Per-language metrics use `expected_hot=true` as the positive class.
|
|
36
49
|
|
|
50
|
+
## Opt-in live rewrite quality
|
|
51
|
+
|
|
52
|
+
`npm run quality:live` runs the live-quality runner without calling a model by
|
|
53
|
+
default. The default path scores fixture inputs and marks the live rewrite step
|
|
54
|
+
as skipped, so it is safe for local smoke checks and CI dry-runs.
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
npm run quality:live
|
|
58
|
+
npm run quality:live -- --json
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
To run actual rewrites, opt in explicitly with an OpenAI-compatible provider.
|
|
62
|
+
Use `PATINA_LIVE_*` so this stays a deliberate local/manual probe rather than a
|
|
63
|
+
per-PR network dependency:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
PATINA_LIVE=1 \
|
|
67
|
+
PATINA_LIVE_PROVIDER=gemini \
|
|
68
|
+
PATINA_LIVE_API_KEY=... \
|
|
69
|
+
npm run quality:live -- --language ko --limit 1
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Supported live settings:
|
|
73
|
+
|
|
74
|
+
- `PATINA_LIVE_PROVIDER` β provider preset (`openai`, `gemini`, `groq`,
|
|
75
|
+
`kimi`, `moonshot`, `together`).
|
|
76
|
+
- `PATINA_LIVE_API_KEY` β live-run key; falls back to the provider key or
|
|
77
|
+
`PATINA_API_KEY`.
|
|
78
|
+
- `PATINA_LIVE_MODEL` / `PATINA_LIVE_API_BASE` / `PATINA_LIVE_TIMEOUT_MS`.
|
|
79
|
+
|
|
80
|
+
The fixture set lives in `tests/fixtures/live-quality/{en,ko}/*.md` with YAML
|
|
81
|
+
frontmatter (`fixture_id`, `language`, optional `profile`, `anchors`,
|
|
82
|
+
`expected_focus`) plus the body text. The legacy
|
|
83
|
+
`tests/quality/live-fixtures.jsonl` remains loadable via `--fixtures`.
|
|
84
|
+
|
|
85
|
+
Live reports are structured JSON or Markdown with:
|
|
86
|
+
|
|
87
|
+
- `schema_version`, redacted settings, and policy floors.
|
|
88
|
+
- `before_score` / `after_score` from model-graded `scoreText`.
|
|
89
|
+
- `mps` from `scoreMPS`.
|
|
90
|
+
- `fidelity` from `scoreFidelity`.
|
|
91
|
+
- `pass`, `warn`, `error`, or `skipped` per fixture.
|
|
92
|
+
|
|
93
|
+
A live rewrite passes when `after_score <= 30`, MPS is at least 70, fidelity is
|
|
94
|
+
at least 70, and the AI score improved. Missing credentials, provider failures,
|
|
95
|
+
schema failures, and MPS/fidelity floor violations are `error` and exit
|
|
96
|
+
nonzero; AI-score target misses remain `warn` so the report is still usable.
|
|
97
|
+
Keep this out of mandatory CI unless the live model path is deliberately
|
|
98
|
+
allowed, because LLM output is non-deterministic and may incur provider cost.
|
|
99
|
+
|
|
100
|
+
## Adversarial MPS fixtures
|
|
101
|
+
|
|
102
|
+
`npm run quality:adversarial-mps` validates a small, repo-owned fixture set
|
|
103
|
+
where explicit meaning anchors are preserved but AI-like wording remains. This
|
|
104
|
+
guards against treating MPS as a humanness score.
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
npm run quality:adversarial-mps
|
|
108
|
+
node scripts/adversarial-mps-report.mjs --check --json
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Inputs live in `tests/quality/adversarial-mps/fixtures.jsonl`; the report is
|
|
112
|
+
written to `docs/research/adversarial-mps.md`. The gate is:
|
|
113
|
+
|
|
114
|
+
- anchor-MPS proxy β₯90;
|
|
115
|
+
- deterministic AI score β₯60;
|
|
116
|
+
- no private or scraped source text.
|
|
117
|
+
|
|
118
|
+
If this gate passes, the case is intentionally adversarial: meaning survived,
|
|
119
|
+
but style still needs work. Ouroboros selection should prefer candidates that
|
|
120
|
+
pass MPS and lower the AI score, rather than letting high MPS hide
|
|
121
|
+
recurring AI markers.
|
|
122
|
+
|
|
123
|
+
## 2025+ rebaseline manifest
|
|
124
|
+
|
|
125
|
+
`npm run benchmark:rebaseline` validates the public JSONL manifest scaffold and
|
|
126
|
+
prints matrix coverage. It does not collect text from vendors, call external
|
|
127
|
+
detectors, or turn a small sample into a headline claim.
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
npm run benchmark:rebaseline
|
|
131
|
+
npm run benchmark:rebaseline:report
|
|
132
|
+
node scripts/rebaseline-summary.mjs --input tests/quality/rebaseline-manifest.example.jsonl --json
|
|
133
|
+
npm run benchmark:rebaseline:intake -- --input artifacts/rebaseline-2025/intake.example.jsonl --dry-run
|
|
134
|
+
npm run benchmark:rebaseline:intake -- --input artifacts/rebaseline-2025/intake.local.example.jsonl --dry-run --require-source-review
|
|
135
|
+
npm run benchmark:rebaseline:web -- --target-per-register 50 --max-per-source 12 --collected-at 2026-05-22
|
|
136
|
+
npm run benchmark:rebaseline:score -- --input artifacts/rebaseline-2025/private/web-human-controls.generated.private.jsonl --output artifacts/rebaseline-2025/human-controls.public.jsonl --scored-at 2026-05-22
|
|
137
|
+
node scripts/rebaseline-summary.mjs --input artifacts/rebaseline-2025/human-controls.public.jsonl --json
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Each row records the source metadata needed by
|
|
141
|
+
`docs/research/2025-rebaseline-plan.md`: `sample_id`, `language`, `class`,
|
|
142
|
+
`register`, `model_family`, `provider`, `model`, `generated_at`, `prompt_id`,
|
|
143
|
+
`decoding`, `postprocess`, `redistribution`, and `text_hash`. Full `text` is
|
|
144
|
+
allowed only for redistributable rows (`repo-ok`, `redistributable`, public
|
|
145
|
+
license values). Private or vendor-copied rows must stay metadata-only and use
|
|
146
|
+
hashes.
|
|
147
|
+
|
|
148
|
+
For local/private corpus intake, use `npm run benchmark:rebaseline:intake`.
|
|
149
|
+
It computes missing `text_hash` values and writes a public manifest that strips
|
|
150
|
+
full text from non-redistributable rows while preserving the full row in the
|
|
151
|
+
gitignored private output. Use `--require-source-review` before pilot reports so
|
|
152
|
+
non-public rows must explain their redistribution status through `source_review`
|
|
153
|
+
or `reviewer_notes`. The tracked `artifacts/rebaseline-2025/intake.example.jsonl`
|
|
154
|
+
fixture and `artifacts/rebaseline-2025/intake.local.example.jsonl` 25-row
|
|
155
|
+
template are smoke checks only; real corpus rows stay local until a license
|
|
156
|
+
review says otherwise.
|
|
157
|
+
|
|
158
|
+
`artifacts/rebaseline-2025/human-controls.public.jsonl` is the first tracked
|
|
159
|
+
web-sourced Korean human-control candidate manifest. It is metadata/hash-only:
|
|
160
|
+
no raw source text is committed. Its deterministic outcome fields are register-stratified false-positive
|
|
161
|
+
evidence; public catch-rate claims require positive AI-like rows and claim-cell coverage, now provided by `rebaseline-2026.scored.public.jsonl` for KO+EN.
|
|
162
|
+
|
|
163
|
+
The #155 report is claim-ready only when the process gate is satisfied: scored outcome rows, at least three generator families across at least two languages, nβ₯100 per claim cell, and confidence intervals. The checked-in 2026 manifest now satisfies that gate for KO+EN.
|
|
164
|
+
|
|
165
|
+
`npm run benchmark:rebaseline:report` refreshes
|
|
166
|
+
`docs/benchmarks/rebaseline-latest.md` and `.json`. Use `tests/quality/rebaseline-manifest.example.jsonl` for a BLOCKED smoke fixture; use `artifacts/rebaseline-2025/rebaseline-2026.scored.public.jsonl` for the current READY public report.
|
|
167
|
+
|
|
168
|
+
## Score vs signal strength
|
|
169
|
+
|
|
170
|
+
The pre-commit prose gate keeps the older, conservative score semantics:
|
|
171
|
+
|
|
172
|
+
```text
|
|
173
|
+
score = hot_paragraphs / total_paragraphs * 100
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
That binary ratio decides pass/fail because it is stable for CI. The report also
|
|
177
|
+
prints two diagnostics:
|
|
178
|
+
|
|
179
|
+
- `signal` β average paragraph intensity of the strongest deterministic trigger:
|
|
180
|
+
how far burstiness or MATTR is inside its low band, how far lexicon density
|
|
181
|
+
is over the threshold, or how strong the Korean diagnostic composite is.
|
|
182
|
+
- `pattern hits` β count of pattern-pack watch terms found in the stripped prose.
|
|
183
|
+
This is diagnostic only; it helps reviewers see pattern-level cleanup that may
|
|
184
|
+
not change the binary hot-paragraph ratio.
|
|
185
|
+
|
|
186
|
+
Treat both as editing diagnostics, not separate authorship verdicts or CI gates.
|
|
187
|
+
The prose gate uses the default deterministic thresholds and the current
|
|
188
|
+
Markdown pattern packs. Runtime scoring may use project config thresholds, so
|
|
189
|
+
compare `signal` values within the same entrypoint rather than across tools.
|
|
190
|
+
|
|
191
|
+
Report person-written paragraphs that cross the gate through the false-positive
|
|
192
|
+
form: <https://github.com/devswha/patina/issues/new?template=false_positive.yml>.
|
|
193
|
+
Include the exact paragraph, language/register, score output, and whether the
|
|
194
|
+
sample can become a public fixture.
|
|
195
|
+
|
|
37
196
|
## What it does NOT measure
|
|
38
197
|
|
|
39
198
|
- LLM-based scoring (`src/scoring.js`). The LLM is non-deterministic by
|
|
40
199
|
design and adds API cost / latency, so it stays out of this layer.
|
|
41
200
|
A separate live-mode benchmark would be its own follow-up.
|
|
42
|
-
-
|
|
43
|
-
|
|
44
|
-
|
|
201
|
+
- Mandatory rewrite quality gates. Live rewrite quality lives in
|
|
202
|
+
`tests/quality/live-quality.mjs` and remains opt-in because it can shell out
|
|
203
|
+
to OpenCode:
|
|
45
204
|
|
|
46
205
|
```bash
|
|
47
|
-
OPENCODE_AVAILABLE=1
|
|
206
|
+
OPENCODE_AVAILABLE=1 npm run quality:live -- --limit 1
|
|
48
207
|
```
|
|
49
208
|
|
|
50
|
-
The
|
|
209
|
+
The scaffold uses `opencode/hy3-preview-free` by default. Override it with
|
|
51
210
|
`OPENCODE_MODEL=<provider/model>` when testing another OpenCode model.
|
|
52
|
-
-
|
|
53
|
-
|
|
211
|
+
- Generalized model-era detector claims. The report now includes
|
|
212
|
+
`signal_score` ranking diagnostics (ROC-AUC, PR-AUC, best-F1 threshold), but
|
|
213
|
+
those numbers are still limited to the checked-in fixture corpus.
|
|
54
214
|
|
|
55
215
|
## Extending the corpus
|
|
56
216
|
|
|
@@ -111,11 +271,28 @@ in `.patina.default.yaml` (`stylometry.burstiness.bands`,
|
|
|
111
271
|
classification. Sweep against this benchmark + your own corpus and
|
|
112
272
|
update thresholds; the shipped values come from the v3.5.1 / v3.7
|
|
113
273
|
calibration documented in `core/stylometry.md` Β§13 Β§16.
|
|
274
|
+
`stylometry.ko_diagnostics.bands` controls the ko-only composite. The private
|
|
275
|
+
KatFish calibration command below reports aggregate catch-rate and FP deltas
|
|
276
|
+
without committing external raw text:
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
npm run benchmark:katfish-ko -- --write --basename katfish-ko-latest
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
Treat that report as a KO diagnostic calibration artifact, not as a broad public
|
|
283
|
+
performance claim.
|
|
284
|
+
|
|
285
|
+
`npm run benchmark:report` also records a diagnostic `signal_score` sweep. The
|
|
286
|
+
prediction rule is `signal_score >= threshold`, and the PR-AUC value is average
|
|
287
|
+
precision over descending score groups. Use it to compare tuning candidates, not
|
|
288
|
+
as an authorship verdict.
|
|
114
289
|
|
|
115
290
|
## Languages
|
|
116
291
|
|
|
117
292
|
Currently runs on all supported pattern-pack languages: `ko`, `en`, `zh`, and
|
|
118
293
|
`ja`. Chinese and Japanese use a deterministic character-token fallback because
|
|
119
294
|
normal prose often has no whitespace; ko/en keep whitespace tokenization.
|
|
120
|
-
|
|
121
|
-
|
|
295
|
+
Korean additionally emits dependency-free spacing/comma/suffix-diversity
|
|
296
|
+
diagnostics and a conservative ko-only composite detector.
|
|
297
|
+
zh/ja now include high-precision AI-lexicon fixtures as well as
|
|
298
|
+
burstiness/MATTR regression coverage.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{"id":"adv-mps-ko-01","lang":"ko","register":"marketing","original":"μν¬μ€νμ΄μ€λ νμλ‘, ν μΌ, νλ‘μ νΈ μΌμ μ ν κ³³μμ κ΄λ¦¬νλ€. ν
νλ¦Ώμ 30κ°μ΄λ©° 볡μ ν΄μ μμ ν μ μλ€.","rewritten":"μ΄ μν¬μ€νμ΄μ€λ νμλ‘κ³Ό ν μΌ, νλ‘μ νΈ μΌμ μ ν΅ν©μ μΌλ‘ κ΄λ¦¬ν μ μλ νμ μ μΈ μμ°μ± μ루μ
μ΄μ ν΅μ¬ κ°μΉλ‘ μ리맀κΉν©λλ€. λ€μν μ
무 λ§₯λ½μμ ν
νλ¦Ώ 30κ°λ₯Ό νμ©ν μ μμΌλ©°, μ¬μ©μλ μ΄λ₯Ό 볡μ ν΄μ μμ ν¨μΌλ‘μ¨ μ
무 ν¨μ¨μ±μ κ·Ήλνν μ μμ΅λλ€.","anchors":["νμλ‘","ν μΌ","νλ‘μ νΈ μΌμ ","ν
νλ¦Ώ 30κ°","볡μ ν΄μ μμ "],"register_note":"Preserves facts but re-adds clustered marketing language."}
|
|
2
|
+
{"id":"adv-mps-ko-02","lang":"ko","register":"technical","original":"λ°°μΉ μμ
μ λ§€μΌ 02μμ μ€νλλ€. μ€ν¨νλ©΄ μ¬λ μλ¦Όμ 보λ΄κ³ μ¬μλλ μΈ λ²μΌλ‘ μ ννλ€.","rewritten":"λ³Έ λ°°μΉ μμ
μ λ§€μΌ 02μμ μμ μ μΌλ‘ μ€νλλλ‘ μ€κ³λμ΄ μμΌλ©°, μ€ν¨ μν©μμλ μ¬λ μλ¦Όμ ν΅ν΄ μ¦κ°μ μΈ λμμ μ§μν©λλ€. λν μ¬μλλ μΈ λ²μΌλ‘ μ ννλ 체κ³μ μΈ μ΄μ λ°©μμΌλ‘ μ 체 μ΄μ μ λ’°μ±μ ν₯μμν€λ ν΅μ¬ μμΉμΌλ‘ μ리맀κΉν©λλ€.","anchors":["λ§€μΌ 02μ","μ¬λ μλ¦Ό","μ¬μλλ μΈ λ²"],"register_note":"Operational facts preserved; AI register remains."}
|
|
3
|
+
{"id":"adv-mps-ko-03","lang":"ko","register":"academic","original":"μ€νμλ μ μ₯μ 60κ°κ° ν¬ν¨λλ€. νκ· μ€μ μκ°μ 72μκ°μμ 10λΆμΌλ‘ μ€μκ³ , νλ³Έμ΄ μμ μΌλ°νμλ μ£Όμκ° νμνλ€.","rewritten":"λ³Έ μ€νμ μ μ₯μ 60κ°λ₯Ό λμμΌλ‘ μνλμμΌλ©°, νκ· μ€μ μκ°μ΄ 72μκ°μμ 10λΆμΌλ‘ κ°μνλ€λ μ μμ νμ λ
Όμμ ν΅μ¬ κΈ°λ°μ΄μ μ€μν μλ―Έλ₯Ό μ§λλλ€. λ€λ§ νλ³Έμ΄ μκΈ° λλ¬Έμ κ²°κ³Όλ₯Ό μΌλ°ννλ λ°μλ μ μ€ν μ κ·Όμ΄ νμνλ€κ³ ν μ μμ΅λλ€.","anchors":["μ μ₯μ 60κ°","72μκ°μμ 10λΆ","νλ³Έμ΄ μ","μΌλ°ν"],"register_note":"High MPS with clustered academic packaging."}
|
|
4
|
+
{"id":"adv-mps-ko-04","lang":"ko","register":"product-doc","original":"λμ보λλ CSV λ΄λ³΄λ΄κΈ°λ₯Ό μ§μνλ€. νν°λ ν, κΈ°κ°, μν μΈ κ°μ§λ€.","rewritten":"μ΄ λμ보λλ CSV λ΄λ³΄λ΄κΈ°λ₯Ό μ§μν¨μΌλ‘μ¨ λ°μ΄ν° νμ©μ±μ λμ΄λ λ° κΈ°μ¬ν©λλ€. λν ν, κΈ°κ°, μν μΈ κ°μ§ νν°λ₯Ό μ 곡νμ¬ μ¬μ©μκ° λ€μν κ΄μ μμ μ 보λ₯Ό ν¨μ¨μ μΌλ‘ νμν μ μλ μ
무 μνκ³μ ν΅μ¬ μ΄μ μμμ μ 곡ν©λλ€.","anchors":["CSV λ΄λ³΄λ΄κΈ°","ν","κΈ°κ°","μν","μΈ κ°μ§ νν°"],"register_note":"Product-doc facts preserved; support/efficiency wording recurs."}
|
|
5
|
+
{"id":"adv-mps-ko-05","lang":"ko","register":"policy","original":"μ μ² κΈ°κ°μ 6μ 1μΌλΆν° 6μ 14μΌκΉμ§λ€. κ°μΈμ μ¨λΌμΈ μμμΌλ‘ μ μνκ³ , κ²°κ³Όλ 7μ 3μΌμ 곡κ°λλ€.","rewritten":"λ³Έ μ μ² κΈ°κ°μ 6μ 1μΌλΆν° 6μ 14μΌκΉμ§λ‘ μ΄μλλ©°, κ°μΈμ μ¨λΌμΈ μμμ ν΅ν΄ μ μν μ μμ΅λλ€. κ²°κ³Όλ 7μ 3μΌμ 곡κ°λ μμ μΌλ‘, μ μ²μλ ν΄λΉ μΌμ μ μ¬μ μ νμΈνλ κ²μ΄ ν΅μ¬μ΄λ©° μμ μ μΈ μ μ μ΄μμ μ€μν μλ―Έλ₯Ό μ§λλλ€.","anchors":["6μ 1μΌλΆν° 6μ 14μΌκΉμ§","μ¨λΌμΈ μμ","7μ 3μΌ"],"register_note":"Dates preserved; officialese packaging remains."}
|
|
6
|
+
{"id":"adv-mps-en-01","lang":"en","register":"marketing","original":"The app imports invoices, groups them by client, and exports a CSV summary at the end of each month.","rewritten":"The app provides a seamless workflow that imports invoices, groups them by client, and exports a CSV summary at the end of each month. This streamlined experience empowers teams to unlock more actionable monthly reporting without changing the underlying billing process.","anchors":["imports invoices","groups them by client","exports a CSV summary","end of each month"],"register_note":"Meaning preserved with dense AI-favored vocabulary."}
|
|
7
|
+
{"id":"adv-mps-en-02","lang":"en","register":"technical","original":"The cache expires after 24 hours. Users can force a manual refresh when debugging stale responses.","rewritten":"The cache is designed as a robust framework that expires after 24 hours while still enabling users to force a manual refresh when debugging stale responses. This approach offers a scalable and thoughtful balance between performance and developer control.","anchors":["expires after 24 hours","manual refresh","debugging stale responses"],"register_note":"Exact controls preserved; AI-like abstraction added."}
|
|
8
|
+
{"id":"adv-mps-en-03","lang":"en","register":"academic","original":"The survey covered 42 maintainers. Twenty-nine said review latency was the main blocker, but the sample was self-selected.","rewritten":"The survey covered 42 maintainers and surfaced a compelling insight: 29 respondents identified review latency as the main blocker. However, because the sample was self-selected, the findings should be interpreted through a nuanced and ethical research lens.","anchors":["42 maintainers","29 respondents","review latency","self-selected"],"register_note":"Numbers and caveat preserved with AI-signature phrasing."}
|
|
9
|
+
{"id":"adv-mps-en-04","lang":"en","register":"support","original":"Password reset links expire in 15 minutes. If a user requests another link, the older link stops working.","rewritten":"Password reset links expire in 15 minutes, creating a secure and user-friendly experience. If a user requests another link, the older link stops working, which helps align the reset workflow with modern account-safety expectations.","anchors":["expire in 15 minutes","requests another link","older link stops working"],"register_note":"Security behavior preserved; packaged UX framing added."}
|
|
10
|
+
{"id":"adv-mps-en-05","lang":"en","register":"strategy","original":"The team will cut weekly planning from 90 minutes to 45 minutes and keep Friday demos unchanged.","rewritten":"The team will streamline weekly planning from 90 minutes to 45 minutes while keeping Friday demos unchanged. This targeted adjustment can accelerate decision-making, bolster alignment, and create a more sustainable operating rhythm without disrupting the existing demo cadence.","anchors":["weekly planning","90 minutes to 45 minutes","Friday demos unchanged"],"register_note":"Schedule facts preserved; AI-favored strategy language remains."}
|
|
@@ -16,6 +16,8 @@ import yaml from 'js-yaml';
|
|
|
16
16
|
|
|
17
17
|
import { analyzeText } from '../../src/features/index.js';
|
|
18
18
|
import { loadLexicon } from '../../src/features/lexicon.js';
|
|
19
|
+
import { summarizeSignalStrength } from '../../src/features/signal-strength.js';
|
|
20
|
+
import { summarizeRanking } from './ranking-metrics.mjs';
|
|
19
21
|
|
|
20
22
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
21
23
|
const REPO_ROOT = resolve(__dirname, '../..');
|
|
@@ -89,6 +91,29 @@ function summarize(m) {
|
|
|
89
91
|
};
|
|
90
92
|
}
|
|
91
93
|
|
|
94
|
+
function rankingRecords(fixtures) {
|
|
95
|
+
return fixtures.map((fixture) => ({
|
|
96
|
+
score: fixture.signal_score,
|
|
97
|
+
expected: fixture.expected_hot,
|
|
98
|
+
}));
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function summarizeRankingByLanguage(fixtures) {
|
|
102
|
+
const byLanguage = {};
|
|
103
|
+
for (const fixture of fixtures) {
|
|
104
|
+
byLanguage[fixture.lang] ||= [];
|
|
105
|
+
byLanguage[fixture.lang].push({
|
|
106
|
+
score: fixture.signal_score,
|
|
107
|
+
expected: fixture.expected_hot,
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
return Object.fromEntries(
|
|
111
|
+
Object.entries(byLanguage)
|
|
112
|
+
.sort(([a], [b]) => a.localeCompare(b))
|
|
113
|
+
.map(([lang, records]) => [lang, summarizeRanking(records)])
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
92
117
|
function round(n, digits = 3) {
|
|
93
118
|
return Math.round(n * 10 ** digits) / 10 ** digits;
|
|
94
119
|
}
|
|
@@ -108,6 +133,7 @@ function wilsonInterval(successes, n, z = 1.959963984540054) {
|
|
|
108
133
|
function detectorHot(result) {
|
|
109
134
|
return {
|
|
110
135
|
burstiness: result.paragraphs.some((p) => p.burstiness?.band === 'low'),
|
|
136
|
+
koDiagnostics: result.paragraphs.some((p) => p.koDiagnostics?.hot),
|
|
111
137
|
mattr: result.paragraphs.some((p) => p.mattr?.band === 'low'),
|
|
112
138
|
lexicon: result.paragraphs.some((p) => p.lexicon?.hot),
|
|
113
139
|
};
|
|
@@ -116,6 +142,7 @@ function detectorHot(result) {
|
|
|
116
142
|
function emptyDetectorMetrics() {
|
|
117
143
|
return {
|
|
118
144
|
burstiness: emptyMetrics(),
|
|
145
|
+
koDiagnostics: emptyMetrics(),
|
|
119
146
|
mattr: emptyMetrics(),
|
|
120
147
|
lexicon: emptyMetrics(),
|
|
121
148
|
};
|
|
@@ -214,6 +241,10 @@ function main() {
|
|
|
214
241
|
mattr_band: p.mattr?.band,
|
|
215
242
|
lexicon_density: round(p.lexicon?.density ?? 0),
|
|
216
243
|
lexicon_hits: p.lexicon?.hits ?? [],
|
|
244
|
+
ko_diagnostics_hot: Boolean(p.koDiagnostics?.hot),
|
|
245
|
+
ko_diagnostics_reasons: p.koDiagnostics?.reasons ?? [],
|
|
246
|
+
ko_diagnostics_strength: round(p.koDiagnostics?.strength ?? 0),
|
|
247
|
+
signal_score: round(summarizeSignalStrength(result.paragraphs)),
|
|
217
248
|
};
|
|
218
249
|
const pinned = expectedRanges[meta.fixture_id];
|
|
219
250
|
if (!pinned) {
|
|
@@ -253,7 +284,7 @@ function main() {
|
|
|
253
284
|
const overallCi = wilsonInterval(totalCorrect, totalCount);
|
|
254
285
|
|
|
255
286
|
const results = {
|
|
256
|
-
schemaVersion:
|
|
287
|
+
schemaVersion: 3,
|
|
257
288
|
fixtureSchemaVersion: FIXTURE_SCHEMA_VERSION,
|
|
258
289
|
nodeVersion: process.version,
|
|
259
290
|
generatedAt: new Date().toISOString(),
|
|
@@ -267,6 +298,12 @@ function main() {
|
|
|
267
298
|
confidence_method: 'Wilson score interval, 95%',
|
|
268
299
|
},
|
|
269
300
|
perLanguage: summary,
|
|
301
|
+
ranking: {
|
|
302
|
+
note: 'Signal-score ranking over the checked-in fixture corpus; diagnostic only, not a public generalization claim.',
|
|
303
|
+
score: 'signal_score from the strongest deterministic paragraph trigger, averaged per fixture',
|
|
304
|
+
overall: summarizeRanking(rankingRecords(fixtureLog)),
|
|
305
|
+
perLanguage: summarizeRankingByLanguage(fixtureLog),
|
|
306
|
+
},
|
|
270
307
|
fixtures: fixtureLog,
|
|
271
308
|
};
|
|
272
309
|
|
|
@@ -277,6 +314,7 @@ function main() {
|
|
|
277
314
|
if (!quiet) {
|
|
278
315
|
console.log(`# Quality benchmark β ${fixtureLog.length} fixtures`);
|
|
279
316
|
console.log(`Overall accuracy: ${(overallAccuracy * 100).toFixed(1)}%`);
|
|
317
|
+
console.log(`Signal ROC-AUC: ${results.ranking.overall.roc_auc.toFixed(3)} Β· PR-AUC: ${results.ranking.overall.pr_auc.toFixed(3)} Β· best-F1 threshold: ${results.ranking.overall.bestF1.threshold}`);
|
|
280
318
|
console.log();
|
|
281
319
|
console.log('| lang | n | accuracy | precision | recall | f1 | TP | FP | FN | TN |');
|
|
282
320
|
console.log('|------|---|----------|-----------|--------|----|----|----|----|----|');
|
|
@@ -21,6 +21,8 @@ const TARGETS = [
|
|
|
21
21
|
{ file: 'README_ZH.md', lang: 'zh' },
|
|
22
22
|
{ file: 'README_JA.md', lang: 'ja' },
|
|
23
23
|
{ file: 'docs/FAQ.md', lang: 'en' },
|
|
24
|
+
{ file: 'docs/social/signs-of-ai-writing.md', lang: 'en' },
|
|
25
|
+
{ file: 'docs/social/signs-of-ai-writing_KR.md', lang: 'ko' },
|
|
24
26
|
{ file: 'SKILL.md', lang: 'ko' },
|
|
25
27
|
];
|
|
26
28
|
|
|
@@ -31,10 +33,10 @@ function scoreFile({ file, lang }) {
|
|
|
31
33
|
|
|
32
34
|
const rows = TARGETS.map(scoreFile);
|
|
33
35
|
console.log('# Dogfood docs score');
|
|
34
|
-
console.log('| file | lang | paragraphs | hot | score | threshold |');
|
|
35
|
-
console.log('
|
|
36
|
+
console.log('| file | lang | paragraphs | hot | score | signal | pattern hits | threshold |');
|
|
37
|
+
console.log('|---|---|---:|---:|---:|---:|---:|---:|');
|
|
36
38
|
for (const r of rows) {
|
|
37
|
-
console.log(`| ${r.file} | ${r.lang} | ${r.paragraphCount} | ${r.hotCount} | ${r.score.toFixed(1)} | ${THRESHOLD} |`);
|
|
39
|
+
console.log(`| ${r.file} | ${r.lang} | ${r.paragraphCount} | ${r.hotCount} | ${r.score.toFixed(1)} | ${r.signalScore.toFixed(1)} | ${r.patternHits} | ${THRESHOLD} |`);
|
|
38
40
|
}
|
|
39
41
|
|
|
40
42
|
const failures = rows.filter((r) => r.score > THRESHOLD);
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
{"fixture_id":"en-coffee-public-docs-01","language":"en","register":"public-docs","source_type":"synthetic-ai","model_family":"fixture","prompt_id":"live-quality-v1","redistribution":"repo-ok","facts":["coffee","Paris","Tokyo","coffee shops","climate change"],"text":"Coffee has emerged as a pivotal cultural phenomenon that has fundamentally transformed social interactions across the globe. This beloved beverage serves as a catalyst for community building, fosters meaningful connections, and facilitates cross-cultural dialogue. From the bustling cafΓ©s of Paris to the serene tea houses repurposed for coffee in Tokyo, this remarkable journey showcases the innovative spirit of human culinary exploration.\n\nThe proliferation of coffee shops in urban centers has created unprecedented opportunities for social engagement. Patrons from diverse backgrounds converge in these spaces, united by their shared appreciation for this aromatic brew. Furthermore, the ritual of coffee consumption has transcended mere sustenance, evolving into a cornerstone of modern social etiquette.\n\nIndustry experts agree that the coffee sector will continue its growth trajectory. Despite challenges related to climate change and supply chain disruptions, the future remains bright. This beverage will maintain its position as an indispensable component of global culture."}
|
|
2
|
+
{"fixture_id":"ko-coffee-public-docs-01","language":"ko","register":"public-docs","source_type":"synthetic-ai","model_family":"fixture","prompt_id":"live-quality-v1","redistribution":"repo-ok","facts":["컀νΌ","μμΈ","λΆμ°","κΈ°ν λ³ν","곡κΈλ§"],"text":"컀νΌλ νλ μ¬νμ μνΈμμ©μ κ·Όλ³Έμ μΌλ‘ λ³νμν¨ ν΅μ¬μ μΈ λ¬Έν νμμΌλ‘ μ리맀κΉνκ³ μμ΅λλ€. μ΄ μλ£λ 곡λ체 νμ±μ μ΄μ§νκ³ μλ―Έ μλ μ°κ²°μ κ°λ₯νκ² νλ©°, λ€μν λ¬ΈνκΆ μ¬μ΄μ λνλ₯Ό νμ±ννλ μ€μν λ§€κ°μ²΄λ‘ κΈ°λ₯ν©λλ€. μμΈμ λ²νν μΉ΄ν 거리λΆν° λΆμ°μ μ‘°μ©ν λ‘μ€ν°λ¦¬κΉμ§, μ»€νΌ λ¬Ένλ μΈκ°μ μ°½μμ μλ¬Ένλ₯Ό 보μ¬μ£Όλ λνμ μΈ μ¬λ‘μ
λλ€.\n\nλμ μ€μ¬λΆμμ μ»€νΌ μ λ¬Έμ μ΄ νμ°λλ©΄μ μ¬νμ μ°Έμ¬λ₯Ό μν μ λ‘ μλ κΈ°νκ° μ°½μΆλκ³ μμ΅λλ€. λ€μν λ°°κ²½μ κ³ κ°λ€μ μ΄ ν₯κΈ°λ‘μ΄ μλ£μ λν 곡ν΅λ μ νΈλ₯Ό λ°νμΌλ‘ ν 곡κ°μ λͺ¨μ
λλ€. λμκ° μ»€νΌ μλΉ μλ‘λ λ¨μν κΈ°νΈμνμ λμ΄ νλμ μνμμμ μ€μν κ΅¬μ± μμλ‘ μ§ννμ΅λλ€.\n\nμ
κ³ μ λ¬Έκ°λ€μ μ»€νΌ μ°μ
μ΄ μμΌλ‘λ μ±μ₯ κΆ€λλ₯Ό μ μ§ν κ²μ΄λΌκ³ λ΄
λλ€. κΈ°ν λ³νμ 곡κΈλ§ λΆμμ΄λΌλ κ³Όμ κ° μ‘΄μ¬νμ§λ§, μμ₯μ λ―Έλλ μ¬μ ν λ°λ€κ³ νκ°λ©λλ€. 컀νΌλ μΈκ³ λ¬Ένμ νμμ μΈ μμλ‘ κ³μ μ리ν κ²μ
λλ€."}
|