patina-cli 3.11.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.patina.default.yaml +29 -29
- package/CHANGELOG.md +53 -0
- package/NOTICE +21 -0
- package/README.md +117 -224
- package/README_JA.md +134 -77
- package/README_KR.md +132 -74
- package/README_ZH.md +137 -80
- package/SKILL.md +11 -20
- package/artifacts/rebaseline-2025/README.md +147 -0
- package/artifacts/rebaseline-2025/human-controls.public.jsonl +250 -0
- package/artifacts/rebaseline-2025/intake.example.jsonl +2 -0
- package/artifacts/rebaseline-2025/intake.local.example.jsonl +25 -0
- package/artifacts/rebaseline-2025/prompts.template.jsonl +7 -0
- package/artifacts/rebaseline-2025/sources.ko-public.jsonl +39 -0
- package/assets/brand/patina-badge.svg +18 -0
- package/assets/brand/patina-mark.svg +8 -0
- package/assets/demo/README.md +79 -0
- package/core/scoring.md +12 -12
- package/core/standalone-prompt.md +3 -1
- package/core/stylometry.md +93 -22
- package/docs/API.md +1554 -0
- package/docs/AUTHENTICATION.md +50 -26
- package/docs/AUTHENTICATION_KR.md +54 -29
- package/docs/BRANDING.md +9 -8
- package/docs/CLI.md +55 -14
- package/docs/COOKBOOK.md +8 -21
- package/docs/DEMO.md +32 -5
- package/docs/EXIT-CODES.md +2 -3
- package/docs/FALSE-POSITIVES.md +63 -0
- package/docs/FAQ.md +9 -1
- package/docs/FAQ_KR.md +3 -1
- package/docs/FLAG-PARITY.md +33 -47
- package/docs/ISSUE-WAVES.md +57 -0
- package/docs/PATTERNS-EN.md +67 -3
- package/docs/PATTERNS-JA.md +68 -2
- package/docs/PATTERNS-KO.md +70 -7
- package/docs/PATTERNS-ZH.md +67 -3
- package/docs/PATTERNS.md +5 -5
- package/docs/RESEARCH-DOCS-PLATFORM.md +54 -0
- package/docs/ROADMAP.md +46 -66
- package/docs/TRANSLATIONESE-KO.md +51 -0
- package/docs/audits/2026-05-deep-research.md +3 -1
- package/docs/benchmarks/README.md +51 -0
- package/docs/benchmarks/detector-comparison.json +69 -9
- package/docs/benchmarks/detector-comparison.md +10 -5
- package/docs/benchmarks/katfish-ko-latest.json +657 -0
- package/docs/benchmarks/katfish-ko-latest.md +77 -0
- package/docs/benchmarks/latest.json +1183 -108
- package/docs/benchmarks/latest.md +84 -60
- package/docs/benchmarks/lexicon-freshness-en-2026-05-22.json +1121 -0
- package/docs/benchmarks/lexicon-freshness-en-2026-05-22.md +136 -0
- package/docs/benchmarks/rebaseline-latest.json +381 -0
- package/docs/benchmarks/rebaseline-latest.md +121 -0
- package/docs/benchmarks/register-stratified-latest.json +164 -0
- package/docs/benchmarks/register-stratified-latest.md +99 -0
- package/docs/benchmarks/register-stratified.md +43 -0
- package/docs/integrations/github-action.md +44 -11
- package/docs/integrations/playground.md +58 -0
- package/docs/integrations/pre-commit.md +5 -5
- package/docs/integrations/release.md +5 -3
- package/docs/integrations/static-sites.md +83 -0
- package/docs/research/2025-rebaseline-plan.md +71 -2
- package/docs/research/2026-rebaseline.md +102 -0
- package/docs/research/adversarial-mps.md +41 -0
- package/docs/research/ai-human-metrics.md +35 -23
- package/docs/research/human-eval-panel.md +42 -0
- package/docs/research/judge-agreement.md +24 -0
- package/docs/research/ko-2025-corpus-sources.md +135 -0
- package/docs/research/lexicon-freshness-audit.md +64 -0
- package/docs/research/zh-ja-lexicon-calibration.md +60 -0
- package/docs/social/patina-launch-copy.md +173 -100
- package/docs/social/patina-launch-execution.md +94 -0
- package/docs/social/patina-launch-korean-first.md +83 -0
- package/docs/social/signs-of-ai-writing.md +26 -0
- package/docs/social/signs-of-ai-writing_KR.md +26 -0
- package/lexicon/ai-en.md +21 -24
- package/lexicon/ai-ja.md +158 -0
- package/lexicon/ai-ko.md +9 -9
- package/lexicon/ai-zh.md +158 -0
- package/lexicon/provenance/ai-en.json +970 -0
- package/lexicon/provenance/ai-ja.json +542 -0
- package/lexicon/provenance/ai-ko.json +866 -0
- package/lexicon/provenance/ai-zh.json +542 -0
- package/package.json +49 -8
- package/patterns/en-communication.md +5 -0
- package/patterns/en-content.md +5 -0
- package/patterns/en-filler.md +5 -0
- package/patterns/en-language.md +29 -1
- package/patterns/en-structure.md +5 -0
- package/patterns/en-style.md +5 -0
- package/patterns/en-viral-hook.md +42 -2
- package/patterns/ja-communication.md +5 -0
- package/patterns/ja-content.md +5 -0
- package/patterns/ja-filler.md +5 -0
- package/patterns/ja-language.md +33 -1
- package/patterns/ja-structure.md +12 -0
- package/patterns/ja-style.md +5 -0
- package/patterns/ja-viral-hook.md +41 -2
- package/patterns/ko-communication.md +5 -0
- package/patterns/ko-content.md +5 -0
- package/patterns/ko-filler.md +5 -0
- package/patterns/ko-language.md +33 -1
- package/patterns/ko-structure.md +25 -6
- package/patterns/ko-style.md +5 -0
- package/patterns/ko-viral-hook.md +38 -2
- package/patterns/zh-communication.md +5 -0
- package/patterns/zh-content.md +5 -0
- package/patterns/zh-filler.md +5 -0
- package/patterns/zh-language.md +37 -1
- package/patterns/zh-structure.md +12 -0
- package/patterns/zh-style.md +5 -0
- package/patterns/zh-viral-hook.md +38 -2
- package/playground/README.md +55 -0
- package/playground/analytics.js +4 -0
- package/playground/analyzer.js +883 -0
- package/playground/app.js +157 -0
- package/playground/data/lexicons.js +343 -0
- package/playground/index.html +138 -0
- package/playground/styles.css +267 -0
- package/profiles/namuwiki.md +111 -0
- package/scripts/adversarial-mps-report.mjs +201 -0
- package/scripts/badge-json.mjs +79 -0
- package/scripts/benchmark-report.mjs +56 -9
- package/scripts/check-release-metadata.mjs +0 -2
- package/scripts/detector-comparison.mjs +7 -7
- package/scripts/generate-playground-data.mjs +77 -0
- package/scripts/katfish-calibration.mjs +464 -0
- package/scripts/lexicon-freshness.mjs +485 -0
- package/scripts/lint.mjs +1 -1
- package/scripts/precommit-score.mjs +4 -3
- package/scripts/prose-score.mjs +81 -5
- package/scripts/rebaseline-intake.mjs +242 -0
- package/scripts/rebaseline-score.mjs +268 -0
- package/scripts/rebaseline-summary.mjs +773 -0
- package/scripts/rebaseline-web-collect.mjs +410 -0
- package/scripts/update-benchmark-ranges.mjs +1 -0
- package/src/api.js +69 -105
- package/src/auth.js +50 -2
- package/src/backends/claude-cli.js +19 -4
- package/src/backends/codex-cli.js +19 -3
- package/src/backends/contract.js +230 -1
- package/src/backends/gemini-cli.js +18 -5
- package/src/backends/index.js +87 -12
- package/src/backends/kimi-cli.js +161 -0
- package/src/cli.js +577 -567
- package/src/commands/doctor.js +2 -2
- package/src/config.js +29 -0
- package/src/errors.js +53 -1
- package/src/features/discourse-tells.js +68 -0
- package/src/features/index.js +82 -8
- package/src/features/lexicon.js +40 -6
- package/src/features/markup-leakage.js +69 -0
- package/src/features/segment.js +41 -0
- package/src/features/signal-strength.js +81 -0
- package/src/features/stylometry.js +231 -1
- package/src/features/translationese.js +127 -0
- package/src/loader.js +76 -0
- package/src/logger.js +22 -23
- package/src/model-defaults.js +55 -0
- package/src/ouroboros.js +31 -0
- package/src/output.js +102 -90
- package/src/prompt-builder.js +103 -68
- package/src/providers.js +51 -4
- package/src/scoring.js +210 -2
- package/src/security.js +75 -0
- package/tests/fixtures/live-quality/en/public-docs-01.md +26 -0
- package/tests/fixtures/live-quality/ko/public-docs-01.md +26 -0
- package/tests/fixtures/suspect-zones/expected-ranges.json +207 -16
- package/tests/fixtures/suspect-zones/ja/ai/ja-ai-04-lexicon.md +11 -0
- package/tests/fixtures/suspect-zones/ja/natural/ja-nat-04-lexicon-cold.md +11 -0
- package/tests/fixtures/suspect-zones/ko/ai/ko-ai-02.md +4 -5
- package/tests/fixtures/suspect-zones/ko/ai/ko-ai-07-ko-diagnostic.md +11 -0
- package/tests/fixtures/suspect-zones/zh/ai/zh-ai-04-lexicon.md +11 -0
- package/tests/fixtures/suspect-zones/zh/natural/zh-nat-04-lexicon-cold.md +11 -0
- package/tests/quality/README.md +188 -11
- package/tests/quality/adversarial-mps/fixtures.jsonl +10 -0
- package/tests/quality/benchmark.mjs +39 -1
- package/tests/quality/dogfood.mjs +5 -3
- package/tests/quality/live-fixtures.jsonl +2 -0
- package/tests/quality/live-quality.mjs +596 -0
- package/tests/quality/ranking-metrics.mjs +136 -0
- package/tests/quality/rebaseline-manifest.example.jsonl +5 -0
- package/vercel.json +53 -0
- package/SKILL-MAX.md +0 -455
- package/docs/internal/HARNESS.md +0 -14
- package/docs/internal/README.md +0 -14
- package/docs/internal/WARP.md +0 -23
- package/patina-max/SKILL.md +0 -523
- package/patina-max/composite.py +0 -457
- package/src/cache.js +0 -106
- package/src/commands/init.js +0 -208
- package/src/manifest.js +0 -162
- package/src/max-mode.js +0 -207
package/docs/PATTERNS-ZH.md
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
This page expands the Chinese pattern packs into a browsable reference. It is generated from `patterns/zh-*.md`, so the numbers, names, watch words, fire conditions, and examples mirror the source pattern files.
|
|
4
4
|
|
|
5
|
-
- Rewrite-capable patterns:
|
|
6
|
-
- Score/audit-only viral-hook patterns:
|
|
5
|
+
- Rewrite-capable patterns: 33
|
|
6
|
+
- Score/audit-only viral-hook patterns: 9
|
|
7
7
|
- Main selector: [PATTERNS.md](PATTERNS.md)
|
|
8
8
|
|
|
9
9
|
## Pattern Index
|
|
@@ -23,6 +23,7 @@ This page expands the Chinese pattern packs into a browsable reference. It is ge
|
|
|
23
23
|
| 11 | rewrite | 同义词循环替换 | [zh-language.md](../patterns/zh-language.md) |
|
|
24
24
|
| 12 | rewrite | 冗长的介词结构 | [zh-language.md](../patterns/zh-language.md) |
|
|
25
25
|
| 32 | rewrite | “更”比较副词滥用 | [zh-language.md](../patterns/zh-language.md) |
|
|
26
|
+
| 33 | rewrite | 定义式隐喻等式("X 是 Z 的[抽象名词]") | [zh-language.md](../patterns/zh-language.md) |
|
|
26
27
|
| 13 | rewrite | 过度使用连接词/过渡词 | [zh-style.md](../patterns/zh-style.md) |
|
|
27
28
|
| 14 | rewrite | 加粗滥用 | [zh-style.md](../patterns/zh-style.md) |
|
|
28
29
|
| 15 | rewrite | 内联标题列表 | [zh-style.md](../patterns/zh-style.md) |
|
|
@@ -50,6 +51,7 @@ This page expands the Chinese pattern packs into a browsable reference. It is ge
|
|
|
50
51
|
| VH-6 | score/audit only | 伪统计引用 | [zh-viral-hook.md](../patterns/zh-viral-hook.md) |
|
|
51
52
|
| VH-7 | score/audit only | 头衔堆叠式权威 | [zh-viral-hook.md](../patterns/zh-viral-hook.md) |
|
|
52
53
|
| VH-8 | score/audit only | 未来自我 / 拟亲密二人称承诺 | [zh-viral-hook.md](../patterns/zh-viral-hook.md) |
|
|
54
|
+
| VH-9 | score/audit only | 格言式收尾 / 独立判断短句(伪深刻"金句") | [zh-viral-hook.md](../patterns/zh-viral-hook.md) |
|
|
53
55
|
|
|
54
56
|
## 内容模式
|
|
55
57
|
|
|
@@ -252,7 +254,7 @@ Example after:
|
|
|
252
254
|
- Source: [zh-language.md](../patterns/zh-language.md)
|
|
253
255
|
- Type: rewrite-capable pattern
|
|
254
256
|
- Watch words: 更具体、更高效、更全面、更深入、更清晰、更积极、更系统、更有效、更稳健、更广泛、更优质、更可持续、更具战略性
|
|
255
|
-
- Fire condition: 同一文档中出现2个以上“更 +
|
|
257
|
+
- Fire condition: 同一文档中出现2个以上“更 + 形容词/副词/抽象名词化短语”,且没有明确比较对象、基准或指标。若同一段落中只有1处“更X”,但同时出现“系统推进”“全面提升”“战略框架”“协同机制”等正式AI标记,也可以触发。
|
|
256
258
|
- Example files: [failure](../examples/zh-32-failure-01.md) · [success](../examples/zh-32-success-01.md)
|
|
257
259
|
|
|
258
260
|
Example before:
|
|
@@ -263,6 +265,36 @@ Example after:
|
|
|
263
265
|
|
|
264
266
|
> 下一阶段先做三件事:把里程碑拆到每周,给每个任务指定负责人,重新核对预算表。产品、运营和法务每周二开30分钟会,专门处理跨部门卡住的事项。
|
|
265
267
|
|
|
268
|
+
### 33. 定义式隐喻等式("X 是 Z 的[抽象名词]")
|
|
269
|
+
|
|
270
|
+
- Source: [zh-language.md](../patterns/zh-language.md)
|
|
271
|
+
- Type: rewrite-capable pattern
|
|
272
|
+
|
|
273
|
+
**注意词汇(抽象名词标记):** 签名、印记、注脚、形状、轮廓、语言、货币、架构、骨架、引擎、心跳、脉搏、命脉、底色、基石、基因/DNA、底层逻辑、本质、缩影
|
|
274
|
+
|
|
275
|
+
**问题:** AI 中文喜欢用一个系动词判断句把两个东西强行划等号——"X 是 Z 的[抽象名词]"——借宏大的隐喻制造深刻感。"对称是信任的架构""尴尬是你主动选择的那条梯度留下的可见印记"之类的句子听上去很有哲理,实则只是把一个抽象概念套在另一个上面,没有给出任何可验证的支撑。它常常单独成句,充当段落的"金句"。
|
|
276
|
+
|
|
277
|
+
**与 #8(系动词回避)的区分(务必不要混淆):** #8 标记的是**回避"是"**——把"X 是 Y"写成"X 充当 Y""X 起到 Y 的作用""X 相当于 Y",修改方向是**改回"是"**。#33 恰好相反:它标记的是已经用了"是"、却把"是"膨胀成"X 是 Z 的[抽象名词]"这种**夸张的隐喻等式**,修改方向是**拆掉这个等式、换成具体陈述**。一个是缺"是",一个是"是"被滥用造句,两者不是同一回事。
|
|
278
|
+
|
|
279
|
+
**触发条件:** 同一文档/段落中出现 2 处以上“X 是 Z 的[抽象名词]”形式的判断句(抽象名词取自上方标记词或同类),这些等式依赖宏大隐喻而不是具体证据成立。单次出现只记作 audit 提示,不直接进入 rewrite。
|
|
280
|
+
|
|
281
|
+
- Example files: [failure](../examples/zh-33-failure-01.md) · [success](../examples/zh-33-success-01.md)
|
|
282
|
+
**排除条件:**
|
|
283
|
+
- 字面或技术性定义("水是万能溶剂""线粒体是细胞的能量工厂""哈希表是一种以键值映射为基础的数据结构")
|
|
284
|
+
- 约定俗成的固定说法或事实性等同("北京是中国的首都""黄河是中华民族的母亲河"作为既有惯用语)
|
|
285
|
+
- 后文用具体事实、数据或机制真正撑起这个等式的等同关系(不是空泛比喻,而是有论据的论断)
|
|
286
|
+
|
|
287
|
+
**Semantic Risk:** MEDIUM
|
|
288
|
+
**Preservation Note:** 拆解隐喻等式时,先判断作者到底想说什么真实主张:是"Z 依赖 X"、"X 能预测 Z",还是"X 是 Z 的前提条件"?保留这个真实关系和它的方向、强度,只去掉"是…的[抽象名词]"这层装饰;若原文除了比喻之外没有任何可保留的信息,就改成一个具体的事实、动作或观察。
|
|
289
|
+
|
|
290
|
+
**修改前:**
|
|
291
|
+
> 对称是信任的架构。在产品设计里,一致性是用户体验的命脉,而留白则是高级感的语言。
|
|
292
|
+
|
|
293
|
+
**修改后:**
|
|
294
|
+
> 界面左右对齐、控件间距统一时,用户更容易预判下一步操作,投诉里"找不到按钮"的比例下降了。我们在两版改版里都验证过这一点。
|
|
295
|
+
|
|
296
|
+
---
|
|
297
|
+
|
|
266
298
|
## 风格模式
|
|
267
299
|
|
|
268
300
|
### 13. 过度使用连接词/过渡词
|
|
@@ -528,6 +560,7 @@ Example after:
|
|
|
528
560
|
- Type: rewrite-capable pattern
|
|
529
561
|
- Watch words: 这是一个~的事实、可以被认为是、作为一种~的方式、被~所~、对于~来说、基于~的原因、关于~的问题、通过~来实现
|
|
530
562
|
- Fire condition: 同一段落中出现2个以上翻译腔表达。单次使用可容忍——正常中文偶尔也会出现。
|
|
563
|
+
- Context guard: issue #352 — when translationese appears around em dash/colon/slash/comma splice punctuation, do not replace only the punctuation; rebuild the clause relation in natural Chinese, e.g. `无 TUI 设置` → `不用 TUI 就完成全自动安装...`.
|
|
531
564
|
- Example files: [failure](../examples/zh-26-failure-01.md) · [success](../examples/zh-26-success-01.md)
|
|
532
565
|
|
|
533
566
|
Example before:
|
|
@@ -704,3 +737,34 @@ Detection example:
|
|
|
704
737
|
> 改写前:朋友,先收藏这篇。一年后的你一定会感谢现在的自己。
|
|
705
738
|
>
|
|
706
739
|
> 改写后:如果你下个月要做计划,可以保存这份清单。
|
|
740
|
+
|
|
741
|
+
### Viral 9. 格言式收尾 / 独立判断短句(伪深刻"金句")
|
|
742
|
+
|
|
743
|
+
- Source: [zh-viral-hook.md](../patterns/zh-viral-hook.md)
|
|
744
|
+
|
|
745
|
+
**关注词汇:**(结构性模式——按形式判断,不看具体词汇)
|
|
746
|
+
|
|
747
|
+
**问题:** 一句很短(约 10 字以内)、语法完整的判断句被单独放在一行或一段,借这种"留白 + 收束"的排版制造庄重感和深刻感——典型的伪深刻"金句""mic drop"。一篇里出现 2 句以上,或每段都用这样一句收尾时,就成了 AI 网红长文的强信号。它靠的是**形式**(独立、短、斩钉截铁),而不是某个特定词汇。
|
|
748
|
+
|
|
749
|
+
**触发条件:** 出现独立成行/成段、语法完整、约 10 字以内的判断式短句,且其作用是修辞性收束(金句、点题),而非传递新信息。一篇出现 1 句即可记为低信号,2 句及以上信号显著。
|
|
750
|
+
|
|
751
|
+
**严重度标尺:**
|
|
752
|
+
- Low:全篇仅 1 句独立格言短句。
|
|
753
|
+
- Medium:出现 2 句独立格言短句。
|
|
754
|
+
- High:出现 3 句及以上,或几乎每个段落都以一句这样的短句收尾。
|
|
755
|
+
|
|
756
|
+
**排除条件:**
|
|
757
|
+
- 诗、歌词、韵文
|
|
758
|
+
- 本就简短的备忘、通知、提示、对问题的一两句回答
|
|
759
|
+
- 有意为之且后文有具体支撑的格言(不是空泛点题,而是引出论据)
|
|
760
|
+
- 对话、引语
|
|
761
|
+
- 标题、小标题
|
|
762
|
+
|
|
763
|
+
**Semantic Risk:** LOW —— 仅评分,重写时不动。
|
|
764
|
+
**Preservation Note:** 该模式默认仅评分不重写;若用户明确要求降调,保留这句话真正想表达的判断或重点,把它并回相邻段落或补上具体支撑,而不是简单删掉,以免抹掉作者的核心观点。
|
|
765
|
+
**改写前 / 改写后示例(手动降信号):**
|
|
766
|
+
> 改写前:增长是结果,不是目标。
|
|
767
|
+
> 真正重要的从来不是数字。
|
|
768
|
+
> 对称会变成陷阱。
|
|
769
|
+
>
|
|
770
|
+
> 改写后:我们今年没有把月活当成首要指标,而是盯着留存率和复购,因为一味追规模上次让我们烧掉了两个季度的预算。
|
package/docs/PATTERNS.md
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
# Pattern Catalog
|
|
2
2
|
|
|
3
|
-
Patina ships
|
|
3
|
+
Patina ships 168 pattern entries across four languages. The language-specific references below expand each pack with pattern numbers, names, watch words, fire conditions, source links, and examples.
|
|
4
4
|
|
|
5
5
|
| Language | Reference | Rewrite-capable patterns | Score/audit-only viral-hook patterns |
|
|
6
6
|
|----------|-----------|--------------------------|--------------------------------------|
|
|
7
|
-
| Korean | [PATTERNS-KO.md](PATTERNS-KO.md) |
|
|
8
|
-
| English | [PATTERNS-EN.md](PATTERNS-EN.md) |
|
|
9
|
-
| Chinese | [PATTERNS-ZH.md](PATTERNS-ZH.md) |
|
|
10
|
-
| Japanese | [PATTERNS-JA.md](PATTERNS-JA.md) |
|
|
7
|
+
| Korean | [PATTERNS-KO.md](PATTERNS-KO.md) | 33 | 9 |
|
|
8
|
+
| English | [PATTERNS-EN.md](PATTERNS-EN.md) | 33 | 9 |
|
|
9
|
+
| Chinese | [PATTERNS-ZH.md](PATTERNS-ZH.md) | 33 | 9 |
|
|
10
|
+
| Japanese | [PATTERNS-JA.md](PATTERNS-JA.md) | 33 | 9 |
|
|
11
11
|
|
|
12
12
|
## Notes
|
|
13
13
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Docs Platform RFC
|
|
2
|
+
|
|
3
|
+
Status: investigation only; no docs-site implementation in this issue.
|
|
4
|
+
Sources checked: 2026-05-20 UTC.
|
|
5
|
+
|
|
6
|
+
Patina currently ships flat Markdown plus generated API/pattern reference pages. That remains the lowest-maintenance option for the next release. A docs site becomes worthwhile when search, localized navigation, or versioned release docs become more valuable than keeping every document directly browsable in GitHub.
|
|
7
|
+
|
|
8
|
+
## Options compared
|
|
9
|
+
|
|
10
|
+
| Option | i18n | Search | Versioning | Deployment effort | Notes |
|
|
11
|
+
|---|---|---|---|---|---|
|
|
12
|
+
| Docusaurus | Strong built-in locale workflow and translated docs routes | Built-in local search options and hosted search integrations | Built-in docs versioning | Medium: React app, sidebars, build pipeline, deploy to Pages/Vercel/Netlify | Best if Patina wants versioned docs and a conventional OSS docs portal. |
|
|
13
|
+
| Astro Starlight | Strong i18n routing for content collections | Built-in Pagefind search in the Starlight docs stack | No first-class versioned-docs workflow comparable to Docusaurus; can model versions as content sections | Medium-low: Astro app plus content migration | Best fit if Patina wants a lightweight multilingual docs site with modern content authoring and fewer React conventions. |
|
|
14
|
+
| MkDocs + Material | Mature Markdown docs, search, navigation, and language customization; i18n usually needs plugin/workflow choices | Built-in client search through MkDocs/Material | Usually handled with `mike` or deploy aliases | Medium: Python toolchain plus theme/plugin choices | Good for Python-heavy teams; less natural for this Node-first repo. |
|
|
15
|
+
| GitHub Pages with current Markdown | No routed i18n beyond separate files | GitHub search only; no site search UX | Tags/branches only, no docs-version UX | Low: already works | Best current default. Keeps maintenance near zero but does not solve navigation/search. |
|
|
16
|
+
|
|
17
|
+
## Source links
|
|
18
|
+
|
|
19
|
+
- Docusaurus docs: https://docusaurus.io/docs
|
|
20
|
+
- Docusaurus i18n: https://docusaurus.io/docs/i18n/introduction
|
|
21
|
+
- Docusaurus versioning: https://docusaurus.io/docs/versioning
|
|
22
|
+
- Astro Starlight docs: https://starlight.astro.build/
|
|
23
|
+
- Starlight i18n: https://starlight.astro.build/guides/i18n/
|
|
24
|
+
- Starlight search: https://starlight.astro.build/guides/search/
|
|
25
|
+
- MkDocs: https://www.mkdocs.org/
|
|
26
|
+
- Material for MkDocs: https://squidfunk.github.io/mkdocs-material/
|
|
27
|
+
- Material search: https://squidfunk.github.io/mkdocs-material/setup/setting-up-site-search/
|
|
28
|
+
- `mike` MkDocs versioning: https://github.com/jimporter/mike
|
|
29
|
+
- GitHub Pages: https://docs.github.com/pages
|
|
30
|
+
|
|
31
|
+
## Recommendation
|
|
32
|
+
|
|
33
|
+
Do **not** migrate immediately. Keep GitHub-rendered Markdown until one of these triggers is true:
|
|
34
|
+
|
|
35
|
+
1. search becomes a recurring support problem;
|
|
36
|
+
2. localized docs need language-specific navigation rather than separate `README_*` files;
|
|
37
|
+
3. a release process requires versioned docs for multiple supported CLI versions.
|
|
38
|
+
|
|
39
|
+
When a site is justified, start with **Astro Starlight** unless versioned docs are the primary requirement. Starlight is the best fit for Patina's current content shape: Markdown-first, multilingual, and lightweight. Choose **Docusaurus** instead if versioning becomes mandatory before the migration starts.
|
|
40
|
+
|
|
41
|
+
## Effort estimate
|
|
42
|
+
|
|
43
|
+
| Phase | Estimate | Work |
|
|
44
|
+
|---|---:|---|
|
|
45
|
+
| RFC acceptance + URL plan | 0.5 day | Pick `/docs` vs custom domain, define canonical source of truth, decide whether README remains the landing page. |
|
|
46
|
+
| Minimal Starlight proof of concept | 1-2 days | Add app scaffold, import 8-12 key docs, configure i18n nav/search, deploy preview. |
|
|
47
|
+
| Full migration | 3-5 days | Move docs into site collections, preserve links/anchors, add redirects, wire CI build, update README links. |
|
|
48
|
+
| Versioned docs with Docusaurus | 4-7 days | Same as above plus sidebar/version policy and release tagging workflow. |
|
|
49
|
+
|
|
50
|
+
## Non-goals
|
|
51
|
+
|
|
52
|
+
- No docs-site dependency is added by this RFC.
|
|
53
|
+
- No README or existing Markdown page is moved.
|
|
54
|
+
- No custom domain, Pages workflow, or search index is created here.
|
package/docs/ROADMAP.md
CHANGED
|
@@ -13,12 +13,14 @@ This roadmap focuses on two things:
|
|
|
13
13
|
- Public scope: Korean, English, Chinese, Japanese AI-writing pattern rewriting
|
|
14
14
|
- Current benchmark layer:
|
|
15
15
|
- deterministic stylometry/lexicon benchmark: `npm run benchmark`
|
|
16
|
+
- adversarial MPS fixture gate: `npm run quality:adversarial-mps`
|
|
17
|
+
- 2026 rebaseline status: [`docs/research/2026-rebaseline.md`](research/2026-rebaseline.md)
|
|
16
18
|
- Current public calibration claim:
|
|
17
|
-
-
|
|
18
|
-
-
|
|
19
|
-
-
|
|
20
|
-
- Current distribution
|
|
21
|
-
- package
|
|
19
|
+
- 2026-05-22 modern-model catch: 67.3% [63.5-71.0%], n=600 across KO+EN × GPT/Claude/Gemini
|
|
20
|
+
- human-control false positives: 16.0% [11.6-21.7%], n=200 across KO+EN
|
|
21
|
+
- per-cell results: `docs/benchmarks/rebaseline-latest.md`
|
|
22
|
+
- Current distribution:
|
|
23
|
+
- npm package `patina-cli` is the public distribution channel; repo metadata currently targets `4.0.0` and should be verified with `npm run release:check` before publishing.
|
|
22
24
|
|
|
23
25
|
## 0. Positioning principles
|
|
24
26
|
|
|
@@ -58,14 +60,16 @@ Avoid overclaiming:
|
|
|
58
60
|
Goal: make claims easier to verify and harder to dismiss.
|
|
59
61
|
|
|
60
62
|
- Publish a short benchmark report generated from `tests/quality/results.json`.
|
|
61
|
-
-
|
|
63
|
+
- Keep ROC-AUC / PR-AUC and threshold sweep diagnostics current in the deterministic benchmark report.
|
|
62
64
|
- Split reports by language, class, and register.
|
|
63
65
|
- Add a visible warning that scores measure AI-likeness, not authorship.
|
|
64
66
|
- Link [`docs/research/ai-human-metrics.md`](research/ai-human-metrics.md) from README.
|
|
67
|
+
- Keep the adversarial MPS report current so high meaning preservation cannot hide unchanged AI-like style.
|
|
65
68
|
|
|
66
69
|
Acceptance criteria:
|
|
67
70
|
|
|
68
71
|
- `npm run benchmark` still passes.
|
|
72
|
+
- `npm run quality:adversarial-mps` still passes.
|
|
69
73
|
- Benchmark output includes current binary metrics plus ranked/threshold metrics.
|
|
70
74
|
- README claims are traceable to a specific benchmark report or spec section.
|
|
71
75
|
|
|
@@ -250,66 +254,42 @@ Do not lead with “bypass AI detectors.” Lead with:
|
|
|
250
254
|
|
|
251
255
|
## 5. Immediate next actions
|
|
252
256
|
|
|
253
|
-
Last triaged: 2026-05
|
|
257
|
+
Last triaged: 2026-06-05, after closing the public launch tracker and moving launch posting notes out of GitHub issues.
|
|
254
258
|
|
|
255
259
|
Current GitHub issue inventory:
|
|
256
260
|
|
|
257
|
-
-
|
|
258
|
-
- Open
|
|
259
|
-
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
-
|
|
265
|
-
-
|
|
266
|
-
-
|
|
267
|
-
-
|
|
268
|
-
- Launch
|
|
269
|
-
-
|
|
270
|
-
-
|
|
271
|
-
-
|
|
272
|
-
-
|
|
273
|
-
-
|
|
274
|
-
-
|
|
275
|
-
-
|
|
276
|
-
-
|
|
277
|
-
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
1. Add zh/ja AI-lexicon files for stylometry overlap detection. Issue: [#104](https://github.com/devswha/patina/issues/104).
|
|
294
|
-
2. Let CLI backends participate in standalone MAX. Issue: [#141](https://github.com/devswha/patina/issues/141).
|
|
295
|
-
3. Reduce MAX prompt weight by opting into a minimal default. Issue: [#143](https://github.com/devswha/patina/issues/143).
|
|
296
|
-
4. Add MAX pane-liveness watchdogs for dead tmux panes. Issue: [#144](https://github.com/devswha/patina/issues/144).
|
|
297
|
-
5. Make `patina auth login <backend>` launch the real login flow. Issue: [#186](https://github.com/devswha/patina/issues/186).
|
|
298
|
-
6. Add JSDoc public exports and publish generated API reference. Issue: [#191](https://github.com/devswha/patina/issues/191).
|
|
299
|
-
7. Tie this roadmap to a GitHub Project board or Milestones. Issue: [#195](https://github.com/devswha/patina/issues/195).
|
|
300
|
-
|
|
301
|
-
### Wave 2 — research calibration, parked unless explicitly scheduled
|
|
302
|
-
|
|
303
|
-
These are medium-priority but research-heavy. Keep them out of the critical path until release blockers and the core quality loop have stable owner time.
|
|
304
|
-
|
|
305
|
-
1. Re-baseline AI catch rate against 2025+ models. Issue: [#155](https://github.com/devswha/patina/issues/155).
|
|
306
|
-
2. Run the lexicon freshness audit with per-entry corpus provenance. Issue: [#160](https://github.com/devswha/patina/issues/160).
|
|
307
|
-
3. Establish quarterly pattern-freshness review with corpus refresh and emerging-pattern triage. Issue: [#165](https://github.com/devswha/patina/issues/165).
|
|
308
|
-
|
|
309
|
-
### Wave 3 — low-priority parked ecosystem and research
|
|
310
|
-
|
|
311
|
-
Do not start these until npm publication, action v1, and medium MAX/auth/docs work have a stable release path:
|
|
312
|
-
|
|
313
|
-
- False-positive and benchmark calibration: [#99](https://github.com/devswha/patina/issues/99), [#156](https://github.com/devswha/patina/issues/156), [#157](https://github.com/devswha/patina/issues/157), [#158](https://github.com/devswha/patina/issues/158), [#159](https://github.com/devswha/patina/issues/159), [#163](https://github.com/devswha/patina/issues/163).
|
|
314
|
-
- Documentation site exploration: [#199](https://github.com/devswha/patina/issues/199).
|
|
315
|
-
- Editor/platform integrations and distribution experiments: [#206](https://github.com/devswha/patina/issues/206), [#207](https://github.com/devswha/patina/issues/207), [#208](https://github.com/devswha/patina/issues/208), [#209](https://github.com/devswha/patina/issues/209), [#210](https://github.com/devswha/patina/issues/210), [#211](https://github.com/devswha/patina/issues/211), [#212](https://github.com/devswha/patina/issues/212).
|
|
261
|
+
- 9 open issues.
|
|
262
|
+
- Open PRs: 0.
|
|
263
|
+
- Open priority split: 0 high, 1 medium, 8 low, and 0 without priority labels.
|
|
264
|
+
- No current high-priority issue.
|
|
265
|
+
|
|
266
|
+
Campaign state:
|
|
267
|
+
|
|
268
|
+
- Merged campaign PRs: #281, #287, #288, #289, #290, #292, #293.
|
|
269
|
+
- Concurrent cleanup merged during the final gate: #294 closed #291.
|
|
270
|
+
- Final review blocker cleanup: #295.
|
|
271
|
+
- Launch Wave 1 badge work: #297 closed #282; companion patina-action#1 added `badge-json` / `badge-branch`.
|
|
272
|
+
- Launch Wave 2 support work: #299 closed #285; the experimental share-card generator from #283 has since been removed from the CLI surface.
|
|
273
|
+
- Launch Wave 3 static playground work closes #208 and targets <https://patina.vibetip.help/> for the try-it-now URL.
|
|
274
|
+
- Launch execution prep: Korean-first channel drafts live in `docs/social/patina-launch-korean-first.md` and score 0.0%; `docs/social/patina-launch-copy.md` scores 6.3% after the KO diagnostic scoring update. Launch posting/deferral notes are maintainer-owned operational bookkeeping and should be tracked outside public GitHub issues.
|
|
275
|
+
- Rebaseline claim pass: `npm run benchmark:rebaseline:report` refreshes `docs/benchmarks/rebaseline-latest.{md,json}` from the #155 claim-ready sanitized manifest (800 rows, no raw text).
|
|
276
|
+
- KO/2025+ corpus prep: `docs/research/ko-2025-corpus-sources.md` records usable Korean sources, `artifacts/rebaseline-2025/intake.local.example.jsonl` provides the 25-row pilot skeleton, `artifacts/rebaseline-2025/sources.ko-public.jsonl` inventories public Korean web sources, `artifacts/rebaseline-2025/human-controls.public.jsonl` tracks 250 scored hash-only web human-control candidates at n=50 for each tracked register, `npm run benchmark:rebaseline:web` collects raw text into ignored private rows, and `npm run benchmark:rebaseline:score` refreshes deterministic outcome fields without copying raw text.
|
|
277
|
+
- KO register pilot: `npm run benchmark:register-pilot -- --write --basename register-stratified-latest` refreshes false positives by register without committing raw text; the expanded current pilot shows 42/250 predicted-hot human-control rows, split by register for threshold work.
|
|
278
|
+
- KO KatFish calibration: `npm run benchmark:katfish-ko -- --write --basename katfish-ko-latest` reports aggregate-only private KatFish metrics; current KO diagnostics improve catch rate from 58.9% to 74.8% versus Patina without KO diagnostics while public-web human-control FP stays 42/250.
|
|
279
|
+
- Launch feedback prep: the false-positive issue form now captures text origin, redistribution, fired paragraph, score output, and expected behavior.
|
|
280
|
+
- Growth nudge prep: the one-time CLI star reminder from #305 has since been removed to keep stderr operational-only.
|
|
281
|
+
- README demo prep: #306 adds the first-screen terminal GIF, try-it-now
|
|
282
|
+
playground link, translated README references, and re-recording notes; #308
|
|
283
|
+
localizes README hero GIF references so English uses `patina-demo-en.gif`,
|
|
284
|
+
Korean uses `patina-demo-ko.gif`, and ZH/JA intentionally fall back to EN.
|
|
285
|
+
- Closed or verified during the campaign: #99, #104, #155, #156, #157, #160, #303, #165, #186, #191, #199, #209, #210, #286, #304, #305, #306, #308.
|
|
286
|
+
- Kept open with explicit blocker comments or pending external action: #158, #159, #206, #207, #211, #212, #284, #307, #324.
|
|
287
|
+
- Legacy bot/harness notes were removed from the public repo; restart autonomous bot work only from a fresh, tracked design if it becomes necessary.
|
|
288
|
+
|
|
289
|
+
Next recommended order:
|
|
290
|
+
|
|
291
|
+
1. Prepare #307 awesome-list discovery submissions only as candidate copy/checklists; maintainer-owned external submissions should stay outside automated repo changes.
|
|
292
|
+
2. Re-implement #324 only when live credentialed quality checks are worth the larger local-runner investment.
|
|
293
|
+
3. Treat low-priority research/ecosystem items (#158, #159, #206, #207, #211, #212, #284) as parked until evaluator budget, reviewer pool, redistributable corpus, external repo, hosting, or governance prerequisites exist; keep new campaign PRs short-lived.
|
|
294
|
+
|
|
295
|
+
Detailed wave grouping lives in `docs/ISSUE-WAVES.md` so launch/growth, completed profile work, evaluation-gated research, and parked ecosystem work can move independently without re-triage.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Korean translationese (번역투 / calque) detection
|
|
2
|
+
|
|
3
|
+
`src/features/translationese.js` adds a deterministic, auditable signal for
|
|
4
|
+
Korean **calques** — grammatical Korean that reads as translated-from-English.
|
|
5
|
+
The stylometry + AI-lexicon signals catch *structure* and *AI vocabulary*; they
|
|
6
|
+
do **not** catch lexical translationese (e.g. "커맨드 기둥" for "command pillars",
|
|
7
|
+
"~에 의해" passives, "당신" for "you"). This catalog fills that gap.
|
|
8
|
+
|
|
9
|
+
## Design notes
|
|
10
|
+
|
|
11
|
+
- **Precision first.** Most of these constructions also appear in good native
|
|
12
|
+
Korean (formal/technical prose especially). So the signal is **density-gated**:
|
|
13
|
+
it only reports `hot` when both an absolute floor (≥4 hits) **and** a
|
|
14
|
+
per-prose-sentence density (≥0.5) are met. A single "~에 의해" means nothing.
|
|
15
|
+
- **Advisory, not a verdict.** `analyzeText` surfaces it as `translationese`
|
|
16
|
+
but deliberately keeps it **out of the document `hot` decision**, so it cannot
|
|
17
|
+
regress benchmark false positives. The SKILL / callers act on it (e.g. the
|
|
18
|
+
rewrite loop), and the audit surface can display it.
|
|
19
|
+
- **ko-only** for now (calques are language-specific).
|
|
20
|
+
- Each rule ships a `before → after` example (enforced by a unit test).
|
|
21
|
+
|
|
22
|
+
## Catalog (before → after)
|
|
23
|
+
|
|
24
|
+
| id | tell | before | after |
|
|
25
|
+
| --- | --- | --- | --- |
|
|
26
|
+
| `noun-calque` | 직역 명사구 (pillar/layer 류) | 세 가지 **커맨드 기둥**을 설치합니다. | 핵심 커맨드 세 가지를 설치합니다. |
|
|
27
|
+
| `dummy-subject` | 가주어 "그것은/이것은" (it is) | **그것은** 매우 중요하다. | 매우 중요하다. |
|
|
28
|
+
| `direct-address-you` | "당신" 직접 호칭 (you) | **당신은** 이것을 설정할 수 있습니다. | 이건 설정할 수 있다. |
|
|
29
|
+
| `passive-e-uihae` | "~에 의해" 피동 (by-passive) | 작업은 에이전트**에 의해** 처리됩니다. | 에이전트가 작업을 처리합니다. |
|
|
30
|
+
| `have-overuse` | "~을 가지고 있다" (have) | 이 도구는 유연성을 **가지고 있습니다**. | 이 도구는 유연합니다. |
|
|
31
|
+
| `one-of` | "~중 하나" (one of the) | 가장 빠른 도구 **중 하나입니다**. | 손꼽히게 빠릅니다. |
|
|
32
|
+
| `provides` | "~을 제공합니다" (provides) | 다양한 기능을 **제공합니다**. | 여러 기능을 쓸 수 있다. |
|
|
33
|
+
| `as-follows` | "다음과 같습니다" (as follows) | 사용법은 **다음과 같습니다**. | 사용법은 이렇다. |
|
|
34
|
+
| `make-easy` | "~하게 만들어 준다" (make it ~) | 설치를 쉽게 **만들어 줍니다**. | 설치가 쉬워진다. |
|
|
35
|
+
|
|
36
|
+
## Output shape
|
|
37
|
+
|
|
38
|
+
```js
|
|
39
|
+
analyzeText(text, { lang: 'ko' }).translationese
|
|
40
|
+
// → { count, density, sentences, byRule:[{id,label,strong,count,example}],
|
|
41
|
+
// hits:[...], hot, thresholds:{count,density} }
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Limitations / next
|
|
45
|
+
|
|
46
|
+
- Calques are **lexical**; the structural stylometric classifier cannot learn
|
|
47
|
+
them (proven separately). This is a rule catalog, like patina's pattern packs.
|
|
48
|
+
- The catalog is intentionally small and conservative; expand with corpus
|
|
49
|
+
evidence and keep the density gate to protect precision.
|
|
50
|
+
- Not wired into `hot` yet — promote only after validating no FP regression on
|
|
51
|
+
the benchmark / a diverse human ko control set.
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# patina 저장소 심층 분석 보고서
|
|
2
2
|
|
|
3
|
+
> **Archive note:** 이 보고서는 MAX mode, hosted backend, `--card`, inline `--api-key`, `--save-run`, response cache 제거 전 스냅샷이다. 아래 CLI surface 관찰은 당시 기준이며 현재 도움말/문서의 source of truth가 아니다.
|
|
4
|
+
|
|
3
5
|
## 경영진 요약
|
|
4
6
|
|
|
5
7
|
`devswha/patina`는 방향이 분명한 저장소다. 패턴 기반으로 AI 글쓰기 흔적을 탐지하고, 감사 가능한 방식으로 다시 쓰기를 수행하며, 한국어·영어·중국어·일본어에 대해 총 126개 패턴을 운영한다. 공개 문서와 코드 기준으로 보면, 이 도구는 standalone Node CLI이자 entity["company","Anthropic","ai company"] Claude Code·entity["company","OpenAI","ai company"] Codex CLI 계열 워크플로에 연결되는 스킬형 도구로 설계되어 있고, rewrite·audit·score·diff·ouroboros·MAX 실행 경로를 이미 갖추고 있다. 또한 backend와 provider를 분리했고, 패턴 팩·프로필·voice·scoring 문서를 프롬프트에 명시적으로 주입하는 구조라 추적 가능성도 좋다. citeturn18view0turn18view1turn26view6turn15view0turn14view2turn23view13
|
|
@@ -51,7 +53,7 @@ flowchart LR
|
|
|
51
53
|
| combined score | `combinedScore`는 `ai-likeness`와 `100-fidelity`를 가중합으로 계산하며 profile별 weight도 config에 있다. | 계산해 놓고도 현재 제어 로직에서 핵심 decision variable로 쓰이지 않는다. design intent와 runtime control이 어긋나 있다. citeturn13view4turn23view10turn27view5 |
|
|
52
54
|
| MAX orchestration | MAX는 각 후보를 생성한 뒤 `scoreText`와 `scoreMPS`를 모두 `models[0]`으로 재채점하고, MPS 원문은 `prompt.split('## Input Text')[1]`로 뽑는다. | 평가 편향과 파싱 취약성이 동시에 존재한다. 생성 모델이 여러 개인데 평가기가 사실상 하나이며, source text 전달도 brittle하다. citeturn28view1turn28view3 |
|
|
53
55
|
| Ouroboros | Ouroboros는 target score, plateau, fidelity floor, MPS floor, rollback을 갖고 있지만, 중단 판단은 `currentScore`, `delta`, floor 위주다. `combined`는 로그에만 남는다. | iterative refinement를 하면서도 실제 결정은 단순 AI-likeness 중심이라, 의미 보존과 자연스러움 사이 균형을 충분히 활용하지 못한다. citeturn14view8turn27view0turn27view3turn27view5 |
|
|
54
|
-
| multilingual | 패턴 카탈로그는 4개 언어를 지원하지만, README의 stylometric/AI-lexicon 설명은 EN
|
|
56
|
+
| multilingual | 패턴 카탈로그는 4개 언어를 지원하지만, README의 stylometric/AI-lexicon 설명은 EN 88개, KO 102개, ZH/JA 각 60개를 명시한다. | zh/ja에 대해 pattern 지원은 있어도 stylometry/lexicon calibration은 약할 가능성이 높다. 특히 paraphrase·translation·summarization처럼 semantic-invariant 과제는 탐지가 더 어렵다는 외부 연구와도 맞물린다. citeturn18view0turn18view4turn33view1 |
|
|
55
57
|
| testing/CI | `npm test` 엔트리포인트는 현재 `node --test tests/e2e/*.test.js`만 가리킨다. | e2e 중심 smoke는 가능해도, 점수식·파서·패턴 스키마·Unicode edge case를 잡는 unit/property/quality benchmark 층이 약하다. citeturn44view0turn44view1 |
|
|
56
58
|
| security | 공개 help는 `--api-key <key>`와 여러 provider 환경변수를 직접 안내한다. | 명령행 인자는 shell history나 process list에 노출될 수 있어 비밀정보 전달 수단으로 바람직하지 않다. GitHub 문서와 CWE도 command line 전달을 피하라고 권고한다. citeturn45view0turn45view1turn41search1turn41search3 |
|
|
57
59
|
| reproducibility/UX | 현재 help에는 `--json`, `--save-run`, prompt hash, manifest, config hash 같은 실험 기록 옵션이 보이지 않는다. | README가 score variance를 인정하고 있어도 사용자가 어떤 model/provider/pattern set으로 결과가 나왔는지 재현하기 어렵다. 또한 MAX의 model naming과 provider naming 체계가 분리돼 초심자에게 혼동을 줄 수 있다. citeturn18view3turn23view13turn45view0turn45view1 |
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Benchmark reports
|
|
2
|
+
|
|
3
|
+
This directory stores checked-in benchmark summaries. They are useful for
|
|
4
|
+
regression review, release notes, and public claims only when the matching gate
|
|
5
|
+
says the evidence is ready.
|
|
6
|
+
|
|
7
|
+
## Files
|
|
8
|
+
|
|
9
|
+
| File | Source command | Use |
|
|
10
|
+
|---|---|---|
|
|
11
|
+
| `latest.md` / `latest.json` | `npm run benchmark:report` | Deterministic suspect-zone fixture benchmark for KO / EN / ZH / JA, including `signal_score` ROC-AUC / PR-AUC diagnostics. |
|
|
12
|
+
| `detector-comparison.md` / `.json` | `npm run benchmark:compare` | Manual/offline comparison protocol for third-party detectors. |
|
|
13
|
+
| `rebaseline-latest.md` / `.json` | `npm run benchmark:rebaseline:report` | #155 claim-ready 2026 modern-model rebaseline summary (800 hash-only rows; KO+EN × GPT/Claude/Gemini plus human controls). |
|
|
14
|
+
| `katfish-ko-latest.md` / `.json` | `npm run benchmark:katfish-ko -- --write --basename katfish-ko-latest` | Aggregate-only private KatFish calibration for the Korean diagnostic layer; raw rows stay ignored/private. |
|
|
15
|
+
|
|
16
|
+
## Refresh
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
npm run benchmark:report
|
|
20
|
+
npm run benchmark:compare
|
|
21
|
+
npm run benchmark:rebaseline:report
|
|
22
|
+
npm run benchmark:katfish-ko -- --write --basename katfish-ko-latest
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Use `npm run benchmark` for the fast fixture classifier smoke check. Use
|
|
26
|
+
`npm run quality:live` only when you want the opt-in rewrite-quality scaffold;
|
|
27
|
+
by default it does not call a model.
|
|
28
|
+
|
|
29
|
+
## Public-claim rule
|
|
30
|
+
|
|
31
|
+
Do not copy numbers into README, launch copy, or social posts unless the report
|
|
32
|
+
itself contains the required evidence. The rebaseline report must stay
|
|
33
|
+
`BLOCKED` until it has scored outcome rows, n≥100 per claim cell, at least two
|
|
34
|
+
languages, at least three generator families, and confidence intervals.
|
|
35
|
+
|
|
36
|
+
The `latest` report's ranking diagnostics are regression evidence for the
|
|
37
|
+
checked-in fixtures only. They help compare thresholds and signal changes, but
|
|
38
|
+
they are not a general claim that patina detects authorship or current model
|
|
39
|
+
families.
|
|
40
|
+
|
|
41
|
+
## False-positive loop
|
|
42
|
+
|
|
43
|
+
If a person-written paragraph is flagged too aggressively, collect it through
|
|
44
|
+
the false-positive form:
|
|
45
|
+
|
|
46
|
+
<https://github.com/devswha/patina/issues/new?template=false_positive.yml>
|
|
47
|
+
|
|
48
|
+
A useful report includes the exact paragraph that fired, language/register,
|
|
49
|
+
score output, and whether the sample can become a public fixture. Private or
|
|
50
|
+
vendor-copied text should stay out of the repository; use metadata and hashes
|
|
51
|
+
instead.
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"reportVersion": 1,
|
|
3
|
-
"generatedAt": "2026-05-
|
|
4
|
-
"fixtureCount":
|
|
5
|
-
"benchmarkGeneratedAt": "2026-05-
|
|
6
|
-
"note": "Offline comparison
|
|
3
|
+
"generatedAt": "2026-05-21T15:02:19.159Z",
|
|
4
|
+
"fixtureCount": 39,
|
|
5
|
+
"benchmarkGeneratedAt": "2026-05-21T15:02:19.140Z",
|
|
6
|
+
"note": "Offline comparison protocol. Built-in Patina row uses deterministic suspect-zone analyzer; third-party rows are manual opt-in only.",
|
|
7
7
|
"manualInput": null,
|
|
8
8
|
"detectors": [
|
|
9
9
|
{
|
|
@@ -11,17 +11,17 @@
|
|
|
11
11
|
"name": "Patina deterministic suspect-zone analyzer",
|
|
12
12
|
"kind": "in-tree",
|
|
13
13
|
"mode": "offline",
|
|
14
|
-
"threshold": "burstiness low OR MATTR low OR lexicon density > threshold"
|
|
14
|
+
"threshold": "burstiness low OR MATTR low OR lexicon density > threshold with min hits OR koDiagnostics hot"
|
|
15
15
|
}
|
|
16
16
|
],
|
|
17
17
|
"summaries": {
|
|
18
18
|
"patina-deterministic": {
|
|
19
|
-
"tp":
|
|
19
|
+
"tp": 21,
|
|
20
20
|
"fp": 0,
|
|
21
21
|
"fn": 0,
|
|
22
|
-
"tn":
|
|
23
|
-
"total":
|
|
24
|
-
"fixtureCount":
|
|
22
|
+
"tn": 18,
|
|
23
|
+
"total": 39,
|
|
24
|
+
"fixtureCount": 39,
|
|
25
25
|
"coverage": 1,
|
|
26
26
|
"accuracy": 1,
|
|
27
27
|
"precision": 1,
|
|
@@ -198,6 +198,18 @@
|
|
|
198
198
|
"source": "tests/quality/benchmark.mjs",
|
|
199
199
|
"notes": null
|
|
200
200
|
},
|
|
201
|
+
{
|
|
202
|
+
"fixture_id": "ja-ai-04-lexicon",
|
|
203
|
+
"lang": "ja",
|
|
204
|
+
"class": "ai",
|
|
205
|
+
"detector": "patina-deterministic",
|
|
206
|
+
"expected_hot": true,
|
|
207
|
+
"predicted_hot": true,
|
|
208
|
+
"correct": true,
|
|
209
|
+
"score": 1,
|
|
210
|
+
"source": "tests/quality/benchmark.mjs",
|
|
211
|
+
"notes": null
|
|
212
|
+
},
|
|
201
213
|
{
|
|
202
214
|
"fixture_id": "ja-nat-01",
|
|
203
215
|
"lang": "ja",
|
|
@@ -234,6 +246,18 @@
|
|
|
234
246
|
"source": "tests/quality/benchmark.mjs",
|
|
235
247
|
"notes": null
|
|
236
248
|
},
|
|
249
|
+
{
|
|
250
|
+
"fixture_id": "ja-nat-04-lexicon-cold",
|
|
251
|
+
"lang": "ja",
|
|
252
|
+
"class": "natural",
|
|
253
|
+
"detector": "patina-deterministic",
|
|
254
|
+
"expected_hot": false,
|
|
255
|
+
"predicted_hot": false,
|
|
256
|
+
"correct": true,
|
|
257
|
+
"score": 0,
|
|
258
|
+
"source": "tests/quality/benchmark.mjs",
|
|
259
|
+
"notes": null
|
|
260
|
+
},
|
|
237
261
|
{
|
|
238
262
|
"fixture_id": "ko-ai-01",
|
|
239
263
|
"lang": "ko",
|
|
@@ -306,6 +330,18 @@
|
|
|
306
330
|
"source": "tests/quality/benchmark.mjs",
|
|
307
331
|
"notes": null
|
|
308
332
|
},
|
|
333
|
+
{
|
|
334
|
+
"fixture_id": "ko-ai-07-ko-diagnostic",
|
|
335
|
+
"lang": "ko",
|
|
336
|
+
"class": "ai",
|
|
337
|
+
"detector": "patina-deterministic",
|
|
338
|
+
"expected_hot": true,
|
|
339
|
+
"predicted_hot": true,
|
|
340
|
+
"correct": true,
|
|
341
|
+
"score": 1,
|
|
342
|
+
"source": "tests/quality/benchmark.mjs",
|
|
343
|
+
"notes": null
|
|
344
|
+
},
|
|
309
345
|
{
|
|
310
346
|
"fixture_id": "ko-nat-01",
|
|
311
347
|
"lang": "ko",
|
|
@@ -402,6 +438,18 @@
|
|
|
402
438
|
"source": "tests/quality/benchmark.mjs",
|
|
403
439
|
"notes": null
|
|
404
440
|
},
|
|
441
|
+
{
|
|
442
|
+
"fixture_id": "zh-ai-04-lexicon",
|
|
443
|
+
"lang": "zh",
|
|
444
|
+
"class": "ai",
|
|
445
|
+
"detector": "patina-deterministic",
|
|
446
|
+
"expected_hot": true,
|
|
447
|
+
"predicted_hot": true,
|
|
448
|
+
"correct": true,
|
|
449
|
+
"score": 1,
|
|
450
|
+
"source": "tests/quality/benchmark.mjs",
|
|
451
|
+
"notes": null
|
|
452
|
+
},
|
|
405
453
|
{
|
|
406
454
|
"fixture_id": "zh-nat-01",
|
|
407
455
|
"lang": "zh",
|
|
@@ -437,6 +485,18 @@
|
|
|
437
485
|
"score": 0,
|
|
438
486
|
"source": "tests/quality/benchmark.mjs",
|
|
439
487
|
"notes": null
|
|
488
|
+
},
|
|
489
|
+
{
|
|
490
|
+
"fixture_id": "zh-nat-04-lexicon-cold",
|
|
491
|
+
"lang": "zh",
|
|
492
|
+
"class": "natural",
|
|
493
|
+
"detector": "patina-deterministic",
|
|
494
|
+
"expected_hot": false,
|
|
495
|
+
"predicted_hot": false,
|
|
496
|
+
"correct": true,
|
|
497
|
+
"score": 0,
|
|
498
|
+
"source": "tests/quality/benchmark.mjs",
|
|
499
|
+
"notes": null
|
|
440
500
|
}
|
|
441
501
|
]
|
|
442
502
|
}
|